chronicle-etl 0.5.4 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +15 -25
  3. data/.rubocop.yml +2 -44
  4. data/Gemfile +2 -2
  5. data/Guardfile +3 -3
  6. data/README.md +98 -73
  7. data/Rakefile +2 -2
  8. data/bin/console +4 -5
  9. data/chronicle-etl.gemspec +50 -45
  10. data/exe/chronicle-etl +1 -1
  11. data/lib/chronicle/etl/authorizer.rb +3 -4
  12. data/lib/chronicle/etl/cli/authorizations.rb +10 -8
  13. data/lib/chronicle/etl/cli/connectors.rb +9 -9
  14. data/lib/chronicle/etl/cli/jobs.rb +130 -53
  15. data/lib/chronicle/etl/cli/main.rb +29 -29
  16. data/lib/chronicle/etl/cli/plugins.rb +29 -26
  17. data/lib/chronicle/etl/cli/secrets.rb +14 -12
  18. data/lib/chronicle/etl/cli/subcommand_base.rb +5 -3
  19. data/lib/chronicle/etl/config.rb +20 -7
  20. data/lib/chronicle/etl/configurable.rb +24 -9
  21. data/lib/chronicle/etl/exceptions.rb +3 -3
  22. data/lib/chronicle/etl/extraction.rb +12 -2
  23. data/lib/chronicle/etl/extractors/csv_extractor.rb +9 -0
  24. data/lib/chronicle/etl/extractors/extractor.rb +15 -2
  25. data/lib/chronicle/etl/extractors/file_extractor.rb +5 -3
  26. data/lib/chronicle/etl/extractors/helpers/input_reader.rb +2 -2
  27. data/lib/chronicle/etl/extractors/json_extractor.rb +14 -4
  28. data/lib/chronicle/etl/extractors/stdin_extractor.rb +3 -0
  29. data/lib/chronicle/etl/job.rb +35 -17
  30. data/lib/chronicle/etl/job_definition.rb +39 -27
  31. data/lib/chronicle/etl/job_log.rb +14 -16
  32. data/lib/chronicle/etl/job_logger.rb +4 -4
  33. data/lib/chronicle/etl/loaders/csv_loader.rb +17 -4
  34. data/lib/chronicle/etl/loaders/helpers/stdout_helper.rb +4 -0
  35. data/lib/chronicle/etl/loaders/json_loader.rb +30 -10
  36. data/lib/chronicle/etl/loaders/loader.rb +0 -17
  37. data/lib/chronicle/etl/loaders/rest_loader.rb +7 -7
  38. data/lib/chronicle/etl/loaders/table_loader.rb +37 -12
  39. data/lib/chronicle/etl/logger.rb +3 -3
  40. data/lib/chronicle/etl/oauth_authorizer.rb +8 -10
  41. data/lib/chronicle/etl/record.rb +15 -0
  42. data/lib/chronicle/etl/registry/connector_registration.rb +15 -23
  43. data/lib/chronicle/etl/registry/connectors.rb +117 -0
  44. data/lib/chronicle/etl/registry/plugin_registration.rb +19 -0
  45. data/lib/chronicle/etl/registry/plugins.rb +171 -0
  46. data/lib/chronicle/etl/registry/registry.rb +3 -52
  47. data/lib/chronicle/etl/registry/self_registering.rb +1 -1
  48. data/lib/chronicle/etl/runner.rb +158 -128
  49. data/lib/chronicle/etl/secrets.rb +5 -5
  50. data/lib/chronicle/etl/transformers/buffer_transformer.rb +29 -0
  51. data/lib/chronicle/etl/transformers/chronicle_transformer.rb +32 -0
  52. data/lib/chronicle/etl/transformers/chronobase_transformer.rb +100 -0
  53. data/lib/chronicle/etl/transformers/fields_limit_transformer.rb +23 -0
  54. data/lib/chronicle/etl/transformers/filter_fields_transformer.rb +60 -0
  55. data/lib/chronicle/etl/transformers/filter_transformer.rb +30 -0
  56. data/lib/chronicle/etl/transformers/format_transformer.rb +32 -0
  57. data/lib/chronicle/etl/transformers/merge_meta_transformer.rb +19 -0
  58. data/lib/chronicle/etl/transformers/multiply_transformer.rb +21 -0
  59. data/lib/chronicle/etl/transformers/null_transformer.rb +5 -7
  60. data/lib/chronicle/etl/transformers/sampler_transformer.rb +21 -0
  61. data/lib/chronicle/etl/transformers/sort_transformer.rb +31 -0
  62. data/lib/chronicle/etl/transformers/transformer.rb +63 -41
  63. data/lib/chronicle/etl/utils/binary_attachments.rb +1 -1
  64. data/lib/chronicle/etl/utils/progress_bar.rb +2 -3
  65. data/lib/chronicle/etl/version.rb +1 -1
  66. data/lib/chronicle/etl.rb +6 -8
  67. metadata +91 -45
  68. data/lib/chronicle/etl/models/activity.rb +0 -15
  69. data/lib/chronicle/etl/models/attachment.rb +0 -14
  70. data/lib/chronicle/etl/models/base.rb +0 -122
  71. data/lib/chronicle/etl/models/entity.rb +0 -29
  72. data/lib/chronicle/etl/models/raw.rb +0 -26
  73. data/lib/chronicle/etl/registry/plugin_registry.rb +0 -95
  74. data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +0 -31
  75. data/lib/chronicle/etl/serializers/raw_serializer.rb +0 -10
  76. data/lib/chronicle/etl/serializers/serializer.rb +0 -28
  77. data/lib/chronicle/etl/transformers/image_file_transformer.rb +0 -247
  78. data/lib/chronicle/etl/utils/hash_utilities.rb +0 -19
  79. data/lib/chronicle/etl/utils/text_recognition.rb +0 -15
@@ -1,133 +1,163 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'colorize'
2
4
  require 'chronic_duration'
3
- require "tty-spinner"
4
-
5
- class Chronicle::ETL::Runner
6
- def initialize(job)
7
- @job = job
8
- @job_logger = Chronicle::ETL::JobLogger.new(@job)
9
- end
10
-
11
- def run!
12
- begin_job
13
- validate_job
14
- instantiate_connectors
15
- prepare_job
16
- prepare_ui
17
- run_extraction
18
- rescue Chronicle::ETL::ExtractionError => e
19
- @job_logger&.error
20
- raise(Chronicle::ETL::RunnerError, "Extraction failed. #{e.message}")
21
- rescue Interrupt
22
- @job_logger&.error
23
- raise(Chronicle::ETL::RunInterruptedError, "Job interrupted.")
24
- rescue StandardError => e
25
- # Just throwing this in here until we have better exception handling in
26
- # loaders, etc
27
- @job_logger&.error
28
- raise(Chronicle::ETL::RunnerError, "Error running job. #{e.message}")
29
- ensure
30
- finish_job
31
- end
32
-
33
- private
34
-
35
- def begin_job
36
- Chronicle::ETL::Logger.info(tty_log_job_initialize)
37
- @initialization_spinner = TTY::Spinner.new(":spinner :title", format: :dots_2)
38
- end
39
-
40
- def validate_job
41
- @initialization_spinner.update(title: "Validating job")
42
- @job.job_definition.validate!
43
- end
44
-
45
- def instantiate_connectors
46
- @initialization_spinner.update(title: "Initializing connectors")
47
- @extractor = @job.instantiate_extractor
48
- @loader = @job.instantiate_loader
49
- end
50
-
51
- def prepare_job
52
- @initialization_spinner.update(title: "Preparing job")
53
- @job_logger.start
54
- @loader.start
55
-
56
- @initialization_spinner.update(title: "Preparing extraction")
57
- @initialization_spinner.auto_spin
58
- @extractor.prepare
59
- @initialization_spinner.success("(#{'successful'.green})")
60
- Chronicle::ETL::Logger.info("\n")
61
- end
62
-
63
- def prepare_ui
64
- total = @extractor.results_count
65
- @progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
66
- Chronicle::ETL::Logger.attach_to_ui(@progress_bar)
67
- end
68
-
69
- def run_extraction
70
- @extractor.extract do |extraction|
71
- process_extraction(extraction)
72
- @progress_bar.increment
5
+ require 'tty-spinner'
6
+
7
+ module Chronicle
8
+ module ETL
9
+ class Runner
10
+ def initialize(job)
11
+ @job = job
12
+ @job_logger = Chronicle::ETL::JobLogger.new(@job)
13
+ end
14
+
15
+ def run!
16
+ begin_job
17
+ validate_job
18
+ instantiate_connectors
19
+ prepare_job
20
+ prepare_ui
21
+ run_extraction
22
+ rescue Chronicle::ETL::ExtractionError => e
23
+ @job_logger&.error
24
+ raise(Chronicle::ETL::RunnerError, "Extraction failed. #{e.message}")
25
+ rescue Interrupt
26
+ @job_logger&.error
27
+ raise(Chronicle::ETL::RunInterruptedError, 'Job interrupted.')
28
+ # rescue StandardError => e
29
+ # # Just throwing this in here until we have better exception handling in
30
+ # # loaders, etc
31
+ # @job_logger&.error
32
+ # raise(Chronicle::ETL::RunnerError, "Error running job. #{e.message}")
33
+ ensure
34
+ finish_job
35
+ end
36
+
37
+ private
38
+
39
+ def begin_job
40
+ Chronicle::ETL::Logger.info(tty_log_job_initialize)
41
+ @initialization_spinner = TTY::Spinner.new(':spinner :title', format: :dots_2)
42
+ end
43
+
44
+ def validate_job
45
+ @initialization_spinner.update(title: 'Validating job')
46
+ @job.job_definition.validate!
47
+ end
48
+
49
+ def instantiate_connectors
50
+ @initialization_spinner.update(title: 'Initializing connectors')
51
+ @extractor = @job.instantiate_extractor
52
+ @transformers = @job.instantiate_transformers
53
+ @loader = @job.instantiate_loader
54
+ end
55
+
56
+ def prepare_job
57
+ @initialization_spinner.update(title: 'Preparing job')
58
+ @job_logger.start
59
+ @loader.start
60
+
61
+ @initialization_spinner.update(title: 'Preparing extraction')
62
+ @initialization_spinner.auto_spin
63
+ @extractor.prepare
64
+ @initialization_spinner.success("(#{'successful'.green})")
65
+ Chronicle::ETL::Logger.info("\n")
66
+ end
67
+
68
+ def prepare_ui
69
+ total = @extractor.results_count
70
+ @progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
71
+ Chronicle::ETL::Logger.attach_to_ui(@progress_bar)
72
+ end
73
+
74
+ def run_extraction
75
+ # Pattern based on Kiba's StreamingRunner
76
+ # https://github.com/thbar/kiba/blob/master/lib/kiba/streaming_runner.rb
77
+ stream = extractor_stream
78
+ recurser = ->(s, t) { transform_stream(s, t) }
79
+ @transformers.reduce(stream, &recurser).each do |record|
80
+ Chronicle::ETL::Logger.debug(tty_log_transformation(record))
81
+ @job_logger.log_transformation(record)
82
+ @progress_bar.increment
83
+ load_record(record)
84
+ end
85
+
86
+ @progress_bar.finish
87
+
88
+ # This is typically a slow method (writing to stdout, writing a big file, etc)
89
+ # TODO: consider adding a spinner?
90
+ @loader.finish
91
+ @job_logger.finish
92
+ end
93
+
94
+ # Initial steam of extracted data, wrapped in a Record class
95
+ def extractor_stream
96
+ Enumerator.new do |y|
97
+ @extractor.extract do |extraction|
98
+ record = Chronicle::ETL::Record.new(data: extraction.data, extraction: extraction)
99
+ y << record
100
+ end
101
+ end
102
+ end
103
+
104
+ # For a given stream of records and a given transformer,
105
+ # returns a new stream of transformed records and finally
106
+ # calls the finish method on the transformer
107
+ def transform_stream(stream, transformer)
108
+ Enumerator.new do |y|
109
+ stream.each do |record|
110
+ transformer.call(record) do |transformed_record|
111
+ y << transformed_record
112
+ end
113
+ end
114
+
115
+ transformer.call_finish do |transformed_record|
116
+ y << transformed_record
117
+ end
118
+ end
119
+ end
120
+
121
+ def load_record(record)
122
+ @loader.load(record.data) unless @job.dry_run?
123
+ end
124
+
125
+ def finish_job
126
+ @job_logger.save
127
+ @progress_bar&.finish
128
+ Chronicle::ETL::Logger.detach_from_ui
129
+ Chronicle::ETL::Logger.info(tty_log_completion)
130
+ end
131
+
132
+ def tty_log_job_initialize
133
+ output = 'Beginning job '
134
+ output += "'#{@job.name}'".bold if @job.name
135
+ output
136
+ end
137
+
138
+ def tty_log_transformation(record)
139
+ output = ' ✓'.green
140
+ output + " #{record}"
141
+ end
142
+
143
+ def tty_log_transformation_failure(exception, transformer)
144
+ output = ' ✖'.red
145
+ output + " Failed to transform #{transformer}. #{exception.message}"
146
+ end
147
+
148
+ def tty_log_completion
149
+ status = @job_logger.success ? 'Success' : 'Failed'
150
+ job_completion = @job_logger.success ? 'Completed' : 'Partially completed'
151
+ output = "\n#{job_completion} job"
152
+ output += " '#{@job.name}'".bold if @job.name
153
+ output += " in #{ChronicDuration.output(@job_logger.duration)}" if @job_logger.duration
154
+ output += "\n Status:\t".light_black + status
155
+ output += "\n Completed:\t".light_black + @job_logger.job_log.num_records_processed.to_s
156
+ if @job_logger.job_log.highest_timestamp
157
+ output += "\n Latest:\t".light_black + @job_logger.job_log.highest_timestamp.iso8601.to_s
158
+ end
159
+ output
160
+ end
73
161
  end
74
-
75
- @progress_bar.finish
76
-
77
- # This is typically a slow method (writing to stdout, writing a big file, etc)
78
- # TODO: consider adding a spinner?
79
- @loader.finish
80
- @job_logger.finish
81
- end
82
-
83
- def process_extraction(extraction)
84
- # For each extraction from our extractor, we create a new tarnsformer
85
- transformer = @job.instantiate_transformer(extraction)
86
-
87
- # And then transform that record, logging it if we're in debug log level
88
- record = transformer.transform
89
- Chronicle::ETL::Logger.debug(tty_log_transformation(transformer))
90
- @job_logger.log_transformation(transformer)
91
-
92
- # Then send the results to the loader
93
- @loader.load(record) unless @job.dry_run?
94
- rescue Chronicle::ETL::TransformationError => e
95
- # TODO: have an option to cancel job if we encounter an error
96
- Chronicle::ETL::Logger.error(tty_log_transformation_failure(e, transformer))
97
- end
98
-
99
- def finish_job
100
- @job_logger.save
101
- @progress_bar&.finish
102
- Chronicle::ETL::Logger.detach_from_ui
103
- Chronicle::ETL::Logger.info(tty_log_completion)
104
- end
105
-
106
- def tty_log_job_initialize
107
- output = "Beginning job "
108
- output += "'#{@job.name}'".bold if @job.name
109
- output
110
- end
111
-
112
- def tty_log_transformation(transformer)
113
- output = " ✓".green
114
- output += " #{transformer}"
115
- end
116
-
117
- def tty_log_transformation_failure(exception, transformer)
118
- output = " ✖".red
119
- output += " Failed to build #{transformer}. #{exception.message}"
120
- end
121
-
122
- def tty_log_completion
123
- status = @job_logger.success ? 'Success' : 'Failed'
124
- job_completion = @job_logger.success ? 'Completed' : 'Partially completed'
125
- output = "\n#{job_completion} job"
126
- output += " '#{@job.name}'".bold if @job.name
127
- output += " in #{ChronicDuration.output(@job_logger.duration)}" if @job_logger.duration
128
- output += "\n Status:\t".light_black + status
129
- output += "\n Completed:\t".light_black + "#{@job_logger.job_log.num_records_processed}"
130
- output += "\n Latest:\t".light_black + "#{@job_logger.job_log.highest_timestamp.iso8601}" if @job_logger.job_log.highest_timestamp
131
- output
132
162
  end
133
163
  end
@@ -1,4 +1,4 @@
1
- require "active_support/core_ext/hash/keys"
1
+ require 'active_support/core_ext/hash/keys'
2
2
 
3
3
  module Chronicle
4
4
  module ETL
@@ -8,7 +8,7 @@ module Chronicle
8
8
 
9
9
  # Whether a given namespace exists
10
10
  def exists?(namespace)
11
- Chronicle::ETL::Config.exists?("secrets", namespace)
11
+ Chronicle::ETL::Config.exists?('secrets', namespace)
12
12
  end
13
13
 
14
14
  # Save a setting to a namespaced config file
@@ -47,7 +47,7 @@ module Chronicle
47
47
 
48
48
  # Read secrets from a config file
49
49
  def read(namespace)
50
- definition = Chronicle::ETL::Config.load("secrets", namespace)
50
+ definition = Chronicle::ETL::Config.load('secrets', namespace)
51
51
  definition[:secrets] || {}
52
52
  end
53
53
 
@@ -56,8 +56,8 @@ module Chronicle
56
56
  data = {
57
57
  secrets: (secrets || {}).transform_keys(&:to_s),
58
58
  chronicle_etl_version: Chronicle::ETL::VERSION
59
- }.deep_stringify_keys
60
- Chronicle::ETL::Config.write("secrets", namespace, data)
59
+ }
60
+ Chronicle::ETL::Config.write('secrets', namespace, data)
61
61
  end
62
62
 
63
63
  # Which config files are available in ~/.config/chronicle/etl/secrets
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class BufferTransformer < Chronicle::ETL::Transformer
6
+ register_connector do |r|
7
+ r.identifier = :buffer
8
+ r.description = 'by buffering'
9
+ end
10
+
11
+ setting :size, default: 10, description: 'The size of the buffer'
12
+
13
+ def transform(record)
14
+ stash_record(record)
15
+
16
+ # FIXME: this doesn't seem to be working with the runner
17
+ return if @stashed_records.size < @config.size
18
+
19
+ # FIXME: this will result in the wrong extraction being associated with
20
+ # the batch of flushed records
21
+ flush_stashed_records.map(&:data)
22
+ end
23
+
24
+ def finish
25
+ flush_stashed_records
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class ChronicleTransformer < Chronicle::ETL::Transformer
6
+ register_connector do |r|
7
+ r.identifier = :chronicle
8
+ r.description = 'records to Chronicle schema'
9
+ end
10
+
11
+ def transform(record)
12
+ converter_klass = find_converter(record.extraction)
13
+ # TODO: handle missing converter
14
+
15
+ converter_klass.new.call(record) do |transformed_record|
16
+ yield transformed_record.data
17
+ end
18
+ end
19
+
20
+ private
21
+
22
+ def find_converter(extraction)
23
+ Chronicle::ETL::Registry::Connectors.find_converter_for_source(
24
+ source: extraction.source,
25
+ type: extraction.type,
26
+ strategy: extraction.strategy,
27
+ target: :chronicle
28
+ )&.klass
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,100 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class ChronobaseTransformer < Chronicle::ETL::Transformer
6
+ PROPERTY_MAP = {
7
+ source: :provider,
8
+ source_id: :provider_id,
9
+ url: :provider_url,
10
+ end_time: :end_at,
11
+ start_time: :start_at,
12
+
13
+ name: :title,
14
+ description: :body,
15
+ text: :body,
16
+
17
+ recipient: :consumers,
18
+ agent: :actor,
19
+ object: :involved,
20
+
21
+ # music ones
22
+ by_artist: :creators,
23
+ in_album: :containers
24
+ }.freeze
25
+
26
+ VERB_MAP = {
27
+ ListenAction: 'listened',
28
+ CommunicateAction: 'messaged'
29
+ }.freeze
30
+
31
+ ENTITY_MAP = {
32
+ MusicRecording: 'song',
33
+ MusicAlbum: 'album',
34
+ MusicGroup: 'musicartist',
35
+ Message: 'message',
36
+ Person: 'person'
37
+ }.freeze
38
+
39
+ register_connector do |r|
40
+ r.identifier = :chronobase
41
+ r.description = 'records to chronobase schema'
42
+ end
43
+
44
+ def transform(record)
45
+ deeply_convert_record(record.data)
46
+ end
47
+
48
+ private
49
+
50
+ def deeply_convert_record(record)
51
+ type = activity?(record) ? 'activity' : 'entity'
52
+
53
+ properties = record.properties.compact.each_with_object({}) do |(k, v), h|
54
+ key = PROPERTY_MAP[k.to_sym] || k
55
+ h[key] = v
56
+ end
57
+
58
+ properties[:verb] = VERB_MAP[record.type_id.to_sym] if VERB_MAP.key?(record.type_id.to_sym)
59
+ properties[:represents] = ENTITY_MAP[record.type_id.to_sym] if ENTITY_MAP.key?(record.type_id.to_sym)
60
+
61
+ properties.transform_values! do |v|
62
+ case v
63
+ when Chronicle::Models::Base
64
+ deeply_convert_record(v)
65
+ when Array
66
+ v.map { |e| e.is_a?(Chronicle::Models::Base) ? deeply_convert_record(e) : e }
67
+ else
68
+ v
69
+ end
70
+ end
71
+
72
+ Chronicle::Serialization::Record.new(
73
+ id: record.id,
74
+ type: type,
75
+ properties: properties.compact,
76
+ meta: {
77
+ dedupe_on: transform_dedupe_on(record)
78
+ },
79
+ schema: 'chronobase'
80
+ )
81
+ end
82
+
83
+ def activity?(record)
84
+ record.type_id.end_with?('Action')
85
+ end
86
+
87
+ def transform_dedupe_on(record)
88
+ property_map_with_type = PROPERTY_MAP.merge({
89
+ type: activity?(record) ? :verb : :represents
90
+ })
91
+
92
+ record.dedupe_on.map do |set|
93
+ set.map do |d|
94
+ property_map_with_type[d] || d
95
+ end.join(',')
96
+ end
97
+ end
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'chronicle/utils/hash_utils'
4
+
5
+ module Chronicle
6
+ module ETL
7
+ # A transformer that filters the fields of a record and returns a new hash with only the specified fields.
8
+ class FieldsLimitTransformer < Chronicle::ETL::Transformer
9
+ register_connector do |r|
10
+ r.identifier = :fields_limit
11
+ r.description = 'by taking first N fields'
12
+ end
13
+
14
+ setting :limit, type: :numeric, default: 10
15
+
16
+ def transform(record)
17
+ # flattern hash and then take the first limit fields
18
+
19
+ Chronicle::Utils::HashUtils.flatten_hash(record.data.to_h).first(@config.limit).to_h
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chronicle
4
+ module ETL
5
+ # A transformer that filters the fields of a record and returns a new hash with only the specified fields.
6
+ class FilterFieldsTransformer < Chronicle::ETL::Transformer
7
+ register_connector do |r|
8
+ r.identifier = :filter_fields
9
+ r.description = 'by taking a subset of the fields'
10
+ end
11
+
12
+ setting :fields, type: :array, default: []
13
+
14
+ def transform(record)
15
+ hash = record.data.to_h.deep_transform_keys(&:to_sym)
16
+ filter_hash(hash, @config.fields.map)
17
+ end
18
+
19
+ private
20
+
21
+ def access_nested_value(data, path)
22
+ keys = path.split('.')
23
+ keys.reduce(data) do |acc, key|
24
+ if acc.is_a?(Array)
25
+ acc.map do |item|
26
+ item[key.to_sym]
27
+ rescue StandardError
28
+ nil
29
+ end
30
+ .compact
31
+ elsif key.include?('[')
32
+ key, index = key.split(/\[|\]/).reject(&:empty?)
33
+ acc = acc[key.to_sym] if acc
34
+ acc.is_a?(Array) ? acc[index.to_i] : nil
35
+ else
36
+ acc&.dig(key.to_sym)
37
+ end
38
+ end
39
+ end
40
+
41
+ def filter_hash(original_hash, fields)
42
+ fields.each_with_object({}) do |field, result|
43
+ value = access_nested_value(original_hash, field)
44
+ keys = field.split('.')
45
+ last_key = keys.pop.to_sym
46
+
47
+ current = result
48
+ keys.each do |key|
49
+ key = key.to_sym
50
+ key, = key.to_s.split(/\[|\]/) if key.to_s.include?('[')
51
+ current[key] ||= {}
52
+ current = current[key]
53
+ end
54
+
55
+ current[last_key] = value
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chronicle
4
+ module ETL
5
+ # Return only records that match all the conditions of the filters
6
+ # setting.
7
+ class FilterTransformer < Chronicle::ETL::Transformer
8
+ register_connector do |r|
9
+ r.identifier = :filter
10
+ r.description = 'by only accepting records that match conditions'
11
+ end
12
+
13
+ setting :filters, type: :hash
14
+
15
+ def transform(record)
16
+ record_hash = record.data.to_h
17
+
18
+ @config.filters.each do |key, value|
19
+ path = key.split('.').map do |k|
20
+ k.match?(/^\d+$/) ? k.to_i : k.to_sym
21
+ end
22
+
23
+ return nil unless record_hash.dig(*path) == value
24
+ end
25
+
26
+ record.data
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class FormatTransformer < Chronicle::ETL::Transformer
6
+ register_connector do |r|
7
+ r.identifier = :format
8
+ r.description = 'records to a differnet hash/json format'
9
+ end
10
+
11
+ setting :format, default: nil
12
+
13
+ def transform(record)
14
+ serializer = find_serializer(@config.format)
15
+ serializer.serialize(record.data)
16
+ end
17
+
18
+ private
19
+
20
+ def find_serializer(format)
21
+ case format
22
+ when 'jsonld'
23
+ Chronicle::Serialization::JSONLDSerializer
24
+ when 'jsonapi'
25
+ Chronicle::Serialization::JSONAPISerializer
26
+ else
27
+ raise 'unknown format'
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class MergeMetaTransformer < Chronicle::ETL::Transformer
6
+ register_connector do |r|
7
+ r.identifier = :merge_meta
8
+ r.description = 'merge extraction meta fields into the record'
9
+ end
10
+
11
+ def transform(record)
12
+ record.data unless record.extraction&.meta
13
+
14
+ record.data[:_meta] = record.extraction.meta
15
+ record.data
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class MultiplyTransformer < Chronicle::ETL::Transformer
6
+ register_connector do |r|
7
+ r.identifier = :multiply
8
+ r.description = 'by taking a sample'
9
+ end
10
+
11
+ setting :n, default: 2, type: :numeric
12
+
13
+ # return the result, sample_size percentage of the time. otherwise nil
14
+ def transform(record)
15
+ @config.n.to_i.times do
16
+ yield record.data
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end