chronicle-etl 0.5.4 → 0.6.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +15 -25
  3. data/.rubocop.yml +2 -44
  4. data/Gemfile +2 -2
  5. data/Guardfile +3 -3
  6. data/README.md +98 -73
  7. data/Rakefile +2 -2
  8. data/bin/console +4 -5
  9. data/chronicle-etl.gemspec +50 -45
  10. data/exe/chronicle-etl +1 -1
  11. data/lib/chronicle/etl/authorizer.rb +3 -4
  12. data/lib/chronicle/etl/cli/authorizations.rb +10 -8
  13. data/lib/chronicle/etl/cli/connectors.rb +9 -9
  14. data/lib/chronicle/etl/cli/jobs.rb +130 -53
  15. data/lib/chronicle/etl/cli/main.rb +29 -29
  16. data/lib/chronicle/etl/cli/plugins.rb +29 -26
  17. data/lib/chronicle/etl/cli/secrets.rb +14 -12
  18. data/lib/chronicle/etl/cli/subcommand_base.rb +5 -3
  19. data/lib/chronicle/etl/config.rb +20 -7
  20. data/lib/chronicle/etl/configurable.rb +24 -9
  21. data/lib/chronicle/etl/exceptions.rb +3 -3
  22. data/lib/chronicle/etl/extraction.rb +12 -2
  23. data/lib/chronicle/etl/extractors/csv_extractor.rb +9 -0
  24. data/lib/chronicle/etl/extractors/extractor.rb +15 -2
  25. data/lib/chronicle/etl/extractors/file_extractor.rb +5 -3
  26. data/lib/chronicle/etl/extractors/helpers/input_reader.rb +2 -2
  27. data/lib/chronicle/etl/extractors/json_extractor.rb +14 -4
  28. data/lib/chronicle/etl/extractors/stdin_extractor.rb +3 -0
  29. data/lib/chronicle/etl/job.rb +35 -17
  30. data/lib/chronicle/etl/job_definition.rb +39 -27
  31. data/lib/chronicle/etl/job_log.rb +14 -16
  32. data/lib/chronicle/etl/job_logger.rb +4 -4
  33. data/lib/chronicle/etl/loaders/csv_loader.rb +17 -4
  34. data/lib/chronicle/etl/loaders/helpers/stdout_helper.rb +4 -0
  35. data/lib/chronicle/etl/loaders/json_loader.rb +30 -10
  36. data/lib/chronicle/etl/loaders/loader.rb +0 -17
  37. data/lib/chronicle/etl/loaders/rest_loader.rb +7 -7
  38. data/lib/chronicle/etl/loaders/table_loader.rb +37 -12
  39. data/lib/chronicle/etl/logger.rb +3 -3
  40. data/lib/chronicle/etl/oauth_authorizer.rb +8 -10
  41. data/lib/chronicle/etl/record.rb +15 -0
  42. data/lib/chronicle/etl/registry/connector_registration.rb +15 -23
  43. data/lib/chronicle/etl/registry/connectors.rb +117 -0
  44. data/lib/chronicle/etl/registry/plugin_registration.rb +19 -0
  45. data/lib/chronicle/etl/registry/plugins.rb +171 -0
  46. data/lib/chronicle/etl/registry/registry.rb +3 -52
  47. data/lib/chronicle/etl/registry/self_registering.rb +1 -1
  48. data/lib/chronicle/etl/runner.rb +158 -128
  49. data/lib/chronicle/etl/secrets.rb +5 -5
  50. data/lib/chronicle/etl/transformers/buffer_transformer.rb +29 -0
  51. data/lib/chronicle/etl/transformers/chronicle_transformer.rb +32 -0
  52. data/lib/chronicle/etl/transformers/chronobase_transformer.rb +100 -0
  53. data/lib/chronicle/etl/transformers/fields_limit_transformer.rb +23 -0
  54. data/lib/chronicle/etl/transformers/filter_fields_transformer.rb +60 -0
  55. data/lib/chronicle/etl/transformers/filter_transformer.rb +30 -0
  56. data/lib/chronicle/etl/transformers/format_transformer.rb +32 -0
  57. data/lib/chronicle/etl/transformers/merge_meta_transformer.rb +19 -0
  58. data/lib/chronicle/etl/transformers/multiply_transformer.rb +21 -0
  59. data/lib/chronicle/etl/transformers/null_transformer.rb +5 -7
  60. data/lib/chronicle/etl/transformers/sampler_transformer.rb +21 -0
  61. data/lib/chronicle/etl/transformers/sort_transformer.rb +31 -0
  62. data/lib/chronicle/etl/transformers/transformer.rb +63 -41
  63. data/lib/chronicle/etl/utils/binary_attachments.rb +1 -1
  64. data/lib/chronicle/etl/utils/progress_bar.rb +2 -3
  65. data/lib/chronicle/etl/version.rb +1 -1
  66. data/lib/chronicle/etl.rb +6 -8
  67. metadata +91 -45
  68. data/lib/chronicle/etl/models/activity.rb +0 -15
  69. data/lib/chronicle/etl/models/attachment.rb +0 -14
  70. data/lib/chronicle/etl/models/base.rb +0 -122
  71. data/lib/chronicle/etl/models/entity.rb +0 -29
  72. data/lib/chronicle/etl/models/raw.rb +0 -26
  73. data/lib/chronicle/etl/registry/plugin_registry.rb +0 -95
  74. data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +0 -31
  75. data/lib/chronicle/etl/serializers/raw_serializer.rb +0 -10
  76. data/lib/chronicle/etl/serializers/serializer.rb +0 -28
  77. data/lib/chronicle/etl/transformers/image_file_transformer.rb +0 -247
  78. data/lib/chronicle/etl/utils/hash_utilities.rb +0 -19
  79. data/lib/chronicle/etl/utils/text_recognition.rb +0 -15
@@ -1,133 +1,163 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'colorize'
2
4
  require 'chronic_duration'
3
- require "tty-spinner"
4
-
5
- class Chronicle::ETL::Runner
6
- def initialize(job)
7
- @job = job
8
- @job_logger = Chronicle::ETL::JobLogger.new(@job)
9
- end
10
-
11
- def run!
12
- begin_job
13
- validate_job
14
- instantiate_connectors
15
- prepare_job
16
- prepare_ui
17
- run_extraction
18
- rescue Chronicle::ETL::ExtractionError => e
19
- @job_logger&.error
20
- raise(Chronicle::ETL::RunnerError, "Extraction failed. #{e.message}")
21
- rescue Interrupt
22
- @job_logger&.error
23
- raise(Chronicle::ETL::RunInterruptedError, "Job interrupted.")
24
- rescue StandardError => e
25
- # Just throwing this in here until we have better exception handling in
26
- # loaders, etc
27
- @job_logger&.error
28
- raise(Chronicle::ETL::RunnerError, "Error running job. #{e.message}")
29
- ensure
30
- finish_job
31
- end
32
-
33
- private
34
-
35
- def begin_job
36
- Chronicle::ETL::Logger.info(tty_log_job_initialize)
37
- @initialization_spinner = TTY::Spinner.new(":spinner :title", format: :dots_2)
38
- end
39
-
40
- def validate_job
41
- @initialization_spinner.update(title: "Validating job")
42
- @job.job_definition.validate!
43
- end
44
-
45
- def instantiate_connectors
46
- @initialization_spinner.update(title: "Initializing connectors")
47
- @extractor = @job.instantiate_extractor
48
- @loader = @job.instantiate_loader
49
- end
50
-
51
- def prepare_job
52
- @initialization_spinner.update(title: "Preparing job")
53
- @job_logger.start
54
- @loader.start
55
-
56
- @initialization_spinner.update(title: "Preparing extraction")
57
- @initialization_spinner.auto_spin
58
- @extractor.prepare
59
- @initialization_spinner.success("(#{'successful'.green})")
60
- Chronicle::ETL::Logger.info("\n")
61
- end
62
-
63
- def prepare_ui
64
- total = @extractor.results_count
65
- @progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
66
- Chronicle::ETL::Logger.attach_to_ui(@progress_bar)
67
- end
68
-
69
- def run_extraction
70
- @extractor.extract do |extraction|
71
- process_extraction(extraction)
72
- @progress_bar.increment
5
+ require 'tty-spinner'
6
+
7
+ module Chronicle
8
+ module ETL
9
+ class Runner
10
+ def initialize(job)
11
+ @job = job
12
+ @job_logger = Chronicle::ETL::JobLogger.new(@job)
13
+ end
14
+
15
+ def run!
16
+ begin_job
17
+ validate_job
18
+ instantiate_connectors
19
+ prepare_job
20
+ prepare_ui
21
+ run_extraction
22
+ rescue Chronicle::ETL::ExtractionError => e
23
+ @job_logger&.error
24
+ raise(Chronicle::ETL::RunnerError, "Extraction failed. #{e.message}")
25
+ rescue Interrupt
26
+ @job_logger&.error
27
+ raise(Chronicle::ETL::RunInterruptedError, 'Job interrupted.')
28
+ # rescue StandardError => e
29
+ # # Just throwing this in here until we have better exception handling in
30
+ # # loaders, etc
31
+ # @job_logger&.error
32
+ # raise(Chronicle::ETL::RunnerError, "Error running job. #{e.message}")
33
+ ensure
34
+ finish_job
35
+ end
36
+
37
+ private
38
+
39
+ def begin_job
40
+ Chronicle::ETL::Logger.info(tty_log_job_initialize)
41
+ @initialization_spinner = TTY::Spinner.new(':spinner :title', format: :dots_2)
42
+ end
43
+
44
+ def validate_job
45
+ @initialization_spinner.update(title: 'Validating job')
46
+ @job.job_definition.validate!
47
+ end
48
+
49
+ def instantiate_connectors
50
+ @initialization_spinner.update(title: 'Initializing connectors')
51
+ @extractor = @job.instantiate_extractor
52
+ @transformers = @job.instantiate_transformers
53
+ @loader = @job.instantiate_loader
54
+ end
55
+
56
+ def prepare_job
57
+ @initialization_spinner.update(title: 'Preparing job')
58
+ @job_logger.start
59
+ @loader.start
60
+
61
+ @initialization_spinner.update(title: 'Preparing extraction')
62
+ @initialization_spinner.auto_spin
63
+ @extractor.prepare
64
+ @initialization_spinner.success("(#{'successful'.green})")
65
+ Chronicle::ETL::Logger.info("\n")
66
+ end
67
+
68
+ def prepare_ui
69
+ total = @extractor.results_count
70
+ @progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
71
+ Chronicle::ETL::Logger.attach_to_ui(@progress_bar)
72
+ end
73
+
74
+ def run_extraction
75
+ # Pattern based on Kiba's StreamingRunner
76
+ # https://github.com/thbar/kiba/blob/master/lib/kiba/streaming_runner.rb
77
+ stream = extractor_stream
78
+ recurser = ->(s, t) { transform_stream(s, t) }
79
+ @transformers.reduce(stream, &recurser).each do |record|
80
+ Chronicle::ETL::Logger.debug(tty_log_transformation(record))
81
+ @job_logger.log_transformation(record)
82
+ @progress_bar.increment
83
+ load_record(record)
84
+ end
85
+
86
+ @progress_bar.finish
87
+
88
+ # This is typically a slow method (writing to stdout, writing a big file, etc)
89
+ # TODO: consider adding a spinner?
90
+ @loader.finish
91
+ @job_logger.finish
92
+ end
93
+
94
+ # Initial steam of extracted data, wrapped in a Record class
95
+ def extractor_stream
96
+ Enumerator.new do |y|
97
+ @extractor.extract do |extraction|
98
+ record = Chronicle::ETL::Record.new(data: extraction.data, extraction: extraction)
99
+ y << record
100
+ end
101
+ end
102
+ end
103
+
104
+ # For a given stream of records and a given transformer,
105
+ # returns a new stream of transformed records and finally
106
+ # calls the finish method on the transformer
107
+ def transform_stream(stream, transformer)
108
+ Enumerator.new do |y|
109
+ stream.each do |record|
110
+ transformer.call(record) do |transformed_record|
111
+ y << transformed_record
112
+ end
113
+ end
114
+
115
+ transformer.call_finish do |transformed_record|
116
+ y << transformed_record
117
+ end
118
+ end
119
+ end
120
+
121
+ def load_record(record)
122
+ @loader.load(record.data) unless @job.dry_run?
123
+ end
124
+
125
+ def finish_job
126
+ @job_logger.save
127
+ @progress_bar&.finish
128
+ Chronicle::ETL::Logger.detach_from_ui
129
+ Chronicle::ETL::Logger.info(tty_log_completion)
130
+ end
131
+
132
+ def tty_log_job_initialize
133
+ output = 'Beginning job '
134
+ output += "'#{@job.name}'".bold if @job.name
135
+ output
136
+ end
137
+
138
+ def tty_log_transformation(record)
139
+ output = ' ✓'.green
140
+ output + " #{record}"
141
+ end
142
+
143
+ def tty_log_transformation_failure(exception, transformer)
144
+ output = ' ✖'.red
145
+ output + " Failed to transform #{transformer}. #{exception.message}"
146
+ end
147
+
148
+ def tty_log_completion
149
+ status = @job_logger.success ? 'Success' : 'Failed'
150
+ job_completion = @job_logger.success ? 'Completed' : 'Partially completed'
151
+ output = "\n#{job_completion} job"
152
+ output += " '#{@job.name}'".bold if @job.name
153
+ output += " in #{ChronicDuration.output(@job_logger.duration)}" if @job_logger.duration
154
+ output += "\n Status:\t".light_black + status
155
+ output += "\n Completed:\t".light_black + @job_logger.job_log.num_records_processed.to_s
156
+ if @job_logger.job_log.highest_timestamp
157
+ output += "\n Latest:\t".light_black + @job_logger.job_log.highest_timestamp.iso8601.to_s
158
+ end
159
+ output
160
+ end
73
161
  end
74
-
75
- @progress_bar.finish
76
-
77
- # This is typically a slow method (writing to stdout, writing a big file, etc)
78
- # TODO: consider adding a spinner?
79
- @loader.finish
80
- @job_logger.finish
81
- end
82
-
83
- def process_extraction(extraction)
84
- # For each extraction from our extractor, we create a new tarnsformer
85
- transformer = @job.instantiate_transformer(extraction)
86
-
87
- # And then transform that record, logging it if we're in debug log level
88
- record = transformer.transform
89
- Chronicle::ETL::Logger.debug(tty_log_transformation(transformer))
90
- @job_logger.log_transformation(transformer)
91
-
92
- # Then send the results to the loader
93
- @loader.load(record) unless @job.dry_run?
94
- rescue Chronicle::ETL::TransformationError => e
95
- # TODO: have an option to cancel job if we encounter an error
96
- Chronicle::ETL::Logger.error(tty_log_transformation_failure(e, transformer))
97
- end
98
-
99
- def finish_job
100
- @job_logger.save
101
- @progress_bar&.finish
102
- Chronicle::ETL::Logger.detach_from_ui
103
- Chronicle::ETL::Logger.info(tty_log_completion)
104
- end
105
-
106
- def tty_log_job_initialize
107
- output = "Beginning job "
108
- output += "'#{@job.name}'".bold if @job.name
109
- output
110
- end
111
-
112
- def tty_log_transformation(transformer)
113
- output = " ✓".green
114
- output += " #{transformer}"
115
- end
116
-
117
- def tty_log_transformation_failure(exception, transformer)
118
- output = " ✖".red
119
- output += " Failed to build #{transformer}. #{exception.message}"
120
- end
121
-
122
- def tty_log_completion
123
- status = @job_logger.success ? 'Success' : 'Failed'
124
- job_completion = @job_logger.success ? 'Completed' : 'Partially completed'
125
- output = "\n#{job_completion} job"
126
- output += " '#{@job.name}'".bold if @job.name
127
- output += " in #{ChronicDuration.output(@job_logger.duration)}" if @job_logger.duration
128
- output += "\n Status:\t".light_black + status
129
- output += "\n Completed:\t".light_black + "#{@job_logger.job_log.num_records_processed}"
130
- output += "\n Latest:\t".light_black + "#{@job_logger.job_log.highest_timestamp.iso8601}" if @job_logger.job_log.highest_timestamp
131
- output
132
162
  end
133
163
  end
@@ -1,4 +1,4 @@
1
- require "active_support/core_ext/hash/keys"
1
+ require 'active_support/core_ext/hash/keys'
2
2
 
3
3
  module Chronicle
4
4
  module ETL
@@ -8,7 +8,7 @@ module Chronicle
8
8
 
9
9
  # Whether a given namespace exists
10
10
  def exists?(namespace)
11
- Chronicle::ETL::Config.exists?("secrets", namespace)
11
+ Chronicle::ETL::Config.exists?('secrets', namespace)
12
12
  end
13
13
 
14
14
  # Save a setting to a namespaced config file
@@ -47,7 +47,7 @@ module Chronicle
47
47
 
48
48
  # Read secrets from a config file
49
49
  def read(namespace)
50
- definition = Chronicle::ETL::Config.load("secrets", namespace)
50
+ definition = Chronicle::ETL::Config.load('secrets', namespace)
51
51
  definition[:secrets] || {}
52
52
  end
53
53
 
@@ -56,8 +56,8 @@ module Chronicle
56
56
  data = {
57
57
  secrets: (secrets || {}).transform_keys(&:to_s),
58
58
  chronicle_etl_version: Chronicle::ETL::VERSION
59
- }.deep_stringify_keys
60
- Chronicle::ETL::Config.write("secrets", namespace, data)
59
+ }
60
+ Chronicle::ETL::Config.write('secrets', namespace, data)
61
61
  end
62
62
 
63
63
  # Which config files are available in ~/.config/chronicle/etl/secrets
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class BufferTransformer < Chronicle::ETL::Transformer
6
+ register_connector do |r|
7
+ r.identifier = :buffer
8
+ r.description = 'by buffering'
9
+ end
10
+
11
+ setting :size, default: 10, description: 'The size of the buffer'
12
+
13
+ def transform(record)
14
+ stash_record(record)
15
+
16
+ # FIXME: this doesn't seem to be working with the runner
17
+ return if @stashed_records.size < @config.size
18
+
19
+ # FIXME: this will result in the wrong extraction being associated with
20
+ # the batch of flushed records
21
+ flush_stashed_records.map(&:data)
22
+ end
23
+
24
+ def finish
25
+ flush_stashed_records
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class ChronicleTransformer < Chronicle::ETL::Transformer
6
+ register_connector do |r|
7
+ r.identifier = :chronicle
8
+ r.description = 'records to Chronicle schema'
9
+ end
10
+
11
+ def transform(record)
12
+ converter_klass = find_converter(record.extraction)
13
+ # TODO: handle missing converter
14
+
15
+ converter_klass.new.call(record) do |transformed_record|
16
+ yield transformed_record.data
17
+ end
18
+ end
19
+
20
+ private
21
+
22
+ def find_converter(extraction)
23
+ Chronicle::ETL::Registry::Connectors.find_converter_for_source(
24
+ source: extraction.source,
25
+ type: extraction.type,
26
+ strategy: extraction.strategy,
27
+ target: :chronicle
28
+ )&.klass
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,100 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class ChronobaseTransformer < Chronicle::ETL::Transformer
6
+ PROPERTY_MAP = {
7
+ source: :provider,
8
+ source_id: :provider_id,
9
+ url: :provider_url,
10
+ end_time: :end_at,
11
+ start_time: :start_at,
12
+
13
+ name: :title,
14
+ description: :body,
15
+ text: :body,
16
+
17
+ recipient: :consumers,
18
+ agent: :actor,
19
+ object: :involved,
20
+
21
+ # music ones
22
+ by_artist: :creators,
23
+ in_album: :containers
24
+ }.freeze
25
+
26
+ VERB_MAP = {
27
+ ListenAction: 'listened',
28
+ CommunicateAction: 'messaged'
29
+ }.freeze
30
+
31
+ ENTITY_MAP = {
32
+ MusicRecording: 'song',
33
+ MusicAlbum: 'album',
34
+ MusicGroup: 'musicartist',
35
+ Message: 'message',
36
+ Person: 'person'
37
+ }.freeze
38
+
39
+ register_connector do |r|
40
+ r.identifier = :chronobase
41
+ r.description = 'records to chronobase schema'
42
+ end
43
+
44
+ def transform(record)
45
+ deeply_convert_record(record.data)
46
+ end
47
+
48
+ private
49
+
50
+ def deeply_convert_record(record)
51
+ type = activity?(record) ? 'activity' : 'entity'
52
+
53
+ properties = record.properties.compact.each_with_object({}) do |(k, v), h|
54
+ key = PROPERTY_MAP[k.to_sym] || k
55
+ h[key] = v
56
+ end
57
+
58
+ properties[:verb] = VERB_MAP[record.type_id.to_sym] if VERB_MAP.key?(record.type_id.to_sym)
59
+ properties[:represents] = ENTITY_MAP[record.type_id.to_sym] if ENTITY_MAP.key?(record.type_id.to_sym)
60
+
61
+ properties.transform_values! do |v|
62
+ case v
63
+ when Chronicle::Models::Base
64
+ deeply_convert_record(v)
65
+ when Array
66
+ v.map { |e| e.is_a?(Chronicle::Models::Base) ? deeply_convert_record(e) : e }
67
+ else
68
+ v
69
+ end
70
+ end
71
+
72
+ Chronicle::Serialization::Record.new(
73
+ id: record.id,
74
+ type: type,
75
+ properties: properties.compact,
76
+ meta: {
77
+ dedupe_on: transform_dedupe_on(record)
78
+ },
79
+ schema: 'chronobase'
80
+ )
81
+ end
82
+
83
+ def activity?(record)
84
+ record.type_id.end_with?('Action')
85
+ end
86
+
87
+ def transform_dedupe_on(record)
88
+ property_map_with_type = PROPERTY_MAP.merge({
89
+ type: activity?(record) ? :verb : :represents
90
+ })
91
+
92
+ record.dedupe_on.map do |set|
93
+ set.map do |d|
94
+ property_map_with_type[d] || d
95
+ end.join(',')
96
+ end
97
+ end
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'chronicle/utils/hash_utils'
4
+
5
+ module Chronicle
6
+ module ETL
7
+ # A transformer that filters the fields of a record and returns a new hash with only the specified fields.
8
+ class FieldsLimitTransformer < Chronicle::ETL::Transformer
9
+ register_connector do |r|
10
+ r.identifier = :fields_limit
11
+ r.description = 'by taking first N fields'
12
+ end
13
+
14
+ setting :limit, type: :numeric, default: 10
15
+
16
+ def transform(record)
17
+ # flattern hash and then take the first limit fields
18
+
19
+ Chronicle::Utils::HashUtils.flatten_hash(record.data.to_h).first(@config.limit).to_h
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chronicle
4
+ module ETL
5
+ # A transformer that filters the fields of a record and returns a new hash with only the specified fields.
6
+ class FilterFieldsTransformer < Chronicle::ETL::Transformer
7
+ register_connector do |r|
8
+ r.identifier = :filter_fields
9
+ r.description = 'by taking a subset of the fields'
10
+ end
11
+
12
+ setting :fields, type: :array, default: []
13
+
14
+ def transform(record)
15
+ hash = record.data.to_h.deep_transform_keys(&:to_sym)
16
+ filter_hash(hash, @config.fields.map)
17
+ end
18
+
19
+ private
20
+
21
+ def access_nested_value(data, path)
22
+ keys = path.split('.')
23
+ keys.reduce(data) do |acc, key|
24
+ if acc.is_a?(Array)
25
+ acc.map do |item|
26
+ item[key.to_sym]
27
+ rescue StandardError
28
+ nil
29
+ end
30
+ .compact
31
+ elsif key.include?('[')
32
+ key, index = key.split(/\[|\]/).reject(&:empty?)
33
+ acc = acc[key.to_sym] if acc
34
+ acc.is_a?(Array) ? acc[index.to_i] : nil
35
+ else
36
+ acc&.dig(key.to_sym)
37
+ end
38
+ end
39
+ end
40
+
41
+ def filter_hash(original_hash, fields)
42
+ fields.each_with_object({}) do |field, result|
43
+ value = access_nested_value(original_hash, field)
44
+ keys = field.split('.')
45
+ last_key = keys.pop.to_sym
46
+
47
+ current = result
48
+ keys.each do |key|
49
+ key = key.to_sym
50
+ key, = key.to_s.split(/\[|\]/) if key.to_s.include?('[')
51
+ current[key] ||= {}
52
+ current = current[key]
53
+ end
54
+
55
+ current[last_key] = value
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chronicle
4
+ module ETL
5
+ # Return only records that match all the conditions of the filters
6
+ # setting.
7
+ class FilterTransformer < Chronicle::ETL::Transformer
8
+ register_connector do |r|
9
+ r.identifier = :filter
10
+ r.description = 'by only accepting records that match conditions'
11
+ end
12
+
13
+ setting :filters, type: :hash
14
+
15
+ def transform(record)
16
+ record_hash = record.data.to_h
17
+
18
+ @config.filters.each do |key, value|
19
+ path = key.split('.').map do |k|
20
+ k.match?(/^\d+$/) ? k.to_i : k.to_sym
21
+ end
22
+
23
+ return nil unless record_hash.dig(*path) == value
24
+ end
25
+
26
+ record.data
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class FormatTransformer < Chronicle::ETL::Transformer
6
+ register_connector do |r|
7
+ r.identifier = :format
8
+ r.description = 'records to a differnet hash/json format'
9
+ end
10
+
11
+ setting :format, default: nil
12
+
13
+ def transform(record)
14
+ serializer = find_serializer(@config.format)
15
+ serializer.serialize(record.data)
16
+ end
17
+
18
+ private
19
+
20
+ def find_serializer(format)
21
+ case format
22
+ when 'jsonld'
23
+ Chronicle::Serialization::JSONLDSerializer
24
+ when 'jsonapi'
25
+ Chronicle::Serialization::JSONAPISerializer
26
+ else
27
+ raise 'unknown format'
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class MergeMetaTransformer < Chronicle::ETL::Transformer
6
+ register_connector do |r|
7
+ r.identifier = :merge_meta
8
+ r.description = 'merge extraction meta fields into the record'
9
+ end
10
+
11
+ def transform(record)
12
+ record.data unless record.extraction&.meta
13
+
14
+ record.data[:_meta] = record.extraction.meta
15
+ record.data
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class MultiplyTransformer < Chronicle::ETL::Transformer
6
+ register_connector do |r|
7
+ r.identifier = :multiply
8
+ r.description = 'by taking a sample'
9
+ end
10
+
11
+ setting :n, default: 2, type: :numeric
12
+
13
+ # return the result, sample_size percentage of the time. otherwise nil
14
+ def transform(record)
15
+ @config.n.to_i.times do
16
+ yield record.data
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end