chronicle-etl 0.5.4 → 0.6.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +15 -25
- data/.rubocop.yml +2 -44
- data/Gemfile +2 -2
- data/Guardfile +3 -3
- data/README.md +98 -73
- data/Rakefile +2 -2
- data/bin/console +4 -5
- data/chronicle-etl.gemspec +50 -45
- data/exe/chronicle-etl +1 -1
- data/lib/chronicle/etl/authorizer.rb +3 -4
- data/lib/chronicle/etl/cli/authorizations.rb +10 -8
- data/lib/chronicle/etl/cli/connectors.rb +9 -9
- data/lib/chronicle/etl/cli/jobs.rb +130 -53
- data/lib/chronicle/etl/cli/main.rb +29 -29
- data/lib/chronicle/etl/cli/plugins.rb +29 -26
- data/lib/chronicle/etl/cli/secrets.rb +14 -12
- data/lib/chronicle/etl/cli/subcommand_base.rb +5 -3
- data/lib/chronicle/etl/config.rb +20 -7
- data/lib/chronicle/etl/configurable.rb +24 -9
- data/lib/chronicle/etl/exceptions.rb +3 -3
- data/lib/chronicle/etl/extraction.rb +12 -2
- data/lib/chronicle/etl/extractors/csv_extractor.rb +9 -0
- data/lib/chronicle/etl/extractors/extractor.rb +15 -2
- data/lib/chronicle/etl/extractors/file_extractor.rb +5 -3
- data/lib/chronicle/etl/extractors/helpers/input_reader.rb +2 -2
- data/lib/chronicle/etl/extractors/json_extractor.rb +14 -4
- data/lib/chronicle/etl/extractors/stdin_extractor.rb +3 -0
- data/lib/chronicle/etl/job.rb +35 -17
- data/lib/chronicle/etl/job_definition.rb +39 -27
- data/lib/chronicle/etl/job_log.rb +14 -16
- data/lib/chronicle/etl/job_logger.rb +4 -4
- data/lib/chronicle/etl/loaders/csv_loader.rb +17 -4
- data/lib/chronicle/etl/loaders/helpers/stdout_helper.rb +4 -0
- data/lib/chronicle/etl/loaders/json_loader.rb +30 -10
- data/lib/chronicle/etl/loaders/loader.rb +0 -17
- data/lib/chronicle/etl/loaders/rest_loader.rb +7 -7
- data/lib/chronicle/etl/loaders/table_loader.rb +37 -12
- data/lib/chronicle/etl/logger.rb +3 -3
- data/lib/chronicle/etl/oauth_authorizer.rb +8 -10
- data/lib/chronicle/etl/record.rb +15 -0
- data/lib/chronicle/etl/registry/connector_registration.rb +15 -23
- data/lib/chronicle/etl/registry/connectors.rb +117 -0
- data/lib/chronicle/etl/registry/plugin_registration.rb +19 -0
- data/lib/chronicle/etl/registry/plugins.rb +171 -0
- data/lib/chronicle/etl/registry/registry.rb +3 -52
- data/lib/chronicle/etl/registry/self_registering.rb +1 -1
- data/lib/chronicle/etl/runner.rb +158 -128
- data/lib/chronicle/etl/secrets.rb +5 -5
- data/lib/chronicle/etl/transformers/buffer_transformer.rb +29 -0
- data/lib/chronicle/etl/transformers/chronicle_transformer.rb +32 -0
- data/lib/chronicle/etl/transformers/chronobase_transformer.rb +100 -0
- data/lib/chronicle/etl/transformers/fields_limit_transformer.rb +23 -0
- data/lib/chronicle/etl/transformers/filter_fields_transformer.rb +60 -0
- data/lib/chronicle/etl/transformers/filter_transformer.rb +30 -0
- data/lib/chronicle/etl/transformers/format_transformer.rb +32 -0
- data/lib/chronicle/etl/transformers/merge_meta_transformer.rb +19 -0
- data/lib/chronicle/etl/transformers/multiply_transformer.rb +21 -0
- data/lib/chronicle/etl/transformers/null_transformer.rb +5 -7
- data/lib/chronicle/etl/transformers/sampler_transformer.rb +21 -0
- data/lib/chronicle/etl/transformers/sort_transformer.rb +31 -0
- data/lib/chronicle/etl/transformers/transformer.rb +63 -41
- data/lib/chronicle/etl/utils/binary_attachments.rb +1 -1
- data/lib/chronicle/etl/utils/progress_bar.rb +2 -3
- data/lib/chronicle/etl/version.rb +1 -1
- data/lib/chronicle/etl.rb +6 -8
- metadata +91 -45
- data/lib/chronicle/etl/models/activity.rb +0 -15
- data/lib/chronicle/etl/models/attachment.rb +0 -14
- data/lib/chronicle/etl/models/base.rb +0 -122
- data/lib/chronicle/etl/models/entity.rb +0 -29
- data/lib/chronicle/etl/models/raw.rb +0 -26
- data/lib/chronicle/etl/registry/plugin_registry.rb +0 -95
- data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +0 -31
- data/lib/chronicle/etl/serializers/raw_serializer.rb +0 -10
- data/lib/chronicle/etl/serializers/serializer.rb +0 -28
- data/lib/chronicle/etl/transformers/image_file_transformer.rb +0 -247
- data/lib/chronicle/etl/utils/hash_utilities.rb +0 -19
- data/lib/chronicle/etl/utils/text_recognition.rb +0 -15
data/lib/chronicle/etl/runner.rb
CHANGED
@@ -1,133 +1,163 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'colorize'
|
2
4
|
require 'chronic_duration'
|
3
|
-
require
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
5
|
+
require 'tty-spinner'
|
6
|
+
|
7
|
+
module Chronicle
|
8
|
+
module ETL
|
9
|
+
class Runner
|
10
|
+
def initialize(job)
|
11
|
+
@job = job
|
12
|
+
@job_logger = Chronicle::ETL::JobLogger.new(@job)
|
13
|
+
end
|
14
|
+
|
15
|
+
def run!
|
16
|
+
begin_job
|
17
|
+
validate_job
|
18
|
+
instantiate_connectors
|
19
|
+
prepare_job
|
20
|
+
prepare_ui
|
21
|
+
run_extraction
|
22
|
+
rescue Chronicle::ETL::ExtractionError => e
|
23
|
+
@job_logger&.error
|
24
|
+
raise(Chronicle::ETL::RunnerError, "Extraction failed. #{e.message}")
|
25
|
+
rescue Interrupt
|
26
|
+
@job_logger&.error
|
27
|
+
raise(Chronicle::ETL::RunInterruptedError, 'Job interrupted.')
|
28
|
+
# rescue StandardError => e
|
29
|
+
# # Just throwing this in here until we have better exception handling in
|
30
|
+
# # loaders, etc
|
31
|
+
# @job_logger&.error
|
32
|
+
# raise(Chronicle::ETL::RunnerError, "Error running job. #{e.message}")
|
33
|
+
ensure
|
34
|
+
finish_job
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def begin_job
|
40
|
+
Chronicle::ETL::Logger.info(tty_log_job_initialize)
|
41
|
+
@initialization_spinner = TTY::Spinner.new(':spinner :title', format: :dots_2)
|
42
|
+
end
|
43
|
+
|
44
|
+
def validate_job
|
45
|
+
@initialization_spinner.update(title: 'Validating job')
|
46
|
+
@job.job_definition.validate!
|
47
|
+
end
|
48
|
+
|
49
|
+
def instantiate_connectors
|
50
|
+
@initialization_spinner.update(title: 'Initializing connectors')
|
51
|
+
@extractor = @job.instantiate_extractor
|
52
|
+
@transformers = @job.instantiate_transformers
|
53
|
+
@loader = @job.instantiate_loader
|
54
|
+
end
|
55
|
+
|
56
|
+
def prepare_job
|
57
|
+
@initialization_spinner.update(title: 'Preparing job')
|
58
|
+
@job_logger.start
|
59
|
+
@loader.start
|
60
|
+
|
61
|
+
@initialization_spinner.update(title: 'Preparing extraction')
|
62
|
+
@initialization_spinner.auto_spin
|
63
|
+
@extractor.prepare
|
64
|
+
@initialization_spinner.success("(#{'successful'.green})")
|
65
|
+
Chronicle::ETL::Logger.info("\n")
|
66
|
+
end
|
67
|
+
|
68
|
+
def prepare_ui
|
69
|
+
total = @extractor.results_count
|
70
|
+
@progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
|
71
|
+
Chronicle::ETL::Logger.attach_to_ui(@progress_bar)
|
72
|
+
end
|
73
|
+
|
74
|
+
def run_extraction
|
75
|
+
# Pattern based on Kiba's StreamingRunner
|
76
|
+
# https://github.com/thbar/kiba/blob/master/lib/kiba/streaming_runner.rb
|
77
|
+
stream = extractor_stream
|
78
|
+
recurser = ->(s, t) { transform_stream(s, t) }
|
79
|
+
@transformers.reduce(stream, &recurser).each do |record|
|
80
|
+
Chronicle::ETL::Logger.debug(tty_log_transformation(record))
|
81
|
+
@job_logger.log_transformation(record)
|
82
|
+
@progress_bar.increment
|
83
|
+
load_record(record)
|
84
|
+
end
|
85
|
+
|
86
|
+
@progress_bar.finish
|
87
|
+
|
88
|
+
# This is typically a slow method (writing to stdout, writing a big file, etc)
|
89
|
+
# TODO: consider adding a spinner?
|
90
|
+
@loader.finish
|
91
|
+
@job_logger.finish
|
92
|
+
end
|
93
|
+
|
94
|
+
# Initial steam of extracted data, wrapped in a Record class
|
95
|
+
def extractor_stream
|
96
|
+
Enumerator.new do |y|
|
97
|
+
@extractor.extract do |extraction|
|
98
|
+
record = Chronicle::ETL::Record.new(data: extraction.data, extraction: extraction)
|
99
|
+
y << record
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
# For a given stream of records and a given transformer,
|
105
|
+
# returns a new stream of transformed records and finally
|
106
|
+
# calls the finish method on the transformer
|
107
|
+
def transform_stream(stream, transformer)
|
108
|
+
Enumerator.new do |y|
|
109
|
+
stream.each do |record|
|
110
|
+
transformer.call(record) do |transformed_record|
|
111
|
+
y << transformed_record
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
transformer.call_finish do |transformed_record|
|
116
|
+
y << transformed_record
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
def load_record(record)
|
122
|
+
@loader.load(record.data) unless @job.dry_run?
|
123
|
+
end
|
124
|
+
|
125
|
+
def finish_job
|
126
|
+
@job_logger.save
|
127
|
+
@progress_bar&.finish
|
128
|
+
Chronicle::ETL::Logger.detach_from_ui
|
129
|
+
Chronicle::ETL::Logger.info(tty_log_completion)
|
130
|
+
end
|
131
|
+
|
132
|
+
def tty_log_job_initialize
|
133
|
+
output = 'Beginning job '
|
134
|
+
output += "'#{@job.name}'".bold if @job.name
|
135
|
+
output
|
136
|
+
end
|
137
|
+
|
138
|
+
def tty_log_transformation(record)
|
139
|
+
output = ' ✓'.green
|
140
|
+
output + " #{record}"
|
141
|
+
end
|
142
|
+
|
143
|
+
def tty_log_transformation_failure(exception, transformer)
|
144
|
+
output = ' ✖'.red
|
145
|
+
output + " Failed to transform #{transformer}. #{exception.message}"
|
146
|
+
end
|
147
|
+
|
148
|
+
def tty_log_completion
|
149
|
+
status = @job_logger.success ? 'Success' : 'Failed'
|
150
|
+
job_completion = @job_logger.success ? 'Completed' : 'Partially completed'
|
151
|
+
output = "\n#{job_completion} job"
|
152
|
+
output += " '#{@job.name}'".bold if @job.name
|
153
|
+
output += " in #{ChronicDuration.output(@job_logger.duration)}" if @job_logger.duration
|
154
|
+
output += "\n Status:\t".light_black + status
|
155
|
+
output += "\n Completed:\t".light_black + @job_logger.job_log.num_records_processed.to_s
|
156
|
+
if @job_logger.job_log.highest_timestamp
|
157
|
+
output += "\n Latest:\t".light_black + @job_logger.job_log.highest_timestamp.iso8601.to_s
|
158
|
+
end
|
159
|
+
output
|
160
|
+
end
|
73
161
|
end
|
74
|
-
|
75
|
-
@progress_bar.finish
|
76
|
-
|
77
|
-
# This is typically a slow method (writing to stdout, writing a big file, etc)
|
78
|
-
# TODO: consider adding a spinner?
|
79
|
-
@loader.finish
|
80
|
-
@job_logger.finish
|
81
|
-
end
|
82
|
-
|
83
|
-
def process_extraction(extraction)
|
84
|
-
# For each extraction from our extractor, we create a new tarnsformer
|
85
|
-
transformer = @job.instantiate_transformer(extraction)
|
86
|
-
|
87
|
-
# And then transform that record, logging it if we're in debug log level
|
88
|
-
record = transformer.transform
|
89
|
-
Chronicle::ETL::Logger.debug(tty_log_transformation(transformer))
|
90
|
-
@job_logger.log_transformation(transformer)
|
91
|
-
|
92
|
-
# Then send the results to the loader
|
93
|
-
@loader.load(record) unless @job.dry_run?
|
94
|
-
rescue Chronicle::ETL::TransformationError => e
|
95
|
-
# TODO: have an option to cancel job if we encounter an error
|
96
|
-
Chronicle::ETL::Logger.error(tty_log_transformation_failure(e, transformer))
|
97
|
-
end
|
98
|
-
|
99
|
-
def finish_job
|
100
|
-
@job_logger.save
|
101
|
-
@progress_bar&.finish
|
102
|
-
Chronicle::ETL::Logger.detach_from_ui
|
103
|
-
Chronicle::ETL::Logger.info(tty_log_completion)
|
104
|
-
end
|
105
|
-
|
106
|
-
def tty_log_job_initialize
|
107
|
-
output = "Beginning job "
|
108
|
-
output += "'#{@job.name}'".bold if @job.name
|
109
|
-
output
|
110
|
-
end
|
111
|
-
|
112
|
-
def tty_log_transformation(transformer)
|
113
|
-
output = " ✓".green
|
114
|
-
output += " #{transformer}"
|
115
|
-
end
|
116
|
-
|
117
|
-
def tty_log_transformation_failure(exception, transformer)
|
118
|
-
output = " ✖".red
|
119
|
-
output += " Failed to build #{transformer}. #{exception.message}"
|
120
|
-
end
|
121
|
-
|
122
|
-
def tty_log_completion
|
123
|
-
status = @job_logger.success ? 'Success' : 'Failed'
|
124
|
-
job_completion = @job_logger.success ? 'Completed' : 'Partially completed'
|
125
|
-
output = "\n#{job_completion} job"
|
126
|
-
output += " '#{@job.name}'".bold if @job.name
|
127
|
-
output += " in #{ChronicDuration.output(@job_logger.duration)}" if @job_logger.duration
|
128
|
-
output += "\n Status:\t".light_black + status
|
129
|
-
output += "\n Completed:\t".light_black + "#{@job_logger.job_log.num_records_processed}"
|
130
|
-
output += "\n Latest:\t".light_black + "#{@job_logger.job_log.highest_timestamp.iso8601}" if @job_logger.job_log.highest_timestamp
|
131
|
-
output
|
132
162
|
end
|
133
163
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
require
|
1
|
+
require 'active_support/core_ext/hash/keys'
|
2
2
|
|
3
3
|
module Chronicle
|
4
4
|
module ETL
|
@@ -8,7 +8,7 @@ module Chronicle
|
|
8
8
|
|
9
9
|
# Whether a given namespace exists
|
10
10
|
def exists?(namespace)
|
11
|
-
Chronicle::ETL::Config.exists?(
|
11
|
+
Chronicle::ETL::Config.exists?('secrets', namespace)
|
12
12
|
end
|
13
13
|
|
14
14
|
# Save a setting to a namespaced config file
|
@@ -47,7 +47,7 @@ module Chronicle
|
|
47
47
|
|
48
48
|
# Read secrets from a config file
|
49
49
|
def read(namespace)
|
50
|
-
definition = Chronicle::ETL::Config.load(
|
50
|
+
definition = Chronicle::ETL::Config.load('secrets', namespace)
|
51
51
|
definition[:secrets] || {}
|
52
52
|
end
|
53
53
|
|
@@ -56,8 +56,8 @@ module Chronicle
|
|
56
56
|
data = {
|
57
57
|
secrets: (secrets || {}).transform_keys(&:to_s),
|
58
58
|
chronicle_etl_version: Chronicle::ETL::VERSION
|
59
|
-
}
|
60
|
-
Chronicle::ETL::Config.write(
|
59
|
+
}
|
60
|
+
Chronicle::ETL::Config.write('secrets', namespace, data)
|
61
61
|
end
|
62
62
|
|
63
63
|
# Which config files are available in ~/.config/chronicle/etl/secrets
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
class BufferTransformer < Chronicle::ETL::Transformer
|
6
|
+
register_connector do |r|
|
7
|
+
r.identifier = :buffer
|
8
|
+
r.description = 'by buffering'
|
9
|
+
end
|
10
|
+
|
11
|
+
setting :size, default: 10, description: 'The size of the buffer'
|
12
|
+
|
13
|
+
def transform(record)
|
14
|
+
stash_record(record)
|
15
|
+
|
16
|
+
# FIXME: this doesn't seem to be working with the runner
|
17
|
+
return if @stashed_records.size < @config.size
|
18
|
+
|
19
|
+
# FIXME: this will result in the wrong extraction being associated with
|
20
|
+
# the batch of flushed records
|
21
|
+
flush_stashed_records.map(&:data)
|
22
|
+
end
|
23
|
+
|
24
|
+
def finish
|
25
|
+
flush_stashed_records
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
class ChronicleTransformer < Chronicle::ETL::Transformer
|
6
|
+
register_connector do |r|
|
7
|
+
r.identifier = :chronicle
|
8
|
+
r.description = 'records to Chronicle schema'
|
9
|
+
end
|
10
|
+
|
11
|
+
def transform(record)
|
12
|
+
converter_klass = find_converter(record.extraction)
|
13
|
+
# TODO: handle missing converter
|
14
|
+
|
15
|
+
converter_klass.new.call(record) do |transformed_record|
|
16
|
+
yield transformed_record.data
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def find_converter(extraction)
|
23
|
+
Chronicle::ETL::Registry::Connectors.find_converter_for_source(
|
24
|
+
source: extraction.source,
|
25
|
+
type: extraction.type,
|
26
|
+
strategy: extraction.strategy,
|
27
|
+
target: :chronicle
|
28
|
+
)&.klass
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,100 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
class ChronobaseTransformer < Chronicle::ETL::Transformer
|
6
|
+
PROPERTY_MAP = {
|
7
|
+
source: :provider,
|
8
|
+
source_id: :provider_id,
|
9
|
+
url: :provider_url,
|
10
|
+
end_time: :end_at,
|
11
|
+
start_time: :start_at,
|
12
|
+
|
13
|
+
name: :title,
|
14
|
+
description: :body,
|
15
|
+
text: :body,
|
16
|
+
|
17
|
+
recipient: :consumers,
|
18
|
+
agent: :actor,
|
19
|
+
object: :involved,
|
20
|
+
|
21
|
+
# music ones
|
22
|
+
by_artist: :creators,
|
23
|
+
in_album: :containers
|
24
|
+
}.freeze
|
25
|
+
|
26
|
+
VERB_MAP = {
|
27
|
+
ListenAction: 'listened',
|
28
|
+
CommunicateAction: 'messaged'
|
29
|
+
}.freeze
|
30
|
+
|
31
|
+
ENTITY_MAP = {
|
32
|
+
MusicRecording: 'song',
|
33
|
+
MusicAlbum: 'album',
|
34
|
+
MusicGroup: 'musicartist',
|
35
|
+
Message: 'message',
|
36
|
+
Person: 'person'
|
37
|
+
}.freeze
|
38
|
+
|
39
|
+
register_connector do |r|
|
40
|
+
r.identifier = :chronobase
|
41
|
+
r.description = 'records to chronobase schema'
|
42
|
+
end
|
43
|
+
|
44
|
+
def transform(record)
|
45
|
+
deeply_convert_record(record.data)
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
|
50
|
+
def deeply_convert_record(record)
|
51
|
+
type = activity?(record) ? 'activity' : 'entity'
|
52
|
+
|
53
|
+
properties = record.properties.compact.each_with_object({}) do |(k, v), h|
|
54
|
+
key = PROPERTY_MAP[k.to_sym] || k
|
55
|
+
h[key] = v
|
56
|
+
end
|
57
|
+
|
58
|
+
properties[:verb] = VERB_MAP[record.type_id.to_sym] if VERB_MAP.key?(record.type_id.to_sym)
|
59
|
+
properties[:represents] = ENTITY_MAP[record.type_id.to_sym] if ENTITY_MAP.key?(record.type_id.to_sym)
|
60
|
+
|
61
|
+
properties.transform_values! do |v|
|
62
|
+
case v
|
63
|
+
when Chronicle::Models::Base
|
64
|
+
deeply_convert_record(v)
|
65
|
+
when Array
|
66
|
+
v.map { |e| e.is_a?(Chronicle::Models::Base) ? deeply_convert_record(e) : e }
|
67
|
+
else
|
68
|
+
v
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
Chronicle::Serialization::Record.new(
|
73
|
+
id: record.id,
|
74
|
+
type: type,
|
75
|
+
properties: properties.compact,
|
76
|
+
meta: {
|
77
|
+
dedupe_on: transform_dedupe_on(record)
|
78
|
+
},
|
79
|
+
schema: 'chronobase'
|
80
|
+
)
|
81
|
+
end
|
82
|
+
|
83
|
+
def activity?(record)
|
84
|
+
record.type_id.end_with?('Action')
|
85
|
+
end
|
86
|
+
|
87
|
+
def transform_dedupe_on(record)
|
88
|
+
property_map_with_type = PROPERTY_MAP.merge({
|
89
|
+
type: activity?(record) ? :verb : :represents
|
90
|
+
})
|
91
|
+
|
92
|
+
record.dedupe_on.map do |set|
|
93
|
+
set.map do |d|
|
94
|
+
property_map_with_type[d] || d
|
95
|
+
end.join(',')
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'chronicle/utils/hash_utils'
|
4
|
+
|
5
|
+
module Chronicle
|
6
|
+
module ETL
|
7
|
+
# A transformer that filters the fields of a record and returns a new hash with only the specified fields.
|
8
|
+
class FieldsLimitTransformer < Chronicle::ETL::Transformer
|
9
|
+
register_connector do |r|
|
10
|
+
r.identifier = :fields_limit
|
11
|
+
r.description = 'by taking first N fields'
|
12
|
+
end
|
13
|
+
|
14
|
+
setting :limit, type: :numeric, default: 10
|
15
|
+
|
16
|
+
def transform(record)
|
17
|
+
# flattern hash and then take the first limit fields
|
18
|
+
|
19
|
+
Chronicle::Utils::HashUtils.flatten_hash(record.data.to_h).first(@config.limit).to_h
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
# A transformer that filters the fields of a record and returns a new hash with only the specified fields.
|
6
|
+
class FilterFieldsTransformer < Chronicle::ETL::Transformer
|
7
|
+
register_connector do |r|
|
8
|
+
r.identifier = :filter_fields
|
9
|
+
r.description = 'by taking a subset of the fields'
|
10
|
+
end
|
11
|
+
|
12
|
+
setting :fields, type: :array, default: []
|
13
|
+
|
14
|
+
def transform(record)
|
15
|
+
hash = record.data.to_h.deep_transform_keys(&:to_sym)
|
16
|
+
filter_hash(hash, @config.fields.map)
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def access_nested_value(data, path)
|
22
|
+
keys = path.split('.')
|
23
|
+
keys.reduce(data) do |acc, key|
|
24
|
+
if acc.is_a?(Array)
|
25
|
+
acc.map do |item|
|
26
|
+
item[key.to_sym]
|
27
|
+
rescue StandardError
|
28
|
+
nil
|
29
|
+
end
|
30
|
+
.compact
|
31
|
+
elsif key.include?('[')
|
32
|
+
key, index = key.split(/\[|\]/).reject(&:empty?)
|
33
|
+
acc = acc[key.to_sym] if acc
|
34
|
+
acc.is_a?(Array) ? acc[index.to_i] : nil
|
35
|
+
else
|
36
|
+
acc&.dig(key.to_sym)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def filter_hash(original_hash, fields)
|
42
|
+
fields.each_with_object({}) do |field, result|
|
43
|
+
value = access_nested_value(original_hash, field)
|
44
|
+
keys = field.split('.')
|
45
|
+
last_key = keys.pop.to_sym
|
46
|
+
|
47
|
+
current = result
|
48
|
+
keys.each do |key|
|
49
|
+
key = key.to_sym
|
50
|
+
key, = key.to_s.split(/\[|\]/) if key.to_s.include?('[')
|
51
|
+
current[key] ||= {}
|
52
|
+
current = current[key]
|
53
|
+
end
|
54
|
+
|
55
|
+
current[last_key] = value
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
# Return only records that match all the conditions of the filters
|
6
|
+
# setting.
|
7
|
+
class FilterTransformer < Chronicle::ETL::Transformer
|
8
|
+
register_connector do |r|
|
9
|
+
r.identifier = :filter
|
10
|
+
r.description = 'by only accepting records that match conditions'
|
11
|
+
end
|
12
|
+
|
13
|
+
setting :filters, type: :hash
|
14
|
+
|
15
|
+
def transform(record)
|
16
|
+
record_hash = record.data.to_h
|
17
|
+
|
18
|
+
@config.filters.each do |key, value|
|
19
|
+
path = key.split('.').map do |k|
|
20
|
+
k.match?(/^\d+$/) ? k.to_i : k.to_sym
|
21
|
+
end
|
22
|
+
|
23
|
+
return nil unless record_hash.dig(*path) == value
|
24
|
+
end
|
25
|
+
|
26
|
+
record.data
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
class FormatTransformer < Chronicle::ETL::Transformer
|
6
|
+
register_connector do |r|
|
7
|
+
r.identifier = :format
|
8
|
+
r.description = 'records to a differnet hash/json format'
|
9
|
+
end
|
10
|
+
|
11
|
+
setting :format, default: nil
|
12
|
+
|
13
|
+
def transform(record)
|
14
|
+
serializer = find_serializer(@config.format)
|
15
|
+
serializer.serialize(record.data)
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def find_serializer(format)
|
21
|
+
case format
|
22
|
+
when 'jsonld'
|
23
|
+
Chronicle::Serialization::JSONLDSerializer
|
24
|
+
when 'jsonapi'
|
25
|
+
Chronicle::Serialization::JSONAPISerializer
|
26
|
+
else
|
27
|
+
raise 'unknown format'
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
class MergeMetaTransformer < Chronicle::ETL::Transformer
|
6
|
+
register_connector do |r|
|
7
|
+
r.identifier = :merge_meta
|
8
|
+
r.description = 'merge extraction meta fields into the record'
|
9
|
+
end
|
10
|
+
|
11
|
+
def transform(record)
|
12
|
+
record.data unless record.extraction&.meta
|
13
|
+
|
14
|
+
record.data[:_meta] = record.extraction.meta
|
15
|
+
record.data
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
class MultiplyTransformer < Chronicle::ETL::Transformer
|
6
|
+
register_connector do |r|
|
7
|
+
r.identifier = :multiply
|
8
|
+
r.description = 'by taking a sample'
|
9
|
+
end
|
10
|
+
|
11
|
+
setting :n, default: 2, type: :numeric
|
12
|
+
|
13
|
+
# return the result, sample_size percentage of the time. otherwise nil
|
14
|
+
def transform(record)
|
15
|
+
@config.n.to_i.times do
|
16
|
+
yield record.data
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|