chronicle-etl 0.5.4 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +15 -25
- data/.rubocop.yml +2 -44
- data/Gemfile +2 -2
- data/Guardfile +3 -3
- data/README.md +98 -73
- data/Rakefile +2 -2
- data/bin/console +4 -5
- data/chronicle-etl.gemspec +50 -45
- data/exe/chronicle-etl +1 -1
- data/lib/chronicle/etl/authorizer.rb +3 -4
- data/lib/chronicle/etl/cli/authorizations.rb +10 -8
- data/lib/chronicle/etl/cli/connectors.rb +9 -9
- data/lib/chronicle/etl/cli/jobs.rb +130 -53
- data/lib/chronicle/etl/cli/main.rb +29 -29
- data/lib/chronicle/etl/cli/plugins.rb +29 -26
- data/lib/chronicle/etl/cli/secrets.rb +14 -12
- data/lib/chronicle/etl/cli/subcommand_base.rb +5 -3
- data/lib/chronicle/etl/config.rb +20 -7
- data/lib/chronicle/etl/configurable.rb +24 -9
- data/lib/chronicle/etl/exceptions.rb +3 -3
- data/lib/chronicle/etl/extraction.rb +12 -2
- data/lib/chronicle/etl/extractors/csv_extractor.rb +9 -0
- data/lib/chronicle/etl/extractors/extractor.rb +15 -2
- data/lib/chronicle/etl/extractors/file_extractor.rb +5 -3
- data/lib/chronicle/etl/extractors/helpers/input_reader.rb +2 -2
- data/lib/chronicle/etl/extractors/json_extractor.rb +14 -4
- data/lib/chronicle/etl/extractors/stdin_extractor.rb +3 -0
- data/lib/chronicle/etl/job.rb +35 -17
- data/lib/chronicle/etl/job_definition.rb +39 -27
- data/lib/chronicle/etl/job_log.rb +14 -16
- data/lib/chronicle/etl/job_logger.rb +4 -4
- data/lib/chronicle/etl/loaders/csv_loader.rb +17 -4
- data/lib/chronicle/etl/loaders/helpers/stdout_helper.rb +4 -0
- data/lib/chronicle/etl/loaders/json_loader.rb +30 -10
- data/lib/chronicle/etl/loaders/loader.rb +0 -17
- data/lib/chronicle/etl/loaders/rest_loader.rb +7 -7
- data/lib/chronicle/etl/loaders/table_loader.rb +37 -12
- data/lib/chronicle/etl/logger.rb +3 -3
- data/lib/chronicle/etl/oauth_authorizer.rb +8 -10
- data/lib/chronicle/etl/record.rb +15 -0
- data/lib/chronicle/etl/registry/connector_registration.rb +15 -23
- data/lib/chronicle/etl/registry/connectors.rb +117 -0
- data/lib/chronicle/etl/registry/plugin_registration.rb +19 -0
- data/lib/chronicle/etl/registry/plugins.rb +171 -0
- data/lib/chronicle/etl/registry/registry.rb +3 -52
- data/lib/chronicle/etl/registry/self_registering.rb +1 -1
- data/lib/chronicle/etl/runner.rb +158 -128
- data/lib/chronicle/etl/secrets.rb +5 -5
- data/lib/chronicle/etl/transformers/buffer_transformer.rb +29 -0
- data/lib/chronicle/etl/transformers/chronicle_transformer.rb +32 -0
- data/lib/chronicle/etl/transformers/chronobase_transformer.rb +100 -0
- data/lib/chronicle/etl/transformers/fields_limit_transformer.rb +23 -0
- data/lib/chronicle/etl/transformers/filter_fields_transformer.rb +60 -0
- data/lib/chronicle/etl/transformers/filter_transformer.rb +30 -0
- data/lib/chronicle/etl/transformers/format_transformer.rb +32 -0
- data/lib/chronicle/etl/transformers/merge_meta_transformer.rb +19 -0
- data/lib/chronicle/etl/transformers/multiply_transformer.rb +21 -0
- data/lib/chronicle/etl/transformers/null_transformer.rb +5 -7
- data/lib/chronicle/etl/transformers/sampler_transformer.rb +21 -0
- data/lib/chronicle/etl/transformers/sort_transformer.rb +31 -0
- data/lib/chronicle/etl/transformers/transformer.rb +63 -41
- data/lib/chronicle/etl/utils/binary_attachments.rb +1 -1
- data/lib/chronicle/etl/utils/progress_bar.rb +2 -3
- data/lib/chronicle/etl/version.rb +1 -1
- data/lib/chronicle/etl.rb +6 -8
- metadata +91 -45
- data/lib/chronicle/etl/models/activity.rb +0 -15
- data/lib/chronicle/etl/models/attachment.rb +0 -14
- data/lib/chronicle/etl/models/base.rb +0 -122
- data/lib/chronicle/etl/models/entity.rb +0 -29
- data/lib/chronicle/etl/models/raw.rb +0 -26
- data/lib/chronicle/etl/registry/plugin_registry.rb +0 -95
- data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +0 -31
- data/lib/chronicle/etl/serializers/raw_serializer.rb +0 -10
- data/lib/chronicle/etl/serializers/serializer.rb +0 -28
- data/lib/chronicle/etl/transformers/image_file_transformer.rb +0 -247
- data/lib/chronicle/etl/utils/hash_utilities.rb +0 -19
- data/lib/chronicle/etl/utils/text_recognition.rb +0 -15
data/lib/chronicle/etl/runner.rb
CHANGED
@@ -1,133 +1,163 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'colorize'
|
2
4
|
require 'chronic_duration'
|
3
|
-
require
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
5
|
+
require 'tty-spinner'
|
6
|
+
|
7
|
+
module Chronicle
|
8
|
+
module ETL
|
9
|
+
class Runner
|
10
|
+
def initialize(job)
|
11
|
+
@job = job
|
12
|
+
@job_logger = Chronicle::ETL::JobLogger.new(@job)
|
13
|
+
end
|
14
|
+
|
15
|
+
def run!
|
16
|
+
begin_job
|
17
|
+
validate_job
|
18
|
+
instantiate_connectors
|
19
|
+
prepare_job
|
20
|
+
prepare_ui
|
21
|
+
run_extraction
|
22
|
+
rescue Chronicle::ETL::ExtractionError => e
|
23
|
+
@job_logger&.error
|
24
|
+
raise(Chronicle::ETL::RunnerError, "Extraction failed. #{e.message}")
|
25
|
+
rescue Interrupt
|
26
|
+
@job_logger&.error
|
27
|
+
raise(Chronicle::ETL::RunInterruptedError, 'Job interrupted.')
|
28
|
+
# rescue StandardError => e
|
29
|
+
# # Just throwing this in here until we have better exception handling in
|
30
|
+
# # loaders, etc
|
31
|
+
# @job_logger&.error
|
32
|
+
# raise(Chronicle::ETL::RunnerError, "Error running job. #{e.message}")
|
33
|
+
ensure
|
34
|
+
finish_job
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def begin_job
|
40
|
+
Chronicle::ETL::Logger.info(tty_log_job_initialize)
|
41
|
+
@initialization_spinner = TTY::Spinner.new(':spinner :title', format: :dots_2)
|
42
|
+
end
|
43
|
+
|
44
|
+
def validate_job
|
45
|
+
@initialization_spinner.update(title: 'Validating job')
|
46
|
+
@job.job_definition.validate!
|
47
|
+
end
|
48
|
+
|
49
|
+
def instantiate_connectors
|
50
|
+
@initialization_spinner.update(title: 'Initializing connectors')
|
51
|
+
@extractor = @job.instantiate_extractor
|
52
|
+
@transformers = @job.instantiate_transformers
|
53
|
+
@loader = @job.instantiate_loader
|
54
|
+
end
|
55
|
+
|
56
|
+
def prepare_job
|
57
|
+
@initialization_spinner.update(title: 'Preparing job')
|
58
|
+
@job_logger.start
|
59
|
+
@loader.start
|
60
|
+
|
61
|
+
@initialization_spinner.update(title: 'Preparing extraction')
|
62
|
+
@initialization_spinner.auto_spin
|
63
|
+
@extractor.prepare
|
64
|
+
@initialization_spinner.success("(#{'successful'.green})")
|
65
|
+
Chronicle::ETL::Logger.info("\n")
|
66
|
+
end
|
67
|
+
|
68
|
+
def prepare_ui
|
69
|
+
total = @extractor.results_count
|
70
|
+
@progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
|
71
|
+
Chronicle::ETL::Logger.attach_to_ui(@progress_bar)
|
72
|
+
end
|
73
|
+
|
74
|
+
def run_extraction
|
75
|
+
# Pattern based on Kiba's StreamingRunner
|
76
|
+
# https://github.com/thbar/kiba/blob/master/lib/kiba/streaming_runner.rb
|
77
|
+
stream = extractor_stream
|
78
|
+
recurser = ->(s, t) { transform_stream(s, t) }
|
79
|
+
@transformers.reduce(stream, &recurser).each do |record|
|
80
|
+
Chronicle::ETL::Logger.debug(tty_log_transformation(record))
|
81
|
+
@job_logger.log_transformation(record)
|
82
|
+
@progress_bar.increment
|
83
|
+
load_record(record)
|
84
|
+
end
|
85
|
+
|
86
|
+
@progress_bar.finish
|
87
|
+
|
88
|
+
# This is typically a slow method (writing to stdout, writing a big file, etc)
|
89
|
+
# TODO: consider adding a spinner?
|
90
|
+
@loader.finish
|
91
|
+
@job_logger.finish
|
92
|
+
end
|
93
|
+
|
94
|
+
# Initial steam of extracted data, wrapped in a Record class
|
95
|
+
def extractor_stream
|
96
|
+
Enumerator.new do |y|
|
97
|
+
@extractor.extract do |extraction|
|
98
|
+
record = Chronicle::ETL::Record.new(data: extraction.data, extraction: extraction)
|
99
|
+
y << record
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
# For a given stream of records and a given transformer,
|
105
|
+
# returns a new stream of transformed records and finally
|
106
|
+
# calls the finish method on the transformer
|
107
|
+
def transform_stream(stream, transformer)
|
108
|
+
Enumerator.new do |y|
|
109
|
+
stream.each do |record|
|
110
|
+
transformer.call(record) do |transformed_record|
|
111
|
+
y << transformed_record
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
transformer.call_finish do |transformed_record|
|
116
|
+
y << transformed_record
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
def load_record(record)
|
122
|
+
@loader.load(record.data) unless @job.dry_run?
|
123
|
+
end
|
124
|
+
|
125
|
+
def finish_job
|
126
|
+
@job_logger.save
|
127
|
+
@progress_bar&.finish
|
128
|
+
Chronicle::ETL::Logger.detach_from_ui
|
129
|
+
Chronicle::ETL::Logger.info(tty_log_completion)
|
130
|
+
end
|
131
|
+
|
132
|
+
def tty_log_job_initialize
|
133
|
+
output = 'Beginning job '
|
134
|
+
output += "'#{@job.name}'".bold if @job.name
|
135
|
+
output
|
136
|
+
end
|
137
|
+
|
138
|
+
def tty_log_transformation(record)
|
139
|
+
output = ' ✓'.green
|
140
|
+
output + " #{record}"
|
141
|
+
end
|
142
|
+
|
143
|
+
def tty_log_transformation_failure(exception, transformer)
|
144
|
+
output = ' ✖'.red
|
145
|
+
output + " Failed to transform #{transformer}. #{exception.message}"
|
146
|
+
end
|
147
|
+
|
148
|
+
def tty_log_completion
|
149
|
+
status = @job_logger.success ? 'Success' : 'Failed'
|
150
|
+
job_completion = @job_logger.success ? 'Completed' : 'Partially completed'
|
151
|
+
output = "\n#{job_completion} job"
|
152
|
+
output += " '#{@job.name}'".bold if @job.name
|
153
|
+
output += " in #{ChronicDuration.output(@job_logger.duration)}" if @job_logger.duration
|
154
|
+
output += "\n Status:\t".light_black + status
|
155
|
+
output += "\n Completed:\t".light_black + @job_logger.job_log.num_records_processed.to_s
|
156
|
+
if @job_logger.job_log.highest_timestamp
|
157
|
+
output += "\n Latest:\t".light_black + @job_logger.job_log.highest_timestamp.iso8601.to_s
|
158
|
+
end
|
159
|
+
output
|
160
|
+
end
|
73
161
|
end
|
74
|
-
|
75
|
-
@progress_bar.finish
|
76
|
-
|
77
|
-
# This is typically a slow method (writing to stdout, writing a big file, etc)
|
78
|
-
# TODO: consider adding a spinner?
|
79
|
-
@loader.finish
|
80
|
-
@job_logger.finish
|
81
|
-
end
|
82
|
-
|
83
|
-
def process_extraction(extraction)
|
84
|
-
# For each extraction from our extractor, we create a new tarnsformer
|
85
|
-
transformer = @job.instantiate_transformer(extraction)
|
86
|
-
|
87
|
-
# And then transform that record, logging it if we're in debug log level
|
88
|
-
record = transformer.transform
|
89
|
-
Chronicle::ETL::Logger.debug(tty_log_transformation(transformer))
|
90
|
-
@job_logger.log_transformation(transformer)
|
91
|
-
|
92
|
-
# Then send the results to the loader
|
93
|
-
@loader.load(record) unless @job.dry_run?
|
94
|
-
rescue Chronicle::ETL::TransformationError => e
|
95
|
-
# TODO: have an option to cancel job if we encounter an error
|
96
|
-
Chronicle::ETL::Logger.error(tty_log_transformation_failure(e, transformer))
|
97
|
-
end
|
98
|
-
|
99
|
-
def finish_job
|
100
|
-
@job_logger.save
|
101
|
-
@progress_bar&.finish
|
102
|
-
Chronicle::ETL::Logger.detach_from_ui
|
103
|
-
Chronicle::ETL::Logger.info(tty_log_completion)
|
104
|
-
end
|
105
|
-
|
106
|
-
def tty_log_job_initialize
|
107
|
-
output = "Beginning job "
|
108
|
-
output += "'#{@job.name}'".bold if @job.name
|
109
|
-
output
|
110
|
-
end
|
111
|
-
|
112
|
-
def tty_log_transformation(transformer)
|
113
|
-
output = " ✓".green
|
114
|
-
output += " #{transformer}"
|
115
|
-
end
|
116
|
-
|
117
|
-
def tty_log_transformation_failure(exception, transformer)
|
118
|
-
output = " ✖".red
|
119
|
-
output += " Failed to build #{transformer}. #{exception.message}"
|
120
|
-
end
|
121
|
-
|
122
|
-
def tty_log_completion
|
123
|
-
status = @job_logger.success ? 'Success' : 'Failed'
|
124
|
-
job_completion = @job_logger.success ? 'Completed' : 'Partially completed'
|
125
|
-
output = "\n#{job_completion} job"
|
126
|
-
output += " '#{@job.name}'".bold if @job.name
|
127
|
-
output += " in #{ChronicDuration.output(@job_logger.duration)}" if @job_logger.duration
|
128
|
-
output += "\n Status:\t".light_black + status
|
129
|
-
output += "\n Completed:\t".light_black + "#{@job_logger.job_log.num_records_processed}"
|
130
|
-
output += "\n Latest:\t".light_black + "#{@job_logger.job_log.highest_timestamp.iso8601}" if @job_logger.job_log.highest_timestamp
|
131
|
-
output
|
132
162
|
end
|
133
163
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
require
|
1
|
+
require 'active_support/core_ext/hash/keys'
|
2
2
|
|
3
3
|
module Chronicle
|
4
4
|
module ETL
|
@@ -8,7 +8,7 @@ module Chronicle
|
|
8
8
|
|
9
9
|
# Whether a given namespace exists
|
10
10
|
def exists?(namespace)
|
11
|
-
Chronicle::ETL::Config.exists?(
|
11
|
+
Chronicle::ETL::Config.exists?('secrets', namespace)
|
12
12
|
end
|
13
13
|
|
14
14
|
# Save a setting to a namespaced config file
|
@@ -47,7 +47,7 @@ module Chronicle
|
|
47
47
|
|
48
48
|
# Read secrets from a config file
|
49
49
|
def read(namespace)
|
50
|
-
definition = Chronicle::ETL::Config.load(
|
50
|
+
definition = Chronicle::ETL::Config.load('secrets', namespace)
|
51
51
|
definition[:secrets] || {}
|
52
52
|
end
|
53
53
|
|
@@ -56,8 +56,8 @@ module Chronicle
|
|
56
56
|
data = {
|
57
57
|
secrets: (secrets || {}).transform_keys(&:to_s),
|
58
58
|
chronicle_etl_version: Chronicle::ETL::VERSION
|
59
|
-
}
|
60
|
-
Chronicle::ETL::Config.write(
|
59
|
+
}
|
60
|
+
Chronicle::ETL::Config.write('secrets', namespace, data)
|
61
61
|
end
|
62
62
|
|
63
63
|
# Which config files are available in ~/.config/chronicle/etl/secrets
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
class BufferTransformer < Chronicle::ETL::Transformer
|
6
|
+
register_connector do |r|
|
7
|
+
r.identifier = :buffer
|
8
|
+
r.description = 'by buffering'
|
9
|
+
end
|
10
|
+
|
11
|
+
setting :size, default: 10, description: 'The size of the buffer'
|
12
|
+
|
13
|
+
def transform(record)
|
14
|
+
stash_record(record)
|
15
|
+
|
16
|
+
# FIXME: this doesn't seem to be working with the runner
|
17
|
+
return if @stashed_records.size < @config.size
|
18
|
+
|
19
|
+
# FIXME: this will result in the wrong extraction being associated with
|
20
|
+
# the batch of flushed records
|
21
|
+
flush_stashed_records.map(&:data)
|
22
|
+
end
|
23
|
+
|
24
|
+
def finish
|
25
|
+
flush_stashed_records
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
class ChronicleTransformer < Chronicle::ETL::Transformer
|
6
|
+
register_connector do |r|
|
7
|
+
r.identifier = :chronicle
|
8
|
+
r.description = 'records to Chronicle schema'
|
9
|
+
end
|
10
|
+
|
11
|
+
def transform(record)
|
12
|
+
converter_klass = find_converter(record.extraction)
|
13
|
+
# TODO: handle missing converter
|
14
|
+
|
15
|
+
converter_klass.new.call(record) do |transformed_record|
|
16
|
+
yield transformed_record.data
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def find_converter(extraction)
|
23
|
+
Chronicle::ETL::Registry::Connectors.find_converter_for_source(
|
24
|
+
source: extraction.source,
|
25
|
+
type: extraction.type,
|
26
|
+
strategy: extraction.strategy,
|
27
|
+
target: :chronicle
|
28
|
+
)&.klass
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,100 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
class ChronobaseTransformer < Chronicle::ETL::Transformer
|
6
|
+
PROPERTY_MAP = {
|
7
|
+
source: :provider,
|
8
|
+
source_id: :provider_id,
|
9
|
+
url: :provider_url,
|
10
|
+
end_time: :end_at,
|
11
|
+
start_time: :start_at,
|
12
|
+
|
13
|
+
name: :title,
|
14
|
+
description: :body,
|
15
|
+
text: :body,
|
16
|
+
|
17
|
+
recipient: :consumers,
|
18
|
+
agent: :actor,
|
19
|
+
object: :involved,
|
20
|
+
|
21
|
+
# music ones
|
22
|
+
by_artist: :creators,
|
23
|
+
in_album: :containers
|
24
|
+
}.freeze
|
25
|
+
|
26
|
+
VERB_MAP = {
|
27
|
+
ListenAction: 'listened',
|
28
|
+
CommunicateAction: 'messaged'
|
29
|
+
}.freeze
|
30
|
+
|
31
|
+
ENTITY_MAP = {
|
32
|
+
MusicRecording: 'song',
|
33
|
+
MusicAlbum: 'album',
|
34
|
+
MusicGroup: 'musicartist',
|
35
|
+
Message: 'message',
|
36
|
+
Person: 'person'
|
37
|
+
}.freeze
|
38
|
+
|
39
|
+
register_connector do |r|
|
40
|
+
r.identifier = :chronobase
|
41
|
+
r.description = 'records to chronobase schema'
|
42
|
+
end
|
43
|
+
|
44
|
+
def transform(record)
|
45
|
+
deeply_convert_record(record.data)
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
|
50
|
+
def deeply_convert_record(record)
|
51
|
+
type = activity?(record) ? 'activity' : 'entity'
|
52
|
+
|
53
|
+
properties = record.properties.compact.each_with_object({}) do |(k, v), h|
|
54
|
+
key = PROPERTY_MAP[k.to_sym] || k
|
55
|
+
h[key] = v
|
56
|
+
end
|
57
|
+
|
58
|
+
properties[:verb] = VERB_MAP[record.type_id.to_sym] if VERB_MAP.key?(record.type_id.to_sym)
|
59
|
+
properties[:represents] = ENTITY_MAP[record.type_id.to_sym] if ENTITY_MAP.key?(record.type_id.to_sym)
|
60
|
+
|
61
|
+
properties.transform_values! do |v|
|
62
|
+
case v
|
63
|
+
when Chronicle::Models::Base
|
64
|
+
deeply_convert_record(v)
|
65
|
+
when Array
|
66
|
+
v.map { |e| e.is_a?(Chronicle::Models::Base) ? deeply_convert_record(e) : e }
|
67
|
+
else
|
68
|
+
v
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
Chronicle::Serialization::Record.new(
|
73
|
+
id: record.id,
|
74
|
+
type: type,
|
75
|
+
properties: properties.compact,
|
76
|
+
meta: {
|
77
|
+
dedupe_on: transform_dedupe_on(record)
|
78
|
+
},
|
79
|
+
schema: 'chronobase'
|
80
|
+
)
|
81
|
+
end
|
82
|
+
|
83
|
+
def activity?(record)
|
84
|
+
record.type_id.end_with?('Action')
|
85
|
+
end
|
86
|
+
|
87
|
+
def transform_dedupe_on(record)
|
88
|
+
property_map_with_type = PROPERTY_MAP.merge({
|
89
|
+
type: activity?(record) ? :verb : :represents
|
90
|
+
})
|
91
|
+
|
92
|
+
record.dedupe_on.map do |set|
|
93
|
+
set.map do |d|
|
94
|
+
property_map_with_type[d] || d
|
95
|
+
end.join(',')
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'chronicle/utils/hash_utils'
|
4
|
+
|
5
|
+
module Chronicle
|
6
|
+
module ETL
|
7
|
+
# A transformer that filters the fields of a record and returns a new hash with only the specified fields.
|
8
|
+
class FieldsLimitTransformer < Chronicle::ETL::Transformer
|
9
|
+
register_connector do |r|
|
10
|
+
r.identifier = :fields_limit
|
11
|
+
r.description = 'by taking first N fields'
|
12
|
+
end
|
13
|
+
|
14
|
+
setting :limit, type: :numeric, default: 10
|
15
|
+
|
16
|
+
def transform(record)
|
17
|
+
# flattern hash and then take the first limit fields
|
18
|
+
|
19
|
+
Chronicle::Utils::HashUtils.flatten_hash(record.data.to_h).first(@config.limit).to_h
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
# A transformer that filters the fields of a record and returns a new hash with only the specified fields.
|
6
|
+
class FilterFieldsTransformer < Chronicle::ETL::Transformer
|
7
|
+
register_connector do |r|
|
8
|
+
r.identifier = :filter_fields
|
9
|
+
r.description = 'by taking a subset of the fields'
|
10
|
+
end
|
11
|
+
|
12
|
+
setting :fields, type: :array, default: []
|
13
|
+
|
14
|
+
def transform(record)
|
15
|
+
hash = record.data.to_h.deep_transform_keys(&:to_sym)
|
16
|
+
filter_hash(hash, @config.fields.map)
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def access_nested_value(data, path)
|
22
|
+
keys = path.split('.')
|
23
|
+
keys.reduce(data) do |acc, key|
|
24
|
+
if acc.is_a?(Array)
|
25
|
+
acc.map do |item|
|
26
|
+
item[key.to_sym]
|
27
|
+
rescue StandardError
|
28
|
+
nil
|
29
|
+
end
|
30
|
+
.compact
|
31
|
+
elsif key.include?('[')
|
32
|
+
key, index = key.split(/\[|\]/).reject(&:empty?)
|
33
|
+
acc = acc[key.to_sym] if acc
|
34
|
+
acc.is_a?(Array) ? acc[index.to_i] : nil
|
35
|
+
else
|
36
|
+
acc&.dig(key.to_sym)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def filter_hash(original_hash, fields)
|
42
|
+
fields.each_with_object({}) do |field, result|
|
43
|
+
value = access_nested_value(original_hash, field)
|
44
|
+
keys = field.split('.')
|
45
|
+
last_key = keys.pop.to_sym
|
46
|
+
|
47
|
+
current = result
|
48
|
+
keys.each do |key|
|
49
|
+
key = key.to_sym
|
50
|
+
key, = key.to_s.split(/\[|\]/) if key.to_s.include?('[')
|
51
|
+
current[key] ||= {}
|
52
|
+
current = current[key]
|
53
|
+
end
|
54
|
+
|
55
|
+
current[last_key] = value
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
# Return only records that match all the conditions of the filters
|
6
|
+
# setting.
|
7
|
+
class FilterTransformer < Chronicle::ETL::Transformer
|
8
|
+
register_connector do |r|
|
9
|
+
r.identifier = :filter
|
10
|
+
r.description = 'by only accepting records that match conditions'
|
11
|
+
end
|
12
|
+
|
13
|
+
setting :filters, type: :hash
|
14
|
+
|
15
|
+
def transform(record)
|
16
|
+
record_hash = record.data.to_h
|
17
|
+
|
18
|
+
@config.filters.each do |key, value|
|
19
|
+
path = key.split('.').map do |k|
|
20
|
+
k.match?(/^\d+$/) ? k.to_i : k.to_sym
|
21
|
+
end
|
22
|
+
|
23
|
+
return nil unless record_hash.dig(*path) == value
|
24
|
+
end
|
25
|
+
|
26
|
+
record.data
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
class FormatTransformer < Chronicle::ETL::Transformer
|
6
|
+
register_connector do |r|
|
7
|
+
r.identifier = :format
|
8
|
+
r.description = 'records to a differnet hash/json format'
|
9
|
+
end
|
10
|
+
|
11
|
+
setting :format, default: nil
|
12
|
+
|
13
|
+
def transform(record)
|
14
|
+
serializer = find_serializer(@config.format)
|
15
|
+
serializer.serialize(record.data)
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def find_serializer(format)
|
21
|
+
case format
|
22
|
+
when 'jsonld'
|
23
|
+
Chronicle::Serialization::JSONLDSerializer
|
24
|
+
when 'jsonapi'
|
25
|
+
Chronicle::Serialization::JSONAPISerializer
|
26
|
+
else
|
27
|
+
raise 'unknown format'
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
class MergeMetaTransformer < Chronicle::ETL::Transformer
|
6
|
+
register_connector do |r|
|
7
|
+
r.identifier = :merge_meta
|
8
|
+
r.description = 'merge extraction meta fields into the record'
|
9
|
+
end
|
10
|
+
|
11
|
+
def transform(record)
|
12
|
+
record.data unless record.extraction&.meta
|
13
|
+
|
14
|
+
record.data[:_meta] = record.extraction.meta
|
15
|
+
record.data
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
class MultiplyTransformer < Chronicle::ETL::Transformer
|
6
|
+
register_connector do |r|
|
7
|
+
r.identifier = :multiply
|
8
|
+
r.description = 'by taking a sample'
|
9
|
+
end
|
10
|
+
|
11
|
+
setting :n, default: 2, type: :numeric
|
12
|
+
|
13
|
+
# return the result, sample_size percentage of the time. otherwise nil
|
14
|
+
def transform(record)
|
15
|
+
@config.n.to_i.times do
|
16
|
+
yield record.data
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|