chronicle-etl 0.5.5 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +15 -25
  3. data/.rubocop.yml +2 -44
  4. data/Gemfile +2 -2
  5. data/Guardfile +3 -3
  6. data/README.md +75 -68
  7. data/Rakefile +2 -2
  8. data/bin/console +4 -5
  9. data/chronicle-etl.gemspec +51 -49
  10. data/exe/chronicle-etl +1 -1
  11. data/lib/chronicle/etl/authorizer.rb +3 -4
  12. data/lib/chronicle/etl/cli/authorizations.rb +8 -6
  13. data/lib/chronicle/etl/cli/connectors.rb +7 -7
  14. data/lib/chronicle/etl/cli/jobs.rb +130 -53
  15. data/lib/chronicle/etl/cli/main.rb +29 -29
  16. data/lib/chronicle/etl/cli/plugins.rb +14 -15
  17. data/lib/chronicle/etl/cli/secrets.rb +14 -12
  18. data/lib/chronicle/etl/cli/subcommand_base.rb +5 -3
  19. data/lib/chronicle/etl/config.rb +18 -8
  20. data/lib/chronicle/etl/configurable.rb +20 -9
  21. data/lib/chronicle/etl/exceptions.rb +3 -3
  22. data/lib/chronicle/etl/extraction.rb +12 -2
  23. data/lib/chronicle/etl/extractors/csv_extractor.rb +9 -0
  24. data/lib/chronicle/etl/extractors/extractor.rb +15 -2
  25. data/lib/chronicle/etl/extractors/file_extractor.rb +5 -3
  26. data/lib/chronicle/etl/extractors/helpers/input_reader.rb +2 -2
  27. data/lib/chronicle/etl/extractors/json_extractor.rb +14 -4
  28. data/lib/chronicle/etl/extractors/stdin_extractor.rb +3 -0
  29. data/lib/chronicle/etl/job.rb +35 -17
  30. data/lib/chronicle/etl/job_definition.rb +38 -26
  31. data/lib/chronicle/etl/job_log.rb +14 -16
  32. data/lib/chronicle/etl/job_logger.rb +4 -4
  33. data/lib/chronicle/etl/loaders/csv_loader.rb +17 -4
  34. data/lib/chronicle/etl/loaders/helpers/stdout_helper.rb +4 -0
  35. data/lib/chronicle/etl/loaders/json_loader.rb +30 -10
  36. data/lib/chronicle/etl/loaders/loader.rb +0 -17
  37. data/lib/chronicle/etl/loaders/rest_loader.rb +7 -7
  38. data/lib/chronicle/etl/loaders/table_loader.rb +37 -12
  39. data/lib/chronicle/etl/logger.rb +2 -2
  40. data/lib/chronicle/etl/oauth_authorizer.rb +8 -8
  41. data/lib/chronicle/etl/record.rb +15 -0
  42. data/lib/chronicle/etl/registry/connector_registration.rb +15 -23
  43. data/lib/chronicle/etl/registry/connectors.rb +93 -36
  44. data/lib/chronicle/etl/registry/plugin_registration.rb +1 -1
  45. data/lib/chronicle/etl/registry/plugins.rb +27 -19
  46. data/lib/chronicle/etl/runner.rb +158 -128
  47. data/lib/chronicle/etl/secrets.rb +4 -4
  48. data/lib/chronicle/etl/transformers/buffer_transformer.rb +29 -0
  49. data/lib/chronicle/etl/transformers/chronicle_transformer.rb +32 -0
  50. data/lib/chronicle/etl/transformers/chronobase_transformer.rb +100 -0
  51. data/lib/chronicle/etl/transformers/fields_limit_transformer.rb +23 -0
  52. data/lib/chronicle/etl/transformers/filter_fields_transformer.rb +60 -0
  53. data/lib/chronicle/etl/transformers/filter_transformer.rb +30 -0
  54. data/lib/chronicle/etl/transformers/format_transformer.rb +32 -0
  55. data/lib/chronicle/etl/transformers/merge_meta_transformer.rb +19 -0
  56. data/lib/chronicle/etl/transformers/multiply_transformer.rb +21 -0
  57. data/lib/chronicle/etl/transformers/null_transformer.rb +5 -7
  58. data/lib/chronicle/etl/transformers/sampler_transformer.rb +21 -0
  59. data/lib/chronicle/etl/transformers/sort_transformer.rb +31 -0
  60. data/lib/chronicle/etl/transformers/transformer.rb +63 -41
  61. data/lib/chronicle/etl/utils/binary_attachments.rb +1 -1
  62. data/lib/chronicle/etl/utils/progress_bar.rb +2 -3
  63. data/lib/chronicle/etl/version.rb +1 -1
  64. data/lib/chronicle/etl.rb +6 -8
  65. metadata +49 -47
  66. data/lib/chronicle/etl/models/activity.rb +0 -15
  67. data/lib/chronicle/etl/models/attachment.rb +0 -14
  68. data/lib/chronicle/etl/models/base.rb +0 -122
  69. data/lib/chronicle/etl/models/entity.rb +0 -29
  70. data/lib/chronicle/etl/models/raw.rb +0 -26
  71. data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +0 -31
  72. data/lib/chronicle/etl/serializers/raw_serializer.rb +0 -10
  73. data/lib/chronicle/etl/serializers/serializer.rb +0 -28
  74. data/lib/chronicle/etl/transformers/image_file_transformer.rb +0 -247
  75. data/lib/chronicle/etl/utils/hash_utilities.rb +0 -19
  76. data/lib/chronicle/etl/utils/text_recognition.rb +0 -15
@@ -0,0 +1,100 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class ChronobaseTransformer < Chronicle::ETL::Transformer
6
+ PROPERTY_MAP = {
7
+ source: :provider,
8
+ source_id: :provider_id,
9
+ url: :provider_url,
10
+ end_time: :end_at,
11
+ start_time: :start_at,
12
+
13
+ name: :title,
14
+ description: :body,
15
+ text: :body,
16
+
17
+ recipient: :consumers,
18
+ agent: :actor,
19
+ object: :involved,
20
+
21
+ # music ones
22
+ by_artist: :creators,
23
+ in_album: :containers
24
+ }.freeze
25
+
26
+ VERB_MAP = {
27
+ ListenAction: 'listened',
28
+ CommunicateAction: 'messaged'
29
+ }.freeze
30
+
31
+ ENTITY_MAP = {
32
+ MusicRecording: 'song',
33
+ MusicAlbum: 'album',
34
+ MusicGroup: 'musicartist',
35
+ Message: 'message',
36
+ Person: 'person'
37
+ }.freeze
38
+
39
+ register_connector do |r|
40
+ r.identifier = :chronobase
41
+ r.description = 'records to chronobase schema'
42
+ end
43
+
44
+ def transform(record)
45
+ deeply_convert_record(record.data)
46
+ end
47
+
48
+ private
49
+
50
+ def deeply_convert_record(record)
51
+ type = activity?(record) ? 'activity' : 'entity'
52
+
53
+ properties = record.properties.compact.each_with_object({}) do |(k, v), h|
54
+ key = PROPERTY_MAP[k.to_sym] || k
55
+ h[key] = v
56
+ end
57
+
58
+ properties[:verb] = VERB_MAP[record.type_id.to_sym] if VERB_MAP.key?(record.type_id.to_sym)
59
+ properties[:represents] = ENTITY_MAP[record.type_id.to_sym] if ENTITY_MAP.key?(record.type_id.to_sym)
60
+
61
+ properties.transform_values! do |v|
62
+ case v
63
+ when Chronicle::Models::Base
64
+ deeply_convert_record(v)
65
+ when Array
66
+ v.map { |e| e.is_a?(Chronicle::Models::Base) ? deeply_convert_record(e) : e }
67
+ else
68
+ v
69
+ end
70
+ end
71
+
72
+ Chronicle::Serialization::Record.new(
73
+ id: record.id,
74
+ type: type,
75
+ properties: properties.compact,
76
+ meta: {
77
+ dedupe_on: transform_dedupe_on(record)
78
+ },
79
+ schema: 'chronobase'
80
+ )
81
+ end
82
+
83
+ def activity?(record)
84
+ record.type_id.end_with?('Action')
85
+ end
86
+
87
+ def transform_dedupe_on(record)
88
+ property_map_with_type = PROPERTY_MAP.merge({
89
+ type: activity?(record) ? :verb : :represents
90
+ })
91
+
92
+ record.dedupe_on.map do |set|
93
+ set.map do |d|
94
+ property_map_with_type[d] || d
95
+ end.join(',')
96
+ end
97
+ end
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'chronicle/utils/hash_utils'
4
+
5
+ module Chronicle
6
+ module ETL
7
+ # A transformer that filters the fields of a record and returns a new hash with only the specified fields.
8
+ class FieldsLimitTransformer < Chronicle::ETL::Transformer
9
+ register_connector do |r|
10
+ r.identifier = :fields_limit
11
+ r.description = 'by taking first N fields'
12
+ end
13
+
14
+ setting :limit, type: :numeric, default: 10
15
+
16
+ def transform(record)
17
+ # flattern hash and then take the first limit fields
18
+
19
+ Chronicle::Utils::HashUtils.flatten_hash(record.data.to_h).first(@config.limit).to_h
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chronicle
4
+ module ETL
5
+ # A transformer that filters the fields of a record and returns a new hash with only the specified fields.
6
+ class FilterFieldsTransformer < Chronicle::ETL::Transformer
7
+ register_connector do |r|
8
+ r.identifier = :filter_fields
9
+ r.description = 'by taking a subset of the fields'
10
+ end
11
+
12
+ setting :fields, type: :array, default: []
13
+
14
+ def transform(record)
15
+ hash = record.data.to_h.deep_transform_keys(&:to_sym)
16
+ filter_hash(hash, @config.fields.map)
17
+ end
18
+
19
+ private
20
+
21
+ def access_nested_value(data, path)
22
+ keys = path.split('.')
23
+ keys.reduce(data) do |acc, key|
24
+ if acc.is_a?(Array)
25
+ acc.map do |item|
26
+ item[key.to_sym]
27
+ rescue StandardError
28
+ nil
29
+ end
30
+ .compact
31
+ elsif key.include?('[')
32
+ key, index = key.split(/\[|\]/).reject(&:empty?)
33
+ acc = acc[key.to_sym] if acc
34
+ acc.is_a?(Array) ? acc[index.to_i] : nil
35
+ else
36
+ acc&.dig(key.to_sym)
37
+ end
38
+ end
39
+ end
40
+
41
+ def filter_hash(original_hash, fields)
42
+ fields.each_with_object({}) do |field, result|
43
+ value = access_nested_value(original_hash, field)
44
+ keys = field.split('.')
45
+ last_key = keys.pop.to_sym
46
+
47
+ current = result
48
+ keys.each do |key|
49
+ key = key.to_sym
50
+ key, = key.to_s.split(/\[|\]/) if key.to_s.include?('[')
51
+ current[key] ||= {}
52
+ current = current[key]
53
+ end
54
+
55
+ current[last_key] = value
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chronicle
4
+ module ETL
5
+ # Return only records that match all the conditions of the filters
6
+ # setting.
7
+ class FilterTransformer < Chronicle::ETL::Transformer
8
+ register_connector do |r|
9
+ r.identifier = :filter
10
+ r.description = 'by only accepting records that match conditions'
11
+ end
12
+
13
+ setting :filters, type: :hash
14
+
15
+ def transform(record)
16
+ record_hash = record.data.to_h
17
+
18
+ @config.filters.each do |key, value|
19
+ path = key.split('.').map do |k|
20
+ k.match?(/^\d+$/) ? k.to_i : k.to_sym
21
+ end
22
+
23
+ return nil unless record_hash.dig(*path) == value
24
+ end
25
+
26
+ record.data
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class FormatTransformer < Chronicle::ETL::Transformer
6
+ register_connector do |r|
7
+ r.identifier = :format
8
+ r.description = 'records to a differnet hash/json format'
9
+ end
10
+
11
+ setting :format, default: nil
12
+
13
+ def transform(record)
14
+ serializer = find_serializer(@config.format)
15
+ serializer.serialize(record.data)
16
+ end
17
+
18
+ private
19
+
20
+ def find_serializer(format)
21
+ case format
22
+ when 'jsonld'
23
+ Chronicle::Serialization::JSONLDSerializer
24
+ when 'jsonapi'
25
+ Chronicle::Serialization::JSONAPISerializer
26
+ else
27
+ raise 'unknown format'
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class MergeMetaTransformer < Chronicle::ETL::Transformer
6
+ register_connector do |r|
7
+ r.identifier = :merge_meta
8
+ r.description = 'merge extraction meta fields into the record'
9
+ end
10
+
11
+ def transform(record)
12
+ record.data unless record.extraction&.meta
13
+
14
+ record.data[:_meta] = record.extraction.meta
15
+ record.data
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class MultiplyTransformer < Chronicle::ETL::Transformer
6
+ register_connector do |r|
7
+ r.identifier = :multiply
8
+ r.description = 'by taking a sample'
9
+ end
10
+
11
+ setting :n, default: 2, type: :numeric
12
+
13
+ # return the result, sample_size percentage of the time. otherwise nil
14
+ def transform(record)
15
+ @config.n.to_i.times do
16
+ yield record.data
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -1,18 +1,16 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Chronicle
2
4
  module ETL
3
5
  class NullTransformer < Chronicle::ETL::Transformer
4
6
  register_connector do |r|
5
- r.identifier = 'null'
7
+ r.identifier = :null
6
8
  r.description = 'in no way'
7
9
  end
8
10
 
9
- def transform
10
- Chronicle::ETL::Models::Raw.new(@extraction.data)
11
+ def transform(record)
12
+ yield record.data
11
13
  end
12
-
13
- def timestamp; end
14
-
15
- def id; end
16
14
  end
17
15
  end
18
16
  end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class SamplerTransformer < Chronicle::ETL::Transformer
6
+ register_connector do |r|
7
+ r.identifier = :sampler
8
+ r.description = 'by taking a sample'
9
+ end
10
+
11
+ setting :percent, default: 10, type: :numeric
12
+
13
+ # return the result, `percent` percentage of the time. otherwise nil
14
+ def transform(record)
15
+ return unless rand(100) < @config.percent
16
+
17
+ record.data
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class SortTransformer < Chronicle::ETL::Transformer
6
+ register_connector do |r|
7
+ r.identifier = :sort
8
+ r.description = 'sorts records by a given field'
9
+ end
10
+
11
+ setting :key, required: true, default: 'id'
12
+ setting :direction, required: false, default: 'desc'
13
+
14
+ def transform(record)
15
+ stash_record(record)
16
+ end
17
+
18
+ def finish
19
+ return unless @stashed_records&.any?
20
+
21
+ sorted = @stashed_records.sort_by do |record|
22
+ value = record.data[@config.key]
23
+ value.nil? ? [1] : [0, value]
24
+ end
25
+
26
+ sorted.reverse! if @config.direction == 'desc'
27
+ sorted
28
+ end
29
+ end
30
+ end
31
+ end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Chronicle
2
4
  module ETL
3
5
  # Abstract class representing an Transformer for an ETL job
@@ -5,65 +7,85 @@ module Chronicle
5
7
  extend Chronicle::ETL::Registry::SelfRegistering
6
8
  include Chronicle::ETL::Configurable
7
9
 
10
+ attr_reader :stashed_records
11
+
8
12
  # Construct a new instance of this transformer. Options are passed in from a Runner
9
13
  # == Parameters:
10
14
  # options::
11
15
  # Options for configuring this Transformer
12
- def initialize(extraction, options = {})
13
- unless extraction.is_a?(Chronicle::ETL::Extraction)
14
- raise Chronicle::ETL::RunnerTypeError, "Extracted should be a Chronicle::ETL::Extraction"
15
- end
16
-
17
- @extraction = extraction
16
+ def initialize(options = {})
18
17
  apply_options(options)
19
18
  end
20
19
 
21
- # @abstract Subclass is expected to implement #transform
22
- # @!method transform
23
- # The main entrypoint for transforming a record. Called by a Runner on each extracted record
24
-
25
- # The domain or provider-specific id of the record this transformer is working on.
26
- # It is useful for:
27
- # - de-duping records that might exist in the loader's destination
28
- # - building a cursor so an extractor doesn't have to start from the beginning of a
29
- # a source
30
- def id
31
- raise NotImplementedError
20
+ # Called once for each extracted record. Can return 0 or more transformed records.
21
+ def call(record, &block)
22
+ raise ArgumentError, 'Input must be a Chronicle::ETL::Record' unless record.is_a?(Record)
23
+
24
+ yielded = false
25
+
26
+ transformed_data = transform(record) do |data|
27
+ new_record = update_data(record, data)
28
+ block.call(new_record)
29
+
30
+ yielded = true
31
+ end
32
+
33
+ return if yielded
34
+
35
+ # Handle transformers that don't yield anything and return
36
+ # transformed data directly. Skip nil values.
37
+ [transformed_data].flatten.compact.each do |data|
38
+ new_record = update_data(record, data)
39
+ block.call(new_record)
40
+ end
32
41
  end
33
42
 
34
- # The domain or provider-specific timestamp of the record this transformer is working on.
35
- # Used for building a cursor so an extractor doesn't have to start from the beginning of a
36
- # data source from the beginning.
37
- def timestamp
38
- raise NotImplementedError
43
+ def call_finish(&block)
44
+ remaining_records = finish
45
+ return if remaining_records.nil?
46
+
47
+ remaining_records.each do |record|
48
+ block.call(record)
49
+ end
39
50
  end
40
51
 
41
- # An optional, human-readable identifier for a transformation, intended for debugging or logging.
42
- # By default, it is just the id.
43
- def friendly_identifier
44
- id
52
+ def transform(_record)
53
+ raise NotImplementedError, 'You must implement the transform method'
45
54
  end
46
55
 
47
- def to_s
48
- ts = begin
49
- unknown = "???"
50
- timestamp&.iso8601 || unknown
51
- rescue TransformationError, NotImplementedError
52
- unknown
53
- end
56
+ # Called once after runner has processed all records
57
+ def finish; end
54
58
 
55
- identifier = begin
56
- unknown = self.class.to_s
57
- friendly_identifier || self.class.to_s
58
- rescue TransformationError, NotImplementedError
59
- unknown
60
- end
59
+ protected
60
+
61
+ def stash_record(record)
62
+ @stashed_records ||= []
63
+ @stashed_records << record
64
+ nil
65
+ end
66
+
67
+ def flush_stashed_records
68
+ @stashed_records.tap(&:clear)
69
+ end
61
70
 
62
- "[#{ts}] #{identifier}"
71
+ def update_data(record, new_data)
72
+ new_record = record.clone
73
+ new_record.data = new_data
74
+ new_record
63
75
  end
64
76
  end
65
77
  end
66
78
  end
67
79
 
68
80
  require_relative 'null_transformer'
69
- require_relative 'image_file_transformer'
81
+ require_relative 'sampler_transformer'
82
+ require_relative 'buffer_transformer'
83
+ require_relative 'multiply_transformer'
84
+ require_relative 'sort_transformer'
85
+ require_relative 'chronicle_transformer'
86
+ require_relative 'format_transformer'
87
+ require_relative 'filter_fields_transformer'
88
+ require_relative 'fields_limit_transformer'
89
+ require_relative 'merge_meta_transformer'
90
+ require_relative 'filter_transformer'
91
+ require_relative 'chronobase_transformer'
@@ -7,7 +7,7 @@ module Chronicle
7
7
  # Utility methods for dealing with binary files
8
8
  module BinaryAttachments
9
9
  def self.filename_to_base64(filename:, mimetype: nil)
10
- mimetype = mimetype || guess_mimetype(filename: filename)
10
+ mimetype ||= guess_mimetype(filename: filename)
11
11
 
12
12
  "data:#{mimetype};base64,#{Base64.strict_encode64(File.read(filename))}"
13
13
  end
@@ -4,7 +4,6 @@ require 'colorize'
4
4
  module Chronicle
5
5
  module ETL
6
6
  module Utils
7
-
8
7
  class ProgressBar
9
8
  FORMAT_WITH_TOTAL = [
10
9
  ':bar ',
@@ -37,7 +36,7 @@ module Chronicle
37
36
  '/s) '.light_black
38
37
  ].join.freeze
39
38
 
40
- def initialize(title: 'Loading', total:)
39
+ def initialize(total:, title: 'Loading')
41
40
  opts = {
42
41
  clear: true,
43
42
  complete: '▓'.light_blue,
@@ -64,7 +63,7 @@ module Chronicle
64
63
  end
65
64
 
66
65
  def log(message)
67
- message.split("\n").each do |line|
66
+ message.split("\n").each do |_line|
68
67
  @pbar.log message
69
68
  end
70
69
  end
@@ -1,5 +1,5 @@
1
1
  module Chronicle
2
2
  module ETL
3
- VERSION = "0.5.5"
3
+ VERSION = '0.6.1'.freeze
4
4
  end
5
5
  end
data/lib/chronicle/etl.rb CHANGED
@@ -1,25 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'chronicle/schema'
4
+ require 'chronicle/models/base'
5
+
1
6
  require_relative 'etl/registry/registry'
2
7
  require_relative 'etl/authorizer'
3
8
  require_relative 'etl/config'
4
9
  require_relative 'etl/configurable'
5
10
  require_relative 'etl/exceptions'
6
11
  require_relative 'etl/extraction'
12
+ require_relative 'etl/record'
7
13
  require_relative 'etl/job_definition'
8
14
  require_relative 'etl/job_log'
9
15
  require_relative 'etl/job_logger'
10
16
  require_relative 'etl/job'
11
17
  require_relative 'etl/logger'
12
- require_relative 'etl/models/activity'
13
- require_relative 'etl/models/attachment'
14
- require_relative 'etl/models/base'
15
- require_relative 'etl/models/raw'
16
- require_relative 'etl/models/entity'
17
18
  require_relative 'etl/runner'
18
19
  require_relative 'etl/secrets'
19
- require_relative 'etl/serializers/serializer'
20
20
  require_relative 'etl/utils/binary_attachments'
21
- require_relative 'etl/utils/hash_utilities'
22
- require_relative 'etl/utils/text_recognition'
23
21
  require_relative 'etl/utils/progress_bar'
24
22
  require_relative 'etl/version'
25
23