chronicle-etl 0.5.5 → 0.6.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +15 -25
  3. data/.rubocop.yml +2 -44
  4. data/Gemfile +2 -2
  5. data/Guardfile +3 -3
  6. data/README.md +75 -68
  7. data/Rakefile +2 -2
  8. data/bin/console +4 -5
  9. data/chronicle-etl.gemspec +51 -49
  10. data/exe/chronicle-etl +1 -1
  11. data/lib/chronicle/etl/authorizer.rb +3 -4
  12. data/lib/chronicle/etl/cli/authorizations.rb +8 -6
  13. data/lib/chronicle/etl/cli/connectors.rb +7 -7
  14. data/lib/chronicle/etl/cli/jobs.rb +130 -53
  15. data/lib/chronicle/etl/cli/main.rb +29 -29
  16. data/lib/chronicle/etl/cli/plugins.rb +14 -15
  17. data/lib/chronicle/etl/cli/secrets.rb +14 -12
  18. data/lib/chronicle/etl/cli/subcommand_base.rb +5 -3
  19. data/lib/chronicle/etl/config.rb +18 -8
  20. data/lib/chronicle/etl/configurable.rb +20 -9
  21. data/lib/chronicle/etl/exceptions.rb +3 -3
  22. data/lib/chronicle/etl/extraction.rb +12 -2
  23. data/lib/chronicle/etl/extractors/csv_extractor.rb +9 -0
  24. data/lib/chronicle/etl/extractors/extractor.rb +15 -2
  25. data/lib/chronicle/etl/extractors/file_extractor.rb +5 -3
  26. data/lib/chronicle/etl/extractors/helpers/input_reader.rb +2 -2
  27. data/lib/chronicle/etl/extractors/json_extractor.rb +14 -4
  28. data/lib/chronicle/etl/extractors/stdin_extractor.rb +3 -0
  29. data/lib/chronicle/etl/job.rb +35 -17
  30. data/lib/chronicle/etl/job_definition.rb +38 -26
  31. data/lib/chronicle/etl/job_log.rb +14 -16
  32. data/lib/chronicle/etl/job_logger.rb +4 -4
  33. data/lib/chronicle/etl/loaders/csv_loader.rb +17 -4
  34. data/lib/chronicle/etl/loaders/helpers/stdout_helper.rb +4 -0
  35. data/lib/chronicle/etl/loaders/json_loader.rb +30 -10
  36. data/lib/chronicle/etl/loaders/loader.rb +0 -17
  37. data/lib/chronicle/etl/loaders/rest_loader.rb +7 -7
  38. data/lib/chronicle/etl/loaders/table_loader.rb +37 -12
  39. data/lib/chronicle/etl/logger.rb +2 -2
  40. data/lib/chronicle/etl/oauth_authorizer.rb +8 -8
  41. data/lib/chronicle/etl/record.rb +15 -0
  42. data/lib/chronicle/etl/registry/connector_registration.rb +15 -23
  43. data/lib/chronicle/etl/registry/connectors.rb +93 -36
  44. data/lib/chronicle/etl/registry/plugin_registration.rb +1 -1
  45. data/lib/chronicle/etl/registry/plugins.rb +27 -19
  46. data/lib/chronicle/etl/runner.rb +158 -128
  47. data/lib/chronicle/etl/secrets.rb +4 -4
  48. data/lib/chronicle/etl/transformers/buffer_transformer.rb +29 -0
  49. data/lib/chronicle/etl/transformers/chronicle_transformer.rb +32 -0
  50. data/lib/chronicle/etl/transformers/chronobase_transformer.rb +100 -0
  51. data/lib/chronicle/etl/transformers/fields_limit_transformer.rb +23 -0
  52. data/lib/chronicle/etl/transformers/filter_fields_transformer.rb +60 -0
  53. data/lib/chronicle/etl/transformers/filter_transformer.rb +30 -0
  54. data/lib/chronicle/etl/transformers/format_transformer.rb +32 -0
  55. data/lib/chronicle/etl/transformers/merge_meta_transformer.rb +19 -0
  56. data/lib/chronicle/etl/transformers/multiply_transformer.rb +21 -0
  57. data/lib/chronicle/etl/transformers/null_transformer.rb +5 -7
  58. data/lib/chronicle/etl/transformers/sampler_transformer.rb +21 -0
  59. data/lib/chronicle/etl/transformers/sort_transformer.rb +31 -0
  60. data/lib/chronicle/etl/transformers/transformer.rb +63 -41
  61. data/lib/chronicle/etl/utils/binary_attachments.rb +1 -1
  62. data/lib/chronicle/etl/utils/progress_bar.rb +2 -3
  63. data/lib/chronicle/etl/version.rb +1 -1
  64. data/lib/chronicle/etl.rb +6 -8
  65. metadata +49 -47
  66. data/lib/chronicle/etl/models/activity.rb +0 -15
  67. data/lib/chronicle/etl/models/attachment.rb +0 -14
  68. data/lib/chronicle/etl/models/base.rb +0 -122
  69. data/lib/chronicle/etl/models/entity.rb +0 -29
  70. data/lib/chronicle/etl/models/raw.rb +0 -26
  71. data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +0 -31
  72. data/lib/chronicle/etl/serializers/raw_serializer.rb +0 -10
  73. data/lib/chronicle/etl/serializers/serializer.rb +0 -28
  74. data/lib/chronicle/etl/transformers/image_file_transformer.rb +0 -247
  75. data/lib/chronicle/etl/utils/hash_utilities.rb +0 -19
  76. data/lib/chronicle/etl/utils/text_recognition.rb +0 -15
@@ -0,0 +1,100 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class ChronobaseTransformer < Chronicle::ETL::Transformer
6
+ PROPERTY_MAP = {
7
+ source: :provider,
8
+ source_id: :provider_id,
9
+ url: :provider_url,
10
+ end_time: :end_at,
11
+ start_time: :start_at,
12
+
13
+ name: :title,
14
+ description: :body,
15
+ text: :body,
16
+
17
+ recipient: :consumers,
18
+ agent: :actor,
19
+ object: :involved,
20
+
21
+ # music ones
22
+ by_artist: :creators,
23
+ in_album: :containers
24
+ }.freeze
25
+
26
+ VERB_MAP = {
27
+ ListenAction: 'listened',
28
+ CommunicateAction: 'messaged'
29
+ }.freeze
30
+
31
+ ENTITY_MAP = {
32
+ MusicRecording: 'song',
33
+ MusicAlbum: 'album',
34
+ MusicGroup: 'musicartist',
35
+ Message: 'message',
36
+ Person: 'person'
37
+ }.freeze
38
+
39
+ register_connector do |r|
40
+ r.identifier = :chronobase
41
+ r.description = 'records to chronobase schema'
42
+ end
43
+
44
+ def transform(record)
45
+ deeply_convert_record(record.data)
46
+ end
47
+
48
+ private
49
+
50
+ def deeply_convert_record(record)
51
+ type = activity?(record) ? 'activity' : 'entity'
52
+
53
+ properties = record.properties.compact.each_with_object({}) do |(k, v), h|
54
+ key = PROPERTY_MAP[k.to_sym] || k
55
+ h[key] = v
56
+ end
57
+
58
+ properties[:verb] = VERB_MAP[record.type_id.to_sym] if VERB_MAP.key?(record.type_id.to_sym)
59
+ properties[:represents] = ENTITY_MAP[record.type_id.to_sym] if ENTITY_MAP.key?(record.type_id.to_sym)
60
+
61
+ properties.transform_values! do |v|
62
+ case v
63
+ when Chronicle::Models::Base
64
+ deeply_convert_record(v)
65
+ when Array
66
+ v.map { |e| e.is_a?(Chronicle::Models::Base) ? deeply_convert_record(e) : e }
67
+ else
68
+ v
69
+ end
70
+ end
71
+
72
+ Chronicle::Serialization::Record.new(
73
+ id: record.id,
74
+ type: type,
75
+ properties: properties.compact,
76
+ meta: {
77
+ dedupe_on: transform_dedupe_on(record)
78
+ },
79
+ schema: 'chronobase'
80
+ )
81
+ end
82
+
83
+ def activity?(record)
84
+ record.type_id.end_with?('Action')
85
+ end
86
+
87
+ def transform_dedupe_on(record)
88
+ property_map_with_type = PROPERTY_MAP.merge({
89
+ type: activity?(record) ? :verb : :represents
90
+ })
91
+
92
+ record.dedupe_on.map do |set|
93
+ set.map do |d|
94
+ property_map_with_type[d] || d
95
+ end.join(',')
96
+ end
97
+ end
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'chronicle/utils/hash_utils'
4
+
5
+ module Chronicle
6
+ module ETL
7
+ # A transformer that filters the fields of a record and returns a new hash with only the specified fields.
8
+ class FieldsLimitTransformer < Chronicle::ETL::Transformer
9
+ register_connector do |r|
10
+ r.identifier = :fields_limit
11
+ r.description = 'by taking first N fields'
12
+ end
13
+
14
+ setting :limit, type: :numeric, default: 10
15
+
16
+ def transform(record)
17
+ # flattern hash and then take the first limit fields
18
+
19
+ Chronicle::Utils::HashUtils.flatten_hash(record.data.to_h).first(@config.limit).to_h
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chronicle
4
+ module ETL
5
+ # A transformer that filters the fields of a record and returns a new hash with only the specified fields.
6
+ class FilterFieldsTransformer < Chronicle::ETL::Transformer
7
+ register_connector do |r|
8
+ r.identifier = :filter_fields
9
+ r.description = 'by taking a subset of the fields'
10
+ end
11
+
12
+ setting :fields, type: :array, default: []
13
+
14
+ def transform(record)
15
+ hash = record.data.to_h.deep_transform_keys(&:to_sym)
16
+ filter_hash(hash, @config.fields.map)
17
+ end
18
+
19
+ private
20
+
21
+ def access_nested_value(data, path)
22
+ keys = path.split('.')
23
+ keys.reduce(data) do |acc, key|
24
+ if acc.is_a?(Array)
25
+ acc.map do |item|
26
+ item[key.to_sym]
27
+ rescue StandardError
28
+ nil
29
+ end
30
+ .compact
31
+ elsif key.include?('[')
32
+ key, index = key.split(/\[|\]/).reject(&:empty?)
33
+ acc = acc[key.to_sym] if acc
34
+ acc.is_a?(Array) ? acc[index.to_i] : nil
35
+ else
36
+ acc&.dig(key.to_sym)
37
+ end
38
+ end
39
+ end
40
+
41
+ def filter_hash(original_hash, fields)
42
+ fields.each_with_object({}) do |field, result|
43
+ value = access_nested_value(original_hash, field)
44
+ keys = field.split('.')
45
+ last_key = keys.pop.to_sym
46
+
47
+ current = result
48
+ keys.each do |key|
49
+ key = key.to_sym
50
+ key, = key.to_s.split(/\[|\]/) if key.to_s.include?('[')
51
+ current[key] ||= {}
52
+ current = current[key]
53
+ end
54
+
55
+ current[last_key] = value
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chronicle
4
+ module ETL
5
+ # Return only records that match all the conditions of the filters
6
+ # setting.
7
+ class FilterTransformer < Chronicle::ETL::Transformer
8
+ register_connector do |r|
9
+ r.identifier = :filter
10
+ r.description = 'by only accepting records that match conditions'
11
+ end
12
+
13
+ setting :filters, type: :hash
14
+
15
+ def transform(record)
16
+ record_hash = record.data.to_h
17
+
18
+ @config.filters.each do |key, value|
19
+ path = key.split('.').map do |k|
20
+ k.match?(/^\d+$/) ? k.to_i : k.to_sym
21
+ end
22
+
23
+ return nil unless record_hash.dig(*path) == value
24
+ end
25
+
26
+ record.data
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class FormatTransformer < Chronicle::ETL::Transformer
6
+ register_connector do |r|
7
+ r.identifier = :format
8
+ r.description = 'records to a differnet hash/json format'
9
+ end
10
+
11
+ setting :format, default: nil
12
+
13
+ def transform(record)
14
+ serializer = find_serializer(@config.format)
15
+ serializer.serialize(record.data)
16
+ end
17
+
18
+ private
19
+
20
+ def find_serializer(format)
21
+ case format
22
+ when 'jsonld'
23
+ Chronicle::Serialization::JSONLDSerializer
24
+ when 'jsonapi'
25
+ Chronicle::Serialization::JSONAPISerializer
26
+ else
27
+ raise 'unknown format'
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class MergeMetaTransformer < Chronicle::ETL::Transformer
6
+ register_connector do |r|
7
+ r.identifier = :merge_meta
8
+ r.description = 'merge extraction meta fields into the record'
9
+ end
10
+
11
+ def transform(record)
12
+ record.data unless record.extraction&.meta
13
+
14
+ record.data[:_meta] = record.extraction.meta
15
+ record.data
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class MultiplyTransformer < Chronicle::ETL::Transformer
6
+ register_connector do |r|
7
+ r.identifier = :multiply
8
+ r.description = 'by taking a sample'
9
+ end
10
+
11
+ setting :n, default: 2, type: :numeric
12
+
13
+ # return the result, sample_size percentage of the time. otherwise nil
14
+ def transform(record)
15
+ @config.n.to_i.times do
16
+ yield record.data
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -1,18 +1,16 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Chronicle
2
4
  module ETL
3
5
  class NullTransformer < Chronicle::ETL::Transformer
4
6
  register_connector do |r|
5
- r.identifier = 'null'
7
+ r.identifier = :null
6
8
  r.description = 'in no way'
7
9
  end
8
10
 
9
- def transform
10
- Chronicle::ETL::Models::Raw.new(@extraction.data)
11
+ def transform(record)
12
+ yield record.data
11
13
  end
12
-
13
- def timestamp; end
14
-
15
- def id; end
16
14
  end
17
15
  end
18
16
  end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class SamplerTransformer < Chronicle::ETL::Transformer
6
+ register_connector do |r|
7
+ r.identifier = :sampler
8
+ r.description = 'by taking a sample'
9
+ end
10
+
11
+ setting :percent, default: 10, type: :numeric
12
+
13
+ # return the result, `percent` percentage of the time. otherwise nil
14
+ def transform(record)
15
+ return unless rand(100) < @config.percent
16
+
17
+ record.data
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chronicle
4
+ module ETL
5
+ class SortTransformer < Chronicle::ETL::Transformer
6
+ register_connector do |r|
7
+ r.identifier = :sort
8
+ r.description = 'sorts records by a given field'
9
+ end
10
+
11
+ setting :key, required: true, default: 'id'
12
+ setting :direction, required: false, default: 'desc'
13
+
14
+ def transform(record)
15
+ stash_record(record)
16
+ end
17
+
18
+ def finish
19
+ return unless @stashed_records&.any?
20
+
21
+ sorted = @stashed_records.sort_by do |record|
22
+ value = record.data[@config.key]
23
+ value.nil? ? [1] : [0, value]
24
+ end
25
+
26
+ sorted.reverse! if @config.direction == 'desc'
27
+ sorted
28
+ end
29
+ end
30
+ end
31
+ end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Chronicle
2
4
  module ETL
3
5
  # Abstract class representing an Transformer for an ETL job
@@ -5,65 +7,85 @@ module Chronicle
5
7
  extend Chronicle::ETL::Registry::SelfRegistering
6
8
  include Chronicle::ETL::Configurable
7
9
 
10
+ attr_reader :stashed_records
11
+
8
12
  # Construct a new instance of this transformer. Options are passed in from a Runner
9
13
  # == Parameters:
10
14
  # options::
11
15
  # Options for configuring this Transformer
12
- def initialize(extraction, options = {})
13
- unless extraction.is_a?(Chronicle::ETL::Extraction)
14
- raise Chronicle::ETL::RunnerTypeError, "Extracted should be a Chronicle::ETL::Extraction"
15
- end
16
-
17
- @extraction = extraction
16
+ def initialize(options = {})
18
17
  apply_options(options)
19
18
  end
20
19
 
21
- # @abstract Subclass is expected to implement #transform
22
- # @!method transform
23
- # The main entrypoint for transforming a record. Called by a Runner on each extracted record
24
-
25
- # The domain or provider-specific id of the record this transformer is working on.
26
- # It is useful for:
27
- # - de-duping records that might exist in the loader's destination
28
- # - building a cursor so an extractor doesn't have to start from the beginning of a
29
- # a source
30
- def id
31
- raise NotImplementedError
20
+ # Called once for each extracted record. Can return 0 or more transformed records.
21
+ def call(record, &block)
22
+ raise ArgumentError, 'Input must be a Chronicle::ETL::Record' unless record.is_a?(Record)
23
+
24
+ yielded = false
25
+
26
+ transformed_data = transform(record) do |data|
27
+ new_record = update_data(record, data)
28
+ block.call(new_record)
29
+
30
+ yielded = true
31
+ end
32
+
33
+ return if yielded
34
+
35
+ # Handle transformers that don't yield anything and return
36
+ # transformed data directly. Skip nil values.
37
+ [transformed_data].flatten.compact.each do |data|
38
+ new_record = update_data(record, data)
39
+ block.call(new_record)
40
+ end
32
41
  end
33
42
 
34
- # The domain or provider-specific timestamp of the record this transformer is working on.
35
- # Used for building a cursor so an extractor doesn't have to start from the beginning of a
36
- # data source from the beginning.
37
- def timestamp
38
- raise NotImplementedError
43
+ def call_finish(&block)
44
+ remaining_records = finish
45
+ return if remaining_records.nil?
46
+
47
+ remaining_records.each do |record|
48
+ block.call(record)
49
+ end
39
50
  end
40
51
 
41
- # An optional, human-readable identifier for a transformation, intended for debugging or logging.
42
- # By default, it is just the id.
43
- def friendly_identifier
44
- id
52
+ def transform(_record)
53
+ raise NotImplementedError, 'You must implement the transform method'
45
54
  end
46
55
 
47
- def to_s
48
- ts = begin
49
- unknown = "???"
50
- timestamp&.iso8601 || unknown
51
- rescue TransformationError, NotImplementedError
52
- unknown
53
- end
56
+ # Called once after runner has processed all records
57
+ def finish; end
54
58
 
55
- identifier = begin
56
- unknown = self.class.to_s
57
- friendly_identifier || self.class.to_s
58
- rescue TransformationError, NotImplementedError
59
- unknown
60
- end
59
+ protected
60
+
61
+ def stash_record(record)
62
+ @stashed_records ||= []
63
+ @stashed_records << record
64
+ nil
65
+ end
66
+
67
+ def flush_stashed_records
68
+ @stashed_records.tap(&:clear)
69
+ end
61
70
 
62
- "[#{ts}] #{identifier}"
71
+ def update_data(record, new_data)
72
+ new_record = record.clone
73
+ new_record.data = new_data
74
+ new_record
63
75
  end
64
76
  end
65
77
  end
66
78
  end
67
79
 
68
80
  require_relative 'null_transformer'
69
- require_relative 'image_file_transformer'
81
+ require_relative 'sampler_transformer'
82
+ require_relative 'buffer_transformer'
83
+ require_relative 'multiply_transformer'
84
+ require_relative 'sort_transformer'
85
+ require_relative 'chronicle_transformer'
86
+ require_relative 'format_transformer'
87
+ require_relative 'filter_fields_transformer'
88
+ require_relative 'fields_limit_transformer'
89
+ require_relative 'merge_meta_transformer'
90
+ require_relative 'filter_transformer'
91
+ require_relative 'chronobase_transformer'
@@ -7,7 +7,7 @@ module Chronicle
7
7
  # Utility methods for dealing with binary files
8
8
  module BinaryAttachments
9
9
  def self.filename_to_base64(filename:, mimetype: nil)
10
- mimetype = mimetype || guess_mimetype(filename: filename)
10
+ mimetype ||= guess_mimetype(filename: filename)
11
11
 
12
12
  "data:#{mimetype};base64,#{Base64.strict_encode64(File.read(filename))}"
13
13
  end
@@ -4,7 +4,6 @@ require 'colorize'
4
4
  module Chronicle
5
5
  module ETL
6
6
  module Utils
7
-
8
7
  class ProgressBar
9
8
  FORMAT_WITH_TOTAL = [
10
9
  ':bar ',
@@ -37,7 +36,7 @@ module Chronicle
37
36
  '/s) '.light_black
38
37
  ].join.freeze
39
38
 
40
- def initialize(title: 'Loading', total:)
39
+ def initialize(total:, title: 'Loading')
41
40
  opts = {
42
41
  clear: true,
43
42
  complete: '▓'.light_blue,
@@ -64,7 +63,7 @@ module Chronicle
64
63
  end
65
64
 
66
65
  def log(message)
67
- message.split("\n").each do |line|
66
+ message.split("\n").each do |_line|
68
67
  @pbar.log message
69
68
  end
70
69
  end
@@ -1,5 +1,5 @@
1
1
  module Chronicle
2
2
  module ETL
3
- VERSION = "0.5.5"
3
+ VERSION = '0.6.1'.freeze
4
4
  end
5
5
  end
data/lib/chronicle/etl.rb CHANGED
@@ -1,25 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'chronicle/schema'
4
+ require 'chronicle/models/base'
5
+
1
6
  require_relative 'etl/registry/registry'
2
7
  require_relative 'etl/authorizer'
3
8
  require_relative 'etl/config'
4
9
  require_relative 'etl/configurable'
5
10
  require_relative 'etl/exceptions'
6
11
  require_relative 'etl/extraction'
12
+ require_relative 'etl/record'
7
13
  require_relative 'etl/job_definition'
8
14
  require_relative 'etl/job_log'
9
15
  require_relative 'etl/job_logger'
10
16
  require_relative 'etl/job'
11
17
  require_relative 'etl/logger'
12
- require_relative 'etl/models/activity'
13
- require_relative 'etl/models/attachment'
14
- require_relative 'etl/models/base'
15
- require_relative 'etl/models/raw'
16
- require_relative 'etl/models/entity'
17
18
  require_relative 'etl/runner'
18
19
  require_relative 'etl/secrets'
19
- require_relative 'etl/serializers/serializer'
20
20
  require_relative 'etl/utils/binary_attachments'
21
- require_relative 'etl/utils/hash_utilities'
22
- require_relative 'etl/utils/text_recognition'
23
21
  require_relative 'etl/utils/progress_bar'
24
22
  require_relative 'etl/version'
25
23