chronicle-etl 0.2.4 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +35 -0
  3. data/.gitignore +3 -0
  4. data/.rubocop.yml +31 -1
  5. data/Guardfile +7 -0
  6. data/README.md +21 -14
  7. data/Rakefile +4 -2
  8. data/chronicle-etl.gemspec +18 -10
  9. data/exe/chronicle-etl +1 -1
  10. data/lib/chronicle/etl/cli/connectors.rb +53 -7
  11. data/lib/chronicle/etl/cli/jobs.rb +59 -24
  12. data/lib/chronicle/etl/cli/main.rb +18 -16
  13. data/lib/chronicle/etl/cli/subcommand_base.rb +2 -2
  14. data/lib/chronicle/etl/cli.rb +7 -0
  15. data/lib/chronicle/etl/config.rb +1 -1
  16. data/lib/chronicle/etl/configurable.rb +150 -0
  17. data/lib/chronicle/etl/exceptions.rb +14 -1
  18. data/lib/chronicle/etl/extraction.rb +12 -0
  19. data/lib/chronicle/etl/extractors/csv_extractor.rb +32 -31
  20. data/lib/chronicle/etl/extractors/extractor.rb +25 -13
  21. data/lib/chronicle/etl/extractors/file_extractor.rb +17 -32
  22. data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +104 -0
  23. data/lib/chronicle/etl/extractors/json_extractor.rb +37 -0
  24. data/lib/chronicle/etl/extractors/stdin_extractor.rb +6 -1
  25. data/lib/chronicle/etl/job.rb +30 -29
  26. data/lib/chronicle/etl/job_definition.rb +45 -7
  27. data/lib/chronicle/etl/job_log.rb +10 -0
  28. data/lib/chronicle/etl/job_logger.rb +23 -20
  29. data/lib/chronicle/etl/loaders/csv_loader.rb +5 -1
  30. data/lib/chronicle/etl/loaders/loader.rb +5 -2
  31. data/lib/chronicle/etl/loaders/rest_loader.rb +9 -5
  32. data/lib/chronicle/etl/loaders/stdout_loader.rb +6 -1
  33. data/lib/chronicle/etl/loaders/table_loader.rb +51 -7
  34. data/lib/chronicle/etl/logger.rb +48 -0
  35. data/lib/chronicle/etl/models/attachment.rb +14 -0
  36. data/lib/chronicle/etl/models/base.rb +23 -7
  37. data/lib/chronicle/etl/models/entity.rb +9 -3
  38. data/lib/chronicle/etl/registry/connector_registration.rb +62 -0
  39. data/lib/chronicle/etl/registry/registry.rb +52 -0
  40. data/lib/chronicle/etl/registry/self_registering.rb +25 -0
  41. data/lib/chronicle/etl/runner.rb +58 -7
  42. data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +25 -0
  43. data/lib/chronicle/etl/serializers/serializer.rb +27 -0
  44. data/lib/chronicle/etl/transformers/image_file_transformer.rb +247 -0
  45. data/lib/chronicle/etl/transformers/null_transformer.rb +10 -1
  46. data/lib/chronicle/etl/transformers/transformer.rb +41 -10
  47. data/lib/chronicle/etl/utils/binary_attachments.rb +21 -0
  48. data/lib/chronicle/etl/utils/progress_bar.rb +3 -1
  49. data/lib/chronicle/etl/utils/text_recognition.rb +15 -0
  50. data/lib/chronicle/etl/version.rb +1 -1
  51. data/lib/chronicle/etl.rb +8 -2
  52. metadata +146 -34
  53. data/.ruby-version +0 -1
  54. data/Gemfile.lock +0 -91
  55. data/lib/chronicle/etl/catalog.rb +0 -108
  56. data/lib/chronicle/etl/utils/jsonapi.rb +0 -28
@@ -1,19 +1,20 @@
1
- require 'deep_merge'
1
+ require 'active_support/core_ext/hash/deep_merge'
2
2
 
3
3
  module Chronicle
4
4
  module ETL
5
5
  class JobDefinition
6
6
  SKELETON_DEFINITION = {
7
+ incremental: false,
7
8
  extractor: {
8
- name: nil,
9
+ name: 'stdin',
9
10
  options: {}
10
11
  },
11
12
  transformer: {
12
- name: nil,
13
+ name: 'null',
13
14
  options: {}
14
15
  },
15
16
  loader: {
16
- name: nil,
17
+ name: 'stdout',
17
18
  options: {}
18
19
  }
19
20
  }.freeze
@@ -26,16 +27,53 @@ module Chronicle
26
27
 
27
28
  # Add config hash to this definition
28
29
  def add_config(config = {})
29
- @definition = config.deep_merge(@definition)
30
+ @definition = @definition.deep_merge(config)
30
31
  load_credentials
31
32
  validate
32
33
  end
33
34
 
35
+ # Is this job continuing from a previous run?
36
+ def incremental?
37
+ @definition[:incremental]
38
+ end
39
+
40
+ def dry_run?
41
+ @definition[:dry_run]
42
+ end
43
+
44
+ def extractor_klass
45
+ load_klass(:extractor, @definition[:extractor][:name])
46
+ end
47
+
48
+ def transformer_klass
49
+ load_klass(:transformer, @definition[:transformer][:name])
50
+ end
51
+
52
+ def loader_klass
53
+ load_klass(:loader, @definition[:loader][:name])
54
+ end
55
+
56
+ def extractor_options
57
+ @definition[:extractor][:options]
58
+ end
59
+
60
+ def transformer_options
61
+ @definition[:transformer][:options]
62
+ end
63
+
64
+ def loader_options
65
+ @definition[:loader][:options]
66
+ end
67
+
34
68
  private
35
69
 
70
+ def load_klass(phase, identifier)
71
+ Chronicle::ETL::Registry.find_by_phase_and_identifier(phase, identifier).klass
72
+ end
73
+
36
74
  def load_credentials
37
- Chronicle::ETL::Catalog::PHASES.each do |phase|
38
- credentials_name = @definition[phase][:options][:credentials]
75
+ Chronicle::ETL::Registry::PHASES.each do |phase|
76
+ credentials_name = @definition[phase].dig(:options, :credentials)
39
77
  if credentials_name
40
78
  credentials = Chronicle::ETL::Config.load_credentials(credentials_name)
41
79
  @definition[phase][:options].deep_merge(credentials)
@@ -50,11 +50,21 @@ module Chronicle
50
50
  @success = true
51
51
  end
52
52
 
53
+ def error
54
+ @finished_at = Time.now
55
+ end
56
+
53
57
  def job= job
54
58
  @job = job
55
59
  @job_id = job.id
56
60
  end
57
61
 
62
+ def duration
63
+ return unless @finished_at
64
+
65
+ @finished_at - @started_at
66
+ end
67
+
58
68
  # Take a JobLog's instance variables and turn them into a hash representation
59
69
  def serialize
60
70
  {
@@ -1,32 +1,14 @@
1
1
  require 'sequel'
2
2
  require 'forwardable'
3
3
 
4
- require 'pry'
5
-
6
4
  module Chronicle
7
5
  module ETL
8
6
  # Saves JobLogs to db and loads previous ones
9
7
  class JobLogger
10
8
  extend Forwardable
11
9
 
12
- def_delegators :@job_log, :start, :finish, :log_transformation
13
-
14
- # Create a new JobLogger
15
- def initialize(job)
16
- @job_log = JobLog.new do |job_log|
17
- job_log.job = job
18
- end
19
- end
20
-
21
- # Save this JobLogger's JobLog to db
22
- def save
23
- return unless @job_log.save_log?
24
-
25
- JobLogger.with_db_connection do |db|
26
- dataset = db[:job_logs]
27
- dataset.insert(@job_log.serialize)
28
- end
29
- end
10
+ def_delegators :@job_log, :start, :finish, :error, :log_transformation, :duration, :success
11
+ attr_accessor :job_log
30
12
 
31
13
  # For a given `job_id`, return the last successful log
32
14
  def self.load_latest(job_id)
@@ -73,6 +55,27 @@ module Chronicle
73
55
  Time :finished_at
74
56
  end
75
57
  end
58
+
59
+ # Create a new JobLogger
60
+ def initialize(job)
61
+ @job_log = JobLog.new do |job_log|
62
+ job_log.job = job
63
+ end
64
+ end
65
+
66
+ # Save this JobLogger's JobLog to db
67
+ def save
68
+ return unless @job_log.save_log?
69
+
70
+ JobLogger.with_db_connection do |db|
71
+ dataset = db[:job_logs]
72
+ dataset.insert(@job_log.serialize)
73
+ end
74
+ end
75
+
76
+ def summarize
77
+ @job_log.inspect
78
+ end
76
79
  end
77
80
  end
78
81
  end
@@ -2,7 +2,11 @@ require 'csv'
2
2
 
3
3
  module Chronicle
4
4
  module ETL
5
- class CsvLoader < Chronicle::ETL::Loader
5
+ class CSVLoader < Chronicle::ETL::Loader
6
+ register_connector do |r|
7
+ r.description = 'CSV'
8
+ end
9
+
6
10
  def initialize(options={})
7
11
  super(options)
8
12
  @rows = []
@@ -2,14 +2,17 @@ module Chronicle
2
2
  module ETL
3
3
  # Abstract class representing a Loader for an ETL job
4
4
  class Loader
5
- extend Chronicle::ETL::Catalog
5
+ extend Chronicle::ETL::Registry::SelfRegistering
6
+ include Chronicle::ETL::Configurable
7
+
8
+ setting :output
6
9
 
7
10
  # Construct a new instance of this loader. Options are passed in from a Runner
8
11
  # == Parameters:
9
12
  # options::
10
13
  # Options for configuring this Loader
11
14
  def initialize(options = {})
12
- @options = options
15
+ apply_options(options)
13
16
  end
14
17
 
15
18
  # Called once before processing records
@@ -5,19 +5,23 @@ require 'json'
5
5
  module Chronicle
6
6
  module ETL
7
7
  class RestLoader < Chronicle::ETL::Loader
8
- def initialize( options={} )
9
- super(options)
8
+ register_connector do |r|
9
+ r.description = 'a REST endpoint'
10
10
  end
11
11
 
12
+ setting :hostname, required: true
13
+ setting :endpoint, required: true
14
+ setting :access_token
15
+
12
16
  def load(record)
13
- payload = Chronicle::ETL::Utils::JSONAPI.serialize(record)
17
+ payload = Chronicle::ETL::JSONAPISerializer.serialize(record)
14
18
  # have the outer data key that json-api expects
15
19
  payload = { data: payload } unless payload[:data]
16
20
 
17
- uri = URI.parse("#{@options[:hostname]}#{@options[:endpoint]}")
21
+ uri = URI.parse("#{@config.hostname}#{@config.endpoint}")
18
22
 
19
23
  header = {
20
- "Authorization" => "Bearer #{@options[:access_token]}",
24
+ "Authorization" => "Bearer #{@config.access_token}",
21
25
  "Content-Type": 'application/json'
22
26
  }
23
27
  use_ssl = uri.scheme == 'https'
@@ -1,8 +1,13 @@
1
1
  module Chronicle
2
2
  module ETL
3
3
  class StdoutLoader < Chronicle::ETL::Loader
4
+ register_connector do |r|
5
+ r.description = 'stdout'
6
+ end
7
+
4
8
  def load(record)
5
- puts record.to_h
9
+ serializer = Chronicle::ETL::JSONAPISerializer.new(record)
10
+ puts serializer.serializable_hash.to_json
6
11
  end
7
12
  end
8
13
  end
@@ -1,21 +1,65 @@
1
1
  require 'tty/table'
2
+ require 'active_support/core_ext/string/filters'
3
+ require 'active_support/core_ext/hash/reverse_merge'
2
4
 
3
5
  module Chronicle
4
6
  module ETL
5
7
  class TableLoader < Chronicle::ETL::Loader
6
- def initialize(options)
7
- super(options)
8
+ register_connector do |r|
9
+ r.description = 'an ASCII table'
8
10
  end
9
11
 
12
+ setting :fields_limit, default: nil
13
+ setting :fields_exclude, default: ['lids', 'type']
14
+ setting :fields_include, default: []
15
+ setting :truncate_values_at, default: 40
16
+ setting :table_renderer, default: :basic
17
+
10
18
  def load(record)
11
- record_hash = record.to_h_flattened
12
- @table ||= TTY::Table.new(header: record_hash.keys)
13
- values = record_hash.values.map{|x| x.to_s[0..30]}
14
- @table << values
19
+ @records ||= []
20
+ @records << record.to_h_flattened
15
21
  end
16
22
 
17
23
  def finish
18
- puts @table.render(:ascii, padding: [0, 1]) if @table
24
+ return if @records.empty?
25
+
26
+ headers = build_headers(@records)
27
+ rows = build_rows(@records, headers)
28
+
29
+ @table = TTY::Table.new(header: headers, rows: rows)
30
+ puts @table.render(
31
+ @config.table_renderer.to_sym,
32
+ padding: [0, 2, 0, 0]
33
+ )
34
+ end
35
+
36
+ private
37
+
38
+ def build_headers(records)
39
+ headers =
40
+ if @config.fields_include.any?
41
+ Set[*@config.fields_include]
42
+ else
43
+ # use all the keys of the flattened record hash
44
+ Set[*records.map(&:keys).flatten.map(&:to_s).uniq]
45
+ end
46
+
47
+ headers = headers.delete_if { |header| header.end_with?(*@config.fields_exclude) } if @config.fields_exclude.any?
48
+ headers = headers.first(@config.fields_limit) if @config.fields_limit
49
+
50
+ headers.to_a.map(&:to_sym)
51
+ end
52
+
53
+ def build_rows(records, headers)
54
+ records.map do |record|
55
+ values = record.values_at(*headers).map{|value| value.to_s }
56
+
57
+ if @config.truncate_values_at
58
+ values = values.map{ |value| value.truncate(@config.truncate_values_at) }
59
+ end
60
+
61
+ values
62
+ end
19
63
  end
20
64
  end
21
65
  end
@@ -0,0 +1,48 @@
1
+ module Chronicle
2
+ module ETL
3
+ module Logger
4
+ extend self
5
+
6
+ DEBUG = 0
7
+ INFO = 1
8
+ WARN = 2
9
+ ERROR = 3
10
+ FATAL = 4
11
+
12
+ attr_accessor :log_level
13
+
14
+ @log_level = INFO
15
+ @destination = $stderr
16
+
17
+ def output message, level
18
+ return unless level >= @log_level
19
+
20
+ if @progress_bar
21
+ @progress_bar.log(message)
22
+ else
23
+ @destination.puts(message)
24
+ end
25
+ end
26
+
27
+ def error(message)
28
+ output(message, ERROR)
29
+ end
30
+
31
+ def info(message)
32
+ output(message, INFO)
33
+ end
34
+
35
+ def debug(message)
36
+ output(message, DEBUG)
37
+ end
38
+
39
+ def attach_to_progress_bar(progress_bar)
40
+ @progress_bar = progress_bar
41
+ end
42
+
43
+ def detach_from_progress_bar
44
+ @progress_bar = nil
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,14 @@
1
+ require 'chronicle/etl/models/base'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ class Attachment < Chronicle::ETL::Models::Base
7
+ TYPE = 'attachments'.freeze
8
+ ATTRIBUTES = [:url_original, :data].freeze
9
+
10
+ attr_accessor(*ATTRIBUTES)
11
+ end
12
+ end
13
+ end
14
+ end
@@ -6,7 +6,7 @@ module Chronicle
6
6
  # Represents a record that's been transformed by a Transformer and
7
7
  # ready to be loaded. Loosely based on ActiveModel.
8
8
  class Base
9
- ATTRIBUTES = [:provider, :provider_id, :lat, :lng].freeze
9
+ ATTRIBUTES = [:provider, :provider_id, :lat, :lng, :metadata].freeze
10
10
  ASSOCIATIONS = [].freeze
11
11
 
12
12
  attr_accessor(:id, :dedupe_on, *ATTRIBUTES)
@@ -14,6 +14,7 @@ module Chronicle
14
14
  def initialize(attributes = {})
15
15
  assign_attributes(attributes) if attributes
16
16
  @dedupe_on = []
17
+ @metadata = {}
17
18
  end
18
19
 
19
20
  # A unique identifier for this model is formed from a type
@@ -36,6 +37,8 @@ module Chronicle
36
37
  # For a given set of fields of this model, generate a
37
38
  # unique local id by hashing the field values
38
39
  def generate_lid fields
40
+ raise ArgumentError.new("Must provide an array of symbolized fields") unless fields.is_a?(Array)
41
+
39
42
  values = fields.sort.map do |field|
40
43
  instance_variable = "@#{field.to_s}"
41
44
  self.instance_variable_get(instance_variable)
@@ -75,9 +78,21 @@ module Chronicle
75
78
  end
76
79
 
77
80
  def associations_hash
78
- Hash[associations.map do |k, v|
79
- [k, v.to_h]
80
- end]
81
+ associations.map do |k, v|
82
+ if v.is_a?(Array)
83
+ [k, v.map(&:to_h)]
84
+ else
85
+ [k, v.to_h]
86
+ end
87
+ end.to_h
88
+ end
89
+
90
+ def meta_hash
91
+ {
92
+ meta: {
93
+ dedupe_on: @dedupe_on.map{|d| d.map(&:to_s).join(",")}
94
+ }
95
+ }
81
96
  end
82
97
 
83
98
  # FIXME: move this to a Utils module
@@ -86,11 +101,12 @@ module Chronicle
86
101
  end
87
102
 
88
103
  def to_h
89
- identifier_hash.merge(attributes).merge(associations_hash)
104
+ identifier_hash
105
+ .merge(attributes)
106
+ .merge(associations_hash)
107
+ .merge(meta_hash)
90
108
  end
91
109
 
92
- private
93
-
94
110
  def assign_attributes attributes
95
111
  attributes.each do |k, v|
96
112
  setter = :"#{k}="
@@ -5,10 +5,16 @@ module Chronicle
5
5
  module Models
6
6
  class Entity < Chronicle::ETL::Models::Base
7
7
  TYPE = 'entities'.freeze
8
- ATTRIBUTES = [:title, :body, :represents, :slug].freeze
9
- ASSOCIATIONS = [].freeze # TODO: add these to reflect Chronicle Schema
8
+ ATTRIBUTES = [:title, :body, :represents, :slug, :myself, :metadata].freeze
9
+ ASSOCIATIONS = [
10
+ :attachments,
11
+ :abouts,
12
+ :depicts,
13
+ :consumers,
14
+ :contains
15
+ ].freeze # TODO: add these to reflect Chronicle Schema
10
16
 
11
- attr_accessor(*ATTRIBUTES)
17
+ attr_accessor(*ATTRIBUTES, *ASSOCIATIONS)
12
18
  end
13
19
  end
14
20
  end
@@ -0,0 +1,62 @@
1
+ module Chronicle
2
+ module ETL
3
+ module Registry
4
+ # Records details about a connector such as its provider and a description
5
+ class ConnectorRegistration
6
+ # FIXME: refactor custom accessor methods later in file
7
+ attr_accessor :identifier, :provider, :klass, :description
8
+
9
+ def initialize(klass)
10
+ @klass = klass
11
+ end
12
+
13
+ def phase
14
+ if klass.ancestors.include? Chronicle::ETL::Extractor
15
+ :extractor
16
+ elsif klass.ancestors.include? Chronicle::ETL::Transformer
17
+ :transformer
18
+ elsif klass.ancestors.include? Chronicle::ETL::Loader
19
+ :loader
20
+ end
21
+ end
22
+
23
+ def to_s
24
+ "#{phase}-#{identifier}"
25
+ end
26
+
27
+ def built_in?
28
+ @klass.to_s.include? 'Chronicle::ETL'
29
+ end
30
+
31
+ def klass_name
32
+ @klass.to_s
33
+ end
34
+
35
+ def identifier
36
+ @identifier || @klass.to_s.split('::').last.gsub!(/(Extractor$|Loader$|Transformer$)/, '').downcase
37
+ end
38
+
39
+ def description
40
+ @description || @klass.to_s.split('::').last
41
+ end
42
+
43
+ def provider
44
+ @provider || (built_in? ? 'chronicle' : '')
45
+ end
46
+
47
+ def descriptive_phrase
48
+ prefix = case phase
49
+ when :extractor
50
+ "Extracts from"
51
+ when :transformer
52
+ "Transforms"
53
+ when :loader
54
+ "Loads to"
55
+ end
56
+
57
+ "#{prefix} #{description}"
58
+ end
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,52 @@
1
+ require 'rubygems'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ # A singleton class that acts as a registry of connector classes available for ETL jobs
6
+ module Registry
7
+ PHASES = [:extractor, :transformer, :loader]
8
+
9
+ class << self
10
+ attr_accessor :connectors
11
+
12
+ def load_all!
13
+ load_connectors_from_gems
14
+ end
15
+
16
+ def load_connectors_from_gems
17
+ Gem::Specification.filter{|s| s.name.match(/^chronicle/) }.each do |gem|
18
+ require_str = gem.name.gsub('chronicle-', 'chronicle/')
19
+ require require_str rescue LoadError
20
+ end
21
+ end
22
+
23
+ def install_connector name
24
+ gem_name = "chronicle-#{name}"
25
+ Gem.install(gem_name)
26
+ end
27
+
28
+ def register connector
29
+ @connectors ||= []
30
+ @connectors << connector
31
+ end
32
+
33
+ def find_by_phase_and_identifier(phase, identifier)
34
+ connector = find_within_loaded_connectors(phase, identifier)
35
+ unless connector
36
+ # Only load external connectors (slow) if not found in built-in connectors
37
+ load_all!
38
+ connector = find_within_loaded_connectors(phase, identifier)
39
+ end
40
+ connector || raise(ConnectorNotAvailableError.new("Connector '#{identifier}' not found"))
41
+ end
42
+
43
+ def find_within_loaded_connectors(phase, identifier)
44
+ @connectors.find { |c| c.phase == phase && c.identifier == identifier }
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
50
+
51
+ require_relative 'self_registering'
52
+ require_relative 'connector_registration'
@@ -0,0 +1,25 @@
1
+ require 'forwardable'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Registry
6
+ # Gives a connector class the ability to let the Chronicle::ETL::Registry
7
+ # know about itself
8
+ module SelfRegistering
9
+ extend Forwardable
10
+
11
+ attr_accessor :connector_registration
12
+
13
+ def_delegators :@connector_registration, :description, :provider, :identifier
14
+
15
+ # Creates a ConnectorRegistration for this connector's details and register's it
16
+ # into the Registry
17
+ def register_connector
18
+ @connector_registration ||= ::Chronicle::ETL::Registry::ConnectorRegistration.new(self)
19
+ yield @connector_registration if block_given?
20
+ ::Chronicle::ETL::Registry.register(@connector_registration)
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end