chronicle-etl 0.2.4 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (56) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +35 -0
  3. data/.gitignore +3 -0
  4. data/.rubocop.yml +31 -1
  5. data/Guardfile +7 -0
  6. data/README.md +21 -14
  7. data/Rakefile +4 -2
  8. data/chronicle-etl.gemspec +18 -10
  9. data/exe/chronicle-etl +1 -1
  10. data/lib/chronicle/etl/cli/connectors.rb +53 -7
  11. data/lib/chronicle/etl/cli/jobs.rb +59 -24
  12. data/lib/chronicle/etl/cli/main.rb +18 -16
  13. data/lib/chronicle/etl/cli/subcommand_base.rb +2 -2
  14. data/lib/chronicle/etl/cli.rb +7 -0
  15. data/lib/chronicle/etl/config.rb +1 -1
  16. data/lib/chronicle/etl/configurable.rb +150 -0
  17. data/lib/chronicle/etl/exceptions.rb +14 -1
  18. data/lib/chronicle/etl/extraction.rb +12 -0
  19. data/lib/chronicle/etl/extractors/csv_extractor.rb +32 -31
  20. data/lib/chronicle/etl/extractors/extractor.rb +25 -13
  21. data/lib/chronicle/etl/extractors/file_extractor.rb +17 -32
  22. data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +104 -0
  23. data/lib/chronicle/etl/extractors/json_extractor.rb +37 -0
  24. data/lib/chronicle/etl/extractors/stdin_extractor.rb +6 -1
  25. data/lib/chronicle/etl/job.rb +30 -29
  26. data/lib/chronicle/etl/job_definition.rb +45 -7
  27. data/lib/chronicle/etl/job_log.rb +10 -0
  28. data/lib/chronicle/etl/job_logger.rb +23 -20
  29. data/lib/chronicle/etl/loaders/csv_loader.rb +5 -1
  30. data/lib/chronicle/etl/loaders/loader.rb +5 -2
  31. data/lib/chronicle/etl/loaders/rest_loader.rb +9 -5
  32. data/lib/chronicle/etl/loaders/stdout_loader.rb +6 -1
  33. data/lib/chronicle/etl/loaders/table_loader.rb +51 -7
  34. data/lib/chronicle/etl/logger.rb +48 -0
  35. data/lib/chronicle/etl/models/attachment.rb +14 -0
  36. data/lib/chronicle/etl/models/base.rb +23 -7
  37. data/lib/chronicle/etl/models/entity.rb +9 -3
  38. data/lib/chronicle/etl/registry/connector_registration.rb +62 -0
  39. data/lib/chronicle/etl/registry/registry.rb +52 -0
  40. data/lib/chronicle/etl/registry/self_registering.rb +25 -0
  41. data/lib/chronicle/etl/runner.rb +58 -7
  42. data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +25 -0
  43. data/lib/chronicle/etl/serializers/serializer.rb +27 -0
  44. data/lib/chronicle/etl/transformers/image_file_transformer.rb +247 -0
  45. data/lib/chronicle/etl/transformers/null_transformer.rb +10 -1
  46. data/lib/chronicle/etl/transformers/transformer.rb +41 -10
  47. data/lib/chronicle/etl/utils/binary_attachments.rb +21 -0
  48. data/lib/chronicle/etl/utils/progress_bar.rb +3 -1
  49. data/lib/chronicle/etl/utils/text_recognition.rb +15 -0
  50. data/lib/chronicle/etl/version.rb +1 -1
  51. data/lib/chronicle/etl.rb +8 -2
  52. metadata +146 -34
  53. data/.ruby-version +0 -1
  54. data/Gemfile.lock +0 -91
  55. data/lib/chronicle/etl/catalog.rb +0 -108
  56. data/lib/chronicle/etl/utils/jsonapi.rb +0 -28
@@ -1,19 +1,20 @@
1
- require 'deep_merge'
1
+ require 'active_support/core_ext/hash/deep_merge'
2
2
 
3
3
  module Chronicle
4
4
  module ETL
5
5
  class JobDefinition
6
6
  SKELETON_DEFINITION = {
7
+ incremental: false,
7
8
  extractor: {
8
- name: nil,
9
+ name: 'stdin',
9
10
  options: {}
10
11
  },
11
12
  transformer: {
12
- name: nil,
13
+ name: 'null',
13
14
  options: {}
14
15
  },
15
16
  loader: {
16
- name: nil,
17
+ name: 'stdout',
17
18
  options: {}
18
19
  }
19
20
  }.freeze
@@ -26,16 +27,53 @@ module Chronicle
26
27
 
27
28
  # Add config hash to this definition
28
29
  def add_config(config = {})
29
- @definition = config.deep_merge(@definition)
30
+ @definition = @definition.deep_merge(config)
30
31
  load_credentials
31
32
  validate
32
33
  end
33
34
 
35
+ # Is this job continuing from a previous run?
36
+ def incremental?
37
+ @definition[:incremental]
38
+ end
39
+
40
+ def dry_run?
41
+ @definition[:dry_run]
42
+ end
43
+
44
+ def extractor_klass
45
+ load_klass(:extractor, @definition[:extractor][:name])
46
+ end
47
+
48
+ def transformer_klass
49
+ load_klass(:transformer, @definition[:transformer][:name])
50
+ end
51
+
52
+ def loader_klass
53
+ load_klass(:loader, @definition[:loader][:name])
54
+ end
55
+
56
+ def extractor_options
57
+ @definition[:extractor][:options]
58
+ end
59
+
60
+ def transformer_options
61
+ @definition[:transformer][:options]
62
+ end
63
+
64
+ def loader_options
65
+ @definition[:loader][:options]
66
+ end
67
+
34
68
  private
35
69
 
70
+ def load_klass(phase, identifier)
71
+ Chronicle::ETL::Registry.find_by_phase_and_identifier(phase, identifier).klass
72
+ end
73
+
36
74
  def load_credentials
37
- Chronicle::ETL::Catalog::PHASES.each do |phase|
38
- credentials_name = @definition[phase][:options][:credentials]
75
+ Chronicle::ETL::Registry::PHASES.each do |phase|
76
+ credentials_name = @definition[phase].dig(:options, :credentials)
39
77
  if credentials_name
40
78
  credentials = Chronicle::ETL::Config.load_credentials(credentials_name)
41
79
  @definition[phase][:options].deep_merge(credentials)
@@ -50,11 +50,21 @@ module Chronicle
50
50
  @success = true
51
51
  end
52
52
 
53
+ def error
54
+ @finished_at = Time.now
55
+ end
56
+
53
57
  def job= job
54
58
  @job = job
55
59
  @job_id = job.id
56
60
  end
57
61
 
62
+ def duration
63
+ return unless @finished_at
64
+
65
+ @finished_at - @started_at
66
+ end
67
+
58
68
  # Take a JobLog's instance variables and turn them into a hash representation
59
69
  def serialize
60
70
  {
@@ -1,32 +1,14 @@
1
1
  require 'sequel'
2
2
  require 'forwardable'
3
3
 
4
- require 'pry'
5
-
6
4
  module Chronicle
7
5
  module ETL
8
6
  # Saves JobLogs to db and loads previous ones
9
7
  class JobLogger
10
8
  extend Forwardable
11
9
 
12
- def_delegators :@job_log, :start, :finish, :log_transformation
13
-
14
- # Create a new JobLogger
15
- def initialize(job)
16
- @job_log = JobLog.new do |job_log|
17
- job_log.job = job
18
- end
19
- end
20
-
21
- # Save this JobLogger's JobLog to db
22
- def save
23
- return unless @job_log.save_log?
24
-
25
- JobLogger.with_db_connection do |db|
26
- dataset = db[:job_logs]
27
- dataset.insert(@job_log.serialize)
28
- end
29
- end
10
+ def_delegators :@job_log, :start, :finish, :error, :log_transformation, :duration, :success
11
+ attr_accessor :job_log
30
12
 
31
13
  # For a given `job_id`, return the last successful log
32
14
  def self.load_latest(job_id)
@@ -73,6 +55,27 @@ module Chronicle
73
55
  Time :finished_at
74
56
  end
75
57
  end
58
+
59
+ # Create a new JobLogger
60
+ def initialize(job)
61
+ @job_log = JobLog.new do |job_log|
62
+ job_log.job = job
63
+ end
64
+ end
65
+
66
+ # Save this JobLogger's JobLog to db
67
+ def save
68
+ return unless @job_log.save_log?
69
+
70
+ JobLogger.with_db_connection do |db|
71
+ dataset = db[:job_logs]
72
+ dataset.insert(@job_log.serialize)
73
+ end
74
+ end
75
+
76
+ def summarize
77
+ @job_log.inspect
78
+ end
76
79
  end
77
80
  end
78
81
  end
@@ -2,7 +2,11 @@ require 'csv'
2
2
 
3
3
  module Chronicle
4
4
  module ETL
5
- class CsvLoader < Chronicle::ETL::Loader
5
+ class CSVLoader < Chronicle::ETL::Loader
6
+ register_connector do |r|
7
+ r.description = 'CSV'
8
+ end
9
+
6
10
  def initialize(options={})
7
11
  super(options)
8
12
  @rows = []
@@ -2,14 +2,17 @@ module Chronicle
2
2
  module ETL
3
3
  # Abstract class representing a Loader for an ETL job
4
4
  class Loader
5
- extend Chronicle::ETL::Catalog
5
+ extend Chronicle::ETL::Registry::SelfRegistering
6
+ include Chronicle::ETL::Configurable
7
+
8
+ setting :output
6
9
 
7
10
  # Construct a new instance of this loader. Options are passed in from a Runner
8
11
  # == Parameters:
9
12
  # options::
10
13
  # Options for configuring this Loader
11
14
  def initialize(options = {})
12
- @options = options
15
+ apply_options(options)
13
16
  end
14
17
 
15
18
  # Called once before processing records
@@ -5,19 +5,23 @@ require 'json'
5
5
  module Chronicle
6
6
  module ETL
7
7
  class RestLoader < Chronicle::ETL::Loader
8
- def initialize( options={} )
9
- super(options)
8
+ register_connector do |r|
9
+ r.description = 'a REST endpoint'
10
10
  end
11
11
 
12
+ setting :hostname, required: true
13
+ setting :endpoint, required: true
14
+ setting :access_token
15
+
12
16
  def load(record)
13
- payload = Chronicle::ETL::Utils::JSONAPI.serialize(record)
17
+ payload = Chronicle::ETL::JSONAPISerializer.serialize(record)
14
18
  # have the outer data key that json-api expects
15
19
  payload = { data: payload } unless payload[:data]
16
20
 
17
- uri = URI.parse("#{@options[:hostname]}#{@options[:endpoint]}")
21
+ uri = URI.parse("#{@config.hostname}#{@config.endpoint}")
18
22
 
19
23
  header = {
20
- "Authorization" => "Bearer #{@options[:access_token]}",
24
+ "Authorization" => "Bearer #{@config.access_token}",
21
25
  "Content-Type": 'application/json'
22
26
  }
23
27
  use_ssl = uri.scheme == 'https'
@@ -1,8 +1,13 @@
1
1
  module Chronicle
2
2
  module ETL
3
3
  class StdoutLoader < Chronicle::ETL::Loader
4
+ register_connector do |r|
5
+ r.description = 'stdout'
6
+ end
7
+
4
8
  def load(record)
5
- puts record.to_h
9
+ serializer = Chronicle::ETL::JSONAPISerializer.new(record)
10
+ puts serializer.serializable_hash.to_json
6
11
  end
7
12
  end
8
13
  end
@@ -1,21 +1,65 @@
1
1
  require 'tty/table'
2
+ require 'active_support/core_ext/string/filters'
3
+ require 'active_support/core_ext/hash/reverse_merge'
2
4
 
3
5
  module Chronicle
4
6
  module ETL
5
7
  class TableLoader < Chronicle::ETL::Loader
6
- def initialize(options)
7
- super(options)
8
+ register_connector do |r|
9
+ r.description = 'an ASCII table'
8
10
  end
9
11
 
12
+ setting :fields_limit, default: nil
13
+ setting :fields_exclude, default: ['lids', 'type']
14
+ setting :fields_include, default: []
15
+ setting :truncate_values_at, default: 40
16
+ setting :table_renderer, default: :basic
17
+
10
18
  def load(record)
11
- record_hash = record.to_h_flattened
12
- @table ||= TTY::Table.new(header: record_hash.keys)
13
- values = record_hash.values.map{|x| x.to_s[0..30]}
14
- @table << values
19
+ @records ||= []
20
+ @records << record.to_h_flattened
15
21
  end
16
22
 
17
23
  def finish
18
- puts @table.render(:ascii, padding: [0, 1]) if @table
24
+ return if @records.empty?
25
+
26
+ headers = build_headers(@records)
27
+ rows = build_rows(@records, headers)
28
+
29
+ @table = TTY::Table.new(header: headers, rows: rows)
30
+ puts @table.render(
31
+ @config.table_renderer.to_sym,
32
+ padding: [0, 2, 0, 0]
33
+ )
34
+ end
35
+
36
+ private
37
+
38
+ def build_headers(records)
39
+ headers =
40
+ if @config.fields_include.any?
41
+ Set[*@config.fields_include]
42
+ else
43
+ # use all the keys of the flattened record hash
44
+ Set[*records.map(&:keys).flatten.map(&:to_s).uniq]
45
+ end
46
+
47
+ headers = headers.delete_if { |header| header.end_with?(*@config.fields_exclude) } if @config.fields_exclude.any?
48
+ headers = headers.first(@config.fields_limit) if @config.fields_limit
49
+
50
+ headers.to_a.map(&:to_sym)
51
+ end
52
+
53
+ def build_rows(records, headers)
54
+ records.map do |record|
55
+ values = record.values_at(*headers).map{|value| value.to_s }
56
+
57
+ if @config.truncate_values_at
58
+ values = values.map{ |value| value.truncate(@config.truncate_values_at) }
59
+ end
60
+
61
+ values
62
+ end
19
63
  end
20
64
  end
21
65
  end
@@ -0,0 +1,48 @@
1
+ module Chronicle
2
+ module ETL
3
+ module Logger
4
+ extend self
5
+
6
+ DEBUG = 0
7
+ INFO = 1
8
+ WARN = 2
9
+ ERROR = 3
10
+ FATAL = 4
11
+
12
+ attr_accessor :log_level
13
+
14
+ @log_level = INFO
15
+ @destination = $stderr
16
+
17
+ def output message, level
18
+ return unless level >= @log_level
19
+
20
+ if @progress_bar
21
+ @progress_bar.log(message)
22
+ else
23
+ @destination.puts(message)
24
+ end
25
+ end
26
+
27
+ def error(message)
28
+ output(message, ERROR)
29
+ end
30
+
31
+ def info(message)
32
+ output(message, INFO)
33
+ end
34
+
35
+ def debug(message)
36
+ output(message, DEBUG)
37
+ end
38
+
39
+ def attach_to_progress_bar(progress_bar)
40
+ @progress_bar = progress_bar
41
+ end
42
+
43
+ def detach_from_progress_bar
44
+ @progress_bar = nil
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,14 @@
1
+ require 'chronicle/etl/models/base'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ class Attachment < Chronicle::ETL::Models::Base
7
+ TYPE = 'attachments'.freeze
8
+ ATTRIBUTES = [:url_original, :data].freeze
9
+
10
+ attr_accessor(*ATTRIBUTES)
11
+ end
12
+ end
13
+ end
14
+ end
@@ -6,7 +6,7 @@ module Chronicle
6
6
  # Represents a record that's been transformed by a Transformer and
7
7
  # ready to be loaded. Loosely based on ActiveModel.
8
8
  class Base
9
- ATTRIBUTES = [:provider, :provider_id, :lat, :lng].freeze
9
+ ATTRIBUTES = [:provider, :provider_id, :lat, :lng, :metadata].freeze
10
10
  ASSOCIATIONS = [].freeze
11
11
 
12
12
  attr_accessor(:id, :dedupe_on, *ATTRIBUTES)
@@ -14,6 +14,7 @@ module Chronicle
14
14
  def initialize(attributes = {})
15
15
  assign_attributes(attributes) if attributes
16
16
  @dedupe_on = []
17
+ @metadata = {}
17
18
  end
18
19
 
19
20
  # A unique identifier for this model is formed from a type
@@ -36,6 +37,8 @@ module Chronicle
36
37
  # For a given set of fields of this model, generate a
37
38
  # unique local id by hashing the field values
38
39
  def generate_lid fields
40
+ raise ArgumentError.new("Must provide an array of symbolized fields") unless fields.is_a?(Array)
41
+
39
42
  values = fields.sort.map do |field|
40
43
  instance_variable = "@#{field.to_s}"
41
44
  self.instance_variable_get(instance_variable)
@@ -75,9 +78,21 @@ module Chronicle
75
78
  end
76
79
 
77
80
  def associations_hash
78
- Hash[associations.map do |k, v|
79
- [k, v.to_h]
80
- end]
81
+ associations.map do |k, v|
82
+ if v.is_a?(Array)
83
+ [k, v.map(&:to_h)]
84
+ else
85
+ [k, v.to_h]
86
+ end
87
+ end.to_h
88
+ end
89
+
90
+ def meta_hash
91
+ {
92
+ meta: {
93
+ dedupe_on: @dedupe_on.map{|d| d.map(&:to_s).join(",")}
94
+ }
95
+ }
81
96
  end
82
97
 
83
98
  # FIXME: move this to a Utils module
@@ -86,11 +101,12 @@ module Chronicle
86
101
  end
87
102
 
88
103
  def to_h
89
- identifier_hash.merge(attributes).merge(associations_hash)
104
+ identifier_hash
105
+ .merge(attributes)
106
+ .merge(associations_hash)
107
+ .merge(meta_hash)
90
108
  end
91
109
 
92
- private
93
-
94
110
  def assign_attributes attributes
95
111
  attributes.each do |k, v|
96
112
  setter = :"#{k}="
@@ -5,10 +5,16 @@ module Chronicle
5
5
  module Models
6
6
  class Entity < Chronicle::ETL::Models::Base
7
7
  TYPE = 'entities'.freeze
8
- ATTRIBUTES = [:title, :body, :represents, :slug].freeze
9
- ASSOCIATIONS = [].freeze # TODO: add these to reflect Chronicle Schema
8
+ ATTRIBUTES = [:title, :body, :represents, :slug, :myself, :metadata].freeze
9
+ ASSOCIATIONS = [
10
+ :attachments,
11
+ :abouts,
12
+ :depicts,
13
+ :consumers,
14
+ :contains
15
+ ].freeze # TODO: add these to reflect Chronicle Schema
10
16
 
11
- attr_accessor(*ATTRIBUTES)
17
+ attr_accessor(*ATTRIBUTES, *ASSOCIATIONS)
12
18
  end
13
19
  end
14
20
  end
@@ -0,0 +1,62 @@
1
+ module Chronicle
2
+ module ETL
3
+ module Registry
4
+ # Records details about a connector such as its provider and a description
5
+ class ConnectorRegistration
6
+ # FIXME: refactor custom accessor methods later in file
7
+ attr_accessor :identifier, :provider, :klass, :description
8
+
9
+ def initialize(klass)
10
+ @klass = klass
11
+ end
12
+
13
+ def phase
14
+ if klass.ancestors.include? Chronicle::ETL::Extractor
15
+ :extractor
16
+ elsif klass.ancestors.include? Chronicle::ETL::Transformer
17
+ :transformer
18
+ elsif klass.ancestors.include? Chronicle::ETL::Loader
19
+ :loader
20
+ end
21
+ end
22
+
23
+ def to_s
24
+ "#{phase}-#{identifier}"
25
+ end
26
+
27
+ def built_in?
28
+ @klass.to_s.include? 'Chronicle::ETL'
29
+ end
30
+
31
+ def klass_name
32
+ @klass.to_s
33
+ end
34
+
35
+ def identifier
36
+ @identifier || @klass.to_s.split('::').last.gsub!(/(Extractor$|Loader$|Transformer$)/, '').downcase
37
+ end
38
+
39
+ def description
40
+ @description || @klass.to_s.split('::').last
41
+ end
42
+
43
+ def provider
44
+ @provider || (built_in? ? 'chronicle' : '')
45
+ end
46
+
47
+ def descriptive_phrase
48
+ prefix = case phase
49
+ when :extractor
50
+ "Extracts from"
51
+ when :transformer
52
+ "Transforms"
53
+ when :loader
54
+ "Loads to"
55
+ end
56
+
57
+ "#{prefix} #{description}"
58
+ end
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,52 @@
1
+ require 'rubygems'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ # A singleton class that acts as a registry of connector classes available for ETL jobs
6
+ module Registry
7
+ PHASES = [:extractor, :transformer, :loader]
8
+
9
+ class << self
10
+ attr_accessor :connectors
11
+
12
+ def load_all!
13
+ load_connectors_from_gems
14
+ end
15
+
16
+ def load_connectors_from_gems
17
+ Gem::Specification.filter{|s| s.name.match(/^chronicle/) }.each do |gem|
18
+ require_str = gem.name.gsub('chronicle-', 'chronicle/')
19
+ require require_str rescue LoadError
20
+ end
21
+ end
22
+
23
+ def install_connector name
24
+ gem_name = "chronicle-#{name}"
25
+ Gem.install(gem_name)
26
+ end
27
+
28
+ def register connector
29
+ @connectors ||= []
30
+ @connectors << connector
31
+ end
32
+
33
+ def find_by_phase_and_identifier(phase, identifier)
34
+ connector = find_within_loaded_connectors(phase, identifier)
35
+ unless connector
36
+ # Only load external connectors (slow) if not found in built-in connectors
37
+ load_all!
38
+ connector = find_within_loaded_connectors(phase, identifier)
39
+ end
40
+ connector || raise(ConnectorNotAvailableError.new("Connector '#{identifier}' not found"))
41
+ end
42
+
43
+ def find_within_loaded_connectors(phase, identifier)
44
+ @connectors.find { |c| c.phase == phase && c.identifier == identifier }
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
50
+
51
+ require_relative 'self_registering'
52
+ require_relative 'connector_registration'
@@ -0,0 +1,25 @@
1
+ require 'forwardable'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Registry
6
+ # Gives a connector class the ability to let the Chronicle::ETL::Registry
7
+ # know about itself
8
+ module SelfRegistering
9
+ extend Forwardable
10
+
11
+ attr_accessor :connector_registration
12
+
13
+ def_delegators :@connector_registration, :description, :provider, :identifier
14
+
15
+ # Creates a ConnectorRegistration for this connector's details and register's it
16
+ # into the Registry
17
+ def register_connector
18
+ @connector_registration ||= ::Chronicle::ETL::Registry::ConnectorRegistration.new(self)
19
+ yield @connector_registration if block_given?
20
+ ::Chronicle::ETL::Registry.register(@connector_registration)
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end