chronicle-etl 0.2.2 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +3 -0
  3. data/.rubocop.yml +3 -0
  4. data/README.md +22 -15
  5. data/chronicle-etl.gemspec +13 -7
  6. data/lib/chronicle/etl/cli/connectors.rb +19 -7
  7. data/lib/chronicle/etl/cli/jobs.rb +38 -26
  8. data/lib/chronicle/etl/cli/main.rb +10 -2
  9. data/lib/chronicle/etl/config.rb +24 -3
  10. data/lib/chronicle/etl/exceptions.rb +13 -0
  11. data/lib/chronicle/etl/extraction.rb +12 -0
  12. data/lib/chronicle/etl/extractors/csv_extractor.rb +43 -37
  13. data/lib/chronicle/etl/extractors/extractor.rb +25 -4
  14. data/lib/chronicle/etl/extractors/file_extractor.rb +15 -33
  15. data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +104 -0
  16. data/lib/chronicle/etl/extractors/json_extractor.rb +45 -0
  17. data/lib/chronicle/etl/extractors/stdin_extractor.rb +6 -1
  18. data/lib/chronicle/etl/job.rb +72 -0
  19. data/lib/chronicle/etl/job_definition.rb +89 -0
  20. data/lib/chronicle/etl/job_log.rb +95 -0
  21. data/lib/chronicle/etl/job_logger.rb +81 -0
  22. data/lib/chronicle/etl/loaders/csv_loader.rb +6 -6
  23. data/lib/chronicle/etl/loaders/loader.rb +2 -2
  24. data/lib/chronicle/etl/loaders/rest_loader.rb +16 -9
  25. data/lib/chronicle/etl/loaders/stdout_loader.rb +8 -3
  26. data/lib/chronicle/etl/loaders/table_loader.rb +58 -7
  27. data/lib/chronicle/etl/logger.rb +48 -0
  28. data/lib/chronicle/etl/models/activity.rb +15 -0
  29. data/lib/chronicle/etl/models/attachment.rb +14 -0
  30. data/lib/chronicle/etl/models/base.rb +119 -0
  31. data/lib/chronicle/etl/models/entity.rb +21 -0
  32. data/lib/chronicle/etl/models/generic.rb +23 -0
  33. data/lib/chronicle/etl/registry/connector_registration.rb +61 -0
  34. data/lib/chronicle/etl/registry/registry.rb +52 -0
  35. data/lib/chronicle/etl/registry/self_registering.rb +25 -0
  36. data/lib/chronicle/etl/runner.rb +66 -24
  37. data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +25 -0
  38. data/lib/chronicle/etl/serializers/serializer.rb +27 -0
  39. data/lib/chronicle/etl/transformers/image_file_transformer.rb +253 -0
  40. data/lib/chronicle/etl/transformers/null_transformer.rb +11 -3
  41. data/lib/chronicle/etl/transformers/transformer.rb +42 -13
  42. data/lib/chronicle/etl/utils/binary_attachments.rb +21 -0
  43. data/lib/chronicle/etl/utils/hash_utilities.rb +19 -0
  44. data/lib/chronicle/etl/utils/progress_bar.rb +3 -1
  45. data/lib/chronicle/etl/utils/text_recognition.rb +15 -0
  46. data/lib/chronicle/etl/version.rb +1 -1
  47. data/lib/chronicle/etl.rb +16 -1
  48. metadata +139 -36
  49. data/CHANGELOG.md +0 -23
  50. data/Gemfile.lock +0 -85
  51. data/lib/chronicle/etl/catalog.rb +0 -102
  52. data/lib/chronicle/etl/transformers/json_transformer.rb +0 -11
@@ -1,20 +1,71 @@
1
1
  require 'tty/table'
2
+ require 'active_support/core_ext/string/filters'
3
+ require 'active_support/core_ext/hash/reverse_merge'
2
4
 
3
5
  module Chronicle
4
6
  module ETL
5
7
  class TableLoader < Chronicle::ETL::Loader
6
- def initialize(options)
7
- super(options)
8
+ register_connector do |r|
9
+ r.description = 'an ASCII table'
8
10
  end
9
11
 
10
- def load(result)
11
- @table ||= TTY::Table.new(header: result.keys)
12
- values = result.values.map{|x| x.to_s[0..30]}
13
- @table << values
12
+ DEFAULT_OPTIONS = {
13
+ fields_limit: nil,
14
+ fields_exclude: ['lids', 'type'],
15
+ fields_include: [],
16
+ truncate_values_at: nil,
17
+ table_renderer: :basic
18
+ }.freeze
19
+
20
+ def initialize(options={})
21
+ @options = options.reverse_merge(DEFAULT_OPTIONS)
22
+ @records = []
23
+ end
24
+
25
+ def load(record)
26
+ @records << record.to_h_flattened
14
27
  end
15
28
 
16
29
  def finish
17
- puts @table.render(:ascii, padding: [0, 1])
30
+ return if @records.empty?
31
+
32
+ headers = build_headers(@records)
33
+ rows = build_rows(@records, headers)
34
+
35
+ @table = TTY::Table.new(header: headers, rows: rows)
36
+ puts @table.render(
37
+ @options[:table_renderer].to_sym,
38
+ padding: [0, 2, 0, 0]
39
+ )
40
+ end
41
+
42
+ private
43
+
44
+ def build_headers(records)
45
+ headers =
46
+ if @options[:fields_include].any?
47
+ Set[*@options[:fields_include]]
48
+ else
49
+ # use all the keys of the flattened record hash
50
+ Set[*records.map(&:keys).flatten.map(&:to_s).uniq]
51
+ end
52
+
53
+ headers = headers.delete_if { |header| header.end_with?(*@options[:fields_exclude]) } if @options[:fields_exclude].any?
54
+ headers = headers.first(@options[:fields_limit]) if @options[:fields_limit]
55
+
56
+ headers.to_a.map(&:to_sym)
57
+ end
58
+
59
+ def build_rows(records, headers)
60
+ records.map do |record|
61
+ values = record.values_at(*headers).map{|value| value.to_s }
62
+
63
+ if @options[:truncate_values_at]
64
+ values = values.map{ |value| value.truncate(@options[:truncate_values_at]) }
65
+ end
66
+
67
+ values
68
+ end
18
69
  end
19
70
  end
20
71
  end
@@ -0,0 +1,48 @@
1
+ module Chronicle
2
+ module ETL
3
+ module Logger
4
+ extend self
5
+
6
+ DEBUG = 0
7
+ INFO = 1
8
+ WARN = 2
9
+ ERROR = 3
10
+ FATAL = 4
11
+
12
+ attr_accessor :log_level
13
+
14
+ @log_level = INFO
15
+ @destination = $stderr
16
+
17
+ def output message, level
18
+ return unless level >= @log_level
19
+
20
+ if @progress_bar
21
+ @progress_bar.log(message)
22
+ else
23
+ @destination.puts(message)
24
+ end
25
+ end
26
+
27
+ def error(message)
28
+ output(message, ERROR)
29
+ end
30
+
31
+ def info(message)
32
+ output(message, INFO)
33
+ end
34
+
35
+ def debug(message)
36
+ output(message, DEBUG)
37
+ end
38
+
39
+ def attach_to_progress_bar(progress_bar)
40
+ @progress_bar = progress_bar
41
+ end
42
+
43
+ def detach_from_progress_bar
44
+ @progress_bar = nil
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,15 @@
1
+ require 'chronicle/etl/models/base'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ class Activity < Chronicle::ETL::Models::Base
7
+ TYPE = 'activities'.freeze
8
+ ATTRIBUTES = [:verb, :start_at, :end_at].freeze
9
+ ASSOCIATIONS = [:involved, :actor].freeze
10
+
11
+ attr_accessor(*ATTRIBUTES, *ASSOCIATIONS)
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,14 @@
1
+ require 'chronicle/etl/models/base'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ class Attachment < Chronicle::ETL::Models::Base
7
+ TYPE = 'attachments'.freeze
8
+ ATTRIBUTES = [:url_original, :data].freeze
9
+
10
+ attr_accessor(*ATTRIBUTES)
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,119 @@
1
+ require 'digest'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ # Represents a record that's been transformed by a Transformer and
7
+ # ready to be loaded. Loosely based on ActiveModel.
8
+ class Base
9
+ ATTRIBUTES = [:provider, :provider_id, :lat, :lng, :metadata].freeze
10
+ ASSOCIATIONS = [].freeze
11
+
12
+ attr_accessor(:id, :dedupe_on, *ATTRIBUTES)
13
+
14
+ def initialize(attributes = {})
15
+ assign_attributes(attributes) if attributes
16
+ @dedupe_on = []
17
+ @metadata = {}
18
+ end
19
+
20
+ # A unique identifier for this model is formed from a type
21
+ # and either an id or lids.
22
+ def identifier_hash
23
+ {
24
+ type: self.class::TYPE,
25
+ id: @id,
26
+ lids: lids
27
+ }.compact
28
+ end
29
+
30
+ # Array of local ids that uniquely identify this record
31
+ def lids
32
+ @dedupe_on.map do |fields|
33
+ generate_lid(fields)
34
+ end.compact.uniq
35
+ end
36
+
37
+ # For a given set of fields of this model, generate a
38
+ # unique local id by hashing the field values
39
+ def generate_lid fields
40
+ raise ArgumentError.new("Must provide an array of symbolized fields") unless fields.is_a?(Array)
41
+
42
+ values = fields.sort.map do |field|
43
+ instance_variable = "@#{field.to_s}"
44
+ self.instance_variable_get(instance_variable)
45
+ end
46
+
47
+ return if values.any? { |e| e.nil? }
48
+
49
+ Digest::SHA256.hexdigest(values.join(","))
50
+ end
51
+
52
+ # Set of attribute names that this model has is Base's shared
53
+ # attributes combined with the child class's
54
+ def attribute_list
55
+ (ATTRIBUTES + self.class::ATTRIBUTES).uniq
56
+ end
57
+
58
+ # All of this record's attributes
59
+ def attributes
60
+ attributes = {}
61
+ attribute_list.each do |attribute|
62
+ instance_variable = "@#{attribute.to_s}"
63
+ attributes[attribute] = self.instance_variable_get(instance_variable)
64
+ end
65
+ attributes.compact
66
+ end
67
+
68
+ # All of this record's associations
69
+ def associations
70
+ association_list = ASSOCIATIONS + self.class::ASSOCIATIONS
71
+ attributes = {}
72
+ association_list.each do |attribute|
73
+ instance_variable = "@#{attribute.to_s}"
74
+ association = self.instance_variable_get(instance_variable)
75
+ attributes[attribute] = association if association
76
+ end
77
+ attributes.compact
78
+ end
79
+
80
+ def associations_hash
81
+ associations.map do |k, v|
82
+ if v.is_a?(Array)
83
+ [k, v.map(&:to_h)]
84
+ else
85
+ [k, v.to_h]
86
+ end
87
+ end.to_h
88
+ end
89
+
90
+ def meta_hash
91
+ {
92
+ meta: {
93
+ dedupe_on: @dedupe_on.map{|d| d.map(&:to_s).join(",")}
94
+ }
95
+ }
96
+ end
97
+
98
+ # FIXME: move this to a Utils module
99
+ def to_h_flattened
100
+ Chronicle::ETL::Utils::HashUtilities.flatten_hash(to_h)
101
+ end
102
+
103
+ def to_h
104
+ identifier_hash
105
+ .merge(attributes)
106
+ .merge(associations_hash)
107
+ .merge(meta_hash)
108
+ end
109
+
110
+ def assign_attributes attributes
111
+ attributes.each do |k, v|
112
+ setter = :"#{k}="
113
+ public_send(setter, v) if respond_to? setter
114
+ end
115
+ end
116
+ end
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,21 @@
1
+ require 'chronicle/etl/models/base'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ class Entity < Chronicle::ETL::Models::Base
7
+ TYPE = 'entities'.freeze
8
+ ATTRIBUTES = [:title, :body, :represents, :slug, :myself, :metadata].freeze
9
+ ASSOCIATIONS = [
10
+ :attachments,
11
+ :abouts,
12
+ :depicts,
13
+ :consumers,
14
+ :contains
15
+ ].freeze # TODO: add these to reflect Chronicle Schema
16
+
17
+ attr_accessor(*ATTRIBUTES, *ASSOCIATIONS)
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,23 @@
1
+ require 'chronicle/etl/models/base'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ class Generic < Chronicle::ETL::Models::Base
7
+ TYPE = 'generic'
8
+
9
+ attr_accessor :properties
10
+
11
+ def initialize(properties = {})
12
+ @properties = properties
13
+ super
14
+ end
15
+
16
+ # Generic models have arbitrary attributes stored in @properties
17
+ def attributes
18
+ @properties.transform_keys(&:to_sym)
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,61 @@
1
+ module Chronicle
2
+ module ETL
3
+ module Registry
4
+ # Records details about a connector such as its provider and a description
5
+ class ConnectorRegistration
6
+ attr_accessor :identifier, :provider, :klass, :description
7
+
8
+ def initialize(klass)
9
+ @klass = klass
10
+ end
11
+
12
+ def phase
13
+ if klass.ancestors.include? Chronicle::ETL::Extractor
14
+ :extractor
15
+ elsif klass.ancestors.include? Chronicle::ETL::Transformer
16
+ :transformer
17
+ elsif klass.ancestors.include? Chronicle::ETL::Loader
18
+ :loader
19
+ end
20
+ end
21
+
22
+ def to_s
23
+ "#{phase}-#{identifier}"
24
+ end
25
+
26
+ def built_in?
27
+ @klass.to_s.include? 'Chronicle::ETL'
28
+ end
29
+
30
+ def klass_name
31
+ @klass.to_s
32
+ end
33
+
34
+ def identifier
35
+ @identifier || @klass.to_s.split('::').last.gsub!(/(Extractor$|Loader$|Transformer$)/, '').downcase
36
+ end
37
+
38
+ def description
39
+ @description || @klass.to_s.split('::').last
40
+ end
41
+
42
+ def provider
43
+ @provider || (built_in? ? 'chronicle' : '')
44
+ end
45
+
46
+ def descriptive_phrase
47
+ prefix = case phase
48
+ when :extractor
49
+ "Extracts from"
50
+ when :transformer
51
+ "Transforms"
52
+ when :loader
53
+ "Loads to"
54
+ end
55
+
56
+ "#{prefix} #{description}"
57
+ end
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,52 @@
1
+ require 'rubygems'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ # A singleton class that acts as a registry of connector classes available for ETL jobs
6
+ module Registry
7
+ PHASES = [:extractor, :transformer, :loader]
8
+
9
+ class << self
10
+ attr_accessor :connectors
11
+
12
+ def load_all!
13
+ load_connectors_from_gems
14
+ end
15
+
16
+ def load_connectors_from_gems
17
+ Gem::Specification.filter{|s| s.name.match(/^chronicle/) }.each do |gem|
18
+ require_str = gem.name.gsub('chronicle-', 'chronicle/')
19
+ require require_str rescue LoadError
20
+ end
21
+ end
22
+
23
+ def install_connector name
24
+ gem_name = "chronicle-#{name}"
25
+ Gem.install(gem_name)
26
+ end
27
+
28
+ def register connector
29
+ @connectors ||= []
30
+ @connectors << connector
31
+ end
32
+
33
+ def find_by_phase_and_identifier(phase, identifier)
34
+ connector = find_within_loaded_connectors(phase, identifier)
35
+ unless connector
36
+ # Only load external connectors (slow) if not found in built-in connectors
37
+ load_all!
38
+ connector = find_within_loaded_connectors(phase, identifier)
39
+ end
40
+ connector || raise(ConnectorNotAvailableError.new("Connector '#{identifier}' not found"))
41
+ end
42
+
43
+ def find_within_loaded_connectors(phase, identifier)
44
+ @connectors.find { |c| c.phase == phase && c.identifier == identifier }
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
50
+
51
+ require_relative 'self_registering'
52
+ require_relative 'connector_registration'
@@ -0,0 +1,25 @@
1
+ require 'forwardable'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Registry
6
+ # Gives a connector class the ability to let the Chronicle::ETL::Registry
7
+ # know about itself
8
+ module SelfRegistering
9
+ extend Forwardable
10
+
11
+ attr_accessor :connector_registration
12
+
13
+ def_delegators :@connector_registration, :description, :provider, :identifier
14
+
15
+ # Creates a ConnectorRegistration for this connector's details and register's it
16
+ # into the Registry
17
+ def register_connector
18
+ @connector_registration ||= ::Chronicle::ETL::Registry::ConnectorRegistration.new(self)
19
+ yield @connector_registration if block_given?
20
+ ::Chronicle::ETL::Registry.register(@connector_registration)
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -1,46 +1,88 @@
1
1
  require 'colorize'
2
+ require 'chronic_duration'
2
3
 
3
4
  class Chronicle::ETL::Runner
4
- def initialize(options = {})
5
- @options = options
5
+ def initialize(job)
6
+ @job = job
7
+ @job_logger = Chronicle::ETL::JobLogger.new(@job)
6
8
  end
7
9
 
8
10
  def run!
9
- extractor = instantiate_klass(:extractor)
10
- loader = instantiate_klass(:loader)
11
+ extractor = @job.instantiate_extractor
12
+ loader = @job.instantiate_loader
11
13
 
14
+ @job_logger.start
15
+ loader.start
16
+
17
+ extractor.prepare
12
18
  total = extractor.results_count
13
- progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
19
+ @progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
20
+ Chronicle::ETL::Logger.attach_to_progress_bar(@progress_bar)
14
21
 
15
- loader.start
22
+ Chronicle::ETL::Logger.info(tty_log_job_start)
23
+ extractor.extract do |extraction|
24
+ unless extraction.is_a?(Chronicle::ETL::Extraction)
25
+ raise Chronicle::ETL::RunnerTypeError, "Extracted should be a Chronicle::ETL::Extraction"
26
+ end
27
+
28
+ transformer = @job.instantiate_transformer(extraction)
29
+ record = transformer.transform
30
+
31
+ unless record.is_a?(Chronicle::ETL::Models::Base)
32
+ raise Chronicle::ETL::RunnerTypeError, "Transformed data should be a type of Chronicle::ETL::Models"
33
+ end
16
34
 
17
- extractor.extract do |data, metadata|
18
- transformer = instantiate_klass(:transformer, data)
19
- transformed_data = transformer.transform
35
+ Chronicle::ETL::Logger.info(tty_log_transformation(transformer))
36
+ @job_logger.log_transformation(transformer)
20
37
 
21
- loader.load(transformed_data)
22
- progress_bar.increment
38
+ loader.load(record) unless @job.dry_run?
39
+ rescue Chronicle::ETL::TransformationError => e
40
+ Chronicle::ETL::Logger.error(tty_log_transformation_failure(e))
41
+ ensure
42
+ @progress_bar.increment
23
43
  end
24
44
 
25
- progress_bar.finish
45
+ @progress_bar.finish
26
46
  loader.finish
47
+ @job_logger.finish
48
+ rescue Interrupt
49
+ Chronicle::ETL::Logger.error("\n#{'Job interrupted'.red}")
50
+ @job_logger.error
51
+ rescue StandardError => e
52
+ raise e
53
+ ensure
54
+ @job_logger.save
55
+ @progress_bar.finish
56
+ Chronicle::ETL::Logger.detach_from_progress_bar
57
+ Chronicle::ETL::Logger.info(tty_log_completion)
27
58
  end
28
59
 
29
60
  private
30
61
 
31
- def instantiate_klass(phase, *args)
32
- klass = load_etl_class(phase, @options[phase][:name])
33
- klass.new(@options[phase][:options], *args)
62
+ def tty_log_job_start
63
+ output = "Beginning job "
64
+ output += "'#{@job.name}'".bold if @job.name
65
+ output
66
+ end
67
+
68
+ def tty_log_transformation transformer
69
+ output = " ✓".green
70
+ output += " #{transformer}"
71
+ end
72
+
73
+ def tty_log_transformation_failure exception
74
+ output = " ✖".red
75
+ output += " Failed to build #{exception.transformation}. #{exception.message}"
34
76
  end
35
77
 
36
- def load_etl_class(phase, identifier)
37
- Chronicle::ETL::Catalog.identifier_to_klass(phase: phase, identifier: identifier)
38
- rescue Chronicle::ETL::ProviderNotAvailableError => e
39
- warn(e.message.red)
40
- warn(" Perhaps you haven't installed it yet: `$ gem install chronicle-#{e.provider}`")
41
- exit(false)
42
- rescue Chronicle::ETL::ConnectorNotAvailableError => e
43
- warn(e.message.red)
44
- exit(false)
78
+ def tty_log_completion
79
+ status = @job_logger.success ? 'Success' : 'Failed'
80
+ output = "\nCompleted job "
81
+ output += "'#{@job.name}'".bold if @job.name
82
+ output += " in #{ChronicDuration.output(@job_logger.duration)}" if @job_logger.duration
83
+ output += "\n Status:\t".light_black + status
84
+ output += "\n Completed:\t".light_black + "#{@job_logger.job_log.num_records_processed}"
85
+ output += "\n Latest:\t".light_black + "#{@job_logger.job_log.highest_timestamp.iso8601}" if @job_logger.job_log.highest_timestamp
86
+ output
45
87
  end
46
88
  end
@@ -0,0 +1,25 @@
1
+ module Chronicle
2
+ module ETL
3
+ class JSONAPISerializer < Chronicle::ETL::Serializer
4
+ def serializable_hash
5
+ @record
6
+ .identifier_hash
7
+ .merge({ attributes: @record.attributes })
8
+ .merge({ relationships: build_associations })
9
+ .merge(@record.meta_hash)
10
+ end
11
+
12
+ def build_associations
13
+ @record.associations.transform_values do |value|
14
+ association_data =
15
+ if value.is_a?(Array)
16
+ value.map { |record| JSONAPISerializer.new(record).serializable_hash }
17
+ else
18
+ JSONAPISerializer.new(value).serializable_hash
19
+ end
20
+ { data: association_data }
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,27 @@
1
+ module Chronicle
2
+ module ETL
3
+ # Abstract class representing a Serializer for an ETL record
4
+ class Serializer
5
+ # Construct a new instance of this serializer.
6
+ # == Parameters:
7
+ # options::
8
+ # Options for configuring this Serializers
9
+ def initialize(record, options = {})
10
+ @record = record
11
+ @options = options
12
+ end
13
+
14
+ # Serialize a record as a hash
15
+ def serializable_hash
16
+ raise NotImplementedError
17
+ end
18
+
19
+ def self.serialize(record)
20
+ serializer = self.new(record)
21
+ serializer.serializable_hash
22
+ end
23
+ end
24
+ end
25
+ end
26
+
27
+ require_relative 'jsonapi_serializer'