chronicle-etl 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +3 -0
  3. data/.rubocop.yml +3 -0
  4. data/README.md +22 -15
  5. data/chronicle-etl.gemspec +11 -5
  6. data/lib/chronicle/etl/cli/connectors.rb +19 -7
  7. data/lib/chronicle/etl/cli/jobs.rb +38 -27
  8. data/lib/chronicle/etl/cli/main.rb +10 -2
  9. data/lib/chronicle/etl/config.rb +24 -3
  10. data/lib/chronicle/etl/exceptions.rb +30 -0
  11. data/lib/chronicle/etl/extraction.rb +12 -0
  12. data/lib/chronicle/etl/extractors/csv_extractor.rb +43 -37
  13. data/lib/chronicle/etl/extractors/extractor.rb +19 -1
  14. data/lib/chronicle/etl/extractors/file_extractor.rb +15 -33
  15. data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +104 -0
  16. data/lib/chronicle/etl/extractors/json_extractor.rb +45 -0
  17. data/lib/chronicle/etl/extractors/stdin_extractor.rb +6 -1
  18. data/lib/chronicle/etl/job.rb +72 -0
  19. data/lib/chronicle/etl/job_definition.rb +89 -0
  20. data/lib/chronicle/etl/job_log.rb +95 -0
  21. data/lib/chronicle/etl/job_logger.rb +81 -0
  22. data/lib/chronicle/etl/loaders/csv_loader.rb +6 -6
  23. data/lib/chronicle/etl/loaders/loader.rb +2 -2
  24. data/lib/chronicle/etl/loaders/rest_loader.rb +16 -9
  25. data/lib/chronicle/etl/loaders/stdout_loader.rb +8 -3
  26. data/lib/chronicle/etl/loaders/table_loader.rb +58 -7
  27. data/lib/chronicle/etl/logger.rb +48 -0
  28. data/lib/chronicle/etl/models/activity.rb +15 -0
  29. data/lib/chronicle/etl/models/attachment.rb +14 -0
  30. data/lib/chronicle/etl/models/base.rb +119 -0
  31. data/lib/chronicle/etl/models/entity.rb +21 -0
  32. data/lib/chronicle/etl/models/generic.rb +23 -0
  33. data/lib/chronicle/etl/registry/connector_registration.rb +61 -0
  34. data/lib/chronicle/etl/registry/registry.rb +52 -0
  35. data/lib/chronicle/etl/registry/self_registering.rb +25 -0
  36. data/lib/chronicle/etl/runner.rb +70 -42
  37. data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +25 -0
  38. data/lib/chronicle/etl/serializers/serializer.rb +27 -0
  39. data/lib/chronicle/etl/transformers/image_file_transformer.rb +253 -0
  40. data/lib/chronicle/etl/transformers/null_transformer.rb +12 -4
  41. data/lib/chronicle/etl/transformers/transformer.rb +42 -12
  42. data/lib/chronicle/etl/utils/binary_attachments.rb +21 -0
  43. data/lib/chronicle/etl/utils/hash_utilities.rb +19 -0
  44. data/lib/chronicle/etl/utils/progress_bar.rb +3 -1
  45. data/lib/chronicle/etl/utils/text_recognition.rb +15 -0
  46. data/lib/chronicle/etl/version.rb +1 -1
  47. data/lib/chronicle/etl.rb +17 -1
  48. metadata +138 -35
  49. data/CHANGELOG.md +0 -23
  50. data/Gemfile.lock +0 -85
  51. data/lib/chronicle/etl/catalog.rb +0 -62
  52. data/lib/chronicle/etl/transformers/json_transformer.rb +0 -11
@@ -0,0 +1,48 @@
1
+ module Chronicle
2
+ module ETL
3
+ module Logger
4
+ extend self
5
+
6
+ DEBUG = 0
7
+ INFO = 1
8
+ WARN = 2
9
+ ERROR = 3
10
+ FATAL = 4
11
+
12
+ attr_accessor :log_level
13
+
14
+ @log_level = INFO
15
+ @destination = $stderr
16
+
17
+ def output message, level
18
+ return unless level >= @log_level
19
+
20
+ if @progress_bar
21
+ @progress_bar.log(message)
22
+ else
23
+ @destination.puts(message)
24
+ end
25
+ end
26
+
27
+ def error(message)
28
+ output(message, ERROR)
29
+ end
30
+
31
+ def info(message)
32
+ output(message, INFO)
33
+ end
34
+
35
+ def debug(message)
36
+ output(message, DEBUG)
37
+ end
38
+
39
+ def attach_to_progress_bar(progress_bar)
40
+ @progress_bar = progress_bar
41
+ end
42
+
43
+ def detach_from_progress_bar
44
+ @progress_bar = nil
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,15 @@
1
+ require 'chronicle/etl/models/base'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ class Activity < Chronicle::ETL::Models::Base
7
+ TYPE = 'activities'.freeze
8
+ ATTRIBUTES = [:verb, :start_at, :end_at].freeze
9
+ ASSOCIATIONS = [:involved, :actor].freeze
10
+
11
+ attr_accessor(*ATTRIBUTES, *ASSOCIATIONS)
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,14 @@
1
+ require 'chronicle/etl/models/base'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ class Attachment < Chronicle::ETL::Models::Base
7
+ TYPE = 'attachments'.freeze
8
+ ATTRIBUTES = [:url_original, :data].freeze
9
+
10
+ attr_accessor(*ATTRIBUTES)
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,119 @@
1
+ require 'digest'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ # Represents a record that's been transformed by a Transformer and
7
+ # ready to be loaded. Loosely based on ActiveModel.
8
+ class Base
9
+ ATTRIBUTES = [:provider, :provider_id, :lat, :lng, :metadata].freeze
10
+ ASSOCIATIONS = [].freeze
11
+
12
+ attr_accessor(:id, :dedupe_on, *ATTRIBUTES)
13
+
14
+ def initialize(attributes = {})
15
+ assign_attributes(attributes) if attributes
16
+ @dedupe_on = []
17
+ @metadata = {}
18
+ end
19
+
20
+ # A unique identifier for this model is formed from a type
21
+ # and either an id or lids.
22
+ def identifier_hash
23
+ {
24
+ type: self.class::TYPE,
25
+ id: @id,
26
+ lids: lids
27
+ }.compact
28
+ end
29
+
30
+ # Array of local ids that uniquely identify this record
31
+ def lids
32
+ @dedupe_on.map do |fields|
33
+ generate_lid(fields)
34
+ end.compact.uniq
35
+ end
36
+
37
+ # For a given set of fields of this model, generate a
38
+ # unique local id by hashing the field values
39
+ def generate_lid fields
40
+ raise ArgumentError.new("Must provide an array of symbolized fields") unless fields.is_a?(Array)
41
+
42
+ values = fields.sort.map do |field|
43
+ instance_variable = "@#{field.to_s}"
44
+ self.instance_variable_get(instance_variable)
45
+ end
46
+
47
+ return if values.any? { |e| e.nil? }
48
+
49
+ Digest::SHA256.hexdigest(values.join(","))
50
+ end
51
+
52
+ # Set of attribute names that this model has is Base's shared
53
+ # attributes combined with the child class's
54
+ def attribute_list
55
+ (ATTRIBUTES + self.class::ATTRIBUTES).uniq
56
+ end
57
+
58
+ # All of this record's attributes
59
+ def attributes
60
+ attributes = {}
61
+ attribute_list.each do |attribute|
62
+ instance_variable = "@#{attribute.to_s}"
63
+ attributes[attribute] = self.instance_variable_get(instance_variable)
64
+ end
65
+ attributes.compact
66
+ end
67
+
68
+ # All of this record's associations
69
+ def associations
70
+ association_list = ASSOCIATIONS + self.class::ASSOCIATIONS
71
+ attributes = {}
72
+ association_list.each do |attribute|
73
+ instance_variable = "@#{attribute.to_s}"
74
+ association = self.instance_variable_get(instance_variable)
75
+ attributes[attribute] = association if association
76
+ end
77
+ attributes.compact
78
+ end
79
+
80
+ def associations_hash
81
+ associations.map do |k, v|
82
+ if v.is_a?(Array)
83
+ [k, v.map(&:to_h)]
84
+ else
85
+ [k, v.to_h]
86
+ end
87
+ end.to_h
88
+ end
89
+
90
+ def meta_hash
91
+ {
92
+ meta: {
93
+ dedupe_on: @dedupe_on.map{|d| d.map(&:to_s).join(",")}
94
+ }
95
+ }
96
+ end
97
+
98
+ # FIXME: move this to a Utils module
99
+ def to_h_flattened
100
+ Chronicle::ETL::Utils::HashUtilities.flatten_hash(to_h)
101
+ end
102
+
103
+ def to_h
104
+ identifier_hash
105
+ .merge(attributes)
106
+ .merge(associations_hash)
107
+ .merge(meta_hash)
108
+ end
109
+
110
+ def assign_attributes attributes
111
+ attributes.each do |k, v|
112
+ setter = :"#{k}="
113
+ public_send(setter, v) if respond_to? setter
114
+ end
115
+ end
116
+ end
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,21 @@
1
+ require 'chronicle/etl/models/base'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ class Entity < Chronicle::ETL::Models::Base
7
+ TYPE = 'entities'.freeze
8
+ ATTRIBUTES = [:title, :body, :represents, :slug, :myself, :metadata].freeze
9
+ ASSOCIATIONS = [
10
+ :attachments,
11
+ :abouts,
12
+ :depicts,
13
+ :consumers,
14
+ :contains
15
+ ].freeze # TODO: add these to reflect Chronicle Schema
16
+
17
+ attr_accessor(*ATTRIBUTES, *ASSOCIATIONS)
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,23 @@
1
+ require 'chronicle/etl/models/base'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ class Generic < Chronicle::ETL::Models::Base
7
+ TYPE = 'generic'
8
+
9
+ attr_accessor :properties
10
+
11
+ def initialize(properties = {})
12
+ @properties = properties
13
+ super
14
+ end
15
+
16
+ # Generic models have arbitrary attributes stored in @properties
17
+ def attributes
18
+ @properties.transform_keys(&:to_sym)
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,61 @@
1
+ module Chronicle
2
+ module ETL
3
+ module Registry
4
+ # Records details about a connector such as its provider and a description
5
+ class ConnectorRegistration
6
+ attr_accessor :identifier, :provider, :klass, :description
7
+
8
+ def initialize(klass)
9
+ @klass = klass
10
+ end
11
+
12
+ def phase
13
+ if klass.ancestors.include? Chronicle::ETL::Extractor
14
+ :extractor
15
+ elsif klass.ancestors.include? Chronicle::ETL::Transformer
16
+ :transformer
17
+ elsif klass.ancestors.include? Chronicle::ETL::Loader
18
+ :loader
19
+ end
20
+ end
21
+
22
+ def to_s
23
+ "#{phase}-#{identifier}"
24
+ end
25
+
26
+ def built_in?
27
+ @klass.to_s.include? 'Chronicle::ETL'
28
+ end
29
+
30
+ def klass_name
31
+ @klass.to_s
32
+ end
33
+
34
+ def identifier
35
+ @identifier || @klass.to_s.split('::').last.gsub!(/(Extractor$|Loader$|Transformer$)/, '').downcase
36
+ end
37
+
38
+ def description
39
+ @description || @klass.to_s.split('::').last
40
+ end
41
+
42
+ def provider
43
+ @provider || (built_in? ? 'chronicle' : '')
44
+ end
45
+
46
+ def descriptive_phrase
47
+ prefix = case phase
48
+ when :extractor
49
+ "Extracts from"
50
+ when :transformer
51
+ "Transforms"
52
+ when :loader
53
+ "Loads to"
54
+ end
55
+
56
+ "#{prefix} #{description}"
57
+ end
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,52 @@
1
+ require 'rubygems'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ # A singleton class that acts as a registry of connector classes available for ETL jobs
6
+ module Registry
7
+ PHASES = [:extractor, :transformer, :loader]
8
+
9
+ class << self
10
+ attr_accessor :connectors
11
+
12
+ def load_all!
13
+ load_connectors_from_gems
14
+ end
15
+
16
+ def load_connectors_from_gems
17
+ Gem::Specification.filter{|s| s.name.match(/^chronicle/) }.each do |gem|
18
+ require_str = gem.name.gsub('chronicle-', 'chronicle/')
19
+ require require_str rescue LoadError
20
+ end
21
+ end
22
+
23
+ def install_connector name
24
+ gem_name = "chronicle-#{name}"
25
+ Gem.install(gem_name)
26
+ end
27
+
28
+ def register connector
29
+ @connectors ||= []
30
+ @connectors << connector
31
+ end
32
+
33
+ def find_by_phase_and_identifier(phase, identifier)
34
+ connector = find_within_loaded_connectors(phase, identifier)
35
+ unless connector
36
+ # Only load external connectors (slow) if not found in built-in connectors
37
+ load_all!
38
+ connector = find_within_loaded_connectors(phase, identifier)
39
+ end
40
+ connector || raise(ConnectorNotAvailableError.new("Connector '#{identifier}' not found"))
41
+ end
42
+
43
+ def find_within_loaded_connectors(phase, identifier)
44
+ @connectors.find { |c| c.phase == phase && c.identifier == identifier }
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
50
+
51
+ require_relative 'self_registering'
52
+ require_relative 'connector_registration'
@@ -0,0 +1,25 @@
1
+ require 'forwardable'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Registry
6
+ # Gives a connector class the ability to let the Chronicle::ETL::Registry
7
+ # know about itself
8
+ module SelfRegistering
9
+ extend Forwardable
10
+
11
+ attr_accessor :connector_registration
12
+
13
+ def_delegators :@connector_registration, :description, :provider, :identifier
14
+
15
+ # Creates a ConnectorRegistration for this connector's details and register's it
16
+ # into the Registry
17
+ def register_connector
18
+ @connector_registration ||= ::Chronicle::ETL::Registry::ConnectorRegistration.new(self)
19
+ yield @connector_registration if block_given?
20
+ ::Chronicle::ETL::Registry.register(@connector_registration)
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -1,59 +1,87 @@
1
- class Chronicle::ETL::Runner
2
- BUILTIN = {
3
- extractor: ['stdin', 'json', 'csv', 'file'],
4
- transformer: ['null'],
5
- loader: ['stdout', 'csv', 'table']
6
- }.freeze
7
-
8
- def initialize(options)
9
- @options = options
1
+ require 'colorize'
2
+ require 'chronic_duration'
10
3
 
11
- instantiate_etl_classes
4
+ class Chronicle::ETL::Runner
5
+ def initialize(job)
6
+ @job = job
7
+ @job_logger = Chronicle::ETL::JobLogger.new(@job)
12
8
  end
13
9
 
14
10
  def run!
15
- total = @extractor.results_count
16
- progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
17
- count = 0
11
+ extractor = @job.instantiate_extractor
12
+ loader = @job.instantiate_loader
13
+
14
+ @job_logger.start
15
+ loader.start
18
16
 
19
- @loader.start
17
+ total = extractor.results_count
18
+ @progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
19
+ Chronicle::ETL::Logger.attach_to_progress_bar(@progress_bar)
20
20
 
21
- @extractor.extract do |data, metadata|
22
- transformed_data = @transformer.transform(data)
23
- @loader.load(transformed_data)
21
+ Chronicle::ETL::Logger.info(tty_log_job_start)
22
+ extractor.extract do |extraction|
23
+ unless extraction.is_a?(Chronicle::ETL::Extraction)
24
+ raise Chronicle::ETL::RunnerTypeError, "Extracted should be a Chronicle::ETL::Extraction"
25
+ end
26
+
27
+ transformer = @job.instantiate_transformer(extraction)
28
+ record = transformer.transform
29
+
30
+ unless record.is_a?(Chronicle::ETL::Models::Base)
31
+ raise Chronicle::ETL::RunnerTypeError, "Transformed data should be a type of Chronicle::ETL::Models"
32
+ end
24
33
 
25
- progress_bar.increment
26
- count += 1
34
+ Chronicle::ETL::Logger.info(tty_log_transformation(transformer))
35
+ @job_logger.log_transformation(transformer)
36
+
37
+ loader.load(record) unless @job.dry_run?
38
+ rescue Chronicle::ETL::TransformationError => e
39
+ Chronicle::ETL::Logger.error(tty_log_transformation_failure(e))
40
+ ensure
41
+ @progress_bar.increment
27
42
  end
28
43
 
29
- progress_bar.finish
30
- @loader.finish
44
+ @progress_bar.finish
45
+ loader.finish
46
+ @job_logger.finish
47
+ rescue Interrupt
48
+ Chronicle::ETL::Logger.error("\n#{'Job interrupted'.red}")
49
+ @job_logger.error
50
+ rescue StandardError => e
51
+ raise e
52
+ ensure
53
+ @job_logger.save
54
+ @progress_bar.finish
55
+ Chronicle::ETL::Logger.detach_from_progress_bar
56
+ Chronicle::ETL::Logger.info(tty_log_completion)
31
57
  end
32
58
 
33
59
  private
34
60
 
35
- def instantiate_etl_classes
36
- @extractor = load_etl_class(:extractor, @options[:extractor][:name]).new(@options[:extractor][:options])
37
- @transformer = load_etl_class(:transformer, @options[:transformer][:name]).new(@options[:transformer][:options])
38
- @loader = load_etl_class(:loader, @options[:loader][:name]).new(@options[:loader][:options])
61
+ def tty_log_job_start
62
+ output = "Beginning job "
63
+ output += "'#{@job.name}'".bold if @job.name
64
+ output
39
65
  end
40
66
 
41
- def load_etl_class(phase, x)
42
- if BUILTIN[phase].include? x
43
- klass_name = "Chronicle::ETL::#{x.capitalize}#{phase.to_s.capitalize}"
44
- else
45
- # TODO: come up with syntax for specifying a particular extractor in a provider library
46
- provider, name = x.split(":")
47
- provider = x unless provider
48
- begin
49
- require "chronicle/#{provider}"
50
- rescue LoadError => e
51
- warn("Error loading #{phase} '#{provider}'".red)
52
- warn(" Perhaps you haven't installed it yet: `$ gem install chronicle-#{provider}`")
53
- exit(false)
54
- end
55
- klass_name = "Chronicle::#{provider.capitalize}::#{name&.capitalize}#{phase.capitalize}"
56
- end
57
- Object.const_get(klass_name)
67
+ def tty_log_transformation transformer
68
+ output = " ✓".green
69
+ output += " #{transformer}"
70
+ end
71
+
72
+ def tty_log_transformation_failure exception
73
+ output = " ✖".red
74
+ output += " Failed to build #{exception.transformation}. #{exception.message}"
75
+ end
76
+
77
+ def tty_log_completion
78
+ status = @job_logger.success ? 'Success' : 'Failed'
79
+ output = "\nCompleted job "
80
+ output += "'#{@job.name}'".bold if @job.name
81
+ output += " in #{ChronicDuration.output(@job_logger.duration)}" if @job_logger.duration
82
+ output += "\n Status:\t".light_black + status
83
+ output += "\n Completed:\t".light_black + "#{@job_logger.job_log.num_records_processed}"
84
+ output += "\n Latest:\t".light_black + "#{@job_logger.job_log.highest_timestamp.iso8601}" if @job_logger.job_log.highest_timestamp
85
+ output
58
86
  end
59
87
  end
@@ -0,0 +1,25 @@
1
+ module Chronicle
2
+ module ETL
3
+ class JSONAPISerializer < Chronicle::ETL::Serializer
4
+ def serializable_hash
5
+ @record
6
+ .identifier_hash
7
+ .merge({ attributes: @record.attributes })
8
+ .merge({ relationships: build_associations })
9
+ .merge(@record.meta_hash)
10
+ end
11
+
12
+ def build_associations
13
+ @record.associations.transform_values do |value|
14
+ association_data =
15
+ if value.is_a?(Array)
16
+ value.map { |record| JSONAPISerializer.new(record).serializable_hash }
17
+ else
18
+ JSONAPISerializer.new(value).serializable_hash
19
+ end
20
+ { data: association_data }
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,27 @@
1
+ module Chronicle
2
+ module ETL
3
+ # Abstract class representing a Serializer for an ETL record
4
+ class Serializer
5
+ # Construct a new instance of this serializer.
6
+ # == Parameters:
7
+ # options::
8
+ # Options for configuring this Serializers
9
+ def initialize(record, options = {})
10
+ @record = record
11
+ @options = options
12
+ end
13
+
14
+ # Serialize a record as a hash
15
+ def serializable_hash
16
+ raise NotImplementedError
17
+ end
18
+
19
+ def self.serialize(record)
20
+ serializer = self.new(record)
21
+ serializer.serializable_hash
22
+ end
23
+ end
24
+ end
25
+ end
26
+
27
+ require_relative 'jsonapi_serializer'