chronicle-etl 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +3 -0
  3. data/.rubocop.yml +3 -0
  4. data/README.md +22 -15
  5. data/chronicle-etl.gemspec +11 -5
  6. data/lib/chronicle/etl/cli/connectors.rb +19 -7
  7. data/lib/chronicle/etl/cli/jobs.rb +38 -27
  8. data/lib/chronicle/etl/cli/main.rb +10 -2
  9. data/lib/chronicle/etl/config.rb +24 -3
  10. data/lib/chronicle/etl/exceptions.rb +30 -0
  11. data/lib/chronicle/etl/extraction.rb +12 -0
  12. data/lib/chronicle/etl/extractors/csv_extractor.rb +43 -37
  13. data/lib/chronicle/etl/extractors/extractor.rb +19 -1
  14. data/lib/chronicle/etl/extractors/file_extractor.rb +15 -33
  15. data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +104 -0
  16. data/lib/chronicle/etl/extractors/json_extractor.rb +45 -0
  17. data/lib/chronicle/etl/extractors/stdin_extractor.rb +6 -1
  18. data/lib/chronicle/etl/job.rb +72 -0
  19. data/lib/chronicle/etl/job_definition.rb +89 -0
  20. data/lib/chronicle/etl/job_log.rb +95 -0
  21. data/lib/chronicle/etl/job_logger.rb +81 -0
  22. data/lib/chronicle/etl/loaders/csv_loader.rb +6 -6
  23. data/lib/chronicle/etl/loaders/loader.rb +2 -2
  24. data/lib/chronicle/etl/loaders/rest_loader.rb +16 -9
  25. data/lib/chronicle/etl/loaders/stdout_loader.rb +8 -3
  26. data/lib/chronicle/etl/loaders/table_loader.rb +58 -7
  27. data/lib/chronicle/etl/logger.rb +48 -0
  28. data/lib/chronicle/etl/models/activity.rb +15 -0
  29. data/lib/chronicle/etl/models/attachment.rb +14 -0
  30. data/lib/chronicle/etl/models/base.rb +119 -0
  31. data/lib/chronicle/etl/models/entity.rb +21 -0
  32. data/lib/chronicle/etl/models/generic.rb +23 -0
  33. data/lib/chronicle/etl/registry/connector_registration.rb +61 -0
  34. data/lib/chronicle/etl/registry/registry.rb +52 -0
  35. data/lib/chronicle/etl/registry/self_registering.rb +25 -0
  36. data/lib/chronicle/etl/runner.rb +70 -42
  37. data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +25 -0
  38. data/lib/chronicle/etl/serializers/serializer.rb +27 -0
  39. data/lib/chronicle/etl/transformers/image_file_transformer.rb +253 -0
  40. data/lib/chronicle/etl/transformers/null_transformer.rb +12 -4
  41. data/lib/chronicle/etl/transformers/transformer.rb +42 -12
  42. data/lib/chronicle/etl/utils/binary_attachments.rb +21 -0
  43. data/lib/chronicle/etl/utils/hash_utilities.rb +19 -0
  44. data/lib/chronicle/etl/utils/progress_bar.rb +3 -1
  45. data/lib/chronicle/etl/utils/text_recognition.rb +15 -0
  46. data/lib/chronicle/etl/version.rb +1 -1
  47. data/lib/chronicle/etl.rb +17 -1
  48. metadata +138 -35
  49. data/CHANGELOG.md +0 -23
  50. data/Gemfile.lock +0 -85
  51. data/lib/chronicle/etl/catalog.rb +0 -62
  52. data/lib/chronicle/etl/transformers/json_transformer.rb +0 -11
@@ -0,0 +1,48 @@
1
+ module Chronicle
2
+ module ETL
3
+ module Logger
4
+ extend self
5
+
6
+ DEBUG = 0
7
+ INFO = 1
8
+ WARN = 2
9
+ ERROR = 3
10
+ FATAL = 4
11
+
12
+ attr_accessor :log_level
13
+
14
+ @log_level = INFO
15
+ @destination = $stderr
16
+
17
+ def output message, level
18
+ return unless level >= @log_level
19
+
20
+ if @progress_bar
21
+ @progress_bar.log(message)
22
+ else
23
+ @destination.puts(message)
24
+ end
25
+ end
26
+
27
+ def error(message)
28
+ output(message, ERROR)
29
+ end
30
+
31
+ def info(message)
32
+ output(message, INFO)
33
+ end
34
+
35
+ def debug(message)
36
+ output(message, DEBUG)
37
+ end
38
+
39
+ def attach_to_progress_bar(progress_bar)
40
+ @progress_bar = progress_bar
41
+ end
42
+
43
+ def detach_from_progress_bar
44
+ @progress_bar = nil
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,15 @@
1
+ require 'chronicle/etl/models/base'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ class Activity < Chronicle::ETL::Models::Base
7
+ TYPE = 'activities'.freeze
8
+ ATTRIBUTES = [:verb, :start_at, :end_at].freeze
9
+ ASSOCIATIONS = [:involved, :actor].freeze
10
+
11
+ attr_accessor(*ATTRIBUTES, *ASSOCIATIONS)
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,14 @@
1
+ require 'chronicle/etl/models/base'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ class Attachment < Chronicle::ETL::Models::Base
7
+ TYPE = 'attachments'.freeze
8
+ ATTRIBUTES = [:url_original, :data].freeze
9
+
10
+ attr_accessor(*ATTRIBUTES)
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,119 @@
1
+ require 'digest'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ # Represents a record that's been transformed by a Transformer and
7
+ # ready to be loaded. Loosely based on ActiveModel.
8
+ class Base
9
+ ATTRIBUTES = [:provider, :provider_id, :lat, :lng, :metadata].freeze
10
+ ASSOCIATIONS = [].freeze
11
+
12
+ attr_accessor(:id, :dedupe_on, *ATTRIBUTES)
13
+
14
+ def initialize(attributes = {})
15
+ assign_attributes(attributes) if attributes
16
+ @dedupe_on = []
17
+ @metadata = {}
18
+ end
19
+
20
+ # A unique identifier for this model is formed from a type
21
+ # and either an id or lids.
22
+ def identifier_hash
23
+ {
24
+ type: self.class::TYPE,
25
+ id: @id,
26
+ lids: lids
27
+ }.compact
28
+ end
29
+
30
+ # Array of local ids that uniquely identify this record
31
+ def lids
32
+ @dedupe_on.map do |fields|
33
+ generate_lid(fields)
34
+ end.compact.uniq
35
+ end
36
+
37
+ # For a given set of fields of this model, generate a
38
+ # unique local id by hashing the field values
39
+ def generate_lid fields
40
+ raise ArgumentError.new("Must provide an array of symbolized fields") unless fields.is_a?(Array)
41
+
42
+ values = fields.sort.map do |field|
43
+ instance_variable = "@#{field.to_s}"
44
+ self.instance_variable_get(instance_variable)
45
+ end
46
+
47
+ return if values.any? { |e| e.nil? }
48
+
49
+ Digest::SHA256.hexdigest(values.join(","))
50
+ end
51
+
52
+ # Set of attribute names that this model has is Base's shared
53
+ # attributes combined with the child class's
54
+ def attribute_list
55
+ (ATTRIBUTES + self.class::ATTRIBUTES).uniq
56
+ end
57
+
58
+ # All of this record's attributes
59
+ def attributes
60
+ attributes = {}
61
+ attribute_list.each do |attribute|
62
+ instance_variable = "@#{attribute.to_s}"
63
+ attributes[attribute] = self.instance_variable_get(instance_variable)
64
+ end
65
+ attributes.compact
66
+ end
67
+
68
+ # All of this record's associations
69
+ def associations
70
+ association_list = ASSOCIATIONS + self.class::ASSOCIATIONS
71
+ attributes = {}
72
+ association_list.each do |attribute|
73
+ instance_variable = "@#{attribute.to_s}"
74
+ association = self.instance_variable_get(instance_variable)
75
+ attributes[attribute] = association if association
76
+ end
77
+ attributes.compact
78
+ end
79
+
80
+ def associations_hash
81
+ associations.map do |k, v|
82
+ if v.is_a?(Array)
83
+ [k, v.map(&:to_h)]
84
+ else
85
+ [k, v.to_h]
86
+ end
87
+ end.to_h
88
+ end
89
+
90
+ def meta_hash
91
+ {
92
+ meta: {
93
+ dedupe_on: @dedupe_on.map{|d| d.map(&:to_s).join(",")}
94
+ }
95
+ }
96
+ end
97
+
98
+ # FIXME: move this to a Utils module
99
+ def to_h_flattened
100
+ Chronicle::ETL::Utils::HashUtilities.flatten_hash(to_h)
101
+ end
102
+
103
+ def to_h
104
+ identifier_hash
105
+ .merge(attributes)
106
+ .merge(associations_hash)
107
+ .merge(meta_hash)
108
+ end
109
+
110
+ def assign_attributes attributes
111
+ attributes.each do |k, v|
112
+ setter = :"#{k}="
113
+ public_send(setter, v) if respond_to? setter
114
+ end
115
+ end
116
+ end
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,21 @@
1
+ require 'chronicle/etl/models/base'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ class Entity < Chronicle::ETL::Models::Base
7
+ TYPE = 'entities'.freeze
8
+ ATTRIBUTES = [:title, :body, :represents, :slug, :myself, :metadata].freeze
9
+ ASSOCIATIONS = [
10
+ :attachments,
11
+ :abouts,
12
+ :depicts,
13
+ :consumers,
14
+ :contains
15
+ ].freeze # TODO: add these to reflect Chronicle Schema
16
+
17
+ attr_accessor(*ATTRIBUTES, *ASSOCIATIONS)
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,23 @@
1
+ require 'chronicle/etl/models/base'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ class Generic < Chronicle::ETL::Models::Base
7
+ TYPE = 'generic'
8
+
9
+ attr_accessor :properties
10
+
11
+ def initialize(properties = {})
12
+ @properties = properties
13
+ super
14
+ end
15
+
16
+ # Generic models have arbitrary attributes stored in @properties
17
+ def attributes
18
+ @properties.transform_keys(&:to_sym)
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,61 @@
1
+ module Chronicle
2
+ module ETL
3
+ module Registry
4
+ # Records details about a connector such as its provider and a description
5
+ class ConnectorRegistration
6
+ attr_accessor :identifier, :provider, :klass, :description
7
+
8
+ def initialize(klass)
9
+ @klass = klass
10
+ end
11
+
12
+ def phase
13
+ if klass.ancestors.include? Chronicle::ETL::Extractor
14
+ :extractor
15
+ elsif klass.ancestors.include? Chronicle::ETL::Transformer
16
+ :transformer
17
+ elsif klass.ancestors.include? Chronicle::ETL::Loader
18
+ :loader
19
+ end
20
+ end
21
+
22
+ def to_s
23
+ "#{phase}-#{identifier}"
24
+ end
25
+
26
+ def built_in?
27
+ @klass.to_s.include? 'Chronicle::ETL'
28
+ end
29
+
30
+ def klass_name
31
+ @klass.to_s
32
+ end
33
+
34
+ def identifier
35
+ @identifier || @klass.to_s.split('::').last.gsub!(/(Extractor$|Loader$|Transformer$)/, '').downcase
36
+ end
37
+
38
+ def description
39
+ @description || @klass.to_s.split('::').last
40
+ end
41
+
42
+ def provider
43
+ @provider || (built_in? ? 'chronicle' : '')
44
+ end
45
+
46
+ def descriptive_phrase
47
+ prefix = case phase
48
+ when :extractor
49
+ "Extracts from"
50
+ when :transformer
51
+ "Transforms"
52
+ when :loader
53
+ "Loads to"
54
+ end
55
+
56
+ "#{prefix} #{description}"
57
+ end
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,52 @@
1
+ require 'rubygems'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ # A singleton class that acts as a registry of connector classes available for ETL jobs
6
+ module Registry
7
+ PHASES = [:extractor, :transformer, :loader]
8
+
9
+ class << self
10
+ attr_accessor :connectors
11
+
12
+ def load_all!
13
+ load_connectors_from_gems
14
+ end
15
+
16
+ def load_connectors_from_gems
17
+ Gem::Specification.filter{|s| s.name.match(/^chronicle/) }.each do |gem|
18
+ require_str = gem.name.gsub('chronicle-', 'chronicle/')
19
+ require require_str rescue LoadError
20
+ end
21
+ end
22
+
23
+ def install_connector name
24
+ gem_name = "chronicle-#{name}"
25
+ Gem.install(gem_name)
26
+ end
27
+
28
+ def register connector
29
+ @connectors ||= []
30
+ @connectors << connector
31
+ end
32
+
33
+ def find_by_phase_and_identifier(phase, identifier)
34
+ connector = find_within_loaded_connectors(phase, identifier)
35
+ unless connector
36
+ # Only load external connectors (slow) if not found in built-in connectors
37
+ load_all!
38
+ connector = find_within_loaded_connectors(phase, identifier)
39
+ end
40
+ connector || raise(ConnectorNotAvailableError.new("Connector '#{identifier}' not found"))
41
+ end
42
+
43
+ def find_within_loaded_connectors(phase, identifier)
44
+ @connectors.find { |c| c.phase == phase && c.identifier == identifier }
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
50
+
51
+ require_relative 'self_registering'
52
+ require_relative 'connector_registration'
@@ -0,0 +1,25 @@
1
+ require 'forwardable'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Registry
6
+ # Gives a connector class the ability to let the Chronicle::ETL::Registry
7
+ # know about itself
8
+ module SelfRegistering
9
+ extend Forwardable
10
+
11
+ attr_accessor :connector_registration
12
+
13
+ def_delegators :@connector_registration, :description, :provider, :identifier
14
+
15
+ # Creates a ConnectorRegistration for this connector's details and register's it
16
+ # into the Registry
17
+ def register_connector
18
+ @connector_registration ||= ::Chronicle::ETL::Registry::ConnectorRegistration.new(self)
19
+ yield @connector_registration if block_given?
20
+ ::Chronicle::ETL::Registry.register(@connector_registration)
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -1,59 +1,87 @@
1
- class Chronicle::ETL::Runner
2
- BUILTIN = {
3
- extractor: ['stdin', 'json', 'csv', 'file'],
4
- transformer: ['null'],
5
- loader: ['stdout', 'csv', 'table']
6
- }.freeze
7
-
8
- def initialize(options)
9
- @options = options
1
+ require 'colorize'
2
+ require 'chronic_duration'
10
3
 
11
- instantiate_etl_classes
4
+ class Chronicle::ETL::Runner
5
+ def initialize(job)
6
+ @job = job
7
+ @job_logger = Chronicle::ETL::JobLogger.new(@job)
12
8
  end
13
9
 
14
10
  def run!
15
- total = @extractor.results_count
16
- progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
17
- count = 0
11
+ extractor = @job.instantiate_extractor
12
+ loader = @job.instantiate_loader
13
+
14
+ @job_logger.start
15
+ loader.start
18
16
 
19
- @loader.start
17
+ total = extractor.results_count
18
+ @progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
19
+ Chronicle::ETL::Logger.attach_to_progress_bar(@progress_bar)
20
20
 
21
- @extractor.extract do |data, metadata|
22
- transformed_data = @transformer.transform(data)
23
- @loader.load(transformed_data)
21
+ Chronicle::ETL::Logger.info(tty_log_job_start)
22
+ extractor.extract do |extraction|
23
+ unless extraction.is_a?(Chronicle::ETL::Extraction)
24
+ raise Chronicle::ETL::RunnerTypeError, "Extracted should be a Chronicle::ETL::Extraction"
25
+ end
26
+
27
+ transformer = @job.instantiate_transformer(extraction)
28
+ record = transformer.transform
29
+
30
+ unless record.is_a?(Chronicle::ETL::Models::Base)
31
+ raise Chronicle::ETL::RunnerTypeError, "Transformed data should be a type of Chronicle::ETL::Models"
32
+ end
24
33
 
25
- progress_bar.increment
26
- count += 1
34
+ Chronicle::ETL::Logger.info(tty_log_transformation(transformer))
35
+ @job_logger.log_transformation(transformer)
36
+
37
+ loader.load(record) unless @job.dry_run?
38
+ rescue Chronicle::ETL::TransformationError => e
39
+ Chronicle::ETL::Logger.error(tty_log_transformation_failure(e))
40
+ ensure
41
+ @progress_bar.increment
27
42
  end
28
43
 
29
- progress_bar.finish
30
- @loader.finish
44
+ @progress_bar.finish
45
+ loader.finish
46
+ @job_logger.finish
47
+ rescue Interrupt
48
+ Chronicle::ETL::Logger.error("\n#{'Job interrupted'.red}")
49
+ @job_logger.error
50
+ rescue StandardError => e
51
+ raise e
52
+ ensure
53
+ @job_logger.save
54
+ @progress_bar.finish
55
+ Chronicle::ETL::Logger.detach_from_progress_bar
56
+ Chronicle::ETL::Logger.info(tty_log_completion)
31
57
  end
32
58
 
33
59
  private
34
60
 
35
- def instantiate_etl_classes
36
- @extractor = load_etl_class(:extractor, @options[:extractor][:name]).new(@options[:extractor][:options])
37
- @transformer = load_etl_class(:transformer, @options[:transformer][:name]).new(@options[:transformer][:options])
38
- @loader = load_etl_class(:loader, @options[:loader][:name]).new(@options[:loader][:options])
61
+ def tty_log_job_start
62
+ output = "Beginning job "
63
+ output += "'#{@job.name}'".bold if @job.name
64
+ output
39
65
  end
40
66
 
41
- def load_etl_class(phase, x)
42
- if BUILTIN[phase].include? x
43
- klass_name = "Chronicle::ETL::#{x.capitalize}#{phase.to_s.capitalize}"
44
- else
45
- # TODO: come up with syntax for specifying a particular extractor in a provider library
46
- provider, name = x.split(":")
47
- provider = x unless provider
48
- begin
49
- require "chronicle/#{provider}"
50
- rescue LoadError => e
51
- warn("Error loading #{phase} '#{provider}'".red)
52
- warn(" Perhaps you haven't installed it yet: `$ gem install chronicle-#{provider}`")
53
- exit(false)
54
- end
55
- klass_name = "Chronicle::#{provider.capitalize}::#{name&.capitalize}#{phase.capitalize}"
56
- end
57
- Object.const_get(klass_name)
67
+ def tty_log_transformation transformer
68
+ output = " ✓".green
69
+ output += " #{transformer}"
70
+ end
71
+
72
+ def tty_log_transformation_failure exception
73
+ output = " ✖".red
74
+ output += " Failed to build #{exception.transformation}. #{exception.message}"
75
+ end
76
+
77
+ def tty_log_completion
78
+ status = @job_logger.success ? 'Success' : 'Failed'
79
+ output = "\nCompleted job "
80
+ output += "'#{@job.name}'".bold if @job.name
81
+ output += " in #{ChronicDuration.output(@job_logger.duration)}" if @job_logger.duration
82
+ output += "\n Status:\t".light_black + status
83
+ output += "\n Completed:\t".light_black + "#{@job_logger.job_log.num_records_processed}"
84
+ output += "\n Latest:\t".light_black + "#{@job_logger.job_log.highest_timestamp.iso8601}" if @job_logger.job_log.highest_timestamp
85
+ output
58
86
  end
59
87
  end
@@ -0,0 +1,25 @@
1
+ module Chronicle
2
+ module ETL
3
+ class JSONAPISerializer < Chronicle::ETL::Serializer
4
+ def serializable_hash
5
+ @record
6
+ .identifier_hash
7
+ .merge({ attributes: @record.attributes })
8
+ .merge({ relationships: build_associations })
9
+ .merge(@record.meta_hash)
10
+ end
11
+
12
+ def build_associations
13
+ @record.associations.transform_values do |value|
14
+ association_data =
15
+ if value.is_a?(Array)
16
+ value.map { |record| JSONAPISerializer.new(record).serializable_hash }
17
+ else
18
+ JSONAPISerializer.new(value).serializable_hash
19
+ end
20
+ { data: association_data }
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,27 @@
1
+ module Chronicle
2
+ module ETL
3
+ # Abstract class representing a Serializer for an ETL record
4
+ class Serializer
5
+ # Construct a new instance of this serializer.
6
+ # == Parameters:
7
+ # options::
8
+ # Options for configuring this Serializers
9
+ def initialize(record, options = {})
10
+ @record = record
11
+ @options = options
12
+ end
13
+
14
+ # Serialize a record as a hash
15
+ def serializable_hash
16
+ raise NotImplementedError
17
+ end
18
+
19
+ def self.serialize(record)
20
+ serializer = self.new(record)
21
+ serializer.serializable_hash
22
+ end
23
+ end
24
+ end
25
+ end
26
+
27
+ require_relative 'jsonapi_serializer'