chronicle-etl 0.1.4 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +8 -0
  3. data/.yardopts +1 -0
  4. data/Gemfile.lock +15 -1
  5. data/README.md +31 -13
  6. data/chronicle-etl.gemspec +6 -1
  7. data/exe/chronicle-etl +2 -2
  8. data/lib/chronicle/etl.rb +15 -2
  9. data/lib/chronicle/etl/catalog.rb +67 -17
  10. data/lib/chronicle/etl/cli/connectors.rb +32 -0
  11. data/lib/chronicle/etl/cli/jobs.rb +116 -0
  12. data/lib/chronicle/etl/cli/main.rb +83 -0
  13. data/lib/chronicle/etl/cli/subcommand_base.rb +37 -0
  14. data/lib/chronicle/etl/config.rb +53 -0
  15. data/lib/chronicle/etl/exceptions.rb +19 -0
  16. data/lib/chronicle/etl/extractors/csv_extractor.rb +2 -3
  17. data/lib/chronicle/etl/extractors/extractor.rb +21 -5
  18. data/lib/chronicle/etl/extractors/file_extractor.rb +2 -2
  19. data/lib/chronicle/etl/extractors/stdin_extractor.rb +2 -2
  20. data/lib/chronicle/etl/job.rb +71 -0
  21. data/lib/chronicle/etl/job_definition.rb +51 -0
  22. data/lib/chronicle/etl/job_log.rb +85 -0
  23. data/lib/chronicle/etl/job_logger.rb +78 -0
  24. data/lib/chronicle/etl/loaders/csv_loader.rb +4 -8
  25. data/lib/chronicle/etl/loaders/loader.rb +11 -2
  26. data/lib/chronicle/etl/loaders/rest_loader.rb +33 -0
  27. data/lib/chronicle/etl/loaders/stdout_loader.rb +5 -5
  28. data/lib/chronicle/etl/loaders/table_loader.rb +7 -6
  29. data/lib/chronicle/etl/models/activity.rb +15 -0
  30. data/lib/chronicle/etl/models/base.rb +103 -0
  31. data/lib/chronicle/etl/models/entity.rb +15 -0
  32. data/lib/chronicle/etl/models/generic.rb +23 -0
  33. data/lib/chronicle/etl/runner.rb +24 -46
  34. data/lib/chronicle/etl/transformers/null_transformer.rb +5 -6
  35. data/lib/chronicle/etl/transformers/transformer.rb +23 -7
  36. data/lib/chronicle/etl/utils/hash_utilities.rb +19 -0
  37. data/lib/chronicle/etl/utils/jsonapi.rb +28 -0
  38. data/lib/chronicle/etl/utils/progress_bar.rb +2 -2
  39. data/lib/chronicle/etl/version.rb +2 -2
  40. metadata +91 -5
  41. data/CHANGELOG.md +0 -23
  42. data/lib/chronicle/etl/cli.rb +0 -56
  43. data/lib/chronicle/etl/transformers/json_transformer.rb +0 -11
@@ -0,0 +1,78 @@
1
+ require 'sequel'
2
+ require 'forwardable'
3
+
4
+ require 'pry'
5
+
6
+ module Chronicle
7
+ module ETL
8
+ # Saves JobLogs to db and loads previous ones
9
+ class JobLogger
10
+ extend Forwardable
11
+
12
+ def_delegators :@job_log, :start, :finish, :log_transformation
13
+
14
+ # Create a new JobLogger
15
+ def initialize(job)
16
+ @job_log = JobLog.new do |job_log|
17
+ job_log.job = job
18
+ end
19
+ end
20
+
21
+ # Save this JobLogger's JobLog to db
22
+ def save
23
+ return unless @job_log.save_log?
24
+
25
+ JobLogger.with_db_connection do |db|
26
+ dataset = db[:job_logs]
27
+ dataset.insert(@job_log.serialize)
28
+ end
29
+ end
30
+
31
+ # For a given `job_id`, return the last successful log
32
+ def self.load_latest(job_id)
33
+ with_db_connection do |db|
34
+ attrs = db[:job_logs].reverse_order(:finished_at).where(success: true).first
35
+ JobLog.build_from_serialized(attrs) if attrs
36
+ end
37
+ end
38
+
39
+ def self.with_db_connection
40
+ initialize_db unless db_exists?
41
+ Sequel.connect("sqlite://#{db_filename}") do |db|
42
+ initialize_schema(db) unless schema_exists?(db)
43
+ yield db
44
+ end
45
+ end
46
+
47
+ def self.db_exists?
48
+ File.exists?(db_filename)
49
+ end
50
+
51
+ def self.schema_exists?(db)
52
+ return db.tables.include? :job_logs
53
+ end
54
+
55
+ def self.db_filename
56
+ data = Runcom::Data.new "chronicle/etl/job_log.db"
57
+ filename = data.all[0].to_s
58
+ end
59
+
60
+ def self.initialize_db
61
+ FileUtils.mkdir_p(File.dirname(db_filename))
62
+ end
63
+
64
+ def self.initialize_schema db
65
+ db.create_table :job_logs do
66
+ primary_key :id
67
+ String :job_id, null: false
68
+ String :last_id
69
+ Time :highest_timestamp
70
+ Integer :num_records_processed
71
+ boolean :success, default: false
72
+ Time :started_at
73
+ Time :finished_at
74
+ end
75
+ end
76
+ end
77
+ end
78
+ end
@@ -1,19 +1,15 @@
1
1
  require 'csv'
2
2
 
3
3
  module Chronicle
4
- module Etl
5
- class CsvLoader < Chronicle::Etl::Loader
4
+ module ETL
5
+ class CsvLoader < Chronicle::ETL::Loader
6
6
  def initialize(options={})
7
7
  super(options)
8
8
  @rows = []
9
9
  end
10
10
 
11
- def load(result)
12
- if (result.is_a? Hash)
13
- @rows << result.values
14
- else
15
- @rows << result
16
- end
11
+ def load(record)
12
+ @rows << record.to_h_flattened.values
17
13
  end
18
14
 
19
15
  def finish
@@ -1,23 +1,32 @@
1
1
  module Chronicle
2
- module Etl
2
+ module ETL
3
+ # Abstract class representing a Loader for an ETL job
3
4
  class Loader
4
- extend Chronicle::Etl::Catalog
5
+ extend Chronicle::ETL::Catalog
5
6
 
7
+ # Construct a new instance of this loader. Options are passed in from a Runner
8
+ # == Parameters:
9
+ # options::
10
+ # Options for configuring this Loader
6
11
  def initialize(options = {})
7
12
  @options = options
8
13
  end
9
14
 
15
+ # Called once before processing records
10
16
  def start; end
11
17
 
18
+ # Load a single record
12
19
  def load
13
20
  raise NotImplementedError
14
21
  end
15
22
 
23
+ # Called once there are no more records to process
16
24
  def finish; end
17
25
  end
18
26
  end
19
27
  end
20
28
 
21
29
  require_relative 'csv_loader'
30
+ require_relative 'rest_loader'
22
31
  require_relative 'stdout_loader'
23
32
  require_relative 'table_loader'
@@ -0,0 +1,33 @@
1
+ require 'net/http'
2
+ require 'uri'
3
+ require 'json'
4
+
5
+ module Chronicle
6
+ module ETL
7
+ class RestLoader < Chronicle::ETL::Loader
8
+ def initialize( options={} )
9
+ super(options)
10
+ end
11
+
12
+ def load(record)
13
+ payload = Chronicle::ETL::Utils::JSONAPI.serialize(record)
14
+ # have the outer data key that json-api expects
15
+ payload = { data: payload } unless payload[:data]
16
+
17
+ uri = URI.parse("#{@options[:hostname]}#{@options[:endpoint]}")
18
+
19
+ header = {
20
+ "Authorization" => "Bearer #{@options[:access_token]}",
21
+ "Content-Type": 'application/json'
22
+ }
23
+ use_ssl = uri.scheme == 'https'
24
+
25
+ Net::HTTP.start(uri.host, uri.port, use_ssl: use_ssl) do |http|
26
+ request = Net::HTTP::Post.new(uri.request_uri, header)
27
+ request.body = payload.to_json
28
+ http.request(request)
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -1,9 +1,9 @@
1
1
  module Chronicle
2
- module Etl
3
- class StdoutLoader < Chronicle::Etl::Loader
4
- def load(result)
5
- puts result.inspect
2
+ module ETL
3
+ class StdoutLoader < Chronicle::ETL::Loader
4
+ def load(record)
5
+ puts record.to_h
6
6
  end
7
7
  end
8
8
  end
9
- end
9
+ end
@@ -1,20 +1,21 @@
1
1
  require 'tty/table'
2
2
 
3
3
  module Chronicle
4
- module Etl
5
- class TableLoader < Chronicle::Etl::Loader
4
+ module ETL
5
+ class TableLoader < Chronicle::ETL::Loader
6
6
  def initialize(options)
7
7
  super(options)
8
8
  end
9
9
 
10
- def load(result)
11
- @table ||= TTY::Table.new(header: result.keys)
12
- values = result.values.map{|x| x.to_s[0..30]}
10
+ def load(record)
11
+ record_hash = record.to_h_flattened
12
+ @table ||= TTY::Table.new(header: record_hash.keys)
13
+ values = record_hash.values.map{|x| x.to_s[0..30]}
13
14
  @table << values
14
15
  end
15
16
 
16
17
  def finish
17
- puts @table.render(:ascii, padding: [0, 1])
18
+ puts @table.render(:ascii, padding: [0, 1]) if @table
18
19
  end
19
20
  end
20
21
  end
@@ -0,0 +1,15 @@
1
+ require 'chronicle/etl/models/base'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ class Activity < Chronicle::ETL::Models::Base
7
+ TYPE = 'activities'.freeze
8
+ ATTRIBUTES = [:verb, :start_at, :end_at].freeze
9
+ ASSOCIATIONS = [:involved, :actor].freeze
10
+
11
+ attr_accessor(*ATTRIBUTES, *ASSOCIATIONS)
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,103 @@
1
+ require 'digest'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ # Represents a record that's been transformed by a Transformer and
7
+ # ready to be loaded. Loosely based on ActiveModel.
8
+ class Base
9
+ ATTRIBUTES = [:provider, :provider_id, :lat, :lng].freeze
10
+ ASSOCIATIONS = [].freeze
11
+
12
+ attr_accessor(:id, :dedupe_on, *ATTRIBUTES)
13
+
14
+ def initialize(attributes = {})
15
+ assign_attributes(attributes) if attributes
16
+ @dedupe_on = []
17
+ end
18
+
19
+ # A unique identifier for this model is formed from a type
20
+ # and either an id or lids.
21
+ def identifier_hash
22
+ {
23
+ type: self.class::TYPE,
24
+ id: @id,
25
+ lids: lids
26
+ }.compact
27
+ end
28
+
29
+ # Array of local ids that uniquely identify this record
30
+ def lids
31
+ @dedupe_on.map do |fields|
32
+ generate_lid(fields)
33
+ end.compact.uniq
34
+ end
35
+
36
+ # For a given set of fields of this model, generate a
37
+ # unique local id by hashing the field values
38
+ def generate_lid fields
39
+ values = fields.sort.map do |field|
40
+ instance_variable = "@#{field.to_s}"
41
+ self.instance_variable_get(instance_variable)
42
+ end
43
+
44
+ return if values.any? { |e| e.nil? }
45
+
46
+ Digest::SHA256.hexdigest(values.join(","))
47
+ end
48
+
49
+ # Set of attribute names that this model has is Base's shared
50
+ # attributes combined with the child class's
51
+ def attribute_list
52
+ (ATTRIBUTES + self.class::ATTRIBUTES).uniq
53
+ end
54
+
55
+ # All of this record's attributes
56
+ def attributes
57
+ attributes = {}
58
+ attribute_list.each do |attribute|
59
+ instance_variable = "@#{attribute.to_s}"
60
+ attributes[attribute] = self.instance_variable_get(instance_variable)
61
+ end
62
+ attributes.compact
63
+ end
64
+
65
+ # All of this record's associations
66
+ def associations
67
+ association_list = ASSOCIATIONS + self.class::ASSOCIATIONS
68
+ attributes = {}
69
+ association_list.each do |attribute|
70
+ instance_variable = "@#{attribute.to_s}"
71
+ association = self.instance_variable_get(instance_variable)
72
+ attributes[attribute] = association if association
73
+ end
74
+ attributes.compact
75
+ end
76
+
77
+ def associations_hash
78
+ Hash[associations.map do |k, v|
79
+ [k, v.to_h]
80
+ end]
81
+ end
82
+
83
+ # FIXME: move this to a Utils module
84
+ def to_h_flattened
85
+ Chronicle::ETL::Utils::HashUtilities.flatten_hash(to_h)
86
+ end
87
+
88
+ def to_h
89
+ identifier_hash.merge(attributes).merge(associations_hash)
90
+ end
91
+
92
+ private
93
+
94
+ def assign_attributes attributes
95
+ attributes.each do |k, v|
96
+ setter = :"#{k}="
97
+ public_send(setter, v) if respond_to? setter
98
+ end
99
+ end
100
+ end
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,15 @@
1
+ require 'chronicle/etl/models/base'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ class Entity < Chronicle::ETL::Models::Base
7
+ TYPE = 'entities'.freeze
8
+ ATTRIBUTES = [:title, :body, :represents, :slug].freeze
9
+ ASSOCIATIONS = [].freeze # TODO: add these to reflect Chronicle Schema
10
+
11
+ attr_accessor(*ATTRIBUTES)
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,23 @@
1
+ require 'chronicle/etl/models/base'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ class Generic < Chronicle::ETL::Models::Base
7
+ TYPE = 'generic'
8
+
9
+ attr_accessor :properties
10
+
11
+ def initialize(properties = {})
12
+ @properties = properties
13
+ super
14
+ end
15
+
16
+ # Generic models have arbitrary attributes stored in @properties
17
+ def attributes
18
+ @properties.transform_keys(&:to_sym)
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -1,59 +1,37 @@
1
- class Chronicle::Etl::Runner
2
- BUILTIN = {
3
- extractor: ['stdin', 'json', 'csv', 'file'],
4
- transformer: ['null'],
5
- loader: ['stdout', 'csv', 'table']
6
- }.freeze
1
+ require 'colorize'
7
2
 
8
- def initialize(options)
9
- @options = options
10
-
11
- instantiate_etl_classes
3
+ class Chronicle::ETL::Runner
4
+ def initialize(job)
5
+ @job = job
6
+ @job_logger = Chronicle::ETL::JobLogger.new(@job)
12
7
  end
13
8
 
14
9
  def run!
15
- total = @extractor.results_count
16
- progress_bar = Chronicle::Etl::Utils::ProgressBar.new(title: 'Running job', total: total)
17
- count = 0
10
+ extractor = @job.instantiate_extractor
11
+ loader = @job.instantiate_loader
12
+
13
+ @job_logger.start
14
+ loader.start
15
+
16
+ total = extractor.results_count
17
+ progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
18
18
 
19
- @loader.start
19
+ extractor.extract do |data, metadata|
20
+ transformer = @job.instantiate_transformer(data)
21
+ record = transformer.transform
20
22
 
21
- @extractor.extract do |data, metadata|
22
- transformed_data = @transformer.transform(data)
23
- @loader.load(transformed_data)
23
+ unless record.is_a?(Chronicle::ETL::Models::Base)
24
+ raise Chronicle::ETL::InvalidTransformedRecordError, "Transformed data is not a type of Chronicle::ETL::Models"
25
+ end
24
26
 
27
+ @job_logger.log_transformation(transformer)
28
+ loader.load(record)
25
29
  progress_bar.increment
26
- count += 1
27
30
  end
28
31
 
29
32
  progress_bar.finish
30
- @loader.finish
31
- end
32
-
33
- private
34
-
35
- def instantiate_etl_classes
36
- @extractor = load_etl_class(:extractor, @options[:extractor][:name]).new(@options[:extractor][:options])
37
- @transformer = load_etl_class(:transformer, @options[:transformer][:name]).new(@options[:transformer][:options])
38
- @loader = load_etl_class(:loader, @options[:loader][:name]).new(@options[:loader][:options])
39
- end
40
-
41
- def load_etl_class(phase, x)
42
- if BUILTIN[phase].include? x
43
- klass_name = "Chronicle::Etl::#{x.capitalize}#{phase.to_s.capitalize}"
44
- else
45
- # TODO: come up with syntax for specifying a particular extractor in a provider library
46
- provider, name = x.split(":")
47
- provider = x unless provider
48
- begin
49
- require "chronicle/#{provider}"
50
- rescue LoadError => e
51
- warn("Error loading #{phase} '#{provider}'")
52
- warn(" Perhaps you haven't installed it yet: `$ gem install chronicle-#{provider}`")
53
- exit(false)
54
- end
55
- klass_name = "Chronicle::#{provider.capitalize}::#{name&.capitalize}#{phase.capitalize}"
56
- end
57
- Object.const_get(klass_name)
33
+ loader.finish
34
+ @job_logger.finish
35
+ @job_logger.save
58
36
  end
59
37
  end