chronicle-etl 0.1.4 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +8 -0
  3. data/.yardopts +1 -0
  4. data/Gemfile.lock +15 -1
  5. data/README.md +31 -13
  6. data/chronicle-etl.gemspec +6 -1
  7. data/exe/chronicle-etl +2 -2
  8. data/lib/chronicle/etl.rb +15 -2
  9. data/lib/chronicle/etl/catalog.rb +67 -17
  10. data/lib/chronicle/etl/cli/connectors.rb +32 -0
  11. data/lib/chronicle/etl/cli/jobs.rb +116 -0
  12. data/lib/chronicle/etl/cli/main.rb +83 -0
  13. data/lib/chronicle/etl/cli/subcommand_base.rb +37 -0
  14. data/lib/chronicle/etl/config.rb +53 -0
  15. data/lib/chronicle/etl/exceptions.rb +19 -0
  16. data/lib/chronicle/etl/extractors/csv_extractor.rb +2 -3
  17. data/lib/chronicle/etl/extractors/extractor.rb +21 -5
  18. data/lib/chronicle/etl/extractors/file_extractor.rb +2 -2
  19. data/lib/chronicle/etl/extractors/stdin_extractor.rb +2 -2
  20. data/lib/chronicle/etl/job.rb +71 -0
  21. data/lib/chronicle/etl/job_definition.rb +51 -0
  22. data/lib/chronicle/etl/job_log.rb +85 -0
  23. data/lib/chronicle/etl/job_logger.rb +78 -0
  24. data/lib/chronicle/etl/loaders/csv_loader.rb +4 -8
  25. data/lib/chronicle/etl/loaders/loader.rb +11 -2
  26. data/lib/chronicle/etl/loaders/rest_loader.rb +33 -0
  27. data/lib/chronicle/etl/loaders/stdout_loader.rb +5 -5
  28. data/lib/chronicle/etl/loaders/table_loader.rb +7 -6
  29. data/lib/chronicle/etl/models/activity.rb +15 -0
  30. data/lib/chronicle/etl/models/base.rb +103 -0
  31. data/lib/chronicle/etl/models/entity.rb +15 -0
  32. data/lib/chronicle/etl/models/generic.rb +23 -0
  33. data/lib/chronicle/etl/runner.rb +24 -46
  34. data/lib/chronicle/etl/transformers/null_transformer.rb +5 -6
  35. data/lib/chronicle/etl/transformers/transformer.rb +23 -7
  36. data/lib/chronicle/etl/utils/hash_utilities.rb +19 -0
  37. data/lib/chronicle/etl/utils/jsonapi.rb +28 -0
  38. data/lib/chronicle/etl/utils/progress_bar.rb +2 -2
  39. data/lib/chronicle/etl/version.rb +2 -2
  40. metadata +91 -5
  41. data/CHANGELOG.md +0 -23
  42. data/lib/chronicle/etl/cli.rb +0 -56
  43. data/lib/chronicle/etl/transformers/json_transformer.rb +0 -11
@@ -0,0 +1,78 @@
1
+ require 'sequel'
2
+ require 'forwardable'
3
+
4
+ require 'pry'
5
+
6
+ module Chronicle
7
+ module ETL
8
+ # Saves JobLogs to db and loads previous ones
9
+ class JobLogger
10
+ extend Forwardable
11
+
12
+ def_delegators :@job_log, :start, :finish, :log_transformation
13
+
14
+ # Create a new JobLogger
15
+ def initialize(job)
16
+ @job_log = JobLog.new do |job_log|
17
+ job_log.job = job
18
+ end
19
+ end
20
+
21
+ # Save this JobLogger's JobLog to db
22
+ def save
23
+ return unless @job_log.save_log?
24
+
25
+ JobLogger.with_db_connection do |db|
26
+ dataset = db[:job_logs]
27
+ dataset.insert(@job_log.serialize)
28
+ end
29
+ end
30
+
31
+ # For a given `job_id`, return the last successful log
32
+ def self.load_latest(job_id)
33
+ with_db_connection do |db|
34
+ attrs = db[:job_logs].reverse_order(:finished_at).where(success: true).first
35
+ JobLog.build_from_serialized(attrs) if attrs
36
+ end
37
+ end
38
+
39
+ def self.with_db_connection
40
+ initialize_db unless db_exists?
41
+ Sequel.connect("sqlite://#{db_filename}") do |db|
42
+ initialize_schema(db) unless schema_exists?(db)
43
+ yield db
44
+ end
45
+ end
46
+
47
+ def self.db_exists?
48
+ File.exists?(db_filename)
49
+ end
50
+
51
+ def self.schema_exists?(db)
52
+ return db.tables.include? :job_logs
53
+ end
54
+
55
+ def self.db_filename
56
+ data = Runcom::Data.new "chronicle/etl/job_log.db"
57
+ filename = data.all[0].to_s
58
+ end
59
+
60
+ def self.initialize_db
61
+ FileUtils.mkdir_p(File.dirname(db_filename))
62
+ end
63
+
64
+ def self.initialize_schema db
65
+ db.create_table :job_logs do
66
+ primary_key :id
67
+ String :job_id, null: false
68
+ String :last_id
69
+ Time :highest_timestamp
70
+ Integer :num_records_processed
71
+ boolean :success, default: false
72
+ Time :started_at
73
+ Time :finished_at
74
+ end
75
+ end
76
+ end
77
+ end
78
+ end
@@ -1,19 +1,15 @@
1
1
  require 'csv'
2
2
 
3
3
  module Chronicle
4
- module Etl
5
- class CsvLoader < Chronicle::Etl::Loader
4
+ module ETL
5
+ class CsvLoader < Chronicle::ETL::Loader
6
6
  def initialize(options={})
7
7
  super(options)
8
8
  @rows = []
9
9
  end
10
10
 
11
- def load(result)
12
- if (result.is_a? Hash)
13
- @rows << result.values
14
- else
15
- @rows << result
16
- end
11
+ def load(record)
12
+ @rows << record.to_h_flattened.values
17
13
  end
18
14
 
19
15
  def finish
@@ -1,23 +1,32 @@
1
1
  module Chronicle
2
- module Etl
2
+ module ETL
3
+ # Abstract class representing a Loader for an ETL job
3
4
  class Loader
4
- extend Chronicle::Etl::Catalog
5
+ extend Chronicle::ETL::Catalog
5
6
 
7
+ # Construct a new instance of this loader. Options are passed in from a Runner
8
+ # == Parameters:
9
+ # options::
10
+ # Options for configuring this Loader
6
11
  def initialize(options = {})
7
12
  @options = options
8
13
  end
9
14
 
15
+ # Called once before processing records
10
16
  def start; end
11
17
 
18
+ # Load a single record
12
19
  def load
13
20
  raise NotImplementedError
14
21
  end
15
22
 
23
+ # Called once there are no more records to process
16
24
  def finish; end
17
25
  end
18
26
  end
19
27
  end
20
28
 
21
29
  require_relative 'csv_loader'
30
+ require_relative 'rest_loader'
22
31
  require_relative 'stdout_loader'
23
32
  require_relative 'table_loader'
@@ -0,0 +1,33 @@
1
+ require 'net/http'
2
+ require 'uri'
3
+ require 'json'
4
+
5
+ module Chronicle
6
+ module ETL
7
+ class RestLoader < Chronicle::ETL::Loader
8
+ def initialize( options={} )
9
+ super(options)
10
+ end
11
+
12
+ def load(record)
13
+ payload = Chronicle::ETL::Utils::JSONAPI.serialize(record)
14
+ # have the outer data key that json-api expects
15
+ payload = { data: payload } unless payload[:data]
16
+
17
+ uri = URI.parse("#{@options[:hostname]}#{@options[:endpoint]}")
18
+
19
+ header = {
20
+ "Authorization" => "Bearer #{@options[:access_token]}",
21
+ "Content-Type": 'application/json'
22
+ }
23
+ use_ssl = uri.scheme == 'https'
24
+
25
+ Net::HTTP.start(uri.host, uri.port, use_ssl: use_ssl) do |http|
26
+ request = Net::HTTP::Post.new(uri.request_uri, header)
27
+ request.body = payload.to_json
28
+ http.request(request)
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -1,9 +1,9 @@
1
1
  module Chronicle
2
- module Etl
3
- class StdoutLoader < Chronicle::Etl::Loader
4
- def load(result)
5
- puts result.inspect
2
+ module ETL
3
+ class StdoutLoader < Chronicle::ETL::Loader
4
+ def load(record)
5
+ puts record.to_h
6
6
  end
7
7
  end
8
8
  end
9
- end
9
+ end
@@ -1,20 +1,21 @@
1
1
  require 'tty/table'
2
2
 
3
3
  module Chronicle
4
- module Etl
5
- class TableLoader < Chronicle::Etl::Loader
4
+ module ETL
5
+ class TableLoader < Chronicle::ETL::Loader
6
6
  def initialize(options)
7
7
  super(options)
8
8
  end
9
9
 
10
- def load(result)
11
- @table ||= TTY::Table.new(header: result.keys)
12
- values = result.values.map{|x| x.to_s[0..30]}
10
+ def load(record)
11
+ record_hash = record.to_h_flattened
12
+ @table ||= TTY::Table.new(header: record_hash.keys)
13
+ values = record_hash.values.map{|x| x.to_s[0..30]}
13
14
  @table << values
14
15
  end
15
16
 
16
17
  def finish
17
- puts @table.render(:ascii, padding: [0, 1])
18
+ puts @table.render(:ascii, padding: [0, 1]) if @table
18
19
  end
19
20
  end
20
21
  end
@@ -0,0 +1,15 @@
1
+ require 'chronicle/etl/models/base'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ class Activity < Chronicle::ETL::Models::Base
7
+ TYPE = 'activities'.freeze
8
+ ATTRIBUTES = [:verb, :start_at, :end_at].freeze
9
+ ASSOCIATIONS = [:involved, :actor].freeze
10
+
11
+ attr_accessor(*ATTRIBUTES, *ASSOCIATIONS)
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,103 @@
1
+ require 'digest'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ # Represents a record that's been transformed by a Transformer and
7
+ # ready to be loaded. Loosely based on ActiveModel.
8
+ class Base
9
+ ATTRIBUTES = [:provider, :provider_id, :lat, :lng].freeze
10
+ ASSOCIATIONS = [].freeze
11
+
12
+ attr_accessor(:id, :dedupe_on, *ATTRIBUTES)
13
+
14
+ def initialize(attributes = {})
15
+ assign_attributes(attributes) if attributes
16
+ @dedupe_on = []
17
+ end
18
+
19
+ # A unique identifier for this model is formed from a type
20
+ # and either an id or lids.
21
+ def identifier_hash
22
+ {
23
+ type: self.class::TYPE,
24
+ id: @id,
25
+ lids: lids
26
+ }.compact
27
+ end
28
+
29
+ # Array of local ids that uniquely identify this record
30
+ def lids
31
+ @dedupe_on.map do |fields|
32
+ generate_lid(fields)
33
+ end.compact.uniq
34
+ end
35
+
36
+ # For a given set of fields of this model, generate a
37
+ # unique local id by hashing the field values
38
+ def generate_lid fields
39
+ values = fields.sort.map do |field|
40
+ instance_variable = "@#{field.to_s}"
41
+ self.instance_variable_get(instance_variable)
42
+ end
43
+
44
+ return if values.any? { |e| e.nil? }
45
+
46
+ Digest::SHA256.hexdigest(values.join(","))
47
+ end
48
+
49
+ # Set of attribute names that this model has is Base's shared
50
+ # attributes combined with the child class's
51
+ def attribute_list
52
+ (ATTRIBUTES + self.class::ATTRIBUTES).uniq
53
+ end
54
+
55
+ # All of this record's attributes
56
+ def attributes
57
+ attributes = {}
58
+ attribute_list.each do |attribute|
59
+ instance_variable = "@#{attribute.to_s}"
60
+ attributes[attribute] = self.instance_variable_get(instance_variable)
61
+ end
62
+ attributes.compact
63
+ end
64
+
65
+ # All of this record's associations
66
+ def associations
67
+ association_list = ASSOCIATIONS + self.class::ASSOCIATIONS
68
+ attributes = {}
69
+ association_list.each do |attribute|
70
+ instance_variable = "@#{attribute.to_s}"
71
+ association = self.instance_variable_get(instance_variable)
72
+ attributes[attribute] = association if association
73
+ end
74
+ attributes.compact
75
+ end
76
+
77
+ def associations_hash
78
+ Hash[associations.map do |k, v|
79
+ [k, v.to_h]
80
+ end]
81
+ end
82
+
83
+ # FIXME: move this to a Utils module
84
+ def to_h_flattened
85
+ Chronicle::ETL::Utils::HashUtilities.flatten_hash(to_h)
86
+ end
87
+
88
+ def to_h
89
+ identifier_hash.merge(attributes).merge(associations_hash)
90
+ end
91
+
92
+ private
93
+
94
+ def assign_attributes attributes
95
+ attributes.each do |k, v|
96
+ setter = :"#{k}="
97
+ public_send(setter, v) if respond_to? setter
98
+ end
99
+ end
100
+ end
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,15 @@
1
+ require 'chronicle/etl/models/base'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ class Entity < Chronicle::ETL::Models::Base
7
+ TYPE = 'entities'.freeze
8
+ ATTRIBUTES = [:title, :body, :represents, :slug].freeze
9
+ ASSOCIATIONS = [].freeze # TODO: add these to reflect Chronicle Schema
10
+
11
+ attr_accessor(*ATTRIBUTES)
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,23 @@
1
+ require 'chronicle/etl/models/base'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ class Generic < Chronicle::ETL::Models::Base
7
+ TYPE = 'generic'
8
+
9
+ attr_accessor :properties
10
+
11
+ def initialize(properties = {})
12
+ @properties = properties
13
+ super
14
+ end
15
+
16
+ # Generic models have arbitrary attributes stored in @properties
17
+ def attributes
18
+ @properties.transform_keys(&:to_sym)
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -1,59 +1,37 @@
1
- class Chronicle::Etl::Runner
2
- BUILTIN = {
3
- extractor: ['stdin', 'json', 'csv', 'file'],
4
- transformer: ['null'],
5
- loader: ['stdout', 'csv', 'table']
6
- }.freeze
1
+ require 'colorize'
7
2
 
8
- def initialize(options)
9
- @options = options
10
-
11
- instantiate_etl_classes
3
+ class Chronicle::ETL::Runner
4
+ def initialize(job)
5
+ @job = job
6
+ @job_logger = Chronicle::ETL::JobLogger.new(@job)
12
7
  end
13
8
 
14
9
  def run!
15
- total = @extractor.results_count
16
- progress_bar = Chronicle::Etl::Utils::ProgressBar.new(title: 'Running job', total: total)
17
- count = 0
10
+ extractor = @job.instantiate_extractor
11
+ loader = @job.instantiate_loader
12
+
13
+ @job_logger.start
14
+ loader.start
15
+
16
+ total = extractor.results_count
17
+ progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
18
18
 
19
- @loader.start
19
+ extractor.extract do |data, metadata|
20
+ transformer = @job.instantiate_transformer(data)
21
+ record = transformer.transform
20
22
 
21
- @extractor.extract do |data, metadata|
22
- transformed_data = @transformer.transform(data)
23
- @loader.load(transformed_data)
23
+ unless record.is_a?(Chronicle::ETL::Models::Base)
24
+ raise Chronicle::ETL::InvalidTransformedRecordError, "Transformed data is not a type of Chronicle::ETL::Models"
25
+ end
24
26
 
27
+ @job_logger.log_transformation(transformer)
28
+ loader.load(record)
25
29
  progress_bar.increment
26
- count += 1
27
30
  end
28
31
 
29
32
  progress_bar.finish
30
- @loader.finish
31
- end
32
-
33
- private
34
-
35
- def instantiate_etl_classes
36
- @extractor = load_etl_class(:extractor, @options[:extractor][:name]).new(@options[:extractor][:options])
37
- @transformer = load_etl_class(:transformer, @options[:transformer][:name]).new(@options[:transformer][:options])
38
- @loader = load_etl_class(:loader, @options[:loader][:name]).new(@options[:loader][:options])
39
- end
40
-
41
- def load_etl_class(phase, x)
42
- if BUILTIN[phase].include? x
43
- klass_name = "Chronicle::Etl::#{x.capitalize}#{phase.to_s.capitalize}"
44
- else
45
- # TODO: come up with syntax for specifying a particular extractor in a provider library
46
- provider, name = x.split(":")
47
- provider = x unless provider
48
- begin
49
- require "chronicle/#{provider}"
50
- rescue LoadError => e
51
- warn("Error loading #{phase} '#{provider}'")
52
- warn(" Perhaps you haven't installed it yet: `$ gem install chronicle-#{provider}`")
53
- exit(false)
54
- end
55
- klass_name = "Chronicle::#{provider.capitalize}::#{name&.capitalize}#{phase.capitalize}"
56
- end
57
- Object.const_get(klass_name)
33
+ loader.finish
34
+ @job_logger.finish
35
+ @job_logger.save
58
36
  end
59
37
  end