chronicle-etl 0.1.4 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +8 -0
- data/.yardopts +1 -0
- data/Gemfile.lock +15 -1
- data/README.md +31 -13
- data/chronicle-etl.gemspec +6 -1
- data/exe/chronicle-etl +2 -2
- data/lib/chronicle/etl.rb +15 -2
- data/lib/chronicle/etl/catalog.rb +67 -17
- data/lib/chronicle/etl/cli/connectors.rb +32 -0
- data/lib/chronicle/etl/cli/jobs.rb +116 -0
- data/lib/chronicle/etl/cli/main.rb +83 -0
- data/lib/chronicle/etl/cli/subcommand_base.rb +37 -0
- data/lib/chronicle/etl/config.rb +53 -0
- data/lib/chronicle/etl/exceptions.rb +19 -0
- data/lib/chronicle/etl/extractors/csv_extractor.rb +2 -3
- data/lib/chronicle/etl/extractors/extractor.rb +21 -5
- data/lib/chronicle/etl/extractors/file_extractor.rb +2 -2
- data/lib/chronicle/etl/extractors/stdin_extractor.rb +2 -2
- data/lib/chronicle/etl/job.rb +71 -0
- data/lib/chronicle/etl/job_definition.rb +51 -0
- data/lib/chronicle/etl/job_log.rb +85 -0
- data/lib/chronicle/etl/job_logger.rb +78 -0
- data/lib/chronicle/etl/loaders/csv_loader.rb +4 -8
- data/lib/chronicle/etl/loaders/loader.rb +11 -2
- data/lib/chronicle/etl/loaders/rest_loader.rb +33 -0
- data/lib/chronicle/etl/loaders/stdout_loader.rb +5 -5
- data/lib/chronicle/etl/loaders/table_loader.rb +7 -6
- data/lib/chronicle/etl/models/activity.rb +15 -0
- data/lib/chronicle/etl/models/base.rb +103 -0
- data/lib/chronicle/etl/models/entity.rb +15 -0
- data/lib/chronicle/etl/models/generic.rb +23 -0
- data/lib/chronicle/etl/runner.rb +24 -46
- data/lib/chronicle/etl/transformers/null_transformer.rb +5 -6
- data/lib/chronicle/etl/transformers/transformer.rb +23 -7
- data/lib/chronicle/etl/utils/hash_utilities.rb +19 -0
- data/lib/chronicle/etl/utils/jsonapi.rb +28 -0
- data/lib/chronicle/etl/utils/progress_bar.rb +2 -2
- data/lib/chronicle/etl/version.rb +2 -2
- metadata +91 -5
- data/CHANGELOG.md +0 -23
- data/lib/chronicle/etl/cli.rb +0 -56
- data/lib/chronicle/etl/transformers/json_transformer.rb +0 -11
@@ -0,0 +1,78 @@
|
|
1
|
+
require 'sequel'
|
2
|
+
require 'forwardable'
|
3
|
+
|
4
|
+
require 'pry'
|
5
|
+
|
6
|
+
module Chronicle
|
7
|
+
module ETL
|
8
|
+
# Saves JobLogs to db and loads previous ones
|
9
|
+
class JobLogger
|
10
|
+
extend Forwardable
|
11
|
+
|
12
|
+
def_delegators :@job_log, :start, :finish, :log_transformation
|
13
|
+
|
14
|
+
# Create a new JobLogger
|
15
|
+
def initialize(job)
|
16
|
+
@job_log = JobLog.new do |job_log|
|
17
|
+
job_log.job = job
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# Save this JobLogger's JobLog to db
|
22
|
+
def save
|
23
|
+
return unless @job_log.save_log?
|
24
|
+
|
25
|
+
JobLogger.with_db_connection do |db|
|
26
|
+
dataset = db[:job_logs]
|
27
|
+
dataset.insert(@job_log.serialize)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# For a given `job_id`, return the last successful log
|
32
|
+
def self.load_latest(job_id)
|
33
|
+
with_db_connection do |db|
|
34
|
+
attrs = db[:job_logs].reverse_order(:finished_at).where(success: true).first
|
35
|
+
JobLog.build_from_serialized(attrs) if attrs
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.with_db_connection
|
40
|
+
initialize_db unless db_exists?
|
41
|
+
Sequel.connect("sqlite://#{db_filename}") do |db|
|
42
|
+
initialize_schema(db) unless schema_exists?(db)
|
43
|
+
yield db
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.db_exists?
|
48
|
+
File.exists?(db_filename)
|
49
|
+
end
|
50
|
+
|
51
|
+
def self.schema_exists?(db)
|
52
|
+
return db.tables.include? :job_logs
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.db_filename
|
56
|
+
data = Runcom::Data.new "chronicle/etl/job_log.db"
|
57
|
+
filename = data.all[0].to_s
|
58
|
+
end
|
59
|
+
|
60
|
+
def self.initialize_db
|
61
|
+
FileUtils.mkdir_p(File.dirname(db_filename))
|
62
|
+
end
|
63
|
+
|
64
|
+
def self.initialize_schema db
|
65
|
+
db.create_table :job_logs do
|
66
|
+
primary_key :id
|
67
|
+
String :job_id, null: false
|
68
|
+
String :last_id
|
69
|
+
Time :highest_timestamp
|
70
|
+
Integer :num_records_processed
|
71
|
+
boolean :success, default: false
|
72
|
+
Time :started_at
|
73
|
+
Time :finished_at
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -1,19 +1,15 @@
|
|
1
1
|
require 'csv'
|
2
2
|
|
3
3
|
module Chronicle
|
4
|
-
module
|
5
|
-
class CsvLoader < Chronicle::
|
4
|
+
module ETL
|
5
|
+
class CsvLoader < Chronicle::ETL::Loader
|
6
6
|
def initialize(options={})
|
7
7
|
super(options)
|
8
8
|
@rows = []
|
9
9
|
end
|
10
10
|
|
11
|
-
def load(
|
12
|
-
|
13
|
-
@rows << result.values
|
14
|
-
else
|
15
|
-
@rows << result
|
16
|
-
end
|
11
|
+
def load(record)
|
12
|
+
@rows << record.to_h_flattened.values
|
17
13
|
end
|
18
14
|
|
19
15
|
def finish
|
@@ -1,23 +1,32 @@
|
|
1
1
|
module Chronicle
|
2
|
-
module
|
2
|
+
module ETL
|
3
|
+
# Abstract class representing a Loader for an ETL job
|
3
4
|
class Loader
|
4
|
-
extend Chronicle::
|
5
|
+
extend Chronicle::ETL::Catalog
|
5
6
|
|
7
|
+
# Construct a new instance of this loader. Options are passed in from a Runner
|
8
|
+
# == Parameters:
|
9
|
+
# options::
|
10
|
+
# Options for configuring this Loader
|
6
11
|
def initialize(options = {})
|
7
12
|
@options = options
|
8
13
|
end
|
9
14
|
|
15
|
+
# Called once before processing records
|
10
16
|
def start; end
|
11
17
|
|
18
|
+
# Load a single record
|
12
19
|
def load
|
13
20
|
raise NotImplementedError
|
14
21
|
end
|
15
22
|
|
23
|
+
# Called once there are no more records to process
|
16
24
|
def finish; end
|
17
25
|
end
|
18
26
|
end
|
19
27
|
end
|
20
28
|
|
21
29
|
require_relative 'csv_loader'
|
30
|
+
require_relative 'rest_loader'
|
22
31
|
require_relative 'stdout_loader'
|
23
32
|
require_relative 'table_loader'
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'uri'
|
3
|
+
require 'json'
|
4
|
+
|
5
|
+
module Chronicle
|
6
|
+
module ETL
|
7
|
+
class RestLoader < Chronicle::ETL::Loader
|
8
|
+
def initialize( options={} )
|
9
|
+
super(options)
|
10
|
+
end
|
11
|
+
|
12
|
+
def load(record)
|
13
|
+
payload = Chronicle::ETL::Utils::JSONAPI.serialize(record)
|
14
|
+
# have the outer data key that json-api expects
|
15
|
+
payload = { data: payload } unless payload[:data]
|
16
|
+
|
17
|
+
uri = URI.parse("#{@options[:hostname]}#{@options[:endpoint]}")
|
18
|
+
|
19
|
+
header = {
|
20
|
+
"Authorization" => "Bearer #{@options[:access_token]}",
|
21
|
+
"Content-Type": 'application/json'
|
22
|
+
}
|
23
|
+
use_ssl = uri.scheme == 'https'
|
24
|
+
|
25
|
+
Net::HTTP.start(uri.host, uri.port, use_ssl: use_ssl) do |http|
|
26
|
+
request = Net::HTTP::Post.new(uri.request_uri, header)
|
27
|
+
request.body = payload.to_json
|
28
|
+
http.request(request)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -1,20 +1,21 @@
|
|
1
1
|
require 'tty/table'
|
2
2
|
|
3
3
|
module Chronicle
|
4
|
-
module
|
5
|
-
class TableLoader < Chronicle::
|
4
|
+
module ETL
|
5
|
+
class TableLoader < Chronicle::ETL::Loader
|
6
6
|
def initialize(options)
|
7
7
|
super(options)
|
8
8
|
end
|
9
9
|
|
10
|
-
def load(
|
11
|
-
|
12
|
-
|
10
|
+
def load(record)
|
11
|
+
record_hash = record.to_h_flattened
|
12
|
+
@table ||= TTY::Table.new(header: record_hash.keys)
|
13
|
+
values = record_hash.values.map{|x| x.to_s[0..30]}
|
13
14
|
@table << values
|
14
15
|
end
|
15
16
|
|
16
17
|
def finish
|
17
|
-
puts @table.render(:ascii, padding: [0, 1])
|
18
|
+
puts @table.render(:ascii, padding: [0, 1]) if @table
|
18
19
|
end
|
19
20
|
end
|
20
21
|
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'chronicle/etl/models/base'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Models
|
6
|
+
class Activity < Chronicle::ETL::Models::Base
|
7
|
+
TYPE = 'activities'.freeze
|
8
|
+
ATTRIBUTES = [:verb, :start_at, :end_at].freeze
|
9
|
+
ASSOCIATIONS = [:involved, :actor].freeze
|
10
|
+
|
11
|
+
attr_accessor(*ATTRIBUTES, *ASSOCIATIONS)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,103 @@
|
|
1
|
+
require 'digest'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Models
|
6
|
+
# Represents a record that's been transformed by a Transformer and
|
7
|
+
# ready to be loaded. Loosely based on ActiveModel.
|
8
|
+
class Base
|
9
|
+
ATTRIBUTES = [:provider, :provider_id, :lat, :lng].freeze
|
10
|
+
ASSOCIATIONS = [].freeze
|
11
|
+
|
12
|
+
attr_accessor(:id, :dedupe_on, *ATTRIBUTES)
|
13
|
+
|
14
|
+
def initialize(attributes = {})
|
15
|
+
assign_attributes(attributes) if attributes
|
16
|
+
@dedupe_on = []
|
17
|
+
end
|
18
|
+
|
19
|
+
# A unique identifier for this model is formed from a type
|
20
|
+
# and either an id or lids.
|
21
|
+
def identifier_hash
|
22
|
+
{
|
23
|
+
type: self.class::TYPE,
|
24
|
+
id: @id,
|
25
|
+
lids: lids
|
26
|
+
}.compact
|
27
|
+
end
|
28
|
+
|
29
|
+
# Array of local ids that uniquely identify this record
|
30
|
+
def lids
|
31
|
+
@dedupe_on.map do |fields|
|
32
|
+
generate_lid(fields)
|
33
|
+
end.compact.uniq
|
34
|
+
end
|
35
|
+
|
36
|
+
# For a given set of fields of this model, generate a
|
37
|
+
# unique local id by hashing the field values
|
38
|
+
def generate_lid fields
|
39
|
+
values = fields.sort.map do |field|
|
40
|
+
instance_variable = "@#{field.to_s}"
|
41
|
+
self.instance_variable_get(instance_variable)
|
42
|
+
end
|
43
|
+
|
44
|
+
return if values.any? { |e| e.nil? }
|
45
|
+
|
46
|
+
Digest::SHA256.hexdigest(values.join(","))
|
47
|
+
end
|
48
|
+
|
49
|
+
# Set of attribute names that this model has is Base's shared
|
50
|
+
# attributes combined with the child class's
|
51
|
+
def attribute_list
|
52
|
+
(ATTRIBUTES + self.class::ATTRIBUTES).uniq
|
53
|
+
end
|
54
|
+
|
55
|
+
# All of this record's attributes
|
56
|
+
def attributes
|
57
|
+
attributes = {}
|
58
|
+
attribute_list.each do |attribute|
|
59
|
+
instance_variable = "@#{attribute.to_s}"
|
60
|
+
attributes[attribute] = self.instance_variable_get(instance_variable)
|
61
|
+
end
|
62
|
+
attributes.compact
|
63
|
+
end
|
64
|
+
|
65
|
+
# All of this record's associations
|
66
|
+
def associations
|
67
|
+
association_list = ASSOCIATIONS + self.class::ASSOCIATIONS
|
68
|
+
attributes = {}
|
69
|
+
association_list.each do |attribute|
|
70
|
+
instance_variable = "@#{attribute.to_s}"
|
71
|
+
association = self.instance_variable_get(instance_variable)
|
72
|
+
attributes[attribute] = association if association
|
73
|
+
end
|
74
|
+
attributes.compact
|
75
|
+
end
|
76
|
+
|
77
|
+
def associations_hash
|
78
|
+
Hash[associations.map do |k, v|
|
79
|
+
[k, v.to_h]
|
80
|
+
end]
|
81
|
+
end
|
82
|
+
|
83
|
+
# FIXME: move this to a Utils module
|
84
|
+
def to_h_flattened
|
85
|
+
Chronicle::ETL::Utils::HashUtilities.flatten_hash(to_h)
|
86
|
+
end
|
87
|
+
|
88
|
+
def to_h
|
89
|
+
identifier_hash.merge(attributes).merge(associations_hash)
|
90
|
+
end
|
91
|
+
|
92
|
+
private
|
93
|
+
|
94
|
+
def assign_attributes attributes
|
95
|
+
attributes.each do |k, v|
|
96
|
+
setter = :"#{k}="
|
97
|
+
public_send(setter, v) if respond_to? setter
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'chronicle/etl/models/base'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Models
|
6
|
+
class Entity < Chronicle::ETL::Models::Base
|
7
|
+
TYPE = 'entities'.freeze
|
8
|
+
ATTRIBUTES = [:title, :body, :represents, :slug].freeze
|
9
|
+
ASSOCIATIONS = [].freeze # TODO: add these to reflect Chronicle Schema
|
10
|
+
|
11
|
+
attr_accessor(*ATTRIBUTES)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'chronicle/etl/models/base'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Models
|
6
|
+
class Generic < Chronicle::ETL::Models::Base
|
7
|
+
TYPE = 'generic'
|
8
|
+
|
9
|
+
attr_accessor :properties
|
10
|
+
|
11
|
+
def initialize(properties = {})
|
12
|
+
@properties = properties
|
13
|
+
super
|
14
|
+
end
|
15
|
+
|
16
|
+
# Generic models have arbitrary attributes stored in @properties
|
17
|
+
def attributes
|
18
|
+
@properties.transform_keys(&:to_sym)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/lib/chronicle/etl/runner.rb
CHANGED
@@ -1,59 +1,37 @@
|
|
1
|
-
|
2
|
-
BUILTIN = {
|
3
|
-
extractor: ['stdin', 'json', 'csv', 'file'],
|
4
|
-
transformer: ['null'],
|
5
|
-
loader: ['stdout', 'csv', 'table']
|
6
|
-
}.freeze
|
1
|
+
require 'colorize'
|
7
2
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
3
|
+
class Chronicle::ETL::Runner
|
4
|
+
def initialize(job)
|
5
|
+
@job = job
|
6
|
+
@job_logger = Chronicle::ETL::JobLogger.new(@job)
|
12
7
|
end
|
13
8
|
|
14
9
|
def run!
|
15
|
-
|
16
|
-
|
17
|
-
|
10
|
+
extractor = @job.instantiate_extractor
|
11
|
+
loader = @job.instantiate_loader
|
12
|
+
|
13
|
+
@job_logger.start
|
14
|
+
loader.start
|
15
|
+
|
16
|
+
total = extractor.results_count
|
17
|
+
progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
|
18
18
|
|
19
|
-
|
19
|
+
extractor.extract do |data, metadata|
|
20
|
+
transformer = @job.instantiate_transformer(data)
|
21
|
+
record = transformer.transform
|
20
22
|
|
21
|
-
|
22
|
-
|
23
|
-
|
23
|
+
unless record.is_a?(Chronicle::ETL::Models::Base)
|
24
|
+
raise Chronicle::ETL::InvalidTransformedRecordError, "Transformed data is not a type of Chronicle::ETL::Models"
|
25
|
+
end
|
24
26
|
|
27
|
+
@job_logger.log_transformation(transformer)
|
28
|
+
loader.load(record)
|
25
29
|
progress_bar.increment
|
26
|
-
count += 1
|
27
30
|
end
|
28
31
|
|
29
32
|
progress_bar.finish
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
private
|
34
|
-
|
35
|
-
def instantiate_etl_classes
|
36
|
-
@extractor = load_etl_class(:extractor, @options[:extractor][:name]).new(@options[:extractor][:options])
|
37
|
-
@transformer = load_etl_class(:transformer, @options[:transformer][:name]).new(@options[:transformer][:options])
|
38
|
-
@loader = load_etl_class(:loader, @options[:loader][:name]).new(@options[:loader][:options])
|
39
|
-
end
|
40
|
-
|
41
|
-
def load_etl_class(phase, x)
|
42
|
-
if BUILTIN[phase].include? x
|
43
|
-
klass_name = "Chronicle::Etl::#{x.capitalize}#{phase.to_s.capitalize}"
|
44
|
-
else
|
45
|
-
# TODO: come up with syntax for specifying a particular extractor in a provider library
|
46
|
-
provider, name = x.split(":")
|
47
|
-
provider = x unless provider
|
48
|
-
begin
|
49
|
-
require "chronicle/#{provider}"
|
50
|
-
rescue LoadError => e
|
51
|
-
warn("Error loading #{phase} '#{provider}'")
|
52
|
-
warn(" Perhaps you haven't installed it yet: `$ gem install chronicle-#{provider}`")
|
53
|
-
exit(false)
|
54
|
-
end
|
55
|
-
klass_name = "Chronicle::#{provider.capitalize}::#{name&.capitalize}#{phase.capitalize}"
|
56
|
-
end
|
57
|
-
Object.const_get(klass_name)
|
33
|
+
loader.finish
|
34
|
+
@job_logger.finish
|
35
|
+
@job_logger.save
|
58
36
|
end
|
59
37
|
end
|