chronicle-etl 0.1.4 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +8 -0
- data/.yardopts +1 -0
- data/Gemfile.lock +15 -1
- data/README.md +31 -13
- data/chronicle-etl.gemspec +6 -1
- data/exe/chronicle-etl +2 -2
- data/lib/chronicle/etl.rb +15 -2
- data/lib/chronicle/etl/catalog.rb +67 -17
- data/lib/chronicle/etl/cli/connectors.rb +32 -0
- data/lib/chronicle/etl/cli/jobs.rb +116 -0
- data/lib/chronicle/etl/cli/main.rb +83 -0
- data/lib/chronicle/etl/cli/subcommand_base.rb +37 -0
- data/lib/chronicle/etl/config.rb +53 -0
- data/lib/chronicle/etl/exceptions.rb +19 -0
- data/lib/chronicle/etl/extractors/csv_extractor.rb +2 -3
- data/lib/chronicle/etl/extractors/extractor.rb +21 -5
- data/lib/chronicle/etl/extractors/file_extractor.rb +2 -2
- data/lib/chronicle/etl/extractors/stdin_extractor.rb +2 -2
- data/lib/chronicle/etl/job.rb +71 -0
- data/lib/chronicle/etl/job_definition.rb +51 -0
- data/lib/chronicle/etl/job_log.rb +85 -0
- data/lib/chronicle/etl/job_logger.rb +78 -0
- data/lib/chronicle/etl/loaders/csv_loader.rb +4 -8
- data/lib/chronicle/etl/loaders/loader.rb +11 -2
- data/lib/chronicle/etl/loaders/rest_loader.rb +33 -0
- data/lib/chronicle/etl/loaders/stdout_loader.rb +5 -5
- data/lib/chronicle/etl/loaders/table_loader.rb +7 -6
- data/lib/chronicle/etl/models/activity.rb +15 -0
- data/lib/chronicle/etl/models/base.rb +103 -0
- data/lib/chronicle/etl/models/entity.rb +15 -0
- data/lib/chronicle/etl/models/generic.rb +23 -0
- data/lib/chronicle/etl/runner.rb +24 -46
- data/lib/chronicle/etl/transformers/null_transformer.rb +5 -6
- data/lib/chronicle/etl/transformers/transformer.rb +23 -7
- data/lib/chronicle/etl/utils/hash_utilities.rb +19 -0
- data/lib/chronicle/etl/utils/jsonapi.rb +28 -0
- data/lib/chronicle/etl/utils/progress_bar.rb +2 -2
- data/lib/chronicle/etl/version.rb +2 -2
- metadata +91 -5
- data/CHANGELOG.md +0 -23
- data/lib/chronicle/etl/cli.rb +0 -56
- data/lib/chronicle/etl/transformers/json_transformer.rb +0 -11
@@ -0,0 +1,78 @@
|
|
1
|
+
require 'sequel'
|
2
|
+
require 'forwardable'
|
3
|
+
|
4
|
+
require 'pry'
|
5
|
+
|
6
|
+
module Chronicle
|
7
|
+
module ETL
|
8
|
+
# Saves JobLogs to db and loads previous ones
|
9
|
+
class JobLogger
|
10
|
+
extend Forwardable
|
11
|
+
|
12
|
+
def_delegators :@job_log, :start, :finish, :log_transformation
|
13
|
+
|
14
|
+
# Create a new JobLogger
|
15
|
+
def initialize(job)
|
16
|
+
@job_log = JobLog.new do |job_log|
|
17
|
+
job_log.job = job
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# Save this JobLogger's JobLog to db
|
22
|
+
def save
|
23
|
+
return unless @job_log.save_log?
|
24
|
+
|
25
|
+
JobLogger.with_db_connection do |db|
|
26
|
+
dataset = db[:job_logs]
|
27
|
+
dataset.insert(@job_log.serialize)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# For a given `job_id`, return the last successful log
|
32
|
+
def self.load_latest(job_id)
|
33
|
+
with_db_connection do |db|
|
34
|
+
attrs = db[:job_logs].reverse_order(:finished_at).where(success: true).first
|
35
|
+
JobLog.build_from_serialized(attrs) if attrs
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.with_db_connection
|
40
|
+
initialize_db unless db_exists?
|
41
|
+
Sequel.connect("sqlite://#{db_filename}") do |db|
|
42
|
+
initialize_schema(db) unless schema_exists?(db)
|
43
|
+
yield db
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.db_exists?
|
48
|
+
File.exists?(db_filename)
|
49
|
+
end
|
50
|
+
|
51
|
+
def self.schema_exists?(db)
|
52
|
+
return db.tables.include? :job_logs
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.db_filename
|
56
|
+
data = Runcom::Data.new "chronicle/etl/job_log.db"
|
57
|
+
filename = data.all[0].to_s
|
58
|
+
end
|
59
|
+
|
60
|
+
def self.initialize_db
|
61
|
+
FileUtils.mkdir_p(File.dirname(db_filename))
|
62
|
+
end
|
63
|
+
|
64
|
+
def self.initialize_schema db
|
65
|
+
db.create_table :job_logs do
|
66
|
+
primary_key :id
|
67
|
+
String :job_id, null: false
|
68
|
+
String :last_id
|
69
|
+
Time :highest_timestamp
|
70
|
+
Integer :num_records_processed
|
71
|
+
boolean :success, default: false
|
72
|
+
Time :started_at
|
73
|
+
Time :finished_at
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -1,19 +1,15 @@
|
|
1
1
|
require 'csv'
|
2
2
|
|
3
3
|
module Chronicle
|
4
|
-
module
|
5
|
-
class CsvLoader < Chronicle::
|
4
|
+
module ETL
|
5
|
+
class CsvLoader < Chronicle::ETL::Loader
|
6
6
|
def initialize(options={})
|
7
7
|
super(options)
|
8
8
|
@rows = []
|
9
9
|
end
|
10
10
|
|
11
|
-
def load(
|
12
|
-
|
13
|
-
@rows << result.values
|
14
|
-
else
|
15
|
-
@rows << result
|
16
|
-
end
|
11
|
+
def load(record)
|
12
|
+
@rows << record.to_h_flattened.values
|
17
13
|
end
|
18
14
|
|
19
15
|
def finish
|
@@ -1,23 +1,32 @@
|
|
1
1
|
module Chronicle
|
2
|
-
module
|
2
|
+
module ETL
|
3
|
+
# Abstract class representing a Loader for an ETL job
|
3
4
|
class Loader
|
4
|
-
extend Chronicle::
|
5
|
+
extend Chronicle::ETL::Catalog
|
5
6
|
|
7
|
+
# Construct a new instance of this loader. Options are passed in from a Runner
|
8
|
+
# == Parameters:
|
9
|
+
# options::
|
10
|
+
# Options for configuring this Loader
|
6
11
|
def initialize(options = {})
|
7
12
|
@options = options
|
8
13
|
end
|
9
14
|
|
15
|
+
# Called once before processing records
|
10
16
|
def start; end
|
11
17
|
|
18
|
+
# Load a single record
|
12
19
|
def load
|
13
20
|
raise NotImplementedError
|
14
21
|
end
|
15
22
|
|
23
|
+
# Called once there are no more records to process
|
16
24
|
def finish; end
|
17
25
|
end
|
18
26
|
end
|
19
27
|
end
|
20
28
|
|
21
29
|
require_relative 'csv_loader'
|
30
|
+
require_relative 'rest_loader'
|
22
31
|
require_relative 'stdout_loader'
|
23
32
|
require_relative 'table_loader'
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'uri'
|
3
|
+
require 'json'
|
4
|
+
|
5
|
+
module Chronicle
|
6
|
+
module ETL
|
7
|
+
class RestLoader < Chronicle::ETL::Loader
|
8
|
+
def initialize( options={} )
|
9
|
+
super(options)
|
10
|
+
end
|
11
|
+
|
12
|
+
def load(record)
|
13
|
+
payload = Chronicle::ETL::Utils::JSONAPI.serialize(record)
|
14
|
+
# have the outer data key that json-api expects
|
15
|
+
payload = { data: payload } unless payload[:data]
|
16
|
+
|
17
|
+
uri = URI.parse("#{@options[:hostname]}#{@options[:endpoint]}")
|
18
|
+
|
19
|
+
header = {
|
20
|
+
"Authorization" => "Bearer #{@options[:access_token]}",
|
21
|
+
"Content-Type": 'application/json'
|
22
|
+
}
|
23
|
+
use_ssl = uri.scheme == 'https'
|
24
|
+
|
25
|
+
Net::HTTP.start(uri.host, uri.port, use_ssl: use_ssl) do |http|
|
26
|
+
request = Net::HTTP::Post.new(uri.request_uri, header)
|
27
|
+
request.body = payload.to_json
|
28
|
+
http.request(request)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -1,20 +1,21 @@
|
|
1
1
|
require 'tty/table'
|
2
2
|
|
3
3
|
module Chronicle
|
4
|
-
module
|
5
|
-
class TableLoader < Chronicle::
|
4
|
+
module ETL
|
5
|
+
class TableLoader < Chronicle::ETL::Loader
|
6
6
|
def initialize(options)
|
7
7
|
super(options)
|
8
8
|
end
|
9
9
|
|
10
|
-
def load(
|
11
|
-
|
12
|
-
|
10
|
+
def load(record)
|
11
|
+
record_hash = record.to_h_flattened
|
12
|
+
@table ||= TTY::Table.new(header: record_hash.keys)
|
13
|
+
values = record_hash.values.map{|x| x.to_s[0..30]}
|
13
14
|
@table << values
|
14
15
|
end
|
15
16
|
|
16
17
|
def finish
|
17
|
-
puts @table.render(:ascii, padding: [0, 1])
|
18
|
+
puts @table.render(:ascii, padding: [0, 1]) if @table
|
18
19
|
end
|
19
20
|
end
|
20
21
|
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'chronicle/etl/models/base'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Models
|
6
|
+
class Activity < Chronicle::ETL::Models::Base
|
7
|
+
TYPE = 'activities'.freeze
|
8
|
+
ATTRIBUTES = [:verb, :start_at, :end_at].freeze
|
9
|
+
ASSOCIATIONS = [:involved, :actor].freeze
|
10
|
+
|
11
|
+
attr_accessor(*ATTRIBUTES, *ASSOCIATIONS)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,103 @@
|
|
1
|
+
require 'digest'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Models
|
6
|
+
# Represents a record that's been transformed by a Transformer and
|
7
|
+
# ready to be loaded. Loosely based on ActiveModel.
|
8
|
+
class Base
|
9
|
+
ATTRIBUTES = [:provider, :provider_id, :lat, :lng].freeze
|
10
|
+
ASSOCIATIONS = [].freeze
|
11
|
+
|
12
|
+
attr_accessor(:id, :dedupe_on, *ATTRIBUTES)
|
13
|
+
|
14
|
+
def initialize(attributes = {})
|
15
|
+
assign_attributes(attributes) if attributes
|
16
|
+
@dedupe_on = []
|
17
|
+
end
|
18
|
+
|
19
|
+
# A unique identifier for this model is formed from a type
|
20
|
+
# and either an id or lids.
|
21
|
+
def identifier_hash
|
22
|
+
{
|
23
|
+
type: self.class::TYPE,
|
24
|
+
id: @id,
|
25
|
+
lids: lids
|
26
|
+
}.compact
|
27
|
+
end
|
28
|
+
|
29
|
+
# Array of local ids that uniquely identify this record
|
30
|
+
def lids
|
31
|
+
@dedupe_on.map do |fields|
|
32
|
+
generate_lid(fields)
|
33
|
+
end.compact.uniq
|
34
|
+
end
|
35
|
+
|
36
|
+
# For a given set of fields of this model, generate a
|
37
|
+
# unique local id by hashing the field values
|
38
|
+
def generate_lid fields
|
39
|
+
values = fields.sort.map do |field|
|
40
|
+
instance_variable = "@#{field.to_s}"
|
41
|
+
self.instance_variable_get(instance_variable)
|
42
|
+
end
|
43
|
+
|
44
|
+
return if values.any? { |e| e.nil? }
|
45
|
+
|
46
|
+
Digest::SHA256.hexdigest(values.join(","))
|
47
|
+
end
|
48
|
+
|
49
|
+
# Set of attribute names that this model has is Base's shared
|
50
|
+
# attributes combined with the child class's
|
51
|
+
def attribute_list
|
52
|
+
(ATTRIBUTES + self.class::ATTRIBUTES).uniq
|
53
|
+
end
|
54
|
+
|
55
|
+
# All of this record's attributes
|
56
|
+
def attributes
|
57
|
+
attributes = {}
|
58
|
+
attribute_list.each do |attribute|
|
59
|
+
instance_variable = "@#{attribute.to_s}"
|
60
|
+
attributes[attribute] = self.instance_variable_get(instance_variable)
|
61
|
+
end
|
62
|
+
attributes.compact
|
63
|
+
end
|
64
|
+
|
65
|
+
# All of this record's associations
|
66
|
+
def associations
|
67
|
+
association_list = ASSOCIATIONS + self.class::ASSOCIATIONS
|
68
|
+
attributes = {}
|
69
|
+
association_list.each do |attribute|
|
70
|
+
instance_variable = "@#{attribute.to_s}"
|
71
|
+
association = self.instance_variable_get(instance_variable)
|
72
|
+
attributes[attribute] = association if association
|
73
|
+
end
|
74
|
+
attributes.compact
|
75
|
+
end
|
76
|
+
|
77
|
+
def associations_hash
|
78
|
+
Hash[associations.map do |k, v|
|
79
|
+
[k, v.to_h]
|
80
|
+
end]
|
81
|
+
end
|
82
|
+
|
83
|
+
# FIXME: move this to a Utils module
|
84
|
+
def to_h_flattened
|
85
|
+
Chronicle::ETL::Utils::HashUtilities.flatten_hash(to_h)
|
86
|
+
end
|
87
|
+
|
88
|
+
def to_h
|
89
|
+
identifier_hash.merge(attributes).merge(associations_hash)
|
90
|
+
end
|
91
|
+
|
92
|
+
private
|
93
|
+
|
94
|
+
def assign_attributes attributes
|
95
|
+
attributes.each do |k, v|
|
96
|
+
setter = :"#{k}="
|
97
|
+
public_send(setter, v) if respond_to? setter
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'chronicle/etl/models/base'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Models
|
6
|
+
class Entity < Chronicle::ETL::Models::Base
|
7
|
+
TYPE = 'entities'.freeze
|
8
|
+
ATTRIBUTES = [:title, :body, :represents, :slug].freeze
|
9
|
+
ASSOCIATIONS = [].freeze # TODO: add these to reflect Chronicle Schema
|
10
|
+
|
11
|
+
attr_accessor(*ATTRIBUTES)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'chronicle/etl/models/base'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Models
|
6
|
+
class Generic < Chronicle::ETL::Models::Base
|
7
|
+
TYPE = 'generic'
|
8
|
+
|
9
|
+
attr_accessor :properties
|
10
|
+
|
11
|
+
def initialize(properties = {})
|
12
|
+
@properties = properties
|
13
|
+
super
|
14
|
+
end
|
15
|
+
|
16
|
+
# Generic models have arbitrary attributes stored in @properties
|
17
|
+
def attributes
|
18
|
+
@properties.transform_keys(&:to_sym)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/lib/chronicle/etl/runner.rb
CHANGED
@@ -1,59 +1,37 @@
|
|
1
|
-
|
2
|
-
BUILTIN = {
|
3
|
-
extractor: ['stdin', 'json', 'csv', 'file'],
|
4
|
-
transformer: ['null'],
|
5
|
-
loader: ['stdout', 'csv', 'table']
|
6
|
-
}.freeze
|
1
|
+
require 'colorize'
|
7
2
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
3
|
+
class Chronicle::ETL::Runner
|
4
|
+
def initialize(job)
|
5
|
+
@job = job
|
6
|
+
@job_logger = Chronicle::ETL::JobLogger.new(@job)
|
12
7
|
end
|
13
8
|
|
14
9
|
def run!
|
15
|
-
|
16
|
-
|
17
|
-
|
10
|
+
extractor = @job.instantiate_extractor
|
11
|
+
loader = @job.instantiate_loader
|
12
|
+
|
13
|
+
@job_logger.start
|
14
|
+
loader.start
|
15
|
+
|
16
|
+
total = extractor.results_count
|
17
|
+
progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
|
18
18
|
|
19
|
-
|
19
|
+
extractor.extract do |data, metadata|
|
20
|
+
transformer = @job.instantiate_transformer(data)
|
21
|
+
record = transformer.transform
|
20
22
|
|
21
|
-
|
22
|
-
|
23
|
-
|
23
|
+
unless record.is_a?(Chronicle::ETL::Models::Base)
|
24
|
+
raise Chronicle::ETL::InvalidTransformedRecordError, "Transformed data is not a type of Chronicle::ETL::Models"
|
25
|
+
end
|
24
26
|
|
27
|
+
@job_logger.log_transformation(transformer)
|
28
|
+
loader.load(record)
|
25
29
|
progress_bar.increment
|
26
|
-
count += 1
|
27
30
|
end
|
28
31
|
|
29
32
|
progress_bar.finish
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
private
|
34
|
-
|
35
|
-
def instantiate_etl_classes
|
36
|
-
@extractor = load_etl_class(:extractor, @options[:extractor][:name]).new(@options[:extractor][:options])
|
37
|
-
@transformer = load_etl_class(:transformer, @options[:transformer][:name]).new(@options[:transformer][:options])
|
38
|
-
@loader = load_etl_class(:loader, @options[:loader][:name]).new(@options[:loader][:options])
|
39
|
-
end
|
40
|
-
|
41
|
-
def load_etl_class(phase, x)
|
42
|
-
if BUILTIN[phase].include? x
|
43
|
-
klass_name = "Chronicle::Etl::#{x.capitalize}#{phase.to_s.capitalize}"
|
44
|
-
else
|
45
|
-
# TODO: come up with syntax for specifying a particular extractor in a provider library
|
46
|
-
provider, name = x.split(":")
|
47
|
-
provider = x unless provider
|
48
|
-
begin
|
49
|
-
require "chronicle/#{provider}"
|
50
|
-
rescue LoadError => e
|
51
|
-
warn("Error loading #{phase} '#{provider}'")
|
52
|
-
warn(" Perhaps you haven't installed it yet: `$ gem install chronicle-#{provider}`")
|
53
|
-
exit(false)
|
54
|
-
end
|
55
|
-
klass_name = "Chronicle::#{provider.capitalize}::#{name&.capitalize}#{phase.capitalize}"
|
56
|
-
end
|
57
|
-
Object.const_get(klass_name)
|
33
|
+
loader.finish
|
34
|
+
@job_logger.finish
|
35
|
+
@job_logger.save
|
58
36
|
end
|
59
37
|
end
|