chronicle-etl 0.1.3 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +8 -0
- data/.yardopts +1 -0
- data/Gemfile.lock +15 -1
- data/README.md +62 -11
- data/chronicle-etl.gemspec +6 -1
- data/exe/chronicle-etl +2 -2
- data/lib/chronicle/etl.rb +9 -2
- data/lib/chronicle/etl/catalog.rb +68 -18
- data/lib/chronicle/etl/cli/connectors.rb +32 -0
- data/lib/chronicle/etl/cli/jobs.rb +116 -0
- data/lib/chronicle/etl/cli/main.rb +83 -0
- data/lib/chronicle/etl/cli/subcommand_base.rb +37 -0
- data/lib/chronicle/etl/config.rb +53 -0
- data/lib/chronicle/etl/exceptions.rb +17 -0
- data/lib/chronicle/etl/extractors/csv_extractor.rb +1 -1
- data/lib/chronicle/etl/extractors/extractor.rb +18 -5
- data/lib/chronicle/etl/extractors/file_extractor.rb +2 -2
- data/lib/chronicle/etl/extractors/stdin_extractor.rb +2 -2
- data/lib/chronicle/etl/job.rb +62 -0
- data/lib/chronicle/etl/job_definition.rb +51 -0
- data/lib/chronicle/etl/job_log.rb +79 -0
- data/lib/chronicle/etl/job_logger.rb +76 -0
- data/lib/chronicle/etl/loaders/csv_loader.rb +2 -2
- data/lib/chronicle/etl/loaders/loader.rb +13 -6
- data/lib/chronicle/etl/loaders/rest_loader.rb +30 -0
- data/lib/chronicle/etl/loaders/stdout_loader.rb +2 -2
- data/lib/chronicle/etl/loaders/table_loader.rb +6 -10
- data/lib/chronicle/etl/runner.rb +19 -51
- data/lib/chronicle/etl/transformers/json_transformer.rb +2 -2
- data/lib/chronicle/etl/transformers/null_transformer.rb +4 -4
- data/lib/chronicle/etl/transformers/transformer.rb +21 -4
- data/lib/chronicle/etl/utils/progress_bar.rb +1 -1
- data/lib/chronicle/etl/version.rb +2 -2
- metadata +85 -4
- data/CHANGELOG.md +0 -18
- data/lib/chronicle/etl/cli.rb +0 -48
@@ -0,0 +1,76 @@
|
|
1
|
+
require 'sequel'
|
2
|
+
require 'forwardable'
|
3
|
+
|
4
|
+
require 'pry'
|
5
|
+
|
6
|
+
module Chronicle
|
7
|
+
module ETL
|
8
|
+
# Saves JobLogs to db and loads previous ones
|
9
|
+
class JobLogger
|
10
|
+
extend Forwardable
|
11
|
+
|
12
|
+
def_delegators :@job_log, :start, :finish, :log_transformation
|
13
|
+
|
14
|
+
# Create a new JobLogger
|
15
|
+
def initialize(job)
|
16
|
+
@job_log = JobLog.new do |job_log|
|
17
|
+
job_log.job = job
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# Save this JobLogger's JobLog to db
|
22
|
+
def save
|
23
|
+
JobLogger.with_db_connection do |db|
|
24
|
+
dataset = db[:job_logs]
|
25
|
+
dataset.insert(@job_log.serialize)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
# For a given `job_id`, return the last successful log
|
30
|
+
def self.load_latest(job_id)
|
31
|
+
with_db_connection do |db|
|
32
|
+
attrs = db[:job_logs].reverse_order(:finished_at).where(success: true).first
|
33
|
+
JobLog.build_from_serialized(attrs) if attrs
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.with_db_connection
|
38
|
+
initialize_db unless db_exists?
|
39
|
+
Sequel.connect("sqlite://#{db_filename}") do |db|
|
40
|
+
initialize_schema(db) unless schema_exists?(db)
|
41
|
+
yield db
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.db_exists?
|
46
|
+
File.exists?(db_filename)
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.schema_exists?(db)
|
50
|
+
return db.tables.include? :job_logs
|
51
|
+
end
|
52
|
+
|
53
|
+
def self.db_filename
|
54
|
+
data = Runcom::Data.new "chronicle/etl/job_log.db"
|
55
|
+
filename = data.all[0].to_s
|
56
|
+
end
|
57
|
+
|
58
|
+
def self.initialize_db
|
59
|
+
FileUtils.mkdir_p(File.dirname(db_filename))
|
60
|
+
end
|
61
|
+
|
62
|
+
def self.initialize_schema db
|
63
|
+
db.create_table :job_logs do
|
64
|
+
primary_key :id
|
65
|
+
String :job_id, null: false
|
66
|
+
String :last_id
|
67
|
+
Time :highest_timestamp
|
68
|
+
Integer :num_records_processed
|
69
|
+
boolean :success, default: false
|
70
|
+
Time :started_at
|
71
|
+
Time :finished_at
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -1,25 +1,32 @@
|
|
1
1
|
module Chronicle
|
2
|
-
module
|
2
|
+
module ETL
|
3
|
+
# Abstract class representing a Loader for an ETL job
|
3
4
|
class Loader
|
4
|
-
extend Chronicle::
|
5
|
-
|
5
|
+
extend Chronicle::ETL::Catalog
|
6
|
+
|
7
|
+
# Construct a new instance of this loader. Options are passed in from a Runner
|
8
|
+
# == Paramters:
|
9
|
+
# options::
|
10
|
+
# Options for configuring this Loader
|
6
11
|
def initialize(options = {})
|
7
12
|
@options = options
|
8
13
|
end
|
9
14
|
|
15
|
+
# Called once before processing records
|
10
16
|
def start; end
|
11
17
|
|
12
|
-
|
13
|
-
|
18
|
+
# Load a single record
|
14
19
|
def load
|
15
20
|
raise NotImplementedError
|
16
21
|
end
|
17
22
|
|
23
|
+
# Called once there are no more records to process
|
18
24
|
def finish; end
|
19
25
|
end
|
20
26
|
end
|
21
27
|
end
|
22
28
|
|
23
29
|
require_relative 'csv_loader'
|
30
|
+
require_relative 'rest_loader'
|
24
31
|
require_relative 'stdout_loader'
|
25
|
-
require_relative 'table_loader'
|
32
|
+
require_relative 'table_loader'
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'uri'
|
3
|
+
require 'json'
|
4
|
+
|
5
|
+
module Chronicle
|
6
|
+
module ETL
|
7
|
+
class RestLoader < Chronicle::ETL::Loader
|
8
|
+
def initialize(options={})
|
9
|
+
super(options)
|
10
|
+
end
|
11
|
+
|
12
|
+
def load(result)
|
13
|
+
uri = URI.parse("#{@options[:hostname]}#{@options[:endpoint]}")
|
14
|
+
|
15
|
+
header = {
|
16
|
+
"Authorization" => "Bearer #{@options[:access_token]}",
|
17
|
+
"Content-Type": 'application/json'
|
18
|
+
}
|
19
|
+
|
20
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
21
|
+
request = Net::HTTP::Post.new(uri.request_uri, header)
|
22
|
+
|
23
|
+
obj = {data: result} unless result[:data]
|
24
|
+
request.body = obj.to_json
|
25
|
+
|
26
|
+
response = http.request(request)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -1,24 +1,20 @@
|
|
1
1
|
require 'tty/table'
|
2
2
|
|
3
3
|
module Chronicle
|
4
|
-
module
|
5
|
-
class TableLoader < Chronicle::
|
4
|
+
module ETL
|
5
|
+
class TableLoader < Chronicle::ETL::Loader
|
6
6
|
def initialize(options)
|
7
7
|
super(options)
|
8
8
|
end
|
9
9
|
|
10
|
-
# defer creating table until we get first result and can determine headers
|
11
|
-
def first_load(result)
|
12
|
-
headers = result.keys
|
13
|
-
@table = TTY::Table.new(header: headers)
|
14
|
-
end
|
15
|
-
|
16
10
|
def load(result)
|
17
|
-
@table
|
11
|
+
@table ||= TTY::Table.new(header: result.keys)
|
12
|
+
values = result.values.map{|x| x.to_s[0..30]}
|
13
|
+
@table << values
|
18
14
|
end
|
19
15
|
|
20
16
|
def finish
|
21
|
-
puts @table.render(:ascii)
|
17
|
+
puts @table.render(:ascii, padding: [0, 1])
|
22
18
|
end
|
23
19
|
end
|
24
20
|
end
|
data/lib/chronicle/etl/runner.rb
CHANGED
@@ -1,64 +1,32 @@
|
|
1
|
-
|
2
|
-
BUILTIN = {
|
3
|
-
extractor: ['stdin', 'json', 'csv', 'file'],
|
4
|
-
transformer: ['null'],
|
5
|
-
loader: ['stdout', 'csv', 'table']
|
6
|
-
}.freeze
|
1
|
+
require 'colorize'
|
7
2
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
3
|
+
class Chronicle::ETL::Runner
|
4
|
+
def initialize(job)
|
5
|
+
@job = job
|
6
|
+
@job_logger = Chronicle::ETL::JobLogger.new(@job)
|
12
7
|
end
|
13
8
|
|
14
9
|
def run!
|
15
|
-
|
16
|
-
|
10
|
+
extractor = @job.instantiate_extractor
|
11
|
+
loader = @job.instantiate_loader
|
17
12
|
|
18
|
-
@
|
13
|
+
@job_logger.start
|
14
|
+
loader.start
|
19
15
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
@loader.first_load(transformed_data) if count == 0
|
24
|
-
@loader.load(transformed_data)
|
16
|
+
total = extractor.results_count
|
17
|
+
progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
|
25
18
|
|
19
|
+
extractor.extract do |data, metadata|
|
20
|
+
transformer = @job.instantiate_transformer(data)
|
21
|
+
transformed_data = transformer.transform
|
22
|
+
@job_logger.log_transformation(transformer)
|
23
|
+
loader.load(transformed_data)
|
26
24
|
progress_bar.increment
|
27
|
-
count += 1
|
28
|
-
# rescue StandardError => e
|
29
|
-
# require 'pry'
|
30
|
-
# binding.pry
|
31
|
-
# progress_bar.log "Error processing; #{e.inspect}"
|
32
25
|
end
|
33
26
|
|
34
27
|
progress_bar.finish
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
private
|
39
|
-
|
40
|
-
def instantiate_etl_classes
|
41
|
-
@extractor = load_etl_class(:extractor, @options[:extractor][:name]).new(@options[:extractor][:options])
|
42
|
-
@transformer = load_etl_class(:transformer, @options[:transformer][:name]).new(@options[:transformer][:options])
|
43
|
-
@loader = load_etl_class(:loader, @options[:loader][:name]).new(@options[:loader][:options])
|
44
|
-
end
|
45
|
-
|
46
|
-
def load_etl_class(phase, name)
|
47
|
-
if BUILTIN[phase].include? name
|
48
|
-
klass_name = "Chronicle::Etl::#{name.capitalize}#{phase.to_s.capitalize}"
|
49
|
-
else
|
50
|
-
# TODO: come up with syntax for specifying a particular extractor in a provider library
|
51
|
-
# provider, extractor = name.split(":")
|
52
|
-
provider = name
|
53
|
-
begin
|
54
|
-
require "chronicle/#{provider}"
|
55
|
-
rescue LoadError => e
|
56
|
-
warn("Error loading #{phase} '#{provider}'")
|
57
|
-
warn(" Perhaps you haven't installed it yet: `$ gem install chronicle-#{provider}`")
|
58
|
-
exit(false)
|
59
|
-
end
|
60
|
-
klass_name = "Chronicle::#{name.capitalize}::ChronicleTransformer"
|
61
|
-
end
|
62
|
-
Object.const_get(klass_name)
|
28
|
+
loader.finish
|
29
|
+
@job_logger.finish
|
30
|
+
@job_logger.save
|
63
31
|
end
|
64
32
|
end
|
@@ -1,15 +1,32 @@
|
|
1
1
|
module Chronicle
|
2
|
-
module
|
2
|
+
module ETL
|
3
|
+
# Abstract class representing an Transformer for an ETL job
|
3
4
|
class Transformer
|
4
|
-
extend Chronicle::
|
5
|
+
extend Chronicle::ETL::Catalog
|
5
6
|
|
6
|
-
|
7
|
+
# Construct a new instance of this transformer. Options are passed in from a Runner
|
8
|
+
# == Paramters:
|
9
|
+
# options::
|
10
|
+
# Options for configuring this Transformer
|
11
|
+
def initialize(options = {}, data)
|
7
12
|
@options = options
|
13
|
+
@data = data
|
8
14
|
end
|
9
15
|
|
10
|
-
|
16
|
+
# The main entrypoint for transforming a record. Called by a Runner on each extracted record
|
17
|
+
def transform
|
11
18
|
raise NotImplementedError
|
12
19
|
end
|
20
|
+
|
21
|
+
# The domain or provider-specific id of the record this transformer is working on.
|
22
|
+
# Used for building a cursor so an extractor doesn't have to start from the beginning of a
|
23
|
+
# data source from the beginning.
|
24
|
+
def id; end
|
25
|
+
|
26
|
+
# The domain or provider-specific timestamp of the record this transformer is working on.
|
27
|
+
# Used for building a cursor so an extractor doesn't have to start from the beginning of a
|
28
|
+
# data source from the beginning.
|
29
|
+
def timestamp; end
|
13
30
|
end
|
14
31
|
end
|
15
32
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chronicle-etl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Louis
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-09-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -66,6 +66,34 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0.17'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: sequel
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '5.35'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '5.35'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: deep_merge
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '1.2'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '1.2'
|
69
97
|
- !ruby/object:Gem::Dependency
|
70
98
|
name: bundler
|
71
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -122,6 +150,48 @@ dependencies:
|
|
122
150
|
- - "~>"
|
123
151
|
- !ruby/object:Gem::Version
|
124
152
|
version: '3.9'
|
153
|
+
- !ruby/object:Gem::Dependency
|
154
|
+
name: runcom
|
155
|
+
requirement: !ruby/object:Gem::Requirement
|
156
|
+
requirements:
|
157
|
+
- - "~>"
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: '6.2'
|
160
|
+
type: :development
|
161
|
+
prerelease: false
|
162
|
+
version_requirements: !ruby/object:Gem::Requirement
|
163
|
+
requirements:
|
164
|
+
- - "~>"
|
165
|
+
- !ruby/object:Gem::Version
|
166
|
+
version: '6.2'
|
167
|
+
- !ruby/object:Gem::Dependency
|
168
|
+
name: redcarpet
|
169
|
+
requirement: !ruby/object:Gem::Requirement
|
170
|
+
requirements:
|
171
|
+
- - "~>"
|
172
|
+
- !ruby/object:Gem::Version
|
173
|
+
version: '3.5'
|
174
|
+
type: :development
|
175
|
+
prerelease: false
|
176
|
+
version_requirements: !ruby/object:Gem::Requirement
|
177
|
+
requirements:
|
178
|
+
- - "~>"
|
179
|
+
- !ruby/object:Gem::Version
|
180
|
+
version: '3.5'
|
181
|
+
- !ruby/object:Gem::Dependency
|
182
|
+
name: sqlite3
|
183
|
+
requirement: !ruby/object:Gem::Requirement
|
184
|
+
requirements:
|
185
|
+
- - "~>"
|
186
|
+
- !ruby/object:Gem::Version
|
187
|
+
version: '1.4'
|
188
|
+
type: :development
|
189
|
+
prerelease: false
|
190
|
+
version_requirements: !ruby/object:Gem::Requirement
|
191
|
+
requirements:
|
192
|
+
- - "~>"
|
193
|
+
- !ruby/object:Gem::Version
|
194
|
+
version: '1.4'
|
125
195
|
description: Chronicle-ETL allows you to extract personal data from a variety of services,
|
126
196
|
transformer it, and load it.
|
127
197
|
email:
|
@@ -133,9 +203,10 @@ extra_rdoc_files: []
|
|
133
203
|
files:
|
134
204
|
- ".gitignore"
|
135
205
|
- ".rspec"
|
206
|
+
- ".rubocop.yml"
|
136
207
|
- ".ruby-version"
|
137
208
|
- ".travis.yml"
|
138
|
-
-
|
209
|
+
- ".yardopts"
|
139
210
|
- CODE_OF_CONDUCT.md
|
140
211
|
- Gemfile
|
141
212
|
- Gemfile.lock
|
@@ -148,13 +219,23 @@ files:
|
|
148
219
|
- exe/chronicle-etl
|
149
220
|
- lib/chronicle/etl.rb
|
150
221
|
- lib/chronicle/etl/catalog.rb
|
151
|
-
- lib/chronicle/etl/cli.rb
|
222
|
+
- lib/chronicle/etl/cli/connectors.rb
|
223
|
+
- lib/chronicle/etl/cli/jobs.rb
|
224
|
+
- lib/chronicle/etl/cli/main.rb
|
225
|
+
- lib/chronicle/etl/cli/subcommand_base.rb
|
226
|
+
- lib/chronicle/etl/config.rb
|
227
|
+
- lib/chronicle/etl/exceptions.rb
|
152
228
|
- lib/chronicle/etl/extractors/csv_extractor.rb
|
153
229
|
- lib/chronicle/etl/extractors/extractor.rb
|
154
230
|
- lib/chronicle/etl/extractors/file_extractor.rb
|
155
231
|
- lib/chronicle/etl/extractors/stdin_extractor.rb
|
232
|
+
- lib/chronicle/etl/job.rb
|
233
|
+
- lib/chronicle/etl/job_definition.rb
|
234
|
+
- lib/chronicle/etl/job_log.rb
|
235
|
+
- lib/chronicle/etl/job_logger.rb
|
156
236
|
- lib/chronicle/etl/loaders/csv_loader.rb
|
157
237
|
- lib/chronicle/etl/loaders/loader.rb
|
238
|
+
- lib/chronicle/etl/loaders/rest_loader.rb
|
158
239
|
- lib/chronicle/etl/loaders/stdout_loader.rb
|
159
240
|
- lib/chronicle/etl/loaders/table_loader.rb
|
160
241
|
- lib/chronicle/etl/runner.rb
|