chronicle-etl 0.1.3 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +8 -0
- data/.yardopts +1 -0
- data/Gemfile.lock +15 -1
- data/README.md +62 -11
- data/chronicle-etl.gemspec +6 -1
- data/exe/chronicle-etl +2 -2
- data/lib/chronicle/etl.rb +9 -2
- data/lib/chronicle/etl/catalog.rb +68 -18
- data/lib/chronicle/etl/cli/connectors.rb +32 -0
- data/lib/chronicle/etl/cli/jobs.rb +116 -0
- data/lib/chronicle/etl/cli/main.rb +83 -0
- data/lib/chronicle/etl/cli/subcommand_base.rb +37 -0
- data/lib/chronicle/etl/config.rb +53 -0
- data/lib/chronicle/etl/exceptions.rb +17 -0
- data/lib/chronicle/etl/extractors/csv_extractor.rb +1 -1
- data/lib/chronicle/etl/extractors/extractor.rb +18 -5
- data/lib/chronicle/etl/extractors/file_extractor.rb +2 -2
- data/lib/chronicle/etl/extractors/stdin_extractor.rb +2 -2
- data/lib/chronicle/etl/job.rb +62 -0
- data/lib/chronicle/etl/job_definition.rb +51 -0
- data/lib/chronicle/etl/job_log.rb +79 -0
- data/lib/chronicle/etl/job_logger.rb +76 -0
- data/lib/chronicle/etl/loaders/csv_loader.rb +2 -2
- data/lib/chronicle/etl/loaders/loader.rb +13 -6
- data/lib/chronicle/etl/loaders/rest_loader.rb +30 -0
- data/lib/chronicle/etl/loaders/stdout_loader.rb +2 -2
- data/lib/chronicle/etl/loaders/table_loader.rb +6 -10
- data/lib/chronicle/etl/runner.rb +19 -51
- data/lib/chronicle/etl/transformers/json_transformer.rb +2 -2
- data/lib/chronicle/etl/transformers/null_transformer.rb +4 -4
- data/lib/chronicle/etl/transformers/transformer.rb +21 -4
- data/lib/chronicle/etl/utils/progress_bar.rb +1 -1
- data/lib/chronicle/etl/version.rb +2 -2
- metadata +85 -4
- data/CHANGELOG.md +0 -18
- data/lib/chronicle/etl/cli.rb +0 -48
@@ -0,0 +1,76 @@
|
|
1
|
+
require 'sequel'
|
2
|
+
require 'forwardable'
|
3
|
+
|
4
|
+
require 'pry'
|
5
|
+
|
6
|
+
module Chronicle
|
7
|
+
module ETL
|
8
|
+
# Saves JobLogs to db and loads previous ones
|
9
|
+
class JobLogger
|
10
|
+
extend Forwardable
|
11
|
+
|
12
|
+
def_delegators :@job_log, :start, :finish, :log_transformation
|
13
|
+
|
14
|
+
# Create a new JobLogger
|
15
|
+
def initialize(job)
|
16
|
+
@job_log = JobLog.new do |job_log|
|
17
|
+
job_log.job = job
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# Save this JobLogger's JobLog to db
|
22
|
+
def save
|
23
|
+
JobLogger.with_db_connection do |db|
|
24
|
+
dataset = db[:job_logs]
|
25
|
+
dataset.insert(@job_log.serialize)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
# For a given `job_id`, return the last successful log
|
30
|
+
def self.load_latest(job_id)
|
31
|
+
with_db_connection do |db|
|
32
|
+
attrs = db[:job_logs].reverse_order(:finished_at).where(success: true).first
|
33
|
+
JobLog.build_from_serialized(attrs) if attrs
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.with_db_connection
|
38
|
+
initialize_db unless db_exists?
|
39
|
+
Sequel.connect("sqlite://#{db_filename}") do |db|
|
40
|
+
initialize_schema(db) unless schema_exists?(db)
|
41
|
+
yield db
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.db_exists?
|
46
|
+
File.exists?(db_filename)
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.schema_exists?(db)
|
50
|
+
return db.tables.include? :job_logs
|
51
|
+
end
|
52
|
+
|
53
|
+
def self.db_filename
|
54
|
+
data = Runcom::Data.new "chronicle/etl/job_log.db"
|
55
|
+
filename = data.all[0].to_s
|
56
|
+
end
|
57
|
+
|
58
|
+
def self.initialize_db
|
59
|
+
FileUtils.mkdir_p(File.dirname(db_filename))
|
60
|
+
end
|
61
|
+
|
62
|
+
def self.initialize_schema db
|
63
|
+
db.create_table :job_logs do
|
64
|
+
primary_key :id
|
65
|
+
String :job_id, null: false
|
66
|
+
String :last_id
|
67
|
+
Time :highest_timestamp
|
68
|
+
Integer :num_records_processed
|
69
|
+
boolean :success, default: false
|
70
|
+
Time :started_at
|
71
|
+
Time :finished_at
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -1,25 +1,32 @@
|
|
1
1
|
module Chronicle
|
2
|
-
module
|
2
|
+
module ETL
|
3
|
+
# Abstract class representing a Loader for an ETL job
|
3
4
|
class Loader
|
4
|
-
extend Chronicle::
|
5
|
-
|
5
|
+
extend Chronicle::ETL::Catalog
|
6
|
+
|
7
|
+
# Construct a new instance of this loader. Options are passed in from a Runner
|
8
|
+
# == Paramters:
|
9
|
+
# options::
|
10
|
+
# Options for configuring this Loader
|
6
11
|
def initialize(options = {})
|
7
12
|
@options = options
|
8
13
|
end
|
9
14
|
|
15
|
+
# Called once before processing records
|
10
16
|
def start; end
|
11
17
|
|
12
|
-
|
13
|
-
|
18
|
+
# Load a single record
|
14
19
|
def load
|
15
20
|
raise NotImplementedError
|
16
21
|
end
|
17
22
|
|
23
|
+
# Called once there are no more records to process
|
18
24
|
def finish; end
|
19
25
|
end
|
20
26
|
end
|
21
27
|
end
|
22
28
|
|
23
29
|
require_relative 'csv_loader'
|
30
|
+
require_relative 'rest_loader'
|
24
31
|
require_relative 'stdout_loader'
|
25
|
-
require_relative 'table_loader'
|
32
|
+
require_relative 'table_loader'
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'uri'
|
3
|
+
require 'json'
|
4
|
+
|
5
|
+
module Chronicle
|
6
|
+
module ETL
|
7
|
+
class RestLoader < Chronicle::ETL::Loader
|
8
|
+
def initialize(options={})
|
9
|
+
super(options)
|
10
|
+
end
|
11
|
+
|
12
|
+
def load(result)
|
13
|
+
uri = URI.parse("#{@options[:hostname]}#{@options[:endpoint]}")
|
14
|
+
|
15
|
+
header = {
|
16
|
+
"Authorization" => "Bearer #{@options[:access_token]}",
|
17
|
+
"Content-Type": 'application/json'
|
18
|
+
}
|
19
|
+
|
20
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
21
|
+
request = Net::HTTP::Post.new(uri.request_uri, header)
|
22
|
+
|
23
|
+
obj = {data: result} unless result[:data]
|
24
|
+
request.body = obj.to_json
|
25
|
+
|
26
|
+
response = http.request(request)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -1,24 +1,20 @@
|
|
1
1
|
require 'tty/table'
|
2
2
|
|
3
3
|
module Chronicle
|
4
|
-
module
|
5
|
-
class TableLoader < Chronicle::
|
4
|
+
module ETL
|
5
|
+
class TableLoader < Chronicle::ETL::Loader
|
6
6
|
def initialize(options)
|
7
7
|
super(options)
|
8
8
|
end
|
9
9
|
|
10
|
-
# defer creating table until we get first result and can determine headers
|
11
|
-
def first_load(result)
|
12
|
-
headers = result.keys
|
13
|
-
@table = TTY::Table.new(header: headers)
|
14
|
-
end
|
15
|
-
|
16
10
|
def load(result)
|
17
|
-
@table
|
11
|
+
@table ||= TTY::Table.new(header: result.keys)
|
12
|
+
values = result.values.map{|x| x.to_s[0..30]}
|
13
|
+
@table << values
|
18
14
|
end
|
19
15
|
|
20
16
|
def finish
|
21
|
-
puts @table.render(:ascii)
|
17
|
+
puts @table.render(:ascii, padding: [0, 1])
|
22
18
|
end
|
23
19
|
end
|
24
20
|
end
|
data/lib/chronicle/etl/runner.rb
CHANGED
@@ -1,64 +1,32 @@
|
|
1
|
-
|
2
|
-
BUILTIN = {
|
3
|
-
extractor: ['stdin', 'json', 'csv', 'file'],
|
4
|
-
transformer: ['null'],
|
5
|
-
loader: ['stdout', 'csv', 'table']
|
6
|
-
}.freeze
|
1
|
+
require 'colorize'
|
7
2
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
3
|
+
class Chronicle::ETL::Runner
|
4
|
+
def initialize(job)
|
5
|
+
@job = job
|
6
|
+
@job_logger = Chronicle::ETL::JobLogger.new(@job)
|
12
7
|
end
|
13
8
|
|
14
9
|
def run!
|
15
|
-
|
16
|
-
|
10
|
+
extractor = @job.instantiate_extractor
|
11
|
+
loader = @job.instantiate_loader
|
17
12
|
|
18
|
-
@
|
13
|
+
@job_logger.start
|
14
|
+
loader.start
|
19
15
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
@loader.first_load(transformed_data) if count == 0
|
24
|
-
@loader.load(transformed_data)
|
16
|
+
total = extractor.results_count
|
17
|
+
progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
|
25
18
|
|
19
|
+
extractor.extract do |data, metadata|
|
20
|
+
transformer = @job.instantiate_transformer(data)
|
21
|
+
transformed_data = transformer.transform
|
22
|
+
@job_logger.log_transformation(transformer)
|
23
|
+
loader.load(transformed_data)
|
26
24
|
progress_bar.increment
|
27
|
-
count += 1
|
28
|
-
# rescue StandardError => e
|
29
|
-
# require 'pry'
|
30
|
-
# binding.pry
|
31
|
-
# progress_bar.log "Error processing; #{e.inspect}"
|
32
25
|
end
|
33
26
|
|
34
27
|
progress_bar.finish
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
private
|
39
|
-
|
40
|
-
def instantiate_etl_classes
|
41
|
-
@extractor = load_etl_class(:extractor, @options[:extractor][:name]).new(@options[:extractor][:options])
|
42
|
-
@transformer = load_etl_class(:transformer, @options[:transformer][:name]).new(@options[:transformer][:options])
|
43
|
-
@loader = load_etl_class(:loader, @options[:loader][:name]).new(@options[:loader][:options])
|
44
|
-
end
|
45
|
-
|
46
|
-
def load_etl_class(phase, name)
|
47
|
-
if BUILTIN[phase].include? name
|
48
|
-
klass_name = "Chronicle::Etl::#{name.capitalize}#{phase.to_s.capitalize}"
|
49
|
-
else
|
50
|
-
# TODO: come up with syntax for specifying a particular extractor in a provider library
|
51
|
-
# provider, extractor = name.split(":")
|
52
|
-
provider = name
|
53
|
-
begin
|
54
|
-
require "chronicle/#{provider}"
|
55
|
-
rescue LoadError => e
|
56
|
-
warn("Error loading #{phase} '#{provider}'")
|
57
|
-
warn(" Perhaps you haven't installed it yet: `$ gem install chronicle-#{provider}`")
|
58
|
-
exit(false)
|
59
|
-
end
|
60
|
-
klass_name = "Chronicle::#{name.capitalize}::ChronicleTransformer"
|
61
|
-
end
|
62
|
-
Object.const_get(klass_name)
|
28
|
+
loader.finish
|
29
|
+
@job_logger.finish
|
30
|
+
@job_logger.save
|
63
31
|
end
|
64
32
|
end
|
@@ -1,15 +1,32 @@
|
|
1
1
|
module Chronicle
|
2
|
-
module
|
2
|
+
module ETL
|
3
|
+
# Abstract class representing an Transformer for an ETL job
|
3
4
|
class Transformer
|
4
|
-
extend Chronicle::
|
5
|
+
extend Chronicle::ETL::Catalog
|
5
6
|
|
6
|
-
|
7
|
+
# Construct a new instance of this transformer. Options are passed in from a Runner
|
8
|
+
# == Paramters:
|
9
|
+
# options::
|
10
|
+
# Options for configuring this Transformer
|
11
|
+
def initialize(options = {}, data)
|
7
12
|
@options = options
|
13
|
+
@data = data
|
8
14
|
end
|
9
15
|
|
10
|
-
|
16
|
+
# The main entrypoint for transforming a record. Called by a Runner on each extracted record
|
17
|
+
def transform
|
11
18
|
raise NotImplementedError
|
12
19
|
end
|
20
|
+
|
21
|
+
# The domain or provider-specific id of the record this transformer is working on.
|
22
|
+
# Used for building a cursor so an extractor doesn't have to start from the beginning of a
|
23
|
+
# data source from the beginning.
|
24
|
+
def id; end
|
25
|
+
|
26
|
+
# The domain or provider-specific timestamp of the record this transformer is working on.
|
27
|
+
# Used for building a cursor so an extractor doesn't have to start from the beginning of a
|
28
|
+
# data source from the beginning.
|
29
|
+
def timestamp; end
|
13
30
|
end
|
14
31
|
end
|
15
32
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chronicle-etl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Louis
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-09-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -66,6 +66,34 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0.17'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: sequel
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '5.35'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '5.35'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: deep_merge
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '1.2'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '1.2'
|
69
97
|
- !ruby/object:Gem::Dependency
|
70
98
|
name: bundler
|
71
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -122,6 +150,48 @@ dependencies:
|
|
122
150
|
- - "~>"
|
123
151
|
- !ruby/object:Gem::Version
|
124
152
|
version: '3.9'
|
153
|
+
- !ruby/object:Gem::Dependency
|
154
|
+
name: runcom
|
155
|
+
requirement: !ruby/object:Gem::Requirement
|
156
|
+
requirements:
|
157
|
+
- - "~>"
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: '6.2'
|
160
|
+
type: :development
|
161
|
+
prerelease: false
|
162
|
+
version_requirements: !ruby/object:Gem::Requirement
|
163
|
+
requirements:
|
164
|
+
- - "~>"
|
165
|
+
- !ruby/object:Gem::Version
|
166
|
+
version: '6.2'
|
167
|
+
- !ruby/object:Gem::Dependency
|
168
|
+
name: redcarpet
|
169
|
+
requirement: !ruby/object:Gem::Requirement
|
170
|
+
requirements:
|
171
|
+
- - "~>"
|
172
|
+
- !ruby/object:Gem::Version
|
173
|
+
version: '3.5'
|
174
|
+
type: :development
|
175
|
+
prerelease: false
|
176
|
+
version_requirements: !ruby/object:Gem::Requirement
|
177
|
+
requirements:
|
178
|
+
- - "~>"
|
179
|
+
- !ruby/object:Gem::Version
|
180
|
+
version: '3.5'
|
181
|
+
- !ruby/object:Gem::Dependency
|
182
|
+
name: sqlite3
|
183
|
+
requirement: !ruby/object:Gem::Requirement
|
184
|
+
requirements:
|
185
|
+
- - "~>"
|
186
|
+
- !ruby/object:Gem::Version
|
187
|
+
version: '1.4'
|
188
|
+
type: :development
|
189
|
+
prerelease: false
|
190
|
+
version_requirements: !ruby/object:Gem::Requirement
|
191
|
+
requirements:
|
192
|
+
- - "~>"
|
193
|
+
- !ruby/object:Gem::Version
|
194
|
+
version: '1.4'
|
125
195
|
description: Chronicle-ETL allows you to extract personal data from a variety of services,
|
126
196
|
transformer it, and load it.
|
127
197
|
email:
|
@@ -133,9 +203,10 @@ extra_rdoc_files: []
|
|
133
203
|
files:
|
134
204
|
- ".gitignore"
|
135
205
|
- ".rspec"
|
206
|
+
- ".rubocop.yml"
|
136
207
|
- ".ruby-version"
|
137
208
|
- ".travis.yml"
|
138
|
-
-
|
209
|
+
- ".yardopts"
|
139
210
|
- CODE_OF_CONDUCT.md
|
140
211
|
- Gemfile
|
141
212
|
- Gemfile.lock
|
@@ -148,13 +219,23 @@ files:
|
|
148
219
|
- exe/chronicle-etl
|
149
220
|
- lib/chronicle/etl.rb
|
150
221
|
- lib/chronicle/etl/catalog.rb
|
151
|
-
- lib/chronicle/etl/cli.rb
|
222
|
+
- lib/chronicle/etl/cli/connectors.rb
|
223
|
+
- lib/chronicle/etl/cli/jobs.rb
|
224
|
+
- lib/chronicle/etl/cli/main.rb
|
225
|
+
- lib/chronicle/etl/cli/subcommand_base.rb
|
226
|
+
- lib/chronicle/etl/config.rb
|
227
|
+
- lib/chronicle/etl/exceptions.rb
|
152
228
|
- lib/chronicle/etl/extractors/csv_extractor.rb
|
153
229
|
- lib/chronicle/etl/extractors/extractor.rb
|
154
230
|
- lib/chronicle/etl/extractors/file_extractor.rb
|
155
231
|
- lib/chronicle/etl/extractors/stdin_extractor.rb
|
232
|
+
- lib/chronicle/etl/job.rb
|
233
|
+
- lib/chronicle/etl/job_definition.rb
|
234
|
+
- lib/chronicle/etl/job_log.rb
|
235
|
+
- lib/chronicle/etl/job_logger.rb
|
156
236
|
- lib/chronicle/etl/loaders/csv_loader.rb
|
157
237
|
- lib/chronicle/etl/loaders/loader.rb
|
238
|
+
- lib/chronicle/etl/loaders/rest_loader.rb
|
158
239
|
- lib/chronicle/etl/loaders/stdout_loader.rb
|
159
240
|
- lib/chronicle/etl/loaders/table_loader.rb
|
160
241
|
- lib/chronicle/etl/runner.rb
|