chronicle-etl 0.1.3 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +8 -0
  3. data/.yardopts +1 -0
  4. data/Gemfile.lock +15 -1
  5. data/README.md +62 -11
  6. data/chronicle-etl.gemspec +6 -1
  7. data/exe/chronicle-etl +2 -2
  8. data/lib/chronicle/etl.rb +9 -2
  9. data/lib/chronicle/etl/catalog.rb +68 -18
  10. data/lib/chronicle/etl/cli/connectors.rb +32 -0
  11. data/lib/chronicle/etl/cli/jobs.rb +116 -0
  12. data/lib/chronicle/etl/cli/main.rb +83 -0
  13. data/lib/chronicle/etl/cli/subcommand_base.rb +37 -0
  14. data/lib/chronicle/etl/config.rb +53 -0
  15. data/lib/chronicle/etl/exceptions.rb +17 -0
  16. data/lib/chronicle/etl/extractors/csv_extractor.rb +1 -1
  17. data/lib/chronicle/etl/extractors/extractor.rb +18 -5
  18. data/lib/chronicle/etl/extractors/file_extractor.rb +2 -2
  19. data/lib/chronicle/etl/extractors/stdin_extractor.rb +2 -2
  20. data/lib/chronicle/etl/job.rb +62 -0
  21. data/lib/chronicle/etl/job_definition.rb +51 -0
  22. data/lib/chronicle/etl/job_log.rb +79 -0
  23. data/lib/chronicle/etl/job_logger.rb +76 -0
  24. data/lib/chronicle/etl/loaders/csv_loader.rb +2 -2
  25. data/lib/chronicle/etl/loaders/loader.rb +13 -6
  26. data/lib/chronicle/etl/loaders/rest_loader.rb +30 -0
  27. data/lib/chronicle/etl/loaders/stdout_loader.rb +2 -2
  28. data/lib/chronicle/etl/loaders/table_loader.rb +6 -10
  29. data/lib/chronicle/etl/runner.rb +19 -51
  30. data/lib/chronicle/etl/transformers/json_transformer.rb +2 -2
  31. data/lib/chronicle/etl/transformers/null_transformer.rb +4 -4
  32. data/lib/chronicle/etl/transformers/transformer.rb +21 -4
  33. data/lib/chronicle/etl/utils/progress_bar.rb +1 -1
  34. data/lib/chronicle/etl/version.rb +2 -2
  35. metadata +85 -4
  36. data/CHANGELOG.md +0 -18
  37. data/lib/chronicle/etl/cli.rb +0 -48
@@ -0,0 +1,76 @@
1
+ require 'sequel'
2
+ require 'forwardable'
3
+
4
+ require 'pry'
5
+
6
+ module Chronicle
7
+ module ETL
8
+ # Saves JobLogs to db and loads previous ones
9
+ class JobLogger
10
+ extend Forwardable
11
+
12
+ def_delegators :@job_log, :start, :finish, :log_transformation
13
+
14
+ # Create a new JobLogger
15
+ def initialize(job)
16
+ @job_log = JobLog.new do |job_log|
17
+ job_log.job = job
18
+ end
19
+ end
20
+
21
+ # Save this JobLogger's JobLog to db
22
+ def save
23
+ JobLogger.with_db_connection do |db|
24
+ dataset = db[:job_logs]
25
+ dataset.insert(@job_log.serialize)
26
+ end
27
+ end
28
+
29
+ # For a given `job_id`, return the last successful log
30
+ def self.load_latest(job_id)
31
+ with_db_connection do |db|
32
+ attrs = db[:job_logs].reverse_order(:finished_at).where(success: true).first
33
+ JobLog.build_from_serialized(attrs) if attrs
34
+ end
35
+ end
36
+
37
+ def self.with_db_connection
38
+ initialize_db unless db_exists?
39
+ Sequel.connect("sqlite://#{db_filename}") do |db|
40
+ initialize_schema(db) unless schema_exists?(db)
41
+ yield db
42
+ end
43
+ end
44
+
45
+ def self.db_exists?
46
+ File.exists?(db_filename)
47
+ end
48
+
49
+ def self.schema_exists?(db)
50
+ return db.tables.include? :job_logs
51
+ end
52
+
53
+ def self.db_filename
54
+ data = Runcom::Data.new "chronicle/etl/job_log.db"
55
+ filename = data.all[0].to_s
56
+ end
57
+
58
+ def self.initialize_db
59
+ FileUtils.mkdir_p(File.dirname(db_filename))
60
+ end
61
+
62
+ def self.initialize_schema db
63
+ db.create_table :job_logs do
64
+ primary_key :id
65
+ String :job_id, null: false
66
+ String :last_id
67
+ Time :highest_timestamp
68
+ Integer :num_records_processed
69
+ boolean :success, default: false
70
+ Time :started_at
71
+ Time :finished_at
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
@@ -1,8 +1,8 @@
1
1
  require 'csv'
2
2
 
3
3
  module Chronicle
4
- module Etl
5
- class CsvLoader < Chronicle::Etl::Loader
4
+ module ETL
5
+ class CsvLoader < Chronicle::ETL::Loader
6
6
  def initialize(options={})
7
7
  super(options)
8
8
  @rows = []
@@ -1,25 +1,32 @@
1
1
  module Chronicle
2
- module Etl
2
+ module ETL
3
+ # Abstract class representing a Loader for an ETL job
3
4
  class Loader
4
- extend Chronicle::Etl::Catalog
5
-
5
+ extend Chronicle::ETL::Catalog
6
+
7
+ # Construct a new instance of this loader. Options are passed in from a Runner
8
+ # == Paramters:
9
+ # options::
10
+ # Options for configuring this Loader
6
11
  def initialize(options = {})
7
12
  @options = options
8
13
  end
9
14
 
15
+ # Called once before processing records
10
16
  def start; end
11
17
 
12
- def first_load result; end
13
-
18
+ # Load a single record
14
19
  def load
15
20
  raise NotImplementedError
16
21
  end
17
22
 
23
+ # Called once there are no more records to process
18
24
  def finish; end
19
25
  end
20
26
  end
21
27
  end
22
28
 
23
29
  require_relative 'csv_loader'
30
+ require_relative 'rest_loader'
24
31
  require_relative 'stdout_loader'
25
- require_relative 'table_loader'
32
+ require_relative 'table_loader'
@@ -0,0 +1,30 @@
1
+ require 'net/http'
2
+ require 'uri'
3
+ require 'json'
4
+
5
+ module Chronicle
6
+ module ETL
7
+ class RestLoader < Chronicle::ETL::Loader
8
+ def initialize(options={})
9
+ super(options)
10
+ end
11
+
12
+ def load(result)
13
+ uri = URI.parse("#{@options[:hostname]}#{@options[:endpoint]}")
14
+
15
+ header = {
16
+ "Authorization" => "Bearer #{@options[:access_token]}",
17
+ "Content-Type": 'application/json'
18
+ }
19
+
20
+ http = Net::HTTP.new(uri.host, uri.port)
21
+ request = Net::HTTP::Post.new(uri.request_uri, header)
22
+
23
+ obj = {data: result} unless result[:data]
24
+ request.body = obj.to_json
25
+
26
+ response = http.request(request)
27
+ end
28
+ end
29
+ end
30
+ end
@@ -1,6 +1,6 @@
1
1
  module Chronicle
2
- module Etl
3
- class StdoutLoader < Chronicle::Etl::Loader
2
+ module ETL
3
+ class StdoutLoader < Chronicle::ETL::Loader
4
4
  def load(result)
5
5
  puts result.inspect
6
6
  end
@@ -1,24 +1,20 @@
1
1
  require 'tty/table'
2
2
 
3
3
  module Chronicle
4
- module Etl
5
- class TableLoader < Chronicle::Etl::Loader
4
+ module ETL
5
+ class TableLoader < Chronicle::ETL::Loader
6
6
  def initialize(options)
7
7
  super(options)
8
8
  end
9
9
 
10
- # defer creating table until we get first result and can determine headers
11
- def first_load(result)
12
- headers = result.keys
13
- @table = TTY::Table.new(header: headers)
14
- end
15
-
16
10
  def load(result)
17
- @table << result
11
+ @table ||= TTY::Table.new(header: result.keys)
12
+ values = result.values.map{|x| x.to_s[0..30]}
13
+ @table << values
18
14
  end
19
15
 
20
16
  def finish
21
- puts @table.render(:ascii)
17
+ puts @table.render(:ascii, padding: [0, 1])
22
18
  end
23
19
  end
24
20
  end
@@ -1,64 +1,32 @@
1
- class Chronicle::Etl::Runner
2
- BUILTIN = {
3
- extractor: ['stdin', 'json', 'csv', 'file'],
4
- transformer: ['null'],
5
- loader: ['stdout', 'csv', 'table']
6
- }.freeze
1
+ require 'colorize'
7
2
 
8
- def initialize(options)
9
- @options = options
10
-
11
- instantiate_etl_classes
3
+ class Chronicle::ETL::Runner
4
+ def initialize(job)
5
+ @job = job
6
+ @job_logger = Chronicle::ETL::JobLogger.new(@job)
12
7
  end
13
8
 
14
9
  def run!
15
- progress_bar = Chronicle::Etl::Utils::ProgressBar.new(title: "Running job", total: @extractor.results_count)
16
- count = 0
10
+ extractor = @job.instantiate_extractor
11
+ loader = @job.instantiate_loader
17
12
 
18
- @loader.start
13
+ @job_logger.start
14
+ loader.start
19
15
 
20
- @extractor.extract do |data, metadata|
21
- transformed_data = @transformer.transform(data)
22
-
23
- @loader.first_load(transformed_data) if count == 0
24
- @loader.load(transformed_data)
16
+ total = extractor.results_count
17
+ progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
25
18
 
19
+ extractor.extract do |data, metadata|
20
+ transformer = @job.instantiate_transformer(data)
21
+ transformed_data = transformer.transform
22
+ @job_logger.log_transformation(transformer)
23
+ loader.load(transformed_data)
26
24
  progress_bar.increment
27
- count += 1
28
- # rescue StandardError => e
29
- # require 'pry'
30
- # binding.pry
31
- # progress_bar.log "Error processing; #{e.inspect}"
32
25
  end
33
26
 
34
27
  progress_bar.finish
35
- @loader.finish
36
- end
37
-
38
- private
39
-
40
- def instantiate_etl_classes
41
- @extractor = load_etl_class(:extractor, @options[:extractor][:name]).new(@options[:extractor][:options])
42
- @transformer = load_etl_class(:transformer, @options[:transformer][:name]).new(@options[:transformer][:options])
43
- @loader = load_etl_class(:loader, @options[:loader][:name]).new(@options[:loader][:options])
44
- end
45
-
46
- def load_etl_class(phase, name)
47
- if BUILTIN[phase].include? name
48
- klass_name = "Chronicle::Etl::#{name.capitalize}#{phase.to_s.capitalize}"
49
- else
50
- # TODO: come up with syntax for specifying a particular extractor in a provider library
51
- # provider, extractor = name.split(":")
52
- provider = name
53
- begin
54
- require "chronicle/#{provider}"
55
- rescue LoadError => e
56
- warn("Error loading #{phase} '#{provider}'")
57
- warn(" Perhaps you haven't installed it yet: `$ gem install chronicle-#{provider}`")
58
- exit(false)
59
- end
60
- klass_name = "Chronicle::#{name.capitalize}::ChronicleTransformer"
61
- end
62
- Object.const_get(klass_name)
28
+ loader.finish
29
+ @job_logger.finish
30
+ @job_logger.save
63
31
  end
64
32
  end
@@ -1,8 +1,8 @@
1
1
  require 'json'
2
2
 
3
3
  module Chronicle
4
- module Etl
5
- class JsonTransformer < Chronicle::Etl::Transformer
4
+ module ETL
5
+ class JsonTransformer < Chronicle::ETL::Transformer
6
6
  def transform data
7
7
  return JSON.parse(data)
8
8
  end
@@ -1,8 +1,8 @@
1
1
  module Chronicle
2
- module Etl
3
- class NullTransformer < Chronicle::Etl::Transformer
4
- def transform data
5
- return data
2
+ module ETL
3
+ class NullTransformer < Chronicle::ETL::Transformer
4
+ def transform
5
+ return @data
6
6
  end
7
7
  end
8
8
 
@@ -1,15 +1,32 @@
1
1
  module Chronicle
2
- module Etl
2
+ module ETL
3
+ # Abstract class representing an Transformer for an ETL job
3
4
  class Transformer
4
- extend Chronicle::Etl::Catalog
5
+ extend Chronicle::ETL::Catalog
5
6
 
6
- def initialize(options = {})
7
+ # Construct a new instance of this transformer. Options are passed in from a Runner
8
+ # == Paramters:
9
+ # options::
10
+ # Options for configuring this Transformer
11
+ def initialize(options = {}, data)
7
12
  @options = options
13
+ @data = data
8
14
  end
9
15
 
10
- def transform data
16
+ # The main entrypoint for transforming a record. Called by a Runner on each extracted record
17
+ def transform
11
18
  raise NotImplementedError
12
19
  end
20
+
21
+ # The domain or provider-specific id of the record this transformer is working on.
22
+ # Used for building a cursor so an extractor doesn't have to start from the beginning of a
23
+ # data source from the beginning.
24
+ def id; end
25
+
26
+ # The domain or provider-specific timestamp of the record this transformer is working on.
27
+ # Used for building a cursor so an extractor doesn't have to start from the beginning of a
28
+ # data source from the beginning.
29
+ def timestamp; end
13
30
  end
14
31
  end
15
32
  end
@@ -2,7 +2,7 @@ require 'tty/progressbar'
2
2
  require 'colorize'
3
3
 
4
4
  module Chronicle
5
- module Etl
5
+ module ETL
6
6
  module Utils
7
7
 
8
8
  class ProgressBar
@@ -1,5 +1,5 @@
1
1
  module Chronicle
2
- module Etl
3
- VERSION = "0.1.3"
2
+ module ETL
3
+ VERSION = "0.2.3"
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: chronicle-etl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Louis
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-08-13 00:00:00.000000000 Z
11
+ date: 2020-09-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -66,6 +66,34 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0.17'
69
+ - !ruby/object:Gem::Dependency
70
+ name: sequel
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '5.35'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '5.35'
83
+ - !ruby/object:Gem::Dependency
84
+ name: deep_merge
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '1.2'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '1.2'
69
97
  - !ruby/object:Gem::Dependency
70
98
  name: bundler
71
99
  requirement: !ruby/object:Gem::Requirement
@@ -122,6 +150,48 @@ dependencies:
122
150
  - - "~>"
123
151
  - !ruby/object:Gem::Version
124
152
  version: '3.9'
153
+ - !ruby/object:Gem::Dependency
154
+ name: runcom
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - "~>"
158
+ - !ruby/object:Gem::Version
159
+ version: '6.2'
160
+ type: :development
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - "~>"
165
+ - !ruby/object:Gem::Version
166
+ version: '6.2'
167
+ - !ruby/object:Gem::Dependency
168
+ name: redcarpet
169
+ requirement: !ruby/object:Gem::Requirement
170
+ requirements:
171
+ - - "~>"
172
+ - !ruby/object:Gem::Version
173
+ version: '3.5'
174
+ type: :development
175
+ prerelease: false
176
+ version_requirements: !ruby/object:Gem::Requirement
177
+ requirements:
178
+ - - "~>"
179
+ - !ruby/object:Gem::Version
180
+ version: '3.5'
181
+ - !ruby/object:Gem::Dependency
182
+ name: sqlite3
183
+ requirement: !ruby/object:Gem::Requirement
184
+ requirements:
185
+ - - "~>"
186
+ - !ruby/object:Gem::Version
187
+ version: '1.4'
188
+ type: :development
189
+ prerelease: false
190
+ version_requirements: !ruby/object:Gem::Requirement
191
+ requirements:
192
+ - - "~>"
193
+ - !ruby/object:Gem::Version
194
+ version: '1.4'
125
195
  description: Chronicle-ETL allows you to extract personal data from a variety of services,
126
196
  transformer it, and load it.
127
197
  email:
@@ -133,9 +203,10 @@ extra_rdoc_files: []
133
203
  files:
134
204
  - ".gitignore"
135
205
  - ".rspec"
206
+ - ".rubocop.yml"
136
207
  - ".ruby-version"
137
208
  - ".travis.yml"
138
- - CHANGELOG.md
209
+ - ".yardopts"
139
210
  - CODE_OF_CONDUCT.md
140
211
  - Gemfile
141
212
  - Gemfile.lock
@@ -148,13 +219,23 @@ files:
148
219
  - exe/chronicle-etl
149
220
  - lib/chronicle/etl.rb
150
221
  - lib/chronicle/etl/catalog.rb
151
- - lib/chronicle/etl/cli.rb
222
+ - lib/chronicle/etl/cli/connectors.rb
223
+ - lib/chronicle/etl/cli/jobs.rb
224
+ - lib/chronicle/etl/cli/main.rb
225
+ - lib/chronicle/etl/cli/subcommand_base.rb
226
+ - lib/chronicle/etl/config.rb
227
+ - lib/chronicle/etl/exceptions.rb
152
228
  - lib/chronicle/etl/extractors/csv_extractor.rb
153
229
  - lib/chronicle/etl/extractors/extractor.rb
154
230
  - lib/chronicle/etl/extractors/file_extractor.rb
155
231
  - lib/chronicle/etl/extractors/stdin_extractor.rb
232
+ - lib/chronicle/etl/job.rb
233
+ - lib/chronicle/etl/job_definition.rb
234
+ - lib/chronicle/etl/job_log.rb
235
+ - lib/chronicle/etl/job_logger.rb
156
236
  - lib/chronicle/etl/loaders/csv_loader.rb
157
237
  - lib/chronicle/etl/loaders/loader.rb
238
+ - lib/chronicle/etl/loaders/rest_loader.rb
158
239
  - lib/chronicle/etl/loaders/stdout_loader.rb
159
240
  - lib/chronicle/etl/loaders/table_loader.rb
160
241
  - lib/chronicle/etl/runner.rb