drudgery 0.0.3 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +68 -8
- data/lib/drudgery/extractors/active_record_extractor.rb +12 -1
- data/lib/drudgery/extractors/csv_extractor.rb +25 -5
- data/lib/drudgery/extractors/sqlite3_extractor.rb +31 -1
- data/lib/drudgery/job.rb +45 -9
- data/lib/drudgery/job_logger.rb +21 -0
- data/lib/drudgery/job_progress.rb +11 -0
- data/lib/drudgery/loaders/active_record_import_loader.rb +3 -0
- data/lib/drudgery/loaders/active_record_loader.rb +3 -0
- data/lib/drudgery/loaders/csv_loader.rb +4 -2
- data/lib/drudgery/loaders/sqlite3_loader.rb +17 -0
- data/lib/drudgery/version.rb +1 -1
- data/lib/drudgery.rb +16 -0
- data/spec/drudgery/extractors/active_record_extractor_spec.rb +49 -25
- data/spec/drudgery/extractors/csv_extractor_spec.rb +40 -18
- data/spec/drudgery/extractors/sqlite3_extractor_spec.rb +77 -32
- data/spec/drudgery/job_logger_spec.rb +59 -0
- data/spec/drudgery/job_progress_spec.rb +19 -0
- data/spec/drudgery/job_spec.rb +163 -35
- data/spec/drudgery/loaders/active_record_import_loader_spec.rb +24 -13
- data/spec/drudgery/loaders/active_record_loader_spec.rb +26 -15
- data/spec/drudgery/loaders/csv_loader_spec.rb +15 -8
- data/spec/drudgery/loaders/sqlite3_loader_spec.rb +43 -11
- data/spec/drudgery/manager_spec.rb +4 -8
- data/spec/drudgery_spec.rb +30 -2
- data/spec/spec_helper.rb +3 -0
- metadata +47 -19
data/README.md
CHANGED
@@ -108,24 +108,66 @@ end
|
|
108
108
|
m.run
|
109
109
|
```
|
110
110
|
|
111
|
+
Logging
|
112
|
+
-------
|
113
|
+
|
114
|
+
Provide Drudgery with a logger and info will be logged about each job.
|
115
|
+
|
116
|
+
When log level is `INFO` expect to see basic output for each job (e.g.
|
117
|
+
when it starts and completes).
|
118
|
+
|
119
|
+
```ruby
|
120
|
+
logger = Logger.new('log/etl.log')
|
121
|
+
logger.level = Logger::INFO # Logger defaults to log level DEBUG
|
122
|
+
|
123
|
+
Drudgery.logger = logger
|
124
|
+
```
|
125
|
+
|
126
|
+
When log level is `DEBUG` expect to see output for each record
|
127
|
+
extracted, transformed and loaded (VERY NOISY).
|
128
|
+
|
129
|
+
Progress
|
130
|
+
--------
|
131
|
+
|
132
|
+
Drudgery also provides progress output to STDERR courtesty of the
|
133
|
+
`progressbar` gem. Progress output is on by default, but can be
|
134
|
+
disabled with the following:
|
135
|
+
|
136
|
+
```ruby
|
137
|
+
Drudgery.show_progress = false
|
138
|
+
```
|
139
|
+
|
111
140
|
Extractors
|
112
141
|
----------
|
113
142
|
|
114
143
|
The following extractors are provided: `:csv`, `:sqlite3`, `:active_record`
|
115
144
|
|
116
|
-
You can use your own extractors if you would like. They need
|
117
|
-
implement
|
145
|
+
You can use your own extractors if you would like. They need to
|
146
|
+
implement the following methods:
|
147
|
+
|
148
|
+
* `#name` - returns extractor's name
|
149
|
+
* `#record_count` - returns count of records in source
|
150
|
+
* `#extract` - must yield each record and record index
|
118
151
|
|
119
152
|
```ruby
|
120
153
|
class ArrayExtractor
|
154
|
+
attr_reader :name
|
155
|
+
|
121
156
|
def initialize(source)
|
122
157
|
@source = source
|
158
|
+
@name = 'array'
|
123
159
|
end
|
124
160
|
|
125
161
|
def extract
|
162
|
+
index = 0
|
126
163
|
@source.each do |record|
|
127
|
-
yield record
|
128
|
-
|
164
|
+
yield [record, index]
|
165
|
+
index += 1
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
def record_count
|
170
|
+
@source.size
|
129
171
|
end
|
130
172
|
end
|
131
173
|
|
@@ -146,15 +188,24 @@ namespace:
|
|
146
188
|
module Drudgery
|
147
189
|
module Extractors
|
148
190
|
class ArrayExtractor
|
191
|
+
attr_reader :name
|
192
|
+
|
149
193
|
def initialize(source)
|
150
194
|
@source = source
|
195
|
+
@name = 'array'
|
151
196
|
end
|
152
197
|
|
153
198
|
def extract
|
199
|
+
index = 0
|
154
200
|
@source.each do |record|
|
155
|
-
yield record
|
201
|
+
yield [record, index]
|
202
|
+
index += 1
|
156
203
|
end
|
157
204
|
end
|
205
|
+
|
206
|
+
def record_count
|
207
|
+
@source.size
|
208
|
+
end
|
158
209
|
end
|
159
210
|
end
|
160
211
|
end
|
@@ -219,14 +270,20 @@ The following loaders are provided:
|
|
219
270
|
* `:active_record`
|
220
271
|
* `:active_record_import`
|
221
272
|
|
222
|
-
You can use your own loaders if you would like. They need
|
223
|
-
|
224
|
-
|
273
|
+
You can use your own loaders if you would like. They need to implement
|
274
|
+
the following methods:
|
275
|
+
|
276
|
+
* `#name` - returns the loader's name
|
277
|
+
* `#load` - accepts an array of records and then write them to the
|
278
|
+
destination
|
225
279
|
|
226
280
|
```ruby
|
227
281
|
class ArrayLoader
|
282
|
+
attr_reader :name
|
283
|
+
|
228
284
|
def initialize(destination)
|
229
285
|
@destination = destination
|
286
|
+
@name = 'array'
|
230
287
|
end
|
231
288
|
|
232
289
|
def load(records)
|
@@ -251,8 +308,11 @@ namespace:
|
|
251
308
|
module Drudgery
|
252
309
|
module Loaders
|
253
310
|
class ArrayLoader
|
311
|
+
attr_reader :name
|
312
|
+
|
254
313
|
def initialize(destination)
|
255
314
|
@destination = destination
|
315
|
+
@name = 'array'
|
256
316
|
end
|
257
317
|
|
258
318
|
def load(records)
|
@@ -1,15 +1,26 @@
|
|
1
1
|
module Drudgery
|
2
2
|
module Extractors
|
3
3
|
class ActiveRecordExtractor
|
4
|
+
attr_reader :name
|
5
|
+
|
4
6
|
def initialize(model)
|
5
7
|
@model = model
|
8
|
+
@name = "active_record:#{@model.name}"
|
6
9
|
end
|
7
10
|
|
8
11
|
def extract
|
12
|
+
index = 0
|
13
|
+
|
9
14
|
@model.find_each do |record|
|
10
|
-
yield record.attributes
|
15
|
+
yield [record.attributes, index]
|
16
|
+
|
17
|
+
index += 1
|
11
18
|
end
|
12
19
|
end
|
20
|
+
|
21
|
+
def record_count
|
22
|
+
@record_count ||= @model.count
|
23
|
+
end
|
13
24
|
end
|
14
25
|
end
|
15
26
|
end
|
@@ -1,18 +1,38 @@
|
|
1
|
-
require 'csv'
|
2
|
-
|
3
1
|
module Drudgery
|
4
2
|
module Extractors
|
5
3
|
class CSVExtractor
|
4
|
+
attr_reader :name
|
5
|
+
|
6
6
|
def initialize(filepath, options={})
|
7
7
|
@filepath = filepath
|
8
|
-
@options = { :headers => true }
|
9
|
-
|
8
|
+
@options = { :headers => true }.merge(options)
|
9
|
+
|
10
|
+
@name = "csv:#{File.basename(@filepath)}"
|
10
11
|
end
|
11
12
|
|
12
13
|
def extract
|
14
|
+
index = 0
|
15
|
+
|
13
16
|
CSV.foreach(@filepath, @options) do |row|
|
14
|
-
yield row.to_hash
|
17
|
+
yield [row.to_hash, index]
|
18
|
+
|
19
|
+
index += 1
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def record_count
|
24
|
+
@record_count ||= calculate_record_count
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
def calculate_record_count
|
29
|
+
record_count = 0
|
30
|
+
|
31
|
+
extract do |data, index|
|
32
|
+
record_count += 1
|
15
33
|
end
|
34
|
+
|
35
|
+
record_count
|
16
36
|
end
|
17
37
|
end
|
18
38
|
end
|
@@ -1,6 +1,8 @@
|
|
1
1
|
module Drudgery
|
2
2
|
module Extractors
|
3
3
|
class SQLite3Extractor
|
4
|
+
attr_reader :name
|
5
|
+
|
4
6
|
def initialize(db, table)
|
5
7
|
@db = db
|
6
8
|
@db.results_as_hash = true
|
@@ -8,6 +10,8 @@ module Drudgery
|
|
8
10
|
|
9
11
|
@table = table
|
10
12
|
@clauses = {}
|
13
|
+
|
14
|
+
@name = "sqlite3:#{main_db_name}.#{@table}"
|
11
15
|
end
|
12
16
|
|
13
17
|
def select(*expressions)
|
@@ -39,12 +43,20 @@ module Drudgery
|
|
39
43
|
end
|
40
44
|
|
41
45
|
def extract
|
46
|
+
index = 0
|
47
|
+
|
42
48
|
@db.execute(sql) do |row|
|
43
49
|
row.reject! { |key, value| key.kind_of?(Integer) }
|
44
|
-
yield row
|
50
|
+
yield [row, index]
|
51
|
+
|
52
|
+
index += 1
|
45
53
|
end
|
46
54
|
end
|
47
55
|
|
56
|
+
def record_count
|
57
|
+
@record_count ||= @db.get_first_value(count_sql)
|
58
|
+
end
|
59
|
+
|
48
60
|
private
|
49
61
|
def sql
|
50
62
|
clauses = [
|
@@ -63,6 +75,24 @@ module Drudgery
|
|
63
75
|
|
64
76
|
clauses.join(' ')
|
65
77
|
end
|
78
|
+
|
79
|
+
def count_sql
|
80
|
+
if @clauses.empty?
|
81
|
+
"SELECT COUNT(*) FROM #{@table}"
|
82
|
+
else
|
83
|
+
"SELECT COUNT(*) FROM (#{sql})"
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def main_db_name
|
88
|
+
main = @db.database_list.detect { |list| list['name'] == 'main' }
|
89
|
+
|
90
|
+
if main['file'].empty?
|
91
|
+
'memory'
|
92
|
+
else
|
93
|
+
File.basename(main['file']).split('.').first
|
94
|
+
end
|
95
|
+
end
|
66
96
|
end
|
67
97
|
end
|
68
98
|
end
|
data/lib/drudgery/job.rb
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
module Drudgery
|
2
2
|
class Job
|
3
|
+
attr_reader :id
|
4
|
+
|
3
5
|
def initialize(options={})
|
6
|
+
@id = Time.now.nsec
|
4
7
|
@extractor = options[:extractor]
|
5
8
|
@loader = options[:loader]
|
6
9
|
@transformer = options[:transformer]
|
@@ -9,6 +12,10 @@ module Drudgery
|
|
9
12
|
@records = []
|
10
13
|
end
|
11
14
|
|
15
|
+
def name
|
16
|
+
"#{@extractor.name} => #{@loader.name}"
|
17
|
+
end
|
18
|
+
|
12
19
|
def batch_size(size)
|
13
20
|
@batch_size = size
|
14
21
|
end
|
@@ -44,29 +51,50 @@ module Drudgery
|
|
44
51
|
end
|
45
52
|
|
46
53
|
def perform
|
47
|
-
|
48
|
-
|
54
|
+
logger.log_with_progress :info, name
|
55
|
+
|
56
|
+
elapsed = Benchmark.realtime do
|
57
|
+
extract_records do |record|
|
58
|
+
@records << record
|
49
59
|
|
50
|
-
|
51
|
-
|
60
|
+
if @records.size == @batch_size
|
61
|
+
load_records
|
62
|
+
end
|
63
|
+
|
64
|
+
progress.inc if Drudgery.show_progress
|
52
65
|
end
|
66
|
+
|
67
|
+
load_records
|
68
|
+
|
69
|
+
progress.finish if Drudgery.show_progress
|
53
70
|
end
|
54
71
|
|
55
|
-
|
72
|
+
logger.log_with_progress :info, "Completed in #{"%.2f" % elapsed}s\n\n"
|
56
73
|
end
|
57
74
|
|
58
75
|
private
|
59
76
|
def extract_records
|
60
|
-
@extractor.extract do |data|
|
77
|
+
@extractor.extract do |data, index|
|
78
|
+
logger.log :debug, "Extracting Record -- Index: #{index}"
|
79
|
+
logger.log :debug, data.inspect
|
80
|
+
|
61
81
|
record = transform_data(data)
|
62
|
-
|
82
|
+
logger.log :debug, "Transforming Record -- Index: #{index}"
|
83
|
+
logger.log :debug, data.inspect
|
63
84
|
|
64
|
-
|
85
|
+
if record.nil?
|
86
|
+
next
|
87
|
+
else
|
88
|
+
yield record
|
89
|
+
end
|
65
90
|
end
|
66
91
|
end
|
67
92
|
|
68
93
|
def load_records
|
69
|
-
|
94
|
+
logger.log :debug, "Loading Records -- Count: #{@records.size}"
|
95
|
+
logger.log :debug, @records.inspect
|
96
|
+
|
97
|
+
@loader.load(@records) unless @records.empty?
|
70
98
|
@records.clear
|
71
99
|
end
|
72
100
|
|
@@ -77,5 +105,13 @@ module Drudgery
|
|
77
105
|
data
|
78
106
|
end
|
79
107
|
end
|
108
|
+
|
109
|
+
def progress
|
110
|
+
@progress ||= Drudgery::JobProgress.new(id, @extractor.record_count)
|
111
|
+
end
|
112
|
+
|
113
|
+
def logger
|
114
|
+
@logger ||= Drudgery::JobLogger.new(id)
|
115
|
+
end
|
80
116
|
end
|
81
117
|
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module Drudgery
|
2
|
+
class JobLogger
|
3
|
+
def initialize(job_id)
|
4
|
+
@prefix = "## JOB #{job_id}"
|
5
|
+
end
|
6
|
+
|
7
|
+
def log_with_progress(mode, message)
|
8
|
+
STDERR.puts format_message(message) if Drudgery.show_progress
|
9
|
+
log(mode, message)
|
10
|
+
end
|
11
|
+
|
12
|
+
def log(mode, message)
|
13
|
+
Drudgery.log mode, format_message(message)
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
def format_message(message)
|
18
|
+
"#{@prefix}: #{message}"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -1,13 +1,15 @@
|
|
1
|
-
require 'csv'
|
2
|
-
|
3
1
|
module Drudgery
|
4
2
|
module Loaders
|
5
3
|
class CSVLoader
|
4
|
+
attr_reader :name
|
5
|
+
|
6
6
|
def initialize(filepath, options={})
|
7
7
|
@filepath = filepath
|
8
8
|
@options = options
|
9
9
|
|
10
10
|
@write_headers = true
|
11
|
+
|
12
|
+
@name = "csv:#{File.basename(@filepath)}"
|
11
13
|
end
|
12
14
|
|
13
15
|
def load(records)
|
@@ -1,9 +1,16 @@
|
|
1
1
|
module Drudgery
|
2
2
|
module Loaders
|
3
3
|
class SQLite3Loader
|
4
|
+
attr_reader :name
|
5
|
+
|
4
6
|
def initialize(db, table)
|
5
7
|
@db = db
|
8
|
+
@db.results_as_hash = true
|
9
|
+
@db.type_translation = true
|
10
|
+
|
6
11
|
@table = table
|
12
|
+
|
13
|
+
@name = "sqlite3:#{main_db_name}.#{@table}"
|
7
14
|
end
|
8
15
|
|
9
16
|
def load(records)
|
@@ -20,6 +27,16 @@ module Drudgery
|
|
20
27
|
def sql(columns)
|
21
28
|
"INSERT INTO #{@table} (#{columns.map { |column| column }.join(', ')}) VALUES (#{columns.map { |column| '?' }.join(', ')})"
|
22
29
|
end
|
30
|
+
|
31
|
+
def main_db_name
|
32
|
+
main = @db.database_list.detect { |list| list['name'] == 'main' }
|
33
|
+
|
34
|
+
if main['file'].empty?
|
35
|
+
'memory'
|
36
|
+
else
|
37
|
+
File.basename(main['file']).split('.').first
|
38
|
+
end
|
39
|
+
end
|
23
40
|
end
|
24
41
|
end
|
25
42
|
end
|
data/lib/drudgery/version.rb
CHANGED
data/lib/drudgery.rb
CHANGED
@@ -1,4 +1,10 @@
|
|
1
|
+
require 'benchmark'
|
2
|
+
require 'csv'
|
3
|
+
require 'progressbar'
|
4
|
+
|
1
5
|
require 'drudgery/version'
|
6
|
+
require 'drudgery/job_progress'
|
7
|
+
require 'drudgery/job_logger'
|
2
8
|
require 'drudgery/manager'
|
3
9
|
require 'drudgery/job'
|
4
10
|
require 'drudgery/transformer'
|
@@ -13,6 +19,14 @@ require 'drudgery/loaders/csv_loader'
|
|
13
19
|
require 'drudgery/loaders/sqlite3_loader'
|
14
20
|
|
15
21
|
module Drudgery
|
22
|
+
class << self
|
23
|
+
attr_accessor :logger, :show_progress
|
24
|
+
|
25
|
+
def log(mode, message)
|
26
|
+
logger.send(mode, message) if logger
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
16
30
|
module Extractors
|
17
31
|
def self.instantiate(type, *args)
|
18
32
|
case type
|
@@ -43,3 +57,5 @@ module Drudgery
|
|
43
57
|
end
|
44
58
|
end
|
45
59
|
end
|
60
|
+
|
61
|
+
Drudgery.show_progress = true
|
@@ -4,68 +4,83 @@ require 'active_record'
|
|
4
4
|
describe Drudgery::Extractors::ActiveRecordExtractor do
|
5
5
|
class Record < ActiveRecord::Base; end
|
6
6
|
|
7
|
+
def mock_model
|
8
|
+
stub('model', :name => 'Record')
|
9
|
+
end
|
10
|
+
|
7
11
|
describe '#initialize' do
|
8
12
|
it 'sets model to provided argument' do
|
9
|
-
model =
|
13
|
+
model = mock_model
|
10
14
|
|
11
15
|
extractor = Drudgery::Extractors::ActiveRecordExtractor.new(model)
|
12
16
|
extractor.instance_variable_get('@model').must_equal model
|
13
17
|
end
|
18
|
+
|
19
|
+
it 'sets name to active_record:<model name>' do
|
20
|
+
extractor = Drudgery::Extractors::ActiveRecordExtractor.new(mock_model)
|
21
|
+
extractor.name.must_equal 'active_record:Record'
|
22
|
+
end
|
14
23
|
end
|
15
24
|
|
16
25
|
describe '#extract' do
|
17
26
|
it 'finds records using model' do
|
18
|
-
model =
|
27
|
+
model = mock_model
|
19
28
|
model.expects(:find_each)
|
20
29
|
|
21
30
|
extractor = Drudgery::Extractors::ActiveRecordExtractor.new(model)
|
22
31
|
extractor.extract
|
23
32
|
end
|
24
33
|
|
25
|
-
it 'yields each record
|
26
|
-
record1 = mock
|
27
|
-
|
28
|
-
|
29
|
-
record2 = mock
|
30
|
-
record2.expects(:attributes).returns({ :b => 2 })
|
34
|
+
it 'yields each record hash and index' do
|
35
|
+
record1 = mock('record1', :attributes => { :a => 1 })
|
36
|
+
record2 = mock('record2', :attributes => { :b => 2 })
|
31
37
|
|
32
|
-
model =
|
38
|
+
model = mock_model
|
33
39
|
model.stubs(:find_each).multiple_yields([record1], [record2])
|
34
40
|
|
35
41
|
extractor = Drudgery::Extractors::ActiveRecordExtractor.new(model)
|
36
42
|
|
37
43
|
records = []
|
38
|
-
|
44
|
+
indexes = []
|
45
|
+
extractor.extract do |record, index|
|
39
46
|
records << record
|
47
|
+
indexes << index
|
40
48
|
end
|
41
49
|
|
42
50
|
records[0].must_equal({ :a => 1 })
|
43
51
|
records[1].must_equal({ :b => 2 })
|
52
|
+
|
53
|
+
indexes.must_equal [0, 1]
|
44
54
|
end
|
45
55
|
|
46
|
-
|
47
|
-
before(:each) do
|
48
|
-
ActiveRecord::Base.establish_connection(:adapter => 'sqlite3', :database => ':memory:')
|
49
|
-
ActiveRecord::Base.connection.create_table(:records) do |t|
|
50
|
-
t.integer :a
|
51
|
-
t.integer :b
|
52
|
-
end
|
56
|
+
end
|
53
57
|
|
54
|
-
|
55
|
-
|
56
|
-
|
58
|
+
describe 'without stubs' do
|
59
|
+
before(:each) do
|
60
|
+
ActiveRecord::Base.establish_connection(:adapter => 'sqlite3', :database => ':memory:')
|
61
|
+
ActiveRecord::Base.connection.create_table(:records) do |t|
|
62
|
+
t.integer :a
|
63
|
+
t.integer :b
|
57
64
|
end
|
58
65
|
|
59
|
-
|
60
|
-
|
61
|
-
|
66
|
+
Record.create!({ :a => 1, :b => 2 })
|
67
|
+
Record.create!({ :a => 3, :b => 4 })
|
68
|
+
Record.create!({ :a => 5, :b => 6 })
|
69
|
+
end
|
70
|
+
|
71
|
+
after(:each) do
|
72
|
+
ActiveRecord::Base.clear_active_connections!
|
73
|
+
end
|
62
74
|
|
63
|
-
|
75
|
+
describe '#extract' do
|
76
|
+
it 'yields each record hash and index' do
|
64
77
|
extractor = Drudgery::Extractors::ActiveRecordExtractor.new(Record)
|
65
78
|
|
66
79
|
records = []
|
67
|
-
|
80
|
+
indexes = []
|
81
|
+
extractor.extract do |record, index|
|
68
82
|
records << record
|
83
|
+
indexes << index
|
69
84
|
end
|
70
85
|
|
71
86
|
records.must_equal([
|
@@ -73,6 +88,15 @@ describe Drudgery::Extractors::ActiveRecordExtractor do
|
|
73
88
|
{ 'id' => 2, 'a' => 3, 'b' => 4 },
|
74
89
|
{ 'id' => 3, 'a' => 5, 'b' => 6 }
|
75
90
|
])
|
91
|
+
|
92
|
+
indexes.must_equal [0, 1, 2]
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
describe '#record_count' do
|
97
|
+
it 'returns model count' do
|
98
|
+
extractor = Drudgery::Extractors::ActiveRecordExtractor.new(Record)
|
99
|
+
extractor.record_count.must_equal 3
|
76
100
|
end
|
77
101
|
end
|
78
102
|
end
|