drudgery 0.0.3 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +68 -8
- data/lib/drudgery/extractors/active_record_extractor.rb +12 -1
- data/lib/drudgery/extractors/csv_extractor.rb +25 -5
- data/lib/drudgery/extractors/sqlite3_extractor.rb +31 -1
- data/lib/drudgery/job.rb +45 -9
- data/lib/drudgery/job_logger.rb +21 -0
- data/lib/drudgery/job_progress.rb +11 -0
- data/lib/drudgery/loaders/active_record_import_loader.rb +3 -0
- data/lib/drudgery/loaders/active_record_loader.rb +3 -0
- data/lib/drudgery/loaders/csv_loader.rb +4 -2
- data/lib/drudgery/loaders/sqlite3_loader.rb +17 -0
- data/lib/drudgery/version.rb +1 -1
- data/lib/drudgery.rb +16 -0
- data/spec/drudgery/extractors/active_record_extractor_spec.rb +49 -25
- data/spec/drudgery/extractors/csv_extractor_spec.rb +40 -18
- data/spec/drudgery/extractors/sqlite3_extractor_spec.rb +77 -32
- data/spec/drudgery/job_logger_spec.rb +59 -0
- data/spec/drudgery/job_progress_spec.rb +19 -0
- data/spec/drudgery/job_spec.rb +163 -35
- data/spec/drudgery/loaders/active_record_import_loader_spec.rb +24 -13
- data/spec/drudgery/loaders/active_record_loader_spec.rb +26 -15
- data/spec/drudgery/loaders/csv_loader_spec.rb +15 -8
- data/spec/drudgery/loaders/sqlite3_loader_spec.rb +43 -11
- data/spec/drudgery/manager_spec.rb +4 -8
- data/spec/drudgery_spec.rb +30 -2
- data/spec/spec_helper.rb +3 -0
- metadata +47 -19
data/README.md
CHANGED
@@ -108,24 +108,66 @@ end
|
|
108
108
|
m.run
|
109
109
|
```
|
110
110
|
|
111
|
+
Logging
|
112
|
+
-------
|
113
|
+
|
114
|
+
Provide Drudgery with a logger and info will be logged about each job.
|
115
|
+
|
116
|
+
When log level is `INFO` expect to see basic output for each job (e.g.
|
117
|
+
when it starts and completes).
|
118
|
+
|
119
|
+
```ruby
|
120
|
+
logger = Logger.new('log/etl.log')
|
121
|
+
logger.level = Logger::INFO # Logger defaults to log level DEBUG
|
122
|
+
|
123
|
+
Drudgery.logger = logger
|
124
|
+
```
|
125
|
+
|
126
|
+
When log level is `DEBUG` expect to see output for each record
|
127
|
+
extracted, transformed and loaded (VERY NOISY).
|
128
|
+
|
129
|
+
Progress
|
130
|
+
--------
|
131
|
+
|
132
|
+
Drudgery also provides progress output to STDERR courtesty of the
|
133
|
+
`progressbar` gem. Progress output is on by default, but can be
|
134
|
+
disabled with the following:
|
135
|
+
|
136
|
+
```ruby
|
137
|
+
Drudgery.show_progress = false
|
138
|
+
```
|
139
|
+
|
111
140
|
Extractors
|
112
141
|
----------
|
113
142
|
|
114
143
|
The following extractors are provided: `:csv`, `:sqlite3`, `:active_record`
|
115
144
|
|
116
|
-
You can use your own extractors if you would like. They need
|
117
|
-
implement
|
145
|
+
You can use your own extractors if you would like. They need to
|
146
|
+
implement the following methods:
|
147
|
+
|
148
|
+
* `#name` - returns extractor's name
|
149
|
+
* `#record_count` - returns count of records in source
|
150
|
+
* `#extract` - must yield each record and record index
|
118
151
|
|
119
152
|
```ruby
|
120
153
|
class ArrayExtractor
|
154
|
+
attr_reader :name
|
155
|
+
|
121
156
|
def initialize(source)
|
122
157
|
@source = source
|
158
|
+
@name = 'array'
|
123
159
|
end
|
124
160
|
|
125
161
|
def extract
|
162
|
+
index = 0
|
126
163
|
@source.each do |record|
|
127
|
-
yield record
|
128
|
-
|
164
|
+
yield [record, index]
|
165
|
+
index += 1
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
def record_count
|
170
|
+
@source.size
|
129
171
|
end
|
130
172
|
end
|
131
173
|
|
@@ -146,15 +188,24 @@ namespace:
|
|
146
188
|
module Drudgery
|
147
189
|
module Extractors
|
148
190
|
class ArrayExtractor
|
191
|
+
attr_reader :name
|
192
|
+
|
149
193
|
def initialize(source)
|
150
194
|
@source = source
|
195
|
+
@name = 'array'
|
151
196
|
end
|
152
197
|
|
153
198
|
def extract
|
199
|
+
index = 0
|
154
200
|
@source.each do |record|
|
155
|
-
yield record
|
201
|
+
yield [record, index]
|
202
|
+
index += 1
|
156
203
|
end
|
157
204
|
end
|
205
|
+
|
206
|
+
def record_count
|
207
|
+
@source.size
|
208
|
+
end
|
158
209
|
end
|
159
210
|
end
|
160
211
|
end
|
@@ -219,14 +270,20 @@ The following loaders are provided:
|
|
219
270
|
* `:active_record`
|
220
271
|
* `:active_record_import`
|
221
272
|
|
222
|
-
You can use your own loaders if you would like. They need
|
223
|
-
|
224
|
-
|
273
|
+
You can use your own loaders if you would like. They need to implement
|
274
|
+
the following methods:
|
275
|
+
|
276
|
+
* `#name` - returns the loader's name
|
277
|
+
* `#load` - accepts an array of records and then write them to the
|
278
|
+
destination
|
225
279
|
|
226
280
|
```ruby
|
227
281
|
class ArrayLoader
|
282
|
+
attr_reader :name
|
283
|
+
|
228
284
|
def initialize(destination)
|
229
285
|
@destination = destination
|
286
|
+
@name = 'array'
|
230
287
|
end
|
231
288
|
|
232
289
|
def load(records)
|
@@ -251,8 +308,11 @@ namespace:
|
|
251
308
|
module Drudgery
|
252
309
|
module Loaders
|
253
310
|
class ArrayLoader
|
311
|
+
attr_reader :name
|
312
|
+
|
254
313
|
def initialize(destination)
|
255
314
|
@destination = destination
|
315
|
+
@name = 'array'
|
256
316
|
end
|
257
317
|
|
258
318
|
def load(records)
|
@@ -1,15 +1,26 @@
|
|
1
1
|
module Drudgery
|
2
2
|
module Extractors
|
3
3
|
class ActiveRecordExtractor
|
4
|
+
attr_reader :name
|
5
|
+
|
4
6
|
def initialize(model)
|
5
7
|
@model = model
|
8
|
+
@name = "active_record:#{@model.name}"
|
6
9
|
end
|
7
10
|
|
8
11
|
def extract
|
12
|
+
index = 0
|
13
|
+
|
9
14
|
@model.find_each do |record|
|
10
|
-
yield record.attributes
|
15
|
+
yield [record.attributes, index]
|
16
|
+
|
17
|
+
index += 1
|
11
18
|
end
|
12
19
|
end
|
20
|
+
|
21
|
+
def record_count
|
22
|
+
@record_count ||= @model.count
|
23
|
+
end
|
13
24
|
end
|
14
25
|
end
|
15
26
|
end
|
@@ -1,18 +1,38 @@
|
|
1
|
-
require 'csv'
|
2
|
-
|
3
1
|
module Drudgery
|
4
2
|
module Extractors
|
5
3
|
class CSVExtractor
|
4
|
+
attr_reader :name
|
5
|
+
|
6
6
|
def initialize(filepath, options={})
|
7
7
|
@filepath = filepath
|
8
|
-
@options = { :headers => true }
|
9
|
-
|
8
|
+
@options = { :headers => true }.merge(options)
|
9
|
+
|
10
|
+
@name = "csv:#{File.basename(@filepath)}"
|
10
11
|
end
|
11
12
|
|
12
13
|
def extract
|
14
|
+
index = 0
|
15
|
+
|
13
16
|
CSV.foreach(@filepath, @options) do |row|
|
14
|
-
yield row.to_hash
|
17
|
+
yield [row.to_hash, index]
|
18
|
+
|
19
|
+
index += 1
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def record_count
|
24
|
+
@record_count ||= calculate_record_count
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
def calculate_record_count
|
29
|
+
record_count = 0
|
30
|
+
|
31
|
+
extract do |data, index|
|
32
|
+
record_count += 1
|
15
33
|
end
|
34
|
+
|
35
|
+
record_count
|
16
36
|
end
|
17
37
|
end
|
18
38
|
end
|
@@ -1,6 +1,8 @@
|
|
1
1
|
module Drudgery
|
2
2
|
module Extractors
|
3
3
|
class SQLite3Extractor
|
4
|
+
attr_reader :name
|
5
|
+
|
4
6
|
def initialize(db, table)
|
5
7
|
@db = db
|
6
8
|
@db.results_as_hash = true
|
@@ -8,6 +10,8 @@ module Drudgery
|
|
8
10
|
|
9
11
|
@table = table
|
10
12
|
@clauses = {}
|
13
|
+
|
14
|
+
@name = "sqlite3:#{main_db_name}.#{@table}"
|
11
15
|
end
|
12
16
|
|
13
17
|
def select(*expressions)
|
@@ -39,12 +43,20 @@ module Drudgery
|
|
39
43
|
end
|
40
44
|
|
41
45
|
def extract
|
46
|
+
index = 0
|
47
|
+
|
42
48
|
@db.execute(sql) do |row|
|
43
49
|
row.reject! { |key, value| key.kind_of?(Integer) }
|
44
|
-
yield row
|
50
|
+
yield [row, index]
|
51
|
+
|
52
|
+
index += 1
|
45
53
|
end
|
46
54
|
end
|
47
55
|
|
56
|
+
def record_count
|
57
|
+
@record_count ||= @db.get_first_value(count_sql)
|
58
|
+
end
|
59
|
+
|
48
60
|
private
|
49
61
|
def sql
|
50
62
|
clauses = [
|
@@ -63,6 +75,24 @@ module Drudgery
|
|
63
75
|
|
64
76
|
clauses.join(' ')
|
65
77
|
end
|
78
|
+
|
79
|
+
def count_sql
|
80
|
+
if @clauses.empty?
|
81
|
+
"SELECT COUNT(*) FROM #{@table}"
|
82
|
+
else
|
83
|
+
"SELECT COUNT(*) FROM (#{sql})"
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def main_db_name
|
88
|
+
main = @db.database_list.detect { |list| list['name'] == 'main' }
|
89
|
+
|
90
|
+
if main['file'].empty?
|
91
|
+
'memory'
|
92
|
+
else
|
93
|
+
File.basename(main['file']).split('.').first
|
94
|
+
end
|
95
|
+
end
|
66
96
|
end
|
67
97
|
end
|
68
98
|
end
|
data/lib/drudgery/job.rb
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
module Drudgery
|
2
2
|
class Job
|
3
|
+
attr_reader :id
|
4
|
+
|
3
5
|
def initialize(options={})
|
6
|
+
@id = Time.now.nsec
|
4
7
|
@extractor = options[:extractor]
|
5
8
|
@loader = options[:loader]
|
6
9
|
@transformer = options[:transformer]
|
@@ -9,6 +12,10 @@ module Drudgery
|
|
9
12
|
@records = []
|
10
13
|
end
|
11
14
|
|
15
|
+
def name
|
16
|
+
"#{@extractor.name} => #{@loader.name}"
|
17
|
+
end
|
18
|
+
|
12
19
|
def batch_size(size)
|
13
20
|
@batch_size = size
|
14
21
|
end
|
@@ -44,29 +51,50 @@ module Drudgery
|
|
44
51
|
end
|
45
52
|
|
46
53
|
def perform
|
47
|
-
|
48
|
-
|
54
|
+
logger.log_with_progress :info, name
|
55
|
+
|
56
|
+
elapsed = Benchmark.realtime do
|
57
|
+
extract_records do |record|
|
58
|
+
@records << record
|
49
59
|
|
50
|
-
|
51
|
-
|
60
|
+
if @records.size == @batch_size
|
61
|
+
load_records
|
62
|
+
end
|
63
|
+
|
64
|
+
progress.inc if Drudgery.show_progress
|
52
65
|
end
|
66
|
+
|
67
|
+
load_records
|
68
|
+
|
69
|
+
progress.finish if Drudgery.show_progress
|
53
70
|
end
|
54
71
|
|
55
|
-
|
72
|
+
logger.log_with_progress :info, "Completed in #{"%.2f" % elapsed}s\n\n"
|
56
73
|
end
|
57
74
|
|
58
75
|
private
|
59
76
|
def extract_records
|
60
|
-
@extractor.extract do |data|
|
77
|
+
@extractor.extract do |data, index|
|
78
|
+
logger.log :debug, "Extracting Record -- Index: #{index}"
|
79
|
+
logger.log :debug, data.inspect
|
80
|
+
|
61
81
|
record = transform_data(data)
|
62
|
-
|
82
|
+
logger.log :debug, "Transforming Record -- Index: #{index}"
|
83
|
+
logger.log :debug, data.inspect
|
63
84
|
|
64
|
-
|
85
|
+
if record.nil?
|
86
|
+
next
|
87
|
+
else
|
88
|
+
yield record
|
89
|
+
end
|
65
90
|
end
|
66
91
|
end
|
67
92
|
|
68
93
|
def load_records
|
69
|
-
|
94
|
+
logger.log :debug, "Loading Records -- Count: #{@records.size}"
|
95
|
+
logger.log :debug, @records.inspect
|
96
|
+
|
97
|
+
@loader.load(@records) unless @records.empty?
|
70
98
|
@records.clear
|
71
99
|
end
|
72
100
|
|
@@ -77,5 +105,13 @@ module Drudgery
|
|
77
105
|
data
|
78
106
|
end
|
79
107
|
end
|
108
|
+
|
109
|
+
def progress
|
110
|
+
@progress ||= Drudgery::JobProgress.new(id, @extractor.record_count)
|
111
|
+
end
|
112
|
+
|
113
|
+
def logger
|
114
|
+
@logger ||= Drudgery::JobLogger.new(id)
|
115
|
+
end
|
80
116
|
end
|
81
117
|
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module Drudgery
|
2
|
+
class JobLogger
|
3
|
+
def initialize(job_id)
|
4
|
+
@prefix = "## JOB #{job_id}"
|
5
|
+
end
|
6
|
+
|
7
|
+
def log_with_progress(mode, message)
|
8
|
+
STDERR.puts format_message(message) if Drudgery.show_progress
|
9
|
+
log(mode, message)
|
10
|
+
end
|
11
|
+
|
12
|
+
def log(mode, message)
|
13
|
+
Drudgery.log mode, format_message(message)
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
def format_message(message)
|
18
|
+
"#{@prefix}: #{message}"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -1,13 +1,15 @@
|
|
1
|
-
require 'csv'
|
2
|
-
|
3
1
|
module Drudgery
|
4
2
|
module Loaders
|
5
3
|
class CSVLoader
|
4
|
+
attr_reader :name
|
5
|
+
|
6
6
|
def initialize(filepath, options={})
|
7
7
|
@filepath = filepath
|
8
8
|
@options = options
|
9
9
|
|
10
10
|
@write_headers = true
|
11
|
+
|
12
|
+
@name = "csv:#{File.basename(@filepath)}"
|
11
13
|
end
|
12
14
|
|
13
15
|
def load(records)
|
@@ -1,9 +1,16 @@
|
|
1
1
|
module Drudgery
|
2
2
|
module Loaders
|
3
3
|
class SQLite3Loader
|
4
|
+
attr_reader :name
|
5
|
+
|
4
6
|
def initialize(db, table)
|
5
7
|
@db = db
|
8
|
+
@db.results_as_hash = true
|
9
|
+
@db.type_translation = true
|
10
|
+
|
6
11
|
@table = table
|
12
|
+
|
13
|
+
@name = "sqlite3:#{main_db_name}.#{@table}"
|
7
14
|
end
|
8
15
|
|
9
16
|
def load(records)
|
@@ -20,6 +27,16 @@ module Drudgery
|
|
20
27
|
def sql(columns)
|
21
28
|
"INSERT INTO #{@table} (#{columns.map { |column| column }.join(', ')}) VALUES (#{columns.map { |column| '?' }.join(', ')})"
|
22
29
|
end
|
30
|
+
|
31
|
+
def main_db_name
|
32
|
+
main = @db.database_list.detect { |list| list['name'] == 'main' }
|
33
|
+
|
34
|
+
if main['file'].empty?
|
35
|
+
'memory'
|
36
|
+
else
|
37
|
+
File.basename(main['file']).split('.').first
|
38
|
+
end
|
39
|
+
end
|
23
40
|
end
|
24
41
|
end
|
25
42
|
end
|
data/lib/drudgery/version.rb
CHANGED
data/lib/drudgery.rb
CHANGED
@@ -1,4 +1,10 @@
|
|
1
|
+
require 'benchmark'
|
2
|
+
require 'csv'
|
3
|
+
require 'progressbar'
|
4
|
+
|
1
5
|
require 'drudgery/version'
|
6
|
+
require 'drudgery/job_progress'
|
7
|
+
require 'drudgery/job_logger'
|
2
8
|
require 'drudgery/manager'
|
3
9
|
require 'drudgery/job'
|
4
10
|
require 'drudgery/transformer'
|
@@ -13,6 +19,14 @@ require 'drudgery/loaders/csv_loader'
|
|
13
19
|
require 'drudgery/loaders/sqlite3_loader'
|
14
20
|
|
15
21
|
module Drudgery
|
22
|
+
class << self
|
23
|
+
attr_accessor :logger, :show_progress
|
24
|
+
|
25
|
+
def log(mode, message)
|
26
|
+
logger.send(mode, message) if logger
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
16
30
|
module Extractors
|
17
31
|
def self.instantiate(type, *args)
|
18
32
|
case type
|
@@ -43,3 +57,5 @@ module Drudgery
|
|
43
57
|
end
|
44
58
|
end
|
45
59
|
end
|
60
|
+
|
61
|
+
Drudgery.show_progress = true
|
@@ -4,68 +4,83 @@ require 'active_record'
|
|
4
4
|
describe Drudgery::Extractors::ActiveRecordExtractor do
|
5
5
|
class Record < ActiveRecord::Base; end
|
6
6
|
|
7
|
+
def mock_model
|
8
|
+
stub('model', :name => 'Record')
|
9
|
+
end
|
10
|
+
|
7
11
|
describe '#initialize' do
|
8
12
|
it 'sets model to provided argument' do
|
9
|
-
model =
|
13
|
+
model = mock_model
|
10
14
|
|
11
15
|
extractor = Drudgery::Extractors::ActiveRecordExtractor.new(model)
|
12
16
|
extractor.instance_variable_get('@model').must_equal model
|
13
17
|
end
|
18
|
+
|
19
|
+
it 'sets name to active_record:<model name>' do
|
20
|
+
extractor = Drudgery::Extractors::ActiveRecordExtractor.new(mock_model)
|
21
|
+
extractor.name.must_equal 'active_record:Record'
|
22
|
+
end
|
14
23
|
end
|
15
24
|
|
16
25
|
describe '#extract' do
|
17
26
|
it 'finds records using model' do
|
18
|
-
model =
|
27
|
+
model = mock_model
|
19
28
|
model.expects(:find_each)
|
20
29
|
|
21
30
|
extractor = Drudgery::Extractors::ActiveRecordExtractor.new(model)
|
22
31
|
extractor.extract
|
23
32
|
end
|
24
33
|
|
25
|
-
it 'yields each record
|
26
|
-
record1 = mock
|
27
|
-
|
28
|
-
|
29
|
-
record2 = mock
|
30
|
-
record2.expects(:attributes).returns({ :b => 2 })
|
34
|
+
it 'yields each record hash and index' do
|
35
|
+
record1 = mock('record1', :attributes => { :a => 1 })
|
36
|
+
record2 = mock('record2', :attributes => { :b => 2 })
|
31
37
|
|
32
|
-
model =
|
38
|
+
model = mock_model
|
33
39
|
model.stubs(:find_each).multiple_yields([record1], [record2])
|
34
40
|
|
35
41
|
extractor = Drudgery::Extractors::ActiveRecordExtractor.new(model)
|
36
42
|
|
37
43
|
records = []
|
38
|
-
|
44
|
+
indexes = []
|
45
|
+
extractor.extract do |record, index|
|
39
46
|
records << record
|
47
|
+
indexes << index
|
40
48
|
end
|
41
49
|
|
42
50
|
records[0].must_equal({ :a => 1 })
|
43
51
|
records[1].must_equal({ :b => 2 })
|
52
|
+
|
53
|
+
indexes.must_equal [0, 1]
|
44
54
|
end
|
45
55
|
|
46
|
-
|
47
|
-
before(:each) do
|
48
|
-
ActiveRecord::Base.establish_connection(:adapter => 'sqlite3', :database => ':memory:')
|
49
|
-
ActiveRecord::Base.connection.create_table(:records) do |t|
|
50
|
-
t.integer :a
|
51
|
-
t.integer :b
|
52
|
-
end
|
56
|
+
end
|
53
57
|
|
54
|
-
|
55
|
-
|
56
|
-
|
58
|
+
describe 'without stubs' do
|
59
|
+
before(:each) do
|
60
|
+
ActiveRecord::Base.establish_connection(:adapter => 'sqlite3', :database => ':memory:')
|
61
|
+
ActiveRecord::Base.connection.create_table(:records) do |t|
|
62
|
+
t.integer :a
|
63
|
+
t.integer :b
|
57
64
|
end
|
58
65
|
|
59
|
-
|
60
|
-
|
61
|
-
|
66
|
+
Record.create!({ :a => 1, :b => 2 })
|
67
|
+
Record.create!({ :a => 3, :b => 4 })
|
68
|
+
Record.create!({ :a => 5, :b => 6 })
|
69
|
+
end
|
70
|
+
|
71
|
+
after(:each) do
|
72
|
+
ActiveRecord::Base.clear_active_connections!
|
73
|
+
end
|
62
74
|
|
63
|
-
|
75
|
+
describe '#extract' do
|
76
|
+
it 'yields each record hash and index' do
|
64
77
|
extractor = Drudgery::Extractors::ActiveRecordExtractor.new(Record)
|
65
78
|
|
66
79
|
records = []
|
67
|
-
|
80
|
+
indexes = []
|
81
|
+
extractor.extract do |record, index|
|
68
82
|
records << record
|
83
|
+
indexes << index
|
69
84
|
end
|
70
85
|
|
71
86
|
records.must_equal([
|
@@ -73,6 +88,15 @@ describe Drudgery::Extractors::ActiveRecordExtractor do
|
|
73
88
|
{ 'id' => 2, 'a' => 3, 'b' => 4 },
|
74
89
|
{ 'id' => 3, 'a' => 5, 'b' => 6 }
|
75
90
|
])
|
91
|
+
|
92
|
+
indexes.must_equal [0, 1, 2]
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
describe '#record_count' do
|
97
|
+
it 'returns model count' do
|
98
|
+
extractor = Drudgery::Extractors::ActiveRecordExtractor.new(Record)
|
99
|
+
extractor.record_count.must_equal 3
|
76
100
|
end
|
77
101
|
end
|
78
102
|
end
|