drudgery 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -79,7 +79,7 @@ db = SQLite3::Database.new('db.sqlite3')
79
79
  m = Drudgery::Manager.new
80
80
 
81
81
  m.prepare do |job|
82
- job.batch_size 5000
82
+ job.batch_size = 5000
83
83
 
84
84
  job.extract :sqlite3, db, 'addresses' do |extractor|
85
85
  extractor.select(
@@ -108,35 +108,6 @@ end
108
108
  m.run
109
109
  ```
110
110
 
111
- Logging
112
- -------
113
-
114
- Provide Drudgery with a logger and info will be logged about each job.
115
-
116
- When log level is `INFO` expect to see basic output for each job (e.g.
117
- when it starts and completes).
118
-
119
- ```ruby
120
- logger = Logger.new('log/etl.log')
121
- logger.level = Logger::INFO # Logger defaults to log level DEBUG
122
-
123
- Drudgery.logger = logger
124
- ```
125
-
126
- When log level is `DEBUG` expect to see output for each record
127
- extracted, transformed and loaded (VERY NOISY).
128
-
129
- Progress
130
- --------
131
-
132
- Drudgery also provides progress output to STDERR courtesty of the
133
- `progressbar` gem. Progress output is on by default, but can be
134
- disabled with the following:
135
-
136
- ```ruby
137
- Drudgery.show_progress = false
138
- ```
139
-
140
111
  Extractors
141
112
  ----------
142
113
 
@@ -255,7 +226,7 @@ m = Drudgery::Manager.new
255
226
 
256
227
  m.prepare do |job|
257
228
  m.extract :csv, 'source.csv'
258
- m.transform( CustomTransformer.new)
229
+ m.transform CustomTransformer.new
259
230
  m.load :csv, 'destination.csv'
260
231
  end
261
232
  ```
@@ -332,6 +303,84 @@ m.prepare do |job|
332
303
  end
333
304
  ```
334
305
 
306
+ Event Hooks
307
+ -----------
308
+
309
+ Drudgery provides hooks so that you can listen for events and execute
310
+ your own code (e.g. logging and progress).
311
+
312
+ The following events are provided:
313
+
314
+ * `:before_job` - Fired before the jobs starts.
315
+ * `:after_job` - Fired after the jobs completes.
316
+ * `:after_extract` - Fired after each record is extracted.
317
+ * `:after_transform` - Fired after each record is transformed.
318
+ * `:after_load` - Fired after each batch of records are loaded.
319
+
320
+ Logging
321
+ -------
322
+
323
+ Support for logging is not provided explicitly. Here is an example
324
+ using the hooks provided:
325
+
326
+ ```ruby
327
+ require 'logger'
328
+ logger = Logger.new('drudgery.log')
329
+
330
+ # before_job yields the job
331
+ Drudgery.subscribe :before_job do |job|
332
+ logger.info "## JOB #{job.id}: #{job.name}"
333
+ end
334
+
335
+ # after_extract yields the job, record, and record index
336
+ Drudgery.subscribe :after_extract do |job, record, index|
337
+ logger.debug "## JOB #{job.id}: Extracting Record -- Index: #{index}"
338
+ logger.debug "## JOB #{job.id}: #{record.inspect}"
339
+ end
340
+
341
+ # after_transform yields the job, record, and record index
342
+ Drudgery.subscribe :after_transform do |job, record, index|
343
+ logger.debug "## JOB #{job.id}: Transforming Record -- Index: #{index}"
344
+ logger.debug "## JOB #{job.id}: #{record.inspect}"
345
+ end
346
+
347
+ # after_load yields the job and records that were loaded
348
+ Drudgery.subscribe :after_load do |job, records|
349
+ logger.debug "## JOB #{job.id}: Loading Records -- Count: #{records.size}"
350
+ logger.debug "## JOB #{job.id}: #{records.inspect}"
351
+ end
352
+
353
+ # after_job yields the job
354
+ Drudgery.subscribe :after_job do |job|
355
+ logger.info "## JOB #{job.id}: Completed at #{job.completed_at}"
356
+ end
357
+ ```
358
+
359
+ Progress
360
+ --------
361
+
362
+ Support for progress indication is not provided explicitly. Here is an example
363
+ using the hooks provided:
364
+
365
+ ```ruby
366
+ require 'rubygems'
367
+ require 'progressbar'
368
+
369
+ progress = {}
370
+
371
+ Drudgery.subscribe :before_job do |job|
372
+ progress[job.id] ||= ProgressBar.new("## JOB #{job.id}", job.record_count)
373
+ end
374
+
375
+ Drudgery.subscribe :after_extract do |job, record, index|
376
+ progress[job.id].inc
377
+ end
378
+
379
+ Drudgery.subscribe :after_job do |job|
380
+ progress[job.id].finish
381
+ end
382
+ ```
383
+
335
384
  Contributing
336
385
  ------------
337
386
 
@@ -10,6 +10,14 @@ module Drudgery
10
10
  @name = "csv:#{File.basename(@filepath)}"
11
11
  end
12
12
 
13
+ def col_sep
14
+ @options[:col_sep]
15
+ end
16
+
17
+ def col_sep=(char)
18
+ @options[:col_sep] = char
19
+ end
20
+
13
21
  def extract
14
22
  index = 0
15
23
 
data/lib/drudgery/job.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  module Drudgery
2
2
  class Job
3
- attr_reader :id
3
+ attr_reader :id, :started_at, :completed_at
4
+ attr_accessor :extractor, :loader, :transformer, :batch_size
4
5
 
5
6
  def initialize(options={})
6
7
  @id = Time.now.nsec
@@ -16,8 +17,10 @@ module Drudgery
16
17
  "#{@extractor.name} => #{@loader.name}"
17
18
  end
18
19
 
19
- def batch_size(size)
20
- @batch_size = size
20
+ def record_count
21
+ if @extractor
22
+ @record_count ||= @extractor.record_count
23
+ end
21
24
  end
22
25
 
23
26
  def extract(*args)
@@ -33,7 +36,7 @@ module Drudgery
33
36
  end
34
37
 
35
38
  def transform(transformer=Drudgery::Transformer.new, &processor)
36
- transformer.register(processor)
39
+ transformer.register(processor) if processor
37
40
 
38
41
  @transformer = transformer
39
42
  end
@@ -51,36 +54,30 @@ module Drudgery
51
54
  end
52
55
 
53
56
  def perform
54
- logger.log_with_progress :info, name
55
-
56
- elapsed = Benchmark.realtime do
57
- extract_records do |record|
58
- @records << record
57
+ @started_at = Time.now
58
+ Drudgery.notify :before_job, self
59
59
 
60
- if @records.size == @batch_size
61
- load_records
62
- end
60
+ extract_records do |record|
61
+ @records << record
63
62
 
64
- progress.inc if Drudgery.show_progress
63
+ if @records.size == @batch_size
64
+ load_records
65
65
  end
66
-
67
- load_records
68
-
69
- progress.finish if Drudgery.show_progress
70
66
  end
71
67
 
72
- logger.log_with_progress :info, "Completed in #{"%.2f" % elapsed}s\n\n"
68
+ load_records
69
+
70
+ @completed_at = Time.now
71
+ Drudgery.notify :after_job, self
73
72
  end
74
73
 
75
74
  private
76
75
  def extract_records
77
76
  @extractor.extract do |data, index|
78
- logger.log :debug, "Extracting Record -- Index: #{index}"
79
- logger.log :debug, data.inspect
77
+ Drudgery.notify :after_extract, self, data, index
80
78
 
81
79
  record = transform_data(data)
82
- logger.log :debug, "Transforming Record -- Index: #{index}"
83
- logger.log :debug, data.inspect
80
+ Drudgery.notify :after_transform, self, record, index
84
81
 
85
82
  if record.nil?
86
83
  next
@@ -91,10 +88,9 @@ module Drudgery
91
88
  end
92
89
 
93
90
  def load_records
94
- logger.log :debug, "Loading Records -- Count: #{@records.size}"
95
- logger.log :debug, @records.inspect
96
-
97
91
  @loader.load(@records) unless @records.empty?
92
+ Drudgery.notify :after_load, self, @records
93
+
98
94
  @records.clear
99
95
  end
100
96
 
@@ -105,13 +101,5 @@ module Drudgery
105
101
  data
106
102
  end
107
103
  end
108
-
109
- def progress
110
- @progress ||= Drudgery::JobProgress.new(id, @extractor.record_count)
111
- end
112
-
113
- def logger
114
- @logger ||= Drudgery::JobLogger.new(id)
115
- end
116
104
  end
117
105
  end
@@ -12,6 +12,14 @@ module Drudgery
12
12
  @name = "csv:#{File.basename(@filepath)}"
13
13
  end
14
14
 
15
+ def col_sep
16
+ @options[:col_sep]
17
+ end
18
+
19
+ def col_sep=(char)
20
+ @options[:col_sep] = char
21
+ end
22
+
15
23
  def load(records)
16
24
  columns = records.first.keys.sort { |a,b| a.to_s <=> b.to_s }
17
25
 
@@ -1,3 +1,3 @@
1
1
  module Drudgery
2
- VERSION = '0.1.0'
2
+ VERSION = '0.2.0'
3
3
  end
data/lib/drudgery.rb CHANGED
@@ -1,10 +1,6 @@
1
- require 'benchmark'
2
1
  require 'csv'
3
- require 'progressbar'
4
2
 
5
3
  require 'drudgery/version'
6
- require 'drudgery/job_progress'
7
- require 'drudgery/job_logger'
8
4
  require 'drudgery/manager'
9
5
  require 'drudgery/job'
10
6
  require 'drudgery/transformer'
@@ -20,10 +16,22 @@ require 'drudgery/loaders/sqlite3_loader'
20
16
 
21
17
  module Drudgery
22
18
  class << self
23
- attr_accessor :logger, :show_progress
19
+ def listeners
20
+ @listeners ||= Hash.new { |hash, key| hash[key] = [] }
21
+ end
22
+
23
+ def subscribe(event, &block)
24
+ listeners[event] << block
25
+ end
24
26
 
25
- def log(mode, message)
26
- logger.send(mode, message) if logger
27
+ def unsubscribe(event)
28
+ listeners[event].clear
29
+ end
30
+
31
+ def notify(event, *args)
32
+ listeners[event].each do |listener|
33
+ listener.call(*args)
34
+ end
27
35
  end
28
36
  end
29
37
 
@@ -57,5 +65,3 @@ module Drudgery
57
65
  end
58
66
  end
59
67
  end
60
-
61
- Drudgery.show_progress = true
@@ -1,102 +1,59 @@
1
1
  require 'spec_helper'
2
- require 'active_record'
3
2
 
4
- describe Drudgery::Extractors::ActiveRecordExtractor do
5
- class Record < ActiveRecord::Base; end
3
+ class Record < ActiveRecord::Base; end
6
4
 
7
- def mock_model
8
- stub('model', :name => 'Record')
9
- end
10
-
11
- describe '#initialize' do
12
- it 'sets model to provided argument' do
13
- model = mock_model
14
-
15
- extractor = Drudgery::Extractors::ActiveRecordExtractor.new(model)
16
- extractor.instance_variable_get('@model').must_equal model
17
- end
18
-
19
- it 'sets name to active_record:<model name>' do
20
- extractor = Drudgery::Extractors::ActiveRecordExtractor.new(mock_model)
21
- extractor.name.must_equal 'active_record:Record'
22
- end
23
- end
24
-
25
- describe '#extract' do
26
- it 'finds records using model' do
27
- model = mock_model
28
- model.expects(:find_each)
29
-
30
- extractor = Drudgery::Extractors::ActiveRecordExtractor.new(model)
31
- extractor.extract
32
- end
33
-
34
- it 'yields each record hash and index' do
35
- record1 = mock('record1', :attributes => { :a => 1 })
36
- record2 = mock('record2', :attributes => { :b => 2 })
37
-
38
- model = mock_model
39
- model.stubs(:find_each).multiple_yields([record1], [record2])
5
+ module Drudgery
6
+ module Extractors
7
+ describe ActiveRecordExtractor do
8
+ before do
9
+ ActiveRecord::Base.establish_connection(:adapter => 'sqlite3', :database => ':memory:')
10
+ ActiveRecord::Base.connection.create_table(:records) do |t|
11
+ t.integer :a
12
+ t.integer :b
13
+ end
40
14
 
41
- extractor = Drudgery::Extractors::ActiveRecordExtractor.new(model)
15
+ Record.create!({ :a => 1, :b => 2 })
16
+ Record.create!({ :a => 3, :b => 4 })
17
+ Record.create!({ :a => 5, :b => 6 })
42
18
 
43
- records = []
44
- indexes = []
45
- extractor.extract do |record, index|
46
- records << record
47
- indexes << index
19
+ @extractor = ActiveRecordExtractor.new(Record)
48
20
  end
49
21
 
50
- records[0].must_equal({ :a => 1 })
51
- records[1].must_equal({ :b => 2 })
52
-
53
- indexes.must_equal [0, 1]
54
- end
22
+ after do
23
+ ActiveRecord::Base.clear_active_connections!
24
+ end
55
25
 
56
- end
57
26
 
58
- describe 'without stubs' do
59
- before(:each) do
60
- ActiveRecord::Base.establish_connection(:adapter => 'sqlite3', :database => ':memory:')
61
- ActiveRecord::Base.connection.create_table(:records) do |t|
62
- t.integer :a
63
- t.integer :b
27
+ describe '#name' do
28
+ it 'returns active_record:<model name>' do
29
+ @extractor.name.must_equal 'active_record:Record'
30
+ end
64
31
  end
65
32
 
66
- Record.create!({ :a => 1, :b => 2 })
67
- Record.create!({ :a => 3, :b => 4 })
68
- Record.create!({ :a => 5, :b => 6 })
69
- end
33
+ describe '#extract' do
34
+ it 'yields each record hash and index' do
35
+ records, indexes = [], []
70
36
 
71
- after(:each) do
72
- ActiveRecord::Base.clear_active_connections!
73
- end
37
+ @extractor.extract do |record, index|
38
+ records << record
39
+ indexes << index
40
+ end
74
41
 
75
- describe '#extract' do
76
- it 'yields each record hash and index' do
77
- extractor = Drudgery::Extractors::ActiveRecordExtractor.new(Record)
42
+ records.must_equal([
43
+ { 'id' => 1, 'a' => 1, 'b' => 2 },
44
+ { 'id' => 2, 'a' => 3, 'b' => 4 },
45
+ { 'id' => 3, 'a' => 5, 'b' => 6 }
46
+ ])
78
47
 
79
- records = []
80
- indexes = []
81
- extractor.extract do |record, index|
82
- records << record
83
- indexes << index
48
+ indexes.must_equal [0, 1, 2]
84
49
  end
85
-
86
- records.must_equal([
87
- { 'id' => 1, 'a' => 1, 'b' => 2 },
88
- { 'id' => 2, 'a' => 3, 'b' => 4 },
89
- { 'id' => 3, 'a' => 5, 'b' => 6 }
90
- ])
91
-
92
- indexes.must_equal [0, 1, 2]
93
50
  end
94
- end
95
51
 
96
- describe '#record_count' do
97
- it 'returns model count' do
98
- extractor = Drudgery::Extractors::ActiveRecordExtractor.new(Record)
99
- extractor.record_count.must_equal 3
52
+ describe '#record_count' do
53
+ it 'returns model count' do
54
+ @extractor = ActiveRecordExtractor.new(Record)
55
+ @extractor.record_count.must_equal 3
56
+ end
100
57
  end
101
58
  end
102
59
  end
@@ -1,104 +1,72 @@
1
1
  require 'spec_helper'
2
2
 
3
- describe Drudgery::Extractors::CSVExtractor do
4
- describe '#initialize' do
5
- it 'sets filepath to provided filepath' do
6
- extractor = Drudgery::Extractors::CSVExtractor.new('file.csv')
7
- extractor.instance_variable_get('@filepath').must_equal 'file.csv'
8
- end
9
-
10
- it 'initializes options hash' do
11
- extractor = Drudgery::Extractors::CSVExtractor.new('file.csv')
12
- extractor.instance_variable_get('@options').must_equal({ :headers => true })
13
- end
14
-
15
- it 'merges provided options with default options' do
16
- options = { :col_sep => '|', :headers => %w[id name email] }
17
-
18
- extractor = Drudgery::Extractors::CSVExtractor.new('file.csv', options)
19
- extractor.instance_variable_get('@options').must_equal({ :col_sep => '|', :headers => %w[id name email] })
20
- end
21
-
22
- it 'sets name to csv:<file base name>' do
23
- extractor = Drudgery::Extractors::CSVExtractor.new('tmp/file.csv')
24
- extractor.name.must_equal 'csv:file.csv'
25
- end
26
- end
27
-
28
- describe '#extract' do
29
- it 'parses records from file' do
30
- CSV.expects(:foreach).with('file.csv', :headers => true)
31
-
32
- extractor = Drudgery::Extractors::CSVExtractor.new('file.csv')
33
- extractor.extract
34
- end
35
-
36
- it 'yields each record hash and index' do
37
- record1 = mock('record1')
38
- record1.expects(:to_hash).returns({ :a => 1 })
39
-
40
- record2 = mock('record2')
41
- record2.expects(:to_hash).returns({ :b => 2 })
42
-
43
- CSV.stubs(:foreach).multiple_yields([record1], [record2])
44
-
45
- extractor = Drudgery::Extractors::CSVExtractor.new('file.csv')
46
-
47
- records = []
48
- indexes = []
49
- extractor.extract do |record, index|
50
- records << record
51
- indexes << index
3
+ module Drudgery
4
+ module Extractors
5
+ describe CSVExtractor do
6
+ before do
7
+ @file = 'tmp/test.csv'
8
+ File.delete(@file) if File.exists?(@file)
9
+
10
+ File.open(@file, 'w') do |f|
11
+ f.puts 'a,b'
12
+ f.puts '1,2'
13
+ f.puts '3,4'
14
+ f.puts '5,6'
15
+ end
52
16
  end
53
17
 
54
- records[0].must_equal({ :a => 1 })
55
- records[1].must_equal({ :b => 2 })
56
-
57
- indexes.must_equal [0, 1]
58
- end
59
- end
18
+ after do
19
+ File.delete(@file) if File.exists?(@file)
20
+ end
60
21
 
61
- describe 'without stubs' do
62
- before(:each) do
63
- File.delete('file.csv') if File.exists?('file.csv')
22
+ describe '#name' do
23
+ it 'returns csv:<file base name>' do
24
+ extractor = CSVExtractor.new('tmp/people.csv')
25
+ extractor.name.must_equal 'csv:people.csv'
26
+ end
27
+ end
64
28
 
65
- File.open('file.csv', 'w') do |f|
66
- f.puts 'a,b'
67
- f.puts '1,2'
68
- f.puts '3,4'
69
- f.puts '5,6'
29
+ describe '#col_sep' do
30
+ it 'returns col_sep option' do
31
+ extractor = CSVExtractor.new('tmp/people.csv', :col_sep => '|')
32
+ extractor.col_sep.must_equal '|'
33
+ end
70
34
  end
71
- end
72
35
 
73
- after(:each) do
74
- File.delete('file.csv') if File.exists?('file.csv')
75
- end
36
+ describe '#col_sep=' do
37
+ it 'sets col_sep to provided character' do
38
+ extractor = CSVExtractor.new('tmp/people.csv')
39
+ extractor.col_sep = '|'
40
+ extractor.col_sep.must_equal '|'
41
+ end
42
+ end
76
43
 
77
- describe '#extract' do
78
- it 'yields each record hash and index' do
79
- extractor = Drudgery::Extractors::CSVExtractor.new('file.csv')
44
+ describe '#extract' do
45
+ it 'yields each record hash and index' do
46
+ extractor = CSVExtractor.new(@file)
80
47
 
81
- records = []
82
- indexes = []
83
- extractor.extract do |record, index|
84
- records << record
85
- indexes << index
86
- end
48
+ records = []
49
+ indexes = []
50
+ extractor.extract do |record, index|
51
+ records << record
52
+ indexes << index
53
+ end
87
54
 
88
- records.must_equal([
89
- { 'a' => '1', 'b' => '2' },
90
- { 'a' => '3', 'b' => '4' },
91
- { 'a' => '5', 'b' => '6' }
92
- ])
55
+ records.must_equal([
56
+ { 'a' => '1', 'b' => '2' },
57
+ { 'a' => '3', 'b' => '4' },
58
+ { 'a' => '5', 'b' => '6' }
59
+ ])
93
60
 
94
- indexes.must_equal [0, 1, 2]
61
+ indexes.must_equal [0, 1, 2]
62
+ end
95
63
  end
96
- end
97
64
 
98
- describe '#record_count' do
99
- it 'returns count of CSV rows' do
100
- extractor = Drudgery::Extractors::CSVExtractor.new('file.csv')
101
- extractor.record_count.must_equal 3
65
+ describe '#record_count' do
66
+ it 'returns count of CSV rows' do
67
+ extractor = CSVExtractor.new(@file)
68
+ extractor.record_count.must_equal 3
69
+ end
102
70
  end
103
71
  end
104
72
  end