drudgery 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -79,7 +79,7 @@ db = SQLite3::Database.new('db.sqlite3')
79
79
  m = Drudgery::Manager.new
80
80
 
81
81
  m.prepare do |job|
82
- job.batch_size 5000
82
+ job.batch_size = 5000
83
83
 
84
84
  job.extract :sqlite3, db, 'addresses' do |extractor|
85
85
  extractor.select(
@@ -108,35 +108,6 @@ end
108
108
  m.run
109
109
  ```
110
110
 
111
- Logging
112
- -------
113
-
114
- Provide Drudgery with a logger and info will be logged about each job.
115
-
116
- When log level is `INFO` expect to see basic output for each job (e.g.
117
- when it starts and completes).
118
-
119
- ```ruby
120
- logger = Logger.new('log/etl.log')
121
- logger.level = Logger::INFO # Logger defaults to log level DEBUG
122
-
123
- Drudgery.logger = logger
124
- ```
125
-
126
- When log level is `DEBUG` expect to see output for each record
127
- extracted, transformed and loaded (VERY NOISY).
128
-
129
- Progress
130
- --------
131
-
132
- Drudgery also provides progress output to STDERR courtesty of the
133
- `progressbar` gem. Progress output is on by default, but can be
134
- disabled with the following:
135
-
136
- ```ruby
137
- Drudgery.show_progress = false
138
- ```
139
-
140
111
  Extractors
141
112
  ----------
142
113
 
@@ -255,7 +226,7 @@ m = Drudgery::Manager.new
255
226
 
256
227
  m.prepare do |job|
257
228
  m.extract :csv, 'source.csv'
258
- m.transform( CustomTransformer.new)
229
+ m.transform CustomTransformer.new
259
230
  m.load :csv, 'destination.csv'
260
231
  end
261
232
  ```
@@ -332,6 +303,84 @@ m.prepare do |job|
332
303
  end
333
304
  ```
334
305
 
306
+ Event Hooks
307
+ -----------
308
+
309
+ Drudgery provides hooks so that you can listen for events and execute
310
+ your own code (e.g. logging and progress).
311
+
312
+ The following events are provided:
313
+
314
+ * `:before_job` - Fired before the jobs starts.
315
+ * `:after_job` - Fired after the jobs completes.
316
+ * `:after_extract` - Fired after each record is extracted.
317
+ * `:after_transform` - Fired after each record is transformed.
318
+ * `:after_load` - Fired after each batch of records are loaded.
319
+
320
+ Logging
321
+ -------
322
+
323
+ Support for logging is not provided explicitly. Here is an example
324
+ using the hooks provided:
325
+
326
+ ```ruby
327
+ require 'logger'
328
+ logger = Logger.new('drudgery.log')
329
+
330
+ # before_job yields the job
331
+ Drudgery.subscribe :before_job do |job|
332
+ logger.info "## JOB #{job.id}: #{job.name}"
333
+ end
334
+
335
+ # after_extract yields the job, record, and record index
336
+ Drudgery.subscribe :after_extract do |job, record, index|
337
+ logger.debug "## JOB #{job.id}: Extracting Record -- Index: #{index}"
338
+ logger.debug "## JOB #{job.id}: #{record.inspect}"
339
+ end
340
+
341
+ # after_transform yields the job, record, and record index
342
+ Drudgery.subscribe :after_transform do |job, record, index|
343
+ logger.debug "## JOB #{job.id}: Transforming Record -- Index: #{index}"
344
+ logger.debug "## JOB #{job.id}: #{record.inspect}"
345
+ end
346
+
347
+ # after_load yields the job and records that were loaded
348
+ Drudgery.subscribe :after_load do |job, records|
349
+ logger.debug "## JOB #{job.id}: Loading Records -- Count: #{records.size}"
350
+ logger.debug "## JOB #{job.id}: #{records.inspect}"
351
+ end
352
+
353
+ # after_job yields the job
354
+ Drudgery.subscribe :after_job do |job|
355
+ logger.info "## JOB #{job.id}: Completed at #{job.completed_at}"
356
+ end
357
+ ```
358
+
359
+ Progress
360
+ --------
361
+
362
+ Support for progress indication is not provided explicitly. Here is an example
363
+ using the hooks provided:
364
+
365
+ ```ruby
366
+ require 'rubygems'
367
+ require 'progressbar'
368
+
369
+ progress = {}
370
+
371
+ Drudgery.subscribe :before_job do |job|
372
+ progress[job.id] ||= ProgressBar.new("## JOB #{job.id}", job.record_count)
373
+ end
374
+
375
+ Drudgery.subscribe :after_extract do |job, record, index|
376
+ progress[job.id].inc
377
+ end
378
+
379
+ Drudgery.subscribe :after_job do |job|
380
+ progress[job.id].finish
381
+ end
382
+ ```
383
+
335
384
  Contributing
336
385
  ------------
337
386
 
@@ -10,6 +10,14 @@ module Drudgery
10
10
  @name = "csv:#{File.basename(@filepath)}"
11
11
  end
12
12
 
13
+ def col_sep
14
+ @options[:col_sep]
15
+ end
16
+
17
+ def col_sep=(char)
18
+ @options[:col_sep] = char
19
+ end
20
+
13
21
  def extract
14
22
  index = 0
15
23
 
data/lib/drudgery/job.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  module Drudgery
2
2
  class Job
3
- attr_reader :id
3
+ attr_reader :id, :started_at, :completed_at
4
+ attr_accessor :extractor, :loader, :transformer, :batch_size
4
5
 
5
6
  def initialize(options={})
6
7
  @id = Time.now.nsec
@@ -16,8 +17,10 @@ module Drudgery
16
17
  "#{@extractor.name} => #{@loader.name}"
17
18
  end
18
19
 
19
- def batch_size(size)
20
- @batch_size = size
20
+ def record_count
21
+ if @extractor
22
+ @record_count ||= @extractor.record_count
23
+ end
21
24
  end
22
25
 
23
26
  def extract(*args)
@@ -33,7 +36,7 @@ module Drudgery
33
36
  end
34
37
 
35
38
  def transform(transformer=Drudgery::Transformer.new, &processor)
36
- transformer.register(processor)
39
+ transformer.register(processor) if processor
37
40
 
38
41
  @transformer = transformer
39
42
  end
@@ -51,36 +54,30 @@ module Drudgery
51
54
  end
52
55
 
53
56
  def perform
54
- logger.log_with_progress :info, name
55
-
56
- elapsed = Benchmark.realtime do
57
- extract_records do |record|
58
- @records << record
57
+ @started_at = Time.now
58
+ Drudgery.notify :before_job, self
59
59
 
60
- if @records.size == @batch_size
61
- load_records
62
- end
60
+ extract_records do |record|
61
+ @records << record
63
62
 
64
- progress.inc if Drudgery.show_progress
63
+ if @records.size == @batch_size
64
+ load_records
65
65
  end
66
-
67
- load_records
68
-
69
- progress.finish if Drudgery.show_progress
70
66
  end
71
67
 
72
- logger.log_with_progress :info, "Completed in #{"%.2f" % elapsed}s\n\n"
68
+ load_records
69
+
70
+ @completed_at = Time.now
71
+ Drudgery.notify :after_job, self
73
72
  end
74
73
 
75
74
  private
76
75
  def extract_records
77
76
  @extractor.extract do |data, index|
78
- logger.log :debug, "Extracting Record -- Index: #{index}"
79
- logger.log :debug, data.inspect
77
+ Drudgery.notify :after_extract, self, data, index
80
78
 
81
79
  record = transform_data(data)
82
- logger.log :debug, "Transforming Record -- Index: #{index}"
83
- logger.log :debug, data.inspect
80
+ Drudgery.notify :after_transform, self, record, index
84
81
 
85
82
  if record.nil?
86
83
  next
@@ -91,10 +88,9 @@ module Drudgery
91
88
  end
92
89
 
93
90
  def load_records
94
- logger.log :debug, "Loading Records -- Count: #{@records.size}"
95
- logger.log :debug, @records.inspect
96
-
97
91
  @loader.load(@records) unless @records.empty?
92
+ Drudgery.notify :after_load, self, @records
93
+
98
94
  @records.clear
99
95
  end
100
96
 
@@ -105,13 +101,5 @@ module Drudgery
105
101
  data
106
102
  end
107
103
  end
108
-
109
- def progress
110
- @progress ||= Drudgery::JobProgress.new(id, @extractor.record_count)
111
- end
112
-
113
- def logger
114
- @logger ||= Drudgery::JobLogger.new(id)
115
- end
116
104
  end
117
105
  end
@@ -12,6 +12,14 @@ module Drudgery
12
12
  @name = "csv:#{File.basename(@filepath)}"
13
13
  end
14
14
 
15
+ def col_sep
16
+ @options[:col_sep]
17
+ end
18
+
19
+ def col_sep=(char)
20
+ @options[:col_sep] = char
21
+ end
22
+
15
23
  def load(records)
16
24
  columns = records.first.keys.sort { |a,b| a.to_s <=> b.to_s }
17
25
 
@@ -1,3 +1,3 @@
1
1
  module Drudgery
2
- VERSION = '0.1.0'
2
+ VERSION = '0.2.0'
3
3
  end
data/lib/drudgery.rb CHANGED
@@ -1,10 +1,6 @@
1
- require 'benchmark'
2
1
  require 'csv'
3
- require 'progressbar'
4
2
 
5
3
  require 'drudgery/version'
6
- require 'drudgery/job_progress'
7
- require 'drudgery/job_logger'
8
4
  require 'drudgery/manager'
9
5
  require 'drudgery/job'
10
6
  require 'drudgery/transformer'
@@ -20,10 +16,22 @@ require 'drudgery/loaders/sqlite3_loader'
20
16
 
21
17
  module Drudgery
22
18
  class << self
23
- attr_accessor :logger, :show_progress
19
+ def listeners
20
+ @listeners ||= Hash.new { |hash, key| hash[key] = [] }
21
+ end
22
+
23
+ def subscribe(event, &block)
24
+ listeners[event] << block
25
+ end
24
26
 
25
- def log(mode, message)
26
- logger.send(mode, message) if logger
27
+ def unsubscribe(event)
28
+ listeners[event].clear
29
+ end
30
+
31
+ def notify(event, *args)
32
+ listeners[event].each do |listener|
33
+ listener.call(*args)
34
+ end
27
35
  end
28
36
  end
29
37
 
@@ -57,5 +65,3 @@ module Drudgery
57
65
  end
58
66
  end
59
67
  end
60
-
61
- Drudgery.show_progress = true
@@ -1,102 +1,59 @@
1
1
  require 'spec_helper'
2
- require 'active_record'
3
2
 
4
- describe Drudgery::Extractors::ActiveRecordExtractor do
5
- class Record < ActiveRecord::Base; end
3
+ class Record < ActiveRecord::Base; end
6
4
 
7
- def mock_model
8
- stub('model', :name => 'Record')
9
- end
10
-
11
- describe '#initialize' do
12
- it 'sets model to provided argument' do
13
- model = mock_model
14
-
15
- extractor = Drudgery::Extractors::ActiveRecordExtractor.new(model)
16
- extractor.instance_variable_get('@model').must_equal model
17
- end
18
-
19
- it 'sets name to active_record:<model name>' do
20
- extractor = Drudgery::Extractors::ActiveRecordExtractor.new(mock_model)
21
- extractor.name.must_equal 'active_record:Record'
22
- end
23
- end
24
-
25
- describe '#extract' do
26
- it 'finds records using model' do
27
- model = mock_model
28
- model.expects(:find_each)
29
-
30
- extractor = Drudgery::Extractors::ActiveRecordExtractor.new(model)
31
- extractor.extract
32
- end
33
-
34
- it 'yields each record hash and index' do
35
- record1 = mock('record1', :attributes => { :a => 1 })
36
- record2 = mock('record2', :attributes => { :b => 2 })
37
-
38
- model = mock_model
39
- model.stubs(:find_each).multiple_yields([record1], [record2])
5
+ module Drudgery
6
+ module Extractors
7
+ describe ActiveRecordExtractor do
8
+ before do
9
+ ActiveRecord::Base.establish_connection(:adapter => 'sqlite3', :database => ':memory:')
10
+ ActiveRecord::Base.connection.create_table(:records) do |t|
11
+ t.integer :a
12
+ t.integer :b
13
+ end
40
14
 
41
- extractor = Drudgery::Extractors::ActiveRecordExtractor.new(model)
15
+ Record.create!({ :a => 1, :b => 2 })
16
+ Record.create!({ :a => 3, :b => 4 })
17
+ Record.create!({ :a => 5, :b => 6 })
42
18
 
43
- records = []
44
- indexes = []
45
- extractor.extract do |record, index|
46
- records << record
47
- indexes << index
19
+ @extractor = ActiveRecordExtractor.new(Record)
48
20
  end
49
21
 
50
- records[0].must_equal({ :a => 1 })
51
- records[1].must_equal({ :b => 2 })
52
-
53
- indexes.must_equal [0, 1]
54
- end
22
+ after do
23
+ ActiveRecord::Base.clear_active_connections!
24
+ end
55
25
 
56
- end
57
26
 
58
- describe 'without stubs' do
59
- before(:each) do
60
- ActiveRecord::Base.establish_connection(:adapter => 'sqlite3', :database => ':memory:')
61
- ActiveRecord::Base.connection.create_table(:records) do |t|
62
- t.integer :a
63
- t.integer :b
27
+ describe '#name' do
28
+ it 'returns active_record:<model name>' do
29
+ @extractor.name.must_equal 'active_record:Record'
30
+ end
64
31
  end
65
32
 
66
- Record.create!({ :a => 1, :b => 2 })
67
- Record.create!({ :a => 3, :b => 4 })
68
- Record.create!({ :a => 5, :b => 6 })
69
- end
33
+ describe '#extract' do
34
+ it 'yields each record hash and index' do
35
+ records, indexes = [], []
70
36
 
71
- after(:each) do
72
- ActiveRecord::Base.clear_active_connections!
73
- end
37
+ @extractor.extract do |record, index|
38
+ records << record
39
+ indexes << index
40
+ end
74
41
 
75
- describe '#extract' do
76
- it 'yields each record hash and index' do
77
- extractor = Drudgery::Extractors::ActiveRecordExtractor.new(Record)
42
+ records.must_equal([
43
+ { 'id' => 1, 'a' => 1, 'b' => 2 },
44
+ { 'id' => 2, 'a' => 3, 'b' => 4 },
45
+ { 'id' => 3, 'a' => 5, 'b' => 6 }
46
+ ])
78
47
 
79
- records = []
80
- indexes = []
81
- extractor.extract do |record, index|
82
- records << record
83
- indexes << index
48
+ indexes.must_equal [0, 1, 2]
84
49
  end
85
-
86
- records.must_equal([
87
- { 'id' => 1, 'a' => 1, 'b' => 2 },
88
- { 'id' => 2, 'a' => 3, 'b' => 4 },
89
- { 'id' => 3, 'a' => 5, 'b' => 6 }
90
- ])
91
-
92
- indexes.must_equal [0, 1, 2]
93
50
  end
94
- end
95
51
 
96
- describe '#record_count' do
97
- it 'returns model count' do
98
- extractor = Drudgery::Extractors::ActiveRecordExtractor.new(Record)
99
- extractor.record_count.must_equal 3
52
+ describe '#record_count' do
53
+ it 'returns model count' do
54
+ @extractor = ActiveRecordExtractor.new(Record)
55
+ @extractor.record_count.must_equal 3
56
+ end
100
57
  end
101
58
  end
102
59
  end
@@ -1,104 +1,72 @@
1
1
  require 'spec_helper'
2
2
 
3
- describe Drudgery::Extractors::CSVExtractor do
4
- describe '#initialize' do
5
- it 'sets filepath to provided filepath' do
6
- extractor = Drudgery::Extractors::CSVExtractor.new('file.csv')
7
- extractor.instance_variable_get('@filepath').must_equal 'file.csv'
8
- end
9
-
10
- it 'initializes options hash' do
11
- extractor = Drudgery::Extractors::CSVExtractor.new('file.csv')
12
- extractor.instance_variable_get('@options').must_equal({ :headers => true })
13
- end
14
-
15
- it 'merges provided options with default options' do
16
- options = { :col_sep => '|', :headers => %w[id name email] }
17
-
18
- extractor = Drudgery::Extractors::CSVExtractor.new('file.csv', options)
19
- extractor.instance_variable_get('@options').must_equal({ :col_sep => '|', :headers => %w[id name email] })
20
- end
21
-
22
- it 'sets name to csv:<file base name>' do
23
- extractor = Drudgery::Extractors::CSVExtractor.new('tmp/file.csv')
24
- extractor.name.must_equal 'csv:file.csv'
25
- end
26
- end
27
-
28
- describe '#extract' do
29
- it 'parses records from file' do
30
- CSV.expects(:foreach).with('file.csv', :headers => true)
31
-
32
- extractor = Drudgery::Extractors::CSVExtractor.new('file.csv')
33
- extractor.extract
34
- end
35
-
36
- it 'yields each record hash and index' do
37
- record1 = mock('record1')
38
- record1.expects(:to_hash).returns({ :a => 1 })
39
-
40
- record2 = mock('record2')
41
- record2.expects(:to_hash).returns({ :b => 2 })
42
-
43
- CSV.stubs(:foreach).multiple_yields([record1], [record2])
44
-
45
- extractor = Drudgery::Extractors::CSVExtractor.new('file.csv')
46
-
47
- records = []
48
- indexes = []
49
- extractor.extract do |record, index|
50
- records << record
51
- indexes << index
3
+ module Drudgery
4
+ module Extractors
5
+ describe CSVExtractor do
6
+ before do
7
+ @file = 'tmp/test.csv'
8
+ File.delete(@file) if File.exists?(@file)
9
+
10
+ File.open(@file, 'w') do |f|
11
+ f.puts 'a,b'
12
+ f.puts '1,2'
13
+ f.puts '3,4'
14
+ f.puts '5,6'
15
+ end
52
16
  end
53
17
 
54
- records[0].must_equal({ :a => 1 })
55
- records[1].must_equal({ :b => 2 })
56
-
57
- indexes.must_equal [0, 1]
58
- end
59
- end
18
+ after do
19
+ File.delete(@file) if File.exists?(@file)
20
+ end
60
21
 
61
- describe 'without stubs' do
62
- before(:each) do
63
- File.delete('file.csv') if File.exists?('file.csv')
22
+ describe '#name' do
23
+ it 'returns csv:<file base name>' do
24
+ extractor = CSVExtractor.new('tmp/people.csv')
25
+ extractor.name.must_equal 'csv:people.csv'
26
+ end
27
+ end
64
28
 
65
- File.open('file.csv', 'w') do |f|
66
- f.puts 'a,b'
67
- f.puts '1,2'
68
- f.puts '3,4'
69
- f.puts '5,6'
29
+ describe '#col_sep' do
30
+ it 'returns col_sep option' do
31
+ extractor = CSVExtractor.new('tmp/people.csv', :col_sep => '|')
32
+ extractor.col_sep.must_equal '|'
33
+ end
70
34
  end
71
- end
72
35
 
73
- after(:each) do
74
- File.delete('file.csv') if File.exists?('file.csv')
75
- end
36
+ describe '#col_sep=' do
37
+ it 'sets col_sep to provided character' do
38
+ extractor = CSVExtractor.new('tmp/people.csv')
39
+ extractor.col_sep = '|'
40
+ extractor.col_sep.must_equal '|'
41
+ end
42
+ end
76
43
 
77
- describe '#extract' do
78
- it 'yields each record hash and index' do
79
- extractor = Drudgery::Extractors::CSVExtractor.new('file.csv')
44
+ describe '#extract' do
45
+ it 'yields each record hash and index' do
46
+ extractor = CSVExtractor.new(@file)
80
47
 
81
- records = []
82
- indexes = []
83
- extractor.extract do |record, index|
84
- records << record
85
- indexes << index
86
- end
48
+ records = []
49
+ indexes = []
50
+ extractor.extract do |record, index|
51
+ records << record
52
+ indexes << index
53
+ end
87
54
 
88
- records.must_equal([
89
- { 'a' => '1', 'b' => '2' },
90
- { 'a' => '3', 'b' => '4' },
91
- { 'a' => '5', 'b' => '6' }
92
- ])
55
+ records.must_equal([
56
+ { 'a' => '1', 'b' => '2' },
57
+ { 'a' => '3', 'b' => '4' },
58
+ { 'a' => '5', 'b' => '6' }
59
+ ])
93
60
 
94
- indexes.must_equal [0, 1, 2]
61
+ indexes.must_equal [0, 1, 2]
62
+ end
95
63
  end
96
- end
97
64
 
98
- describe '#record_count' do
99
- it 'returns count of CSV rows' do
100
- extractor = Drudgery::Extractors::CSVExtractor.new('file.csv')
101
- extractor.record_count.must_equal 3
65
+ describe '#record_count' do
66
+ it 'returns count of CSV rows' do
67
+ extractor = CSVExtractor.new(@file)
68
+ extractor.record_count.must_equal 3
69
+ end
102
70
  end
103
71
  end
104
72
  end