chicago-etl 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.3
1
+ 0.2.4
data/chicago-etl.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "chicago-etl"
8
- s.version = "0.2.3"
8
+ s.version = "0.2.4"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Roland Swingler"]
12
- s.date = "2014-01-06"
12
+ s.date = "2014-01-07"
13
13
  s.description = "ETL tools for Chicago"
14
14
  s.email = "roland.swingler@gmail.com"
15
15
  s.extra_rdoc_files = [
@@ -45,6 +45,18 @@ module Chicago
45
45
  perform_task(:extract, task_name, &block)
46
46
  end
47
47
 
48
+ # Marks this batch for re-extraction.
49
+ def reextract
50
+ @reextract = true
51
+ self
52
+ end
53
+
54
+ # Returns true when this batch should re-extract - i.e. load
55
+ # records without regard to creation/update times.
56
+ def reextracting?
57
+ !!@reextract
58
+ end
59
+
48
60
  # Deprecated.
49
61
  #
50
62
  # @deprecated Use perform_task instead
@@ -33,8 +33,11 @@ module Chicago
33
33
  end
34
34
  end
35
35
 
36
- @pre_execution_strategies << lambda {|stage, etl_batch, reextract|
37
- stage.sink(:error).truncate if reextract && stage.sink(:error)
36
+ @pre_execution_strategies << lambda {|stage, etl_batch|
37
+ if etl_batch.reextracting? && stage.sink(:error)
38
+ stage.sink(:error).truncate
39
+ end
40
+
38
41
  stage.sink(:default).
39
42
  set_constant_values(:_inserted_at => Time.now)
40
43
  }
@@ -34,9 +34,9 @@ module Chicago
34
34
  end
35
35
 
36
36
  # Executes this stage in the context of an ETL::Batch
37
- def execute(etl_batch, reextract=false)
38
- prepare_stage(etl_batch, reextract)
39
- transform_and_load filtered_source(etl_batch, reextract)
37
+ def execute(etl_batch)
38
+ prepare_stage(etl_batch)
39
+ transform_and_load filtered_source(etl_batch)
40
40
  end
41
41
 
42
42
  # Returns the named sink, if it exists
@@ -49,8 +49,8 @@ module Chicago
49
49
  end
50
50
 
51
51
  # @api private
52
- def filtered_source(etl_batch, reextract=false)
53
- filtered_dataset = reextract ? source :
52
+ def filtered_source(etl_batch)
53
+ filtered_dataset = etl_batch.reextracting? ? source :
54
54
  @filter_strategy.call(source, etl_batch)
55
55
 
56
56
  DatasetSource.new(filtered_dataset)
@@ -58,9 +58,9 @@ module Chicago
58
58
 
59
59
  private
60
60
 
61
- def prepare_stage(etl_batch, reextract)
61
+ def prepare_stage(etl_batch)
62
62
  @pre_execution_strategies.each do |strategy|
63
- strategy.call(self, etl_batch, reextract)
63
+ strategy.call(self, etl_batch)
64
64
  end
65
65
  end
66
66
 
@@ -26,13 +26,13 @@ module Chicago
26
26
  # Specifies that the sinks should be truncated before loading
27
27
  # data.
28
28
  def truncate_pre_load
29
- @pre_execution_strategies << lambda {|stage, etl_batch, reextract|
29
+ @pre_execution_strategies << lambda {|stage, etl_batch|
30
30
  stage.sinks.each {|sink| sink.truncate }
31
31
  }
32
32
  end
33
33
 
34
34
  # Specifies that the dataset should never be filtered to the ETL
35
- # batch - i.e. it should behave as if reextract was always true
35
+ # batch - i.e. it should behave as if the batch is reextracting
36
36
  def full_reload
37
37
  @filter_strategy = lambda {|dataset, etl_batch| dataset }
38
38
  end
data/lib/chicago/etl.rb CHANGED
@@ -59,12 +59,12 @@ module Chicago
59
59
  # Tasks execution status is stored in a database etl task
60
60
  # invocations table - this ensures tasks aren't run more than once
61
61
  # within a batch.
62
- def self.execute(stage, etl_batch, reextract, logger)
62
+ def self.execute(stage, etl_batch, logger)
63
63
  etl_batch.perform_task(:load, stage.name) do
64
64
  if stage.executable?
65
- logger.debug "Starting loading #{stage.name}"
66
- stage.execute(etl_batch, reextract)
67
- logger.info "Finished loading #{stage.name}"
65
+ logger.debug "Starting executing stage: #{stage.name}"
66
+ stage.execute etl_batch
67
+ logger.info "Finished executing stage: #{stage.name}"
68
68
  else
69
69
  logger.info "Skipping stage #{stage.name}"
70
70
  end
@@ -84,4 +84,9 @@ describe Chicago::ETL::Batch do
84
84
  batch = ETL::Batch.instance.start
85
85
  lambda { batch.perform_task(:transform, "Test") {} }.should_not raise_error
86
86
  end
87
+
88
+ it "can be marked as re-extracting" do
89
+ ETL::Batch.instance.reextract.should be_reextracting
90
+ ETL::Batch.instance.should_not be_reextracting
91
+ end
87
92
  end
@@ -25,7 +25,7 @@ describe "creating and running a dimension stage" do
25
25
  end
26
26
 
27
27
  pipeline.stages.each do |stage|
28
- stage.execute(double, true)
28
+ stage.execute(double(:reextracting? => true))
29
29
  end
30
30
  end
31
31
 
@@ -29,7 +29,7 @@ describe "defining and executing a stage" do
29
29
  end
30
30
 
31
31
  pipeline.stages.each do |stage|
32
- stage.execute(double, true)
32
+ stage.execute(double(:reextracting? => true))
33
33
  end
34
34
 
35
35
  stage = pipeline.stages.first
@@ -56,7 +56,7 @@ describe "defining and executing a stage" do
56
56
  end
57
57
 
58
58
  pipeline.stages.each do |stage|
59
- stage.execute(double, true)
59
+ stage.execute(double(:reextracting? => true))
60
60
  end
61
61
 
62
62
  stage = pipeline.stages.first
@@ -68,7 +68,7 @@ describe "defining and executing a stage" do
68
68
  end
69
69
 
70
70
  it "allows the source to be filtered via a filter strategy" do
71
- etl_batch_double = double
71
+ etl_batch_double = double(:reextracting? => false)
72
72
  fake_source = []
73
73
  fake_source.should_receive(:another_dataset_method).and_return([])
74
74
 
@@ -88,7 +88,7 @@ describe "defining and executing a stage" do
88
88
  end
89
89
 
90
90
  pipeline.stages.each do |stage|
91
- stage.execute(etl_batch_double, false)
91
+ stage.execute(etl_batch_double)
92
92
  end
93
93
  end
94
94
  end
@@ -15,13 +15,13 @@ describe "Chicago::ETL Execution method" do
15
15
  stage.should_not_receive(:execute)
16
16
  logger.should_receive(:info).with("Skipping stage test")
17
17
 
18
- Chicago::ETL.execute(stage, batch, false, logger)
18
+ Chicago::ETL.execute(stage, batch, logger)
19
19
  end
20
20
 
21
21
  it "executes the stage" do
22
22
  stage = double(:stage, :executable? => true, :name => "test")
23
- stage.should_receive(:execute).with(batch, false)
23
+ stage.should_receive(:execute).with(batch)
24
24
 
25
- Chicago::ETL.execute(stage, batch, false, logger)
25
+ Chicago::ETL.execute(stage, batch, logger)
26
26
  end
27
27
  end
@@ -64,7 +64,7 @@ describe "Mysql -> Mysql through transformation chain" do
64
64
  :other => sink_2
65
65
  })
66
66
 
67
- stage.execute(double(:etl_batch), true)
67
+ stage.execute(double(:etl_batch, :reextracting? => true))
68
68
 
69
69
  expected = [{:id => 1, :foo => nil, :bin => "1F"},
70
70
  {:id => 2, :foo => "Hello", :bin => "1F"}]
@@ -1,6 +1,8 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  describe Chicago::ETL::Stage do
4
+ let(:etl_batch) { double(:etl_batch, :reextracting? => true) }
5
+
4
6
  it "requires a source" do
5
7
  expect {
6
8
  described_class.new(:test,
@@ -23,7 +25,7 @@ describe Chicago::ETL::Stage do
23
25
  :sinks => {:default => double(:sink)},
24
26
  :filter_strategy => lambda { fail })
25
27
 
26
- stage.filtered_source(double(:etl_batch), true)
28
+ stage.filtered_source(etl_batch)
27
29
  end
28
30
 
29
31
  it "opens sinks before writing and closes them afterwards" do
@@ -35,6 +37,6 @@ describe Chicago::ETL::Stage do
35
37
  :source => [],
36
38
  :sinks => {:default => sink})
37
39
 
38
- stage.execute(double(:etl_batch), true)
40
+ stage.execute(etl_batch)
39
41
  end
40
42
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: chicago-etl
3
3
  version: !ruby/object:Gem::Version
4
- hash: 17
4
+ hash: 31
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 2
9
- - 3
10
- version: 0.2.3
9
+ - 4
10
+ version: 0.2.4
11
11
  platform: ruby
12
12
  authors:
13
13
  - Roland Swingler
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2014-01-06 00:00:00 Z
18
+ date: 2014-01-07 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  version_requirements: &id001 !ruby/object:Gem::Requirement