chicago-etl 0.2.3 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.3
1
+ 0.2.4
data/chicago-etl.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "chicago-etl"
8
- s.version = "0.2.3"
8
+ s.version = "0.2.4"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Roland Swingler"]
12
- s.date = "2014-01-06"
12
+ s.date = "2014-01-07"
13
13
  s.description = "ETL tools for Chicago"
14
14
  s.email = "roland.swingler@gmail.com"
15
15
  s.extra_rdoc_files = [
@@ -45,6 +45,18 @@ module Chicago
45
45
  perform_task(:extract, task_name, &block)
46
46
  end
47
47
 
48
+ # Marks this batch for re-extraction.
49
+ def reextract
50
+ @reextract = true
51
+ self
52
+ end
53
+
54
+ # Returns true when this batch should re-extract - i.e. load
55
+ # records without regard to creation/update times.
56
+ def reextracting?
57
+ !!@reextract
58
+ end
59
+
48
60
  # Deprecated.
49
61
  #
50
62
  # @deprecated Use perform_task instead
@@ -33,8 +33,11 @@ module Chicago
33
33
  end
34
34
  end
35
35
 
36
- @pre_execution_strategies << lambda {|stage, etl_batch, reextract|
37
- stage.sink(:error).truncate if reextract && stage.sink(:error)
36
+ @pre_execution_strategies << lambda {|stage, etl_batch|
37
+ if etl_batch.reextracting? && stage.sink(:error)
38
+ stage.sink(:error).truncate
39
+ end
40
+
38
41
  stage.sink(:default).
39
42
  set_constant_values(:_inserted_at => Time.now)
40
43
  }
@@ -34,9 +34,9 @@ module Chicago
34
34
  end
35
35
 
36
36
  # Executes this stage in the context of an ETL::Batch
37
- def execute(etl_batch, reextract=false)
38
- prepare_stage(etl_batch, reextract)
39
- transform_and_load filtered_source(etl_batch, reextract)
37
+ def execute(etl_batch)
38
+ prepare_stage(etl_batch)
39
+ transform_and_load filtered_source(etl_batch)
40
40
  end
41
41
 
42
42
  # Returns the named sink, if it exists
@@ -49,8 +49,8 @@ module Chicago
49
49
  end
50
50
 
51
51
  # @api private
52
- def filtered_source(etl_batch, reextract=false)
53
- filtered_dataset = reextract ? source :
52
+ def filtered_source(etl_batch)
53
+ filtered_dataset = etl_batch.reextracting? ? source :
54
54
  @filter_strategy.call(source, etl_batch)
55
55
 
56
56
  DatasetSource.new(filtered_dataset)
@@ -58,9 +58,9 @@ module Chicago
58
58
 
59
59
  private
60
60
 
61
- def prepare_stage(etl_batch, reextract)
61
+ def prepare_stage(etl_batch)
62
62
  @pre_execution_strategies.each do |strategy|
63
- strategy.call(self, etl_batch, reextract)
63
+ strategy.call(self, etl_batch)
64
64
  end
65
65
  end
66
66
 
@@ -26,13 +26,13 @@ module Chicago
26
26
  # Specifies that the sinks should be truncated before loading
27
27
  # data.
28
28
  def truncate_pre_load
29
- @pre_execution_strategies << lambda {|stage, etl_batch, reextract|
29
+ @pre_execution_strategies << lambda {|stage, etl_batch|
30
30
  stage.sinks.each {|sink| sink.truncate }
31
31
  }
32
32
  end
33
33
 
34
34
  # Specifies that the dataset should never be filtered to the ETL
35
- # batch - i.e. it should behave as if reextract was always true
35
+ # batch - i.e. it should behave as if the batch is reextracting
36
36
  def full_reload
37
37
  @filter_strategy = lambda {|dataset, etl_batch| dataset }
38
38
  end
data/lib/chicago/etl.rb CHANGED
@@ -59,12 +59,12 @@ module Chicago
59
59
  # Tasks execution status is stored in a database etl task
60
60
  # invocations table - this ensures tasks aren't run more than once
61
61
  # within a batch.
62
- def self.execute(stage, etl_batch, reextract, logger)
62
+ def self.execute(stage, etl_batch, logger)
63
63
  etl_batch.perform_task(:load, stage.name) do
64
64
  if stage.executable?
65
- logger.debug "Starting loading #{stage.name}"
66
- stage.execute(etl_batch, reextract)
67
- logger.info "Finished loading #{stage.name}"
65
+ logger.debug "Starting executing stage: #{stage.name}"
66
+ stage.execute etl_batch
67
+ logger.info "Finished executing stage: #{stage.name}"
68
68
  else
69
69
  logger.info "Skipping stage #{stage.name}"
70
70
  end
@@ -84,4 +84,9 @@ describe Chicago::ETL::Batch do
84
84
  batch = ETL::Batch.instance.start
85
85
  lambda { batch.perform_task(:transform, "Test") {} }.should_not raise_error
86
86
  end
87
+
88
+ it "can be marked as re-extracting" do
89
+ ETL::Batch.instance.reextract.should be_reextracting
90
+ ETL::Batch.instance.should_not be_reextracting
91
+ end
87
92
  end
@@ -25,7 +25,7 @@ describe "creating and running a dimension stage" do
25
25
  end
26
26
 
27
27
  pipeline.stages.each do |stage|
28
- stage.execute(double, true)
28
+ stage.execute(double(:reextracting? => true))
29
29
  end
30
30
  end
31
31
 
@@ -29,7 +29,7 @@ describe "defining and executing a stage" do
29
29
  end
30
30
 
31
31
  pipeline.stages.each do |stage|
32
- stage.execute(double, true)
32
+ stage.execute(double(:reextracting? => true))
33
33
  end
34
34
 
35
35
  stage = pipeline.stages.first
@@ -56,7 +56,7 @@ describe "defining and executing a stage" do
56
56
  end
57
57
 
58
58
  pipeline.stages.each do |stage|
59
- stage.execute(double, true)
59
+ stage.execute(double(:reextracting? => true))
60
60
  end
61
61
 
62
62
  stage = pipeline.stages.first
@@ -68,7 +68,7 @@ describe "defining and executing a stage" do
68
68
  end
69
69
 
70
70
  it "allows the source to be filtered via a filter strategy" do
71
- etl_batch_double = double
71
+ etl_batch_double = double(:reextracting? => false)
72
72
  fake_source = []
73
73
  fake_source.should_receive(:another_dataset_method).and_return([])
74
74
 
@@ -88,7 +88,7 @@ describe "defining and executing a stage" do
88
88
  end
89
89
 
90
90
  pipeline.stages.each do |stage|
91
- stage.execute(etl_batch_double, false)
91
+ stage.execute(etl_batch_double)
92
92
  end
93
93
  end
94
94
  end
@@ -15,13 +15,13 @@ describe "Chicago::ETL Execution method" do
15
15
  stage.should_not_receive(:execute)
16
16
  logger.should_receive(:info).with("Skipping stage test")
17
17
 
18
- Chicago::ETL.execute(stage, batch, false, logger)
18
+ Chicago::ETL.execute(stage, batch, logger)
19
19
  end
20
20
 
21
21
  it "executes the stage" do
22
22
  stage = double(:stage, :executable? => true, :name => "test")
23
- stage.should_receive(:execute).with(batch, false)
23
+ stage.should_receive(:execute).with(batch)
24
24
 
25
- Chicago::ETL.execute(stage, batch, false, logger)
25
+ Chicago::ETL.execute(stage, batch, logger)
26
26
  end
27
27
  end
@@ -64,7 +64,7 @@ describe "Mysql -> Mysql through transformation chain" do
64
64
  :other => sink_2
65
65
  })
66
66
 
67
- stage.execute(double(:etl_batch), true)
67
+ stage.execute(double(:etl_batch, :reextracting? => true))
68
68
 
69
69
  expected = [{:id => 1, :foo => nil, :bin => "1F"},
70
70
  {:id => 2, :foo => "Hello", :bin => "1F"}]
@@ -1,6 +1,8 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  describe Chicago::ETL::Stage do
4
+ let(:etl_batch) { double(:etl_batch, :reextracting? => true) }
5
+
4
6
  it "requires a source" do
5
7
  expect {
6
8
  described_class.new(:test,
@@ -23,7 +25,7 @@ describe Chicago::ETL::Stage do
23
25
  :sinks => {:default => double(:sink)},
24
26
  :filter_strategy => lambda { fail })
25
27
 
26
- stage.filtered_source(double(:etl_batch), true)
28
+ stage.filtered_source(etl_batch)
27
29
  end
28
30
 
29
31
  it "opens sinks before writing and closes them afterwards" do
@@ -35,6 +37,6 @@ describe Chicago::ETL::Stage do
35
37
  :source => [],
36
38
  :sinks => {:default => sink})
37
39
 
38
- stage.execute(double(:etl_batch), true)
40
+ stage.execute(etl_batch)
39
41
  end
40
42
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: chicago-etl
3
3
  version: !ruby/object:Gem::Version
4
- hash: 17
4
+ hash: 31
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 2
9
- - 3
10
- version: 0.2.3
9
+ - 4
10
+ version: 0.2.4
11
11
  platform: ruby
12
12
  authors:
13
13
  - Roland Swingler
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2014-01-06 00:00:00 Z
18
+ date: 2014-01-07 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  version_requirements: &id001 !ruby/object:Gem::Requirement