chicago-etl 0.2.3 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/chicago-etl.gemspec +2 -2
- data/lib/chicago/etl/batch.rb +12 -0
- data/lib/chicago/etl/schema_table_stage_builder.rb +5 -2
- data/lib/chicago/etl/stage.rb +7 -7
- data/lib/chicago/etl/stage_builder.rb +2 -2
- data/lib/chicago/etl.rb +4 -4
- data/spec/etl/batch_spec.rb +5 -0
- data/spec/etl/define_dimension_stage_spec.rb +1 -1
- data/spec/etl/define_stage_spec.rb +4 -4
- data/spec/etl/execution_wrapper_spec.rb +3 -3
- data/spec/etl/mysql_integration_spec.rb +1 -1
- data/spec/etl/stage_spec.rb +4 -2
- metadata +4 -4
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.4
|
data/chicago-etl.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "chicago-etl"
|
8
|
-
s.version = "0.2.
|
8
|
+
s.version = "0.2.4"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Roland Swingler"]
|
12
|
-
s.date = "2014-01-
|
12
|
+
s.date = "2014-01-07"
|
13
13
|
s.description = "ETL tools for Chicago"
|
14
14
|
s.email = "roland.swingler@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
data/lib/chicago/etl/batch.rb
CHANGED
@@ -45,6 +45,18 @@ module Chicago
|
|
45
45
|
perform_task(:extract, task_name, &block)
|
46
46
|
end
|
47
47
|
|
48
|
+
# Marks this batch for re-extraction.
|
49
|
+
def reextract
|
50
|
+
@reextract = true
|
51
|
+
self
|
52
|
+
end
|
53
|
+
|
54
|
+
# Returns true when this batch should re-extract - i.e. load
|
55
|
+
# records without regard to creation/update times.
|
56
|
+
def reextracting?
|
57
|
+
!!@reextract
|
58
|
+
end
|
59
|
+
|
48
60
|
# Deprecated.
|
49
61
|
#
|
50
62
|
# @deprecated Use perform_task instead
|
@@ -33,8 +33,11 @@ module Chicago
|
|
33
33
|
end
|
34
34
|
end
|
35
35
|
|
36
|
-
@pre_execution_strategies << lambda {|stage, etl_batch
|
37
|
-
|
36
|
+
@pre_execution_strategies << lambda {|stage, etl_batch|
|
37
|
+
if etl_batch.reextracting? && stage.sink(:error)
|
38
|
+
stage.sink(:error).truncate
|
39
|
+
end
|
40
|
+
|
38
41
|
stage.sink(:default).
|
39
42
|
set_constant_values(:_inserted_at => Time.now)
|
40
43
|
}
|
data/lib/chicago/etl/stage.rb
CHANGED
@@ -34,9 +34,9 @@ module Chicago
|
|
34
34
|
end
|
35
35
|
|
36
36
|
# Executes this stage in the context of an ETL::Batch
|
37
|
-
def execute(etl_batch
|
38
|
-
prepare_stage(etl_batch
|
39
|
-
transform_and_load filtered_source(etl_batch
|
37
|
+
def execute(etl_batch)
|
38
|
+
prepare_stage(etl_batch)
|
39
|
+
transform_and_load filtered_source(etl_batch)
|
40
40
|
end
|
41
41
|
|
42
42
|
# Returns the named sink, if it exists
|
@@ -49,8 +49,8 @@ module Chicago
|
|
49
49
|
end
|
50
50
|
|
51
51
|
# @api private
|
52
|
-
def filtered_source(etl_batch
|
53
|
-
filtered_dataset =
|
52
|
+
def filtered_source(etl_batch)
|
53
|
+
filtered_dataset = etl_batch.reextracting? ? source :
|
54
54
|
@filter_strategy.call(source, etl_batch)
|
55
55
|
|
56
56
|
DatasetSource.new(filtered_dataset)
|
@@ -58,9 +58,9 @@ module Chicago
|
|
58
58
|
|
59
59
|
private
|
60
60
|
|
61
|
-
def prepare_stage(etl_batch
|
61
|
+
def prepare_stage(etl_batch)
|
62
62
|
@pre_execution_strategies.each do |strategy|
|
63
|
-
strategy.call(self, etl_batch
|
63
|
+
strategy.call(self, etl_batch)
|
64
64
|
end
|
65
65
|
end
|
66
66
|
|
@@ -26,13 +26,13 @@ module Chicago
|
|
26
26
|
# Specifies that the sinks should be truncated before loading
|
27
27
|
# data.
|
28
28
|
def truncate_pre_load
|
29
|
-
@pre_execution_strategies << lambda {|stage, etl_batch
|
29
|
+
@pre_execution_strategies << lambda {|stage, etl_batch|
|
30
30
|
stage.sinks.each {|sink| sink.truncate }
|
31
31
|
}
|
32
32
|
end
|
33
33
|
|
34
34
|
# Specifies that the dataset should never be filtered to the ETL
|
35
|
-
# batch - i.e. it should behave as if
|
35
|
+
# batch - i.e. it should behave as if the batch is reextracting
|
36
36
|
def full_reload
|
37
37
|
@filter_strategy = lambda {|dataset, etl_batch| dataset }
|
38
38
|
end
|
data/lib/chicago/etl.rb
CHANGED
@@ -59,12 +59,12 @@ module Chicago
|
|
59
59
|
# Tasks execution status is stored in a database etl task
|
60
60
|
# invocations table - this ensures tasks aren't run more than once
|
61
61
|
# within a batch.
|
62
|
-
def self.execute(stage, etl_batch,
|
62
|
+
def self.execute(stage, etl_batch, logger)
|
63
63
|
etl_batch.perform_task(:load, stage.name) do
|
64
64
|
if stage.executable?
|
65
|
-
logger.debug "Starting
|
66
|
-
stage.execute
|
67
|
-
logger.info "Finished
|
65
|
+
logger.debug "Starting executing stage: #{stage.name}"
|
66
|
+
stage.execute etl_batch
|
67
|
+
logger.info "Finished executing stage: #{stage.name}"
|
68
68
|
else
|
69
69
|
logger.info "Skipping stage #{stage.name}"
|
70
70
|
end
|
data/spec/etl/batch_spec.rb
CHANGED
@@ -84,4 +84,9 @@ describe Chicago::ETL::Batch do
|
|
84
84
|
batch = ETL::Batch.instance.start
|
85
85
|
lambda { batch.perform_task(:transform, "Test") {} }.should_not raise_error
|
86
86
|
end
|
87
|
+
|
88
|
+
it "can be marked as re-extracting" do
|
89
|
+
ETL::Batch.instance.reextract.should be_reextracting
|
90
|
+
ETL::Batch.instance.should_not be_reextracting
|
91
|
+
end
|
87
92
|
end
|
@@ -29,7 +29,7 @@ describe "defining and executing a stage" do
|
|
29
29
|
end
|
30
30
|
|
31
31
|
pipeline.stages.each do |stage|
|
32
|
-
stage.execute(double
|
32
|
+
stage.execute(double(:reextracting? => true))
|
33
33
|
end
|
34
34
|
|
35
35
|
stage = pipeline.stages.first
|
@@ -56,7 +56,7 @@ describe "defining and executing a stage" do
|
|
56
56
|
end
|
57
57
|
|
58
58
|
pipeline.stages.each do |stage|
|
59
|
-
stage.execute(double
|
59
|
+
stage.execute(double(:reextracting? => true))
|
60
60
|
end
|
61
61
|
|
62
62
|
stage = pipeline.stages.first
|
@@ -68,7 +68,7 @@ describe "defining and executing a stage" do
|
|
68
68
|
end
|
69
69
|
|
70
70
|
it "allows the source to be filtered via a filter strategy" do
|
71
|
-
etl_batch_double = double
|
71
|
+
etl_batch_double = double(:reextracting? => false)
|
72
72
|
fake_source = []
|
73
73
|
fake_source.should_receive(:another_dataset_method).and_return([])
|
74
74
|
|
@@ -88,7 +88,7 @@ describe "defining and executing a stage" do
|
|
88
88
|
end
|
89
89
|
|
90
90
|
pipeline.stages.each do |stage|
|
91
|
-
stage.execute(etl_batch_double
|
91
|
+
stage.execute(etl_batch_double)
|
92
92
|
end
|
93
93
|
end
|
94
94
|
end
|
@@ -15,13 +15,13 @@ describe "Chicago::ETL Execution method" do
|
|
15
15
|
stage.should_not_receive(:execute)
|
16
16
|
logger.should_receive(:info).with("Skipping stage test")
|
17
17
|
|
18
|
-
Chicago::ETL.execute(stage, batch,
|
18
|
+
Chicago::ETL.execute(stage, batch, logger)
|
19
19
|
end
|
20
20
|
|
21
21
|
it "executes the stage" do
|
22
22
|
stage = double(:stage, :executable? => true, :name => "test")
|
23
|
-
stage.should_receive(:execute).with(batch
|
23
|
+
stage.should_receive(:execute).with(batch)
|
24
24
|
|
25
|
-
Chicago::ETL.execute(stage, batch,
|
25
|
+
Chicago::ETL.execute(stage, batch, logger)
|
26
26
|
end
|
27
27
|
end
|
@@ -64,7 +64,7 @@ describe "Mysql -> Mysql through transformation chain" do
|
|
64
64
|
:other => sink_2
|
65
65
|
})
|
66
66
|
|
67
|
-
stage.execute(double(:etl_batch
|
67
|
+
stage.execute(double(:etl_batch, :reextracting? => true))
|
68
68
|
|
69
69
|
expected = [{:id => 1, :foo => nil, :bin => "1F"},
|
70
70
|
{:id => 2, :foo => "Hello", :bin => "1F"}]
|
data/spec/etl/stage_spec.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
describe Chicago::ETL::Stage do
|
4
|
+
let(:etl_batch) { double(:etl_batch, :reextracting? => true) }
|
5
|
+
|
4
6
|
it "requires a source" do
|
5
7
|
expect {
|
6
8
|
described_class.new(:test,
|
@@ -23,7 +25,7 @@ describe Chicago::ETL::Stage do
|
|
23
25
|
:sinks => {:default => double(:sink)},
|
24
26
|
:filter_strategy => lambda { fail })
|
25
27
|
|
26
|
-
stage.filtered_source(
|
28
|
+
stage.filtered_source(etl_batch)
|
27
29
|
end
|
28
30
|
|
29
31
|
it "opens sinks before writing and closes them afterwards" do
|
@@ -35,6 +37,6 @@ describe Chicago::ETL::Stage do
|
|
35
37
|
:source => [],
|
36
38
|
:sinks => {:default => sink})
|
37
39
|
|
38
|
-
stage.execute(
|
40
|
+
stage.execute(etl_batch)
|
39
41
|
end
|
40
42
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chicago-etl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 31
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 2
|
9
|
-
-
|
10
|
-
version: 0.2.
|
9
|
+
- 4
|
10
|
+
version: 0.2.4
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Roland Swingler
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2014-01-
|
18
|
+
date: 2014-01-07 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
version_requirements: &id001 !ruby/object:Gem::Requirement
|