chicago-etl 0.2.3 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/chicago-etl.gemspec +2 -2
- data/lib/chicago/etl/batch.rb +12 -0
- data/lib/chicago/etl/schema_table_stage_builder.rb +5 -2
- data/lib/chicago/etl/stage.rb +7 -7
- data/lib/chicago/etl/stage_builder.rb +2 -2
- data/lib/chicago/etl.rb +4 -4
- data/spec/etl/batch_spec.rb +5 -0
- data/spec/etl/define_dimension_stage_spec.rb +1 -1
- data/spec/etl/define_stage_spec.rb +4 -4
- data/spec/etl/execution_wrapper_spec.rb +3 -3
- data/spec/etl/mysql_integration_spec.rb +1 -1
- data/spec/etl/stage_spec.rb +4 -2
- metadata +4 -4
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.4
|
data/chicago-etl.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "chicago-etl"
|
8
|
-
s.version = "0.2.
|
8
|
+
s.version = "0.2.4"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Roland Swingler"]
|
12
|
-
s.date = "2014-01-
|
12
|
+
s.date = "2014-01-07"
|
13
13
|
s.description = "ETL tools for Chicago"
|
14
14
|
s.email = "roland.swingler@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
data/lib/chicago/etl/batch.rb
CHANGED
@@ -45,6 +45,18 @@ module Chicago
|
|
45
45
|
perform_task(:extract, task_name, &block)
|
46
46
|
end
|
47
47
|
|
48
|
+
# Marks this batch for re-extraction.
|
49
|
+
def reextract
|
50
|
+
@reextract = true
|
51
|
+
self
|
52
|
+
end
|
53
|
+
|
54
|
+
# Returns true when this batch should re-extract - i.e. load
|
55
|
+
# records without regard to creation/update times.
|
56
|
+
def reextracting?
|
57
|
+
!!@reextract
|
58
|
+
end
|
59
|
+
|
48
60
|
# Deprecated.
|
49
61
|
#
|
50
62
|
# @deprecated Use perform_task instead
|
@@ -33,8 +33,11 @@ module Chicago
|
|
33
33
|
end
|
34
34
|
end
|
35
35
|
|
36
|
-
@pre_execution_strategies << lambda {|stage, etl_batch
|
37
|
-
|
36
|
+
@pre_execution_strategies << lambda {|stage, etl_batch|
|
37
|
+
if etl_batch.reextracting? && stage.sink(:error)
|
38
|
+
stage.sink(:error).truncate
|
39
|
+
end
|
40
|
+
|
38
41
|
stage.sink(:default).
|
39
42
|
set_constant_values(:_inserted_at => Time.now)
|
40
43
|
}
|
data/lib/chicago/etl/stage.rb
CHANGED
@@ -34,9 +34,9 @@ module Chicago
|
|
34
34
|
end
|
35
35
|
|
36
36
|
# Executes this stage in the context of an ETL::Batch
|
37
|
-
def execute(etl_batch
|
38
|
-
prepare_stage(etl_batch
|
39
|
-
transform_and_load filtered_source(etl_batch
|
37
|
+
def execute(etl_batch)
|
38
|
+
prepare_stage(etl_batch)
|
39
|
+
transform_and_load filtered_source(etl_batch)
|
40
40
|
end
|
41
41
|
|
42
42
|
# Returns the named sink, if it exists
|
@@ -49,8 +49,8 @@ module Chicago
|
|
49
49
|
end
|
50
50
|
|
51
51
|
# @api private
|
52
|
-
def filtered_source(etl_batch
|
53
|
-
filtered_dataset =
|
52
|
+
def filtered_source(etl_batch)
|
53
|
+
filtered_dataset = etl_batch.reextracting? ? source :
|
54
54
|
@filter_strategy.call(source, etl_batch)
|
55
55
|
|
56
56
|
DatasetSource.new(filtered_dataset)
|
@@ -58,9 +58,9 @@ module Chicago
|
|
58
58
|
|
59
59
|
private
|
60
60
|
|
61
|
-
def prepare_stage(etl_batch
|
61
|
+
def prepare_stage(etl_batch)
|
62
62
|
@pre_execution_strategies.each do |strategy|
|
63
|
-
strategy.call(self, etl_batch
|
63
|
+
strategy.call(self, etl_batch)
|
64
64
|
end
|
65
65
|
end
|
66
66
|
|
@@ -26,13 +26,13 @@ module Chicago
|
|
26
26
|
# Specifies that the sinks should be truncated before loading
|
27
27
|
# data.
|
28
28
|
def truncate_pre_load
|
29
|
-
@pre_execution_strategies << lambda {|stage, etl_batch
|
29
|
+
@pre_execution_strategies << lambda {|stage, etl_batch|
|
30
30
|
stage.sinks.each {|sink| sink.truncate }
|
31
31
|
}
|
32
32
|
end
|
33
33
|
|
34
34
|
# Specifies that the dataset should never be filtered to the ETL
|
35
|
-
# batch - i.e. it should behave as if
|
35
|
+
# batch - i.e. it should behave as if the batch is reextracting
|
36
36
|
def full_reload
|
37
37
|
@filter_strategy = lambda {|dataset, etl_batch| dataset }
|
38
38
|
end
|
data/lib/chicago/etl.rb
CHANGED
@@ -59,12 +59,12 @@ module Chicago
|
|
59
59
|
# Tasks execution status is stored in a database etl task
|
60
60
|
# invocations table - this ensures tasks aren't run more than once
|
61
61
|
# within a batch.
|
62
|
-
def self.execute(stage, etl_batch,
|
62
|
+
def self.execute(stage, etl_batch, logger)
|
63
63
|
etl_batch.perform_task(:load, stage.name) do
|
64
64
|
if stage.executable?
|
65
|
-
logger.debug "Starting
|
66
|
-
stage.execute
|
67
|
-
logger.info "Finished
|
65
|
+
logger.debug "Starting executing stage: #{stage.name}"
|
66
|
+
stage.execute etl_batch
|
67
|
+
logger.info "Finished executing stage: #{stage.name}"
|
68
68
|
else
|
69
69
|
logger.info "Skipping stage #{stage.name}"
|
70
70
|
end
|
data/spec/etl/batch_spec.rb
CHANGED
@@ -84,4 +84,9 @@ describe Chicago::ETL::Batch do
|
|
84
84
|
batch = ETL::Batch.instance.start
|
85
85
|
lambda { batch.perform_task(:transform, "Test") {} }.should_not raise_error
|
86
86
|
end
|
87
|
+
|
88
|
+
it "can be marked as re-extracting" do
|
89
|
+
ETL::Batch.instance.reextract.should be_reextracting
|
90
|
+
ETL::Batch.instance.should_not be_reextracting
|
91
|
+
end
|
87
92
|
end
|
@@ -29,7 +29,7 @@ describe "defining and executing a stage" do
|
|
29
29
|
end
|
30
30
|
|
31
31
|
pipeline.stages.each do |stage|
|
32
|
-
stage.execute(double
|
32
|
+
stage.execute(double(:reextracting? => true))
|
33
33
|
end
|
34
34
|
|
35
35
|
stage = pipeline.stages.first
|
@@ -56,7 +56,7 @@ describe "defining and executing a stage" do
|
|
56
56
|
end
|
57
57
|
|
58
58
|
pipeline.stages.each do |stage|
|
59
|
-
stage.execute(double
|
59
|
+
stage.execute(double(:reextracting? => true))
|
60
60
|
end
|
61
61
|
|
62
62
|
stage = pipeline.stages.first
|
@@ -68,7 +68,7 @@ describe "defining and executing a stage" do
|
|
68
68
|
end
|
69
69
|
|
70
70
|
it "allows the source to be filtered via a filter strategy" do
|
71
|
-
etl_batch_double = double
|
71
|
+
etl_batch_double = double(:reextracting? => false)
|
72
72
|
fake_source = []
|
73
73
|
fake_source.should_receive(:another_dataset_method).and_return([])
|
74
74
|
|
@@ -88,7 +88,7 @@ describe "defining and executing a stage" do
|
|
88
88
|
end
|
89
89
|
|
90
90
|
pipeline.stages.each do |stage|
|
91
|
-
stage.execute(etl_batch_double
|
91
|
+
stage.execute(etl_batch_double)
|
92
92
|
end
|
93
93
|
end
|
94
94
|
end
|
@@ -15,13 +15,13 @@ describe "Chicago::ETL Execution method" do
|
|
15
15
|
stage.should_not_receive(:execute)
|
16
16
|
logger.should_receive(:info).with("Skipping stage test")
|
17
17
|
|
18
|
-
Chicago::ETL.execute(stage, batch,
|
18
|
+
Chicago::ETL.execute(stage, batch, logger)
|
19
19
|
end
|
20
20
|
|
21
21
|
it "executes the stage" do
|
22
22
|
stage = double(:stage, :executable? => true, :name => "test")
|
23
|
-
stage.should_receive(:execute).with(batch
|
23
|
+
stage.should_receive(:execute).with(batch)
|
24
24
|
|
25
|
-
Chicago::ETL.execute(stage, batch,
|
25
|
+
Chicago::ETL.execute(stage, batch, logger)
|
26
26
|
end
|
27
27
|
end
|
@@ -64,7 +64,7 @@ describe "Mysql -> Mysql through transformation chain" do
|
|
64
64
|
:other => sink_2
|
65
65
|
})
|
66
66
|
|
67
|
-
stage.execute(double(:etl_batch
|
67
|
+
stage.execute(double(:etl_batch, :reextracting? => true))
|
68
68
|
|
69
69
|
expected = [{:id => 1, :foo => nil, :bin => "1F"},
|
70
70
|
{:id => 2, :foo => "Hello", :bin => "1F"}]
|
data/spec/etl/stage_spec.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
describe Chicago::ETL::Stage do
|
4
|
+
let(:etl_batch) { double(:etl_batch, :reextracting? => true) }
|
5
|
+
|
4
6
|
it "requires a source" do
|
5
7
|
expect {
|
6
8
|
described_class.new(:test,
|
@@ -23,7 +25,7 @@ describe Chicago::ETL::Stage do
|
|
23
25
|
:sinks => {:default => double(:sink)},
|
24
26
|
:filter_strategy => lambda { fail })
|
25
27
|
|
26
|
-
stage.filtered_source(
|
28
|
+
stage.filtered_source(etl_batch)
|
27
29
|
end
|
28
30
|
|
29
31
|
it "opens sinks before writing and closes them afterwards" do
|
@@ -35,6 +37,6 @@ describe Chicago::ETL::Stage do
|
|
35
37
|
:source => [],
|
36
38
|
:sinks => {:default => sink})
|
37
39
|
|
38
|
-
stage.execute(
|
40
|
+
stage.execute(etl_batch)
|
39
41
|
end
|
40
42
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chicago-etl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 31
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 2
|
9
|
-
-
|
10
|
-
version: 0.2.
|
9
|
+
- 4
|
10
|
+
version: 0.2.4
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Roland Swingler
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2014-01-
|
18
|
+
date: 2014-01-07 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
version_requirements: &id001 !ruby/object:Gem::Requirement
|