chicago-etl 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/chicago-etl.gemspec +2 -2
- data/lib/chicago/etl/dataset_batch_stage.rb +1 -13
- data/lib/chicago/etl/pipeline.rb +10 -6
- data/lib/chicago/etl/stage.rb +7 -0
- metadata +4 -4
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.1
|
data/chicago-etl.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "chicago-etl"
|
8
|
-
s.version = "0.2.
|
8
|
+
s.version = "0.2.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Roland Swingler"]
|
12
|
-
s.date = "2013-11-
|
12
|
+
s.date = "2013-11-19"
|
13
13
|
s.description = "ETL tools for Chicago"
|
14
14
|
s.email = "roland.swingler@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -1,25 +1,13 @@
|
|
1
1
|
module Chicago
|
2
2
|
module ETL
|
3
|
-
# Links a PipelineStage to a Dataset.
|
4
|
-
#
|
5
3
|
# Allows deferring constructing a DatasetSource until extract
|
6
4
|
# time, so that it can be filtered to an ETL batch appropriately.
|
7
5
|
class DatasetBatchStage < Stage
|
8
|
-
attr_reader :name
|
9
|
-
|
10
|
-
def initialize(name, options={})
|
11
|
-
super
|
12
|
-
@filter_strategy = options[:filter_strategy] ||
|
13
|
-
lambda { |dataset, etl_batch| @source.filter_to_etl_batch(etl_batch)}
|
14
|
-
end
|
15
|
-
|
16
6
|
# Executes this ETL stage.
|
17
7
|
#
|
18
8
|
# Configures the dataset and flows rows into the pipeline.
|
19
9
|
def execute(etl_batch, reextract=false)
|
20
|
-
if
|
21
|
-
sinks.each {|sink| sink.truncate }
|
22
|
-
elsif reextract && sink(:error)
|
10
|
+
if reextract && sink(:error) && !truncate_pre_load?
|
23
11
|
sink(:error).truncate
|
24
12
|
end
|
25
13
|
|
data/lib/chicago/etl/pipeline.rb
CHANGED
@@ -65,13 +65,17 @@ module Chicago
|
|
65
65
|
pipeline do
|
66
66
|
end
|
67
67
|
end
|
68
|
-
DatasetBatchStage.new(name,
|
69
|
-
:source => @dataset,
|
70
|
-
:transformations => @sinks_and_transformations[:transformations],
|
71
|
-
:sinks => @sinks_and_transformations[:sinks],
|
72
|
-
:filter_strategy => @filter_strategy,
|
73
|
-
:truncate_pre_load => @truncate_pre_load)
|
74
68
|
|
69
|
+
@filter_strategy ||= lambda {|dataset, etl_batch|
|
70
|
+
dataset.filter_to_etl_batch(etl_batch)
|
71
|
+
}
|
72
|
+
|
73
|
+
DatasetBatchStage.new(name,
|
74
|
+
:source => @dataset,
|
75
|
+
:transformations => @sinks_and_transformations[:transformations],
|
76
|
+
:sinks => @sinks_and_transformations[:sinks],
|
77
|
+
:filter_strategy => @filter_strategy,
|
78
|
+
:truncate_pre_load => @truncate_pre_load)
|
75
79
|
end
|
76
80
|
|
77
81
|
protected
|
data/lib/chicago/etl/stage.rb
CHANGED
@@ -18,11 +18,18 @@ module Chicago
|
|
18
18
|
@transformations = options[:transformations] || []
|
19
19
|
@filter_strategy = options[:filter_strategy] ||
|
20
20
|
lambda {|source, _| source }
|
21
|
+
@truncate_pre_load = !!options[:truncate_pre_load]
|
21
22
|
|
22
23
|
validate_arguments
|
23
24
|
end
|
25
|
+
|
26
|
+
# Returns true if the sinks should be truncated pre-load.
|
27
|
+
def truncate_pre_load?
|
28
|
+
@truncate_pre_load
|
29
|
+
end
|
24
30
|
|
25
31
|
def execute(etl_batch, reextract=false)
|
32
|
+
sinks.each {|sink| sink.truncate } if truncate_pre_load?
|
26
33
|
transform_and_load filtered_source(etl_batch, reextract)
|
27
34
|
end
|
28
35
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chicago-etl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 21
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 2
|
9
|
-
-
|
10
|
-
version: 0.2.
|
9
|
+
- 1
|
10
|
+
version: 0.2.1
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Roland Swingler
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2013-11-
|
18
|
+
date: 2013-11-19 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
version_requirements: &id001 !ruby/object:Gem::Requirement
|