chicago-etl 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/chicago-etl.gemspec +7 -2
- data/lib/chicago/etl/batch.rb +0 -21
- data/lib/chicago/etl/pipeline.rb +17 -1
- data/lib/chicago/etl/stage.rb +76 -0
- data/lib/chicago/etl/stage_builder.rb +69 -0
- data/lib/chicago/etl/transformations/deduplicate_rows.rb +40 -0
- data/lib/chicago/etl.rb +4 -0
- data/spec/etl/define_stage_spec.rb +114 -0
- data/spec/etl/transformations/deduplicate_rows_spec.rb +23 -0
- metadata +198 -184
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.1
|
data/chicago-etl.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "chicago-etl"
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Roland Swingler"]
|
12
|
-
s.date = "2013-
|
12
|
+
s.date = "2013-11-07"
|
13
13
|
s.description = "ETL tools for Chicago"
|
14
14
|
s.email = "roland.swingler@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -43,10 +43,13 @@ Gem::Specification.new do |s|
|
|
43
43
|
"lib/chicago/etl/screens/out_of_bounds.rb",
|
44
44
|
"lib/chicago/etl/sequel/dependant_tables.rb",
|
45
45
|
"lib/chicago/etl/sequel/filter_to_etl_batch.rb",
|
46
|
+
"lib/chicago/etl/stage.rb",
|
47
|
+
"lib/chicago/etl/stage_builder.rb",
|
46
48
|
"lib/chicago/etl/table_builder.rb",
|
47
49
|
"lib/chicago/etl/task_invocation.rb",
|
48
50
|
"lib/chicago/etl/tasks.rb",
|
49
51
|
"lib/chicago/etl/transformations.rb",
|
52
|
+
"lib/chicago/etl/transformations/deduplicate_rows.rb",
|
50
53
|
"lib/chicago/etl/transformations/uk_post_code.rb",
|
51
54
|
"lib/chicago/etl/transformations/uk_post_code_field.rb",
|
52
55
|
"lib/chicago/flow/array_sink.rb",
|
@@ -68,6 +71,7 @@ Gem::Specification.new do |s|
|
|
68
71
|
"spec/etl/core_extensions_spec.rb",
|
69
72
|
"spec/etl/counter_spec.rb",
|
70
73
|
"spec/etl/dataset_batch_stage_spec.rb",
|
74
|
+
"spec/etl/define_stage_spec.rb",
|
71
75
|
"spec/etl/etl_batch_id_dataset_filter.rb",
|
72
76
|
"spec/etl/key_builder_spec.rb",
|
73
77
|
"spec/etl/load_dataset_builder_spec.rb",
|
@@ -80,6 +84,7 @@ Gem::Specification.new do |s|
|
|
80
84
|
"spec/etl/sequel/filter_to_etl_batch_spec.rb",
|
81
85
|
"spec/etl/table_builder_spec.rb",
|
82
86
|
"spec/etl/task_spec.rb",
|
87
|
+
"spec/etl/transformations/deduplicate_rows_spec.rb",
|
83
88
|
"spec/etl/transformations/uk_post_code_field_spec.rb",
|
84
89
|
"spec/etl/transformations/uk_post_code_spec.rb",
|
85
90
|
"spec/etl/transformations_spec.rb",
|
data/lib/chicago/etl/batch.rb
CHANGED
@@ -31,27 +31,6 @@ module Chicago
|
|
31
31
|
end
|
32
32
|
end
|
33
33
|
|
34
|
-
# Deprecated.
|
35
|
-
#
|
36
|
-
# @deprecated Use perform_task instead
|
37
|
-
def load(task_name, &block)
|
38
|
-
perform_task(:load, task_name, &block)
|
39
|
-
end
|
40
|
-
|
41
|
-
# Deprecated.
|
42
|
-
#
|
43
|
-
# @deprecated Use perform_task instead
|
44
|
-
def transform(task_name, &block)
|
45
|
-
perform_task(:extract, task_name, &block)
|
46
|
-
end
|
47
|
-
|
48
|
-
# Deprecated.
|
49
|
-
#
|
50
|
-
# @deprecated Use perform_task instead
|
51
|
-
def extract(task_name, &block)
|
52
|
-
perform_task(:extract, task_name, &block)
|
53
|
-
end
|
54
|
-
|
55
34
|
# Performs a named task if it hasn't already run successfully in
|
56
35
|
# this batch.
|
57
36
|
def perform_task(stage, task_name, &block)
|
data/lib/chicago/etl/pipeline.rb
CHANGED
@@ -8,11 +8,24 @@ module Chicago
|
|
8
8
|
# Returns all defined fact load tasks
|
9
9
|
attr_reader :load_facts
|
10
10
|
|
11
|
+
# Returns all the defined generic stages.
|
12
|
+
attr_reader :stages
|
13
|
+
|
11
14
|
# Creates a pipeline for a Schema.
|
12
15
|
def initialize(db, schema)
|
13
16
|
@schema, @db = schema, db
|
14
17
|
@load_dimensions = Chicago::Schema::NamedElementCollection.new
|
15
18
|
@load_facts = Chicago::Schema::NamedElementCollection.new
|
19
|
+
@stages = Chicago::Schema::NamedElementCollection.new
|
20
|
+
end
|
21
|
+
|
22
|
+
# Defines a generic stage in the pipeline.
|
23
|
+
def define_stage(name, &block)
|
24
|
+
@stages << build_schemaless_stage(name, &block)
|
25
|
+
end
|
26
|
+
|
27
|
+
def build_schemaless_stage(name, &block)
|
28
|
+
StageBuilder.new(@db).build(name, &block)
|
16
29
|
end
|
17
30
|
|
18
31
|
# Defines a dimension load stage
|
@@ -73,6 +86,7 @@ module Chicago
|
|
73
86
|
|
74
87
|
# Define elements of the pipeline. See LoadPipelineStageBuilder
|
75
88
|
# for details.
|
89
|
+
# TODO: rename pipeline => transforms below this method
|
76
90
|
def pipeline(&block)
|
77
91
|
@pipeline_stage = LoadPipelineStageBuilder.new(@db, @schema_table).
|
78
92
|
build(&block)
|
@@ -81,9 +95,11 @@ module Chicago
|
|
81
95
|
# Defines the dataset, see DatasetBuilder .
|
82
96
|
#
|
83
97
|
# The block must return a Sequel::Dataset.
|
84
|
-
|
98
|
+
# TODO: rename dataset => source below this method, make generic
|
99
|
+
def source(&block)
|
85
100
|
@dataset = DatasetBuilder.new(@db).build(&block)
|
86
101
|
end
|
102
|
+
alias :dataset :source
|
87
103
|
|
88
104
|
# Define a custom filter strategy for filtering to an ETL batch.
|
89
105
|
def filter_strategy(&block)
|
@@ -0,0 +1,76 @@
|
|
1
|
+
module Chicago
|
2
|
+
module ETL
|
3
|
+
class Stage
|
4
|
+
attr_reader :name
|
5
|
+
|
6
|
+
def initialize(name, options={})
|
7
|
+
@name = name
|
8
|
+
@source = options.fetch(:source)
|
9
|
+
raise ArgumentError, "Stage #{name} requires a source" unless @source
|
10
|
+
|
11
|
+
@sinks = options.fetch(:sinks)
|
12
|
+
raise ArgumentError, "Stage #{name} requires at least one sink" if @sinks.empty?
|
13
|
+
|
14
|
+
@transformations = options.fetch(:transformations)
|
15
|
+
@transformation_chain = Chicago::Flow::TransformationChain.
|
16
|
+
new(*@transformations)
|
17
|
+
|
18
|
+
@filter_strategy = options[:filter_strategy] ||
|
19
|
+
lambda {|source, _| source }
|
20
|
+
end
|
21
|
+
|
22
|
+
def execute(etl_batch, reextract)
|
23
|
+
modified_source = reextract_and_filter_source(@source, etl_batch, reextract)
|
24
|
+
transform_and_load_from(modified_source)
|
25
|
+
end
|
26
|
+
|
27
|
+
def transform_and_load_from(source)
|
28
|
+
end
|
29
|
+
|
30
|
+
def reextract_and_filter_source(source, etl_batch, reextract=false)
|
31
|
+
if reextract
|
32
|
+
filtered_dataset = source
|
33
|
+
else
|
34
|
+
filtered_dataset = @filter_strategy.call(source, etl_batch)
|
35
|
+
end
|
36
|
+
Chicago::Flow::DatasetSource.new(filtered_dataset)
|
37
|
+
end
|
38
|
+
|
39
|
+
attr_reader :transformation_chain
|
40
|
+
|
41
|
+
# Returns the named sink, if it exists
|
42
|
+
def sink(name)
|
43
|
+
@sinks[name.to_sym]
|
44
|
+
end
|
45
|
+
|
46
|
+
def sinks
|
47
|
+
@sinks.values
|
48
|
+
end
|
49
|
+
|
50
|
+
def register_sink(name, sink)
|
51
|
+
@sinks[name.to_sym] = sink
|
52
|
+
self
|
53
|
+
end
|
54
|
+
|
55
|
+
def transform_and_load_from(source)
|
56
|
+
sinks.each(&:open)
|
57
|
+
pipe_rows_to_sinks_from(source)
|
58
|
+
sinks.each(&:close)
|
59
|
+
end
|
60
|
+
|
61
|
+
private
|
62
|
+
|
63
|
+
def pipe_rows_to_sinks_from(source)
|
64
|
+
source.each do |row|
|
65
|
+
transformation_chain.process(row).each {|row| process_row(row) }
|
66
|
+
end
|
67
|
+
transformation_chain.flush.each {|row| process_row(row) }
|
68
|
+
end
|
69
|
+
|
70
|
+
def process_row(row)
|
71
|
+
stream = row.delete(:_stream) || :default
|
72
|
+
@sinks[stream] << row
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
module Chicago
|
2
|
+
module ETL
|
3
|
+
class StageBuilder
|
4
|
+
attr_reader :sink_factory
|
5
|
+
|
6
|
+
def initialize(db)
|
7
|
+
@db = db
|
8
|
+
end
|
9
|
+
|
10
|
+
def build(name, &block)
|
11
|
+
@sinks = {}
|
12
|
+
@transformations = []
|
13
|
+
|
14
|
+
instance_eval &block
|
15
|
+
|
16
|
+
Stage.new(name,
|
17
|
+
:source => @dataset,
|
18
|
+
:sinks => @sinks,
|
19
|
+
:transformations => @transformations,
|
20
|
+
:filter_strategy => @filter_strategy)
|
21
|
+
end
|
22
|
+
|
23
|
+
def source(&block)
|
24
|
+
@dataset = DatasetBuilder.new(@db).build(&block)
|
25
|
+
end
|
26
|
+
|
27
|
+
def transformations(klass=TransformationBuilder, &block)
|
28
|
+
@transformations = klass.new.build(&block)
|
29
|
+
end
|
30
|
+
|
31
|
+
def sinks(options={}, &block)
|
32
|
+
@sinks = SinkBuilder.new.build(&block)
|
33
|
+
end
|
34
|
+
|
35
|
+
# TODO: think of potentially better ways of dealig with this
|
36
|
+
# problem.
|
37
|
+
def filter_strategy(&block)
|
38
|
+
@filter_strategy = block
|
39
|
+
end
|
40
|
+
|
41
|
+
class TransformationBuilder
|
42
|
+
def build(&block)
|
43
|
+
@transformations = []
|
44
|
+
instance_eval(&block)
|
45
|
+
@transformations
|
46
|
+
end
|
47
|
+
|
48
|
+
def add(transformation)
|
49
|
+
@transformations << transformation
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
class SinkBuilder
|
54
|
+
def build(&block)
|
55
|
+
@sinks = {}
|
56
|
+
instance_eval(&block)
|
57
|
+
@sinks
|
58
|
+
end
|
59
|
+
|
60
|
+
protected
|
61
|
+
|
62
|
+
def add(sink, options={})
|
63
|
+
stream = options[:stream] || :default
|
64
|
+
@sinks[stream] = sink
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module Chicago
|
2
|
+
module ETL
|
3
|
+
class DeduplicateRows < Chicago::Flow::Transformation
|
4
|
+
def process_row(row)
|
5
|
+
if @working_row.nil?
|
6
|
+
@working_row = row
|
7
|
+
return
|
8
|
+
elsif same_row?(row)
|
9
|
+
@working_row = merge_rows(row)
|
10
|
+
return
|
11
|
+
else
|
12
|
+
assign_new_row_and_return_old_row(row)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def flush
|
17
|
+
@working_row.nil? ? [] : [@working_row]
|
18
|
+
end
|
19
|
+
|
20
|
+
protected
|
21
|
+
|
22
|
+
attr_reader :working_row
|
23
|
+
|
24
|
+
# This should be implemented by clients
|
25
|
+
def merge_rows(row)
|
26
|
+
end
|
27
|
+
|
28
|
+
# This should be implemented by clients
|
29
|
+
def same_row?(row)
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def assign_new_row_and_return_old_row(row)
|
35
|
+
row, @working_row = @working_row, row
|
36
|
+
row
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
data/lib/chicago/etl.rb
CHANGED
@@ -29,6 +29,9 @@ require 'chicago/etl/dataset_batch_stage'
|
|
29
29
|
require 'chicago/etl/load_pipeline_stage_builder'
|
30
30
|
require 'chicago/etl/pipeline'
|
31
31
|
|
32
|
+
require 'chicago/etl/stage'
|
33
|
+
require 'chicago/etl/stage_builder'
|
34
|
+
|
32
35
|
# Sequel Extensions
|
33
36
|
require 'chicago/etl/sequel/filter_to_etl_batch'
|
34
37
|
require 'chicago/etl/sequel/dependant_tables'
|
@@ -40,6 +43,7 @@ require 'chicago/etl/screens/invalid_element'
|
|
40
43
|
require 'chicago/etl/screens/out_of_bounds'
|
41
44
|
|
42
45
|
# Transformations
|
46
|
+
require 'chicago/etl/transformations/deduplicate_rows'
|
43
47
|
require 'chicago/etl/transformations/uk_post_code'
|
44
48
|
require 'chicago/etl/transformations/uk_post_code_field'
|
45
49
|
|
@@ -0,0 +1,114 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
class TestTransformation < Chicago::Flow::Transformation
|
4
|
+
def output_streams
|
5
|
+
[:another_stream]
|
6
|
+
end
|
7
|
+
|
8
|
+
def process_row(row)
|
9
|
+
[row, assign_stream({:some_field => "has an error value"}, :another_stream)]
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
describe "defining and executing a stage" do
|
14
|
+
let(:rows) { [{:some_field => "value"}] }
|
15
|
+
let(:db) { double(:test_dataset_method => rows) }
|
16
|
+
let(:schema) { double }
|
17
|
+
let(:pipeline) { Chicago::ETL::Pipeline.new(db, schema)}
|
18
|
+
|
19
|
+
it "allows no tranformations" do
|
20
|
+
pipeline.define_stage(:test_stage) do
|
21
|
+
source do
|
22
|
+
db.test_dataset_method
|
23
|
+
end
|
24
|
+
|
25
|
+
sinks do
|
26
|
+
add Chicago::Flow::ArraySink.new(:test)
|
27
|
+
add Chicago::Flow::ArraySink.new(:test), :stream => :another_stream
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
pipeline.stages.each do |stage|
|
32
|
+
stage.execute(double, true)
|
33
|
+
end
|
34
|
+
|
35
|
+
stage = pipeline.stages.first
|
36
|
+
stage.sink(:default).data.length.should == 1
|
37
|
+
stage.sink(:default).data.first.should == {:some_field => "value"}
|
38
|
+
|
39
|
+
stage.sink(:another_stream).data.length.should == 0
|
40
|
+
end
|
41
|
+
|
42
|
+
it "requires sinks" do
|
43
|
+
expect {
|
44
|
+
pipeline.define_stage(:test_stage) do
|
45
|
+
source do
|
46
|
+
db.test_dataset_method
|
47
|
+
end
|
48
|
+
end
|
49
|
+
}.to raise_error(ArgumentError)
|
50
|
+
end
|
51
|
+
|
52
|
+
it "requires sources" do
|
53
|
+
expect {
|
54
|
+
pipeline.define_stage(:test_stage) do
|
55
|
+
sinks do
|
56
|
+
add Chicago::Flow::ArraySink.new(:test)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
}.to raise_error(ArgumentError)
|
60
|
+
end
|
61
|
+
|
62
|
+
it "glues the source, transformations, and sink correctly" do
|
63
|
+
pipeline.define_stage(:test_stage) do
|
64
|
+
source do
|
65
|
+
db.test_dataset_method
|
66
|
+
end
|
67
|
+
|
68
|
+
transformations do
|
69
|
+
add TestTransformation.new
|
70
|
+
end
|
71
|
+
|
72
|
+
sinks do
|
73
|
+
add Chicago::Flow::ArraySink.new(:test)
|
74
|
+
add Chicago::Flow::ArraySink.new(:test), :stream => :another_stream
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
pipeline.stages.each do |stage|
|
79
|
+
stage.execute(double, true)
|
80
|
+
end
|
81
|
+
|
82
|
+
stage = pipeline.stages.first
|
83
|
+
stage.sink(:default).data.length.should == 1
|
84
|
+
stage.sink(:default).data.first.should == {:some_field => "value"}
|
85
|
+
|
86
|
+
stage.sink(:another_stream).data.length.should == 1
|
87
|
+
stage.sink(:another_stream).data.first.should == {:some_field => "has an error value"}
|
88
|
+
end
|
89
|
+
|
90
|
+
it "allows the source to be filtered via a filter strategy" do
|
91
|
+
etl_batch_double = double
|
92
|
+
fake_source = []
|
93
|
+
|
94
|
+
fake_source.should_receive(:another_dataset_method).and_return([])
|
95
|
+
pipeline.define_stage(:test_stage) do
|
96
|
+
source do
|
97
|
+
fake_source
|
98
|
+
end
|
99
|
+
|
100
|
+
sinks do
|
101
|
+
add Chicago::Flow::ArraySink.new(:test)
|
102
|
+
end
|
103
|
+
|
104
|
+
filter_strategy do |source, etl_batch|
|
105
|
+
etl_batch.should == etl_batch_double
|
106
|
+
source.another_dataset_method
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
pipeline.stages.each do |stage|
|
111
|
+
stage.execute(etl_batch_double, false)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Chicago::ETL::DeduplicateRows do
|
4
|
+
it "deduplicates rows" do
|
5
|
+
class TestTransform < described_class
|
6
|
+
def merge_rows(row)
|
7
|
+
working_row.merge(row)
|
8
|
+
end
|
9
|
+
|
10
|
+
def same_row?(row)
|
11
|
+
working_row[:id] == row[:id]
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
transform = TestTransform.new
|
16
|
+
|
17
|
+
transform.process({:id => 1, :foo => :bar}).should be_blank
|
18
|
+
transform.process({:id => 1, :bar => :baz}).should be_blank
|
19
|
+
transform.process({:id => 2, :foo => :quux}).should == {:id => 1, :foo => :bar, :bar => :baz}
|
20
|
+
|
21
|
+
transform.flush.should == [{:id => 2, :foo => :quux}]
|
22
|
+
end
|
23
|
+
end
|
metadata
CHANGED
@@ -1,233 +1,248 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: chicago-etl
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
5
|
prerelease:
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 1
|
9
|
-
- 0
|
10
|
-
version: 0.1.0
|
11
6
|
platform: ruby
|
12
|
-
authors:
|
7
|
+
authors:
|
13
8
|
- Roland Swingler
|
14
9
|
autorequire:
|
15
10
|
bindir: bin
|
16
11
|
cert_chain: []
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
12
|
+
date: 2013-11-07 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: chicagowarehouse
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
22
17
|
none: false
|
23
|
-
requirements:
|
18
|
+
requirements:
|
24
19
|
- - ~>
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
|
27
|
-
segments:
|
28
|
-
- 0
|
29
|
-
- 4
|
30
|
-
version: "0.4"
|
31
|
-
requirement: *id001
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0.4'
|
32
22
|
type: :runtime
|
33
23
|
prerelease: false
|
34
|
-
|
35
|
-
- !ruby/object:Gem::Dependency
|
36
|
-
version_requirements: &id002 !ruby/object:Gem::Requirement
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
25
|
none: false
|
38
|
-
requirements:
|
39
|
-
- -
|
40
|
-
- !ruby/object:Gem::Version
|
41
|
-
|
42
|
-
|
43
|
-
- 0
|
44
|
-
version: "0"
|
45
|
-
requirement: *id002
|
46
|
-
type: :runtime
|
47
|
-
prerelease: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0.4'
|
30
|
+
- !ruby/object:Gem::Dependency
|
48
31
|
name: fastercsv
|
49
|
-
|
50
|
-
version_requirements: &id003 !ruby/object:Gem::Requirement
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
51
33
|
none: false
|
52
|
-
requirements:
|
53
|
-
- -
|
54
|
-
- !ruby/object:Gem::Version
|
55
|
-
|
56
|
-
segments:
|
57
|
-
- 0
|
58
|
-
version: "0"
|
59
|
-
requirement: *id003
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
60
38
|
type: :runtime
|
61
39
|
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
62
47
|
name: sequel
|
63
|
-
|
64
|
-
version_requirements: &id004 !ruby/object:Gem::Requirement
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
65
49
|
none: false
|
66
|
-
requirements:
|
67
|
-
- -
|
68
|
-
- !ruby/object:Gem::Version
|
69
|
-
|
70
|
-
segments:
|
71
|
-
- 0
|
72
|
-
- 0
|
73
|
-
- 2
|
74
|
-
version: 0.0.2
|
75
|
-
requirement: *id004
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
76
54
|
type: :runtime
|
77
55
|
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
78
63
|
name: sequel_load_data_infile
|
79
|
-
|
80
|
-
version_requirements: &id005 !ruby/object:Gem::Requirement
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
81
65
|
none: false
|
82
|
-
requirements:
|
83
|
-
- -
|
84
|
-
- !ruby/object:Gem::Version
|
85
|
-
|
86
|
-
segments:
|
87
|
-
- 0
|
88
|
-
version: "0"
|
89
|
-
requirement: *id005
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 0.0.2
|
90
70
|
type: :runtime
|
91
71
|
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: 0.0.2
|
78
|
+
- !ruby/object:Gem::Dependency
|
92
79
|
name: sequel_fast_columns
|
93
|
-
|
94
|
-
version_requirements: &id006 !ruby/object:Gem::Requirement
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
95
81
|
none: false
|
96
|
-
requirements:
|
97
|
-
- -
|
98
|
-
- !ruby/object:Gem::Version
|
99
|
-
|
100
|
-
|
101
|
-
- 2
|
102
|
-
version: "2"
|
103
|
-
requirement: *id006
|
104
|
-
type: :development
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
type: :runtime
|
105
87
|
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
94
|
+
- !ruby/object:Gem::Dependency
|
106
95
|
name: rspec
|
107
|
-
|
108
|
-
version_requirements: &id007 !ruby/object:Gem::Requirement
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
109
97
|
none: false
|
110
|
-
requirements:
|
111
|
-
- -
|
112
|
-
- !ruby/object:Gem::Version
|
113
|
-
|
114
|
-
segments:
|
115
|
-
- 0
|
116
|
-
version: "0"
|
117
|
-
requirement: *id007
|
98
|
+
requirements:
|
99
|
+
- - ~>
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '2'
|
118
102
|
type: :development
|
119
103
|
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ~>
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '2'
|
110
|
+
- !ruby/object:Gem::Dependency
|
120
111
|
name: timecop
|
121
|
-
|
122
|
-
version_requirements: &id008 !ruby/object:Gem::Requirement
|
112
|
+
requirement: !ruby/object:Gem::Requirement
|
123
113
|
none: false
|
124
|
-
requirements:
|
125
|
-
- -
|
126
|
-
- !ruby/object:Gem::Version
|
127
|
-
|
128
|
-
segments:
|
129
|
-
- 0
|
130
|
-
version: "0"
|
131
|
-
requirement: *id008
|
114
|
+
requirements:
|
115
|
+
- - ! '>='
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
132
118
|
type: :development
|
133
119
|
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
122
|
+
requirements:
|
123
|
+
- - ! '>='
|
124
|
+
- !ruby/object:Gem::Version
|
125
|
+
version: '0'
|
126
|
+
- !ruby/object:Gem::Dependency
|
134
127
|
name: yard
|
135
|
-
|
136
|
-
version_requirements: &id009 !ruby/object:Gem::Requirement
|
128
|
+
requirement: !ruby/object:Gem::Requirement
|
137
129
|
none: false
|
138
|
-
requirements:
|
139
|
-
- -
|
140
|
-
- !ruby/object:Gem::Version
|
141
|
-
|
142
|
-
segments:
|
143
|
-
- 0
|
144
|
-
version: "0"
|
145
|
-
requirement: *id009
|
130
|
+
requirements:
|
131
|
+
- - ! '>='
|
132
|
+
- !ruby/object:Gem::Version
|
133
|
+
version: '0'
|
146
134
|
type: :development
|
147
135
|
prerelease: false
|
136
|
+
version_requirements: !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
138
|
+
requirements:
|
139
|
+
- - ! '>='
|
140
|
+
- !ruby/object:Gem::Version
|
141
|
+
version: '0'
|
142
|
+
- !ruby/object:Gem::Dependency
|
148
143
|
name: flog
|
149
|
-
|
150
|
-
version_requirements: &id010 !ruby/object:Gem::Requirement
|
144
|
+
requirement: !ruby/object:Gem::Requirement
|
151
145
|
none: false
|
152
|
-
requirements:
|
153
|
-
- -
|
154
|
-
- !ruby/object:Gem::Version
|
155
|
-
|
156
|
-
segments:
|
157
|
-
- 0
|
158
|
-
version: "0"
|
159
|
-
requirement: *id010
|
146
|
+
requirements:
|
147
|
+
- - ! '>='
|
148
|
+
- !ruby/object:Gem::Version
|
149
|
+
version: '0'
|
160
150
|
type: :development
|
161
151
|
prerelease: false
|
152
|
+
version_requirements: !ruby/object:Gem::Requirement
|
153
|
+
none: false
|
154
|
+
requirements:
|
155
|
+
- - ! '>='
|
156
|
+
- !ruby/object:Gem::Version
|
157
|
+
version: '0'
|
158
|
+
- !ruby/object:Gem::Dependency
|
162
159
|
name: simplecov
|
163
|
-
|
164
|
-
version_requirements: &id011 !ruby/object:Gem::Requirement
|
160
|
+
requirement: !ruby/object:Gem::Requirement
|
165
161
|
none: false
|
166
|
-
requirements:
|
167
|
-
- -
|
168
|
-
- !ruby/object:Gem::Version
|
169
|
-
|
170
|
-
segments:
|
171
|
-
- 0
|
172
|
-
version: "0"
|
173
|
-
requirement: *id011
|
162
|
+
requirements:
|
163
|
+
- - ! '>='
|
164
|
+
- !ruby/object:Gem::Version
|
165
|
+
version: '0'
|
174
166
|
type: :development
|
175
167
|
prerelease: false
|
168
|
+
version_requirements: !ruby/object:Gem::Requirement
|
169
|
+
none: false
|
170
|
+
requirements:
|
171
|
+
- - ! '>='
|
172
|
+
- !ruby/object:Gem::Version
|
173
|
+
version: '0'
|
174
|
+
- !ruby/object:Gem::Dependency
|
176
175
|
name: ZenTest
|
177
|
-
|
178
|
-
version_requirements: &id012 !ruby/object:Gem::Requirement
|
176
|
+
requirement: !ruby/object:Gem::Requirement
|
179
177
|
none: false
|
180
|
-
requirements:
|
181
|
-
- -
|
182
|
-
- !ruby/object:Gem::Version
|
183
|
-
|
184
|
-
segments:
|
185
|
-
- 2
|
186
|
-
- 8
|
187
|
-
- 1
|
188
|
-
version: 2.8.1
|
189
|
-
requirement: *id012
|
178
|
+
requirements:
|
179
|
+
- - ! '>='
|
180
|
+
- !ruby/object:Gem::Version
|
181
|
+
version: '0'
|
190
182
|
type: :development
|
191
183
|
prerelease: false
|
184
|
+
version_requirements: !ruby/object:Gem::Requirement
|
185
|
+
none: false
|
186
|
+
requirements:
|
187
|
+
- - ! '>='
|
188
|
+
- !ruby/object:Gem::Version
|
189
|
+
version: '0'
|
190
|
+
- !ruby/object:Gem::Dependency
|
192
191
|
name: mysql
|
193
|
-
|
194
|
-
version_requirements: &id013 !ruby/object:Gem::Requirement
|
192
|
+
requirement: !ruby/object:Gem::Requirement
|
195
193
|
none: false
|
196
|
-
requirements:
|
197
|
-
- -
|
198
|
-
- !ruby/object:Gem::Version
|
199
|
-
|
200
|
-
segments:
|
201
|
-
- 1
|
202
|
-
version: "1"
|
203
|
-
requirement: *id013
|
194
|
+
requirements:
|
195
|
+
- - '='
|
196
|
+
- !ruby/object:Gem::Version
|
197
|
+
version: 2.8.1
|
204
198
|
type: :development
|
205
199
|
prerelease: false
|
200
|
+
version_requirements: !ruby/object:Gem::Requirement
|
201
|
+
none: false
|
202
|
+
requirements:
|
203
|
+
- - '='
|
204
|
+
- !ruby/object:Gem::Version
|
205
|
+
version: 2.8.1
|
206
|
+
- !ruby/object:Gem::Dependency
|
206
207
|
name: bundler
|
207
|
-
|
208
|
-
version_requirements: &id014 !ruby/object:Gem::Requirement
|
208
|
+
requirement: !ruby/object:Gem::Requirement
|
209
209
|
none: false
|
210
|
-
requirements:
|
211
|
-
- -
|
212
|
-
- !ruby/object:Gem::Version
|
213
|
-
|
214
|
-
segments:
|
215
|
-
- 0
|
216
|
-
version: "0"
|
217
|
-
requirement: *id014
|
210
|
+
requirements:
|
211
|
+
- - ~>
|
212
|
+
- !ruby/object:Gem::Version
|
213
|
+
version: '1'
|
218
214
|
type: :development
|
219
215
|
prerelease: false
|
216
|
+
version_requirements: !ruby/object:Gem::Requirement
|
217
|
+
none: false
|
218
|
+
requirements:
|
219
|
+
- - ~>
|
220
|
+
- !ruby/object:Gem::Version
|
221
|
+
version: '1'
|
222
|
+
- !ruby/object:Gem::Dependency
|
220
223
|
name: jeweler
|
224
|
+
requirement: !ruby/object:Gem::Requirement
|
225
|
+
none: false
|
226
|
+
requirements:
|
227
|
+
- - ! '>='
|
228
|
+
- !ruby/object:Gem::Version
|
229
|
+
version: '0'
|
230
|
+
type: :development
|
231
|
+
prerelease: false
|
232
|
+
version_requirements: !ruby/object:Gem::Requirement
|
233
|
+
none: false
|
234
|
+
requirements:
|
235
|
+
- - ! '>='
|
236
|
+
- !ruby/object:Gem::Version
|
237
|
+
version: '0'
|
221
238
|
description: ETL tools for Chicago
|
222
239
|
email: roland.swingler@gmail.com
|
223
240
|
executables: []
|
224
|
-
|
225
241
|
extensions: []
|
226
|
-
|
227
|
-
extra_rdoc_files:
|
242
|
+
extra_rdoc_files:
|
228
243
|
- LICENSE.txt
|
229
244
|
- README.rdoc
|
230
|
-
files:
|
245
|
+
files:
|
231
246
|
- .document
|
232
247
|
- .rspec
|
233
248
|
- Gemfile
|
@@ -254,10 +269,13 @@ files:
|
|
254
269
|
- lib/chicago/etl/screens/out_of_bounds.rb
|
255
270
|
- lib/chicago/etl/sequel/dependant_tables.rb
|
256
271
|
- lib/chicago/etl/sequel/filter_to_etl_batch.rb
|
272
|
+
- lib/chicago/etl/stage.rb
|
273
|
+
- lib/chicago/etl/stage_builder.rb
|
257
274
|
- lib/chicago/etl/table_builder.rb
|
258
275
|
- lib/chicago/etl/task_invocation.rb
|
259
276
|
- lib/chicago/etl/tasks.rb
|
260
277
|
- lib/chicago/etl/transformations.rb
|
278
|
+
- lib/chicago/etl/transformations/deduplicate_rows.rb
|
261
279
|
- lib/chicago/etl/transformations/uk_post_code.rb
|
262
280
|
- lib/chicago/etl/transformations/uk_post_code_field.rb
|
263
281
|
- lib/chicago/flow/array_sink.rb
|
@@ -279,6 +297,7 @@ files:
|
|
279
297
|
- spec/etl/core_extensions_spec.rb
|
280
298
|
- spec/etl/counter_spec.rb
|
281
299
|
- spec/etl/dataset_batch_stage_spec.rb
|
300
|
+
- spec/etl/define_stage_spec.rb
|
282
301
|
- spec/etl/etl_batch_id_dataset_filter.rb
|
283
302
|
- spec/etl/key_builder_spec.rb
|
284
303
|
- spec/etl/load_dataset_builder_spec.rb
|
@@ -291,6 +310,7 @@ files:
|
|
291
310
|
- spec/etl/sequel/filter_to_etl_batch_spec.rb
|
292
311
|
- spec/etl/table_builder_spec.rb
|
293
312
|
- spec/etl/task_spec.rb
|
313
|
+
- spec/etl/transformations/deduplicate_rows_spec.rb
|
294
314
|
- spec/etl/transformations/uk_post_code_field_spec.rb
|
295
315
|
- spec/etl/transformations/uk_post_code_spec.rb
|
296
316
|
- spec/etl/transformations_spec.rb
|
@@ -306,37 +326,31 @@ files:
|
|
306
326
|
- spec/flow/transformation_spec.rb
|
307
327
|
- spec/spec_helper.rb
|
308
328
|
homepage: http://github.com/notonthehighstreet/chicago-etl
|
309
|
-
licenses:
|
329
|
+
licenses:
|
310
330
|
- MIT
|
311
331
|
post_install_message:
|
312
332
|
rdoc_options: []
|
313
|
-
|
314
|
-
require_paths:
|
333
|
+
require_paths:
|
315
334
|
- lib
|
316
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
335
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
317
336
|
none: false
|
318
|
-
requirements:
|
319
|
-
- -
|
320
|
-
- !ruby/object:Gem::Version
|
321
|
-
|
322
|
-
segments:
|
337
|
+
requirements:
|
338
|
+
- - ! '>='
|
339
|
+
- !ruby/object:Gem::Version
|
340
|
+
version: '0'
|
341
|
+
segments:
|
323
342
|
- 0
|
324
|
-
|
325
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
343
|
+
hash: -2054734000096616506
|
344
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
326
345
|
none: false
|
327
|
-
requirements:
|
328
|
-
- -
|
329
|
-
- !ruby/object:Gem::Version
|
330
|
-
|
331
|
-
segments:
|
332
|
-
- 0
|
333
|
-
version: "0"
|
346
|
+
requirements:
|
347
|
+
- - ! '>='
|
348
|
+
- !ruby/object:Gem::Version
|
349
|
+
version: '0'
|
334
350
|
requirements: []
|
335
|
-
|
336
351
|
rubyforge_project:
|
337
352
|
rubygems_version: 1.8.25
|
338
353
|
signing_key:
|
339
354
|
specification_version: 3
|
340
355
|
summary: Chicago ETL
|
341
356
|
test_files: []
|
342
|
-
|