chicago-etl 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.0
1
+ 0.1.1
data/chicago-etl.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "chicago-etl"
8
- s.version = "0.1.0"
8
+ s.version = "0.1.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Roland Swingler"]
12
- s.date = "2013-09-05"
12
+ s.date = "2013-11-07"
13
13
  s.description = "ETL tools for Chicago"
14
14
  s.email = "roland.swingler@gmail.com"
15
15
  s.extra_rdoc_files = [
@@ -43,10 +43,13 @@ Gem::Specification.new do |s|
43
43
  "lib/chicago/etl/screens/out_of_bounds.rb",
44
44
  "lib/chicago/etl/sequel/dependant_tables.rb",
45
45
  "lib/chicago/etl/sequel/filter_to_etl_batch.rb",
46
+ "lib/chicago/etl/stage.rb",
47
+ "lib/chicago/etl/stage_builder.rb",
46
48
  "lib/chicago/etl/table_builder.rb",
47
49
  "lib/chicago/etl/task_invocation.rb",
48
50
  "lib/chicago/etl/tasks.rb",
49
51
  "lib/chicago/etl/transformations.rb",
52
+ "lib/chicago/etl/transformations/deduplicate_rows.rb",
50
53
  "lib/chicago/etl/transformations/uk_post_code.rb",
51
54
  "lib/chicago/etl/transformations/uk_post_code_field.rb",
52
55
  "lib/chicago/flow/array_sink.rb",
@@ -68,6 +71,7 @@ Gem::Specification.new do |s|
68
71
  "spec/etl/core_extensions_spec.rb",
69
72
  "spec/etl/counter_spec.rb",
70
73
  "spec/etl/dataset_batch_stage_spec.rb",
74
+ "spec/etl/define_stage_spec.rb",
71
75
  "spec/etl/etl_batch_id_dataset_filter.rb",
72
76
  "spec/etl/key_builder_spec.rb",
73
77
  "spec/etl/load_dataset_builder_spec.rb",
@@ -80,6 +84,7 @@ Gem::Specification.new do |s|
80
84
  "spec/etl/sequel/filter_to_etl_batch_spec.rb",
81
85
  "spec/etl/table_builder_spec.rb",
82
86
  "spec/etl/task_spec.rb",
87
+ "spec/etl/transformations/deduplicate_rows_spec.rb",
83
88
  "spec/etl/transformations/uk_post_code_field_spec.rb",
84
89
  "spec/etl/transformations/uk_post_code_spec.rb",
85
90
  "spec/etl/transformations_spec.rb",
@@ -31,27 +31,6 @@ module Chicago
31
31
  end
32
32
  end
33
33
 
34
- # Deprecated.
35
- #
36
- # @deprecated Use perform_task instead
37
- def load(task_name, &block)
38
- perform_task(:load, task_name, &block)
39
- end
40
-
41
- # Deprecated.
42
- #
43
- # @deprecated Use perform_task instead
44
- def transform(task_name, &block)
45
- perform_task(:extract, task_name, &block)
46
- end
47
-
48
- # Deprecated.
49
- #
50
- # @deprecated Use perform_task instead
51
- def extract(task_name, &block)
52
- perform_task(:extract, task_name, &block)
53
- end
54
-
55
34
  # Performs a named task if it hasn't already run successfully in
56
35
  # this batch.
57
36
  def perform_task(stage, task_name, &block)
@@ -8,11 +8,24 @@ module Chicago
8
8
  # Returns all defined fact load tasks
9
9
  attr_reader :load_facts
10
10
 
11
+ # Returns all the defined generic stages.
12
+ attr_reader :stages
13
+
11
14
  # Creates a pipeline for a Schema.
12
15
  def initialize(db, schema)
13
16
  @schema, @db = schema, db
14
17
  @load_dimensions = Chicago::Schema::NamedElementCollection.new
15
18
  @load_facts = Chicago::Schema::NamedElementCollection.new
19
+ @stages = Chicago::Schema::NamedElementCollection.new
20
+ end
21
+
22
+ # Defines a generic stage in the pipeline.
23
+ def define_stage(name, &block)
24
+ @stages << build_schemaless_stage(name, &block)
25
+ end
26
+
27
+ def build_schemaless_stage(name, &block)
28
+ StageBuilder.new(@db).build(name, &block)
16
29
  end
17
30
 
18
31
  # Defines a dimension load stage
@@ -73,6 +86,7 @@ module Chicago
73
86
 
74
87
  # Define elements of the pipeline. See LoadPipelineStageBuilder
75
88
  # for details.
89
+ # TODO: rename pipeline => transforms below this method
76
90
  def pipeline(&block)
77
91
  @pipeline_stage = LoadPipelineStageBuilder.new(@db, @schema_table).
78
92
  build(&block)
@@ -81,9 +95,11 @@ module Chicago
81
95
  # Defines the dataset, see DatasetBuilder .
82
96
  #
83
97
  # The block must return a Sequel::Dataset.
84
- def dataset(&block)
98
+ # TODO: rename dataset => source below this method, make generic
99
+ def source(&block)
85
100
  @dataset = DatasetBuilder.new(@db).build(&block)
86
101
  end
102
+ alias :dataset :source
87
103
 
88
104
  # Define a custom filter strategy for filtering to an ETL batch.
89
105
  def filter_strategy(&block)
@@ -0,0 +1,76 @@
1
+ module Chicago
2
+ module ETL
3
+ class Stage
4
+ attr_reader :name
5
+
6
+ def initialize(name, options={})
7
+ @name = name
8
+ @source = options.fetch(:source)
9
+ raise ArgumentError, "Stage #{name} requires a source" unless @source
10
+
11
+ @sinks = options.fetch(:sinks)
12
+ raise ArgumentError, "Stage #{name} requires at least one sink" if @sinks.empty?
13
+
14
+ @transformations = options.fetch(:transformations)
15
+ @transformation_chain = Chicago::Flow::TransformationChain.
16
+ new(*@transformations)
17
+
18
+ @filter_strategy = options[:filter_strategy] ||
19
+ lambda {|source, _| source }
20
+ end
21
+
22
+ def execute(etl_batch, reextract)
23
+ modified_source = reextract_and_filter_source(@source, etl_batch, reextract)
24
+ transform_and_load_from(modified_source)
25
+ end
26
+
27
+ def transform_and_load_from(source)
28
+ end
29
+
30
+ def reextract_and_filter_source(source, etl_batch, reextract=false)
31
+ if reextract
32
+ filtered_dataset = source
33
+ else
34
+ filtered_dataset = @filter_strategy.call(source, etl_batch)
35
+ end
36
+ Chicago::Flow::DatasetSource.new(filtered_dataset)
37
+ end
38
+
39
+ attr_reader :transformation_chain
40
+
41
+ # Returns the named sink, if it exists
42
+ def sink(name)
43
+ @sinks[name.to_sym]
44
+ end
45
+
46
+ def sinks
47
+ @sinks.values
48
+ end
49
+
50
+ def register_sink(name, sink)
51
+ @sinks[name.to_sym] = sink
52
+ self
53
+ end
54
+
55
+ def transform_and_load_from(source)
56
+ sinks.each(&:open)
57
+ pipe_rows_to_sinks_from(source)
58
+ sinks.each(&:close)
59
+ end
60
+
61
+ private
62
+
63
+ def pipe_rows_to_sinks_from(source)
64
+ source.each do |row|
65
+ transformation_chain.process(row).each {|row| process_row(row) }
66
+ end
67
+ transformation_chain.flush.each {|row| process_row(row) }
68
+ end
69
+
70
+ def process_row(row)
71
+ stream = row.delete(:_stream) || :default
72
+ @sinks[stream] << row
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,69 @@
1
+ module Chicago
2
+ module ETL
3
+ class StageBuilder
4
+ attr_reader :sink_factory
5
+
6
+ def initialize(db)
7
+ @db = db
8
+ end
9
+
10
+ def build(name, &block)
11
+ @sinks = {}
12
+ @transformations = []
13
+
14
+ instance_eval &block
15
+
16
+ Stage.new(name,
17
+ :source => @dataset,
18
+ :sinks => @sinks,
19
+ :transformations => @transformations,
20
+ :filter_strategy => @filter_strategy)
21
+ end
22
+
23
+ def source(&block)
24
+ @dataset = DatasetBuilder.new(@db).build(&block)
25
+ end
26
+
27
+ def transformations(klass=TransformationBuilder, &block)
28
+ @transformations = klass.new.build(&block)
29
+ end
30
+
31
+ def sinks(options={}, &block)
32
+ @sinks = SinkBuilder.new.build(&block)
33
+ end
34
+
35
+ # TODO: think of potentially better ways of dealig with this
36
+ # problem.
37
+ def filter_strategy(&block)
38
+ @filter_strategy = block
39
+ end
40
+
41
+ class TransformationBuilder
42
+ def build(&block)
43
+ @transformations = []
44
+ instance_eval(&block)
45
+ @transformations
46
+ end
47
+
48
+ def add(transformation)
49
+ @transformations << transformation
50
+ end
51
+ end
52
+
53
+ class SinkBuilder
54
+ def build(&block)
55
+ @sinks = {}
56
+ instance_eval(&block)
57
+ @sinks
58
+ end
59
+
60
+ protected
61
+
62
+ def add(sink, options={})
63
+ stream = options[:stream] || :default
64
+ @sinks[stream] = sink
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,40 @@
1
+ module Chicago
2
+ module ETL
3
+ class DeduplicateRows < Chicago::Flow::Transformation
4
+ def process_row(row)
5
+ if @working_row.nil?
6
+ @working_row = row
7
+ return
8
+ elsif same_row?(row)
9
+ @working_row = merge_rows(row)
10
+ return
11
+ else
12
+ assign_new_row_and_return_old_row(row)
13
+ end
14
+ end
15
+
16
+ def flush
17
+ @working_row.nil? ? [] : [@working_row]
18
+ end
19
+
20
+ protected
21
+
22
+ attr_reader :working_row
23
+
24
+ # This should be implemented by clients
25
+ def merge_rows(row)
26
+ end
27
+
28
+ # This should be implemented by clients
29
+ def same_row?(row)
30
+ end
31
+
32
+ private
33
+
34
+ def assign_new_row_and_return_old_row(row)
35
+ row, @working_row = @working_row, row
36
+ row
37
+ end
38
+ end
39
+ end
40
+ end
data/lib/chicago/etl.rb CHANGED
@@ -29,6 +29,9 @@ require 'chicago/etl/dataset_batch_stage'
29
29
  require 'chicago/etl/load_pipeline_stage_builder'
30
30
  require 'chicago/etl/pipeline'
31
31
 
32
+ require 'chicago/etl/stage'
33
+ require 'chicago/etl/stage_builder'
34
+
32
35
  # Sequel Extensions
33
36
  require 'chicago/etl/sequel/filter_to_etl_batch'
34
37
  require 'chicago/etl/sequel/dependant_tables'
@@ -40,6 +43,7 @@ require 'chicago/etl/screens/invalid_element'
40
43
  require 'chicago/etl/screens/out_of_bounds'
41
44
 
42
45
  # Transformations
46
+ require 'chicago/etl/transformations/deduplicate_rows'
43
47
  require 'chicago/etl/transformations/uk_post_code'
44
48
  require 'chicago/etl/transformations/uk_post_code_field'
45
49
 
@@ -0,0 +1,114 @@
1
+ require "spec_helper"
2
+
3
+ class TestTransformation < Chicago::Flow::Transformation
4
+ def output_streams
5
+ [:another_stream]
6
+ end
7
+
8
+ def process_row(row)
9
+ [row, assign_stream({:some_field => "has an error value"}, :another_stream)]
10
+ end
11
+ end
12
+
13
+ describe "defining and executing a stage" do
14
+ let(:rows) { [{:some_field => "value"}] }
15
+ let(:db) { double(:test_dataset_method => rows) }
16
+ let(:schema) { double }
17
+ let(:pipeline) { Chicago::ETL::Pipeline.new(db, schema)}
18
+
19
+ it "allows no tranformations" do
20
+ pipeline.define_stage(:test_stage) do
21
+ source do
22
+ db.test_dataset_method
23
+ end
24
+
25
+ sinks do
26
+ add Chicago::Flow::ArraySink.new(:test)
27
+ add Chicago::Flow::ArraySink.new(:test), :stream => :another_stream
28
+ end
29
+ end
30
+
31
+ pipeline.stages.each do |stage|
32
+ stage.execute(double, true)
33
+ end
34
+
35
+ stage = pipeline.stages.first
36
+ stage.sink(:default).data.length.should == 1
37
+ stage.sink(:default).data.first.should == {:some_field => "value"}
38
+
39
+ stage.sink(:another_stream).data.length.should == 0
40
+ end
41
+
42
+ it "requires sinks" do
43
+ expect {
44
+ pipeline.define_stage(:test_stage) do
45
+ source do
46
+ db.test_dataset_method
47
+ end
48
+ end
49
+ }.to raise_error(ArgumentError)
50
+ end
51
+
52
+ it "requires sources" do
53
+ expect {
54
+ pipeline.define_stage(:test_stage) do
55
+ sinks do
56
+ add Chicago::Flow::ArraySink.new(:test)
57
+ end
58
+ end
59
+ }.to raise_error(ArgumentError)
60
+ end
61
+
62
+ it "glues the source, transformations, and sink correctly" do
63
+ pipeline.define_stage(:test_stage) do
64
+ source do
65
+ db.test_dataset_method
66
+ end
67
+
68
+ transformations do
69
+ add TestTransformation.new
70
+ end
71
+
72
+ sinks do
73
+ add Chicago::Flow::ArraySink.new(:test)
74
+ add Chicago::Flow::ArraySink.new(:test), :stream => :another_stream
75
+ end
76
+ end
77
+
78
+ pipeline.stages.each do |stage|
79
+ stage.execute(double, true)
80
+ end
81
+
82
+ stage = pipeline.stages.first
83
+ stage.sink(:default).data.length.should == 1
84
+ stage.sink(:default).data.first.should == {:some_field => "value"}
85
+
86
+ stage.sink(:another_stream).data.length.should == 1
87
+ stage.sink(:another_stream).data.first.should == {:some_field => "has an error value"}
88
+ end
89
+
90
+ it "allows the source to be filtered via a filter strategy" do
91
+ etl_batch_double = double
92
+ fake_source = []
93
+
94
+ fake_source.should_receive(:another_dataset_method).and_return([])
95
+ pipeline.define_stage(:test_stage) do
96
+ source do
97
+ fake_source
98
+ end
99
+
100
+ sinks do
101
+ add Chicago::Flow::ArraySink.new(:test)
102
+ end
103
+
104
+ filter_strategy do |source, etl_batch|
105
+ etl_batch.should == etl_batch_double
106
+ source.another_dataset_method
107
+ end
108
+ end
109
+
110
+ pipeline.stages.each do |stage|
111
+ stage.execute(etl_batch_double, false)
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,23 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::ETL::DeduplicateRows do
4
+ it "deduplicates rows" do
5
+ class TestTransform < described_class
6
+ def merge_rows(row)
7
+ working_row.merge(row)
8
+ end
9
+
10
+ def same_row?(row)
11
+ working_row[:id] == row[:id]
12
+ end
13
+ end
14
+
15
+ transform = TestTransform.new
16
+
17
+ transform.process({:id => 1, :foo => :bar}).should be_blank
18
+ transform.process({:id => 1, :bar => :baz}).should be_blank
19
+ transform.process({:id => 2, :foo => :quux}).should == {:id => 1, :foo => :bar, :bar => :baz}
20
+
21
+ transform.flush.should == [{:id => 2, :foo => :quux}]
22
+ end
23
+ end
metadata CHANGED
@@ -1,233 +1,248 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: chicago-etl
3
- version: !ruby/object:Gem::Version
4
- hash: 27
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
5
  prerelease:
6
- segments:
7
- - 0
8
- - 1
9
- - 0
10
- version: 0.1.0
11
6
  platform: ruby
12
- authors:
7
+ authors:
13
8
  - Roland Swingler
14
9
  autorequire:
15
10
  bindir: bin
16
11
  cert_chain: []
17
-
18
- date: 2013-09-05 00:00:00 Z
19
- dependencies:
20
- - !ruby/object:Gem::Dependency
21
- version_requirements: &id001 !ruby/object:Gem::Requirement
12
+ date: 2013-11-07 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: chicagowarehouse
16
+ requirement: !ruby/object:Gem::Requirement
22
17
  none: false
23
- requirements:
18
+ requirements:
24
19
  - - ~>
25
- - !ruby/object:Gem::Version
26
- hash: 3
27
- segments:
28
- - 0
29
- - 4
30
- version: "0.4"
31
- requirement: *id001
20
+ - !ruby/object:Gem::Version
21
+ version: '0.4'
32
22
  type: :runtime
33
23
  prerelease: false
34
- name: chicagowarehouse
35
- - !ruby/object:Gem::Dependency
36
- version_requirements: &id002 !ruby/object:Gem::Requirement
24
+ version_requirements: !ruby/object:Gem::Requirement
37
25
  none: false
38
- requirements:
39
- - - ">="
40
- - !ruby/object:Gem::Version
41
- hash: 3
42
- segments:
43
- - 0
44
- version: "0"
45
- requirement: *id002
46
- type: :runtime
47
- prerelease: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: '0.4'
30
+ - !ruby/object:Gem::Dependency
48
31
  name: fastercsv
49
- - !ruby/object:Gem::Dependency
50
- version_requirements: &id003 !ruby/object:Gem::Requirement
32
+ requirement: !ruby/object:Gem::Requirement
51
33
  none: false
52
- requirements:
53
- - - ">="
54
- - !ruby/object:Gem::Version
55
- hash: 3
56
- segments:
57
- - 0
58
- version: "0"
59
- requirement: *id003
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
60
38
  type: :runtime
61
39
  prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
62
47
  name: sequel
63
- - !ruby/object:Gem::Dependency
64
- version_requirements: &id004 !ruby/object:Gem::Requirement
48
+ requirement: !ruby/object:Gem::Requirement
65
49
  none: false
66
- requirements:
67
- - - ">="
68
- - !ruby/object:Gem::Version
69
- hash: 27
70
- segments:
71
- - 0
72
- - 0
73
- - 2
74
- version: 0.0.2
75
- requirement: *id004
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
76
54
  type: :runtime
77
55
  prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
78
63
  name: sequel_load_data_infile
79
- - !ruby/object:Gem::Dependency
80
- version_requirements: &id005 !ruby/object:Gem::Requirement
64
+ requirement: !ruby/object:Gem::Requirement
81
65
  none: false
82
- requirements:
83
- - - ">="
84
- - !ruby/object:Gem::Version
85
- hash: 3
86
- segments:
87
- - 0
88
- version: "0"
89
- requirement: *id005
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: 0.0.2
90
70
  type: :runtime
91
71
  prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: 0.0.2
78
+ - !ruby/object:Gem::Dependency
92
79
  name: sequel_fast_columns
93
- - !ruby/object:Gem::Dependency
94
- version_requirements: &id006 !ruby/object:Gem::Requirement
80
+ requirement: !ruby/object:Gem::Requirement
95
81
  none: false
96
- requirements:
97
- - - ~>
98
- - !ruby/object:Gem::Version
99
- hash: 7
100
- segments:
101
- - 2
102
- version: "2"
103
- requirement: *id006
104
- type: :development
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :runtime
105
87
  prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ - !ruby/object:Gem::Dependency
106
95
  name: rspec
107
- - !ruby/object:Gem::Dependency
108
- version_requirements: &id007 !ruby/object:Gem::Requirement
96
+ requirement: !ruby/object:Gem::Requirement
109
97
  none: false
110
- requirements:
111
- - - ">="
112
- - !ruby/object:Gem::Version
113
- hash: 3
114
- segments:
115
- - 0
116
- version: "0"
117
- requirement: *id007
98
+ requirements:
99
+ - - ~>
100
+ - !ruby/object:Gem::Version
101
+ version: '2'
118
102
  type: :development
119
103
  prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ~>
108
+ - !ruby/object:Gem::Version
109
+ version: '2'
110
+ - !ruby/object:Gem::Dependency
120
111
  name: timecop
121
- - !ruby/object:Gem::Dependency
122
- version_requirements: &id008 !ruby/object:Gem::Requirement
112
+ requirement: !ruby/object:Gem::Requirement
123
113
  none: false
124
- requirements:
125
- - - ">="
126
- - !ruby/object:Gem::Version
127
- hash: 3
128
- segments:
129
- - 0
130
- version: "0"
131
- requirement: *id008
114
+ requirements:
115
+ - - ! '>='
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
132
118
  type: :development
133
119
  prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ none: false
122
+ requirements:
123
+ - - ! '>='
124
+ - !ruby/object:Gem::Version
125
+ version: '0'
126
+ - !ruby/object:Gem::Dependency
134
127
  name: yard
135
- - !ruby/object:Gem::Dependency
136
- version_requirements: &id009 !ruby/object:Gem::Requirement
128
+ requirement: !ruby/object:Gem::Requirement
137
129
  none: false
138
- requirements:
139
- - - ">="
140
- - !ruby/object:Gem::Version
141
- hash: 3
142
- segments:
143
- - 0
144
- version: "0"
145
- requirement: *id009
130
+ requirements:
131
+ - - ! '>='
132
+ - !ruby/object:Gem::Version
133
+ version: '0'
146
134
  type: :development
147
135
  prerelease: false
136
+ version_requirements: !ruby/object:Gem::Requirement
137
+ none: false
138
+ requirements:
139
+ - - ! '>='
140
+ - !ruby/object:Gem::Version
141
+ version: '0'
142
+ - !ruby/object:Gem::Dependency
148
143
  name: flog
149
- - !ruby/object:Gem::Dependency
150
- version_requirements: &id010 !ruby/object:Gem::Requirement
144
+ requirement: !ruby/object:Gem::Requirement
151
145
  none: false
152
- requirements:
153
- - - ">="
154
- - !ruby/object:Gem::Version
155
- hash: 3
156
- segments:
157
- - 0
158
- version: "0"
159
- requirement: *id010
146
+ requirements:
147
+ - - ! '>='
148
+ - !ruby/object:Gem::Version
149
+ version: '0'
160
150
  type: :development
161
151
  prerelease: false
152
+ version_requirements: !ruby/object:Gem::Requirement
153
+ none: false
154
+ requirements:
155
+ - - ! '>='
156
+ - !ruby/object:Gem::Version
157
+ version: '0'
158
+ - !ruby/object:Gem::Dependency
162
159
  name: simplecov
163
- - !ruby/object:Gem::Dependency
164
- version_requirements: &id011 !ruby/object:Gem::Requirement
160
+ requirement: !ruby/object:Gem::Requirement
165
161
  none: false
166
- requirements:
167
- - - ">="
168
- - !ruby/object:Gem::Version
169
- hash: 3
170
- segments:
171
- - 0
172
- version: "0"
173
- requirement: *id011
162
+ requirements:
163
+ - - ! '>='
164
+ - !ruby/object:Gem::Version
165
+ version: '0'
174
166
  type: :development
175
167
  prerelease: false
168
+ version_requirements: !ruby/object:Gem::Requirement
169
+ none: false
170
+ requirements:
171
+ - - ! '>='
172
+ - !ruby/object:Gem::Version
173
+ version: '0'
174
+ - !ruby/object:Gem::Dependency
176
175
  name: ZenTest
177
- - !ruby/object:Gem::Dependency
178
- version_requirements: &id012 !ruby/object:Gem::Requirement
176
+ requirement: !ruby/object:Gem::Requirement
179
177
  none: false
180
- requirements:
181
- - - "="
182
- - !ruby/object:Gem::Version
183
- hash: 45
184
- segments:
185
- - 2
186
- - 8
187
- - 1
188
- version: 2.8.1
189
- requirement: *id012
178
+ requirements:
179
+ - - ! '>='
180
+ - !ruby/object:Gem::Version
181
+ version: '0'
190
182
  type: :development
191
183
  prerelease: false
184
+ version_requirements: !ruby/object:Gem::Requirement
185
+ none: false
186
+ requirements:
187
+ - - ! '>='
188
+ - !ruby/object:Gem::Version
189
+ version: '0'
190
+ - !ruby/object:Gem::Dependency
192
191
  name: mysql
193
- - !ruby/object:Gem::Dependency
194
- version_requirements: &id013 !ruby/object:Gem::Requirement
192
+ requirement: !ruby/object:Gem::Requirement
195
193
  none: false
196
- requirements:
197
- - - ~>
198
- - !ruby/object:Gem::Version
199
- hash: 1
200
- segments:
201
- - 1
202
- version: "1"
203
- requirement: *id013
194
+ requirements:
195
+ - - '='
196
+ - !ruby/object:Gem::Version
197
+ version: 2.8.1
204
198
  type: :development
205
199
  prerelease: false
200
+ version_requirements: !ruby/object:Gem::Requirement
201
+ none: false
202
+ requirements:
203
+ - - '='
204
+ - !ruby/object:Gem::Version
205
+ version: 2.8.1
206
+ - !ruby/object:Gem::Dependency
206
207
  name: bundler
207
- - !ruby/object:Gem::Dependency
208
- version_requirements: &id014 !ruby/object:Gem::Requirement
208
+ requirement: !ruby/object:Gem::Requirement
209
209
  none: false
210
- requirements:
211
- - - ">="
212
- - !ruby/object:Gem::Version
213
- hash: 3
214
- segments:
215
- - 0
216
- version: "0"
217
- requirement: *id014
210
+ requirements:
211
+ - - ~>
212
+ - !ruby/object:Gem::Version
213
+ version: '1'
218
214
  type: :development
219
215
  prerelease: false
216
+ version_requirements: !ruby/object:Gem::Requirement
217
+ none: false
218
+ requirements:
219
+ - - ~>
220
+ - !ruby/object:Gem::Version
221
+ version: '1'
222
+ - !ruby/object:Gem::Dependency
220
223
  name: jeweler
224
+ requirement: !ruby/object:Gem::Requirement
225
+ none: false
226
+ requirements:
227
+ - - ! '>='
228
+ - !ruby/object:Gem::Version
229
+ version: '0'
230
+ type: :development
231
+ prerelease: false
232
+ version_requirements: !ruby/object:Gem::Requirement
233
+ none: false
234
+ requirements:
235
+ - - ! '>='
236
+ - !ruby/object:Gem::Version
237
+ version: '0'
221
238
  description: ETL tools for Chicago
222
239
  email: roland.swingler@gmail.com
223
240
  executables: []
224
-
225
241
  extensions: []
226
-
227
- extra_rdoc_files:
242
+ extra_rdoc_files:
228
243
  - LICENSE.txt
229
244
  - README.rdoc
230
- files:
245
+ files:
231
246
  - .document
232
247
  - .rspec
233
248
  - Gemfile
@@ -254,10 +269,13 @@ files:
254
269
  - lib/chicago/etl/screens/out_of_bounds.rb
255
270
  - lib/chicago/etl/sequel/dependant_tables.rb
256
271
  - lib/chicago/etl/sequel/filter_to_etl_batch.rb
272
+ - lib/chicago/etl/stage.rb
273
+ - lib/chicago/etl/stage_builder.rb
257
274
  - lib/chicago/etl/table_builder.rb
258
275
  - lib/chicago/etl/task_invocation.rb
259
276
  - lib/chicago/etl/tasks.rb
260
277
  - lib/chicago/etl/transformations.rb
278
+ - lib/chicago/etl/transformations/deduplicate_rows.rb
261
279
  - lib/chicago/etl/transformations/uk_post_code.rb
262
280
  - lib/chicago/etl/transformations/uk_post_code_field.rb
263
281
  - lib/chicago/flow/array_sink.rb
@@ -279,6 +297,7 @@ files:
279
297
  - spec/etl/core_extensions_spec.rb
280
298
  - spec/etl/counter_spec.rb
281
299
  - spec/etl/dataset_batch_stage_spec.rb
300
+ - spec/etl/define_stage_spec.rb
282
301
  - spec/etl/etl_batch_id_dataset_filter.rb
283
302
  - spec/etl/key_builder_spec.rb
284
303
  - spec/etl/load_dataset_builder_spec.rb
@@ -291,6 +310,7 @@ files:
291
310
  - spec/etl/sequel/filter_to_etl_batch_spec.rb
292
311
  - spec/etl/table_builder_spec.rb
293
312
  - spec/etl/task_spec.rb
313
+ - spec/etl/transformations/deduplicate_rows_spec.rb
294
314
  - spec/etl/transformations/uk_post_code_field_spec.rb
295
315
  - spec/etl/transformations/uk_post_code_spec.rb
296
316
  - spec/etl/transformations_spec.rb
@@ -306,37 +326,31 @@ files:
306
326
  - spec/flow/transformation_spec.rb
307
327
  - spec/spec_helper.rb
308
328
  homepage: http://github.com/notonthehighstreet/chicago-etl
309
- licenses:
329
+ licenses:
310
330
  - MIT
311
331
  post_install_message:
312
332
  rdoc_options: []
313
-
314
- require_paths:
333
+ require_paths:
315
334
  - lib
316
- required_ruby_version: !ruby/object:Gem::Requirement
335
+ required_ruby_version: !ruby/object:Gem::Requirement
317
336
  none: false
318
- requirements:
319
- - - ">="
320
- - !ruby/object:Gem::Version
321
- hash: 3
322
- segments:
337
+ requirements:
338
+ - - ! '>='
339
+ - !ruby/object:Gem::Version
340
+ version: '0'
341
+ segments:
323
342
  - 0
324
- version: "0"
325
- required_rubygems_version: !ruby/object:Gem::Requirement
343
+ hash: -2054734000096616506
344
+ required_rubygems_version: !ruby/object:Gem::Requirement
326
345
  none: false
327
- requirements:
328
- - - ">="
329
- - !ruby/object:Gem::Version
330
- hash: 3
331
- segments:
332
- - 0
333
- version: "0"
346
+ requirements:
347
+ - - ! '>='
348
+ - !ruby/object:Gem::Version
349
+ version: '0'
334
350
  requirements: []
335
-
336
351
  rubyforge_project:
337
352
  rubygems_version: 1.8.25
338
353
  signing_key:
339
354
  specification_version: 3
340
355
  summary: Chicago ETL
341
356
  test_files: []
342
-