chicago-etl 0.0.13 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +8 -3
- data/README.rdoc +4 -1
- data/VERSION +1 -1
- data/chicago-etl.gemspec +59 -22
- data/chicago-flow.gemspec +92 -0
- data/lib/chicago/etl/batch.rb +9 -2
- data/lib/chicago/etl/core_extensions.rb +12 -0
- data/lib/chicago/etl/counter.rb +8 -1
- data/lib/chicago/etl/dataset_batch_stage.rb +52 -0
- data/lib/chicago/etl/key_builder.rb +17 -39
- data/lib/chicago/etl/load_dataset_builder.rb +3 -1
- data/lib/chicago/etl/load_pipeline_stage_builder.rb +142 -0
- data/lib/chicago/etl/pipeline.rb +151 -0
- data/lib/chicago/etl/schema_table_sink_factory.rb +74 -0
- data/lib/chicago/etl/screens/column_screen.rb +26 -25
- data/lib/chicago/etl/screens/invalid_element.rb +5 -5
- data/lib/chicago/etl/screens/missing_value.rb +4 -2
- data/lib/chicago/etl/screens/out_of_bounds.rb +2 -0
- data/lib/chicago/etl/table_builder.rb +4 -2
- data/lib/chicago/etl/task_invocation.rb +0 -1
- data/lib/chicago/etl/transformations.rb +128 -0
- data/lib/chicago/etl.rb +39 -8
- data/lib/chicago/flow/array_sink.rb +35 -0
- data/lib/chicago/flow/array_source.rb +15 -0
- data/lib/chicago/flow/dataset_source.rb +23 -0
- data/lib/chicago/flow/errors.rb +14 -0
- data/lib/chicago/flow/filter.rb +15 -0
- data/lib/chicago/flow/mysql.rb +4 -0
- data/lib/chicago/{etl/mysql_load_file_value_transformer.rb → flow/mysql_file_serializer.rb} +7 -4
- data/lib/chicago/flow/mysql_file_sink.rb +68 -0
- data/lib/chicago/flow/null_sink.rb +8 -0
- data/lib/chicago/flow/pipeline_endpoint.rb +15 -0
- data/lib/chicago/flow/pipeline_stage.rb +68 -0
- data/lib/chicago/flow/sink.rb +53 -0
- data/lib/chicago/flow/transformation.rb +169 -0
- data/lib/chicago/flow/transformation_chain.rb +40 -0
- data/spec/etl/batch_spec.rb +2 -1
- data/spec/etl/core_extensions_spec.rb +13 -0
- data/spec/etl/dataset_batch_stage_spec.rb +55 -0
- data/spec/etl/key_builder_spec.rb +25 -83
- data/spec/etl/pipeline_stage_builder_spec.rb +39 -0
- data/spec/etl/schema_table_sink_factory_spec.rb +69 -0
- data/spec/etl/screens/invalid_element_spec.rb +10 -11
- data/spec/etl/screens/missing_value_spec.rb +21 -21
- data/spec/etl/screens/out_of_bounds_spec.rb +21 -29
- data/spec/etl/transformations_spec.rb +109 -0
- data/spec/flow/array_sink_spec.rb +26 -0
- data/spec/flow/array_source_spec.rb +20 -0
- data/spec/flow/dataset_source_spec.rb +15 -0
- data/spec/flow/filter_spec.rb +13 -0
- data/spec/flow/mysql_file_serializer_spec.rb +27 -0
- data/spec/flow/mysql_file_sink_spec.rb +94 -0
- data/spec/flow/mysql_integration_spec.rb +72 -0
- data/spec/flow/pipeline_stage_spec.rb +89 -0
- data/spec/flow/transformation_chain_spec.rb +76 -0
- data/spec/flow/transformation_spec.rb +91 -0
- data/spec/spec_helper.rb +5 -0
- metadata +135 -39
- data/lib/chicago/etl/buffering_insert_writer.rb +0 -36
- data/lib/chicago/etl/mysql_dumpfile.rb +0 -32
- data/lib/chicago/etl/screens/composite_screen.rb +0 -17
- data/lib/chicago/etl/sequel/load_data_infile.rb +0 -141
- data/lib/chicago/etl/sink.rb +0 -61
- data/lib/chicago/etl/transformations/add_insert_timestamp.rb +0 -16
- data/spec/etl/mysql_dumpfile_spec.rb +0 -42
- data/spec/etl/mysql_load_file_value_transformer_spec.rb +0 -27
- data/spec/etl/screens/composite_screen_spec.rb +0 -25
- data/spec/etl/sequel/load_data_infile_expression_spec.rb +0 -60
- data/spec/etl/sequel/load_data_infile_spec.rb +0 -37
- data/spec/etl/sink_spec.rb +0 -7
- data/spec/etl/transformations/add_insert_timestamp_spec.rb +0 -9
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Chicago::ETL::DatasetBatchStage do
|
4
|
+
let(:pipeline_stage) { mock(:pipeline_stage).as_null_object }
|
5
|
+
let(:dataset) { mock(:dataset).as_null_object }
|
6
|
+
let(:stage) { described_class.new(:foo, dataset, pipeline_stage) }
|
7
|
+
let(:etl_batch) { stub(:etl_batch) }
|
8
|
+
|
9
|
+
it "has a name" do
|
10
|
+
stage.name.should == :foo
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should set the inserted at time on the default sink" do
|
14
|
+
sink = Chicago::Flow::ArraySink.new(:foo)
|
15
|
+
pipeline_stage.stub(:sink).with(:default).and_return(sink)
|
16
|
+
stage.pipeline_stage.should == pipeline_stage
|
17
|
+
|
18
|
+
sink.constant_values[:_inserted_at].should_not be_nil
|
19
|
+
end
|
20
|
+
|
21
|
+
it "filters the dataset to the batch" do
|
22
|
+
dataset.should_recieve(:filter_to_etl_batch).with(etl_batch)
|
23
|
+
stage.source(etl_batch)
|
24
|
+
end
|
25
|
+
|
26
|
+
it "does not filter the dataset if re-extracting" do
|
27
|
+
dataset.should_not_recieve(:filter_to_etl_batch)
|
28
|
+
stage.source(etl_batch, true)
|
29
|
+
end
|
30
|
+
|
31
|
+
it "can filter via a custom strategy" do
|
32
|
+
dataset.should_not_recieve(:filter_to_etl_batch)
|
33
|
+
|
34
|
+
filter_strategy = lambda {|ds, batch| ds }
|
35
|
+
described_class.new(:foo, dataset, pipeline_stage, :filter_strategy => filter_strategy).
|
36
|
+
source(etl_batch)
|
37
|
+
end
|
38
|
+
|
39
|
+
it "executes the pipeline stage using a DatasetSource" do
|
40
|
+
pipeline_stage.should_receive(:execute).
|
41
|
+
with(kind_of(Chicago::Flow::DatasetSource))
|
42
|
+
stage.execute(etl_batch, true)
|
43
|
+
end
|
44
|
+
|
45
|
+
it "truncates any sinks if truncate_pre_load has been set" do
|
46
|
+
stage = described_class.new(:foo, dataset, pipeline_stage,
|
47
|
+
:truncate_pre_load => true)
|
48
|
+
|
49
|
+
sink = Chicago::Flow::ArraySink.new(:output)
|
50
|
+
sink << {:foo => "foo"}
|
51
|
+
pipeline_stage.stub(:sinks).and_return([sink])
|
52
|
+
stage.execute(etl_batch)
|
53
|
+
sink.data.should == []
|
54
|
+
end
|
55
|
+
end
|
@@ -39,8 +39,9 @@ describe Chicago::ETL::KeyBuilder do
|
|
39
39
|
before :each do
|
40
40
|
@db = stub(:staging_database).as_null_object
|
41
41
|
@db.stub(:[]).and_return(stub(:max => nil, :select_hash => {}))
|
42
|
-
@
|
43
|
-
Chicago::ETL::
|
42
|
+
@sink = stub(:sink).as_null_object
|
43
|
+
Chicago::ETL::SchemaTableSinkFactory.stub(:new).
|
44
|
+
and_return(stub(:factory, :key_sink => @sink))
|
44
45
|
end
|
45
46
|
|
46
47
|
describe "for identifiable dimensions" do
|
@@ -50,20 +51,20 @@ describe Chicago::ETL::KeyBuilder do
|
|
50
51
|
|
51
52
|
it "returns an incrementing key, given a row" do
|
52
53
|
builder = described_class.for_table(@dimension, @db)
|
53
|
-
builder.key(:original_id => 2).should == 1
|
54
|
-
builder.key(:original_id => 3).should == 2
|
54
|
+
builder.key(:original_id => 2).first.should == 1
|
55
|
+
builder.key(:original_id => 3).first.should == 2
|
55
56
|
end
|
56
57
|
|
57
58
|
it "returns the same key for the same record" do
|
58
59
|
builder = described_class.for_table(@dimension, @db)
|
59
|
-
builder.key(:original_id => 2).should == 1
|
60
|
-
builder.key(:original_id => 2).should == 1
|
60
|
+
builder.key(:original_id => 2).first.should == 1
|
61
|
+
builder.key(:original_id => 2).first.should == 1
|
61
62
|
end
|
62
63
|
|
63
64
|
it "takes into account the current maximum key in the database" do
|
64
65
|
@db.stub(:[]).with(:keys_dimension_user).and_return(stub(:max => 2, :select_hash => {}))
|
65
66
|
builder = described_class.for_table(@dimension, @db)
|
66
|
-
builder.key(:original_id => 1).should == 3
|
67
|
+
builder.key(:original_id => 1).first.should == 3
|
67
68
|
end
|
68
69
|
|
69
70
|
it "returns previously created keys" do
|
@@ -71,55 +72,14 @@ describe Chicago::ETL::KeyBuilder do
|
|
71
72
|
@db.stub(:[]).with(:keys_dimension_user).and_return(dataset)
|
72
73
|
|
73
74
|
builder = described_class.for_table(@dimension, @db)
|
74
|
-
builder.key(:original_id => 30).should == 2
|
75
|
-
builder.key(:original_id => 40).should == 1
|
75
|
+
builder.key(:original_id => 30).first.should == 2
|
76
|
+
builder.key(:original_id => 40).first.should == 1
|
76
77
|
end
|
77
78
|
|
78
79
|
it "raises an error when original_id isn't present in the row" do
|
79
80
|
builder = described_class.for_table(@dimension, @db)
|
80
81
|
expect { builder.key(:foo => :bar) }.to raise_error(Chicago::ETL::KeyError)
|
81
82
|
end
|
82
|
-
|
83
|
-
it "flushes new keys to a key table" do
|
84
|
-
pending
|
85
|
-
dataset = stub(:dataset, :max => 1, :select_hash => {40 => 1})
|
86
|
-
dataset.stub(:insert_replace => dataset)
|
87
|
-
@db.stub(:[]).with(:keys_dimension_user).and_return(dataset)
|
88
|
-
|
89
|
-
dataset.should_receive(:multi_insert).
|
90
|
-
with([{:original_id => 30, :dimension_id => 2}])
|
91
|
-
|
92
|
-
builder = described_class.for_table(@dimension, @db)
|
93
|
-
builder.key(:original_id => 30)
|
94
|
-
builder.key(:original_id => 40)
|
95
|
-
builder.flush
|
96
|
-
end
|
97
|
-
|
98
|
-
it "flushes new keys only once" do
|
99
|
-
pending
|
100
|
-
dataset = stub(:dataset, :max => 1, :select_hash => {40 => 1})
|
101
|
-
dataset.stub(:insert_replace => dataset)
|
102
|
-
@db.stub(:[]).with(:keys_dimension_user).and_return(dataset)
|
103
|
-
|
104
|
-
dataset.should_receive(:multi_insert).
|
105
|
-
with([{:original_id => 30, :dimension_id => 2}])
|
106
|
-
dataset.should_receive(:multi_insert).with([])
|
107
|
-
|
108
|
-
builder = described_class.for_table(@dimension, @db)
|
109
|
-
builder.key(:original_id => 30)
|
110
|
-
builder.key(:original_id => 40)
|
111
|
-
builder.flush
|
112
|
-
builder.flush
|
113
|
-
end
|
114
|
-
|
115
|
-
it "replaces old mappings with new values" do
|
116
|
-
pending
|
117
|
-
dataset = stub(:dataset, :max => 1, :select_hash => {40 => 1}, :multi_insert => nil)
|
118
|
-
@db.stub(:[]).with(:keys_dimension_user).and_return(dataset)
|
119
|
-
|
120
|
-
dataset.should_receive(:insert_replace).and_return(dataset)
|
121
|
-
described_class.for_table(@dimension, @db).flush
|
122
|
-
end
|
123
83
|
end
|
124
84
|
|
125
85
|
describe "for non-identifiable dimensions with an existing hash" do
|
@@ -129,25 +89,18 @@ describe Chicago::ETL::KeyBuilder do
|
|
129
89
|
end
|
130
90
|
|
131
91
|
it "returns an incrementing key, given a row" do
|
132
|
-
@builder.key(:hash => "aaa").should == 1
|
133
|
-
@builder.key(:hash => "aab").should == 2
|
92
|
+
@builder.key(:hash => "aaa").first.should == 1
|
93
|
+
@builder.key(:hash => "aab").first.should == 2
|
134
94
|
end
|
135
95
|
|
136
96
|
it "returns the same incrementing key" do
|
137
|
-
@builder.key(:hash => "aaa").should == 1
|
138
|
-
@builder.key(:hash => "aaa").should == 1
|
97
|
+
@builder.key(:hash => "aaa").first.should == 1
|
98
|
+
@builder.key(:hash => "aaa").first.should == 1
|
139
99
|
end
|
140
100
|
|
141
101
|
it "returns the same incrementing key, ignoring case" do
|
142
|
-
@builder.key(:hash => "aaa").should == 1
|
143
|
-
@builder.key(:hash => "AAA").should == 1
|
144
|
-
end
|
145
|
-
|
146
|
-
it "inserts the hash as a binary literal" do
|
147
|
-
# Yuck. Don't like the implementation test, but mock
|
148
|
-
# expectations fail here for some reason, maybe because of the
|
149
|
-
# Sequel::LiteralString?
|
150
|
-
@builder.key_for_insert(@builder.original_key(:hash => "aaa")).should == "0xAAA".lit
|
102
|
+
@builder.key(:hash => "aaa").first.should == 1
|
103
|
+
@builder.key(:hash => "AAA").first.should == 1
|
151
104
|
end
|
152
105
|
end
|
153
106
|
|
@@ -158,32 +111,25 @@ describe Chicago::ETL::KeyBuilder do
|
|
158
111
|
|
159
112
|
it "returns an incrementing key, given a row" do
|
160
113
|
@builder.key(:line1 => "some street", :post_code => "TW3 X45").
|
161
|
-
should == 1
|
114
|
+
first.should == 1
|
162
115
|
@builder.key(:line1 => "some road", :post_code => "TW3 X45").
|
163
|
-
should == 2
|
116
|
+
first.should == 2
|
164
117
|
end
|
165
118
|
|
166
119
|
it "returns the same incrementing key, ignoring case" do
|
167
120
|
@builder.key(:line1 => "some street", :post_code => "TW3 X45").
|
168
|
-
should == 1
|
121
|
+
first.should == 1
|
169
122
|
@builder.key(:line1 => "some STREET", :post_code => "TW3 X45").
|
170
|
-
should == 1
|
123
|
+
first.should == 1
|
171
124
|
end
|
172
125
|
|
173
126
|
it "can override default hash preparation" do
|
174
127
|
@builder.hash_preparation = lambda {|c| c }
|
175
128
|
|
176
129
|
@builder.key(:line1 => "some street", :post_code => "TW3 X45").
|
177
|
-
should == 1
|
130
|
+
first.should == 1
|
178
131
|
@builder.key(:line1 => "some STREET", :post_code => "TW3 X45").
|
179
|
-
should == 2
|
180
|
-
end
|
181
|
-
|
182
|
-
it "inserts the hash as a binary literal" do
|
183
|
-
# Yuck. Don't like the implementation test, but mock
|
184
|
-
# expectations fail here for some reason, maybe because of the
|
185
|
-
# Sequel::LiteralString?
|
186
|
-
@builder.key_for_insert(@builder.original_key(:line1 => "some street", :post_code => "TW3 X45")).should == "0x817860F2417EB83D81FEA9D82E6B213A".lit
|
132
|
+
first.should == 2
|
187
133
|
end
|
188
134
|
|
189
135
|
it "selects the Hex version of the binary column for the cache" do
|
@@ -210,18 +156,14 @@ describe Chicago::ETL::KeyBuilder do
|
|
210
156
|
end
|
211
157
|
|
212
158
|
it "increments the id, regardless of row equality" do
|
213
|
-
@builder.key({}).should == 1
|
214
|
-
@builder.key({}).should == 2
|
159
|
+
@builder.key({}).first.should == 1
|
160
|
+
@builder.key({}).first.should == 2
|
215
161
|
end
|
216
162
|
|
217
163
|
it "increments from the last id stored id in the fact table" do
|
218
164
|
@db.stub(:[]).with(:facts_addresses).and_return(stub(:max => 100, :select_hash => {}))
|
219
165
|
@builder = described_class.for_table(@schema.fact(:addresses), @db)
|
220
|
-
@builder.key({}).should == 101
|
221
|
-
end
|
222
|
-
|
223
|
-
it "supports the flush interface as a no-op" do
|
224
|
-
lambda { @builder.flush }.should_not raise_error
|
166
|
+
@builder.key({}).first.should == 101
|
225
167
|
end
|
226
168
|
end
|
227
169
|
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Chicago::ETL::LoadPipelineStageBuilder do
|
4
|
+
let(:dimension) { stub(:dimension).as_null_object }
|
5
|
+
let(:db) { stub(:db).as_null_object }
|
6
|
+
let(:sink_factory) { stub(:sink_factory).as_null_object }
|
7
|
+
|
8
|
+
before(:each) {
|
9
|
+
Chicago::ETL::SchemaTableSinkFactory.stub(:new).and_return(sink_factory)
|
10
|
+
}
|
11
|
+
|
12
|
+
it "should exclude columns from the sink" do
|
13
|
+
sink_factory.should_receive(:sink).
|
14
|
+
with(:ignore => false, :exclude => [:foo]).
|
15
|
+
and_return(stub(:sink).as_null_object)
|
16
|
+
|
17
|
+
described_class.new(db, dimension).build do
|
18
|
+
load_separately :foo
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
it "can specify rows are not going to be replaced" do
|
23
|
+
sink_factory.should_receive(:sink).
|
24
|
+
with(:ignore => true, :exclude => []).
|
25
|
+
and_return(stub(:sink).as_null_object)
|
26
|
+
|
27
|
+
described_class.new(db, dimension).build do
|
28
|
+
ignore_present_rows
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
it "can add key mappings" do
|
33
|
+
stage = described_class.new(db, dimension).build do
|
34
|
+
key_mapping :bar, :original_id
|
35
|
+
end
|
36
|
+
|
37
|
+
stage.sink(:bar).should_not be_nil
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Chicago::ETL::SchemaTableSinkFactory do
|
4
|
+
let(:db) { stub(:db) }
|
5
|
+
|
6
|
+
let(:dimension) {
|
7
|
+
Chicago::Schema::Builders::DimensionBuilder.new(stub(:schema)).build(:foo) do
|
8
|
+
columns do
|
9
|
+
string :bar
|
10
|
+
integer :baz
|
11
|
+
end
|
12
|
+
end
|
13
|
+
}
|
14
|
+
|
15
|
+
let(:sink_class) { Chicago::Flow::MysqlFileSink }
|
16
|
+
|
17
|
+
it "builds a MysqlFileSink" do
|
18
|
+
sink_class.should_receive(:new).
|
19
|
+
with(db, :dimension_foo, [:id, :bar, :baz], {})
|
20
|
+
|
21
|
+
described_class.new(db, dimension).sink
|
22
|
+
end
|
23
|
+
|
24
|
+
it "allows rows to be ignored instead of replaced" do
|
25
|
+
sink_class.should_receive(:new).
|
26
|
+
with(db, :dimension_foo, [:id, :bar, :baz], {:ignore => true})
|
27
|
+
|
28
|
+
described_class.new(db, dimension).sink(:ignore => true)
|
29
|
+
end
|
30
|
+
|
31
|
+
it "allows an explicit filepath to be specified" do
|
32
|
+
sink_class.should_receive(:new).
|
33
|
+
with(db, :dimension_foo, [:id, :bar, :baz], {:filepath => "foo"})
|
34
|
+
|
35
|
+
described_class.new(db, dimension).sink(:filepath => "foo")
|
36
|
+
end
|
37
|
+
|
38
|
+
it "can exclude columns from a dimension" do
|
39
|
+
sink_class.should_receive(:new).
|
40
|
+
with(db, :dimension_foo, [:id, :bar], {})
|
41
|
+
|
42
|
+
described_class.new(db, dimension).sink(:exclude => :baz)
|
43
|
+
end
|
44
|
+
|
45
|
+
it "builds the key table sink" do
|
46
|
+
sink = stub(:sink).as_null_object
|
47
|
+
sink_class.should_receive(:new).
|
48
|
+
with(db, :keys_dimension_foo, [:original_id, :dimension_id], {}).
|
49
|
+
and_return(sink)
|
50
|
+
|
51
|
+
described_class.new(db, dimension).key_sink()
|
52
|
+
end
|
53
|
+
|
54
|
+
it "builds other explicit key table sinks" do
|
55
|
+
sink = stub(:sink).as_null_object
|
56
|
+
sink_class.should_receive(:new).
|
57
|
+
with(db, :keys_foo, [:original_id, :dimension_id], {}).
|
58
|
+
and_return(sink)
|
59
|
+
|
60
|
+
described_class.new(db, dimension).key_sink(:table => :keys_foo)
|
61
|
+
end
|
62
|
+
|
63
|
+
it "builds an error sink" do
|
64
|
+
sink_class.should_receive(:new).
|
65
|
+
with(db, :etl_error_log, [:column, :row_id, :error, :severity, :error_detail], {}).and_return(stub.as_null_object)
|
66
|
+
|
67
|
+
described_class.new(db, dimension).error_sink
|
68
|
+
end
|
69
|
+
end
|
@@ -5,23 +5,22 @@ describe Chicago::ETL::Screens::InvalidElement do
|
|
5
5
|
Chicago::Schema::Column.new(:enum, :string, :elements => ["Foo", "Unknown"], :default => "Unknown", :optional => true)
|
6
6
|
}
|
7
7
|
|
8
|
+
let(:transformation) {
|
9
|
+
described_class.new(:column => enum_col)
|
10
|
+
}
|
11
|
+
|
8
12
|
it "has a severity of 3" do
|
9
|
-
|
13
|
+
transformation.severity.should == 3
|
10
14
|
end
|
11
15
|
|
12
16
|
it "reports invalid element for enum columns" do
|
13
|
-
row
|
14
|
-
|
15
|
-
row.should ==
|
16
|
-
|
17
|
-
errors.first[:error].should == "Invalid Element"
|
17
|
+
row = transformation.process_row({:enum => "Bar"})
|
18
|
+
|
19
|
+
row[:enum].should == 'Unknown'
|
20
|
+
row[:_errors].first[:error].should == "Invalid Element"
|
18
21
|
end
|
19
22
|
|
20
23
|
it "does not report a valid element" do
|
21
|
-
|
22
|
-
call({:enum => "foo"})
|
23
|
-
row.should == {:enum => 'foo'}
|
24
|
-
|
25
|
-
errors.should be_empty
|
24
|
+
transformation.process_row({:enum => "foo"}).should == {:enum => 'foo'}
|
26
25
|
end
|
27
26
|
end
|
@@ -14,45 +14,45 @@ describe Chicago::ETL::Screens::MissingValue do
|
|
14
14
|
}
|
15
15
|
|
16
16
|
it "reports nil in an expected column as a missing value, with severity 2" do
|
17
|
-
row
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
17
|
+
row = described_class.new(:column => string_col).process_row({})
|
18
|
+
|
19
|
+
error = row[:_errors].first
|
20
|
+
error[:column].should == "str"
|
21
|
+
error[:error].should == "Missing Value"
|
22
|
+
error[:severity].should == 2
|
23
23
|
end
|
24
24
|
|
25
25
|
it "reports an empty string value in an expected column as a missing value" do
|
26
|
-
row
|
27
|
-
|
26
|
+
row = described_class.
|
27
|
+
new(:column => string_col).
|
28
|
+
process_row({:str => " "})
|
28
29
|
|
29
|
-
|
30
|
+
row[:_errors].should_not be_nil
|
30
31
|
end
|
31
32
|
|
32
33
|
it "does not report 0 as a missing value" do
|
33
|
-
row
|
34
|
-
|
35
|
-
|
36
|
-
errors.should be_empty
|
34
|
+
row = described_class.new(:column => int_col).
|
35
|
+
process_row({:int => 0})
|
36
|
+
row[:_errors].should be_nil
|
37
37
|
end
|
38
38
|
|
39
39
|
it "reports missing values with severity 1 if the column is descriptive" do
|
40
|
-
row
|
41
|
-
|
40
|
+
row = described_class.new(:column => descriptive_col).process_row({})
|
41
|
+
row[:_errors].last[:severity].should == 1
|
42
42
|
end
|
43
43
|
|
44
44
|
it "does not report boolean values as missing" do
|
45
|
-
row
|
46
|
-
|
45
|
+
row = described_class.new(:column => bool_col).process_row({})
|
46
|
+
row[:_errors].should be_nil
|
47
47
|
end
|
48
48
|
|
49
49
|
it "does not report optional columns as missing values" do
|
50
|
-
row
|
51
|
-
|
50
|
+
row = described_class.new(:column => optional_col).process_row({})
|
51
|
+
row[:_errors].should be_nil
|
52
52
|
end
|
53
53
|
|
54
54
|
it "fills in a default value for missing values" do
|
55
|
-
row
|
56
|
-
row.should ==
|
55
|
+
row = described_class.new(:column => optional_col).process_row({})
|
56
|
+
row[:str].should == ''
|
57
57
|
end
|
58
58
|
end
|
@@ -9,56 +9,48 @@ describe Chicago::ETL::Screens::OutOfBounds do
|
|
9
9
|
Chicago::Schema::Column.new(:str, :string, :min => 2, :max => 5)
|
10
10
|
}
|
11
11
|
|
12
|
+
let(:int_transformation) {
|
13
|
+
described_class.new(:column => int_col)
|
14
|
+
}
|
15
|
+
|
16
|
+
let(:str_transformation) {
|
17
|
+
described_class.new(:column => str_col)
|
18
|
+
}
|
19
|
+
|
12
20
|
it "applies to numeric columns when the value is lower than the minimum" do
|
13
|
-
row
|
14
|
-
|
15
|
-
|
16
|
-
errors.first[:error].should == "Out Of Bounds"
|
21
|
+
row = int_transformation.process_row(:int => -1)
|
22
|
+
row[:_errors].first[:error].should == "Out Of Bounds"
|
17
23
|
end
|
18
24
|
|
19
25
|
it "applies to numeric columns when the value is above the minimum" do
|
20
|
-
row
|
21
|
-
|
22
|
-
|
23
|
-
errors.first[:error].should == "Out Of Bounds"
|
26
|
+
row = int_transformation.process_row(:int => 101)
|
27
|
+
row[:_errors].first[:error].should == "Out Of Bounds"
|
24
28
|
end
|
25
29
|
|
26
30
|
it "applies to string columns when the number of chars is below minimum" do
|
27
|
-
row
|
28
|
-
|
29
|
-
|
30
|
-
errors.first[:error].should == "Out Of Bounds"
|
31
|
+
row = str_transformation.process_row(:str => "a")
|
32
|
+
row[:_errors].first[:error].should == "Out Of Bounds"
|
31
33
|
end
|
32
34
|
|
33
35
|
it "applies to string columns when the number of chars is above maximum" do
|
34
|
-
row
|
35
|
-
|
36
|
-
|
37
|
-
errors.first[:error].should == "Out Of Bounds"
|
36
|
+
row = str_transformation.process_row(:str => "abcdef")
|
37
|
+
row[:_errors].first[:error].should == "Out Of Bounds"
|
38
38
|
end
|
39
39
|
|
40
40
|
it "does not apply to string values in range" do
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
errors.should be_empty
|
41
|
+
str_transformation.process_row(:str => "abcde").
|
42
|
+
should_not have_key(:_errors)
|
45
43
|
end
|
46
44
|
|
47
45
|
it "does not apply to numeric values in range" do
|
48
|
-
|
49
|
-
call(:int => 0)
|
50
|
-
|
51
|
-
errors.should be_empty
|
46
|
+
int_transformation.process_row(:int => 0).should_not have_key(:_errors)
|
52
47
|
end
|
53
48
|
|
54
49
|
it "has severity 2" do
|
55
|
-
|
50
|
+
int_transformation.severity.should == 2
|
56
51
|
end
|
57
52
|
|
58
53
|
it "does not replace values with default" do
|
59
|
-
|
60
|
-
call(:str => "a")
|
61
|
-
|
62
|
-
row.should == {:str => "a"}
|
54
|
+
str_transformation.process_row(:str => "a")[:str].should == "a"
|
63
55
|
end
|
64
56
|
end
|
@@ -0,0 +1,109 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Chicago::ETL::Transformations::DemultiplexErrors do
|
4
|
+
it "declares it adds things to the error stream" do
|
5
|
+
subject.output_streams.should include(:error)
|
6
|
+
end
|
7
|
+
|
8
|
+
it "does nothing to a row without an :_errors key" do
|
9
|
+
subject.process({}).should == [{}]
|
10
|
+
end
|
11
|
+
|
12
|
+
it "removes the :_error key from the row" do
|
13
|
+
subject.process(:_errors => [{:error => 1}]).first.should == {}
|
14
|
+
end
|
15
|
+
|
16
|
+
it "adds the errors onto the error stream" do
|
17
|
+
subject.process(:_errors => [{:error => 1}]).last.should == {
|
18
|
+
:error => 1,
|
19
|
+
Chicago::Flow::STREAM => :error
|
20
|
+
}
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
describe Chicago::ETL::Transformations::WrittenRowFilter do
|
25
|
+
it "only lets the first row through" do
|
26
|
+
filter = described_class.new(:key => :id)
|
27
|
+
filter.process(:id => 1).should == {:id => 1}
|
28
|
+
filter.process(:id => 2).should == {:id => 2}
|
29
|
+
filter.process(:id => 1).should be_nil
|
30
|
+
end
|
31
|
+
|
32
|
+
it "requires a key option" do
|
33
|
+
described_class.required_options.should include(:key)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
describe Chicago::ETL::Transformations::AddKey do
|
38
|
+
let(:key_builder) { stub(:key_builder, :key => 42) }
|
39
|
+
let(:transform) { described_class.new(:key_builder => key_builder) }
|
40
|
+
|
41
|
+
it "requires a key builder" do
|
42
|
+
described_class.required_options.should include(:key_builder)
|
43
|
+
end
|
44
|
+
|
45
|
+
it "adds the key to the row" do
|
46
|
+
transform.process({}).should == {:id => 42}
|
47
|
+
end
|
48
|
+
|
49
|
+
it "adds the key to any rows in an embedded :_errors key" do
|
50
|
+
transform.process({:_errors => [{}]}).
|
51
|
+
should == {:id => 42, :_errors => [{:row_id => 42}]}
|
52
|
+
end
|
53
|
+
|
54
|
+
it "should declare that it adds the :id field" do
|
55
|
+
transform.added_fields.should == [:id]
|
56
|
+
end
|
57
|
+
|
58
|
+
it "should declare that it writes to the dimension_key stream" do
|
59
|
+
transform.output_streams.should include(:dimension_key)
|
60
|
+
end
|
61
|
+
|
62
|
+
it "should return a new row on the dimension_key stream" do
|
63
|
+
key_builder.stub(:key => [42, {:original_id => 42}])
|
64
|
+
transform.process({}).last.should == {:_stream => :dimension_key, :original_id => 42}
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
describe Chicago::ETL::Transformations::DimensionKeyMapping do
|
69
|
+
let(:transform) {
|
70
|
+
described_class.new(:original_key => :original_id,
|
71
|
+
:key_table => :keys_foo)
|
72
|
+
}
|
73
|
+
|
74
|
+
it "should require an original_key and a key table" do
|
75
|
+
described_class.required_options.should == [:original_key, :key_table]
|
76
|
+
end
|
77
|
+
|
78
|
+
it "removes the key from the stream" do
|
79
|
+
transform.process({:original_id => 1}).first.should == {}
|
80
|
+
transform.removed_fields.should == [:original_id]
|
81
|
+
end
|
82
|
+
|
83
|
+
it "links the original key with the id on the stream" do
|
84
|
+
transform.process({:original_id => 1, :id => 2}).last.
|
85
|
+
should == {:_stream => :keys_foo, :original_id => 1, :dimension_id => 2}
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
describe Chicago::ETL::Transformations::HashColumns do
|
90
|
+
it "requires a columns option" do
|
91
|
+
described_class.required_options.should include(:columns)
|
92
|
+
end
|
93
|
+
|
94
|
+
it "adds a hash field to the row" do
|
95
|
+
Digest::MD5.stub(:hexdigest).with("ab").and_return("a")
|
96
|
+
|
97
|
+
transform = described_class.new(:columns => [:a, :b])
|
98
|
+
transform.added_fields.should == [:hash]
|
99
|
+
transform.process(:a => 'a', :b => 'b')[:hash].should == "A"
|
100
|
+
end
|
101
|
+
|
102
|
+
it "can add the hash to an arbitrary output field" do
|
103
|
+
Digest::MD5.stub(:hexdigest).with("ab").and_return("a")
|
104
|
+
transform = described_class.new(:columns => [:a, :b],
|
105
|
+
:output_field => :foo)
|
106
|
+
transform.added_fields.should == [:foo]
|
107
|
+
transform.process(:a => 'a', :b => 'b')[:foo].should == "A"
|
108
|
+
end
|
109
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Chicago::Flow::ArraySink do
|
4
|
+
let(:sink) { described_class.new(:foo) }
|
5
|
+
|
6
|
+
it "has a name" do
|
7
|
+
sink.name.should == :foo
|
8
|
+
end
|
9
|
+
|
10
|
+
it "stores rows in #data" do
|
11
|
+
sink << {:a => 1}
|
12
|
+
sink.data.should == [{:a => 1}]
|
13
|
+
end
|
14
|
+
|
15
|
+
it "merges constant values into the sink row" do
|
16
|
+
sink.set_constant_values(:number => 1).should == sink
|
17
|
+
sink << {:a => 1}
|
18
|
+
sink.data.should == [{:a => 1, :number => 1}]
|
19
|
+
end
|
20
|
+
|
21
|
+
it "can be truncated" do
|
22
|
+
sink << {:a => 1}
|
23
|
+
sink.truncate
|
24
|
+
sink.data.should be_empty
|
25
|
+
end
|
26
|
+
end
|