chicago-etl 0.0.13 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (71) hide show
  1. data/Gemfile +8 -3
  2. data/README.rdoc +4 -1
  3. data/VERSION +1 -1
  4. data/chicago-etl.gemspec +59 -22
  5. data/chicago-flow.gemspec +92 -0
  6. data/lib/chicago/etl/batch.rb +9 -2
  7. data/lib/chicago/etl/core_extensions.rb +12 -0
  8. data/lib/chicago/etl/counter.rb +8 -1
  9. data/lib/chicago/etl/dataset_batch_stage.rb +52 -0
  10. data/lib/chicago/etl/key_builder.rb +17 -39
  11. data/lib/chicago/etl/load_dataset_builder.rb +3 -1
  12. data/lib/chicago/etl/load_pipeline_stage_builder.rb +142 -0
  13. data/lib/chicago/etl/pipeline.rb +151 -0
  14. data/lib/chicago/etl/schema_table_sink_factory.rb +74 -0
  15. data/lib/chicago/etl/screens/column_screen.rb +26 -25
  16. data/lib/chicago/etl/screens/invalid_element.rb +5 -5
  17. data/lib/chicago/etl/screens/missing_value.rb +4 -2
  18. data/lib/chicago/etl/screens/out_of_bounds.rb +2 -0
  19. data/lib/chicago/etl/table_builder.rb +4 -2
  20. data/lib/chicago/etl/task_invocation.rb +0 -1
  21. data/lib/chicago/etl/transformations.rb +128 -0
  22. data/lib/chicago/etl.rb +39 -8
  23. data/lib/chicago/flow/array_sink.rb +35 -0
  24. data/lib/chicago/flow/array_source.rb +15 -0
  25. data/lib/chicago/flow/dataset_source.rb +23 -0
  26. data/lib/chicago/flow/errors.rb +14 -0
  27. data/lib/chicago/flow/filter.rb +15 -0
  28. data/lib/chicago/flow/mysql.rb +4 -0
  29. data/lib/chicago/{etl/mysql_load_file_value_transformer.rb → flow/mysql_file_serializer.rb} +7 -4
  30. data/lib/chicago/flow/mysql_file_sink.rb +68 -0
  31. data/lib/chicago/flow/null_sink.rb +8 -0
  32. data/lib/chicago/flow/pipeline_endpoint.rb +15 -0
  33. data/lib/chicago/flow/pipeline_stage.rb +68 -0
  34. data/lib/chicago/flow/sink.rb +53 -0
  35. data/lib/chicago/flow/transformation.rb +169 -0
  36. data/lib/chicago/flow/transformation_chain.rb +40 -0
  37. data/spec/etl/batch_spec.rb +2 -1
  38. data/spec/etl/core_extensions_spec.rb +13 -0
  39. data/spec/etl/dataset_batch_stage_spec.rb +55 -0
  40. data/spec/etl/key_builder_spec.rb +25 -83
  41. data/spec/etl/pipeline_stage_builder_spec.rb +39 -0
  42. data/spec/etl/schema_table_sink_factory_spec.rb +69 -0
  43. data/spec/etl/screens/invalid_element_spec.rb +10 -11
  44. data/spec/etl/screens/missing_value_spec.rb +21 -21
  45. data/spec/etl/screens/out_of_bounds_spec.rb +21 -29
  46. data/spec/etl/transformations_spec.rb +109 -0
  47. data/spec/flow/array_sink_spec.rb +26 -0
  48. data/spec/flow/array_source_spec.rb +20 -0
  49. data/spec/flow/dataset_source_spec.rb +15 -0
  50. data/spec/flow/filter_spec.rb +13 -0
  51. data/spec/flow/mysql_file_serializer_spec.rb +27 -0
  52. data/spec/flow/mysql_file_sink_spec.rb +94 -0
  53. data/spec/flow/mysql_integration_spec.rb +72 -0
  54. data/spec/flow/pipeline_stage_spec.rb +89 -0
  55. data/spec/flow/transformation_chain_spec.rb +76 -0
  56. data/spec/flow/transformation_spec.rb +91 -0
  57. data/spec/spec_helper.rb +5 -0
  58. metadata +135 -39
  59. data/lib/chicago/etl/buffering_insert_writer.rb +0 -36
  60. data/lib/chicago/etl/mysql_dumpfile.rb +0 -32
  61. data/lib/chicago/etl/screens/composite_screen.rb +0 -17
  62. data/lib/chicago/etl/sequel/load_data_infile.rb +0 -141
  63. data/lib/chicago/etl/sink.rb +0 -61
  64. data/lib/chicago/etl/transformations/add_insert_timestamp.rb +0 -16
  65. data/spec/etl/mysql_dumpfile_spec.rb +0 -42
  66. data/spec/etl/mysql_load_file_value_transformer_spec.rb +0 -27
  67. data/spec/etl/screens/composite_screen_spec.rb +0 -25
  68. data/spec/etl/sequel/load_data_infile_expression_spec.rb +0 -60
  69. data/spec/etl/sequel/load_data_infile_spec.rb +0 -37
  70. data/spec/etl/sink_spec.rb +0 -7
  71. data/spec/etl/transformations/add_insert_timestamp_spec.rb +0 -9
@@ -0,0 +1,55 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::ETL::DatasetBatchStage do
4
+ let(:pipeline_stage) { mock(:pipeline_stage).as_null_object }
5
+ let(:dataset) { mock(:dataset).as_null_object }
6
+ let(:stage) { described_class.new(:foo, dataset, pipeline_stage) }
7
+ let(:etl_batch) { stub(:etl_batch) }
8
+
9
+ it "has a name" do
10
+ stage.name.should == :foo
11
+ end
12
+
13
+ it "should set the inserted at time on the default sink" do
14
+ sink = Chicago::Flow::ArraySink.new(:foo)
15
+ pipeline_stage.stub(:sink).with(:default).and_return(sink)
16
+ stage.pipeline_stage.should == pipeline_stage
17
+
18
+ sink.constant_values[:_inserted_at].should_not be_nil
19
+ end
20
+
21
+ it "filters the dataset to the batch" do
22
+ dataset.should_recieve(:filter_to_etl_batch).with(etl_batch)
23
+ stage.source(etl_batch)
24
+ end
25
+
26
+ it "does not filter the dataset if re-extracting" do
27
+ dataset.should_not_recieve(:filter_to_etl_batch)
28
+ stage.source(etl_batch, true)
29
+ end
30
+
31
+ it "can filter via a custom strategy" do
32
+ dataset.should_not_recieve(:filter_to_etl_batch)
33
+
34
+ filter_strategy = lambda {|ds, batch| ds }
35
+ described_class.new(:foo, dataset, pipeline_stage, :filter_strategy => filter_strategy).
36
+ source(etl_batch)
37
+ end
38
+
39
+ it "executes the pipeline stage using a DatasetSource" do
40
+ pipeline_stage.should_receive(:execute).
41
+ with(kind_of(Chicago::Flow::DatasetSource))
42
+ stage.execute(etl_batch, true)
43
+ end
44
+
45
+ it "truncates any sinks if truncate_pre_load has been set" do
46
+ stage = described_class.new(:foo, dataset, pipeline_stage,
47
+ :truncate_pre_load => true)
48
+
49
+ sink = Chicago::Flow::ArraySink.new(:output)
50
+ sink << {:foo => "foo"}
51
+ pipeline_stage.stub(:sinks).and_return([sink])
52
+ stage.execute(etl_batch)
53
+ sink.data.should == []
54
+ end
55
+ end
@@ -39,8 +39,9 @@ describe Chicago::ETL::KeyBuilder do
39
39
  before :each do
40
40
  @db = stub(:staging_database).as_null_object
41
41
  @db.stub(:[]).and_return(stub(:max => nil, :select_hash => {}))
42
- @writer = stub(:writer).as_null_object
43
- Chicago::ETL::BufferingInsertWriter.stub(:new).and_return(@writer)
42
+ @sink = stub(:sink).as_null_object
43
+ Chicago::ETL::SchemaTableSinkFactory.stub(:new).
44
+ and_return(stub(:factory, :key_sink => @sink))
44
45
  end
45
46
 
46
47
  describe "for identifiable dimensions" do
@@ -50,20 +51,20 @@ describe Chicago::ETL::KeyBuilder do
50
51
 
51
52
  it "returns an incrementing key, given a row" do
52
53
  builder = described_class.for_table(@dimension, @db)
53
- builder.key(:original_id => 2).should == 1
54
- builder.key(:original_id => 3).should == 2
54
+ builder.key(:original_id => 2).first.should == 1
55
+ builder.key(:original_id => 3).first.should == 2
55
56
  end
56
57
 
57
58
  it "returns the same key for the same record" do
58
59
  builder = described_class.for_table(@dimension, @db)
59
- builder.key(:original_id => 2).should == 1
60
- builder.key(:original_id => 2).should == 1
60
+ builder.key(:original_id => 2).first.should == 1
61
+ builder.key(:original_id => 2).first.should == 1
61
62
  end
62
63
 
63
64
  it "takes into account the current maximum key in the database" do
64
65
  @db.stub(:[]).with(:keys_dimension_user).and_return(stub(:max => 2, :select_hash => {}))
65
66
  builder = described_class.for_table(@dimension, @db)
66
- builder.key(:original_id => 1).should == 3
67
+ builder.key(:original_id => 1).first.should == 3
67
68
  end
68
69
 
69
70
  it "returns previously created keys" do
@@ -71,55 +72,14 @@ describe Chicago::ETL::KeyBuilder do
71
72
  @db.stub(:[]).with(:keys_dimension_user).and_return(dataset)
72
73
 
73
74
  builder = described_class.for_table(@dimension, @db)
74
- builder.key(:original_id => 30).should == 2
75
- builder.key(:original_id => 40).should == 1
75
+ builder.key(:original_id => 30).first.should == 2
76
+ builder.key(:original_id => 40).first.should == 1
76
77
  end
77
78
 
78
79
  it "raises an error when original_id isn't present in the row" do
79
80
  builder = described_class.for_table(@dimension, @db)
80
81
  expect { builder.key(:foo => :bar) }.to raise_error(Chicago::ETL::KeyError)
81
82
  end
82
-
83
- it "flushes new keys to a key table" do
84
- pending
85
- dataset = stub(:dataset, :max => 1, :select_hash => {40 => 1})
86
- dataset.stub(:insert_replace => dataset)
87
- @db.stub(:[]).with(:keys_dimension_user).and_return(dataset)
88
-
89
- dataset.should_receive(:multi_insert).
90
- with([{:original_id => 30, :dimension_id => 2}])
91
-
92
- builder = described_class.for_table(@dimension, @db)
93
- builder.key(:original_id => 30)
94
- builder.key(:original_id => 40)
95
- builder.flush
96
- end
97
-
98
- it "flushes new keys only once" do
99
- pending
100
- dataset = stub(:dataset, :max => 1, :select_hash => {40 => 1})
101
- dataset.stub(:insert_replace => dataset)
102
- @db.stub(:[]).with(:keys_dimension_user).and_return(dataset)
103
-
104
- dataset.should_receive(:multi_insert).
105
- with([{:original_id => 30, :dimension_id => 2}])
106
- dataset.should_receive(:multi_insert).with([])
107
-
108
- builder = described_class.for_table(@dimension, @db)
109
- builder.key(:original_id => 30)
110
- builder.key(:original_id => 40)
111
- builder.flush
112
- builder.flush
113
- end
114
-
115
- it "replaces old mappings with new values" do
116
- pending
117
- dataset = stub(:dataset, :max => 1, :select_hash => {40 => 1}, :multi_insert => nil)
118
- @db.stub(:[]).with(:keys_dimension_user).and_return(dataset)
119
-
120
- dataset.should_receive(:insert_replace).and_return(dataset)
121
- described_class.for_table(@dimension, @db).flush
122
- end
123
83
  end
124
84
 
125
85
  describe "for non-identifiable dimensions with an existing hash" do
@@ -129,25 +89,18 @@ describe Chicago::ETL::KeyBuilder do
129
89
  end
130
90
 
131
91
  it "returns an incrementing key, given a row" do
132
- @builder.key(:hash => "aaa").should == 1
133
- @builder.key(:hash => "aab").should == 2
92
+ @builder.key(:hash => "aaa").first.should == 1
93
+ @builder.key(:hash => "aab").first.should == 2
134
94
  end
135
95
 
136
96
  it "returns the same incrementing key" do
137
- @builder.key(:hash => "aaa").should == 1
138
- @builder.key(:hash => "aaa").should == 1
97
+ @builder.key(:hash => "aaa").first.should == 1
98
+ @builder.key(:hash => "aaa").first.should == 1
139
99
  end
140
100
 
141
101
  it "returns the same incrementing key, ignoring case" do
142
- @builder.key(:hash => "aaa").should == 1
143
- @builder.key(:hash => "AAA").should == 1
144
- end
145
-
146
- it "inserts the hash as a binary literal" do
147
- # Yuck. Don't like the implementation test, but mock
148
- # expectations fail here for some reason, maybe because of the
149
- # Sequel::LiteralString?
150
- @builder.key_for_insert(@builder.original_key(:hash => "aaa")).should == "0xAAA".lit
102
+ @builder.key(:hash => "aaa").first.should == 1
103
+ @builder.key(:hash => "AAA").first.should == 1
151
104
  end
152
105
  end
153
106
 
@@ -158,32 +111,25 @@ describe Chicago::ETL::KeyBuilder do
158
111
 
159
112
  it "returns an incrementing key, given a row" do
160
113
  @builder.key(:line1 => "some street", :post_code => "TW3 X45").
161
- should == 1
114
+ first.should == 1
162
115
  @builder.key(:line1 => "some road", :post_code => "TW3 X45").
163
- should == 2
116
+ first.should == 2
164
117
  end
165
118
 
166
119
  it "returns the same incrementing key, ignoring case" do
167
120
  @builder.key(:line1 => "some street", :post_code => "TW3 X45").
168
- should == 1
121
+ first.should == 1
169
122
  @builder.key(:line1 => "some STREET", :post_code => "TW3 X45").
170
- should == 1
123
+ first.should == 1
171
124
  end
172
125
 
173
126
  it "can override default hash preparation" do
174
127
  @builder.hash_preparation = lambda {|c| c }
175
128
 
176
129
  @builder.key(:line1 => "some street", :post_code => "TW3 X45").
177
- should == 1
130
+ first.should == 1
178
131
  @builder.key(:line1 => "some STREET", :post_code => "TW3 X45").
179
- should == 2
180
- end
181
-
182
- it "inserts the hash as a binary literal" do
183
- # Yuck. Don't like the implementation test, but mock
184
- # expectations fail here for some reason, maybe because of the
185
- # Sequel::LiteralString?
186
- @builder.key_for_insert(@builder.original_key(:line1 => "some street", :post_code => "TW3 X45")).should == "0x817860F2417EB83D81FEA9D82E6B213A".lit
132
+ first.should == 2
187
133
  end
188
134
 
189
135
  it "selects the Hex version of the binary column for the cache" do
@@ -210,18 +156,14 @@ describe Chicago::ETL::KeyBuilder do
210
156
  end
211
157
 
212
158
  it "increments the id, regardless of row equality" do
213
- @builder.key({}).should == 1
214
- @builder.key({}).should == 2
159
+ @builder.key({}).first.should == 1
160
+ @builder.key({}).first.should == 2
215
161
  end
216
162
 
217
163
  it "increments from the last id stored id in the fact table" do
218
164
  @db.stub(:[]).with(:facts_addresses).and_return(stub(:max => 100, :select_hash => {}))
219
165
  @builder = described_class.for_table(@schema.fact(:addresses), @db)
220
- @builder.key({}).should == 101
221
- end
222
-
223
- it "supports the flush interface as a no-op" do
224
- lambda { @builder.flush }.should_not raise_error
166
+ @builder.key({}).first.should == 101
225
167
  end
226
168
  end
227
169
  end
@@ -0,0 +1,39 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::ETL::LoadPipelineStageBuilder do
4
+ let(:dimension) { stub(:dimension).as_null_object }
5
+ let(:db) { stub(:db).as_null_object }
6
+ let(:sink_factory) { stub(:sink_factory).as_null_object }
7
+
8
+ before(:each) {
9
+ Chicago::ETL::SchemaTableSinkFactory.stub(:new).and_return(sink_factory)
10
+ }
11
+
12
+ it "should exclude columns from the sink" do
13
+ sink_factory.should_receive(:sink).
14
+ with(:ignore => false, :exclude => [:foo]).
15
+ and_return(stub(:sink).as_null_object)
16
+
17
+ described_class.new(db, dimension).build do
18
+ load_separately :foo
19
+ end
20
+ end
21
+
22
+ it "can specify rows are not going to be replaced" do
23
+ sink_factory.should_receive(:sink).
24
+ with(:ignore => true, :exclude => []).
25
+ and_return(stub(:sink).as_null_object)
26
+
27
+ described_class.new(db, dimension).build do
28
+ ignore_present_rows
29
+ end
30
+ end
31
+
32
+ it "can add key mappings" do
33
+ stage = described_class.new(db, dimension).build do
34
+ key_mapping :bar, :original_id
35
+ end
36
+
37
+ stage.sink(:bar).should_not be_nil
38
+ end
39
+ end
@@ -0,0 +1,69 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::ETL::SchemaTableSinkFactory do
4
+ let(:db) { stub(:db) }
5
+
6
+ let(:dimension) {
7
+ Chicago::Schema::Builders::DimensionBuilder.new(stub(:schema)).build(:foo) do
8
+ columns do
9
+ string :bar
10
+ integer :baz
11
+ end
12
+ end
13
+ }
14
+
15
+ let(:sink_class) { Chicago::Flow::MysqlFileSink }
16
+
17
+ it "builds a MysqlFileSink" do
18
+ sink_class.should_receive(:new).
19
+ with(db, :dimension_foo, [:id, :bar, :baz], {})
20
+
21
+ described_class.new(db, dimension).sink
22
+ end
23
+
24
+ it "allows rows to be ignored instead of replaced" do
25
+ sink_class.should_receive(:new).
26
+ with(db, :dimension_foo, [:id, :bar, :baz], {:ignore => true})
27
+
28
+ described_class.new(db, dimension).sink(:ignore => true)
29
+ end
30
+
31
+ it "allows an explicit filepath to be specified" do
32
+ sink_class.should_receive(:new).
33
+ with(db, :dimension_foo, [:id, :bar, :baz], {:filepath => "foo"})
34
+
35
+ described_class.new(db, dimension).sink(:filepath => "foo")
36
+ end
37
+
38
+ it "can exclude columns from a dimension" do
39
+ sink_class.should_receive(:new).
40
+ with(db, :dimension_foo, [:id, :bar], {})
41
+
42
+ described_class.new(db, dimension).sink(:exclude => :baz)
43
+ end
44
+
45
+ it "builds the key table sink" do
46
+ sink = stub(:sink).as_null_object
47
+ sink_class.should_receive(:new).
48
+ with(db, :keys_dimension_foo, [:original_id, :dimension_id], {}).
49
+ and_return(sink)
50
+
51
+ described_class.new(db, dimension).key_sink()
52
+ end
53
+
54
+ it "builds other explicit key table sinks" do
55
+ sink = stub(:sink).as_null_object
56
+ sink_class.should_receive(:new).
57
+ with(db, :keys_foo, [:original_id, :dimension_id], {}).
58
+ and_return(sink)
59
+
60
+ described_class.new(db, dimension).key_sink(:table => :keys_foo)
61
+ end
62
+
63
+ it "builds an error sink" do
64
+ sink_class.should_receive(:new).
65
+ with(db, :etl_error_log, [:column, :row_id, :error, :severity, :error_detail], {}).and_return(stub.as_null_object)
66
+
67
+ described_class.new(db, dimension).error_sink
68
+ end
69
+ end
@@ -5,23 +5,22 @@ describe Chicago::ETL::Screens::InvalidElement do
5
5
  Chicago::Schema::Column.new(:enum, :string, :elements => ["Foo", "Unknown"], :default => "Unknown", :optional => true)
6
6
  }
7
7
 
8
+ let(:transformation) {
9
+ described_class.new(:column => enum_col)
10
+ }
11
+
8
12
  it "has a severity of 3" do
9
- described_class.new(:dimension_foo, enum_col).severity.should == 3
13
+ transformation.severity.should == 3
10
14
  end
11
15
 
12
16
  it "reports invalid element for enum columns" do
13
- row, errors = described_class.new(:dimension_foo, enum_col).
14
- call({:enum => "Bar"})
15
- row.should == {:enum => 'Unknown'}
16
-
17
- errors.first[:error].should == "Invalid Element"
17
+ row = transformation.process_row({:enum => "Bar"})
18
+
19
+ row[:enum].should == 'Unknown'
20
+ row[:_errors].first[:error].should == "Invalid Element"
18
21
  end
19
22
 
20
23
  it "does not report a valid element" do
21
- row, errors = described_class.new(:dimension_foo, enum_col).
22
- call({:enum => "foo"})
23
- row.should == {:enum => 'foo'}
24
-
25
- errors.should be_empty
24
+ transformation.process_row({:enum => "foo"}).should == {:enum => 'foo'}
26
25
  end
27
26
  end
@@ -14,45 +14,45 @@ describe Chicago::ETL::Screens::MissingValue do
14
14
  }
15
15
 
16
16
  it "reports nil in an expected column as a missing value, with severity 2" do
17
- row, errors = described_class.new(:dimension_foo, string_col).call({})
18
-
19
- errors.first[:table].should == "dimension_foo"
20
- errors.first[:column].should == "str"
21
- errors.first[:error].should == "Missing Value"
22
- errors.first[:severity].should == 2
17
+ row = described_class.new(:column => string_col).process_row({})
18
+
19
+ error = row[:_errors].first
20
+ error[:column].should == "str"
21
+ error[:error].should == "Missing Value"
22
+ error[:severity].should == 2
23
23
  end
24
24
 
25
25
  it "reports an empty string value in an expected column as a missing value" do
26
- row, errors = described_class.new(:dimension_foo, string_col).
27
- call({:str => " "})
26
+ row = described_class.
27
+ new(:column => string_col).
28
+ process_row({:str => " "})
28
29
 
29
- errors.first[:error].should == "Missing Value"
30
+ row[:_errors].should_not be_nil
30
31
  end
31
32
 
32
33
  it "does not report 0 as a missing value" do
33
- row, errors = described_class.new(:dimension_foo, int_col).
34
- call({:int => 0})
35
-
36
- errors.should be_empty
34
+ row = described_class.new(:column => int_col).
35
+ process_row({:int => 0})
36
+ row[:_errors].should be_nil
37
37
  end
38
38
 
39
39
  it "reports missing values with severity 1 if the column is descriptive" do
40
- row, errors = described_class.new(:dimension_foo, descriptive_col).call({})
41
- errors.first[:severity].should == 1
40
+ row = described_class.new(:column => descriptive_col).process_row({})
41
+ row[:_errors].last[:severity].should == 1
42
42
  end
43
43
 
44
44
  it "does not report boolean values as missing" do
45
- row, errors = described_class.new(:dimension_foo, bool_col).call({})
46
- errors.should be_empty
45
+ row = described_class.new(:column => bool_col).process_row({})
46
+ row[:_errors].should be_nil
47
47
  end
48
48
 
49
49
  it "does not report optional columns as missing values" do
50
- row, errors = described_class.new(:dimension_foo, optional_col).call({})
51
- errors.should be_empty
50
+ row = described_class.new(:column => optional_col).process_row({})
51
+ row[:_errors].should be_nil
52
52
  end
53
53
 
54
54
  it "fills in a default value for missing values" do
55
- row, errors = described_class.new(:dimension_foo, optional_col).call({})
56
- row.should == {:str => ''}
55
+ row = described_class.new(:column => optional_col).process_row({})
56
+ row[:str].should == ''
57
57
  end
58
58
  end
@@ -9,56 +9,48 @@ describe Chicago::ETL::Screens::OutOfBounds do
9
9
  Chicago::Schema::Column.new(:str, :string, :min => 2, :max => 5)
10
10
  }
11
11
 
12
+ let(:int_transformation) {
13
+ described_class.new(:column => int_col)
14
+ }
15
+
16
+ let(:str_transformation) {
17
+ described_class.new(:column => str_col)
18
+ }
19
+
12
20
  it "applies to numeric columns when the value is lower than the minimum" do
13
- row, errors = described_class.new(:dimension_foo, int_col).
14
- call(:int => -1)
15
-
16
- errors.first[:error].should == "Out Of Bounds"
21
+ row = int_transformation.process_row(:int => -1)
22
+ row[:_errors].first[:error].should == "Out Of Bounds"
17
23
  end
18
24
 
19
25
  it "applies to numeric columns when the value is above the minimum" do
20
- row, errors = described_class.new(:dimension_foo, int_col).
21
- call(:int => 101)
22
-
23
- errors.first[:error].should == "Out Of Bounds"
26
+ row = int_transformation.process_row(:int => 101)
27
+ row[:_errors].first[:error].should == "Out Of Bounds"
24
28
  end
25
29
 
26
30
  it "applies to string columns when the number of chars is below minimum" do
27
- row, errors = described_class.new(:dimension_foo, str_col).
28
- call(:str => "a")
29
-
30
- errors.first[:error].should == "Out Of Bounds"
31
+ row = str_transformation.process_row(:str => "a")
32
+ row[:_errors].first[:error].should == "Out Of Bounds"
31
33
  end
32
34
 
33
35
  it "applies to string columns when the number of chars is above maximum" do
34
- row, errors = described_class.new(:dimension_foo, str_col).
35
- call(:str => "abcdef")
36
-
37
- errors.first[:error].should == "Out Of Bounds"
36
+ row = str_transformation.process_row(:str => "abcdef")
37
+ row[:_errors].first[:error].should == "Out Of Bounds"
38
38
  end
39
39
 
40
40
  it "does not apply to string values in range" do
41
- row, errors = described_class.new(:dimension_foo, str_col).
42
- call(:str => "abcde")
43
-
44
- errors.should be_empty
41
+ str_transformation.process_row(:str => "abcde").
42
+ should_not have_key(:_errors)
45
43
  end
46
44
 
47
45
  it "does not apply to numeric values in range" do
48
- row, errors = described_class.new(:dimension_foo, int_col).
49
- call(:int => 0)
50
-
51
- errors.should be_empty
46
+ int_transformation.process_row(:int => 0).should_not have_key(:_errors)
52
47
  end
53
48
 
54
49
  it "has severity 2" do
55
- described_class.new(:dimension_foo, int_col).severity.should == 2
50
+ int_transformation.severity.should == 2
56
51
  end
57
52
 
58
53
  it "does not replace values with default" do
59
- row, errors = described_class.new(:dimension_foo, str_col).
60
- call(:str => "a")
61
-
62
- row.should == {:str => "a"}
54
+ str_transformation.process_row(:str => "a")[:str].should == "a"
63
55
  end
64
56
  end
@@ -0,0 +1,109 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::ETL::Transformations::DemultiplexErrors do
4
+ it "declares it adds things to the error stream" do
5
+ subject.output_streams.should include(:error)
6
+ end
7
+
8
+ it "does nothing to a row without an :_errors key" do
9
+ subject.process({}).should == [{}]
10
+ end
11
+
12
+ it "removes the :_error key from the row" do
13
+ subject.process(:_errors => [{:error => 1}]).first.should == {}
14
+ end
15
+
16
+ it "adds the errors onto the error stream" do
17
+ subject.process(:_errors => [{:error => 1}]).last.should == {
18
+ :error => 1,
19
+ Chicago::Flow::STREAM => :error
20
+ }
21
+ end
22
+ end
23
+
24
+ describe Chicago::ETL::Transformations::WrittenRowFilter do
25
+ it "only lets the first row through" do
26
+ filter = described_class.new(:key => :id)
27
+ filter.process(:id => 1).should == {:id => 1}
28
+ filter.process(:id => 2).should == {:id => 2}
29
+ filter.process(:id => 1).should be_nil
30
+ end
31
+
32
+ it "requires a key option" do
33
+ described_class.required_options.should include(:key)
34
+ end
35
+ end
36
+
37
+ describe Chicago::ETL::Transformations::AddKey do
38
+ let(:key_builder) { stub(:key_builder, :key => 42) }
39
+ let(:transform) { described_class.new(:key_builder => key_builder) }
40
+
41
+ it "requires a key builder" do
42
+ described_class.required_options.should include(:key_builder)
43
+ end
44
+
45
+ it "adds the key to the row" do
46
+ transform.process({}).should == {:id => 42}
47
+ end
48
+
49
+ it "adds the key to any rows in an embedded :_errors key" do
50
+ transform.process({:_errors => [{}]}).
51
+ should == {:id => 42, :_errors => [{:row_id => 42}]}
52
+ end
53
+
54
+ it "should declare that it adds the :id field" do
55
+ transform.added_fields.should == [:id]
56
+ end
57
+
58
+ it "should declare that it writes to the dimension_key stream" do
59
+ transform.output_streams.should include(:dimension_key)
60
+ end
61
+
62
+ it "should return a new row on the dimension_key stream" do
63
+ key_builder.stub(:key => [42, {:original_id => 42}])
64
+ transform.process({}).last.should == {:_stream => :dimension_key, :original_id => 42}
65
+ end
66
+ end
67
+
68
+ describe Chicago::ETL::Transformations::DimensionKeyMapping do
69
+ let(:transform) {
70
+ described_class.new(:original_key => :original_id,
71
+ :key_table => :keys_foo)
72
+ }
73
+
74
+ it "should require an original_key and a key table" do
75
+ described_class.required_options.should == [:original_key, :key_table]
76
+ end
77
+
78
+ it "removes the key from the stream" do
79
+ transform.process({:original_id => 1}).first.should == {}
80
+ transform.removed_fields.should == [:original_id]
81
+ end
82
+
83
+ it "links the original key with the id on the stream" do
84
+ transform.process({:original_id => 1, :id => 2}).last.
85
+ should == {:_stream => :keys_foo, :original_id => 1, :dimension_id => 2}
86
+ end
87
+ end
88
+
89
+ describe Chicago::ETL::Transformations::HashColumns do
90
+ it "requires a columns option" do
91
+ described_class.required_options.should include(:columns)
92
+ end
93
+
94
+ it "adds a hash field to the row" do
95
+ Digest::MD5.stub(:hexdigest).with("ab").and_return("a")
96
+
97
+ transform = described_class.new(:columns => [:a, :b])
98
+ transform.added_fields.should == [:hash]
99
+ transform.process(:a => 'a', :b => 'b')[:hash].should == "A"
100
+ end
101
+
102
+ it "can add the hash to an arbitrary output field" do
103
+ Digest::MD5.stub(:hexdigest).with("ab").and_return("a")
104
+ transform = described_class.new(:columns => [:a, :b],
105
+ :output_field => :foo)
106
+ transform.added_fields.should == [:foo]
107
+ transform.process(:a => 'a', :b => 'b')[:foo].should == "A"
108
+ end
109
+ end
@@ -0,0 +1,26 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::Flow::ArraySink do
4
+ let(:sink) { described_class.new(:foo) }
5
+
6
+ it "has a name" do
7
+ sink.name.should == :foo
8
+ end
9
+
10
+ it "stores rows in #data" do
11
+ sink << {:a => 1}
12
+ sink.data.should == [{:a => 1}]
13
+ end
14
+
15
+ it "merges constant values into the sink row" do
16
+ sink.set_constant_values(:number => 1).should == sink
17
+ sink << {:a => 1}
18
+ sink.data.should == [{:a => 1, :number => 1}]
19
+ end
20
+
21
+ it "can be truncated" do
22
+ sink << {:a => 1}
23
+ sink.truncate
24
+ sink.data.should be_empty
25
+ end
26
+ end