chicago-etl 0.0.13 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. data/Gemfile +8 -3
  2. data/README.rdoc +4 -1
  3. data/VERSION +1 -1
  4. data/chicago-etl.gemspec +59 -22
  5. data/chicago-flow.gemspec +92 -0
  6. data/lib/chicago/etl/batch.rb +9 -2
  7. data/lib/chicago/etl/core_extensions.rb +12 -0
  8. data/lib/chicago/etl/counter.rb +8 -1
  9. data/lib/chicago/etl/dataset_batch_stage.rb +52 -0
  10. data/lib/chicago/etl/key_builder.rb +17 -39
  11. data/lib/chicago/etl/load_dataset_builder.rb +3 -1
  12. data/lib/chicago/etl/load_pipeline_stage_builder.rb +142 -0
  13. data/lib/chicago/etl/pipeline.rb +151 -0
  14. data/lib/chicago/etl/schema_table_sink_factory.rb +74 -0
  15. data/lib/chicago/etl/screens/column_screen.rb +26 -25
  16. data/lib/chicago/etl/screens/invalid_element.rb +5 -5
  17. data/lib/chicago/etl/screens/missing_value.rb +4 -2
  18. data/lib/chicago/etl/screens/out_of_bounds.rb +2 -0
  19. data/lib/chicago/etl/table_builder.rb +4 -2
  20. data/lib/chicago/etl/task_invocation.rb +0 -1
  21. data/lib/chicago/etl/transformations.rb +128 -0
  22. data/lib/chicago/etl.rb +39 -8
  23. data/lib/chicago/flow/array_sink.rb +35 -0
  24. data/lib/chicago/flow/array_source.rb +15 -0
  25. data/lib/chicago/flow/dataset_source.rb +23 -0
  26. data/lib/chicago/flow/errors.rb +14 -0
  27. data/lib/chicago/flow/filter.rb +15 -0
  28. data/lib/chicago/flow/mysql.rb +4 -0
  29. data/lib/chicago/{etl/mysql_load_file_value_transformer.rb → flow/mysql_file_serializer.rb} +7 -4
  30. data/lib/chicago/flow/mysql_file_sink.rb +68 -0
  31. data/lib/chicago/flow/null_sink.rb +8 -0
  32. data/lib/chicago/flow/pipeline_endpoint.rb +15 -0
  33. data/lib/chicago/flow/pipeline_stage.rb +68 -0
  34. data/lib/chicago/flow/sink.rb +53 -0
  35. data/lib/chicago/flow/transformation.rb +169 -0
  36. data/lib/chicago/flow/transformation_chain.rb +40 -0
  37. data/spec/etl/batch_spec.rb +2 -1
  38. data/spec/etl/core_extensions_spec.rb +13 -0
  39. data/spec/etl/dataset_batch_stage_spec.rb +55 -0
  40. data/spec/etl/key_builder_spec.rb +25 -83
  41. data/spec/etl/pipeline_stage_builder_spec.rb +39 -0
  42. data/spec/etl/schema_table_sink_factory_spec.rb +69 -0
  43. data/spec/etl/screens/invalid_element_spec.rb +10 -11
  44. data/spec/etl/screens/missing_value_spec.rb +21 -21
  45. data/spec/etl/screens/out_of_bounds_spec.rb +21 -29
  46. data/spec/etl/transformations_spec.rb +109 -0
  47. data/spec/flow/array_sink_spec.rb +26 -0
  48. data/spec/flow/array_source_spec.rb +20 -0
  49. data/spec/flow/dataset_source_spec.rb +15 -0
  50. data/spec/flow/filter_spec.rb +13 -0
  51. data/spec/flow/mysql_file_serializer_spec.rb +27 -0
  52. data/spec/flow/mysql_file_sink_spec.rb +94 -0
  53. data/spec/flow/mysql_integration_spec.rb +72 -0
  54. data/spec/flow/pipeline_stage_spec.rb +89 -0
  55. data/spec/flow/transformation_chain_spec.rb +76 -0
  56. data/spec/flow/transformation_spec.rb +91 -0
  57. data/spec/spec_helper.rb +5 -0
  58. metadata +135 -39
  59. data/lib/chicago/etl/buffering_insert_writer.rb +0 -36
  60. data/lib/chicago/etl/mysql_dumpfile.rb +0 -32
  61. data/lib/chicago/etl/screens/composite_screen.rb +0 -17
  62. data/lib/chicago/etl/sequel/load_data_infile.rb +0 -141
  63. data/lib/chicago/etl/sink.rb +0 -61
  64. data/lib/chicago/etl/transformations/add_insert_timestamp.rb +0 -16
  65. data/spec/etl/mysql_dumpfile_spec.rb +0 -42
  66. data/spec/etl/mysql_load_file_value_transformer_spec.rb +0 -27
  67. data/spec/etl/screens/composite_screen_spec.rb +0 -25
  68. data/spec/etl/sequel/load_data_infile_expression_spec.rb +0 -60
  69. data/spec/etl/sequel/load_data_infile_spec.rb +0 -37
  70. data/spec/etl/sink_spec.rb +0 -7
  71. data/spec/etl/transformations/add_insert_timestamp_spec.rb +0 -9
@@ -0,0 +1,55 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::ETL::DatasetBatchStage do
4
+ let(:pipeline_stage) { mock(:pipeline_stage).as_null_object }
5
+ let(:dataset) { mock(:dataset).as_null_object }
6
+ let(:stage) { described_class.new(:foo, dataset, pipeline_stage) }
7
+ let(:etl_batch) { stub(:etl_batch) }
8
+
9
+ it "has a name" do
10
+ stage.name.should == :foo
11
+ end
12
+
13
+ it "should set the inserted at time on the default sink" do
14
+ sink = Chicago::Flow::ArraySink.new(:foo)
15
+ pipeline_stage.stub(:sink).with(:default).and_return(sink)
16
+ stage.pipeline_stage.should == pipeline_stage
17
+
18
+ sink.constant_values[:_inserted_at].should_not be_nil
19
+ end
20
+
21
+ it "filters the dataset to the batch" do
22
+ dataset.should_recieve(:filter_to_etl_batch).with(etl_batch)
23
+ stage.source(etl_batch)
24
+ end
25
+
26
+ it "does not filter the dataset if re-extracting" do
27
+ dataset.should_not_recieve(:filter_to_etl_batch)
28
+ stage.source(etl_batch, true)
29
+ end
30
+
31
+ it "can filter via a custom strategy" do
32
+ dataset.should_not_recieve(:filter_to_etl_batch)
33
+
34
+ filter_strategy = lambda {|ds, batch| ds }
35
+ described_class.new(:foo, dataset, pipeline_stage, :filter_strategy => filter_strategy).
36
+ source(etl_batch)
37
+ end
38
+
39
+ it "executes the pipeline stage using a DatasetSource" do
40
+ pipeline_stage.should_receive(:execute).
41
+ with(kind_of(Chicago::Flow::DatasetSource))
42
+ stage.execute(etl_batch, true)
43
+ end
44
+
45
+ it "truncates any sinks if truncate_pre_load has been set" do
46
+ stage = described_class.new(:foo, dataset, pipeline_stage,
47
+ :truncate_pre_load => true)
48
+
49
+ sink = Chicago::Flow::ArraySink.new(:output)
50
+ sink << {:foo => "foo"}
51
+ pipeline_stage.stub(:sinks).and_return([sink])
52
+ stage.execute(etl_batch)
53
+ sink.data.should == []
54
+ end
55
+ end
@@ -39,8 +39,9 @@ describe Chicago::ETL::KeyBuilder do
39
39
  before :each do
40
40
  @db = stub(:staging_database).as_null_object
41
41
  @db.stub(:[]).and_return(stub(:max => nil, :select_hash => {}))
42
- @writer = stub(:writer).as_null_object
43
- Chicago::ETL::BufferingInsertWriter.stub(:new).and_return(@writer)
42
+ @sink = stub(:sink).as_null_object
43
+ Chicago::ETL::SchemaTableSinkFactory.stub(:new).
44
+ and_return(stub(:factory, :key_sink => @sink))
44
45
  end
45
46
 
46
47
  describe "for identifiable dimensions" do
@@ -50,20 +51,20 @@ describe Chicago::ETL::KeyBuilder do
50
51
 
51
52
  it "returns an incrementing key, given a row" do
52
53
  builder = described_class.for_table(@dimension, @db)
53
- builder.key(:original_id => 2).should == 1
54
- builder.key(:original_id => 3).should == 2
54
+ builder.key(:original_id => 2).first.should == 1
55
+ builder.key(:original_id => 3).first.should == 2
55
56
  end
56
57
 
57
58
  it "returns the same key for the same record" do
58
59
  builder = described_class.for_table(@dimension, @db)
59
- builder.key(:original_id => 2).should == 1
60
- builder.key(:original_id => 2).should == 1
60
+ builder.key(:original_id => 2).first.should == 1
61
+ builder.key(:original_id => 2).first.should == 1
61
62
  end
62
63
 
63
64
  it "takes into account the current maximum key in the database" do
64
65
  @db.stub(:[]).with(:keys_dimension_user).and_return(stub(:max => 2, :select_hash => {}))
65
66
  builder = described_class.for_table(@dimension, @db)
66
- builder.key(:original_id => 1).should == 3
67
+ builder.key(:original_id => 1).first.should == 3
67
68
  end
68
69
 
69
70
  it "returns previously created keys" do
@@ -71,55 +72,14 @@ describe Chicago::ETL::KeyBuilder do
71
72
  @db.stub(:[]).with(:keys_dimension_user).and_return(dataset)
72
73
 
73
74
  builder = described_class.for_table(@dimension, @db)
74
- builder.key(:original_id => 30).should == 2
75
- builder.key(:original_id => 40).should == 1
75
+ builder.key(:original_id => 30).first.should == 2
76
+ builder.key(:original_id => 40).first.should == 1
76
77
  end
77
78
 
78
79
  it "raises an error when original_id isn't present in the row" do
79
80
  builder = described_class.for_table(@dimension, @db)
80
81
  expect { builder.key(:foo => :bar) }.to raise_error(Chicago::ETL::KeyError)
81
82
  end
82
-
83
- it "flushes new keys to a key table" do
84
- pending
85
- dataset = stub(:dataset, :max => 1, :select_hash => {40 => 1})
86
- dataset.stub(:insert_replace => dataset)
87
- @db.stub(:[]).with(:keys_dimension_user).and_return(dataset)
88
-
89
- dataset.should_receive(:multi_insert).
90
- with([{:original_id => 30, :dimension_id => 2}])
91
-
92
- builder = described_class.for_table(@dimension, @db)
93
- builder.key(:original_id => 30)
94
- builder.key(:original_id => 40)
95
- builder.flush
96
- end
97
-
98
- it "flushes new keys only once" do
99
- pending
100
- dataset = stub(:dataset, :max => 1, :select_hash => {40 => 1})
101
- dataset.stub(:insert_replace => dataset)
102
- @db.stub(:[]).with(:keys_dimension_user).and_return(dataset)
103
-
104
- dataset.should_receive(:multi_insert).
105
- with([{:original_id => 30, :dimension_id => 2}])
106
- dataset.should_receive(:multi_insert).with([])
107
-
108
- builder = described_class.for_table(@dimension, @db)
109
- builder.key(:original_id => 30)
110
- builder.key(:original_id => 40)
111
- builder.flush
112
- builder.flush
113
- end
114
-
115
- it "replaces old mappings with new values" do
116
- pending
117
- dataset = stub(:dataset, :max => 1, :select_hash => {40 => 1}, :multi_insert => nil)
118
- @db.stub(:[]).with(:keys_dimension_user).and_return(dataset)
119
-
120
- dataset.should_receive(:insert_replace).and_return(dataset)
121
- described_class.for_table(@dimension, @db).flush
122
- end
123
83
  end
124
84
 
125
85
  describe "for non-identifiable dimensions with an existing hash" do
@@ -129,25 +89,18 @@ describe Chicago::ETL::KeyBuilder do
129
89
  end
130
90
 
131
91
  it "returns an incrementing key, given a row" do
132
- @builder.key(:hash => "aaa").should == 1
133
- @builder.key(:hash => "aab").should == 2
92
+ @builder.key(:hash => "aaa").first.should == 1
93
+ @builder.key(:hash => "aab").first.should == 2
134
94
  end
135
95
 
136
96
  it "returns the same incrementing key" do
137
- @builder.key(:hash => "aaa").should == 1
138
- @builder.key(:hash => "aaa").should == 1
97
+ @builder.key(:hash => "aaa").first.should == 1
98
+ @builder.key(:hash => "aaa").first.should == 1
139
99
  end
140
100
 
141
101
  it "returns the same incrementing key, ignoring case" do
142
- @builder.key(:hash => "aaa").should == 1
143
- @builder.key(:hash => "AAA").should == 1
144
- end
145
-
146
- it "inserts the hash as a binary literal" do
147
- # Yuck. Don't like the implementation test, but mock
148
- # expectations fail here for some reason, maybe because of the
149
- # Sequel::LiteralString?
150
- @builder.key_for_insert(@builder.original_key(:hash => "aaa")).should == "0xAAA".lit
102
+ @builder.key(:hash => "aaa").first.should == 1
103
+ @builder.key(:hash => "AAA").first.should == 1
151
104
  end
152
105
  end
153
106
 
@@ -158,32 +111,25 @@ describe Chicago::ETL::KeyBuilder do
158
111
 
159
112
  it "returns an incrementing key, given a row" do
160
113
  @builder.key(:line1 => "some street", :post_code => "TW3 X45").
161
- should == 1
114
+ first.should == 1
162
115
  @builder.key(:line1 => "some road", :post_code => "TW3 X45").
163
- should == 2
116
+ first.should == 2
164
117
  end
165
118
 
166
119
  it "returns the same incrementing key, ignoring case" do
167
120
  @builder.key(:line1 => "some street", :post_code => "TW3 X45").
168
- should == 1
121
+ first.should == 1
169
122
  @builder.key(:line1 => "some STREET", :post_code => "TW3 X45").
170
- should == 1
123
+ first.should == 1
171
124
  end
172
125
 
173
126
  it "can override default hash preparation" do
174
127
  @builder.hash_preparation = lambda {|c| c }
175
128
 
176
129
  @builder.key(:line1 => "some street", :post_code => "TW3 X45").
177
- should == 1
130
+ first.should == 1
178
131
  @builder.key(:line1 => "some STREET", :post_code => "TW3 X45").
179
- should == 2
180
- end
181
-
182
- it "inserts the hash as a binary literal" do
183
- # Yuck. Don't like the implementation test, but mock
184
- # expectations fail here for some reason, maybe because of the
185
- # Sequel::LiteralString?
186
- @builder.key_for_insert(@builder.original_key(:line1 => "some street", :post_code => "TW3 X45")).should == "0x817860F2417EB83D81FEA9D82E6B213A".lit
132
+ first.should == 2
187
133
  end
188
134
 
189
135
  it "selects the Hex version of the binary column for the cache" do
@@ -210,18 +156,14 @@ describe Chicago::ETL::KeyBuilder do
210
156
  end
211
157
 
212
158
  it "increments the id, regardless of row equality" do
213
- @builder.key({}).should == 1
214
- @builder.key({}).should == 2
159
+ @builder.key({}).first.should == 1
160
+ @builder.key({}).first.should == 2
215
161
  end
216
162
 
217
163
  it "increments from the last id stored id in the fact table" do
218
164
  @db.stub(:[]).with(:facts_addresses).and_return(stub(:max => 100, :select_hash => {}))
219
165
  @builder = described_class.for_table(@schema.fact(:addresses), @db)
220
- @builder.key({}).should == 101
221
- end
222
-
223
- it "supports the flush interface as a no-op" do
224
- lambda { @builder.flush }.should_not raise_error
166
+ @builder.key({}).first.should == 101
225
167
  end
226
168
  end
227
169
  end
@@ -0,0 +1,39 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::ETL::LoadPipelineStageBuilder do
4
+ let(:dimension) { stub(:dimension).as_null_object }
5
+ let(:db) { stub(:db).as_null_object }
6
+ let(:sink_factory) { stub(:sink_factory).as_null_object }
7
+
8
+ before(:each) {
9
+ Chicago::ETL::SchemaTableSinkFactory.stub(:new).and_return(sink_factory)
10
+ }
11
+
12
+ it "should exclude columns from the sink" do
13
+ sink_factory.should_receive(:sink).
14
+ with(:ignore => false, :exclude => [:foo]).
15
+ and_return(stub(:sink).as_null_object)
16
+
17
+ described_class.new(db, dimension).build do
18
+ load_separately :foo
19
+ end
20
+ end
21
+
22
+ it "can specify rows are not going to be replaced" do
23
+ sink_factory.should_receive(:sink).
24
+ with(:ignore => true, :exclude => []).
25
+ and_return(stub(:sink).as_null_object)
26
+
27
+ described_class.new(db, dimension).build do
28
+ ignore_present_rows
29
+ end
30
+ end
31
+
32
+ it "can add key mappings" do
33
+ stage = described_class.new(db, dimension).build do
34
+ key_mapping :bar, :original_id
35
+ end
36
+
37
+ stage.sink(:bar).should_not be_nil
38
+ end
39
+ end
@@ -0,0 +1,69 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::ETL::SchemaTableSinkFactory do
4
+ let(:db) { stub(:db) }
5
+
6
+ let(:dimension) {
7
+ Chicago::Schema::Builders::DimensionBuilder.new(stub(:schema)).build(:foo) do
8
+ columns do
9
+ string :bar
10
+ integer :baz
11
+ end
12
+ end
13
+ }
14
+
15
+ let(:sink_class) { Chicago::Flow::MysqlFileSink }
16
+
17
+ it "builds a MysqlFileSink" do
18
+ sink_class.should_receive(:new).
19
+ with(db, :dimension_foo, [:id, :bar, :baz], {})
20
+
21
+ described_class.new(db, dimension).sink
22
+ end
23
+
24
+ it "allows rows to be ignored instead of replaced" do
25
+ sink_class.should_receive(:new).
26
+ with(db, :dimension_foo, [:id, :bar, :baz], {:ignore => true})
27
+
28
+ described_class.new(db, dimension).sink(:ignore => true)
29
+ end
30
+
31
+ it "allows an explicit filepath to be specified" do
32
+ sink_class.should_receive(:new).
33
+ with(db, :dimension_foo, [:id, :bar, :baz], {:filepath => "foo"})
34
+
35
+ described_class.new(db, dimension).sink(:filepath => "foo")
36
+ end
37
+
38
+ it "can exclude columns from a dimension" do
39
+ sink_class.should_receive(:new).
40
+ with(db, :dimension_foo, [:id, :bar], {})
41
+
42
+ described_class.new(db, dimension).sink(:exclude => :baz)
43
+ end
44
+
45
+ it "builds the key table sink" do
46
+ sink = stub(:sink).as_null_object
47
+ sink_class.should_receive(:new).
48
+ with(db, :keys_dimension_foo, [:original_id, :dimension_id], {}).
49
+ and_return(sink)
50
+
51
+ described_class.new(db, dimension).key_sink()
52
+ end
53
+
54
+ it "builds other explicit key table sinks" do
55
+ sink = stub(:sink).as_null_object
56
+ sink_class.should_receive(:new).
57
+ with(db, :keys_foo, [:original_id, :dimension_id], {}).
58
+ and_return(sink)
59
+
60
+ described_class.new(db, dimension).key_sink(:table => :keys_foo)
61
+ end
62
+
63
+ it "builds an error sink" do
64
+ sink_class.should_receive(:new).
65
+ with(db, :etl_error_log, [:column, :row_id, :error, :severity, :error_detail], {}).and_return(stub.as_null_object)
66
+
67
+ described_class.new(db, dimension).error_sink
68
+ end
69
+ end
@@ -5,23 +5,22 @@ describe Chicago::ETL::Screens::InvalidElement do
5
5
  Chicago::Schema::Column.new(:enum, :string, :elements => ["Foo", "Unknown"], :default => "Unknown", :optional => true)
6
6
  }
7
7
 
8
+ let(:transformation) {
9
+ described_class.new(:column => enum_col)
10
+ }
11
+
8
12
  it "has a severity of 3" do
9
- described_class.new(:dimension_foo, enum_col).severity.should == 3
13
+ transformation.severity.should == 3
10
14
  end
11
15
 
12
16
  it "reports invalid element for enum columns" do
13
- row, errors = described_class.new(:dimension_foo, enum_col).
14
- call({:enum => "Bar"})
15
- row.should == {:enum => 'Unknown'}
16
-
17
- errors.first[:error].should == "Invalid Element"
17
+ row = transformation.process_row({:enum => "Bar"})
18
+
19
+ row[:enum].should == 'Unknown'
20
+ row[:_errors].first[:error].should == "Invalid Element"
18
21
  end
19
22
 
20
23
  it "does not report a valid element" do
21
- row, errors = described_class.new(:dimension_foo, enum_col).
22
- call({:enum => "foo"})
23
- row.should == {:enum => 'foo'}
24
-
25
- errors.should be_empty
24
+ transformation.process_row({:enum => "foo"}).should == {:enum => 'foo'}
26
25
  end
27
26
  end
@@ -14,45 +14,45 @@ describe Chicago::ETL::Screens::MissingValue do
14
14
  }
15
15
 
16
16
  it "reports nil in an expected column as a missing value, with severity 2" do
17
- row, errors = described_class.new(:dimension_foo, string_col).call({})
18
-
19
- errors.first[:table].should == "dimension_foo"
20
- errors.first[:column].should == "str"
21
- errors.first[:error].should == "Missing Value"
22
- errors.first[:severity].should == 2
17
+ row = described_class.new(:column => string_col).process_row({})
18
+
19
+ error = row[:_errors].first
20
+ error[:column].should == "str"
21
+ error[:error].should == "Missing Value"
22
+ error[:severity].should == 2
23
23
  end
24
24
 
25
25
  it "reports an empty string value in an expected column as a missing value" do
26
- row, errors = described_class.new(:dimension_foo, string_col).
27
- call({:str => " "})
26
+ row = described_class.
27
+ new(:column => string_col).
28
+ process_row({:str => " "})
28
29
 
29
- errors.first[:error].should == "Missing Value"
30
+ row[:_errors].should_not be_nil
30
31
  end
31
32
 
32
33
  it "does not report 0 as a missing value" do
33
- row, errors = described_class.new(:dimension_foo, int_col).
34
- call({:int => 0})
35
-
36
- errors.should be_empty
34
+ row = described_class.new(:column => int_col).
35
+ process_row({:int => 0})
36
+ row[:_errors].should be_nil
37
37
  end
38
38
 
39
39
  it "reports missing values with severity 1 if the column is descriptive" do
40
- row, errors = described_class.new(:dimension_foo, descriptive_col).call({})
41
- errors.first[:severity].should == 1
40
+ row = described_class.new(:column => descriptive_col).process_row({})
41
+ row[:_errors].last[:severity].should == 1
42
42
  end
43
43
 
44
44
  it "does not report boolean values as missing" do
45
- row, errors = described_class.new(:dimension_foo, bool_col).call({})
46
- errors.should be_empty
45
+ row = described_class.new(:column => bool_col).process_row({})
46
+ row[:_errors].should be_nil
47
47
  end
48
48
 
49
49
  it "does not report optional columns as missing values" do
50
- row, errors = described_class.new(:dimension_foo, optional_col).call({})
51
- errors.should be_empty
50
+ row = described_class.new(:column => optional_col).process_row({})
51
+ row[:_errors].should be_nil
52
52
  end
53
53
 
54
54
  it "fills in a default value for missing values" do
55
- row, errors = described_class.new(:dimension_foo, optional_col).call({})
56
- row.should == {:str => ''}
55
+ row = described_class.new(:column => optional_col).process_row({})
56
+ row[:str].should == ''
57
57
  end
58
58
  end
@@ -9,56 +9,48 @@ describe Chicago::ETL::Screens::OutOfBounds do
9
9
  Chicago::Schema::Column.new(:str, :string, :min => 2, :max => 5)
10
10
  }
11
11
 
12
+ let(:int_transformation) {
13
+ described_class.new(:column => int_col)
14
+ }
15
+
16
+ let(:str_transformation) {
17
+ described_class.new(:column => str_col)
18
+ }
19
+
12
20
  it "applies to numeric columns when the value is lower than the minimum" do
13
- row, errors = described_class.new(:dimension_foo, int_col).
14
- call(:int => -1)
15
-
16
- errors.first[:error].should == "Out Of Bounds"
21
+ row = int_transformation.process_row(:int => -1)
22
+ row[:_errors].first[:error].should == "Out Of Bounds"
17
23
  end
18
24
 
19
25
  it "applies to numeric columns when the value is above the minimum" do
20
- row, errors = described_class.new(:dimension_foo, int_col).
21
- call(:int => 101)
22
-
23
- errors.first[:error].should == "Out Of Bounds"
26
+ row = int_transformation.process_row(:int => 101)
27
+ row[:_errors].first[:error].should == "Out Of Bounds"
24
28
  end
25
29
 
26
30
  it "applies to string columns when the number of chars is below minimum" do
27
- row, errors = described_class.new(:dimension_foo, str_col).
28
- call(:str => "a")
29
-
30
- errors.first[:error].should == "Out Of Bounds"
31
+ row = str_transformation.process_row(:str => "a")
32
+ row[:_errors].first[:error].should == "Out Of Bounds"
31
33
  end
32
34
 
33
35
  it "applies to string columns when the number of chars is above maximum" do
34
- row, errors = described_class.new(:dimension_foo, str_col).
35
- call(:str => "abcdef")
36
-
37
- errors.first[:error].should == "Out Of Bounds"
36
+ row = str_transformation.process_row(:str => "abcdef")
37
+ row[:_errors].first[:error].should == "Out Of Bounds"
38
38
  end
39
39
 
40
40
  it "does not apply to string values in range" do
41
- row, errors = described_class.new(:dimension_foo, str_col).
42
- call(:str => "abcde")
43
-
44
- errors.should be_empty
41
+ str_transformation.process_row(:str => "abcde").
42
+ should_not have_key(:_errors)
45
43
  end
46
44
 
47
45
  it "does not apply to numeric values in range" do
48
- row, errors = described_class.new(:dimension_foo, int_col).
49
- call(:int => 0)
50
-
51
- errors.should be_empty
46
+ int_transformation.process_row(:int => 0).should_not have_key(:_errors)
52
47
  end
53
48
 
54
49
  it "has severity 2" do
55
- described_class.new(:dimension_foo, int_col).severity.should == 2
50
+ int_transformation.severity.should == 2
56
51
  end
57
52
 
58
53
  it "does not replace values with default" do
59
- row, errors = described_class.new(:dimension_foo, str_col).
60
- call(:str => "a")
61
-
62
- row.should == {:str => "a"}
54
+ str_transformation.process_row(:str => "a")[:str].should == "a"
63
55
  end
64
56
  end
@@ -0,0 +1,109 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::ETL::Transformations::DemultiplexErrors do
4
+ it "declares it adds things to the error stream" do
5
+ subject.output_streams.should include(:error)
6
+ end
7
+
8
+ it "does nothing to a row without an :_errors key" do
9
+ subject.process({}).should == [{}]
10
+ end
11
+
12
+ it "removes the :_error key from the row" do
13
+ subject.process(:_errors => [{:error => 1}]).first.should == {}
14
+ end
15
+
16
+ it "adds the errors onto the error stream" do
17
+ subject.process(:_errors => [{:error => 1}]).last.should == {
18
+ :error => 1,
19
+ Chicago::Flow::STREAM => :error
20
+ }
21
+ end
22
+ end
23
+
24
+ describe Chicago::ETL::Transformations::WrittenRowFilter do
25
+ it "only lets the first row through" do
26
+ filter = described_class.new(:key => :id)
27
+ filter.process(:id => 1).should == {:id => 1}
28
+ filter.process(:id => 2).should == {:id => 2}
29
+ filter.process(:id => 1).should be_nil
30
+ end
31
+
32
+ it "requires a key option" do
33
+ described_class.required_options.should include(:key)
34
+ end
35
+ end
36
+
37
+ describe Chicago::ETL::Transformations::AddKey do
38
+ let(:key_builder) { stub(:key_builder, :key => 42) }
39
+ let(:transform) { described_class.new(:key_builder => key_builder) }
40
+
41
+ it "requires a key builder" do
42
+ described_class.required_options.should include(:key_builder)
43
+ end
44
+
45
+ it "adds the key to the row" do
46
+ transform.process({}).should == {:id => 42}
47
+ end
48
+
49
+ it "adds the key to any rows in an embedded :_errors key" do
50
+ transform.process({:_errors => [{}]}).
51
+ should == {:id => 42, :_errors => [{:row_id => 42}]}
52
+ end
53
+
54
+ it "should declare that it adds the :id field" do
55
+ transform.added_fields.should == [:id]
56
+ end
57
+
58
+ it "should declare that it writes to the dimension_key stream" do
59
+ transform.output_streams.should include(:dimension_key)
60
+ end
61
+
62
+ it "should return a new row on the dimension_key stream" do
63
+ key_builder.stub(:key => [42, {:original_id => 42}])
64
+ transform.process({}).last.should == {:_stream => :dimension_key, :original_id => 42}
65
+ end
66
+ end
67
+
68
+ describe Chicago::ETL::Transformations::DimensionKeyMapping do
69
+ let(:transform) {
70
+ described_class.new(:original_key => :original_id,
71
+ :key_table => :keys_foo)
72
+ }
73
+
74
+ it "should require an original_key and a key table" do
75
+ described_class.required_options.should == [:original_key, :key_table]
76
+ end
77
+
78
+ it "removes the key from the stream" do
79
+ transform.process({:original_id => 1}).first.should == {}
80
+ transform.removed_fields.should == [:original_id]
81
+ end
82
+
83
+ it "links the original key with the id on the stream" do
84
+ transform.process({:original_id => 1, :id => 2}).last.
85
+ should == {:_stream => :keys_foo, :original_id => 1, :dimension_id => 2}
86
+ end
87
+ end
88
+
89
+ describe Chicago::ETL::Transformations::HashColumns do
90
+ it "requires a columns option" do
91
+ described_class.required_options.should include(:columns)
92
+ end
93
+
94
+ it "adds a hash field to the row" do
95
+ Digest::MD5.stub(:hexdigest).with("ab").and_return("a")
96
+
97
+ transform = described_class.new(:columns => [:a, :b])
98
+ transform.added_fields.should == [:hash]
99
+ transform.process(:a => 'a', :b => 'b')[:hash].should == "A"
100
+ end
101
+
102
+ it "can add the hash to an arbitrary output field" do
103
+ Digest::MD5.stub(:hexdigest).with("ab").and_return("a")
104
+ transform = described_class.new(:columns => [:a, :b],
105
+ :output_field => :foo)
106
+ transform.added_fields.should == [:foo]
107
+ transform.process(:a => 'a', :b => 'b')[:foo].should == "A"
108
+ end
109
+ end
@@ -0,0 +1,26 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::Flow::ArraySink do
4
+ let(:sink) { described_class.new(:foo) }
5
+
6
+ it "has a name" do
7
+ sink.name.should == :foo
8
+ end
9
+
10
+ it "stores rows in #data" do
11
+ sink << {:a => 1}
12
+ sink.data.should == [{:a => 1}]
13
+ end
14
+
15
+ it "merges constant values into the sink row" do
16
+ sink.set_constant_values(:number => 1).should == sink
17
+ sink << {:a => 1}
18
+ sink.data.should == [{:a => 1, :number => 1}]
19
+ end
20
+
21
+ it "can be truncated" do
22
+ sink << {:a => 1}
23
+ sink.truncate
24
+ sink.data.should be_empty
25
+ end
26
+ end