chicago-etl 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. data/.document +5 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +16 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.rdoc +21 -0
  6. data/Rakefile +42 -0
  7. data/VERSION +1 -0
  8. data/chicago-etl.gemspec +117 -0
  9. data/lib/chicago/etl/batch.rb +110 -0
  10. data/lib/chicago/etl/buffering_insert_writer.rb +36 -0
  11. data/lib/chicago/etl/counter.rb +36 -0
  12. data/lib/chicago/etl/key_builder.rb +198 -0
  13. data/lib/chicago/etl/load_dataset_builder.rb +75 -0
  14. data/lib/chicago/etl/mysql_dumpfile.rb +32 -0
  15. data/lib/chicago/etl/mysql_load_file_value_transformer.rb +24 -0
  16. data/lib/chicago/etl/screens/column_screen.rb +59 -0
  17. data/lib/chicago/etl/screens/composite_screen.rb +17 -0
  18. data/lib/chicago/etl/screens/invalid_element.rb +27 -0
  19. data/lib/chicago/etl/screens/missing_value.rb +22 -0
  20. data/lib/chicago/etl/screens/out_of_bounds.rb +33 -0
  21. data/lib/chicago/etl/sequel/dependant_tables.rb +48 -0
  22. data/lib/chicago/etl/sequel/filter_to_etl_batch.rb +53 -0
  23. data/lib/chicago/etl/sequel/load_data_infile.rb +19 -0
  24. data/lib/chicago/etl/sink.rb +61 -0
  25. data/lib/chicago/etl/table_builder.rb +45 -0
  26. data/lib/chicago/etl/task_invocation.rb +32 -0
  27. data/lib/chicago/etl/tasks.rb +34 -0
  28. data/lib/chicago/etl/transformations/add_insert_timestamp.rb +16 -0
  29. data/lib/chicago/etl/transformations/uk_post_code.rb +40 -0
  30. data/lib/chicago/etl/transformations/uk_post_code_field.rb +59 -0
  31. data/lib/chicago/etl.rb +35 -0
  32. data/lib/chicago-etl.rb +0 -0
  33. data/spec/db_connections.yml.dist +4 -0
  34. data/spec/etl/batch_spec.rb +86 -0
  35. data/spec/etl/counter_spec.rb +44 -0
  36. data/spec/etl/etl_batch_id_dataset_filter.rb +29 -0
  37. data/spec/etl/key_builder_spec.rb +190 -0
  38. data/spec/etl/load_dataset_builder_spec.rb +86 -0
  39. data/spec/etl/mysql_dumpfile_spec.rb +42 -0
  40. data/spec/etl/mysql_load_file_value_transformer_spec.rb +27 -0
  41. data/spec/etl/screens/composite_screen_spec.rb +25 -0
  42. data/spec/etl/screens/invalid_element_spec.rb +27 -0
  43. data/spec/etl/screens/missing_value_spec.rb +58 -0
  44. data/spec/etl/screens/out_of_bounds_spec.rb +64 -0
  45. data/spec/etl/sequel/dependant_tables_spec.rb +41 -0
  46. data/spec/etl/sequel/filter_to_etl_batch_spec.rb +54 -0
  47. data/spec/etl/sequel/load_data_infile_spec.rb +37 -0
  48. data/spec/etl/sink_spec.rb +7 -0
  49. data/spec/etl/table_builder_spec.rb +22 -0
  50. data/spec/etl/task_spec.rb +87 -0
  51. data/spec/etl/transformations/add_insert_timestamp_spec.rb +9 -0
  52. data/spec/etl/transformations/uk_post_code_field_spec.rb +95 -0
  53. data/spec/etl/transformations/uk_post_code_spec.rb +102 -0
  54. data/spec/spec_helper.rb +20 -0
  55. metadata +245 -0
@@ -0,0 +1,86 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::ETL::LoadDatasetBuilder do
4
+ let(:db) { stub(:database).as_null_object }
5
+
6
+ before :each do
7
+ db.stub(:[]).with(:original_users).
8
+ and_return(TEST_DB[:original_users])
9
+ db.stub(:[]).with(:original_preferences).
10
+ and_return(TEST_DB[:original_preferences])
11
+ db[:original_users].stub(:columns).
12
+ and_return([:id, :name, :email])
13
+ db[:original_preferences].stub(:columns).
14
+ and_return([:id, :spam])
15
+ end
16
+
17
+ it "selects from the specified table" do
18
+ subject.table(:original_users)
19
+ subject.build(db, [:name]).opts[:from].should == [:original_users]
20
+ end
21
+
22
+ it "selects the columns from the table" do
23
+ subject.configure { table(:original_users) }
24
+
25
+ subject.build(db, [:id, :name]).opts[:select].should == [:id.qualify(:original_users), :name.qualify(:original_users)]
26
+ end
27
+
28
+ it "can handle column renaming" do
29
+ subject.configure do
30
+ table :original_users
31
+ provide :original_id, :id
32
+ end
33
+
34
+ subject.build(db, [:original_id, :name]).opts[:select].
35
+ should == [:id.qualify(:original_users).as(:original_id), :name.qualify(:original_users)]
36
+ end
37
+
38
+ it "can provide constructed columns" do
39
+ subject.configure do
40
+ table :original_users
41
+ provide :original_id, :foo.qualify(:bar)
42
+ end
43
+
44
+ subject.build(db, [:original_id, :name]).opts[:select].
45
+ should == [:foo.qualify(:bar).as(:original_id), :name.qualify(:original_users)]
46
+ end
47
+
48
+ it "left outer joins a denormalized table" do
49
+ subject.configure do
50
+ table :original_users
51
+ denormalize :original_preferences, :id => :id
52
+ end
53
+
54
+ subject.build(db, [:id, :name]).sql.should =~ /LEFT OUTER JOIN `original_preferences` ON \(`original_preferences`.`id` = `original_users`.`id`\)/
55
+ end
56
+
57
+ it "takes columns from the appropriate tables where possible" do
58
+ subject.configure do
59
+ table :original_users
60
+ denormalize :original_preferences, :id => :id
61
+ end
62
+
63
+ subject.build(db, [:id, :name, :spam]).opts[:select].
64
+ should == [:id.qualify(:original_users),
65
+ :name.qualify(:original_users),
66
+ :spam.qualify(:original_preferences)]
67
+ end
68
+
69
+ it "takes renames columns from denormalized tables" do
70
+ subject.configure do
71
+ table :original_users
72
+ denormalize :original_preferences, :id => :id
73
+ provide :email_allowed, :spam
74
+ end
75
+
76
+ subject.build(db, [:id, :name, :email_allowed]).opts[:select].
77
+ should include(:spam.qualify(:original_preferences).as(:email_allowed))
78
+ end
79
+
80
+ it "automatically renames ids of denormalized tables" do
81
+ subject.configure do
82
+ table :original_users
83
+ denormalize :original_preferences, :id => :id
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,42 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::ETL::MysqlDumpfile do
4
+ before :each do
5
+ @csv = mock(:csv)
6
+ end
7
+
8
+ it "outputs specified column values in order" do
9
+ dumpfile = described_class.new(@csv, [:foo, :bar])
10
+ @csv.should_receive(:<<).with(["1", "2"])
11
+
12
+ dumpfile << {:foo => "1", :bar => "2", :baz => "not output"}
13
+ end
14
+
15
+ it "transforms values with a MysqlLoadFileValueTransformer" do
16
+ transformer = mock(:transformer)
17
+ Chicago::ETL::MysqlLoadFileValueTransformer.stub(:new).and_return(transformer)
18
+
19
+ transformer.should_receive(:transform).with("bar").and_return("baz")
20
+ @csv.should_receive(:<<).with(["baz"])
21
+
22
+ dumpfile = described_class.new(@csv, [:foo])
23
+ dumpfile << {:foo => "bar"}
24
+ end
25
+
26
+ it "will write a row only once with the same key" do
27
+ dumpfile = described_class.new(@csv, [:foo], :id)
28
+ @csv.should_receive(:<<).with(["bar"])
29
+
30
+ dumpfile << {:id => 1, :foo => "bar"}
31
+ dumpfile << {:id => 1, :foo => "baz"}
32
+ end
33
+
34
+ it "will write a row multiple times if no key is specified" do
35
+ dumpfile = described_class.new(@csv, [:foo])
36
+ @csv.should_receive(:<<).with(["bar"])
37
+ @csv.should_receive(:<<).with(["baz"])
38
+
39
+ dumpfile << {:id => 1, :foo => "bar"}
40
+ dumpfile << {:id => 1, :foo => "baz"}
41
+ end
42
+ end
@@ -0,0 +1,27 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::ETL::MysqlLoadFileValueTransformer do
4
+ it "transforms nil into \\N" do
5
+ subject.transform(nil).should == "\\N"
6
+ end
7
+
8
+ it "transforms true into '1'" do
9
+ subject.transform(true).should == "1"
10
+ end
11
+
12
+ it "transforms false into '0'" do
13
+ subject.transform(false).should == "0"
14
+ end
15
+
16
+ it "transforms times into mysql time format" do
17
+ subject.transform(Time.local(2011,01,02,10,30,50)).should == "2011-01-02 10:30:50"
18
+ end
19
+
20
+ it "transforms datetimes into mysql time format" do
21
+ subject.transform(DateTime.new(2011,01,02,10,30,50)).should == "2011-01-02 10:30:50"
22
+ end
23
+
24
+ it "transforms dates into mysql date format" do
25
+ subject.transform(Date.new(2011,01,02)).should == "2011-01-02"
26
+ end
27
+ end
@@ -0,0 +1,25 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::ETL::Screens::CompositeScreen do
4
+ let(:screen) do
5
+ i = 0
6
+
7
+ lambda {|row, errors|
8
+ i += 1
9
+ errors << i
10
+ [row, errors]
11
+ }
12
+ end
13
+
14
+ it "calls all child screens" do
15
+ row, errors = described_class.new([screen, screen]).call({:a => 1}, [])
16
+ row.should == {:a => 1}
17
+ errors.should == [1,2]
18
+ end
19
+
20
+ it "supports variable arguments in the constructor" do
21
+ row, errors = described_class.new(screen, screen).call({:a => 1}, [])
22
+ row.should == {:a => 1}
23
+ errors.should == [1,2]
24
+ end
25
+ end
@@ -0,0 +1,27 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::ETL::Screens::InvalidElement do
4
+ let(:enum_col) {
5
+ Chicago::Schema::Column.new(:enum, :string, :elements => ["Foo", "Unknown"], :default => "Unknown", :optional => true)
6
+ }
7
+
8
+ it "has a severity of 3" do
9
+ described_class.new(:dimension_foo, enum_col).severity.should == 3
10
+ end
11
+
12
+ it "reports invalid element for enum columns" do
13
+ row, errors = described_class.new(:dimension_foo, enum_col).
14
+ call({:enum => "Bar"})
15
+ row.should == {:enum => 'Unknown'}
16
+
17
+ errors.first[:error].should == "Invalid Element"
18
+ end
19
+
20
+ it "does not report a valid element" do
21
+ row, errors = described_class.new(:dimension_foo, enum_col).
22
+ call({:enum => "foo"})
23
+ row.should == {:enum => 'foo'}
24
+
25
+ errors.should be_empty
26
+ end
27
+ end
@@ -0,0 +1,58 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::ETL::Screens::MissingValue do
4
+ let(:string_col) { Chicago::Schema::Column.new(:str, :string) }
5
+ let(:int_col) { Chicago::Schema::Column.new(:int, :integer) }
6
+ let(:bool_col) { Chicago::Schema::Column.new(:bool, :boolean) }
7
+
8
+ let(:descriptive_col) {
9
+ Chicago::Schema::Column.new(:str, :string, :descriptive => true)
10
+ }
11
+
12
+ let(:optional_col) {
13
+ Chicago::Schema::Column.new(:str, :string, :optional => true)
14
+ }
15
+
16
+ it "reports nil in an expected column as a missing value, with severity 2" do
17
+ row, errors = described_class.new(:dimension_foo, string_col).call({})
18
+
19
+ errors.first[:table].should == "dimension_foo"
20
+ errors.first[:column].should == "str"
21
+ errors.first[:error].should == "Missing Value"
22
+ errors.first[:severity].should == 2
23
+ end
24
+
25
+ it "reports an empty string value in an expected column as a missing value" do
26
+ row, errors = described_class.new(:dimension_foo, string_col).
27
+ call({:str => " "})
28
+
29
+ errors.first[:error].should == "Missing Value"
30
+ end
31
+
32
+ it "does not report 0 as a missing value" do
33
+ row, errors = described_class.new(:dimension_foo, int_col).
34
+ call({:int => 0})
35
+
36
+ errors.should be_empty
37
+ end
38
+
39
+ it "reports missing values with severity 1 if the column is descriptive" do
40
+ row, errors = described_class.new(:dimension_foo, descriptive_col).call({})
41
+ errors.first[:severity].should == 1
42
+ end
43
+
44
+ it "does not report boolean values as missing" do
45
+ row, errors = described_class.new(:dimension_foo, bool_col).call({})
46
+ errors.should be_empty
47
+ end
48
+
49
+ it "does not report optional columns as missing values" do
50
+ row, errors = described_class.new(:dimension_foo, optional_col).call({})
51
+ errors.should be_empty
52
+ end
53
+
54
+ it "fills in a default value for missing values" do
55
+ row, errors = described_class.new(:dimension_foo, optional_col).call({})
56
+ row.should == {:str => ''}
57
+ end
58
+ end
@@ -0,0 +1,64 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::ETL::Screens::OutOfBounds do
4
+ let(:int_col) {
5
+ Chicago::Schema::Column.new(:int, :integer, :min => 0, :max => 100)
6
+ }
7
+
8
+ let(:str_col) {
9
+ Chicago::Schema::Column.new(:str, :string, :min => 2, :max => 5)
10
+ }
11
+
12
+ it "applies to numeric columns when the value is lower than the minimum" do
13
+ row, errors = described_class.new(:dimension_foo, int_col).
14
+ call(:int => -1)
15
+
16
+ errors.first[:error].should == "Out Of Bounds"
17
+ end
18
+
19
+ it "applies to numeric columns when the value is above the minimum" do
20
+ row, errors = described_class.new(:dimension_foo, int_col).
21
+ call(:int => 101)
22
+
23
+ errors.first[:error].should == "Out Of Bounds"
24
+ end
25
+
26
+ it "applies to string columns when the number of chars is below minimum" do
27
+ row, errors = described_class.new(:dimension_foo, str_col).
28
+ call(:str => "a")
29
+
30
+ errors.first[:error].should == "Out Of Bounds"
31
+ end
32
+
33
+ it "applies to string columns when the number of chars is above maximum" do
34
+ row, errors = described_class.new(:dimension_foo, str_col).
35
+ call(:str => "abcdef")
36
+
37
+ errors.first[:error].should == "Out Of Bounds"
38
+ end
39
+
40
+ it "does not apply to string values in range" do
41
+ row, errors = described_class.new(:dimension_foo, str_col).
42
+ call(:str => "abcde")
43
+
44
+ errors.should be_empty
45
+ end
46
+
47
+ it "does not apply to numeric values in range" do
48
+ row, errors = described_class.new(:dimension_foo, int_col).
49
+ call(:int => 0)
50
+
51
+ errors.should be_empty
52
+ end
53
+
54
+ it "has severity 2" do
55
+ described_class.new(:dimension_foo, int_col).severity.should == 2
56
+ end
57
+
58
+ it "does not replace values with default" do
59
+ row, errors = described_class.new(:dimension_foo, str_col).
60
+ call(:str => "a")
61
+
62
+ row.should == {:str => "a"}
63
+ end
64
+ end
@@ -0,0 +1,41 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::ETL::SequelExtensions::DependantTables do
4
+ it "returns the table in the from clause" do
5
+ TEST_DB[:foo].dependant_tables.should == [:foo]
6
+ end
7
+
8
+ it "returns tables from join clauses" do
9
+ TEST_DB[:foo].join(:bar).join(:baz).dependant_tables.
10
+ should == [:foo, :bar, :baz]
11
+ end
12
+
13
+ it "returns unique real tables from join clauses when aliased" do
14
+ TEST_DB[:foo].join(:bar).join(:bar.as(:baz)).dependant_tables.
15
+ should == [:foo, :bar]
16
+ end
17
+
18
+ it "returns real tables from 'from' clauses when aliased" do
19
+ TEST_DB[:foo.as(:bar)].join(:bar).join(:bar.as(:baz)).
20
+ dependant_tables.should == [:foo, :bar]
21
+ end
22
+
23
+ it "returns tables from nested datasets in the from clause" do
24
+ TEST_DB[TEST_DB[:foo].as(:bar)].dependant_tables.should == [:foo]
25
+ end
26
+
27
+ it "returns tables from nested datasets in the join clause" do
28
+ TEST_DB[:foo].join(TEST_DB[:bar].as(:baz)).dependant_tables.
29
+ should == [:foo, :bar]
30
+ end
31
+
32
+ it "handles unioned datasets" do
33
+ TEST_DB[:foo].union(TEST_DB[:bar]).union(TEST_DB[:baz]).
34
+ dependant_tables.should == [:foo, :bar, :baz]
35
+ end
36
+
37
+ it "handles unioned datasets where from_self is false" do
38
+ TEST_DB[:foo].union(TEST_DB[:bar], :from_self => false).
39
+ dependant_tables.should == [:foo, :bar]
40
+ end
41
+ end
@@ -0,0 +1,54 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::ETL::SequelExtensions::FilterToEtlBatch do
4
+ let(:batch) { stub(:batch, :id => 42) }
5
+
6
+ it "should do nothing to a table without an etl_batch_id column" do
7
+ TEST_DB.should_receive(:schema).with(:foo).and_return([])
8
+ TEST_DB[:foo].filter_to_etl_batch(batch).should == TEST_DB[:foo]
9
+ end
10
+
11
+ it "filters a table with an ETL batch id column" do
12
+ TEST_DB.should_receive(:schema).with(:foo).and_return([[:etl_batch_id, {}]])
13
+ TEST_DB[:foo].filter_to_etl_batch(batch).sql.
14
+ should include("\(`foo`.`etl_batch_id` = 42\)")
15
+ end
16
+
17
+ it "filters an aliased table with an ETL batch id column" do
18
+ TEST_DB.should_receive(:schema).with(:foo).and_return([[:etl_batch_id, {}]])
19
+ TEST_DB[:foo.as(:bar)].filter_to_etl_batch(batch).sql.
20
+ should include("\(`bar`.`etl_batch_id` = 42\)")
21
+ end
22
+
23
+ it "doesn't attempt to look for etl columns in nested queries" do
24
+ TEST_DB[TEST_DB[:foo].as(:bar)].filter_to_etl_batch(batch).sql.
25
+ should_not include("`bar`.`etl_batch_id` = 42")
26
+ end
27
+
28
+ it "filters based on joins" do
29
+ TEST_DB.should_receive(:schema).with(:baz).and_return([[:etl_batch_id, {}]])
30
+ TEST_DB.should_receive(:schema).with(:bar).and_return([])
31
+ TEST_DB.should_receive(:schema).with(:foo).and_return([])
32
+
33
+ sql = TEST_DB[:foo].join_table(:left_outer, :bar, :id => :id).join(:baz).filter_to_etl_batch(batch).sql
34
+ sql.should include("\(`baz`.`etl_batch_id` = 42\)")
35
+ end
36
+
37
+ it "filters based on joined aliases" do
38
+ TEST_DB.should_receive(:schema).with(:bar).and_return([[:etl_batch_id, {}]])
39
+ TEST_DB.should_receive(:schema).with(:foo).and_return([])
40
+
41
+ TEST_DB[:foo].join(:bar.as(:baz)).filter_to_etl_batch(batch).sql.
42
+ should include("\(`baz`.`etl_batch_id` = 42\)")
43
+ end
44
+
45
+ it "applies filters to each unioned dataset" do
46
+ TEST_DB.should_receive(:schema).with(:bar).and_return([[:etl_batch_id, {}]])
47
+ TEST_DB.should_receive(:schema).with(:foo).and_return([[:etl_batch_id, {}]])
48
+
49
+ sql = TEST_DB[:foo].union(TEST_DB[:bar], :from_self => false).filter_to_etl_batch(batch).sql
50
+
51
+ sql.should include("\(`foo`.`etl_batch_id` = 42\)")
52
+ sql.should include("\(`bar`.`etl_batch_id` = 42\)")
53
+ end
54
+ end
@@ -0,0 +1,37 @@
1
+ require 'spec_helper'
2
+
3
+ describe Chicago::ETL::SequelExtensions::LoadDataInfile do
4
+ before :each do
5
+ @sql = TEST_DB[:foo].load_csv_infile_sql("bar.csv", [:bar, :baz])
6
+ end
7
+
8
+ it "loads the data in the file" do
9
+ @sql.should include("LOAD DATA INFILE 'bar.csv'")
10
+ end
11
+
12
+ it "replaces rows currently in the table" do
13
+ @sql.should include("REPLACE INTO TABLE `foo`")
14
+ end
15
+
16
+ it "should be in the UTF 8 character set" do
17
+ @sql.should include("CHARACTER SET 'utf8'")
18
+ end
19
+
20
+ it "should escape with the \" character" do
21
+ @sql.should include("ESCAPED BY '\"'")
22
+ end
23
+
24
+ it "supports standard csv, with optional quoting" do
25
+ @sql.should include("FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '\"'")
26
+ end
27
+
28
+ it "loads into the columns specified" do
29
+ @sql.should include("(`bar`,`baz`);")
30
+ end
31
+
32
+ it "can ignore instead of replacing rows" do
33
+ @sql = TEST_DB[:foo].insert_ignore.
34
+ load_csv_infile_sql("bar.csv", [:bar, :baz])
35
+ @sql.should include("IGNORE INTO TABLE `foo`")
36
+ end
37
+ end