chicago-etl 0.0.13 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (71) hide show
  1. data/Gemfile +8 -3
  2. data/README.rdoc +4 -1
  3. data/VERSION +1 -1
  4. data/chicago-etl.gemspec +59 -22
  5. data/chicago-flow.gemspec +92 -0
  6. data/lib/chicago/etl/batch.rb +9 -2
  7. data/lib/chicago/etl/core_extensions.rb +12 -0
  8. data/lib/chicago/etl/counter.rb +8 -1
  9. data/lib/chicago/etl/dataset_batch_stage.rb +52 -0
  10. data/lib/chicago/etl/key_builder.rb +17 -39
  11. data/lib/chicago/etl/load_dataset_builder.rb +3 -1
  12. data/lib/chicago/etl/load_pipeline_stage_builder.rb +142 -0
  13. data/lib/chicago/etl/pipeline.rb +151 -0
  14. data/lib/chicago/etl/schema_table_sink_factory.rb +74 -0
  15. data/lib/chicago/etl/screens/column_screen.rb +26 -25
  16. data/lib/chicago/etl/screens/invalid_element.rb +5 -5
  17. data/lib/chicago/etl/screens/missing_value.rb +4 -2
  18. data/lib/chicago/etl/screens/out_of_bounds.rb +2 -0
  19. data/lib/chicago/etl/table_builder.rb +4 -2
  20. data/lib/chicago/etl/task_invocation.rb +0 -1
  21. data/lib/chicago/etl/transformations.rb +128 -0
  22. data/lib/chicago/etl.rb +39 -8
  23. data/lib/chicago/flow/array_sink.rb +35 -0
  24. data/lib/chicago/flow/array_source.rb +15 -0
  25. data/lib/chicago/flow/dataset_source.rb +23 -0
  26. data/lib/chicago/flow/errors.rb +14 -0
  27. data/lib/chicago/flow/filter.rb +15 -0
  28. data/lib/chicago/flow/mysql.rb +4 -0
  29. data/lib/chicago/{etl/mysql_load_file_value_transformer.rb → flow/mysql_file_serializer.rb} +7 -4
  30. data/lib/chicago/flow/mysql_file_sink.rb +68 -0
  31. data/lib/chicago/flow/null_sink.rb +8 -0
  32. data/lib/chicago/flow/pipeline_endpoint.rb +15 -0
  33. data/lib/chicago/flow/pipeline_stage.rb +68 -0
  34. data/lib/chicago/flow/sink.rb +53 -0
  35. data/lib/chicago/flow/transformation.rb +169 -0
  36. data/lib/chicago/flow/transformation_chain.rb +40 -0
  37. data/spec/etl/batch_spec.rb +2 -1
  38. data/spec/etl/core_extensions_spec.rb +13 -0
  39. data/spec/etl/dataset_batch_stage_spec.rb +55 -0
  40. data/spec/etl/key_builder_spec.rb +25 -83
  41. data/spec/etl/pipeline_stage_builder_spec.rb +39 -0
  42. data/spec/etl/schema_table_sink_factory_spec.rb +69 -0
  43. data/spec/etl/screens/invalid_element_spec.rb +10 -11
  44. data/spec/etl/screens/missing_value_spec.rb +21 -21
  45. data/spec/etl/screens/out_of_bounds_spec.rb +21 -29
  46. data/spec/etl/transformations_spec.rb +109 -0
  47. data/spec/flow/array_sink_spec.rb +26 -0
  48. data/spec/flow/array_source_spec.rb +20 -0
  49. data/spec/flow/dataset_source_spec.rb +15 -0
  50. data/spec/flow/filter_spec.rb +13 -0
  51. data/spec/flow/mysql_file_serializer_spec.rb +27 -0
  52. data/spec/flow/mysql_file_sink_spec.rb +94 -0
  53. data/spec/flow/mysql_integration_spec.rb +72 -0
  54. data/spec/flow/pipeline_stage_spec.rb +89 -0
  55. data/spec/flow/transformation_chain_spec.rb +76 -0
  56. data/spec/flow/transformation_spec.rb +91 -0
  57. data/spec/spec_helper.rb +5 -0
  58. metadata +135 -39
  59. data/lib/chicago/etl/buffering_insert_writer.rb +0 -36
  60. data/lib/chicago/etl/mysql_dumpfile.rb +0 -32
  61. data/lib/chicago/etl/screens/composite_screen.rb +0 -17
  62. data/lib/chicago/etl/sequel/load_data_infile.rb +0 -141
  63. data/lib/chicago/etl/sink.rb +0 -61
  64. data/lib/chicago/etl/transformations/add_insert_timestamp.rb +0 -16
  65. data/spec/etl/mysql_dumpfile_spec.rb +0 -42
  66. data/spec/etl/mysql_load_file_value_transformer_spec.rb +0 -27
  67. data/spec/etl/screens/composite_screen_spec.rb +0 -25
  68. data/spec/etl/sequel/load_data_infile_expression_spec.rb +0 -60
  69. data/spec/etl/sequel/load_data_infile_spec.rb +0 -37
  70. data/spec/etl/sink_spec.rb +0 -7
  71. data/spec/etl/transformations/add_insert_timestamp_spec.rb +0 -9
@@ -1,60 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Chicago::ETL::SequelExtensions::LoadDataInfileExpression do
4
- it "loads the data in the file into the table" do
5
- described_class.new("bar.csv", :foo, ['bar', 'quux']).
6
- to_sql(TEST_DB).should include("LOAD DATA INFILE 'bar.csv' INTO TABLE `foo`")
7
- end
8
-
9
- it "loads the data with replacment" do
10
- described_class.new("bar.csv", :foo, ['bar', 'quux'],
11
- :update => :replace).
12
- to_sql(TEST_DB).should include("REPLACE INTO TABLE")
13
- end
14
-
15
- it "loads the data ignoring rows" do
16
- described_class.new("bar.csv", :foo, ['bar', 'quux'], :update => :ignore).
17
- to_sql(TEST_DB).should include("IGNORE INTO TABLE")
18
- end
19
-
20
- it "should be in UTF-8 character set by default" do
21
- described_class.new("bar.csv", :foo, ['bar', 'quux']).
22
- to_sql(TEST_DB).should include("CHARACTER SET 'utf8'")
23
- end
24
-
25
- it "may be in other character sets" do
26
- described_class.new("bar.csv", :foo, ['bar', 'quux'], :character_set => "ascii").
27
- to_sql(TEST_DB).should include("CHARACTER SET 'ascii'")
28
- end
29
-
30
- it "should load columns" do
31
- described_class.new("bar.csv", :foo, ['bar', 'quux']).
32
- to_sql(TEST_DB).should include("(`bar`,`quux`)")
33
- end
34
-
35
- it "should load into variables if column begins with @" do
36
- described_class.new("bar.csv", :foo, ['@bar', 'quux']).
37
- to_sql(TEST_DB).should include("(@bar,`quux`)")
38
- end
39
-
40
- it "can ignore lines" do
41
- described_class.new("bar.csv", :foo, ['bar', 'quux'], :ignore => 2).
42
- to_sql(TEST_DB).should include("IGNORE 2 LINES")
43
- end
44
-
45
- it "can be in csv format" do
46
- described_class.new("bar.csv", :foo, ['bar', 'quux'], :format => :csv).
47
- to_sql(TEST_DB).should include("FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '\"' ESCAPED BY '\"'")
48
- end
49
-
50
- it "can set column values" do
51
- sql = described_class.new("bar.csv", :foo, ['@bar', 'quux'],
52
- :set => {:bar => :unhex.sql_function("@bar".lit),
53
- :etl_batch_id => 3}).
54
- to_sql(TEST_DB)
55
-
56
- sql.should include("SET")
57
- sql.should include("`etl_batch_id` = 3")
58
- sql.should include("`bar` = unhex(@bar)")
59
- end
60
- end
@@ -1,37 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Chicago::ETL::SequelExtensions::LoadDataInfile do
4
- before :each do
5
- @sql = TEST_DB[:foo].load_csv_infile_sql("bar.csv", [:bar, :baz])
6
- end
7
-
8
- it "loads the data in the file" do
9
- @sql.should include("LOAD DATA INFILE 'bar.csv'")
10
- end
11
-
12
- it "replaces rows currently in the table" do
13
- @sql.should include("REPLACE INTO TABLE `foo`")
14
- end
15
-
16
- it "should be in the UTF 8 character set" do
17
- @sql.should include("CHARACTER SET 'utf8'")
18
- end
19
-
20
- it "should escape with the \" character" do
21
- @sql.should include("ESCAPED BY '\"'")
22
- end
23
-
24
- it "supports standard csv, with optional quoting" do
25
- @sql.should include("FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '\"'")
26
- end
27
-
28
- it "loads into the columns specified" do
29
- @sql.should include("(`bar`,`baz`)")
30
- end
31
-
32
- it "can ignore instead of replacing rows" do
33
- @sql = TEST_DB[:foo].insert_ignore.
34
- load_csv_infile_sql("bar.csv", [:bar, :baz])
35
- @sql.should include("IGNORE INTO TABLE `foo`")
36
- end
37
- end
@@ -1,7 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Chicago::ETL::Sink do
4
- it "writes rows once if given a key" do
5
-
6
- end
7
- end
@@ -1,9 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Chicago::ETL::Transformations::AddInsertTimestamp do
4
- it "adds a timestamp in UTC in the _inserted_at field" do
5
- time = subject.call({}).first[:_inserted_at]
6
- time.should be_kind_of(Time)
7
- time.zone.should == "UTC"
8
- end
9
- end