chicago-etl 0.0.13 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. data/Gemfile +8 -3
  2. data/README.rdoc +4 -1
  3. data/VERSION +1 -1
  4. data/chicago-etl.gemspec +59 -22
  5. data/chicago-flow.gemspec +92 -0
  6. data/lib/chicago/etl/batch.rb +9 -2
  7. data/lib/chicago/etl/core_extensions.rb +12 -0
  8. data/lib/chicago/etl/counter.rb +8 -1
  9. data/lib/chicago/etl/dataset_batch_stage.rb +52 -0
  10. data/lib/chicago/etl/key_builder.rb +17 -39
  11. data/lib/chicago/etl/load_dataset_builder.rb +3 -1
  12. data/lib/chicago/etl/load_pipeline_stage_builder.rb +142 -0
  13. data/lib/chicago/etl/pipeline.rb +151 -0
  14. data/lib/chicago/etl/schema_table_sink_factory.rb +74 -0
  15. data/lib/chicago/etl/screens/column_screen.rb +26 -25
  16. data/lib/chicago/etl/screens/invalid_element.rb +5 -5
  17. data/lib/chicago/etl/screens/missing_value.rb +4 -2
  18. data/lib/chicago/etl/screens/out_of_bounds.rb +2 -0
  19. data/lib/chicago/etl/table_builder.rb +4 -2
  20. data/lib/chicago/etl/task_invocation.rb +0 -1
  21. data/lib/chicago/etl/transformations.rb +128 -0
  22. data/lib/chicago/etl.rb +39 -8
  23. data/lib/chicago/flow/array_sink.rb +35 -0
  24. data/lib/chicago/flow/array_source.rb +15 -0
  25. data/lib/chicago/flow/dataset_source.rb +23 -0
  26. data/lib/chicago/flow/errors.rb +14 -0
  27. data/lib/chicago/flow/filter.rb +15 -0
  28. data/lib/chicago/flow/mysql.rb +4 -0
  29. data/lib/chicago/{etl/mysql_load_file_value_transformer.rb → flow/mysql_file_serializer.rb} +7 -4
  30. data/lib/chicago/flow/mysql_file_sink.rb +68 -0
  31. data/lib/chicago/flow/null_sink.rb +8 -0
  32. data/lib/chicago/flow/pipeline_endpoint.rb +15 -0
  33. data/lib/chicago/flow/pipeline_stage.rb +68 -0
  34. data/lib/chicago/flow/sink.rb +53 -0
  35. data/lib/chicago/flow/transformation.rb +169 -0
  36. data/lib/chicago/flow/transformation_chain.rb +40 -0
  37. data/spec/etl/batch_spec.rb +2 -1
  38. data/spec/etl/core_extensions_spec.rb +13 -0
  39. data/spec/etl/dataset_batch_stage_spec.rb +55 -0
  40. data/spec/etl/key_builder_spec.rb +25 -83
  41. data/spec/etl/pipeline_stage_builder_spec.rb +39 -0
  42. data/spec/etl/schema_table_sink_factory_spec.rb +69 -0
  43. data/spec/etl/screens/invalid_element_spec.rb +10 -11
  44. data/spec/etl/screens/missing_value_spec.rb +21 -21
  45. data/spec/etl/screens/out_of_bounds_spec.rb +21 -29
  46. data/spec/etl/transformations_spec.rb +109 -0
  47. data/spec/flow/array_sink_spec.rb +26 -0
  48. data/spec/flow/array_source_spec.rb +20 -0
  49. data/spec/flow/dataset_source_spec.rb +15 -0
  50. data/spec/flow/filter_spec.rb +13 -0
  51. data/spec/flow/mysql_file_serializer_spec.rb +27 -0
  52. data/spec/flow/mysql_file_sink_spec.rb +94 -0
  53. data/spec/flow/mysql_integration_spec.rb +72 -0
  54. data/spec/flow/pipeline_stage_spec.rb +89 -0
  55. data/spec/flow/transformation_chain_spec.rb +76 -0
  56. data/spec/flow/transformation_spec.rb +91 -0
  57. data/spec/spec_helper.rb +5 -0
  58. metadata +135 -39
  59. data/lib/chicago/etl/buffering_insert_writer.rb +0 -36
  60. data/lib/chicago/etl/mysql_dumpfile.rb +0 -32
  61. data/lib/chicago/etl/screens/composite_screen.rb +0 -17
  62. data/lib/chicago/etl/sequel/load_data_infile.rb +0 -141
  63. data/lib/chicago/etl/sink.rb +0 -61
  64. data/lib/chicago/etl/transformations/add_insert_timestamp.rb +0 -16
  65. data/spec/etl/mysql_dumpfile_spec.rb +0 -42
  66. data/spec/etl/mysql_load_file_value_transformer_spec.rb +0 -27
  67. data/spec/etl/screens/composite_screen_spec.rb +0 -25
  68. data/spec/etl/sequel/load_data_infile_expression_spec.rb +0 -60
  69. data/spec/etl/sequel/load_data_infile_spec.rb +0 -37
  70. data/spec/etl/sink_spec.rb +0 -7
  71. data/spec/etl/transformations/add_insert_timestamp_spec.rb +0 -9
@@ -1,60 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Chicago::ETL::SequelExtensions::LoadDataInfileExpression do
4
- it "loads the data in the file into the table" do
5
- described_class.new("bar.csv", :foo, ['bar', 'quux']).
6
- to_sql(TEST_DB).should include("LOAD DATA INFILE 'bar.csv' INTO TABLE `foo`")
7
- end
8
-
9
- it "loads the data with replacment" do
10
- described_class.new("bar.csv", :foo, ['bar', 'quux'],
11
- :update => :replace).
12
- to_sql(TEST_DB).should include("REPLACE INTO TABLE")
13
- end
14
-
15
- it "loads the data ignoring rows" do
16
- described_class.new("bar.csv", :foo, ['bar', 'quux'], :update => :ignore).
17
- to_sql(TEST_DB).should include("IGNORE INTO TABLE")
18
- end
19
-
20
- it "should be in UTF-8 character set by default" do
21
- described_class.new("bar.csv", :foo, ['bar', 'quux']).
22
- to_sql(TEST_DB).should include("CHARACTER SET 'utf8'")
23
- end
24
-
25
- it "may be in other character sets" do
26
- described_class.new("bar.csv", :foo, ['bar', 'quux'], :character_set => "ascii").
27
- to_sql(TEST_DB).should include("CHARACTER SET 'ascii'")
28
- end
29
-
30
- it "should load columns" do
31
- described_class.new("bar.csv", :foo, ['bar', 'quux']).
32
- to_sql(TEST_DB).should include("(`bar`,`quux`)")
33
- end
34
-
35
- it "should load into variables if column begins with @" do
36
- described_class.new("bar.csv", :foo, ['@bar', 'quux']).
37
- to_sql(TEST_DB).should include("(@bar,`quux`)")
38
- end
39
-
40
- it "can ignore lines" do
41
- described_class.new("bar.csv", :foo, ['bar', 'quux'], :ignore => 2).
42
- to_sql(TEST_DB).should include("IGNORE 2 LINES")
43
- end
44
-
45
- it "can be in csv format" do
46
- described_class.new("bar.csv", :foo, ['bar', 'quux'], :format => :csv).
47
- to_sql(TEST_DB).should include("FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '\"' ESCAPED BY '\"'")
48
- end
49
-
50
- it "can set column values" do
51
- sql = described_class.new("bar.csv", :foo, ['@bar', 'quux'],
52
- :set => {:bar => :unhex.sql_function("@bar".lit),
53
- :etl_batch_id => 3}).
54
- to_sql(TEST_DB)
55
-
56
- sql.should include("SET")
57
- sql.should include("`etl_batch_id` = 3")
58
- sql.should include("`bar` = unhex(@bar)")
59
- end
60
- end
@@ -1,37 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Chicago::ETL::SequelExtensions::LoadDataInfile do
4
- before :each do
5
- @sql = TEST_DB[:foo].load_csv_infile_sql("bar.csv", [:bar, :baz])
6
- end
7
-
8
- it "loads the data in the file" do
9
- @sql.should include("LOAD DATA INFILE 'bar.csv'")
10
- end
11
-
12
- it "replaces rows currently in the table" do
13
- @sql.should include("REPLACE INTO TABLE `foo`")
14
- end
15
-
16
- it "should be in the UTF 8 character set" do
17
- @sql.should include("CHARACTER SET 'utf8'")
18
- end
19
-
20
- it "should escape with the \" character" do
21
- @sql.should include("ESCAPED BY '\"'")
22
- end
23
-
24
- it "supports standard csv, with optional quoting" do
25
- @sql.should include("FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '\"'")
26
- end
27
-
28
- it "loads into the columns specified" do
29
- @sql.should include("(`bar`,`baz`)")
30
- end
31
-
32
- it "can ignore instead of replacing rows" do
33
- @sql = TEST_DB[:foo].insert_ignore.
34
- load_csv_infile_sql("bar.csv", [:bar, :baz])
35
- @sql.should include("IGNORE INTO TABLE `foo`")
36
- end
37
- end
@@ -1,7 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Chicago::ETL::Sink do
4
- it "writes rows once if given a key" do
5
-
6
- end
7
- end
@@ -1,9 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Chicago::ETL::Transformations::AddInsertTimestamp do
4
- it "adds a timestamp in UTC in the _inserted_at field" do
5
- time = subject.call({}).first[:_inserted_at]
6
- time.should be_kind_of(Time)
7
- time.zone.should == "UTC"
8
- end
9
- end