chicago-etl 0.0.13 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (71) hide show
  1. data/Gemfile +8 -3
  2. data/README.rdoc +4 -1
  3. data/VERSION +1 -1
  4. data/chicago-etl.gemspec +59 -22
  5. data/chicago-flow.gemspec +92 -0
  6. data/lib/chicago/etl/batch.rb +9 -2
  7. data/lib/chicago/etl/core_extensions.rb +12 -0
  8. data/lib/chicago/etl/counter.rb +8 -1
  9. data/lib/chicago/etl/dataset_batch_stage.rb +52 -0
  10. data/lib/chicago/etl/key_builder.rb +17 -39
  11. data/lib/chicago/etl/load_dataset_builder.rb +3 -1
  12. data/lib/chicago/etl/load_pipeline_stage_builder.rb +142 -0
  13. data/lib/chicago/etl/pipeline.rb +151 -0
  14. data/lib/chicago/etl/schema_table_sink_factory.rb +74 -0
  15. data/lib/chicago/etl/screens/column_screen.rb +26 -25
  16. data/lib/chicago/etl/screens/invalid_element.rb +5 -5
  17. data/lib/chicago/etl/screens/missing_value.rb +4 -2
  18. data/lib/chicago/etl/screens/out_of_bounds.rb +2 -0
  19. data/lib/chicago/etl/table_builder.rb +4 -2
  20. data/lib/chicago/etl/task_invocation.rb +0 -1
  21. data/lib/chicago/etl/transformations.rb +128 -0
  22. data/lib/chicago/etl.rb +39 -8
  23. data/lib/chicago/flow/array_sink.rb +35 -0
  24. data/lib/chicago/flow/array_source.rb +15 -0
  25. data/lib/chicago/flow/dataset_source.rb +23 -0
  26. data/lib/chicago/flow/errors.rb +14 -0
  27. data/lib/chicago/flow/filter.rb +15 -0
  28. data/lib/chicago/flow/mysql.rb +4 -0
  29. data/lib/chicago/{etl/mysql_load_file_value_transformer.rb → flow/mysql_file_serializer.rb} +7 -4
  30. data/lib/chicago/flow/mysql_file_sink.rb +68 -0
  31. data/lib/chicago/flow/null_sink.rb +8 -0
  32. data/lib/chicago/flow/pipeline_endpoint.rb +15 -0
  33. data/lib/chicago/flow/pipeline_stage.rb +68 -0
  34. data/lib/chicago/flow/sink.rb +53 -0
  35. data/lib/chicago/flow/transformation.rb +169 -0
  36. data/lib/chicago/flow/transformation_chain.rb +40 -0
  37. data/spec/etl/batch_spec.rb +2 -1
  38. data/spec/etl/core_extensions_spec.rb +13 -0
  39. data/spec/etl/dataset_batch_stage_spec.rb +55 -0
  40. data/spec/etl/key_builder_spec.rb +25 -83
  41. data/spec/etl/pipeline_stage_builder_spec.rb +39 -0
  42. data/spec/etl/schema_table_sink_factory_spec.rb +69 -0
  43. data/spec/etl/screens/invalid_element_spec.rb +10 -11
  44. data/spec/etl/screens/missing_value_spec.rb +21 -21
  45. data/spec/etl/screens/out_of_bounds_spec.rb +21 -29
  46. data/spec/etl/transformations_spec.rb +109 -0
  47. data/spec/flow/array_sink_spec.rb +26 -0
  48. data/spec/flow/array_source_spec.rb +20 -0
  49. data/spec/flow/dataset_source_spec.rb +15 -0
  50. data/spec/flow/filter_spec.rb +13 -0
  51. data/spec/flow/mysql_file_serializer_spec.rb +27 -0
  52. data/spec/flow/mysql_file_sink_spec.rb +94 -0
  53. data/spec/flow/mysql_integration_spec.rb +72 -0
  54. data/spec/flow/pipeline_stage_spec.rb +89 -0
  55. data/spec/flow/transformation_chain_spec.rb +76 -0
  56. data/spec/flow/transformation_spec.rb +91 -0
  57. data/spec/spec_helper.rb +5 -0
  58. metadata +135 -39
  59. data/lib/chicago/etl/buffering_insert_writer.rb +0 -36
  60. data/lib/chicago/etl/mysql_dumpfile.rb +0 -32
  61. data/lib/chicago/etl/screens/composite_screen.rb +0 -17
  62. data/lib/chicago/etl/sequel/load_data_infile.rb +0 -141
  63. data/lib/chicago/etl/sink.rb +0 -61
  64. data/lib/chicago/etl/transformations/add_insert_timestamp.rb +0 -16
  65. data/spec/etl/mysql_dumpfile_spec.rb +0 -42
  66. data/spec/etl/mysql_load_file_value_transformer_spec.rb +0 -27
  67. data/spec/etl/screens/composite_screen_spec.rb +0 -25
  68. data/spec/etl/sequel/load_data_infile_expression_spec.rb +0 -60
  69. data/spec/etl/sequel/load_data_infile_spec.rb +0 -37
  70. data/spec/etl/sink_spec.rb +0 -7
  71. data/spec/etl/transformations/add_insert_timestamp_spec.rb +0 -9
data/Gemfile CHANGED
@@ -1,6 +1,10 @@
1
1
  source "http://rubygems.org"
2
2
 
3
3
  gem "chicagowarehouse", "~> 0.4"
4
+ gem "fastercsv", :platform => :ruby_18
5
+ gem "sequel"
6
+ gem "sequel_load_data_infile", ">= 0.0.2", :require => "sequel/load_data_infile"
7
+ gem "sequel_fast_columns", :require => "sequel/fast_columns"
4
8
 
5
9
  # Add dependencies to develop your gem here.
6
10
  # Include everything needed to run rake, tests, features, etc.
@@ -9,8 +13,9 @@ group :development do
9
13
  gem "timecop"
10
14
  gem "yard"
11
15
  gem "flog"
12
- gem "jeweler"
13
- gem "rcov", :platforms => :mri_18
14
- gem "simplecov", :platforms => :mri_19
16
+ gem "simplecov", :platforms => :mri_19, :require => false
15
17
  gem "ZenTest"
18
+ gem "mysql", "2.8.1"
19
+ gem "bundler", "~> 1"
20
+ gem "jeweler"
16
21
  end
data/README.rdoc CHANGED
@@ -14,8 +14,11 @@ An ETL pipeline for use with Chicago Warehouse.
14
14
  * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
15
15
  * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
16
16
 
17
+ == Authors
18
+
19
+ Roland Swingler (@knaveofdiamonds)
20
+
17
21
  == Copyright
18
22
 
19
23
  Copyright (c) 2012 notonthehighstreet.com. See LICENSE.txt for
20
24
  further details.
21
-
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.13
1
+ 0.1.0
data/chicago-etl.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "chicago-etl"
8
- s.version = "0.0.13"
8
+ s.version = "0.1.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Roland Swingler"]
12
- s.date = "2013-04-16"
12
+ s.date = "2013-09-05"
13
13
  s.description = "ETL tools for Chicago"
14
14
  s.email = "roland.swingler@gmail.com"
15
15
  s.extra_rdoc_files = [
@@ -25,52 +25,74 @@ Gem::Specification.new do |s|
25
25
  "Rakefile",
26
26
  "VERSION",
27
27
  "chicago-etl.gemspec",
28
+ "chicago-flow.gemspec",
28
29
  "lib/chicago-etl.rb",
29
30
  "lib/chicago/etl.rb",
30
31
  "lib/chicago/etl/batch.rb",
31
- "lib/chicago/etl/buffering_insert_writer.rb",
32
+ "lib/chicago/etl/core_extensions.rb",
32
33
  "lib/chicago/etl/counter.rb",
34
+ "lib/chicago/etl/dataset_batch_stage.rb",
33
35
  "lib/chicago/etl/key_builder.rb",
34
36
  "lib/chicago/etl/load_dataset_builder.rb",
35
- "lib/chicago/etl/mysql_dumpfile.rb",
36
- "lib/chicago/etl/mysql_load_file_value_transformer.rb",
37
+ "lib/chicago/etl/load_pipeline_stage_builder.rb",
38
+ "lib/chicago/etl/pipeline.rb",
39
+ "lib/chicago/etl/schema_table_sink_factory.rb",
37
40
  "lib/chicago/etl/screens/column_screen.rb",
38
- "lib/chicago/etl/screens/composite_screen.rb",
39
41
  "lib/chicago/etl/screens/invalid_element.rb",
40
42
  "lib/chicago/etl/screens/missing_value.rb",
41
43
  "lib/chicago/etl/screens/out_of_bounds.rb",
42
44
  "lib/chicago/etl/sequel/dependant_tables.rb",
43
45
  "lib/chicago/etl/sequel/filter_to_etl_batch.rb",
44
- "lib/chicago/etl/sequel/load_data_infile.rb",
45
- "lib/chicago/etl/sink.rb",
46
46
  "lib/chicago/etl/table_builder.rb",
47
47
  "lib/chicago/etl/task_invocation.rb",
48
48
  "lib/chicago/etl/tasks.rb",
49
- "lib/chicago/etl/transformations/add_insert_timestamp.rb",
49
+ "lib/chicago/etl/transformations.rb",
50
50
  "lib/chicago/etl/transformations/uk_post_code.rb",
51
51
  "lib/chicago/etl/transformations/uk_post_code_field.rb",
52
+ "lib/chicago/flow/array_sink.rb",
53
+ "lib/chicago/flow/array_source.rb",
54
+ "lib/chicago/flow/dataset_source.rb",
55
+ "lib/chicago/flow/errors.rb",
56
+ "lib/chicago/flow/filter.rb",
57
+ "lib/chicago/flow/mysql.rb",
58
+ "lib/chicago/flow/mysql_file_serializer.rb",
59
+ "lib/chicago/flow/mysql_file_sink.rb",
60
+ "lib/chicago/flow/null_sink.rb",
61
+ "lib/chicago/flow/pipeline_endpoint.rb",
62
+ "lib/chicago/flow/pipeline_stage.rb",
63
+ "lib/chicago/flow/sink.rb",
64
+ "lib/chicago/flow/transformation.rb",
65
+ "lib/chicago/flow/transformation_chain.rb",
52
66
  "spec/db_connections.yml.dist",
53
67
  "spec/etl/batch_spec.rb",
68
+ "spec/etl/core_extensions_spec.rb",
54
69
  "spec/etl/counter_spec.rb",
70
+ "spec/etl/dataset_batch_stage_spec.rb",
55
71
  "spec/etl/etl_batch_id_dataset_filter.rb",
56
72
  "spec/etl/key_builder_spec.rb",
57
73
  "spec/etl/load_dataset_builder_spec.rb",
58
- "spec/etl/mysql_dumpfile_spec.rb",
59
- "spec/etl/mysql_load_file_value_transformer_spec.rb",
60
- "spec/etl/screens/composite_screen_spec.rb",
74
+ "spec/etl/pipeline_stage_builder_spec.rb",
75
+ "spec/etl/schema_table_sink_factory_spec.rb",
61
76
  "spec/etl/screens/invalid_element_spec.rb",
62
77
  "spec/etl/screens/missing_value_spec.rb",
63
78
  "spec/etl/screens/out_of_bounds_spec.rb",
64
79
  "spec/etl/sequel/dependant_tables_spec.rb",
65
80
  "spec/etl/sequel/filter_to_etl_batch_spec.rb",
66
- "spec/etl/sequel/load_data_infile_expression_spec.rb",
67
- "spec/etl/sequel/load_data_infile_spec.rb",
68
- "spec/etl/sink_spec.rb",
69
81
  "spec/etl/table_builder_spec.rb",
70
82
  "spec/etl/task_spec.rb",
71
- "spec/etl/transformations/add_insert_timestamp_spec.rb",
72
83
  "spec/etl/transformations/uk_post_code_field_spec.rb",
73
84
  "spec/etl/transformations/uk_post_code_spec.rb",
85
+ "spec/etl/transformations_spec.rb",
86
+ "spec/flow/array_sink_spec.rb",
87
+ "spec/flow/array_source_spec.rb",
88
+ "spec/flow/dataset_source_spec.rb",
89
+ "spec/flow/filter_spec.rb",
90
+ "spec/flow/mysql_file_serializer_spec.rb",
91
+ "spec/flow/mysql_file_sink_spec.rb",
92
+ "spec/flow/mysql_integration_spec.rb",
93
+ "spec/flow/pipeline_stage_spec.rb",
94
+ "spec/flow/transformation_chain_spec.rb",
95
+ "spec/flow/transformation_spec.rb",
74
96
  "spec/spec_helper.rb"
75
97
  ]
76
98
  s.homepage = "http://github.com/notonthehighstreet/chicago-etl"
@@ -84,35 +106,50 @@ Gem::Specification.new do |s|
84
106
 
85
107
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
86
108
  s.add_runtime_dependency(%q<chicagowarehouse>, ["~> 0.4"])
109
+ s.add_runtime_dependency(%q<fastercsv>, [">= 0"])
110
+ s.add_runtime_dependency(%q<sequel>, [">= 0"])
111
+ s.add_runtime_dependency(%q<sequel_load_data_infile>, [">= 0.0.2"])
112
+ s.add_runtime_dependency(%q<sequel_fast_columns>, [">= 0"])
87
113
  s.add_development_dependency(%q<rspec>, ["~> 2"])
88
114
  s.add_development_dependency(%q<timecop>, [">= 0"])
89
115
  s.add_development_dependency(%q<yard>, [">= 0"])
90
116
  s.add_development_dependency(%q<flog>, [">= 0"])
91
- s.add_development_dependency(%q<jeweler>, [">= 0"])
92
- s.add_development_dependency(%q<rcov>, [">= 0"])
93
117
  s.add_development_dependency(%q<simplecov>, [">= 0"])
94
118
  s.add_development_dependency(%q<ZenTest>, [">= 0"])
119
+ s.add_development_dependency(%q<mysql>, ["= 2.8.1"])
120
+ s.add_development_dependency(%q<bundler>, ["~> 1"])
121
+ s.add_development_dependency(%q<jeweler>, [">= 0"])
95
122
  else
96
123
  s.add_dependency(%q<chicagowarehouse>, ["~> 0.4"])
124
+ s.add_dependency(%q<fastercsv>, [">= 0"])
125
+ s.add_dependency(%q<sequel>, [">= 0"])
126
+ s.add_dependency(%q<sequel_load_data_infile>, [">= 0.0.2"])
127
+ s.add_dependency(%q<sequel_fast_columns>, [">= 0"])
97
128
  s.add_dependency(%q<rspec>, ["~> 2"])
98
129
  s.add_dependency(%q<timecop>, [">= 0"])
99
130
  s.add_dependency(%q<yard>, [">= 0"])
100
131
  s.add_dependency(%q<flog>, [">= 0"])
101
- s.add_dependency(%q<jeweler>, [">= 0"])
102
- s.add_dependency(%q<rcov>, [">= 0"])
103
132
  s.add_dependency(%q<simplecov>, [">= 0"])
104
133
  s.add_dependency(%q<ZenTest>, [">= 0"])
134
+ s.add_dependency(%q<mysql>, ["= 2.8.1"])
135
+ s.add_dependency(%q<bundler>, ["~> 1"])
136
+ s.add_dependency(%q<jeweler>, [">= 0"])
105
137
  end
106
138
  else
107
139
  s.add_dependency(%q<chicagowarehouse>, ["~> 0.4"])
140
+ s.add_dependency(%q<fastercsv>, [">= 0"])
141
+ s.add_dependency(%q<sequel>, [">= 0"])
142
+ s.add_dependency(%q<sequel_load_data_infile>, [">= 0.0.2"])
143
+ s.add_dependency(%q<sequel_fast_columns>, [">= 0"])
108
144
  s.add_dependency(%q<rspec>, ["~> 2"])
109
145
  s.add_dependency(%q<timecop>, [">= 0"])
110
146
  s.add_dependency(%q<yard>, [">= 0"])
111
147
  s.add_dependency(%q<flog>, [">= 0"])
112
- s.add_dependency(%q<jeweler>, [">= 0"])
113
- s.add_dependency(%q<rcov>, [">= 0"])
114
148
  s.add_dependency(%q<simplecov>, [">= 0"])
115
149
  s.add_dependency(%q<ZenTest>, [">= 0"])
150
+ s.add_dependency(%q<mysql>, ["= 2.8.1"])
151
+ s.add_dependency(%q<bundler>, ["~> 1"])
152
+ s.add_dependency(%q<jeweler>, [">= 0"])
116
153
  end
117
154
  end
118
155
 
@@ -0,0 +1,92 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "chicago-flow"
8
+ s.version = "0.0.1"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Roland Swingler"]
12
+ s.date = "2013-06-05"
13
+ s.description = "Dataflow-style processing for hash-like rows"
14
+ s.email = "roland.swingler@gmail.com"
15
+ s.extra_rdoc_files = [
16
+ "LICENSE.txt",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".rspec",
22
+ "Gemfile",
23
+ "LICENSE.txt",
24
+ "README.rdoc",
25
+ "Rakefile",
26
+ "VERSION",
27
+ "lib/chicago/flow.rb",
28
+ "lib/chicago/flow/array_sink.rb",
29
+ "lib/chicago/flow/array_source.rb",
30
+ "lib/chicago/flow/dataset_source.rb",
31
+ "lib/chicago/flow/filter.rb",
32
+ "lib/chicago/flow/mysql.rb",
33
+ "lib/chicago/flow/mysql_file_serializer.rb",
34
+ "lib/chicago/flow/mysql_file_sink.rb",
35
+ "lib/chicago/flow/pipeline_endpoint.rb",
36
+ "lib/chicago/flow/pipeline_stage.rb",
37
+ "lib/chicago/flow/sink.rb",
38
+ "lib/chicago/flow/transformation.rb",
39
+ "lib/chicago/flow/transformation_chain.rb",
40
+ "spec/array_sink_spec.rb",
41
+ "spec/array_source_spec.rb",
42
+ "spec/database.yml.dist",
43
+ "spec/dataset_source_spec.rb",
44
+ "spec/filter_spec.rb",
45
+ "spec/mysql_file_serializer_spec.rb",
46
+ "spec/mysql_file_sink_spec.rb",
47
+ "spec/mysql_integration_spec.rb",
48
+ "spec/pipeline_stage_spec.rb",
49
+ "spec/spec_helper.rb",
50
+ "spec/transformation_chain_spec.rb",
51
+ "spec/transformation_spec.rb"
52
+ ]
53
+ s.homepage = "http://github.com/notonthehighstreet/chicago-flow"
54
+ s.licenses = ["MIT"]
55
+ s.require_paths = ["lib"]
56
+ s.rubygems_version = "1.8.24"
57
+ s.summary = "Dataflow-style processing for hash-like rows"
58
+
59
+ if s.respond_to? :specification_version then
60
+ s.specification_version = 3
61
+
62
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
63
+ s.add_runtime_dependency(%q<fastercsv>, [">= 0"])
64
+ s.add_runtime_dependency(%q<sequel>, [">= 0"])
65
+ s.add_runtime_dependency(%q<sequel_load_data_infile>, [">= 0.0.2"])
66
+ s.add_runtime_dependency(%q<sequel_fast_columns>, [">= 0"])
67
+ s.add_development_dependency(%q<mysql>, ["= 2.8.1"])
68
+ s.add_development_dependency(%q<rspec>, ["~> 2"])
69
+ s.add_development_dependency(%q<bundler>, ["~> 1"])
70
+ s.add_development_dependency(%q<jeweler>, ["~> 1.8.4"])
71
+ else
72
+ s.add_dependency(%q<fastercsv>, [">= 0"])
73
+ s.add_dependency(%q<sequel>, [">= 0"])
74
+ s.add_dependency(%q<sequel_load_data_infile>, [">= 0.0.2"])
75
+ s.add_dependency(%q<sequel_fast_columns>, [">= 0"])
76
+ s.add_dependency(%q<mysql>, ["= 2.8.1"])
77
+ s.add_dependency(%q<rspec>, ["~> 2"])
78
+ s.add_dependency(%q<bundler>, ["~> 1"])
79
+ s.add_dependency(%q<jeweler>, ["~> 1.8.4"])
80
+ end
81
+ else
82
+ s.add_dependency(%q<fastercsv>, [">= 0"])
83
+ s.add_dependency(%q<sequel>, [">= 0"])
84
+ s.add_dependency(%q<sequel_load_data_infile>, [">= 0.0.2"])
85
+ s.add_dependency(%q<sequel_fast_columns>, [">= 0"])
86
+ s.add_dependency(%q<mysql>, ["= 2.8.1"])
87
+ s.add_dependency(%q<rspec>, ["~> 2"])
88
+ s.add_dependency(%q<bundler>, ["~> 1"])
89
+ s.add_dependency(%q<jeweler>, ["~> 1.8.4"])
90
+ end
91
+ end
92
+
@@ -32,21 +32,27 @@ module Chicago
32
32
  end
33
33
 
34
34
  # Deprecated.
35
+ #
36
+ # @deprecated Use perform_task instead
35
37
  def load(task_name, &block)
36
38
  perform_task(:load, task_name, &block)
37
39
  end
38
40
 
39
41
  # Deprecated.
42
+ #
43
+ # @deprecated Use perform_task instead
40
44
  def transform(task_name, &block)
41
45
  perform_task(:extract, task_name, &block)
42
46
  end
43
47
 
44
48
  # Deprecated.
49
+ #
50
+ # @deprecated Use perform_task instead
45
51
  def extract(task_name, &block)
46
52
  perform_task(:extract, task_name, &block)
47
53
  end
48
54
 
49
- # Perform a named task if it hasn't already run successfully in
55
+ # Performs a named task if it hasn't already run successfully in
50
56
  # this batch.
51
57
  def perform_task(stage, task_name, &block)
52
58
  task = find_or_create_task_invocation(stage, task_name)
@@ -95,7 +101,8 @@ module Chicago
95
101
  @log ||= Logger.new(File.join(dir, "log"))
96
102
  end
97
103
 
98
- def after_create # :nodoc:
104
+ # @api private
105
+ def after_create
99
106
  FileUtils.mkdir_p(dir, :mode => 0777)
100
107
  end
101
108
 
@@ -0,0 +1,12 @@
1
+ class Hash
2
+ def put(key, value)
3
+ store(key, value)
4
+ self
5
+ end
6
+
7
+ def modify_existing(key)
8
+ value = self[key]
9
+ self[key] = yield value unless value.nil?
10
+ self
11
+ end
12
+ end
@@ -12,7 +12,14 @@ module Chicago
12
12
  # Returns the current number this counter is on.
13
13
  attr_reader :current
14
14
 
15
- # Creates a new counter, optionally with a starting count.
15
+ # Creates a new counter.
16
+ #
17
+ # May optionally be created with a starting count, either as a
18
+ # number or as a block which generates a number.
19
+ #
20
+ # Counter.new(41).next # returns 42
21
+ # Counter.new { 2 + 2 }.next # returns 5
22
+ #
16
23
  def initialize(current_number=0, &block)
17
24
  @mutex = Mutex.new
18
25
  if block
@@ -0,0 +1,52 @@
1
+ module Chicago
2
+ module ETL
3
+ # Links a PipelineStage to a Dataset.
4
+ #
5
+ # Allows deferring constructing a DatasetSource until extract
6
+ # time, so that it can be filtered to an ETL batch appropriately.
7
+ class DatasetBatchStage
8
+ attr_reader :name
9
+
10
+ def initialize(name, dataset, pipeline_stage, options={})
11
+ @name = name
12
+ @dataset = dataset
13
+ @pipeline_stage = pipeline_stage
14
+ @filter_strategy = options[:filter_strategy] || lambda {|dataset, etl_batch|
15
+ dataset.filter_to_etl_batch(etl_batch)
16
+ }
17
+ @truncate_pre_load = !!options[:truncate_pre_load]
18
+ end
19
+
20
+ # Executes this ETL stage.
21
+ #
22
+ # Configures the dataset and flows rows into the pipeline.
23
+ def execute(etl_batch, reextract=false)
24
+ if @truncate_pre_load
25
+ pipeline_stage.sinks.each {|sink| sink.truncate }
26
+ elsif reextract && pipeline_stage.sink(:error)
27
+ pipeline_stage.sink(:error).truncate
28
+ end
29
+
30
+ pipeline_stage.execute(source(etl_batch, reextract))
31
+ end
32
+
33
+ # Returns the pipeline for this stage.
34
+ def pipeline_stage
35
+ @pipeline_stage.sink(:default).
36
+ set_constant_values(:_inserted_at => Time.now)
37
+ @pipeline_stage
38
+ end
39
+
40
+ # Returns a DatasetSource for the provided dataset filtered to
41
+ # the ETL batch as appropriate.
42
+ def source(etl_batch, reextract=false)
43
+ if reextract
44
+ filtered_dataset = @dataset
45
+ else
46
+ filtered_dataset = @filter_strategy.call(@dataset, etl_batch)
47
+ end
48
+ Chicago::Flow::DatasetSource.new(filtered_dataset)
49
+ end
50
+ end
51
+ end
52
+ end
@@ -1,5 +1,4 @@
1
1
  require 'digest/md5'
2
- require 'chicago/etl/buffering_insert_writer'
3
2
 
4
3
  module Chicago
5
4
  module ETL
@@ -12,6 +11,8 @@ module Chicago
12
11
  #
13
12
  # @api public
14
13
  class KeyBuilder
14
+ # Creates the appropriate KeyBuilder for a star schema table.
15
+ #
15
16
  # @api private
16
17
  class Factory
17
18
  attr_reader :table, :staging_db
@@ -20,19 +21,17 @@ module Chicago
20
21
  @table = table
21
22
  @staging_db = staging_db
22
23
  end
23
-
24
+
24
25
  def make
25
26
  if dimension?
26
27
  key_table = staging_db[table.key_table_name]
27
- key_sink = BufferingInsertWriter.new(key_table,
28
- [:original_id, :dimension_id])
29
28
 
30
29
  if table.identifiable?
31
- IdentifiableDimensionKeyBuilder.new(key_table, key_sink)
30
+ IdentifiableDimensionKeyBuilder.new(key_table)
32
31
  elsif existing_hash_column?(table)
33
- ExistingHashColumnKeyBuilder.new(key_table, key_sink)
32
+ ExistingHashColumnKeyBuilder.new(key_table)
34
33
  else
35
- HashingKeyBuilder.new(key_table, key_sink, columns_to_hash)
34
+ HashingKeyBuilder.new(key_table, columns_to_hash)
36
35
  end
37
36
  elsif fact?
38
37
  FactKeyBuilder.new(staging_db[table.table_name])
@@ -68,9 +67,8 @@ module Chicago
68
67
  Factory.new(table, staging_db).make
69
68
  end
70
69
 
71
- def initialize(key_table, key_sink)
70
+ def initialize(key_table)
72
71
  @key_table = key_table
73
- @new_keys = key_sink
74
72
  @counter = Counter.new { key_table.max(:dimension_id) }
75
73
  end
76
74
 
@@ -84,14 +82,15 @@ module Chicago
84
82
  new_key = @key_mapping[row_id]
85
83
 
86
84
  if new_key
87
- new_key
85
+ [new_key, nil]
88
86
  else
89
87
  new_key = @counter.next
90
- @new_keys << {
91
- :original_id => key_for_insert(row_id),
92
- :dimension_id => new_key
93
- }
94
88
  @key_mapping[row_id] = new_key
89
+
90
+ [new_key, {
91
+ :original_id => row_id,
92
+ :dimension_id => new_key
93
+ }]
95
94
  end
96
95
  end
97
96
 
@@ -100,11 +99,6 @@ module Chicago
100
99
  # Overridden by subclasses.
101
100
  def original_key(row)
102
101
  end
103
-
104
- # Flushes any newly created keys to the key table.
105
- def flush
106
- @new_keys.flush
107
- end
108
102
 
109
103
  protected
110
104
 
@@ -132,10 +126,6 @@ module Chicago
132
126
  row[:original_id]
133
127
  end
134
128
 
135
- def key_for_insert(original_id)
136
- original_id
137
- end
138
-
139
129
  def original_key_select_fragment
140
130
  :original_id
141
131
  end
@@ -150,10 +140,6 @@ module Chicago
150
140
  row[:hash].upcase
151
141
  end
152
142
 
153
- def key_for_insert(original_id)
154
- ("0x" + original_id).lit
155
- end
156
-
157
143
  def original_key_select_fragment
158
144
  :hex.sql_function(:original_id).as(:original_id)
159
145
  end
@@ -170,8 +156,8 @@ module Chicago
170
156
  attr_reader :columns
171
157
  attr_accessor :hash_preparation
172
158
 
173
- def initialize(key_table, key_sink, columns)
174
- super(key_table, key_sink)
159
+ def initialize(key_table, columns)
160
+ super(key_table)
175
161
  @columns = columns
176
162
  @hash_preparation = lambda {|column| column.to_s.upcase }
177
163
  end
@@ -181,10 +167,6 @@ module Chicago
181
167
  Digest::MD5.hexdigest(str).upcase
182
168
  end
183
169
 
184
- def key_for_insert(original_id)
185
- ("0x" + original_id).lit
186
- end
187
-
188
170
  def original_key_select_fragment
189
171
  :hex.sql_function(:original_id).as(:original_id)
190
172
  end
@@ -203,7 +185,7 @@ module Chicago
203
185
  #
204
186
  # In addition, the same row passed twice will get a different id.
205
187
  class FactKeyBuilder
206
- def initialize(db_table, key_sink=nil)
188
+ def initialize(db_table)
207
189
  @db_table = db_table
208
190
  @counter = Counter.new { @db_table.max(:id) }
209
191
  end
@@ -211,11 +193,7 @@ module Chicago
211
193
  # Returns an id given a row - the row actually has no bearing on
212
194
  # the id returned.
213
195
  def key(row)
214
- @counter.next
215
- end
216
-
217
- # No-op, provided for interface compatability.
218
- def flush
196
+ [@counter.next, nil]
219
197
  end
220
198
  end
221
199
  end
@@ -2,6 +2,9 @@ require 'set'
2
2
 
3
3
  module Chicago
4
4
  module ETL
5
+ # Currently unused - work in progress.
6
+ #
7
+ # @api private
5
8
  class LoadDatasetBuilder
6
9
  def initialize(&block)
7
10
  @constructed_columns = {}
@@ -69,7 +72,6 @@ module Chicago
69
72
  raise "Column #{name} was either ambiguous or non-existant"
70
73
  end
71
74
  end
72
-
73
75
  end
74
76
  end
75
77
  end