chicago-etl 0.0.13 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +8 -3
- data/README.rdoc +4 -1
- data/VERSION +1 -1
- data/chicago-etl.gemspec +59 -22
- data/chicago-flow.gemspec +92 -0
- data/lib/chicago/etl/batch.rb +9 -2
- data/lib/chicago/etl/core_extensions.rb +12 -0
- data/lib/chicago/etl/counter.rb +8 -1
- data/lib/chicago/etl/dataset_batch_stage.rb +52 -0
- data/lib/chicago/etl/key_builder.rb +17 -39
- data/lib/chicago/etl/load_dataset_builder.rb +3 -1
- data/lib/chicago/etl/load_pipeline_stage_builder.rb +142 -0
- data/lib/chicago/etl/pipeline.rb +151 -0
- data/lib/chicago/etl/schema_table_sink_factory.rb +74 -0
- data/lib/chicago/etl/screens/column_screen.rb +26 -25
- data/lib/chicago/etl/screens/invalid_element.rb +5 -5
- data/lib/chicago/etl/screens/missing_value.rb +4 -2
- data/lib/chicago/etl/screens/out_of_bounds.rb +2 -0
- data/lib/chicago/etl/table_builder.rb +4 -2
- data/lib/chicago/etl/task_invocation.rb +0 -1
- data/lib/chicago/etl/transformations.rb +128 -0
- data/lib/chicago/etl.rb +39 -8
- data/lib/chicago/flow/array_sink.rb +35 -0
- data/lib/chicago/flow/array_source.rb +15 -0
- data/lib/chicago/flow/dataset_source.rb +23 -0
- data/lib/chicago/flow/errors.rb +14 -0
- data/lib/chicago/flow/filter.rb +15 -0
- data/lib/chicago/flow/mysql.rb +4 -0
- data/lib/chicago/{etl/mysql_load_file_value_transformer.rb → flow/mysql_file_serializer.rb} +7 -4
- data/lib/chicago/flow/mysql_file_sink.rb +68 -0
- data/lib/chicago/flow/null_sink.rb +8 -0
- data/lib/chicago/flow/pipeline_endpoint.rb +15 -0
- data/lib/chicago/flow/pipeline_stage.rb +68 -0
- data/lib/chicago/flow/sink.rb +53 -0
- data/lib/chicago/flow/transformation.rb +169 -0
- data/lib/chicago/flow/transformation_chain.rb +40 -0
- data/spec/etl/batch_spec.rb +2 -1
- data/spec/etl/core_extensions_spec.rb +13 -0
- data/spec/etl/dataset_batch_stage_spec.rb +55 -0
- data/spec/etl/key_builder_spec.rb +25 -83
- data/spec/etl/pipeline_stage_builder_spec.rb +39 -0
- data/spec/etl/schema_table_sink_factory_spec.rb +69 -0
- data/spec/etl/screens/invalid_element_spec.rb +10 -11
- data/spec/etl/screens/missing_value_spec.rb +21 -21
- data/spec/etl/screens/out_of_bounds_spec.rb +21 -29
- data/spec/etl/transformations_spec.rb +109 -0
- data/spec/flow/array_sink_spec.rb +26 -0
- data/spec/flow/array_source_spec.rb +20 -0
- data/spec/flow/dataset_source_spec.rb +15 -0
- data/spec/flow/filter_spec.rb +13 -0
- data/spec/flow/mysql_file_serializer_spec.rb +27 -0
- data/spec/flow/mysql_file_sink_spec.rb +94 -0
- data/spec/flow/mysql_integration_spec.rb +72 -0
- data/spec/flow/pipeline_stage_spec.rb +89 -0
- data/spec/flow/transformation_chain_spec.rb +76 -0
- data/spec/flow/transformation_spec.rb +91 -0
- data/spec/spec_helper.rb +5 -0
- metadata +135 -39
- data/lib/chicago/etl/buffering_insert_writer.rb +0 -36
- data/lib/chicago/etl/mysql_dumpfile.rb +0 -32
- data/lib/chicago/etl/screens/composite_screen.rb +0 -17
- data/lib/chicago/etl/sequel/load_data_infile.rb +0 -141
- data/lib/chicago/etl/sink.rb +0 -61
- data/lib/chicago/etl/transformations/add_insert_timestamp.rb +0 -16
- data/spec/etl/mysql_dumpfile_spec.rb +0 -42
- data/spec/etl/mysql_load_file_value_transformer_spec.rb +0 -27
- data/spec/etl/screens/composite_screen_spec.rb +0 -25
- data/spec/etl/sequel/load_data_infile_expression_spec.rb +0 -60
- data/spec/etl/sequel/load_data_infile_spec.rb +0 -37
- data/spec/etl/sink_spec.rb +0 -7
- data/spec/etl/transformations/add_insert_timestamp_spec.rb +0 -9
data/Gemfile
CHANGED
@@ -1,6 +1,10 @@
|
|
1
1
|
source "http://rubygems.org"
|
2
2
|
|
3
3
|
gem "chicagowarehouse", "~> 0.4"
|
4
|
+
gem "fastercsv", :platform => :ruby_18
|
5
|
+
gem "sequel"
|
6
|
+
gem "sequel_load_data_infile", ">= 0.0.2", :require => "sequel/load_data_infile"
|
7
|
+
gem "sequel_fast_columns", :require => "sequel/fast_columns"
|
4
8
|
|
5
9
|
# Add dependencies to develop your gem here.
|
6
10
|
# Include everything needed to run rake, tests, features, etc.
|
@@ -9,8 +13,9 @@ group :development do
|
|
9
13
|
gem "timecop"
|
10
14
|
gem "yard"
|
11
15
|
gem "flog"
|
12
|
-
gem "
|
13
|
-
gem "rcov", :platforms => :mri_18
|
14
|
-
gem "simplecov", :platforms => :mri_19
|
16
|
+
gem "simplecov", :platforms => :mri_19, :require => false
|
15
17
|
gem "ZenTest"
|
18
|
+
gem "mysql", "2.8.1"
|
19
|
+
gem "bundler", "~> 1"
|
20
|
+
gem "jeweler"
|
16
21
|
end
|
data/README.rdoc
CHANGED
@@ -14,8 +14,11 @@ An ETL pipeline for use with Chicago Warehouse.
|
|
14
14
|
* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
|
15
15
|
* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
|
16
16
|
|
17
|
+
== Authors
|
18
|
+
|
19
|
+
Roland Swingler (@knaveofdiamonds)
|
20
|
+
|
17
21
|
== Copyright
|
18
22
|
|
19
23
|
Copyright (c) 2012 notonthehighstreet.com. See LICENSE.txt for
|
20
24
|
further details.
|
21
|
-
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0
|
1
|
+
0.1.0
|
data/chicago-etl.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "chicago-etl"
|
8
|
-
s.version = "0.0
|
8
|
+
s.version = "0.1.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Roland Swingler"]
|
12
|
-
s.date = "2013-
|
12
|
+
s.date = "2013-09-05"
|
13
13
|
s.description = "ETL tools for Chicago"
|
14
14
|
s.email = "roland.swingler@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -25,52 +25,74 @@ Gem::Specification.new do |s|
|
|
25
25
|
"Rakefile",
|
26
26
|
"VERSION",
|
27
27
|
"chicago-etl.gemspec",
|
28
|
+
"chicago-flow.gemspec",
|
28
29
|
"lib/chicago-etl.rb",
|
29
30
|
"lib/chicago/etl.rb",
|
30
31
|
"lib/chicago/etl/batch.rb",
|
31
|
-
"lib/chicago/etl/
|
32
|
+
"lib/chicago/etl/core_extensions.rb",
|
32
33
|
"lib/chicago/etl/counter.rb",
|
34
|
+
"lib/chicago/etl/dataset_batch_stage.rb",
|
33
35
|
"lib/chicago/etl/key_builder.rb",
|
34
36
|
"lib/chicago/etl/load_dataset_builder.rb",
|
35
|
-
"lib/chicago/etl/
|
36
|
-
"lib/chicago/etl/
|
37
|
+
"lib/chicago/etl/load_pipeline_stage_builder.rb",
|
38
|
+
"lib/chicago/etl/pipeline.rb",
|
39
|
+
"lib/chicago/etl/schema_table_sink_factory.rb",
|
37
40
|
"lib/chicago/etl/screens/column_screen.rb",
|
38
|
-
"lib/chicago/etl/screens/composite_screen.rb",
|
39
41
|
"lib/chicago/etl/screens/invalid_element.rb",
|
40
42
|
"lib/chicago/etl/screens/missing_value.rb",
|
41
43
|
"lib/chicago/etl/screens/out_of_bounds.rb",
|
42
44
|
"lib/chicago/etl/sequel/dependant_tables.rb",
|
43
45
|
"lib/chicago/etl/sequel/filter_to_etl_batch.rb",
|
44
|
-
"lib/chicago/etl/sequel/load_data_infile.rb",
|
45
|
-
"lib/chicago/etl/sink.rb",
|
46
46
|
"lib/chicago/etl/table_builder.rb",
|
47
47
|
"lib/chicago/etl/task_invocation.rb",
|
48
48
|
"lib/chicago/etl/tasks.rb",
|
49
|
-
"lib/chicago/etl/transformations
|
49
|
+
"lib/chicago/etl/transformations.rb",
|
50
50
|
"lib/chicago/etl/transformations/uk_post_code.rb",
|
51
51
|
"lib/chicago/etl/transformations/uk_post_code_field.rb",
|
52
|
+
"lib/chicago/flow/array_sink.rb",
|
53
|
+
"lib/chicago/flow/array_source.rb",
|
54
|
+
"lib/chicago/flow/dataset_source.rb",
|
55
|
+
"lib/chicago/flow/errors.rb",
|
56
|
+
"lib/chicago/flow/filter.rb",
|
57
|
+
"lib/chicago/flow/mysql.rb",
|
58
|
+
"lib/chicago/flow/mysql_file_serializer.rb",
|
59
|
+
"lib/chicago/flow/mysql_file_sink.rb",
|
60
|
+
"lib/chicago/flow/null_sink.rb",
|
61
|
+
"lib/chicago/flow/pipeline_endpoint.rb",
|
62
|
+
"lib/chicago/flow/pipeline_stage.rb",
|
63
|
+
"lib/chicago/flow/sink.rb",
|
64
|
+
"lib/chicago/flow/transformation.rb",
|
65
|
+
"lib/chicago/flow/transformation_chain.rb",
|
52
66
|
"spec/db_connections.yml.dist",
|
53
67
|
"spec/etl/batch_spec.rb",
|
68
|
+
"spec/etl/core_extensions_spec.rb",
|
54
69
|
"spec/etl/counter_spec.rb",
|
70
|
+
"spec/etl/dataset_batch_stage_spec.rb",
|
55
71
|
"spec/etl/etl_batch_id_dataset_filter.rb",
|
56
72
|
"spec/etl/key_builder_spec.rb",
|
57
73
|
"spec/etl/load_dataset_builder_spec.rb",
|
58
|
-
"spec/etl/
|
59
|
-
"spec/etl/
|
60
|
-
"spec/etl/screens/composite_screen_spec.rb",
|
74
|
+
"spec/etl/pipeline_stage_builder_spec.rb",
|
75
|
+
"spec/etl/schema_table_sink_factory_spec.rb",
|
61
76
|
"spec/etl/screens/invalid_element_spec.rb",
|
62
77
|
"spec/etl/screens/missing_value_spec.rb",
|
63
78
|
"spec/etl/screens/out_of_bounds_spec.rb",
|
64
79
|
"spec/etl/sequel/dependant_tables_spec.rb",
|
65
80
|
"spec/etl/sequel/filter_to_etl_batch_spec.rb",
|
66
|
-
"spec/etl/sequel/load_data_infile_expression_spec.rb",
|
67
|
-
"spec/etl/sequel/load_data_infile_spec.rb",
|
68
|
-
"spec/etl/sink_spec.rb",
|
69
81
|
"spec/etl/table_builder_spec.rb",
|
70
82
|
"spec/etl/task_spec.rb",
|
71
|
-
"spec/etl/transformations/add_insert_timestamp_spec.rb",
|
72
83
|
"spec/etl/transformations/uk_post_code_field_spec.rb",
|
73
84
|
"spec/etl/transformations/uk_post_code_spec.rb",
|
85
|
+
"spec/etl/transformations_spec.rb",
|
86
|
+
"spec/flow/array_sink_spec.rb",
|
87
|
+
"spec/flow/array_source_spec.rb",
|
88
|
+
"spec/flow/dataset_source_spec.rb",
|
89
|
+
"spec/flow/filter_spec.rb",
|
90
|
+
"spec/flow/mysql_file_serializer_spec.rb",
|
91
|
+
"spec/flow/mysql_file_sink_spec.rb",
|
92
|
+
"spec/flow/mysql_integration_spec.rb",
|
93
|
+
"spec/flow/pipeline_stage_spec.rb",
|
94
|
+
"spec/flow/transformation_chain_spec.rb",
|
95
|
+
"spec/flow/transformation_spec.rb",
|
74
96
|
"spec/spec_helper.rb"
|
75
97
|
]
|
76
98
|
s.homepage = "http://github.com/notonthehighstreet/chicago-etl"
|
@@ -84,35 +106,50 @@ Gem::Specification.new do |s|
|
|
84
106
|
|
85
107
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
86
108
|
s.add_runtime_dependency(%q<chicagowarehouse>, ["~> 0.4"])
|
109
|
+
s.add_runtime_dependency(%q<fastercsv>, [">= 0"])
|
110
|
+
s.add_runtime_dependency(%q<sequel>, [">= 0"])
|
111
|
+
s.add_runtime_dependency(%q<sequel_load_data_infile>, [">= 0.0.2"])
|
112
|
+
s.add_runtime_dependency(%q<sequel_fast_columns>, [">= 0"])
|
87
113
|
s.add_development_dependency(%q<rspec>, ["~> 2"])
|
88
114
|
s.add_development_dependency(%q<timecop>, [">= 0"])
|
89
115
|
s.add_development_dependency(%q<yard>, [">= 0"])
|
90
116
|
s.add_development_dependency(%q<flog>, [">= 0"])
|
91
|
-
s.add_development_dependency(%q<jeweler>, [">= 0"])
|
92
|
-
s.add_development_dependency(%q<rcov>, [">= 0"])
|
93
117
|
s.add_development_dependency(%q<simplecov>, [">= 0"])
|
94
118
|
s.add_development_dependency(%q<ZenTest>, [">= 0"])
|
119
|
+
s.add_development_dependency(%q<mysql>, ["= 2.8.1"])
|
120
|
+
s.add_development_dependency(%q<bundler>, ["~> 1"])
|
121
|
+
s.add_development_dependency(%q<jeweler>, [">= 0"])
|
95
122
|
else
|
96
123
|
s.add_dependency(%q<chicagowarehouse>, ["~> 0.4"])
|
124
|
+
s.add_dependency(%q<fastercsv>, [">= 0"])
|
125
|
+
s.add_dependency(%q<sequel>, [">= 0"])
|
126
|
+
s.add_dependency(%q<sequel_load_data_infile>, [">= 0.0.2"])
|
127
|
+
s.add_dependency(%q<sequel_fast_columns>, [">= 0"])
|
97
128
|
s.add_dependency(%q<rspec>, ["~> 2"])
|
98
129
|
s.add_dependency(%q<timecop>, [">= 0"])
|
99
130
|
s.add_dependency(%q<yard>, [">= 0"])
|
100
131
|
s.add_dependency(%q<flog>, [">= 0"])
|
101
|
-
s.add_dependency(%q<jeweler>, [">= 0"])
|
102
|
-
s.add_dependency(%q<rcov>, [">= 0"])
|
103
132
|
s.add_dependency(%q<simplecov>, [">= 0"])
|
104
133
|
s.add_dependency(%q<ZenTest>, [">= 0"])
|
134
|
+
s.add_dependency(%q<mysql>, ["= 2.8.1"])
|
135
|
+
s.add_dependency(%q<bundler>, ["~> 1"])
|
136
|
+
s.add_dependency(%q<jeweler>, [">= 0"])
|
105
137
|
end
|
106
138
|
else
|
107
139
|
s.add_dependency(%q<chicagowarehouse>, ["~> 0.4"])
|
140
|
+
s.add_dependency(%q<fastercsv>, [">= 0"])
|
141
|
+
s.add_dependency(%q<sequel>, [">= 0"])
|
142
|
+
s.add_dependency(%q<sequel_load_data_infile>, [">= 0.0.2"])
|
143
|
+
s.add_dependency(%q<sequel_fast_columns>, [">= 0"])
|
108
144
|
s.add_dependency(%q<rspec>, ["~> 2"])
|
109
145
|
s.add_dependency(%q<timecop>, [">= 0"])
|
110
146
|
s.add_dependency(%q<yard>, [">= 0"])
|
111
147
|
s.add_dependency(%q<flog>, [">= 0"])
|
112
|
-
s.add_dependency(%q<jeweler>, [">= 0"])
|
113
|
-
s.add_dependency(%q<rcov>, [">= 0"])
|
114
148
|
s.add_dependency(%q<simplecov>, [">= 0"])
|
115
149
|
s.add_dependency(%q<ZenTest>, [">= 0"])
|
150
|
+
s.add_dependency(%q<mysql>, ["= 2.8.1"])
|
151
|
+
s.add_dependency(%q<bundler>, ["~> 1"])
|
152
|
+
s.add_dependency(%q<jeweler>, [">= 0"])
|
116
153
|
end
|
117
154
|
end
|
118
155
|
|
@@ -0,0 +1,92 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = "chicago-flow"
|
8
|
+
s.version = "0.0.1"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Roland Swingler"]
|
12
|
+
s.date = "2013-06-05"
|
13
|
+
s.description = "Dataflow-style processing for hash-like rows"
|
14
|
+
s.email = "roland.swingler@gmail.com"
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE.txt",
|
17
|
+
"README.rdoc"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".document",
|
21
|
+
".rspec",
|
22
|
+
"Gemfile",
|
23
|
+
"LICENSE.txt",
|
24
|
+
"README.rdoc",
|
25
|
+
"Rakefile",
|
26
|
+
"VERSION",
|
27
|
+
"lib/chicago/flow.rb",
|
28
|
+
"lib/chicago/flow/array_sink.rb",
|
29
|
+
"lib/chicago/flow/array_source.rb",
|
30
|
+
"lib/chicago/flow/dataset_source.rb",
|
31
|
+
"lib/chicago/flow/filter.rb",
|
32
|
+
"lib/chicago/flow/mysql.rb",
|
33
|
+
"lib/chicago/flow/mysql_file_serializer.rb",
|
34
|
+
"lib/chicago/flow/mysql_file_sink.rb",
|
35
|
+
"lib/chicago/flow/pipeline_endpoint.rb",
|
36
|
+
"lib/chicago/flow/pipeline_stage.rb",
|
37
|
+
"lib/chicago/flow/sink.rb",
|
38
|
+
"lib/chicago/flow/transformation.rb",
|
39
|
+
"lib/chicago/flow/transformation_chain.rb",
|
40
|
+
"spec/array_sink_spec.rb",
|
41
|
+
"spec/array_source_spec.rb",
|
42
|
+
"spec/database.yml.dist",
|
43
|
+
"spec/dataset_source_spec.rb",
|
44
|
+
"spec/filter_spec.rb",
|
45
|
+
"spec/mysql_file_serializer_spec.rb",
|
46
|
+
"spec/mysql_file_sink_spec.rb",
|
47
|
+
"spec/mysql_integration_spec.rb",
|
48
|
+
"spec/pipeline_stage_spec.rb",
|
49
|
+
"spec/spec_helper.rb",
|
50
|
+
"spec/transformation_chain_spec.rb",
|
51
|
+
"spec/transformation_spec.rb"
|
52
|
+
]
|
53
|
+
s.homepage = "http://github.com/notonthehighstreet/chicago-flow"
|
54
|
+
s.licenses = ["MIT"]
|
55
|
+
s.require_paths = ["lib"]
|
56
|
+
s.rubygems_version = "1.8.24"
|
57
|
+
s.summary = "Dataflow-style processing for hash-like rows"
|
58
|
+
|
59
|
+
if s.respond_to? :specification_version then
|
60
|
+
s.specification_version = 3
|
61
|
+
|
62
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
63
|
+
s.add_runtime_dependency(%q<fastercsv>, [">= 0"])
|
64
|
+
s.add_runtime_dependency(%q<sequel>, [">= 0"])
|
65
|
+
s.add_runtime_dependency(%q<sequel_load_data_infile>, [">= 0.0.2"])
|
66
|
+
s.add_runtime_dependency(%q<sequel_fast_columns>, [">= 0"])
|
67
|
+
s.add_development_dependency(%q<mysql>, ["= 2.8.1"])
|
68
|
+
s.add_development_dependency(%q<rspec>, ["~> 2"])
|
69
|
+
s.add_development_dependency(%q<bundler>, ["~> 1"])
|
70
|
+
s.add_development_dependency(%q<jeweler>, ["~> 1.8.4"])
|
71
|
+
else
|
72
|
+
s.add_dependency(%q<fastercsv>, [">= 0"])
|
73
|
+
s.add_dependency(%q<sequel>, [">= 0"])
|
74
|
+
s.add_dependency(%q<sequel_load_data_infile>, [">= 0.0.2"])
|
75
|
+
s.add_dependency(%q<sequel_fast_columns>, [">= 0"])
|
76
|
+
s.add_dependency(%q<mysql>, ["= 2.8.1"])
|
77
|
+
s.add_dependency(%q<rspec>, ["~> 2"])
|
78
|
+
s.add_dependency(%q<bundler>, ["~> 1"])
|
79
|
+
s.add_dependency(%q<jeweler>, ["~> 1.8.4"])
|
80
|
+
end
|
81
|
+
else
|
82
|
+
s.add_dependency(%q<fastercsv>, [">= 0"])
|
83
|
+
s.add_dependency(%q<sequel>, [">= 0"])
|
84
|
+
s.add_dependency(%q<sequel_load_data_infile>, [">= 0.0.2"])
|
85
|
+
s.add_dependency(%q<sequel_fast_columns>, [">= 0"])
|
86
|
+
s.add_dependency(%q<mysql>, ["= 2.8.1"])
|
87
|
+
s.add_dependency(%q<rspec>, ["~> 2"])
|
88
|
+
s.add_dependency(%q<bundler>, ["~> 1"])
|
89
|
+
s.add_dependency(%q<jeweler>, ["~> 1.8.4"])
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
data/lib/chicago/etl/batch.rb
CHANGED
@@ -32,21 +32,27 @@ module Chicago
|
|
32
32
|
end
|
33
33
|
|
34
34
|
# Deprecated.
|
35
|
+
#
|
36
|
+
# @deprecated Use perform_task instead
|
35
37
|
def load(task_name, &block)
|
36
38
|
perform_task(:load, task_name, &block)
|
37
39
|
end
|
38
40
|
|
39
41
|
# Deprecated.
|
42
|
+
#
|
43
|
+
# @deprecated Use perform_task instead
|
40
44
|
def transform(task_name, &block)
|
41
45
|
perform_task(:extract, task_name, &block)
|
42
46
|
end
|
43
47
|
|
44
48
|
# Deprecated.
|
49
|
+
#
|
50
|
+
# @deprecated Use perform_task instead
|
45
51
|
def extract(task_name, &block)
|
46
52
|
perform_task(:extract, task_name, &block)
|
47
53
|
end
|
48
54
|
|
49
|
-
#
|
55
|
+
# Performs a named task if it hasn't already run successfully in
|
50
56
|
# this batch.
|
51
57
|
def perform_task(stage, task_name, &block)
|
52
58
|
task = find_or_create_task_invocation(stage, task_name)
|
@@ -95,7 +101,8 @@ module Chicago
|
|
95
101
|
@log ||= Logger.new(File.join(dir, "log"))
|
96
102
|
end
|
97
103
|
|
98
|
-
|
104
|
+
# @api private
|
105
|
+
def after_create
|
99
106
|
FileUtils.mkdir_p(dir, :mode => 0777)
|
100
107
|
end
|
101
108
|
|
data/lib/chicago/etl/counter.rb
CHANGED
@@ -12,7 +12,14 @@ module Chicago
|
|
12
12
|
# Returns the current number this counter is on.
|
13
13
|
attr_reader :current
|
14
14
|
|
15
|
-
# Creates a new counter
|
15
|
+
# Creates a new counter.
|
16
|
+
#
|
17
|
+
# May optionally be created with a starting count, either as a
|
18
|
+
# number or as a block which generates a number.
|
19
|
+
#
|
20
|
+
# Counter.new(41).next # returns 42
|
21
|
+
# Counter.new { 2 + 2 }.next # returns 5
|
22
|
+
#
|
16
23
|
def initialize(current_number=0, &block)
|
17
24
|
@mutex = Mutex.new
|
18
25
|
if block
|
@@ -0,0 +1,52 @@
|
|
1
|
+
module Chicago
|
2
|
+
module ETL
|
3
|
+
# Links a PipelineStage to a Dataset.
|
4
|
+
#
|
5
|
+
# Allows deferring constructing a DatasetSource until extract
|
6
|
+
# time, so that it can be filtered to an ETL batch appropriately.
|
7
|
+
class DatasetBatchStage
|
8
|
+
attr_reader :name
|
9
|
+
|
10
|
+
def initialize(name, dataset, pipeline_stage, options={})
|
11
|
+
@name = name
|
12
|
+
@dataset = dataset
|
13
|
+
@pipeline_stage = pipeline_stage
|
14
|
+
@filter_strategy = options[:filter_strategy] || lambda {|dataset, etl_batch|
|
15
|
+
dataset.filter_to_etl_batch(etl_batch)
|
16
|
+
}
|
17
|
+
@truncate_pre_load = !!options[:truncate_pre_load]
|
18
|
+
end
|
19
|
+
|
20
|
+
# Executes this ETL stage.
|
21
|
+
#
|
22
|
+
# Configures the dataset and flows rows into the pipeline.
|
23
|
+
def execute(etl_batch, reextract=false)
|
24
|
+
if @truncate_pre_load
|
25
|
+
pipeline_stage.sinks.each {|sink| sink.truncate }
|
26
|
+
elsif reextract && pipeline_stage.sink(:error)
|
27
|
+
pipeline_stage.sink(:error).truncate
|
28
|
+
end
|
29
|
+
|
30
|
+
pipeline_stage.execute(source(etl_batch, reextract))
|
31
|
+
end
|
32
|
+
|
33
|
+
# Returns the pipeline for this stage.
|
34
|
+
def pipeline_stage
|
35
|
+
@pipeline_stage.sink(:default).
|
36
|
+
set_constant_values(:_inserted_at => Time.now)
|
37
|
+
@pipeline_stage
|
38
|
+
end
|
39
|
+
|
40
|
+
# Returns a DatasetSource for the provided dataset filtered to
|
41
|
+
# the ETL batch as appropriate.
|
42
|
+
def source(etl_batch, reextract=false)
|
43
|
+
if reextract
|
44
|
+
filtered_dataset = @dataset
|
45
|
+
else
|
46
|
+
filtered_dataset = @filter_strategy.call(@dataset, etl_batch)
|
47
|
+
end
|
48
|
+
Chicago::Flow::DatasetSource.new(filtered_dataset)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -1,5 +1,4 @@
|
|
1
1
|
require 'digest/md5'
|
2
|
-
require 'chicago/etl/buffering_insert_writer'
|
3
2
|
|
4
3
|
module Chicago
|
5
4
|
module ETL
|
@@ -12,6 +11,8 @@ module Chicago
|
|
12
11
|
#
|
13
12
|
# @api public
|
14
13
|
class KeyBuilder
|
14
|
+
# Creates the appropriate KeyBuilder for a star schema table.
|
15
|
+
#
|
15
16
|
# @api private
|
16
17
|
class Factory
|
17
18
|
attr_reader :table, :staging_db
|
@@ -20,19 +21,17 @@ module Chicago
|
|
20
21
|
@table = table
|
21
22
|
@staging_db = staging_db
|
22
23
|
end
|
23
|
-
|
24
|
+
|
24
25
|
def make
|
25
26
|
if dimension?
|
26
27
|
key_table = staging_db[table.key_table_name]
|
27
|
-
key_sink = BufferingInsertWriter.new(key_table,
|
28
|
-
[:original_id, :dimension_id])
|
29
28
|
|
30
29
|
if table.identifiable?
|
31
|
-
IdentifiableDimensionKeyBuilder.new(key_table
|
30
|
+
IdentifiableDimensionKeyBuilder.new(key_table)
|
32
31
|
elsif existing_hash_column?(table)
|
33
|
-
ExistingHashColumnKeyBuilder.new(key_table
|
32
|
+
ExistingHashColumnKeyBuilder.new(key_table)
|
34
33
|
else
|
35
|
-
HashingKeyBuilder.new(key_table,
|
34
|
+
HashingKeyBuilder.new(key_table, columns_to_hash)
|
36
35
|
end
|
37
36
|
elsif fact?
|
38
37
|
FactKeyBuilder.new(staging_db[table.table_name])
|
@@ -68,9 +67,8 @@ module Chicago
|
|
68
67
|
Factory.new(table, staging_db).make
|
69
68
|
end
|
70
69
|
|
71
|
-
def initialize(key_table
|
70
|
+
def initialize(key_table)
|
72
71
|
@key_table = key_table
|
73
|
-
@new_keys = key_sink
|
74
72
|
@counter = Counter.new { key_table.max(:dimension_id) }
|
75
73
|
end
|
76
74
|
|
@@ -84,14 +82,15 @@ module Chicago
|
|
84
82
|
new_key = @key_mapping[row_id]
|
85
83
|
|
86
84
|
if new_key
|
87
|
-
new_key
|
85
|
+
[new_key, nil]
|
88
86
|
else
|
89
87
|
new_key = @counter.next
|
90
|
-
@new_keys << {
|
91
|
-
:original_id => key_for_insert(row_id),
|
92
|
-
:dimension_id => new_key
|
93
|
-
}
|
94
88
|
@key_mapping[row_id] = new_key
|
89
|
+
|
90
|
+
[new_key, {
|
91
|
+
:original_id => row_id,
|
92
|
+
:dimension_id => new_key
|
93
|
+
}]
|
95
94
|
end
|
96
95
|
end
|
97
96
|
|
@@ -100,11 +99,6 @@ module Chicago
|
|
100
99
|
# Overridden by subclasses.
|
101
100
|
def original_key(row)
|
102
101
|
end
|
103
|
-
|
104
|
-
# Flushes any newly created keys to the key table.
|
105
|
-
def flush
|
106
|
-
@new_keys.flush
|
107
|
-
end
|
108
102
|
|
109
103
|
protected
|
110
104
|
|
@@ -132,10 +126,6 @@ module Chicago
|
|
132
126
|
row[:original_id]
|
133
127
|
end
|
134
128
|
|
135
|
-
def key_for_insert(original_id)
|
136
|
-
original_id
|
137
|
-
end
|
138
|
-
|
139
129
|
def original_key_select_fragment
|
140
130
|
:original_id
|
141
131
|
end
|
@@ -150,10 +140,6 @@ module Chicago
|
|
150
140
|
row[:hash].upcase
|
151
141
|
end
|
152
142
|
|
153
|
-
def key_for_insert(original_id)
|
154
|
-
("0x" + original_id).lit
|
155
|
-
end
|
156
|
-
|
157
143
|
def original_key_select_fragment
|
158
144
|
:hex.sql_function(:original_id).as(:original_id)
|
159
145
|
end
|
@@ -170,8 +156,8 @@ module Chicago
|
|
170
156
|
attr_reader :columns
|
171
157
|
attr_accessor :hash_preparation
|
172
158
|
|
173
|
-
def initialize(key_table,
|
174
|
-
super(key_table
|
159
|
+
def initialize(key_table, columns)
|
160
|
+
super(key_table)
|
175
161
|
@columns = columns
|
176
162
|
@hash_preparation = lambda {|column| column.to_s.upcase }
|
177
163
|
end
|
@@ -181,10 +167,6 @@ module Chicago
|
|
181
167
|
Digest::MD5.hexdigest(str).upcase
|
182
168
|
end
|
183
169
|
|
184
|
-
def key_for_insert(original_id)
|
185
|
-
("0x" + original_id).lit
|
186
|
-
end
|
187
|
-
|
188
170
|
def original_key_select_fragment
|
189
171
|
:hex.sql_function(:original_id).as(:original_id)
|
190
172
|
end
|
@@ -203,7 +185,7 @@ module Chicago
|
|
203
185
|
#
|
204
186
|
# In addition, the same row passed twice will get a different id.
|
205
187
|
class FactKeyBuilder
|
206
|
-
def initialize(db_table
|
188
|
+
def initialize(db_table)
|
207
189
|
@db_table = db_table
|
208
190
|
@counter = Counter.new { @db_table.max(:id) }
|
209
191
|
end
|
@@ -211,11 +193,7 @@ module Chicago
|
|
211
193
|
# Returns an id given a row - the row actually has no bearing on
|
212
194
|
# the id returned.
|
213
195
|
def key(row)
|
214
|
-
@counter.next
|
215
|
-
end
|
216
|
-
|
217
|
-
# No-op, provided for interface compatability.
|
218
|
-
def flush
|
196
|
+
[@counter.next, nil]
|
219
197
|
end
|
220
198
|
end
|
221
199
|
end
|
@@ -2,6 +2,9 @@ require 'set'
|
|
2
2
|
|
3
3
|
module Chicago
|
4
4
|
module ETL
|
5
|
+
# Currently unused - work in progress.
|
6
|
+
#
|
7
|
+
# @api private
|
5
8
|
class LoadDatasetBuilder
|
6
9
|
def initialize(&block)
|
7
10
|
@constructed_columns = {}
|
@@ -69,7 +72,6 @@ module Chicago
|
|
69
72
|
raise "Column #{name} was either ambiguous or non-existant"
|
70
73
|
end
|
71
74
|
end
|
72
|
-
|
73
75
|
end
|
74
76
|
end
|
75
77
|
end
|