chicago-etl 0.0.13 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +8 -3
- data/README.rdoc +4 -1
- data/VERSION +1 -1
- data/chicago-etl.gemspec +59 -22
- data/chicago-flow.gemspec +92 -0
- data/lib/chicago/etl/batch.rb +9 -2
- data/lib/chicago/etl/core_extensions.rb +12 -0
- data/lib/chicago/etl/counter.rb +8 -1
- data/lib/chicago/etl/dataset_batch_stage.rb +52 -0
- data/lib/chicago/etl/key_builder.rb +17 -39
- data/lib/chicago/etl/load_dataset_builder.rb +3 -1
- data/lib/chicago/etl/load_pipeline_stage_builder.rb +142 -0
- data/lib/chicago/etl/pipeline.rb +151 -0
- data/lib/chicago/etl/schema_table_sink_factory.rb +74 -0
- data/lib/chicago/etl/screens/column_screen.rb +26 -25
- data/lib/chicago/etl/screens/invalid_element.rb +5 -5
- data/lib/chicago/etl/screens/missing_value.rb +4 -2
- data/lib/chicago/etl/screens/out_of_bounds.rb +2 -0
- data/lib/chicago/etl/table_builder.rb +4 -2
- data/lib/chicago/etl/task_invocation.rb +0 -1
- data/lib/chicago/etl/transformations.rb +128 -0
- data/lib/chicago/etl.rb +39 -8
- data/lib/chicago/flow/array_sink.rb +35 -0
- data/lib/chicago/flow/array_source.rb +15 -0
- data/lib/chicago/flow/dataset_source.rb +23 -0
- data/lib/chicago/flow/errors.rb +14 -0
- data/lib/chicago/flow/filter.rb +15 -0
- data/lib/chicago/flow/mysql.rb +4 -0
- data/lib/chicago/{etl/mysql_load_file_value_transformer.rb → flow/mysql_file_serializer.rb} +7 -4
- data/lib/chicago/flow/mysql_file_sink.rb +68 -0
- data/lib/chicago/flow/null_sink.rb +8 -0
- data/lib/chicago/flow/pipeline_endpoint.rb +15 -0
- data/lib/chicago/flow/pipeline_stage.rb +68 -0
- data/lib/chicago/flow/sink.rb +53 -0
- data/lib/chicago/flow/transformation.rb +169 -0
- data/lib/chicago/flow/transformation_chain.rb +40 -0
- data/spec/etl/batch_spec.rb +2 -1
- data/spec/etl/core_extensions_spec.rb +13 -0
- data/spec/etl/dataset_batch_stage_spec.rb +55 -0
- data/spec/etl/key_builder_spec.rb +25 -83
- data/spec/etl/pipeline_stage_builder_spec.rb +39 -0
- data/spec/etl/schema_table_sink_factory_spec.rb +69 -0
- data/spec/etl/screens/invalid_element_spec.rb +10 -11
- data/spec/etl/screens/missing_value_spec.rb +21 -21
- data/spec/etl/screens/out_of_bounds_spec.rb +21 -29
- data/spec/etl/transformations_spec.rb +109 -0
- data/spec/flow/array_sink_spec.rb +26 -0
- data/spec/flow/array_source_spec.rb +20 -0
- data/spec/flow/dataset_source_spec.rb +15 -0
- data/spec/flow/filter_spec.rb +13 -0
- data/spec/flow/mysql_file_serializer_spec.rb +27 -0
- data/spec/flow/mysql_file_sink_spec.rb +94 -0
- data/spec/flow/mysql_integration_spec.rb +72 -0
- data/spec/flow/pipeline_stage_spec.rb +89 -0
- data/spec/flow/transformation_chain_spec.rb +76 -0
- data/spec/flow/transformation_spec.rb +91 -0
- data/spec/spec_helper.rb +5 -0
- metadata +135 -39
- data/lib/chicago/etl/buffering_insert_writer.rb +0 -36
- data/lib/chicago/etl/mysql_dumpfile.rb +0 -32
- data/lib/chicago/etl/screens/composite_screen.rb +0 -17
- data/lib/chicago/etl/sequel/load_data_infile.rb +0 -141
- data/lib/chicago/etl/sink.rb +0 -61
- data/lib/chicago/etl/transformations/add_insert_timestamp.rb +0 -16
- data/spec/etl/mysql_dumpfile_spec.rb +0 -42
- data/spec/etl/mysql_load_file_value_transformer_spec.rb +0 -27
- data/spec/etl/screens/composite_screen_spec.rb +0 -25
- data/spec/etl/sequel/load_data_infile_expression_spec.rb +0 -60
- data/spec/etl/sequel/load_data_infile_spec.rb +0 -37
- data/spec/etl/sink_spec.rb +0 -7
- data/spec/etl/transformations/add_insert_timestamp_spec.rb +0 -9
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chicago-etl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 27
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
+
- 1
|
8
9
|
- 0
|
9
|
-
|
10
|
-
version: 0.0.13
|
10
|
+
version: 0.1.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Roland Swingler
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2013-
|
18
|
+
date: 2013-09-05 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
version_requirements: &id001 !ruby/object:Gem::Requirement
|
@@ -36,16 +36,16 @@ dependencies:
|
|
36
36
|
version_requirements: &id002 !ruby/object:Gem::Requirement
|
37
37
|
none: false
|
38
38
|
requirements:
|
39
|
-
- -
|
39
|
+
- - ">="
|
40
40
|
- !ruby/object:Gem::Version
|
41
|
-
hash:
|
41
|
+
hash: 3
|
42
42
|
segments:
|
43
|
-
-
|
44
|
-
version: "
|
43
|
+
- 0
|
44
|
+
version: "0"
|
45
45
|
requirement: *id002
|
46
|
-
type: :
|
46
|
+
type: :runtime
|
47
47
|
prerelease: false
|
48
|
-
name:
|
48
|
+
name: fastercsv
|
49
49
|
- !ruby/object:Gem::Dependency
|
50
50
|
version_requirements: &id003 !ruby/object:Gem::Requirement
|
51
51
|
none: false
|
@@ -57,23 +57,25 @@ dependencies:
|
|
57
57
|
- 0
|
58
58
|
version: "0"
|
59
59
|
requirement: *id003
|
60
|
-
type: :
|
60
|
+
type: :runtime
|
61
61
|
prerelease: false
|
62
|
-
name:
|
62
|
+
name: sequel
|
63
63
|
- !ruby/object:Gem::Dependency
|
64
64
|
version_requirements: &id004 !ruby/object:Gem::Requirement
|
65
65
|
none: false
|
66
66
|
requirements:
|
67
67
|
- - ">="
|
68
68
|
- !ruby/object:Gem::Version
|
69
|
-
hash:
|
69
|
+
hash: 27
|
70
70
|
segments:
|
71
71
|
- 0
|
72
|
-
|
72
|
+
- 0
|
73
|
+
- 2
|
74
|
+
version: 0.0.2
|
73
75
|
requirement: *id004
|
74
|
-
type: :
|
76
|
+
type: :runtime
|
75
77
|
prerelease: false
|
76
|
-
name:
|
78
|
+
name: sequel_load_data_infile
|
77
79
|
- !ruby/object:Gem::Dependency
|
78
80
|
version_requirements: &id005 !ruby/object:Gem::Requirement
|
79
81
|
none: false
|
@@ -85,23 +87,23 @@ dependencies:
|
|
85
87
|
- 0
|
86
88
|
version: "0"
|
87
89
|
requirement: *id005
|
88
|
-
type: :
|
90
|
+
type: :runtime
|
89
91
|
prerelease: false
|
90
|
-
name:
|
92
|
+
name: sequel_fast_columns
|
91
93
|
- !ruby/object:Gem::Dependency
|
92
94
|
version_requirements: &id006 !ruby/object:Gem::Requirement
|
93
95
|
none: false
|
94
96
|
requirements:
|
95
|
-
- -
|
97
|
+
- - ~>
|
96
98
|
- !ruby/object:Gem::Version
|
97
|
-
hash:
|
99
|
+
hash: 7
|
98
100
|
segments:
|
99
|
-
-
|
100
|
-
version: "
|
101
|
+
- 2
|
102
|
+
version: "2"
|
101
103
|
requirement: *id006
|
102
104
|
type: :development
|
103
105
|
prerelease: false
|
104
|
-
name:
|
106
|
+
name: rspec
|
105
107
|
- !ruby/object:Gem::Dependency
|
106
108
|
version_requirements: &id007 !ruby/object:Gem::Requirement
|
107
109
|
none: false
|
@@ -115,7 +117,7 @@ dependencies:
|
|
115
117
|
requirement: *id007
|
116
118
|
type: :development
|
117
119
|
prerelease: false
|
118
|
-
name:
|
120
|
+
name: timecop
|
119
121
|
- !ruby/object:Gem::Dependency
|
120
122
|
version_requirements: &id008 !ruby/object:Gem::Requirement
|
121
123
|
none: false
|
@@ -129,7 +131,7 @@ dependencies:
|
|
129
131
|
requirement: *id008
|
130
132
|
type: :development
|
131
133
|
prerelease: false
|
132
|
-
name:
|
134
|
+
name: yard
|
133
135
|
- !ruby/object:Gem::Dependency
|
134
136
|
version_requirements: &id009 !ruby/object:Gem::Requirement
|
135
137
|
none: false
|
@@ -143,7 +145,79 @@ dependencies:
|
|
143
145
|
requirement: *id009
|
144
146
|
type: :development
|
145
147
|
prerelease: false
|
148
|
+
name: flog
|
149
|
+
- !ruby/object:Gem::Dependency
|
150
|
+
version_requirements: &id010 !ruby/object:Gem::Requirement
|
151
|
+
none: false
|
152
|
+
requirements:
|
153
|
+
- - ">="
|
154
|
+
- !ruby/object:Gem::Version
|
155
|
+
hash: 3
|
156
|
+
segments:
|
157
|
+
- 0
|
158
|
+
version: "0"
|
159
|
+
requirement: *id010
|
160
|
+
type: :development
|
161
|
+
prerelease: false
|
162
|
+
name: simplecov
|
163
|
+
- !ruby/object:Gem::Dependency
|
164
|
+
version_requirements: &id011 !ruby/object:Gem::Requirement
|
165
|
+
none: false
|
166
|
+
requirements:
|
167
|
+
- - ">="
|
168
|
+
- !ruby/object:Gem::Version
|
169
|
+
hash: 3
|
170
|
+
segments:
|
171
|
+
- 0
|
172
|
+
version: "0"
|
173
|
+
requirement: *id011
|
174
|
+
type: :development
|
175
|
+
prerelease: false
|
146
176
|
name: ZenTest
|
177
|
+
- !ruby/object:Gem::Dependency
|
178
|
+
version_requirements: &id012 !ruby/object:Gem::Requirement
|
179
|
+
none: false
|
180
|
+
requirements:
|
181
|
+
- - "="
|
182
|
+
- !ruby/object:Gem::Version
|
183
|
+
hash: 45
|
184
|
+
segments:
|
185
|
+
- 2
|
186
|
+
- 8
|
187
|
+
- 1
|
188
|
+
version: 2.8.1
|
189
|
+
requirement: *id012
|
190
|
+
type: :development
|
191
|
+
prerelease: false
|
192
|
+
name: mysql
|
193
|
+
- !ruby/object:Gem::Dependency
|
194
|
+
version_requirements: &id013 !ruby/object:Gem::Requirement
|
195
|
+
none: false
|
196
|
+
requirements:
|
197
|
+
- - ~>
|
198
|
+
- !ruby/object:Gem::Version
|
199
|
+
hash: 1
|
200
|
+
segments:
|
201
|
+
- 1
|
202
|
+
version: "1"
|
203
|
+
requirement: *id013
|
204
|
+
type: :development
|
205
|
+
prerelease: false
|
206
|
+
name: bundler
|
207
|
+
- !ruby/object:Gem::Dependency
|
208
|
+
version_requirements: &id014 !ruby/object:Gem::Requirement
|
209
|
+
none: false
|
210
|
+
requirements:
|
211
|
+
- - ">="
|
212
|
+
- !ruby/object:Gem::Version
|
213
|
+
hash: 3
|
214
|
+
segments:
|
215
|
+
- 0
|
216
|
+
version: "0"
|
217
|
+
requirement: *id014
|
218
|
+
type: :development
|
219
|
+
prerelease: false
|
220
|
+
name: jeweler
|
147
221
|
description: ETL tools for Chicago
|
148
222
|
email: roland.swingler@gmail.com
|
149
223
|
executables: []
|
@@ -162,52 +236,74 @@ files:
|
|
162
236
|
- Rakefile
|
163
237
|
- VERSION
|
164
238
|
- chicago-etl.gemspec
|
239
|
+
- chicago-flow.gemspec
|
165
240
|
- lib/chicago-etl.rb
|
166
241
|
- lib/chicago/etl.rb
|
167
242
|
- lib/chicago/etl/batch.rb
|
168
|
-
- lib/chicago/etl/
|
243
|
+
- lib/chicago/etl/core_extensions.rb
|
169
244
|
- lib/chicago/etl/counter.rb
|
245
|
+
- lib/chicago/etl/dataset_batch_stage.rb
|
170
246
|
- lib/chicago/etl/key_builder.rb
|
171
247
|
- lib/chicago/etl/load_dataset_builder.rb
|
172
|
-
- lib/chicago/etl/
|
173
|
-
- lib/chicago/etl/
|
248
|
+
- lib/chicago/etl/load_pipeline_stage_builder.rb
|
249
|
+
- lib/chicago/etl/pipeline.rb
|
250
|
+
- lib/chicago/etl/schema_table_sink_factory.rb
|
174
251
|
- lib/chicago/etl/screens/column_screen.rb
|
175
|
-
- lib/chicago/etl/screens/composite_screen.rb
|
176
252
|
- lib/chicago/etl/screens/invalid_element.rb
|
177
253
|
- lib/chicago/etl/screens/missing_value.rb
|
178
254
|
- lib/chicago/etl/screens/out_of_bounds.rb
|
179
255
|
- lib/chicago/etl/sequel/dependant_tables.rb
|
180
256
|
- lib/chicago/etl/sequel/filter_to_etl_batch.rb
|
181
|
-
- lib/chicago/etl/sequel/load_data_infile.rb
|
182
|
-
- lib/chicago/etl/sink.rb
|
183
257
|
- lib/chicago/etl/table_builder.rb
|
184
258
|
- lib/chicago/etl/task_invocation.rb
|
185
259
|
- lib/chicago/etl/tasks.rb
|
186
|
-
- lib/chicago/etl/transformations
|
260
|
+
- lib/chicago/etl/transformations.rb
|
187
261
|
- lib/chicago/etl/transformations/uk_post_code.rb
|
188
262
|
- lib/chicago/etl/transformations/uk_post_code_field.rb
|
263
|
+
- lib/chicago/flow/array_sink.rb
|
264
|
+
- lib/chicago/flow/array_source.rb
|
265
|
+
- lib/chicago/flow/dataset_source.rb
|
266
|
+
- lib/chicago/flow/errors.rb
|
267
|
+
- lib/chicago/flow/filter.rb
|
268
|
+
- lib/chicago/flow/mysql.rb
|
269
|
+
- lib/chicago/flow/mysql_file_serializer.rb
|
270
|
+
- lib/chicago/flow/mysql_file_sink.rb
|
271
|
+
- lib/chicago/flow/null_sink.rb
|
272
|
+
- lib/chicago/flow/pipeline_endpoint.rb
|
273
|
+
- lib/chicago/flow/pipeline_stage.rb
|
274
|
+
- lib/chicago/flow/sink.rb
|
275
|
+
- lib/chicago/flow/transformation.rb
|
276
|
+
- lib/chicago/flow/transformation_chain.rb
|
189
277
|
- spec/db_connections.yml.dist
|
190
278
|
- spec/etl/batch_spec.rb
|
279
|
+
- spec/etl/core_extensions_spec.rb
|
191
280
|
- spec/etl/counter_spec.rb
|
281
|
+
- spec/etl/dataset_batch_stage_spec.rb
|
192
282
|
- spec/etl/etl_batch_id_dataset_filter.rb
|
193
283
|
- spec/etl/key_builder_spec.rb
|
194
284
|
- spec/etl/load_dataset_builder_spec.rb
|
195
|
-
- spec/etl/
|
196
|
-
- spec/etl/
|
197
|
-
- spec/etl/screens/composite_screen_spec.rb
|
285
|
+
- spec/etl/pipeline_stage_builder_spec.rb
|
286
|
+
- spec/etl/schema_table_sink_factory_spec.rb
|
198
287
|
- spec/etl/screens/invalid_element_spec.rb
|
199
288
|
- spec/etl/screens/missing_value_spec.rb
|
200
289
|
- spec/etl/screens/out_of_bounds_spec.rb
|
201
290
|
- spec/etl/sequel/dependant_tables_spec.rb
|
202
291
|
- spec/etl/sequel/filter_to_etl_batch_spec.rb
|
203
|
-
- spec/etl/sequel/load_data_infile_expression_spec.rb
|
204
|
-
- spec/etl/sequel/load_data_infile_spec.rb
|
205
|
-
- spec/etl/sink_spec.rb
|
206
292
|
- spec/etl/table_builder_spec.rb
|
207
293
|
- spec/etl/task_spec.rb
|
208
|
-
- spec/etl/transformations/add_insert_timestamp_spec.rb
|
209
294
|
- spec/etl/transformations/uk_post_code_field_spec.rb
|
210
295
|
- spec/etl/transformations/uk_post_code_spec.rb
|
296
|
+
- spec/etl/transformations_spec.rb
|
297
|
+
- spec/flow/array_sink_spec.rb
|
298
|
+
- spec/flow/array_source_spec.rb
|
299
|
+
- spec/flow/dataset_source_spec.rb
|
300
|
+
- spec/flow/filter_spec.rb
|
301
|
+
- spec/flow/mysql_file_serializer_spec.rb
|
302
|
+
- spec/flow/mysql_file_sink_spec.rb
|
303
|
+
- spec/flow/mysql_integration_spec.rb
|
304
|
+
- spec/flow/pipeline_stage_spec.rb
|
305
|
+
- spec/flow/transformation_chain_spec.rb
|
306
|
+
- spec/flow/transformation_spec.rb
|
211
307
|
- spec/spec_helper.rb
|
212
308
|
homepage: http://github.com/notonthehighstreet/chicago-etl
|
213
309
|
licenses:
|
@@ -1,36 +0,0 @@
|
|
1
|
-
require 'chicago/etl/sink'
|
2
|
-
|
3
|
-
module Chicago
|
4
|
-
module ETL
|
5
|
-
# Wrapper around a dataset to allowed buffered inserts.
|
6
|
-
#
|
7
|
-
# @api public
|
8
|
-
class BufferingInsertWriter < Sink
|
9
|
-
# The number of rows written before inserting to the DB.
|
10
|
-
BUFFER_SIZE = 10_000
|
11
|
-
|
12
|
-
def initialize(dataset, column_names, key=nil)
|
13
|
-
super([], column_names, key)
|
14
|
-
@dataset = dataset
|
15
|
-
end
|
16
|
-
|
17
|
-
def flush
|
18
|
-
@dataset.insert_replace.import(column_names, output)
|
19
|
-
output.clear
|
20
|
-
end
|
21
|
-
|
22
|
-
protected
|
23
|
-
|
24
|
-
def write(row)
|
25
|
-
output << @column_names.map {|name| row[name] }
|
26
|
-
flush if reached_buffer_limit?
|
27
|
-
end
|
28
|
-
|
29
|
-
private
|
30
|
-
|
31
|
-
def reached_buffer_limit?
|
32
|
-
output.size >= BUFFER_SIZE
|
33
|
-
end
|
34
|
-
end
|
35
|
-
end
|
36
|
-
end
|
@@ -1,32 +0,0 @@
|
|
1
|
-
require 'chicago/etl/sink'
|
2
|
-
|
3
|
-
module Chicago
|
4
|
-
module ETL
|
5
|
-
# Wrapper around FasterCSV's output object, to convert values to a
|
6
|
-
# format required by MySQL's LOAD DATA INFILE command.
|
7
|
-
#
|
8
|
-
# @api public
|
9
|
-
class MysqlDumpfile < Sink
|
10
|
-
# Creates a new writer.
|
11
|
-
#
|
12
|
-
# @param csv a FasterCSV output object
|
13
|
-
# @param [Symbol] column_names columns to be output
|
14
|
-
# @param key an optional key to ensure rows are written only once.
|
15
|
-
def initialize(csv, column_names, key=nil)
|
16
|
-
super(csv, column_names, key)
|
17
|
-
@transformer = MysqlLoadFileValueTransformer.new
|
18
|
-
end
|
19
|
-
|
20
|
-
protected
|
21
|
-
|
22
|
-
# Writes a row to the output.
|
23
|
-
#
|
24
|
-
# @param Hash row Only keys in column_names will be output.
|
25
|
-
def write(row)
|
26
|
-
output << @column_names.map {|name|
|
27
|
-
@transformer.transform(row[name])
|
28
|
-
}
|
29
|
-
end
|
30
|
-
end
|
31
|
-
end
|
32
|
-
end
|
@@ -1,17 +0,0 @@
|
|
1
|
-
module Chicago
|
2
|
-
module ETL
|
3
|
-
module Screens
|
4
|
-
class CompositeScreen
|
5
|
-
def initialize(*screens)
|
6
|
-
@screens = screens.flatten
|
7
|
-
end
|
8
|
-
|
9
|
-
def call(row, errors=[])
|
10
|
-
@screens.inject([row,errors]) do |(row, errors), screen|
|
11
|
-
screen.call(row, errors)
|
12
|
-
end
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end
|
17
|
-
end
|
@@ -1,141 +0,0 @@
|
|
1
|
-
module Chicago
|
2
|
-
module ETL
|
3
|
-
module SequelExtensions
|
4
|
-
# @api private
|
5
|
-
class LoadDataInfileExpression
|
6
|
-
attr_reader :path, :table, :columns, :ignore, :character_set
|
7
|
-
|
8
|
-
def initialize(path, table, columns, opts={})
|
9
|
-
@path = path
|
10
|
-
@table = table
|
11
|
-
@columns = columns
|
12
|
-
@ignore = opts[:ignore]
|
13
|
-
@update = opts[:update]
|
14
|
-
@set = opts[:set] || {}
|
15
|
-
@character_set = opts[:character_set] || "utf8"
|
16
|
-
if opts[:format] == :csv
|
17
|
-
@field_terminator = ","
|
18
|
-
@enclosed_by = '"'
|
19
|
-
@escaped_by = '"'
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
def replace?
|
24
|
-
@update == :replace
|
25
|
-
end
|
26
|
-
|
27
|
-
def ignore?
|
28
|
-
@update == :ignore
|
29
|
-
end
|
30
|
-
|
31
|
-
def to_sql(db)
|
32
|
-
@db = db
|
33
|
-
[load_fragment,
|
34
|
-
replace_fragment,
|
35
|
-
table_fragment,
|
36
|
-
character_set_fragment,
|
37
|
-
field_terminator_fragment,
|
38
|
-
field_enclosure_fragment,
|
39
|
-
escape_fragment,
|
40
|
-
ignore_fragment,
|
41
|
-
column_fragment,
|
42
|
-
set_fragment].compact.join(" ")
|
43
|
-
end
|
44
|
-
|
45
|
-
private
|
46
|
-
|
47
|
-
def load_fragment
|
48
|
-
"LOAD DATA INFILE '#{path}'"
|
49
|
-
end
|
50
|
-
|
51
|
-
def replace_fragment
|
52
|
-
@update.to_s.upcase if replace? || ignore?
|
53
|
-
end
|
54
|
-
|
55
|
-
def table_fragment
|
56
|
-
"INTO TABLE `#{table}`"
|
57
|
-
end
|
58
|
-
|
59
|
-
def character_set_fragment
|
60
|
-
"CHARACTER SET '#{character_set}'"
|
61
|
-
end
|
62
|
-
|
63
|
-
def field_terminator_fragment
|
64
|
-
"FIELDS TERMINATED BY '#{@field_terminator}'" if @field_terminator
|
65
|
-
end
|
66
|
-
|
67
|
-
def field_enclosure_fragment
|
68
|
-
"OPTIONALLY ENCLOSED BY '#{@enclosed_by}'" if @enclosed_by
|
69
|
-
end
|
70
|
-
|
71
|
-
def escape_fragment
|
72
|
-
"ESCAPED BY '#{@escaped_by}'" if @escaped_by
|
73
|
-
end
|
74
|
-
|
75
|
-
def ignore_fragment
|
76
|
-
"IGNORE #{ignore} LINES" if ignore
|
77
|
-
end
|
78
|
-
|
79
|
-
def column_fragment
|
80
|
-
"(" + columns.map {|c| format_column(c) }.join(",") + ")"
|
81
|
-
end
|
82
|
-
|
83
|
-
def set_fragment
|
84
|
-
unless @set.empty?
|
85
|
-
"SET " + @set.map do |k, v|
|
86
|
-
"#{@db.literal(k)} = #{@db.literal(v)}"
|
87
|
-
end.join(", ")
|
88
|
-
end
|
89
|
-
end
|
90
|
-
|
91
|
-
def format_column(column)
|
92
|
-
column.to_s[0..0] == "@" ? column : "`#{column}`"
|
93
|
-
end
|
94
|
-
end
|
95
|
-
|
96
|
-
module LoadDataInfile
|
97
|
-
# Load data in file specified at path.
|
98
|
-
#
|
99
|
-
# Columns is a list of columns to load - column names starting
|
100
|
-
# with an @ symbol will be treated as variables.
|
101
|
-
#
|
102
|
-
# By default, this will generate a REPLACE INTO TABLE
|
103
|
-
# statement.
|
104
|
-
#
|
105
|
-
# Options:
|
106
|
-
# :ignore - the number of lines to ignore in the source file
|
107
|
-
# :update - nil, :ignore or :replace
|
108
|
-
# :set - a hash specifying autopopulation of columns
|
109
|
-
# :character_set - the character set of the file, UTF8 default
|
110
|
-
# :format - either nil or :csv
|
111
|
-
def load_infile(path, columns, options={})
|
112
|
-
execute_dui(load_infile_sql(path, columns, options))
|
113
|
-
end
|
114
|
-
|
115
|
-
def load_infile_sql(path, columns, options={})
|
116
|
-
replacement = opts[:insert_ignore] ? :ignore : :replace
|
117
|
-
options = {:update => replacement}.merge(options)
|
118
|
-
LoadDataInfileExpression.new(path,
|
119
|
-
opts[:from].first,
|
120
|
-
columns,
|
121
|
-
options).
|
122
|
-
to_sql(db)
|
123
|
-
end
|
124
|
-
|
125
|
-
# Loads the CSV data columns in path into this dataset's
|
126
|
-
# table.
|
127
|
-
#
|
128
|
-
# See load_infile for more options.
|
129
|
-
def load_csv_infile(path, columns, options={})
|
130
|
-
execute_dui(load_csv_infile_sql(path, columns, options))
|
131
|
-
end
|
132
|
-
|
133
|
-
def load_csv_infile_sql(path, columns, options={})
|
134
|
-
load_infile_sql(path, columns, options.merge(:format => :csv))
|
135
|
-
end
|
136
|
-
end
|
137
|
-
end
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
141
|
-
Sequel::Dataset.send :include, Chicago::ETL::SequelExtensions::LoadDataInfile
|
data/lib/chicago/etl/sink.rb
DELETED
@@ -1,61 +0,0 @@
|
|
1
|
-
require 'set'
|
2
|
-
|
3
|
-
module Chicago
|
4
|
-
module ETL
|
5
|
-
# An end point to write rows.
|
6
|
-
#
|
7
|
-
# @abstract
|
8
|
-
# @api public
|
9
|
-
class Sink
|
10
|
-
# Returns the column names expected to be written to this sink.
|
11
|
-
# @api public
|
12
|
-
attr_reader :column_names
|
13
|
-
|
14
|
-
# @abstract
|
15
|
-
def initialize(output, column_names, unique_row_key=nil)
|
16
|
-
@output = output
|
17
|
-
@column_names = column_names
|
18
|
-
@written_rows = Set.new
|
19
|
-
@unique_row_key = unique_row_key
|
20
|
-
end
|
21
|
-
|
22
|
-
# Writes a row to the output.
|
23
|
-
#
|
24
|
-
# Row will not be written to the output if it has already been
|
25
|
-
# written, as identified by the unique row key.
|
26
|
-
#
|
27
|
-
# Should not be overridden by subclasses - overwrite write instead.
|
28
|
-
def <<(row)
|
29
|
-
unless written?(row)
|
30
|
-
write row
|
31
|
-
@written_rows << row[@unique_row_key]
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
# Flushes any remaining writes to the output.
|
36
|
-
#
|
37
|
-
# By default does nothing, subclasses should override where
|
38
|
-
# necessary.
|
39
|
-
def flush
|
40
|
-
end
|
41
|
-
|
42
|
-
# Returns true if this row has previously been written to the
|
43
|
-
# output.
|
44
|
-
#
|
45
|
-
# Always returns false if no key to determine row uniqueness has
|
46
|
-
# been provided.
|
47
|
-
def written?(row)
|
48
|
-
return false if @unique_row_key.nil?
|
49
|
-
@written_rows.include?(row[@unique_row_key])
|
50
|
-
end
|
51
|
-
|
52
|
-
protected
|
53
|
-
|
54
|
-
attr_reader :output
|
55
|
-
|
56
|
-
# @abstract
|
57
|
-
def write(row)
|
58
|
-
end
|
59
|
-
end
|
60
|
-
end
|
61
|
-
end
|
@@ -1,16 +0,0 @@
|
|
1
|
-
module Chicago
|
2
|
-
module ETL
|
3
|
-
module Transformations
|
4
|
-
class AddInsertTimestamp
|
5
|
-
def initialize(timestamp=Time.now)
|
6
|
-
@insert_timestamp = timestamp.utc
|
7
|
-
end
|
8
|
-
|
9
|
-
def call(row, errors=[])
|
10
|
-
row[:_inserted_at] = @insert_timestamp
|
11
|
-
[row, errors]
|
12
|
-
end
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end
|
@@ -1,42 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
|
3
|
-
describe Chicago::ETL::MysqlDumpfile do
|
4
|
-
before :each do
|
5
|
-
@csv = mock(:csv)
|
6
|
-
end
|
7
|
-
|
8
|
-
it "outputs specified column values in order" do
|
9
|
-
dumpfile = described_class.new(@csv, [:foo, :bar])
|
10
|
-
@csv.should_receive(:<<).with(["1", "2"])
|
11
|
-
|
12
|
-
dumpfile << {:foo => "1", :bar => "2", :baz => "not output"}
|
13
|
-
end
|
14
|
-
|
15
|
-
it "transforms values with a MysqlLoadFileValueTransformer" do
|
16
|
-
transformer = mock(:transformer)
|
17
|
-
Chicago::ETL::MysqlLoadFileValueTransformer.stub(:new).and_return(transformer)
|
18
|
-
|
19
|
-
transformer.should_receive(:transform).with("bar").and_return("baz")
|
20
|
-
@csv.should_receive(:<<).with(["baz"])
|
21
|
-
|
22
|
-
dumpfile = described_class.new(@csv, [:foo])
|
23
|
-
dumpfile << {:foo => "bar"}
|
24
|
-
end
|
25
|
-
|
26
|
-
it "will write a row only once with the same key" do
|
27
|
-
dumpfile = described_class.new(@csv, [:foo], :id)
|
28
|
-
@csv.should_receive(:<<).with(["bar"])
|
29
|
-
|
30
|
-
dumpfile << {:id => 1, :foo => "bar"}
|
31
|
-
dumpfile << {:id => 1, :foo => "baz"}
|
32
|
-
end
|
33
|
-
|
34
|
-
it "will write a row multiple times if no key is specified" do
|
35
|
-
dumpfile = described_class.new(@csv, [:foo])
|
36
|
-
@csv.should_receive(:<<).with(["bar"])
|
37
|
-
@csv.should_receive(:<<).with(["baz"])
|
38
|
-
|
39
|
-
dumpfile << {:id => 1, :foo => "bar"}
|
40
|
-
dumpfile << {:id => 1, :foo => "baz"}
|
41
|
-
end
|
42
|
-
end
|
@@ -1,27 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
|
3
|
-
describe Chicago::ETL::MysqlLoadFileValueTransformer do
|
4
|
-
it "transforms nil into \\N" do
|
5
|
-
subject.transform(nil).should == "\\N"
|
6
|
-
end
|
7
|
-
|
8
|
-
it "transforms true into '1'" do
|
9
|
-
subject.transform(true).should == "1"
|
10
|
-
end
|
11
|
-
|
12
|
-
it "transforms false into '0'" do
|
13
|
-
subject.transform(false).should == "0"
|
14
|
-
end
|
15
|
-
|
16
|
-
it "transforms times into mysql time format" do
|
17
|
-
subject.transform(Time.local(2011,01,02,10,30,50)).should == "2011-01-02 10:30:50"
|
18
|
-
end
|
19
|
-
|
20
|
-
it "transforms datetimes into mysql time format" do
|
21
|
-
subject.transform(DateTime.new(2011,01,02,10,30,50)).should == "2011-01-02 10:30:50"
|
22
|
-
end
|
23
|
-
|
24
|
-
it "transforms dates into mysql date format" do
|
25
|
-
subject.transform(Date.new(2011,01,02)).should == "2011-01-02"
|
26
|
-
end
|
27
|
-
end
|
@@ -1,25 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
|
3
|
-
describe Chicago::ETL::Screens::CompositeScreen do
|
4
|
-
let(:screen) do
|
5
|
-
i = 0
|
6
|
-
|
7
|
-
lambda {|row, errors|
|
8
|
-
i += 1
|
9
|
-
errors << i
|
10
|
-
[row, errors]
|
11
|
-
}
|
12
|
-
end
|
13
|
-
|
14
|
-
it "calls all child screens" do
|
15
|
-
row, errors = described_class.new([screen, screen]).call({:a => 1}, [])
|
16
|
-
row.should == {:a => 1}
|
17
|
-
errors.should == [1,2]
|
18
|
-
end
|
19
|
-
|
20
|
-
it "supports variable arguments in the constructor" do
|
21
|
-
row, errors = described_class.new(screen, screen).call({:a => 1}, [])
|
22
|
-
row.should == {:a => 1}
|
23
|
-
errors.should == [1,2]
|
24
|
-
end
|
25
|
-
end
|