chicago-etl 0.0.13 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +8 -3
- data/README.rdoc +4 -1
- data/VERSION +1 -1
- data/chicago-etl.gemspec +59 -22
- data/chicago-flow.gemspec +92 -0
- data/lib/chicago/etl/batch.rb +9 -2
- data/lib/chicago/etl/core_extensions.rb +12 -0
- data/lib/chicago/etl/counter.rb +8 -1
- data/lib/chicago/etl/dataset_batch_stage.rb +52 -0
- data/lib/chicago/etl/key_builder.rb +17 -39
- data/lib/chicago/etl/load_dataset_builder.rb +3 -1
- data/lib/chicago/etl/load_pipeline_stage_builder.rb +142 -0
- data/lib/chicago/etl/pipeline.rb +151 -0
- data/lib/chicago/etl/schema_table_sink_factory.rb +74 -0
- data/lib/chicago/etl/screens/column_screen.rb +26 -25
- data/lib/chicago/etl/screens/invalid_element.rb +5 -5
- data/lib/chicago/etl/screens/missing_value.rb +4 -2
- data/lib/chicago/etl/screens/out_of_bounds.rb +2 -0
- data/lib/chicago/etl/table_builder.rb +4 -2
- data/lib/chicago/etl/task_invocation.rb +0 -1
- data/lib/chicago/etl/transformations.rb +128 -0
- data/lib/chicago/etl.rb +39 -8
- data/lib/chicago/flow/array_sink.rb +35 -0
- data/lib/chicago/flow/array_source.rb +15 -0
- data/lib/chicago/flow/dataset_source.rb +23 -0
- data/lib/chicago/flow/errors.rb +14 -0
- data/lib/chicago/flow/filter.rb +15 -0
- data/lib/chicago/flow/mysql.rb +4 -0
- data/lib/chicago/{etl/mysql_load_file_value_transformer.rb → flow/mysql_file_serializer.rb} +7 -4
- data/lib/chicago/flow/mysql_file_sink.rb +68 -0
- data/lib/chicago/flow/null_sink.rb +8 -0
- data/lib/chicago/flow/pipeline_endpoint.rb +15 -0
- data/lib/chicago/flow/pipeline_stage.rb +68 -0
- data/lib/chicago/flow/sink.rb +53 -0
- data/lib/chicago/flow/transformation.rb +169 -0
- data/lib/chicago/flow/transformation_chain.rb +40 -0
- data/spec/etl/batch_spec.rb +2 -1
- data/spec/etl/core_extensions_spec.rb +13 -0
- data/spec/etl/dataset_batch_stage_spec.rb +55 -0
- data/spec/etl/key_builder_spec.rb +25 -83
- data/spec/etl/pipeline_stage_builder_spec.rb +39 -0
- data/spec/etl/schema_table_sink_factory_spec.rb +69 -0
- data/spec/etl/screens/invalid_element_spec.rb +10 -11
- data/spec/etl/screens/missing_value_spec.rb +21 -21
- data/spec/etl/screens/out_of_bounds_spec.rb +21 -29
- data/spec/etl/transformations_spec.rb +109 -0
- data/spec/flow/array_sink_spec.rb +26 -0
- data/spec/flow/array_source_spec.rb +20 -0
- data/spec/flow/dataset_source_spec.rb +15 -0
- data/spec/flow/filter_spec.rb +13 -0
- data/spec/flow/mysql_file_serializer_spec.rb +27 -0
- data/spec/flow/mysql_file_sink_spec.rb +94 -0
- data/spec/flow/mysql_integration_spec.rb +72 -0
- data/spec/flow/pipeline_stage_spec.rb +89 -0
- data/spec/flow/transformation_chain_spec.rb +76 -0
- data/spec/flow/transformation_spec.rb +91 -0
- data/spec/spec_helper.rb +5 -0
- metadata +135 -39
- data/lib/chicago/etl/buffering_insert_writer.rb +0 -36
- data/lib/chicago/etl/mysql_dumpfile.rb +0 -32
- data/lib/chicago/etl/screens/composite_screen.rb +0 -17
- data/lib/chicago/etl/sequel/load_data_infile.rb +0 -141
- data/lib/chicago/etl/sink.rb +0 -61
- data/lib/chicago/etl/transformations/add_insert_timestamp.rb +0 -16
- data/spec/etl/mysql_dumpfile_spec.rb +0 -42
- data/spec/etl/mysql_load_file_value_transformer_spec.rb +0 -27
- data/spec/etl/screens/composite_screen_spec.rb +0 -25
- data/spec/etl/sequel/load_data_infile_expression_spec.rb +0 -60
- data/spec/etl/sequel/load_data_infile_spec.rb +0 -37
- data/spec/etl/sink_spec.rb +0 -7
- data/spec/etl/transformations/add_insert_timestamp_spec.rb +0 -9
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chicago-etl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 27
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
+
- 1
|
8
9
|
- 0
|
9
|
-
|
10
|
-
version: 0.0.13
|
10
|
+
version: 0.1.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Roland Swingler
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2013-
|
18
|
+
date: 2013-09-05 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
version_requirements: &id001 !ruby/object:Gem::Requirement
|
@@ -36,16 +36,16 @@ dependencies:
|
|
36
36
|
version_requirements: &id002 !ruby/object:Gem::Requirement
|
37
37
|
none: false
|
38
38
|
requirements:
|
39
|
-
- -
|
39
|
+
- - ">="
|
40
40
|
- !ruby/object:Gem::Version
|
41
|
-
hash:
|
41
|
+
hash: 3
|
42
42
|
segments:
|
43
|
-
-
|
44
|
-
version: "
|
43
|
+
- 0
|
44
|
+
version: "0"
|
45
45
|
requirement: *id002
|
46
|
-
type: :
|
46
|
+
type: :runtime
|
47
47
|
prerelease: false
|
48
|
-
name:
|
48
|
+
name: fastercsv
|
49
49
|
- !ruby/object:Gem::Dependency
|
50
50
|
version_requirements: &id003 !ruby/object:Gem::Requirement
|
51
51
|
none: false
|
@@ -57,23 +57,25 @@ dependencies:
|
|
57
57
|
- 0
|
58
58
|
version: "0"
|
59
59
|
requirement: *id003
|
60
|
-
type: :
|
60
|
+
type: :runtime
|
61
61
|
prerelease: false
|
62
|
-
name:
|
62
|
+
name: sequel
|
63
63
|
- !ruby/object:Gem::Dependency
|
64
64
|
version_requirements: &id004 !ruby/object:Gem::Requirement
|
65
65
|
none: false
|
66
66
|
requirements:
|
67
67
|
- - ">="
|
68
68
|
- !ruby/object:Gem::Version
|
69
|
-
hash:
|
69
|
+
hash: 27
|
70
70
|
segments:
|
71
71
|
- 0
|
72
|
-
|
72
|
+
- 0
|
73
|
+
- 2
|
74
|
+
version: 0.0.2
|
73
75
|
requirement: *id004
|
74
|
-
type: :
|
76
|
+
type: :runtime
|
75
77
|
prerelease: false
|
76
|
-
name:
|
78
|
+
name: sequel_load_data_infile
|
77
79
|
- !ruby/object:Gem::Dependency
|
78
80
|
version_requirements: &id005 !ruby/object:Gem::Requirement
|
79
81
|
none: false
|
@@ -85,23 +87,23 @@ dependencies:
|
|
85
87
|
- 0
|
86
88
|
version: "0"
|
87
89
|
requirement: *id005
|
88
|
-
type: :
|
90
|
+
type: :runtime
|
89
91
|
prerelease: false
|
90
|
-
name:
|
92
|
+
name: sequel_fast_columns
|
91
93
|
- !ruby/object:Gem::Dependency
|
92
94
|
version_requirements: &id006 !ruby/object:Gem::Requirement
|
93
95
|
none: false
|
94
96
|
requirements:
|
95
|
-
- -
|
97
|
+
- - ~>
|
96
98
|
- !ruby/object:Gem::Version
|
97
|
-
hash:
|
99
|
+
hash: 7
|
98
100
|
segments:
|
99
|
-
-
|
100
|
-
version: "
|
101
|
+
- 2
|
102
|
+
version: "2"
|
101
103
|
requirement: *id006
|
102
104
|
type: :development
|
103
105
|
prerelease: false
|
104
|
-
name:
|
106
|
+
name: rspec
|
105
107
|
- !ruby/object:Gem::Dependency
|
106
108
|
version_requirements: &id007 !ruby/object:Gem::Requirement
|
107
109
|
none: false
|
@@ -115,7 +117,7 @@ dependencies:
|
|
115
117
|
requirement: *id007
|
116
118
|
type: :development
|
117
119
|
prerelease: false
|
118
|
-
name:
|
120
|
+
name: timecop
|
119
121
|
- !ruby/object:Gem::Dependency
|
120
122
|
version_requirements: &id008 !ruby/object:Gem::Requirement
|
121
123
|
none: false
|
@@ -129,7 +131,7 @@ dependencies:
|
|
129
131
|
requirement: *id008
|
130
132
|
type: :development
|
131
133
|
prerelease: false
|
132
|
-
name:
|
134
|
+
name: yard
|
133
135
|
- !ruby/object:Gem::Dependency
|
134
136
|
version_requirements: &id009 !ruby/object:Gem::Requirement
|
135
137
|
none: false
|
@@ -143,7 +145,79 @@ dependencies:
|
|
143
145
|
requirement: *id009
|
144
146
|
type: :development
|
145
147
|
prerelease: false
|
148
|
+
name: flog
|
149
|
+
- !ruby/object:Gem::Dependency
|
150
|
+
version_requirements: &id010 !ruby/object:Gem::Requirement
|
151
|
+
none: false
|
152
|
+
requirements:
|
153
|
+
- - ">="
|
154
|
+
- !ruby/object:Gem::Version
|
155
|
+
hash: 3
|
156
|
+
segments:
|
157
|
+
- 0
|
158
|
+
version: "0"
|
159
|
+
requirement: *id010
|
160
|
+
type: :development
|
161
|
+
prerelease: false
|
162
|
+
name: simplecov
|
163
|
+
- !ruby/object:Gem::Dependency
|
164
|
+
version_requirements: &id011 !ruby/object:Gem::Requirement
|
165
|
+
none: false
|
166
|
+
requirements:
|
167
|
+
- - ">="
|
168
|
+
- !ruby/object:Gem::Version
|
169
|
+
hash: 3
|
170
|
+
segments:
|
171
|
+
- 0
|
172
|
+
version: "0"
|
173
|
+
requirement: *id011
|
174
|
+
type: :development
|
175
|
+
prerelease: false
|
146
176
|
name: ZenTest
|
177
|
+
- !ruby/object:Gem::Dependency
|
178
|
+
version_requirements: &id012 !ruby/object:Gem::Requirement
|
179
|
+
none: false
|
180
|
+
requirements:
|
181
|
+
- - "="
|
182
|
+
- !ruby/object:Gem::Version
|
183
|
+
hash: 45
|
184
|
+
segments:
|
185
|
+
- 2
|
186
|
+
- 8
|
187
|
+
- 1
|
188
|
+
version: 2.8.1
|
189
|
+
requirement: *id012
|
190
|
+
type: :development
|
191
|
+
prerelease: false
|
192
|
+
name: mysql
|
193
|
+
- !ruby/object:Gem::Dependency
|
194
|
+
version_requirements: &id013 !ruby/object:Gem::Requirement
|
195
|
+
none: false
|
196
|
+
requirements:
|
197
|
+
- - ~>
|
198
|
+
- !ruby/object:Gem::Version
|
199
|
+
hash: 1
|
200
|
+
segments:
|
201
|
+
- 1
|
202
|
+
version: "1"
|
203
|
+
requirement: *id013
|
204
|
+
type: :development
|
205
|
+
prerelease: false
|
206
|
+
name: bundler
|
207
|
+
- !ruby/object:Gem::Dependency
|
208
|
+
version_requirements: &id014 !ruby/object:Gem::Requirement
|
209
|
+
none: false
|
210
|
+
requirements:
|
211
|
+
- - ">="
|
212
|
+
- !ruby/object:Gem::Version
|
213
|
+
hash: 3
|
214
|
+
segments:
|
215
|
+
- 0
|
216
|
+
version: "0"
|
217
|
+
requirement: *id014
|
218
|
+
type: :development
|
219
|
+
prerelease: false
|
220
|
+
name: jeweler
|
147
221
|
description: ETL tools for Chicago
|
148
222
|
email: roland.swingler@gmail.com
|
149
223
|
executables: []
|
@@ -162,52 +236,74 @@ files:
|
|
162
236
|
- Rakefile
|
163
237
|
- VERSION
|
164
238
|
- chicago-etl.gemspec
|
239
|
+
- chicago-flow.gemspec
|
165
240
|
- lib/chicago-etl.rb
|
166
241
|
- lib/chicago/etl.rb
|
167
242
|
- lib/chicago/etl/batch.rb
|
168
|
-
- lib/chicago/etl/
|
243
|
+
- lib/chicago/etl/core_extensions.rb
|
169
244
|
- lib/chicago/etl/counter.rb
|
245
|
+
- lib/chicago/etl/dataset_batch_stage.rb
|
170
246
|
- lib/chicago/etl/key_builder.rb
|
171
247
|
- lib/chicago/etl/load_dataset_builder.rb
|
172
|
-
- lib/chicago/etl/
|
173
|
-
- lib/chicago/etl/
|
248
|
+
- lib/chicago/etl/load_pipeline_stage_builder.rb
|
249
|
+
- lib/chicago/etl/pipeline.rb
|
250
|
+
- lib/chicago/etl/schema_table_sink_factory.rb
|
174
251
|
- lib/chicago/etl/screens/column_screen.rb
|
175
|
-
- lib/chicago/etl/screens/composite_screen.rb
|
176
252
|
- lib/chicago/etl/screens/invalid_element.rb
|
177
253
|
- lib/chicago/etl/screens/missing_value.rb
|
178
254
|
- lib/chicago/etl/screens/out_of_bounds.rb
|
179
255
|
- lib/chicago/etl/sequel/dependant_tables.rb
|
180
256
|
- lib/chicago/etl/sequel/filter_to_etl_batch.rb
|
181
|
-
- lib/chicago/etl/sequel/load_data_infile.rb
|
182
|
-
- lib/chicago/etl/sink.rb
|
183
257
|
- lib/chicago/etl/table_builder.rb
|
184
258
|
- lib/chicago/etl/task_invocation.rb
|
185
259
|
- lib/chicago/etl/tasks.rb
|
186
|
-
- lib/chicago/etl/transformations
|
260
|
+
- lib/chicago/etl/transformations.rb
|
187
261
|
- lib/chicago/etl/transformations/uk_post_code.rb
|
188
262
|
- lib/chicago/etl/transformations/uk_post_code_field.rb
|
263
|
+
- lib/chicago/flow/array_sink.rb
|
264
|
+
- lib/chicago/flow/array_source.rb
|
265
|
+
- lib/chicago/flow/dataset_source.rb
|
266
|
+
- lib/chicago/flow/errors.rb
|
267
|
+
- lib/chicago/flow/filter.rb
|
268
|
+
- lib/chicago/flow/mysql.rb
|
269
|
+
- lib/chicago/flow/mysql_file_serializer.rb
|
270
|
+
- lib/chicago/flow/mysql_file_sink.rb
|
271
|
+
- lib/chicago/flow/null_sink.rb
|
272
|
+
- lib/chicago/flow/pipeline_endpoint.rb
|
273
|
+
- lib/chicago/flow/pipeline_stage.rb
|
274
|
+
- lib/chicago/flow/sink.rb
|
275
|
+
- lib/chicago/flow/transformation.rb
|
276
|
+
- lib/chicago/flow/transformation_chain.rb
|
189
277
|
- spec/db_connections.yml.dist
|
190
278
|
- spec/etl/batch_spec.rb
|
279
|
+
- spec/etl/core_extensions_spec.rb
|
191
280
|
- spec/etl/counter_spec.rb
|
281
|
+
- spec/etl/dataset_batch_stage_spec.rb
|
192
282
|
- spec/etl/etl_batch_id_dataset_filter.rb
|
193
283
|
- spec/etl/key_builder_spec.rb
|
194
284
|
- spec/etl/load_dataset_builder_spec.rb
|
195
|
-
- spec/etl/
|
196
|
-
- spec/etl/
|
197
|
-
- spec/etl/screens/composite_screen_spec.rb
|
285
|
+
- spec/etl/pipeline_stage_builder_spec.rb
|
286
|
+
- spec/etl/schema_table_sink_factory_spec.rb
|
198
287
|
- spec/etl/screens/invalid_element_spec.rb
|
199
288
|
- spec/etl/screens/missing_value_spec.rb
|
200
289
|
- spec/etl/screens/out_of_bounds_spec.rb
|
201
290
|
- spec/etl/sequel/dependant_tables_spec.rb
|
202
291
|
- spec/etl/sequel/filter_to_etl_batch_spec.rb
|
203
|
-
- spec/etl/sequel/load_data_infile_expression_spec.rb
|
204
|
-
- spec/etl/sequel/load_data_infile_spec.rb
|
205
|
-
- spec/etl/sink_spec.rb
|
206
292
|
- spec/etl/table_builder_spec.rb
|
207
293
|
- spec/etl/task_spec.rb
|
208
|
-
- spec/etl/transformations/add_insert_timestamp_spec.rb
|
209
294
|
- spec/etl/transformations/uk_post_code_field_spec.rb
|
210
295
|
- spec/etl/transformations/uk_post_code_spec.rb
|
296
|
+
- spec/etl/transformations_spec.rb
|
297
|
+
- spec/flow/array_sink_spec.rb
|
298
|
+
- spec/flow/array_source_spec.rb
|
299
|
+
- spec/flow/dataset_source_spec.rb
|
300
|
+
- spec/flow/filter_spec.rb
|
301
|
+
- spec/flow/mysql_file_serializer_spec.rb
|
302
|
+
- spec/flow/mysql_file_sink_spec.rb
|
303
|
+
- spec/flow/mysql_integration_spec.rb
|
304
|
+
- spec/flow/pipeline_stage_spec.rb
|
305
|
+
- spec/flow/transformation_chain_spec.rb
|
306
|
+
- spec/flow/transformation_spec.rb
|
211
307
|
- spec/spec_helper.rb
|
212
308
|
homepage: http://github.com/notonthehighstreet/chicago-etl
|
213
309
|
licenses:
|
@@ -1,36 +0,0 @@
|
|
1
|
-
require 'chicago/etl/sink'
|
2
|
-
|
3
|
-
module Chicago
|
4
|
-
module ETL
|
5
|
-
# Wrapper around a dataset to allowed buffered inserts.
|
6
|
-
#
|
7
|
-
# @api public
|
8
|
-
class BufferingInsertWriter < Sink
|
9
|
-
# The number of rows written before inserting to the DB.
|
10
|
-
BUFFER_SIZE = 10_000
|
11
|
-
|
12
|
-
def initialize(dataset, column_names, key=nil)
|
13
|
-
super([], column_names, key)
|
14
|
-
@dataset = dataset
|
15
|
-
end
|
16
|
-
|
17
|
-
def flush
|
18
|
-
@dataset.insert_replace.import(column_names, output)
|
19
|
-
output.clear
|
20
|
-
end
|
21
|
-
|
22
|
-
protected
|
23
|
-
|
24
|
-
def write(row)
|
25
|
-
output << @column_names.map {|name| row[name] }
|
26
|
-
flush if reached_buffer_limit?
|
27
|
-
end
|
28
|
-
|
29
|
-
private
|
30
|
-
|
31
|
-
def reached_buffer_limit?
|
32
|
-
output.size >= BUFFER_SIZE
|
33
|
-
end
|
34
|
-
end
|
35
|
-
end
|
36
|
-
end
|
@@ -1,32 +0,0 @@
|
|
1
|
-
require 'chicago/etl/sink'
|
2
|
-
|
3
|
-
module Chicago
|
4
|
-
module ETL
|
5
|
-
# Wrapper around FasterCSV's output object, to convert values to a
|
6
|
-
# format required by MySQL's LOAD DATA INFILE command.
|
7
|
-
#
|
8
|
-
# @api public
|
9
|
-
class MysqlDumpfile < Sink
|
10
|
-
# Creates a new writer.
|
11
|
-
#
|
12
|
-
# @param csv a FasterCSV output object
|
13
|
-
# @param [Symbol] column_names columns to be output
|
14
|
-
# @param key an optional key to ensure rows are written only once.
|
15
|
-
def initialize(csv, column_names, key=nil)
|
16
|
-
super(csv, column_names, key)
|
17
|
-
@transformer = MysqlLoadFileValueTransformer.new
|
18
|
-
end
|
19
|
-
|
20
|
-
protected
|
21
|
-
|
22
|
-
# Writes a row to the output.
|
23
|
-
#
|
24
|
-
# @param Hash row Only keys in column_names will be output.
|
25
|
-
def write(row)
|
26
|
-
output << @column_names.map {|name|
|
27
|
-
@transformer.transform(row[name])
|
28
|
-
}
|
29
|
-
end
|
30
|
-
end
|
31
|
-
end
|
32
|
-
end
|
@@ -1,17 +0,0 @@
|
|
1
|
-
module Chicago
|
2
|
-
module ETL
|
3
|
-
module Screens
|
4
|
-
class CompositeScreen
|
5
|
-
def initialize(*screens)
|
6
|
-
@screens = screens.flatten
|
7
|
-
end
|
8
|
-
|
9
|
-
def call(row, errors=[])
|
10
|
-
@screens.inject([row,errors]) do |(row, errors), screen|
|
11
|
-
screen.call(row, errors)
|
12
|
-
end
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end
|
17
|
-
end
|
@@ -1,141 +0,0 @@
|
|
1
|
-
module Chicago
|
2
|
-
module ETL
|
3
|
-
module SequelExtensions
|
4
|
-
# @api private
|
5
|
-
class LoadDataInfileExpression
|
6
|
-
attr_reader :path, :table, :columns, :ignore, :character_set
|
7
|
-
|
8
|
-
def initialize(path, table, columns, opts={})
|
9
|
-
@path = path
|
10
|
-
@table = table
|
11
|
-
@columns = columns
|
12
|
-
@ignore = opts[:ignore]
|
13
|
-
@update = opts[:update]
|
14
|
-
@set = opts[:set] || {}
|
15
|
-
@character_set = opts[:character_set] || "utf8"
|
16
|
-
if opts[:format] == :csv
|
17
|
-
@field_terminator = ","
|
18
|
-
@enclosed_by = '"'
|
19
|
-
@escaped_by = '"'
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
def replace?
|
24
|
-
@update == :replace
|
25
|
-
end
|
26
|
-
|
27
|
-
def ignore?
|
28
|
-
@update == :ignore
|
29
|
-
end
|
30
|
-
|
31
|
-
def to_sql(db)
|
32
|
-
@db = db
|
33
|
-
[load_fragment,
|
34
|
-
replace_fragment,
|
35
|
-
table_fragment,
|
36
|
-
character_set_fragment,
|
37
|
-
field_terminator_fragment,
|
38
|
-
field_enclosure_fragment,
|
39
|
-
escape_fragment,
|
40
|
-
ignore_fragment,
|
41
|
-
column_fragment,
|
42
|
-
set_fragment].compact.join(" ")
|
43
|
-
end
|
44
|
-
|
45
|
-
private
|
46
|
-
|
47
|
-
def load_fragment
|
48
|
-
"LOAD DATA INFILE '#{path}'"
|
49
|
-
end
|
50
|
-
|
51
|
-
def replace_fragment
|
52
|
-
@update.to_s.upcase if replace? || ignore?
|
53
|
-
end
|
54
|
-
|
55
|
-
def table_fragment
|
56
|
-
"INTO TABLE `#{table}`"
|
57
|
-
end
|
58
|
-
|
59
|
-
def character_set_fragment
|
60
|
-
"CHARACTER SET '#{character_set}'"
|
61
|
-
end
|
62
|
-
|
63
|
-
def field_terminator_fragment
|
64
|
-
"FIELDS TERMINATED BY '#{@field_terminator}'" if @field_terminator
|
65
|
-
end
|
66
|
-
|
67
|
-
def field_enclosure_fragment
|
68
|
-
"OPTIONALLY ENCLOSED BY '#{@enclosed_by}'" if @enclosed_by
|
69
|
-
end
|
70
|
-
|
71
|
-
def escape_fragment
|
72
|
-
"ESCAPED BY '#{@escaped_by}'" if @escaped_by
|
73
|
-
end
|
74
|
-
|
75
|
-
def ignore_fragment
|
76
|
-
"IGNORE #{ignore} LINES" if ignore
|
77
|
-
end
|
78
|
-
|
79
|
-
def column_fragment
|
80
|
-
"(" + columns.map {|c| format_column(c) }.join(",") + ")"
|
81
|
-
end
|
82
|
-
|
83
|
-
def set_fragment
|
84
|
-
unless @set.empty?
|
85
|
-
"SET " + @set.map do |k, v|
|
86
|
-
"#{@db.literal(k)} = #{@db.literal(v)}"
|
87
|
-
end.join(", ")
|
88
|
-
end
|
89
|
-
end
|
90
|
-
|
91
|
-
def format_column(column)
|
92
|
-
column.to_s[0..0] == "@" ? column : "`#{column}`"
|
93
|
-
end
|
94
|
-
end
|
95
|
-
|
96
|
-
module LoadDataInfile
|
97
|
-
# Load data in file specified at path.
|
98
|
-
#
|
99
|
-
# Columns is a list of columns to load - column names starting
|
100
|
-
# with an @ symbol will be treated as variables.
|
101
|
-
#
|
102
|
-
# By default, this will generate a REPLACE INTO TABLE
|
103
|
-
# statement.
|
104
|
-
#
|
105
|
-
# Options:
|
106
|
-
# :ignore - the number of lines to ignore in the source file
|
107
|
-
# :update - nil, :ignore or :replace
|
108
|
-
# :set - a hash specifying autopopulation of columns
|
109
|
-
# :character_set - the character set of the file, UTF8 default
|
110
|
-
# :format - either nil or :csv
|
111
|
-
def load_infile(path, columns, options={})
|
112
|
-
execute_dui(load_infile_sql(path, columns, options))
|
113
|
-
end
|
114
|
-
|
115
|
-
def load_infile_sql(path, columns, options={})
|
116
|
-
replacement = opts[:insert_ignore] ? :ignore : :replace
|
117
|
-
options = {:update => replacement}.merge(options)
|
118
|
-
LoadDataInfileExpression.new(path,
|
119
|
-
opts[:from].first,
|
120
|
-
columns,
|
121
|
-
options).
|
122
|
-
to_sql(db)
|
123
|
-
end
|
124
|
-
|
125
|
-
# Loads the CSV data columns in path into this dataset's
|
126
|
-
# table.
|
127
|
-
#
|
128
|
-
# See load_infile for more options.
|
129
|
-
def load_csv_infile(path, columns, options={})
|
130
|
-
execute_dui(load_csv_infile_sql(path, columns, options))
|
131
|
-
end
|
132
|
-
|
133
|
-
def load_csv_infile_sql(path, columns, options={})
|
134
|
-
load_infile_sql(path, columns, options.merge(:format => :csv))
|
135
|
-
end
|
136
|
-
end
|
137
|
-
end
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
141
|
-
Sequel::Dataset.send :include, Chicago::ETL::SequelExtensions::LoadDataInfile
|
data/lib/chicago/etl/sink.rb
DELETED
@@ -1,61 +0,0 @@
|
|
1
|
-
require 'set'
|
2
|
-
|
3
|
-
module Chicago
|
4
|
-
module ETL
|
5
|
-
# An end point to write rows.
|
6
|
-
#
|
7
|
-
# @abstract
|
8
|
-
# @api public
|
9
|
-
class Sink
|
10
|
-
# Returns the column names expected to be written to this sink.
|
11
|
-
# @api public
|
12
|
-
attr_reader :column_names
|
13
|
-
|
14
|
-
# @abstract
|
15
|
-
def initialize(output, column_names, unique_row_key=nil)
|
16
|
-
@output = output
|
17
|
-
@column_names = column_names
|
18
|
-
@written_rows = Set.new
|
19
|
-
@unique_row_key = unique_row_key
|
20
|
-
end
|
21
|
-
|
22
|
-
# Writes a row to the output.
|
23
|
-
#
|
24
|
-
# Row will not be written to the output if it has already been
|
25
|
-
# written, as identified by the unique row key.
|
26
|
-
#
|
27
|
-
# Should not be overridden by subclasses - overwrite write instead.
|
28
|
-
def <<(row)
|
29
|
-
unless written?(row)
|
30
|
-
write row
|
31
|
-
@written_rows << row[@unique_row_key]
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
# Flushes any remaining writes to the output.
|
36
|
-
#
|
37
|
-
# By default does nothing, subclasses should override where
|
38
|
-
# necessary.
|
39
|
-
def flush
|
40
|
-
end
|
41
|
-
|
42
|
-
# Returns true if this row has previously been written to the
|
43
|
-
# output.
|
44
|
-
#
|
45
|
-
# Always returns false if no key to determine row uniqueness has
|
46
|
-
# been provided.
|
47
|
-
def written?(row)
|
48
|
-
return false if @unique_row_key.nil?
|
49
|
-
@written_rows.include?(row[@unique_row_key])
|
50
|
-
end
|
51
|
-
|
52
|
-
protected
|
53
|
-
|
54
|
-
attr_reader :output
|
55
|
-
|
56
|
-
# @abstract
|
57
|
-
def write(row)
|
58
|
-
end
|
59
|
-
end
|
60
|
-
end
|
61
|
-
end
|
@@ -1,16 +0,0 @@
|
|
1
|
-
module Chicago
|
2
|
-
module ETL
|
3
|
-
module Transformations
|
4
|
-
class AddInsertTimestamp
|
5
|
-
def initialize(timestamp=Time.now)
|
6
|
-
@insert_timestamp = timestamp.utc
|
7
|
-
end
|
8
|
-
|
9
|
-
def call(row, errors=[])
|
10
|
-
row[:_inserted_at] = @insert_timestamp
|
11
|
-
[row, errors]
|
12
|
-
end
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end
|
@@ -1,42 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
|
3
|
-
describe Chicago::ETL::MysqlDumpfile do
|
4
|
-
before :each do
|
5
|
-
@csv = mock(:csv)
|
6
|
-
end
|
7
|
-
|
8
|
-
it "outputs specified column values in order" do
|
9
|
-
dumpfile = described_class.new(@csv, [:foo, :bar])
|
10
|
-
@csv.should_receive(:<<).with(["1", "2"])
|
11
|
-
|
12
|
-
dumpfile << {:foo => "1", :bar => "2", :baz => "not output"}
|
13
|
-
end
|
14
|
-
|
15
|
-
it "transforms values with a MysqlLoadFileValueTransformer" do
|
16
|
-
transformer = mock(:transformer)
|
17
|
-
Chicago::ETL::MysqlLoadFileValueTransformer.stub(:new).and_return(transformer)
|
18
|
-
|
19
|
-
transformer.should_receive(:transform).with("bar").and_return("baz")
|
20
|
-
@csv.should_receive(:<<).with(["baz"])
|
21
|
-
|
22
|
-
dumpfile = described_class.new(@csv, [:foo])
|
23
|
-
dumpfile << {:foo => "bar"}
|
24
|
-
end
|
25
|
-
|
26
|
-
it "will write a row only once with the same key" do
|
27
|
-
dumpfile = described_class.new(@csv, [:foo], :id)
|
28
|
-
@csv.should_receive(:<<).with(["bar"])
|
29
|
-
|
30
|
-
dumpfile << {:id => 1, :foo => "bar"}
|
31
|
-
dumpfile << {:id => 1, :foo => "baz"}
|
32
|
-
end
|
33
|
-
|
34
|
-
it "will write a row multiple times if no key is specified" do
|
35
|
-
dumpfile = described_class.new(@csv, [:foo])
|
36
|
-
@csv.should_receive(:<<).with(["bar"])
|
37
|
-
@csv.should_receive(:<<).with(["baz"])
|
38
|
-
|
39
|
-
dumpfile << {:id => 1, :foo => "bar"}
|
40
|
-
dumpfile << {:id => 1, :foo => "baz"}
|
41
|
-
end
|
42
|
-
end
|
@@ -1,27 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
|
3
|
-
describe Chicago::ETL::MysqlLoadFileValueTransformer do
|
4
|
-
it "transforms nil into \\N" do
|
5
|
-
subject.transform(nil).should == "\\N"
|
6
|
-
end
|
7
|
-
|
8
|
-
it "transforms true into '1'" do
|
9
|
-
subject.transform(true).should == "1"
|
10
|
-
end
|
11
|
-
|
12
|
-
it "transforms false into '0'" do
|
13
|
-
subject.transform(false).should == "0"
|
14
|
-
end
|
15
|
-
|
16
|
-
it "transforms times into mysql time format" do
|
17
|
-
subject.transform(Time.local(2011,01,02,10,30,50)).should == "2011-01-02 10:30:50"
|
18
|
-
end
|
19
|
-
|
20
|
-
it "transforms datetimes into mysql time format" do
|
21
|
-
subject.transform(DateTime.new(2011,01,02,10,30,50)).should == "2011-01-02 10:30:50"
|
22
|
-
end
|
23
|
-
|
24
|
-
it "transforms dates into mysql date format" do
|
25
|
-
subject.transform(Date.new(2011,01,02)).should == "2011-01-02"
|
26
|
-
end
|
27
|
-
end
|
@@ -1,25 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
|
3
|
-
describe Chicago::ETL::Screens::CompositeScreen do
|
4
|
-
let(:screen) do
|
5
|
-
i = 0
|
6
|
-
|
7
|
-
lambda {|row, errors|
|
8
|
-
i += 1
|
9
|
-
errors << i
|
10
|
-
[row, errors]
|
11
|
-
}
|
12
|
-
end
|
13
|
-
|
14
|
-
it "calls all child screens" do
|
15
|
-
row, errors = described_class.new([screen, screen]).call({:a => 1}, [])
|
16
|
-
row.should == {:a => 1}
|
17
|
-
errors.should == [1,2]
|
18
|
-
end
|
19
|
-
|
20
|
-
it "supports variable arguments in the constructor" do
|
21
|
-
row, errors = described_class.new(screen, screen).call({:a => 1}, [])
|
22
|
-
row.should == {:a => 1}
|
23
|
-
errors.should == [1,2]
|
24
|
-
end
|
25
|
-
end
|