bricolage 5.13.1 → 5.14.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/jobclass/streaming_load.rb +161 -100
- data/lib/bricolage/version.rb +1 -1
- data/test/home/Gemfile.lock +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5bb93d5d2f2b8100f394fefef4df4a0312e612a4
|
4
|
+
data.tar.gz: a961533ace5a56216757ab3c99666780922511ac
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7d5e13b4683b83f850ebca170c0ba4f00917874f386d95d16b5764d63625be388de3940a3f8be6b07a19bb000b9da3c024aa05c4795d48067efc81b672ada610
|
7
|
+
data.tar.gz: 68d542616e8a6826c0e9bddc33afd9744b449c7ee218610d65750186915ec51913f8144c8155bb083feb80b17ee086eafdc5c7dec07f16db012912d8546c5fba
|
data/jobclass/streaming_load.rb
CHANGED
@@ -42,9 +42,11 @@ class StreamingLoadJobClass < RubyJobClass
|
|
42
42
|
end
|
43
43
|
|
44
44
|
def run
|
45
|
-
|
46
|
-
|
47
|
-
|
45
|
+
if @dequeue_only
|
46
|
+
@loader.dequeue
|
47
|
+
else
|
48
|
+
@loader.load
|
49
|
+
end
|
48
50
|
nil
|
49
51
|
end
|
50
52
|
|
@@ -108,8 +110,12 @@ class StreamingLoadJobClass < RubyJobClass
|
|
108
110
|
|
109
111
|
attr_reader :sql
|
110
112
|
|
111
|
-
def
|
112
|
-
|
113
|
+
def work_table
|
114
|
+
@work_table || "#{@table}_wk"
|
115
|
+
end
|
116
|
+
|
117
|
+
def log_table
|
118
|
+
@log_table || "#{@table}_l"
|
113
119
|
end
|
114
120
|
|
115
121
|
def log_basic_info
|
@@ -118,123 +124,131 @@ class StreamingLoadJobClass < RubyJobClass
|
|
118
124
|
@logger.info "queue: #{@src.queue_url}"
|
119
125
|
end
|
120
126
|
|
121
|
-
def
|
127
|
+
def dequeue
|
122
128
|
log_basic_info
|
123
|
-
@logger.info
|
129
|
+
@logger.info "dequeue start"
|
124
130
|
objects = @src.queued_objects
|
125
131
|
if objects.empty?
|
126
132
|
@logger.info 'no target data files; exit'
|
127
133
|
return
|
128
134
|
end
|
129
|
-
|
135
|
+
create_load_log_file(objects) {|log_url|
|
130
136
|
@ds.open {|conn|
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
137
|
+
execute_update conn, copy_load_log_stmt(log_url, @src.credential_string)
|
138
|
+
foreach_loaded_object(objects) do |obj|
|
139
|
+
obj.dequeue(@noop)
|
140
|
+
end
|
135
141
|
}
|
136
|
-
dequeue_all objects
|
137
142
|
}
|
138
143
|
end
|
139
144
|
|
140
|
-
def
|
145
|
+
def load
|
141
146
|
log_basic_info
|
142
|
-
@logger.info 'load
|
147
|
+
@logger.info 'load with manifest'
|
143
148
|
objects = @src.queued_objects
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
149
|
+
if objects.empty?
|
150
|
+
@logger.info 'no target data files; exit'
|
151
|
+
return
|
152
|
+
end
|
153
|
+
create_load_log_file(objects) {|log_url|
|
154
|
+
@ds.open {|conn|
|
155
|
+
create_tmp_log_table(conn, log_url) {|tmp_log_table|
|
156
|
+
loaded, not_loaded = partition_loaded_objects(conn, objects, tmp_log_table)
|
157
|
+
unless @load_only
|
158
|
+
loaded.each do |obj|
|
159
|
+
obj.dequeue(force: true, noop: @noop)
|
160
|
+
end
|
161
|
+
end
|
162
|
+
unless not_loaded.empty?
|
163
|
+
create_manifest_file(not_loaded) {|manifest_url|
|
164
|
+
init_work_table conn
|
165
|
+
execute_update conn, manifest_copy_stmt(work_table, manifest_url)
|
166
|
+
@logger.info "load succeeded: #{manifest_url}" unless @noop
|
167
|
+
commit conn, work_table, tmp_log_table unless @load_only
|
168
|
+
}
|
169
|
+
unless @load_only
|
170
|
+
not_loaded.each do |obj|
|
171
|
+
obj.dequeue(force: true, noop: @noop)
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
}
|
176
|
+
}
|
152
177
|
}
|
153
|
-
dequeue_all objects
|
154
178
|
end
|
155
179
|
|
156
|
-
def commit(conn,
|
157
|
-
@end_time = Time.now
|
158
|
-
return if @load_only
|
180
|
+
def commit(conn, work_table, tmp_log_table)
|
181
|
+
@end_time = Time.now # commit_load_log writes this, generate before that
|
159
182
|
transaction(conn) {
|
160
|
-
commit_work_table conn
|
161
|
-
|
162
|
-
}
|
163
|
-
end
|
164
|
-
|
165
|
-
def dequeue_loaded_files
|
166
|
-
@logger.info "dequeue start"
|
167
|
-
objects = @src.queued_objects
|
168
|
-
@ds.open {|conn|
|
169
|
-
objects.each do |obj|
|
170
|
-
if loaded_object?(conn, obj)
|
171
|
-
obj.dequeue(@noop)
|
172
|
-
end
|
173
|
-
end
|
183
|
+
commit_work_table conn, work_table
|
184
|
+
commit_load_log conn, tmp_log_table
|
174
185
|
}
|
175
186
|
end
|
176
187
|
|
177
188
|
private
|
178
189
|
|
179
190
|
def init_work_table(conn)
|
180
|
-
|
181
|
-
execute_update conn, "truncate #{@work_table};"
|
191
|
+
execute_update conn, "truncate #{work_table};"
|
182
192
|
end
|
183
193
|
|
184
|
-
def commit_work_table(conn)
|
185
|
-
|
186
|
-
insert_stmt = @sql ? @sql.source : "insert into #{@table} select * from #{@work_table};"
|
194
|
+
def commit_work_table(conn, work_table)
|
195
|
+
insert_stmt = @sql ? @sql.source : "insert into #{@table} select * from #{work_table};"
|
187
196
|
execute_update conn, insert_stmt
|
188
197
|
# keep work table records for tracing
|
189
198
|
end
|
190
199
|
|
191
|
-
def copy_file_statement(obj)
|
192
|
-
%Q(
|
193
|
-
copy #{load_target_table} from '#{obj.url}'
|
194
|
-
credentials '#{obj.credential_string}'
|
195
|
-
#{@load_options}
|
196
|
-
;).gsub(/\s+/, ' ').strip
|
197
|
-
end
|
198
|
-
|
199
200
|
def create_manifest_file(objects)
|
200
201
|
manifest_name = "manifest-#{@job_process_id}.json"
|
201
202
|
@logger.info "creating manifest: #{manifest_name}"
|
202
|
-
|
203
|
-
|
203
|
+
json = make_manifest_json(objects)
|
204
|
+
@logger.info "manifest:\n" + json
|
205
|
+
url = @src.put_control_file(manifest_name, json, noop: @noop)
|
204
206
|
yield url
|
205
207
|
@src.remove_control_file(File.basename(url), noop: @noop)
|
206
208
|
end
|
207
209
|
|
208
210
|
def make_manifest_json(objects)
|
209
211
|
ents = objects.map {|obj|
|
210
|
-
{ "url" => obj.url, "mandatory" =>
|
212
|
+
{ "url" => obj.url, "mandatory" => false }
|
211
213
|
}
|
212
214
|
JSON.pretty_generate({ "entries" => ents })
|
213
215
|
end
|
214
216
|
|
215
|
-
def
|
217
|
+
def manifest_copy_stmt(target_table, manifest_url)
|
216
218
|
%Q(
|
217
|
-
copy #{
|
219
|
+
copy #{target_table}
|
218
220
|
from '#{manifest_url}'
|
219
|
-
credentials '#{credential_string}'
|
221
|
+
credentials '#{@src.credential_string}'
|
220
222
|
manifest
|
223
|
+
statupdate false
|
221
224
|
#{@load_options}
|
222
225
|
;).gsub(/\s+/, ' ').strip
|
223
226
|
end
|
224
227
|
|
225
|
-
def
|
226
|
-
|
228
|
+
def create_load_log_file(objects)
|
229
|
+
log_name = "load_log-#{@job_process_id}.csv"
|
230
|
+
@logger.info "creating tmp load log: #{log_name}"
|
231
|
+
csv = make_load_log_csv(objects)
|
232
|
+
@logger.info "load_log:\n" + csv
|
233
|
+
url = @src.put_control_file(log_name, csv, noop: @noop)
|
234
|
+
yield url
|
235
|
+
@src.remove_control_file(File.basename(url), noop: @noop)
|
227
236
|
end
|
228
237
|
|
229
|
-
def
|
230
|
-
|
231
|
-
|
232
|
-
|
238
|
+
def make_load_log_csv(objects)
|
239
|
+
buf = StringIO.new
|
240
|
+
objects.each do |obj|
|
241
|
+
log = make_load_log(obj)
|
242
|
+
cols = [
|
243
|
+
log.job_process_id,
|
244
|
+
format_timestamp(log.start_time),
|
245
|
+
'', # end time does not exist yet
|
246
|
+
log.target_table,
|
247
|
+
log.data_file
|
248
|
+
]
|
249
|
+
buf.puts cols.map {|c| %Q("#{c}") }.join(',')
|
233
250
|
end
|
234
|
-
|
235
|
-
|
236
|
-
def make_load_logs(objects)
|
237
|
-
objects.map {|obj| make_load_log(obj) }
|
251
|
+
buf.string
|
238
252
|
end
|
239
253
|
|
240
254
|
def make_load_log(obj)
|
@@ -243,34 +257,78 @@ class StreamingLoadJobClass < RubyJobClass
|
|
243
257
|
|
244
258
|
LoadLogRecord = Struct.new(:job_process_id, :start_time, :end_time, :target_table, :data_file)
|
245
259
|
|
246
|
-
def
|
247
|
-
|
248
|
-
execute_update
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
260
|
+
def create_tmp_log_table(conn, log_url)
|
261
|
+
target_table = log_table_wk
|
262
|
+
execute_update conn, "create table #{target_table} (like #{@log_table});"
|
263
|
+
execute_update conn, load_log_copy_stmt(target_table, log_url, @src.credential_string)
|
264
|
+
begin
|
265
|
+
yield target_table
|
266
|
+
ensure
|
267
|
+
begin
|
268
|
+
execute_update conn, "drop table #{target_table}"
|
269
|
+
rescue PostgreSQLException => ex
|
270
|
+
@logger.error ex.message + " (ignored)"
|
271
|
+
end
|
272
|
+
end
|
273
|
+
end
|
274
|
+
|
275
|
+
def log_table_wk
|
276
|
+
"#{@log_table}_tmp#{Process.pid}"
|
277
|
+
end
|
278
|
+
|
279
|
+
def load_log_copy_stmt(target_table, log_url, credential_string)
|
280
|
+
%Q(
|
281
|
+
copy #{target_table}
|
282
|
+
from '#{log_url}'
|
283
|
+
credentials '#{credential_string}'
|
284
|
+
delimiter ','
|
285
|
+
removequotes
|
286
|
+
;).gsub(/\s+/, ' ').strip
|
287
|
+
end
|
288
|
+
|
289
|
+
def partition_loaded_objects(conn, objects, tmp_log_table)
|
290
|
+
recs = conn.execute(<<-EndSQL)
|
291
|
+
select
|
292
|
+
data_file
|
293
|
+
, case when l.job_process_id is not null then 'true' else 'false' end as is_loaded
|
294
|
+
from
|
295
|
+
#{@log_table} l right outer join #{tmp_log_table} t using (data_file)
|
263
296
|
;
|
264
297
|
EndSQL
|
298
|
+
index = {}
|
299
|
+
objects.each do |obj|
|
300
|
+
index[obj.url] = obj
|
301
|
+
end
|
302
|
+
recs.each do |rec|
|
303
|
+
obj = index[rec['data_file']]
|
304
|
+
obj.loaded = (rec['is_loaded'] == 'true')
|
305
|
+
end
|
306
|
+
objects.partition(&:loaded)
|
265
307
|
end
|
266
308
|
|
267
|
-
def
|
268
|
-
|
269
|
-
|
309
|
+
def commit_load_log(conn, tmp_table_name)
|
310
|
+
conn.execute(<<-EndSQL)
|
311
|
+
insert into #{@log_table}
|
312
|
+
select
|
313
|
+
job_process_id
|
314
|
+
, start_time
|
315
|
+
, #{sql_timestamp @end_time}
|
316
|
+
, target_table
|
317
|
+
, data_file
|
318
|
+
from
|
319
|
+
#{tmp_table_name}
|
320
|
+
where
|
321
|
+
data_file not in (select data_file from #{@log_table})
|
322
|
+
;
|
323
|
+
EndSQL
|
270
324
|
end
|
271
325
|
|
272
326
|
def sql_timestamp(time)
|
273
|
-
%Q(timestamp '#{time
|
327
|
+
%Q(timestamp '#{format_timestamp(time)}')
|
328
|
+
end
|
329
|
+
|
330
|
+
def format_timestamp(time)
|
331
|
+
time.strftime('%Y-%m-%d %H:%M:%S')
|
274
332
|
end
|
275
333
|
|
276
334
|
def sql_string(str)
|
@@ -299,13 +357,6 @@ class StreamingLoadJobClass < RubyJobClass
|
|
299
357
|
def mask_secrets(log)
|
300
358
|
log.gsub(/\bcredentials\s+'.*?'/mi, "credentials '****'")
|
301
359
|
end
|
302
|
-
|
303
|
-
def dequeue_all(objects)
|
304
|
-
return if @load_only
|
305
|
-
objects.each do |obj|
|
306
|
-
obj.dequeue(@noop)
|
307
|
-
end
|
308
|
-
end
|
309
360
|
end
|
310
361
|
|
311
362
|
class S3Queue
|
@@ -424,8 +475,11 @@ class StreamingLoadJobClass < RubyJobClass
|
|
424
475
|
@s3queue = s3queue
|
425
476
|
@object = object
|
426
477
|
@logger = logger
|
478
|
+
@loaded = nil
|
427
479
|
end
|
428
480
|
|
481
|
+
attr_accessor :loaded
|
482
|
+
|
429
483
|
def credential_string
|
430
484
|
@s3queue.credential_string
|
431
485
|
end
|
@@ -442,11 +496,18 @@ class StreamingLoadJobClass < RubyJobClass
|
|
442
496
|
@s3queue.object_url_direct(path)
|
443
497
|
end
|
444
498
|
|
445
|
-
def dequeue(noop
|
499
|
+
def dequeue(force: false, noop: false)
|
446
500
|
@logger.info "s3 move: #{path} -> #{persistent_path}"
|
447
501
|
return if noop
|
448
502
|
@object.move_to persistent_object, dequeue_options
|
449
|
-
@logger.info "
|
503
|
+
@logger.info "done"
|
504
|
+
rescue Aws::S3::Errors::NoSuchKey => ex
|
505
|
+
@logger.error "S3 error: #{ex.message}"
|
506
|
+
if force
|
507
|
+
@logger.info "move error ignored (may be caused by eventual consistency)"
|
508
|
+
else
|
509
|
+
raise
|
510
|
+
end
|
450
511
|
end
|
451
512
|
|
452
513
|
def persistent_object
|
data/lib/bricolage/version.rb
CHANGED
data/test/home/Gemfile.lock
CHANGED