bricolage 5.13.1 → 5.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/jobclass/streaming_load.rb +161 -100
- data/lib/bricolage/version.rb +1 -1
- data/test/home/Gemfile.lock +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5bb93d5d2f2b8100f394fefef4df4a0312e612a4
|
4
|
+
data.tar.gz: a961533ace5a56216757ab3c99666780922511ac
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7d5e13b4683b83f850ebca170c0ba4f00917874f386d95d16b5764d63625be388de3940a3f8be6b07a19bb000b9da3c024aa05c4795d48067efc81b672ada610
|
7
|
+
data.tar.gz: 68d542616e8a6826c0e9bddc33afd9744b449c7ee218610d65750186915ec51913f8144c8155bb083feb80b17ee086eafdc5c7dec07f16db012912d8546c5fba
|
data/jobclass/streaming_load.rb
CHANGED
@@ -42,9 +42,11 @@ class StreamingLoadJobClass < RubyJobClass
|
|
42
42
|
end
|
43
43
|
|
44
44
|
def run
|
45
|
-
|
46
|
-
|
47
|
-
|
45
|
+
if @dequeue_only
|
46
|
+
@loader.dequeue
|
47
|
+
else
|
48
|
+
@loader.load
|
49
|
+
end
|
48
50
|
nil
|
49
51
|
end
|
50
52
|
|
@@ -108,8 +110,12 @@ class StreamingLoadJobClass < RubyJobClass
|
|
108
110
|
|
109
111
|
attr_reader :sql
|
110
112
|
|
111
|
-
def
|
112
|
-
|
113
|
+
def work_table
|
114
|
+
@work_table || "#{@table}_wk"
|
115
|
+
end
|
116
|
+
|
117
|
+
def log_table
|
118
|
+
@log_table || "#{@table}_l"
|
113
119
|
end
|
114
120
|
|
115
121
|
def log_basic_info
|
@@ -118,123 +124,131 @@ class StreamingLoadJobClass < RubyJobClass
|
|
118
124
|
@logger.info "queue: #{@src.queue_url}"
|
119
125
|
end
|
120
126
|
|
121
|
-
def
|
127
|
+
def dequeue
|
122
128
|
log_basic_info
|
123
|
-
@logger.info
|
129
|
+
@logger.info "dequeue start"
|
124
130
|
objects = @src.queued_objects
|
125
131
|
if objects.empty?
|
126
132
|
@logger.info 'no target data files; exit'
|
127
133
|
return
|
128
134
|
end
|
129
|
-
|
135
|
+
create_load_log_file(objects) {|log_url|
|
130
136
|
@ds.open {|conn|
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
137
|
+
execute_update conn, copy_load_log_stmt(log_url, @src.credential_string)
|
138
|
+
foreach_loaded_object(objects) do |obj|
|
139
|
+
obj.dequeue(@noop)
|
140
|
+
end
|
135
141
|
}
|
136
|
-
dequeue_all objects
|
137
142
|
}
|
138
143
|
end
|
139
144
|
|
140
|
-
def
|
145
|
+
def load
|
141
146
|
log_basic_info
|
142
|
-
@logger.info 'load
|
147
|
+
@logger.info 'load with manifest'
|
143
148
|
objects = @src.queued_objects
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
149
|
+
if objects.empty?
|
150
|
+
@logger.info 'no target data files; exit'
|
151
|
+
return
|
152
|
+
end
|
153
|
+
create_load_log_file(objects) {|log_url|
|
154
|
+
@ds.open {|conn|
|
155
|
+
create_tmp_log_table(conn, log_url) {|tmp_log_table|
|
156
|
+
loaded, not_loaded = partition_loaded_objects(conn, objects, tmp_log_table)
|
157
|
+
unless @load_only
|
158
|
+
loaded.each do |obj|
|
159
|
+
obj.dequeue(force: true, noop: @noop)
|
160
|
+
end
|
161
|
+
end
|
162
|
+
unless not_loaded.empty?
|
163
|
+
create_manifest_file(not_loaded) {|manifest_url|
|
164
|
+
init_work_table conn
|
165
|
+
execute_update conn, manifest_copy_stmt(work_table, manifest_url)
|
166
|
+
@logger.info "load succeeded: #{manifest_url}" unless @noop
|
167
|
+
commit conn, work_table, tmp_log_table unless @load_only
|
168
|
+
}
|
169
|
+
unless @load_only
|
170
|
+
not_loaded.each do |obj|
|
171
|
+
obj.dequeue(force: true, noop: @noop)
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
}
|
176
|
+
}
|
152
177
|
}
|
153
|
-
dequeue_all objects
|
154
178
|
end
|
155
179
|
|
156
|
-
def commit(conn,
|
157
|
-
@end_time = Time.now
|
158
|
-
return if @load_only
|
180
|
+
def commit(conn, work_table, tmp_log_table)
|
181
|
+
@end_time = Time.now # commit_load_log writes this, generate before that
|
159
182
|
transaction(conn) {
|
160
|
-
commit_work_table conn
|
161
|
-
|
162
|
-
}
|
163
|
-
end
|
164
|
-
|
165
|
-
def dequeue_loaded_files
|
166
|
-
@logger.info "dequeue start"
|
167
|
-
objects = @src.queued_objects
|
168
|
-
@ds.open {|conn|
|
169
|
-
objects.each do |obj|
|
170
|
-
if loaded_object?(conn, obj)
|
171
|
-
obj.dequeue(@noop)
|
172
|
-
end
|
173
|
-
end
|
183
|
+
commit_work_table conn, work_table
|
184
|
+
commit_load_log conn, tmp_log_table
|
174
185
|
}
|
175
186
|
end
|
176
187
|
|
177
188
|
private
|
178
189
|
|
179
190
|
def init_work_table(conn)
|
180
|
-
|
181
|
-
execute_update conn, "truncate #{@work_table};"
|
191
|
+
execute_update conn, "truncate #{work_table};"
|
182
192
|
end
|
183
193
|
|
184
|
-
def commit_work_table(conn)
|
185
|
-
|
186
|
-
insert_stmt = @sql ? @sql.source : "insert into #{@table} select * from #{@work_table};"
|
194
|
+
def commit_work_table(conn, work_table)
|
195
|
+
insert_stmt = @sql ? @sql.source : "insert into #{@table} select * from #{work_table};"
|
187
196
|
execute_update conn, insert_stmt
|
188
197
|
# keep work table records for tracing
|
189
198
|
end
|
190
199
|
|
191
|
-
def copy_file_statement(obj)
|
192
|
-
%Q(
|
193
|
-
copy #{load_target_table} from '#{obj.url}'
|
194
|
-
credentials '#{obj.credential_string}'
|
195
|
-
#{@load_options}
|
196
|
-
;).gsub(/\s+/, ' ').strip
|
197
|
-
end
|
198
|
-
|
199
200
|
def create_manifest_file(objects)
|
200
201
|
manifest_name = "manifest-#{@job_process_id}.json"
|
201
202
|
@logger.info "creating manifest: #{manifest_name}"
|
202
|
-
|
203
|
-
|
203
|
+
json = make_manifest_json(objects)
|
204
|
+
@logger.info "manifest:\n" + json
|
205
|
+
url = @src.put_control_file(manifest_name, json, noop: @noop)
|
204
206
|
yield url
|
205
207
|
@src.remove_control_file(File.basename(url), noop: @noop)
|
206
208
|
end
|
207
209
|
|
208
210
|
def make_manifest_json(objects)
|
209
211
|
ents = objects.map {|obj|
|
210
|
-
{ "url" => obj.url, "mandatory" =>
|
212
|
+
{ "url" => obj.url, "mandatory" => false }
|
211
213
|
}
|
212
214
|
JSON.pretty_generate({ "entries" => ents })
|
213
215
|
end
|
214
216
|
|
215
|
-
def
|
217
|
+
def manifest_copy_stmt(target_table, manifest_url)
|
216
218
|
%Q(
|
217
|
-
copy #{
|
219
|
+
copy #{target_table}
|
218
220
|
from '#{manifest_url}'
|
219
|
-
credentials '#{credential_string}'
|
221
|
+
credentials '#{@src.credential_string}'
|
220
222
|
manifest
|
223
|
+
statupdate false
|
221
224
|
#{@load_options}
|
222
225
|
;).gsub(/\s+/, ' ').strip
|
223
226
|
end
|
224
227
|
|
225
|
-
def
|
226
|
-
|
228
|
+
def create_load_log_file(objects)
|
229
|
+
log_name = "load_log-#{@job_process_id}.csv"
|
230
|
+
@logger.info "creating tmp load log: #{log_name}"
|
231
|
+
csv = make_load_log_csv(objects)
|
232
|
+
@logger.info "load_log:\n" + csv
|
233
|
+
url = @src.put_control_file(log_name, csv, noop: @noop)
|
234
|
+
yield url
|
235
|
+
@src.remove_control_file(File.basename(url), noop: @noop)
|
227
236
|
end
|
228
237
|
|
229
|
-
def
|
230
|
-
|
231
|
-
|
232
|
-
|
238
|
+
def make_load_log_csv(objects)
|
239
|
+
buf = StringIO.new
|
240
|
+
objects.each do |obj|
|
241
|
+
log = make_load_log(obj)
|
242
|
+
cols = [
|
243
|
+
log.job_process_id,
|
244
|
+
format_timestamp(log.start_time),
|
245
|
+
'', # end time does not exist yet
|
246
|
+
log.target_table,
|
247
|
+
log.data_file
|
248
|
+
]
|
249
|
+
buf.puts cols.map {|c| %Q("#{c}") }.join(',')
|
233
250
|
end
|
234
|
-
|
235
|
-
|
236
|
-
def make_load_logs(objects)
|
237
|
-
objects.map {|obj| make_load_log(obj) }
|
251
|
+
buf.string
|
238
252
|
end
|
239
253
|
|
240
254
|
def make_load_log(obj)
|
@@ -243,34 +257,78 @@ class StreamingLoadJobClass < RubyJobClass
|
|
243
257
|
|
244
258
|
LoadLogRecord = Struct.new(:job_process_id, :start_time, :end_time, :target_table, :data_file)
|
245
259
|
|
246
|
-
def
|
247
|
-
|
248
|
-
execute_update
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
260
|
+
def create_tmp_log_table(conn, log_url)
|
261
|
+
target_table = log_table_wk
|
262
|
+
execute_update conn, "create table #{target_table} (like #{@log_table});"
|
263
|
+
execute_update conn, load_log_copy_stmt(target_table, log_url, @src.credential_string)
|
264
|
+
begin
|
265
|
+
yield target_table
|
266
|
+
ensure
|
267
|
+
begin
|
268
|
+
execute_update conn, "drop table #{target_table}"
|
269
|
+
rescue PostgreSQLException => ex
|
270
|
+
@logger.error ex.message + " (ignored)"
|
271
|
+
end
|
272
|
+
end
|
273
|
+
end
|
274
|
+
|
275
|
+
def log_table_wk
|
276
|
+
"#{@log_table}_tmp#{Process.pid}"
|
277
|
+
end
|
278
|
+
|
279
|
+
def load_log_copy_stmt(target_table, log_url, credential_string)
|
280
|
+
%Q(
|
281
|
+
copy #{target_table}
|
282
|
+
from '#{log_url}'
|
283
|
+
credentials '#{credential_string}'
|
284
|
+
delimiter ','
|
285
|
+
removequotes
|
286
|
+
;).gsub(/\s+/, ' ').strip
|
287
|
+
end
|
288
|
+
|
289
|
+
def partition_loaded_objects(conn, objects, tmp_log_table)
|
290
|
+
recs = conn.execute(<<-EndSQL)
|
291
|
+
select
|
292
|
+
data_file
|
293
|
+
, case when l.job_process_id is not null then 'true' else 'false' end as is_loaded
|
294
|
+
from
|
295
|
+
#{@log_table} l right outer join #{tmp_log_table} t using (data_file)
|
263
296
|
;
|
264
297
|
EndSQL
|
298
|
+
index = {}
|
299
|
+
objects.each do |obj|
|
300
|
+
index[obj.url] = obj
|
301
|
+
end
|
302
|
+
recs.each do |rec|
|
303
|
+
obj = index[rec['data_file']]
|
304
|
+
obj.loaded = (rec['is_loaded'] == 'true')
|
305
|
+
end
|
306
|
+
objects.partition(&:loaded)
|
265
307
|
end
|
266
308
|
|
267
|
-
def
|
268
|
-
|
269
|
-
|
309
|
+
def commit_load_log(conn, tmp_table_name)
|
310
|
+
conn.execute(<<-EndSQL)
|
311
|
+
insert into #{@log_table}
|
312
|
+
select
|
313
|
+
job_process_id
|
314
|
+
, start_time
|
315
|
+
, #{sql_timestamp @end_time}
|
316
|
+
, target_table
|
317
|
+
, data_file
|
318
|
+
from
|
319
|
+
#{tmp_table_name}
|
320
|
+
where
|
321
|
+
data_file not in (select data_file from #{@log_table})
|
322
|
+
;
|
323
|
+
EndSQL
|
270
324
|
end
|
271
325
|
|
272
326
|
def sql_timestamp(time)
|
273
|
-
%Q(timestamp '#{time
|
327
|
+
%Q(timestamp '#{format_timestamp(time)}')
|
328
|
+
end
|
329
|
+
|
330
|
+
def format_timestamp(time)
|
331
|
+
time.strftime('%Y-%m-%d %H:%M:%S')
|
274
332
|
end
|
275
333
|
|
276
334
|
def sql_string(str)
|
@@ -299,13 +357,6 @@ class StreamingLoadJobClass < RubyJobClass
|
|
299
357
|
def mask_secrets(log)
|
300
358
|
log.gsub(/\bcredentials\s+'.*?'/mi, "credentials '****'")
|
301
359
|
end
|
302
|
-
|
303
|
-
def dequeue_all(objects)
|
304
|
-
return if @load_only
|
305
|
-
objects.each do |obj|
|
306
|
-
obj.dequeue(@noop)
|
307
|
-
end
|
308
|
-
end
|
309
360
|
end
|
310
361
|
|
311
362
|
class S3Queue
|
@@ -424,8 +475,11 @@ class StreamingLoadJobClass < RubyJobClass
|
|
424
475
|
@s3queue = s3queue
|
425
476
|
@object = object
|
426
477
|
@logger = logger
|
478
|
+
@loaded = nil
|
427
479
|
end
|
428
480
|
|
481
|
+
attr_accessor :loaded
|
482
|
+
|
429
483
|
def credential_string
|
430
484
|
@s3queue.credential_string
|
431
485
|
end
|
@@ -442,11 +496,18 @@ class StreamingLoadJobClass < RubyJobClass
|
|
442
496
|
@s3queue.object_url_direct(path)
|
443
497
|
end
|
444
498
|
|
445
|
-
def dequeue(noop
|
499
|
+
def dequeue(force: false, noop: false)
|
446
500
|
@logger.info "s3 move: #{path} -> #{persistent_path}"
|
447
501
|
return if noop
|
448
502
|
@object.move_to persistent_object, dequeue_options
|
449
|
-
@logger.info "
|
503
|
+
@logger.info "done"
|
504
|
+
rescue Aws::S3::Errors::NoSuchKey => ex
|
505
|
+
@logger.error "S3 error: #{ex.message}"
|
506
|
+
if force
|
507
|
+
@logger.info "move error ignored (may be caused by eventual consistency)"
|
508
|
+
else
|
509
|
+
raise
|
510
|
+
end
|
450
511
|
end
|
451
512
|
|
452
513
|
def persistent_object
|
data/lib/bricolage/version.rb
CHANGED
data/test/home/Gemfile.lock
CHANGED