bricolage 5.13.1 → 5.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d2ad1fd2590af8b932a03d63de07faa2e3a7b58a
4
- data.tar.gz: 4bb804de2144390c4aab51c0ce812616139d881f
3
+ metadata.gz: 5bb93d5d2f2b8100f394fefef4df4a0312e612a4
4
+ data.tar.gz: a961533ace5a56216757ab3c99666780922511ac
5
5
  SHA512:
6
- metadata.gz: b3e4089221b1bffcc7768bbe81cc4c9ea29e6e742de660239176bdd4e678b02fbeaaf224699b2a83a59f183c54242198ffde2070adbeec75cdae4abdd7b25fbe
7
- data.tar.gz: c773b13460f9fa0774d2c8f31fec351071abf4e6ad87ce3df303b60cf878a86952c6e5ff2ef9fe41927b9673304f336e4b3b5688601df533efa37e234cf891a2
6
+ metadata.gz: 7d5e13b4683b83f850ebca170c0ba4f00917874f386d95d16b5764d63625be388de3940a3f8be6b07a19bb000b9da3c024aa05c4795d48067efc81b672ada610
7
+ data.tar.gz: 68d542616e8a6826c0e9bddc33afd9744b449c7ee218610d65750186915ec51913f8144c8155bb083feb80b17ee086eafdc5c7dec07f16db012912d8546c5fba
@@ -42,9 +42,11 @@ class StreamingLoadJobClass < RubyJobClass
42
42
  end
43
43
 
44
44
  def run
45
- @loader.dequeue_loaded_files unless @load_only
46
- return nil if @dequeue_only
47
- @loader.load
45
+ if @dequeue_only
46
+ @loader.dequeue
47
+ else
48
+ @loader.load
49
+ end
48
50
  nil
49
51
  end
50
52
 
@@ -108,8 +110,12 @@ class StreamingLoadJobClass < RubyJobClass
108
110
 
109
111
  attr_reader :sql
110
112
 
111
- def load
112
- load_in_parallel
113
+ def work_table
114
+ @work_table || "#{@table}_wk"
115
+ end
116
+
117
+ def log_table
118
+ @log_table || "#{@table}_l"
113
119
  end
114
120
 
115
121
  def log_basic_info
@@ -118,123 +124,131 @@ class StreamingLoadJobClass < RubyJobClass
118
124
  @logger.info "queue: #{@src.queue_url}"
119
125
  end
120
126
 
121
- def load_in_parallel
127
+ def dequeue
122
128
  log_basic_info
123
- @logger.info 'load with manifest'
129
+ @logger.info "dequeue start"
124
130
  objects = @src.queued_objects
125
131
  if objects.empty?
126
132
  @logger.info 'no target data files; exit'
127
133
  return
128
134
  end
129
- create_manifest_file(objects) {|manifest_url|
135
+ create_load_log_file(objects) {|log_url|
130
136
  @ds.open {|conn|
131
- init_work_table conn
132
- execute_update conn, copy_manifest_statement(manifest_url, @src.credential_string)
133
- @logger.info "load succeeded: #{manifest_url}" unless @noop
134
- commit conn, objects
137
+ execute_update conn, copy_load_log_stmt(log_url, @src.credential_string)
138
+ foreach_loaded_object(objects) do |obj|
139
+ obj.dequeue(@noop)
140
+ end
135
141
  }
136
- dequeue_all objects
137
142
  }
138
143
  end
139
144
 
140
- def load_in_sequential
145
+ def load
141
146
  log_basic_info
142
- @logger.info 'load each objects sequentially'
147
+ @logger.info 'load with manifest'
143
148
  objects = @src.queued_objects
144
- @ds.open {|conn|
145
- init_work_table(conn)
146
- objects.each do |obj|
147
- @logger.info "load: #{obj.url}"
148
- execute_update conn, copy_file_statement(obj)
149
- @logger.info "load succeeded: #{obj.url}" unless @noop
150
- end
151
- commit conn, objects
149
+ if objects.empty?
150
+ @logger.info 'no target data files; exit'
151
+ return
152
+ end
153
+ create_load_log_file(objects) {|log_url|
154
+ @ds.open {|conn|
155
+ create_tmp_log_table(conn, log_url) {|tmp_log_table|
156
+ loaded, not_loaded = partition_loaded_objects(conn, objects, tmp_log_table)
157
+ unless @load_only
158
+ loaded.each do |obj|
159
+ obj.dequeue(force: true, noop: @noop)
160
+ end
161
+ end
162
+ unless not_loaded.empty?
163
+ create_manifest_file(not_loaded) {|manifest_url|
164
+ init_work_table conn
165
+ execute_update conn, manifest_copy_stmt(work_table, manifest_url)
166
+ @logger.info "load succeeded: #{manifest_url}" unless @noop
167
+ commit conn, work_table, tmp_log_table unless @load_only
168
+ }
169
+ unless @load_only
170
+ not_loaded.each do |obj|
171
+ obj.dequeue(force: true, noop: @noop)
172
+ end
173
+ end
174
+ end
175
+ }
176
+ }
152
177
  }
153
- dequeue_all objects
154
178
  end
155
179
 
156
- def commit(conn, objects)
157
- @end_time = Time.now
158
- return if @load_only
180
+ def commit(conn, work_table, tmp_log_table)
181
+ @end_time = Time.now # commit_load_log writes this, generate before that
159
182
  transaction(conn) {
160
- commit_work_table conn
161
- write_load_logs conn, objects
162
- }
163
- end
164
-
165
- def dequeue_loaded_files
166
- @logger.info "dequeue start"
167
- objects = @src.queued_objects
168
- @ds.open {|conn|
169
- objects.each do |obj|
170
- if loaded_object?(conn, obj)
171
- obj.dequeue(@noop)
172
- end
173
- end
183
+ commit_work_table conn, work_table
184
+ commit_load_log conn, tmp_log_table
174
185
  }
175
186
  end
176
187
 
177
188
  private
178
189
 
179
190
  def init_work_table(conn)
180
- return unless @work_table
181
- execute_update conn, "truncate #{@work_table};"
191
+ execute_update conn, "truncate #{work_table};"
182
192
  end
183
193
 
184
- def commit_work_table(conn)
185
- return unless @work_table
186
- insert_stmt = @sql ? @sql.source : "insert into #{@table} select * from #{@work_table};"
194
+ def commit_work_table(conn, work_table)
195
+ insert_stmt = @sql ? @sql.source : "insert into #{@table} select * from #{work_table};"
187
196
  execute_update conn, insert_stmt
188
197
  # keep work table records for tracing
189
198
  end
190
199
 
191
- def copy_file_statement(obj)
192
- %Q(
193
- copy #{load_target_table} from '#{obj.url}'
194
- credentials '#{obj.credential_string}'
195
- #{@load_options}
196
- ;).gsub(/\s+/, ' ').strip
197
- end
198
-
199
200
  def create_manifest_file(objects)
200
201
  manifest_name = "manifest-#{@job_process_id}.json"
201
202
  @logger.info "creating manifest: #{manifest_name}"
202
- @logger.info "manifest:\n" + make_manifest_json(objects)
203
- url = @src.put_control_file(manifest_name, make_manifest_json(objects), noop: @noop)
203
+ json = make_manifest_json(objects)
204
+ @logger.info "manifest:\n" + json
205
+ url = @src.put_control_file(manifest_name, json, noop: @noop)
204
206
  yield url
205
207
  @src.remove_control_file(File.basename(url), noop: @noop)
206
208
  end
207
209
 
208
210
  def make_manifest_json(objects)
209
211
  ents = objects.map {|obj|
210
- { "url" => obj.url, "mandatory" => true }
212
+ { "url" => obj.url, "mandatory" => false }
211
213
  }
212
214
  JSON.pretty_generate({ "entries" => ents })
213
215
  end
214
216
 
215
- def copy_manifest_statement(manifest_url, credential_string)
217
+ def manifest_copy_stmt(target_table, manifest_url)
216
218
  %Q(
217
- copy #{load_target_table}
219
+ copy #{target_table}
218
220
  from '#{manifest_url}'
219
- credentials '#{credential_string}'
221
+ credentials '#{@src.credential_string}'
220
222
  manifest
223
+ statupdate false
221
224
  #{@load_options}
222
225
  ;).gsub(/\s+/, ' ').strip
223
226
  end
224
227
 
225
- def load_target_table
226
- @work_table || @table
228
+ def create_load_log_file(objects)
229
+ log_name = "load_log-#{@job_process_id}.csv"
230
+ @logger.info "creating tmp load log: #{log_name}"
231
+ csv = make_load_log_csv(objects)
232
+ @logger.info "load_log:\n" + csv
233
+ url = @src.put_control_file(log_name, csv, noop: @noop)
234
+ yield url
235
+ @src.remove_control_file(File.basename(url), noop: @noop)
227
236
  end
228
237
 
229
- def write_load_logs(conn, objects)
230
- return unless @log_table
231
- make_load_logs(objects).each do |record|
232
- write_load_log conn, record
238
+ def make_load_log_csv(objects)
239
+ buf = StringIO.new
240
+ objects.each do |obj|
241
+ log = make_load_log(obj)
242
+ cols = [
243
+ log.job_process_id,
244
+ format_timestamp(log.start_time),
245
+ '', # end time does not exist yet
246
+ log.target_table,
247
+ log.data_file
248
+ ]
249
+ buf.puts cols.map {|c| %Q("#{c}") }.join(',')
233
250
  end
234
- end
235
-
236
- def make_load_logs(objects)
237
- objects.map {|obj| make_load_log(obj) }
251
+ buf.string
238
252
  end
239
253
 
240
254
  def make_load_log(obj)
@@ -243,34 +257,78 @@ class StreamingLoadJobClass < RubyJobClass
243
257
 
244
258
  LoadLogRecord = Struct.new(:job_process_id, :start_time, :end_time, :target_table, :data_file)
245
259
 
246
- def write_load_log(conn, record)
247
- return unless @log_table
248
- execute_update(conn, <<-EndSQL.gsub(/^\s+/, '').strip)
249
- insert into #{@log_table}
250
- ( job_process_id
251
- , start_time
252
- , end_time
253
- , target_table
254
- , data_file
255
- )
256
- values
257
- ( #{sql_string record.job_process_id}
258
- , #{sql_timestamp record.start_time}
259
- , #{sql_timestamp record.end_time}
260
- , #{sql_string record.target_table}
261
- , #{sql_string record.data_file}
262
- )
260
+ def create_tmp_log_table(conn, log_url)
261
+ target_table = log_table_wk
262
+ execute_update conn, "create table #{target_table} (like #{@log_table});"
263
+ execute_update conn, load_log_copy_stmt(target_table, log_url, @src.credential_string)
264
+ begin
265
+ yield target_table
266
+ ensure
267
+ begin
268
+ execute_update conn, "drop table #{target_table}"
269
+ rescue PostgreSQLException => ex
270
+ @logger.error ex.message + " (ignored)"
271
+ end
272
+ end
273
+ end
274
+
275
+ def log_table_wk
276
+ "#{@log_table}_tmp#{Process.pid}"
277
+ end
278
+
279
+ def load_log_copy_stmt(target_table, log_url, credential_string)
280
+ %Q(
281
+ copy #{target_table}
282
+ from '#{log_url}'
283
+ credentials '#{credential_string}'
284
+ delimiter ','
285
+ removequotes
286
+ ;).gsub(/\s+/, ' ').strip
287
+ end
288
+
289
+ def partition_loaded_objects(conn, objects, tmp_log_table)
290
+ recs = conn.execute(<<-EndSQL)
291
+ select
292
+ data_file
293
+ , case when l.job_process_id is not null then 'true' else 'false' end as is_loaded
294
+ from
295
+ #{@log_table} l right outer join #{tmp_log_table} t using (data_file)
263
296
  ;
264
297
  EndSQL
298
+ index = {}
299
+ objects.each do |obj|
300
+ index[obj.url] = obj
301
+ end
302
+ recs.each do |rec|
303
+ obj = index[rec['data_file']]
304
+ obj.loaded = (rec['is_loaded'] == 'true')
305
+ end
306
+ objects.partition(&:loaded)
265
307
  end
266
308
 
267
- def loaded_object?(conn, obj)
268
- rs = conn.execute("select count(*) as c from #{@log_table} where data_file = #{sql_string obj.url}")
269
- rs.first['c'].to_i > 0
309
+ def commit_load_log(conn, tmp_table_name)
310
+ conn.execute(<<-EndSQL)
311
+ insert into #{@log_table}
312
+ select
313
+ job_process_id
314
+ , start_time
315
+ , #{sql_timestamp @end_time}
316
+ , target_table
317
+ , data_file
318
+ from
319
+ #{tmp_table_name}
320
+ where
321
+ data_file not in (select data_file from #{@log_table})
322
+ ;
323
+ EndSQL
270
324
  end
271
325
 
272
326
  def sql_timestamp(time)
273
- %Q(timestamp '#{time.strftime('%Y-%m-%d %H:%M:%S')}')
327
+ %Q(timestamp '#{format_timestamp(time)}')
328
+ end
329
+
330
+ def format_timestamp(time)
331
+ time.strftime('%Y-%m-%d %H:%M:%S')
274
332
  end
275
333
 
276
334
  def sql_string(str)
@@ -299,13 +357,6 @@ class StreamingLoadJobClass < RubyJobClass
299
357
  def mask_secrets(log)
300
358
  log.gsub(/\bcredentials\s+'.*?'/mi, "credentials '****'")
301
359
  end
302
-
303
- def dequeue_all(objects)
304
- return if @load_only
305
- objects.each do |obj|
306
- obj.dequeue(@noop)
307
- end
308
- end
309
360
  end
310
361
 
311
362
  class S3Queue
@@ -424,8 +475,11 @@ class StreamingLoadJobClass < RubyJobClass
424
475
  @s3queue = s3queue
425
476
  @object = object
426
477
  @logger = logger
478
+ @loaded = nil
427
479
  end
428
480
 
481
+ attr_accessor :loaded
482
+
429
483
  def credential_string
430
484
  @s3queue.credential_string
431
485
  end
@@ -442,11 +496,18 @@ class StreamingLoadJobClass < RubyJobClass
442
496
  @s3queue.object_url_direct(path)
443
497
  end
444
498
 
445
- def dequeue(noop = false)
499
+ def dequeue(force: false, noop: false)
446
500
  @logger.info "s3 move: #{path} -> #{persistent_path}"
447
501
  return if noop
448
502
  @object.move_to persistent_object, dequeue_options
449
- @logger.info "file saved"
503
+ @logger.info "done"
504
+ rescue Aws::S3::Errors::NoSuchKey => ex
505
+ @logger.error "S3 error: #{ex.message}"
506
+ if force
507
+ @logger.info "move error ignored (may be caused by eventual consistency)"
508
+ else
509
+ raise
510
+ end
450
511
  end
451
512
 
452
513
  def persistent_object
@@ -1,4 +1,4 @@
1
1
  module Bricolage
2
2
  APPLICATION_NAME = 'Bricolage'
3
- VERSION = '5.13.1'
3
+ VERSION = '5.14.0'
4
4
  end
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: ../..
3
3
  specs:
4
- bricolage (5.13.1)
4
+ bricolage (5.14.0)
5
5
  aws-sdk (~> 2)
6
6
  mysql2
7
7
  pg
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bricolage
3
3
  version: !ruby/object:Gem::Version
4
- version: 5.13.1
4
+ version: 5.14.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Minero Aoki