bricolage 5.13.1 → 5.14.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d2ad1fd2590af8b932a03d63de07faa2e3a7b58a
4
- data.tar.gz: 4bb804de2144390c4aab51c0ce812616139d881f
3
+ metadata.gz: 5bb93d5d2f2b8100f394fefef4df4a0312e612a4
4
+ data.tar.gz: a961533ace5a56216757ab3c99666780922511ac
5
5
  SHA512:
6
- metadata.gz: b3e4089221b1bffcc7768bbe81cc4c9ea29e6e742de660239176bdd4e678b02fbeaaf224699b2a83a59f183c54242198ffde2070adbeec75cdae4abdd7b25fbe
7
- data.tar.gz: c773b13460f9fa0774d2c8f31fec351071abf4e6ad87ce3df303b60cf878a86952c6e5ff2ef9fe41927b9673304f336e4b3b5688601df533efa37e234cf891a2
6
+ metadata.gz: 7d5e13b4683b83f850ebca170c0ba4f00917874f386d95d16b5764d63625be388de3940a3f8be6b07a19bb000b9da3c024aa05c4795d48067efc81b672ada610
7
+ data.tar.gz: 68d542616e8a6826c0e9bddc33afd9744b449c7ee218610d65750186915ec51913f8144c8155bb083feb80b17ee086eafdc5c7dec07f16db012912d8546c5fba
@@ -42,9 +42,11 @@ class StreamingLoadJobClass < RubyJobClass
42
42
  end
43
43
 
44
44
  def run
45
- @loader.dequeue_loaded_files unless @load_only
46
- return nil if @dequeue_only
47
- @loader.load
45
+ if @dequeue_only
46
+ @loader.dequeue
47
+ else
48
+ @loader.load
49
+ end
48
50
  nil
49
51
  end
50
52
 
@@ -108,8 +110,12 @@ class StreamingLoadJobClass < RubyJobClass
108
110
 
109
111
  attr_reader :sql
110
112
 
111
- def load
112
- load_in_parallel
113
+ def work_table
114
+ @work_table || "#{@table}_wk"
115
+ end
116
+
117
+ def log_table
118
+ @log_table || "#{@table}_l"
113
119
  end
114
120
 
115
121
  def log_basic_info
@@ -118,123 +124,131 @@ class StreamingLoadJobClass < RubyJobClass
118
124
  @logger.info "queue: #{@src.queue_url}"
119
125
  end
120
126
 
121
- def load_in_parallel
127
+ def dequeue
122
128
  log_basic_info
123
- @logger.info 'load with manifest'
129
+ @logger.info "dequeue start"
124
130
  objects = @src.queued_objects
125
131
  if objects.empty?
126
132
  @logger.info 'no target data files; exit'
127
133
  return
128
134
  end
129
- create_manifest_file(objects) {|manifest_url|
135
+ create_load_log_file(objects) {|log_url|
130
136
  @ds.open {|conn|
131
- init_work_table conn
132
- execute_update conn, copy_manifest_statement(manifest_url, @src.credential_string)
133
- @logger.info "load succeeded: #{manifest_url}" unless @noop
134
- commit conn, objects
137
+ execute_update conn, copy_load_log_stmt(log_url, @src.credential_string)
138
+ foreach_loaded_object(objects) do |obj|
139
+ obj.dequeue(@noop)
140
+ end
135
141
  }
136
- dequeue_all objects
137
142
  }
138
143
  end
139
144
 
140
- def load_in_sequential
145
+ def load
141
146
  log_basic_info
142
- @logger.info 'load each objects sequentially'
147
+ @logger.info 'load with manifest'
143
148
  objects = @src.queued_objects
144
- @ds.open {|conn|
145
- init_work_table(conn)
146
- objects.each do |obj|
147
- @logger.info "load: #{obj.url}"
148
- execute_update conn, copy_file_statement(obj)
149
- @logger.info "load succeeded: #{obj.url}" unless @noop
150
- end
151
- commit conn, objects
149
+ if objects.empty?
150
+ @logger.info 'no target data files; exit'
151
+ return
152
+ end
153
+ create_load_log_file(objects) {|log_url|
154
+ @ds.open {|conn|
155
+ create_tmp_log_table(conn, log_url) {|tmp_log_table|
156
+ loaded, not_loaded = partition_loaded_objects(conn, objects, tmp_log_table)
157
+ unless @load_only
158
+ loaded.each do |obj|
159
+ obj.dequeue(force: true, noop: @noop)
160
+ end
161
+ end
162
+ unless not_loaded.empty?
163
+ create_manifest_file(not_loaded) {|manifest_url|
164
+ init_work_table conn
165
+ execute_update conn, manifest_copy_stmt(work_table, manifest_url)
166
+ @logger.info "load succeeded: #{manifest_url}" unless @noop
167
+ commit conn, work_table, tmp_log_table unless @load_only
168
+ }
169
+ unless @load_only
170
+ not_loaded.each do |obj|
171
+ obj.dequeue(force: true, noop: @noop)
172
+ end
173
+ end
174
+ end
175
+ }
176
+ }
152
177
  }
153
- dequeue_all objects
154
178
  end
155
179
 
156
- def commit(conn, objects)
157
- @end_time = Time.now
158
- return if @load_only
180
+ def commit(conn, work_table, tmp_log_table)
181
+ @end_time = Time.now # commit_load_log writes this, generate before that
159
182
  transaction(conn) {
160
- commit_work_table conn
161
- write_load_logs conn, objects
162
- }
163
- end
164
-
165
- def dequeue_loaded_files
166
- @logger.info "dequeue start"
167
- objects = @src.queued_objects
168
- @ds.open {|conn|
169
- objects.each do |obj|
170
- if loaded_object?(conn, obj)
171
- obj.dequeue(@noop)
172
- end
173
- end
183
+ commit_work_table conn, work_table
184
+ commit_load_log conn, tmp_log_table
174
185
  }
175
186
  end
176
187
 
177
188
  private
178
189
 
179
190
  def init_work_table(conn)
180
- return unless @work_table
181
- execute_update conn, "truncate #{@work_table};"
191
+ execute_update conn, "truncate #{work_table};"
182
192
  end
183
193
 
184
- def commit_work_table(conn)
185
- return unless @work_table
186
- insert_stmt = @sql ? @sql.source : "insert into #{@table} select * from #{@work_table};"
194
+ def commit_work_table(conn, work_table)
195
+ insert_stmt = @sql ? @sql.source : "insert into #{@table} select * from #{work_table};"
187
196
  execute_update conn, insert_stmt
188
197
  # keep work table records for tracing
189
198
  end
190
199
 
191
- def copy_file_statement(obj)
192
- %Q(
193
- copy #{load_target_table} from '#{obj.url}'
194
- credentials '#{obj.credential_string}'
195
- #{@load_options}
196
- ;).gsub(/\s+/, ' ').strip
197
- end
198
-
199
200
  def create_manifest_file(objects)
200
201
  manifest_name = "manifest-#{@job_process_id}.json"
201
202
  @logger.info "creating manifest: #{manifest_name}"
202
- @logger.info "manifest:\n" + make_manifest_json(objects)
203
- url = @src.put_control_file(manifest_name, make_manifest_json(objects), noop: @noop)
203
+ json = make_manifest_json(objects)
204
+ @logger.info "manifest:\n" + json
205
+ url = @src.put_control_file(manifest_name, json, noop: @noop)
204
206
  yield url
205
207
  @src.remove_control_file(File.basename(url), noop: @noop)
206
208
  end
207
209
 
208
210
  def make_manifest_json(objects)
209
211
  ents = objects.map {|obj|
210
- { "url" => obj.url, "mandatory" => true }
212
+ { "url" => obj.url, "mandatory" => false }
211
213
  }
212
214
  JSON.pretty_generate({ "entries" => ents })
213
215
  end
214
216
 
215
- def copy_manifest_statement(manifest_url, credential_string)
217
+ def manifest_copy_stmt(target_table, manifest_url)
216
218
  %Q(
217
- copy #{load_target_table}
219
+ copy #{target_table}
218
220
  from '#{manifest_url}'
219
- credentials '#{credential_string}'
221
+ credentials '#{@src.credential_string}'
220
222
  manifest
223
+ statupdate false
221
224
  #{@load_options}
222
225
  ;).gsub(/\s+/, ' ').strip
223
226
  end
224
227
 
225
- def load_target_table
226
- @work_table || @table
228
+ def create_load_log_file(objects)
229
+ log_name = "load_log-#{@job_process_id}.csv"
230
+ @logger.info "creating tmp load log: #{log_name}"
231
+ csv = make_load_log_csv(objects)
232
+ @logger.info "load_log:\n" + csv
233
+ url = @src.put_control_file(log_name, csv, noop: @noop)
234
+ yield url
235
+ @src.remove_control_file(File.basename(url), noop: @noop)
227
236
  end
228
237
 
229
- def write_load_logs(conn, objects)
230
- return unless @log_table
231
- make_load_logs(objects).each do |record|
232
- write_load_log conn, record
238
+ def make_load_log_csv(objects)
239
+ buf = StringIO.new
240
+ objects.each do |obj|
241
+ log = make_load_log(obj)
242
+ cols = [
243
+ log.job_process_id,
244
+ format_timestamp(log.start_time),
245
+ '', # end time does not exist yet
246
+ log.target_table,
247
+ log.data_file
248
+ ]
249
+ buf.puts cols.map {|c| %Q("#{c}") }.join(',')
233
250
  end
234
- end
235
-
236
- def make_load_logs(objects)
237
- objects.map {|obj| make_load_log(obj) }
251
+ buf.string
238
252
  end
239
253
 
240
254
  def make_load_log(obj)
@@ -243,34 +257,78 @@ class StreamingLoadJobClass < RubyJobClass
243
257
 
244
258
  LoadLogRecord = Struct.new(:job_process_id, :start_time, :end_time, :target_table, :data_file)
245
259
 
246
- def write_load_log(conn, record)
247
- return unless @log_table
248
- execute_update(conn, <<-EndSQL.gsub(/^\s+/, '').strip)
249
- insert into #{@log_table}
250
- ( job_process_id
251
- , start_time
252
- , end_time
253
- , target_table
254
- , data_file
255
- )
256
- values
257
- ( #{sql_string record.job_process_id}
258
- , #{sql_timestamp record.start_time}
259
- , #{sql_timestamp record.end_time}
260
- , #{sql_string record.target_table}
261
- , #{sql_string record.data_file}
262
- )
260
+ def create_tmp_log_table(conn, log_url)
261
+ target_table = log_table_wk
262
+ execute_update conn, "create table #{target_table} (like #{@log_table});"
263
+ execute_update conn, load_log_copy_stmt(target_table, log_url, @src.credential_string)
264
+ begin
265
+ yield target_table
266
+ ensure
267
+ begin
268
+ execute_update conn, "drop table #{target_table}"
269
+ rescue PostgreSQLException => ex
270
+ @logger.error ex.message + " (ignored)"
271
+ end
272
+ end
273
+ end
274
+
275
+ def log_table_wk
276
+ "#{@log_table}_tmp#{Process.pid}"
277
+ end
278
+
279
+ def load_log_copy_stmt(target_table, log_url, credential_string)
280
+ %Q(
281
+ copy #{target_table}
282
+ from '#{log_url}'
283
+ credentials '#{credential_string}'
284
+ delimiter ','
285
+ removequotes
286
+ ;).gsub(/\s+/, ' ').strip
287
+ end
288
+
289
+ def partition_loaded_objects(conn, objects, tmp_log_table)
290
+ recs = conn.execute(<<-EndSQL)
291
+ select
292
+ data_file
293
+ , case when l.job_process_id is not null then 'true' else 'false' end as is_loaded
294
+ from
295
+ #{@log_table} l right outer join #{tmp_log_table} t using (data_file)
263
296
  ;
264
297
  EndSQL
298
+ index = {}
299
+ objects.each do |obj|
300
+ index[obj.url] = obj
301
+ end
302
+ recs.each do |rec|
303
+ obj = index[rec['data_file']]
304
+ obj.loaded = (rec['is_loaded'] == 'true')
305
+ end
306
+ objects.partition(&:loaded)
265
307
  end
266
308
 
267
- def loaded_object?(conn, obj)
268
- rs = conn.execute("select count(*) as c from #{@log_table} where data_file = #{sql_string obj.url}")
269
- rs.first['c'].to_i > 0
309
+ def commit_load_log(conn, tmp_table_name)
310
+ conn.execute(<<-EndSQL)
311
+ insert into #{@log_table}
312
+ select
313
+ job_process_id
314
+ , start_time
315
+ , #{sql_timestamp @end_time}
316
+ , target_table
317
+ , data_file
318
+ from
319
+ #{tmp_table_name}
320
+ where
321
+ data_file not in (select data_file from #{@log_table})
322
+ ;
323
+ EndSQL
270
324
  end
271
325
 
272
326
  def sql_timestamp(time)
273
- %Q(timestamp '#{time.strftime('%Y-%m-%d %H:%M:%S')}')
327
+ %Q(timestamp '#{format_timestamp(time)}')
328
+ end
329
+
330
+ def format_timestamp(time)
331
+ time.strftime('%Y-%m-%d %H:%M:%S')
274
332
  end
275
333
 
276
334
  def sql_string(str)
@@ -299,13 +357,6 @@ class StreamingLoadJobClass < RubyJobClass
299
357
  def mask_secrets(log)
300
358
  log.gsub(/\bcredentials\s+'.*?'/mi, "credentials '****'")
301
359
  end
302
-
303
- def dequeue_all(objects)
304
- return if @load_only
305
- objects.each do |obj|
306
- obj.dequeue(@noop)
307
- end
308
- end
309
360
  end
310
361
 
311
362
  class S3Queue
@@ -424,8 +475,11 @@ class StreamingLoadJobClass < RubyJobClass
424
475
  @s3queue = s3queue
425
476
  @object = object
426
477
  @logger = logger
478
+ @loaded = nil
427
479
  end
428
480
 
481
+ attr_accessor :loaded
482
+
429
483
  def credential_string
430
484
  @s3queue.credential_string
431
485
  end
@@ -442,11 +496,18 @@ class StreamingLoadJobClass < RubyJobClass
442
496
  @s3queue.object_url_direct(path)
443
497
  end
444
498
 
445
- def dequeue(noop = false)
499
+ def dequeue(force: false, noop: false)
446
500
  @logger.info "s3 move: #{path} -> #{persistent_path}"
447
501
  return if noop
448
502
  @object.move_to persistent_object, dequeue_options
449
- @logger.info "file saved"
503
+ @logger.info "done"
504
+ rescue Aws::S3::Errors::NoSuchKey => ex
505
+ @logger.error "S3 error: #{ex.message}"
506
+ if force
507
+ @logger.info "move error ignored (may be caused by eventual consistency)"
508
+ else
509
+ raise
510
+ end
450
511
  end
451
512
 
452
513
  def persistent_object
@@ -1,4 +1,4 @@
1
1
  module Bricolage
2
2
  APPLICATION_NAME = 'Bricolage'
3
- VERSION = '5.13.1'
3
+ VERSION = '5.14.0'
4
4
  end
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: ../..
3
3
  specs:
4
- bricolage (5.13.1)
4
+ bricolage (5.14.0)
5
5
  aws-sdk (~> 2)
6
6
  mysql2
7
7
  pg
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bricolage
3
3
  version: !ruby/object:Gem::Version
4
- version: 5.13.1
4
+ version: 5.14.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Minero Aoki