tapsoob 0.6.2-java → 0.7.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,446 @@
1
+ # -*- encoding : utf-8 -*-
2
+ require 'tapsoob/operation/base'
3
+ require 'tapsoob/progress_event'
4
+
5
+ module Tapsoob
6
+ module Operation
7
+ class Push < Base
8
+ def file_prefix
9
+ "push"
10
+ end
11
+
12
+ # Disable table-level parallelization for push operations to respect
13
+ # foreign key dependencies. Tables must be loaded in dependency order
14
+ # (as specified in table_order.txt manifest) to avoid FK violations.
15
+ # Intra-table parallelization is still enabled and safe.
16
+ def parallel?
17
+ false
18
+ end
19
+
20
+ def to_hash
21
+ super.merge(:local_tables_info => local_tables_info)
22
+ end
23
+
24
+ def run
25
+ catch_errors do
26
+ unless resuming?
27
+ push_schema if schema?
28
+ push_indexes if indexes_first? && schema?
29
+ end
30
+ setup_signal_trap
31
+ push_partial_data if data? && resuming?
32
+ push_data if data?
33
+ push_indexes if !indexes_first? && schema?
34
+ push_reset_sequences
35
+ end
36
+ end
37
+
38
+ def push_indexes
39
+ idxs = {}
40
+ table_idxs = Dir.glob(File.join(dump_path, "indexes", "*.json")).map { |path| File.basename(path, '.json') }
41
+ table_idxs.each do |table_idx|
42
+ # Read NDJSON format - each line is a separate index
43
+ index_file = File.join(dump_path, "indexes", "#{table_idx}.json")
44
+ idxs[table_idx] = File.readlines(index_file).map { |line| JSON.parse(line.strip) }
45
+ end
46
+
47
+ return unless idxs.size > 0
48
+
49
+ log.info "Sending indexes"
50
+
51
+ # Calculate max title width for consistent alignment
52
+ filtered_idxs = apply_table_filter(idxs).select { |table, indexes| indexes.size > 0 }
53
+ Tapsoob::ProgressEvent.indexes_start(filtered_idxs.size)
54
+ max_title_width = filtered_idxs.keys.map { |table| "#{table} indexes".length }.max || 14
55
+
56
+ filtered_idxs.each do |table, indexes|
57
+ progress = opts[:progress] ? Tapsoob::Progress::Bar.new("#{table} indexes", indexes.size, STDOUT, max_title_width) : nil
58
+ indexes.each do |idx|
59
+ Tapsoob::Utils.load_indexes(database_url, idx)
60
+ progress.inc(1) if progress
61
+ end
62
+ progress.finish if progress
63
+ end
64
+ Tapsoob::ProgressEvent.indexes_complete(filtered_idxs.size)
65
+ end
66
+
67
+ def push_schema
68
+ log.info "Sending schema"
69
+ Tapsoob::ProgressEvent.schema_start(tables.size)
70
+
71
+ progress = opts[:progress] ? Tapsoob::Progress::Bar.new('Schema', tables.size) : nil
72
+ tables.each do |table, count|
73
+ log.debug "Loading '#{table}' schema\n"
74
+ # Reuse existing db connection for better performance
75
+ Tapsoob::Utils.load_schema(dump_path, db, table)
76
+ progress.inc(1) if progress
77
+ end
78
+ progress.finish if progress
79
+ Tapsoob::ProgressEvent.schema_complete(tables.size)
80
+ end
81
+
82
+ def push_reset_sequences
83
+ log.info "Resetting sequences"
84
+ Tapsoob::ProgressEvent.sequences_start
85
+
86
+ Tapsoob::Utils.schema_bin(:reset_db_sequences, database_url)
87
+
88
+ Tapsoob::ProgressEvent.sequences_complete
89
+ end
90
+
91
+ def push_partial_data
92
+ return if stream_state == {}
93
+
94
+ table_name = stream_state[:table_name]
95
+ record_count = tables[table_name.to_s]
96
+ log.info "Resuming #{table_name}, #{format_number(record_count)} records"
97
+ stream = Tapsoob::DataStream::Base.factory(db, stream_state)
98
+ chunksize = stream_state[:chunksize] || default_chunksize
99
+ estimated_chunks = [(record_count.to_f / chunksize).ceil, 1].max
100
+ progress = (opts[:progress] ? Tapsoob::Progress::Bar.new(table_name.to_s, estimated_chunks) : nil)
101
+ push_data_from_file(stream, progress)
102
+ end
103
+
104
+ def push_data
105
+ log.info "Sending data"
106
+
107
+ log.info "#{tables.size} tables, #{format_number(record_count)} records"
108
+ Tapsoob::ProgressEvent.data_start(tables.size, record_count)
109
+
110
+ if parallel?
111
+ push_data_parallel
112
+ else
113
+ push_data_serial
114
+ end
115
+
116
+ Tapsoob::ProgressEvent.data_complete(tables.size, record_count)
117
+ end
118
+
119
+ def push_data_serial
120
+ max_visible_bars = 8
121
+ multi_progress = opts[:progress] ? Tapsoob::Progress::MultiBar.new(max_visible_bars) : nil
122
+
123
+ tables.each do |table_name, count|
124
+ # Skip if data file doesn't exist or has no data
125
+ data_file = File.join(dump_path, "data", "#{table_name}.json")
126
+ next unless File.exist?(data_file) && count > 0
127
+
128
+ # Check if this table should use intra-table parallelization
129
+ table_workers = table_parallel_workers(table_name, count)
130
+
131
+ if table_workers > 1
132
+ info_msg = "Table #{table_name}: using #{table_workers} workers for #{format_number(count)} records"
133
+ multi_progress.set_info(info_msg) if multi_progress
134
+ Tapsoob::ProgressEvent.table_start(table_name, count, workers: table_workers)
135
+ push_data_from_file_parallel(table_name, count, table_workers, multi_progress)
136
+ else
137
+ # Show info message for single-worker table
138
+ info_msg = "Loading #{table_name}: #{format_number(count)} records"
139
+ multi_progress.set_info(info_msg) if multi_progress
140
+
141
+ Tapsoob::ProgressEvent.table_start(table_name, count)
142
+ db[table_name.to_sym].truncate if @opts[:purge]
143
+ stream = Tapsoob::DataStream::Base.factory(db, {
144
+ :table_name => table_name,
145
+ :chunksize => default_chunksize
146
+ }, {
147
+ :"skip-duplicates" => opts[:"skip-duplicates"] || false,
148
+ :"discard-identity" => opts[:"discard-identity"] || false,
149
+ :purge => opts[:purge] || false,
150
+ :debug => opts[:debug]
151
+ })
152
+ estimated_chunks = [(count.to_f / default_chunksize).ceil, 1].max
153
+ progress = multi_progress ? multi_progress.create_bar(table_name.to_s, estimated_chunks) : nil
154
+ push_data_from_file(stream, progress, count)
155
+ end
156
+ end
157
+
158
+ multi_progress.stop if multi_progress
159
+ end
160
+
161
+ def push_data_parallel
162
+ log.info "Using #{parallel_workers} parallel workers for table-level parallelization"
163
+
164
+ # Reserve space for both table-level and intra-table workers
165
+ max_visible_bars = 8
166
+ multi_progress = opts[:progress] ? Tapsoob::Progress::MultiBar.new(max_visible_bars) : nil
167
+ table_queue = Queue.new
168
+
169
+ tables.each do |table_name, count|
170
+ data_file = File.join(dump_path, "data", "#{table_name}.json")
171
+ next unless File.exist?(data_file) && count > 0
172
+ table_queue << [table_name, count]
173
+ end
174
+
175
+ workers = (1..parallel_workers).map do
176
+ Thread.new do
177
+ loop do
178
+ break if table_queue.empty?
179
+
180
+ table_name, count = table_queue.pop(true) rescue break
181
+
182
+ # Check if this table should use intra-table parallelization
183
+ table_workers = table_parallel_workers(table_name, count)
184
+
185
+ if table_workers > 1
186
+ # Large table - use intra-table parallelization
187
+ info_msg = "Table #{table_name}: using #{table_workers} workers for #{format_number(count)} records"
188
+ if multi_progress
189
+ multi_progress.set_info(info_msg)
190
+ else
191
+ log.info info_msg
192
+ end
193
+
194
+ Tapsoob::ProgressEvent.table_start(table_name, count, workers: table_workers)
195
+ # Run intra-table parallelization, passing parent progress bar
196
+ push_data_from_file_parallel(table_name, count, table_workers, multi_progress)
197
+ else
198
+ # Small table - use single-threaded processing
199
+ Tapsoob::ProgressEvent.table_start(table_name, count)
200
+ db[table_name.to_sym].truncate if @opts[:purge]
201
+ stream = Tapsoob::DataStream::Base.factory(db, {
202
+ :table_name => table_name,
203
+ :chunksize => default_chunksize
204
+ }, {
205
+ :"skip-duplicates" => opts[:"skip-duplicates"] || false,
206
+ :"discard-identity" => opts[:"discard-identity"] || false,
207
+ :purge => opts[:purge] || false,
208
+ :debug => opts[:debug]
209
+ })
210
+
211
+ estimated_chunks = [(count.to_f / default_chunksize).ceil, 1].max
212
+ progress = multi_progress ? multi_progress.create_bar(table_name.to_s, estimated_chunks) : nil
213
+
214
+ push_data_from_file(stream, progress, count)
215
+ end
216
+ end
217
+ end
218
+ end
219
+
220
+ workers.each(&:join)
221
+ multi_progress.stop if multi_progress
222
+ end
223
+
224
+ def push_data_from_file(stream, progress, total_records = nil)
225
+ records_processed = 0
226
+
227
+ loop do
228
+ if exiting?
229
+ store_session
230
+ exit 0
231
+ end
232
+
233
+ row_size = 0
234
+ chunksize = stream.state[:chunksize]
235
+
236
+ begin
237
+ chunksize = Tapsoob::Utils.calculate_chunksize(chunksize) do |c|
238
+ stream.state[:chunksize] = c.to_i
239
+ encoded_data, row_size, elapsed_time = nil
240
+ d1 = c.time_delta do
241
+ encoded_data, row_size, elapsed_time = stream.fetch({ :type => "file", :source => dump_path })
242
+ end
243
+
244
+ data = nil
245
+ d2 = c.time_delta do
246
+ data = {
247
+ :state => stream.to_hash,
248
+ :checksum => Tapsoob::Utils.checksum(encoded_data).to_s,
249
+ :encoded_data => encoded_data
250
+ }
251
+ end
252
+
253
+ stream.fetch_data_to_database(data)
254
+
255
+ # Track records for progress events
256
+ if row_size.positive?
257
+ records_processed += row_size
258
+ Tapsoob::ProgressEvent.table_progress(stream.table_name, records_processed, total_records) if total_records
259
+ end
260
+ self.stream_state = stream.to_hash
261
+
262
+ c.idle_secs = (d1 + d2)
263
+
264
+ elapsed_time
265
+ end
266
+ rescue Tapsoob::CorruptedData => e
267
+ # retry the same data, it got corrupted somehow.
268
+ next
269
+ rescue Tapsoob::DuplicatePrimaryKeyError => e
270
+ # verify the stream and retry it
271
+ stream.verify_stream
272
+ stream = JSON.generate({ :state => stream.to_hash })
273
+ next
274
+ end
275
+ stream.state[:chunksize] = chunksize
276
+
277
+ # Update progress bar by 1 chunk
278
+ progress.inc(1) if progress
279
+
280
+ break if stream.complete?
281
+ end
282
+
283
+ progress.finish if progress
284
+ add_completed_table(stream.table_name)
285
+ self.stream_state = {}
286
+
287
+ # Emit final table complete event
288
+ Tapsoob::ProgressEvent.table_complete(stream.table_name, records_processed)
289
+ end
290
+
291
+ def local_tables_info
292
+ opts[:local_tables_info] ||= fetch_local_tables_info
293
+ end
294
+
295
+ def tables
296
+ h = {}
297
+ local_tables_info.each do |table_name, count|
298
+ next if completed_tables.include?(table_name.to_s)
299
+ h[table_name.to_s] = count
300
+ end
301
+ h
302
+ end
303
+
304
+ def record_count
305
+ @record_count ||= local_tables_info.values.inject(0) { |a,c| a += c }
306
+ end
307
+
308
+ def fetch_local_tables_info
309
+ tables_with_counts = {}
310
+
311
+ # Try to load table order from metadata file (preserves dependency order)
312
+ ordered_tables = load_table_order
313
+
314
+ if ordered_tables
315
+ # Use the saved dependency order
316
+ table_names = ordered_tables
317
+ else
318
+ # Fallback: read from schema files (alphabetical order - may have issues with dependencies)
319
+ table_names = Dir.glob(File.join(dump_path, "schemas", "*"))
320
+ .map { |path| File.basename(path, ".rb") }
321
+ .reject { |name| name == "table_order" } # Exclude metadata file
322
+ .sort
323
+ end
324
+
325
+ # Count rows for each table
326
+ table_names.each do |table|
327
+ if File.exist?(File.join(dump_path, "data", "#{table}.json"))
328
+ # Read NDJSON format - each line is a separate JSON chunk
329
+ total_rows = 0
330
+ File.readlines(File.join(dump_path, "data", "#{table}.json")).each do |line|
331
+ chunk = JSON.parse(line.strip)
332
+ total_rows += chunk["data"].size if chunk["data"]
333
+ end
334
+ tables_with_counts[table] = total_rows
335
+ else
336
+ tables_with_counts[table] = 0
337
+ end
338
+ end
339
+
340
+ apply_table_filter(tables_with_counts)
341
+ end
342
+
343
+ # Calculate line ranges for file partitioning
344
+ def calculate_file_line_ranges(table_name, num_workers)
345
+ file_path = File.join(dump_path, "data", "#{table_name}.json")
346
+ return [] unless File.exist?(file_path)
347
+
348
+ total_lines = File.foreach(file_path).count
349
+ return [[0, total_lines - 1]] if total_lines == 0 || num_workers <= 1
350
+
351
+ lines_per_worker = (total_lines.to_f / num_workers).ceil
352
+
353
+ ranges = []
354
+ (0...num_workers).each do |i|
355
+ start_line = i * lines_per_worker
356
+ end_line = [((i + 1) * lines_per_worker) - 1, total_lines - 1].min
357
+ ranges << [start_line, end_line] if start_line < total_lines
358
+ end
359
+
360
+ ranges
361
+ end
362
+
363
+ # Parallel push for a single large table using file partitioning
364
+ def push_data_from_file_parallel(table_name, count, num_workers, parent_progress = nil)
365
+ # Calculate line ranges for each worker
366
+ ranges = calculate_file_line_ranges(table_name, num_workers)
367
+ return if ranges.empty?
368
+
369
+ # Truncate table if purge is enabled
370
+ db[table_name.to_sym].truncate if @opts[:purge]
371
+
372
+ # Create single shared progress bar for the table
373
+ estimated_chunks = [(count.to_f / default_chunksize).ceil, 1].max
374
+ shared_progress = parent_progress ? parent_progress.create_bar(table_name.to_s, estimated_chunks) : nil
375
+
376
+ # Mutex for coordinating database writes and progress tracking
377
+ write_mutex = Mutex.new
378
+ records_processed = 0
379
+
380
+ begin
381
+ workers = (0...num_workers).map do |worker_id|
382
+ Thread.new do
383
+ start_line, end_line = ranges[worker_id]
384
+
385
+ # Create worker-specific stream with line range
386
+ stream = Tapsoob::DataStream::FilePartition.new(db, {
387
+ :table_name => table_name,
388
+ :chunksize => default_chunksize,
389
+ :line_range => [start_line, end_line]
390
+ }, {
391
+ :"skip-duplicates" => opts[:"skip-duplicates"] || false,
392
+ :"discard-identity" => opts[:"discard-identity"] || false,
393
+ :purge => opts[:purge] || false,
394
+ :debug => opts[:debug]
395
+ })
396
+
397
+ # Process chunks from file
398
+ loop do
399
+ break if stream.complete?
400
+
401
+ begin
402
+ encoded_data, row_size, elapsed_time = stream.fetch(:type => "file", :source => dump_path)
403
+
404
+ if row_size.positive?
405
+ data = {
406
+ :state => stream.to_hash,
407
+ :checksum => Tapsoob::Utils.checksum(encoded_data).to_s,
408
+ :encoded_data => encoded_data
409
+ }
410
+
411
+ # Thread-safe database write and progress tracking
412
+ write_mutex.synchronize do
413
+ stream.fetch_data_to_database(data)
414
+
415
+ # Track records for progress events
416
+ records_processed += row_size
417
+ Tapsoob::ProgressEvent.table_progress(table_name, records_processed, count)
418
+ end
419
+
420
+ shared_progress.inc(1) if shared_progress
421
+ end
422
+
423
+ rescue Tapsoob::CorruptedData => e
424
+ log.info "Worker #{worker_id}: Corrupted Data Received #{e.message}, retrying..."
425
+ next
426
+ rescue StandardError => e
427
+ log.error "Worker #{worker_id} error: #{e.message}"
428
+ log.error e.backtrace.join("\n")
429
+ raise
430
+ end
431
+
432
+ break if stream.complete?
433
+ end
434
+ end
435
+ end
436
+
437
+ workers.each(&:join)
438
+ shared_progress.finish if shared_progress
439
+ ensure
440
+ # Always emit table_complete event, even if there was an error
441
+ Tapsoob::ProgressEvent.table_complete(table_name, records_processed)
442
+ end
443
+ end
444
+ end
445
+ end
446
+ end