tapsoob 0.6.2-java → 0.7.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,240 @@
1
+ # -*- encoding : utf-8 -*-
2
+ require 'sequel'
3
+ require 'thread'
4
+ require 'etc'
5
+
6
+ require 'tapsoob/data_stream'
7
+ require 'tapsoob/log'
8
+ require 'tapsoob/progress'
9
+ require 'tapsoob/progress_event'
10
+ require 'tapsoob/schema'
11
+
12
+ module Tapsoob
13
+ module Operation
14
+ class Base
15
+ attr_reader :database_url, :dump_path, :opts
16
+
17
+ def initialize(database_url, dump_path = nil, opts={})
18
+ @database_url = database_url
19
+ @dump_path = dump_path
20
+ @opts = opts
21
+ @exiting = false
22
+
23
+ # Enable JSON progress events only when:
24
+ # 1. CLI progress bars are disabled (--progress=false), AND
25
+ # 2. Not piping (dump_path is provided)
26
+ # This prevents STDERR noise when piping and when using visual progress bars
27
+ Tapsoob::ProgressEvent.enabled = !opts[:progress] && !dump_path.nil?
28
+ end
29
+
30
+ def file_prefix
31
+ "op"
32
+ end
33
+
34
+ def data?
35
+ opts[:data]
36
+ end
37
+
38
+ def schema?
39
+ opts[:schema]
40
+ end
41
+
42
+ def indexes_first?
43
+ !!opts[:indexes_first]
44
+ end
45
+
46
+ def table_filter
47
+ opts[:tables] || []
48
+ end
49
+
50
+ def exclude_tables
51
+ opts[:exclude_tables] || []
52
+ end
53
+
54
+ def apply_table_filter(tables)
55
+ return tables if table_filter.empty? && exclude_tables.empty?
56
+
57
+ if tables.kind_of?(Hash)
58
+ ntables = {}
59
+ tables.each do |t, d|
60
+ if !exclude_tables.include?(t.to_s) && (!table_filter.empty? && table_filter.include?(t.to_s))
61
+ ntables[t] = d
62
+ end
63
+ end
64
+ ntables
65
+ else
66
+ tables.reject { |t| exclude_tables.include?(t.to_s) }.select { |t| table_filter.include?(t.to_s) }
67
+ end
68
+ end
69
+
70
+ def log
71
+ Tapsoob.log.level = Logger::DEBUG if opts[:debug]
72
+ Tapsoob.log
73
+ end
74
+
75
+ def store_session
76
+ file = "#{file_prefix}_#{Time.now.strftime("%Y%m%d%H%M")}.dat"
77
+ log.info "\nSaving session to #{file}..."
78
+ File.open(file, 'w') do |f|
79
+ f.write(JSON.generate(to_hash))
80
+ end
81
+ end
82
+
83
+ def to_hash
84
+ {
85
+ :klass => self.class.to_s,
86
+ :database_url => database_url,
87
+ :stream_state => stream_state,
88
+ :completed_tables => completed_tables,
89
+ :table_filter => table_filter,
90
+ }
91
+ end
92
+
93
+ def exiting?
94
+ !!@exiting
95
+ end
96
+
97
+ def setup_signal_trap
98
+ trap("INT") {
99
+ puts "\nCompleting current action..."
100
+ @exiting = true
101
+ }
102
+
103
+ trap("TERM") {
104
+ puts "\nCompleting current action..."
105
+ @exiting = true
106
+ }
107
+ end
108
+
109
+ def resuming?
110
+ opts[:resume] == true
111
+ end
112
+
113
+ def default_chunksize
114
+ opts[:default_chunksize]
115
+ end
116
+
117
+ def completed_tables
118
+ opts[:completed_tables] ||= []
119
+ end
120
+
121
+ def stream_state
122
+ opts[:stream_state] ||= {}
123
+ end
124
+
125
+ def stream_state=(val)
126
+ opts[:stream_state] = val
127
+ end
128
+
129
+ def db
130
+ @db ||= Sequel.connect(database_url, max_connections: parallel_workers * 2)
131
+ @db.extension :schema_dumper
132
+ @db.loggers << Tapsoob.log if opts[:debug]
133
+
134
+ # Set parameters
135
+ if @db.uri =~ /oracle/i
136
+ @db << "ALTER SESSION SET NLS_DATE_FORMAT='YYYY-MM-DD HH24:MI:SS'"
137
+ @db << "ALTER SESSION SET NLS_TIMESTAMP_FORMAT='YYYY-MM-DD HH24:MI:SS:FF6'"
138
+ end
139
+
140
+ @db
141
+ end
142
+
143
+ def parallel?
144
+ parallel_workers > 1
145
+ end
146
+
147
+ def parallel_workers
148
+ @parallel_workers ||= [opts[:parallel].to_i, 1].max
149
+ end
150
+
151
+ # Auto-detect number of workers for intra-table parallelization
152
+ def table_parallel_workers(table_name, row_count)
153
+ # Disable intra-table parallelization when piping to STDOUT
154
+ # (no dump_path means we're outputting JSON directly, which can't be safely parallelized)
155
+ return 1 if dump_path.nil?
156
+
157
+ # TEMPORARILY RE-ENABLED for debugging
158
+ # return 1 if self.is_a?(Tapsoob::Operation::Push)
159
+
160
+ # Minimum threshold for parallelization (100K rows by default)
161
+ threshold = 100_000
162
+ return 1 if row_count < threshold
163
+
164
+ # Detect available CPU cores
165
+ available_cpus = Etc.nprocessors rescue 4
166
+
167
+ # Use up to 50% of CPUs for single table, max 8 workers
168
+ max_workers = [available_cpus / 2, 8, 2].max
169
+
170
+ # Scale based on table size
171
+ if row_count >= 5_000_000
172
+ max_workers
173
+ elsif row_count >= 1_000_000
174
+ [max_workers / 2, 2].max
175
+ elsif row_count >= 500_000
176
+ [max_workers / 4, 2].max
177
+ else
178
+ 2 # Minimum 2 workers for tables over threshold
179
+ end
180
+ end
181
+
182
+ # Check if table can use efficient PK-based partitioning
183
+ def can_use_pk_partitioning?(table_name)
184
+ Tapsoob::Utils.single_integer_primary_key(db, table_name.to_sym)
185
+ end
186
+
187
+ def completed_tables_mutex
188
+ @completed_tables_mutex ||= Mutex.new
189
+ end
190
+
191
+ def add_completed_table(table_name)
192
+ completed_tables_mutex.synchronize do
193
+ completed_tables << table_name.to_s
194
+ end
195
+ end
196
+
197
+ def format_number(num)
198
+ num.to_s.gsub(/(\d)(?=(\d\d\d)+(?!\d))/, "\\1,")
199
+ end
200
+
201
+ def save_table_order(table_names)
202
+ return unless dump_path
203
+
204
+ metadata_file = File.join(dump_path, "table_order.txt")
205
+ File.open(metadata_file, 'w') do |file|
206
+ table_names.each { |table| file.puts(table) }
207
+ end
208
+ end
209
+
210
+ def load_table_order
211
+ return nil unless dump_path
212
+
213
+ metadata_file = File.join(dump_path, "table_order.txt")
214
+ return nil unless File.exist?(metadata_file)
215
+
216
+ File.readlines(metadata_file).map(&:strip).reject(&:empty?)
217
+ end
218
+
219
+ def catch_errors(&blk)
220
+ begin
221
+ blk.call
222
+ rescue Exception => e
223
+ raise e
224
+ end
225
+ end
226
+
227
+ def self.factory(type, database_url, dump_path, opts)
228
+ type = :resume if opts[:resume]
229
+ klass = case type
230
+ when :pull then Tapsoob::Operation::Pull
231
+ when :push then Tapsoob::Operation::Push
232
+ when :resume then eval(opts[:klass])
233
+ else raise "Unknown Operation Type -> #{type}"
234
+ end
235
+
236
+ klass.new(database_url, dump_path, opts)
237
+ end
238
+ end
239
+ end
240
+ end
@@ -0,0 +1,419 @@
1
+ # -*- encoding : utf-8 -*-
2
+ require 'fileutils'
3
+ require 'tapsoob/operation/base'
4
+ require 'tapsoob/progress_event'
5
+
6
+ module Tapsoob
7
+ module Operation
8
+ class Pull < Base
9
+ def file_prefix
10
+ "pull"
11
+ end
12
+
13
+ def to_hash
14
+ super.merge(:remote_tables_info => remote_tables_info)
15
+ end
16
+
17
+ def run
18
+ catch_errors do
19
+ unless resuming?
20
+ initialize_dump_directory if dump_path
21
+ pull_schema if schema?
22
+ pull_indexes if indexes_first? && schema?
23
+ end
24
+ setup_signal_trap
25
+ pull_partial_data if data? && resuming?
26
+ pull_data if data?
27
+ pull_indexes if !indexes_first? && schema?
28
+ pull_reset_sequences
29
+ end
30
+ end
31
+
32
+ def initialize_dump_directory
33
+ %w[data schemas indexes].each do |subdir|
34
+ dir_path = File.join(dump_path, subdir)
35
+ FileUtils.rm_rf(dir_path)
36
+ FileUtils.mkdir_p(dir_path)
37
+ end
38
+
39
+ FileUtils.rm_f(File.join(dump_path, "table_order.txt"))
40
+ end
41
+
42
+ def pull_schema
43
+ log.info "Receiving schema"
44
+ Tapsoob::ProgressEvent.schema_start(tables.size)
45
+
46
+ progress = opts[:progress] ? Tapsoob::Progress::Bar.new('Schema', tables.size) : nil
47
+ tables.each do |table_name, count|
48
+ # Reuse existing db connection for better performance
49
+ schema_data = Tapsoob::Schema.dump_table(db, table_name, @opts.slice(:indexes, :same_db))
50
+ log.debug "Table: #{table_name}\n#{schema_data}\n"
51
+ output = Tapsoob::Utils.export_schema(dump_path, table_name, schema_data)
52
+ puts output if dump_path.nil? && output
53
+ progress.inc(1) if progress
54
+ end
55
+ progress.finish if progress
56
+ Tapsoob::ProgressEvent.schema_complete(tables.size)
57
+
58
+ # Save table order for dependency-aware schema loading during push
59
+ save_table_order(tables.keys) if dump_path
60
+ end
61
+
62
+ def pull_data
63
+ log.info "Receiving data"
64
+
65
+ log.info "#{tables.size} tables, #{format_number(record_count)} records"
66
+ Tapsoob::ProgressEvent.data_start(tables.size, record_count)
67
+
68
+ if parallel?
69
+ pull_data_parallel
70
+ else
71
+ pull_data_serial
72
+ end
73
+
74
+ Tapsoob::ProgressEvent.data_complete(tables.size, record_count)
75
+ end
76
+
77
+ def pull_data_serial
78
+ tables.each do |table_name, count|
79
+ # Auto-detect if we should use intra-table parallelization
80
+ table_workers = table_parallel_workers(table_name, count)
81
+
82
+ if table_workers > 1
83
+ log.info "Table #{table_name}: using #{table_workers} workers for #{format_number(count)} records"
84
+ Tapsoob::ProgressEvent.table_start(table_name, count, workers: table_workers)
85
+ pull_data_from_table_parallel(table_name, count, table_workers)
86
+ else
87
+ Tapsoob::ProgressEvent.table_start(table_name, count)
88
+ stream = Tapsoob::DataStream::Base.factory(db, {
89
+ :chunksize => default_chunksize,
90
+ :table_name => table_name
91
+ }, { :debug => opts[:debug] })
92
+ estimated_chunks = [(count.to_f / default_chunksize).ceil, 1].max
93
+ progress = (opts[:progress] ? Tapsoob::Progress::Bar.new(table_name.to_s, estimated_chunks) : nil)
94
+ pull_data_from_table(stream, progress, count)
95
+ end
96
+ end
97
+ end
98
+
99
+ def pull_data_parallel
100
+ log.info "Using #{parallel_workers} parallel workers for table-level parallelization"
101
+
102
+ # Reserve space for both table-level and intra-table workers
103
+ # With 4 table workers and potentially 8 intra-table workers per table,
104
+ # we could have many concurrent progress bars. Show up to 8 at once.
105
+ max_visible_bars = 8
106
+ multi_progress = opts[:progress] ? Tapsoob::Progress::MultiBar.new(max_visible_bars) : nil
107
+ table_queue = Queue.new
108
+ tables.each { |table_name, count| table_queue << [table_name, count] }
109
+
110
+ workers = (1..parallel_workers).map do
111
+ Thread.new do
112
+ loop do
113
+ break if table_queue.empty?
114
+
115
+ table_name, count = table_queue.pop(true) rescue break
116
+
117
+ # Check if this table should use intra-table parallelization
118
+ table_workers = table_parallel_workers(table_name, count)
119
+
120
+ if table_workers > 1
121
+ # Large table - use intra-table parallelization
122
+ info_msg = "Table #{table_name}: using #{table_workers} workers for #{format_number(count)} records"
123
+ if multi_progress
124
+ multi_progress.set_info(info_msg)
125
+ else
126
+ log.info info_msg
127
+ end
128
+
129
+ Tapsoob::ProgressEvent.table_start(table_name, count, workers: table_workers)
130
+ # Run intra-table parallelization, passing parent progress bar
131
+ pull_data_from_table_parallel(table_name, count, table_workers, multi_progress)
132
+ else
133
+ # Small table - use single-threaded processing
134
+ Tapsoob::ProgressEvent.table_start(table_name, count)
135
+ stream = Tapsoob::DataStream::Base.factory(db, {
136
+ :chunksize => default_chunksize,
137
+ :table_name => table_name
138
+ }, { :debug => opts[:debug] })
139
+
140
+ estimated_chunks = [(count.to_f / default_chunksize).ceil, 1].max
141
+ progress = multi_progress ? multi_progress.create_bar(table_name.to_s, estimated_chunks) : nil
142
+
143
+ pull_data_from_table(stream, progress, count)
144
+ end
145
+ end
146
+ end
147
+ end
148
+
149
+ workers.each(&:join)
150
+ multi_progress.stop if multi_progress
151
+ end
152
+
153
+ def pull_partial_data
154
+ return if stream_state == {}
155
+
156
+ table_name = stream_state[:table_name]
157
+ record_count = tables[table_name.to_s]
158
+ log.info "Resuming #{table_name}, #{format_number(record_count)} records"
159
+
160
+ stream = Tapsoob::DataStream::Base.factory(db, stream_state)
161
+ chunksize = stream_state[:chunksize] || default_chunksize
162
+ estimated_chunks = [(record_count.to_f / chunksize).ceil, 1].max
163
+ progress = (opts[:progress] ? Tapsoob::Progress::Bar.new(table_name.to_s, estimated_chunks) : nil)
164
+ pull_data_from_table(stream, progress)
165
+ end
166
+
167
+ def pull_data_from_table(stream, progress, total_records = nil)
168
+ records_processed = 0
169
+
170
+ loop do
171
+ if exiting?
172
+ store_session
173
+ exit 0
174
+ end
175
+
176
+ row_size = 0
177
+ chunksize = stream.state[:chunksize]
178
+
179
+ begin
180
+ chunksize = Tapsoob::Utils.calculate_chunksize(chunksize) do |c|
181
+ stream.state[:chunksize] = c.to_i
182
+ encoded_data, row_size, elapsed_time = nil
183
+ d1 = c.time_delta do
184
+ encoded_data, row_size, elapsed_time = stream.fetch
185
+ end
186
+
187
+ data = nil
188
+ d2 = c.time_delta do
189
+ data = {
190
+ :state => stream.to_hash,
191
+ :checksum => Tapsoob::Utils.checksum(encoded_data).to_s,
192
+ :encoded_data => encoded_data
193
+ }
194
+ end
195
+
196
+ stream.fetch_data_from_database(data) do |rows|
197
+ next if rows == {}
198
+
199
+ # Update progress bar by 1 chunk
200
+ progress.inc(1) if progress
201
+
202
+ # Track records processed for progress events
203
+ if rows[:data]
204
+ records_processed += rows[:data].size
205
+ Tapsoob::ProgressEvent.table_progress(stream.table_name, records_processed, total_records) if total_records
206
+ end
207
+
208
+ if dump_path.nil?
209
+ puts JSON.generate(rows)
210
+ else
211
+ Tapsoob::Utils.export_rows(dump_path, stream.table_name, rows)
212
+ end
213
+ end
214
+ log.debug "row size: #{row_size}"
215
+ stream.error = false
216
+ self.stream_state = stream.to_hash
217
+
218
+ c.idle_secs = (d1 + d2)
219
+
220
+ elapsed_time
221
+ end
222
+ rescue Tapsoob::CorruptedData => e
223
+ log.info "Corrupted Data Received #{e.message}, retrying..."
224
+ stream.error = true
225
+ next
226
+ end
227
+
228
+ break if stream.complete?
229
+ end
230
+
231
+ progress.finish if progress
232
+ add_completed_table(stream.table_name)
233
+ self.stream_state = {}
234
+
235
+ # Emit final table complete event
236
+ Tapsoob::ProgressEvent.table_complete(stream.table_name, records_processed)
237
+ end
238
+
239
+ def pull_data_from_table_parallel(table_name, row_count, num_workers, parent_progress = nil)
240
+ # Mutex for coordinating file writes and progress tracking
241
+ write_mutex = Mutex.new
242
+ records_processed = 0
243
+
244
+ begin
245
+ # Determine partitioning strategy
246
+ use_pk_partitioning = can_use_pk_partitioning?(table_name)
247
+
248
+ if use_pk_partitioning
249
+ # PK-based partitioning for efficient range queries
250
+ ranges = Tapsoob::DataStream::Keyed.calculate_pk_ranges(db, table_name, num_workers)
251
+ log.debug "Table #{table_name}: using PK-based partitioning with #{ranges.size} ranges"
252
+ else
253
+ # Interleaved chunking for tables without integer PK
254
+ log.debug "Table #{table_name}: using interleaved chunking with #{num_workers} workers"
255
+ end
256
+
257
+ # Progress tracking - create ONE shared progress bar for the entire table
258
+ estimated_chunks = [(row_count.to_f / default_chunksize).ceil, 1].max
259
+ shared_progress = parent_progress ? parent_progress.create_bar(table_name.to_s, estimated_chunks) : nil
260
+
261
+ workers = (0...num_workers).map do |worker_id|
262
+ Thread.new do
263
+ # Create worker-specific stream
264
+ if use_pk_partitioning
265
+ min_pk, max_pk = ranges[worker_id]
266
+ stream = Tapsoob::DataStream::KeyedPartition.new(db, {
267
+ :table_name => table_name,
268
+ :chunksize => default_chunksize,
269
+ :partition_range => [min_pk, max_pk]
270
+ }, { :debug => opts[:debug] })
271
+ else
272
+ stream = Tapsoob::DataStream::Interleaved.new(db, {
273
+ :table_name => table_name,
274
+ :chunksize => default_chunksize,
275
+ :worker_id => worker_id,
276
+ :num_workers => num_workers
277
+ }, { :debug => opts[:debug] })
278
+ end
279
+
280
+ # Process data chunks
281
+ loop do
282
+ break if exiting? || stream.complete?
283
+
284
+ begin
285
+ encoded_data, row_size, elapsed_time = stream.fetch
286
+
287
+ # Skip processing empty results
288
+ if row_size.positive?
289
+ data = {
290
+ :state => stream.to_hash,
291
+ :checksum => Tapsoob::Utils.checksum(encoded_data).to_s,
292
+ :encoded_data => encoded_data
293
+ }
294
+
295
+ stream.fetch_data_from_database(data) do |rows|
296
+ next if rows == {}
297
+
298
+ # Thread-safe file write and progress tracking
299
+ write_mutex.synchronize do
300
+ if dump_path.nil?
301
+ puts JSON.generate(rows)
302
+ else
303
+ Tapsoob::Utils.export_rows(dump_path, stream.table_name, rows)
304
+ end
305
+
306
+ # Track records for progress events
307
+ if rows[:data]
308
+ records_processed += rows[:data].size
309
+ Tapsoob::ProgressEvent.table_progress(table_name, records_processed, row_count)
310
+ end
311
+ end
312
+
313
+ shared_progress.inc(1) if shared_progress
314
+ end
315
+ end
316
+
317
+ # Check completion AFTER processing data to avoid losing the last chunk
318
+ break if stream.complete?
319
+
320
+ rescue Tapsoob::CorruptedData => e
321
+ log.info "Worker #{worker_id}: Corrupted Data Received #{e.message}, retrying..."
322
+ next
323
+ rescue StandardError => e
324
+ log.error "Worker #{worker_id} error: #{e.message}"
325
+ log.error e.backtrace.join("\n")
326
+ raise
327
+ end
328
+ end
329
+ end
330
+ end
331
+
332
+ workers.each(&:join)
333
+ shared_progress.finish if shared_progress
334
+
335
+ add_completed_table(table_name)
336
+ ensure
337
+ # Always emit table_complete event, even if there was an error
338
+ Tapsoob::ProgressEvent.table_complete(table_name, records_processed)
339
+ end
340
+ end
341
+
342
+ def tables
343
+ h = {}
344
+ tables_info.each do |table_name, count|
345
+ next if completed_tables.include?(table_name.to_s)
346
+ h[table_name.to_s] = count
347
+ end
348
+ h
349
+ end
350
+
351
+ def record_count
352
+ tables_info.values.inject(:+)
353
+ end
354
+
355
+ def tables_info
356
+ opts[:tables_info] ||= fetch_tables_info
357
+ end
358
+
359
+ def fetch_tables_info
360
+ tables = db.send(:sort_dumped_tables, db.tables, {})
361
+
362
+ data = {}
363
+ apply_table_filter(tables).each do |table_name|
364
+ data[table_name] = db[table_name].count
365
+ end
366
+ data
367
+ end
368
+
369
+ def self.factory(db, state)
370
+ if defined?(Sequel::MySQL) && Sequel::MySQL.respond_to?(:convert_invalid_date_time=)
371
+ Sequel::MySQL.convert_invalid_date_time = :nil
372
+ end
373
+
374
+ if state.has_key?(:klass)
375
+ return eval(state[:klass]).new(db, state)
376
+ end
377
+
378
+ if Tapsoob::Utils.single_integer_primary_key(db, state[:table_name].to_sym)
379
+ Tapsoob::DataStream::Keyed.new(db, state)
380
+ else
381
+ Tapsoob::DataStream::Base.new(db, state)
382
+ end
383
+ end
384
+
385
+ def pull_indexes
386
+ log.info "Receiving indexes"
387
+
388
+ raw_idxs = Tapsoob::Schema.indexes_individual(database_url)
389
+ idxs = (raw_idxs && raw_idxs.length >= 2 ? JSON.parse(raw_idxs) : {})
390
+
391
+ # Calculate max title width for consistent alignment
392
+ filtered_idxs = apply_table_filter(idxs).select { |table, indexes| indexes.size > 0 }
393
+ Tapsoob::ProgressEvent.indexes_start(filtered_idxs.size)
394
+ max_title_width = filtered_idxs.keys.map { |table| "#{table} indexes".length }.max || 14
395
+
396
+ filtered_idxs.each do |table, indexes|
397
+ progress = opts[:progress] ? Tapsoob::Progress::Bar.new("#{table} indexes", indexes.size, STDOUT, max_title_width) : nil
398
+ indexes.each do |idx|
399
+ output = Tapsoob::Utils.export_indexes(dump_path, table, idx)
400
+ puts output if dump_path.nil? && output
401
+ progress.inc(1) if progress
402
+ end
403
+ progress.finish if progress
404
+ end
405
+ Tapsoob::ProgressEvent.indexes_complete(filtered_idxs.size)
406
+ end
407
+
408
+ def pull_reset_sequences
409
+ log.info "Resetting sequences"
410
+ Tapsoob::ProgressEvent.sequences_start
411
+
412
+ output = Tapsoob::Utils.schema_bin(:reset_db_sequences, database_url)
413
+ puts output if dump_path.nil? && output
414
+
415
+ Tapsoob::ProgressEvent.sequences_complete
416
+ end
417
+ end
418
+ end
419
+ end