tapsoob 0.6.2-java → 0.7.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,669 +1,10 @@
1
1
  # -*- encoding : utf-8 -*-
2
- require 'sequel'
3
- require 'thread'
4
-
5
- require 'tapsoob/data_stream'
6
- require 'tapsoob/log'
7
- require 'tapsoob/progress'
8
- require 'tapsoob/schema'
9
2
 
10
3
  module Tapsoob
11
- class Operation
12
- attr_reader :database_url, :dump_path, :opts
13
-
14
- def initialize(database_url, dump_path = nil, opts={})
15
- @database_url = database_url
16
- @dump_path = dump_path
17
- @opts = opts
18
- @exiting = false
19
- end
20
-
21
- def file_prefix
22
- "op"
23
- end
24
-
25
- def data?
26
- opts[:data]
27
- end
28
-
29
- def schema?
30
- opts[:schema]
31
- end
32
-
33
- def indexes_first?
34
- !!opts[:indexes_first]
35
- end
36
-
37
- def table_filter
38
- opts[:tables] || []
39
- end
40
-
41
- def exclude_tables
42
- opts[:exclude_tables] || []
43
- end
44
-
45
- def apply_table_filter(tables)
46
- return tables if table_filter.empty? && exclude_tables.empty?
47
-
48
- if tables.kind_of?(Hash)
49
- ntables = {}
50
- tables.each do |t, d|
51
- if !exclude_tables.include?(t.to_s) && (!table_filter.empty? && table_filter.include?(t.to_s))
52
- ntables[t] = d
53
- end
54
- end
55
- ntables
56
- else
57
- tables.reject { |t| exclude_tables.include?(t.to_s) }.select { |t| table_filter.include?(t.to_s) }
58
- end
59
- end
60
-
61
- def log
62
- Tapsoob.log.level = Logger::DEBUG if opts[:debug]
63
- Tapsoob.log
64
- end
65
-
66
- def store_session
67
- file = "#{file_prefix}_#{Time.now.strftime("%Y%m%d%H%M")}.dat"
68
- log.info "\nSaving session to #{file}..."
69
- File.open(file, 'w') do |f|
70
- f.write(JSON.generate(to_hash))
71
- end
72
- end
73
-
74
- def to_hash
75
- {
76
- :klass => self.class.to_s,
77
- :database_url => database_url,
78
- :stream_state => stream_state,
79
- :completed_tables => completed_tables,
80
- :table_filter => table_filter,
81
- }
82
- end
83
-
84
- def exiting?
85
- !!@exiting
86
- end
87
-
88
- def setup_signal_trap
89
- trap("INT") {
90
- puts "\nCompleting current action..."
91
- @exiting = true
92
- }
93
-
94
- trap("TERM") {
95
- puts "\nCompleting current action..."
96
- @exiting = true
97
- }
98
- end
99
-
100
- def resuming?
101
- opts[:resume] == true
102
- end
103
-
104
- def default_chunksize
105
- opts[:default_chunksize]
106
- end
107
-
108
- def completed_tables
109
- opts[:completed_tables] ||= []
110
- end
111
-
112
- def stream_state
113
- opts[:stream_state] ||= {}
114
- end
115
-
116
- def stream_state=(val)
117
- opts[:stream_state] = val
118
- end
119
-
120
- def db
121
- @db ||= Sequel.connect(database_url, max_connections: parallel_workers * 2)
122
- @db.extension :schema_dumper
123
- @db.loggers << Tapsoob.log if opts[:debug]
124
-
125
- # Set parameters
126
- if @db.uri =~ /oracle/i
127
- @db << "ALTER SESSION SET NLS_DATE_FORMAT='YYYY-MM-DD HH24:MI:SS'"
128
- @db << "ALTER SESSION SET NLS_TIMESTAMP_FORMAT='YYYY-MM-DD HH24:MI:SS:FF6'"
129
- end
130
-
131
- @db
132
- end
133
-
134
- def parallel?
135
- parallel_workers > 1
136
- end
137
-
138
- def parallel_workers
139
- @parallel_workers ||= [opts[:parallel].to_i, 1].max
140
- end
141
-
142
- def completed_tables_mutex
143
- @completed_tables_mutex ||= Mutex.new
144
- end
145
-
146
- def add_completed_table(table_name)
147
- completed_tables_mutex.synchronize do
148
- completed_tables << table_name.to_s
149
- end
150
- end
151
-
152
- def format_number(num)
153
- num.to_s.gsub(/(\d)(?=(\d\d\d)+(?!\d))/, "\\1,")
154
- end
155
-
156
- def catch_errors(&blk)
157
- begin
158
- blk.call
159
- rescue Exception => e
160
- raise e
161
- end
162
- end
163
-
164
- def self.factory(type, database_url, dump_path, opts)
165
- type = :resume if opts[:resume]
166
- klass = case type
167
- when :pull then Tapsoob::Pull
168
- when :push then Tapsoob::Push
169
- when :resume then eval(opts[:klass])
170
- else raise "Unknown Operation Type -> #{type}"
171
- end
172
-
173
- klass.new(database_url, dump_path, opts)
174
- end
175
- end
176
-
177
- class Pull < Operation
178
- def file_prefix
179
- "pull"
180
- end
181
-
182
- def to_hash
183
- super.merge(:remote_tables_info => remote_tables_info)
184
- end
185
-
186
- def run
187
- catch_errors do
188
- unless resuming?
189
- pull_schema if schema?
190
- pull_indexes if indexes_first? && schema?
191
- end
192
- setup_signal_trap
193
- pull_partial_data if data? && resuming?
194
- pull_data if data?
195
- pull_indexes if !indexes_first? && schema?
196
- pull_reset_sequences
197
- end
198
- end
199
-
200
- def pull_schema
201
- log.info "Receiving schema"
202
-
203
- progress = ProgressBar.new('Schema', tables.size)
204
- tables.each do |table_name, count|
205
- # Reuse existing db connection for better performance
206
- schema_data = Tapsoob::Schema.dump_table(db, table_name, @opts.slice(:indexes, :same_db))
207
- log.debug "Table: #{table_name}\n#{schema_data}\n"
208
- output = Tapsoob::Utils.export_schema(dump_path, table_name, schema_data)
209
- puts output if dump_path.nil? && output
210
- progress.inc(1)
211
- end
212
- progress.finish
213
- end
214
-
215
- def pull_data
216
- log.info "Receiving data"
217
-
218
- log.info "#{tables.size} tables, #{format_number(record_count)} records"
219
-
220
- if parallel?
221
- pull_data_parallel
222
- else
223
- pull_data_serial
224
- end
225
- end
226
-
227
- def pull_data_serial
228
- tables.each do |table_name, count|
229
- stream = Tapsoob::DataStream.factory(db, {
230
- :chunksize => default_chunksize,
231
- :table_name => table_name
232
- }, { :debug => opts[:debug] })
233
- estimated_chunks = [(count.to_f / default_chunksize).ceil, 1].max
234
- progress = (opts[:progress] ? ProgressBar.new(table_name.to_s, estimated_chunks) : nil)
235
- pull_data_from_table(stream, progress)
236
- end
237
- end
238
-
239
- def pull_data_parallel
240
- log.info "Using #{parallel_workers} parallel workers"
241
-
242
- multi_progress = opts[:progress] ? MultiProgressBar.new(parallel_workers) : nil
243
- table_queue = Queue.new
244
- tables.each { |table_name, count| table_queue << [table_name, count] }
245
-
246
- workers = (1..parallel_workers).map do
247
- Thread.new do
248
- loop do
249
- break if table_queue.empty?
250
-
251
- table_name, count = table_queue.pop(true) rescue break
252
-
253
- # Each thread gets its own connection from the pool
254
- stream = Tapsoob::DataStream.factory(db, {
255
- :chunksize => default_chunksize,
256
- :table_name => table_name
257
- }, { :debug => opts[:debug] })
258
-
259
- estimated_chunks = [(count.to_f / default_chunksize).ceil, 1].max
260
- progress = multi_progress ? multi_progress.create_bar(table_name.to_s, estimated_chunks) : nil
261
-
262
- pull_data_from_table(stream, progress)
263
- end
264
- end
265
- end
266
-
267
- workers.each(&:join)
268
- multi_progress.stop if multi_progress
269
- end
270
-
271
- def pull_partial_data
272
- return if stream_state == {}
273
-
274
- table_name = stream_state[:table_name]
275
- record_count = tables[table_name.to_s]
276
- log.info "Resuming #{table_name}, #{format_number(record_count)} records"
277
-
278
- stream = Tapsoob::DataStream.factory(db, stream_state)
279
- chunksize = stream_state[:chunksize] || default_chunksize
280
- estimated_chunks = [(record_count.to_f / chunksize).ceil, 1].max
281
- progress = (opts[:progress] ? ProgressBar.new(table_name.to_s, estimated_chunks) : nil)
282
- pull_data_from_table(stream, progress)
283
- end
284
-
285
- def pull_data_from_table(stream, progress)
286
- loop do
287
- if exiting?
288
- store_session
289
- exit 0
290
- end
291
-
292
- row_size = 0
293
- chunksize = stream.state[:chunksize]
294
-
295
- begin
296
- chunksize = Tapsoob::Utils.calculate_chunksize(chunksize) do |c|
297
- stream.state[:chunksize] = c.to_i
298
- encoded_data, row_size, elapsed_time = nil
299
- d1 = c.time_delta do
300
- encoded_data, row_size, elapsed_time = stream.fetch
301
- end
302
-
303
- data = nil
304
- d2 = c.time_delta do
305
- data = {
306
- :state => stream.to_hash,
307
- :checksum => Tapsoob::Utils.checksum(encoded_data).to_s,
308
- :encoded_data => encoded_data
309
- }
310
- end
311
-
312
- stream.fetch_data_from_database(data) do |rows|
313
- next if rows == {}
314
-
315
- # Update progress bar by 1 chunk
316
- progress.inc(1) if progress
317
-
318
- if dump_path.nil?
319
- puts JSON.generate(rows)
320
- else
321
- Tapsoob::Utils.export_rows(dump_path, stream.table_name, rows)
322
- end
323
- end
324
- log.debug "row size: #{row_size}"
325
- stream.error = false
326
- self.stream_state = stream.to_hash
327
-
328
- c.idle_secs = (d1 + d2)
329
-
330
- elapsed_time
331
- end
332
- rescue Tapsoob::CorruptedData => e
333
- log.info "Corrupted Data Received #{e.message}, retrying..."
334
- stream.error = true
335
- next
336
- end
337
-
338
- break if stream.complete?
339
- end
340
-
341
- progress.finish if progress
342
- add_completed_table(stream.table_name)
343
- self.stream_state = {}
344
- end
345
-
346
- def tables
347
- h = {}
348
- tables_info.each do |table_name, count|
349
- next if completed_tables.include?(table_name.to_s)
350
- h[table_name.to_s] = count
351
- end
352
- h
353
- end
354
-
355
- def record_count
356
- tables_info.values.inject(:+)
357
- end
358
-
359
- def tables_info
360
- opts[:tables_info] ||= fetch_tables_info
361
- end
362
-
363
- def fetch_tables_info
364
- tables = db.send(:sort_dumped_tables, db.tables, {})
365
-
366
- data = {}
367
- apply_table_filter(tables).each do |table_name|
368
- data[table_name] = db[table_name].count
369
- end
370
- data
371
- end
372
-
373
- def self.factory(db, state)
374
- if defined?(Sequel::MySQL) && Sequel::MySQL.respond_to?(:convert_invalid_date_time=)
375
- Sequel::MySQL.convert_invalid_date_time = :nil
376
- end
377
-
378
- if state.has_key?(:klass)
379
- return eval(state[:klass]).new(db, state)
380
- end
381
-
382
- if Tapsoob::Utils.single_integer_primary_key(db, state[:table_name].to_sym)
383
- DataStreamKeyed.new(db, state)
384
- else
385
- DataStream.new(db, state)
386
- end
387
- end
388
-
389
- def pull_indexes
390
- log.info "Receiving indexes"
391
-
392
- raw_idxs = Tapsoob::Schema.indexes_individual(database_url)
393
- idxs = (raw_idxs && raw_idxs.length >= 2 ? JSON.parse(raw_idxs) : {})
394
-
395
- # Calculate max title width for consistent alignment
396
- filtered_idxs = apply_table_filter(idxs).select { |table, indexes| indexes.size > 0 }
397
- max_title_width = filtered_idxs.keys.map { |table| "#{table} indexes".length }.max || 14
398
-
399
- filtered_idxs.each do |table, indexes|
400
- progress = ProgressBar.new("#{table} indexes", indexes.size, STDOUT, max_title_width)
401
- indexes.each do |idx|
402
- output = Tapsoob::Utils.export_indexes(dump_path, table, idx)
403
- puts output if dump_path.nil? && output
404
- progress.inc(1)
405
- end
406
- progress.finish
407
- end
408
- end
409
-
410
- def pull_reset_sequences
411
- log.info "Resetting sequences"
412
-
413
- output = Tapsoob::Utils.schema_bin(:reset_db_sequences, database_url)
414
- puts output if dump_path.nil? && output
415
- end
416
- end
417
-
418
- class Push < Operation
419
- def file_prefix
420
- "push"
421
- end
422
-
423
- def to_hash
424
- super.merge(:local_tables_info => local_tables_info)
425
- end
426
-
427
- def run
428
- catch_errors do
429
- unless resuming?
430
- push_schema if schema?
431
- push_indexes if indexes_first? && schema?
432
- end
433
- setup_signal_trap
434
- push_partial_data if data? && resuming?
435
- push_data if data?
436
- push_indexes if !indexes_first? && schema?
437
- push_reset_sequences
438
- end
439
- end
440
-
441
- def push_indexes
442
- idxs = {}
443
- table_idxs = Dir.glob(File.join(dump_path, "indexes", "*.json")).map { |path| File.basename(path, '.json') }
444
- table_idxs.each do |table_idx|
445
- # Read NDJSON format - each line is a separate index
446
- index_file = File.join(dump_path, "indexes", "#{table_idx}.json")
447
- idxs[table_idx] = File.readlines(index_file).map { |line| JSON.parse(line.strip) }
448
- end
449
-
450
- return unless idxs.size > 0
451
-
452
- log.info "Sending indexes"
453
-
454
- # Calculate max title width for consistent alignment
455
- filtered_idxs = apply_table_filter(idxs).select { |table, indexes| indexes.size > 0 }
456
- max_title_width = filtered_idxs.keys.map { |table| "#{table} indexes".length }.max || 14
457
-
458
- filtered_idxs.each do |table, indexes|
459
- progress = ProgressBar.new("#{table} indexes", indexes.size, STDOUT, max_title_width)
460
- indexes.each do |idx|
461
- Tapsoob::Utils.load_indexes(database_url, idx)
462
- progress.inc(1)
463
- end
464
- progress.finish
465
- end
466
- end
467
-
468
- def push_schema
469
- log.info "Sending schema"
470
-
471
- progress = ProgressBar.new('Schema', tables.size)
472
- tables.each do |table, count|
473
- log.debug "Loading '#{table}' schema\n"
474
- # Reuse existing db connection for better performance
475
- Tapsoob::Utils.load_schema(dump_path, db, table)
476
- progress.inc(1)
477
- end
478
- progress.finish
479
- end
480
-
481
- def push_reset_sequences
482
- log.info "Resetting sequences"
483
-
484
- Tapsoob::Utils.schema_bin(:reset_db_sequences, database_url)
485
- end
486
-
487
- def push_partial_data
488
- return if stream_state == {}
489
-
490
- table_name = stream_state[:table_name]
491
- record_count = tables[table_name.to_s]
492
- log.info "Resuming #{table_name}, #{format_number(record_count)} records"
493
- stream = Tapsoob::DataStream.factory(db, stream_state)
494
- chunksize = stream_state[:chunksize] || default_chunksize
495
- estimated_chunks = [(record_count.to_f / chunksize).ceil, 1].max
496
- progress = (opts[:progress] ? ProgressBar.new(table_name.to_s, estimated_chunks) : nil)
497
- push_data_from_file(stream, progress)
498
- end
499
-
500
- def push_data
501
- log.info "Sending data"
502
-
503
- log.info "#{tables.size} tables, #{format_number(record_count)} records"
504
-
505
- if parallel?
506
- push_data_parallel
507
- else
508
- push_data_serial
509
- end
510
- end
511
-
512
- def push_data_serial
513
- tables.each do |table_name, count|
514
- # Skip if data file doesn't exist or has no data
515
- data_file = File.join(dump_path, "data", "#{table_name}.json")
516
- next unless File.exist?(data_file) && count > 0
517
- db[table_name.to_sym].truncate if @opts[:purge]
518
- stream = Tapsoob::DataStream.factory(db, {
519
- :table_name => table_name,
520
- :chunksize => default_chunksize
521
- }, {
522
- :"skip-duplicates" => opts[:"skip-duplicates"] || false,
523
- :"discard-identity" => opts[:"discard-identity"] || false,
524
- :purge => opts[:purge] || false,
525
- :debug => opts[:debug]
526
- })
527
- estimated_chunks = [(count.to_f / default_chunksize).ceil, 1].max
528
- progress = (opts[:progress] ? ProgressBar.new(table_name.to_s, estimated_chunks) : nil)
529
- push_data_from_file(stream, progress)
530
- end
531
- end
532
-
533
- def push_data_parallel
534
- log.info "Using #{parallel_workers} parallel workers"
535
-
536
- multi_progress = opts[:progress] ? MultiProgressBar.new(parallel_workers) : nil
537
- table_queue = Queue.new
538
-
539
- tables.each do |table_name, count|
540
- data_file = File.join(dump_path, "data", "#{table_name}.json")
541
- next unless File.exist?(data_file) && count > 0
542
- table_queue << [table_name, count]
543
- end
544
-
545
- workers = (1..parallel_workers).map do
546
- Thread.new do
547
- loop do
548
- break if table_queue.empty?
549
-
550
- table_name, count = table_queue.pop(true) rescue break
551
-
552
- # Each thread gets its own connection from the pool
553
- db[table_name.to_sym].truncate if @opts[:purge]
554
- stream = Tapsoob::DataStream.factory(db, {
555
- :table_name => table_name,
556
- :chunksize => default_chunksize
557
- }, {
558
- :"skip-duplicates" => opts[:"skip-duplicates"] || false,
559
- :"discard-identity" => opts[:"discard-identity"] || false,
560
- :purge => opts[:purge] || false,
561
- :debug => opts[:debug]
562
- })
563
-
564
- estimated_chunks = [(count.to_f / default_chunksize).ceil, 1].max
565
- progress = multi_progress ? multi_progress.create_bar(table_name.to_s, estimated_chunks) : nil
566
-
567
- push_data_from_file(stream, progress)
568
- end
569
- end
570
- end
571
-
572
- workers.each(&:join)
573
- multi_progress.stop if multi_progress
574
- end
575
-
576
- def push_data_from_file(stream, progress)
577
- loop do
578
- if exiting?
579
- store_session
580
- exit 0
581
- end
582
-
583
- row_size = 0
584
- chunksize = stream.state[:chunksize]
585
-
586
- begin
587
- chunksize = Tapsoob::Utils.calculate_chunksize(chunksize) do |c|
588
- stream.state[:chunksize] = c.to_i
589
- encoded_data, row_size, elapsed_time = nil
590
- d1 = c.time_delta do
591
- encoded_data, row_size, elapsed_time = stream.fetch({ :type => "file", :source => dump_path })
592
- end
593
-
594
- data = nil
595
- d2 = c.time_delta do
596
- data = {
597
- :state => stream.to_hash,
598
- :checksum => Tapsoob::Utils.checksum(encoded_data).to_s,
599
- :encoded_data => encoded_data
600
- }
601
- end
602
-
603
- stream.fetch_data_to_database(data)
604
- log.debug "row size: #{row_size}"
605
- self.stream_state = stream.to_hash
606
-
607
- c.idle_secs = (d1 + d2)
608
-
609
- elapsed_time
610
- end
611
- rescue Tapsoob::CorruptedData => e
612
- # retry the same data, it got corrupted somehow.
613
- next
614
- rescue Tapsoob::DuplicatePrimaryKeyError => e
615
- # verify the stream and retry it
616
- stream.verify_stream
617
- stream = JSON.generate({ :state => stream.to_hash })
618
- next
619
- end
620
- stream.state[:chunksize] = chunksize
621
-
622
- # Update progress bar by 1 chunk
623
- progress.inc(1) if progress
624
-
625
- break if stream.complete?
626
- end
627
-
628
- progress.finish if progress
629
- add_completed_table(stream.table_name)
630
- self.stream_state = {}
631
- end
632
-
633
- def local_tables_info
634
- opts[:local_tables_info] ||= fetch_local_tables_info
635
- end
636
-
637
- def tables
638
- h = {}
639
- local_tables_info.each do |table_name, count|
640
- next if completed_tables.include?(table_name.to_s)
641
- h[table_name.to_s] = count
642
- end
643
- h
644
- end
645
-
646
- def record_count
647
- @record_count ||= local_tables_info.values.inject(0) { |a,c| a += c }
648
- end
649
-
650
- def fetch_local_tables_info
651
- tables_with_counts = {}
652
- tbls = Dir.glob(File.join(dump_path, "schemas", "*")).map { |path| File.basename(path, ".rb") }
653
- tbls.each do |table|
654
- if File.exist?(File.join(dump_path, "data", "#{table}.json"))
655
- # Read NDJSON format - each line is a separate JSON chunk
656
- total_rows = 0
657
- File.readlines(File.join(dump_path, "data", "#{table}.json")).each do |line|
658
- chunk = JSON.parse(line.strip)
659
- total_rows += chunk["data"].size if chunk["data"]
660
- end
661
- tables_with_counts[table] = total_rows
662
- else
663
- tables_with_counts[table] = 0
664
- end
665
- end
666
- apply_table_filter(tables_with_counts)
667
- end
4
+ module Operation
5
+ # Require all Operation classes
6
+ require 'tapsoob/operation/base'
7
+ require 'tapsoob/operation/pull'
8
+ require 'tapsoob/operation/push'
668
9
  end
669
10
  end
@@ -239,7 +239,3 @@ module Tapsoob
239
239
  end
240
240
  end
241
241
  end
242
-
243
- # Backward compatibility aliases
244
- ProgressBar = Tapsoob::Progress::Bar
245
- ReversedProgressBar = Tapsoob::Progress::ReversedBar