tapsoob 0.6.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,670 +1,10 @@
1
1
  # -*- encoding : utf-8 -*-
2
- require 'sequel'
3
- require 'thread'
4
-
5
- require 'tapsoob/data_stream'
6
- require 'tapsoob/log'
7
- require 'tapsoob/progress_bar'
8
- require 'tapsoob/multi_progress_bar'
9
- require 'tapsoob/schema'
10
2
 
11
3
  module Tapsoob
12
- class Operation
13
- attr_reader :database_url, :dump_path, :opts
14
-
15
- def initialize(database_url, dump_path = nil, opts={})
16
- @database_url = database_url
17
- @dump_path = dump_path
18
- @opts = opts
19
- @exiting = false
20
- end
21
-
22
- def file_prefix
23
- "op"
24
- end
25
-
26
- def data?
27
- opts[:data]
28
- end
29
-
30
- def schema?
31
- opts[:schema]
32
- end
33
-
34
- def indexes_first?
35
- !!opts[:indexes_first]
36
- end
37
-
38
- def table_filter
39
- opts[:tables] || []
40
- end
41
-
42
- def exclude_tables
43
- opts[:exclude_tables] || []
44
- end
45
-
46
- def apply_table_filter(tables)
47
- return tables if table_filter.empty? && exclude_tables.empty?
48
-
49
- if tables.kind_of?(Hash)
50
- ntables = {}
51
- tables.each do |t, d|
52
- if !exclude_tables.include?(t.to_s) && (!table_filter.empty? && table_filter.include?(t.to_s))
53
- ntables[t] = d
54
- end
55
- end
56
- ntables
57
- else
58
- tables.reject { |t| exclude_tables.include?(t.to_s) }.select { |t| table_filter.include?(t.to_s) }
59
- end
60
- end
61
-
62
- def log
63
- Tapsoob.log.level = Logger::DEBUG if opts[:debug]
64
- Tapsoob.log
65
- end
66
-
67
- def store_session
68
- file = "#{file_prefix}_#{Time.now.strftime("%Y%m%d%H%M")}.dat"
69
- log.info "\nSaving session to #{file}..."
70
- File.open(file, 'w') do |f|
71
- f.write(JSON.generate(to_hash))
72
- end
73
- end
74
-
75
- def to_hash
76
- {
77
- :klass => self.class.to_s,
78
- :database_url => database_url,
79
- :stream_state => stream_state,
80
- :completed_tables => completed_tables,
81
- :table_filter => table_filter,
82
- }
83
- end
84
-
85
- def exiting?
86
- !!@exiting
87
- end
88
-
89
- def setup_signal_trap
90
- trap("INT") {
91
- puts "\nCompleting current action..."
92
- @exiting = true
93
- }
94
-
95
- trap("TERM") {
96
- puts "\nCompleting current action..."
97
- @exiting = true
98
- }
99
- end
100
-
101
- def resuming?
102
- opts[:resume] == true
103
- end
104
-
105
- def default_chunksize
106
- opts[:default_chunksize]
107
- end
108
-
109
- def completed_tables
110
- opts[:completed_tables] ||= []
111
- end
112
-
113
- def stream_state
114
- opts[:stream_state] ||= {}
115
- end
116
-
117
- def stream_state=(val)
118
- opts[:stream_state] = val
119
- end
120
-
121
- def db
122
- @db ||= Sequel.connect(database_url, max_connections: parallel_workers * 2)
123
- @db.extension :schema_dumper
124
- @db.loggers << Tapsoob.log if opts[:debug]
125
-
126
- # Set parameters
127
- if @db.uri =~ /oracle/i
128
- @db << "ALTER SESSION SET NLS_DATE_FORMAT='YYYY-MM-DD HH24:MI:SS'"
129
- @db << "ALTER SESSION SET NLS_TIMESTAMP_FORMAT='YYYY-MM-DD HH24:MI:SS:FF6'"
130
- end
131
-
132
- @db
133
- end
134
-
135
- def parallel?
136
- parallel_workers > 1
137
- end
138
-
139
- def parallel_workers
140
- @parallel_workers ||= [opts[:parallel].to_i, 1].max
141
- end
142
-
143
- def completed_tables_mutex
144
- @completed_tables_mutex ||= Mutex.new
145
- end
146
-
147
- def add_completed_table(table_name)
148
- completed_tables_mutex.synchronize do
149
- completed_tables << table_name.to_s
150
- end
151
- end
152
-
153
- def format_number(num)
154
- num.to_s.gsub(/(\d)(?=(\d\d\d)+(?!\d))/, "\\1,")
155
- end
156
-
157
- def catch_errors(&blk)
158
- begin
159
- blk.call
160
- rescue Exception => e
161
- raise e
162
- end
163
- end
164
-
165
- def self.factory(type, database_url, dump_path, opts)
166
- type = :resume if opts[:resume]
167
- klass = case type
168
- when :pull then Tapsoob::Pull
169
- when :push then Tapsoob::Push
170
- when :resume then eval(opts[:klass])
171
- else raise "Unknown Operation Type -> #{type}"
172
- end
173
-
174
- klass.new(database_url, dump_path, opts)
175
- end
176
- end
177
-
178
- class Pull < Operation
179
- def file_prefix
180
- "pull"
181
- end
182
-
183
- def to_hash
184
- super.merge(:remote_tables_info => remote_tables_info)
185
- end
186
-
187
- def run
188
- catch_errors do
189
- unless resuming?
190
- pull_schema if schema?
191
- pull_indexes if indexes_first? && schema?
192
- end
193
- setup_signal_trap
194
- pull_partial_data if data? && resuming?
195
- pull_data if data?
196
- pull_indexes if !indexes_first? && schema?
197
- pull_reset_sequences
198
- end
199
- end
200
-
201
- def pull_schema
202
- log.info "Receiving schema"
203
-
204
- progress = ProgressBar.new('Schema', tables.size)
205
- tables.each do |table_name, count|
206
- # Reuse existing db connection for better performance
207
- schema_data = Tapsoob::Schema.dump_table(db, table_name, @opts.slice(:indexes, :same_db))
208
- log.debug "Table: #{table_name}\n#{schema_data}\n"
209
- output = Tapsoob::Utils.export_schema(dump_path, table_name, schema_data)
210
- puts output if dump_path.nil? && output
211
- progress.inc(1)
212
- end
213
- progress.finish
214
- end
215
-
216
- def pull_data
217
- log.info "Receiving data"
218
-
219
- log.info "#{tables.size} tables, #{format_number(record_count)} records"
220
-
221
- if parallel?
222
- pull_data_parallel
223
- else
224
- pull_data_serial
225
- end
226
- end
227
-
228
- def pull_data_serial
229
- tables.each do |table_name, count|
230
- stream = Tapsoob::DataStream.factory(db, {
231
- :chunksize => default_chunksize,
232
- :table_name => table_name
233
- }, { :debug => opts[:debug] })
234
- estimated_chunks = [(count.to_f / default_chunksize).ceil, 1].max
235
- progress = (opts[:progress] ? ProgressBar.new(table_name.to_s, estimated_chunks) : nil)
236
- pull_data_from_table(stream, progress)
237
- end
238
- end
239
-
240
- def pull_data_parallel
241
- log.info "Using #{parallel_workers} parallel workers"
242
-
243
- multi_progress = opts[:progress] ? MultiProgressBar.new(parallel_workers) : nil
244
- table_queue = Queue.new
245
- tables.each { |table_name, count| table_queue << [table_name, count] }
246
-
247
- workers = (1..parallel_workers).map do
248
- Thread.new do
249
- loop do
250
- break if table_queue.empty?
251
-
252
- table_name, count = table_queue.pop(true) rescue break
253
-
254
- # Each thread gets its own connection from the pool
255
- stream = Tapsoob::DataStream.factory(db, {
256
- :chunksize => default_chunksize,
257
- :table_name => table_name
258
- }, { :debug => opts[:debug] })
259
-
260
- estimated_chunks = [(count.to_f / default_chunksize).ceil, 1].max
261
- progress = multi_progress ? multi_progress.create_bar(table_name.to_s, estimated_chunks) : nil
262
-
263
- pull_data_from_table(stream, progress)
264
- end
265
- end
266
- end
267
-
268
- workers.each(&:join)
269
- multi_progress.stop if multi_progress
270
- end
271
-
272
- def pull_partial_data
273
- return if stream_state == {}
274
-
275
- table_name = stream_state[:table_name]
276
- record_count = tables[table_name.to_s]
277
- log.info "Resuming #{table_name}, #{format_number(record_count)} records"
278
-
279
- stream = Tapsoob::DataStream.factory(db, stream_state)
280
- chunksize = stream_state[:chunksize] || default_chunksize
281
- estimated_chunks = [(record_count.to_f / chunksize).ceil, 1].max
282
- progress = (opts[:progress] ? ProgressBar.new(table_name.to_s, estimated_chunks) : nil)
283
- pull_data_from_table(stream, progress)
284
- end
285
-
286
- def pull_data_from_table(stream, progress)
287
- loop do
288
- if exiting?
289
- store_session
290
- exit 0
291
- end
292
-
293
- row_size = 0
294
- chunksize = stream.state[:chunksize]
295
-
296
- begin
297
- chunksize = Tapsoob::Utils.calculate_chunksize(chunksize) do |c|
298
- stream.state[:chunksize] = c.to_i
299
- encoded_data, row_size, elapsed_time = nil
300
- d1 = c.time_delta do
301
- encoded_data, row_size, elapsed_time = stream.fetch
302
- end
303
-
304
- data = nil
305
- d2 = c.time_delta do
306
- data = {
307
- :state => stream.to_hash,
308
- :checksum => Tapsoob::Utils.checksum(encoded_data).to_s,
309
- :encoded_data => encoded_data
310
- }
311
- end
312
-
313
- stream.fetch_data_from_database(data) do |rows|
314
- next if rows == {}
315
-
316
- # Update progress bar by 1 chunk
317
- progress.inc(1) if progress
318
-
319
- if dump_path.nil?
320
- puts JSON.generate(rows)
321
- else
322
- Tapsoob::Utils.export_rows(dump_path, stream.table_name, rows)
323
- end
324
- end
325
- log.debug "row size: #{row_size}"
326
- stream.error = false
327
- self.stream_state = stream.to_hash
328
-
329
- c.idle_secs = (d1 + d2)
330
-
331
- elapsed_time
332
- end
333
- rescue Tapsoob::CorruptedData => e
334
- log.info "Corrupted Data Received #{e.message}, retrying..."
335
- stream.error = true
336
- next
337
- end
338
-
339
- break if stream.complete?
340
- end
341
-
342
- progress.finish if progress
343
- add_completed_table(stream.table_name)
344
- self.stream_state = {}
345
- end
346
-
347
- def tables
348
- h = {}
349
- tables_info.each do |table_name, count|
350
- next if completed_tables.include?(table_name.to_s)
351
- h[table_name.to_s] = count
352
- end
353
- h
354
- end
355
-
356
- def record_count
357
- tables_info.values.inject(:+)
358
- end
359
-
360
- def tables_info
361
- opts[:tables_info] ||= fetch_tables_info
362
- end
363
-
364
- def fetch_tables_info
365
- tables = db.send(:sort_dumped_tables, db.tables, {})
366
-
367
- data = {}
368
- apply_table_filter(tables).each do |table_name|
369
- data[table_name] = db[table_name].count
370
- end
371
- data
372
- end
373
-
374
- def self.factory(db, state)
375
- if defined?(Sequel::MySQL) && Sequel::MySQL.respond_to?(:convert_invalid_date_time=)
376
- Sequel::MySQL.convert_invalid_date_time = :nil
377
- end
378
-
379
- if state.has_key?(:klass)
380
- return eval(state[:klass]).new(db, state)
381
- end
382
-
383
- if Tapsoob::Utils.single_integer_primary_key(db, state[:table_name].to_sym)
384
- DataStreamKeyed.new(db, state)
385
- else
386
- DataStream.new(db, state)
387
- end
388
- end
389
-
390
- def pull_indexes
391
- log.info "Receiving indexes"
392
-
393
- raw_idxs = Tapsoob::Schema.indexes_individual(database_url)
394
- idxs = (raw_idxs && raw_idxs.length >= 2 ? JSON.parse(raw_idxs) : {})
395
-
396
- # Calculate max title width for consistent alignment
397
- filtered_idxs = apply_table_filter(idxs).select { |table, indexes| indexes.size > 0 }
398
- max_title_width = filtered_idxs.keys.map { |table| "#{table} indexes".length }.max || 14
399
-
400
- filtered_idxs.each do |table, indexes|
401
- progress = ProgressBar.new("#{table} indexes", indexes.size, STDOUT, max_title_width)
402
- indexes.each do |idx|
403
- output = Tapsoob::Utils.export_indexes(dump_path, table, idx)
404
- puts output if dump_path.nil? && output
405
- progress.inc(1)
406
- end
407
- progress.finish
408
- end
409
- end
410
-
411
- def pull_reset_sequences
412
- log.info "Resetting sequences"
413
-
414
- output = Tapsoob::Utils.schema_bin(:reset_db_sequences, database_url)
415
- puts output if dump_path.nil? && output
416
- end
417
- end
418
-
419
- class Push < Operation
420
- def file_prefix
421
- "push"
422
- end
423
-
424
- def to_hash
425
- super.merge(:local_tables_info => local_tables_info)
426
- end
427
-
428
- def run
429
- catch_errors do
430
- unless resuming?
431
- push_schema if schema?
432
- push_indexes if indexes_first? && schema?
433
- end
434
- setup_signal_trap
435
- push_partial_data if data? && resuming?
436
- push_data if data?
437
- push_indexes if !indexes_first? && schema?
438
- push_reset_sequences
439
- end
440
- end
441
-
442
- def push_indexes
443
- idxs = {}
444
- table_idxs = Dir.glob(File.join(dump_path, "indexes", "*.json")).map { |path| File.basename(path, '.json') }
445
- table_idxs.each do |table_idx|
446
- # Read NDJSON format - each line is a separate index
447
- index_file = File.join(dump_path, "indexes", "#{table_idx}.json")
448
- idxs[table_idx] = File.readlines(index_file).map { |line| JSON.parse(line.strip) }
449
- end
450
-
451
- return unless idxs.size > 0
452
-
453
- log.info "Sending indexes"
454
-
455
- # Calculate max title width for consistent alignment
456
- filtered_idxs = apply_table_filter(idxs).select { |table, indexes| indexes.size > 0 }
457
- max_title_width = filtered_idxs.keys.map { |table| "#{table} indexes".length }.max || 14
458
-
459
- filtered_idxs.each do |table, indexes|
460
- progress = ProgressBar.new("#{table} indexes", indexes.size, STDOUT, max_title_width)
461
- indexes.each do |idx|
462
- Tapsoob::Utils.load_indexes(database_url, idx)
463
- progress.inc(1)
464
- end
465
- progress.finish
466
- end
467
- end
468
-
469
- def push_schema
470
- log.info "Sending schema"
471
-
472
- progress = ProgressBar.new('Schema', tables.size)
473
- tables.each do |table, count|
474
- log.debug "Loading '#{table}' schema\n"
475
- # Reuse existing db connection for better performance
476
- Tapsoob::Utils.load_schema(dump_path, db, table)
477
- progress.inc(1)
478
- end
479
- progress.finish
480
- end
481
-
482
- def push_reset_sequences
483
- log.info "Resetting sequences"
484
-
485
- Tapsoob::Utils.schema_bin(:reset_db_sequences, database_url)
486
- end
487
-
488
- def push_partial_data
489
- return if stream_state == {}
490
-
491
- table_name = stream_state[:table_name]
492
- record_count = tables[table_name.to_s]
493
- log.info "Resuming #{table_name}, #{format_number(record_count)} records"
494
- stream = Tapsoob::DataStream.factory(db, stream_state)
495
- chunksize = stream_state[:chunksize] || default_chunksize
496
- estimated_chunks = [(record_count.to_f / chunksize).ceil, 1].max
497
- progress = (opts[:progress] ? ProgressBar.new(table_name.to_s, estimated_chunks) : nil)
498
- push_data_from_file(stream, progress)
499
- end
500
-
501
- def push_data
502
- log.info "Sending data"
503
-
504
- log.info "#{tables.size} tables, #{format_number(record_count)} records"
505
-
506
- if parallel?
507
- push_data_parallel
508
- else
509
- push_data_serial
510
- end
511
- end
512
-
513
- def push_data_serial
514
- tables.each do |table_name, count|
515
- # Skip if data file doesn't exist or has no data
516
- data_file = File.join(dump_path, "data", "#{table_name}.json")
517
- next unless File.exist?(data_file) && count > 0
518
- db[table_name.to_sym].truncate if @opts[:purge]
519
- stream = Tapsoob::DataStream.factory(db, {
520
- :table_name => table_name,
521
- :chunksize => default_chunksize
522
- }, {
523
- :"skip-duplicates" => opts[:"skip-duplicates"] || false,
524
- :"discard-identity" => opts[:"discard-identity"] || false,
525
- :purge => opts[:purge] || false,
526
- :debug => opts[:debug]
527
- })
528
- estimated_chunks = [(count.to_f / default_chunksize).ceil, 1].max
529
- progress = (opts[:progress] ? ProgressBar.new(table_name.to_s, estimated_chunks) : nil)
530
- push_data_from_file(stream, progress)
531
- end
532
- end
533
-
534
- def push_data_parallel
535
- log.info "Using #{parallel_workers} parallel workers"
536
-
537
- multi_progress = opts[:progress] ? MultiProgressBar.new(parallel_workers) : nil
538
- table_queue = Queue.new
539
-
540
- tables.each do |table_name, count|
541
- data_file = File.join(dump_path, "data", "#{table_name}.json")
542
- next unless File.exist?(data_file) && count > 0
543
- table_queue << [table_name, count]
544
- end
545
-
546
- workers = (1..parallel_workers).map do
547
- Thread.new do
548
- loop do
549
- break if table_queue.empty?
550
-
551
- table_name, count = table_queue.pop(true) rescue break
552
-
553
- # Each thread gets its own connection from the pool
554
- db[table_name.to_sym].truncate if @opts[:purge]
555
- stream = Tapsoob::DataStream.factory(db, {
556
- :table_name => table_name,
557
- :chunksize => default_chunksize
558
- }, {
559
- :"skip-duplicates" => opts[:"skip-duplicates"] || false,
560
- :"discard-identity" => opts[:"discard-identity"] || false,
561
- :purge => opts[:purge] || false,
562
- :debug => opts[:debug]
563
- })
564
-
565
- estimated_chunks = [(count.to_f / default_chunksize).ceil, 1].max
566
- progress = multi_progress ? multi_progress.create_bar(table_name.to_s, estimated_chunks) : nil
567
-
568
- push_data_from_file(stream, progress)
569
- end
570
- end
571
- end
572
-
573
- workers.each(&:join)
574
- multi_progress.stop if multi_progress
575
- end
576
-
577
- def push_data_from_file(stream, progress)
578
- loop do
579
- if exiting?
580
- store_session
581
- exit 0
582
- end
583
-
584
- row_size = 0
585
- chunksize = stream.state[:chunksize]
586
-
587
- begin
588
- chunksize = Tapsoob::Utils.calculate_chunksize(chunksize) do |c|
589
- stream.state[:chunksize] = c.to_i
590
- encoded_data, row_size, elapsed_time = nil
591
- d1 = c.time_delta do
592
- encoded_data, row_size, elapsed_time = stream.fetch({ :type => "file", :source => dump_path })
593
- end
594
-
595
- data = nil
596
- d2 = c.time_delta do
597
- data = {
598
- :state => stream.to_hash,
599
- :checksum => Tapsoob::Utils.checksum(encoded_data).to_s,
600
- :encoded_data => encoded_data
601
- }
602
- end
603
-
604
- stream.fetch_data_to_database(data)
605
- log.debug "row size: #{row_size}"
606
- self.stream_state = stream.to_hash
607
-
608
- c.idle_secs = (d1 + d2)
609
-
610
- elapsed_time
611
- end
612
- rescue Tapsoob::CorruptedData => e
613
- # retry the same data, it got corrupted somehow.
614
- next
615
- rescue Tapsoob::DuplicatePrimaryKeyError => e
616
- # verify the stream and retry it
617
- stream.verify_stream
618
- stream = JSON.generate({ :state => stream.to_hash })
619
- next
620
- end
621
- stream.state[:chunksize] = chunksize
622
-
623
- # Update progress bar by 1 chunk
624
- progress.inc(1) if progress
625
-
626
- break if stream.complete?
627
- end
628
-
629
- progress.finish if progress
630
- add_completed_table(stream.table_name)
631
- self.stream_state = {}
632
- end
633
-
634
- def local_tables_info
635
- opts[:local_tables_info] ||= fetch_local_tables_info
636
- end
637
-
638
- def tables
639
- h = {}
640
- local_tables_info.each do |table_name, count|
641
- next if completed_tables.include?(table_name.to_s)
642
- h[table_name.to_s] = count
643
- end
644
- h
645
- end
646
-
647
- def record_count
648
- @record_count ||= local_tables_info.values.inject(0) { |a,c| a += c }
649
- end
650
-
651
- def fetch_local_tables_info
652
- tables_with_counts = {}
653
- tbls = Dir.glob(File.join(dump_path, "schemas", "*")).map { |path| File.basename(path, ".rb") }
654
- tbls.each do |table|
655
- if File.exist?(File.join(dump_path, "data", "#{table}.json"))
656
- # Read NDJSON format - each line is a separate JSON chunk
657
- total_rows = 0
658
- File.readlines(File.join(dump_path, "data", "#{table}.json")).each do |line|
659
- chunk = JSON.parse(line.strip)
660
- total_rows += chunk["data"].size if chunk["data"]
661
- end
662
- tables_with_counts[table] = total_rows
663
- else
664
- tables_with_counts[table] = 0
665
- end
666
- end
667
- apply_table_filter(tables_with_counts)
668
- end
4
+ module Operation
5
+ # Require all Operation classes
6
+ require 'tapsoob/operation/base'
7
+ require 'tapsoob/operation/pull'
8
+ require 'tapsoob/operation/push'
669
9
  end
670
10
  end