tapsoob 0.6.2-java → 0.7.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +18 -2
- data/lib/tapsoob/cli/data_stream.rb +3 -3
- data/lib/tapsoob/cli/root.rb +2 -3
- data/lib/tapsoob/data_stream/base.rb +315 -0
- data/lib/tapsoob/data_stream/file_partition.rb +87 -0
- data/lib/tapsoob/data_stream/interleaved.rb +80 -0
- data/lib/tapsoob/data_stream/keyed.rb +124 -0
- data/lib/tapsoob/data_stream/keyed_partition.rb +64 -0
- data/lib/tapsoob/data_stream.rb +7 -378
- data/lib/tapsoob/operation/base.rb +240 -0
- data/lib/tapsoob/operation/pull.rb +419 -0
- data/lib/tapsoob/operation/push.rb +446 -0
- data/lib/tapsoob/operation.rb +5 -664
- data/lib/tapsoob/progress/bar.rb +0 -4
- data/lib/tapsoob/progress/multi_bar.rb +90 -58
- data/lib/tapsoob/progress/thread_safe_bar.rb +0 -3
- data/lib/tapsoob/progress_event.rb +109 -0
- data/lib/tapsoob/version.rb +1 -1
- data/lib/tasks/tapsoob.rake +2 -2
- metadata +11 -2
data/lib/tapsoob/operation.rb
CHANGED
|
@@ -1,669 +1,10 @@
|
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
|
2
|
-
require 'sequel'
|
|
3
|
-
require 'thread'
|
|
4
|
-
|
|
5
|
-
require 'tapsoob/data_stream'
|
|
6
|
-
require 'tapsoob/log'
|
|
7
|
-
require 'tapsoob/progress'
|
|
8
|
-
require 'tapsoob/schema'
|
|
9
2
|
|
|
10
3
|
module Tapsoob
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
@dump_path = dump_path
|
|
17
|
-
@opts = opts
|
|
18
|
-
@exiting = false
|
|
19
|
-
end
|
|
20
|
-
|
|
21
|
-
def file_prefix
|
|
22
|
-
"op"
|
|
23
|
-
end
|
|
24
|
-
|
|
25
|
-
def data?
|
|
26
|
-
opts[:data]
|
|
27
|
-
end
|
|
28
|
-
|
|
29
|
-
def schema?
|
|
30
|
-
opts[:schema]
|
|
31
|
-
end
|
|
32
|
-
|
|
33
|
-
def indexes_first?
|
|
34
|
-
!!opts[:indexes_first]
|
|
35
|
-
end
|
|
36
|
-
|
|
37
|
-
def table_filter
|
|
38
|
-
opts[:tables] || []
|
|
39
|
-
end
|
|
40
|
-
|
|
41
|
-
def exclude_tables
|
|
42
|
-
opts[:exclude_tables] || []
|
|
43
|
-
end
|
|
44
|
-
|
|
45
|
-
def apply_table_filter(tables)
|
|
46
|
-
return tables if table_filter.empty? && exclude_tables.empty?
|
|
47
|
-
|
|
48
|
-
if tables.kind_of?(Hash)
|
|
49
|
-
ntables = {}
|
|
50
|
-
tables.each do |t, d|
|
|
51
|
-
if !exclude_tables.include?(t.to_s) && (!table_filter.empty? && table_filter.include?(t.to_s))
|
|
52
|
-
ntables[t] = d
|
|
53
|
-
end
|
|
54
|
-
end
|
|
55
|
-
ntables
|
|
56
|
-
else
|
|
57
|
-
tables.reject { |t| exclude_tables.include?(t.to_s) }.select { |t| table_filter.include?(t.to_s) }
|
|
58
|
-
end
|
|
59
|
-
end
|
|
60
|
-
|
|
61
|
-
def log
|
|
62
|
-
Tapsoob.log.level = Logger::DEBUG if opts[:debug]
|
|
63
|
-
Tapsoob.log
|
|
64
|
-
end
|
|
65
|
-
|
|
66
|
-
def store_session
|
|
67
|
-
file = "#{file_prefix}_#{Time.now.strftime("%Y%m%d%H%M")}.dat"
|
|
68
|
-
log.info "\nSaving session to #{file}..."
|
|
69
|
-
File.open(file, 'w') do |f|
|
|
70
|
-
f.write(JSON.generate(to_hash))
|
|
71
|
-
end
|
|
72
|
-
end
|
|
73
|
-
|
|
74
|
-
def to_hash
|
|
75
|
-
{
|
|
76
|
-
:klass => self.class.to_s,
|
|
77
|
-
:database_url => database_url,
|
|
78
|
-
:stream_state => stream_state,
|
|
79
|
-
:completed_tables => completed_tables,
|
|
80
|
-
:table_filter => table_filter,
|
|
81
|
-
}
|
|
82
|
-
end
|
|
83
|
-
|
|
84
|
-
def exiting?
|
|
85
|
-
!!@exiting
|
|
86
|
-
end
|
|
87
|
-
|
|
88
|
-
def setup_signal_trap
|
|
89
|
-
trap("INT") {
|
|
90
|
-
puts "\nCompleting current action..."
|
|
91
|
-
@exiting = true
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
trap("TERM") {
|
|
95
|
-
puts "\nCompleting current action..."
|
|
96
|
-
@exiting = true
|
|
97
|
-
}
|
|
98
|
-
end
|
|
99
|
-
|
|
100
|
-
def resuming?
|
|
101
|
-
opts[:resume] == true
|
|
102
|
-
end
|
|
103
|
-
|
|
104
|
-
def default_chunksize
|
|
105
|
-
opts[:default_chunksize]
|
|
106
|
-
end
|
|
107
|
-
|
|
108
|
-
def completed_tables
|
|
109
|
-
opts[:completed_tables] ||= []
|
|
110
|
-
end
|
|
111
|
-
|
|
112
|
-
def stream_state
|
|
113
|
-
opts[:stream_state] ||= {}
|
|
114
|
-
end
|
|
115
|
-
|
|
116
|
-
def stream_state=(val)
|
|
117
|
-
opts[:stream_state] = val
|
|
118
|
-
end
|
|
119
|
-
|
|
120
|
-
def db
|
|
121
|
-
@db ||= Sequel.connect(database_url, max_connections: parallel_workers * 2)
|
|
122
|
-
@db.extension :schema_dumper
|
|
123
|
-
@db.loggers << Tapsoob.log if opts[:debug]
|
|
124
|
-
|
|
125
|
-
# Set parameters
|
|
126
|
-
if @db.uri =~ /oracle/i
|
|
127
|
-
@db << "ALTER SESSION SET NLS_DATE_FORMAT='YYYY-MM-DD HH24:MI:SS'"
|
|
128
|
-
@db << "ALTER SESSION SET NLS_TIMESTAMP_FORMAT='YYYY-MM-DD HH24:MI:SS:FF6'"
|
|
129
|
-
end
|
|
130
|
-
|
|
131
|
-
@db
|
|
132
|
-
end
|
|
133
|
-
|
|
134
|
-
def parallel?
|
|
135
|
-
parallel_workers > 1
|
|
136
|
-
end
|
|
137
|
-
|
|
138
|
-
def parallel_workers
|
|
139
|
-
@parallel_workers ||= [opts[:parallel].to_i, 1].max
|
|
140
|
-
end
|
|
141
|
-
|
|
142
|
-
def completed_tables_mutex
|
|
143
|
-
@completed_tables_mutex ||= Mutex.new
|
|
144
|
-
end
|
|
145
|
-
|
|
146
|
-
def add_completed_table(table_name)
|
|
147
|
-
completed_tables_mutex.synchronize do
|
|
148
|
-
completed_tables << table_name.to_s
|
|
149
|
-
end
|
|
150
|
-
end
|
|
151
|
-
|
|
152
|
-
def format_number(num)
|
|
153
|
-
num.to_s.gsub(/(\d)(?=(\d\d\d)+(?!\d))/, "\\1,")
|
|
154
|
-
end
|
|
155
|
-
|
|
156
|
-
def catch_errors(&blk)
|
|
157
|
-
begin
|
|
158
|
-
blk.call
|
|
159
|
-
rescue Exception => e
|
|
160
|
-
raise e
|
|
161
|
-
end
|
|
162
|
-
end
|
|
163
|
-
|
|
164
|
-
def self.factory(type, database_url, dump_path, opts)
|
|
165
|
-
type = :resume if opts[:resume]
|
|
166
|
-
klass = case type
|
|
167
|
-
when :pull then Tapsoob::Pull
|
|
168
|
-
when :push then Tapsoob::Push
|
|
169
|
-
when :resume then eval(opts[:klass])
|
|
170
|
-
else raise "Unknown Operation Type -> #{type}"
|
|
171
|
-
end
|
|
172
|
-
|
|
173
|
-
klass.new(database_url, dump_path, opts)
|
|
174
|
-
end
|
|
175
|
-
end
|
|
176
|
-
|
|
177
|
-
class Pull < Operation
|
|
178
|
-
def file_prefix
|
|
179
|
-
"pull"
|
|
180
|
-
end
|
|
181
|
-
|
|
182
|
-
def to_hash
|
|
183
|
-
super.merge(:remote_tables_info => remote_tables_info)
|
|
184
|
-
end
|
|
185
|
-
|
|
186
|
-
def run
|
|
187
|
-
catch_errors do
|
|
188
|
-
unless resuming?
|
|
189
|
-
pull_schema if schema?
|
|
190
|
-
pull_indexes if indexes_first? && schema?
|
|
191
|
-
end
|
|
192
|
-
setup_signal_trap
|
|
193
|
-
pull_partial_data if data? && resuming?
|
|
194
|
-
pull_data if data?
|
|
195
|
-
pull_indexes if !indexes_first? && schema?
|
|
196
|
-
pull_reset_sequences
|
|
197
|
-
end
|
|
198
|
-
end
|
|
199
|
-
|
|
200
|
-
def pull_schema
|
|
201
|
-
log.info "Receiving schema"
|
|
202
|
-
|
|
203
|
-
progress = ProgressBar.new('Schema', tables.size)
|
|
204
|
-
tables.each do |table_name, count|
|
|
205
|
-
# Reuse existing db connection for better performance
|
|
206
|
-
schema_data = Tapsoob::Schema.dump_table(db, table_name, @opts.slice(:indexes, :same_db))
|
|
207
|
-
log.debug "Table: #{table_name}\n#{schema_data}\n"
|
|
208
|
-
output = Tapsoob::Utils.export_schema(dump_path, table_name, schema_data)
|
|
209
|
-
puts output if dump_path.nil? && output
|
|
210
|
-
progress.inc(1)
|
|
211
|
-
end
|
|
212
|
-
progress.finish
|
|
213
|
-
end
|
|
214
|
-
|
|
215
|
-
def pull_data
|
|
216
|
-
log.info "Receiving data"
|
|
217
|
-
|
|
218
|
-
log.info "#{tables.size} tables, #{format_number(record_count)} records"
|
|
219
|
-
|
|
220
|
-
if parallel?
|
|
221
|
-
pull_data_parallel
|
|
222
|
-
else
|
|
223
|
-
pull_data_serial
|
|
224
|
-
end
|
|
225
|
-
end
|
|
226
|
-
|
|
227
|
-
def pull_data_serial
|
|
228
|
-
tables.each do |table_name, count|
|
|
229
|
-
stream = Tapsoob::DataStream.factory(db, {
|
|
230
|
-
:chunksize => default_chunksize,
|
|
231
|
-
:table_name => table_name
|
|
232
|
-
}, { :debug => opts[:debug] })
|
|
233
|
-
estimated_chunks = [(count.to_f / default_chunksize).ceil, 1].max
|
|
234
|
-
progress = (opts[:progress] ? ProgressBar.new(table_name.to_s, estimated_chunks) : nil)
|
|
235
|
-
pull_data_from_table(stream, progress)
|
|
236
|
-
end
|
|
237
|
-
end
|
|
238
|
-
|
|
239
|
-
def pull_data_parallel
|
|
240
|
-
log.info "Using #{parallel_workers} parallel workers"
|
|
241
|
-
|
|
242
|
-
multi_progress = opts[:progress] ? MultiProgressBar.new(parallel_workers) : nil
|
|
243
|
-
table_queue = Queue.new
|
|
244
|
-
tables.each { |table_name, count| table_queue << [table_name, count] }
|
|
245
|
-
|
|
246
|
-
workers = (1..parallel_workers).map do
|
|
247
|
-
Thread.new do
|
|
248
|
-
loop do
|
|
249
|
-
break if table_queue.empty?
|
|
250
|
-
|
|
251
|
-
table_name, count = table_queue.pop(true) rescue break
|
|
252
|
-
|
|
253
|
-
# Each thread gets its own connection from the pool
|
|
254
|
-
stream = Tapsoob::DataStream.factory(db, {
|
|
255
|
-
:chunksize => default_chunksize,
|
|
256
|
-
:table_name => table_name
|
|
257
|
-
}, { :debug => opts[:debug] })
|
|
258
|
-
|
|
259
|
-
estimated_chunks = [(count.to_f / default_chunksize).ceil, 1].max
|
|
260
|
-
progress = multi_progress ? multi_progress.create_bar(table_name.to_s, estimated_chunks) : nil
|
|
261
|
-
|
|
262
|
-
pull_data_from_table(stream, progress)
|
|
263
|
-
end
|
|
264
|
-
end
|
|
265
|
-
end
|
|
266
|
-
|
|
267
|
-
workers.each(&:join)
|
|
268
|
-
multi_progress.stop if multi_progress
|
|
269
|
-
end
|
|
270
|
-
|
|
271
|
-
def pull_partial_data
|
|
272
|
-
return if stream_state == {}
|
|
273
|
-
|
|
274
|
-
table_name = stream_state[:table_name]
|
|
275
|
-
record_count = tables[table_name.to_s]
|
|
276
|
-
log.info "Resuming #{table_name}, #{format_number(record_count)} records"
|
|
277
|
-
|
|
278
|
-
stream = Tapsoob::DataStream.factory(db, stream_state)
|
|
279
|
-
chunksize = stream_state[:chunksize] || default_chunksize
|
|
280
|
-
estimated_chunks = [(record_count.to_f / chunksize).ceil, 1].max
|
|
281
|
-
progress = (opts[:progress] ? ProgressBar.new(table_name.to_s, estimated_chunks) : nil)
|
|
282
|
-
pull_data_from_table(stream, progress)
|
|
283
|
-
end
|
|
284
|
-
|
|
285
|
-
def pull_data_from_table(stream, progress)
|
|
286
|
-
loop do
|
|
287
|
-
if exiting?
|
|
288
|
-
store_session
|
|
289
|
-
exit 0
|
|
290
|
-
end
|
|
291
|
-
|
|
292
|
-
row_size = 0
|
|
293
|
-
chunksize = stream.state[:chunksize]
|
|
294
|
-
|
|
295
|
-
begin
|
|
296
|
-
chunksize = Tapsoob::Utils.calculate_chunksize(chunksize) do |c|
|
|
297
|
-
stream.state[:chunksize] = c.to_i
|
|
298
|
-
encoded_data, row_size, elapsed_time = nil
|
|
299
|
-
d1 = c.time_delta do
|
|
300
|
-
encoded_data, row_size, elapsed_time = stream.fetch
|
|
301
|
-
end
|
|
302
|
-
|
|
303
|
-
data = nil
|
|
304
|
-
d2 = c.time_delta do
|
|
305
|
-
data = {
|
|
306
|
-
:state => stream.to_hash,
|
|
307
|
-
:checksum => Tapsoob::Utils.checksum(encoded_data).to_s,
|
|
308
|
-
:encoded_data => encoded_data
|
|
309
|
-
}
|
|
310
|
-
end
|
|
311
|
-
|
|
312
|
-
stream.fetch_data_from_database(data) do |rows|
|
|
313
|
-
next if rows == {}
|
|
314
|
-
|
|
315
|
-
# Update progress bar by 1 chunk
|
|
316
|
-
progress.inc(1) if progress
|
|
317
|
-
|
|
318
|
-
if dump_path.nil?
|
|
319
|
-
puts JSON.generate(rows)
|
|
320
|
-
else
|
|
321
|
-
Tapsoob::Utils.export_rows(dump_path, stream.table_name, rows)
|
|
322
|
-
end
|
|
323
|
-
end
|
|
324
|
-
log.debug "row size: #{row_size}"
|
|
325
|
-
stream.error = false
|
|
326
|
-
self.stream_state = stream.to_hash
|
|
327
|
-
|
|
328
|
-
c.idle_secs = (d1 + d2)
|
|
329
|
-
|
|
330
|
-
elapsed_time
|
|
331
|
-
end
|
|
332
|
-
rescue Tapsoob::CorruptedData => e
|
|
333
|
-
log.info "Corrupted Data Received #{e.message}, retrying..."
|
|
334
|
-
stream.error = true
|
|
335
|
-
next
|
|
336
|
-
end
|
|
337
|
-
|
|
338
|
-
break if stream.complete?
|
|
339
|
-
end
|
|
340
|
-
|
|
341
|
-
progress.finish if progress
|
|
342
|
-
add_completed_table(stream.table_name)
|
|
343
|
-
self.stream_state = {}
|
|
344
|
-
end
|
|
345
|
-
|
|
346
|
-
def tables
|
|
347
|
-
h = {}
|
|
348
|
-
tables_info.each do |table_name, count|
|
|
349
|
-
next if completed_tables.include?(table_name.to_s)
|
|
350
|
-
h[table_name.to_s] = count
|
|
351
|
-
end
|
|
352
|
-
h
|
|
353
|
-
end
|
|
354
|
-
|
|
355
|
-
def record_count
|
|
356
|
-
tables_info.values.inject(:+)
|
|
357
|
-
end
|
|
358
|
-
|
|
359
|
-
def tables_info
|
|
360
|
-
opts[:tables_info] ||= fetch_tables_info
|
|
361
|
-
end
|
|
362
|
-
|
|
363
|
-
def fetch_tables_info
|
|
364
|
-
tables = db.send(:sort_dumped_tables, db.tables, {})
|
|
365
|
-
|
|
366
|
-
data = {}
|
|
367
|
-
apply_table_filter(tables).each do |table_name|
|
|
368
|
-
data[table_name] = db[table_name].count
|
|
369
|
-
end
|
|
370
|
-
data
|
|
371
|
-
end
|
|
372
|
-
|
|
373
|
-
def self.factory(db, state)
|
|
374
|
-
if defined?(Sequel::MySQL) && Sequel::MySQL.respond_to?(:convert_invalid_date_time=)
|
|
375
|
-
Sequel::MySQL.convert_invalid_date_time = :nil
|
|
376
|
-
end
|
|
377
|
-
|
|
378
|
-
if state.has_key?(:klass)
|
|
379
|
-
return eval(state[:klass]).new(db, state)
|
|
380
|
-
end
|
|
381
|
-
|
|
382
|
-
if Tapsoob::Utils.single_integer_primary_key(db, state[:table_name].to_sym)
|
|
383
|
-
DataStreamKeyed.new(db, state)
|
|
384
|
-
else
|
|
385
|
-
DataStream.new(db, state)
|
|
386
|
-
end
|
|
387
|
-
end
|
|
388
|
-
|
|
389
|
-
def pull_indexes
|
|
390
|
-
log.info "Receiving indexes"
|
|
391
|
-
|
|
392
|
-
raw_idxs = Tapsoob::Schema.indexes_individual(database_url)
|
|
393
|
-
idxs = (raw_idxs && raw_idxs.length >= 2 ? JSON.parse(raw_idxs) : {})
|
|
394
|
-
|
|
395
|
-
# Calculate max title width for consistent alignment
|
|
396
|
-
filtered_idxs = apply_table_filter(idxs).select { |table, indexes| indexes.size > 0 }
|
|
397
|
-
max_title_width = filtered_idxs.keys.map { |table| "#{table} indexes".length }.max || 14
|
|
398
|
-
|
|
399
|
-
filtered_idxs.each do |table, indexes|
|
|
400
|
-
progress = ProgressBar.new("#{table} indexes", indexes.size, STDOUT, max_title_width)
|
|
401
|
-
indexes.each do |idx|
|
|
402
|
-
output = Tapsoob::Utils.export_indexes(dump_path, table, idx)
|
|
403
|
-
puts output if dump_path.nil? && output
|
|
404
|
-
progress.inc(1)
|
|
405
|
-
end
|
|
406
|
-
progress.finish
|
|
407
|
-
end
|
|
408
|
-
end
|
|
409
|
-
|
|
410
|
-
def pull_reset_sequences
|
|
411
|
-
log.info "Resetting sequences"
|
|
412
|
-
|
|
413
|
-
output = Tapsoob::Utils.schema_bin(:reset_db_sequences, database_url)
|
|
414
|
-
puts output if dump_path.nil? && output
|
|
415
|
-
end
|
|
416
|
-
end
|
|
417
|
-
|
|
418
|
-
class Push < Operation
|
|
419
|
-
def file_prefix
|
|
420
|
-
"push"
|
|
421
|
-
end
|
|
422
|
-
|
|
423
|
-
def to_hash
|
|
424
|
-
super.merge(:local_tables_info => local_tables_info)
|
|
425
|
-
end
|
|
426
|
-
|
|
427
|
-
def run
|
|
428
|
-
catch_errors do
|
|
429
|
-
unless resuming?
|
|
430
|
-
push_schema if schema?
|
|
431
|
-
push_indexes if indexes_first? && schema?
|
|
432
|
-
end
|
|
433
|
-
setup_signal_trap
|
|
434
|
-
push_partial_data if data? && resuming?
|
|
435
|
-
push_data if data?
|
|
436
|
-
push_indexes if !indexes_first? && schema?
|
|
437
|
-
push_reset_sequences
|
|
438
|
-
end
|
|
439
|
-
end
|
|
440
|
-
|
|
441
|
-
def push_indexes
|
|
442
|
-
idxs = {}
|
|
443
|
-
table_idxs = Dir.glob(File.join(dump_path, "indexes", "*.json")).map { |path| File.basename(path, '.json') }
|
|
444
|
-
table_idxs.each do |table_idx|
|
|
445
|
-
# Read NDJSON format - each line is a separate index
|
|
446
|
-
index_file = File.join(dump_path, "indexes", "#{table_idx}.json")
|
|
447
|
-
idxs[table_idx] = File.readlines(index_file).map { |line| JSON.parse(line.strip) }
|
|
448
|
-
end
|
|
449
|
-
|
|
450
|
-
return unless idxs.size > 0
|
|
451
|
-
|
|
452
|
-
log.info "Sending indexes"
|
|
453
|
-
|
|
454
|
-
# Calculate max title width for consistent alignment
|
|
455
|
-
filtered_idxs = apply_table_filter(idxs).select { |table, indexes| indexes.size > 0 }
|
|
456
|
-
max_title_width = filtered_idxs.keys.map { |table| "#{table} indexes".length }.max || 14
|
|
457
|
-
|
|
458
|
-
filtered_idxs.each do |table, indexes|
|
|
459
|
-
progress = ProgressBar.new("#{table} indexes", indexes.size, STDOUT, max_title_width)
|
|
460
|
-
indexes.each do |idx|
|
|
461
|
-
Tapsoob::Utils.load_indexes(database_url, idx)
|
|
462
|
-
progress.inc(1)
|
|
463
|
-
end
|
|
464
|
-
progress.finish
|
|
465
|
-
end
|
|
466
|
-
end
|
|
467
|
-
|
|
468
|
-
def push_schema
|
|
469
|
-
log.info "Sending schema"
|
|
470
|
-
|
|
471
|
-
progress = ProgressBar.new('Schema', tables.size)
|
|
472
|
-
tables.each do |table, count|
|
|
473
|
-
log.debug "Loading '#{table}' schema\n"
|
|
474
|
-
# Reuse existing db connection for better performance
|
|
475
|
-
Tapsoob::Utils.load_schema(dump_path, db, table)
|
|
476
|
-
progress.inc(1)
|
|
477
|
-
end
|
|
478
|
-
progress.finish
|
|
479
|
-
end
|
|
480
|
-
|
|
481
|
-
def push_reset_sequences
|
|
482
|
-
log.info "Resetting sequences"
|
|
483
|
-
|
|
484
|
-
Tapsoob::Utils.schema_bin(:reset_db_sequences, database_url)
|
|
485
|
-
end
|
|
486
|
-
|
|
487
|
-
def push_partial_data
|
|
488
|
-
return if stream_state == {}
|
|
489
|
-
|
|
490
|
-
table_name = stream_state[:table_name]
|
|
491
|
-
record_count = tables[table_name.to_s]
|
|
492
|
-
log.info "Resuming #{table_name}, #{format_number(record_count)} records"
|
|
493
|
-
stream = Tapsoob::DataStream.factory(db, stream_state)
|
|
494
|
-
chunksize = stream_state[:chunksize] || default_chunksize
|
|
495
|
-
estimated_chunks = [(record_count.to_f / chunksize).ceil, 1].max
|
|
496
|
-
progress = (opts[:progress] ? ProgressBar.new(table_name.to_s, estimated_chunks) : nil)
|
|
497
|
-
push_data_from_file(stream, progress)
|
|
498
|
-
end
|
|
499
|
-
|
|
500
|
-
def push_data
|
|
501
|
-
log.info "Sending data"
|
|
502
|
-
|
|
503
|
-
log.info "#{tables.size} tables, #{format_number(record_count)} records"
|
|
504
|
-
|
|
505
|
-
if parallel?
|
|
506
|
-
push_data_parallel
|
|
507
|
-
else
|
|
508
|
-
push_data_serial
|
|
509
|
-
end
|
|
510
|
-
end
|
|
511
|
-
|
|
512
|
-
def push_data_serial
|
|
513
|
-
tables.each do |table_name, count|
|
|
514
|
-
# Skip if data file doesn't exist or has no data
|
|
515
|
-
data_file = File.join(dump_path, "data", "#{table_name}.json")
|
|
516
|
-
next unless File.exist?(data_file) && count > 0
|
|
517
|
-
db[table_name.to_sym].truncate if @opts[:purge]
|
|
518
|
-
stream = Tapsoob::DataStream.factory(db, {
|
|
519
|
-
:table_name => table_name,
|
|
520
|
-
:chunksize => default_chunksize
|
|
521
|
-
}, {
|
|
522
|
-
:"skip-duplicates" => opts[:"skip-duplicates"] || false,
|
|
523
|
-
:"discard-identity" => opts[:"discard-identity"] || false,
|
|
524
|
-
:purge => opts[:purge] || false,
|
|
525
|
-
:debug => opts[:debug]
|
|
526
|
-
})
|
|
527
|
-
estimated_chunks = [(count.to_f / default_chunksize).ceil, 1].max
|
|
528
|
-
progress = (opts[:progress] ? ProgressBar.new(table_name.to_s, estimated_chunks) : nil)
|
|
529
|
-
push_data_from_file(stream, progress)
|
|
530
|
-
end
|
|
531
|
-
end
|
|
532
|
-
|
|
533
|
-
def push_data_parallel
|
|
534
|
-
log.info "Using #{parallel_workers} parallel workers"
|
|
535
|
-
|
|
536
|
-
multi_progress = opts[:progress] ? MultiProgressBar.new(parallel_workers) : nil
|
|
537
|
-
table_queue = Queue.new
|
|
538
|
-
|
|
539
|
-
tables.each do |table_name, count|
|
|
540
|
-
data_file = File.join(dump_path, "data", "#{table_name}.json")
|
|
541
|
-
next unless File.exist?(data_file) && count > 0
|
|
542
|
-
table_queue << [table_name, count]
|
|
543
|
-
end
|
|
544
|
-
|
|
545
|
-
workers = (1..parallel_workers).map do
|
|
546
|
-
Thread.new do
|
|
547
|
-
loop do
|
|
548
|
-
break if table_queue.empty?
|
|
549
|
-
|
|
550
|
-
table_name, count = table_queue.pop(true) rescue break
|
|
551
|
-
|
|
552
|
-
# Each thread gets its own connection from the pool
|
|
553
|
-
db[table_name.to_sym].truncate if @opts[:purge]
|
|
554
|
-
stream = Tapsoob::DataStream.factory(db, {
|
|
555
|
-
:table_name => table_name,
|
|
556
|
-
:chunksize => default_chunksize
|
|
557
|
-
}, {
|
|
558
|
-
:"skip-duplicates" => opts[:"skip-duplicates"] || false,
|
|
559
|
-
:"discard-identity" => opts[:"discard-identity"] || false,
|
|
560
|
-
:purge => opts[:purge] || false,
|
|
561
|
-
:debug => opts[:debug]
|
|
562
|
-
})
|
|
563
|
-
|
|
564
|
-
estimated_chunks = [(count.to_f / default_chunksize).ceil, 1].max
|
|
565
|
-
progress = multi_progress ? multi_progress.create_bar(table_name.to_s, estimated_chunks) : nil
|
|
566
|
-
|
|
567
|
-
push_data_from_file(stream, progress)
|
|
568
|
-
end
|
|
569
|
-
end
|
|
570
|
-
end
|
|
571
|
-
|
|
572
|
-
workers.each(&:join)
|
|
573
|
-
multi_progress.stop if multi_progress
|
|
574
|
-
end
|
|
575
|
-
|
|
576
|
-
def push_data_from_file(stream, progress)
|
|
577
|
-
loop do
|
|
578
|
-
if exiting?
|
|
579
|
-
store_session
|
|
580
|
-
exit 0
|
|
581
|
-
end
|
|
582
|
-
|
|
583
|
-
row_size = 0
|
|
584
|
-
chunksize = stream.state[:chunksize]
|
|
585
|
-
|
|
586
|
-
begin
|
|
587
|
-
chunksize = Tapsoob::Utils.calculate_chunksize(chunksize) do |c|
|
|
588
|
-
stream.state[:chunksize] = c.to_i
|
|
589
|
-
encoded_data, row_size, elapsed_time = nil
|
|
590
|
-
d1 = c.time_delta do
|
|
591
|
-
encoded_data, row_size, elapsed_time = stream.fetch({ :type => "file", :source => dump_path })
|
|
592
|
-
end
|
|
593
|
-
|
|
594
|
-
data = nil
|
|
595
|
-
d2 = c.time_delta do
|
|
596
|
-
data = {
|
|
597
|
-
:state => stream.to_hash,
|
|
598
|
-
:checksum => Tapsoob::Utils.checksum(encoded_data).to_s,
|
|
599
|
-
:encoded_data => encoded_data
|
|
600
|
-
}
|
|
601
|
-
end
|
|
602
|
-
|
|
603
|
-
stream.fetch_data_to_database(data)
|
|
604
|
-
log.debug "row size: #{row_size}"
|
|
605
|
-
self.stream_state = stream.to_hash
|
|
606
|
-
|
|
607
|
-
c.idle_secs = (d1 + d2)
|
|
608
|
-
|
|
609
|
-
elapsed_time
|
|
610
|
-
end
|
|
611
|
-
rescue Tapsoob::CorruptedData => e
|
|
612
|
-
# retry the same data, it got corrupted somehow.
|
|
613
|
-
next
|
|
614
|
-
rescue Tapsoob::DuplicatePrimaryKeyError => e
|
|
615
|
-
# verify the stream and retry it
|
|
616
|
-
stream.verify_stream
|
|
617
|
-
stream = JSON.generate({ :state => stream.to_hash })
|
|
618
|
-
next
|
|
619
|
-
end
|
|
620
|
-
stream.state[:chunksize] = chunksize
|
|
621
|
-
|
|
622
|
-
# Update progress bar by 1 chunk
|
|
623
|
-
progress.inc(1) if progress
|
|
624
|
-
|
|
625
|
-
break if stream.complete?
|
|
626
|
-
end
|
|
627
|
-
|
|
628
|
-
progress.finish if progress
|
|
629
|
-
add_completed_table(stream.table_name)
|
|
630
|
-
self.stream_state = {}
|
|
631
|
-
end
|
|
632
|
-
|
|
633
|
-
def local_tables_info
|
|
634
|
-
opts[:local_tables_info] ||= fetch_local_tables_info
|
|
635
|
-
end
|
|
636
|
-
|
|
637
|
-
def tables
|
|
638
|
-
h = {}
|
|
639
|
-
local_tables_info.each do |table_name, count|
|
|
640
|
-
next if completed_tables.include?(table_name.to_s)
|
|
641
|
-
h[table_name.to_s] = count
|
|
642
|
-
end
|
|
643
|
-
h
|
|
644
|
-
end
|
|
645
|
-
|
|
646
|
-
def record_count
|
|
647
|
-
@record_count ||= local_tables_info.values.inject(0) { |a,c| a += c }
|
|
648
|
-
end
|
|
649
|
-
|
|
650
|
-
def fetch_local_tables_info
|
|
651
|
-
tables_with_counts = {}
|
|
652
|
-
tbls = Dir.glob(File.join(dump_path, "schemas", "*")).map { |path| File.basename(path, ".rb") }
|
|
653
|
-
tbls.each do |table|
|
|
654
|
-
if File.exist?(File.join(dump_path, "data", "#{table}.json"))
|
|
655
|
-
# Read NDJSON format - each line is a separate JSON chunk
|
|
656
|
-
total_rows = 0
|
|
657
|
-
File.readlines(File.join(dump_path, "data", "#{table}.json")).each do |line|
|
|
658
|
-
chunk = JSON.parse(line.strip)
|
|
659
|
-
total_rows += chunk["data"].size if chunk["data"]
|
|
660
|
-
end
|
|
661
|
-
tables_with_counts[table] = total_rows
|
|
662
|
-
else
|
|
663
|
-
tables_with_counts[table] = 0
|
|
664
|
-
end
|
|
665
|
-
end
|
|
666
|
-
apply_table_filter(tables_with_counts)
|
|
667
|
-
end
|
|
4
|
+
module Operation
|
|
5
|
+
# Require all Operation classes
|
|
6
|
+
require 'tapsoob/operation/base'
|
|
7
|
+
require 'tapsoob/operation/pull'
|
|
8
|
+
require 'tapsoob/operation/push'
|
|
668
9
|
end
|
|
669
10
|
end
|