tapsoob 0.6.2-java → 0.7.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +18 -2
- data/lib/tapsoob/cli/data_stream.rb +3 -3
- data/lib/tapsoob/cli/root.rb +2 -3
- data/lib/tapsoob/data_stream/base.rb +315 -0
- data/lib/tapsoob/data_stream/file_partition.rb +87 -0
- data/lib/tapsoob/data_stream/interleaved.rb +80 -0
- data/lib/tapsoob/data_stream/keyed.rb +124 -0
- data/lib/tapsoob/data_stream/keyed_partition.rb +64 -0
- data/lib/tapsoob/data_stream.rb +7 -378
- data/lib/tapsoob/operation/base.rb +240 -0
- data/lib/tapsoob/operation/pull.rb +419 -0
- data/lib/tapsoob/operation/push.rb +446 -0
- data/lib/tapsoob/operation.rb +5 -664
- data/lib/tapsoob/progress/bar.rb +0 -4
- data/lib/tapsoob/progress/multi_bar.rb +90 -58
- data/lib/tapsoob/progress/thread_safe_bar.rb +0 -3
- data/lib/tapsoob/progress_event.rb +109 -0
- data/lib/tapsoob/version.rb +1 -1
- data/lib/tasks/tapsoob.rake +2 -2
- metadata +11 -2
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
|
2
|
+
require 'sequel'
|
|
3
|
+
require 'thread'
|
|
4
|
+
require 'etc'
|
|
5
|
+
|
|
6
|
+
require 'tapsoob/data_stream'
|
|
7
|
+
require 'tapsoob/log'
|
|
8
|
+
require 'tapsoob/progress'
|
|
9
|
+
require 'tapsoob/progress_event'
|
|
10
|
+
require 'tapsoob/schema'
|
|
11
|
+
|
|
12
|
+
module Tapsoob
|
|
13
|
+
module Operation
|
|
14
|
+
class Base
|
|
15
|
+
attr_reader :database_url, :dump_path, :opts
|
|
16
|
+
|
|
17
|
+
def initialize(database_url, dump_path = nil, opts={})
|
|
18
|
+
@database_url = database_url
|
|
19
|
+
@dump_path = dump_path
|
|
20
|
+
@opts = opts
|
|
21
|
+
@exiting = false
|
|
22
|
+
|
|
23
|
+
# Enable JSON progress events only when:
|
|
24
|
+
# 1. CLI progress bars are disabled (--progress=false), AND
|
|
25
|
+
# 2. Not piping (dump_path is provided)
|
|
26
|
+
# This prevents STDERR noise when piping and when using visual progress bars
|
|
27
|
+
Tapsoob::ProgressEvent.enabled = !opts[:progress] && !dump_path.nil?
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def file_prefix
|
|
31
|
+
"op"
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def data?
|
|
35
|
+
opts[:data]
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def schema?
|
|
39
|
+
opts[:schema]
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def indexes_first?
|
|
43
|
+
!!opts[:indexes_first]
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def table_filter
|
|
47
|
+
opts[:tables] || []
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def exclude_tables
|
|
51
|
+
opts[:exclude_tables] || []
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def apply_table_filter(tables)
|
|
55
|
+
return tables if table_filter.empty? && exclude_tables.empty?
|
|
56
|
+
|
|
57
|
+
if tables.kind_of?(Hash)
|
|
58
|
+
ntables = {}
|
|
59
|
+
tables.each do |t, d|
|
|
60
|
+
if !exclude_tables.include?(t.to_s) && (!table_filter.empty? && table_filter.include?(t.to_s))
|
|
61
|
+
ntables[t] = d
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
ntables
|
|
65
|
+
else
|
|
66
|
+
tables.reject { |t| exclude_tables.include?(t.to_s) }.select { |t| table_filter.include?(t.to_s) }
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def log
|
|
71
|
+
Tapsoob.log.level = Logger::DEBUG if opts[:debug]
|
|
72
|
+
Tapsoob.log
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def store_session
|
|
76
|
+
file = "#{file_prefix}_#{Time.now.strftime("%Y%m%d%H%M")}.dat"
|
|
77
|
+
log.info "\nSaving session to #{file}..."
|
|
78
|
+
File.open(file, 'w') do |f|
|
|
79
|
+
f.write(JSON.generate(to_hash))
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def to_hash
|
|
84
|
+
{
|
|
85
|
+
:klass => self.class.to_s,
|
|
86
|
+
:database_url => database_url,
|
|
87
|
+
:stream_state => stream_state,
|
|
88
|
+
:completed_tables => completed_tables,
|
|
89
|
+
:table_filter => table_filter,
|
|
90
|
+
}
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def exiting?
|
|
94
|
+
!!@exiting
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def setup_signal_trap
|
|
98
|
+
trap("INT") {
|
|
99
|
+
puts "\nCompleting current action..."
|
|
100
|
+
@exiting = true
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
trap("TERM") {
|
|
104
|
+
puts "\nCompleting current action..."
|
|
105
|
+
@exiting = true
|
|
106
|
+
}
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def resuming?
|
|
110
|
+
opts[:resume] == true
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def default_chunksize
|
|
114
|
+
opts[:default_chunksize]
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
def completed_tables
|
|
118
|
+
opts[:completed_tables] ||= []
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def stream_state
|
|
122
|
+
opts[:stream_state] ||= {}
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def stream_state=(val)
|
|
126
|
+
opts[:stream_state] = val
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def db
|
|
130
|
+
@db ||= Sequel.connect(database_url, max_connections: parallel_workers * 2)
|
|
131
|
+
@db.extension :schema_dumper
|
|
132
|
+
@db.loggers << Tapsoob.log if opts[:debug]
|
|
133
|
+
|
|
134
|
+
# Set parameters
|
|
135
|
+
if @db.uri =~ /oracle/i
|
|
136
|
+
@db << "ALTER SESSION SET NLS_DATE_FORMAT='YYYY-MM-DD HH24:MI:SS'"
|
|
137
|
+
@db << "ALTER SESSION SET NLS_TIMESTAMP_FORMAT='YYYY-MM-DD HH24:MI:SS:FF6'"
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
@db
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
def parallel?
|
|
144
|
+
parallel_workers > 1
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
def parallel_workers
|
|
148
|
+
@parallel_workers ||= [opts[:parallel].to_i, 1].max
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
# Auto-detect number of workers for intra-table parallelization
|
|
152
|
+
def table_parallel_workers(table_name, row_count)
|
|
153
|
+
# Disable intra-table parallelization when piping to STDOUT
|
|
154
|
+
# (no dump_path means we're outputting JSON directly, which can't be safely parallelized)
|
|
155
|
+
return 1 if dump_path.nil?
|
|
156
|
+
|
|
157
|
+
# TEMPORARILY RE-ENABLED for debugging
|
|
158
|
+
# return 1 if self.is_a?(Tapsoob::Operation::Push)
|
|
159
|
+
|
|
160
|
+
# Minimum threshold for parallelization (100K rows by default)
|
|
161
|
+
threshold = 100_000
|
|
162
|
+
return 1 if row_count < threshold
|
|
163
|
+
|
|
164
|
+
# Detect available CPU cores
|
|
165
|
+
available_cpus = Etc.nprocessors rescue 4
|
|
166
|
+
|
|
167
|
+
# Use up to 50% of CPUs for single table, max 8 workers
|
|
168
|
+
max_workers = [available_cpus / 2, 8, 2].max
|
|
169
|
+
|
|
170
|
+
# Scale based on table size
|
|
171
|
+
if row_count >= 5_000_000
|
|
172
|
+
max_workers
|
|
173
|
+
elsif row_count >= 1_000_000
|
|
174
|
+
[max_workers / 2, 2].max
|
|
175
|
+
elsif row_count >= 500_000
|
|
176
|
+
[max_workers / 4, 2].max
|
|
177
|
+
else
|
|
178
|
+
2 # Minimum 2 workers for tables over threshold
|
|
179
|
+
end
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
# Check if table can use efficient PK-based partitioning
|
|
183
|
+
def can_use_pk_partitioning?(table_name)
|
|
184
|
+
Tapsoob::Utils.single_integer_primary_key(db, table_name.to_sym)
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
def completed_tables_mutex
|
|
188
|
+
@completed_tables_mutex ||= Mutex.new
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
def add_completed_table(table_name)
|
|
192
|
+
completed_tables_mutex.synchronize do
|
|
193
|
+
completed_tables << table_name.to_s
|
|
194
|
+
end
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
def format_number(num)
|
|
198
|
+
num.to_s.gsub(/(\d)(?=(\d\d\d)+(?!\d))/, "\\1,")
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
def save_table_order(table_names)
|
|
202
|
+
return unless dump_path
|
|
203
|
+
|
|
204
|
+
metadata_file = File.join(dump_path, "table_order.txt")
|
|
205
|
+
File.open(metadata_file, 'w') do |file|
|
|
206
|
+
table_names.each { |table| file.puts(table) }
|
|
207
|
+
end
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
def load_table_order
|
|
211
|
+
return nil unless dump_path
|
|
212
|
+
|
|
213
|
+
metadata_file = File.join(dump_path, "table_order.txt")
|
|
214
|
+
return nil unless File.exist?(metadata_file)
|
|
215
|
+
|
|
216
|
+
File.readlines(metadata_file).map(&:strip).reject(&:empty?)
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
def catch_errors(&blk)
|
|
220
|
+
begin
|
|
221
|
+
blk.call
|
|
222
|
+
rescue Exception => e
|
|
223
|
+
raise e
|
|
224
|
+
end
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
def self.factory(type, database_url, dump_path, opts)
|
|
228
|
+
type = :resume if opts[:resume]
|
|
229
|
+
klass = case type
|
|
230
|
+
when :pull then Tapsoob::Operation::Pull
|
|
231
|
+
when :push then Tapsoob::Operation::Push
|
|
232
|
+
when :resume then eval(opts[:klass])
|
|
233
|
+
else raise "Unknown Operation Type -> #{type}"
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
klass.new(database_url, dump_path, opts)
|
|
237
|
+
end
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
end
|
|
@@ -0,0 +1,419 @@
|
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
|
2
|
+
require 'fileutils'
|
|
3
|
+
require 'tapsoob/operation/base'
|
|
4
|
+
require 'tapsoob/progress_event'
|
|
5
|
+
|
|
6
|
+
module Tapsoob
|
|
7
|
+
module Operation
|
|
8
|
+
class Pull < Base
|
|
9
|
+
def file_prefix
|
|
10
|
+
"pull"
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def to_hash
|
|
14
|
+
super.merge(:remote_tables_info => remote_tables_info)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def run
|
|
18
|
+
catch_errors do
|
|
19
|
+
unless resuming?
|
|
20
|
+
initialize_dump_directory if dump_path
|
|
21
|
+
pull_schema if schema?
|
|
22
|
+
pull_indexes if indexes_first? && schema?
|
|
23
|
+
end
|
|
24
|
+
setup_signal_trap
|
|
25
|
+
pull_partial_data if data? && resuming?
|
|
26
|
+
pull_data if data?
|
|
27
|
+
pull_indexes if !indexes_first? && schema?
|
|
28
|
+
pull_reset_sequences
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def initialize_dump_directory
|
|
33
|
+
%w[data schemas indexes].each do |subdir|
|
|
34
|
+
dir_path = File.join(dump_path, subdir)
|
|
35
|
+
FileUtils.rm_rf(dir_path)
|
|
36
|
+
FileUtils.mkdir_p(dir_path)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
FileUtils.rm_f(File.join(dump_path, "table_order.txt"))
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def pull_schema
|
|
43
|
+
log.info "Receiving schema"
|
|
44
|
+
Tapsoob::ProgressEvent.schema_start(tables.size)
|
|
45
|
+
|
|
46
|
+
progress = opts[:progress] ? Tapsoob::Progress::Bar.new('Schema', tables.size) : nil
|
|
47
|
+
tables.each do |table_name, count|
|
|
48
|
+
# Reuse existing db connection for better performance
|
|
49
|
+
schema_data = Tapsoob::Schema.dump_table(db, table_name, @opts.slice(:indexes, :same_db))
|
|
50
|
+
log.debug "Table: #{table_name}\n#{schema_data}\n"
|
|
51
|
+
output = Tapsoob::Utils.export_schema(dump_path, table_name, schema_data)
|
|
52
|
+
puts output if dump_path.nil? && output
|
|
53
|
+
progress.inc(1) if progress
|
|
54
|
+
end
|
|
55
|
+
progress.finish if progress
|
|
56
|
+
Tapsoob::ProgressEvent.schema_complete(tables.size)
|
|
57
|
+
|
|
58
|
+
# Save table order for dependency-aware schema loading during push
|
|
59
|
+
save_table_order(tables.keys) if dump_path
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def pull_data
|
|
63
|
+
log.info "Receiving data"
|
|
64
|
+
|
|
65
|
+
log.info "#{tables.size} tables, #{format_number(record_count)} records"
|
|
66
|
+
Tapsoob::ProgressEvent.data_start(tables.size, record_count)
|
|
67
|
+
|
|
68
|
+
if parallel?
|
|
69
|
+
pull_data_parallel
|
|
70
|
+
else
|
|
71
|
+
pull_data_serial
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
Tapsoob::ProgressEvent.data_complete(tables.size, record_count)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def pull_data_serial
|
|
78
|
+
tables.each do |table_name, count|
|
|
79
|
+
# Auto-detect if we should use intra-table parallelization
|
|
80
|
+
table_workers = table_parallel_workers(table_name, count)
|
|
81
|
+
|
|
82
|
+
if table_workers > 1
|
|
83
|
+
log.info "Table #{table_name}: using #{table_workers} workers for #{format_number(count)} records"
|
|
84
|
+
Tapsoob::ProgressEvent.table_start(table_name, count, workers: table_workers)
|
|
85
|
+
pull_data_from_table_parallel(table_name, count, table_workers)
|
|
86
|
+
else
|
|
87
|
+
Tapsoob::ProgressEvent.table_start(table_name, count)
|
|
88
|
+
stream = Tapsoob::DataStream::Base.factory(db, {
|
|
89
|
+
:chunksize => default_chunksize,
|
|
90
|
+
:table_name => table_name
|
|
91
|
+
}, { :debug => opts[:debug] })
|
|
92
|
+
estimated_chunks = [(count.to_f / default_chunksize).ceil, 1].max
|
|
93
|
+
progress = (opts[:progress] ? Tapsoob::Progress::Bar.new(table_name.to_s, estimated_chunks) : nil)
|
|
94
|
+
pull_data_from_table(stream, progress, count)
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def pull_data_parallel
|
|
100
|
+
log.info "Using #{parallel_workers} parallel workers for table-level parallelization"
|
|
101
|
+
|
|
102
|
+
# Reserve space for both table-level and intra-table workers
|
|
103
|
+
# With 4 table workers and potentially 8 intra-table workers per table,
|
|
104
|
+
# we could have many concurrent progress bars. Show up to 8 at once.
|
|
105
|
+
max_visible_bars = 8
|
|
106
|
+
multi_progress = opts[:progress] ? Tapsoob::Progress::MultiBar.new(max_visible_bars) : nil
|
|
107
|
+
table_queue = Queue.new
|
|
108
|
+
tables.each { |table_name, count| table_queue << [table_name, count] }
|
|
109
|
+
|
|
110
|
+
workers = (1..parallel_workers).map do
|
|
111
|
+
Thread.new do
|
|
112
|
+
loop do
|
|
113
|
+
break if table_queue.empty?
|
|
114
|
+
|
|
115
|
+
table_name, count = table_queue.pop(true) rescue break
|
|
116
|
+
|
|
117
|
+
# Check if this table should use intra-table parallelization
|
|
118
|
+
table_workers = table_parallel_workers(table_name, count)
|
|
119
|
+
|
|
120
|
+
if table_workers > 1
|
|
121
|
+
# Large table - use intra-table parallelization
|
|
122
|
+
info_msg = "Table #{table_name}: using #{table_workers} workers for #{format_number(count)} records"
|
|
123
|
+
if multi_progress
|
|
124
|
+
multi_progress.set_info(info_msg)
|
|
125
|
+
else
|
|
126
|
+
log.info info_msg
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
Tapsoob::ProgressEvent.table_start(table_name, count, workers: table_workers)
|
|
130
|
+
# Run intra-table parallelization, passing parent progress bar
|
|
131
|
+
pull_data_from_table_parallel(table_name, count, table_workers, multi_progress)
|
|
132
|
+
else
|
|
133
|
+
# Small table - use single-threaded processing
|
|
134
|
+
Tapsoob::ProgressEvent.table_start(table_name, count)
|
|
135
|
+
stream = Tapsoob::DataStream::Base.factory(db, {
|
|
136
|
+
:chunksize => default_chunksize,
|
|
137
|
+
:table_name => table_name
|
|
138
|
+
}, { :debug => opts[:debug] })
|
|
139
|
+
|
|
140
|
+
estimated_chunks = [(count.to_f / default_chunksize).ceil, 1].max
|
|
141
|
+
progress = multi_progress ? multi_progress.create_bar(table_name.to_s, estimated_chunks) : nil
|
|
142
|
+
|
|
143
|
+
pull_data_from_table(stream, progress, count)
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
workers.each(&:join)
|
|
150
|
+
multi_progress.stop if multi_progress
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def pull_partial_data
|
|
154
|
+
return if stream_state == {}
|
|
155
|
+
|
|
156
|
+
table_name = stream_state[:table_name]
|
|
157
|
+
record_count = tables[table_name.to_s]
|
|
158
|
+
log.info "Resuming #{table_name}, #{format_number(record_count)} records"
|
|
159
|
+
|
|
160
|
+
stream = Tapsoob::DataStream::Base.factory(db, stream_state)
|
|
161
|
+
chunksize = stream_state[:chunksize] || default_chunksize
|
|
162
|
+
estimated_chunks = [(record_count.to_f / chunksize).ceil, 1].max
|
|
163
|
+
progress = (opts[:progress] ? Tapsoob::Progress::Bar.new(table_name.to_s, estimated_chunks) : nil)
|
|
164
|
+
pull_data_from_table(stream, progress)
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
def pull_data_from_table(stream, progress, total_records = nil)
|
|
168
|
+
records_processed = 0
|
|
169
|
+
|
|
170
|
+
loop do
|
|
171
|
+
if exiting?
|
|
172
|
+
store_session
|
|
173
|
+
exit 0
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
row_size = 0
|
|
177
|
+
chunksize = stream.state[:chunksize]
|
|
178
|
+
|
|
179
|
+
begin
|
|
180
|
+
chunksize = Tapsoob::Utils.calculate_chunksize(chunksize) do |c|
|
|
181
|
+
stream.state[:chunksize] = c.to_i
|
|
182
|
+
encoded_data, row_size, elapsed_time = nil
|
|
183
|
+
d1 = c.time_delta do
|
|
184
|
+
encoded_data, row_size, elapsed_time = stream.fetch
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
data = nil
|
|
188
|
+
d2 = c.time_delta do
|
|
189
|
+
data = {
|
|
190
|
+
:state => stream.to_hash,
|
|
191
|
+
:checksum => Tapsoob::Utils.checksum(encoded_data).to_s,
|
|
192
|
+
:encoded_data => encoded_data
|
|
193
|
+
}
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
stream.fetch_data_from_database(data) do |rows|
|
|
197
|
+
next if rows == {}
|
|
198
|
+
|
|
199
|
+
# Update progress bar by 1 chunk
|
|
200
|
+
progress.inc(1) if progress
|
|
201
|
+
|
|
202
|
+
# Track records processed for progress events
|
|
203
|
+
if rows[:data]
|
|
204
|
+
records_processed += rows[:data].size
|
|
205
|
+
Tapsoob::ProgressEvent.table_progress(stream.table_name, records_processed, total_records) if total_records
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
if dump_path.nil?
|
|
209
|
+
puts JSON.generate(rows)
|
|
210
|
+
else
|
|
211
|
+
Tapsoob::Utils.export_rows(dump_path, stream.table_name, rows)
|
|
212
|
+
end
|
|
213
|
+
end
|
|
214
|
+
log.debug "row size: #{row_size}"
|
|
215
|
+
stream.error = false
|
|
216
|
+
self.stream_state = stream.to_hash
|
|
217
|
+
|
|
218
|
+
c.idle_secs = (d1 + d2)
|
|
219
|
+
|
|
220
|
+
elapsed_time
|
|
221
|
+
end
|
|
222
|
+
rescue Tapsoob::CorruptedData => e
|
|
223
|
+
log.info "Corrupted Data Received #{e.message}, retrying..."
|
|
224
|
+
stream.error = true
|
|
225
|
+
next
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
break if stream.complete?
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
progress.finish if progress
|
|
232
|
+
add_completed_table(stream.table_name)
|
|
233
|
+
self.stream_state = {}
|
|
234
|
+
|
|
235
|
+
# Emit final table complete event
|
|
236
|
+
Tapsoob::ProgressEvent.table_complete(stream.table_name, records_processed)
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
def pull_data_from_table_parallel(table_name, row_count, num_workers, parent_progress = nil)
|
|
240
|
+
# Mutex for coordinating file writes and progress tracking
|
|
241
|
+
write_mutex = Mutex.new
|
|
242
|
+
records_processed = 0
|
|
243
|
+
|
|
244
|
+
begin
|
|
245
|
+
# Determine partitioning strategy
|
|
246
|
+
use_pk_partitioning = can_use_pk_partitioning?(table_name)
|
|
247
|
+
|
|
248
|
+
if use_pk_partitioning
|
|
249
|
+
# PK-based partitioning for efficient range queries
|
|
250
|
+
ranges = Tapsoob::DataStream::Keyed.calculate_pk_ranges(db, table_name, num_workers)
|
|
251
|
+
log.debug "Table #{table_name}: using PK-based partitioning with #{ranges.size} ranges"
|
|
252
|
+
else
|
|
253
|
+
# Interleaved chunking for tables without integer PK
|
|
254
|
+
log.debug "Table #{table_name}: using interleaved chunking with #{num_workers} workers"
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
# Progress tracking - create ONE shared progress bar for the entire table
|
|
258
|
+
estimated_chunks = [(row_count.to_f / default_chunksize).ceil, 1].max
|
|
259
|
+
shared_progress = parent_progress ? parent_progress.create_bar(table_name.to_s, estimated_chunks) : nil
|
|
260
|
+
|
|
261
|
+
workers = (0...num_workers).map do |worker_id|
|
|
262
|
+
Thread.new do
|
|
263
|
+
# Create worker-specific stream
|
|
264
|
+
if use_pk_partitioning
|
|
265
|
+
min_pk, max_pk = ranges[worker_id]
|
|
266
|
+
stream = Tapsoob::DataStream::KeyedPartition.new(db, {
|
|
267
|
+
:table_name => table_name,
|
|
268
|
+
:chunksize => default_chunksize,
|
|
269
|
+
:partition_range => [min_pk, max_pk]
|
|
270
|
+
}, { :debug => opts[:debug] })
|
|
271
|
+
else
|
|
272
|
+
stream = Tapsoob::DataStream::Interleaved.new(db, {
|
|
273
|
+
:table_name => table_name,
|
|
274
|
+
:chunksize => default_chunksize,
|
|
275
|
+
:worker_id => worker_id,
|
|
276
|
+
:num_workers => num_workers
|
|
277
|
+
}, { :debug => opts[:debug] })
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
# Process data chunks
|
|
281
|
+
loop do
|
|
282
|
+
break if exiting? || stream.complete?
|
|
283
|
+
|
|
284
|
+
begin
|
|
285
|
+
encoded_data, row_size, elapsed_time = stream.fetch
|
|
286
|
+
|
|
287
|
+
# Skip processing empty results
|
|
288
|
+
if row_size.positive?
|
|
289
|
+
data = {
|
|
290
|
+
:state => stream.to_hash,
|
|
291
|
+
:checksum => Tapsoob::Utils.checksum(encoded_data).to_s,
|
|
292
|
+
:encoded_data => encoded_data
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
stream.fetch_data_from_database(data) do |rows|
|
|
296
|
+
next if rows == {}
|
|
297
|
+
|
|
298
|
+
# Thread-safe file write and progress tracking
|
|
299
|
+
write_mutex.synchronize do
|
|
300
|
+
if dump_path.nil?
|
|
301
|
+
puts JSON.generate(rows)
|
|
302
|
+
else
|
|
303
|
+
Tapsoob::Utils.export_rows(dump_path, stream.table_name, rows)
|
|
304
|
+
end
|
|
305
|
+
|
|
306
|
+
# Track records for progress events
|
|
307
|
+
if rows[:data]
|
|
308
|
+
records_processed += rows[:data].size
|
|
309
|
+
Tapsoob::ProgressEvent.table_progress(table_name, records_processed, row_count)
|
|
310
|
+
end
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
shared_progress.inc(1) if shared_progress
|
|
314
|
+
end
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
# Check completion AFTER processing data to avoid losing the last chunk
|
|
318
|
+
break if stream.complete?
|
|
319
|
+
|
|
320
|
+
rescue Tapsoob::CorruptedData => e
|
|
321
|
+
log.info "Worker #{worker_id}: Corrupted Data Received #{e.message}, retrying..."
|
|
322
|
+
next
|
|
323
|
+
rescue StandardError => e
|
|
324
|
+
log.error "Worker #{worker_id} error: #{e.message}"
|
|
325
|
+
log.error e.backtrace.join("\n")
|
|
326
|
+
raise
|
|
327
|
+
end
|
|
328
|
+
end
|
|
329
|
+
end
|
|
330
|
+
end
|
|
331
|
+
|
|
332
|
+
workers.each(&:join)
|
|
333
|
+
shared_progress.finish if shared_progress
|
|
334
|
+
|
|
335
|
+
add_completed_table(table_name)
|
|
336
|
+
ensure
|
|
337
|
+
# Always emit table_complete event, even if there was an error
|
|
338
|
+
Tapsoob::ProgressEvent.table_complete(table_name, records_processed)
|
|
339
|
+
end
|
|
340
|
+
end
|
|
341
|
+
|
|
342
|
+
def tables
|
|
343
|
+
h = {}
|
|
344
|
+
tables_info.each do |table_name, count|
|
|
345
|
+
next if completed_tables.include?(table_name.to_s)
|
|
346
|
+
h[table_name.to_s] = count
|
|
347
|
+
end
|
|
348
|
+
h
|
|
349
|
+
end
|
|
350
|
+
|
|
351
|
+
def record_count
|
|
352
|
+
tables_info.values.inject(:+)
|
|
353
|
+
end
|
|
354
|
+
|
|
355
|
+
def tables_info
|
|
356
|
+
opts[:tables_info] ||= fetch_tables_info
|
|
357
|
+
end
|
|
358
|
+
|
|
359
|
+
def fetch_tables_info
|
|
360
|
+
tables = db.send(:sort_dumped_tables, db.tables, {})
|
|
361
|
+
|
|
362
|
+
data = {}
|
|
363
|
+
apply_table_filter(tables).each do |table_name|
|
|
364
|
+
data[table_name] = db[table_name].count
|
|
365
|
+
end
|
|
366
|
+
data
|
|
367
|
+
end
|
|
368
|
+
|
|
369
|
+
def self.factory(db, state)
|
|
370
|
+
if defined?(Sequel::MySQL) && Sequel::MySQL.respond_to?(:convert_invalid_date_time=)
|
|
371
|
+
Sequel::MySQL.convert_invalid_date_time = :nil
|
|
372
|
+
end
|
|
373
|
+
|
|
374
|
+
if state.has_key?(:klass)
|
|
375
|
+
return eval(state[:klass]).new(db, state)
|
|
376
|
+
end
|
|
377
|
+
|
|
378
|
+
if Tapsoob::Utils.single_integer_primary_key(db, state[:table_name].to_sym)
|
|
379
|
+
Tapsoob::DataStream::Keyed.new(db, state)
|
|
380
|
+
else
|
|
381
|
+
Tapsoob::DataStream::Base.new(db, state)
|
|
382
|
+
end
|
|
383
|
+
end
|
|
384
|
+
|
|
385
|
+
def pull_indexes
|
|
386
|
+
log.info "Receiving indexes"
|
|
387
|
+
|
|
388
|
+
raw_idxs = Tapsoob::Schema.indexes_individual(database_url)
|
|
389
|
+
idxs = (raw_idxs && raw_idxs.length >= 2 ? JSON.parse(raw_idxs) : {})
|
|
390
|
+
|
|
391
|
+
# Calculate max title width for consistent alignment
|
|
392
|
+
filtered_idxs = apply_table_filter(idxs).select { |table, indexes| indexes.size > 0 }
|
|
393
|
+
Tapsoob::ProgressEvent.indexes_start(filtered_idxs.size)
|
|
394
|
+
max_title_width = filtered_idxs.keys.map { |table| "#{table} indexes".length }.max || 14
|
|
395
|
+
|
|
396
|
+
filtered_idxs.each do |table, indexes|
|
|
397
|
+
progress = opts[:progress] ? Tapsoob::Progress::Bar.new("#{table} indexes", indexes.size, STDOUT, max_title_width) : nil
|
|
398
|
+
indexes.each do |idx|
|
|
399
|
+
output = Tapsoob::Utils.export_indexes(dump_path, table, idx)
|
|
400
|
+
puts output if dump_path.nil? && output
|
|
401
|
+
progress.inc(1) if progress
|
|
402
|
+
end
|
|
403
|
+
progress.finish if progress
|
|
404
|
+
end
|
|
405
|
+
Tapsoob::ProgressEvent.indexes_complete(filtered_idxs.size)
|
|
406
|
+
end
|
|
407
|
+
|
|
408
|
+
def pull_reset_sequences
|
|
409
|
+
log.info "Resetting sequences"
|
|
410
|
+
Tapsoob::ProgressEvent.sequences_start
|
|
411
|
+
|
|
412
|
+
output = Tapsoob::Utils.schema_bin(:reset_db_sequences, database_url)
|
|
413
|
+
puts output if dump_path.nil? && output
|
|
414
|
+
|
|
415
|
+
Tapsoob::ProgressEvent.sequences_complete
|
|
416
|
+
end
|
|
417
|
+
end
|
|
418
|
+
end
|
|
419
|
+
end
|