tapsoob 0.5.31 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +11 -1
- data/lib/tapsoob/cli/data_stream.rb +45 -20
- data/lib/tapsoob/cli/root.rb +3 -0
- data/lib/tapsoob/multi_progress_bar.rb +234 -0
- data/lib/tapsoob/operation.rb +126 -9
- data/lib/tapsoob/progress_bar.rb +3 -2
- data/lib/tapsoob/version.rb +1 -1
- data/lib/tasks/tapsoob.rake +16 -4
- metadata +2 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: d509bbdc385a715e9784dc63e823d50d11cb179f3cf4c78fa229643550b6bd84
|
|
4
|
+
data.tar.gz: c6d37e2166cbf5be914945cec18e166f5a2ba8d51f7ea71ef85088def0499256
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ed60501ee636716fd3f34e3d6291725ab418c0a18bc5aba52cfd736ab937bb83b61e3070a140de49650966c2244aeb0d195c13f4b6f4e1ea48f60db8e8572f69
|
|
7
|
+
data.tar.gz: e117b807f9a425034928c758ea8bd1937702cacaf9d26a88124f4da02fd1587f5970be9e748f4ddff509ad030304ad15722d904d7cc38925661f9526a6c3227a
|
data/README.md
CHANGED
|
@@ -36,7 +36,7 @@ You can list all available options using the command:
|
|
|
36
36
|
tapsoob push -h
|
|
37
37
|
|
|
38
38
|
|
|
39
|
-
##
|
|
39
|
+
## Piping your schema/indexes/data
|
|
40
40
|
|
|
41
41
|
Due to some needs we added ways to pipe your schema/indexes/data directly from one database to another, here's an equivalent of the export/import process described above using this technique :
|
|
42
42
|
|
|
@@ -60,6 +60,16 @@ If you're using Rails, there's also two Rake tasks provided:
|
|
|
60
60
|
* `tapsoob:pull` which dumps the database into a new folder under the `db` folder
|
|
61
61
|
* `tapsoob:push` which reads the last dump you made from `tapsoob:pull` from the `db` folder
|
|
62
62
|
|
|
63
|
+
## NEW : Full parallelization support from 0.6.1 onwards
|
|
64
|
+
|
|
65
|
+
You can now dump/load a full database or data using parallelization to speed up the process at memory cost and database load like so :
|
|
66
|
+
|
|
67
|
+
```
|
|
68
|
+
tapsoob pull [OPTIONS] <dump_path> <database_url> -j <number_of_threads>
|
|
69
|
+
tapsoob push [OPTIONS] <dump_path> <database_url> -j <number_of_threads>
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
It defaults to a single thread as per pre 0.6.1, it is also appliable to `tapsoob data pull/push` but only when dumping to files, you can't parallelize and pipe for obvious reasons, it'll fall back to a single thread with a warning if you try to do this for safety.
|
|
63
73
|
|
|
64
74
|
## Notes
|
|
65
75
|
|
|
@@ -14,10 +14,20 @@ module Tapsoob
|
|
|
14
14
|
option :chunksize, desc: "Initial chunksize", default: 1000, type: :numeric, aliases: "-c"
|
|
15
15
|
option :tables, desc: "Shortcut to filter on a list of tables", type: :array, aliases: "-t"
|
|
16
16
|
option :"exclude-tables", desc: "Shortcut to exclude a list of tables", type: :array, aliases: "-e"
|
|
17
|
+
option :parallel, desc: "Number of parallel workers for table processing (default: 1)", default: 1, type: :numeric, aliases: "-j"
|
|
17
18
|
option :progress, desc: "Show progress", default: true, type: :boolean, aliases: "-p"
|
|
18
19
|
option :debug, desc: "Enable debug messages", default: false, type: :boolean, aliases: "-d"
|
|
19
20
|
def pull(database_url, dump_path = nil)
|
|
20
|
-
|
|
21
|
+
opts = parse_opts(options)
|
|
22
|
+
|
|
23
|
+
# Force serial mode when outputting to STDOUT (for piping)
|
|
24
|
+
# Parallel mode would interleave output and corrupt the JSON stream
|
|
25
|
+
if dump_path.nil? && opts[:parallel] && opts[:parallel] > 1
|
|
26
|
+
STDERR.puts "Warning: Parallel mode disabled when outputting to STDOUT (for piping)"
|
|
27
|
+
opts[:parallel] = 1
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
op = Tapsoob::Operation.factory(:pull, database_url, dump_path, opts)
|
|
21
31
|
op.pull_data
|
|
22
32
|
end
|
|
23
33
|
|
|
@@ -25,36 +35,48 @@ module Tapsoob
|
|
|
25
35
|
option :chunksize, desc: "Initial chunksize", default: 1000, type: :numeric, aliases: "-c"
|
|
26
36
|
option :tables, desc: "Shortcut to filter on a list of tables", type: :array, aliases: "-t"
|
|
27
37
|
option :"exclude-tables", desc: "Shortcut to exclude a list of tables", type: :array, aliases: "-e"
|
|
38
|
+
option :parallel, desc: "Number of parallel workers for table processing (default: 1)", default: 1, type: :numeric, aliases: "-j"
|
|
28
39
|
option :progress, desc: "Show progress", default: true, type: :boolean, aliases: "-p"
|
|
29
40
|
option :purge, desc: "Purge data in tables prior to performing the import", default: false, type: :boolean
|
|
30
41
|
option :"skip-duplicates", desc: "Remove duplicates when loading data", default: false, type: :boolean
|
|
31
42
|
option :"discard-identity", desc: "Remove identity when pushing data (may result in creating duplicates)", default: false, type: :boolean
|
|
32
43
|
option :debug, desc: "Enable debug messages", default: false, type: :boolean, aliases: "-d"
|
|
33
44
|
def push(database_url, dump_path = nil)
|
|
34
|
-
# instantiate stuff
|
|
35
|
-
data = []
|
|
36
45
|
opts = parse_opts(options)
|
|
37
46
|
|
|
38
|
-
#
|
|
47
|
+
# If dump_path is provided, use the Operation class for proper parallel support
|
|
39
48
|
if dump_path && Dir.exist?(dump_path)
|
|
40
|
-
|
|
41
|
-
|
|
49
|
+
op = Tapsoob::Operation.factory(:push, database_url, dump_path, opts)
|
|
50
|
+
op.push_data
|
|
42
51
|
else
|
|
52
|
+
# STDIN mode: read and import data directly (no parallel support for STDIN)
|
|
53
|
+
if opts[:parallel] && opts[:parallel] > 1
|
|
54
|
+
STDERR.puts "Warning: Parallel mode not supported when reading from STDIN"
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
data = []
|
|
43
58
|
STDIN.each_line { |line| data << JSON.parse(line, symbolize_names: true) }
|
|
44
|
-
end
|
|
45
59
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
60
|
+
# import data
|
|
61
|
+
data.each do |table|
|
|
62
|
+
table_name = table[:table_name]
|
|
63
|
+
|
|
64
|
+
# Truncate table if purge option is enabled
|
|
65
|
+
if opts[:purge]
|
|
66
|
+
db(database_url, opts)[table_name.to_sym].truncate
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
stream = Tapsoob::DataStream.factory(db(database_url, opts), {
|
|
70
|
+
table_name: table_name,
|
|
71
|
+
chunksize: opts[:default_chunksize]
|
|
72
|
+
}, { :"discard-identity" => opts[:"discard-identity"] || false, :purge => opts[:purge] || false, :debug => opts[:debug] })
|
|
73
|
+
|
|
74
|
+
begin
|
|
75
|
+
stream.import_rows(table)
|
|
76
|
+
rescue Exception => e
|
|
77
|
+
stream.log.debug e.message
|
|
78
|
+
STDERR.puts "Error loading data in #{table_name} : #{e.message}"
|
|
79
|
+
end
|
|
58
80
|
end
|
|
59
81
|
end
|
|
60
82
|
end
|
|
@@ -65,6 +87,7 @@ module Tapsoob
|
|
|
65
87
|
opts = {
|
|
66
88
|
progress: options[:progress],
|
|
67
89
|
tables: options[:tables],
|
|
90
|
+
parallel: options[:parallel],
|
|
68
91
|
debug: options[:debug]
|
|
69
92
|
}
|
|
70
93
|
|
|
@@ -85,7 +108,9 @@ module Tapsoob
|
|
|
85
108
|
end
|
|
86
109
|
|
|
87
110
|
def db(database_url, opts = {})
|
|
88
|
-
|
|
111
|
+
# Support connection pooling for parallel operations
|
|
112
|
+
parallel_workers = opts[:parallel] || 1
|
|
113
|
+
@db ||= Sequel.connect(database_url, max_connections: parallel_workers * 2)
|
|
89
114
|
@db.loggers << Tapsoob.log if opts[:debug]
|
|
90
115
|
|
|
91
116
|
# Set parameters
|
data/lib/tapsoob/cli/root.rb
CHANGED
|
@@ -22,6 +22,7 @@ module Tapsoob
|
|
|
22
22
|
option :"exclude-tables", desc: "Shortcut to exclude a list of tables", type: :array, aliases: "-e"
|
|
23
23
|
option :"indexes", type: :boolean, default: false
|
|
24
24
|
option :"same-db", type: :boolean, default: false
|
|
25
|
+
option :parallel, desc: "Number of parallel workers for table processing (default: 1)", default: 1, type: :numeric, aliases: "-j"
|
|
25
26
|
option :progress, desc: "Show progress", default: true, type: :boolean
|
|
26
27
|
option :debug, desc: "Enable debug messages", default: false, type: :boolean
|
|
27
28
|
def pull(dump_path, database_url)
|
|
@@ -47,6 +48,7 @@ module Tapsoob
|
|
|
47
48
|
option :purge, desc: "Purge data in tables prior to performing the import", default: false, type: :boolean, aliases: "-p"
|
|
48
49
|
option :"skip-duplicates", desc: "Remove duplicates when loading data", default: false, type: :boolean
|
|
49
50
|
option :"discard-identity", desc: "Remove identity when pushing data (may result in creating duplicates)", default: false, type: :boolean
|
|
51
|
+
option :parallel, desc: "Number of parallel workers for table processing (default: 1)", default: 1, type: :numeric, aliases: "-j"
|
|
50
52
|
option :progress, desc: "Show progress", default: true, type: :boolean
|
|
51
53
|
option :debug, desc: "Enable debug messages", default: false, type: :boolean
|
|
52
54
|
def push(dump_path, database_url)
|
|
@@ -82,6 +84,7 @@ module Tapsoob
|
|
|
82
84
|
indexes_first: options[:"indexes_first"],
|
|
83
85
|
disable_compression: options[:"disable-compression"],
|
|
84
86
|
tables: options[:tables],
|
|
87
|
+
parallel: options[:parallel],
|
|
85
88
|
progress: options[:progress],
|
|
86
89
|
debug: options[:debug]
|
|
87
90
|
})
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
|
2
|
+
require 'tapsoob/progress_bar'
|
|
3
|
+
|
|
4
|
+
# MultiProgressBar manages multiple progress bars in parallel
|
|
5
|
+
# Each bar gets its own line in the terminal
|
|
6
|
+
class MultiProgressBar
|
|
7
|
+
def initialize(max_bars = 4)
|
|
8
|
+
@max_bars = max_bars
|
|
9
|
+
@bars = []
|
|
10
|
+
@mutex = Mutex.new
|
|
11
|
+
@active = true
|
|
12
|
+
@out = STDOUT
|
|
13
|
+
@last_update = Time.now
|
|
14
|
+
@reserved_lines = 0 # Track how many lines we've actually reserved
|
|
15
|
+
@max_title_width = 14 # Minimum width, will grow with longer titles
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Create a new progress bar and return it
|
|
19
|
+
def create_bar(title, total)
|
|
20
|
+
@mutex.synchronize do
|
|
21
|
+
# Remove any existing bar with the same title to prevent duplicates
|
|
22
|
+
@bars.reject! { |b| b.title == title }
|
|
23
|
+
|
|
24
|
+
# Update max title width to accommodate longer titles
|
|
25
|
+
@max_title_width = [@max_title_width, title.length].max
|
|
26
|
+
|
|
27
|
+
bar = ThreadSafeProgressBar.new(title, total, self)
|
|
28
|
+
|
|
29
|
+
# Reserve a line for this new bar during active updates
|
|
30
|
+
# Cap at 2 * max_bars to show active workers + some recent finished bars
|
|
31
|
+
if @reserved_lines < @max_bars * 2
|
|
32
|
+
@out.print "\n"
|
|
33
|
+
@out.flush
|
|
34
|
+
@reserved_lines += 1
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
@bars << bar
|
|
38
|
+
bar
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Get the current maximum title width for alignment
|
|
43
|
+
# Note: Always called from within synchronized methods, so no mutex needed
|
|
44
|
+
def max_title_width
|
|
45
|
+
@max_title_width
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Called by individual bars when they update
|
|
49
|
+
def update
|
|
50
|
+
@mutex.synchronize do
|
|
51
|
+
return unless @active
|
|
52
|
+
return unless should_redraw?
|
|
53
|
+
|
|
54
|
+
@last_update = Time.now
|
|
55
|
+
redraw_all
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Finish a specific bar - mark it as completed
|
|
60
|
+
def finish_bar(bar)
|
|
61
|
+
@mutex.synchronize do
|
|
62
|
+
return unless @active
|
|
63
|
+
|
|
64
|
+
bar.mark_finished
|
|
65
|
+
|
|
66
|
+
# Respect throttle when finishing to avoid spamming redraws
|
|
67
|
+
if should_redraw?
|
|
68
|
+
@last_update = Time.now
|
|
69
|
+
redraw_all
|
|
70
|
+
end
|
|
71
|
+
# If throttled, the next regular update will show the finished state
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Stop all progress bars and keep them visible
|
|
76
|
+
def stop
|
|
77
|
+
@mutex.synchronize do
|
|
78
|
+
@active = false
|
|
79
|
+
|
|
80
|
+
# Final cleanup: remove any duplicate titles (keep the last occurrence of each unique title)
|
|
81
|
+
@bars = @bars.reverse.uniq { |bar| bar.title }.reverse
|
|
82
|
+
|
|
83
|
+
# Final redraw to show completed state (skip active check)
|
|
84
|
+
redraw_all(true)
|
|
85
|
+
# Move cursor past all bars
|
|
86
|
+
@out.print "\n"
|
|
87
|
+
@out.flush
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
private
|
|
92
|
+
|
|
93
|
+
# Check if enough time has passed to redraw (throttle to 10 updates/sec)
|
|
94
|
+
def should_redraw?
|
|
95
|
+
Time.now - @last_update >= 0.1
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def redraw_all(force = false)
|
|
99
|
+
return unless force || @active
|
|
100
|
+
return if @bars.empty?
|
|
101
|
+
|
|
102
|
+
if force && !@active
|
|
103
|
+
render_final_display
|
|
104
|
+
else
|
|
105
|
+
render_active_display
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# Final display: show all completed bars
|
|
110
|
+
def render_final_display
|
|
111
|
+
# Clear the reserved lines first
|
|
112
|
+
if @reserved_lines > 0
|
|
113
|
+
@out.print "\r\e[#{@reserved_lines}A"
|
|
114
|
+
@reserved_lines.times { @out.print "\r\e[K\n" }
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Print all bars (adds new lines as needed)
|
|
118
|
+
@bars.each do |bar|
|
|
119
|
+
@out.print "\r\e[K"
|
|
120
|
+
bar.render_to(@out)
|
|
121
|
+
@out.print "\n"
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
@out.flush
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# Normal operation: show active bars + recent finished in reserved space
|
|
128
|
+
def render_active_display
|
|
129
|
+
return if @reserved_lines == 0
|
|
130
|
+
|
|
131
|
+
# Partition bars in a single pass for efficiency
|
|
132
|
+
active_bars, finished_bars = @bars.partition { |b| !b.finished? }
|
|
133
|
+
|
|
134
|
+
# Build display: active bars first, then recent finished to fill remaining space
|
|
135
|
+
# Ensure we don't request negative count from .last()
|
|
136
|
+
remaining_space = [@reserved_lines - active_bars.length, 0].max
|
|
137
|
+
bars_to_draw = active_bars + finished_bars.last(remaining_space)
|
|
138
|
+
|
|
139
|
+
# If we have more bars than reserved lines, show only the most recent
|
|
140
|
+
bars_to_draw = bars_to_draw.last(@reserved_lines) if bars_to_draw.length > @reserved_lines
|
|
141
|
+
|
|
142
|
+
# Move up and redraw in reserved space
|
|
143
|
+
@out.print "\r\e[#{@reserved_lines}A"
|
|
144
|
+
@reserved_lines.times do |i|
|
|
145
|
+
@out.print "\r\e[K"
|
|
146
|
+
bars_to_draw[i].render_to(@out) if i < bars_to_draw.length
|
|
147
|
+
@out.print "\n"
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
@out.flush
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
# Thread-safe progress bar that reports to a MultiProgressBar
|
|
155
|
+
class ThreadSafeProgressBar < ProgressBar
|
|
156
|
+
attr_reader :title
|
|
157
|
+
|
|
158
|
+
def initialize(title, total, multi_progress_bar)
|
|
159
|
+
@multi_progress_bar = multi_progress_bar
|
|
160
|
+
@out = STDOUT # Need this for get_width to work
|
|
161
|
+
# Don't call parent initialize, we'll manage output ourselves
|
|
162
|
+
@title = title
|
|
163
|
+
@total = total
|
|
164
|
+
@terminal_width = 80
|
|
165
|
+
@bar_mark = "="
|
|
166
|
+
@current = 0
|
|
167
|
+
@previous = 0
|
|
168
|
+
@finished_p = false
|
|
169
|
+
@start_time = ::Time.now
|
|
170
|
+
@previous_time = @start_time
|
|
171
|
+
@format_arguments = [:title, :percentage, :bar, :stat]
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
# Override show to notify multi-progress instead of direct output
|
|
175
|
+
def show
|
|
176
|
+
@previous_time = ::Time.now # Update to prevent time-based refresh spam
|
|
177
|
+
@multi_progress_bar.update
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
# Render this bar to the given output stream
|
|
181
|
+
def render_to(out)
|
|
182
|
+
# Get dynamic title width from MultiProgressBar for consistent alignment
|
|
183
|
+
# Store as instance variable so parent class fmt_* methods can use it
|
|
184
|
+
@title_width = @multi_progress_bar.max_title_width
|
|
185
|
+
|
|
186
|
+
# Recalculate terminal width to handle resizes and use full width
|
|
187
|
+
width = get_width
|
|
188
|
+
# Calculate bar width: total_width - fixed_elements - padding
|
|
189
|
+
# Fixed: title(variable) + " "(1) + percentage(4) + " "(1) + "|"(1) + "|"(1) + " "(1) + timer(15) = title_width + 25
|
|
190
|
+
# Padding: +3 for timer fluctuations and safety
|
|
191
|
+
fixed_chars = @title_width + 28
|
|
192
|
+
@terminal_width = [width - fixed_chars, 20].max
|
|
193
|
+
|
|
194
|
+
# Build format string with dynamic title width
|
|
195
|
+
format = "%-#{@title_width}s %3d%% %s %s"
|
|
196
|
+
arguments = @format_arguments.map { |method| send("fmt_#{method}") }
|
|
197
|
+
line = sprintf(format, *arguments)
|
|
198
|
+
|
|
199
|
+
# Ensure line doesn't exceed terminal width to prevent wrapping
|
|
200
|
+
# Leave 2 chars margin for safety
|
|
201
|
+
line = line[0, width - 2] if line.length > width - 2
|
|
202
|
+
|
|
203
|
+
out.print(line)
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
# Override clear to do nothing (managed by MultiProgressBar)
|
|
207
|
+
def clear
|
|
208
|
+
# no-op
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
# Mark this bar as finished (for tracking)
|
|
212
|
+
def mark_finished
|
|
213
|
+
@finished_p = true
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
# Override to use the same @finished_p flag
|
|
217
|
+
def finished?
|
|
218
|
+
@finished_p
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
# Override finish to notify multi-progress
|
|
222
|
+
def finish
|
|
223
|
+
@current = @total
|
|
224
|
+
@multi_progress_bar.finish_bar(self)
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
# Override inc to check if we need to update
|
|
228
|
+
def inc(step = 1)
|
|
229
|
+
@current += step
|
|
230
|
+
@current = @total if @current > @total
|
|
231
|
+
show_if_needed
|
|
232
|
+
@previous = @current
|
|
233
|
+
end
|
|
234
|
+
end
|
data/lib/tapsoob/operation.rb
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
|
2
2
|
require 'sequel'
|
|
3
|
+
require 'thread'
|
|
3
4
|
|
|
4
5
|
require 'tapsoob/data_stream'
|
|
5
6
|
require 'tapsoob/log'
|
|
6
7
|
require 'tapsoob/progress_bar'
|
|
8
|
+
require 'tapsoob/multi_progress_bar'
|
|
7
9
|
require 'tapsoob/schema'
|
|
8
10
|
|
|
9
11
|
module Tapsoob
|
|
@@ -117,7 +119,7 @@ module Tapsoob
|
|
|
117
119
|
end
|
|
118
120
|
|
|
119
121
|
def db
|
|
120
|
-
@db ||= Sequel.connect(database_url)
|
|
122
|
+
@db ||= Sequel.connect(database_url, max_connections: parallel_workers * 2)
|
|
121
123
|
@db.extension :schema_dumper
|
|
122
124
|
@db.loggers << Tapsoob.log if opts[:debug]
|
|
123
125
|
|
|
@@ -130,6 +132,24 @@ module Tapsoob
|
|
|
130
132
|
@db
|
|
131
133
|
end
|
|
132
134
|
|
|
135
|
+
def parallel?
|
|
136
|
+
parallel_workers > 1
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
def parallel_workers
|
|
140
|
+
@parallel_workers ||= [opts[:parallel].to_i, 1].max
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
def completed_tables_mutex
|
|
144
|
+
@completed_tables_mutex ||= Mutex.new
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
def add_completed_table(table_name)
|
|
148
|
+
completed_tables_mutex.synchronize do
|
|
149
|
+
completed_tables << table_name.to_s
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
|
|
133
153
|
def format_number(num)
|
|
134
154
|
num.to_s.gsub(/(\d)(?=(\d\d\d)+(?!\d))/, "\\1,")
|
|
135
155
|
end
|
|
@@ -198,6 +218,14 @@ module Tapsoob
|
|
|
198
218
|
|
|
199
219
|
log.info "#{tables.size} tables, #{format_number(record_count)} records"
|
|
200
220
|
|
|
221
|
+
if parallel?
|
|
222
|
+
pull_data_parallel
|
|
223
|
+
else
|
|
224
|
+
pull_data_serial
|
|
225
|
+
end
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
def pull_data_serial
|
|
201
229
|
tables.each do |table_name, count|
|
|
202
230
|
stream = Tapsoob::DataStream.factory(db, {
|
|
203
231
|
:chunksize => default_chunksize,
|
|
@@ -209,6 +237,38 @@ module Tapsoob
|
|
|
209
237
|
end
|
|
210
238
|
end
|
|
211
239
|
|
|
240
|
+
def pull_data_parallel
|
|
241
|
+
log.info "Using #{parallel_workers} parallel workers"
|
|
242
|
+
|
|
243
|
+
multi_progress = opts[:progress] ? MultiProgressBar.new(parallel_workers) : nil
|
|
244
|
+
table_queue = Queue.new
|
|
245
|
+
tables.each { |table_name, count| table_queue << [table_name, count] }
|
|
246
|
+
|
|
247
|
+
workers = (1..parallel_workers).map do
|
|
248
|
+
Thread.new do
|
|
249
|
+
loop do
|
|
250
|
+
break if table_queue.empty?
|
|
251
|
+
|
|
252
|
+
table_name, count = table_queue.pop(true) rescue break
|
|
253
|
+
|
|
254
|
+
# Each thread gets its own connection from the pool
|
|
255
|
+
stream = Tapsoob::DataStream.factory(db, {
|
|
256
|
+
:chunksize => default_chunksize,
|
|
257
|
+
:table_name => table_name
|
|
258
|
+
}, { :debug => opts[:debug] })
|
|
259
|
+
|
|
260
|
+
estimated_chunks = [(count.to_f / default_chunksize).ceil, 1].max
|
|
261
|
+
progress = multi_progress ? multi_progress.create_bar(table_name.to_s, estimated_chunks) : nil
|
|
262
|
+
|
|
263
|
+
pull_data_from_table(stream, progress)
|
|
264
|
+
end
|
|
265
|
+
end
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
workers.each(&:join)
|
|
269
|
+
multi_progress.stop if multi_progress
|
|
270
|
+
end
|
|
271
|
+
|
|
212
272
|
def pull_partial_data
|
|
213
273
|
return if stream_state == {}
|
|
214
274
|
|
|
@@ -280,7 +340,7 @@ module Tapsoob
|
|
|
280
340
|
end
|
|
281
341
|
|
|
282
342
|
progress.finish if progress
|
|
283
|
-
|
|
343
|
+
add_completed_table(stream.table_name)
|
|
284
344
|
self.stream_state = {}
|
|
285
345
|
end
|
|
286
346
|
|
|
@@ -333,9 +393,12 @@ module Tapsoob
|
|
|
333
393
|
raw_idxs = Tapsoob::Schema.indexes_individual(database_url)
|
|
334
394
|
idxs = (raw_idxs && raw_idxs.length >= 2 ? JSON.parse(raw_idxs) : {})
|
|
335
395
|
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
396
|
+
# Calculate max title width for consistent alignment
|
|
397
|
+
filtered_idxs = apply_table_filter(idxs).select { |table, indexes| indexes.size > 0 }
|
|
398
|
+
max_title_width = filtered_idxs.keys.map { |table| "#{table} indexes".length }.max || 14
|
|
399
|
+
|
|
400
|
+
filtered_idxs.each do |table, indexes|
|
|
401
|
+
progress = ProgressBar.new("#{table} indexes", indexes.size, STDOUT, max_title_width)
|
|
339
402
|
indexes.each do |idx|
|
|
340
403
|
output = Tapsoob::Utils.export_indexes(dump_path, table, idx)
|
|
341
404
|
puts output if dump_path.nil? && output
|
|
@@ -389,9 +452,12 @@ module Tapsoob
|
|
|
389
452
|
|
|
390
453
|
log.info "Sending indexes"
|
|
391
454
|
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
455
|
+
# Calculate max title width for consistent alignment
|
|
456
|
+
filtered_idxs = apply_table_filter(idxs).select { |table, indexes| indexes.size > 0 }
|
|
457
|
+
max_title_width = filtered_idxs.keys.map { |table| "#{table} indexes".length }.max || 14
|
|
458
|
+
|
|
459
|
+
filtered_idxs.each do |table, indexes|
|
|
460
|
+
progress = ProgressBar.new("#{table} indexes", indexes.size, STDOUT, max_title_width)
|
|
395
461
|
indexes.each do |idx|
|
|
396
462
|
Tapsoob::Utils.load_indexes(database_url, idx)
|
|
397
463
|
progress.inc(1)
|
|
@@ -437,6 +503,14 @@ module Tapsoob
|
|
|
437
503
|
|
|
438
504
|
log.info "#{tables.size} tables, #{format_number(record_count)} records"
|
|
439
505
|
|
|
506
|
+
if parallel?
|
|
507
|
+
push_data_parallel
|
|
508
|
+
else
|
|
509
|
+
push_data_serial
|
|
510
|
+
end
|
|
511
|
+
end
|
|
512
|
+
|
|
513
|
+
def push_data_serial
|
|
440
514
|
tables.each do |table_name, count|
|
|
441
515
|
# Skip if data file doesn't exist or has no data
|
|
442
516
|
data_file = File.join(dump_path, "data", "#{table_name}.json")
|
|
@@ -457,6 +531,49 @@ module Tapsoob
|
|
|
457
531
|
end
|
|
458
532
|
end
|
|
459
533
|
|
|
534
|
+
def push_data_parallel
|
|
535
|
+
log.info "Using #{parallel_workers} parallel workers"
|
|
536
|
+
|
|
537
|
+
multi_progress = opts[:progress] ? MultiProgressBar.new(parallel_workers) : nil
|
|
538
|
+
table_queue = Queue.new
|
|
539
|
+
|
|
540
|
+
tables.each do |table_name, count|
|
|
541
|
+
data_file = File.join(dump_path, "data", "#{table_name}.json")
|
|
542
|
+
next unless File.exist?(data_file) && count > 0
|
|
543
|
+
table_queue << [table_name, count]
|
|
544
|
+
end
|
|
545
|
+
|
|
546
|
+
workers = (1..parallel_workers).map do
|
|
547
|
+
Thread.new do
|
|
548
|
+
loop do
|
|
549
|
+
break if table_queue.empty?
|
|
550
|
+
|
|
551
|
+
table_name, count = table_queue.pop(true) rescue break
|
|
552
|
+
|
|
553
|
+
# Each thread gets its own connection from the pool
|
|
554
|
+
db[table_name.to_sym].truncate if @opts[:purge]
|
|
555
|
+
stream = Tapsoob::DataStream.factory(db, {
|
|
556
|
+
:table_name => table_name,
|
|
557
|
+
:chunksize => default_chunksize
|
|
558
|
+
}, {
|
|
559
|
+
:"skip-duplicates" => opts[:"skip-duplicates"] || false,
|
|
560
|
+
:"discard-identity" => opts[:"discard-identity"] || false,
|
|
561
|
+
:purge => opts[:purge] || false,
|
|
562
|
+
:debug => opts[:debug]
|
|
563
|
+
})
|
|
564
|
+
|
|
565
|
+
estimated_chunks = [(count.to_f / default_chunksize).ceil, 1].max
|
|
566
|
+
progress = multi_progress ? multi_progress.create_bar(table_name.to_s, estimated_chunks) : nil
|
|
567
|
+
|
|
568
|
+
push_data_from_file(stream, progress)
|
|
569
|
+
end
|
|
570
|
+
end
|
|
571
|
+
end
|
|
572
|
+
|
|
573
|
+
workers.each(&:join)
|
|
574
|
+
multi_progress.stop if multi_progress
|
|
575
|
+
end
|
|
576
|
+
|
|
460
577
|
def push_data_from_file(stream, progress)
|
|
461
578
|
loop do
|
|
462
579
|
if exiting?
|
|
@@ -510,7 +627,7 @@ module Tapsoob
|
|
|
510
627
|
end
|
|
511
628
|
|
|
512
629
|
progress.finish if progress
|
|
513
|
-
|
|
630
|
+
add_completed_table(stream.table_name)
|
|
514
631
|
self.stream_state = {}
|
|
515
632
|
end
|
|
516
633
|
|
data/lib/tapsoob/progress_bar.rb
CHANGED
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
class ProgressBar
|
|
14
14
|
VERSION = "0.9"
|
|
15
15
|
|
|
16
|
-
def initialize (title, total, out = STDOUT)
|
|
16
|
+
def initialize (title, total, out = STDOUT, title_width = nil)
|
|
17
17
|
@title = title
|
|
18
18
|
@total = total
|
|
19
19
|
@out = out
|
|
@@ -24,7 +24,8 @@ class ProgressBar
|
|
|
24
24
|
@finished_p = false
|
|
25
25
|
@start_time = ::Time.now
|
|
26
26
|
@previous_time = @start_time
|
|
27
|
-
|
|
27
|
+
# Set title width: use provided width, or accommodate the title, with a minimum of 14
|
|
28
|
+
@title_width = title_width || [title.length, 14].max
|
|
28
29
|
@format = "%-#{@title_width}s %3d%% %s %s"
|
|
29
30
|
@format_arguments = [:title, :percentage, :bar, :stat]
|
|
30
31
|
clear
|
data/lib/tapsoob/version.rb
CHANGED
data/lib/tasks/tapsoob.rake
CHANGED
|
@@ -1,8 +1,14 @@
|
|
|
1
1
|
namespace :tapsoob do
|
|
2
|
-
desc "Pulls a database to your filesystem"
|
|
2
|
+
desc "Pulls a database to your filesystem (PARALLEL=4 for 4 workers)"
|
|
3
3
|
task :pull => :environment do
|
|
4
4
|
# Default options
|
|
5
|
-
opts={:default_chunksize => 1000, :debug => false, :resume_filename => nil, :disable_compression => false, :schema => true, :data => true, :indexes_first => false, :progress => true}
|
|
5
|
+
opts={:default_chunksize => 1000, :debug => false, :resume_filename => nil, :disable_compression => false, :schema => true, :data => true, :indexes_first => false, :progress => true, :parallel => 1}
|
|
6
|
+
|
|
7
|
+
# Allow overriding parallel workers via PARALLEL environment variable
|
|
8
|
+
if ENV['PARALLEL'] && ENV['PARALLEL'].to_i > 0
|
|
9
|
+
opts[:parallel] = ENV['PARALLEL'].to_i
|
|
10
|
+
puts "Using #{opts[:parallel]} parallel workers"
|
|
11
|
+
end
|
|
6
12
|
|
|
7
13
|
# Get the dump_path
|
|
8
14
|
dump_path = File.expand_path(Rails.root.join("db", Time.now.strftime("%Y%m%d%I%M%S%p"))).to_s
|
|
@@ -20,10 +26,16 @@ namespace :tapsoob do
|
|
|
20
26
|
Rake::Task["tapsoob:clean"].invoke
|
|
21
27
|
end
|
|
22
28
|
|
|
23
|
-
desc "Push a compatible dump on your filesystem to a database"
|
|
29
|
+
desc "Push a compatible dump on your filesystem to a database (PARALLEL=4 for 4 workers)"
|
|
24
30
|
task :push, [:timestamp] => :environment do |t, args|
|
|
25
31
|
# Default options
|
|
26
|
-
opts={:default_chunksize => 1000, :debug => false, :resume_filename => nil, :disable_compression => false, :schema => true, :data => true, :indexes_first => false, :progress => true}
|
|
32
|
+
opts={:default_chunksize => 1000, :debug => false, :resume_filename => nil, :disable_compression => false, :schema => true, :data => true, :indexes_first => false, :progress => true, :parallel => 1}
|
|
33
|
+
|
|
34
|
+
# Allow overriding parallel workers via PARALLEL environment variable
|
|
35
|
+
if ENV['PARALLEL'] && ENV['PARALLEL'].to_i > 0
|
|
36
|
+
opts[:parallel] = ENV['PARALLEL'].to_i
|
|
37
|
+
puts "Using #{opts[:parallel]} parallel workers"
|
|
38
|
+
end
|
|
27
39
|
|
|
28
40
|
# Get the dumps
|
|
29
41
|
dumps = Dir[Rails.root.join("db", "*/")].select { |e| e =~ /([0-9]{14})([A-Z]{2})/ }.sort
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: tapsoob
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.6.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Félix Bellanger
|
|
@@ -104,6 +104,7 @@ files:
|
|
|
104
104
|
- lib/tapsoob/data_stream.rb
|
|
105
105
|
- lib/tapsoob/errors.rb
|
|
106
106
|
- lib/tapsoob/log.rb
|
|
107
|
+
- lib/tapsoob/multi_progress_bar.rb
|
|
107
108
|
- lib/tapsoob/operation.rb
|
|
108
109
|
- lib/tapsoob/progress_bar.rb
|
|
109
110
|
- lib/tapsoob/railtie.rb
|