tapsoob 0.6.0-java → 0.6.1-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +11 -1
- data/lib/tapsoob/cli/data_stream.rb +45 -20
- data/lib/tapsoob/version.rb +1 -1
- data/lib/tasks/tapsoob.rake +16 -4
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 4496a325b1c666f3184aac7b24f0344c7c895bf3481c04faa0326bab6f8ac9c0
|
|
4
|
+
data.tar.gz: 4794ef68add8faa325f7eb3212e40213e43a946b3947342aafa49290b2815aa3
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 9da5a4494551b52576443de7f48ec8f2e27df98422c10a8d1e222ba6182d4ffab59fbd6a3bfd3b7f42fc64ded0f02c6e51056168e0bb9214e52e874c7765bdc5
|
|
7
|
+
data.tar.gz: f72add9544b1ec901579f8a661e6157e27f2ea905854c0c4addc074debe736498963d0500d5a73dce986a18b4a7391677c0e9d9560556ca5ba6c551108f006b1
|
data/README.md
CHANGED
|
@@ -36,7 +36,7 @@ You can list all available options using the command:
|
|
|
36
36
|
tapsoob push -h
|
|
37
37
|
|
|
38
38
|
|
|
39
|
-
##
|
|
39
|
+
## Piping your schema/indexes/data
|
|
40
40
|
|
|
41
41
|
Due to some needs we added ways to pipe your schema/indexes/data directly from one database to another, here's an equivalent of the export/import process described above using this technique :
|
|
42
42
|
|
|
@@ -60,6 +60,16 @@ If you're using Rails, there's also two Rake tasks provided:
|
|
|
60
60
|
* `tapsoob:pull` which dumps the database into a new folder under the `db` folder
|
|
61
61
|
* `tapsoob:push` which reads the last dump you made from `tapsoob:pull` from the `db` folder
|
|
62
62
|
|
|
63
|
+
## NEW : Full parallelization support from 0.6.1 onwards
|
|
64
|
+
|
|
65
|
+
You can now dump/load a full database or data using parallelization to speed up the process at memory cost and database load like so :
|
|
66
|
+
|
|
67
|
+
```
|
|
68
|
+
tapsoob pull [OPTIONS] <dump_path> <database_url> -j <number_of_threads>
|
|
69
|
+
tapsoob push [OPTIONS] <dump_path> <database_url> -j <number_of_threads>
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
It defaults to a single thread as per pre 0.6.1, it is also appliable to `tapsoob data pull/push` but only when dumping to files, you can't parallelize and pipe for obvious reasons, it'll fall back to a single thread with a warning if you try to do this for safety.
|
|
63
73
|
|
|
64
74
|
## Notes
|
|
65
75
|
|
|
@@ -14,10 +14,20 @@ module Tapsoob
|
|
|
14
14
|
option :chunksize, desc: "Initial chunksize", default: 1000, type: :numeric, aliases: "-c"
|
|
15
15
|
option :tables, desc: "Shortcut to filter on a list of tables", type: :array, aliases: "-t"
|
|
16
16
|
option :"exclude-tables", desc: "Shortcut to exclude a list of tables", type: :array, aliases: "-e"
|
|
17
|
+
option :parallel, desc: "Number of parallel workers for table processing (default: 1)", default: 1, type: :numeric, aliases: "-j"
|
|
17
18
|
option :progress, desc: "Show progress", default: true, type: :boolean, aliases: "-p"
|
|
18
19
|
option :debug, desc: "Enable debug messages", default: false, type: :boolean, aliases: "-d"
|
|
19
20
|
def pull(database_url, dump_path = nil)
|
|
20
|
-
|
|
21
|
+
opts = parse_opts(options)
|
|
22
|
+
|
|
23
|
+
# Force serial mode when outputting to STDOUT (for piping)
|
|
24
|
+
# Parallel mode would interleave output and corrupt the JSON stream
|
|
25
|
+
if dump_path.nil? && opts[:parallel] && opts[:parallel] > 1
|
|
26
|
+
STDERR.puts "Warning: Parallel mode disabled when outputting to STDOUT (for piping)"
|
|
27
|
+
opts[:parallel] = 1
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
op = Tapsoob::Operation.factory(:pull, database_url, dump_path, opts)
|
|
21
31
|
op.pull_data
|
|
22
32
|
end
|
|
23
33
|
|
|
@@ -25,36 +35,48 @@ module Tapsoob
|
|
|
25
35
|
option :chunksize, desc: "Initial chunksize", default: 1000, type: :numeric, aliases: "-c"
|
|
26
36
|
option :tables, desc: "Shortcut to filter on a list of tables", type: :array, aliases: "-t"
|
|
27
37
|
option :"exclude-tables", desc: "Shortcut to exclude a list of tables", type: :array, aliases: "-e"
|
|
38
|
+
option :parallel, desc: "Number of parallel workers for table processing (default: 1)", default: 1, type: :numeric, aliases: "-j"
|
|
28
39
|
option :progress, desc: "Show progress", default: true, type: :boolean, aliases: "-p"
|
|
29
40
|
option :purge, desc: "Purge data in tables prior to performing the import", default: false, type: :boolean
|
|
30
41
|
option :"skip-duplicates", desc: "Remove duplicates when loading data", default: false, type: :boolean
|
|
31
42
|
option :"discard-identity", desc: "Remove identity when pushing data (may result in creating duplicates)", default: false, type: :boolean
|
|
32
43
|
option :debug, desc: "Enable debug messages", default: false, type: :boolean, aliases: "-d"
|
|
33
44
|
def push(database_url, dump_path = nil)
|
|
34
|
-
# instantiate stuff
|
|
35
|
-
data = []
|
|
36
45
|
opts = parse_opts(options)
|
|
37
46
|
|
|
38
|
-
#
|
|
47
|
+
# If dump_path is provided, use the Operation class for proper parallel support
|
|
39
48
|
if dump_path && Dir.exist?(dump_path)
|
|
40
|
-
|
|
41
|
-
|
|
49
|
+
op = Tapsoob::Operation.factory(:push, database_url, dump_path, opts)
|
|
50
|
+
op.push_data
|
|
42
51
|
else
|
|
52
|
+
# STDIN mode: read and import data directly (no parallel support for STDIN)
|
|
53
|
+
if opts[:parallel] && opts[:parallel] > 1
|
|
54
|
+
STDERR.puts "Warning: Parallel mode not supported when reading from STDIN"
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
data = []
|
|
43
58
|
STDIN.each_line { |line| data << JSON.parse(line, symbolize_names: true) }
|
|
44
|
-
end
|
|
45
59
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
60
|
+
# import data
|
|
61
|
+
data.each do |table|
|
|
62
|
+
table_name = table[:table_name]
|
|
63
|
+
|
|
64
|
+
# Truncate table if purge option is enabled
|
|
65
|
+
if opts[:purge]
|
|
66
|
+
db(database_url, opts)[table_name.to_sym].truncate
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
stream = Tapsoob::DataStream.factory(db(database_url, opts), {
|
|
70
|
+
table_name: table_name,
|
|
71
|
+
chunksize: opts[:default_chunksize]
|
|
72
|
+
}, { :"discard-identity" => opts[:"discard-identity"] || false, :purge => opts[:purge] || false, :debug => opts[:debug] })
|
|
73
|
+
|
|
74
|
+
begin
|
|
75
|
+
stream.import_rows(table)
|
|
76
|
+
rescue Exception => e
|
|
77
|
+
stream.log.debug e.message
|
|
78
|
+
STDERR.puts "Error loading data in #{table_name} : #{e.message}"
|
|
79
|
+
end
|
|
58
80
|
end
|
|
59
81
|
end
|
|
60
82
|
end
|
|
@@ -65,6 +87,7 @@ module Tapsoob
|
|
|
65
87
|
opts = {
|
|
66
88
|
progress: options[:progress],
|
|
67
89
|
tables: options[:tables],
|
|
90
|
+
parallel: options[:parallel],
|
|
68
91
|
debug: options[:debug]
|
|
69
92
|
}
|
|
70
93
|
|
|
@@ -85,7 +108,9 @@ module Tapsoob
|
|
|
85
108
|
end
|
|
86
109
|
|
|
87
110
|
def db(database_url, opts = {})
|
|
88
|
-
|
|
111
|
+
# Support connection pooling for parallel operations
|
|
112
|
+
parallel_workers = opts[:parallel] || 1
|
|
113
|
+
@db ||= Sequel.connect(database_url, max_connections: parallel_workers * 2)
|
|
89
114
|
@db.loggers << Tapsoob.log if opts[:debug]
|
|
90
115
|
|
|
91
116
|
# Set parameters
|
data/lib/tapsoob/version.rb
CHANGED
data/lib/tasks/tapsoob.rake
CHANGED
|
@@ -1,8 +1,14 @@
|
|
|
1
1
|
namespace :tapsoob do
|
|
2
|
-
desc "Pulls a database to your filesystem"
|
|
2
|
+
desc "Pulls a database to your filesystem (PARALLEL=4 for 4 workers)"
|
|
3
3
|
task :pull => :environment do
|
|
4
4
|
# Default options
|
|
5
|
-
opts={:default_chunksize => 1000, :debug => false, :resume_filename => nil, :disable_compression => false, :schema => true, :data => true, :indexes_first => false, :progress => true}
|
|
5
|
+
opts={:default_chunksize => 1000, :debug => false, :resume_filename => nil, :disable_compression => false, :schema => true, :data => true, :indexes_first => false, :progress => true, :parallel => 1}
|
|
6
|
+
|
|
7
|
+
# Allow overriding parallel workers via PARALLEL environment variable
|
|
8
|
+
if ENV['PARALLEL'] && ENV['PARALLEL'].to_i > 0
|
|
9
|
+
opts[:parallel] = ENV['PARALLEL'].to_i
|
|
10
|
+
puts "Using #{opts[:parallel]} parallel workers"
|
|
11
|
+
end
|
|
6
12
|
|
|
7
13
|
# Get the dump_path
|
|
8
14
|
dump_path = File.expand_path(Rails.root.join("db", Time.now.strftime("%Y%m%d%I%M%S%p"))).to_s
|
|
@@ -20,10 +26,16 @@ namespace :tapsoob do
|
|
|
20
26
|
Rake::Task["tapsoob:clean"].invoke
|
|
21
27
|
end
|
|
22
28
|
|
|
23
|
-
desc "Push a compatible dump on your filesystem to a database"
|
|
29
|
+
desc "Push a compatible dump on your filesystem to a database (PARALLEL=4 for 4 workers)"
|
|
24
30
|
task :push, [:timestamp] => :environment do |t, args|
|
|
25
31
|
# Default options
|
|
26
|
-
opts={:default_chunksize => 1000, :debug => false, :resume_filename => nil, :disable_compression => false, :schema => true, :data => true, :indexes_first => false, :progress => true}
|
|
32
|
+
opts={:default_chunksize => 1000, :debug => false, :resume_filename => nil, :disable_compression => false, :schema => true, :data => true, :indexes_first => false, :progress => true, :parallel => 1}
|
|
33
|
+
|
|
34
|
+
# Allow overriding parallel workers via PARALLEL environment variable
|
|
35
|
+
if ENV['PARALLEL'] && ENV['PARALLEL'].to_i > 0
|
|
36
|
+
opts[:parallel] = ENV['PARALLEL'].to_i
|
|
37
|
+
puts "Using #{opts[:parallel]} parallel workers"
|
|
38
|
+
end
|
|
27
39
|
|
|
28
40
|
# Get the dumps
|
|
29
41
|
dumps = Dir[Rails.root.join("db", "*/")].select { |e| e =~ /([0-9]{14})([A-Z]{2})/ }.sort
|