tapsoob 0.6.0-java → 0.6.1-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b2d30be0a3150a1592ece7ced339c7bf56c1de3f3b4c0fe9468b2370e8e6b855
4
- data.tar.gz: e7ceee5ebe56f0fa375bb76e5ab360604b9231ce8a5c1263fabe24c84abc15e5
3
+ metadata.gz: 4496a325b1c666f3184aac7b24f0344c7c895bf3481c04faa0326bab6f8ac9c0
4
+ data.tar.gz: 4794ef68add8faa325f7eb3212e40213e43a946b3947342aafa49290b2815aa3
5
5
  SHA512:
6
- metadata.gz: d19b9ad09c465356dcfa49af92ded16dcf496c9743f1a345a5ca3b6afb46570ae8c1d646400a98b504e3bc86ff8e59e083fb673b960da6b6b616ff04ca6d218f
7
- data.tar.gz: 15ce04aad9422147b81b2b51fee2e9d270ab824680bb777e8ea5014c9f11da1b83cfc426352a9db5cfb9b47fc0fb9ce355e2eee43ad68d3103c4d6e220d042dc
6
+ metadata.gz: 9da5a4494551b52576443de7f48ec8f2e27df98422c10a8d1e222ba6182d4ffab59fbd6a3bfd3b7f42fc64ded0f02c6e51056168e0bb9214e52e874c7765bdc5
7
+ data.tar.gz: f72add9544b1ec901579f8a661e6157e27f2ea905854c0c4addc074debe736498963d0500d5a73dce986a18b4a7391677c0e9d9560556ca5ba6c551108f006b1
data/README.md CHANGED
@@ -36,7 +36,7 @@ You can list all available options using the command:
36
36
  tapsoob push -h
37
37
 
38
38
 
39
- ## NEW : Piping your schema/indexes/data
39
+ ## Piping your schema/indexes/data
40
40
 
41
41
  Due to some needs we added ways to pipe your schema/indexes/data directly from one database to another, here's an equivalent of the export/import process described above using this technique :
42
42
 
@@ -60,6 +60,16 @@ If you're using Rails, there's also two Rake tasks provided:
60
60
  * `tapsoob:pull` which dumps the database into a new folder under the `db` folder
61
61
  * `tapsoob:push` which reads the last dump you made from `tapsoob:pull` from the `db` folder
62
62
 
63
+ ## NEW : Full parallelization support from 0.6.1 onwards
64
+
65
+ You can now dump/load a full database or data using parallelization to speed up the process at memory cost and database load like so :
66
+
67
+ ```
68
+ tapsoob pull [OPTIONS] <dump_path> <database_url> -j <number_of_threads>
69
+ tapsoob push [OPTIONS] <dump_path> <database_url> -j <number_of_threads>
70
+ ```
71
+
72
+ It defaults to a single thread as per pre 0.6.1, it is also appliable to `tapsoob data pull/push` but only when dumping to files, you can't parallelize and pipe for obvious reasons, it'll fall back to a single thread with a warning if you try to do this for safety.
63
73
 
64
74
  ## Notes
65
75
 
@@ -14,10 +14,20 @@ module Tapsoob
14
14
  option :chunksize, desc: "Initial chunksize", default: 1000, type: :numeric, aliases: "-c"
15
15
  option :tables, desc: "Shortcut to filter on a list of tables", type: :array, aliases: "-t"
16
16
  option :"exclude-tables", desc: "Shortcut to exclude a list of tables", type: :array, aliases: "-e"
17
+ option :parallel, desc: "Number of parallel workers for table processing (default: 1)", default: 1, type: :numeric, aliases: "-j"
17
18
  option :progress, desc: "Show progress", default: true, type: :boolean, aliases: "-p"
18
19
  option :debug, desc: "Enable debug messages", default: false, type: :boolean, aliases: "-d"
19
20
  def pull(database_url, dump_path = nil)
20
- op = Tapsoob::Operation.factory(:pull, database_url, dump_path, parse_opts(options))
21
+ opts = parse_opts(options)
22
+
23
+ # Force serial mode when outputting to STDOUT (for piping)
24
+ # Parallel mode would interleave output and corrupt the JSON stream
25
+ if dump_path.nil? && opts[:parallel] && opts[:parallel] > 1
26
+ STDERR.puts "Warning: Parallel mode disabled when outputting to STDOUT (for piping)"
27
+ opts[:parallel] = 1
28
+ end
29
+
30
+ op = Tapsoob::Operation.factory(:pull, database_url, dump_path, opts)
21
31
  op.pull_data
22
32
  end
23
33
 
@@ -25,36 +35,48 @@ module Tapsoob
25
35
  option :chunksize, desc: "Initial chunksize", default: 1000, type: :numeric, aliases: "-c"
26
36
  option :tables, desc: "Shortcut to filter on a list of tables", type: :array, aliases: "-t"
27
37
  option :"exclude-tables", desc: "Shortcut to exclude a list of tables", type: :array, aliases: "-e"
38
+ option :parallel, desc: "Number of parallel workers for table processing (default: 1)", default: 1, type: :numeric, aliases: "-j"
28
39
  option :progress, desc: "Show progress", default: true, type: :boolean, aliases: "-p"
29
40
  option :purge, desc: "Purge data in tables prior to performing the import", default: false, type: :boolean
30
41
  option :"skip-duplicates", desc: "Remove duplicates when loading data", default: false, type: :boolean
31
42
  option :"discard-identity", desc: "Remove identity when pushing data (may result in creating duplicates)", default: false, type: :boolean
32
43
  option :debug, desc: "Enable debug messages", default: false, type: :boolean, aliases: "-d"
33
44
  def push(database_url, dump_path = nil)
34
- # instantiate stuff
35
- data = []
36
45
  opts = parse_opts(options)
37
46
 
38
- # read data from dump_path or from STDIN
47
+ # If dump_path is provided, use the Operation class for proper parallel support
39
48
  if dump_path && Dir.exist?(dump_path)
40
- files = Dir[Pathname.new(dump_path).join("*.json")]
41
- files.each { |file| data << JSON.parse(File.read(file), symbolize_names: true) }
49
+ op = Tapsoob::Operation.factory(:push, database_url, dump_path, opts)
50
+ op.push_data
42
51
  else
52
+ # STDIN mode: read and import data directly (no parallel support for STDIN)
53
+ if opts[:parallel] && opts[:parallel] > 1
54
+ STDERR.puts "Warning: Parallel mode not supported when reading from STDIN"
55
+ end
56
+
57
+ data = []
43
58
  STDIN.each_line { |line| data << JSON.parse(line, symbolize_names: true) }
44
- end
45
59
 
46
- # import data
47
- data.each do |table|
48
- stream = Tapsoob::DataStream.factory(db(database_url, opts), {
49
- table_name: table[:table_name],
50
- chunksize: opts[:default_chunksize]
51
- }, { :"discard-identity" => opts[:"discard-identity"] || false, :purge => opts[:purge] || false, :debug => opts[:debug] })
52
-
53
- begin
54
- stream.import_rows(table)
55
- rescue Exception => e
56
- stream.log.debug e.message
57
- STDERR.puts "Error loading data in #{table[:table_name]} : #{e.message}"
60
+ # import data
61
+ data.each do |table|
62
+ table_name = table[:table_name]
63
+
64
+ # Truncate table if purge option is enabled
65
+ if opts[:purge]
66
+ db(database_url, opts)[table_name.to_sym].truncate
67
+ end
68
+
69
+ stream = Tapsoob::DataStream.factory(db(database_url, opts), {
70
+ table_name: table_name,
71
+ chunksize: opts[:default_chunksize]
72
+ }, { :"discard-identity" => opts[:"discard-identity"] || false, :purge => opts[:purge] || false, :debug => opts[:debug] })
73
+
74
+ begin
75
+ stream.import_rows(table)
76
+ rescue Exception => e
77
+ stream.log.debug e.message
78
+ STDERR.puts "Error loading data in #{table_name} : #{e.message}"
79
+ end
58
80
  end
59
81
  end
60
82
  end
@@ -65,6 +87,7 @@ module Tapsoob
65
87
  opts = {
66
88
  progress: options[:progress],
67
89
  tables: options[:tables],
90
+ parallel: options[:parallel],
68
91
  debug: options[:debug]
69
92
  }
70
93
 
@@ -85,7 +108,9 @@ module Tapsoob
85
108
  end
86
109
 
87
110
  def db(database_url, opts = {})
88
- @db ||= Sequel.connect(database_url)
111
+ # Support connection pooling for parallel operations
112
+ parallel_workers = opts[:parallel] || 1
113
+ @db ||= Sequel.connect(database_url, max_connections: parallel_workers * 2)
89
114
  @db.loggers << Tapsoob.log if opts[:debug]
90
115
 
91
116
  # Set parameters
@@ -1,4 +1,4 @@
1
1
  # -*- encoding : utf-8 -*-
2
2
  module Tapsoob
3
- VERSION = "0.6.0".freeze
3
+ VERSION = "0.6.1".freeze
4
4
  end
@@ -1,8 +1,14 @@
1
1
  namespace :tapsoob do
2
- desc "Pulls a database to your filesystem"
2
+ desc "Pulls a database to your filesystem (PARALLEL=4 for 4 workers)"
3
3
  task :pull => :environment do
4
4
  # Default options
5
- opts={:default_chunksize => 1000, :debug => false, :resume_filename => nil, :disable_compression => false, :schema => true, :data => true, :indexes_first => false, :progress => true}
5
+ opts={:default_chunksize => 1000, :debug => false, :resume_filename => nil, :disable_compression => false, :schema => true, :data => true, :indexes_first => false, :progress => true, :parallel => 1}
6
+
7
+ # Allow overriding parallel workers via PARALLEL environment variable
8
+ if ENV['PARALLEL'] && ENV['PARALLEL'].to_i > 0
9
+ opts[:parallel] = ENV['PARALLEL'].to_i
10
+ puts "Using #{opts[:parallel]} parallel workers"
11
+ end
6
12
 
7
13
  # Get the dump_path
8
14
  dump_path = File.expand_path(Rails.root.join("db", Time.now.strftime("%Y%m%d%I%M%S%p"))).to_s
@@ -20,10 +26,16 @@ namespace :tapsoob do
20
26
  Rake::Task["tapsoob:clean"].invoke
21
27
  end
22
28
 
23
- desc "Push a compatible dump on your filesystem to a database"
29
+ desc "Push a compatible dump on your filesystem to a database (PARALLEL=4 for 4 workers)"
24
30
  task :push, [:timestamp] => :environment do |t, args|
25
31
  # Default options
26
- opts={:default_chunksize => 1000, :debug => false, :resume_filename => nil, :disable_compression => false, :schema => true, :data => true, :indexes_first => false, :progress => true}
32
+ opts={:default_chunksize => 1000, :debug => false, :resume_filename => nil, :disable_compression => false, :schema => true, :data => true, :indexes_first => false, :progress => true, :parallel => 1}
33
+
34
+ # Allow overriding parallel workers via PARALLEL environment variable
35
+ if ENV['PARALLEL'] && ENV['PARALLEL'].to_i > 0
36
+ opts[:parallel] = ENV['PARALLEL'].to_i
37
+ puts "Using #{opts[:parallel]} parallel workers"
38
+ end
27
39
 
28
40
  # Get the dumps
29
41
  dumps = Dir[Rails.root.join("db", "*/")].select { |e| e =~ /([0-9]{14})([A-Z]{2})/ }.sort
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tapsoob
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.6.1
5
5
  platform: java
6
6
  authors:
7
7
  - Félix Bellanger