RubyGems - tapsoob - Versions diffs - 0.6.0-java → 0.6.1-java - Mend

tapsoob 0.6.0-java → 0.6.1-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml +4 -4
data/README.md +11 -1
data/lib/tapsoob/cli/data_stream.rb +45 -20
data/lib/tapsoob/version.rb +1 -1
data/lib/tasks/tapsoob.rake +16 -4
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: b2d30be0a3150a1592ece7ced339c7bf56c1de3f3b4c0fe9468b2370e8e6b855
-  data.tar.gz: e7ceee5ebe56f0fa375bb76e5ab360604b9231ce8a5c1263fabe24c84abc15e5
+  metadata.gz: 4496a325b1c666f3184aac7b24f0344c7c895bf3481c04faa0326bab6f8ac9c0
+  data.tar.gz: 4794ef68add8faa325f7eb3212e40213e43a946b3947342aafa49290b2815aa3
 SHA512:
-  metadata.gz: d19b9ad09c465356dcfa49af92ded16dcf496c9743f1a345a5ca3b6afb46570ae8c1d646400a98b504e3bc86ff8e59e083fb673b960da6b6b616ff04ca6d218f
-  data.tar.gz: 15ce04aad9422147b81b2b51fee2e9d270ab824680bb777e8ea5014c9f11da1b83cfc426352a9db5cfb9b47fc0fb9ce355e2eee43ad68d3103c4d6e220d042dc
+  metadata.gz: 9da5a4494551b52576443de7f48ec8f2e27df98422c10a8d1e222ba6182d4ffab59fbd6a3bfd3b7f42fc64ded0f02c6e51056168e0bb9214e52e874c7765bdc5
+  data.tar.gz: f72add9544b1ec901579f8a661e6157e27f2ea905854c0c4addc074debe736498963d0500d5a73dce986a18b4a7391677c0e9d9560556ca5ba6c551108f006b1

data/README.md CHANGED Viewed

@@ -36,7 +36,7 @@ You can list all available options using the command:
     tapsoob push -h
-## NEW : Piping your schema/indexes/data
+## Piping your schema/indexes/data
 Due to some needs we added ways to pipe your schema/indexes/data directly from one database to another, here's an equivalent of the export/import process described above using this technique :
@@ -60,6 +60,16 @@ If you're using Rails, there's also two Rake tasks provided:
 * `tapsoob:pull` which dumps the database into a new folder under the `db` folder
 * `tapsoob:push` which reads the last dump you made from `tapsoob:pull` from the `db` folder
+## NEW : Full parallelization support from 0.6.1 onwards
+You can now dump/load a full database or data using parallelization to speed up the process at memory cost and database load like so :
+```
+tapsoob pull [OPTIONS] <dump_path> <database_url> -j <number_of_threads>
+tapsoob push [OPTIONS] <dump_path> <database_url> -j <number_of_threads>
+```
+It defaults to a single thread as per pre 0.6.1, it is also appliable to `tapsoob data pull/push` but only when dumping to files, you can't parallelize and pipe for obvious reasons, it'll fall back to a single thread with a warning if you try to do this for safety.
 ## Notes

data/lib/tapsoob/cli/data_stream.rb CHANGED Viewed

@@ -14,10 +14,20 @@ module Tapsoob
       option :chunksize, desc: "Initial chunksize", default: 1000, type: :numeric, aliases: "-c"
       option :tables, desc: "Shortcut to filter on a list of tables", type: :array, aliases: "-t"
       option :"exclude-tables", desc: "Shortcut to exclude a list of tables", type: :array, aliases: "-e"
+      option :parallel, desc: "Number of parallel workers for table processing (default: 1)", default: 1, type: :numeric, aliases: "-j"
       option :progress, desc: "Show progress", default: true, type: :boolean, aliases: "-p"
       option :debug, desc: "Enable debug messages", default: false, type: :boolean, aliases: "-d"
       def pull(database_url, dump_path = nil)
-        op = Tapsoob::Operation.factory(:pull, database_url, dump_path, parse_opts(options))
+        opts = parse_opts(options)
+        # Force serial mode when outputting to STDOUT (for piping)
+        # Parallel mode would interleave output and corrupt the JSON stream
+        if dump_path.nil? && opts[:parallel] && opts[:parallel] > 1
+          STDERR.puts "Warning: Parallel mode disabled when outputting to STDOUT (for piping)"
+          opts[:parallel] = 1
+        end
+        op = Tapsoob::Operation.factory(:pull, database_url, dump_path, opts)
         op.pull_data
       end
@@ -25,36 +35,48 @@ module Tapsoob
       option :chunksize, desc: "Initial chunksize", default: 1000, type: :numeric, aliases: "-c"
       option :tables, desc: "Shortcut to filter on a list of tables", type: :array, aliases: "-t"
       option :"exclude-tables", desc: "Shortcut to exclude a list of tables", type: :array, aliases: "-e"
+      option :parallel, desc: "Number of parallel workers for table processing (default: 1)", default: 1, type: :numeric, aliases: "-j"
       option :progress, desc: "Show progress", default: true, type: :boolean, aliases: "-p"
       option :purge, desc: "Purge data in tables prior to performing the import", default: false, type: :boolean
       option :"skip-duplicates", desc: "Remove duplicates when loading data", default: false, type: :boolean
       option :"discard-identity", desc: "Remove identity when pushing data (may result in creating duplicates)", default: false, type: :boolean
       option :debug, desc: "Enable debug messages", default: false, type: :boolean, aliases: "-d"
       def push(database_url, dump_path = nil)
-        # instantiate stuff
-        data = []
         opts = parse_opts(options)
-        # read data from dump_path or from STDIN
+        # If dump_path is provided, use the Operation class for proper parallel support
         if dump_path && Dir.exist?(dump_path)
-          files = Dir[Pathname.new(dump_path).join("*.json")]
-          files.each { |file| data << JSON.parse(File.read(file), symbolize_names: true) }
+          op = Tapsoob::Operation.factory(:push, database_url, dump_path, opts)
+          op.push_data
         else
+          # STDIN mode: read and import data directly (no parallel support for STDIN)
+          if opts[:parallel] && opts[:parallel] > 1
+            STDERR.puts "Warning: Parallel mode not supported when reading from STDIN"
+          end
+          data = []
           STDIN.each_line { |line| data << JSON.parse(line, symbolize_names: true) }
-        end
-        # import data
-        data.each do |table|
-          stream = Tapsoob::DataStream.factory(db(database_url, opts), {
-            table_name: table[:table_name],
-            chunksize: opts[:default_chunksize]
-          }, { :"discard-identity" => opts[:"discard-identity"] || false, :purge => opts[:purge] || false, :debug => opts[:debug] })
-          begin
-            stream.import_rows(table)
-          rescue Exception => e
-            stream.log.debug e.message
-            STDERR.puts "Error loading data in #{table[:table_name]} : #{e.message}"
+          # import data
+          data.each do |table|
+            table_name = table[:table_name]
+            # Truncate table if purge option is enabled
+            if opts[:purge]
+              db(database_url, opts)[table_name.to_sym].truncate
+            end
+            stream = Tapsoob::DataStream.factory(db(database_url, opts), {
+              table_name: table_name,
+              chunksize: opts[:default_chunksize]
+            }, { :"discard-identity" => opts[:"discard-identity"] || false, :purge => opts[:purge] || false, :debug => opts[:debug] })
+            begin
+              stream.import_rows(table)
+            rescue Exception => e
+              stream.log.debug e.message
+              STDERR.puts "Error loading data in #{table_name} : #{e.message}"
+            end
           end
         end
       end
@@ -65,6 +87,7 @@ module Tapsoob
           opts = {
             progress: options[:progress],
             tables: options[:tables],
+            parallel: options[:parallel],
             debug: options[:debug]
           }
@@ -85,7 +108,9 @@ module Tapsoob
         end
         def db(database_url, opts = {})
-          @db ||= Sequel.connect(database_url)
+          # Support connection pooling for parallel operations
+          parallel_workers = opts[:parallel] || 1
+          @db ||= Sequel.connect(database_url, max_connections: parallel_workers * 2)
           @db.loggers << Tapsoob.log if opts[:debug]
           # Set parameters

data/lib/tapsoob/version.rb CHANGED Viewed

@@ -1,4 +1,4 @@
 # -*- encoding : utf-8 -*-
 module Tapsoob
-  VERSION = "0.6.0".freeze
+  VERSION = "0.6.1".freeze
 end

data/lib/tasks/tapsoob.rake CHANGED Viewed

@@ -1,8 +1,14 @@
 namespace :tapsoob do
-  desc "Pulls a database to your filesystem"
+  desc "Pulls a database to your filesystem (PARALLEL=4 for 4 workers)"
   task :pull => :environment do
     # Default options
-    opts={:default_chunksize => 1000, :debug => false, :resume_filename => nil, :disable_compression => false, :schema => true, :data => true, :indexes_first => false, :progress => true}
+    opts={:default_chunksize => 1000, :debug => false, :resume_filename => nil, :disable_compression => false, :schema => true, :data => true, :indexes_first => false, :progress => true, :parallel => 1}
+    # Allow overriding parallel workers via PARALLEL environment variable
+    if ENV['PARALLEL'] && ENV['PARALLEL'].to_i > 0
+      opts[:parallel] = ENV['PARALLEL'].to_i
+      puts "Using #{opts[:parallel]} parallel workers"
+    end
     # Get the dump_path
     dump_path = File.expand_path(Rails.root.join("db", Time.now.strftime("%Y%m%d%I%M%S%p"))).to_s
@@ -20,10 +26,16 @@ namespace :tapsoob do
     Rake::Task["tapsoob:clean"].invoke
   end
-  desc "Push a compatible dump on your filesystem to a database"
+  desc "Push a compatible dump on your filesystem to a database (PARALLEL=4 for 4 workers)"
   task :push, [:timestamp] => :environment do |t, args|
     # Default options
-    opts={:default_chunksize => 1000, :debug => false, :resume_filename => nil, :disable_compression => false, :schema => true, :data => true, :indexes_first => false, :progress => true}
+    opts={:default_chunksize => 1000, :debug => false, :resume_filename => nil, :disable_compression => false, :schema => true, :data => true, :indexes_first => false, :progress => true, :parallel => 1}
+    # Allow overriding parallel workers via PARALLEL environment variable
+    if ENV['PARALLEL'] && ENV['PARALLEL'].to_i > 0
+      opts[:parallel] = ENV['PARALLEL'].to_i
+      puts "Using #{opts[:parallel]} parallel workers"
+    end
     # Get the dumps
     dumps = Dir[Rails.root.join("db", "*/")].select { |e| e =~ /([0-9]{14})([A-Z]{2})/ }.sort

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: tapsoob
 version: !ruby/object:Gem::Version
-  version: 0.6.0
+  version: 0.6.1
 platform: java
 authors:
 - Félix Bellanger