RubyGems - tapsoob - Versions diffs - 0.6.2-java → 0.7.0-java - Mend

tapsoob 0.6.2-java → 0.7.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +4 -4
data/README.md +18 -2
data/lib/tapsoob/cli/data_stream.rb +3 -3
data/lib/tapsoob/cli/root.rb +2 -3
data/lib/tapsoob/data_stream/base.rb +315 -0
data/lib/tapsoob/data_stream/file_partition.rb +87 -0
data/lib/tapsoob/data_stream/interleaved.rb +80 -0
data/lib/tapsoob/data_stream/keyed.rb +124 -0
data/lib/tapsoob/data_stream/keyed_partition.rb +64 -0
data/lib/tapsoob/data_stream.rb +7 -378
data/lib/tapsoob/operation/base.rb +240 -0
data/lib/tapsoob/operation/pull.rb +419 -0
data/lib/tapsoob/operation/push.rb +446 -0
data/lib/tapsoob/operation.rb +5 -664
data/lib/tapsoob/progress/bar.rb +0 -4
data/lib/tapsoob/progress/multi_bar.rb +90 -58
data/lib/tapsoob/progress/thread_safe_bar.rb +0 -3
data/lib/tapsoob/progress_event.rb +109 -0
data/lib/tapsoob/version.rb +1 -1
data/lib/tasks/tapsoob.rake +2 -2
metadata +11 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 4a60a9f569ac31e7a9390328af8974ee95a056a3d35b510eedafb13fd9ab3331
-  data.tar.gz: 5f6a3ca22a9b95bff31adf8a9921bfc79e3768047fbe769a0c9c3adbfe72c7f7
+  metadata.gz: 87a418bd365385b576c8eaeac8c5a54dec4b2254a3f7e30aa89a1e62bb0bb691
+  data.tar.gz: 3191066263768280a015c824411a797a5642052238c48f89cb55d84368bdb13b
 SHA512:
-  metadata.gz: c7f62ebad5ce2449ae283a22f5ae0e63faaa84cdce808537eda8e35f60346ff782be1826c3614fd4636787b67df8e1cc78f1ce1d74103d33c78b2ff41146dbec
-  data.tar.gz: 91911b166a9bae36b504dfa2112088fc11033f36f8b100ded1e9e9d94ad409c9bfb23a7dffe5afb5ec0d6e1adcc619a7923cf774bc9856c0013e6a78b2ae16fd
+  metadata.gz: 5318c4e1212ed88cdb4e9468f7cc7e39024074fc34d76a00b972758d00a1e9ab1f6fb91e7ee15b65b5bc3c1c657509f1aa5db11d37b5113bc191e55f5a6606de
+  data.tar.gz: b6545013c50c34b73d9c0a62d8b2995749187aee051bc296a1f4d2d4754c0504a24d215554b436b4a958e3b46da833603ec291039d2b94b2f81d0ffecd5b5d02

data/README.md CHANGED Viewed

@@ -14,6 +14,23 @@ Tapsoob currently rely on the Sequel ORM (<http://sequel.rubyforge.org/>) so we
 If you're using either Oracle or Oracle XE you will need some extra requirements. If you're using Ruby you'll need to have your ORACLE_HOME environnement variable set properly and the `ruby-oci8` gem installed. However if you're using jRuby you'll need to have the official Oracle JDBC driver (see here for more informations: <http://www.oracle.com/technetwork/articles/dsl/jruby-oracle11g-330825.html>) and it should be loaded prior to using Tapsoob otherwise you won't be able to connect the database.
+## Recent changes
+### 0.7.0
+### Features
+* Introducing a new CLI interface when dumping/loading data using parallelization, much cleaner and better way to keep track of what's going on.
+* Introducing de-facto intra-table parallelization for large tables which makes a tremendous difference (10x speed boost in most cases).
+* When using the `--progress=false` or `--no-progress` option there's now PROGRESS data being outputted to STDERR (only when dumping/loading to/from a directory).
+### Internal changes
+* Moved all DataStream* related classes into a specific module.
+* Moved all Operation related classes into a specific module.
+* Moved all progress bars related Classes into a specific module.
 ## Exporting your data
     tapsoob pull [OPTIONS] <dump_path> <database_url>
@@ -60,7 +77,7 @@ If you're using Rails, there's also two Rake tasks provided:
 * `tapsoob:pull` which dumps the database into a new folder under the `db` folder
 * `tapsoob:push` which reads the last dump you made from `tapsoob:pull` from the `db` folder
-## NEW : Full parallelization support from 0.6.1 onwards
+## Parallelization support from 0.6.1 onwards
 You can now dump/load a full database or data using parallelization to speed up the process at memory cost and database load like so :
@@ -78,7 +95,6 @@ Your exports can be moved from one machine to another for backups or replication
 ## ToDo
-* Add a compression layer
 * Tests (in progress)

data/lib/tapsoob/cli/data_stream.rb CHANGED Viewed

@@ -27,7 +27,7 @@ module Tapsoob
           opts[:parallel] = 1
         end
-        op = Tapsoob::Operation.factory(:pull, database_url, dump_path, opts)
+        op = Tapsoob::Operation::Base.factory(:pull, database_url, dump_path, opts)
         op.pull_data
       end
@@ -46,7 +46,7 @@ module Tapsoob
         # If dump_path is provided, use the Operation class for proper parallel support
         if dump_path && Dir.exist?(dump_path)
-          op = Tapsoob::Operation.factory(:push, database_url, dump_path, opts)
+          op = Tapsoob::Operation::Base.factory(:push, database_url, dump_path, opts)
           op.push_data
         else
           # STDIN mode: read and import data directly (no parallel support for STDIN)
@@ -66,7 +66,7 @@ module Tapsoob
               db(database_url, opts)[table_name.to_sym].truncate
             end
-            stream = Tapsoob::DataStream.factory(db(database_url, opts), {
+            stream = Tapsoob::DataStream::Base.factory(db(database_url, opts), {
               table_name: table_name,
               chunksize: opts[:default_chunksize]
             }, { :"discard-identity" => opts[:"discard-identity"] || false, :purge => opts[:purge] || false, :debug => opts[:debug] })

data/lib/tapsoob/cli/root.rb CHANGED Viewed

@@ -125,7 +125,7 @@ module Tapsoob
           FileUtils.mkpath "#{dump_path}/data"
           FileUtils.mkpath "#{dump_path}/indexes"
-          Tapsoob::Operation.factory(method, database_url, dump_path, opts).run
+          Tapsoob::Operation::Base.factory(method, database_url, dump_path, opts).run
         end
         def clientresumexfer(method, dump_path, database_url, opts)
@@ -134,7 +134,6 @@ module Tapsoob
           dump_path = dump_path || session.delete(:dump_path)
-          require 'taps/operation'
           newsession = session.merge({
             :default_chunksize => opts[:default_chunksize],
@@ -142,7 +141,7 @@ module Tapsoob
             :resume => true
           })
-          Tapsoob::Operation.factory(method, database_url, dump_path, newsession).run
+          Tapsoob::Operation::Base.factory(method, database_url, dump_path, newsession).run
         end
     end
   end

data/lib/tapsoob/data_stream/base.rb ADDED Viewed

@@ -0,0 +1,315 @@
+# -*- encoding : utf-8 -*-
+require 'tapsoob/log'
+require 'tapsoob/utils'
+module Tapsoob
+  module DataStream
+    class Base
+      DEFAULT_CHUNKSIZE = 1000
+      attr_reader :db, :state, :options
+      def initialize(db, state, opts = {})
+        @db = db
+        @state = {
+          :offset          => 0,
+          :avg_chunksize   => 0,
+          :num_chunksize   => 0,
+          :total_chunksize => 0
+        }.merge(state)
+        @state[:chunksize] ||= DEFAULT_CHUNKSIZE
+        @options = opts
+        @complete = false
+      end
+      def log
+        Tapsoob.log.level = Logger::DEBUG if state[:debug]
+        Tapsoob.log
+      end
+      def error=(val)
+        state[:error] = val
+      end
+      def error
+        state[:error] || false
+      end
+      def table_name
+        state[:table_name].to_sym
+      end
+      def table_name_sql
+        table_name
+      end
+      def to_hash
+        state.merge(:klass => self.class.to_s)
+      end
+      def to_json
+        JSON.generate(to_hash)
+      end
+      def string_columns
+        @string_columns ||= Tapsoob::Utils.incorrect_blobs(db, table_name)
+      end
+      def table
+        @table ||= db[table_name_sql]
+      end
+      def order_by(name=nil)
+        @order_by ||= begin
+          name ||= table_name
+          Tapsoob::Utils.order_by(db, name)
+        end
+      end
+      def increment(row_count)
+        state[:offset] += row_count
+      end
+      # keep a record of the average chunksize within the first few hundred thousand records, after chunksize
+      # goes below 100 or maybe if offset is > 1000
+      def fetch_rows
+        # Only count once on first fetch
+        state[:size] ||= table.count
+        ds = table.order(*order_by).limit(state[:chunksize], state[:offset])
+        log.debug "DataStream::Base#fetch_rows SQL -> #{ds.sql}"
+        rows = Tapsoob::Utils.format_data(db, ds.all,
+          :string_columns => string_columns,
+          :schema => db.schema(table_name),
+          :table => table_name
+        )
+        update_chunksize_stats
+        rows
+      end
+      def fetch_file(dump_path)
+        # Stream NDJSON format - read line by line without loading entire file
+        file_path = File.join(dump_path, "data", "#{table_name}.json")
+        # Initialize state on first call
+        unless state[:file_initialized]
+          state[:file_initialized] = true
+          state[:lines_read] = 0
+          state[:total_lines] = File.foreach(file_path).count
+        end
+        table_name_val = nil
+        header_val = nil
+        types_val = nil
+        data_batch = []
+        # Read from current offset
+        File.open(file_path, 'r') do |file|
+          # Skip to current offset
+          state[:lines_read].times { file.gets }
+          # Read chunksize worth of lines
+          state[:chunksize].times do
+            break if file.eof?
+            line = file.gets
+            next unless line
+            chunk = JSON.parse(line.strip)
+            table_name_val ||= chunk["table_name"]
+            header_val ||= chunk["header"]
+            types_val ||= chunk["types"]
+            data_batch.concat(chunk["data"]) if chunk["data"]
+            state[:lines_read] += 1
+          end
+        end
+        # Apply skip-duplicates if needed
+        data_batch = data_batch.uniq if @options[:"skip-duplicates"]
+        # Don't set state[:size] or state[:offset] here - they're managed separately
+        # for completion tracking based on actual data rows imported
+        log.debug "DataStream::Base#fetch_file: read #{data_batch.size} rows from #{state[:lines_read]} lines (total #{state[:total_lines]} lines in file)"
+        rows = {
+          :table_name => table_name_val,
+          :header     => header_val,
+          :data       => data_batch,
+          :types      => types_val
+        }
+        update_chunksize_stats
+        rows
+      end
+      def max_chunksize_training
+        20
+      end
+      def update_chunksize_stats
+        return if state[:num_chunksize] >= max_chunksize_training
+        state[:total_chunksize] += state[:chunksize]
+        state[:num_chunksize] += 1
+        state[:avg_chunksize] = state[:total_chunksize] / state[:num_chunksize] rescue state[:chunksize]
+      end
+      def encode_rows(rows)
+        Tapsoob::Utils.base64encode(Marshal.dump(rows))
+      end
+      def fetch(opts = {})
+        opts = (opts.empty? ? { :type => "database", :source => db.uri } : opts)
+        log.debug "DataStream::Base#fetch state -> #{state.inspect}"
+        t1 = Time.now
+        rows = (opts[:type] == "file" ? fetch_file(opts[:source]) : fetch_rows)
+        encoded_data = encode_rows(rows)
+        t2 = Time.now
+        elapsed_time = t2 - t1
+        # Only increment offset for database fetches
+        # For file fetches, offset is managed by fetch_file (tracks lines read, not rows)
+        if opts[:type] != "file"
+          state[:offset] += (rows == {} ? 0 : rows[:data].size)
+        end
+        [encoded_data, (rows == {} ? 0 : rows[:data].size), elapsed_time]
+      end
+      def complete?
+        # For file-based loading, check if we've read all lines
+        if state[:file_initialized]
+          result = state[:lines_read] >= state[:total_lines]
+          log.debug "DataStream::Base#complete? (file) lines_read=#{state[:lines_read]} total_lines=#{state[:total_lines]} result=#{result} table=#{table_name}"
+          result
+        else
+          # For database fetching, check offset vs size
+          result = state[:offset] >= state[:size]
+          log.debug "DataStream::Base#complete? (db) offset=#{state[:offset]} size=#{state[:size]} result=#{result} table=#{table_name}"
+          result
+        end
+      end
+      def fetch_data_from_database(params)
+        encoded_data = params[:encoded_data]
+        rows = parse_encoded_data(encoded_data, params[:checksum])
+        # update local state
+        state.merge!(params[:state].merge(:chunksize => state[:chunksize]))
+        yield rows if block_given?
+        (rows == {} ? 0 : rows[:data].size)
+      end
+      def fetch_data_to_database(params)
+        encoded_data = params[:encoded_data]
+        rows = parse_encoded_data(encoded_data, params[:checksum])
+        log.debug "DataStream::Base#fetch_data_to_database: importing #{rows[:data] ? rows[:data].size : 0} rows for table #{table_name rescue 'unknown'}"
+        import_rows(rows)
+        (rows == {} ? 0 : rows[:data].size)
+      end
+      def self.parse_json(json)
+        hash = JSON.parse(json).symbolize_keys
+        hash[:state].symbolize_keys! if hash.has_key?(:state)
+        hash
+      end
+      def parse_encoded_data(encoded_data, checksum)
+        raise Tapsoob::CorruptedData.new("Checksum Failed") unless Tapsoob::Utils.valid_data?(encoded_data, checksum)
+        begin
+          return Marshal.load(Tapsoob::Utils.base64decode(encoded_data))
+        rescue Object => e
+          unless ENV['NO_DUMP_MARSHAL_ERRORS']
+            puts "Error encountered loading data, wrote the data chunk to dump.#{Process.pid}.dat"
+            File.open("dump.#{Process.pid}.dat", "w") { |f| f.write(encoded_data) }
+          end
+          raise e
+        end
+      end
+      def import_rows(rows)
+        columns = rows[:header]
+        data    = rows[:data]
+        # Only import existing columns
+        if table.columns.size != columns.size
+          existing_columns        = table.columns.map(&:to_s)
+          additional_columns      = columns - existing_columns
+          additional_columns_idxs = additional_columns.map { |c| columns.index(c) }
+          additional_columns_idxs.reverse.each do |idx|
+            columns.delete_at(idx)
+            rows[:types].delete_at(idx)
+          end
+          data.each_index { |didx| additional_columns_idxs.reverse.each { |idx| data[didx].delete_at(idx) } }
+        end
+        # Decode blobs
+        if rows.has_key?(:types) && rows[:types].include?("blob")
+          blob_indices = rows[:types].each_index.select { |idx| rows[:types][idx] == "blob" }
+          data.each_index do |idx|
+            blob_indices.each do |bi|
+              data[idx][bi] = Sequel::SQL::Blob.new(Tapsoob::Utils.base64decode(data[idx][bi])) unless data[idx][bi].nil?
+            end
+          end
+        end
+        # Parse date/datetime/time columns
+        if rows.has_key?(:types)
+          %w(date datetime time).each do |type|
+            if rows[:types].include?(type)
+              type_indices = rows[:types].each_index.select { |idx| rows[:types][idx] == type }
+              data.each_index do |idx|
+                type_indices.each do |ti|
+                  data[idx][ti] = Sequel.send("string_to_#{type}".to_sym, data[idx][ti]) unless data[idx][ti].nil?
+                end
+              end
+            end
+          end
+        end
+        # Remove id column
+        if @options[:"discard-identity"] && rows[:header].include?("id")
+          columns = rows[:header] - ["id"]
+          data    = data.map { |d| d[1..-1] }
+        end
+        table.import(columns, data, :commit_every => 100)
+      rescue Exception => ex
+        case ex.message
+        when /integer out of range/ then
+          raise Tapsoob::InvalidData, <<-ERROR, []
+  \nDetected integer data that exceeds the maximum allowable size for an integer type.
+  This generally occurs when importing from SQLite due to the fact that SQLite does
+  not enforce maximum values on integer types.
+          ERROR
+        else raise ex
+        end
+      end
+      def verify_stream
+        state[:offset] = table.count
+      end
+      def self.factory(db, state, opts)
+        if defined?(Sequel::MySQL) && Sequel::MySQL.respond_to?(:convert_invalid_date_time=)
+          Sequel::MySQL.convert_invalid_date_time = :nil
+        end
+        if state.has_key?(:klass)
+          return eval(state[:klass]).new(db, state, opts)
+        end
+        if Tapsoob::Utils.single_integer_primary_key(db, state[:table_name].to_sym)
+          Tapsoob::DataStream::Keyed.new(db, state, opts)
+        else
+          Tapsoob::DataStream::Base.new(db, state, opts)
+        end
+      end
+    end
+  end
+end

data/lib/tapsoob/data_stream/file_partition.rb ADDED Viewed

@@ -0,0 +1,87 @@
+# -*- encoding : utf-8 -*-
+require 'tapsoob/data_stream/base'
+module Tapsoob
+  module DataStream
+    # DataStream variant for file-based parallelized loading
+    # Each worker reads a different portion of the NDJSON file
+    class FilePartition < Base
+      def initialize(db, state, opts = {})
+        super(db, state, opts)
+        @state = {
+          :line_range => nil,  # [start_line, end_line]
+          :lines_read => 0
+        }.merge(@state)
+        # Initialize current_line from line_range if provided
+        if @state[:line_range]
+          start_line, end_line = @state[:line_range]
+          @state[:current_line] = start_line
+        end
+      end
+      def fetch_file(dump_path)
+        return {} if state[:line_range].nil?
+        file_path = File.join(dump_path, "data", "#{table_name}.json")
+        start_line, end_line = state[:line_range]
+        table_name_val = nil
+        header_val = nil
+        types_val = nil
+        data_batch = []
+        # Read lines in this worker's range
+        File.open(file_path, 'r') do |file|
+          # Skip to current position
+          state[:current_line].times { file.gets }
+          # Read up to chunksize lines, but don't exceed end_line
+          lines_to_read = [state[:chunksize], end_line - state[:current_line] + 1].min
+          log.debug "DataStream::FilePartition#fetch_file: current_line=#{state[:current_line]} end_line=#{end_line} lines_to_read=#{lines_to_read} chunksize=#{state[:chunksize]} table=#{table_name}"
+          lines_to_read.times do
+            break if file.eof? || state[:current_line] > end_line
+            line = file.gets
+            next unless line
+            chunk = JSON.parse(line.strip)
+            table_name_val ||= chunk["table_name"]
+            header_val ||= chunk["header"]
+            types_val ||= chunk["types"]
+            data_batch.concat(chunk["data"]) if chunk["data"]
+            state[:current_line] += 1
+          end
+        end
+        log.debug "DataStream::FilePartition#fetch_file: read #{data_batch.size} rows in #{state[:current_line] - start_line} lines table=#{table_name}"
+        # Apply skip-duplicates if needed
+        data_batch = data_batch.uniq if @options[:"skip-duplicates"]
+        state[:size] = end_line - start_line + 1
+        state[:offset] = state[:current_line] - start_line
+        rows = {
+          :table_name => table_name_val,
+          :header     => header_val,
+          :data       => data_batch,
+          :types      => types_val
+        }
+        update_chunksize_stats
+        rows
+      end
+      def complete?
+        return true if state[:line_range].nil?
+        start_line, end_line = state[:line_range]
+        result = state[:current_line] && state[:current_line] > end_line
+        log.debug "DataStream::FilePartition#complete? current_line=#{state[:current_line]} end_line=#{end_line} result=#{result} table=#{table_name}"
+        result
+      end
+    end
+  end
+end

data/lib/tapsoob/data_stream/interleaved.rb ADDED Viewed

@@ -0,0 +1,80 @@
+# -*- encoding : utf-8 -*-
+require 'tapsoob/data_stream/base'
+module Tapsoob
+  module DataStream
+    # DataStream variant for interleaved chunk-based partitioning (for tables without integer PK)
+    class Interleaved < Base
+      def initialize(db, state, opts = {})
+        super(db, state, opts)
+        # :worker_id = which worker this is (0-indexed)
+        # :num_workers = total number of workers
+        # :chunk_number = current chunk number for this worker
+        @state = {
+          :worker_id => 0,
+          :num_workers => 1,
+          :chunk_number => 0
+        }.merge(@state)
+      end
+      def fetch_rows
+        worker_id = state[:worker_id]
+        num_workers = state[:num_workers]
+        chunk_number = state[:chunk_number]
+        chunksize = state[:chunksize]
+        # Only count once on first fetch
+        state[:size] ||= table.count
+        # Calculate which global chunk this worker should fetch
+        # Worker 0: chunks 0, num_workers, 2*num_workers, ...
+        # Worker 1: chunks 1, num_workers+1, 2*num_workers+1, ...
+        global_chunk_index = (chunk_number * num_workers) + worker_id
+        offset = global_chunk_index * chunksize
+        ds = table.order(*order_by).limit(chunksize, offset)
+        log.debug "DataStream::Interleaved#fetch_rows SQL -> #{ds.sql} (worker #{worker_id}/#{num_workers}, chunk #{chunk_number})"
+        rows = Tapsoob::Utils.format_data(db, ds.all,
+          :string_columns => string_columns,
+          :schema => db.schema(table_name),
+          :table => table_name
+        )
+        update_chunksize_stats
+        rows
+      end
+      def fetch(opts = {})
+        opts = (opts.empty? ? { :type => "database", :source => db.uri } : opts)
+        log.debug "DataStream::Interleaved#fetch state -> #{state.inspect}"
+        t1 = Time.now
+        rows = (opts[:type] == "file" ? fetch_file(opts[:source]) : fetch_rows)
+        encoded_data = encode_rows(rows)
+        t2 = Time.now
+        elapsed_time = t2 - t1
+        row_count = (rows == {} ? 0 : rows[:data].size)
+        # Always increment chunk number to avoid infinite loops
+        # Even if we got 0 rows, move to the next chunk position
+        state[:chunk_number] += 1
+        state[:offset] += row_count
+        [encoded_data, row_count, elapsed_time]
+      end
+      def increment(row_count)
+        # This is called by the old code path - not used in new parallel implementation
+        state[:chunk_number] += 1
+        state[:offset] += row_count
+      end
+      def complete?
+        state[:offset] >= state[:size]
+      end
+    end
+  end
+end