RubyGems - tapsoob - Versions diffs - 0.6.2-java → 0.7.0-java - Mend

tapsoob 0.6.2-java → 0.7.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +4 -4
data/README.md +18 -2
data/lib/tapsoob/cli/data_stream.rb +3 -3
data/lib/tapsoob/cli/root.rb +2 -3
data/lib/tapsoob/data_stream/base.rb +315 -0
data/lib/tapsoob/data_stream/file_partition.rb +87 -0
data/lib/tapsoob/data_stream/interleaved.rb +80 -0
data/lib/tapsoob/data_stream/keyed.rb +124 -0
data/lib/tapsoob/data_stream/keyed_partition.rb +64 -0
data/lib/tapsoob/data_stream.rb +7 -378
data/lib/tapsoob/operation/base.rb +240 -0
data/lib/tapsoob/operation/pull.rb +419 -0
data/lib/tapsoob/operation/push.rb +446 -0
data/lib/tapsoob/operation.rb +5 -664
data/lib/tapsoob/progress/bar.rb +0 -4
data/lib/tapsoob/progress/multi_bar.rb +90 -58
data/lib/tapsoob/progress/thread_safe_bar.rb +0 -3
data/lib/tapsoob/progress_event.rb +109 -0
data/lib/tapsoob/version.rb +1 -1
data/lib/tasks/tapsoob.rake +2 -2
metadata +11 -2

data/lib/tapsoob/data_stream/keyed.rb ADDED Viewed

@@ -0,0 +1,124 @@
+# -*- encoding : utf-8 -*-
+require 'tapsoob/data_stream/base'
+module Tapsoob
+  module DataStream
+    class Keyed < Base
+      attr_accessor :buffer
+      def initialize(db, state, opts = {})
+        super(db, state, opts)
+        @state = { :primary_key => order_by(state[:table_name]).first, :filter => 0 }.merge(@state)
+        @state[:chunksize] ||= DEFAULT_CHUNKSIZE
+        @buffer = []
+      end
+      def primary_key
+        state[:primary_key].to_sym
+      end
+      def buffer_limit
+        if state[:last_fetched] and state[:last_fetched] < state[:filter] and self.buffer.size == 0
+          state[:last_fetched]
+        else
+          state[:filter]
+        end
+      end
+      def calc_limit(chunksize)
+        # we want to not fetch more than is needed while we're
+        # inside sinatra but locally we can select more than
+        # is strictly needed
+        if defined?(Sinatra)
+          (chunksize * 1.1).ceil
+        else
+          (chunksize * 3).ceil
+        end
+      end
+      def load_buffer(chunksize)
+        num = 0
+        loop do
+          limit = calc_limit(chunksize)
+          # we have to use local variables in order for the virtual row filter to work correctly
+          key = primary_key
+          buf_limit = buffer_limit
+          ds = table.order(*order_by).filter { key.sql_number > buf_limit }.limit(limit)
+          log.debug "DataStream::Keyed#load_buffer SQL -> #{ds.sql}"
+          data = ds.all
+          self.buffer += data
+          num += data.size
+          if data.any?
+            # keep a record of the last primary key value in the buffer
+            state[:filter] = self.buffer.last[primary_key]
+          end
+          break if num >= chunksize || data.empty?
+        end
+      end
+      def fetch_buffered(chunksize)
+        load_buffer(chunksize) if buffer.size < chunksize
+        rows = buffer.slice(0, chunksize)
+        state[:last_fetched] = rows.any? ? rows.last[primary_key] : nil
+        rows
+      end
+      def increment(row_count)
+        # pop the rows we just successfully sent off the buffer
+        @buffer.slice!(0, row_count)
+      end
+      def verify_stream
+        key = primary_key
+        ds = table.order(*order_by)
+        current_filter = ds.max(key.sql_number)
+        # set the current filter to the max of the primary key
+        state[:filter] = current_filter
+        # clear out the last_fetched value so it can restart from scratch
+        state[:last_fetched] = nil
+        log.debug "DataStream::Keyed#verify_stream -> state: #{state.inspect}"
+      end
+      # Calculate PK range for partitioning
+      def self.calculate_pk_ranges(db, table_name, num_partitions)
+        key = Tapsoob::Utils.order_by(db, table_name).first
+        ds = db[table_name.to_sym]
+        # Get total row count
+        total_rows = ds.count
+        return [[ds.min(key) || 0, ds.max(key) || 0]] if total_rows == 0 || num_partitions <= 1
+        # Calculate target rows per partition
+        rows_per_partition = (total_rows.to_f / num_partitions).ceil
+        # Find PK boundaries at percentiles using OFFSET
+        # This ensures even distribution of ROWS, not PK values
+        ranges = []
+        (0...num_partitions).each do |i|
+          # Calculate row offset for this partition's start
+          start_offset = i * rows_per_partition
+          end_offset = [(i + 1) * rows_per_partition - 1, total_rows - 1].min
+          # Get the PK value at this row offset
+          start_pk = ds.order(key).limit(1, start_offset).select(key).first
+          start_pk = start_pk ? start_pk[key] : (ds.min(key) || 0)
+          # Get the PK value at the end offset (or max for last partition)
+          if i == num_partitions - 1
+            end_pk = ds.max(key) || start_pk
+          else
+            end_pk_row = ds.order(key).limit(1, end_offset).select(key).first
+            end_pk = end_pk_row ? end_pk_row[key] : start_pk
+          end
+          ranges << [start_pk, end_pk]
+        end
+        ranges
+      end
+    end
+  end
+end

data/lib/tapsoob/data_stream/keyed_partition.rb ADDED Viewed

@@ -0,0 +1,64 @@
+# -*- encoding : utf-8 -*-
+require 'tapsoob/data_stream/base'
+module Tapsoob
+  module DataStream
+    # DataStream variant for PK-based range partitioning
+    class KeyedPartition < Base
+      def initialize(db, state, opts = {})
+        super(db, state, opts)
+        # :partition_range = [min_pk, max_pk] for this partition
+        # :last_pk = last primary key value fetched
+        @state = {
+          :partition_range => nil,
+          :last_pk => nil
+        }.merge(@state)
+      end
+      def primary_key
+        @primary_key ||= Tapsoob::Utils.order_by(db, table_name).first
+      end
+      def fetch_rows
+        return {} if state[:partition_range].nil?
+        # Only count once on first fetch
+        state[:size] ||= table.count
+        min_pk, max_pk = state[:partition_range]
+        chunksize = state[:chunksize]
+        # Build query with PK range filter
+        key = primary_key
+        last = state[:last_pk] || (min_pk - 1)
+        ds = table.order(*order_by).filter do
+          (Sequel.identifier(key) > last) & (Sequel.identifier(key) >= min_pk) & (Sequel.identifier(key) <= max_pk)
+        end.limit(chunksize)
+        data = ds.all
+        # Update last_pk for next fetch
+        if data.any?
+          state[:last_pk] = data.last[primary_key]
+        else
+          # No data found in this range - mark partition as complete
+          state[:last_pk] = max_pk
+        end
+        Tapsoob::Utils.format_data(db, data,
+          :string_columns => string_columns,
+          :schema => db.schema(table_name),
+          :table => table_name
+        )
+      end
+      def complete?
+        return true if state[:partition_range].nil?
+        min_pk, max_pk = state[:partition_range]
+        # Complete when we've fetched past the max PK
+        state[:last_pk] && state[:last_pk] >= max_pk
+      end
+    end
+  end
+end

data/lib/tapsoob/data_stream.rb CHANGED Viewed

@@ -1,383 +1,12 @@
 # -*- encoding : utf-8 -*-
-require 'tapsoob/log'
-require 'tapsoob/utils'
 module Tapsoob
-  class DataStream
-    DEFAULT_CHUNKSIZE = 1000
-    attr_reader :db, :state, :options
-    def initialize(db, state, opts = {})
-      @db = db
-      @state = {
-        :offset          => 0,
-        :avg_chunksize   => 0,
-        :num_chunksize   => 0,
-        :total_chunksize => 0
-      }.merge(state)
-      @state[:chunksize] ||= DEFAULT_CHUNKSIZE
-      @options = opts
-      @complete = false
-    end
-    def log
-      Tapsoob.log.level = Logger::DEBUG if state[:debug]
-      Tapsoob.log
-    end
-    def error=(val)
-      state[:error] = val
-    end
-    def error
-      state[:error] || false
-    end
-    def table_name
-      state[:table_name].to_sym
-    end
-    def table_name_sql
-      table_name
-    end
-    def to_hash
-      state.merge(:klass => self.class.to_s)
-    end
-    def to_json
-      JSON.generate(to_hash)
-    end
-    def string_columns
-      @string_columns ||= Tapsoob::Utils.incorrect_blobs(db, table_name)
-    end
-    def table
-      @table ||= db[table_name_sql]
-    end
-    def order_by(name=nil)
-      @order_by ||= begin
-        name ||= table_name
-        Tapsoob::Utils.order_by(db, name)
-      end
-    end
-    def increment(row_count)
-      state[:offset] += row_count
-    end
-    # keep a record of the average chunksize within the first few hundred thousand records, after chunksize
-    # goes below 100 or maybe if offset is > 1000
-    def fetch_rows
-      #state[:chunksize] = fetch_chunksize
-      ds = table.order(*order_by).limit(state[:chunksize], state[:offset])
-      state[:size] = table.count
-      log.debug "DataStream#fetch_rows SQL -> #{ds.sql}"
-      rows = Tapsoob::Utils.format_data(db, ds.all,
-        :string_columns => string_columns,
-        :schema => db.schema(table_name),
-        :table => table_name
-      )
-      update_chunksize_stats
-      rows
-    end
-    def fetch_file(dump_path)
-      #state[:chunksize] = fetch_chunksize
-      # Read NDJSON format - each line is a separate JSON chunk
-      file_path = File.join(dump_path, "data", "#{table_name}.json")
-      # Parse all chunks and combine them
-      all_data = []
-      table_name_val = nil
-      header_val = nil
-      types_val = nil
-      File.readlines(file_path).each do |line|
-        chunk = JSON.parse(line.strip)
-        table_name_val ||= chunk["table_name"]
-        header_val ||= chunk["header"]
-        types_val ||= chunk["types"]
-        all_data.concat(chunk["data"]) if chunk["data"]
-      end
-      # Apply skip-duplicates if needed
-      all_data = all_data.uniq if @options[:"skip-duplicates"]
-      state[:size] = all_data.size
-      log.debug "DataStream#fetch_file"
-      rows = {
-        :table_name => table_name_val,
-        :header     => header_val,
-        :data       => (all_data[state[:offset], state[:chunksize]] || []),
-        :types      => types_val
-      }
-      update_chunksize_stats
-      rows
-    end
-    def max_chunksize_training
-      20
-    end
-    #def fetch_chunksize
-    #  chunksize = state[:chunksize]
-    #  return chunksize if state[:num_chunksize] < max_chunksize_training
-    #  return chunksize if state[:avg_chunksize] == 0
-    #  return chunksize if state[:error]
-    #  state[:avg_chunksize] > chunksize ? state[:avg_chunksize] : chunksize
-    #end
-    def update_chunksize_stats
-      return if state[:num_chunksize] >= max_chunksize_training
-      state[:total_chunksize] += state[:chunksize]
-      state[:num_chunksize] += 1
-      state[:avg_chunksize] = state[:total_chunksize] / state[:num_chunksize] rescue state[:chunksize]
-    end
-    def encode_rows(rows)
-      Tapsoob::Utils.base64encode(Marshal.dump(rows))
-    end
-    def fetch(opts = {})
-      opts = (opts.empty? ? { :type => "database", :source => db.uri } : opts)
-      log.debug "DataStream#fetch state -> #{state.inspect}"
-      t1 = Time.now
-      rows = (opts[:type] == "file" ? fetch_file(opts[:source]) : fetch_rows)
-      encoded_data = encode_rows(rows)
-      t2 = Time.now
-      elapsed_time = t2 - t1
-      state[:offset] += (rows == {} ? 0 : rows[:data].size)
-      [encoded_data, (rows == {} ? 0 : rows[:data].size), elapsed_time]
-    end
-    def complete?
-      state[:offset] >= state[:size]
-    end
-    def fetch_data_from_database(params)
-      encoded_data = params[:encoded_data]
-      rows = parse_encoded_data(encoded_data, params[:checksum])
-      # update local state
-      state.merge!(params[:state].merge(:chunksize => state[:chunksize]))
-      yield rows if block_given?
-      (rows == {} ? 0 : rows[:data].size)
-    end
-    def fetch_data_to_database(params)
-      encoded_data = params[:encoded_data]
-      rows = parse_encoded_data(encoded_data, params[:checksum])
-      import_rows(rows)
-      (rows == {} ? 0 : rows[:data].size)
-    end
-    def self.parse_json(json)
-      hash = JSON.parse(json).symbolize_keys
-      hash[:state].symbolize_keys! if hash.has_key?(:state)
-      hash
-    end
-    def parse_encoded_data(encoded_data, checksum)
-      raise Tapsoob::CorruptedData.new("Checksum Failed") unless Tapsoob::Utils.valid_data?(encoded_data, checksum)
-      begin
-        return Marshal.load(Tapsoob::Utils.base64decode(encoded_data))
-      rescue Object => e
-        unless ENV['NO_DUMP_MARSHAL_ERRORS']
-          puts "Error encountered loading data, wrote the data chunk to dump.#{Process.pid}.dat"
-          File.open("dump.#{Process.pid}.dat", "w") { |f| f.write(encoded_data) }
-        end
-        raise e
-      end
-    end
-    def import_rows(rows)
-      columns = rows[:header]
-      data    = rows[:data]
-      # Only import existing columns
-      if table.columns.size != columns.size
-        existing_columns        = table.columns.map(&:to_s)
-        additional_columns      = columns - existing_columns
-        additional_columns_idxs = additional_columns.map { |c| columns.index(c) }
-        additional_columns_idxs.reverse.each do |idx|
-          columns.delete_at(idx)
-          rows[:types].delete_at(idx)
-        end
-        data.each_index { |didx| additional_columns_idxs.reverse.each { |idx| data[didx].delete_at(idx) } }
-      end
-      # Decode blobs
-      if rows.has_key?(:types) && rows[:types].include?("blob")
-        blob_indices = rows[:types].each_index.select { |idx| rows[:types][idx] == "blob" }
-        data.each_index do |idx|
-          blob_indices.each do |bi|
-            data[idx][bi] = Sequel::SQL::Blob.new(Tapsoob::Utils.base64decode(data[idx][bi])) unless data[idx][bi].nil?
-          end
-        end
-      end
-      # Parse date/datetime/time columns
-      if rows.has_key?(:types)
-        %w(date datetime time).each do |type|
-          if rows[:types].include?(type)
-            type_indices = rows[:types].each_index.select { |idx| rows[:types][idx] == type }
-            data.each_index do |idx|
-              type_indices.each do |ti|
-                data[idx][ti] = Sequel.send("string_to_#{type}".to_sym, data[idx][ti]) unless data[idx][ti].nil?
-              end
-            end
-          end
-        end
-      end
-      # Remove id column
-      if @options[:"discard-identity"] && rows[:header].include?("id")
-        columns = rows[:header] - ["id"]
-        data    = data.map { |d| d[1..-1] }
-      end
-      table.import(columns, data, :commit_every => 100)
-    rescue Exception => ex
-      case ex.message
-      when /integer out of range/ then
-        raise Tapsoob::InvalidData, <<-ERROR, []
-  \nDetected integer data that exceeds the maximum allowable size for an integer type.
-  This generally occurs when importing from SQLite due to the fact that SQLite does
-  not enforce maximum values on integer types.
-        ERROR
-      else raise ex
-      end
-    end
-    def verify_stream
-      state[:offset] = table.count
-    end
-    def self.factory(db, state, opts)
-      if defined?(Sequel::MySQL) && Sequel::MySQL.respond_to?(:convert_invalid_date_time=)
-        Sequel::MySQL.convert_invalid_date_time = :nil
-      end
-      if state.has_key?(:klass)
-        return eval(state[:klass]).new(db, state, opts)
-      end
-      if Tapsoob::Utils.single_integer_primary_key(db, state[:table_name].to_sym)
-        DataStreamKeyed.new(db, state, opts)
-      else
-        DataStream.new(db, state, opts)
-      end
-    end
-  end
-  class DataStreamKeyed < DataStream
-    attr_accessor :buffer
-    def initialize(db, state, opts = {})
-      super(db, state, opts)
-      @state = { :primary_key => order_by(state[:table_name]).first, :filter => 0 }.merge(@state)
-      @state[:chunksize] ||= DEFAULT_CHUNKSIZE
-      @buffer = []
-    end
-    def primary_key
-      state[:primary_key].to_sym
-    end
-    def buffer_limit
-      if state[:last_fetched] and state[:last_fetched] < state[:filter] and self.buffer.size == 0
-        state[:last_fetched]
-      else
-        state[:filter]
-      end
-    end
-    def calc_limit(chunksize)
-      # we want to not fetch more than is needed while we're
-      # inside sinatra but locally we can select more than
-      # is strictly needed
-      if defined?(Sinatra)
-        (chunksize * 1.1).ceil
-      else
-        (chunksize * 3).ceil
-      end
-    end
-    def load_buffer(chunksize)
-      # make sure BasicObject is not polluted by subsequent requires
-      Sequel::BasicObject.remove_methods!
-      num = 0
-      loop do
-        limit = calc_limit(chunksize)
-        # we have to use local variables in order for the virtual row filter to work correctly
-        key = primary_key
-        buf_limit = buffer_limit
-        ds = table.order(*order_by).filter { key.sql_number > buf_limit }.limit(limit)
-        log.debug "DataStreamKeyed#load_buffer SQL -> #{ds.sql}"
-        data = ds.all
-        self.buffer += data
-        num += data.size
-        if data.size > 0
-          # keep a record of the last primary key value in the buffer
-          state[:filter] = self.buffer.last[ primary_key ]
-        end
-        break if num >= chunksize or data.size == 0
-      end
-    end
-    def fetch_buffered(chunksize)
-      load_buffer(chunksize) if self.buffer.size < chunksize
-      rows = buffer.slice(0, chunksize)
-      state[:last_fetched] = if rows.size > 0
-        rows.last[ primary_key ]
-      else
-        nil
-      end
-      rows
-    end
-    #def import_rows(rows)
-    #  table.import(rows[:header], rows[:data])
-    #end
-    #def fetch_rows
-    #  chunksize = state[:chunksize]
-    #  Tapsoob::Utils.format_data(fetch_buffered(chunksize) || [],
-    #    :string_columns => string_columns)
-    #end
-    def increment(row_count)
-      # pop the rows we just successfully sent off the buffer
-      @buffer.slice!(0, row_count)
-    end
-    def verify_stream
-      key = primary_key
-      ds = table.order(*order_by)
-      current_filter = ds.max(key.sql_number)
-      # set the current filter to the max of the primary key
-      state[:filter] = current_filter
-      # clear out the last_fetched value so it can restart from scratch
-      state[:last_fetched] = nil
-      log.debug "DataStreamKeyed#verify_stream -> state: #{state.inspect}"
-    end
+  module DataStream
+    # Require all DataStream classes
+    require 'tapsoob/data_stream/base'
+    require 'tapsoob/data_stream/keyed'
+    require 'tapsoob/data_stream/keyed_partition'
+    require 'tapsoob/data_stream/interleaved'
+    require 'tapsoob/data_stream/file_partition'
   end
 end