RubyGems - wyrm - Versions diffs - 0.4.1 → 0.4.2 - Mend

wyrm 0.4.1 → 0.4.2

Files changed (17) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: da207f92151b080d31039a364c1a2c50022f01ff
-  data.tar.gz: ddf38f48b42597ed08671cb67fadd8457e425e07
+  metadata.gz: c40184e0e1f6175ad0447494ff5bf367c39292db
+  data.tar.gz: c7b927a63887f83ba35b6c3be3c11fb412a2212a
 SHA512:
-  metadata.gz: 03e699a00d14fa7baacc286b886cf35074766b7b6b3b8e6e10fde08779ded7fda4930f9666bad95274bf773f8fe33f1916f3836414e98c188c81963b3a01459c
-  data.tar.gz: 0052a0b096e62662223f9e4a9da2cfd79e2908f033f83a7a9201463aac3ec9f56407299b68e461b9b850f39ea8c3da5e53c48ad7510561d680b601a38ce739ca
+  metadata.gz: cd762e971e8fb35f4147b4657b5fbb67fb1de1ef26ec4d8ef7af2dac2a9f6532cf8bce4e02587021e261e302e133d6312caad46cf6e06924d3701a25dc8bb2a1
+  data.tar.gz: 7c38e0d0f186e78e58639220b21755b219e85ef15b3acbe8c920e145c70f1715702b4c4fd060abebad767732469296dd855af198349b27be97d70fd419060e47

data/.travis.yml CHANGED

@@ -1,4 +1,6 @@
 language: ruby
-rvm: []
-# - 2.3 not supported as of 16-Mar-2016. srsly, 3 months after release
+rvm:
+  - 2.3.0
+  - 2.3.1
+  # - jruby-9.1.0.0 fails because of db drivers
 script: bundle exec rspec spec

data/Gemfile CHANGED

@@ -1,32 +1,24 @@
-raise "You need >= ruby-2.3 (or maybe a Queue with close would work)" unless RUBY_VERSION >= '2.3.0'
+source 'https://rubygems.org'
-# TODO this is for older versions of bundler
-def from_gemrc
-  # auto-load from ~/.gemrc
-  home_gemrc = Pathname('~/.gemrc').expand_path
-  if home_gemrc.exist?
-    require 'yaml'
-    # use all the sources specified in .gemrc
-    YAML.load_file(home_gemrc)[:sources]
-  end
-end
-# Use the gemrc source if defined, unless CANON is set,
-# otherwise just use the default.
-def preferred_sources
-  rv = from_gemrc unless eval(ENV['CANON']||'')
-  rv ||= []
-  rv << 'http://rubygems.org' if rv.empty?
-  rv
-end
-preferred_sources.each{|src| source src}
+raise "You need >= ruby-2.3 for wyrm" unless RUBY_VERSION >= '2.3.0'
 # Specify your gem's dependencies in wyrm.gemspec
 gemspec
-if Pathname('/usr/include/mysql').exist?
-  # version is for mysql streaming result sets
-  gem "mysql2", '>= 0.3.12'
+platforms :ruby do
+  gem 'pg'
+  gem 'sequel_pg'
+  gem 'sqlite3'
+  gem 'pry-byebug'
+  if Pathname('/usr/include/mysql').exist?
+    # version is for mysql streaming result sets
+    gem "mysql2", '>= 0.3.12'
+  end
+end
+platforms :jruby do
+  # gem "pg"
+  gem 'jdbc-sqlite3'
+  gem 'jdbc-postgres'
 end

data/History.txt CHANGED

@@ -1,3 +1,7 @@
+== 0.4.2
+* special case for jruby closing popen stream
+* use modules better
 == 0.4.1
 * Improve docs and examples
 * make pbzip2 somewhat configurable.

data/README.md CHANGED

@@ -30,6 +30,8 @@ Wyrm because:
 ## Dependencies
+Ruby >= 2.3.0, for Queue#close
 You must have a working
 [pbzip2](http://compression.ca/pbzip2/ "Will use all your cores")
 on your path. If you really have to use something else,

data/lib/wyrm/dump.rb CHANGED

@@ -6,104 +6,129 @@ require 'wyrm/schema_tools'
 require 'wyrm/logger'
 # Dump a schema and compressed data from a db to a set of files
-#  src_db = Sequel.connect "postgres://localhost:5454/lots"
-#  ds = DumpSchema.new src_db, Pathname('/var/data/lots')
-#  ds.call
+#
+#  Dump["postgres://localhost:5454/lots", '/var/data/lots']
+#
 # TODO possibly use Gem::Package::TarWriter to write tar files
-class Wyrm::Dump
-  include Wyrm::PumpMaker
-  include Wyrm::SchemaTools
-  include Wyrm::Logger
-  def initialize( src_db, container = nil, pump: nil )
-    @container = Pathname.new container || '.'
-    raise "#{@container} does not exist" unless @container.exist?
+module Wyrm
+  class Dump
+    include Wyrm::PumpMaker
+    include Wyrm::SchemaTools
+    include Wyrm::Logger
+    def self.[]( *args )
+      new(*args).call
+    end
-    @src_db = maybe_deebe src_db
-    @pump = make_pump( @src_db, pump )
+    def call
+      dump_schema
+      dump_tables
+      dump_indexes
+    end
-    @src_db.extension :schema_dumper
-  end
+    def initialize( src_db, container = nil, pump: nil )
+      @container = Pathname.new container || '.'
+      raise "#{@container} does not exist" unless @container.exist?
-  attr_reader :src_db, :container, :pump
+      @src_db = maybe_deebe src_db
+      @pump = make_pump( @src_db, pump )
-  def same_db; false end
+      @src_db.extension :schema_dumper
+    end
-  def numbering
-    @numbering ||= '000'
-  end
+    attr_reader :src_db, :container, :pump
-  def dump_schema
-    (container + "#{numbering.next!}_schema.rb").open('w') do |io|
-      io.write schema_migration
-    end
-  end
+    def same_db; false end
-  def dump_indexes
-    (container + "#{numbering.next!}_indexes.rb").open('w') do |io|
-      io.write index_migration
+    def numbering
+      @numbering ||= '000'
     end
-    (container + "#{numbering.next!}_foreign_keys.rb").open('w') do |io|
-      io.write fk_migration
+    def dump_table_schemas( *tables )
+      (container + "#{numbering.next!}_schema.rb").open('w') do |io|
+        tables.each do |table|
+          logger.debug "schema for #{table}"
+          io.puts table_migration table
+        end
+      end
     end
-  end
-  def write_through_bz2( pathname )
-    fio = pathname.open('w')
-    # open subprocess in read-write mode
-    zio = IO.popen( STREAM_COMP, 'r+' )
-    copier = Thread.new do
-      begin
-        IO.copy_stream zio, fio
-        logger.debug "finished stream copy"
-      ensure
-        fio.close
+    def dump_schema
+      (container + "#{numbering.next!}_schema.rb").open('w') do |io|
+        io.write schema_migration
       end
     end
-    yield zio
+    def dump_indexes
+      (container + "#{numbering.next!}_indexes.rb").open('w') do |io|
+        io.write index_migration
+      end
-    # signal the copier thread to stop
-    zio.close_write
-    logger.debug 'finished dumping'
+      (container + "#{numbering.next!}_foreign_keys.rb").open('w') do |io|
+        io.write fk_migration
+      end
+    end
-    # wait for copier thread to finish
-    copier.join
-    logger.debug 'stream copy thread finished'
-  ensure
-    zio.close unless zio.closed?
-    fio.close unless fio.closed?
-  end
+    def write_through_bz2( pathname )
+      fio = pathname.open('w')
+      # open subprocess in read-write mode
+      zio = IO.popen( STREAM_COMP, 'r+' )
+      copier = Thread.new do
+        begin
+          IO.copy_stream zio, fio
+          logger.debug "finished stream copy"
+        ensure
+          fio.close
+        end
+      end
-  def dump_table( table_name, &io_block )
-    pump.table_name = table_name
-    if pump.table_dataset.empty?
-      logger.info "No records in #{table_name}"
-      return
+      # block receiving zio will write to it.
+      yield zio
+      # signal the copier thread to stop
+      logger.debug 'flushing'
+      if RUBY_ENGINE == 'jruby'
+        # seems to be required for jruby, at least 9.1.2.0
+        logger.debug 'jruby flushing'
+        zio.flush
+        logger.debug 'jruby close'
+        zio.close
+      else
+        zio.close_write
+      end
+      logger.debug 'finished dumping'
+      # wait for copier thread to finish
+      copier.join
+      logger.debug 'stream copy thread finished'
+    ensure
+      zio.close if zio && !zio.closed?
+      fio.close if fio && !fio.closed?
     end
-    filename = container + "#{table_name}.dbp.bz2"
-    logger.info "dumping #{table_name} to #{filename}"
+    def dump_table( table_name, &io_block )
+      pump.table_name = table_name
+      if pump.table_dataset.empty?
+        logger.info "No records in #{table_name}"
+        return
+      end
-    write_through_bz2 filename do |zio|
-      # generate the dump
-      pump.io = zio
-      pump.dump
-    end
-  rescue
-    logger.error "failed dumping #{table_name}: #{$!.message}"
-  end
+      filename = container + "#{table_name}.dbp.bz2"
+      logger.info "dumping #{table_name} to #{filename}"
-  def dump_tables
-    src_db.tables.each do |table_name|
-      dump_table table_name
+      write_through_bz2 filename do |zio|
+        # generate the dump
+        pump.io = zio
+        pump.dump
+      end
+    rescue
+      logger.error "failed dumping #{table_name}: #{$!.message}"
     end
-  end
-  def call
-    dump_schema
-    dump_tables
-    dump_indexes
+    def dump_tables
+      src_db.tables.each do |table_name|
+        dump_table table_name
+      end
+    end
   end
 end

data/lib/wyrm/logger.rb CHANGED

@@ -4,7 +4,7 @@ module Wyrm
   module Logger
     def logger
       @logger ||= ::Logger.new( STDERR ).tap do |lgr|
-        lgr.level = ::Logger::INFO
+        lgr.level = ::Logger::DEBUG
       end
     end
   end

data/lib/wyrm/pump.rb CHANGED

@@ -8,304 +8,306 @@ require 'wyrm/module'
 # TODO need to version the dumps, or something like that.
 # TODO looks like io should belong to codec. Hmm. Not sure.
 # TODO table_name table_dataset need some thinking about. Dataset would encapsulate both. But couldn't change db then, and primary_keys would be hard.
-class Wyrm::Pump
-  def initialize( db: nil, table_name: nil, io: STDOUT, codec: :marshal, page_size: 10000, dry_run: false, logger: nil )
-    self.codec = codec
-    self.db = db
-    self.table_name = table_name
-    self.io = io
-    self.page_size = page_size
-    self.dry_run = dry_run
-    self.logger = logger
-    yield self if block_given?
-  end
+module Wyrm
+  class Pump
+    def initialize( db: nil, table_name: nil, io: STDOUT, codec: :marshal, page_size: 10000, dry_run: false, logger: nil )
+      self.codec = codec
+      self.db = db
+      self.table_name = table_name
+      self.io = io
+      self.page_size = page_size
+      self.dry_run = dry_run
+      self.logger = logger
+      yield self if block_given?
+    end
-  include Wyrm::Logger
-  attr_writer :logger
+    include Wyrm::Logger
+    attr_writer :logger
-  attr_accessor :io, :page_size, :dry_run
-  def dry_run?; dry_run; end
+    attr_accessor :io, :page_size, :dry_run
+    def dry_run?; dry_run; end
-  # These are affected by cached values
-  attr_reader :db, :table_name
+    # These are affected by cached values
+    attr_reader :db, :table_name
-  def invalidate_cached_members
-    @primary_keys = nil
-    @table_dataset = nil
-  end
+    def invalidate_cached_members
+      @primary_keys = nil
+      @table_dataset = nil
+    end
-  def table_name=( name_sym )
-    invalidate_cached_members
-    @table_name = name_sym
-  end
+    def table_name=( name_sym )
+      invalidate_cached_members
+      @table_name = name_sym
+    end
-  def db=( other_db )
-    invalidate_cached_members
+    def db=( other_db )
+      invalidate_cached_members
-    @db = other_db
-    return unless other_db
+      @db = other_db
+      return unless other_db
-    # add extensions
-    @db.extension :pagination
+      # add extensions
+      @db.extension :pagination
-    # turn on postgres streaming if available
-    # also gets called for non-postgres dbs, but that seems to be fine.
-    if defined?( Sequel::Postgres ) && @db.is_a?(Sequel::Postgres::Database) && defined?(Sequel::Postgres.supports_streaming?) && Sequel::Postgres.supports_streaming?
-      @db.extension :pg_streaming
-      logger.info "Streaming for #{@db.uri}"
-    else
-      logger.info "No streaming for #{@db.uri}"
+      # turn on postgres streaming if available
+      # also gets called for non-postgres dbs, but that seems to be fine.
+      if defined?( Sequel::Postgres::Database ) && @db.is_a?(Sequel::Postgres::Database) && defined?(Sequel::Postgres.supports_streaming?) && Sequel::Postgres.supports_streaming?
+        @db.extension :pg_streaming
+        logger.info "Streaming for #{@db.uri}"
+      else
+        logger.info "No streaming for #{@db.uri}"
+      end
     end
-  end
-  # return an object that responds to ===
-  # which returns true if ==='s parameter
-  # responds to all the methods
-  def self.quacks_like( *methods )
-    @quacks_like ||= {}
-    @quacks_like[methods] ||= lambda do |inst|
-      methods.all?{|m| inst.respond_to? m}
+    # return an object that responds to ===
+    # which returns true if ==='s parameter
+    # responds to all the methods
+    def self.quacks_like( *methods )
+      @quacks_like ||= {}
+      @quacks_like[methods] ||= lambda do |inst|
+        methods.all?{|m| inst.respond_to? m}
+      end
     end
-  end
-  def quacks_like( *methods )
-    self.class.quacks_like( *methods )
-  end
+    def quacks_like( *methods )
+      self.class.quacks_like( *methods )
+    end
-  def codec=( codec_thing )
-    @codec =
-    case codec_thing
-    when :yaml; YamlCodec.new
-    when :marshal; MarshalCodec.new
-    when Class
-      codec_thing.new
-    when quacks_like(:encode,:decode)
-      codec_thing
-    else
-      raise "unknown codec #{codec_thing.inspect}"
+    def codec=( codec_thing )
+      @codec =
+      case codec_thing
+      when :yaml; YamlCodec.new
+      when :marshal; MarshalCodec.new
+      when Class
+        codec_thing.new
+      when quacks_like(:encode,:decode)
+        codec_thing
+      else
+        raise "unknown codec #{codec_thing.inspect}"
+      end
     end
-  end
-  attr_reader :codec
+    attr_reader :codec
-  class MarshalCodec
-    def encode( obj, io )
-      Marshal.dump obj, io
-    end
+    class MarshalCodec
+      def encode( obj, io )
+        Marshal.dump obj, io
+      end
-    def decode( io, &block )
-      obj = Marshal.load(io)
-      yield obj if block_given?
-      obj
+      def decode( io, &block )
+        obj = Marshal.load(io)
+        yield obj if block_given?
+        obj
+      end
     end
-  end
-  class YamlCodec
-    def encode( obj, io )
-      YAML.dump obj, io
-    end
+    class YamlCodec
+      def encode( obj, io )
+        YAML.dump obj, io
+      end
-    def decode( io, &block )
-      obj = YAML.load(io)
-      yield obj if block_given?
-      obj
+      def decode( io, &block )
+        obj = YAML.load(io)
+        yield obj if block_given?
+        obj
+      end
     end
-  end
-  def primary_keys
-    # each_with_object([]){...} is only faster for < 3 items in 100000
-    @primary_keys ||= db.schema(table_name).map{|name,column_info| name if column_info[:primary_key]}.compact
-  end
+    def primary_keys
+      # each_with_object([]){...} is only faster for < 3 items in 100000
+      @primary_keys ||= db.schema(table_name).map{|name,column_info| name if column_info[:primary_key]}.compact
+    end
-  def table_dataset
-    @table_dataset ||= db[table_name.to_sym]
-  end
+    def table_dataset
+      @table_dataset ||= db[table_name.to_sym]
+    end
-  # Use limit / offset. Last fallback if there are no keys (or a compound primary key?).
-  def paginated_dump( &encode_block )
-    records_count = 0
-    table_dataset.order(*primary_keys).each_page(page_size) do |page|
-      logger.info "#{__method__} #{table_name} #{records_count}"
-      logger.debug page.sql
-      page.each &encode_block
-      records_count += page_size
+    # Use limit / offset. Last fallback if there are no keys (or a compound primary key?).
+    def paginated_dump( &encode_block )
+      records_count = 0
+      table_dataset.order(*primary_keys).each_page(page_size) do |page|
+        logger.info "#{__method__} #{table_name} #{records_count}"
+        logger.debug page.sql
+        page.each &encode_block
+        records_count += page_size
+      end
     end
-  end
-  # Use limit / offset, but not for all fields.
-  # The idea is that large offsets are expensive in the db because the db server has to read
-  # through the data set to reach the required offset. So make that only ids need to be read,
-  # and then do the main select from the limited id list.
-  # select * from massive as full
-  #   inner join (select id from massive order by whatever limit m, n) limit
-  #   on full.id = limit.id
-  # order by full.whatever
-  # http://www.numerati.com/2012/06/26/reading-large-result-sets-with-hibernate-and-mysql/
-  def inner_dump( &encode_block )
-    # could possibly overrride Dataset#paginate(page_no, page_size, record_count=nil)
-    on_conditions = primary_keys.map{|f| [f,f]}.to_h
-    (0..table_dataset.count).step(page_size).each do |offset|
-      limit_dataset = table_dataset.select( *primary_keys ).limit( page_size, offset ).order( *primary_keys )
-      page = table_dataset.join( limit_dataset, on_conditions ).order( *primary_keys ).qualify(table_name)
-      logger.info "#{__method__} #{table_name} #{offset}"
-      logger.debug page.sql
-      page.each &encode_block
+    # Use limit / offset, but not for all fields.
+    # The idea is that large offsets are expensive in the db because the db server has to read
+    # through the data set to reach the required offset. So make that only ids need to be read,
+    # and then do the main select from the limited id list.
+    # select * from massive as full
+    #   inner join (select id from massive order by whatever limit m, n) limit
+    #   on full.id = limit.id
+    # order by full.whatever
+    # http://www.numerati.com/2012/06/26/reading-large-result-sets-with-hibernate-and-mysql/
+    def inner_dump( &encode_block )
+      # could possibly overrride Dataset#paginate(page_no, page_size, record_count=nil)
+      on_conditions = primary_keys.map{|f| [f,f]}.to_h
+      (0..table_dataset.count).step(page_size).each do |offset|
+        limit_dataset = table_dataset.select( *primary_keys ).limit( page_size, offset ).order( *primary_keys )
+        page = table_dataset.join( limit_dataset, on_conditions ).order( *primary_keys ).qualify(table_name)
+        logger.info "#{__method__} #{table_name} #{offset}"
+        logger.debug page.sql
+        page.each &encode_block
+      end
     end
-  end
-  # Selects pages by a range of ids, using >= and <.
-  # Use this for integer pks
-  def min_max_dump( &encode_block )
-    # select max(id), min(id) from table
-    # and then split that up into 10000 size chunks.
-    # Not really important if there aren't exactly 10000
-    min, max = table_dataset.select{[min(id), max(id)]}.first.values
-    return unless min && max
-    # will always include the last item because page_size will be
-    # bigger than max for the last page
-    (min..max).step(page_size).each do |offset|
-      page = table_dataset.where( id: offset...(offset + page_size) )
-      logger.info "#{__method__} #{table_name} #{offset}"
-      logger.debug page.sql
-      page.each &encode_block
+    # Selects pages by a range of ids, using >= and <.
+    # Use this for integer pks
+    def min_max_dump( &encode_block )
+      # select max(id), min(id) from table
+      # and then split that up into 10000 size chunks.
+      # Not really important if there aren't exactly 10000
+      min, max = table_dataset.select{[min(id), max(id)]}.first.values
+      return unless min && max
+      # will always include the last item because page_size will be
+      # bigger than max for the last page
+      (min..max).step(page_size).each do |offset|
+        page = table_dataset.where( id: offset...(offset + page_size) )
+        logger.info "#{__method__} #{table_name} #{offset}"
+        logger.debug page.sql
+        page.each &encode_block
+      end
     end
-  end
-  def stream_dump( &encode_block )
-    logger.info "using result set streaming"
-    # I want to output progress every page_size records,
-    # without doing a records_count % page_size every iteration.
-    # So define an external enumerator
-    # TODO should really performance test the options here.
-    records_count = 0
-    enum = table_dataset.stream.enum_for
-    loop do
-      begin
-        page_size.times do
-          encode_block.call enum.next
-          records_count += 1
+    def stream_dump( &encode_block )
+      logger.info "using result set streaming"
+      # I want to output progress every page_size records,
+      # without doing a records_count % page_size every iteration.
+      # So define an external enumerator
+      # TODO should really performance test the options here.
+      records_count = 0
+      enum = table_dataset.stream.enum_for
+      loop do
+        begin
+          page_size.times do
+            encode_block.call enum.next
+            records_count += 1
+          end
+        ensure
+          logger.info "#{__method__} #{table_name} #{records_count}" if records_count < page_size
+          logger.debug "  #{records_count} from #{table_dataset.sql}"
         end
-      ensure
-        logger.info "#{__method__} #{table_name} #{records_count}" if records_count < page_size
-        logger.debug "  #{records_count} from #{table_dataset.sql}"
       end
     end
-  end
-  # Dump the serialization of the table to the specified io.
-  #
-  # TODO need to also dump a first row containing useful stuff:
-  # - source table name
-  # - number of rows
-  # - source db url
-  # - permissions?
-  # These should all be in one object that can be Marshall.load-ed easily.
-  #
-  # TODO could speed this up by have a query thread which runs the next page-query while
-  # the current one is being written/compressed.
-  def dump
-    _dump do |row|
-      codec.encode( row.values, io ) unless dry_run?
+    # Dump the serialization of the table to the specified io.
+    #
+    # TODO need to also dump a first row containing useful stuff:
+    # - source table name
+    # - number of rows
+    # - source db url
+    # - permissions?
+    # These should all be in one object that can be Marshall.load-ed easily.
+    #
+    # TODO could speed this up by have a query thread which runs the next page-query while
+    # the current one is being written/compressed.
+    def dump
+      _dump do |row|
+        codec.encode( row.values, io ) unless dry_run?
+      end
+    ensure
+      io.flush
     end
-  ensure
-    io.flush
-  end
-  # decide which kind of paged iteration will be best for this table.
-  # Return an iterator, or yield row hashes to the block
-  def _dump( &encode_block )
-    return enum_for(__method__) unless block_given?
-    case
-    when table_dataset.respond_to?( :stream )
-      stream_dump &encode_block
+    # decide which kind of paged iteration will be best for this table.
+    # Return an iterator, or yield row hashes to the block
+    def _dump( &encode_block )
+      return enum_for(__method__) unless block_given?
+      case
+      when table_dataset.respond_to?( :stream )
+        stream_dump &encode_block
-    when primary_keys.empty?
-      paginated_dump &encode_block
+      when primary_keys.empty?
+        paginated_dump &encode_block
-    when primary_keys.all?{|i| i == :id }
-      min_max_dump &encode_block
+      when primary_keys.all?{|i| i == :id }
+        min_max_dump &encode_block
-    else
-      inner_dump &encode_block
+      else
+        inner_dump &encode_block
+      end
     end
-  end
-  def dump_matches_columns?( row_enum, columns )
-    raise "schema mismatch" unless row_enum.peek.size == columns.size
-    true
-  rescue StopIteration
-    # peek threw a StopIteration, so there's no data
-    false
-  end
+    def dump_matches_columns?( row_enum, columns )
+      raise "schema mismatch" unless row_enum.peek.size == columns.size
+      true
+    rescue StopIteration
+      # peek threw a StopIteration, so there's no data
+      false
+    end
-  # start_row is zero-based
-  #
-  # TODO don't generate the full insert, ie leave out the fields
-  # because we've already checked that the columns and the table
-  # match.
-  # TODO generate column names in insert, they might still work
-  # if columns have been added to the db, but not the dump.
-  def restore( start_row: 0, filename: 'io' )
-    columns = table_dataset.columns
-    row_enum = each_row
-    return unless dump_matches_columns?( row_enum, columns )
-    logger.info "#{__method__} inserting to #{table_name} from #{start_row}"
-    logger.debug "  #{columns.inspect}"
-    rows_restored = 0
-    if start_row != 0
-      logger.debug{ "skipping #{start_row} rows from #{filename}" }
-      start_row.times do |i|
-        row_enum.next
-        logger.debug{ "skipped #{i} from #{filename}" } if i % page_size == 0
+    # start_row is zero-based
+    #
+    # TODO don't generate the full insert, ie leave out the fields
+    # because we've already checked that the columns and the table
+    # match.
+    # TODO generate column names in insert, they might still work
+    # if columns have been added to the db, but not the dump.
+    def restore( start_row: 0, filename: 'io' )
+      columns = table_dataset.columns
+      row_enum = each_row
+      return unless dump_matches_columns?( row_enum, columns )
+      logger.info "#{__method__} inserting to #{table_name} from #{start_row}"
+      logger.debug "  #{columns.inspect}"
+      rows_restored = 0
+      if start_row != 0
+        logger.debug{ "skipping #{start_row} rows from #{filename}" }
+        start_row.times do |i|
+          row_enum.next
+          logger.debug{ "skipped #{i} from #{filename}" } if i % page_size == 0
+        end
+        logger.debug{ "skipped #{start_row} from #{filename}" }
+        rows_restored += start_row
       end
-      logger.debug{ "skipped #{start_row} from #{filename}" }
-      rows_restored += start_row
-    end
-    loop do
-      db.transaction do
-        begin
-          page_size.times do
-            # This skips all the checks in the Sequel code. Basically we want
-            # to generate the
-            #   insert into (field1,field2) values (value1,value2)
-            # statement as quickly as possible.
-            #
-            # Uses a private method so it will need to be updated repeatedly.
-            sql = table_dataset.clone( columns: columns, values: row_enum.next ).send(:_insert_sql)
-            db.execute sql unless dry_run?
-            rows_restored += 1
+      loop do
+        db.transaction do
+          begin
+            page_size.times do
+              # This skips all the checks in the Sequel code. Basically we want
+              # to generate the
+              #   insert into (field1,field2) values (value1,value2)
+              # statement as quickly as possible.
+              #
+              # Uses a private method so it will need to be updated repeatedly.
+              sql = table_dataset.clone( columns: columns, values: row_enum.next ).send(:_insert_sql)
+              db.execute sql unless dry_run?
+              rows_restored += 1
+            end
+          rescue StopIteration
+            # reached the end of the inout stream.
+            # So commit this transaction, and then re-raise
+            # StopIteration to get out of the loop{} statement
+            db.after_commit{ raise StopIteration }
           end
-        rescue StopIteration
-          # reached the end of the inout stream.
-          # So commit this transaction, and then re-raise
-          # StopIteration to get out of the loop{} statement
-          db.after_commit{ raise StopIteration }
         end
       end
+      logger.info "#{__method__} #{table_name} done. Inserted #{rows_restored}."
+      rows_restored
     end
-    logger.info "#{__method__} #{table_name} done. Inserted #{rows_restored}."
-    rows_restored
-  end
-  # Enumerate through the given io at its current position.
-  # Can raise StopIteration (ie when eof is not detected)
-  # MAYBE don't check for io.eof here, leave that to the codec
-  def each_row
-    return enum_for(__method__) unless block_given?
-    yield codec.decode( io ) until io.eof?
-  end
+    # Enumerate through the given io at its current position.
+    # Can raise StopIteration (ie when eof is not detected)
+    # MAYBE don't check for io.eof here, leave that to the codec
+    def each_row
+      return enum_for(__method__) unless block_given?
+      yield codec.decode( io ) until io.eof?
+    end
-  # Enumerate sql insert statements from the dump
-  def insert_sql_each
-    return enum_for(__method__) unless block_given?
-    each_row do |row|
-      yield table_dataset.insert_sql( row )
+    # Enumerate sql insert statements from the dump
+    def insert_sql_each
+      return enum_for(__method__) unless block_given?
+      each_row do |row|
+        yield table_dataset.insert_sql( row )
+      end
     end
   end
 end