RubyGems - wyrm - Versions diffs - 0.4.1 → 0.4.2 - Mend

wyrm 0.4.1 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: da207f92151b080d31039a364c1a2c50022f01ff
-  data.tar.gz: ddf38f48b42597ed08671cb67fadd8457e425e07
+  metadata.gz: c40184e0e1f6175ad0447494ff5bf367c39292db
+  data.tar.gz: c7b927a63887f83ba35b6c3be3c11fb412a2212a
 SHA512:
-  metadata.gz: 03e699a00d14fa7baacc286b886cf35074766b7b6b3b8e6e10fde08779ded7fda4930f9666bad95274bf773f8fe33f1916f3836414e98c188c81963b3a01459c
-  data.tar.gz: 0052a0b096e62662223f9e4a9da2cfd79e2908f033f83a7a9201463aac3ec9f56407299b68e461b9b850f39ea8c3da5e53c48ad7510561d680b601a38ce739ca
+  metadata.gz: cd762e971e8fb35f4147b4657b5fbb67fb1de1ef26ec4d8ef7af2dac2a9f6532cf8bce4e02587021e261e302e133d6312caad46cf6e06924d3701a25dc8bb2a1
+  data.tar.gz: 7c38e0d0f186e78e58639220b21755b219e85ef15b3acbe8c920e145c70f1715702b4c4fd060abebad767732469296dd855af198349b27be97d70fd419060e47

data/.travis.yml CHANGED

@@ -1,4 +1,6 @@
 language: ruby
-rvm: []
-# - 2.3 not supported as of 16-Mar-2016. srsly, 3 months after release
+rvm:
+  - 2.3.0
+  - 2.3.1
+  # - jruby-9.1.0.0 fails because of db drivers
 script: bundle exec rspec spec

data/Gemfile CHANGED

@@ -1,32 +1,24 @@
-raise "You need >= ruby-2.3 (or maybe a Queue with close would work)" unless RUBY_VERSION >= '2.3.0'
+source 'https://rubygems.org'
-# TODO this is for older versions of bundler
-def from_gemrc
-  # auto-load from ~/.gemrc
-  home_gemrc = Pathname('~/.gemrc').expand_path
-  if home_gemrc.exist?
-    require 'yaml'
-    # use all the sources specified in .gemrc
-    YAML.load_file(home_gemrc)[:sources]
-  end
-end
-# Use the gemrc source if defined, unless CANON is set,
-# otherwise just use the default.
-def preferred_sources
-  rv = from_gemrc unless eval(ENV['CANON']||'')
-  rv ||= []
-  rv << 'http://rubygems.org' if rv.empty?
-  rv
-end
-preferred_sources.each{|src| source src}
+raise "You need >= ruby-2.3 for wyrm" unless RUBY_VERSION >= '2.3.0'
 # Specify your gem's dependencies in wyrm.gemspec
 gemspec
-if Pathname('/usr/include/mysql').exist?
-  # version is for mysql streaming result sets
-  gem "mysql2", '>= 0.3.12'
+platforms :ruby do
+  gem 'pg'
+  gem 'sequel_pg'
+  gem 'sqlite3'
+  gem 'pry-byebug'
+  if Pathname('/usr/include/mysql').exist?
+    # version is for mysql streaming result sets
+    gem "mysql2", '>= 0.3.12'
+  end
+end
+platforms :jruby do
+  # gem "pg"
+  gem 'jdbc-sqlite3'
+  gem 'jdbc-postgres'
 end

data/History.txt CHANGED

@@ -1,3 +1,7 @@
+== 0.4.2
+* special case for jruby closing popen stream
+* use modules better
 == 0.4.1
 * Improve docs and examples
 * make pbzip2 somewhat configurable.

data/README.md CHANGED

@@ -30,6 +30,8 @@ Wyrm because:
 ## Dependencies
+Ruby >= 2.3.0, for Queue#close
 You must have a working
 [pbzip2](http://compression.ca/pbzip2/ "Will use all your cores")
 on your path. If you really have to use something else,

data/lib/wyrm/dump.rb CHANGED

@@ -6,104 +6,129 @@ require 'wyrm/schema_tools'
 require 'wyrm/logger'
 # Dump a schema and compressed data from a db to a set of files
-#  src_db = Sequel.connect "postgres://localhost:5454/lots"
-#  ds = DumpSchema.new src_db, Pathname('/var/data/lots')
-#  ds.call
+#
+#  Dump["postgres://localhost:5454/lots", '/var/data/lots']
+#
 # TODO possibly use Gem::Package::TarWriter to write tar files
-class Wyrm::Dump
-  include Wyrm::PumpMaker
-  include Wyrm::SchemaTools
-  include Wyrm::Logger
-  def initialize( src_db, container = nil, pump: nil )
-    @container = Pathname.new container || '.'
-    raise "#{@container} does not exist" unless @container.exist?
+module Wyrm
+  class Dump
+    include Wyrm::PumpMaker
+    include Wyrm::SchemaTools
+    include Wyrm::Logger
+    def self.[]( *args )
+      new(*args).call
+    end
-    @src_db = maybe_deebe src_db
-    @pump = make_pump( @src_db, pump )
+    def call
+      dump_schema
+      dump_tables
+      dump_indexes
+    end
-    @src_db.extension :schema_dumper
-  end
+    def initialize( src_db, container = nil, pump: nil )
+      @container = Pathname.new container || '.'
+      raise "#{@container} does not exist" unless @container.exist?
-  attr_reader :src_db, :container, :pump
+      @src_db = maybe_deebe src_db
+      @pump = make_pump( @src_db, pump )
-  def same_db; false end
+      @src_db.extension :schema_dumper
+    end
-  def numbering
-    @numbering ||= '000'
-  end
+    attr_reader :src_db, :container, :pump
-  def dump_schema
-    (container + "#{numbering.next!}_schema.rb").open('w') do |io|
-      io.write schema_migration
-    end
-  end
+    def same_db; false end
-  def dump_indexes
-    (container + "#{numbering.next!}_indexes.rb").open('w') do |io|
-      io.write index_migration
+    def numbering
+      @numbering ||= '000'
     end
-    (container + "#{numbering.next!}_foreign_keys.rb").open('w') do |io|
-      io.write fk_migration
+    def dump_table_schemas( *tables )
+      (container + "#{numbering.next!}_schema.rb").open('w') do |io|
+        tables.each do |table|
+          logger.debug "schema for #{table}"
+          io.puts table_migration table
+        end
+      end
     end
-  end
-  def write_through_bz2( pathname )
-    fio = pathname.open('w')
-    # open subprocess in read-write mode
-    zio = IO.popen( STREAM_COMP, 'r+' )
-    copier = Thread.new do
-      begin
-        IO.copy_stream zio, fio
-        logger.debug "finished stream copy"
-      ensure
-        fio.close
+    def dump_schema
+      (container + "#{numbering.next!}_schema.rb").open('w') do |io|
+        io.write schema_migration
       end
     end
-    yield zio
+    def dump_indexes
+      (container + "#{numbering.next!}_indexes.rb").open('w') do |io|
+        io.write index_migration
+      end
-    # signal the copier thread to stop
-    zio.close_write
-    logger.debug 'finished dumping'
+      (container + "#{numbering.next!}_foreign_keys.rb").open('w') do |io|
+        io.write fk_migration
+      end
+    end
-    # wait for copier thread to finish
-    copier.join
-    logger.debug 'stream copy thread finished'
-  ensure
-    zio.close unless zio.closed?
-    fio.close unless fio.closed?
-  end
+    def write_through_bz2( pathname )
+      fio = pathname.open('w')
+      # open subprocess in read-write mode
+      zio = IO.popen( STREAM_COMP, 'r+' )
+      copier = Thread.new do
+        begin
+          IO.copy_stream zio, fio
+          logger.debug "finished stream copy"
+        ensure
+          fio.close
+        end
+      end
-  def dump_table( table_name, &io_block )
-    pump.table_name = table_name
-    if pump.table_dataset.empty?
-      logger.info "No records in #{table_name}"
-      return
+      # block receiving zio will write to it.
+      yield zio
+      # signal the copier thread to stop
+      logger.debug 'flushing'
+      if RUBY_ENGINE == 'jruby'
+        # seems to be required for jruby, at least 9.1.2.0
+        logger.debug 'jruby flushing'
+        zio.flush
+        logger.debug 'jruby close'
+        zio.close
+      else
+        zio.close_write
+      end
+      logger.debug 'finished dumping'
+      # wait for copier thread to finish
+      copier.join
+      logger.debug 'stream copy thread finished'
+    ensure
+      zio.close if zio && !zio.closed?
+      fio.close if fio && !fio.closed?
     end
-    filename = container + "#{table_name}.dbp.bz2"
-    logger.info "dumping #{table_name} to #{filename}"
+    def dump_table( table_name, &io_block )
+      pump.table_name = table_name
+      if pump.table_dataset.empty?
+        logger.info "No records in #{table_name}"
+        return
+      end
-    write_through_bz2 filename do |zio|
-      # generate the dump
-      pump.io = zio
-      pump.dump
-    end
-  rescue
-    logger.error "failed dumping #{table_name}: #{$!.message}"
-  end
+      filename = container + "#{table_name}.dbp.bz2"
+      logger.info "dumping #{table_name} to #{filename}"
-  def dump_tables
-    src_db.tables.each do |table_name|
-      dump_table table_name
+      write_through_bz2 filename do |zio|
+        # generate the dump
+        pump.io = zio
+        pump.dump
+      end
+    rescue
+      logger.error "failed dumping #{table_name}: #{$!.message}"
     end
-  end
-  def call
-    dump_schema
-    dump_tables
-    dump_indexes
+    def dump_tables
+      src_db.tables.each do |table_name|
+        dump_table table_name
+      end
+    end
   end
 end

data/lib/wyrm/logger.rb CHANGED

@@ -4,7 +4,7 @@ module Wyrm
   module Logger
     def logger
       @logger ||= ::Logger.new( STDERR ).tap do |lgr|
-        lgr.level = ::Logger::INFO
+        lgr.level = ::Logger::DEBUG
       end
     end
   end

data/lib/wyrm/pump.rb CHANGED

@@ -8,304 +8,306 @@ require 'wyrm/module'
 # TODO need to version the dumps, or something like that.
 # TODO looks like io should belong to codec. Hmm. Not sure.
 # TODO table_name table_dataset need some thinking about. Dataset would encapsulate both. But couldn't change db then, and primary_keys would be hard.
-class Wyrm::Pump
-  def initialize( db: nil, table_name: nil, io: STDOUT, codec: :marshal, page_size: 10000, dry_run: false, logger: nil )
-    self.codec = codec
-    self.db = db
-    self.table_name = table_name
-    self.io = io
-    self.page_size = page_size
-    self.dry_run = dry_run
-    self.logger = logger
-    yield self if block_given?
-  end
+module Wyrm
+  class Pump
+    def initialize( db: nil, table_name: nil, io: STDOUT, codec: :marshal, page_size: 10000, dry_run: false, logger: nil )
+      self.codec = codec
+      self.db = db
+      self.table_name = table_name
+      self.io = io
+      self.page_size = page_size
+      self.dry_run = dry_run
+      self.logger = logger
+      yield self if block_given?
+    end
-  include Wyrm::Logger
-  attr_writer :logger
+    include Wyrm::Logger
+    attr_writer :logger
-  attr_accessor :io, :page_size, :dry_run
-  def dry_run?; dry_run; end
+    attr_accessor :io, :page_size, :dry_run
+    def dry_run?; dry_run; end
-  # These are affected by cached values
-  attr_reader :db, :table_name
+    # These are affected by cached values
+    attr_reader :db, :table_name
-  def invalidate_cached_members
-    @primary_keys = nil
-    @table_dataset = nil
-  end
+    def invalidate_cached_members
+      @primary_keys = nil
+      @table_dataset = nil
+    end
-  def table_name=( name_sym )
-    invalidate_cached_members
-    @table_name = name_sym
-  end
+    def table_name=( name_sym )
+      invalidate_cached_members
+      @table_name = name_sym
+    end
-  def db=( other_db )
-    invalidate_cached_members
+    def db=( other_db )
+      invalidate_cached_members
-    @db = other_db
-    return unless other_db
+      @db = other_db
+      return unless other_db
-    # add extensions
-    @db.extension :pagination
+      # add extensions
+      @db.extension :pagination
-    # turn on postgres streaming if available
-    # also gets called for non-postgres dbs, but that seems to be fine.
-    if defined?( Sequel::Postgres ) && @db.is_a?(Sequel::Postgres::Database) && defined?(Sequel::Postgres.supports_streaming?) && Sequel::Postgres.supports_streaming?
-      @db.extension :pg_streaming
-      logger.info "Streaming for #{@db.uri}"
-    else
-      logger.info "No streaming for #{@db.uri}"
+      # turn on postgres streaming if available
+      # also gets called for non-postgres dbs, but that seems to be fine.
+      if defined?( Sequel::Postgres::Database ) && @db.is_a?(Sequel::Postgres::Database) && defined?(Sequel::Postgres.supports_streaming?) && Sequel::Postgres.supports_streaming?
+        @db.extension :pg_streaming
+        logger.info "Streaming for #{@db.uri}"
+      else
+        logger.info "No streaming for #{@db.uri}"
+      end
     end
-  end
-  # return an object that responds to ===
-  # which returns true if ==='s parameter
-  # responds to all the methods
-  def self.quacks_like( *methods )
-    @quacks_like ||= {}
-    @quacks_like[methods] ||= lambda do |inst|
-      methods.all?{|m| inst.respond_to? m}
+    # return an object that responds to ===
+    # which returns true if ==='s parameter
+    # responds to all the methods
+    def self.quacks_like( *methods )
+      @quacks_like ||= {}
+      @quacks_like[methods] ||= lambda do |inst|
+        methods.all?{|m| inst.respond_to? m}
+      end
     end
-  end
-  def quacks_like( *methods )
-    self.class.quacks_like( *methods )
-  end
+    def quacks_like( *methods )
+      self.class.quacks_like( *methods )
+    end
-  def codec=( codec_thing )
-    @codec =
-    case codec_thing
-    when :yaml; YamlCodec.new
-    when :marshal; MarshalCodec.new
-    when Class
-      codec_thing.new
-    when quacks_like(:encode,:decode)
-      codec_thing
-    else
-      raise "unknown codec #{codec_thing.inspect}"
+    def codec=( codec_thing )
+      @codec =
+      case codec_thing
+      when :yaml; YamlCodec.new
+      when :marshal; MarshalCodec.new
+      when Class
+        codec_thing.new
+      when quacks_like(:encode,:decode)
+        codec_thing
+      else
+        raise "unknown codec #{codec_thing.inspect}"
+      end
     end
-  end
-  attr_reader :codec
+    attr_reader :codec
-  class MarshalCodec
-    def encode( obj, io )
-      Marshal.dump obj, io
-    end
+    class MarshalCodec
+      def encode( obj, io )
+        Marshal.dump obj, io
+      end
-    def decode( io, &block )
-      obj = Marshal.load(io)
-      yield obj if block_given?
-      obj
+      def decode( io, &block )
+        obj = Marshal.load(io)
+        yield obj if block_given?
+        obj
+      end
     end
-  end
-  class YamlCodec
-    def encode( obj, io )
-      YAML.dump obj, io
-    end
+    class YamlCodec
+      def encode( obj, io )
+        YAML.dump obj, io
+      end
-    def decode( io, &block )
-      obj = YAML.load(io)
-      yield obj if block_given?
-      obj
+      def decode( io, &block )
+        obj = YAML.load(io)
+        yield obj if block_given?
+        obj
+      end
     end
-  end
-  def primary_keys
-    # each_with_object([]){...} is only faster for < 3 items in 100000
-    @primary_keys ||= db.schema(table_name).map{|name,column_info| name if column_info[:primary_key]}.compact
-  end
+    def primary_keys
+      # each_with_object([]){...} is only faster for < 3 items in 100000
+      @primary_keys ||= db.schema(table_name).map{|name,column_info| name if column_info[:primary_key]}.compact
+    end
-  def table_dataset
-    @table_dataset ||= db[table_name.to_sym]
-  end
+    def table_dataset
+      @table_dataset ||= db[table_name.to_sym]
+    end
-  # Use limit / offset. Last fallback if there are no keys (or a compound primary key?).
-  def paginated_dump( &encode_block )
-    records_count = 0
-    table_dataset.order(*primary_keys).each_page(page_size) do |page|
-      logger.info "#{__method__} #{table_name} #{records_count}"
-      logger.debug page.sql
-      page.each &encode_block
-      records_count += page_size
+    # Use limit / offset. Last fallback if there are no keys (or a compound primary key?).
+    def paginated_dump( &encode_block )
+      records_count = 0
+      table_dataset.order(*primary_keys).each_page(page_size) do |page|
+        logger.info "#{__method__} #{table_name} #{records_count}"
+        logger.debug page.sql
+        page.each &encode_block
+        records_count += page_size
+      end
     end
-  end
-  # Use limit / offset, but not for all fields.
-  # The idea is that large offsets are expensive in the db because the db server has to read
-  # through the data set to reach the required offset. So make that only ids need to be read,
-  # and then do the main select from the limited id list.
-  # select * from massive as full
-  #   inner join (select id from massive order by whatever limit m, n) limit
-  #   on full.id = limit.id
-  # order by full.whatever
-  # http://www.numerati.com/2012/06/26/reading-large-result-sets-with-hibernate-and-mysql/
-  def inner_dump( &encode_block )
-    # could possibly overrride Dataset#paginate(page_no, page_size, record_count=nil)
-    on_conditions = primary_keys.map{|f| [f,f]}.to_h
-    (0..table_dataset.count).step(page_size).each do |offset|
-      limit_dataset = table_dataset.select( *primary_keys ).limit( page_size, offset ).order( *primary_keys )
-      page = table_dataset.join( limit_dataset, on_conditions ).order( *primary_keys ).qualify(table_name)
-      logger.info "#{__method__} #{table_name} #{offset}"
-      logger.debug page.sql
-      page.each &encode_block
+    # Use limit / offset, but not for all fields.
+    # The idea is that large offsets are expensive in the db because the db server has to read
+    # through the data set to reach the required offset. So make that only ids need to be read,
+    # and then do the main select from the limited id list.
+    # select * from massive as full
+    #   inner join (select id from massive order by whatever limit m, n) limit
+    #   on full.id = limit.id
+    # order by full.whatever
+    # http://www.numerati.com/2012/06/26/reading-large-result-sets-with-hibernate-and-mysql/
+    def inner_dump( &encode_block )
+      # could possibly overrride Dataset#paginate(page_no, page_size, record_count=nil)
+      on_conditions = primary_keys.map{|f| [f,f]}.to_h
+      (0..table_dataset.count).step(page_size).each do |offset|
+        limit_dataset = table_dataset.select( *primary_keys ).limit( page_size, offset ).order( *primary_keys )
+        page = table_dataset.join( limit_dataset, on_conditions ).order( *primary_keys ).qualify(table_name)
+        logger.info "#{__method__} #{table_name} #{offset}"
+        logger.debug page.sql
+        page.each &encode_block
+      end
     end
-  end
-  # Selects pages by a range of ids, using >= and <.
-  # Use this for integer pks
-  def min_max_dump( &encode_block )
-    # select max(id), min(id) from table
-    # and then split that up into 10000 size chunks.
-    # Not really important if there aren't exactly 10000
-    min, max = table_dataset.select{[min(id), max(id)]}.first.values
-    return unless min && max
-    # will always include the last item because page_size will be
-    # bigger than max for the last page
-    (min..max).step(page_size).each do |offset|
-      page = table_dataset.where( id: offset...(offset + page_size) )
-      logger.info "#{__method__} #{table_name} #{offset}"
-      logger.debug page.sql
-      page.each &encode_block
+    # Selects pages by a range of ids, using >= and <.
+    # Use this for integer pks
+    def min_max_dump( &encode_block )
+      # select max(id), min(id) from table
+      # and then split that up into 10000 size chunks.
+      # Not really important if there aren't exactly 10000
+      min, max = table_dataset.select{[min(id), max(id)]}.first.values
+      return unless min && max
+      # will always include the last item because page_size will be
+      # bigger than max for the last page
+      (min..max).step(page_size).each do |offset|
+        page = table_dataset.where( id: offset...(offset + page_size) )
+        logger.info "#{__method__} #{table_name} #{offset}"
+        logger.debug page.sql
+        page.each &encode_block
+      end
     end
-  end
-  def stream_dump( &encode_block )
-    logger.info "using result set streaming"
-    # I want to output progress every page_size records,
-    # without doing a records_count % page_size every iteration.
-    # So define an external enumerator
-    # TODO should really performance test the options here.
-    records_count = 0
-    enum = table_dataset.stream.enum_for
-    loop do
-      begin
-        page_size.times do
-          encode_block.call enum.next
-          records_count += 1
+    def stream_dump( &encode_block )
+      logger.info "using result set streaming"
+      # I want to output progress every page_size records,
+      # without doing a records_count % page_size every iteration.
+      # So define an external enumerator
+      # TODO should really performance test the options here.
+      records_count = 0
+      enum = table_dataset.stream.enum_for
+      loop do
+        begin
+          page_size.times do
+            encode_block.call enum.next
+            records_count += 1
+          end
+        ensure
+          logger.info "#{__method__} #{table_name} #{records_count}" if records_count < page_size
+          logger.debug "  #{records_count} from #{table_dataset.sql}"
         end
-      ensure
-        logger.info "#{__method__} #{table_name} #{records_count}" if records_count < page_size
-        logger.debug "  #{records_count} from #{table_dataset.sql}"
       end
     end
-  end
-  # Dump the serialization of the table to the specified io.
-  #
-  # TODO need to also dump a first row containing useful stuff:
-  # - source table name
-  # - number of rows
-  # - source db url
-  # - permissions?
-  # These should all be in one object that can be Marshall.load-ed easily.
-  #
-  # TODO could speed this up by have a query thread which runs the next page-query while
-  # the current one is being written/compressed.
-  def dump
-    _dump do |row|
-      codec.encode( row.values, io ) unless dry_run?
+    # Dump the serialization of the table to the specified io.
+    #
+    # TODO need to also dump a first row containing useful stuff:
+    # - source table name
+    # - number of rows
+    # - source db url
+    # - permissions?
+    # These should all be in one object that can be Marshall.load-ed easily.
+    #
+    # TODO could speed this up by have a query thread which runs the next page-query while
+    # the current one is being written/compressed.
+    def dump
+      _dump do |row|
+        codec.encode( row.values, io ) unless dry_run?
+      end
+    ensure
+      io.flush
     end
-  ensure
-    io.flush
-  end
-  # decide which kind of paged iteration will be best for this table.
-  # Return an iterator, or yield row hashes to the block
-  def _dump( &encode_block )
-    return enum_for(__method__) unless block_given?
-    case
-    when table_dataset.respond_to?( :stream )
-      stream_dump &encode_block
+    # decide which kind of paged iteration will be best for this table.
+    # Return an iterator, or yield row hashes to the block
+    def _dump( &encode_block )
+      return enum_for(__method__) unless block_given?
+      case
+      when table_dataset.respond_to?( :stream )
+        stream_dump &encode_block
-    when primary_keys.empty?
-      paginated_dump &encode_block
+      when primary_keys.empty?
+        paginated_dump &encode_block
-    when primary_keys.all?{|i| i == :id }
-      min_max_dump &encode_block
+      when primary_keys.all?{|i| i == :id }
+        min_max_dump &encode_block
-    else
-      inner_dump &encode_block
+      else
+        inner_dump &encode_block
+      end
     end
-  end
-  def dump_matches_columns?( row_enum, columns )
-    raise "schema mismatch" unless row_enum.peek.size == columns.size
-    true
-  rescue StopIteration
-    # peek threw a StopIteration, so there's no data
-    false
-  end
+    def dump_matches_columns?( row_enum, columns )
+      raise "schema mismatch" unless row_enum.peek.size == columns.size
+      true
+    rescue StopIteration
+      # peek threw a StopIteration, so there's no data
+      false
+    end
-  # start_row is zero-based
-  #
-  # TODO don't generate the full insert, ie leave out the fields
-  # because we've already checked that the columns and the table
-  # match.
-  # TODO generate column names in insert, they might still work
-  # if columns have been added to the db, but not the dump.
-  def restore( start_row: 0, filename: 'io' )
-    columns = table_dataset.columns
-    row_enum = each_row
-    return unless dump_matches_columns?( row_enum, columns )
-    logger.info "#{__method__} inserting to #{table_name} from #{start_row}"
-    logger.debug "  #{columns.inspect}"
-    rows_restored = 0
-    if start_row != 0
-      logger.debug{ "skipping #{start_row} rows from #{filename}" }
-      start_row.times do |i|
-        row_enum.next
-        logger.debug{ "skipped #{i} from #{filename}" } if i % page_size == 0
+    # start_row is zero-based
+    #
+    # TODO don't generate the full insert, ie leave out the fields
+    # because we've already checked that the columns and the table
+    # match.
+    # TODO generate column names in insert, they might still work
+    # if columns have been added to the db, but not the dump.
+    def restore( start_row: 0, filename: 'io' )
+      columns = table_dataset.columns
+      row_enum = each_row
+      return unless dump_matches_columns?( row_enum, columns )
+      logger.info "#{__method__} inserting to #{table_name} from #{start_row}"
+      logger.debug "  #{columns.inspect}"
+      rows_restored = 0
+      if start_row != 0
+        logger.debug{ "skipping #{start_row} rows from #{filename}" }
+        start_row.times do |i|
+          row_enum.next
+          logger.debug{ "skipped #{i} from #{filename}" } if i % page_size == 0
+        end
+        logger.debug{ "skipped #{start_row} from #{filename}" }
+        rows_restored += start_row
       end
-      logger.debug{ "skipped #{start_row} from #{filename}" }
-      rows_restored += start_row
-    end
-    loop do
-      db.transaction do
-        begin
-          page_size.times do
-            # This skips all the checks in the Sequel code. Basically we want
-            # to generate the
-            #   insert into (field1,field2) values (value1,value2)
-            # statement as quickly as possible.
-            #
-            # Uses a private method so it will need to be updated repeatedly.
-            sql = table_dataset.clone( columns: columns, values: row_enum.next ).send(:_insert_sql)
-            db.execute sql unless dry_run?
-            rows_restored += 1
+      loop do
+        db.transaction do
+          begin
+            page_size.times do
+              # This skips all the checks in the Sequel code. Basically we want
+              # to generate the
+              #   insert into (field1,field2) values (value1,value2)
+              # statement as quickly as possible.
+              #
+              # Uses a private method so it will need to be updated repeatedly.
+              sql = table_dataset.clone( columns: columns, values: row_enum.next ).send(:_insert_sql)
+              db.execute sql unless dry_run?
+              rows_restored += 1
+            end
+          rescue StopIteration
+            # reached the end of the inout stream.
+            # So commit this transaction, and then re-raise
+            # StopIteration to get out of the loop{} statement
+            db.after_commit{ raise StopIteration }
           end
-        rescue StopIteration
-          # reached the end of the inout stream.
-          # So commit this transaction, and then re-raise
-          # StopIteration to get out of the loop{} statement
-          db.after_commit{ raise StopIteration }
         end
       end
+      logger.info "#{__method__} #{table_name} done. Inserted #{rows_restored}."
+      rows_restored
     end
-    logger.info "#{__method__} #{table_name} done. Inserted #{rows_restored}."
-    rows_restored
-  end
-  # Enumerate through the given io at its current position.
-  # Can raise StopIteration (ie when eof is not detected)
-  # MAYBE don't check for io.eof here, leave that to the codec
-  def each_row
-    return enum_for(__method__) unless block_given?
-    yield codec.decode( io ) until io.eof?
-  end
+    # Enumerate through the given io at its current position.
+    # Can raise StopIteration (ie when eof is not detected)
+    # MAYBE don't check for io.eof here, leave that to the codec
+    def each_row
+      return enum_for(__method__) unless block_given?
+      yield codec.decode( io ) until io.eof?
+    end
-  # Enumerate sql insert statements from the dump
-  def insert_sql_each
-    return enum_for(__method__) unless block_given?
-    each_row do |row|
-      yield table_dataset.insert_sql( row )
+    # Enumerate sql insert statements from the dump
+    def insert_sql_each
+      return enum_for(__method__) unless block_given?
+      each_row do |row|
+        yield table_dataset.insert_sql( row )
+      end
     end
   end
 end