RubyGems - wyrm - Versions diffs - 0.2.0 → 0.2.1 - Mend

wyrm 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: c432ee798bf7c5208a16696daa5741d64351721d
-  data.tar.gz: 51e22092611ef48f16d4757eb8c327f96e6847f5
+  metadata.gz: 6c0bb0fe99a301ead2da2ce8a64dc1eb20c925b0
+  data.tar.gz: 031b66ab01f20c5ebad94dbfa3c50338dbd15cba
 SHA512:
-  metadata.gz: b907a9adbd5b47ac9847a0aeaa95e5318eff307735947f8e4dfd4aa35e819f8ff873cbf53053202a4a944f2961fe6a3254f33bd4816753386580e2049c13e186
-  data.tar.gz: 65e512436c2991f2b9786c8e9e2587854b227e95bac2dcc1ec2f87b3d0840e786ac4e13a454197cdeb38a43eae948c9fbaa743b3c1f3b21d02714b398fd56894
+  metadata.gz: 5feadc5c19a9df8417414cb91270ae55b8d114cf94fc9daa33b01b2f23292e858a109b0d703cfc7474952b963c26876d7e732f12c0802146cd5ef7838e803629
+  data.tar.gz: a73a7c30e43a430fb05d22552f5b0f10c81cdcd99ec818ff8f838f6ff1a6f59f3c9d71a9d8eacdd26ff4066f1140e82e0236db0e13c2421fba8b1800b7d35710

data/Gemfile CHANGED

@@ -1,7 +1,26 @@
-source 'https://rubygems.org'
-# source 'file:///var/cache/rubygems'
+def from_gemrc
+  # auto-load from ~/.gemrc
+  home_gemrc = Pathname('~/.gemrc').expand_path
-gem 'sequel', ~> '4.0.0'
+  if home_gemrc.exist?
+    require 'yaml'
+    # use all the sources specified in .gemrc
+    YAML.load_file(home_gemrc)[:sources]
+  end
+end
+# Use the gemrc source if defined, unless CANON is set,
+# otherwise just use the default.
+def preferred_sources
+  rv = from_gemrc unless eval(ENV['CANON']||'')
+  rv ||= []
+  rv << 'http://rubygems.org' if rv.empty?
+  rv
+end
+preferred_sources.each{|src| source src}
+gem 'sequel'
 gem 'fastandand'
 # Specify your gem's dependencies in wyrm.gemspec

data/README.md CHANGED

@@ -3,16 +3,23 @@
 Transfer data from one database to another. Has been used to dump > 100M dbs,
 and one 850G db. Should theoretically work for any dbs supported by Sequel.
+Dumps are compressed with bz2, using pbzip2. Fast *and* small :-D For example:
+mysqldump | bzip2 for a certain 850G db comes to 127G. With wyrm it
+comes to 134G.
 Currently transfers tables and views only. Does not attempt to transfer
 stored procs, permissions, triggers etc.
-Works best for tables that have single numeric primary keys, but should also
-handle compound primary keys and tables without primary keys.
+Handles tables with a single numeric key, single non-numeric key, and no
+primary key. Haven't tried with compound primary key.
+Depending on table keys will use different strategies to keep memory usage small.
+Will use result set streaming if available.
 Wyrm because:
 - I like dragons
-- I can (eventually) have a Wyrm::Hole to transfer data through :-D
+- I can (eventually) have a Wyrm::Hole to transfer data through ;-)
 ## Dependencies
@@ -37,7 +44,7 @@ Or install it yourself as:
 Make sure you install the db gems, typically
-    $ gem install pg sequel_pg mysql2
+    $ gem install pg sequel_pg mysql2 sqlite3
 ## Usage
@@ -77,7 +84,7 @@ require 'wyrm/db_pump'
 db = Sequel.connect 'postgres://postgres@localhost/other_db'
 dbp = DbPump.new db, :things
-dbp.open_bz2 '/mnt/disk/wyrm/things.dbp.bz2'
+dbp.io = IO.popen 'pbzip2 -d -c /mnt/disk/wyrm/things.dbp.bz2'
 dbp.each_row do |row|
   puts row.inspect
 end

data/lib/wyrm/db_pump.rb CHANGED

@@ -1,20 +1,16 @@
 require 'sequel'
 require 'yaml'
-require 'ostruct'
 require 'logger'
-require 'fastandand'
 Sequel.extension :migration
-# TODO possibly use Gem::Package::TarWriter to write tar files
 # TODO when restoring, could use a SizeQueue to make sure the db is kept busy
 # TODO need to version the dumps, or something like that.
-# TODO This really should be Wyrm::Hole. Or maybe Wyrm::Hole should
-# be the codec that connects two DbPumps, for direct transfer?
+# TODO looks like io should belong to codec. Hmm. Not sure.
+# TODO table_name table_dataset need some thinking about. Dataset would encapsulate both. But couldn't change db then, and primary_keys would be hard.
 class DbPump
   # some codecs might ignore io, eg if a dbpump is talking to another dbpump
-  def initialize( db, table_name, io: STDOUT, codec: :marshal, page_size: 10000, dry_run: false )
+  def initialize( db: nil, table_name: nil, io: STDOUT, codec: :marshal, page_size: 10000, dry_run: false )
     self.codec = codec
     self.db = db
     self.table_name = table_name
@@ -42,14 +38,24 @@ class DbPump
   def db=( other_db )
     invalidate_cached_members
     @db = other_db
+    return unless other_db
+    # add extensions
     @db.extension :pagination
+    # turn on postgres streaming if available
+    if defined?( Sequel::Postgres ) && Sequel::Postgres.supports_streaming?
+      logger.info "Turn streaming on for postgres"
+      @db.extension :pg_streaming
+    end
   end
   # return an object that responds to ===
   # which returns true if ==='s parameter
   # responds to all the methods
-  def quacks_like( *methods )
+  def self.quacks_like( *methods )
     @quacks_like ||= {}
     @quacks_like[methods] ||= Object.new.tap do |obj|
       obj.define_singleton_method(:===) do |instance|
@@ -58,6 +64,10 @@ class DbPump
     end
   end
+  def quacks_like( *methods )
+    self.class.quacks_like( *methods )
+  end
   def codec=( codec_thing )
     @codec =
     case codec_thing
@@ -68,7 +78,7 @@ class DbPump
     when quacks_like( :encode, :decode )
       codec_thing
     else
-      raise "unknown codec #{codec_thing}"
+      raise "unknown codec #{codec_thing.inspect}"
     end
   end
@@ -110,44 +120,75 @@ class DbPump
     @table_dataset ||= db[table_name.to_sym]
   end
-  # TODO possibly use select from outer / inner join to
-  # http://www.numerati.com/2012/06/26/reading-large-result-sets-with-hibernate-and-mysql/
-  # because mysql is useless
-  def paginated_dump
+  # Use limit / offset. Last fallback if there are no keys (or a compound primary key?).
+  def paginated_dump( &encode_block )
     table_dataset.order(*primary_keys).each_page(page_size) do |page|
       logger.info page.sql
-      page.each do |row|
-        unless dry_run?
-          codec.encode row.values, io
-        end
-      end
+      page.each &encode_block
     end
   end
-  # have to use this for non-integer pks
+  # Use limit / offset, but not for all fields.
   # The idea is that large offsets are expensive in the db because the db server has to read
-  # through the data set to reach the required offset. So make that only ids, and then
-  # do the main select from the limited id list.
+  # through the data set to reach the required offset. So make that only ids need to be read,
+  # and then do the main select from the limited id list.
   # TODO could speed this up by have a query thread which runs the next page-query while
   # the current one is being written/compressed.
   # select * from massive as full
   #   inner join (select id from massive order by whatever limit m, n) limit
   #   on full.id = limit.id
   # order by full.whatever
-  def inner_dump
+  # http://www.numerati.com/2012/06/26/reading-large-result-sets-with-hibernate-and-mysql/
+  def inner_dump( &encode_block )
     # could possibly overrride Dataset#paginate(page_no, page_size, record_count=nil)
     0.step(table_dataset.count, page_size).each do |offset|
       limit_dataset = table_dataset.select( *primary_keys ).limit( page_size, offset ).order( *primary_keys )
       page = table_dataset.join( limit_dataset, Hash[ primary_keys.map{|f| [f,f]} ] ).order( *primary_keys ).qualify(table_name)
       logger.info page.sql
-      page.each do |row|
-        unless dry_run?
-          codec.encode row.values, io
+      page.each &encode_block
+    end
+  end
+  # Selects pages by a range of ids, using >= and <.
+  # Use this for integer pks
+  def min_max_dump( &encode_block )
+    # select max(id), min(id) from table
+    # and then split that up into 10000 size chunks.
+    # Not really important if there aren't exactly 10000
+    min, max = table_dataset.select{[min(id), max(id)]}.first.values
+    return unless min && max
+    # will always include the last item because page_size will be
+    # bigger than max for the last page
+    (min..max).step(page_size).each do |offset|
+      page = table_dataset.where( id: offset...(offset + page_size) )
+      logger.info page.sql
+      page.each &encode_block
+    end
+  end
+  def stream_dump( &encode_block )
+    logger.info "using result set streaming"
+    # I want to output progress every page_size records,
+    # without doing a records_count % page_size every iteration.
+    # So define an external enumerator
+    # TODO should really performance test the options here.
+    records_count = 0
+    enum = table_dataset.stream.enum_for
+    loop do
+      begin
+        page_size.times do
+          encode_block.call enum.next
+          records_count += 1
         end
+      ensure
+        logger.info "#{records_count} from #{table_dataset.sql}"
       end
     end
   end
+  # Dump the serialization of the table to the specified io.
   # TODO need to also dump a first row containing useful stuff:
   # - source table name
   # - number of rows
@@ -155,50 +196,50 @@ class DbPump
   # - permissions?
   # These should all be in one object that can be Marshall.load-ed easily.
   def dump
+    _dump do |row|
+      codec.encode( row.values, io ) unless dry_run?
+    end
+  ensure
+    io.flush
+  end
+  # decide which kind of paged iteration will be best for this table.
+  # Return an iterator, or yield row hashes to the block
+  def _dump( &encode_block )
+    return enum_for(__method__) unless block_given?
     case
+    when table_dataset.respond_to?( :stream )
+      stream_dump &encode_block
     when primary_keys.empty?
-      paginated_dump
+      paginated_dump &encode_block
     when primary_keys.all?{|i| i == :id }
-      min_max_dump
+      min_max_dump &encode_block
     else
-      inner_dump
+      inner_dump &encode_block
     end
-    io.flush
   end
-  # could use this for integer pks
-  def min_max_dump
-    # select max(id), min(id) from patents
-    # and then split that up into 10000 size chunks. Not really important if there aren't exactly 10000
-    min, max = table_dataset.select{[min(id), max(id)]}.first.values
-    return unless min && max
-    # could possibly overrride Dataset#paginate(page_no, page_size, record_count=nil)
-    # TODO definitely need to refactor this
-    # will always include the last item because
-    (min..max).step(page_size).each do |offset|
-      page = table_dataset.where( id: offset...(offset + page_size) )
-      logger.info page.sql
-      page.each do |row|
-        unless dry_run?
-          codec.encode row.values, io
-        end
-      end
-    end
+  def dump_matches_columns?( row_enum, columns )
+    raise "schema mismatch" unless row_enum.peek.size == columns.size
+    true
+  rescue StopIteration
+    # peek threw a StopIteration, so there's no data
+    false
   end
-  # TODO lazy evaluation / streaming
+  # TODO don't generate the full insert, ie leave out the fields
+  # because we've already checked that the columns and the table
+  # match.
+  # TODO generate column names in insert, they might still work
+  # if columns have been added to the db, but not the dump.
   # start_row is zero-based
   def restore( start_row: 0, filename: 'io' )
     columns = table_dataset.columns
-    logger.info{ "inserting to #{table_name} #{columns.inspect}" }
-    # get the Enumerator
     row_enum = each_row
-    # check that columns match
-    raise "schema mismatch" if row_enum.peek.size != columns.size
+    return unless dump_matches_columns?( row_enum, columns )
+    logger.info{ "inserting to #{table_name} #{columns.inspect}" }
     rows_restored = 0
     if start_row != 0
@@ -217,7 +258,10 @@ class DbPump
       db.transaction do
         begin
           page_size.times do
-            # This skips all the checks in the Sequel code
+            # This skips all the checks in the Sequel code. Basically we want
+            # to generate the
+            #   insert into (field1,field2) values (value1,value2)
+            # statement as quickly as possible.
             sql = table_dataset.clone( columns: columns, values: row_enum.next ).send( :clause_sql, :insert )
             db.execute sql unless dry_run?
             rows_restored += 1
@@ -235,18 +279,14 @@ class DbPump
     rows_restored
   end
-  # this doesn't really belong here, but it will do for now.
-  def open_bz2( filename )
-    io.andand.close if io != STDOUT && !io.andand.closed?
-    self.io = IO.popen( "pbzip2 -d -c #{filename}" )
-  end
-  # enumerate through the given io at its current position
+  # Enumerate through the given io at its current position
+  # TODO don't check for io.eof here, leave that to the codec
   def each_row
     return enum_for(__method__) unless block_given?
     yield codec.decode( io ) until io.eof?
   end
+  # Enumerate sql insert statements from the dump
   def insert_sql_each
     return enum_for(__method__) unless block_given?
     each_row do |row|

data/lib/wyrm/dump_schema.rb CHANGED

@@ -6,6 +6,7 @@ require 'wyrm/pump_maker'
 #  ds = DumpSchema.new src_db, Pathname('/var/data/lots')
 #  ds.dump_schema
 #  ds.dump_tables
+# TODO possibly use Gem::Package::TarWriter to write tar files
 class DumpSchema
   include PumpMaker
@@ -31,27 +32,6 @@ class DumpSchema
     @fk_migration ||= src_db.dump_foreign_key_migration(:same_db => same_db)
   end
-  def restore_migration
-    <<-EOF
-      require 'restore_migration'
-      Sequel.migration do
-        def db_pump
-        end
-        up do
-          restore_tables
-        end
-        down do
-          # from each table clear table
-          each_table do |table_name|
-            db_pump.restore table_name, io: io, db: db
-          end
-        end
-      end
-    EOF
-  end
   def same_db
     false
   end
@@ -61,24 +41,22 @@ class DumpSchema
   end
   def dump_schema
-    (container + '001_schema.rb').open('w') do |io|
-      io.write schema_migration
-    end
+    numbering = '000'
-    (container + '002_populate_tables.rb').open('w') do |io|
-      io.write restore_migration
+    (container + "#{numbering.next!}_schema.rb").open('w') do |io|
+      io.write schema_migration
     end
-    (container + '003_indexes.rb').open('w') do |io|
+    (container + "#{numbering.next!}_indexes.rb").open('w') do |io|
       io.write index_migration
     end
-    (container + '004_foreign_keys.rb').open('w') do |io|
+    (container + "#{numbering.next!}_foreign_keys.rb").open('w') do |io|
       io.write fk_migration
     end
   end
-  def open_bz2( pathname )
+  def write_through_bz2( pathname )
     fio = pathname.open('w')
     # open subprocess in read-write mode
     zio = IO.popen( "pbzip2 -z", 'r+' )
@@ -96,7 +74,8 @@ class DumpSchema
     # signal the copier thread to stop
     zio.close_write
     logger.debug 'finished dumping'
-    # wait for copier thread to
+    # wait for copier thread to finish
     copier.join
     logger.debug 'stream copy thread finished'
   ensure
@@ -104,7 +83,7 @@ class DumpSchema
     fio.close unless fio.closed?
   end
-  def dump_table( table_name )
+  def dump_table( table_name, &io_block )
     pump.table_name = table_name
     if pump.table_dataset.empty?
       logger.info "No records in #{table_name}"
@@ -114,7 +93,7 @@ class DumpSchema
     filename = container + "#{table_name}.dbp.bz2"
     logger.info "dumping #{table_name} to #{filename}"
-    open_bz2 filename do |zio|
+    write_through_bz2 filename do |zio|
       # generate the dump
       pump.io = zio
       pump.dump

data/lib/wyrm/pump_maker.rb CHANGED

@@ -1,6 +1,6 @@
 require 'wyrm/db_pump'
-class Object
+module PumpMaker
   def call_or_self( maybe_callable )
     if maybe_callable.respond_to? :call
       maybe_callable.call( self )
@@ -8,11 +8,9 @@ class Object
       maybe_callable
     end
   end
-end
-module PumpMaker
   def make_pump( db, pump_thing )
-    call_or_self(pump_thing) || DbPump.new( db, nil )
+    call_or_self(pump_thing) || DbPump.new( db: db )
   end
   def maybe_deebe( db_or_string )

data/lib/wyrm/restore_schema.rb CHANGED

@@ -2,9 +2,9 @@ require 'logger'
 require 'wyrm/pump_maker'
 # Load a schema from a set of dump files (from DumpSchema)
-# and restore the table data
+# and restore the table data.
 #  dst_db = Sequel.connect "postgres://localhost:5454/lots"
-#  rs = RestoreSchema.new dst_db, Pathname('/var/data/lots')
+#  rs = RestoreSchema.new dst_db, '/var/data/lots'
 #  rs.create
 #  rs.restore_tables
 class RestoreSchema
@@ -14,23 +14,39 @@ class RestoreSchema
     @container = Pathname.new container
     @dst_db = maybe_deebe dst_db
     @pump = make_pump( @dst_db, pump )
-    load_migrations
   end
   attr_reader :pump
   attr_reader :dst_db
   attr_reader :container
-  attr_reader :schema_migration, :index_migration, :fk_migration
-  def logger
-    @logger ||= Logger.new STDERR
+  # sequel wants migrations numbered, but it's a bit of an annoyance for this.
+  def find_single( glob )
+    candidates =Pathname.glob container + glob
+    raise "too many #{candidates.inspect} for #{glob}" unless candidates.size == 1
+    candidates.first
+  end
+  def schema_migration
+    @schema_migration ||= find_single( '*schema.rb' ).read
+  end
+  def index_migration
+    @index_migration ||= find_single( '*indexes.rb' ).read
+  end
+  def fk_migration
+    @fk_migration ||= find_single( '*foreign_keys.rb' ).read
   end
-  def load_migrations
-    @schema_migration = (container + '001_schema.rb').read
-    @index_migration = (container + '003_indexes.rb').read
-    @fk_migration = (container + '004_foreign_keys.rb').read
+  def reload_migrations
+    @fk_migration = nil
+    @index_migration = nil
+    @schema_migration = nil
+  end
+  def logger
+    @logger ||= Logger.new STDERR
   end
   # create indexes and foreign keys, and reset sequences
@@ -53,20 +69,34 @@ class RestoreSchema
     eval( schema_migration ).apply dst_db, :up
   end
-  # assume the table name is the base name of table_file
+  # assume the table name is the base name of table_file pathname
   def restore_table( table_file )
     logger.info "restoring from #{table_file}"
     pump.table_name = table_file.basename.sub_ext('').sub_ext('').to_s.to_sym
     # TODO check if table has been restored already, and has the correct rows,
-    # otherwise pass in a start row.
-    IO.popen( "pbzip2 -d -c #{table_file}" ) do |io|
+    open_bz2 table_file do |io|
       pump.io = io
       pump.restore
     end
   end
+  # open a dbp.bz2 file and either yield or return an io of the uncompressed contents
+  def open_bz2( table_name, &block )
+    table_file =
+    case table_name
+    when Symbol
+      container + "#{table_name}.dbp.bz2"
+    when Pathname
+      table_name
+    else
+      raise "Don't know what to do with #{table_name.inspect}"
+    end
+    IO.popen "pbzip2 -d -c #{table_file}", &block
+  end
   def restore_tables
-    table_files = Pathname.glob Pathname(container) + '*dbp.bz2'
+    table_files = Pathname.glob container + '*.dbp.bz2'
     table_files.sort_by{|tf| tf.stat.size}.each{|table_file| restore_table table_file}
   end
 end

data/lib/wyrm/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Wyrm
-  VERSION = "0.2.0"
+  VERSION = "0.2.1"
 end

data/snippets/console.rb CHANGED

@@ -3,8 +3,6 @@ require 'sqlite3'
 require 'pathname'
 require 'wyrm/dump_schema.rb'
-db = Sequel.connect 'sqlite:/home/panic/.qtstalker/new-trading.sqlite3'
 # pump = DbPump.new db, :positions, codec: :yaml
 dumper = DumpSchema.new db, '/tmp/test', pump: lambda{|_| DbPump.new db, nil, codec: :yaml}
 dumper = DumpSchema.new db, '/tmp/test', pump: ->(dump_schema){ DbPump.new dump_schema.src_db, nil, codec: :yaml}

data/wyrm.gemspec CHANGED

@@ -18,7 +18,7 @@ Gem::Specification.new do |spec|
   spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
   spec.require_paths = ["lib"]
-  spec.add_runtime_dependency 'sequel', '~> 4.0.0'
+  spec.add_runtime_dependency 'sequel'
   spec.add_runtime_dependency "fastandand"
   spec.add_development_dependency "bundler", "~> 1.3"

metadata CHANGED

@@ -1,29 +1,29 @@
 --- !ruby/object:Gem::Specification
 name: wyrm
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.2.1
 platform: ruby
 authors:
 - John Anderson
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-08-03 00:00:00.000000000 Z
+date: 2013-08-07 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: sequel
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - '>='
       - !ruby/object:Gem::Version
-        version: 4.0.0
+        version: '0'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - '>='
       - !ruby/object:Gem::Version
-        version: 4.0.0
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: fastandand
   requirement: !ruby/object:Gem::Requirement