RubyGems - wyrm - Versions diffs - 0.1.4 → 0.2.0 - Mend

wyrm 0.1.4 → 0.2.0

Files changed (11) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: d1446fe2b49cd863188938f3aebbccda3bb41f47
-  data.tar.gz: 265fcd26522e424398b5c9584e8a15687d595431
+  metadata.gz: c432ee798bf7c5208a16696daa5741d64351721d
+  data.tar.gz: 51e22092611ef48f16d4757eb8c327f96e6847f5
 SHA512:
-  metadata.gz: 8014ba8205ad7e2d1af85539291fc9e95d0af94bd1561a59368fdd8264246c7566fd5cd7dd45271fad0f41fc001d3dad811840922e03a5965e8cd9e2997514a7
-  data.tar.gz: 1c450d1b9efd49717e81b67c03109b6670a3e6215cf60f3b0df38f8fb05e89dbc71e6d83527e87e8d3a038e92d93ca0a5b2ad2c7c66473a5434de072e88b4013
+  metadata.gz: b907a9adbd5b47ac9847a0aeaa95e5318eff307735947f8e4dfd4aa35e819f8ff873cbf53053202a4a944f2961fe6a3254f33bd4816753386580e2049c13e186
+  data.tar.gz: 65e512436c2991f2b9786c8e9e2587854b227e95bac2dcc1ec2f87b3d0840e786ac4e13a454197cdeb38a43eae948c9fbaa743b3c1f3b21d02714b398fd56894

data/Gemfile CHANGED

@@ -1,10 +1,8 @@
 source 'https://rubygems.org'
 # source 'file:///var/cache/rubygems'
-gem 'sequel'
+gem 'sequel', ~> '4.0.0'
 gem 'fastandand'
-gem 'pry'
-gem 'pry-debundle'
 # Specify your gem's dependencies in wyrm.gemspec
 gemspec

data/README.md CHANGED

@@ -12,10 +12,17 @@ handle compound primary keys and tables without primary keys.
 Wyrm because:
 - I like dragons
-- I can have a Wyrm::Hole to transfer data through :-D
+- I can (eventually) have a Wyrm::Hole to transfer data through :-D
+## Dependencies
+You must have a working
+[pbzip2](http://compression.ca/pbzip2/ "Will use all your cores")
+on your path.
 ## Installation
 Add this line to your application's Gemfile:
     gem 'wyrm'
@@ -30,38 +37,52 @@ Or install it yourself as:
 Make sure you install the db gems, typically
-    $ gem install pg mysql2
+    $ gem install pg sequel_pg mysql2
 ## Usage
-This is mostly a toolkit right now. To transfer from mysql to postgres do:
-```ruby
-require 'sequel'
-require 'pathname'
+### CLI
+Very basic cli at this point.
+From the source db to the file system
+    $ wyrm mysql2://localhost/beeg_data_bays /tmp/lots_fs_space
-# on the source host
-# dump tables from mysql
-require 'wyrm/dump_schema'
-src_db = Sequel.connect "mysql2://localhost/lots"
-ds = DumpSchema.new src_db, Pathname('/tmp/lots')
-ds.dump_schema
+Optionally transfer data. Already compressed, so no -z
-# this might take a while ;-)
-ds.dump_tables
+    $ rsync -var /tmp/lots_fs_space user@host:/tmp/lots_fs_space
-# transfer data. Already compressed, so no -z
-# rsync -var /tmp/lots user@host:/var/data/
+On the destination host
-# on the destination host
-# restore tables to postgres
+    $ wyrm /tmp/lots_fs_space postgres://localhost/betta_dee_bee
+### irb / pry
+For restoring. dump will be similar.
+``` ruby
 require 'wyrm/restore_schema'
-dst_db = Sequel.connect "postgres://localhost/lots"
-rs = RestoreSchema.new dst_db, Pathname('/var/data/lots')
+rs = RestoreSchema.new 'postgres://postgres@localhost/your_db', '/mnt/disk/wyrm'
 rs.create
 rs.restore_tables
 rs.index
 ```
+Or for the lower-level stuff
+``` ruby
+require 'sequel'
+require 'wyrm/db_pump'
+db = Sequel.connect 'postgres://postgres@localhost/other_db'
+dbp = DbPump.new db, :things
+dbp.open_bz2 '/mnt/disk/wyrm/things.dbp.bz2'
+dbp.each_row do |row|
+  puts row.inspect
+end
+```
 ## Contributing
 1. Fork it

data/bin/wyrm ADDED

@@ -0,0 +1,36 @@
+#! /usr/bin/env ruby
+require 'pathname'
+require 'uri'
+def restore( db, directory )
+  require 'wyrm/restore_schema'
+  rs = RestoreSchema.new db, directory
+  rs.create
+  rs.restore_tables
+  rs.index
+end
+def dump( db, directory )
+  require 'wyrm/dump_schema'
+  ds = DumpSchema.new db, directory
+  ds.dump_schema
+  ds.dump_tables
+end
+if ARGV.empty?
+  puts "Provide source and destination"
+  puts "Either can be a sequel db string or a directory"
+end
+src, dst = ARGV.map{|arg| URI.parse arg}
+if src.scheme && Pathname(dst.to_s).exist?
+  # src is a db path, so dump from it
+  dump( src.to_s, dst.to_s )
+elsif dst.scheme && Pathname(src.to_s).exist?
+  # dst is a path and src is a url, so restore
+  restore( dst.to_s, src.to_s )
+else
+  puts "Don't know how to handle #{src} -> #{dst}"
+end

data/lib/wyrm/db_pump.rb CHANGED

@@ -10,12 +10,6 @@ Sequel.extension :migration
 # TODO when restoring, could use a SizeQueue to make sure the db is kept busy
 # TODO need to version the dumps, or something like that.
-# So the slowest-changing variables are the db, the io stream
-# and the page size.
-# table will change every call. Will IO stream change between
-# table changes? No. So a currying type approach will work.
-# Somebody must have done this before.
-# But table and io are often related (ie table going to one file)
 # TODO This really should be Wyrm::Hole. Or maybe Wyrm::Hole should
 # be the codec that connects two DbPumps, for direct transfer?
 class DbPump
@@ -31,32 +25,36 @@ class DbPump
   end
   attr_accessor :io, :page_size, :dry_run
+  def dry_run?; dry_run; end
   # These affect cached values
   attr_reader :db, :table_name
-  def table_name=( name_sym )
+  def invalidate_cached_members
     @primary_keys = nil
     @table_dataset = nil
+  end
+  def table_name=( name_sym )
+    invalidate_cached_members
     @table_name = name_sym
   end
   def db=( other_db )
-    @primary_keys = nil
-    @table_dataset = nil
+    invalidate_cached_members
     @db = other_db
     @db.extension :pagination
   end
-  def dry_run?; dry_run; end
-  class RespondsTo
-    def initialize( *methods )
-      @methods = methods
-    end
-    def ===( instance )
-      @methods.all?{|m| instance.respond_to? m}
+  # return an object that responds to ===
+  # which returns true if ==='s parameter
+  # responds to all the methods
+  def quacks_like( *methods )
+    @quacks_like ||= {}
+    @quacks_like[methods] ||= Object.new.tap do |obj|
+      obj.define_singleton_method(:===) do |instance|
+        methods.all?{|m| instance.respond_to? m}
+      end
     end
   end
@@ -66,18 +64,16 @@ class DbPump
     when :yaml; YamlCodec.new
     when :marshal; MarshalCodec.new
     when Class
-      codec.new
-    when RespondsTo.new( :encode, :decode )
-      codec
+      codec_thing.new
+    when quacks_like( :encode, :decode )
+      codec_thing
     else
-      raise "unknown codec #{codec}"
+      raise "unknown codec #{codec_thing}"
     end
   end
   attr_reader :codec
-  # TODO could use msgpack as serialization here, but its API is unpleasant.
   class MarshalCodec
     def encode( obj, io )
       Marshal.dump obj, io
@@ -90,18 +86,6 @@ class DbPump
     end
   end
-  class MsgPackCodec
-    def encode( obj, io )
-      Marshal.dump obj, io
-    end
-    def decode( io, &block )
-      obj = Marshal.load(io)
-      yield obj if block_given?
-      obj
-    end
-  end
   class YamlCodec
     def encode( obj, io )
       YAML.dump obj, io
@@ -203,51 +187,70 @@ class DbPump
     end
   end
-  # TODO possible memory issues here if the rows are big. May need to fork this.
-  # TODO lazy evaluation
-  def restore( start_row: 0 )
-    logger.info "restoring #{table_name}"
-    # destination db should be same structure as incoming data
-    column_names = db.schema(table_name.to_sym).map( &:first )
-    first = ->(row){raise "schema mismatch" if row.size != column_names.size}
+  # TODO lazy evaluation / streaming
+  # start_row is zero-based
+  def restore( start_row: 0, filename: 'io' )
+    columns = table_dataset.columns
+    logger.info{ "inserting to #{table_name} #{columns.inspect}" }
+    # get the Enumerator
+    row_enum = each_row
+    # check that columns match
+    raise "schema mismatch" if row_enum.peek.size != columns.size
     rows_restored = 0
-    # skip this many rows
-    start_row.times do
-      codec.decode( io ) {|row|}
+    if start_row != 0
+      logger.info{ "skipping #{start_row} rows from #{filename}" }
+      start_row.times do |i|
+        row_enum.next
+        logger.info{ "skipped #{i} from #{filename}" } if i % page_size == 0
+      end
+      logger.info{ "skipped #{start_row} from #{filename}" }
+      rows_restored += start_row
     end
-    # copy rows into db
-    while !io.eof?
-      # fetch a page of rows
-      rows_ary = []
-      begin
-        page_size.times do |i|
-          codec.decode( io ) do |row|
-            rows_ary << row
-          end
-          rows_restored += 1
-        end
-      rescue EOFError => e
-        # ran out of rows, so just use the ones we have so far
-      end
+    logger.info{ "inserting to #{table_name} from #{rows_restored}" }
-      # insert to db. Hopeful db support bulk insert, which Sequel will figure out
+    loop do
       db.transaction do
-        table_dataset.import column_names, rows_ary
-        yield rows_restored if block_given?
-        logger.info "restored #{rows_restored}"
+        begin
+          page_size.times do
+            # This skips all the checks in the Sequel code
+            sql = table_dataset.clone( columns: columns, values: row_enum.next ).send( :clause_sql, :insert )
+            db.execute sql unless dry_run?
+            rows_restored += 1
+          end
+        rescue StopIteration
+          # er reached the end of the inout stream.
+          # So commit this transaction, and then re-raise
+          # StopIteration to get out of the loop{} statement
+          db.after_commit{ raise StopIteration }
+        end
+        logger.info{ "#{table_name} inserted #{rows_restored}" }
       end
     end
+    logger.info{ "#{table_name} done. Inserted #{rows_restored}." }
     rows_restored
   end
-  def self.from_bz2( filename, db, table_name, options = {} )
-    IO.popen( "pbzip2 -d -c #{filename}" ) do |io|
-      dbpump = DbPump.new db, table_name, io: io
-      dbpump.restore
+  # this doesn't really belong here, but it will do for now.
+  def open_bz2( filename )
+    io.andand.close if io != STDOUT && !io.andand.closed?
+    self.io = IO.popen( "pbzip2 -d -c #{filename}" )
+  end
+  # enumerate through the given io at its current position
+  def each_row
+    return enum_for(__method__) unless block_given?
+    yield codec.decode( io ) until io.eof?
+  end
+  def insert_sql_each
+    return enum_for(__method__) unless block_given?
+    each_row do |row|
+      yield table_dataset.insert_sql( row )
     end
   end
 end

data/lib/wyrm/dump_schema.rb CHANGED

@@ -1,15 +1,5 @@
 require 'logger'
-require 'wyrm/db_pump'
-class Object
-  def call_or_self( maybe_callable )
-    if maybe_callable.respond_to? :call
-      maybe_callable.call( self )
-    else
-      maybe_callable
-    end
-  end
-end
+require 'wyrm/pump_maker'
 # Dump a schema and compressed data from a db to a set of files
 #  src_db = Sequel.connect "postgres://localhost:5454/lots"
@@ -17,19 +7,18 @@ end
 #  ds.dump_schema
 #  ds.dump_tables
 class DumpSchema
+  include PumpMaker
   def initialize( src_db, container = nil, pump: nil )
-    src_db.extension :schema_dumper
-    @src_db = src_db
-    @container = Pathname(container)
-    @pump = make_pump( pump )
+    @src_db = maybe_deebe src_db
+    @container = Pathname.new container
+    @pump = make_pump( @src_db, pump )
+    @src_db.extension :schema_dumper
   end
   attr_reader :src_db, :container, :pump
-  def make_pump( pump_thing )
-    call_or_self(pump_thing) || DbPump.new( src_db, nil )
-  end
   def schema_migration
     @schema_migration ||= src_db.dump_schema_migration(:indexes=>false, :same_db => same_db)
   end
@@ -116,11 +105,17 @@ class DumpSchema
   end
   def dump_table( table_name )
+    pump.table_name = table_name
+    if pump.table_dataset.empty?
+      logger.info "No records in #{table_name}"
+      return
+    end
     filename = container + "#{table_name}.dbp.bz2"
     logger.info "dumping #{table_name} to #{filename}"
     open_bz2 filename do |zio|
       # generate the dump
-      pump.table_name = table_name
       pump.io = zio
       pump.dump
     end

data/lib/wyrm/pump_maker.rb ADDED

@@ -0,0 +1,28 @@
+require 'wyrm/db_pump'
+class Object
+  def call_or_self( maybe_callable )
+    if maybe_callable.respond_to? :call
+      maybe_callable.call( self )
+    else
+      maybe_callable
+    end
+  end
+end
+module PumpMaker
+  def make_pump( db, pump_thing )
+    call_or_self(pump_thing) || DbPump.new( db, nil )
+  end
+  def maybe_deebe( db_or_string )
+    case db_or_string
+    when String
+      Sequel.connect db_or_string
+    when Sequel::Database
+      db_or_string
+    else
+      raise "Don't know how to db-ify #{db_or_string.inspect}"
+    end
+  end
+end

data/lib/wyrm/restore_schema.rb CHANGED

@@ -1,5 +1,5 @@
 require 'logger'
-require 'wyrm/db_pump'
+require 'wyrm/pump_maker'
 # Load a schema from a set of dump files (from DumpSchema)
 # and restore the table data
@@ -8,15 +8,18 @@ require 'wyrm/db_pump'
 #  rs.create
 #  rs.restore_tables
 class RestoreSchema
-  def initialize( dst_db, container )
-    @container = container
-    @dst_db = dst_db
-    @options = {:codec => :marshal}
-    load_migrations @container
+  include PumpMaker
+  def initialize( dst_db, container, pump: nil )
+    @container = Pathname.new container
+    @dst_db = maybe_deebe dst_db
+    @pump = make_pump( @dst_db, pump )
+    load_migrations
   end
+  attr_reader :pump
   attr_reader :dst_db
-  attr_reader :options
   attr_reader :container
   attr_reader :schema_migration, :index_migration, :fk_migration
@@ -24,7 +27,7 @@ class RestoreSchema
     @logger ||= Logger.new STDERR
   end
-  def load_migrations( container )
+  def load_migrations
     @schema_migration = (container + '001_schema.rb').read
     @index_migration = (container + '003_indexes.rb').read
     @fk_migration = (container + '004_foreign_keys.rb').read
@@ -46,19 +49,24 @@ class RestoreSchema
   # create the destination schema
   def create
+    logger.info "creating tables"
     eval( schema_migration ).apply dst_db, :up
   end
-  def restore_one_table( table_file )
+  # assume the table name is the base name of table_file
+  def restore_table( table_file )
     logger.info "restoring from #{table_file}"
-    table_name = table_file.basename.sub_ext('').sub_ext('').to_s.to_sym
-    # check if table has been restored already, and has the correct rows,
+    pump.table_name = table_file.basename.sub_ext('').sub_ext('').to_s.to_sym
+    # TODO check if table has been restored already, and has the correct rows,
     # otherwise pass in a start row.
-    DbPump.from_bz2 table_file, dst_db, table_name
+    IO.popen( "pbzip2 -d -c #{table_file}" ) do |io|
+      pump.io = io
+      pump.restore
+    end
   end
   def restore_tables
     table_files = Pathname.glob Pathname(container) + '*dbp.bz2'
-    table_files.sort_by{|tf| tf.stat.size}.each{|table_file| restore_one_table table_file}
+    table_files.sort_by{|tf| tf.stat.size}.each{|table_file| restore_table table_file}
   end
 end

data/lib/wyrm/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Wyrm
-  VERSION = "0.1.4"
+  VERSION = "0.2.0"
 end

data/wyrm.gemspec CHANGED

@@ -18,7 +18,7 @@ Gem::Specification.new do |spec|
   spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
   spec.require_paths = ["lib"]
-  spec.add_runtime_dependency "sequel"
+  spec.add_runtime_dependency 'sequel', '~> 4.0.0'
   spec.add_runtime_dependency "fastandand"
   spec.add_development_dependency "bundler", "~> 1.3"

metadata CHANGED

@@ -1,29 +1,29 @@
 --- !ruby/object:Gem::Specification
 name: wyrm
 version: !ruby/object:Gem::Version
-  version: 0.1.4
+  version: 0.2.0
 platform: ruby
 authors:
 - John Anderson
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-08-02 00:00:00.000000000 Z
+date: 2013-08-03 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: sequel
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - ~>
       - !ruby/object:Gem::Version
-        version: '0'
+        version: 4.0.0
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - ~>
       - !ruby/object:Gem::Version
-        version: '0'
+        version: 4.0.0
 - !ruby/object:Gem::Dependency
   name: fastandand
   requirement: !ruby/object:Gem::Requirement
@@ -69,7 +69,8 @@ dependencies:
 description: Transfer from one SQL database to another
 email:
 - panic@semiosix.com
-executables: []
+executables:
+- wyrm
 extensions: []
 extra_rdoc_files: []
 files:
@@ -79,10 +80,12 @@ files:
 - LICENSE.txt
 - README.md
 - Rakefile
+- bin/wyrm
 - lib/wyrm.rb
 - lib/wyrm/db_pump.rb
 - lib/wyrm/dump_schema.rb
 - lib/wyrm/other_schema.rb
+- lib/wyrm/pump_maker.rb
 - lib/wyrm/restore_schema.rb
 - lib/wyrm/transferer.rb
 - lib/wyrm/version.rb