RubyGems - wyrm - Versions diffs - 0.2.1 → 0.3.0 - Mend

wyrm 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

checksums.yaml +4 -4
data/.gitignore +2 -0
data/.rvmrc +1 -1
data/Gemfile +0 -3
data/README.md +18 -11
data/bin/wyrm +40 -24
data/bin/wyrm-view +34 -0
data/lib/wyrm.rb +3 -5
data/lib/wyrm/cli.rb +9 -0
data/lib/wyrm/core_extensions.rb +10 -0
data/lib/wyrm/{dump_schema.rb → dump.rb} +22 -21
data/lib/wyrm/hole.rb +164 -0
data/lib/wyrm/logger.rb +11 -0
data/lib/wyrm/module.rb +1 -0
data/lib/wyrm/{db_pump.rb → pump.rb} +40 -34
data/lib/wyrm/pump_maker.rb +10 -4
data/lib/wyrm/{restore_schema.rb → restore.rb} +40 -33
data/lib/wyrm/schema_tools.rb +91 -0
data/lib/wyrm/version.rb +1 -1
data/snippets/console.rb +5 -3
data/spec/core_extensions_spec.rb +50 -0
data/spec/hole_mouth_spec.rb +176 -0
data/spec/pump_spec.rb +62 -0
data/spec/schema_tools_spec.rb +201 -0
data/wyrm.gemspec +12 -3
metadata +135 -23
data/lib/wyrm/other_schema.rb +0 -6
data/lib/wyrm/transferer.rb +0 -32

data/lib/wyrm/logger.rb ADDED Viewed

@@ -0,0 +1,11 @@
+require 'logger'
+module Wyrm
+  module Logger
+    def logger
+      @logger ||= ::Logger.new( STDERR ).tap do |lgr|
+        lgr.level = ::Logger::INFO
+      end
+    end
+  end
+end

data/lib/wyrm/module.rb ADDED Viewed

	@@ -0,0 +1 @@
1	+ module Wyrm end

data/lib/wyrm/{db_pump.rb → pump.rb} RENAMED Viewed

@@ -1,25 +1,28 @@
 require 'sequel'
 require 'yaml'
-require 'logger'
-Sequel.extension :migration
+require 'wyrm/logger'
+require 'wyrm/module'
 # TODO when restoring, could use a SizeQueue to make sure the db is kept busy
 # TODO need to version the dumps, or something like that.
 # TODO looks like io should belong to codec. Hmm. Not sure.
 # TODO table_name table_dataset need some thinking about. Dataset would encapsulate both. But couldn't change db then, and primary_keys would be hard.
-class DbPump
-  # some codecs might ignore io, eg if a dbpump is talking to another dbpump
-  def initialize( db: nil, table_name: nil, io: STDOUT, codec: :marshal, page_size: 10000, dry_run: false )
+class Wyrm::Pump
+  def initialize( db: nil, table_name: nil, io: STDOUT, codec: :marshal, page_size: 10000, dry_run: false, logger: nil )
     self.codec = codec
     self.db = db
     self.table_name = table_name
     self.io = io
     self.page_size = page_size
     self.dry_run = dry_run
+    self.logger = logger
     yield self if block_given?
   end
+  include Wyrm::Logger
+  attr_writer :logger
   attr_accessor :io, :page_size, :dry_run
   def dry_run?; dry_run; end
@@ -46,9 +49,11 @@ class DbPump
     @db.extension :pagination
     # turn on postgres streaming if available
-    if defined?( Sequel::Postgres ) && Sequel::Postgres.supports_streaming?
-      logger.info "Turn streaming on for postgres"
+    if defined?( Sequel::Postgres ) && defined?(Sequel::Postgres.supports_streaming?) && Sequel::Postgres.supports_streaming?
+      logger.debug "Streaming for postgres"
       @db.extension :pg_streaming
+    else
+      logger.info "No streaming for postgres"
     end
   end
@@ -57,10 +62,8 @@ class DbPump
   # responds to all the methods
   def self.quacks_like( *methods )
     @quacks_like ||= {}
-    @quacks_like[methods] ||= Object.new.tap do |obj|
-      obj.define_singleton_method(:===) do |instance|
-        methods.all?{|m| instance.respond_to? m}
-      end
+    @quacks_like[methods] ||= lambda do |inst|
+      methods.all?{|m| inst.respond_to? m}
     end
   end
@@ -75,7 +78,7 @@ class DbPump
     when :marshal; MarshalCodec.new
     when Class
       codec_thing.new
-    when quacks_like( :encode, :decode )
+    when quacks_like(:encode,:decode)
       codec_thing
     else
       raise "unknown codec #{codec_thing.inspect}"
@@ -108,10 +111,6 @@ class DbPump
     end
   end
-  def logger
-    @logger ||= Logger.new STDERR
-  end
   def primary_keys
     @primary_keys ||= db.schema(table_name).select{|df| df.last[:primary_key]}.map{|df| df.first}
   end
@@ -122,9 +121,12 @@ class DbPump
   # Use limit / offset. Last fallback if there are no keys (or a compound primary key?).
   def paginated_dump( &encode_block )
+    records_count = 0
     table_dataset.order(*primary_keys).each_page(page_size) do |page|
-      logger.info page.sql
+      logger.info{ "#{__method__} #{table_name} #{records_count}" }
+      logger.debug{ page.sql }
       page.each &encode_block
+      records_count += page_size
     end
   end
@@ -132,8 +134,6 @@ class DbPump
   # The idea is that large offsets are expensive in the db because the db server has to read
   # through the data set to reach the required offset. So make that only ids need to be read,
   # and then do the main select from the limited id list.
-  # TODO could speed this up by have a query thread which runs the next page-query while
-  # the current one is being written/compressed.
   # select * from massive as full
   #   inner join (select id from massive order by whatever limit m, n) limit
   #   on full.id = limit.id
@@ -144,7 +144,8 @@ class DbPump
     0.step(table_dataset.count, page_size).each do |offset|
       limit_dataset = table_dataset.select( *primary_keys ).limit( page_size, offset ).order( *primary_keys )
       page = table_dataset.join( limit_dataset, Hash[ primary_keys.map{|f| [f,f]} ] ).order( *primary_keys ).qualify(table_name)
-      logger.info page.sql
+      logger.info{ "#{__method__} #{table_name} #{offset}" }
+      logger.debug{ page.sql }
       page.each &encode_block
     end
   end
@@ -162,13 +163,14 @@ class DbPump
     # bigger than max for the last page
     (min..max).step(page_size).each do |offset|
       page = table_dataset.where( id: offset...(offset + page_size) )
-      logger.info page.sql
+      logger.info{ "#{__method__} #{table_name} #{offset}" }
+      logger.debug{ page.sql }
       page.each &encode_block
     end
   end
   def stream_dump( &encode_block )
-    logger.info "using result set streaming"
+    logger.debug{ "using result set streaming" }
     # I want to output progress every page_size records,
     # without doing a records_count % page_size every iteration.
@@ -183,18 +185,23 @@ class DbPump
           records_count += 1
         end
       ensure
-        logger.info "#{records_count} from #{table_dataset.sql}"
+        logger.info{ "#{__method__} #{table_name} #{records_count}" if records_count < page_size }
+        logger.debug{ "  from #{table_dataset.sql}" }
       end
     end
   end
   # Dump the serialization of the table to the specified io.
+  #
   # TODO need to also dump a first row containing useful stuff:
   # - source table name
   # - number of rows
   # - source db url
   # - permissions?
   # These should all be in one object that can be Marshall.load-ed easily.
+  #
+  # TODO could speed this up by have a query thread which runs the next page-query while
+  # the current one is being written/compressed.
   def dump
     _dump do |row|
       codec.encode( row.values, io ) unless dry_run?
@@ -239,21 +246,20 @@ class DbPump
     return unless dump_matches_columns?( row_enum, columns )
-    logger.info{ "inserting to #{table_name} #{columns.inspect}" }
+    logger.info{ "#{__method__} inserting to #{table_name} from #{start_row}" }
+    logger.debug{ "  #{columns.inspect}" }
     rows_restored = 0
     if start_row != 0
-      logger.info{ "skipping #{start_row} rows from #{filename}" }
+      logger.debug{ "skipping #{start_row} rows from #{filename}" }
       start_row.times do |i|
         row_enum.next
-        logger.info{ "skipped #{i} from #{filename}" } if i % page_size == 0
+        logger.debug{ "skipped #{i} from #{filename}" } if i % page_size == 0
       end
-      logger.info{ "skipped #{start_row} from #{filename}" }
+      logger.debug{ "skipped #{start_row} from #{filename}" }
       rows_restored += start_row
     end
-    logger.info{ "inserting to #{table_name} from #{rows_restored}" }
     loop do
       db.transaction do
         begin
@@ -267,20 +273,20 @@ class DbPump
             rows_restored += 1
           end
         rescue StopIteration
-          # er reached the end of the inout stream.
+          # reached the end of the inout stream.
           # So commit this transaction, and then re-raise
           # StopIteration to get out of the loop{} statement
           db.after_commit{ raise StopIteration }
         end
-        logger.info{ "#{table_name} inserted #{rows_restored}" }
       end
     end
-    logger.info{ "#{table_name} done. Inserted #{rows_restored}." }
+    logger.info{ "#{__method__} #{table_name} done. Inserted #{rows_restored}." }
     rows_restored
   end
-  # Enumerate through the given io at its current position
-  # TODO don't check for io.eof here, leave that to the codec
+  # Enumerate through the given io at its current position.
+  # Can raise StopIteration (ie when eof is not detected)
+  # MAYBE don't check for io.eof here, leave that to the codec
   def each_row
     return enum_for(__method__) unless block_given?
     yield codec.decode( io ) until io.eof?

data/lib/wyrm/pump_maker.rb CHANGED Viewed

@@ -1,6 +1,7 @@
-require 'wyrm/db_pump'
+require 'wyrm/pump'
+require 'wyrm/module'
-module PumpMaker
+module Wyrm::PumpMaker
   def call_or_self( maybe_callable )
     if maybe_callable.respond_to? :call
       maybe_callable.call( self )
@@ -10,13 +11,18 @@ module PumpMaker
   end
   def make_pump( db, pump_thing )
-    call_or_self(pump_thing) || DbPump.new( db: db )
+    call_or_self(pump_thing) || Pump.new( db: db )
   end
   def maybe_deebe( db_or_string )
     case db_or_string
     when String
-      Sequel.connect db_or_string
+      begin
+        Sequel.connect db_or_string
+      rescue Sequel::AdapterNotFound
+        puts "\nCan't find db driver for #{db_or_string}. It might work to do\n\n  gem install #{db_or_string.split(?:).first}\n\n"
+        exit(1)
+      end
     when Sequel::Database
       db_or_string
     else

data/lib/wyrm/{restore_schema.rb → restore.rb} RENAMED Viewed

@@ -1,28 +1,44 @@
-require 'logger'
+require 'ostruct'
+require 'pathname'
+require 'wyrm/logger'
+require 'wyrm/module'
 require 'wyrm/pump_maker'
+require 'wyrm/schema_tools'
 # Load a schema from a set of dump files (from DumpSchema)
 # and restore the table data.
 #  dst_db = Sequel.connect "postgres://localhost:5454/lots"
 #  rs = RestoreSchema.new dst_db, '/var/data/lots'
-#  rs.create
-#  rs.restore_tables
-class RestoreSchema
+#  rs.call
+# TODO the problem with lazy loading the schema files is that
+# errors in indexes and foreign keys will only be picked up at the
+# end of they probably lengthy table restore process.
+# TODO check if table has been restored already, and has the correct rows,
+class Wyrm::Restore
   include PumpMaker
+  include SchemaTools
+  include Wyrm::Logger
-  def initialize( dst_db, container, pump: nil )
+  def initialize( container, dst_db, pump: nil, drop_tables: false )
     @container = Pathname.new container
     @dst_db = maybe_deebe dst_db
     @pump = make_pump( @dst_db, pump )
+    options.drop_tables = drop_tables
   end
   attr_reader :pump
   attr_reader :dst_db
   attr_reader :container
+  def options
+    @options ||= OpenStruct.new
+  end
   # sequel wants migrations numbered, but it's a bit of an annoyance for this.
   def find_single( glob )
-    candidates =Pathname.glob container + glob
+    candidates = Pathname.glob container + glob
     raise "too many #{candidates.inspect} for #{glob}" unless candidates.size == 1
     candidates.first
   end
@@ -45,38 +61,13 @@ class RestoreSchema
     @schema_migration = nil
   end
-  def logger
-    @logger ||= Logger.new STDERR
-  end
-  # create indexes and foreign keys, and reset sequences
-  def index
-    logger.info "creating indexes"
-    eval( index_migration ).apply dst_db, :up
-    logger.info "creating foreign keys"
-    eval( fk_migration ).apply dst_db, :up
-    if dst_db.database_type == :postgres
-      logger.info "reset primary key sequences"
-      dst_db.tables.each{|t| dst_db.reset_primary_key_sequence(t)}
-      logger.info "Primary key sequences reset successfully"
-    end
-  end
-  # create the destination schema
-  def create
-    logger.info "creating tables"
-    eval( schema_migration ).apply dst_db, :up
-  end
   # assume the table name is the base name of table_file pathname
   def restore_table( table_file )
     logger.info "restoring from #{table_file}"
     pump.table_name = table_file.basename.sub_ext('').sub_ext('').to_s.to_sym
-    # TODO check if table has been restored already, and has the correct rows,
     open_bz2 table_file do |io|
       pump.io = io
-      pump.restore
+      pump.restore filename: table_file
     end
   end
@@ -95,8 +86,24 @@ class RestoreSchema
     IO.popen "pbzip2 -d -c #{table_file}", &block
   end
+  def table_files
+    Pathname.glob container + '*.dbp.bz2'
+  end
   def restore_tables
-    table_files = Pathname.glob container + '*.dbp.bz2'
     table_files.sort_by{|tf| tf.stat.size}.each{|table_file| restore_table table_file}
   end
+  def table_names
+    table_files.map do |path|
+      path.basename.to_s.split(?.)[0...-2].last.to_sym
+    end
+  end
+  def call
+    drop_tables(table_names) if options.drop_tables
+    create_tables
+    restore_tables
+    create_indexes
+  end
 end

data/lib/wyrm/schema_tools.rb ADDED Viewed

@@ -0,0 +1,91 @@
+require 'fastandand'
+Sequel.extension :migration
+require 'wyrm/module'
+# needs dst_db for mutate operations
+# and src_db for fetch operations
+# src_db must have extension(:schema_dumper)
+module Wyrm::SchemaTools
+  # some includers will need to provide a different implementation for this.
+  def same_db
+    respond_to?( :dst_db ) && respond_to?( :src_db ) && dst_db.andand.database_type == src_db.andand.database_type
+  end
+  def schema_migration
+    @schema_migration ||= src_db.dump_schema_migration(:indexes=>false, :same_db => same_db)
+  end
+  def index_migration
+    @index_migration ||= src_db.dump_indexes_migration(:same_db => same_db)
+  end
+  def fk_migration
+    @fk_migration ||= src_db.dump_foreign_key_migration(:same_db => same_db)
+  end
+  def drop_table_options
+    @drop_table_options ||=
+    begin
+      if dst_db.database_type == :postgres
+        {cascade: true}
+      else
+        {}
+      end
+    end
+  end
+  # Delete given tables.
+  # Recurse if there are foreign keys preventing table deletion.
+  # This implementation will fail for tables with mutual foreign keys.
+  # TODO maybe this should use the schema down migration?
+  def drop_tables( tables )
+    foreign_keyed_tables = []
+    tables.each do |table_name|
+      begin
+        logger.debug "dropping #{table_name}"
+        dst_db.drop_table? table_name, drop_table_options
+      rescue Sequel::ForeignKeyConstraintViolation => ex
+        foreign_keyed_tables << table_name
+      rescue Sequel::DatabaseError => ex
+        # Mysql2::Error: Cannot delete or update a parent row: a foreign key constraint fails
+        # SQLite3::ConstraintException: FOREIGN KEY constraint failed==
+        if ex.message =~ /foreign key constraint fail/i
+          foreign_keyed_tables << table_name
+        else
+          raise
+        end
+      end
+    end
+    # this should be temporary
+    if tables.sort == foreign_keyed_tables.sort
+      raise "can't remove #{tables.inspect} because they have mutual foreign keys"
+    end
+    # recursively delete tables
+    drop_tables foreign_keyed_tables.shuffle unless foreign_keyed_tables.empty?
+  end
+  def create_tables
+    logger.info "creating tables"
+    eval( schema_migration ).apply dst_db, :up
+  end
+  def create_indexes
+    # create indexes and foreign keys, and reset sequences
+    logger.info "creating indexes"
+    eval( index_migration ).apply dst_db, :up
+    logger.info "creating foreign keys"
+    eval( fk_migration ).apply dst_db, :up
+    if dst_db.database_type == :postgres
+      logger.info "reset primary key sequences"
+      dst_db.tables.each{|t| dst_db.reset_primary_key_sequence(t)}
+      logger.info "Primary key sequences reset successfully"
+    end
+  end
+end