wyrm 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,11 @@
1
+ require 'logger'
2
+
3
+ module Wyrm
4
+ module Logger
5
+ def logger
6
+ @logger ||= ::Logger.new( STDERR ).tap do |lgr|
7
+ lgr.level = ::Logger::INFO
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1 @@
1
+ module Wyrm end
@@ -1,25 +1,28 @@
1
1
  require 'sequel'
2
2
  require 'yaml'
3
- require 'logger'
4
3
 
5
- Sequel.extension :migration
4
+ require 'wyrm/logger'
5
+ require 'wyrm/module'
6
6
 
7
7
  # TODO when restoring, could use a SizeQueue to make sure the db is kept busy
8
8
  # TODO need to version the dumps, or something like that.
9
9
  # TODO looks like io should belong to codec. Hmm. Not sure.
10
10
  # TODO table_name table_dataset need some thinking about. Dataset would encapsulate both. But couldn't change db then, and primary_keys would be hard.
11
- class DbPump
12
- # some codecs might ignore io, eg if a dbpump is talking to another dbpump
13
- def initialize( db: nil, table_name: nil, io: STDOUT, codec: :marshal, page_size: 10000, dry_run: false )
11
+ class Wyrm::Pump
12
+ def initialize( db: nil, table_name: nil, io: STDOUT, codec: :marshal, page_size: 10000, dry_run: false, logger: nil )
14
13
  self.codec = codec
15
14
  self.db = db
16
15
  self.table_name = table_name
17
16
  self.io = io
18
17
  self.page_size = page_size
19
18
  self.dry_run = dry_run
19
+ self.logger = logger
20
20
  yield self if block_given?
21
21
  end
22
22
 
23
+ include Wyrm::Logger
24
+ attr_writer :logger
25
+
23
26
  attr_accessor :io, :page_size, :dry_run
24
27
  def dry_run?; dry_run; end
25
28
 
@@ -46,9 +49,11 @@ class DbPump
46
49
  @db.extension :pagination
47
50
 
48
51
  # turn on postgres streaming if available
49
- if defined?( Sequel::Postgres ) && Sequel::Postgres.supports_streaming?
50
- logger.info "Turn streaming on for postgres"
52
+ if defined?( Sequel::Postgres ) && defined?(Sequel::Postgres.supports_streaming?) && Sequel::Postgres.supports_streaming?
53
+ logger.debug "Streaming for postgres"
51
54
  @db.extension :pg_streaming
55
+ else
56
+ logger.info "No streaming for postgres"
52
57
  end
53
58
  end
54
59
 
@@ -57,10 +62,8 @@ class DbPump
57
62
  # responds to all the methods
58
63
  def self.quacks_like( *methods )
59
64
  @quacks_like ||= {}
60
- @quacks_like[methods] ||= Object.new.tap do |obj|
61
- obj.define_singleton_method(:===) do |instance|
62
- methods.all?{|m| instance.respond_to? m}
63
- end
65
+ @quacks_like[methods] ||= lambda do |inst|
66
+ methods.all?{|m| inst.respond_to? m}
64
67
  end
65
68
  end
66
69
 
@@ -75,7 +78,7 @@ class DbPump
75
78
  when :marshal; MarshalCodec.new
76
79
  when Class
77
80
  codec_thing.new
78
- when quacks_like( :encode, :decode )
81
+ when quacks_like(:encode,:decode)
79
82
  codec_thing
80
83
  else
81
84
  raise "unknown codec #{codec_thing.inspect}"
@@ -108,10 +111,6 @@ class DbPump
108
111
  end
109
112
  end
110
113
 
111
- def logger
112
- @logger ||= Logger.new STDERR
113
- end
114
-
115
114
  def primary_keys
116
115
  @primary_keys ||= db.schema(table_name).select{|df| df.last[:primary_key]}.map{|df| df.first}
117
116
  end
@@ -122,9 +121,12 @@ class DbPump
122
121
 
123
122
  # Use limit / offset. Last fallback if there are no keys (or a compound primary key?).
124
123
  def paginated_dump( &encode_block )
124
+ records_count = 0
125
125
  table_dataset.order(*primary_keys).each_page(page_size) do |page|
126
- logger.info page.sql
126
+ logger.info{ "#{__method__} #{table_name} #{records_count}" }
127
+ logger.debug{ page.sql }
127
128
  page.each &encode_block
129
+ records_count += page_size
128
130
  end
129
131
  end
130
132
 
@@ -132,8 +134,6 @@ class DbPump
132
134
  # The idea is that large offsets are expensive in the db because the db server has to read
133
135
  # through the data set to reach the required offset. So make that only ids need to be read,
134
136
  # and then do the main select from the limited id list.
135
- # TODO could speed this up by have a query thread which runs the next page-query while
136
- # the current one is being written/compressed.
137
137
  # select * from massive as full
138
138
  # inner join (select id from massive order by whatever limit m, n) limit
139
139
  # on full.id = limit.id
@@ -144,7 +144,8 @@ class DbPump
144
144
  0.step(table_dataset.count, page_size).each do |offset|
145
145
  limit_dataset = table_dataset.select( *primary_keys ).limit( page_size, offset ).order( *primary_keys )
146
146
  page = table_dataset.join( limit_dataset, Hash[ primary_keys.map{|f| [f,f]} ] ).order( *primary_keys ).qualify(table_name)
147
- logger.info page.sql
147
+ logger.info{ "#{__method__} #{table_name} #{offset}" }
148
+ logger.debug{ page.sql }
148
149
  page.each &encode_block
149
150
  end
150
151
  end
@@ -162,13 +163,14 @@ class DbPump
162
163
  # bigger than max for the last page
163
164
  (min..max).step(page_size).each do |offset|
164
165
  page = table_dataset.where( id: offset...(offset + page_size) )
165
- logger.info page.sql
166
+ logger.info{ "#{__method__} #{table_name} #{offset}" }
167
+ logger.debug{ page.sql }
166
168
  page.each &encode_block
167
169
  end
168
170
  end
169
171
 
170
172
  def stream_dump( &encode_block )
171
- logger.info "using result set streaming"
173
+ logger.debug{ "using result set streaming" }
172
174
 
173
175
  # I want to output progress every page_size records,
174
176
  # without doing a records_count % page_size every iteration.
@@ -183,18 +185,23 @@ class DbPump
183
185
  records_count += 1
184
186
  end
185
187
  ensure
186
- logger.info "#{records_count} from #{table_dataset.sql}"
188
+ logger.info{ "#{__method__} #{table_name} #{records_count}" if records_count < page_size }
189
+ logger.debug{ " from #{table_dataset.sql}" }
187
190
  end
188
191
  end
189
192
  end
190
193
 
191
194
  # Dump the serialization of the table to the specified io.
195
+ #
192
196
  # TODO need to also dump a first row containing useful stuff:
193
197
  # - source table name
194
198
  # - number of rows
195
199
  # - source db url
196
200
  # - permissions?
197
201
  # These should all be in one object that can be Marshall.load-ed easily.
202
+ #
203
+ # TODO could speed this up by have a query thread which runs the next page-query while
204
+ # the current one is being written/compressed.
198
205
  def dump
199
206
  _dump do |row|
200
207
  codec.encode( row.values, io ) unless dry_run?
@@ -239,21 +246,20 @@ class DbPump
239
246
 
240
247
  return unless dump_matches_columns?( row_enum, columns )
241
248
 
242
- logger.info{ "inserting to #{table_name} #{columns.inspect}" }
249
+ logger.info{ "#{__method__} inserting to #{table_name} from #{start_row}" }
250
+ logger.debug{ " #{columns.inspect}" }
243
251
  rows_restored = 0
244
252
 
245
253
  if start_row != 0
246
- logger.info{ "skipping #{start_row} rows from #{filename}" }
254
+ logger.debug{ "skipping #{start_row} rows from #{filename}" }
247
255
  start_row.times do |i|
248
256
  row_enum.next
249
- logger.info{ "skipped #{i} from #{filename}" } if i % page_size == 0
257
+ logger.debug{ "skipped #{i} from #{filename}" } if i % page_size == 0
250
258
  end
251
- logger.info{ "skipped #{start_row} from #{filename}" }
259
+ logger.debug{ "skipped #{start_row} from #{filename}" }
252
260
  rows_restored += start_row
253
261
  end
254
262
 
255
- logger.info{ "inserting to #{table_name} from #{rows_restored}" }
256
-
257
263
  loop do
258
264
  db.transaction do
259
265
  begin
@@ -267,20 +273,20 @@ class DbPump
267
273
  rows_restored += 1
268
274
  end
269
275
  rescue StopIteration
270
- # er reached the end of the inout stream.
276
+ # reached the end of the inout stream.
271
277
  # So commit this transaction, and then re-raise
272
278
  # StopIteration to get out of the loop{} statement
273
279
  db.after_commit{ raise StopIteration }
274
280
  end
275
- logger.info{ "#{table_name} inserted #{rows_restored}" }
276
281
  end
277
282
  end
278
- logger.info{ "#{table_name} done. Inserted #{rows_restored}." }
283
+ logger.info{ "#{__method__} #{table_name} done. Inserted #{rows_restored}." }
279
284
  rows_restored
280
285
  end
281
286
 
282
- # Enumerate through the given io at its current position
283
- # TODO don't check for io.eof here, leave that to the codec
287
+ # Enumerate through the given io at its current position.
288
+ # Can raise StopIteration (ie when eof is not detected)
289
+ # MAYBE don't check for io.eof here, leave that to the codec
284
290
  def each_row
285
291
  return enum_for(__method__) unless block_given?
286
292
  yield codec.decode( io ) until io.eof?
@@ -1,6 +1,7 @@
1
- require 'wyrm/db_pump'
1
+ require 'wyrm/pump'
2
+ require 'wyrm/module'
2
3
 
3
- module PumpMaker
4
+ module Wyrm::PumpMaker
4
5
  def call_or_self( maybe_callable )
5
6
  if maybe_callable.respond_to? :call
6
7
  maybe_callable.call( self )
@@ -10,13 +11,18 @@ module PumpMaker
10
11
  end
11
12
 
12
13
  def make_pump( db, pump_thing )
13
- call_or_self(pump_thing) || DbPump.new( db: db )
14
+ call_or_self(pump_thing) || Pump.new( db: db )
14
15
  end
15
16
 
16
17
  def maybe_deebe( db_or_string )
17
18
  case db_or_string
18
19
  when String
19
- Sequel.connect db_or_string
20
+ begin
21
+ Sequel.connect db_or_string
22
+ rescue Sequel::AdapterNotFound
23
+ puts "\nCan't find db driver for #{db_or_string}. It might work to do\n\n gem install #{db_or_string.split(?:).first}\n\n"
24
+ exit(1)
25
+ end
20
26
  when Sequel::Database
21
27
  db_or_string
22
28
  else
@@ -1,28 +1,44 @@
1
- require 'logger'
1
+ require 'ostruct'
2
+ require 'pathname'
3
+
4
+ require 'wyrm/logger'
5
+ require 'wyrm/module'
2
6
  require 'wyrm/pump_maker'
7
+ require 'wyrm/schema_tools'
3
8
 
4
9
  # Load a schema from a set of dump files (from DumpSchema)
5
10
  # and restore the table data.
6
11
  # dst_db = Sequel.connect "postgres://localhost:5454/lots"
7
12
  # rs = RestoreSchema.new dst_db, '/var/data/lots'
8
- # rs.create
9
- # rs.restore_tables
10
- class RestoreSchema
13
+ # rs.call
14
+ # TODO the problem with lazy loading the schema files is that
15
+ # errors in indexes and foreign keys will only be picked up at the
16
+ # end of they probably lengthy table restore process.
17
+ # TODO check if table has been restored already, and has the correct rows,
18
+ class Wyrm::Restore
11
19
  include PumpMaker
20
+ include SchemaTools
21
+ include Wyrm::Logger
12
22
 
13
- def initialize( dst_db, container, pump: nil )
23
+ def initialize( container, dst_db, pump: nil, drop_tables: false )
14
24
  @container = Pathname.new container
15
25
  @dst_db = maybe_deebe dst_db
16
26
  @pump = make_pump( @dst_db, pump )
27
+
28
+ options.drop_tables = drop_tables
17
29
  end
18
30
 
19
31
  attr_reader :pump
20
32
  attr_reader :dst_db
21
33
  attr_reader :container
22
34
 
35
+ def options
36
+ @options ||= OpenStruct.new
37
+ end
38
+
23
39
  # sequel wants migrations numbered, but it's a bit of an annoyance for this.
24
40
  def find_single( glob )
25
- candidates =Pathname.glob container + glob
41
+ candidates = Pathname.glob container + glob
26
42
  raise "too many #{candidates.inspect} for #{glob}" unless candidates.size == 1
27
43
  candidates.first
28
44
  end
@@ -45,38 +61,13 @@ class RestoreSchema
45
61
  @schema_migration = nil
46
62
  end
47
63
 
48
- def logger
49
- @logger ||= Logger.new STDERR
50
- end
51
-
52
- # create indexes and foreign keys, and reset sequences
53
- def index
54
- logger.info "creating indexes"
55
- eval( index_migration ).apply dst_db, :up
56
- logger.info "creating foreign keys"
57
- eval( fk_migration ).apply dst_db, :up
58
-
59
- if dst_db.database_type == :postgres
60
- logger.info "reset primary key sequences"
61
- dst_db.tables.each{|t| dst_db.reset_primary_key_sequence(t)}
62
- logger.info "Primary key sequences reset successfully"
63
- end
64
- end
65
-
66
- # create the destination schema
67
- def create
68
- logger.info "creating tables"
69
- eval( schema_migration ).apply dst_db, :up
70
- end
71
-
72
64
  # assume the table name is the base name of table_file pathname
73
65
  def restore_table( table_file )
74
66
  logger.info "restoring from #{table_file}"
75
67
  pump.table_name = table_file.basename.sub_ext('').sub_ext('').to_s.to_sym
76
- # TODO check if table has been restored already, and has the correct rows,
77
68
  open_bz2 table_file do |io|
78
69
  pump.io = io
79
- pump.restore
70
+ pump.restore filename: table_file
80
71
  end
81
72
  end
82
73
 
@@ -95,8 +86,24 @@ class RestoreSchema
95
86
  IO.popen "pbzip2 -d -c #{table_file}", &block
96
87
  end
97
88
 
89
+ def table_files
90
+ Pathname.glob container + '*.dbp.bz2'
91
+ end
92
+
98
93
  def restore_tables
99
- table_files = Pathname.glob container + '*.dbp.bz2'
100
94
  table_files.sort_by{|tf| tf.stat.size}.each{|table_file| restore_table table_file}
101
95
  end
96
+
97
+ def table_names
98
+ table_files.map do |path|
99
+ path.basename.to_s.split(?.)[0...-2].last.to_sym
100
+ end
101
+ end
102
+
103
+ def call
104
+ drop_tables(table_names) if options.drop_tables
105
+ create_tables
106
+ restore_tables
107
+ create_indexes
108
+ end
102
109
  end
@@ -0,0 +1,91 @@
1
+ require 'fastandand'
2
+ Sequel.extension :migration
3
+ require 'wyrm/module'
4
+
5
+ # needs dst_db for mutate operations
6
+ # and src_db for fetch operations
7
+ # src_db must have extension(:schema_dumper)
8
+ module Wyrm::SchemaTools
9
+ # some includers will need to provide a different implementation for this.
10
+ def same_db
11
+ respond_to?( :dst_db ) && respond_to?( :src_db ) && dst_db.andand.database_type == src_db.andand.database_type
12
+ end
13
+
14
+ def schema_migration
15
+ @schema_migration ||= src_db.dump_schema_migration(:indexes=>false, :same_db => same_db)
16
+ end
17
+
18
+ def index_migration
19
+ @index_migration ||= src_db.dump_indexes_migration(:same_db => same_db)
20
+ end
21
+
22
+ def fk_migration
23
+ @fk_migration ||= src_db.dump_foreign_key_migration(:same_db => same_db)
24
+ end
25
+
26
+ def drop_table_options
27
+ @drop_table_options ||=
28
+ begin
29
+ if dst_db.database_type == :postgres
30
+ {cascade: true}
31
+ else
32
+ {}
33
+ end
34
+ end
35
+ end
36
+
37
+ # Delete given tables.
38
+ # Recurse if there are foreign keys preventing table deletion.
39
+ # This implementation will fail for tables with mutual foreign keys.
40
+ # TODO maybe this should use the schema down migration?
41
+ def drop_tables( tables )
42
+ foreign_keyed_tables = []
43
+ tables.each do |table_name|
44
+ begin
45
+ logger.debug "dropping #{table_name}"
46
+ dst_db.drop_table? table_name, drop_table_options
47
+
48
+ rescue Sequel::ForeignKeyConstraintViolation => ex
49
+ foreign_keyed_tables << table_name
50
+
51
+ rescue Sequel::DatabaseError => ex
52
+ # Mysql2::Error: Cannot delete or update a parent row: a foreign key constraint fails
53
+ # SQLite3::ConstraintException: FOREIGN KEY constraint failed==
54
+ if ex.message =~ /foreign key constraint fail/i
55
+ foreign_keyed_tables << table_name
56
+ else
57
+ raise
58
+ end
59
+
60
+ end
61
+ end
62
+
63
+ # this should be temporary
64
+ if tables.sort == foreign_keyed_tables.sort
65
+ raise "can't remove #{tables.inspect} because they have mutual foreign keys"
66
+ end
67
+
68
+ # recursively delete tables
69
+ drop_tables foreign_keyed_tables.shuffle unless foreign_keyed_tables.empty?
70
+ end
71
+
72
+ def create_tables
73
+ logger.info "creating tables"
74
+ eval( schema_migration ).apply dst_db, :up
75
+ end
76
+
77
+ def create_indexes
78
+ # create indexes and foreign keys, and reset sequences
79
+ logger.info "creating indexes"
80
+ eval( index_migration ).apply dst_db, :up
81
+
82
+ logger.info "creating foreign keys"
83
+ eval( fk_migration ).apply dst_db, :up
84
+
85
+ if dst_db.database_type == :postgres
86
+ logger.info "reset primary key sequences"
87
+ dst_db.tables.each{|t| dst_db.reset_primary_key_sequence(t)}
88
+ logger.info "Primary key sequences reset successfully"
89
+ end
90
+ end
91
+ end