wyrm 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,11 @@
1
+ require 'logger'
2
+
3
+ module Wyrm
4
+ module Logger
5
+ def logger
6
+ @logger ||= ::Logger.new( STDERR ).tap do |lgr|
7
+ lgr.level = ::Logger::INFO
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1 @@
1
+ module Wyrm end
@@ -1,25 +1,28 @@
1
1
  require 'sequel'
2
2
  require 'yaml'
3
- require 'logger'
4
3
 
5
- Sequel.extension :migration
4
+ require 'wyrm/logger'
5
+ require 'wyrm/module'
6
6
 
7
7
  # TODO when restoring, could use a SizeQueue to make sure the db is kept busy
8
8
  # TODO need to version the dumps, or something like that.
9
9
  # TODO looks like io should belong to codec. Hmm. Not sure.
10
10
  # TODO table_name table_dataset need some thinking about. Dataset would encapsulate both. But couldn't change db then, and primary_keys would be hard.
11
- class DbPump
12
- # some codecs might ignore io, eg if a dbpump is talking to another dbpump
13
- def initialize( db: nil, table_name: nil, io: STDOUT, codec: :marshal, page_size: 10000, dry_run: false )
11
+ class Wyrm::Pump
12
+ def initialize( db: nil, table_name: nil, io: STDOUT, codec: :marshal, page_size: 10000, dry_run: false, logger: nil )
14
13
  self.codec = codec
15
14
  self.db = db
16
15
  self.table_name = table_name
17
16
  self.io = io
18
17
  self.page_size = page_size
19
18
  self.dry_run = dry_run
19
+ self.logger = logger
20
20
  yield self if block_given?
21
21
  end
22
22
 
23
+ include Wyrm::Logger
24
+ attr_writer :logger
25
+
23
26
  attr_accessor :io, :page_size, :dry_run
24
27
  def dry_run?; dry_run; end
25
28
 
@@ -46,9 +49,11 @@ class DbPump
46
49
  @db.extension :pagination
47
50
 
48
51
  # turn on postgres streaming if available
49
- if defined?( Sequel::Postgres ) && Sequel::Postgres.supports_streaming?
50
- logger.info "Turn streaming on for postgres"
52
+ if defined?( Sequel::Postgres ) && defined?(Sequel::Postgres.supports_streaming?) && Sequel::Postgres.supports_streaming?
53
+ logger.debug "Streaming for postgres"
51
54
  @db.extension :pg_streaming
55
+ else
56
+ logger.info "No streaming for postgres"
52
57
  end
53
58
  end
54
59
 
@@ -57,10 +62,8 @@ class DbPump
57
62
  # responds to all the methods
58
63
  def self.quacks_like( *methods )
59
64
  @quacks_like ||= {}
60
- @quacks_like[methods] ||= Object.new.tap do |obj|
61
- obj.define_singleton_method(:===) do |instance|
62
- methods.all?{|m| instance.respond_to? m}
63
- end
65
+ @quacks_like[methods] ||= lambda do |inst|
66
+ methods.all?{|m| inst.respond_to? m}
64
67
  end
65
68
  end
66
69
 
@@ -75,7 +78,7 @@ class DbPump
75
78
  when :marshal; MarshalCodec.new
76
79
  when Class
77
80
  codec_thing.new
78
- when quacks_like( :encode, :decode )
81
+ when quacks_like(:encode,:decode)
79
82
  codec_thing
80
83
  else
81
84
  raise "unknown codec #{codec_thing.inspect}"
@@ -108,10 +111,6 @@ class DbPump
108
111
  end
109
112
  end
110
113
 
111
- def logger
112
- @logger ||= Logger.new STDERR
113
- end
114
-
115
114
  def primary_keys
116
115
  @primary_keys ||= db.schema(table_name).select{|df| df.last[:primary_key]}.map{|df| df.first}
117
116
  end
@@ -122,9 +121,12 @@ class DbPump
122
121
 
123
122
  # Use limit / offset. Last fallback if there are no keys (or a compound primary key?).
124
123
  def paginated_dump( &encode_block )
124
+ records_count = 0
125
125
  table_dataset.order(*primary_keys).each_page(page_size) do |page|
126
- logger.info page.sql
126
+ logger.info{ "#{__method__} #{table_name} #{records_count}" }
127
+ logger.debug{ page.sql }
127
128
  page.each &encode_block
129
+ records_count += page_size
128
130
  end
129
131
  end
130
132
 
@@ -132,8 +134,6 @@ class DbPump
132
134
  # The idea is that large offsets are expensive in the db because the db server has to read
133
135
  # through the data set to reach the required offset. So make that only ids need to be read,
134
136
  # and then do the main select from the limited id list.
135
- # TODO could speed this up by have a query thread which runs the next page-query while
136
- # the current one is being written/compressed.
137
137
  # select * from massive as full
138
138
  # inner join (select id from massive order by whatever limit m, n) limit
139
139
  # on full.id = limit.id
@@ -144,7 +144,8 @@ class DbPump
144
144
  0.step(table_dataset.count, page_size).each do |offset|
145
145
  limit_dataset = table_dataset.select( *primary_keys ).limit( page_size, offset ).order( *primary_keys )
146
146
  page = table_dataset.join( limit_dataset, Hash[ primary_keys.map{|f| [f,f]} ] ).order( *primary_keys ).qualify(table_name)
147
- logger.info page.sql
147
+ logger.info{ "#{__method__} #{table_name} #{offset}" }
148
+ logger.debug{ page.sql }
148
149
  page.each &encode_block
149
150
  end
150
151
  end
@@ -162,13 +163,14 @@ class DbPump
162
163
  # bigger than max for the last page
163
164
  (min..max).step(page_size).each do |offset|
164
165
  page = table_dataset.where( id: offset...(offset + page_size) )
165
- logger.info page.sql
166
+ logger.info{ "#{__method__} #{table_name} #{offset}" }
167
+ logger.debug{ page.sql }
166
168
  page.each &encode_block
167
169
  end
168
170
  end
169
171
 
170
172
  def stream_dump( &encode_block )
171
- logger.info "using result set streaming"
173
+ logger.debug{ "using result set streaming" }
172
174
 
173
175
  # I want to output progress every page_size records,
174
176
  # without doing a records_count % page_size every iteration.
@@ -183,18 +185,23 @@ class DbPump
183
185
  records_count += 1
184
186
  end
185
187
  ensure
186
- logger.info "#{records_count} from #{table_dataset.sql}"
188
+ logger.info{ "#{__method__} #{table_name} #{records_count}" if records_count < page_size }
189
+ logger.debug{ " from #{table_dataset.sql}" }
187
190
  end
188
191
  end
189
192
  end
190
193
 
191
194
  # Dump the serialization of the table to the specified io.
195
+ #
192
196
  # TODO need to also dump a first row containing useful stuff:
193
197
  # - source table name
194
198
  # - number of rows
195
199
  # - source db url
196
200
  # - permissions?
197
201
  # These should all be in one object that can be Marshall.load-ed easily.
202
+ #
203
+ # TODO could speed this up by have a query thread which runs the next page-query while
204
+ # the current one is being written/compressed.
198
205
  def dump
199
206
  _dump do |row|
200
207
  codec.encode( row.values, io ) unless dry_run?
@@ -239,21 +246,20 @@ class DbPump
239
246
 
240
247
  return unless dump_matches_columns?( row_enum, columns )
241
248
 
242
- logger.info{ "inserting to #{table_name} #{columns.inspect}" }
249
+ logger.info{ "#{__method__} inserting to #{table_name} from #{start_row}" }
250
+ logger.debug{ " #{columns.inspect}" }
243
251
  rows_restored = 0
244
252
 
245
253
  if start_row != 0
246
- logger.info{ "skipping #{start_row} rows from #{filename}" }
254
+ logger.debug{ "skipping #{start_row} rows from #{filename}" }
247
255
  start_row.times do |i|
248
256
  row_enum.next
249
- logger.info{ "skipped #{i} from #{filename}" } if i % page_size == 0
257
+ logger.debug{ "skipped #{i} from #{filename}" } if i % page_size == 0
250
258
  end
251
- logger.info{ "skipped #{start_row} from #{filename}" }
259
+ logger.debug{ "skipped #{start_row} from #{filename}" }
252
260
  rows_restored += start_row
253
261
  end
254
262
 
255
- logger.info{ "inserting to #{table_name} from #{rows_restored}" }
256
-
257
263
  loop do
258
264
  db.transaction do
259
265
  begin
@@ -267,20 +273,20 @@ class DbPump
267
273
  rows_restored += 1
268
274
  end
269
275
  rescue StopIteration
270
- # er reached the end of the inout stream.
276
+ # reached the end of the inout stream.
271
277
  # So commit this transaction, and then re-raise
272
278
  # StopIteration to get out of the loop{} statement
273
279
  db.after_commit{ raise StopIteration }
274
280
  end
275
- logger.info{ "#{table_name} inserted #{rows_restored}" }
276
281
  end
277
282
  end
278
- logger.info{ "#{table_name} done. Inserted #{rows_restored}." }
283
+ logger.info{ "#{__method__} #{table_name} done. Inserted #{rows_restored}." }
279
284
  rows_restored
280
285
  end
281
286
 
282
- # Enumerate through the given io at its current position
283
- # TODO don't check for io.eof here, leave that to the codec
287
+ # Enumerate through the given io at its current position.
288
+ # Can raise StopIteration (ie when eof is not detected)
289
+ # MAYBE don't check for io.eof here, leave that to the codec
284
290
  def each_row
285
291
  return enum_for(__method__) unless block_given?
286
292
  yield codec.decode( io ) until io.eof?
@@ -1,6 +1,7 @@
1
- require 'wyrm/db_pump'
1
+ require 'wyrm/pump'
2
+ require 'wyrm/module'
2
3
 
3
- module PumpMaker
4
+ module Wyrm::PumpMaker
4
5
  def call_or_self( maybe_callable )
5
6
  if maybe_callable.respond_to? :call
6
7
  maybe_callable.call( self )
@@ -10,13 +11,18 @@ module PumpMaker
10
11
  end
11
12
 
12
13
  def make_pump( db, pump_thing )
13
- call_or_self(pump_thing) || DbPump.new( db: db )
14
+ call_or_self(pump_thing) || Pump.new( db: db )
14
15
  end
15
16
 
16
17
  def maybe_deebe( db_or_string )
17
18
  case db_or_string
18
19
  when String
19
- Sequel.connect db_or_string
20
+ begin
21
+ Sequel.connect db_or_string
22
+ rescue Sequel::AdapterNotFound
23
+ puts "\nCan't find db driver for #{db_or_string}. It might work to do\n\n gem install #{db_or_string.split(?:).first}\n\n"
24
+ exit(1)
25
+ end
20
26
  when Sequel::Database
21
27
  db_or_string
22
28
  else
@@ -1,28 +1,44 @@
1
- require 'logger'
1
+ require 'ostruct'
2
+ require 'pathname'
3
+
4
+ require 'wyrm/logger'
5
+ require 'wyrm/module'
2
6
  require 'wyrm/pump_maker'
7
+ require 'wyrm/schema_tools'
3
8
 
4
9
  # Load a schema from a set of dump files (from DumpSchema)
5
10
  # and restore the table data.
6
11
  # dst_db = Sequel.connect "postgres://localhost:5454/lots"
7
12
  # rs = RestoreSchema.new dst_db, '/var/data/lots'
8
- # rs.create
9
- # rs.restore_tables
10
- class RestoreSchema
13
+ # rs.call
14
+ # TODO the problem with lazy loading the schema files is that
15
+ # errors in indexes and foreign keys will only be picked up at the
16
+ # end of they probably lengthy table restore process.
17
+ # TODO check if table has been restored already, and has the correct rows,
18
+ class Wyrm::Restore
11
19
  include PumpMaker
20
+ include SchemaTools
21
+ include Wyrm::Logger
12
22
 
13
- def initialize( dst_db, container, pump: nil )
23
+ def initialize( container, dst_db, pump: nil, drop_tables: false )
14
24
  @container = Pathname.new container
15
25
  @dst_db = maybe_deebe dst_db
16
26
  @pump = make_pump( @dst_db, pump )
27
+
28
+ options.drop_tables = drop_tables
17
29
  end
18
30
 
19
31
  attr_reader :pump
20
32
  attr_reader :dst_db
21
33
  attr_reader :container
22
34
 
35
+ def options
36
+ @options ||= OpenStruct.new
37
+ end
38
+
23
39
  # sequel wants migrations numbered, but it's a bit of an annoyance for this.
24
40
  def find_single( glob )
25
- candidates =Pathname.glob container + glob
41
+ candidates = Pathname.glob container + glob
26
42
  raise "too many #{candidates.inspect} for #{glob}" unless candidates.size == 1
27
43
  candidates.first
28
44
  end
@@ -45,38 +61,13 @@ class RestoreSchema
45
61
  @schema_migration = nil
46
62
  end
47
63
 
48
- def logger
49
- @logger ||= Logger.new STDERR
50
- end
51
-
52
- # create indexes and foreign keys, and reset sequences
53
- def index
54
- logger.info "creating indexes"
55
- eval( index_migration ).apply dst_db, :up
56
- logger.info "creating foreign keys"
57
- eval( fk_migration ).apply dst_db, :up
58
-
59
- if dst_db.database_type == :postgres
60
- logger.info "reset primary key sequences"
61
- dst_db.tables.each{|t| dst_db.reset_primary_key_sequence(t)}
62
- logger.info "Primary key sequences reset successfully"
63
- end
64
- end
65
-
66
- # create the destination schema
67
- def create
68
- logger.info "creating tables"
69
- eval( schema_migration ).apply dst_db, :up
70
- end
71
-
72
64
  # assume the table name is the base name of table_file pathname
73
65
  def restore_table( table_file )
74
66
  logger.info "restoring from #{table_file}"
75
67
  pump.table_name = table_file.basename.sub_ext('').sub_ext('').to_s.to_sym
76
- # TODO check if table has been restored already, and has the correct rows,
77
68
  open_bz2 table_file do |io|
78
69
  pump.io = io
79
- pump.restore
70
+ pump.restore filename: table_file
80
71
  end
81
72
  end
82
73
 
@@ -95,8 +86,24 @@ class RestoreSchema
95
86
  IO.popen "pbzip2 -d -c #{table_file}", &block
96
87
  end
97
88
 
89
+ def table_files
90
+ Pathname.glob container + '*.dbp.bz2'
91
+ end
92
+
98
93
  def restore_tables
99
- table_files = Pathname.glob container + '*.dbp.bz2'
100
94
  table_files.sort_by{|tf| tf.stat.size}.each{|table_file| restore_table table_file}
101
95
  end
96
+
97
+ def table_names
98
+ table_files.map do |path|
99
+ path.basename.to_s.split(?.)[0...-2].last.to_sym
100
+ end
101
+ end
102
+
103
+ def call
104
+ drop_tables(table_names) if options.drop_tables
105
+ create_tables
106
+ restore_tables
107
+ create_indexes
108
+ end
102
109
  end
@@ -0,0 +1,91 @@
1
+ require 'fastandand'
2
+ Sequel.extension :migration
3
+ require 'wyrm/module'
4
+
5
+ # needs dst_db for mutate operations
6
+ # and src_db for fetch operations
7
+ # src_db must have extension(:schema_dumper)
8
+ module Wyrm::SchemaTools
9
+ # some includers will need to provide a different implementation for this.
10
+ def same_db
11
+ respond_to?( :dst_db ) && respond_to?( :src_db ) && dst_db.andand.database_type == src_db.andand.database_type
12
+ end
13
+
14
+ def schema_migration
15
+ @schema_migration ||= src_db.dump_schema_migration(:indexes=>false, :same_db => same_db)
16
+ end
17
+
18
+ def index_migration
19
+ @index_migration ||= src_db.dump_indexes_migration(:same_db => same_db)
20
+ end
21
+
22
+ def fk_migration
23
+ @fk_migration ||= src_db.dump_foreign_key_migration(:same_db => same_db)
24
+ end
25
+
26
+ def drop_table_options
27
+ @drop_table_options ||=
28
+ begin
29
+ if dst_db.database_type == :postgres
30
+ {cascade: true}
31
+ else
32
+ {}
33
+ end
34
+ end
35
+ end
36
+
37
+ # Delete given tables.
38
+ # Recurse if there are foreign keys preventing table deletion.
39
+ # This implementation will fail for tables with mutual foreign keys.
40
+ # TODO maybe this should use the schema down migration?
41
+ def drop_tables( tables )
42
+ foreign_keyed_tables = []
43
+ tables.each do |table_name|
44
+ begin
45
+ logger.debug "dropping #{table_name}"
46
+ dst_db.drop_table? table_name, drop_table_options
47
+
48
+ rescue Sequel::ForeignKeyConstraintViolation => ex
49
+ foreign_keyed_tables << table_name
50
+
51
+ rescue Sequel::DatabaseError => ex
52
+ # Mysql2::Error: Cannot delete or update a parent row: a foreign key constraint fails
53
+ # SQLite3::ConstraintException: FOREIGN KEY constraint failed==
54
+ if ex.message =~ /foreign key constraint fail/i
55
+ foreign_keyed_tables << table_name
56
+ else
57
+ raise
58
+ end
59
+
60
+ end
61
+ end
62
+
63
+ # this should be temporary
64
+ if tables.sort == foreign_keyed_tables.sort
65
+ raise "can't remove #{tables.inspect} because they have mutual foreign keys"
66
+ end
67
+
68
+ # recursively delete tables
69
+ drop_tables foreign_keyed_tables.shuffle unless foreign_keyed_tables.empty?
70
+ end
71
+
72
+ def create_tables
73
+ logger.info "creating tables"
74
+ eval( schema_migration ).apply dst_db, :up
75
+ end
76
+
77
+ def create_indexes
78
+ # create indexes and foreign keys, and reset sequences
79
+ logger.info "creating indexes"
80
+ eval( index_migration ).apply dst_db, :up
81
+
82
+ logger.info "creating foreign keys"
83
+ eval( fk_migration ).apply dst_db, :up
84
+
85
+ if dst_db.database_type == :postgres
86
+ logger.info "reset primary key sequences"
87
+ dst_db.tables.each{|t| dst_db.reset_primary_key_sequence(t)}
88
+ logger.info "Primary key sequences reset successfully"
89
+ end
90
+ end
91
+ end