wyrm 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/.rvmrc +1 -1
- data/Gemfile +0 -3
- data/README.md +18 -11
- data/bin/wyrm +40 -24
- data/bin/wyrm-view +34 -0
- data/lib/wyrm.rb +3 -5
- data/lib/wyrm/cli.rb +9 -0
- data/lib/wyrm/core_extensions.rb +10 -0
- data/lib/wyrm/{dump_schema.rb → dump.rb} +22 -21
- data/lib/wyrm/hole.rb +164 -0
- data/lib/wyrm/logger.rb +11 -0
- data/lib/wyrm/module.rb +1 -0
- data/lib/wyrm/{db_pump.rb → pump.rb} +40 -34
- data/lib/wyrm/pump_maker.rb +10 -4
- data/lib/wyrm/{restore_schema.rb → restore.rb} +40 -33
- data/lib/wyrm/schema_tools.rb +91 -0
- data/lib/wyrm/version.rb +1 -1
- data/snippets/console.rb +5 -3
- data/spec/core_extensions_spec.rb +50 -0
- data/spec/hole_mouth_spec.rb +176 -0
- data/spec/pump_spec.rb +62 -0
- data/spec/schema_tools_spec.rb +201 -0
- data/wyrm.gemspec +12 -3
- metadata +135 -23
- data/lib/wyrm/other_schema.rb +0 -6
- data/lib/wyrm/transferer.rb +0 -32
data/lib/wyrm/logger.rb
ADDED
data/lib/wyrm/module.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
module Wyrm end
|
@@ -1,25 +1,28 @@
|
|
1
1
|
require 'sequel'
|
2
2
|
require 'yaml'
|
3
|
-
require 'logger'
|
4
3
|
|
5
|
-
|
4
|
+
require 'wyrm/logger'
|
5
|
+
require 'wyrm/module'
|
6
6
|
|
7
7
|
# TODO when restoring, could use a SizeQueue to make sure the db is kept busy
|
8
8
|
# TODO need to version the dumps, or something like that.
|
9
9
|
# TODO looks like io should belong to codec. Hmm. Not sure.
|
10
10
|
# TODO table_name table_dataset need some thinking about. Dataset would encapsulate both. But couldn't change db then, and primary_keys would be hard.
|
11
|
-
class
|
12
|
-
|
13
|
-
def initialize( db: nil, table_name: nil, io: STDOUT, codec: :marshal, page_size: 10000, dry_run: false )
|
11
|
+
class Wyrm::Pump
|
12
|
+
def initialize( db: nil, table_name: nil, io: STDOUT, codec: :marshal, page_size: 10000, dry_run: false, logger: nil )
|
14
13
|
self.codec = codec
|
15
14
|
self.db = db
|
16
15
|
self.table_name = table_name
|
17
16
|
self.io = io
|
18
17
|
self.page_size = page_size
|
19
18
|
self.dry_run = dry_run
|
19
|
+
self.logger = logger
|
20
20
|
yield self if block_given?
|
21
21
|
end
|
22
22
|
|
23
|
+
include Wyrm::Logger
|
24
|
+
attr_writer :logger
|
25
|
+
|
23
26
|
attr_accessor :io, :page_size, :dry_run
|
24
27
|
def dry_run?; dry_run; end
|
25
28
|
|
@@ -46,9 +49,11 @@ class DbPump
|
|
46
49
|
@db.extension :pagination
|
47
50
|
|
48
51
|
# turn on postgres streaming if available
|
49
|
-
if defined?( Sequel::Postgres ) && Sequel::Postgres.supports_streaming?
|
50
|
-
logger.
|
52
|
+
if defined?( Sequel::Postgres ) && defined?(Sequel::Postgres.supports_streaming?) && Sequel::Postgres.supports_streaming?
|
53
|
+
logger.debug "Streaming for postgres"
|
51
54
|
@db.extension :pg_streaming
|
55
|
+
else
|
56
|
+
logger.info "No streaming for postgres"
|
52
57
|
end
|
53
58
|
end
|
54
59
|
|
@@ -57,10 +62,8 @@ class DbPump
|
|
57
62
|
# responds to all the methods
|
58
63
|
def self.quacks_like( *methods )
|
59
64
|
@quacks_like ||= {}
|
60
|
-
@quacks_like[methods] ||=
|
61
|
-
|
62
|
-
methods.all?{|m| instance.respond_to? m}
|
63
|
-
end
|
65
|
+
@quacks_like[methods] ||= lambda do |inst|
|
66
|
+
methods.all?{|m| inst.respond_to? m}
|
64
67
|
end
|
65
68
|
end
|
66
69
|
|
@@ -75,7 +78,7 @@ class DbPump
|
|
75
78
|
when :marshal; MarshalCodec.new
|
76
79
|
when Class
|
77
80
|
codec_thing.new
|
78
|
-
when quacks_like(
|
81
|
+
when quacks_like(:encode,:decode)
|
79
82
|
codec_thing
|
80
83
|
else
|
81
84
|
raise "unknown codec #{codec_thing.inspect}"
|
@@ -108,10 +111,6 @@ class DbPump
|
|
108
111
|
end
|
109
112
|
end
|
110
113
|
|
111
|
-
def logger
|
112
|
-
@logger ||= Logger.new STDERR
|
113
|
-
end
|
114
|
-
|
115
114
|
def primary_keys
|
116
115
|
@primary_keys ||= db.schema(table_name).select{|df| df.last[:primary_key]}.map{|df| df.first}
|
117
116
|
end
|
@@ -122,9 +121,12 @@ class DbPump
|
|
122
121
|
|
123
122
|
# Use limit / offset. Last fallback if there are no keys (or a compound primary key?).
|
124
123
|
def paginated_dump( &encode_block )
|
124
|
+
records_count = 0
|
125
125
|
table_dataset.order(*primary_keys).each_page(page_size) do |page|
|
126
|
-
logger.info
|
126
|
+
logger.info{ "#{__method__} #{table_name} #{records_count}" }
|
127
|
+
logger.debug{ page.sql }
|
127
128
|
page.each &encode_block
|
129
|
+
records_count += page_size
|
128
130
|
end
|
129
131
|
end
|
130
132
|
|
@@ -132,8 +134,6 @@ class DbPump
|
|
132
134
|
# The idea is that large offsets are expensive in the db because the db server has to read
|
133
135
|
# through the data set to reach the required offset. So make that only ids need to be read,
|
134
136
|
# and then do the main select from the limited id list.
|
135
|
-
# TODO could speed this up by have a query thread which runs the next page-query while
|
136
|
-
# the current one is being written/compressed.
|
137
137
|
# select * from massive as full
|
138
138
|
# inner join (select id from massive order by whatever limit m, n) limit
|
139
139
|
# on full.id = limit.id
|
@@ -144,7 +144,8 @@ class DbPump
|
|
144
144
|
0.step(table_dataset.count, page_size).each do |offset|
|
145
145
|
limit_dataset = table_dataset.select( *primary_keys ).limit( page_size, offset ).order( *primary_keys )
|
146
146
|
page = table_dataset.join( limit_dataset, Hash[ primary_keys.map{|f| [f,f]} ] ).order( *primary_keys ).qualify(table_name)
|
147
|
-
logger.info
|
147
|
+
logger.info{ "#{__method__} #{table_name} #{offset}" }
|
148
|
+
logger.debug{ page.sql }
|
148
149
|
page.each &encode_block
|
149
150
|
end
|
150
151
|
end
|
@@ -162,13 +163,14 @@ class DbPump
|
|
162
163
|
# bigger than max for the last page
|
163
164
|
(min..max).step(page_size).each do |offset|
|
164
165
|
page = table_dataset.where( id: offset...(offset + page_size) )
|
165
|
-
logger.info
|
166
|
+
logger.info{ "#{__method__} #{table_name} #{offset}" }
|
167
|
+
logger.debug{ page.sql }
|
166
168
|
page.each &encode_block
|
167
169
|
end
|
168
170
|
end
|
169
171
|
|
170
172
|
def stream_dump( &encode_block )
|
171
|
-
logger.
|
173
|
+
logger.debug{ "using result set streaming" }
|
172
174
|
|
173
175
|
# I want to output progress every page_size records,
|
174
176
|
# without doing a records_count % page_size every iteration.
|
@@ -183,18 +185,23 @@ class DbPump
|
|
183
185
|
records_count += 1
|
184
186
|
end
|
185
187
|
ensure
|
186
|
-
logger.info "#{
|
188
|
+
logger.info{ "#{__method__} #{table_name} #{records_count}" if records_count < page_size }
|
189
|
+
logger.debug{ " from #{table_dataset.sql}" }
|
187
190
|
end
|
188
191
|
end
|
189
192
|
end
|
190
193
|
|
191
194
|
# Dump the serialization of the table to the specified io.
|
195
|
+
#
|
192
196
|
# TODO need to also dump a first row containing useful stuff:
|
193
197
|
# - source table name
|
194
198
|
# - number of rows
|
195
199
|
# - source db url
|
196
200
|
# - permissions?
|
197
201
|
# These should all be in one object that can be Marshall.load-ed easily.
|
202
|
+
#
|
203
|
+
# TODO could speed this up by have a query thread which runs the next page-query while
|
204
|
+
# the current one is being written/compressed.
|
198
205
|
def dump
|
199
206
|
_dump do |row|
|
200
207
|
codec.encode( row.values, io ) unless dry_run?
|
@@ -239,21 +246,20 @@ class DbPump
|
|
239
246
|
|
240
247
|
return unless dump_matches_columns?( row_enum, columns )
|
241
248
|
|
242
|
-
logger.info{ "inserting to #{table_name} #{
|
249
|
+
logger.info{ "#{__method__} inserting to #{table_name} from #{start_row}" }
|
250
|
+
logger.debug{ " #{columns.inspect}" }
|
243
251
|
rows_restored = 0
|
244
252
|
|
245
253
|
if start_row != 0
|
246
|
-
logger.
|
254
|
+
logger.debug{ "skipping #{start_row} rows from #{filename}" }
|
247
255
|
start_row.times do |i|
|
248
256
|
row_enum.next
|
249
|
-
logger.
|
257
|
+
logger.debug{ "skipped #{i} from #{filename}" } if i % page_size == 0
|
250
258
|
end
|
251
|
-
logger.
|
259
|
+
logger.debug{ "skipped #{start_row} from #{filename}" }
|
252
260
|
rows_restored += start_row
|
253
261
|
end
|
254
262
|
|
255
|
-
logger.info{ "inserting to #{table_name} from #{rows_restored}" }
|
256
|
-
|
257
263
|
loop do
|
258
264
|
db.transaction do
|
259
265
|
begin
|
@@ -267,20 +273,20 @@ class DbPump
|
|
267
273
|
rows_restored += 1
|
268
274
|
end
|
269
275
|
rescue StopIteration
|
270
|
-
#
|
276
|
+
# reached the end of the inout stream.
|
271
277
|
# So commit this transaction, and then re-raise
|
272
278
|
# StopIteration to get out of the loop{} statement
|
273
279
|
db.after_commit{ raise StopIteration }
|
274
280
|
end
|
275
|
-
logger.info{ "#{table_name} inserted #{rows_restored}" }
|
276
281
|
end
|
277
282
|
end
|
278
|
-
logger.info{ "#{table_name} done. Inserted #{rows_restored}." }
|
283
|
+
logger.info{ "#{__method__} #{table_name} done. Inserted #{rows_restored}." }
|
279
284
|
rows_restored
|
280
285
|
end
|
281
286
|
|
282
|
-
# Enumerate through the given io at its current position
|
283
|
-
#
|
287
|
+
# Enumerate through the given io at its current position.
|
288
|
+
# Can raise StopIteration (ie when eof is not detected)
|
289
|
+
# MAYBE don't check for io.eof here, leave that to the codec
|
284
290
|
def each_row
|
285
291
|
return enum_for(__method__) unless block_given?
|
286
292
|
yield codec.decode( io ) until io.eof?
|
data/lib/wyrm/pump_maker.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
|
-
require 'wyrm/
|
1
|
+
require 'wyrm/pump'
|
2
|
+
require 'wyrm/module'
|
2
3
|
|
3
|
-
module PumpMaker
|
4
|
+
module Wyrm::PumpMaker
|
4
5
|
def call_or_self( maybe_callable )
|
5
6
|
if maybe_callable.respond_to? :call
|
6
7
|
maybe_callable.call( self )
|
@@ -10,13 +11,18 @@ module PumpMaker
|
|
10
11
|
end
|
11
12
|
|
12
13
|
def make_pump( db, pump_thing )
|
13
|
-
call_or_self(pump_thing) ||
|
14
|
+
call_or_self(pump_thing) || Pump.new( db: db )
|
14
15
|
end
|
15
16
|
|
16
17
|
def maybe_deebe( db_or_string )
|
17
18
|
case db_or_string
|
18
19
|
when String
|
19
|
-
|
20
|
+
begin
|
21
|
+
Sequel.connect db_or_string
|
22
|
+
rescue Sequel::AdapterNotFound
|
23
|
+
puts "\nCan't find db driver for #{db_or_string}. It might work to do\n\n gem install #{db_or_string.split(?:).first}\n\n"
|
24
|
+
exit(1)
|
25
|
+
end
|
20
26
|
when Sequel::Database
|
21
27
|
db_or_string
|
22
28
|
else
|
@@ -1,28 +1,44 @@
|
|
1
|
-
require '
|
1
|
+
require 'ostruct'
|
2
|
+
require 'pathname'
|
3
|
+
|
4
|
+
require 'wyrm/logger'
|
5
|
+
require 'wyrm/module'
|
2
6
|
require 'wyrm/pump_maker'
|
7
|
+
require 'wyrm/schema_tools'
|
3
8
|
|
4
9
|
# Load a schema from a set of dump files (from DumpSchema)
|
5
10
|
# and restore the table data.
|
6
11
|
# dst_db = Sequel.connect "postgres://localhost:5454/lots"
|
7
12
|
# rs = RestoreSchema.new dst_db, '/var/data/lots'
|
8
|
-
# rs.
|
9
|
-
#
|
10
|
-
|
13
|
+
# rs.call
|
14
|
+
# TODO the problem with lazy loading the schema files is that
|
15
|
+
# errors in indexes and foreign keys will only be picked up at the
|
16
|
+
# end of they probably lengthy table restore process.
|
17
|
+
# TODO check if table has been restored already, and has the correct rows,
|
18
|
+
class Wyrm::Restore
|
11
19
|
include PumpMaker
|
20
|
+
include SchemaTools
|
21
|
+
include Wyrm::Logger
|
12
22
|
|
13
|
-
def initialize(
|
23
|
+
def initialize( container, dst_db, pump: nil, drop_tables: false )
|
14
24
|
@container = Pathname.new container
|
15
25
|
@dst_db = maybe_deebe dst_db
|
16
26
|
@pump = make_pump( @dst_db, pump )
|
27
|
+
|
28
|
+
options.drop_tables = drop_tables
|
17
29
|
end
|
18
30
|
|
19
31
|
attr_reader :pump
|
20
32
|
attr_reader :dst_db
|
21
33
|
attr_reader :container
|
22
34
|
|
35
|
+
def options
|
36
|
+
@options ||= OpenStruct.new
|
37
|
+
end
|
38
|
+
|
23
39
|
# sequel wants migrations numbered, but it's a bit of an annoyance for this.
|
24
40
|
def find_single( glob )
|
25
|
-
candidates =Pathname.glob container + glob
|
41
|
+
candidates = Pathname.glob container + glob
|
26
42
|
raise "too many #{candidates.inspect} for #{glob}" unless candidates.size == 1
|
27
43
|
candidates.first
|
28
44
|
end
|
@@ -45,38 +61,13 @@ class RestoreSchema
|
|
45
61
|
@schema_migration = nil
|
46
62
|
end
|
47
63
|
|
48
|
-
def logger
|
49
|
-
@logger ||= Logger.new STDERR
|
50
|
-
end
|
51
|
-
|
52
|
-
# create indexes and foreign keys, and reset sequences
|
53
|
-
def index
|
54
|
-
logger.info "creating indexes"
|
55
|
-
eval( index_migration ).apply dst_db, :up
|
56
|
-
logger.info "creating foreign keys"
|
57
|
-
eval( fk_migration ).apply dst_db, :up
|
58
|
-
|
59
|
-
if dst_db.database_type == :postgres
|
60
|
-
logger.info "reset primary key sequences"
|
61
|
-
dst_db.tables.each{|t| dst_db.reset_primary_key_sequence(t)}
|
62
|
-
logger.info "Primary key sequences reset successfully"
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|
66
|
-
# create the destination schema
|
67
|
-
def create
|
68
|
-
logger.info "creating tables"
|
69
|
-
eval( schema_migration ).apply dst_db, :up
|
70
|
-
end
|
71
|
-
|
72
64
|
# assume the table name is the base name of table_file pathname
|
73
65
|
def restore_table( table_file )
|
74
66
|
logger.info "restoring from #{table_file}"
|
75
67
|
pump.table_name = table_file.basename.sub_ext('').sub_ext('').to_s.to_sym
|
76
|
-
# TODO check if table has been restored already, and has the correct rows,
|
77
68
|
open_bz2 table_file do |io|
|
78
69
|
pump.io = io
|
79
|
-
pump.restore
|
70
|
+
pump.restore filename: table_file
|
80
71
|
end
|
81
72
|
end
|
82
73
|
|
@@ -95,8 +86,24 @@ class RestoreSchema
|
|
95
86
|
IO.popen "pbzip2 -d -c #{table_file}", &block
|
96
87
|
end
|
97
88
|
|
89
|
+
def table_files
|
90
|
+
Pathname.glob container + '*.dbp.bz2'
|
91
|
+
end
|
92
|
+
|
98
93
|
def restore_tables
|
99
|
-
table_files = Pathname.glob container + '*.dbp.bz2'
|
100
94
|
table_files.sort_by{|tf| tf.stat.size}.each{|table_file| restore_table table_file}
|
101
95
|
end
|
96
|
+
|
97
|
+
def table_names
|
98
|
+
table_files.map do |path|
|
99
|
+
path.basename.to_s.split(?.)[0...-2].last.to_sym
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
def call
|
104
|
+
drop_tables(table_names) if options.drop_tables
|
105
|
+
create_tables
|
106
|
+
restore_tables
|
107
|
+
create_indexes
|
108
|
+
end
|
102
109
|
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
require 'fastandand'
|
2
|
+
Sequel.extension :migration
|
3
|
+
require 'wyrm/module'
|
4
|
+
|
5
|
+
# needs dst_db for mutate operations
|
6
|
+
# and src_db for fetch operations
|
7
|
+
# src_db must have extension(:schema_dumper)
|
8
|
+
module Wyrm::SchemaTools
|
9
|
+
# some includers will need to provide a different implementation for this.
|
10
|
+
def same_db
|
11
|
+
respond_to?( :dst_db ) && respond_to?( :src_db ) && dst_db.andand.database_type == src_db.andand.database_type
|
12
|
+
end
|
13
|
+
|
14
|
+
def schema_migration
|
15
|
+
@schema_migration ||= src_db.dump_schema_migration(:indexes=>false, :same_db => same_db)
|
16
|
+
end
|
17
|
+
|
18
|
+
def index_migration
|
19
|
+
@index_migration ||= src_db.dump_indexes_migration(:same_db => same_db)
|
20
|
+
end
|
21
|
+
|
22
|
+
def fk_migration
|
23
|
+
@fk_migration ||= src_db.dump_foreign_key_migration(:same_db => same_db)
|
24
|
+
end
|
25
|
+
|
26
|
+
def drop_table_options
|
27
|
+
@drop_table_options ||=
|
28
|
+
begin
|
29
|
+
if dst_db.database_type == :postgres
|
30
|
+
{cascade: true}
|
31
|
+
else
|
32
|
+
{}
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
# Delete given tables.
|
38
|
+
# Recurse if there are foreign keys preventing table deletion.
|
39
|
+
# This implementation will fail for tables with mutual foreign keys.
|
40
|
+
# TODO maybe this should use the schema down migration?
|
41
|
+
def drop_tables( tables )
|
42
|
+
foreign_keyed_tables = []
|
43
|
+
tables.each do |table_name|
|
44
|
+
begin
|
45
|
+
logger.debug "dropping #{table_name}"
|
46
|
+
dst_db.drop_table? table_name, drop_table_options
|
47
|
+
|
48
|
+
rescue Sequel::ForeignKeyConstraintViolation => ex
|
49
|
+
foreign_keyed_tables << table_name
|
50
|
+
|
51
|
+
rescue Sequel::DatabaseError => ex
|
52
|
+
# Mysql2::Error: Cannot delete or update a parent row: a foreign key constraint fails
|
53
|
+
# SQLite3::ConstraintException: FOREIGN KEY constraint failed==
|
54
|
+
if ex.message =~ /foreign key constraint fail/i
|
55
|
+
foreign_keyed_tables << table_name
|
56
|
+
else
|
57
|
+
raise
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
# this should be temporary
|
64
|
+
if tables.sort == foreign_keyed_tables.sort
|
65
|
+
raise "can't remove #{tables.inspect} because they have mutual foreign keys"
|
66
|
+
end
|
67
|
+
|
68
|
+
# recursively delete tables
|
69
|
+
drop_tables foreign_keyed_tables.shuffle unless foreign_keyed_tables.empty?
|
70
|
+
end
|
71
|
+
|
72
|
+
def create_tables
|
73
|
+
logger.info "creating tables"
|
74
|
+
eval( schema_migration ).apply dst_db, :up
|
75
|
+
end
|
76
|
+
|
77
|
+
def create_indexes
|
78
|
+
# create indexes and foreign keys, and reset sequences
|
79
|
+
logger.info "creating indexes"
|
80
|
+
eval( index_migration ).apply dst_db, :up
|
81
|
+
|
82
|
+
logger.info "creating foreign keys"
|
83
|
+
eval( fk_migration ).apply dst_db, :up
|
84
|
+
|
85
|
+
if dst_db.database_type == :postgres
|
86
|
+
logger.info "reset primary key sequences"
|
87
|
+
dst_db.tables.each{|t| dst_db.reset_primary_key_sequence(t)}
|
88
|
+
logger.info "Primary key sequences reset successfully"
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|