wyrm 0.4.1 → 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: da207f92151b080d31039a364c1a2c50022f01ff
4
- data.tar.gz: ddf38f48b42597ed08671cb67fadd8457e425e07
3
+ metadata.gz: c40184e0e1f6175ad0447494ff5bf367c39292db
4
+ data.tar.gz: c7b927a63887f83ba35b6c3be3c11fb412a2212a
5
5
  SHA512:
6
- metadata.gz: 03e699a00d14fa7baacc286b886cf35074766b7b6b3b8e6e10fde08779ded7fda4930f9666bad95274bf773f8fe33f1916f3836414e98c188c81963b3a01459c
7
- data.tar.gz: 0052a0b096e62662223f9e4a9da2cfd79e2908f033f83a7a9201463aac3ec9f56407299b68e461b9b850f39ea8c3da5e53c48ad7510561d680b601a38ce739ca
6
+ metadata.gz: cd762e971e8fb35f4147b4657b5fbb67fb1de1ef26ec4d8ef7af2dac2a9f6532cf8bce4e02587021e261e302e133d6312caad46cf6e06924d3701a25dc8bb2a1
7
+ data.tar.gz: 7c38e0d0f186e78e58639220b21755b219e85ef15b3acbe8c920e145c70f1715702b4c4fd060abebad767732469296dd855af198349b27be97d70fd419060e47
@@ -1,4 +1,6 @@
1
1
  language: ruby
2
- rvm: []
3
- # - 2.3 not supported as of 16-Mar-2016. srsly, 3 months after release
2
+ rvm:
3
+ - 2.3.0
4
+ - 2.3.1
5
+ # - jruby-9.1.0.0 fails because of db drivers
4
6
  script: bundle exec rspec spec
data/Gemfile CHANGED
@@ -1,32 +1,24 @@
1
- raise "You need >= ruby-2.3 (or maybe a Queue with close would work)" unless RUBY_VERSION >= '2.3.0'
1
+ source 'https://rubygems.org'
2
2
 
3
- # TODO this is for older versions of bundler
4
- def from_gemrc
5
- # auto-load from ~/.gemrc
6
- home_gemrc = Pathname('~/.gemrc').expand_path
7
-
8
- if home_gemrc.exist?
9
- require 'yaml'
10
- # use all the sources specified in .gemrc
11
- YAML.load_file(home_gemrc)[:sources]
12
- end
13
- end
14
-
15
- # Use the gemrc source if defined, unless CANON is set,
16
- # otherwise just use the default.
17
- def preferred_sources
18
- rv = from_gemrc unless eval(ENV['CANON']||'')
19
- rv ||= []
20
- rv << 'http://rubygems.org' if rv.empty?
21
- rv
22
- end
23
-
24
- preferred_sources.each{|src| source src}
3
+ raise "You need >= ruby-2.3 for wyrm" unless RUBY_VERSION >= '2.3.0'
25
4
 
26
5
  # Specify your gem's dependencies in wyrm.gemspec
27
6
  gemspec
28
7
 
29
- if Pathname('/usr/include/mysql').exist?
30
- # version is for mysql streaming result sets
31
- gem "mysql2", '>= 0.3.12'
8
+ platforms :ruby do
9
+ gem 'pg'
10
+ gem 'sequel_pg'
11
+ gem 'sqlite3'
12
+ gem 'pry-byebug'
13
+
14
+ if Pathname('/usr/include/mysql').exist?
15
+ # version is for mysql streaming result sets
16
+ gem "mysql2", '>= 0.3.12'
17
+ end
18
+ end
19
+
20
+ platforms :jruby do
21
+ # gem "pg"
22
+ gem 'jdbc-sqlite3'
23
+ gem 'jdbc-postgres'
32
24
  end
@@ -1,3 +1,7 @@
1
+ == 0.4.2
2
+ * special case for jruby closing popen stream
3
+ * use modules better
4
+
1
5
  == 0.4.1
2
6
  * Improve docs and examples
3
7
  * make pbzip2 somewhat configurable.
data/README.md CHANGED
@@ -30,6 +30,8 @@ Wyrm because:
30
30
 
31
31
  ## Dependencies
32
32
 
33
+ Ruby >= 2.3.0, for Queue#close
34
+
33
35
  You must have a working
34
36
  [pbzip2](http://compression.ca/pbzip2/ "Will use all your cores")
35
37
  on your path. If you really have to use something else,
@@ -6,104 +6,129 @@ require 'wyrm/schema_tools'
6
6
  require 'wyrm/logger'
7
7
 
8
8
  # Dump a schema and compressed data from a db to a set of files
9
- # src_db = Sequel.connect "postgres://localhost:5454/lots"
10
- # ds = DumpSchema.new src_db, Pathname('/var/data/lots')
11
- # ds.call
9
+ #
10
+ # Dump["postgres://localhost:5454/lots", '/var/data/lots']
11
+ #
12
12
  # TODO possibly use Gem::Package::TarWriter to write tar files
13
- class Wyrm::Dump
14
- include Wyrm::PumpMaker
15
- include Wyrm::SchemaTools
16
- include Wyrm::Logger
17
-
18
- def initialize( src_db, container = nil, pump: nil )
19
- @container = Pathname.new container || '.'
20
- raise "#{@container} does not exist" unless @container.exist?
13
+ module Wyrm
14
+ class Dump
15
+ include Wyrm::PumpMaker
16
+ include Wyrm::SchemaTools
17
+ include Wyrm::Logger
18
+
19
+ def self.[]( *args )
20
+ new(*args).call
21
+ end
21
22
 
22
- @src_db = maybe_deebe src_db
23
- @pump = make_pump( @src_db, pump )
23
+ def call
24
+ dump_schema
25
+ dump_tables
26
+ dump_indexes
27
+ end
24
28
 
25
- @src_db.extension :schema_dumper
26
- end
29
+ def initialize( src_db, container = nil, pump: nil )
30
+ @container = Pathname.new container || '.'
31
+ raise "#{@container} does not exist" unless @container.exist?
27
32
 
28
- attr_reader :src_db, :container, :pump
33
+ @src_db = maybe_deebe src_db
34
+ @pump = make_pump( @src_db, pump )
29
35
 
30
- def same_db; false end
36
+ @src_db.extension :schema_dumper
37
+ end
31
38
 
32
- def numbering
33
- @numbering ||= '000'
34
- end
39
+ attr_reader :src_db, :container, :pump
35
40
 
36
- def dump_schema
37
- (container + "#{numbering.next!}_schema.rb").open('w') do |io|
38
- io.write schema_migration
39
- end
40
- end
41
+ def same_db; false end
41
42
 
42
- def dump_indexes
43
- (container + "#{numbering.next!}_indexes.rb").open('w') do |io|
44
- io.write index_migration
43
+ def numbering
44
+ @numbering ||= '000'
45
45
  end
46
46
 
47
- (container + "#{numbering.next!}_foreign_keys.rb").open('w') do |io|
48
- io.write fk_migration
47
+ def dump_table_schemas( *tables )
48
+ (container + "#{numbering.next!}_schema.rb").open('w') do |io|
49
+ tables.each do |table|
50
+ logger.debug "schema for #{table}"
51
+ io.puts table_migration table
52
+ end
53
+ end
49
54
  end
50
- end
51
55
 
52
- def write_through_bz2( pathname )
53
- fio = pathname.open('w')
54
- # open subprocess in read-write mode
55
- zio = IO.popen( STREAM_COMP, 'r+' )
56
- copier = Thread.new do
57
- begin
58
- IO.copy_stream zio, fio
59
- logger.debug "finished stream copy"
60
- ensure
61
- fio.close
56
+ def dump_schema
57
+ (container + "#{numbering.next!}_schema.rb").open('w') do |io|
58
+ io.write schema_migration
62
59
  end
63
60
  end
64
61
 
65
- yield zio
62
+ def dump_indexes
63
+ (container + "#{numbering.next!}_indexes.rb").open('w') do |io|
64
+ io.write index_migration
65
+ end
66
66
 
67
- # signal the copier thread to stop
68
- zio.close_write
69
- logger.debug 'finished dumping'
67
+ (container + "#{numbering.next!}_foreign_keys.rb").open('w') do |io|
68
+ io.write fk_migration
69
+ end
70
+ end
70
71
 
71
- # wait for copier thread to finish
72
- copier.join
73
- logger.debug 'stream copy thread finished'
74
- ensure
75
- zio.close unless zio.closed?
76
- fio.close unless fio.closed?
77
- end
72
+ def write_through_bz2( pathname )
73
+ fio = pathname.open('w')
74
+ # open subprocess in read-write mode
75
+ zio = IO.popen( STREAM_COMP, 'r+' )
76
+ copier = Thread.new do
77
+ begin
78
+ IO.copy_stream zio, fio
79
+ logger.debug "finished stream copy"
80
+ ensure
81
+ fio.close
82
+ end
83
+ end
78
84
 
79
- def dump_table( table_name, &io_block )
80
- pump.table_name = table_name
81
- if pump.table_dataset.empty?
82
- logger.info "No records in #{table_name}"
83
- return
85
+ # block receiving zio will write to it.
86
+ yield zio
87
+
88
+ # signal the copier thread to stop
89
+ logger.debug 'flushing'
90
+ if RUBY_ENGINE == 'jruby'
91
+ # seems to be required for jruby, at least 9.1.2.0
92
+ logger.debug 'jruby flushing'
93
+ zio.flush
94
+ logger.debug 'jruby close'
95
+ zio.close
96
+ else
97
+ zio.close_write
98
+ end
99
+ logger.debug 'finished dumping'
100
+
101
+ # wait for copier thread to finish
102
+ copier.join
103
+ logger.debug 'stream copy thread finished'
104
+ ensure
105
+ zio.close if zio && !zio.closed?
106
+ fio.close if fio && !fio.closed?
84
107
  end
85
108
 
86
- filename = container + "#{table_name}.dbp.bz2"
87
- logger.info "dumping #{table_name} to #{filename}"
109
+ def dump_table( table_name, &io_block )
110
+ pump.table_name = table_name
111
+ if pump.table_dataset.empty?
112
+ logger.info "No records in #{table_name}"
113
+ return
114
+ end
88
115
 
89
- write_through_bz2 filename do |zio|
90
- # generate the dump
91
- pump.io = zio
92
- pump.dump
93
- end
94
- rescue
95
- logger.error "failed dumping #{table_name}: #{$!.message}"
96
- end
116
+ filename = container + "#{table_name}.dbp.bz2"
117
+ logger.info "dumping #{table_name} to #{filename}"
97
118
 
98
- def dump_tables
99
- src_db.tables.each do |table_name|
100
- dump_table table_name
119
+ write_through_bz2 filename do |zio|
120
+ # generate the dump
121
+ pump.io = zio
122
+ pump.dump
123
+ end
124
+ rescue
125
+ logger.error "failed dumping #{table_name}: #{$!.message}"
101
126
  end
102
- end
103
127
 
104
- def call
105
- dump_schema
106
- dump_tables
107
- dump_indexes
128
+ def dump_tables
129
+ src_db.tables.each do |table_name|
130
+ dump_table table_name
131
+ end
132
+ end
108
133
  end
109
134
  end
@@ -4,7 +4,7 @@ module Wyrm
4
4
  module Logger
5
5
  def logger
6
6
  @logger ||= ::Logger.new( STDERR ).tap do |lgr|
7
- lgr.level = ::Logger::INFO
7
+ lgr.level = ::Logger::DEBUG
8
8
  end
9
9
  end
10
10
  end
@@ -8,304 +8,306 @@ require 'wyrm/module'
8
8
  # TODO need to version the dumps, or something like that.
9
9
  # TODO looks like io should belong to codec. Hmm. Not sure.
10
10
  # TODO table_name table_dataset need some thinking about. Dataset would encapsulate both. But couldn't change db then, and primary_keys would be hard.
11
- class Wyrm::Pump
12
- def initialize( db: nil, table_name: nil, io: STDOUT, codec: :marshal, page_size: 10000, dry_run: false, logger: nil )
13
- self.codec = codec
14
- self.db = db
15
- self.table_name = table_name
16
- self.io = io
17
- self.page_size = page_size
18
- self.dry_run = dry_run
19
- self.logger = logger
20
- yield self if block_given?
21
- end
11
+ module Wyrm
12
+ class Pump
13
+ def initialize( db: nil, table_name: nil, io: STDOUT, codec: :marshal, page_size: 10000, dry_run: false, logger: nil )
14
+ self.codec = codec
15
+ self.db = db
16
+ self.table_name = table_name
17
+ self.io = io
18
+ self.page_size = page_size
19
+ self.dry_run = dry_run
20
+ self.logger = logger
21
+ yield self if block_given?
22
+ end
22
23
 
23
- include Wyrm::Logger
24
- attr_writer :logger
24
+ include Wyrm::Logger
25
+ attr_writer :logger
25
26
 
26
- attr_accessor :io, :page_size, :dry_run
27
- def dry_run?; dry_run; end
27
+ attr_accessor :io, :page_size, :dry_run
28
+ def dry_run?; dry_run; end
28
29
 
29
- # These are affected by cached values
30
- attr_reader :db, :table_name
30
+ # These are affected by cached values
31
+ attr_reader :db, :table_name
31
32
 
32
- def invalidate_cached_members
33
- @primary_keys = nil
34
- @table_dataset = nil
35
- end
33
+ def invalidate_cached_members
34
+ @primary_keys = nil
35
+ @table_dataset = nil
36
+ end
36
37
 
37
- def table_name=( name_sym )
38
- invalidate_cached_members
39
- @table_name = name_sym
40
- end
38
+ def table_name=( name_sym )
39
+ invalidate_cached_members
40
+ @table_name = name_sym
41
+ end
41
42
 
42
- def db=( other_db )
43
- invalidate_cached_members
43
+ def db=( other_db )
44
+ invalidate_cached_members
44
45
 
45
- @db = other_db
46
- return unless other_db
46
+ @db = other_db
47
+ return unless other_db
47
48
 
48
- # add extensions
49
- @db.extension :pagination
49
+ # add extensions
50
+ @db.extension :pagination
50
51
 
51
- # turn on postgres streaming if available
52
- # also gets called for non-postgres dbs, but that seems to be fine.
53
- if defined?( Sequel::Postgres ) && @db.is_a?(Sequel::Postgres::Database) && defined?(Sequel::Postgres.supports_streaming?) && Sequel::Postgres.supports_streaming?
54
- @db.extension :pg_streaming
55
- logger.info "Streaming for #{@db.uri}"
56
- else
57
- logger.info "No streaming for #{@db.uri}"
52
+ # turn on postgres streaming if available
53
+ # also gets called for non-postgres dbs, but that seems to be fine.
54
+ if defined?( Sequel::Postgres::Database ) && @db.is_a?(Sequel::Postgres::Database) && defined?(Sequel::Postgres.supports_streaming?) && Sequel::Postgres.supports_streaming?
55
+ @db.extension :pg_streaming
56
+ logger.info "Streaming for #{@db.uri}"
57
+ else
58
+ logger.info "No streaming for #{@db.uri}"
59
+ end
58
60
  end
59
- end
60
61
 
61
- # return an object that responds to ===
62
- # which returns true if ==='s parameter
63
- # responds to all the methods
64
- def self.quacks_like( *methods )
65
- @quacks_like ||= {}
66
- @quacks_like[methods] ||= lambda do |inst|
67
- methods.all?{|m| inst.respond_to? m}
62
+ # return an object that responds to ===
63
+ # which returns true if ==='s parameter
64
+ # responds to all the methods
65
+ def self.quacks_like( *methods )
66
+ @quacks_like ||= {}
67
+ @quacks_like[methods] ||= lambda do |inst|
68
+ methods.all?{|m| inst.respond_to? m}
69
+ end
68
70
  end
69
- end
70
71
 
71
- def quacks_like( *methods )
72
- self.class.quacks_like( *methods )
73
- end
72
+ def quacks_like( *methods )
73
+ self.class.quacks_like( *methods )
74
+ end
74
75
 
75
- def codec=( codec_thing )
76
- @codec =
77
- case codec_thing
78
- when :yaml; YamlCodec.new
79
- when :marshal; MarshalCodec.new
80
- when Class
81
- codec_thing.new
82
- when quacks_like(:encode,:decode)
83
- codec_thing
84
- else
85
- raise "unknown codec #{codec_thing.inspect}"
76
+ def codec=( codec_thing )
77
+ @codec =
78
+ case codec_thing
79
+ when :yaml; YamlCodec.new
80
+ when :marshal; MarshalCodec.new
81
+ when Class
82
+ codec_thing.new
83
+ when quacks_like(:encode,:decode)
84
+ codec_thing
85
+ else
86
+ raise "unknown codec #{codec_thing.inspect}"
87
+ end
86
88
  end
87
- end
88
89
 
89
- attr_reader :codec
90
+ attr_reader :codec
90
91
 
91
- class MarshalCodec
92
- def encode( obj, io )
93
- Marshal.dump obj, io
94
- end
92
+ class MarshalCodec
93
+ def encode( obj, io )
94
+ Marshal.dump obj, io
95
+ end
95
96
 
96
- def decode( io, &block )
97
- obj = Marshal.load(io)
98
- yield obj if block_given?
99
- obj
97
+ def decode( io, &block )
98
+ obj = Marshal.load(io)
99
+ yield obj if block_given?
100
+ obj
101
+ end
100
102
  end
101
- end
102
103
 
103
- class YamlCodec
104
- def encode( obj, io )
105
- YAML.dump obj, io
106
- end
104
+ class YamlCodec
105
+ def encode( obj, io )
106
+ YAML.dump obj, io
107
+ end
107
108
 
108
- def decode( io, &block )
109
- obj = YAML.load(io)
110
- yield obj if block_given?
111
- obj
109
+ def decode( io, &block )
110
+ obj = YAML.load(io)
111
+ yield obj if block_given?
112
+ obj
113
+ end
112
114
  end
113
- end
114
115
 
115
- def primary_keys
116
- # each_with_object([]){...} is only faster for < 3 items in 100000
117
- @primary_keys ||= db.schema(table_name).map{|name,column_info| name if column_info[:primary_key]}.compact
118
- end
116
+ def primary_keys
117
+ # each_with_object([]){...} is only faster for < 3 items in 100000
118
+ @primary_keys ||= db.schema(table_name).map{|name,column_info| name if column_info[:primary_key]}.compact
119
+ end
119
120
 
120
- def table_dataset
121
- @table_dataset ||= db[table_name.to_sym]
122
- end
121
+ def table_dataset
122
+ @table_dataset ||= db[table_name.to_sym]
123
+ end
123
124
 
124
- # Use limit / offset. Last fallback if there are no keys (or a compound primary key?).
125
- def paginated_dump( &encode_block )
126
- records_count = 0
127
- table_dataset.order(*primary_keys).each_page(page_size) do |page|
128
- logger.info "#{__method__} #{table_name} #{records_count}"
129
- logger.debug page.sql
130
- page.each &encode_block
131
- records_count += page_size
125
+ # Use limit / offset. Last fallback if there are no keys (or a compound primary key?).
126
+ def paginated_dump( &encode_block )
127
+ records_count = 0
128
+ table_dataset.order(*primary_keys).each_page(page_size) do |page|
129
+ logger.info "#{__method__} #{table_name} #{records_count}"
130
+ logger.debug page.sql
131
+ page.each &encode_block
132
+ records_count += page_size
133
+ end
132
134
  end
133
- end
134
135
 
135
- # Use limit / offset, but not for all fields.
136
- # The idea is that large offsets are expensive in the db because the db server has to read
137
- # through the data set to reach the required offset. So make that only ids need to be read,
138
- # and then do the main select from the limited id list.
139
- # select * from massive as full
140
- # inner join (select id from massive order by whatever limit m, n) limit
141
- # on full.id = limit.id
142
- # order by full.whatever
143
- # http://www.numerati.com/2012/06/26/reading-large-result-sets-with-hibernate-and-mysql/
144
- def inner_dump( &encode_block )
145
- # could possibly overrride Dataset#paginate(page_no, page_size, record_count=nil)
146
- on_conditions = primary_keys.map{|f| [f,f]}.to_h
147
- (0..table_dataset.count).step(page_size).each do |offset|
148
- limit_dataset = table_dataset.select( *primary_keys ).limit( page_size, offset ).order( *primary_keys )
149
- page = table_dataset.join( limit_dataset, on_conditions ).order( *primary_keys ).qualify(table_name)
150
- logger.info "#{__method__} #{table_name} #{offset}"
151
- logger.debug page.sql
152
- page.each &encode_block
136
+ # Use limit / offset, but not for all fields.
137
+ # The idea is that large offsets are expensive in the db because the db server has to read
138
+ # through the data set to reach the required offset. So make that only ids need to be read,
139
+ # and then do the main select from the limited id list.
140
+ # select * from massive as full
141
+ # inner join (select id from massive order by whatever limit m, n) limit
142
+ # on full.id = limit.id
143
+ # order by full.whatever
144
+ # http://www.numerati.com/2012/06/26/reading-large-result-sets-with-hibernate-and-mysql/
145
+ def inner_dump( &encode_block )
146
+ # could possibly overrride Dataset#paginate(page_no, page_size, record_count=nil)
147
+ on_conditions = primary_keys.map{|f| [f,f]}.to_h
148
+ (0..table_dataset.count).step(page_size).each do |offset|
149
+ limit_dataset = table_dataset.select( *primary_keys ).limit( page_size, offset ).order( *primary_keys )
150
+ page = table_dataset.join( limit_dataset, on_conditions ).order( *primary_keys ).qualify(table_name)
151
+ logger.info "#{__method__} #{table_name} #{offset}"
152
+ logger.debug page.sql
153
+ page.each &encode_block
154
+ end
153
155
  end
154
- end
155
156
 
156
- # Selects pages by a range of ids, using >= and <.
157
- # Use this for integer pks
158
- def min_max_dump( &encode_block )
159
- # select max(id), min(id) from table
160
- # and then split that up into 10000 size chunks.
161
- # Not really important if there aren't exactly 10000
162
- min, max = table_dataset.select{[min(id), max(id)]}.first.values
163
- return unless min && max
164
-
165
- # will always include the last item because page_size will be
166
- # bigger than max for the last page
167
- (min..max).step(page_size).each do |offset|
168
- page = table_dataset.where( id: offset...(offset + page_size) )
169
- logger.info "#{__method__} #{table_name} #{offset}"
170
- logger.debug page.sql
171
- page.each &encode_block
157
+ # Selects pages by a range of ids, using >= and <.
158
+ # Use this for integer pks
159
+ def min_max_dump( &encode_block )
160
+ # select max(id), min(id) from table
161
+ # and then split that up into 10000 size chunks.
162
+ # Not really important if there aren't exactly 10000
163
+ min, max = table_dataset.select{[min(id), max(id)]}.first.values
164
+ return unless min && max
165
+
166
+ # will always include the last item because page_size will be
167
+ # bigger than max for the last page
168
+ (min..max).step(page_size).each do |offset|
169
+ page = table_dataset.where( id: offset...(offset + page_size) )
170
+ logger.info "#{__method__} #{table_name} #{offset}"
171
+ logger.debug page.sql
172
+ page.each &encode_block
173
+ end
172
174
  end
173
- end
174
175
 
175
- def stream_dump( &encode_block )
176
- logger.info "using result set streaming"
177
-
178
- # I want to output progress every page_size records,
179
- # without doing a records_count % page_size every iteration.
180
- # So define an external enumerator
181
- # TODO should really performance test the options here.
182
- records_count = 0
183
- enum = table_dataset.stream.enum_for
184
- loop do
185
- begin
186
- page_size.times do
187
- encode_block.call enum.next
188
- records_count += 1
176
+ def stream_dump( &encode_block )
177
+ logger.info "using result set streaming"
178
+
179
+ # I want to output progress every page_size records,
180
+ # without doing a records_count % page_size every iteration.
181
+ # So define an external enumerator
182
+ # TODO should really performance test the options here.
183
+ records_count = 0
184
+ enum = table_dataset.stream.enum_for
185
+ loop do
186
+ begin
187
+ page_size.times do
188
+ encode_block.call enum.next
189
+ records_count += 1
190
+ end
191
+ ensure
192
+ logger.info "#{__method__} #{table_name} #{records_count}" if records_count < page_size
193
+ logger.debug " #{records_count} from #{table_dataset.sql}"
189
194
  end
190
- ensure
191
- logger.info "#{__method__} #{table_name} #{records_count}" if records_count < page_size
192
- logger.debug " #{records_count} from #{table_dataset.sql}"
193
195
  end
194
196
  end
195
- end
196
197
 
197
- # Dump the serialization of the table to the specified io.
198
- #
199
- # TODO need to also dump a first row containing useful stuff:
200
- # - source table name
201
- # - number of rows
202
- # - source db url
203
- # - permissions?
204
- # These should all be in one object that can be Marshall.load-ed easily.
205
- #
206
- # TODO could speed this up by have a query thread which runs the next page-query while
207
- # the current one is being written/compressed.
208
- def dump
209
- _dump do |row|
210
- codec.encode( row.values, io ) unless dry_run?
198
+ # Dump the serialization of the table to the specified io.
199
+ #
200
+ # TODO need to also dump a first row containing useful stuff:
201
+ # - source table name
202
+ # - number of rows
203
+ # - source db url
204
+ # - permissions?
205
+ # These should all be in one object that can be Marshall.load-ed easily.
206
+ #
207
+ # TODO could speed this up by have a query thread which runs the next page-query while
208
+ # the current one is being written/compressed.
209
+ def dump
210
+ _dump do |row|
211
+ codec.encode( row.values, io ) unless dry_run?
212
+ end
213
+ ensure
214
+ io.flush
211
215
  end
212
- ensure
213
- io.flush
214
- end
215
216
 
216
- # decide which kind of paged iteration will be best for this table.
217
- # Return an iterator, or yield row hashes to the block
218
- def _dump( &encode_block )
219
- return enum_for(__method__) unless block_given?
220
- case
221
- when table_dataset.respond_to?( :stream )
222
- stream_dump &encode_block
217
+ # decide which kind of paged iteration will be best for this table.
218
+ # Return an iterator, or yield row hashes to the block
219
+ def _dump( &encode_block )
220
+ return enum_for(__method__) unless block_given?
221
+ case
222
+ when table_dataset.respond_to?( :stream )
223
+ stream_dump &encode_block
223
224
 
224
- when primary_keys.empty?
225
- paginated_dump &encode_block
225
+ when primary_keys.empty?
226
+ paginated_dump &encode_block
226
227
 
227
- when primary_keys.all?{|i| i == :id }
228
- min_max_dump &encode_block
228
+ when primary_keys.all?{|i| i == :id }
229
+ min_max_dump &encode_block
229
230
 
230
- else
231
- inner_dump &encode_block
231
+ else
232
+ inner_dump &encode_block
233
+ end
232
234
  end
233
- end
234
235
 
235
- def dump_matches_columns?( row_enum, columns )
236
- raise "schema mismatch" unless row_enum.peek.size == columns.size
237
- true
238
- rescue StopIteration
239
- # peek threw a StopIteration, so there's no data
240
- false
241
- end
236
+ def dump_matches_columns?( row_enum, columns )
237
+ raise "schema mismatch" unless row_enum.peek.size == columns.size
238
+ true
239
+ rescue StopIteration
240
+ # peek threw a StopIteration, so there's no data
241
+ false
242
+ end
242
243
 
243
- # start_row is zero-based
244
- #
245
- # TODO don't generate the full insert, ie leave out the fields
246
- # because we've already checked that the columns and the table
247
- # match.
248
- # TODO generate column names in insert, they might still work
249
- # if columns have been added to the db, but not the dump.
250
- def restore( start_row: 0, filename: 'io' )
251
- columns = table_dataset.columns
252
- row_enum = each_row
253
-
254
- return unless dump_matches_columns?( row_enum, columns )
255
-
256
- logger.info "#{__method__} inserting to #{table_name} from #{start_row}"
257
- logger.debug " #{columns.inspect}"
258
- rows_restored = 0
259
-
260
- if start_row != 0
261
- logger.debug{ "skipping #{start_row} rows from #{filename}" }
262
- start_row.times do |i|
263
- row_enum.next
264
- logger.debug{ "skipped #{i} from #{filename}" } if i % page_size == 0
244
+ # start_row is zero-based
245
+ #
246
+ # TODO don't generate the full insert, ie leave out the fields
247
+ # because we've already checked that the columns and the table
248
+ # match.
249
+ # TODO generate column names in insert, they might still work
250
+ # if columns have been added to the db, but not the dump.
251
+ def restore( start_row: 0, filename: 'io' )
252
+ columns = table_dataset.columns
253
+ row_enum = each_row
254
+
255
+ return unless dump_matches_columns?( row_enum, columns )
256
+
257
+ logger.info "#{__method__} inserting to #{table_name} from #{start_row}"
258
+ logger.debug " #{columns.inspect}"
259
+ rows_restored = 0
260
+
261
+ if start_row != 0
262
+ logger.debug{ "skipping #{start_row} rows from #{filename}" }
263
+ start_row.times do |i|
264
+ row_enum.next
265
+ logger.debug{ "skipped #{i} from #{filename}" } if i % page_size == 0
266
+ end
267
+ logger.debug{ "skipped #{start_row} from #{filename}" }
268
+ rows_restored += start_row
265
269
  end
266
- logger.debug{ "skipped #{start_row} from #{filename}" }
267
- rows_restored += start_row
268
- end
269
270
 
270
- loop do
271
- db.transaction do
272
- begin
273
- page_size.times do
274
- # This skips all the checks in the Sequel code. Basically we want
275
- # to generate the
276
- # insert into (field1,field2) values (value1,value2)
277
- # statement as quickly as possible.
278
- #
279
- # Uses a private method so it will need to be updated repeatedly.
280
- sql = table_dataset.clone( columns: columns, values: row_enum.next ).send(:_insert_sql)
281
- db.execute sql unless dry_run?
282
- rows_restored += 1
271
+ loop do
272
+ db.transaction do
273
+ begin
274
+ page_size.times do
275
+ # This skips all the checks in the Sequel code. Basically we want
276
+ # to generate the
277
+ # insert into (field1,field2) values (value1,value2)
278
+ # statement as quickly as possible.
279
+ #
280
+ # Uses a private method so it will need to be updated repeatedly.
281
+ sql = table_dataset.clone( columns: columns, values: row_enum.next ).send(:_insert_sql)
282
+ db.execute sql unless dry_run?
283
+ rows_restored += 1
284
+ end
285
+ rescue StopIteration
286
+ # reached the end of the inout stream.
287
+ # So commit this transaction, and then re-raise
288
+ # StopIteration to get out of the loop{} statement
289
+ db.after_commit{ raise StopIteration }
283
290
  end
284
- rescue StopIteration
285
- # reached the end of the inout stream.
286
- # So commit this transaction, and then re-raise
287
- # StopIteration to get out of the loop{} statement
288
- db.after_commit{ raise StopIteration }
289
291
  end
290
292
  end
293
+ logger.info "#{__method__} #{table_name} done. Inserted #{rows_restored}."
294
+ rows_restored
291
295
  end
292
- logger.info "#{__method__} #{table_name} done. Inserted #{rows_restored}."
293
- rows_restored
294
- end
295
296
 
296
- # Enumerate through the given io at its current position.
297
- # Can raise StopIteration (ie when eof is not detected)
298
- # MAYBE don't check for io.eof here, leave that to the codec
299
- def each_row
300
- return enum_for(__method__) unless block_given?
301
- yield codec.decode( io ) until io.eof?
302
- end
297
+ # Enumerate through the given io at its current position.
298
+ # Can raise StopIteration (ie when eof is not detected)
299
+ # MAYBE don't check for io.eof here, leave that to the codec
300
+ def each_row
301
+ return enum_for(__method__) unless block_given?
302
+ yield codec.decode( io ) until io.eof?
303
+ end
303
304
 
304
- # Enumerate sql insert statements from the dump
305
- def insert_sql_each
306
- return enum_for(__method__) unless block_given?
307
- each_row do |row|
308
- yield table_dataset.insert_sql( row )
305
+ # Enumerate sql insert statements from the dump
306
+ def insert_sql_each
307
+ return enum_for(__method__) unless block_given?
308
+ each_row do |row|
309
+ yield table_dataset.insert_sql( row )
310
+ end
309
311
  end
310
312
  end
311
313
  end