split_pgdump 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +55 -0
- data/bin/split_pgdump +340 -0
- metadata +52 -0
data/README
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
= Tool for splitting postgresql dump in a set of files
|
2
|
+
|
3
|
+
I wish to use git or mercurial for managing my database history.
|
4
|
+
Unfortunately, every single data change force them to store whole dump again.
|
5
|
+
Even if you data actually not changed, rows order is not promised to be stable.
|
6
|
+
|
7
|
+
split_pgdump splits dump in a set of small sorted files, so that git could track
|
8
|
+
changes only of atcually changed data.
|
9
|
+
|
10
|
+
Also, it allows rsync to effectevely transmit backup changes over network.
|
11
|
+
|
12
|
+
== Usage
|
13
|
+
|
14
|
+
Simplest example:
|
15
|
+
|
16
|
+
> pg_dump my_base | split_pgdump
|
17
|
+
|
18
|
+
It produces:
|
19
|
+
`dump.sql` - file with schema and psql copy instructions,
|
20
|
+
`dump.sql-tables/#{table}.dat` - 'copy data' for each table in a dump,
|
21
|
+
sorted numerically (I hope, it is `id`)
|
22
|
+
|
23
|
+
You can change file name by `-f` option.
|
24
|
+
|
25
|
+
=== Rules
|
26
|
+
Rules are read from `split.rules` file (could be changed by `-r` option).
|
27
|
+
File could contain set of lines:
|
28
|
+
|
29
|
+
table_regexp {split:<Split expr>} {sort:<Sort expr>}
|
30
|
+
|
31
|
+
<Split expr> examples:
|
32
|
+
split:$field_name!
|
33
|
+
split:$field_name!_$other_field!
|
34
|
+
split:$client_id%00100!-$id%0025000!
|
35
|
+
split:$some_field[2..-1]!/$other_field[10..30]%0005!
|
36
|
+
|
37
|
+
<Sort expr> is space separated list of fields, optionally with options for
|
38
|
+
gnu `sort` --key parameters (on my machine they are MbdfghinRrV):
|
39
|
+
sort:client_id uid
|
40
|
+
sort:client_id:n id:n
|
41
|
+
|
42
|
+
Example for redmines wiki_content_versions:
|
43
|
+
|
44
|
+
wiki_content_versions split:$page_id%0025!/$id%0000250! sort:page_id:n id:n
|
45
|
+
|
46
|
+
Either `split:` or `sort:` option could be skipped.
|
47
|
+
|
48
|
+
== Author and Copyright
|
49
|
+
|
50
|
+
Copyright (c) 2011 by Sokolov Yura (funny.falcon@gmail.com)
|
51
|
+
Released under the same terms of license as Ruby
|
52
|
+
|
53
|
+
== Homepage
|
54
|
+
|
55
|
+
https://github.com/funny-falcon/split_pgdump
|
data/bin/split_pgdump
ADDED
@@ -0,0 +1,340 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# vim: set syntax=ruby shiftwidth=2 softtabstop=2 tabstop=8 expandtab
|
3
|
+
require 'optparse'
|
4
|
+
require 'fileutils'
|
5
|
+
require 'strscan'
|
6
|
+
|
7
|
+
$debug = false
|
8
|
+
|
9
|
+
class CWorker
|
10
|
+
attr_accessor :rules_file, :output_file, :sorter, :rules
|
11
|
+
def initialize
|
12
|
+
@rules_file = 'split.rules'
|
13
|
+
@output_file = 'dump.sql'
|
14
|
+
@sorter = `which sort`.chomp
|
15
|
+
@rules = []
|
16
|
+
end
|
17
|
+
|
18
|
+
def tables_dir
|
19
|
+
output_file + '-tables'
|
20
|
+
end
|
21
|
+
|
22
|
+
def clear_files
|
23
|
+
FileUtils.rm_f output_file
|
24
|
+
FileUtils.rm_rf Dir[File.join(tables_dir, '*')]
|
25
|
+
FileUtils.mkdir_p tables_dir
|
26
|
+
end
|
27
|
+
|
28
|
+
def parse_rules
|
29
|
+
if File.exists?(rules_file)
|
30
|
+
File.open(rules_file) do |f|
|
31
|
+
f.each_line do |line|
|
32
|
+
if rule = Rule.parse(line)
|
33
|
+
@rules << rule
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
else
|
38
|
+
puts "NO FILE #{rules_file}" if $debug
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def find_rule(table)
|
43
|
+
@rules.find{|rule| table =~ rule.regex}
|
44
|
+
end
|
45
|
+
|
46
|
+
def process_schema_line(out, line)
|
47
|
+
if line =~ /^COPY (\w+) \(([^)]+)\) FROM stdin;/
|
48
|
+
table_name, columns = $1, $2.split(', ')
|
49
|
+
@table = Table.new(tables_dir, table_name, columns)
|
50
|
+
@tables << @table
|
51
|
+
puts "Start to write table #{table_name}" if $debug
|
52
|
+
@state = :table
|
53
|
+
else
|
54
|
+
out.write line
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def process_copy_line(out, line)
|
59
|
+
if line =~ /^\\\.[\r\n]/
|
60
|
+
@table.flush_all
|
61
|
+
@table.copy_lines{|l| out.puts l}
|
62
|
+
@table = nil
|
63
|
+
@state = :schema
|
64
|
+
else
|
65
|
+
@table.add_line(line)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def work
|
70
|
+
@state = :schema
|
71
|
+
@table = nil
|
72
|
+
@tables = []
|
73
|
+
|
74
|
+
File.open(output_file, 'w') do |out|
|
75
|
+
STDIN.each_line do |line|
|
76
|
+
case @state
|
77
|
+
when :schema
|
78
|
+
process_schema_line(out, line)
|
79
|
+
when :table
|
80
|
+
process_copy_line(out, line)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
@tables.each{|table| table.finish_all}
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
Worker = CWorker.new
|
90
|
+
|
91
|
+
class Rule
|
92
|
+
class ParseError < StandardError; end
|
93
|
+
|
94
|
+
attr_reader :regex, :split_parts, :sort_keys
|
95
|
+
def self.parse(line)
|
96
|
+
line = line.sub(%r{(;|#|//).*$},'').strip
|
97
|
+
return if line.empty?
|
98
|
+
|
99
|
+
if line =~ /^(\S+)(?:\s+split:(\S+))?(?:\s+sort:((?:(?:[^\s:]+)(?::[MbdfghinRrV]+)?(?:\s+|\s*$))+))?$/
|
100
|
+
puts "#$1 split:#$2 sort:#$3" if $debug
|
101
|
+
new($1, $2, $3)
|
102
|
+
else
|
103
|
+
raise ParseError, "Wrong rule line #{line}"
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def initialize(table_regex, split_expr, sort_keys)
|
108
|
+
@regex = Regexp.new table_regex
|
109
|
+
parse_split_expr(split_expr)
|
110
|
+
parse_sort_keys(sort_keys)
|
111
|
+
end
|
112
|
+
|
113
|
+
def parse_split_expr(split_expr)
|
114
|
+
s = StringScanner.new(split_expr || '')
|
115
|
+
parts = []
|
116
|
+
while !s.eos?
|
117
|
+
if field = s.scan(/\$[^\[%]+/)
|
118
|
+
field = field[1..-1]
|
119
|
+
part = {:type => :field, :field => field, :actions => []}
|
120
|
+
while !s.eos?
|
121
|
+
if range = s.scan(/\[[+-]?\d+\.\.\.?[+-]?\d+\]/)
|
122
|
+
part[:actions] << {:range => range}
|
123
|
+
elsif mod = s.scan(/%\d+/)
|
124
|
+
part[:actions] << {:mod => mod[1..-1]}
|
125
|
+
else
|
126
|
+
break
|
127
|
+
end
|
128
|
+
end
|
129
|
+
parts << part
|
130
|
+
if sep = s.scan(/![^$\s#\\]*/)
|
131
|
+
if sep > '!'
|
132
|
+
parts << {:type => :sep, :sep => sep[1..-1]}
|
133
|
+
end
|
134
|
+
next
|
135
|
+
end
|
136
|
+
end
|
137
|
+
raise ParseError, "Wrong format of split expr #{split_expr} (rest: #{s.rest})"
|
138
|
+
end
|
139
|
+
@split_parts = parts
|
140
|
+
end
|
141
|
+
|
142
|
+
def parse_sort_keys(sort_keys)
|
143
|
+
@sort_keys = (sort_keys || '').scan(/([^\s:]+)(?::([MbdfghinRrV]+))?/).map do |key, flags|
|
144
|
+
{:field => key, :flags => flags}
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
class Table
|
150
|
+
class NoColumn < StandardError; end
|
151
|
+
ONE_FILE_CACHE_SIZE = 128 * 1024
|
152
|
+
TOTAL_CACHE_SIZE = 5 * 1024 * 1024
|
153
|
+
class OneFile
|
154
|
+
attr_reader :file_name, :cache_size
|
155
|
+
def initialize(dir, name)
|
156
|
+
@file_name = File.join(dir, name)
|
157
|
+
@cache_lines = []
|
158
|
+
@cache_size = 0
|
159
|
+
end
|
160
|
+
|
161
|
+
def add_line(line)
|
162
|
+
@cache_lines << line
|
163
|
+
@cache_size += line.size
|
164
|
+
flush if @cache_size > ONE_FILE_CACHE_SIZE
|
165
|
+
end
|
166
|
+
|
167
|
+
def flush
|
168
|
+
dir = File.dirname(@file_name)
|
169
|
+
unless File.directory?(dir)
|
170
|
+
FileUtils.mkdir_p(dir)
|
171
|
+
end
|
172
|
+
File.open(@file_name, 'a') do |f|
|
173
|
+
@cache_lines.each{|l| f.write(l)}
|
174
|
+
end
|
175
|
+
@cache_lines.clear
|
176
|
+
@cache_size = 0
|
177
|
+
end
|
178
|
+
|
179
|
+
def write_finish
|
180
|
+
File.open(@file_name, 'a') do |f|
|
181
|
+
f.puts('\\.')
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
def sort(sort_line = [])
|
186
|
+
args = [Worker.sorter]
|
187
|
+
if sort_line && !sort_line.empty?
|
188
|
+
args.concat sort_line
|
189
|
+
else
|
190
|
+
args << '-n'
|
191
|
+
end
|
192
|
+
args.push '-o', @file_name, @file_name
|
193
|
+
puts args.join(' ') if $debug
|
194
|
+
system *args
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
attr_reader :name, :columns, :files, :sort_line
|
199
|
+
def initialize(dir, name, columns)
|
200
|
+
@dir = dir
|
201
|
+
@table = name
|
202
|
+
@columns = columns.map{|c| c.sub(/^"(.+)"$/, '\\1')}
|
203
|
+
if @rule = Worker.find_rule(name)
|
204
|
+
apply_rule
|
205
|
+
else
|
206
|
+
@split_args = []
|
207
|
+
end
|
208
|
+
@files = {}
|
209
|
+
@total_cache_size = 0
|
210
|
+
end
|
211
|
+
|
212
|
+
def _mod(s, len, mod)
|
213
|
+
"%0#{len}d" % (s.to_i / mod * mod)
|
214
|
+
end
|
215
|
+
|
216
|
+
def apply_rule
|
217
|
+
split_string = ''
|
218
|
+
@rule.split_parts.each do |part|
|
219
|
+
case part[:type]
|
220
|
+
when :sep
|
221
|
+
split_string << part[:sep]
|
222
|
+
when :field
|
223
|
+
i = @columns.find_index(part[:field])
|
224
|
+
raise NoColumn, part[:field] unless i
|
225
|
+
field = "values[#{i}]"
|
226
|
+
part[:actions].each do |action|
|
227
|
+
if action[:mod]
|
228
|
+
mod_s = action[:mod]
|
229
|
+
mod = mod_s.to_i
|
230
|
+
field = "_mod(#{field},#{mod_s.size},#{mod})"
|
231
|
+
elsif action[:range]
|
232
|
+
field << "#{action[:range]}"
|
233
|
+
end
|
234
|
+
end
|
235
|
+
split_string << "\#{#{field}}"
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
239
|
+
eval <<-"EOF"
|
240
|
+
def self.file_name(values)
|
241
|
+
name = %{#{split_string}}.gsub(/\\.\\.|\\s|\\?|\\*/, '_')
|
242
|
+
"\#@table/\#{name}.dat"
|
243
|
+
end
|
244
|
+
EOF
|
245
|
+
|
246
|
+
@sort_args = @rule.sort_keys.map do |key|
|
247
|
+
i = @columns.find_index(key[:field])
|
248
|
+
raise NoColumn, key[:field] unless i
|
249
|
+
i += 1
|
250
|
+
"--key=#{i},#{i}#{key[:flags]}"
|
251
|
+
end
|
252
|
+
end
|
253
|
+
|
254
|
+
def file_name(values)
|
255
|
+
"#@table.dat"
|
256
|
+
end
|
257
|
+
|
258
|
+
def add_line(line)
|
259
|
+
values = line.chomp.split("\t")
|
260
|
+
fname = file_name(values)
|
261
|
+
one_file = @files[fname] ||= OneFile.new(@dir, fname)
|
262
|
+
@total_cache_size -= one_file.cache_size
|
263
|
+
one_file.add_line(line)
|
264
|
+
@total_cache_size += one_file.cache_size
|
265
|
+
flush_all if @total_cache_size > TOTAL_CACHE_SIZE
|
266
|
+
end
|
267
|
+
|
268
|
+
def flush_all
|
269
|
+
@files.each{|name, one_file| one_file.flush}
|
270
|
+
@total_cache_size = 0
|
271
|
+
end
|
272
|
+
|
273
|
+
def copy_lines
|
274
|
+
if block_given?
|
275
|
+
@files.each do |name, one_file|
|
276
|
+
yield "\\copy #{@table} (#{@columns.join(', ')}) from #{one_file.file_name}"
|
277
|
+
end
|
278
|
+
else
|
279
|
+
to_enum(:copy_lines)
|
280
|
+
end
|
281
|
+
end
|
282
|
+
|
283
|
+
def finish_all
|
284
|
+
@files.each do |name, one_file|
|
285
|
+
one_file.sort(@sort_args)
|
286
|
+
one_file.write_finish
|
287
|
+
end
|
288
|
+
end
|
289
|
+
end
|
290
|
+
|
291
|
+
opts = OptionParser.new do |opts|
|
292
|
+
opts.banner = "\
|
293
|
+
Usage: pg_dump my_base | split_pgdump [-r RULES_FILE] [-f DUMP_FILE] [-s SORT_BIN] [-d]
|
294
|
+
|
295
|
+
split_pgdump intend for producing stable set of small files instead of one
|
296
|
+
big dump file. Such set is suitable for being source for SCM systems, being
|
297
|
+
effectivly transmitted using rsync, repacking by 7z and other.
|
298
|
+
|
299
|
+
"
|
300
|
+
|
301
|
+
opts.separator("Options:")
|
302
|
+
|
303
|
+
opts.on("-r", "--rules=RULES_FILE", "File with rules on table splitting (default 'split.rules')") do |v|
|
304
|
+
Worker.rules_file = v
|
305
|
+
end
|
306
|
+
opts.on("-f", "--file=FILE", "main file name (default 'dump.sql').",
|
307
|
+
"Table content will be storred in FILE-tables directory") do |v|
|
308
|
+
Worker.output_file = v
|
309
|
+
end
|
310
|
+
opts.on("-s", "--sort=SORT_BIN", "sort executable compatible with gnu coreutils sort") do |v|
|
311
|
+
Worker.sorter = v
|
312
|
+
end
|
313
|
+
opts.on("-d", "--debug", "debug"){|v| $debug = true}
|
314
|
+
opts.on_tail("-h", "--help", "this message"){|v| puts opts; exit}
|
315
|
+
|
316
|
+
opts.on_tail("\
|
317
|
+
Rules file format:
|
318
|
+
table_regexp {split:<Split expr>} {sort:<Sort expr>}
|
319
|
+
|
320
|
+
<Split expr> examples:
|
321
|
+
split:$field_name!
|
322
|
+
split:$field_name!_$other_field!
|
323
|
+
split:$client_id%00100!-$id%0025000!
|
324
|
+
split:$some_field[2..-1]!/$other_field[10..30]%0005!
|
325
|
+
|
326
|
+
<Sort expr> is space separated list of fields, optionally with options for
|
327
|
+
gnu `sort` --key parameters (on my machine they are MbdfghinRrV):
|
328
|
+
sort:client_id uid
|
329
|
+
sort:client_id:n id:n
|
330
|
+
|
331
|
+
Example for redmines wiki_content_versions:
|
332
|
+
|
333
|
+
wiki_content_versions split:$page_id%0025!/$id%0000250! sort:page_id:n id:n
|
334
|
+
")
|
335
|
+
|
336
|
+
end.parse!
|
337
|
+
|
338
|
+
Worker.parse_rules
|
339
|
+
Worker.clear_files
|
340
|
+
Worker.work
|
metadata
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: split_pgdump
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Sokolov Yura aka funny_falcon
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-11-22 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: ! 'split_pgdump aimed to produce set of small sorted files from one big
|
15
|
+
dump file.
|
16
|
+
|
17
|
+
'
|
18
|
+
email: funny.falcon@gmail.com
|
19
|
+
executables:
|
20
|
+
- split_pgdump
|
21
|
+
extensions: []
|
22
|
+
extra_rdoc_files: []
|
23
|
+
files:
|
24
|
+
- bin/split_pgdump
|
25
|
+
- README
|
26
|
+
homepage: https://github.com/funny-falcon/split_pgdump
|
27
|
+
licenses:
|
28
|
+
- GPL
|
29
|
+
post_install_message:
|
30
|
+
rdoc_options: []
|
31
|
+
require_paths:
|
32
|
+
- lib
|
33
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
34
|
+
none: false
|
35
|
+
requirements:
|
36
|
+
- - ! '>='
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
41
|
+
requirements:
|
42
|
+
- - ! '>='
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
version: '0'
|
45
|
+
requirements: []
|
46
|
+
rubyforge_project:
|
47
|
+
rubygems_version: 1.8.10
|
48
|
+
signing_key:
|
49
|
+
specification_version: 3
|
50
|
+
summary: split_pgdump is a tool for splitting postgresql dump in a managable set of
|
51
|
+
files
|
52
|
+
test_files: []
|