split_pgdump 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +55 -0
- data/bin/split_pgdump +340 -0
- metadata +52 -0
data/README
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
= Tool for splitting postgresql dump in a set of files
|
2
|
+
|
3
|
+
I wish to use git or mercurial for managing my database history.
|
4
|
+
Unfortunately, every single data change force them to store whole dump again.
|
5
|
+
Even if you data actually not changed, rows order is not promised to be stable.
|
6
|
+
|
7
|
+
split_pgdump splits dump in a set of small sorted files, so that git could track
|
8
|
+
changes only of atcually changed data.
|
9
|
+
|
10
|
+
Also, it allows rsync to effectevely transmit backup changes over network.
|
11
|
+
|
12
|
+
== Usage
|
13
|
+
|
14
|
+
Simplest example:
|
15
|
+
|
16
|
+
> pg_dump my_base | split_pgdump
|
17
|
+
|
18
|
+
It produces:
|
19
|
+
`dump.sql` - file with schema and psql copy instructions,
|
20
|
+
`dump.sql-tables/#{table}.dat` - 'copy data' for each table in a dump,
|
21
|
+
sorted numerically (I hope, it is `id`)
|
22
|
+
|
23
|
+
You can change file name by `-f` option.
|
24
|
+
|
25
|
+
=== Rules
|
26
|
+
Rules are read from `split.rules` file (could be changed by `-r` option).
|
27
|
+
File could contain set of lines:
|
28
|
+
|
29
|
+
table_regexp {split:<Split expr>} {sort:<Sort expr>}
|
30
|
+
|
31
|
+
<Split expr> examples:
|
32
|
+
split:$field_name!
|
33
|
+
split:$field_name!_$other_field!
|
34
|
+
split:$client_id%00100!-$id%0025000!
|
35
|
+
split:$some_field[2..-1]!/$other_field[10..30]%0005!
|
36
|
+
|
37
|
+
<Sort expr> is space separated list of fields, optionally with options for
|
38
|
+
gnu `sort` --key parameters (on my machine they are MbdfghinRrV):
|
39
|
+
sort:client_id uid
|
40
|
+
sort:client_id:n id:n
|
41
|
+
|
42
|
+
Example for redmines wiki_content_versions:
|
43
|
+
|
44
|
+
wiki_content_versions split:$page_id%0025!/$id%0000250! sort:page_id:n id:n
|
45
|
+
|
46
|
+
Either `split:` or `sort:` option could be skipped.
|
47
|
+
|
48
|
+
== Author and Copyright
|
49
|
+
|
50
|
+
Copyright (c) 2011 by Sokolov Yura (funny.falcon@gmail.com)
|
51
|
+
Released under the same terms of license as Ruby
|
52
|
+
|
53
|
+
== Homepage
|
54
|
+
|
55
|
+
https://github.com/funny-falcon/split_pgdump
|
data/bin/split_pgdump
ADDED
@@ -0,0 +1,340 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# vim: set syntax=ruby shiftwidth=2 softtabstop=2 tabstop=8 expandtab
|
3
|
+
require 'optparse'
|
4
|
+
require 'fileutils'
|
5
|
+
require 'strscan'
|
6
|
+
|
7
|
+
$debug = false
|
8
|
+
|
9
|
+
class CWorker
|
10
|
+
attr_accessor :rules_file, :output_file, :sorter, :rules
|
11
|
+
def initialize
|
12
|
+
@rules_file = 'split.rules'
|
13
|
+
@output_file = 'dump.sql'
|
14
|
+
@sorter = `which sort`.chomp
|
15
|
+
@rules = []
|
16
|
+
end
|
17
|
+
|
18
|
+
def tables_dir
|
19
|
+
output_file + '-tables'
|
20
|
+
end
|
21
|
+
|
22
|
+
def clear_files
|
23
|
+
FileUtils.rm_f output_file
|
24
|
+
FileUtils.rm_rf Dir[File.join(tables_dir, '*')]
|
25
|
+
FileUtils.mkdir_p tables_dir
|
26
|
+
end
|
27
|
+
|
28
|
+
def parse_rules
|
29
|
+
if File.exists?(rules_file)
|
30
|
+
File.open(rules_file) do |f|
|
31
|
+
f.each_line do |line|
|
32
|
+
if rule = Rule.parse(line)
|
33
|
+
@rules << rule
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
else
|
38
|
+
puts "NO FILE #{rules_file}" if $debug
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def find_rule(table)
|
43
|
+
@rules.find{|rule| table =~ rule.regex}
|
44
|
+
end
|
45
|
+
|
46
|
+
def process_schema_line(out, line)
|
47
|
+
if line =~ /^COPY (\w+) \(([^)]+)\) FROM stdin;/
|
48
|
+
table_name, columns = $1, $2.split(', ')
|
49
|
+
@table = Table.new(tables_dir, table_name, columns)
|
50
|
+
@tables << @table
|
51
|
+
puts "Start to write table #{table_name}" if $debug
|
52
|
+
@state = :table
|
53
|
+
else
|
54
|
+
out.write line
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def process_copy_line(out, line)
|
59
|
+
if line =~ /^\\\.[\r\n]/
|
60
|
+
@table.flush_all
|
61
|
+
@table.copy_lines{|l| out.puts l}
|
62
|
+
@table = nil
|
63
|
+
@state = :schema
|
64
|
+
else
|
65
|
+
@table.add_line(line)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def work
|
70
|
+
@state = :schema
|
71
|
+
@table = nil
|
72
|
+
@tables = []
|
73
|
+
|
74
|
+
File.open(output_file, 'w') do |out|
|
75
|
+
STDIN.each_line do |line|
|
76
|
+
case @state
|
77
|
+
when :schema
|
78
|
+
process_schema_line(out, line)
|
79
|
+
when :table
|
80
|
+
process_copy_line(out, line)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
@tables.each{|table| table.finish_all}
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
Worker = CWorker.new
|
90
|
+
|
91
|
+
class Rule
|
92
|
+
class ParseError < StandardError; end
|
93
|
+
|
94
|
+
attr_reader :regex, :split_parts, :sort_keys
|
95
|
+
def self.parse(line)
|
96
|
+
line = line.sub(%r{(;|#|//).*$},'').strip
|
97
|
+
return if line.empty?
|
98
|
+
|
99
|
+
if line =~ /^(\S+)(?:\s+split:(\S+))?(?:\s+sort:((?:(?:[^\s:]+)(?::[MbdfghinRrV]+)?(?:\s+|\s*$))+))?$/
|
100
|
+
puts "#$1 split:#$2 sort:#$3" if $debug
|
101
|
+
new($1, $2, $3)
|
102
|
+
else
|
103
|
+
raise ParseError, "Wrong rule line #{line}"
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def initialize(table_regex, split_expr, sort_keys)
|
108
|
+
@regex = Regexp.new table_regex
|
109
|
+
parse_split_expr(split_expr)
|
110
|
+
parse_sort_keys(sort_keys)
|
111
|
+
end
|
112
|
+
|
113
|
+
def parse_split_expr(split_expr)
|
114
|
+
s = StringScanner.new(split_expr || '')
|
115
|
+
parts = []
|
116
|
+
while !s.eos?
|
117
|
+
if field = s.scan(/\$[^\[%]+/)
|
118
|
+
field = field[1..-1]
|
119
|
+
part = {:type => :field, :field => field, :actions => []}
|
120
|
+
while !s.eos?
|
121
|
+
if range = s.scan(/\[[+-]?\d+\.\.\.?[+-]?\d+\]/)
|
122
|
+
part[:actions] << {:range => range}
|
123
|
+
elsif mod = s.scan(/%\d+/)
|
124
|
+
part[:actions] << {:mod => mod[1..-1]}
|
125
|
+
else
|
126
|
+
break
|
127
|
+
end
|
128
|
+
end
|
129
|
+
parts << part
|
130
|
+
if sep = s.scan(/![^$\s#\\]*/)
|
131
|
+
if sep > '!'
|
132
|
+
parts << {:type => :sep, :sep => sep[1..-1]}
|
133
|
+
end
|
134
|
+
next
|
135
|
+
end
|
136
|
+
end
|
137
|
+
raise ParseError, "Wrong format of split expr #{split_expr} (rest: #{s.rest})"
|
138
|
+
end
|
139
|
+
@split_parts = parts
|
140
|
+
end
|
141
|
+
|
142
|
+
def parse_sort_keys(sort_keys)
|
143
|
+
@sort_keys = (sort_keys || '').scan(/([^\s:]+)(?::([MbdfghinRrV]+))?/).map do |key, flags|
|
144
|
+
{:field => key, :flags => flags}
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
class Table
|
150
|
+
class NoColumn < StandardError; end
|
151
|
+
ONE_FILE_CACHE_SIZE = 128 * 1024
|
152
|
+
TOTAL_CACHE_SIZE = 5 * 1024 * 1024
|
153
|
+
class OneFile
|
154
|
+
attr_reader :file_name, :cache_size
|
155
|
+
def initialize(dir, name)
|
156
|
+
@file_name = File.join(dir, name)
|
157
|
+
@cache_lines = []
|
158
|
+
@cache_size = 0
|
159
|
+
end
|
160
|
+
|
161
|
+
def add_line(line)
|
162
|
+
@cache_lines << line
|
163
|
+
@cache_size += line.size
|
164
|
+
flush if @cache_size > ONE_FILE_CACHE_SIZE
|
165
|
+
end
|
166
|
+
|
167
|
+
def flush
|
168
|
+
dir = File.dirname(@file_name)
|
169
|
+
unless File.directory?(dir)
|
170
|
+
FileUtils.mkdir_p(dir)
|
171
|
+
end
|
172
|
+
File.open(@file_name, 'a') do |f|
|
173
|
+
@cache_lines.each{|l| f.write(l)}
|
174
|
+
end
|
175
|
+
@cache_lines.clear
|
176
|
+
@cache_size = 0
|
177
|
+
end
|
178
|
+
|
179
|
+
def write_finish
|
180
|
+
File.open(@file_name, 'a') do |f|
|
181
|
+
f.puts('\\.')
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
def sort(sort_line = [])
|
186
|
+
args = [Worker.sorter]
|
187
|
+
if sort_line && !sort_line.empty?
|
188
|
+
args.concat sort_line
|
189
|
+
else
|
190
|
+
args << '-n'
|
191
|
+
end
|
192
|
+
args.push '-o', @file_name, @file_name
|
193
|
+
puts args.join(' ') if $debug
|
194
|
+
system *args
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
attr_reader :name, :columns, :files, :sort_line
|
199
|
+
def initialize(dir, name, columns)
|
200
|
+
@dir = dir
|
201
|
+
@table = name
|
202
|
+
@columns = columns.map{|c| c.sub(/^"(.+)"$/, '\\1')}
|
203
|
+
if @rule = Worker.find_rule(name)
|
204
|
+
apply_rule
|
205
|
+
else
|
206
|
+
@split_args = []
|
207
|
+
end
|
208
|
+
@files = {}
|
209
|
+
@total_cache_size = 0
|
210
|
+
end
|
211
|
+
|
212
|
+
def _mod(s, len, mod)
|
213
|
+
"%0#{len}d" % (s.to_i / mod * mod)
|
214
|
+
end
|
215
|
+
|
216
|
+
def apply_rule
|
217
|
+
split_string = ''
|
218
|
+
@rule.split_parts.each do |part|
|
219
|
+
case part[:type]
|
220
|
+
when :sep
|
221
|
+
split_string << part[:sep]
|
222
|
+
when :field
|
223
|
+
i = @columns.find_index(part[:field])
|
224
|
+
raise NoColumn, part[:field] unless i
|
225
|
+
field = "values[#{i}]"
|
226
|
+
part[:actions].each do |action|
|
227
|
+
if action[:mod]
|
228
|
+
mod_s = action[:mod]
|
229
|
+
mod = mod_s.to_i
|
230
|
+
field = "_mod(#{field},#{mod_s.size},#{mod})"
|
231
|
+
elsif action[:range]
|
232
|
+
field << "#{action[:range]}"
|
233
|
+
end
|
234
|
+
end
|
235
|
+
split_string << "\#{#{field}}"
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
239
|
+
eval <<-"EOF"
|
240
|
+
def self.file_name(values)
|
241
|
+
name = %{#{split_string}}.gsub(/\\.\\.|\\s|\\?|\\*/, '_')
|
242
|
+
"\#@table/\#{name}.dat"
|
243
|
+
end
|
244
|
+
EOF
|
245
|
+
|
246
|
+
@sort_args = @rule.sort_keys.map do |key|
|
247
|
+
i = @columns.find_index(key[:field])
|
248
|
+
raise NoColumn, key[:field] unless i
|
249
|
+
i += 1
|
250
|
+
"--key=#{i},#{i}#{key[:flags]}"
|
251
|
+
end
|
252
|
+
end
|
253
|
+
|
254
|
+
def file_name(values)
|
255
|
+
"#@table.dat"
|
256
|
+
end
|
257
|
+
|
258
|
+
def add_line(line)
|
259
|
+
values = line.chomp.split("\t")
|
260
|
+
fname = file_name(values)
|
261
|
+
one_file = @files[fname] ||= OneFile.new(@dir, fname)
|
262
|
+
@total_cache_size -= one_file.cache_size
|
263
|
+
one_file.add_line(line)
|
264
|
+
@total_cache_size += one_file.cache_size
|
265
|
+
flush_all if @total_cache_size > TOTAL_CACHE_SIZE
|
266
|
+
end
|
267
|
+
|
268
|
+
def flush_all
|
269
|
+
@files.each{|name, one_file| one_file.flush}
|
270
|
+
@total_cache_size = 0
|
271
|
+
end
|
272
|
+
|
273
|
+
def copy_lines
|
274
|
+
if block_given?
|
275
|
+
@files.each do |name, one_file|
|
276
|
+
yield "\\copy #{@table} (#{@columns.join(', ')}) from #{one_file.file_name}"
|
277
|
+
end
|
278
|
+
else
|
279
|
+
to_enum(:copy_lines)
|
280
|
+
end
|
281
|
+
end
|
282
|
+
|
283
|
+
def finish_all
|
284
|
+
@files.each do |name, one_file|
|
285
|
+
one_file.sort(@sort_args)
|
286
|
+
one_file.write_finish
|
287
|
+
end
|
288
|
+
end
|
289
|
+
end
|
290
|
+
|
291
|
+
opts = OptionParser.new do |opts|
|
292
|
+
opts.banner = "\
|
293
|
+
Usage: pg_dump my_base | split_pgdump [-r RULES_FILE] [-f DUMP_FILE] [-s SORT_BIN] [-d]
|
294
|
+
|
295
|
+
split_pgdump intend for producing stable set of small files instead of one
|
296
|
+
big dump file. Such set is suitable for being source for SCM systems, being
|
297
|
+
effectivly transmitted using rsync, repacking by 7z and other.
|
298
|
+
|
299
|
+
"
|
300
|
+
|
301
|
+
opts.separator("Options:")
|
302
|
+
|
303
|
+
opts.on("-r", "--rules=RULES_FILE", "File with rules on table splitting (default 'split.rules')") do |v|
|
304
|
+
Worker.rules_file = v
|
305
|
+
end
|
306
|
+
opts.on("-f", "--file=FILE", "main file name (default 'dump.sql').",
|
307
|
+
"Table content will be storred in FILE-tables directory") do |v|
|
308
|
+
Worker.output_file = v
|
309
|
+
end
|
310
|
+
opts.on("-s", "--sort=SORT_BIN", "sort executable compatible with gnu coreutils sort") do |v|
|
311
|
+
Worker.sorter = v
|
312
|
+
end
|
313
|
+
opts.on("-d", "--debug", "debug"){|v| $debug = true}
|
314
|
+
opts.on_tail("-h", "--help", "this message"){|v| puts opts; exit}
|
315
|
+
|
316
|
+
opts.on_tail("\
|
317
|
+
Rules file format:
|
318
|
+
table_regexp {split:<Split expr>} {sort:<Sort expr>}
|
319
|
+
|
320
|
+
<Split expr> examples:
|
321
|
+
split:$field_name!
|
322
|
+
split:$field_name!_$other_field!
|
323
|
+
split:$client_id%00100!-$id%0025000!
|
324
|
+
split:$some_field[2..-1]!/$other_field[10..30]%0005!
|
325
|
+
|
326
|
+
<Sort expr> is space separated list of fields, optionally with options for
|
327
|
+
gnu `sort` --key parameters (on my machine they are MbdfghinRrV):
|
328
|
+
sort:client_id uid
|
329
|
+
sort:client_id:n id:n
|
330
|
+
|
331
|
+
Example for redmines wiki_content_versions:
|
332
|
+
|
333
|
+
wiki_content_versions split:$page_id%0025!/$id%0000250! sort:page_id:n id:n
|
334
|
+
")
|
335
|
+
|
336
|
+
end.parse!
|
337
|
+
|
338
|
+
Worker.parse_rules
|
339
|
+
Worker.clear_files
|
340
|
+
Worker.work
|
metadata
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: split_pgdump
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Sokolov Yura aka funny_falcon
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-11-22 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: ! 'split_pgdump aimed to produce set of small sorted files from one big
|
15
|
+
dump file.
|
16
|
+
|
17
|
+
'
|
18
|
+
email: funny.falcon@gmail.com
|
19
|
+
executables:
|
20
|
+
- split_pgdump
|
21
|
+
extensions: []
|
22
|
+
extra_rdoc_files: []
|
23
|
+
files:
|
24
|
+
- bin/split_pgdump
|
25
|
+
- README
|
26
|
+
homepage: https://github.com/funny-falcon/split_pgdump
|
27
|
+
licenses:
|
28
|
+
- GPL
|
29
|
+
post_install_message:
|
30
|
+
rdoc_options: []
|
31
|
+
require_paths:
|
32
|
+
- lib
|
33
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
34
|
+
none: false
|
35
|
+
requirements:
|
36
|
+
- - ! '>='
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
41
|
+
requirements:
|
42
|
+
- - ! '>='
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
version: '0'
|
45
|
+
requirements: []
|
46
|
+
rubyforge_project:
|
47
|
+
rubygems_version: 1.8.10
|
48
|
+
signing_key:
|
49
|
+
specification_version: 3
|
50
|
+
summary: split_pgdump is a tool for splitting postgresql dump in a managable set of
|
51
|
+
files
|
52
|
+
test_files: []
|