split_pgdump 0.3.3 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/split_pgdump +22 -333
- data/lib/split_pgdump.rb +346 -0
- metadata +26 -43
data/bin/split_pgdump
CHANGED
@@ -1,332 +1,14 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
# vim: set syntax=ruby shiftwidth=2 softtabstop=2 tabstop=8 expandtab
|
3
3
|
require 'optparse'
|
4
|
-
require '
|
5
|
-
require 'strscan'
|
4
|
+
require 'split_pgdump'
|
6
5
|
|
7
|
-
|
8
|
-
|
9
|
-
class Worker
|
10
|
-
attr_accessor :rules_file, :output_file, :sorter, :rules, :num_sorters
|
11
|
-
attr_accessor :could_fork
|
12
|
-
def initialize
|
13
|
-
@rules_file = 'split.rules'
|
14
|
-
@output_file = 'dump.sql'
|
15
|
-
@sorter = `which sort`.chomp
|
16
|
-
@rules = []
|
17
|
-
@num_sorters = 0
|
18
|
-
@could_fork = true
|
19
|
-
end
|
20
|
-
|
21
|
-
def tables_dir
|
22
|
-
output_file + '-tables'
|
23
|
-
end
|
24
|
-
|
25
|
-
def clear_files
|
26
|
-
FileUtils.rm_f output_file
|
27
|
-
FileUtils.rm_rf Dir[File.join(tables_dir, '*')]
|
28
|
-
FileUtils.mkdir_p tables_dir
|
29
|
-
end
|
30
|
-
|
31
|
-
def parse_rules
|
32
|
-
if File.exists?(rules_file)
|
33
|
-
File.open(rules_file) do |f|
|
34
|
-
f.each_line do |line|
|
35
|
-
if rule = Rule.parse(line)
|
36
|
-
@rules << rule
|
37
|
-
end
|
38
|
-
end
|
39
|
-
end
|
40
|
-
else
|
41
|
-
puts "NO FILE #{rules_file}" if $debug
|
42
|
-
end
|
43
|
-
end
|
44
|
-
|
45
|
-
def find_rule(table)
|
46
|
-
@rules.find{|rule| table =~ rule.regex}
|
47
|
-
end
|
48
|
-
|
49
|
-
def process_schema_line(out, line)
|
50
|
-
if line =~ /^COPY (\w+) \(([^)]+)\) FROM stdin;/
|
51
|
-
table_name, columns = $1, $2.split(', ')
|
52
|
-
rule = find_rule("#@schema.#{table_name}")
|
53
|
-
@table = Table.new(tables_dir, @schema, table_name, columns, rule)
|
54
|
-
@tables << @table
|
55
|
-
puts "Start to write table #{table_name}" if $debug
|
56
|
-
@start_time = Time.now
|
57
|
-
@state = :table
|
58
|
-
else
|
59
|
-
if line =~ /^SET search_path = ([^,]+)/
|
60
|
-
@schema = $1
|
61
|
-
end
|
62
|
-
out.write line
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|
66
|
-
def process_copy_line(out, line)
|
67
|
-
if line =~ /^\\\.[\r\n]/
|
68
|
-
@table.flush_all
|
69
|
-
@table.copy_lines{|l| out.puts l}
|
70
|
-
puts "Table #{@table.table} copied in #{Time.now - @start_time}s" if $debug
|
71
|
-
@table = nil
|
72
|
-
@state = :schema
|
73
|
-
else
|
74
|
-
@table.add_line(line)
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
def work(in_stream)
|
79
|
-
@state = :schema
|
80
|
-
@table = nil
|
81
|
-
@tables = []
|
82
|
-
@schema = 'public'
|
83
|
-
|
84
|
-
File.open(output_file, 'w') do |out|
|
85
|
-
in_stream.each_line do |line|
|
86
|
-
case @state
|
87
|
-
when :schema
|
88
|
-
process_schema_line(out, line)
|
89
|
-
when :table
|
90
|
-
process_copy_line(out, line)
|
91
|
-
end
|
92
|
-
end
|
93
|
-
end
|
94
|
-
|
95
|
-
@start_time = Time.now
|
96
|
-
sort_and_finish
|
97
|
-
puts "Finished in #{Time.now - @start_time}s #{Process.pid}" if $debug
|
98
|
-
end
|
99
|
-
|
100
|
-
def sort_and_finish
|
101
|
-
files = []
|
102
|
-
for table in @tables
|
103
|
-
for one_file in table.files.values
|
104
|
-
files << [table.sort_args, one_file]
|
105
|
-
end
|
106
|
-
end
|
107
|
-
if @num_sorters > 1
|
108
|
-
files.each_slice(@num_sorters) do |one_files|
|
109
|
-
cmd = one_files.map{|sort_args, one_file|
|
110
|
-
one_file.sort_args(sort_args).unshift(@sorter).map{|a|"'#{a}'"}.join(' ')
|
111
|
-
}
|
112
|
-
cmd = cmd.map{|c| "{ #{c} & }"} if @could_fork
|
113
|
-
cmd = cmd.join(' ; ')
|
114
|
-
cmd += ' ; wait ' if @could_fork
|
115
|
-
system cmd
|
116
|
-
one_files.each{|sort_args, one_file| one_file.write_finish}
|
117
|
-
end
|
118
|
-
else
|
119
|
-
files.each do |sort_args, one_file|
|
120
|
-
system(sorter, *one_file.sort_args(sort_args))
|
121
|
-
one_file.write_finish
|
122
|
-
end
|
123
|
-
end
|
124
|
-
end
|
125
|
-
end
|
126
|
-
|
127
|
-
class Rule
|
128
|
-
class ParseError < StandardError; end
|
129
|
-
|
130
|
-
attr_reader :regex, :split_parts, :sort_keys
|
131
|
-
def self.parse(line)
|
132
|
-
line = line.sub(%r{(;|#|//).*$},'').strip
|
133
|
-
return if line.empty?
|
134
|
-
|
135
|
-
if line =~ /^(\S+)(?:\s+split:(\S+))?(?:\s+sort:((?:(?:[^\s:]+)(?::[MbdfghinRrV]+)?(?:\s+|\s*$))+))?$/
|
136
|
-
puts "#$1 split:#$2 sort:#$3" if $debug
|
137
|
-
new($1, $2, $3)
|
138
|
-
else
|
139
|
-
raise ParseError, "Wrong rule line #{line}"
|
140
|
-
end
|
141
|
-
end
|
142
|
-
|
143
|
-
def initialize(table_regex, split_expr, sort_keys)
|
144
|
-
@regex = Regexp.new table_regex
|
145
|
-
parse_split_expr(split_expr)
|
146
|
-
parse_sort_keys(sort_keys)
|
147
|
-
end
|
148
|
-
|
149
|
-
def parse_split_expr(split_expr)
|
150
|
-
s = StringScanner.new(split_expr || '')
|
151
|
-
parts = []
|
152
|
-
while !s.eos?
|
153
|
-
if field = s.scan(/\$[^\[%!]+/)
|
154
|
-
field = field[1..-1]
|
155
|
-
part = {:type => :field, :field => field, :actions => []}
|
156
|
-
while !s.eos?
|
157
|
-
if range = s.scan(/\[[+-]?\d+\.\.\.?[+-]?\d+\]/)
|
158
|
-
part[:actions] << {:range => range}
|
159
|
-
elsif mod = s.scan(/%\d+/)
|
160
|
-
part[:actions] << {:mod => mod[1..-1]}
|
161
|
-
else
|
162
|
-
break
|
163
|
-
end
|
164
|
-
end
|
165
|
-
parts << part
|
166
|
-
if sep = s.scan(/![^$\s#\\]*/)
|
167
|
-
if sep > '!'
|
168
|
-
parts << {:type => :sep, :sep => sep[1..-1]}
|
169
|
-
end
|
170
|
-
next
|
171
|
-
end
|
172
|
-
end
|
173
|
-
raise ParseError, "Wrong format of split expr #{split_expr} (rest: '#{s.rest}')"
|
174
|
-
end
|
175
|
-
@split_parts = parts
|
176
|
-
end
|
177
|
-
|
178
|
-
def parse_sort_keys(sort_keys)
|
179
|
-
@sort_keys = (sort_keys || '').scan(/([^\s:]+)(?::([MbdfghinRrV]+))?/).map do |key, flags|
|
180
|
-
{:field => key, :flags => flags}
|
181
|
-
end
|
182
|
-
end
|
183
|
-
end
|
184
|
-
|
185
|
-
class Table
|
186
|
-
class NoColumn < StandardError; end
|
187
|
-
ONE_FILE_CACHE_SIZE = 256 * 1024
|
188
|
-
TOTAL_CACHE_SIZE = 5 * 1024 * 1024
|
189
|
-
class OneFile
|
190
|
-
attr_reader :file_name, :cache_size
|
191
|
-
def initialize(dir, name)
|
192
|
-
@file_name = File.join(dir, name)
|
193
|
-
@cache_lines = []
|
194
|
-
@cache_size = 0
|
195
|
-
end
|
196
|
-
|
197
|
-
def add_line(line)
|
198
|
-
@cache_lines << line
|
199
|
-
@cache_size += line.size
|
200
|
-
end
|
201
|
-
|
202
|
-
def flush(&block)
|
203
|
-
@cache_size = 0
|
204
|
-
dir = File.dirname(@file_name)
|
205
|
-
unless File.directory?(dir)
|
206
|
-
FileUtils.mkdir_p(dir)
|
207
|
-
end
|
208
|
-
content = @cache_lines.join
|
209
|
-
File.open(@file_name, 'a'){|f| f.write(content)}
|
210
|
-
@cache_lines.clear
|
211
|
-
end
|
212
|
-
|
213
|
-
def write_finish
|
214
|
-
File.open(@file_name, 'a') do |f|
|
215
|
-
f.puts('\\.')
|
216
|
-
end
|
217
|
-
end
|
218
|
-
|
219
|
-
def sort_args(sort_line = [])
|
220
|
-
args = []
|
221
|
-
if sort_line && !sort_line.empty?
|
222
|
-
args.concat sort_line
|
223
|
-
else
|
224
|
-
args << '-n'
|
225
|
-
end
|
226
|
-
args.push '-o', @file_name, @file_name
|
227
|
-
puts args.join(' ') if $debug
|
228
|
-
args
|
229
|
-
end
|
230
|
-
end
|
231
|
-
|
232
|
-
attr_reader :table, :columns, :files, :sort_line, :sort_args
|
233
|
-
def initialize(dir, schema, name, columns, rule)
|
234
|
-
@dir = dir
|
235
|
-
@table = name
|
236
|
-
@schema = schema
|
237
|
-
@columns = columns.map{|c| c.sub(/^"(.+)"$/, '\\1')}
|
238
|
-
apply_rule rule
|
239
|
-
@files = {}
|
240
|
-
@total_cache_size = 0
|
241
|
-
end
|
242
|
-
|
243
|
-
def _mod(s, len, mod)
|
244
|
-
"%0#{len}d" % (s.to_i / mod * mod)
|
245
|
-
end
|
246
|
-
|
247
|
-
def apply_rule(rule)
|
248
|
-
if rule
|
249
|
-
split_string = ''
|
250
|
-
rule.split_parts.each do |part|
|
251
|
-
case part[:type]
|
252
|
-
when :sep
|
253
|
-
split_string << part[:sep]
|
254
|
-
when :field
|
255
|
-
i = @columns.find_index(part[:field])
|
256
|
-
raise NoColumn, "Table #{@schema}.#{@table} has no column #{part[:field]} for use in split" unless i
|
257
|
-
field = "values[#{i}]"
|
258
|
-
part[:actions].each do |action|
|
259
|
-
if action[:mod]
|
260
|
-
mod_s = action[:mod]
|
261
|
-
mod = mod_s.to_i
|
262
|
-
field = "_mod(#{field},#{mod_s.size},#{mod})"
|
263
|
-
elsif action[:range]
|
264
|
-
field << "#{action[:range]}"
|
265
|
-
end
|
266
|
-
end
|
267
|
-
split_string << "\#{#{field}}"
|
268
|
-
end
|
269
|
-
end
|
270
|
-
|
271
|
-
eval <<-"EOF"
|
272
|
-
def self.file_name(values)
|
273
|
-
name = %{#{split_string}}.gsub(/\\.\\.|\\s|\\?|\\*|'|"/, '_')
|
274
|
-
"\#{table_schema}/\#{name}.dat"
|
275
|
-
end
|
276
|
-
EOF
|
277
|
-
|
278
|
-
@sort_args = rule.sort_keys.map do |key|
|
279
|
-
i = @columns.find_index(key[:field])
|
280
|
-
raise NoColumn, "Table #{@schema}.#{@table} has no column #{key[:field]} for use in sort" unless i
|
281
|
-
i += 1
|
282
|
-
"--key=#{i},#{i}#{key[:flags]}"
|
283
|
-
end
|
284
|
-
else
|
285
|
-
@sort_args = []
|
286
|
-
end
|
287
|
-
end
|
288
|
-
|
289
|
-
def table_schema
|
290
|
-
@schema == 'public' ? @table : "#@schema/#@table"
|
291
|
-
end
|
292
|
-
|
293
|
-
def file_name(values)
|
294
|
-
"#{table_schema}.dat"
|
295
|
-
end
|
296
|
-
|
297
|
-
def add_line(line)
|
298
|
-
values = line.chomp.split("\t")
|
299
|
-
fname = file_name(values)
|
300
|
-
one_file = @files[fname] ||= OneFile.new(@dir, fname)
|
301
|
-
one_file.add_line(line)
|
302
|
-
@total_cache_size += line.size
|
303
|
-
if one_file.cache_size > ONE_FILE_CACHE_SIZE
|
304
|
-
@total_cache_size -= one_file.cache_size
|
305
|
-
one_file.flush
|
306
|
-
end
|
307
|
-
flush_all if @total_cache_size > TOTAL_CACHE_SIZE
|
308
|
-
end
|
309
|
-
|
310
|
-
def flush_all
|
311
|
-
@files.each{|name, one_file| one_file.flush}
|
312
|
-
@total_cache_size = 0
|
313
|
-
end
|
314
|
-
|
315
|
-
def copy_lines
|
316
|
-
if block_given?
|
317
|
-
@files.each do |name, one_file|
|
318
|
-
yield "\\copy #{@table} (#{@columns.join(', ')}) from #{one_file.file_name}"
|
319
|
-
end
|
320
|
-
else
|
321
|
-
to_enum(:copy_lines)
|
322
|
-
end
|
323
|
-
end
|
324
|
-
end
|
325
|
-
|
326
|
-
class ComandLineWorker < Worker
|
6
|
+
class SplitPgDump::ComandLineWorker < SplitPgDump::Worker
|
327
7
|
def parse_comand_line
|
328
8
|
opts = OptionParser.new do |opts|
|
9
|
+
opts.version = SplitPgDump::VERSION
|
329
10
|
opts.banner = "\
|
11
|
+
#{opts.program_name} #{opts.version}
|
330
12
|
Usage: pg_dump my_base | split_pgdump [-r RULES_FILE] [-f DUMP_FILE] [-s SORT_BIN] [-d]
|
331
13
|
|
332
14
|
split_pgdump intend for producing stable set of small files instead of one
|
@@ -347,14 +29,23 @@ effectivly transmitted using rsync, repacking by 7z and other.
|
|
347
29
|
opts.on("-s", "--sort=SORT_BIN", "sort executable compatible with gnu coreutils sort (default `which sort`)") do |v|
|
348
30
|
self.sorter = v
|
349
31
|
end
|
350
|
-
opts.on("-n", "--sorters=NUM", Integer, "number of sorters started in a shell"
|
351
|
-
|
352
|
-
self.num_sorters = n
|
32
|
+
opts.on("-n", "--sorters=NUM", Integer, "number of sorters started in a shell") do |n|
|
33
|
+
self.num_sorters = n.to_i
|
353
34
|
end
|
354
35
|
opts.on("--no-shell-fork", "could not use shell & for parrallel execution of sorters") do
|
355
|
-
self.could_fork =
|
36
|
+
self.could_fork = false
|
37
|
+
end
|
38
|
+
opts.on("-x", "--xargs=XARGS_BIN", "xargs executable (-L and -P options used) (default `which xargs`)") do |v|
|
39
|
+
self.xargs = v
|
40
|
+
end
|
41
|
+
opts.on("--no-xargs", 'explicitly disable xargs') do
|
42
|
+
self.xargs = ''
|
356
43
|
end
|
357
44
|
opts.on("-d", "--debug", "debug"){|v| $debug = true}
|
45
|
+
opts.on_tail("-v", "--version", "show version") do
|
46
|
+
puts opts.version
|
47
|
+
exit
|
48
|
+
end
|
358
49
|
opts.on_tail("-h", "--help", "this message"){|v| puts opts; exit}
|
359
50
|
|
360
51
|
opts.on_tail("\
|
@@ -381,10 +72,8 @@ wiki_content_versions split:$page_id%0025!/$id%0000250! sort:page_id:n id:n
|
|
381
72
|
end
|
382
73
|
end
|
383
74
|
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
worker.work(STDIN)
|
390
|
-
end
|
75
|
+
worker = SplitPgDump::ComandLineWorker.new
|
76
|
+
worker.parse_comand_line
|
77
|
+
worker.parse_rules
|
78
|
+
worker.clear_files
|
79
|
+
worker.work(STDIN)
|
data/lib/split_pgdump.rb
ADDED
@@ -0,0 +1,346 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# vim: set syntax=ruby shiftwidth=2 softtabstop=2 tabstop=8 expandtab
|
3
|
+
require 'fileutils'
|
4
|
+
require 'strscan'
|
5
|
+
require 'shellwords'
|
6
|
+
|
7
|
+
$debug = false
|
8
|
+
|
9
|
+
module SplitPgDump
|
10
|
+
VERSION = '0.3.5'
|
11
|
+
end
|
12
|
+
|
13
|
+
class SplitPgDump::Worker
|
14
|
+
attr_accessor :rules_file, :output_file, :sorter, :rules, :num_sorters
|
15
|
+
attr_accessor :could_fork, :xargs
|
16
|
+
def initialize
|
17
|
+
@rules_file = 'split.rules'
|
18
|
+
@output_file = 'dump.sql'
|
19
|
+
@sorter = `which sort`.chomp
|
20
|
+
@xargs = `which xargs`.chomp
|
21
|
+
@rules = []
|
22
|
+
@num_sorters = 0
|
23
|
+
@could_fork = true
|
24
|
+
end
|
25
|
+
|
26
|
+
def tables_dir
|
27
|
+
output_file + '-tables'
|
28
|
+
end
|
29
|
+
|
30
|
+
def clear_files
|
31
|
+
FileUtils.rm_f output_file
|
32
|
+
FileUtils.rm_rf Dir[File.join(tables_dir, '*')]
|
33
|
+
FileUtils.mkdir_p tables_dir
|
34
|
+
end
|
35
|
+
|
36
|
+
def parse_rules
|
37
|
+
if File.exists?(rules_file)
|
38
|
+
File.open(rules_file) do |f|
|
39
|
+
f.each_line do |line|
|
40
|
+
if rule = SplitPgDump::Rule.parse(line)
|
41
|
+
@rules << rule
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
else
|
46
|
+
puts "NO FILE #{rules_file}" if $debug
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def find_rule(table)
|
51
|
+
@rules.find{|rule| table =~ rule.regex}
|
52
|
+
end
|
53
|
+
|
54
|
+
def process_schema_line(out, line)
|
55
|
+
if line =~ /^COPY (\w+) \(([^)]+)\) FROM stdin;/
|
56
|
+
table_name, columns = $1, $2.split(', ')
|
57
|
+
rule = find_rule("#@schema.#{table_name}")
|
58
|
+
@table = SplitPgDump::Table.new(tables_dir, @schema, table_name, columns, rule)
|
59
|
+
@tables << @table
|
60
|
+
puts "Start to write table #{table_name}" if $debug
|
61
|
+
@start_time = Time.now
|
62
|
+
@state = :table
|
63
|
+
else
|
64
|
+
if line =~ /^SET search_path = ([^,]+)/
|
65
|
+
@schema = $1
|
66
|
+
end
|
67
|
+
out.write line
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def process_copy_line(out, line)
|
72
|
+
if line =~ /^\\\.[\r\n]/
|
73
|
+
@table.flush_all
|
74
|
+
@table.copy_lines{|l| out.puts l}
|
75
|
+
puts "Table #{@table.table} copied in #{Time.now - @start_time}s" if $debug
|
76
|
+
@table = nil
|
77
|
+
@state = :schema
|
78
|
+
else
|
79
|
+
@table.add_line(line)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def work(in_stream)
|
84
|
+
@state = :schema
|
85
|
+
@table = nil
|
86
|
+
@tables = []
|
87
|
+
@schema = 'public'
|
88
|
+
|
89
|
+
File.open(output_file, 'w') do |out|
|
90
|
+
in_stream.each_line do |line|
|
91
|
+
case @state
|
92
|
+
when :schema
|
93
|
+
process_schema_line(out, line)
|
94
|
+
when :table
|
95
|
+
process_copy_line(out, line)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
@start_time = Time.now
|
101
|
+
sort_and_finish
|
102
|
+
puts "Finished in #{Time.now - @start_time}s #{Process.pid}" if $debug
|
103
|
+
end
|
104
|
+
|
105
|
+
def sort_and_finish
|
106
|
+
files = []
|
107
|
+
for table in @tables
|
108
|
+
for one_file in table.files.values
|
109
|
+
sort_args = one_file.sort_args(table.sort_args).shelljoin
|
110
|
+
files << [one_file, sort_args]
|
111
|
+
end
|
112
|
+
end
|
113
|
+
unless @xargs.empty?
|
114
|
+
num_sorters = [@num_sorters, 1].max
|
115
|
+
xargs_cmd = [@xargs, '-L1', '-P', num_sorters.to_s, @sorter].shelljoin
|
116
|
+
puts xargs_cmd if $debug
|
117
|
+
IO.popen(xargs_cmd, 'w+') do |io|
|
118
|
+
files.each{|one_file, sort_args|
|
119
|
+
puts sort_args if $debug
|
120
|
+
io.puts sort_args
|
121
|
+
}
|
122
|
+
io.close_write
|
123
|
+
io.each_line{|l|
|
124
|
+
puts l if $debug
|
125
|
+
}
|
126
|
+
end
|
127
|
+
else
|
128
|
+
sorter = @sorter.shellescape
|
129
|
+
commands = files.map{|one_file, sort_args| "#{sorter} #{sort_args}" }
|
130
|
+
if @num_sorters > 1
|
131
|
+
commands.each_slice(@num_sorters) do |cmd|
|
132
|
+
cmd = cmd.map{|c| "{ #{c} & }"} if @could_fork
|
133
|
+
cmd = cmd.join(' ; ')
|
134
|
+
cmd += ' ; wait ' if @could_fork
|
135
|
+
puts cmd if $debug
|
136
|
+
system cmd
|
137
|
+
end
|
138
|
+
else
|
139
|
+
commands.each do |cmd|
|
140
|
+
puts cmd if $debug
|
141
|
+
system cmd
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
files.each{|one_file, sort_args| one_file.write_finish}
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
class SplitPgDump::Rule
|
150
|
+
class ParseError < StandardError; end
|
151
|
+
|
152
|
+
attr_reader :regex, :split_parts, :sort_keys
|
153
|
+
def self.parse(line)
|
154
|
+
line = line.sub(%r{(;|#|//).*$},'').strip
|
155
|
+
return if line.empty?
|
156
|
+
|
157
|
+
if line =~ /^(\S+)(?:\s+split:(\S+))?(?:\s+sort:((?:(?:[^\s:]+)(?::[MbdfghinRrV]+)?(?:\s+|\s*$))+))?$/
|
158
|
+
puts "#$1 split:#$2 sort:#$3" if $debug
|
159
|
+
new($1, $2, $3)
|
160
|
+
else
|
161
|
+
raise ParseError, "Wrong rule line #{line}"
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
def initialize(table_regex, split_expr, sort_keys)
|
166
|
+
@regex = Regexp.new table_regex
|
167
|
+
parse_split_expr(split_expr)
|
168
|
+
parse_sort_keys(sort_keys)
|
169
|
+
end
|
170
|
+
|
171
|
+
def parse_split_expr(split_expr)
|
172
|
+
s = StringScanner.new(split_expr || '')
|
173
|
+
parts = []
|
174
|
+
while !s.eos?
|
175
|
+
if field = s.scan(/\$[^\[%!]+/)
|
176
|
+
field = field[1..-1]
|
177
|
+
part = {:type => :field, :field => field, :actions => []}
|
178
|
+
while !s.eos?
|
179
|
+
if range = s.scan(/\[[+-]?\d+\.\.\.?[+-]?\d+\]/)
|
180
|
+
part[:actions] << {:range => range}
|
181
|
+
elsif mod = s.scan(/%\d+/)
|
182
|
+
part[:actions] << {:mod => mod[1..-1]}
|
183
|
+
else
|
184
|
+
break
|
185
|
+
end
|
186
|
+
end
|
187
|
+
parts << part
|
188
|
+
if sep = s.scan(/![^$\s#\\]*/)
|
189
|
+
if sep > '!'
|
190
|
+
parts << {:type => :sep, :sep => sep[1..-1]}
|
191
|
+
end
|
192
|
+
next
|
193
|
+
end
|
194
|
+
end
|
195
|
+
raise ParseError, "Wrong format of split expr #{split_expr} (rest: '#{s.rest}')"
|
196
|
+
end
|
197
|
+
@split_parts = parts
|
198
|
+
end
|
199
|
+
|
200
|
+
def parse_sort_keys(sort_keys)
|
201
|
+
@sort_keys = (sort_keys || '').scan(/([^\s:]+)(?::([MbdfghinRrV]+))?/).map do |key, flags|
|
202
|
+
{:field => key, :flags => flags}
|
203
|
+
end
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
class SplitPgDump::Table
|
208
|
+
class NoColumn < StandardError; end
|
209
|
+
ONE_FILE_CACHE_SIZE = 256 * 1024
|
210
|
+
TOTAL_CACHE_SIZE = 5 * 1024 * 1024
|
211
|
+
|
212
|
+
class OneFile
|
213
|
+
attr_reader :file_name, :cache_size
|
214
|
+
def initialize(dir, name)
|
215
|
+
@file_name = File.join(dir, name)
|
216
|
+
@cache_lines = []
|
217
|
+
@cache_size = 0
|
218
|
+
end
|
219
|
+
|
220
|
+
def add_line(line)
|
221
|
+
@cache_lines << line
|
222
|
+
@cache_size += line.size
|
223
|
+
end
|
224
|
+
|
225
|
+
def flush(&block)
|
226
|
+
@cache_size = 0
|
227
|
+
dir = File.dirname(@file_name)
|
228
|
+
unless File.directory?(dir)
|
229
|
+
FileUtils.mkdir_p(dir)
|
230
|
+
end
|
231
|
+
content = @cache_lines.join
|
232
|
+
File.open(@file_name, 'a'){|f| f.write(content)}
|
233
|
+
@cache_lines.clear
|
234
|
+
end
|
235
|
+
|
236
|
+
def write_finish
|
237
|
+
File.open(@file_name, 'a') do |f|
|
238
|
+
f.puts('\\.')
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
def sort_args(sort_line = [])
|
243
|
+
args = []
|
244
|
+
if sort_line && !sort_line.empty?
|
245
|
+
args.concat sort_line
|
246
|
+
else
|
247
|
+
args << '-n'
|
248
|
+
end
|
249
|
+
args.push '-o', @file_name, @file_name
|
250
|
+
args
|
251
|
+
end
|
252
|
+
end
|
253
|
+
|
254
|
+
attr_reader :table, :columns, :files, :sort_line, :sort_args
|
255
|
+
def initialize(dir, schema, name, columns, rule)
|
256
|
+
@dir = dir
|
257
|
+
@table = name
|
258
|
+
@schema = schema
|
259
|
+
@columns = columns.map{|c| c.sub(/^"(.+)"$/, '\\1')}
|
260
|
+
apply_rule rule
|
261
|
+
@files = {}
|
262
|
+
@total_cache_size = 0
|
263
|
+
end
|
264
|
+
|
265
|
+
def _mod(s, len, mod)
|
266
|
+
"%0#{len}d" % (s.to_i / mod * mod)
|
267
|
+
end
|
268
|
+
|
269
|
+
def apply_rule(rule)
|
270
|
+
if rule
|
271
|
+
split_string = ''
|
272
|
+
rule.split_parts.each do |part|
|
273
|
+
case part[:type]
|
274
|
+
when :sep
|
275
|
+
split_string << part[:sep]
|
276
|
+
when :field
|
277
|
+
i = @columns.find_index(part[:field])
|
278
|
+
raise NoColumn, "Table #{@schema}.#{@table} has no column #{part[:field]} for use in split" unless i
|
279
|
+
field = "values[#{i}]"
|
280
|
+
part[:actions].each do |action|
|
281
|
+
if action[:mod]
|
282
|
+
mod_s = action[:mod]
|
283
|
+
mod = mod_s.to_i
|
284
|
+
field = "_mod(#{field},#{mod_s.size},#{mod})"
|
285
|
+
elsif action[:range]
|
286
|
+
field << "#{action[:range]}"
|
287
|
+
end
|
288
|
+
end
|
289
|
+
split_string << "\#{#{field}}"
|
290
|
+
end
|
291
|
+
end
|
292
|
+
|
293
|
+
eval <<-"EOF"
|
294
|
+
def self.file_name(values)
|
295
|
+
name = %{#{split_string}}.gsub(/\\.\\.|\\s|\\?|\\*|'|"/, '_')
|
296
|
+
"\#{table_schema}/\#{name}.dat"
|
297
|
+
end
|
298
|
+
EOF
|
299
|
+
|
300
|
+
@sort_args = rule.sort_keys.map do |key|
|
301
|
+
i = @columns.find_index(key[:field])
|
302
|
+
raise NoColumn, "Table #{@schema}.#{@table} has no column #{key[:field]} for use in sort" unless i
|
303
|
+
i += 1
|
304
|
+
"--key=#{i},#{i}#{key[:flags]}"
|
305
|
+
end
|
306
|
+
else
|
307
|
+
@sort_args = []
|
308
|
+
end
|
309
|
+
end
|
310
|
+
|
311
|
+
def table_schema
|
312
|
+
@schema == 'public' ? @table : "#@schema/#@table"
|
313
|
+
end
|
314
|
+
|
315
|
+
def file_name(values)
|
316
|
+
"#{table_schema}.dat"
|
317
|
+
end
|
318
|
+
|
319
|
+
def add_line(line)
|
320
|
+
values = line.chomp.split("\t")
|
321
|
+
fname = file_name(values)
|
322
|
+
one_file = @files[fname] ||= OneFile.new(@dir, fname)
|
323
|
+
one_file.add_line(line)
|
324
|
+
@total_cache_size += line.size
|
325
|
+
if one_file.cache_size > ONE_FILE_CACHE_SIZE
|
326
|
+
@total_cache_size -= one_file.cache_size
|
327
|
+
one_file.flush
|
328
|
+
end
|
329
|
+
flush_all if @total_cache_size > TOTAL_CACHE_SIZE
|
330
|
+
end
|
331
|
+
|
332
|
+
def flush_all
|
333
|
+
@files.each{|name, one_file| one_file.flush}
|
334
|
+
@total_cache_size = 0
|
335
|
+
end
|
336
|
+
|
337
|
+
def copy_lines
|
338
|
+
if block_given?
|
339
|
+
@files.each do |name, one_file|
|
340
|
+
yield "\\copy #{@table} (#{@columns.join(', ')}) from #{one_file.file_name}"
|
341
|
+
end
|
342
|
+
else
|
343
|
+
to_enum(:copy_lines)
|
344
|
+
end
|
345
|
+
end
|
346
|
+
end
|
metadata
CHANGED
@@ -1,70 +1,53 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: split_pgdump
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.3.5
|
5
5
|
prerelease:
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 3
|
9
|
-
- 3
|
10
|
-
version: 0.3.3
|
11
6
|
platform: ruby
|
12
|
-
authors:
|
7
|
+
authors:
|
13
8
|
- Sokolov Yura aka funny_falcon
|
14
9
|
autorequire:
|
15
10
|
bindir: bin
|
16
11
|
cert_chain: []
|
17
|
-
|
18
|
-
date: 2011-11-22 00:00:00 +04:00
|
19
|
-
default_executable:
|
12
|
+
date: 2011-11-22 00:00:00.000000000 Z
|
20
13
|
dependencies: []
|
14
|
+
description: ! 'split_pgdump aimed to produce set of small sorted files from one big
|
15
|
+
dump file.
|
21
16
|
|
22
|
-
|
23
|
-
split_pgdump aimed to produce set of small sorted files from one big dump file.
|
24
|
-
|
17
|
+
'
|
25
18
|
email: funny.falcon@gmail.com
|
26
|
-
executables:
|
19
|
+
executables:
|
27
20
|
- split_pgdump
|
28
21
|
extensions: []
|
29
|
-
|
30
22
|
extra_rdoc_files: []
|
31
|
-
|
32
|
-
files:
|
23
|
+
files:
|
33
24
|
- bin/split_pgdump
|
34
25
|
- README
|
35
|
-
|
26
|
+
- lib/split_pgdump.rb
|
36
27
|
homepage: https://github.com/funny-falcon/split_pgdump
|
37
|
-
licenses:
|
28
|
+
licenses:
|
38
29
|
- GPL
|
39
30
|
post_install_message:
|
40
31
|
rdoc_options: []
|
41
|
-
|
42
|
-
require_paths:
|
32
|
+
require_paths:
|
43
33
|
- lib
|
44
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
34
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
45
35
|
none: false
|
46
|
-
requirements:
|
47
|
-
- -
|
48
|
-
- !ruby/object:Gem::Version
|
49
|
-
|
50
|
-
|
51
|
-
- 0
|
52
|
-
version: "0"
|
53
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - ! '>='
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '0'
|
40
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
54
41
|
none: false
|
55
|
-
requirements:
|
56
|
-
- -
|
57
|
-
- !ruby/object:Gem::Version
|
58
|
-
|
59
|
-
segments:
|
60
|
-
- 0
|
61
|
-
version: "0"
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
62
46
|
requirements: []
|
63
|
-
|
64
47
|
rubyforge_project:
|
65
|
-
rubygems_version: 1.
|
48
|
+
rubygems_version: 1.8.12
|
66
49
|
signing_key:
|
67
50
|
specification_version: 3
|
68
|
-
summary: split_pgdump is a tool for splitting postgresql dump in a managable set of
|
51
|
+
summary: split_pgdump is a tool for splitting postgresql dump in a managable set of
|
52
|
+
files
|
69
53
|
test_files: []
|
70
|
-
|