split_pgdump 0.3.6 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,8 @@
1
+ if RUBY_ENGINE == 'ruby'
2
+ require 'mkmf'
3
+ create_makefile('native_compute_name')
4
+ else
5
+ File.open(File.dirname(__FILE__) + "/Makefile", "w") do |f|
6
+ f.write("install:\n\t#nothing to build")
7
+ end
8
+ end
@@ -0,0 +1,117 @@
1
+ #include "ruby.h"
2
+ #include "ruby/intern.h"
3
+ #include "ruby/defines.h"
4
+ #include "ruby/encoding.h"
5
+
6
+ static ID idDiv;
7
+ static ID idMul;
8
+
9
+ static VALUE
10
+ apply_actions(VALUE field, VALUE actions)
11
+ {
12
+ long j, actions_len = RARRAY_LEN(actions);
13
+ long beg, len;
14
+ VALUE num = 0, modi = 0;
15
+ for (j = 0; j < actions_len; j++) {
16
+ VALUE action = rb_ary_entry(actions, j);
17
+ VALUE klass = rb_class_of(action);
18
+ if (klass == rb_cRange) {
19
+ /* copied from rb_str_aref */
20
+ len = rb_str_strlen(field);
21
+ if (RTEST(rb_range_beg_len(action, &beg, &len, len, 0)))
22
+ field = rb_str_substr(field, beg, len);
23
+ } else if (klass == rb_cArray) {
24
+ num = rb_str_to_inum(field, 10, 0);
25
+ modi = rb_ary_entry(action, 1);
26
+ if ( (FIXNUM_P(num) ||
27
+ TYPE(num) == T_BIGNUM &&
28
+ RBIGNUM_LEN(num) <= (SIZEOF_LONG/SIZEOF_BDIGITS)
29
+ ) &&
30
+ FIXNUM_P(modi) &&
31
+ FIX2LONG(modi)) {
32
+ long modl = NUM2LONG(modi);
33
+ long numl = (NUM2LONG(num) / modl) * modl;
34
+ char buf[30];
35
+
36
+ int wrtn = snprintf(buf, 30,
37
+ RSTRING_PTR(rb_ary_entry(action, 0)),
38
+ numl);
39
+ if (wrtn < 30) {
40
+ field = rb_str_new(buf, wrtn);
41
+ continue;
42
+ }
43
+ }
44
+ else {
45
+ num = rb_funcall2(num, idDiv, 1, &modi);
46
+ num = rb_funcall2(num, idMul, 1, &modi);
47
+ }
48
+ field = rb_str_format(1, &num, rb_ary_entry(action, 0));
49
+ }
50
+ }
51
+ return field;
52
+ }
53
+
54
+ #define INITIAL_CAPA 32
55
+ static VALUE
56
+ spgd_compute_name(VALUE self, VALUE split_rule, VALUE values)
57
+ {
58
+ VALUE res = 0;
59
+ int encoding = -1;
60
+ char *result = (char*) xmalloc(INITIAL_CAPA);
61
+ int pos = 0, capa = INITIAL_CAPA;
62
+ long i, rule_len = RARRAY_LEN(split_rule);
63
+ if (!result) {
64
+ rb_memerror();
65
+ }
66
+ for (i = 0; i < rule_len; i++) {
67
+ VALUE rule = rb_ary_entry(split_rule, i);
68
+ if (rb_class_of(rule) == rb_cArray) {
69
+ long fieldnum = NUM2LONG(rb_ary_entry(rule, 0));
70
+ VALUE actions = rb_ary_entry(rule, 1);
71
+ rule = rb_ary_entry(values, fieldnum);
72
+ encoding = ENCODING_GET(rule);
73
+ if (RTEST(actions) && RARRAY_LEN(actions)) {
74
+ rule = apply_actions(rule, actions);
75
+ }
76
+ }
77
+ if (rb_class_of(rule) == rb_cString) {
78
+ long size = RSTRING_LEN(rule);
79
+ if (capa < pos + size + 1) {
80
+ char *tmp;
81
+ capa = pos + size + 1;
82
+ if (i + 1 != rule_len) capa = (capa * 3) >> 1;
83
+ tmp = (char*) xrealloc(result, capa);
84
+ if (!tmp) {
85
+ xfree(result);
86
+ rb_memerror();
87
+ }
88
+ result = tmp;
89
+ }
90
+ if (encoding == -1) encoding = ENCODING_GET(rule);
91
+ strncpy(result + pos, RSTRING_PTR(rule), size + 1);
92
+ pos += size;
93
+ }
94
+ }
95
+ res = rb_str_new(result, pos);
96
+ ENCODING_SET(res, encoding);
97
+ ENC_CODERANGE_CLEAR(res);
98
+ xfree(result);
99
+ return res;
100
+ }
101
+
102
+ static VALUE
103
+ spgd_native_compute_name(VALUE self)
104
+ {
105
+ return Qtrue;
106
+ }
107
+
108
+ void Init_native_compute_name() {
109
+ VALUE split_pgdump = rb_define_module("SplitPgDump");
110
+ VALUE native_compute = rb_define_module_under(split_pgdump, "NativeComputeName");
111
+
112
+ rb_define_method(native_compute, "compute_name", spgd_compute_name, 2);
113
+ rb_define_method(native_compute, "native_compute_name?", spgd_native_compute_name, 0);
114
+
115
+ CONST_ID(idDiv, "/");
116
+ CONST_ID(idMul, "*");
117
+ }
@@ -7,7 +7,12 @@ require 'shellwords'
7
7
  $debug = false
8
8
 
9
9
  module SplitPgDump
10
- VERSION = '0.3.6'
10
+ VERSION = '0.4.0'
11
+ end
12
+
13
+ begin
14
+ require 'split_pgdump/native_compute_name'
15
+ rescue LoadError
11
16
  end
12
17
 
13
18
  class SplitPgDump::Worker
@@ -57,7 +62,7 @@ class SplitPgDump::Worker
57
62
  rule = find_rule("#@schema.#{table_name}")
58
63
  @table = SplitPgDump::Table.new(tables_dir, @schema, table_name, columns, rule)
59
64
  @tables << @table
60
- puts "Start to write table #{table_name}" if $debug
65
+ puts "Start to write table \t#{table_name}" if $debug
61
66
  @start_time = Time.now
62
67
  @state = :table
63
68
  else
@@ -72,7 +77,7 @@ class SplitPgDump::Worker
72
77
  if line =~ /^\\\.[\r\n]/
73
78
  @table.flush_all
74
79
  @table.copy_lines{|l| out.puts l}
75
- puts "Table #{@table.table} copied in #{Time.now - @start_time}s" if $debug
80
+ puts "Table #{@table.table} copied in \t#{"%.2f" % (Time.now - @start_time)}s" if $debug
76
81
  @table = nil
77
82
  @state = :schema
78
83
  else
@@ -120,7 +125,7 @@ class SplitPgDump::Worker
120
125
  io.puts sort_args
121
126
  }
122
127
  io.close_write
123
- io.each_line{|l|
128
+ io.each_line{|l|
124
129
  puts l if $debug
125
130
  }
126
131
  end
@@ -174,23 +179,24 @@ class SplitPgDump::Rule
174
179
  while !s.eos?
175
180
  if field = s.scan(/\$[^\[%!]+/)
176
181
  field = field[1..-1]
177
- part = {:type => :field, :field => field, :actions => []}
182
+ part = [field]
178
183
  while !s.eos?
179
184
  if range = s.scan(/\[[+-]?\d+\.\.\.?[+-]?\d+\]/)
180
- part[:actions] << {:range => range}
185
+ part << eval(range[1...-1])
181
186
  elsif mod = s.scan(/%\d+/)
182
- part[:actions] << {:mod => mod[1..-1]}
187
+ mod = mod[1..-1]
188
+ format = "%0#{mod.size}d"
189
+ modi = mod.to_i
190
+ part << [format, modi]
183
191
  else
184
192
  break
185
193
  end
186
194
  end
187
195
  parts << part
188
- if sep = s.scan(/![^$\s#\\]*/)
189
- if sep > '!'
190
- parts << {:type => :sep, :sep => sep[1..-1]}
191
- end
192
- next
193
- end
196
+ next if s.scan(/!/)
197
+ elsif sep = s.scan(/[^$\s#\\]+/)
198
+ parts << sep
199
+ next
194
200
  end
195
201
  raise ParseError, "Wrong format of split expr #{split_expr} (rest: '#{s.rest}')"
196
202
  end
@@ -215,6 +221,10 @@ class SplitPgDump::Table
215
221
  @file_name = File.join(dir, name)
216
222
  @cache_lines = []
217
223
  @cache_size = 0
224
+ dir = File.dirname(@file_name)
225
+ unless File.directory?(dir)
226
+ FileUtils.mkdir_p(dir)
227
+ end
218
228
  end
219
229
 
220
230
  def add_line(line)
@@ -223,14 +233,12 @@ class SplitPgDump::Table
223
233
  end
224
234
 
225
235
  def flush(&block)
226
- @cache_size = 0
227
- dir = File.dirname(@file_name)
228
- unless File.directory?(dir)
229
- FileUtils.mkdir_p(dir)
236
+ if @cache_size > 0
237
+ @cache_size = 0
238
+ content = @cache_lines.join
239
+ File.open(@file_name, 'a'){|f| f.write(content)}
240
+ @cache_lines.clear
230
241
  end
231
- content = @cache_lines.join
232
- File.open(@file_name, 'a'){|f| f.write(content)}
233
- @cache_lines.clear
234
242
  end
235
243
 
236
244
  def write_finish
@@ -251,23 +259,39 @@ class SplitPgDump::Table
251
259
  end
252
260
  end
253
261
 
254
- module DefaultName
255
- def file_name(line)
256
- @file_name
257
- end
258
- end
259
- include DefaultName
260
-
261
262
  module ComputeName
262
- def file_name(line)
263
- values = line.chomp.split("\t")
264
- name = compute_name(values)
265
- @file_name[name] ||= begin
266
- name_strip = name.gsub(/\.\.|\s|\?|\*|'|"/, '_')
267
- "#{table_schema}/#{name_strip}.dat"
263
+ def compute_name(split_rule, values)
264
+ result = ''
265
+ split_rule.each do |rule|
266
+ case rule
267
+ when String
268
+ result << rule
269
+ when Array
270
+ field = values[rule[0]]
271
+ rule[1].each do |action|
272
+ case action
273
+ when Range
274
+ field = field[action]
275
+ when Array # take modulo
276
+ v = field.to_i
277
+ field = action[0] % (v - v % action[1])
278
+ end
279
+ end
280
+ result << field
281
+ end
268
282
  end
283
+ result
284
+ end
285
+
286
+ def native_compute_name?
287
+ false
269
288
  end
270
289
  end
290
+ if defined?(SplitPgDump::NativeComputeName)
291
+ include SplitPgDump::NativeComputeName
292
+ else
293
+ include ComputeName
294
+ end
271
295
 
272
296
  attr_reader :table, :columns, :files, :sort_line, :sort_args
273
297
  def initialize(dir, schema, name, columns, rule)
@@ -278,6 +302,7 @@ class SplitPgDump::Table
278
302
  @file_name = "#{table_schema}.dat"
279
303
  apply_rule rule
280
304
  @files = {}
305
+ @files_to_flush = {}
281
306
  @total_cache_size = 0
282
307
  end
283
308
 
@@ -287,36 +312,53 @@ class SplitPgDump::Table
287
312
 
288
313
  def apply_rule(rule)
289
314
  if rule
290
- split_string = ''
291
- rule.split_parts.each do |part|
292
- case part[:type]
293
- when :sep
294
- split_string << part[:sep]
295
- when :field
296
- i = @columns.find_index(part[:field])
297
- raise NoColumn, "Table #{@schema}.#{@table} has no column #{part[:field]} for use in split" unless i
298
- field = "values[#{i}]"
299
- part[:actions].each do |action|
300
- if action[:mod]
301
- mod_s = action[:mod]
302
- mod = mod_s.to_i
303
- field = "_mod(#{field}, '%0#{mod_s.size}d', #{mod})"
304
- elsif action[:range]
305
- field << "#{action[:range]}"
315
+ unless rule.split_parts.empty?
316
+ if native_compute_name?
317
+ @split_rule = rule.split_parts.map do |part|
318
+ case part
319
+ when Array # field manipulations
320
+ unless i = @columns.index(part[0])
321
+ raise NoColumn, "Table #{@schema}.#{@table} has no column #{part[0]} for use in split"
322
+ end
323
+ [i, part[1..-1]]
324
+ else
325
+ part
326
+ end
327
+ end
328
+ else
329
+ split_string = ''
330
+ split_rule = []
331
+ rule.split_parts.map do |part|
332
+ case part
333
+ when Array #field manipulation
334
+ unless i = @columns.index(part[0])
335
+ raise NoColumn, "Table #{@schema}.#{@table} has no column #{part[0]} for use in split"
336
+ end
337
+ field = "values[#{i}]"
338
+ part[1..-1].each do |action|
339
+ ssize = split_rule.size
340
+ case action
341
+ when Range
342
+ field << "[split_rule[#{ssize}]]"
343
+ split_rule << action
344
+ when Array # take module
345
+ field = "_mod(#{field}, split_rule[#{ssize}], split_rule[#{ssize+1}])"
346
+ split_rule.concat action
347
+ end
348
+ end
349
+ split_string << "\#{#{field}}"
350
+ when String
351
+ split_string << part
306
352
  end
307
353
  end
308
- split_string << "\#{#{field}}"
354
+ @split_rule = split_rule
355
+ eval <<-"EOF"
356
+ def self.compute_name(split_rule, values)
357
+ %{#{split_string}}
358
+ end
359
+ EOF
309
360
  end
310
- end
311
-
312
- if split_string > ''
313
361
  @file_name = {}
314
- eval <<-"EOF"
315
- def self.compute_name(values)
316
- %{#{split_string}}
317
- end
318
- EOF
319
- extend ComputeName
320
362
  end
321
363
 
322
364
  @sort_args = rule.sort_keys.map do |key|
@@ -335,12 +377,21 @@ class SplitPgDump::Table
335
377
  end
336
378
 
337
379
  def file_name(line)
338
- @file_name
380
+ values = line.split("\t")
381
+ values.last.chomp!
382
+ name = compute_name(@split_rule, values)
383
+ @file_name[name] ||= begin
384
+ name_strip = name.gsub(/\.\.|\s|\?|\*|'|"/, '_')
385
+ "#{table_schema}/#{name_strip}.dat"
386
+ end
339
387
  end
340
388
 
341
389
  def add_line(line)
342
- fname = file_name(line)
390
+ fname = @split_rule ? file_name(line) : @file_name
343
391
  one_file = @files[fname] ||= OneFile.new(@dir, fname)
392
+
393
+ @files_to_flush[one_file] = true if one_file.cache_size == 0
394
+
344
395
  one_file.add_line(line)
345
396
  @total_cache_size += line.size
346
397
  if one_file.cache_size > ONE_FILE_CACHE_SIZE
@@ -351,7 +402,8 @@ class SplitPgDump::Table
351
402
  end
352
403
 
353
404
  def flush_all
354
- @files.each{|name, one_file| one_file.flush}
405
+ @files_to_flush.each{|one_file, _| one_file.flush }
406
+ @files_to_flush.clear
355
407
  @total_cache_size = 0
356
408
  end
357
409
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: split_pgdump
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.6
4
+ version: 0.4.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -18,12 +18,15 @@ description: ! 'split_pgdump aimed to produce set of small sorted files from one
18
18
  email: funny.falcon@gmail.com
19
19
  executables:
20
20
  - split_pgdump
21
- extensions: []
21
+ extensions:
22
+ - ext/split_pgdump/extconf.rb
22
23
  extra_rdoc_files: []
23
24
  files:
24
25
  - bin/split_pgdump
25
26
  - README
26
27
  - lib/split_pgdump.rb
28
+ - ext/split_pgdump/extconf.rb
29
+ - ext/split_pgdump/native_compute_name.c
27
30
  homepage: https://github.com/funny-falcon/split_pgdump
28
31
  licenses:
29
32
  - GPL
@@ -31,6 +34,7 @@ post_install_message:
31
34
  rdoc_options: []
32
35
  require_paths:
33
36
  - lib
37
+ - ext
34
38
  required_ruby_version: !ruby/object:Gem::Requirement
35
39
  none: false
36
40
  requirements: