RubyGems - split_pgdump - Versions diffs - 0.3.6 → 0.4.0 - Mend

split_pgdump 0.3.6 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

data/ext/split_pgdump/extconf.rb +8 -0
data/ext/split_pgdump/native_compute_name.c +117 -0
data/lib/split_pgdump.rb +114 -62
metadata +6 -2

data/ext/split_pgdump/extconf.rb ADDED

@@ -0,0 +1,8 @@
+if RUBY_ENGINE == 'ruby'
+  require 'mkmf'
+  create_makefile('native_compute_name')
+else
+  File.open(File.dirname(__FILE__) + "/Makefile", "w") do |f|
+    f.write("install:\n\t#nothing to build")
+  end
+end

data/ext/split_pgdump/native_compute_name.c ADDED

@@ -0,0 +1,117 @@
+#include "ruby.h"
+#include "ruby/intern.h"
+#include "ruby/defines.h"
+#include "ruby/encoding.h"
+static ID idDiv;
+static ID idMul;
+static VALUE
+apply_actions(VALUE field, VALUE actions)
+{
+    long j, actions_len = RARRAY_LEN(actions);
+    long beg, len;
+    VALUE num = 0, modi = 0;
+    for (j = 0; j < actions_len; j++) {
+	VALUE action = rb_ary_entry(actions, j);
+	VALUE klass = rb_class_of(action);
+	if (klass == rb_cRange) {
+	    /* copied from rb_str_aref */
+	    len = rb_str_strlen(field);
+	    if (RTEST(rb_range_beg_len(action, &beg, &len, len, 0)))
+		field = rb_str_substr(field, beg, len);
+	} else if (klass == rb_cArray) {
+	    num = rb_str_to_inum(field, 10, 0);
+	    modi = rb_ary_entry(action, 1);
+	    if ( (FIXNUM_P(num) ||
+		      TYPE(num) == T_BIGNUM &&
+		      RBIGNUM_LEN(num) <= (SIZEOF_LONG/SIZEOF_BDIGITS)
+		  ) &&
+		  FIXNUM_P(modi) &&
+		  FIX2LONG(modi)) {
+		long modl = NUM2LONG(modi);
+		long numl = (NUM2LONG(num) / modl) * modl;
+		char buf[30];
+		int wrtn = snprintf(buf, 30,
+			RSTRING_PTR(rb_ary_entry(action, 0)),
+			numl);
+		if (wrtn < 30) {
+		    field = rb_str_new(buf, wrtn);
+		    continue;
+		}
+	    }
+	    else {
+		num = rb_funcall2(num, idDiv, 1, &modi);
+		num = rb_funcall2(num, idMul, 1, &modi);
+	    }
+	    field = rb_str_format(1, &num, rb_ary_entry(action, 0));
+	}
+    }
+    return field;
+}
+#define INITIAL_CAPA 32
+static VALUE
+spgd_compute_name(VALUE self, VALUE split_rule, VALUE values)
+{
+    VALUE res = 0;
+    int encoding = -1;
+    char *result = (char*) xmalloc(INITIAL_CAPA);
+    int pos = 0, capa = INITIAL_CAPA;
+    long i, rule_len = RARRAY_LEN(split_rule);
+    if (!result) {
+	rb_memerror();
+    }
+    for (i = 0; i < rule_len; i++) {
+	VALUE rule = rb_ary_entry(split_rule, i);
+	if (rb_class_of(rule) == rb_cArray) {
+	    long fieldnum = NUM2LONG(rb_ary_entry(rule, 0));
+	    VALUE actions = rb_ary_entry(rule, 1);
+	    rule = rb_ary_entry(values, fieldnum);
+	    encoding = ENCODING_GET(rule);
+	    if (RTEST(actions) && RARRAY_LEN(actions)) {
+		rule = apply_actions(rule, actions);
+	    }
+	}
+	if (rb_class_of(rule) == rb_cString) {
+	    long size = RSTRING_LEN(rule);
+	    if (capa < pos + size + 1) {
+		char *tmp;
+		capa = pos + size + 1;
+		if (i + 1 != rule_len) capa = (capa * 3) >> 1;
+		tmp = (char*) xrealloc(result, capa);
+		if (!tmp) {
+		    xfree(result);
+		    rb_memerror();
+		}
+		result = tmp;
+	    }
+	    if (encoding == -1) encoding = ENCODING_GET(rule);
+	    strncpy(result + pos, RSTRING_PTR(rule), size + 1);
+	    pos += size;
+	}
+    }
+    res = rb_str_new(result, pos);
+    ENCODING_SET(res, encoding);
+    ENC_CODERANGE_CLEAR(res);
+    xfree(result);
+    return res;
+}
+static VALUE
+spgd_native_compute_name(VALUE self)
+{
+    return Qtrue;
+}
+void Init_native_compute_name() {
+    VALUE split_pgdump = rb_define_module("SplitPgDump");
+    VALUE native_compute = rb_define_module_under(split_pgdump, "NativeComputeName");
+    rb_define_method(native_compute, "compute_name", spgd_compute_name, 2);
+    rb_define_method(native_compute, "native_compute_name?", spgd_native_compute_name, 0);
+    CONST_ID(idDiv, "/");
+    CONST_ID(idMul, "*");
+}

data/lib/split_pgdump.rb CHANGED

@@ -7,7 +7,12 @@ require 'shellwords'
 $debug = false
 module SplitPgDump
-  VERSION = '0.3.6'
+  VERSION = '0.4.0'
+end
+begin
+  require 'split_pgdump/native_compute_name'
+rescue LoadError
 end
 class SplitPgDump::Worker
@@ -57,7 +62,7 @@ class SplitPgDump::Worker
       rule = find_rule("#@schema.#{table_name}")
       @table = SplitPgDump::Table.new(tables_dir, @schema, table_name, columns, rule)
       @tables << @table
-      puts "Start to write table #{table_name}" if $debug
+      puts "Start to write table \t#{table_name}" if $debug
       @start_time = Time.now
       @state = :table
     else
@@ -72,7 +77,7 @@ class SplitPgDump::Worker
     if line =~ /^\\\.[\r\n]/
       @table.flush_all
       @table.copy_lines{|l| out.puts l}
-      puts "Table #{@table.table} copied in #{Time.now - @start_time}s" if $debug
+      puts "Table #{@table.table} copied in \t#{"%.2f" % (Time.now - @start_time)}s" if $debug
       @table = nil
       @state = :schema
     else
@@ -120,7 +125,7 @@ class SplitPgDump::Worker
           io.puts sort_args
         }
         io.close_write
-        io.each_line{|l|
+        io.each_line{|l|
           puts l  if $debug
         }
       end
@@ -174,23 +179,24 @@ class SplitPgDump::Rule
     while !s.eos?
       if field = s.scan(/\$[^\[%!]+/)
         field = field[1..-1]
-        part = {:type => :field, :field => field, :actions => []}
+        part = [field]
         while !s.eos?
           if range = s.scan(/\[[+-]?\d+\.\.\.?[+-]?\d+\]/)
-            part[:actions] << {:range => range}
+            part << eval(range[1...-1])
           elsif mod = s.scan(/%\d+/)
-            part[:actions] << {:mod => mod[1..-1]}
+            mod = mod[1..-1]
+            format = "%0#{mod.size}d"
+            modi = mod.to_i
+            part << [format, modi]
           else
             break
           end
         end
         parts << part
-        if sep = s.scan(/![^$\s#\\]*/)
-          if sep > '!'
-            parts << {:type => :sep, :sep => sep[1..-1]}
-          end
-          next
-        end
+        next if s.scan(/!/)
+      elsif sep = s.scan(/[^$\s#\\]+/)
+        parts << sep
+        next
       end
       raise ParseError, "Wrong format of split expr #{split_expr} (rest: '#{s.rest}')"
     end
@@ -215,6 +221,10 @@ class SplitPgDump::Table
       @file_name = File.join(dir, name)
       @cache_lines = []
       @cache_size = 0
+      dir = File.dirname(@file_name)
+      unless File.directory?(dir)
+        FileUtils.mkdir_p(dir)
+      end
     end
     def add_line(line)
@@ -223,14 +233,12 @@ class SplitPgDump::Table
     end
     def flush(&block)
-      @cache_size = 0
-      dir = File.dirname(@file_name)
-      unless File.directory?(dir)
-        FileUtils.mkdir_p(dir)
+      if @cache_size > 0
+        @cache_size = 0
+        content = @cache_lines.join
+        File.open(@file_name, 'a'){|f| f.write(content)}
+        @cache_lines.clear
       end
-      content = @cache_lines.join
-      File.open(@file_name, 'a'){|f| f.write(content)}
-      @cache_lines.clear
     end
     def write_finish
@@ -251,23 +259,39 @@ class SplitPgDump::Table
     end
   end
-  module DefaultName
-    def file_name(line)
-      @file_name
-    end
-  end
-  include DefaultName
   module ComputeName
-    def file_name(line)
-      values = line.chomp.split("\t")
-      name = compute_name(values)
-      @file_name[name] ||= begin
-        name_strip = name.gsub(/\.\.|\s|\?|\*|'|"/, '_')
-        "#{table_schema}/#{name_strip}.dat"
+    def compute_name(split_rule, values)
+      result = ''
+      split_rule.each do |rule|
+        case rule
+        when String
+          result << rule
+        when Array
+          field = values[rule[0]]
+          rule[1].each do |action|
+            case action
+            when Range
+              field = field[action]
+            when Array # take modulo
+              v = field.to_i
+              field = action[0] % (v - v % action[1])
+            end
+          end
+          result << field
+        end
       end
+      result
+    end
+    def native_compute_name?
+      false
     end
   end
+  if defined?(SplitPgDump::NativeComputeName)
+    include SplitPgDump::NativeComputeName
+  else
+    include ComputeName
+  end
   attr_reader :table, :columns, :files, :sort_line, :sort_args
   def initialize(dir, schema, name, columns, rule)
@@ -278,6 +302,7 @@ class SplitPgDump::Table
     @file_name = "#{table_schema}.dat"
     apply_rule rule
     @files = {}
+    @files_to_flush = {}
     @total_cache_size = 0
   end
@@ -287,36 +312,53 @@ class SplitPgDump::Table
   def apply_rule(rule)
     if rule
-      split_string = ''
-      rule.split_parts.each do |part|
-        case part[:type]
-        when :sep
-          split_string << part[:sep]
-        when :field
-          i = @columns.find_index(part[:field])
-          raise NoColumn, "Table #{@schema}.#{@table} has no column #{part[:field]} for use in split"  unless i
-          field = "values[#{i}]"
-          part[:actions].each do |action|
-            if action[:mod]
-              mod_s = action[:mod]
-              mod = mod_s.to_i
-              field = "_mod(#{field}, '%0#{mod_s.size}d', #{mod})"
-            elsif action[:range]
-              field << "#{action[:range]}"
+      unless rule.split_parts.empty?
+        if native_compute_name?
+          @split_rule = rule.split_parts.map do |part|
+            case part
+            when Array # field manipulations
+              unless i = @columns.index(part[0])
+                raise NoColumn, "Table #{@schema}.#{@table} has no column #{part[0]} for use in split"
+              end
+              [i, part[1..-1]]
+            else
+              part
+            end
+          end
+        else
+          split_string = ''
+          split_rule = []
+          rule.split_parts.map do |part|
+            case part
+            when Array #field manipulation
+              unless i = @columns.index(part[0])
+                raise NoColumn, "Table #{@schema}.#{@table} has no column #{part[0]} for use in split"
+              end
+              field = "values[#{i}]"
+              part[1..-1].each do |action|
+                ssize = split_rule.size
+                case action
+                when Range
+                  field << "[split_rule[#{ssize}]]"
+                  split_rule << action
+                when Array # take module
+                  field = "_mod(#{field}, split_rule[#{ssize}], split_rule[#{ssize+1}])"
+                  split_rule.concat action
+                end
+              end
+              split_string << "\#{#{field}}"
+            when String
+              split_string << part
             end
           end
-          split_string << "\#{#{field}}"
+          @split_rule = split_rule
+          eval <<-"EOF"
+            def self.compute_name(split_rule, values)
+              %{#{split_string}}
+            end
+          EOF
         end
-      end
-      if split_string > ''
         @file_name = {}
-        eval <<-"EOF"
-          def self.compute_name(values)
-            %{#{split_string}}
-          end
-        EOF
-        extend ComputeName
       end
       @sort_args = rule.sort_keys.map do |key|
@@ -335,12 +377,21 @@ class SplitPgDump::Table
   end
   def file_name(line)
-    @file_name
+    values = line.split("\t")
+    values.last.chomp!
+    name = compute_name(@split_rule, values)
+    @file_name[name] ||= begin
+      name_strip = name.gsub(/\.\.|\s|\?|\*|'|"/, '_')
+      "#{table_schema}/#{name_strip}.dat"
+    end
   end
   def add_line(line)
-    fname = file_name(line)
+    fname = @split_rule ? file_name(line) : @file_name
     one_file = @files[fname] ||= OneFile.new(@dir, fname)
+    @files_to_flush[one_file] = true  if one_file.cache_size == 0
     one_file.add_line(line)
     @total_cache_size += line.size
     if one_file.cache_size > ONE_FILE_CACHE_SIZE
@@ -351,7 +402,8 @@ class SplitPgDump::Table
   end
   def flush_all
-    @files.each{|name, one_file| one_file.flush}
+    @files_to_flush.each{|one_file, _| one_file.flush }
+    @files_to_flush.clear
     @total_cache_size = 0
   end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: split_pgdump
 version: !ruby/object:Gem::Version
-  version: 0.3.6
+  version: 0.4.0
   prerelease:
 platform: ruby
 authors:
@@ -18,12 +18,15 @@ description: ! 'split_pgdump aimed to produce set of small sorted files from one
 email: funny.falcon@gmail.com
 executables:
 - split_pgdump
-extensions: []
+extensions:
+- ext/split_pgdump/extconf.rb
 extra_rdoc_files: []
 files:
 - bin/split_pgdump
 - README
 - lib/split_pgdump.rb
+- ext/split_pgdump/extconf.rb
+- ext/split_pgdump/native_compute_name.c
 homepage: https://github.com/funny-falcon/split_pgdump
 licenses:
 - GPL
@@ -31,6 +34,7 @@ post_install_message:
 rdoc_options: []
 require_paths:
 - lib
+- ext
 required_ruby_version: !ruby/object:Gem::Requirement
   none: false
   requirements: