RubyGems - split_pgdump - Versions diffs - 0.1.0 - Mend

split_pgdump 0.1.0

Files changed (3) hide show

data/README ADDED Viewed

@@ -0,0 +1,55 @@
+= Tool for splitting postgresql dump in a set of files
+I wish to use git or mercurial for managing my database history.
+Unfortunately, every single data change force them to store whole dump again.
+Even if you data actually not changed, rows order is not promised to be stable.
+split_pgdump splits dump in a set of small sorted files, so that git could track
+changes only of atcually changed data.
+Also, it allows rsync to effectevely transmit backup changes over network.
+== Usage
+Simplest example:
+  > pg_dump my_base | split_pgdump
+It produces:
+  `dump.sql`  - file with schema and psql copy instructions,
+  `dump.sql-tables/#{table}.dat` - 'copy data' for each table in a dump,
+              sorted numerically (I hope, it is `id`)
+You can change file name by `-f` option.
+=== Rules
+Rules are read from `split.rules` file (could be changed by `-r` option).
+File could contain set of lines:
+table_regexp  {split:<Split expr>} {sort:<Sort expr>}
+<Split expr> examples:
+  split:$field_name!
+  split:$field_name!_$other_field!
+  split:$client_id%00100!-$id%0025000!
+  split:$some_field[2..-1]!/$other_field[10..30]%0005!
+<Sort expr> is space separated list of fields, optionally with options for
+gnu `sort` --key parameters (on my machine they are MbdfghinRrV):
+  sort:client_id uid
+  sort:client_id:n id:n
+Example for redmines wiki_content_versions:
+wiki_content_versions split:$page_id%0025!/$id%0000250! sort:page_id:n id:n
+Either `split:` or `sort:` option could be skipped.
+== Author and Copyright
+Copyright (c) 2011 by Sokolov Yura (funny.falcon@gmail.com)
+Released under the same terms of license as Ruby
+== Homepage
+https://github.com/funny-falcon/split_pgdump

data/bin/split_pgdump ADDED Viewed

@@ -0,0 +1,340 @@
+#!/usr/bin/env ruby
+# vim: set syntax=ruby shiftwidth=2 softtabstop=2 tabstop=8 expandtab
+require 'optparse'
+require 'fileutils'
+require 'strscan'
+$debug = false
+class CWorker
+  attr_accessor :rules_file, :output_file, :sorter, :rules
+  def initialize
+    @rules_file = 'split.rules'
+    @output_file = 'dump.sql'
+    @sorter = `which sort`.chomp
+    @rules = []
+  end
+  def tables_dir
+    output_file + '-tables'
+  end
+  def clear_files
+    FileUtils.rm_f output_file
+    FileUtils.rm_rf Dir[File.join(tables_dir, '*')]
+    FileUtils.mkdir_p tables_dir
+  end
+  def parse_rules
+    if File.exists?(rules_file)
+      File.open(rules_file) do |f|
+        f.each_line do |line|
+          if rule = Rule.parse(line)
+            @rules << rule
+          end
+        end
+      end
+    else
+      puts "NO FILE #{rules_file}"  if $debug
+    end
+  end
+  def find_rule(table)
+    @rules.find{|rule| table =~ rule.regex}
+  end
+  def process_schema_line(out, line)
+    if line =~ /^COPY (\w+) \(([^)]+)\) FROM stdin;/
+      table_name, columns = $1, $2.split(', ')
+      @table = Table.new(tables_dir, table_name, columns)
+      @tables << @table
+      puts "Start to write table #{table_name}" if $debug
+      @state = :table
+    else
+      out.write line
+    end
+  end
+  def process_copy_line(out, line)
+    if line =~ /^\\\.[\r\n]/
+      @table.flush_all
+      @table.copy_lines{|l| out.puts l}
+      @table = nil
+      @state = :schema
+    else
+      @table.add_line(line)
+    end
+  end
+  def work
+    @state = :schema
+    @table = nil
+    @tables = []
+    File.open(output_file, 'w') do |out|
+      STDIN.each_line do |line|
+        case @state
+        when :schema
+          process_schema_line(out, line)
+        when :table
+          process_copy_line(out, line)
+        end
+      end
+    end
+    @tables.each{|table| table.finish_all}
+  end
+end
+Worker = CWorker.new
+class Rule
+  class ParseError < StandardError; end
+  attr_reader :regex, :split_parts, :sort_keys
+  def self.parse(line)
+    line = line.sub(%r{(;|#|//).*$},'').strip
+    return if line.empty?
+    if line =~ /^(\S+)(?:\s+split:(\S+))?(?:\s+sort:((?:(?:[^\s:]+)(?::[MbdfghinRrV]+)?(?:\s+|\s*$))+))?$/
+      puts "#$1 split:#$2 sort:#$3" if $debug
+      new($1, $2, $3)
+    else
+      raise ParseError, "Wrong rule line #{line}"
+    end
+  end
+  def initialize(table_regex, split_expr, sort_keys)
+    @regex = Regexp.new table_regex
+    parse_split_expr(split_expr)
+    parse_sort_keys(sort_keys)
+  end
+  def parse_split_expr(split_expr)
+    s = StringScanner.new(split_expr || '')
+    parts = []
+    while !s.eos?
+      if field = s.scan(/\$[^\[%]+/)
+        field = field[1..-1]
+        part = {:type => :field, :field => field, :actions => []}
+        while !s.eos?
+          if range = s.scan(/\[[+-]?\d+\.\.\.?[+-]?\d+\]/)
+            part[:actions] << {:range => range}
+          elsif mod = s.scan(/%\d+/)
+            part[:actions] << {:mod => mod[1..-1]}
+          else
+            break
+          end
+        end
+        parts << part
+        if sep = s.scan(/![^$\s#\\]*/)
+          if sep > '!'
+            parts << {:type => :sep, :sep => sep[1..-1]}
+          end
+          next
+        end
+      end
+      raise ParseError, "Wrong format of split expr #{split_expr} (rest: #{s.rest})"
+    end
+    @split_parts = parts
+  end
+  def parse_sort_keys(sort_keys)
+    @sort_keys = (sort_keys || '').scan(/([^\s:]+)(?::([MbdfghinRrV]+))?/).map do |key, flags|
+      {:field => key, :flags => flags}
+    end
+  end
+end
+class Table
+  class NoColumn < StandardError; end
+  ONE_FILE_CACHE_SIZE = 128 * 1024
+  TOTAL_CACHE_SIZE = 5 * 1024 * 1024
+  class OneFile
+    attr_reader :file_name, :cache_size
+    def initialize(dir, name)
+      @file_name = File.join(dir, name)
+      @cache_lines = []
+      @cache_size = 0
+    end
+    def add_line(line)
+      @cache_lines << line
+      @cache_size += line.size
+      flush if @cache_size > ONE_FILE_CACHE_SIZE
+    end
+    def flush
+      dir = File.dirname(@file_name)
+      unless File.directory?(dir)
+        FileUtils.mkdir_p(dir)
+      end
+      File.open(@file_name, 'a') do |f|
+        @cache_lines.each{|l| f.write(l)}
+      end
+      @cache_lines.clear
+      @cache_size = 0
+    end
+    def write_finish
+      File.open(@file_name, 'a') do |f|
+        f.puts('\\.')
+      end
+    end
+    def sort(sort_line = [])
+      args = [Worker.sorter]
+      if sort_line && !sort_line.empty?
+        args.concat sort_line
+      else
+        args << '-n'
+      end
+      args.push '-o', @file_name, @file_name
+      puts args.join(' ')  if $debug
+      system *args
+    end
+  end
+  attr_reader :name, :columns, :files, :sort_line
+  def initialize(dir, name, columns)
+    @dir = dir
+    @table = name
+    @columns = columns.map{|c| c.sub(/^"(.+)"$/, '\\1')}
+    if @rule = Worker.find_rule(name)
+      apply_rule
+    else
+      @split_args = []
+    end
+    @files = {}
+    @total_cache_size = 0
+  end
+  def _mod(s, len, mod)
+    "%0#{len}d" % (s.to_i / mod * mod)
+  end
+  def apply_rule
+    split_string = ''
+    @rule.split_parts.each do |part|
+      case part[:type]
+      when :sep
+        split_string << part[:sep]
+      when :field
+        i = @columns.find_index(part[:field])
+        raise NoColumn, part[:field]  unless i
+        field = "values[#{i}]"
+        part[:actions].each do |action|
+          if action[:mod]
+            mod_s = action[:mod]
+            mod = mod_s.to_i
+            field = "_mod(#{field},#{mod_s.size},#{mod})"
+          elsif action[:range]
+            field << "#{action[:range]}"
+          end
+        end
+        split_string << "\#{#{field}}"
+      end
+    end
+    eval <<-"EOF"
+      def self.file_name(values)
+        name = %{#{split_string}}.gsub(/\\.\\.|\\s|\\?|\\*/, '_')
+        "\#@table/\#{name}.dat"
+      end
+    EOF
+    @sort_args = @rule.sort_keys.map do |key|
+      i = @columns.find_index(key[:field])
+      raise NoColumn, key[:field]  unless i
+      i += 1
+      "--key=#{i},#{i}#{key[:flags]}"
+    end
+  end
+  def file_name(values)
+    "#@table.dat"
+  end
+  def add_line(line)
+    values = line.chomp.split("\t")
+    fname = file_name(values)
+    one_file = @files[fname] ||= OneFile.new(@dir, fname)
+    @total_cache_size -= one_file.cache_size
+    one_file.add_line(line)
+    @total_cache_size += one_file.cache_size
+    flush_all if @total_cache_size > TOTAL_CACHE_SIZE
+  end
+  def flush_all
+    @files.each{|name, one_file| one_file.flush}
+    @total_cache_size = 0
+  end
+  def copy_lines
+    if block_given?
+      @files.each do |name, one_file|
+        yield "\\copy #{@table} (#{@columns.join(', ')}) from #{one_file.file_name}"
+      end
+    else
+      to_enum(:copy_lines)
+    end
+  end
+  def finish_all
+    @files.each do |name, one_file|
+      one_file.sort(@sort_args)
+      one_file.write_finish
+    end
+  end
+end
+opts = OptionParser.new do |opts|
+  opts.banner = "\
+Usage: pg_dump my_base | split_pgdump [-r RULES_FILE] [-f DUMP_FILE] [-s SORT_BIN] [-d]
+split_pgdump intend for producing stable set of small files instead of one
+big dump file. Such set is suitable for being source for SCM systems, being
+effectivly transmitted using rsync, repacking by 7z and other.
+"
+  opts.separator("Options:")
+  opts.on("-r", "--rules=RULES_FILE", "File with rules on table splitting (default 'split.rules')") do |v|
+    Worker.rules_file = v
+  end
+  opts.on("-f", "--file=FILE", "main file name (default 'dump.sql').",
+          "Table content will be storred in FILE-tables directory") do |v|
+    Worker.output_file = v
+  end
+  opts.on("-s", "--sort=SORT_BIN", "sort executable compatible with gnu coreutils sort") do |v|
+    Worker.sorter = v
+  end
+  opts.on("-d", "--debug", "debug"){|v| $debug = true}
+  opts.on_tail("-h", "--help", "this message"){|v| puts opts; exit}
+  opts.on_tail("\
+Rules file format:
+table_regexp  {split:<Split expr>} {sort:<Sort expr>}
+<Split expr> examples:
+  split:$field_name!
+  split:$field_name!_$other_field!
+  split:$client_id%00100!-$id%0025000!
+  split:$some_field[2..-1]!/$other_field[10..30]%0005!
+<Sort expr> is space separated list of fields, optionally with options for
+gnu `sort` --key parameters (on my machine they are MbdfghinRrV):
+  sort:client_id uid
+  sort:client_id:n id:n
+Example for redmines wiki_content_versions:
+wiki_content_versions split:$page_id%0025!/$id%0000250! sort:page_id:n id:n
+")
+end.parse!
+Worker.parse_rules
+Worker.clear_files
+Worker.work

metadata ADDED Viewed

@@ -0,0 +1,52 @@
+--- !ruby/object:Gem::Specification
+name: split_pgdump
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+  prerelease:
+platform: ruby
+authors:
+- Sokolov Yura aka funny_falcon
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2011-11-22 00:00:00.000000000 Z
+dependencies: []
+description: ! 'split_pgdump aimed to produce set of small sorted files from one big
+  dump file.
+'
+email: funny.falcon@gmail.com
+executables:
+- split_pgdump
+extensions: []
+extra_rdoc_files: []
+files:
+- bin/split_pgdump
+- README
+homepage: https://github.com/funny-falcon/split_pgdump
+licenses:
+- GPL
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.10
+signing_key:
+specification_version: 3
+summary: split_pgdump is a tool for splitting postgresql dump in a managable set of
+  files
+test_files: []