RubyGems - data_transport - Versions diffs - 0.3.3 - Mend

data_transport 0.3.3

Files changed (8) hide show

data/lib/data_transport/data_store/active_record.rb +210 -0
data/lib/data_transport/data_store/csv_file.rb +25 -0
data/lib/data_transport/data_store/file.rb +186 -0
data/lib/data_transport/data_store.rb +27 -0
data/lib/data_transport/record/destination.rb +28 -0
data/lib/data_transport/record/source.rb +24 -0
data/lib/data_transport.rb +52 -0
metadata +68 -0

data/lib/data_transport/data_store/active_record.rb ADDED Viewed

@@ -0,0 +1,210 @@
+module DataTransport
+  class DataStore
+    # Data store that reads and writes records in a database via ActiveRecord.
+    # This class is specifically optimized for reading and writing large
+    # numbers of records, providing a significant advantage over using
+    # ActiveRecord directly.
+    class ActiveRecord < DataStore
+      # There are two ways to initialize this data store. The first is by
+      # specifying one of your ActiveRecord models:
+      #
+      #   DataTransport::DataStore::ActiveRecord.new :class => MyModel
+      #
+      # The second is by providing an ActiveRecord database specification (as
+      # read from database.yml, for example) and a table name:
+      #
+      #   db_spec = ActiveRecord::Base.configurations["other_app_#{RAILS_ENV}"]
+      #   DataTransport::DataStore::ActiveRecord.new(
+      #     :connection => db_spec,
+      #     :table_name => "sprockets"
+      #   )
+      #
+      # The second form is useful for importing or exporting data in non-Rails
+      # applications.
+      #
+      # In addition, the following options are accepted:
+      #
+      # conditions::     Conditions describing which records to read. This can
+      #                  be anything that ActiveRecord will recognize, such as
+      #                  a hash table, an array with substitutions, or raw SQL.
+      #                  Default is nil (no conditions, read all records).
+      # truncate::       If true, the table will be truncated before any
+      #                  records are written. On databases that support it,
+      #                  this is performed by executing a TRUNCATE TABLE query;
+      #                  all other databases use ActiveRecord's delete_all
+      #                  method.
+      # ignore_errors::  If true, errors that occur during record insertion
+      #                  will be ignored. This is useful if your table has a
+      #                  unique index and you want to silently drop records
+      #                  with duplicate keys. Currently this only works on
+      #                  MySQL. Default is false.
+      # max_sql_length:: Maximum permissible length of an SQL query, in bytes.
+      #                  Rows to be inserted are buffered until the largest
+      #                  possible INSERT statement has been generated, at which
+      #                  point the statement is executed and a new INSERT
+      #                  statement begins. The default value varies depending
+      #                  on what type of database you're connected to. With
+      #                  SQLite, the default is 1,000,000. With MySQL, the
+      #                  default is the value of the +max_allowed_packet+
+      #                  variable minus 512. With all other databases, the
+      #                  default is 16,777,216.
+      def initialize(options = {})
+        super()
+        # Extract options.
+        @class          = options.delete(:class)
+        @connection     = options.delete(:connection)
+        @table_name     = options.delete(:table_name)
+        @conditions     = options.delete(:conditions)
+        @truncate       = options.delete(:truncate)
+        @ignore_errors  = options.delete(:ignore_errors)
+        @max_sql_length = options.delete(:max_sql_length)
+        # Make sure a class or connection and table name was provided.
+        if @class.nil? && (@connection.nil? || @table_name.nil?)
+          raise(ArgumentError, "missing required option `class', or `connection' and `table_name'")
+        end
+        raise(TypeError, "class must be a class") if @class && !@class.is_a?(Class)
+        # If connection specs were provided instead of a class, make an
+        # anonymous ActiveRecord subclass.
+        unless @class
+          @class = Class.new(::ActiveRecord::Base)
+          @class.set_table_name @table_name
+          @class.establish_connection @connection
+        end
+        # Make sure the class descends from ActiveRecord::Base.
+        klass = @class.superclass
+        is_active_record = false
+        while klass
+          if klass == ::ActiveRecord::Base
+            is_active_record = true
+            break
+          end
+          klass = klass.superclass
+        end
+        raise(TypeError, "class must descend from ActiveRecord::Base") unless is_active_record
+        # If ignore_errors is true, make sure we're connected to a MySQL
+        # database. We don't use is_a? because if the MySQL adapter isn't
+        # loaded, referencing its class throws a NameError.
+        if @ignore_errors
+          unless @class.connection.class.to_s ==
+            "ActiveRecord::ConnectionAdapters::MysqlAdapter"
+            raise ArgumentError, "ignore_errors can only be used with a MySQL database"
+          end
+        end
+        # Check for unknown options.
+        unless options.empty?
+          raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
+        end
+        # Figure out how much data the database can handle in one query. See
+        # the note above in the ignore_errors compatibility check about using
+        # stringified class names.
+        if @max_sql_length
+          @max_sql_length = @max_sql_length.to_i
+        else
+          case @class.connection.class.to_s
+          when "ActiveRecord::ConnectionAdapters::MysqlAdapter"
+            rows = @class.connection.select_all("SHOW VARIABLES LIKE 'max_allowed_packet'")
+            @max_sql_length = rows.first["Value"].to_i - 512
+          when /\AActiveRecord::ConnectionAdapters::SQLite3?Adapter\Z/
+            @max_sql_length = 1_000_000
+          else
+            @max_sql_length = 16_777_216
+          end
+        end
+        # Fetch column information
+        @columns = {}
+        @class.columns.each {|c| @columns[c.name.to_sym] = c}
+      end
+      def klass # :nodoc:
+        @class
+      end
+      # Returns the number of records in the table that match the data store's
+      # conditions.
+      def count
+        @class.count(:conditions => @conditions)
+      end
+      def each_record(batch_size = nil) # :nodoc:
+        conn = @class.connection
+        column_names = conn.columns(@class.table_name).collect {|c| c.name}
+        offset = 0
+        record = {}
+        base_query = "SELECT * FROM #{conn.quote_table_name(@class.table_name)}"
+        @class.send(:add_conditions!, base_query, @conditions) unless @conditions.nil?
+        while true
+          sql = base_query.dup
+          conn.add_limit_offset!(sql, :limit => batch_size, :offset => offset)
+          offset += batch_size
+          rows = conn.select_rows(sql)
+          break if rows.empty?
+          rows.each do |row|
+            record.clear
+            column_names.each_with_index do |column_name, i|
+              column_name = column_name.to_sym
+              record[column_name] = @columns[column_name].type_cast(row[i])
+            end
+            yield record
+          end
+        end
+      end
+      def write_record(record) # :nodoc:
+        conn = @class.connection
+        # If no SQL has been produced yet, start an INSERT statement.
+        @sql_buffer ||= start_insert_sql(record)
+        # Convert the record into a string of quoted values.
+        values = []
+        record.each {|k, v| values << conn.quote(v, @columns[k])}
+        values = "(#{values.join ","}),"
+        # Write the record.
+        if @max_sql_length.nil?
+          # We have no information on the database's maximum allowed packet
+          # size, so it's safest to write the record immediately.
+          @sql_buffer << values
+          finalize
+        elsif @sql_buffer.length + record.length > @max_sql_length
+          # Appending this record to the SQL buffer will exceed the maximum
+          # allowed packet size. Send the buffer to the database and start a
+          # new statement with this record.
+          finalize
+          @sql_buffer = start_insert_sql
+          @sql_buffer << values
+        else
+          # This record will not cause the SQL buffer to exceed the maximum
+          # allowed packet size. Append it to the SQL buffer.
+          @sql_buffer << values
+        end
+      end
+      def finalize # :nodoc:
+        if @truncate
+          conn = @class.connection
+          begin
+            conn.execute("TRUNCATE TABLE #{conn.quote_table_name(@class.table_name)}")
+          rescue
+            @class.delete_all
+          end
+          @truncate = false
+        end
+        if @sql_buffer && @sql_buffer[-1,1] == ","
+          @sql_buffer.chop!
+          @class.connection.execute(@sql_buffer)
+        end
+      end
+      def reset # :nodoc:
+        @sql_buffer = nil
+      end
+    private
+      def start_insert_sql(record)
+        "INSERT #{@ignore_errors ? "IGNORE " : " "}INTO " +
+          "#{@class.connection.quote_table_name(@class.table_name)} " +
+          "(#{record.keys.join ","}) VALUES "
+      end
+    end
+  end
+end

data/lib/data_transport/data_store/csv_file.rb ADDED Viewed

@@ -0,0 +1,25 @@
+module DataTransport
+  class DataStore
+    # Identical to the File data store, except that it is preconfigured to read
+    # and write CSV files.
+    class CSVFile < File
+      # Accepts the same options as the File data store, except that the
+      # following options have different defaults:
+      #
+      # delimiter:: ","
+      # enclosure:: "\""
+      # escape::    :double
+      #
+      # These defaults describe the CSV format.
+      def initialize(options = {})
+        super({
+          :delimiter => ",",
+          :enclosure => "\"",
+          :escape    => :double
+        }.merge(options))
+      end
+    end
+  end
+end

data/lib/data_transport/data_store/file.rb ADDED Viewed

@@ -0,0 +1,186 @@
+module DataTransport
+  class DataStore
+    # Data store that reads and writes records in a flat text file.
+    #
+    # Although this class can read and write CSV files, you should use the
+    # CSVFile data store for that instead of this one.
+    class File < DataStore
+      attr_reader :mode # :nodoc:
+      # Accepts the following options:
+      #
+      # header::    If true, the file has a header row that contains the names
+      #             of each field. Default is false.
+      # delimiter:: String that separates individual fields in a row. Default
+      #             is "\t".
+      # enclosure:: String that encloses individual fields. For example, if
+      #             this is set to "\"", fields will be enclosed in double
+      #             quotes. Default is nil (no enclosure).
+      # escape::    Escape sequence for occurrences of the enclosure string in
+      #             field values. Set this to the special value :double if
+      #             enclosure characters are escaped by doubling them (like in
+      #             CSV and SQL). Default is nil.
+      # path::      Path to the file.
+      # null::      String that represents fields whose value is nil (but not
+      #             blank). Default is "".
+      # keys::      Array of field names. Not necessary for files with a header
+      #             row. Default for files without a header row is fieldXX,
+      #             where XX is numbered sequentially starting from 00.
+      def initialize(options = {})
+        super()
+        # Extract options.
+        @header    = options.delete(:header)
+        @delimiter = options.delete(:delimiter) || "\t"
+        @enclosure = options.delete(:enclosure)
+        @escape    = options.delete(:escape)
+        @path      = options.delete(:path)
+        @null      = options.delete(:null) || ""
+        @keys      = options.delete(:keys)
+        # Validate options.
+        raise(ArgumentError, "missing required option `path'") if @path.nil?
+        if @escape && @enclosure.nil?
+          raise(ArgumentError, "`escape' cannot be used without `enclosure'")
+        end
+        unless options.empty?
+          raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
+        end
+        # Handle the special :double escape sequence.
+        @escape = @enclosure if @escape == :double
+        # Create an enclosure placeholder, which is used to avoid clobbering
+        # escaped enclosure characters during parsing.
+        if @escape
+          if @enclosure == 0.chr
+            safe_ch = 1.chr
+          else
+            safe_ch = 0.chr
+          end
+          @placeholder = "#{safe_ch}__ENCLOSURE_PLACEHOLDER__#{safe_ch}"
+        end
+      end
+      # Returns the number of lines in the file (not counting the header, if
+      # there is one).
+      def count
+        return @count if @count
+        self.mode = :input
+        line_count = 0
+        rewind_and_restore do
+          io.readline if @header
+          until io.eof?
+            io.gets
+            line_count += 1
+          end
+        end
+        @count = line_count
+      end
+      def each_record(batch_size = nil) # :nodoc:
+        self.mode = :input
+        io.rewind
+        io.readline if @header
+        until io.eof?
+          line = io.gets || break
+          line.chomp!
+          values = values_from_s(line)
+          if keys.length != values.length
+            raise RuntimeError, "wrong number of fields (#{values.length} for #{keys.length})"
+          end
+          record = {}
+          keys.length.times {|i| record[keys[i]] = values[i]}
+          yield record
+        end
+      end
+      def write_record(record) # :nodoc:
+        self.mode = :output
+        # If no key order was ever specified, make one up.
+        @keys ||= record.keys.sort {|a,b| a.to_s <=> b.to_s}
+        # Write the header if this is the first record.
+        if @header && io.pos == 0
+          io.puts(values_to_s(keys))
+        end
+        # Write the values in a predictable order.
+        values = keys.collect do |k|
+          record[k].nil?? @null : record[k]
+        end
+        io.puts(values_to_s(values))
+      end
+    private
+      def values_to_s(values)
+        if @escape
+          values = values.collect do |v|
+            @enclosure + v.to_s.gsub(/#{@enclosure}/, @escape + @enclosure) + @enclosure
+          end
+        elsif @enclosure
+          values = values.collect {|v| @enclosure + v.to_s + @enclosure}
+        end
+        values.join(@delimiter)
+      end
+      def values_from_s(str)
+        if @escape
+          str = str.gsub(/#{@escape}#{@enclosure}/, @placeholder)
+          values = str.split(/#{@enclosure + @delimiter + @enclosure}/)
+          values.first.sub!(/^#{@enclosure}/, "")
+          values.last.sub!(/#{@enclosure}$/, "")
+          values.each do |v|
+            v.gsub!(/#{@placeholder}/, @enclosure)
+          end
+        elsif @enclosure
+          values = str.split(/#{@enclosure + @delimiter + @enclosure}/)
+          values.first.sub!(/^#{@enclosure}/, "")
+          values.last.sub!(/#{@enclosure}$/, "")
+        else
+          values = str.split(/#{@delimiter}/)
+        end
+        values
+      end
+      def mode=(new_mode)
+        if !@mode.nil? && @mode != new_mode
+          raise RuntimeError, "can't switch mode from #{@mode} to #{new_mode}"
+        end
+        unless [:input, :output].include?(new_mode)
+          raise ArgumentError, "unknown mode `#{new_mode}'"
+        end
+        @mode = new_mode
+      end
+      def io
+        return @io if @io
+        if mode == :output
+          @io = ::File.open(@path, "w")
+          @io.rewind
+          @io
+        else
+          @io = ::File.open(@path, "r")
+        end
+      end
+      def keys
+        return @keys if @keys
+        return [] if mode == :output
+        line = rewind_and_restore { io.readline }
+        line.chomp!
+        fields = values_from_s(line)
+        if @header
+          @keys = fields.collect! {|hdr| hdr.downcase.to_sym}
+        else
+          @keys = (0..(fields.length - 1)).to_a.collect! do |i|
+            sprintf("field%02d", i).to_sym
+          end
+        end
+      end
+      def rewind_and_restore
+        pos = io.pos
+        io.rewind
+        result = yield
+        io.seek(pos)
+        result
+      end
+    end
+  end
+end

data/lib/data_transport/data_store.rb ADDED Viewed

@@ -0,0 +1,27 @@
+require "data_transport/data_store/file"
+require "data_transport/data_store/csv_file"
+require "data_transport/data_store/active_record"
+module DataTransport
+  class DataStore # :nodoc:
+    def count
+      raise NotImplementedError
+    end
+    def each_record(batch_size = nil)
+      raise NotImplementedError
+    end
+    def write_record(record)
+      raise NotImplementedError
+    end
+    def finalize
+      # Do nothing by default.
+    end
+    def reset
+      # Do nothing by default.
+    end
+  end
+end

data/lib/data_transport/record/destination.rb ADDED Viewed

@@ -0,0 +1,28 @@
+module DataTransport
+  module Record # :nodoc:
+    class Destination # :nodoc:
+      attr_reader :record
+      def initialize
+        @record = {}
+      end
+      def reset!
+        @record.clear
+      end
+      def method_missing(name, *args)
+        name_s = name.to_s
+        if name_s[-1,1] == "="
+          unless args.length == 1
+            raise ArgumentError, "wrong number of arguments (#{args.length} for 1)"
+          end
+          name_s.chop!
+          @record[name_s.to_sym] = args.first
+        else
+          super
+        end
+      end
+    end
+  end
+end

data/lib/data_transport/record/source.rb ADDED Viewed

@@ -0,0 +1,24 @@
+module DataTransport
+  module Record # :nodoc:
+    class Source # :nodoc:
+      def record=(record)
+        @record = record
+      end
+      def id
+        method_missing :id
+      end
+      def method_missing(name, *args)
+        if @record.has_key?(name)
+          unless args.empty?
+            raise ArgumentError, "wrong number of arguments (#{args.length} for 0)"
+          end
+          @record[name]
+        else
+          super
+        end
+      end
+    end
+  end
+end

data/lib/data_transport.rb ADDED Viewed

@@ -0,0 +1,52 @@
+require "data_transport/data_store"
+require "data_transport/record/source"
+require "data_transport/record/destination"
+module DataTransport
+  DEFAULT_BATCH_SIZE = 100_000 # :nodoc:
+  # Reads records from an input data source, processes them with the supplied
+  # block, and writes them to an output data source. Accepts the following
+  # options:
+  #
+  # batch_size:: Records are read from the input in batches. This option sets
+  #              the number of records in a single batch. Default is 1000.
+  #
+  # The block is passed two objects that represent the source and destination
+  # record. These objects have methods that reflect the attributes of the
+  # records. The following example reads the +name+ and +price+ attributes from
+  # input records, downcases the name, multiplies the price by 100, and writes
+  # them to the output:
+  #
+  #   # input  = DataTransport::DataSource:: ...
+  #   # output = DataTransport::DataSource:: ...
+  #
+  #   DataTransport.map(input, output) do |src, dst|
+  #     dst.name  = src.name.downcase
+  #     dst.price = (src.price * 100).to_i
+  #   end
+  #
+  # The destination doesn't necessarily have to have the same attributes as the
+  # source (or even the same number of attributes). The transformations that
+  # can be accomplished are limited only by what you can do in a block of Ruby.
+  def self.map(input, output, options = {}, &block)
+    # Extract options.
+    batch_size = options.delete(:batch_size) || DEFAULT_BATCH_SIZE
+    raise(TypeError, "batch size must be an integer") unless batch_size.is_a?(Integer)
+    raise(RangeError, "batch size must be greater than zero") if batch_size < 1
+    unless options.empty?
+      raise(ArgumentError, "unrecognized options: `#{options.keys.join("', `")}'")
+    end
+    # Run the transport.
+    output.reset
+    source = DataTransport::Record::Source.new
+    destination = DataTransport::Record::Destination.new
+    input.each_record(batch_size) do |record|
+      source.record = record
+      destination.reset!
+      yield source, destination
+      output.write_record(destination.record)
+    end
+    output.finalize
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,68 @@
+--- !ruby/object:Gem::Specification
+name: data_transport
+version: !ruby/object:Gem::Version
+  prerelease: false
+  segments:
+  - 0
+  - 3
+  - 3
+  version: 0.3.3
+platform: ruby
+authors:
+- Dana Contreras
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2010-03-27 00:00:00 -04:00
+default_executable:
+dependencies: []
+description:
+email:
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/data_transport.rb
+- lib/data_transport/data_store.rb
+- lib/data_transport/data_store/active_record.rb
+- lib/data_transport/data_store/csv_file.rb
+- lib/data_transport/data_store/file.rb
+- lib/data_transport/record/destination.rb
+- lib/data_transport/record/source.rb
+has_rdoc: true
+homepage: http://github.com/DanaDanger/data_transport
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      segments:
+      - 0
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      segments:
+      - 0
+      version: "0"
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.6
+signing_key:
+specification_version: 3
+summary: A gem for importing and exporting large quantities of data.
+test_files: []