RubyGems - DanaDanger-data_transport - Versions diffs - 0.1.1 → 0.2 - Mend

DanaDanger-data_transport 0.1.1 → 0.2

Files changed (8) hide show

data/lib/data_transport.rb +36 -14
data/lib/data_transport/data_store.rb +4 -8
data/lib/data_transport/data_store/active_record.rb +61 -17
data/lib/data_transport/data_store/file.rb +82 -13
data/lib/data_transport/record/destination.rb +2 -2
data/lib/data_transport/record/source.rb +2 -2
metadata +2 -3
data/lib/data_transport/map.rb +0 -21

data/lib/data_transport.rb CHANGED Viewed

@@ -1,29 +1,51 @@
-require "data_transport/map"
 require "data_transport/data_store"
+require "data_transport/record/source"
+require "data_transport/record/destination"
 module DataTransport
-  def self.default_batch_size
-    1000
-  end
+  DEFAULT_BATCH_SIZE = 1000 # :nodoc:
+  # Reads records from an input data source, processes them with the supplied
+  # block, and writes them to an output data source. Accepts the following
+  # options:
+  #
+  # batch_size:: Records are read from the input in batches. This option sets
+  #              the number of records in a single batch. Default is 1000.
+  #
+  # The block is passed two objects that represent the source and destination
+  # record. These objects have methods that reflect the attributes of the
+  # records. The following example reads the +name+ and +price+ attributes from
+  # input records, downcases the name, multiplies the price by 100, and writes
+  # them to the output:
+  #
+  #   # input  = DataTransport::DataSource:: ...
+  #   # output = DataTransport::DataSource:: ...
+  #
+  #   DataTransport.map(input, output) do |src, dst|
+  #     dst.name  = src.name.downcase
+  #     dst.price = (src.price * 100).to_i
+  #   end
+  #
+  # The destination doesn't necessarily have to have the same attributes as the
+  # source (or even the same number of attributes). The transformations that
+  # can be accomplished are limited only by what you can do in a block of Ruby.
   def self.map(input, output, options = {}, &block)
     # Extract options.
-    ignore_duplicates = options.delete(:ignore_duplicates)
+    batch_size = options[:batch_size] || DEFAULT_BATCH_SIZE
+    raise(TypeError, "batch size must be an integer") unless batch_size.is_a?(Integer)
+    raise(RangeError, "batch size must be greater than zero") if batch_size < 1
     unless options.empty?
       raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
     end
-    # If ignore_duplicates is true, make sure the output is a MySQL database.
-    if ignore_duplicates
-      unless output.is_a?(DataStore::ActiveRecord) && output.klass.connection.is_a?(::ActiveRecord::ConnectionAdapters::MysqlAdapter)
-        raise ArgumentError, "ignore_duplicates can only be used with an ActiveRecord data store connected to a MySQL database"
-      end
-    end
     # Run the transport.
     output.reset
-    output.ignore_duplicates = true if ignore_duplicates
-    map = DataTransport::Map.new(&block)
+    source = DataTransport::Record::Source.new
+    destination = DataTransport::Record::Destination.new
     input.each_record do |record|
-      output.write_record(map.map(record))
+      source.record = record
+      destination.reset!
+      yield source, destination
+      output.write_record(destination.record)
     end
     output.finalize
   end

data/lib/data_transport/data_store.rb CHANGED Viewed

@@ -1,8 +1,9 @@
 require "data_transport/data_store/file"
+require "data_transport/data_store/csv_file"
 require "data_transport/data_store/active_record"
 module DataTransport
-  class DataStore
+  class DataStore # :nodoc:
     def count
       raise NotImplementedError
     end
@@ -19,13 +20,8 @@ module DataTransport
       # Do nothing by default.
     end
-  protected
-    def check_batch_size(batch_size)
-      batch_size ||= DataTransport.default_batch_size
-      raise(TypeError, "batch size must be an integer") unless batch_size.is_a?(Integer)
-      raise(RangeError, "batch size must be greater than zero") if batch_size < 1
-      batch_size
+    def reset
+      # Do nothing by default.
     end
   end
 end

data/lib/data_transport/data_store/active_record.rb CHANGED Viewed

@@ -1,16 +1,54 @@
 module DataTransport
   class DataStore
+    # Data store that reads and writes records in a database via ActiveRecord.
+    # This class is specifically optimized for reading and writing large
+    # numbers of records, providing a significant advantage over using
+    # ActiveRecord directly.
+    #
+    # On MySQL databases, records are written in batches of the largest size
+    # possible instead of being inserted one by one.
     class ActiveRecord < DataStore
-      attr_accessor :ignore_duplicates
+      # There are two ways to initialize this data store. The first is by
+      # specifying one of your ActiveRecord models:
+      #
+      #   DataTransport::DataStore::ActiveRecord.new :class => MyModel
+      #
+      # The second is by providing an ActiveRecord database specification (as
+      # read from database.yml, for example) and a table name:
+      #
+      #   db_spec = ActiveRecord::Base.configurations["other_app_#{RAILS_ENV}"]
+      #   DataTransport::DataStore::ActiveRecord.new(
+      #     :connection => db_spec,
+      #     :table_name => "sprockets"
+      #   )
+      #
+      # The second form is useful for importing or exporting data in non-Rails
+      # applications.
+      #
+      # In addition, the following options are accepted:
+      #
+      # conditions::    Conditions describing which records to read. This can
+      #                 be anything that ActiveRecord will recognize, such as
+      #                 a hash table, an array with substitutions, or raw SQL.
+      #                 Default is nil (no conditions, read all records).
+      # truncate::      If true, the table will be truncated before any records
+      #                 are written. On MySQL databases, this is performed by
+      #                 executing a TRUNCATE TABLE query; all other databases
+      #                 use ActiveRecord's delete_all method.
+      # ignore_errors:: If true, errors that occur during record insertion will
+      #                 be ignored. This is useful if your table has a unique
+      #                 index and you want to silently drop records with
+      #                 duplicate keys. Currently this only works on MySQL.
+      #                 Default is false.
       def initialize(options = {})
         super()
         # Extract options.
-        @class      = options.delete(:class)
-        @connection = options.delete(:connection)
-        @table_name = options.delete(:table_name)
-        @conditions = options.delete(:conditions)
-        @truncate   = options.delete(:truncate)
+        @class         = options.delete(:class)
+        @connection    = options.delete(:connection)
+        @table_name    = options.delete(:table_name)
+        @conditions    = options.delete(:conditions)
+        @truncate      = options.delete(:truncate)
+        @ignore_errors = options.delete(:ignore_errors)
         # Make sure a class or connection and table name was provided.
         if @class.nil? && (@connection.nil? || @table_name.nil?)
           raise(ArgumentError, "missing required option `class', or `connection' and `table_name'")
@@ -34,6 +72,13 @@ module DataTransport
           klass = klass.superclass
         end
         raise(TypeError, "class must descend from ActiveRecord::Base") unless is_active_record
+        # If ignore_errors is true, make sure we're connected to a MySQL
+        # database.
+        if @ignore_errors
+          unless @class.connection.is_a?(::ActiveRecord::ConnectionAdapters::MysqlAdapter)
+            raise ArgumentError, "ignore_errors can only be used with a MySQL database"
+          end
+        end
         # Check for unknown options.
         unless options.empty?
           raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
@@ -50,17 +95,17 @@ module DataTransport
         @class.columns.each {|c| @columns[c.name.to_sym] = c}
       end
-      def klass
+      def klass # :nodoc:
         @class
       end
+      # Returns the number of records in the table that match the data store's
+      # conditions.
       def count
         @class.count(:conditions => @conditions)
       end
-      def each_record(batch_size = nil)
-        batch_size = check_batch_size(batch_size)
+      def each_record(batch_size) # :nodoc:
         conn = @class.connection
         column_names = conn.columns(@class.table_name).collect {|c| c.name}
@@ -85,7 +130,7 @@ module DataTransport
         end
       end
-      def write_record(record)
+      def write_record(record) # :nodoc:
         conn = @class.connection
         # If no SQL has been produced yet, start an INSERT statement.
         @sql_buffer ||= start_insert_sql(record)
@@ -113,7 +158,7 @@ module DataTransport
         end
       end
-      def finalize
+      def finalize # :nodoc:
         if @truncate
           conn = @class.connection
           begin
@@ -123,21 +168,20 @@ module DataTransport
           end
           @truncate = false
         end
-        if @sql_buffer[-1,1] == ","
+        if @sql_buffer && @sql_buffer[-1,1] == ","
           @sql_buffer.chop!
           @class.connection.execute(@sql_buffer)
         end
       end
-      def reset
-        self.ignore_duplicates = false
+      def reset # :nodoc:
         @sql_buffer = nil
       end
     private
       def start_insert_sql(record)
-        "INSERT #{ignore_duplicates ? "IGNORE " : " "}INTO " +
+        "INSERT #{@ignore_errors ? "IGNORE " : " "}INTO " +
           "#{@class.connection.quote_table_name(@class.table_name)} " +
           "(#{record.keys.join ","}) VALUES "
       end

data/lib/data_transport/data_store/file.rb CHANGED Viewed

@@ -1,23 +1,65 @@
 module DataTransport
   class DataStore
+    # Data store that reads and writes records in a flat text file.
+    #
+    # Although this class can read and write CSV files, you should use the
+    # CSVFile data store for that instead of this one.
     class File < DataStore
-      attr_reader :mode
+      attr_reader :mode # :nodoc:
+      # Accepts the following options:
+      #
+      # header::    If true, the file has a header row that contains the names
+      #             of each field. Default is false.
+      # delimiter:: String that separates individual fields in a row. Default
+      #             is "\t".
+      # enclosure:: String that encloses individual fields. For example, if
+      #             this is set to "\"", fields will be enclosed in double
+      #             quotes. Default is nil (no enclosure).
+      # escape::    Escape sequence for occurrences of the enclosure string in
+      #             field values. Set this to the special value :double if
+      #             enclosure characters are escaped by doubling them (like in
+      #             CSV and SQL). Default is nil.
+      # path::      Path to the file.
+      # null::      String that represents fields whose value is nil (but not
+      #             blank). Default is "".
+      # keys::      Array of field names. Not necessary for files with a header
+      #             row. Default for files without a header row is fieldXX,
+      #             where XX is numbered sequentially starting from 00.
       def initialize(options = {})
         super()
+        # Extract options.
         @header    = options.delete(:header)
         @delimiter = options.delete(:delimiter) || "\t"
+        @enclosure = options.delete(:enclosure)
+        @escape    = options.delete(:escape)
         @path      = options.delete(:path)
         @null      = options.delete(:null) || ""
         @keys      = options.delete(:keys)
+        # Validate options.
         raise(ArgumentError, "missing required option `path'") if @path.nil?
+        if @escape && @enclosure.nil?
+          raise(ArgumentError, "`escape' cannot be used without `enclosure'")
+        end
         unless options.empty?
           raise(ArgumentError, "unrecognized options: `#{options.join("', `")}'")
         end
+        # Handle the special :double escape sequence.
+        @escape = @enclosure if @escape == :double
+        # Create an enclosure placeholder, which is used to avoid clobbering
+        # escaped enclosure characters during parsing.
+        if @escape
+          if @enclosure == 0.chr
+            safe_ch = 1.chr
+          else
+            safe_ch = 0.chr
+          end
+          @placeholder = "#{safe_ch}__ENCLOSURE_PLACEHOLDER__#{safe_ch}"
+        end
       end
+      # Returns the number of lines in the file (not counting the header, if
+      # there is one).
       def count
         return @count if @count
         self.mode = :input
@@ -32,17 +74,14 @@ module DataTransport
         @count = line_count
       end
-      def each_record(batch_size = nil)
+      def each_record(batch_size = nil) # :nodoc:
         self.mode = :input
-        batch_size = check_batch_size(batch_size)
         io.rewind
         io.readline if @header
         until io.eof?
           line = io.gets || break
           line.chomp!
-          values = line.split(/#{@delimiter}/)
+          values = values_from_s(line)
           if keys.length != values.length
             raise RuntimeError, "wrong number of fields (#{values.length} for #{keys.length})"
           end
@@ -52,23 +91,53 @@ module DataTransport
         end
       end
-      def write_record(record)
+      def write_record(record) # :nodoc:
         self.mode = :output
         # If no key order was ever specified, make one up.
         @keys ||= record.keys.sort {|a,b| a.to_s <=> b.to_s}
         # Write the header if this is the first record.
         if @header && io.pos == 0
-          io.puts(keys.join(@delimiter))
+          io.puts(values_to_s(keys))
         end
         # Write the values in a predictable order.
         values = keys.collect do |k|
           record[k].nil?? @null : record[k]
         end
-        io.puts(values.join(@delimiter))
+        io.puts(values_to_s(values))
       end
     private
+      def values_to_s(values)
+        if @escape
+          values = values.collect do |v|
+            @enclosure + v.to_s.gsub(/#{@enclosure}/, @escape + @enclosure) + @enclosure
+          end
+        elsif @enclosure
+          values = values.collect {|v| @enclosure + v.to_s + @enclosure}
+        end
+        values.join(@delimiter)
+      end
+      def values_from_s(str)
+        if @escape
+          str = str.gsub(/#{@escape}#{@enclosure}/, @placeholder)
+          values = str.split(/#{@enclosure + @delimiter + @enclosure}/)
+          values.first.sub!(/^#{@enclosure}/, "")
+          values.last.sub!(/#{@enclosure}$/, "")
+          values.each do |v|
+            v.gsub!(/#{@placeholder}/, @enclosure)
+          end
+        elsif @enclosure
+          values = str.split(/#{@enclosure + @delimiter + @enclosure}/)
+          values.first.sub!(/^#{@enclosure}/, "")
+          values.last.sub!(/#{@enclosure}$/, "")
+        else
+          values = str.split(/#{@delimiter}/)
+        end
+        values
+      end
       def mode=(new_mode)
         if !@mode.nil? && @mode != new_mode
           raise RuntimeError, "can't switch mode from #{@mode} to #{new_mode}"
@@ -95,12 +164,12 @@ module DataTransport
         return [] if mode == :output
         line = rewind_and_restore { io.readline }
         line.chomp!
-        fields = line.split(/#{@delimiter}/)
+        fields = values_from_s(line)
         if @header
           @keys = fields.collect! {|hdr| hdr.downcase.to_sym}
         else
           @keys = (0..(fields.length - 1)).to_a.collect! do |i|
-            sprintf("column%02d", i).to_sym
+            sprintf("field%02d", i).to_sym
           end
         end
       end

data/lib/data_transport/record/destination.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module DataTransport
-  module Record
-    class Destination
+  module Record # :nodoc:
+    class Destination # :nodoc:
       attr_reader :record
       def initialize

data/lib/data_transport/record/source.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module DataTransport
-  module Record
-    class Source
+  module Record # :nodoc:
+    class Source # :nodoc:
       def record=(record)
         @record = record
       end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: DanaDanger-data_transport
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: "0.2"
 platform: ruby
 authors:
 - Dana Danger
@@ -23,13 +23,12 @@ extra_rdoc_files: []
 files:
 - lib/data_transport.rb
-- lib/data_transport/map.rb
 - lib/data_transport/data_store.rb
 - lib/data_transport/data_store/active_record.rb
 - lib/data_transport/data_store/file.rb
 - lib/data_transport/record/destination.rb
 - lib/data_transport/record/source.rb
-has_rdoc: false
+has_rdoc: true
 homepage: http://github.com/DanaDanger/data_transport
 post_install_message:
 rdoc_options: []

data/lib/data_transport/map.rb DELETED Viewed

@@ -1,21 +0,0 @@
-require "data_transport/record/source"
-require "data_transport/record/destination"
-module DataTransport
-  class Map
-    attr_reader :source, :destination
-    def initialize(&block)
-      @block = block
-      @source = DataTransport::Record::Source.new
-      @destination = DataTransport::Record::Destination.new
-    end
-    def map(record)
-      @source.record = record
-      @destination.reset!
-      @block.call(@source, @destination)
-      @destination.record
-    end
-  end
-end