RubyGems - fastercsv - Versions diffs - 1.4.0 → 1.5.0 - Mend

fastercsv 1.4.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/CHANGELOG CHANGED Viewed

@@ -2,6 +2,14 @@
 Below is a complete listing of changes for each revision of FasterCSV.
+== 1.5.0
+* The main parser has been rewritten by Timothy Elliott to avoid big input
+  issues with Ruby 1.8's regex engine.  This makes FasterCSV handle more inputs
+  gracefully.
+* FasterCSV will now exit with a notice to upgrade if required in Ruby 1.9.
+* Included a missing file so the tests will run in source packages.
 == 1.4.0
 * Added encoding support patch from Michael Reinsch.

data/Rakefile CHANGED Viewed

@@ -12,8 +12,8 @@ task :default => [:test]
 Rake::TestTask.new do |test|
 	test.libs       << "test"
-	test.test_files = [ "test/ts_all.rb" ]
-	test.verbose    = true
+	test.test_files =  %w[test/ts_all.rb]
+	test.verbose    =  true
 end
 Rake::RDocTask.new do |rdoc|
@@ -45,8 +45,6 @@ task :benchmark do
   path = "test/test_data.csv"
 	sh %Q{time ruby -r csv -e } +
 	   %Q{'#{TESTS}.times { CSV.foreach("#{path}") { |row| } }'}
-	sh %Q{time ruby -r lightcsv -e } +
-	   %Q{'#{TESTS}.times { LightCsv.foreach("#{path}") { |row| } }'}
 	sh %Q{time ruby -r lib/faster_csv -e } +
 	   %Q{'#{TESTS}.times { FasterCSV.foreach("#{path}") { |row| } }'}
 end
@@ -58,12 +56,12 @@ spec = Gem::Specification.new do |spec|
 	spec.platform = Gem::Platform::RUBY
 	spec.summary  = "FasterCSV is CSV, but faster, smaller, and cleaner."
-	spec.test_suite_file = "test/ts_all.rb"
+	spec.test_files      = %w[test/ts_all.rb]
 	spec.files           = Dir.glob("{lib,test,examples}/**/*.rb").
 	                           reject { |item| item.include?(".svn") } +
 	                       Dir.glob("{test,examples}/**/*.csv").
 	                           reject { |item| item.include?(".svn") } +
-	                           ["Rakefile", "setup.rb"]
+	                       %w[Rakefile setup.rb test/line_endings.gz]
 	spec.has_rdoc         = true
 	spec.extra_rdoc_files = %w[ AUTHORS COPYING README INSTALL TODO CHANGELOG

data/lib/faster_csv.rb CHANGED Viewed

@@ -7,6 +7,13 @@
 #
 # See FasterCSV for documentation.
+if RUBY_VERSION >= "1.9"
+  abort <<-VERSION_WARNING.gsub(/^\s+/, "")
+  Please switch to Ruby 1.9's standard CSV library.  It's FasterCSV plus
+  support for Ruby 1.9's m17n encoding engine.
+  VERSION_WARNING
+end
 require "forwardable"
 require "English"
 require "enumerator"
@@ -75,7 +82,7 @@ require "stringio"
 #
 class FasterCSV
   # The version of the installed library.
-  VERSION = "1.4.0".freeze
+  VERSION = "1.5.0".freeze
   #
   # A FasterCSV::Row is part Array and part Hash.  It retains an order for the
@@ -1559,7 +1566,7 @@ class FasterCSV
     end
     # begin with a blank line, so we can always add to it
-    line = ""
+    line = String.new
     #
     # it can take multiple calls to <tt>@io.gets()</tt> to get a full line,
@@ -1567,7 +1574,11 @@ class FasterCSV
     #
     loop do
       # add another read to the line
-      line  += @io.gets(@row_sep) rescue return nil
+      begin
+        line  += @io.gets(@row_sep)
+      rescue
+        return nil
+      end
       # copy the line so we can chop it up in parsing
       parse =  line.dup
       parse.sub!(@parsers[:line_end], "")
@@ -1590,41 +1601,37 @@ class FasterCSV
         end
       end
-      #
-      # shave leading empty fields if needed, because the main parser chokes
-      # on these
-      #
-      csv = if parse.sub!(@parsers[:leading_fields], "")
-        [nil] * ($&.length / @col_sep.length)
-      else
-        Array.new
-      end
-      #
-      # then parse the main fields with a hyper-tuned Regexp from
-      # Mastering Regular Expressions, Second Edition
-      #
-      parse.gsub!(@parsers[:csv_row]) do
-        csv << if $1.nil?     # we found an unquoted field
-          if $2.empty?        # switch empty unquoted fields to +nil+...
-            nil               # for CSV compatibility
-          else
-            # I decided to take a strict approach to CSV parsing...
-            if $2.count("\r\n").zero?  # verify correctness of field...
-              $2
-            else
-              # or throw an Exception
-              raise MalformedCSVError, "Unquoted fields do not allow " +
-                                       "\\r or \\n (line #{lineno + 1})."
-            end
+      # parse the fields with a mix of String#split and regular expressions
+      csv           = Array.new
+      current_field = String.new
+      field_quotes  = 0
+      parse.split(@col_sep, -1).each do |match|
+        if current_field.empty? && match.count(@quote_and_newlines).zero?
+          csv           << (match.empty? ? nil : match)
+        elsif(current_field.empty? ? match[0] : current_field[0]) == @quote_char[0]
+          current_field << match
+          field_quotes += match.count(@quote_char)
+          if field_quotes % 2 == 0
+            in_quotes = current_field[@parsers[:quoted_field], 1]
+            raise MalformedCSVError unless in_quotes
+            current_field = in_quotes
+            current_field.gsub!(@quote_char * 2, @quote_char) # unescape contents
+            csv           << current_field
+            current_field =  String.new
+            field_quotes  =  0
+          else # we found a quoted field that spans multiple lines
+            current_field << @col_sep
           end
-        else                  # we found a quoted field...
-          $1.gsub(@quote_char * 2, @quote_char)  # unescape contents
+        elsif match.count("\r\n").zero?
+          raise MalformedCSVError, "Illegal quoting on line #{lineno + 1}."
+        else
+          raise MalformedCSVError, "Unquoted fields do not allow " +
+                                   "\\r or \\n (line #{lineno + 1})."
         end
-        ""  # gsub!'s replacement, clear the field
       end
       # if parse is empty?(), we found all the fields on the line...
-      if parse.empty?
+      if field_quotes % 2 == 0
         @lineno += 1
         # save fields unconverted fields, if needed...
@@ -1646,9 +1653,7 @@ class FasterCSV
       # if we're not empty?() but at eof?(), a quoted field wasn't closed...
       if @io.eof?
         raise MalformedCSVError, "Unclosed quoted field on line #{lineno + 1}."
-      elsif parse =~ @parsers[:bad_field]
-        raise MalformedCSVError, "Illegal quoting on line #{lineno + 1}."
-      elsif @field_size_limit and parse.length >= @field_size_limit
+      elsif @field_size_limit and current_field.size >= @field_size_limit
         raise MalformedCSVError, "Field size exceeded on line #{lineno + 1}."
       end
       # otherwise, we need to loop and pull some more data to complete the row
@@ -1697,9 +1702,10 @@ class FasterCSV
   #
   def init_separators(options)
     # store the selected separators
-    @col_sep    = options.delete(:col_sep)
-    @row_sep    = options.delete(:row_sep)
-    @quote_char = options.delete(:quote_char)
+    @col_sep            = options.delete(:col_sep)
+    @row_sep            = options.delete(:row_sep)
+    @quote_char         = options.delete(:quote_char)
+    @quote_and_newlines = "#{@quote_char}\r\n"
     if @quote_char.length != 1
       raise ArgumentError, ":quote_char has to be a single character String"
@@ -1785,31 +1791,12 @@ class FasterCSV
     esc_row_sep = Regexp.escape(@row_sep)
     esc_quote   = Regexp.escape(@quote_char)
     @parsers = {
-      # for empty leading fields
-      :leading_fields => Regexp.new("\\A(?:#{esc_col_sep})+", nil, @encoding),
-      # The Primary Parser
-      :csv_row        => Regexp.new(<<-END_PARSER, Regexp::EXTENDED, @encoding),
-      \\G(?:\\A|#{esc_col_sep})              # anchor the match
-      (?: #{esc_quote}( (?>[^#{esc_quote}]*) # find quoted fields
-                        (?> #{esc_quote*2}
-                            [^#{esc_quote}]* )* )#{esc_quote}
-          |                                  # ... or ...
-          ([^#{esc_quote}#{esc_col_sep}]*)   # unquoted fields
-          )
-      (?=#{esc_col_sep}|\\z)                 # ensure we are at field's end
-      END_PARSER
-      # a test for unescaped quotes
-      :bad_field      => Regexp.new(<<-END_BAD, Regexp::EXTENDED, @encoding),
-      \\A#{esc_col_sep}?                    # starts with an optional comma
-      (?: #{esc_quote} (?>[^#{esc_quote}]*) # an extra quote
-                       (?> #{esc_quote*2}
-                           [^#{esc_quote}]* )*
-                       #{esc_quote}[^#{esc_quote}]
-          |                                 # ... or ...
-          [^#{esc_quote}#{esc_col_sep}]+
-          #{esc_quote}                      # unescaped quote
-          )
-      END_BAD
+      :any_field      => Regexp.new( "[^#{esc_col_sep}]+",
+                                     Regexp::MULTILINE,
+                                     @encoding ),
+      :quoted_field   => Regexp.new( "^#{esc_quote}(.*)#{esc_quote}$",
+                                     Regexp::MULTILINE,
+                                     @encoding ),
       # safer than chomp!()
       :line_end       => Regexp.new("#{esc_row_sep}\\z", nil, @encoding)
     }

data/test/line_endings.gz ADDED Viewed

Binary file

data/test/tc_csv_parsing.rb CHANGED Viewed

@@ -108,6 +108,13 @@ class TestCSVParsing < Test::Unit::TestCase
     #
     assert_equal(Array.new, FasterCSV.parse_line("\n1,2,3\n"))
   end
+  def test_non_regex_edge_cases
+    # An early version of the non-regex parser fails this test
+    [["foo,\"foo,bar,baz,foo\",\"foo\"", ["foo", "foo,bar,baz,foo", "foo"]]].each do |edge_case|
+      assert_equal(edge_case.last, FasterCSV.parse_line(edge_case.first))
+    end
+  end
   def test_malformed_csv
     assert_raise(FasterCSV::MalformedCSVError) do

data/test/tc_interface.rb CHANGED Viewed

@@ -103,6 +103,17 @@ class TestFasterCSVInterface < Test::Unit::TestCase
       assert_equal(nil, csv.shift)
     end
   end
+  def test_long_line # ruby's regex parser may have problems with long rows
+    File.unlink(@path)
+    long_field_length = 2800
+    File.open(@path, "w") do |file|
+      file << "1\t2\t#{'3' * long_field_length}\r\n"
+    end
+    @expected = [%w{1 2} + ['3' * long_field_length]]
+    test_shift
+  end
   ### Test Write Interface ###

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: fastercsv
 version: !ruby/object:Gem::Version
-  version: 1.4.0
+  version: 1.5.0
 platform: ruby
 authors:
 - James Edward Gray II
@@ -9,11 +9,15 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2008-09-10 00:00:00 -05:00
+date: 2009-06-15 00:00:00 -05:00
 default_executable:
 dependencies: []
-description: FasterCSV is intended as a complete replacement to the CSV standard library. It is significantly faster and smaller while still being pure Ruby code. It also strives for a better interface.
+description: |
+  FasterCSV is intended as a complete replacement to the CSV standard library. It
+  is significantly faster and smaller while still being pure Ruby code. It also
+  strives for a better interface.
 email: james@grayproductions.net
 executables: []
@@ -52,6 +56,7 @@ files:
 - examples/purchase.csv
 - Rakefile
 - setup.rb
+- test/line_endings.gz
 - AUTHORS
 - COPYING
 - README
@@ -61,6 +66,8 @@ files:
 - LICENSE
 has_rdoc: true
 homepage: http://fastercsv.rubyforge.org
+licenses: []
 post_install_message:
 rdoc_options:
 - --title
@@ -84,9 +91,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project: fastercsv
-rubygems_version: 1.2.0
+rubygems_version: 1.3.4
 signing_key:
-specification_version: 2
+specification_version: 3
 summary: FasterCSV is CSV, but faster, smaller, and cleaner.
 test_files:
 - test/ts_all.rb