RubyGems - fastercsv - Versions diffs - 1.4.0 → 1.5.0 - Mend

fastercsv 1.4.0 → 1.5.0

Files changed (7) hide show

data/CHANGELOG CHANGED Viewed

@@ -2,6 +2,14 @@
 Below is a complete listing of changes for each revision of FasterCSV.
+== 1.5.0
+* The main parser has been rewritten by Timothy Elliott to avoid big input
+  issues with Ruby 1.8's regex engine.  This makes FasterCSV handle more inputs
+  gracefully.
+* FasterCSV will now exit with a notice to upgrade if required in Ruby 1.9.
+* Included a missing file so the tests will run in source packages.
 == 1.4.0
 * Added encoding support patch from Michael Reinsch.

data/Rakefile CHANGED Viewed

@@ -12,8 +12,8 @@ task :default => [:test]
 Rake::TestTask.new do |test|
 	test.libs       << "test"
-	test.test_files = [ "test/ts_all.rb" ]
-	test.verbose    = true
+	test.test_files =  %w[test/ts_all.rb]
+	test.verbose    =  true
 end
 Rake::RDocTask.new do |rdoc|
@@ -45,8 +45,6 @@ task :benchmark do
   path = "test/test_data.csv"
 	sh %Q{time ruby -r csv -e } +
 	   %Q{'#{TESTS}.times { CSV.foreach("#{path}") { |row| } }'}
-	sh %Q{time ruby -r lightcsv -e } +
-	   %Q{'#{TESTS}.times { LightCsv.foreach("#{path}") { |row| } }'}
 	sh %Q{time ruby -r lib/faster_csv -e } +
 	   %Q{'#{TESTS}.times { FasterCSV.foreach("#{path}") { |row| } }'}
 end
@@ -58,12 +56,12 @@ spec = Gem::Specification.new do |spec|
 	spec.platform = Gem::Platform::RUBY
 	spec.summary  = "FasterCSV is CSV, but faster, smaller, and cleaner."
-	spec.test_suite_file = "test/ts_all.rb"
+	spec.test_files      = %w[test/ts_all.rb]
 	spec.files           = Dir.glob("{lib,test,examples}/**/*.rb").
 	                           reject { |item| item.include?(".svn") } +
 	                       Dir.glob("{test,examples}/**/*.csv").
 	                           reject { |item| item.include?(".svn") } +
-	                           ["Rakefile", "setup.rb"]
+	                       %w[Rakefile setup.rb test/line_endings.gz]
 	spec.has_rdoc         = true
 	spec.extra_rdoc_files = %w[ AUTHORS COPYING README INSTALL TODO CHANGELOG

data/lib/faster_csv.rb CHANGED Viewed

@@ -7,6 +7,13 @@
 #
 # See FasterCSV for documentation.
+if RUBY_VERSION >= "1.9"
+  abort <<-VERSION_WARNING.gsub(/^\s+/, "")
+  Please switch to Ruby 1.9's standard CSV library.  It's FasterCSV plus
+  support for Ruby 1.9's m17n encoding engine.
+  VERSION_WARNING
+end
 require "forwardable"
 require "English"
 require "enumerator"
@@ -75,7 +82,7 @@ require "stringio"
 #
 class FasterCSV
   # The version of the installed library.
-  VERSION = "1.4.0".freeze
+  VERSION = "1.5.0".freeze
   #
   # A FasterCSV::Row is part Array and part Hash.  It retains an order for the
@@ -1559,7 +1566,7 @@ class FasterCSV
     end
     # begin with a blank line, so we can always add to it
-    line = ""
+    line = String.new
     #
     # it can take multiple calls to <tt>@io.gets()</tt> to get a full line,
@@ -1567,7 +1574,11 @@ class FasterCSV
     #
     loop do
       # add another read to the line
-      line  += @io.gets(@row_sep) rescue return nil
+      begin
+        line  += @io.gets(@row_sep)
+      rescue
+        return nil
+      end
       # copy the line so we can chop it up in parsing
       parse =  line.dup
       parse.sub!(@parsers[:line_end], "")
@@ -1590,41 +1601,37 @@ class FasterCSV
         end
       end
-      #
-      # shave leading empty fields if needed, because the main parser chokes
-      # on these
-      #
-      csv = if parse.sub!(@parsers[:leading_fields], "")
-        [nil] * ($&.length / @col_sep.length)
-      else
-        Array.new
-      end
-      #
-      # then parse the main fields with a hyper-tuned Regexp from
-      # Mastering Regular Expressions, Second Edition
-      #
-      parse.gsub!(@parsers[:csv_row]) do
-        csv << if $1.nil?     # we found an unquoted field
-          if $2.empty?        # switch empty unquoted fields to +nil+...
-            nil               # for CSV compatibility
-          else
-            # I decided to take a strict approach to CSV parsing...
-            if $2.count("\r\n").zero?  # verify correctness of field...
-              $2
-            else
-              # or throw an Exception
-              raise MalformedCSVError, "Unquoted fields do not allow " +
-                                       "\\r or \\n (line #{lineno + 1})."
-            end
+      # parse the fields with a mix of String#split and regular expressions
+      csv           = Array.new
+      current_field = String.new
+      field_quotes  = 0
+      parse.split(@col_sep, -1).each do |match|
+        if current_field.empty? && match.count(@quote_and_newlines).zero?
+          csv           << (match.empty? ? nil : match)
+        elsif(current_field.empty? ? match[0] : current_field[0]) == @quote_char[0]
+          current_field << match
+          field_quotes += match.count(@quote_char)
+          if field_quotes % 2 == 0
+            in_quotes = current_field[@parsers[:quoted_field], 1]
+            raise MalformedCSVError unless in_quotes
+            current_field = in_quotes
+            current_field.gsub!(@quote_char * 2, @quote_char) # unescape contents
+            csv           << current_field
+            current_field =  String.new
+            field_quotes  =  0
+          else # we found a quoted field that spans multiple lines
+            current_field << @col_sep
           end
-        else                  # we found a quoted field...
-          $1.gsub(@quote_char * 2, @quote_char)  # unescape contents
+        elsif match.count("\r\n").zero?
+          raise MalformedCSVError, "Illegal quoting on line #{lineno + 1}."
+        else
+          raise MalformedCSVError, "Unquoted fields do not allow " +
+                                   "\\r or \\n (line #{lineno + 1})."
         end
-        ""  # gsub!'s replacement, clear the field
       end
       # if parse is empty?(), we found all the fields on the line...
-      if parse.empty?
+      if field_quotes % 2 == 0
         @lineno += 1
         # save fields unconverted fields, if needed...
@@ -1646,9 +1653,7 @@ class FasterCSV
       # if we're not empty?() but at eof?(), a quoted field wasn't closed...
       if @io.eof?
         raise MalformedCSVError, "Unclosed quoted field on line #{lineno + 1}."
-      elsif parse =~ @parsers[:bad_field]
-        raise MalformedCSVError, "Illegal quoting on line #{lineno + 1}."
-      elsif @field_size_limit and parse.length >= @field_size_limit
+      elsif @field_size_limit and current_field.size >= @field_size_limit
         raise MalformedCSVError, "Field size exceeded on line #{lineno + 1}."
       end
       # otherwise, we need to loop and pull some more data to complete the row
@@ -1697,9 +1702,10 @@ class FasterCSV
   #
   def init_separators(options)
     # store the selected separators
-    @col_sep    = options.delete(:col_sep)
-    @row_sep    = options.delete(:row_sep)
-    @quote_char = options.delete(:quote_char)
+    @col_sep            = options.delete(:col_sep)
+    @row_sep            = options.delete(:row_sep)
+    @quote_char         = options.delete(:quote_char)
+    @quote_and_newlines = "#{@quote_char}\r\n"
     if @quote_char.length != 1
       raise ArgumentError, ":quote_char has to be a single character String"
@@ -1785,31 +1791,12 @@ class FasterCSV
     esc_row_sep = Regexp.escape(@row_sep)
     esc_quote   = Regexp.escape(@quote_char)
     @parsers = {
-      # for empty leading fields
-      :leading_fields => Regexp.new("\\A(?:#{esc_col_sep})+", nil, @encoding),
-      # The Primary Parser
-      :csv_row        => Regexp.new(<<-END_PARSER, Regexp::EXTENDED, @encoding),
-      \\G(?:\\A|#{esc_col_sep})              # anchor the match
-      (?: #{esc_quote}( (?>[^#{esc_quote}]*) # find quoted fields
-                        (?> #{esc_quote*2}
-                            [^#{esc_quote}]* )* )#{esc_quote}
-          |                                  # ... or ...
-          ([^#{esc_quote}#{esc_col_sep}]*)   # unquoted fields
-          )
-      (?=#{esc_col_sep}|\\z)                 # ensure we are at field's end
-      END_PARSER
-      # a test for unescaped quotes
-      :bad_field      => Regexp.new(<<-END_BAD, Regexp::EXTENDED, @encoding),
-      \\A#{esc_col_sep}?                    # starts with an optional comma
-      (?: #{esc_quote} (?>[^#{esc_quote}]*) # an extra quote
-                       (?> #{esc_quote*2}
-                           [^#{esc_quote}]* )*
-                       #{esc_quote}[^#{esc_quote}]
-          |                                 # ... or ...
-          [^#{esc_quote}#{esc_col_sep}]+
-          #{esc_quote}                      # unescaped quote
-          )
-      END_BAD
+      :any_field      => Regexp.new( "[^#{esc_col_sep}]+",
+                                     Regexp::MULTILINE,
+                                     @encoding ),
+      :quoted_field   => Regexp.new( "^#{esc_quote}(.*)#{esc_quote}$",
+                                     Regexp::MULTILINE,
+                                     @encoding ),
       # safer than chomp!()
       :line_end       => Regexp.new("#{esc_row_sep}\\z", nil, @encoding)
     }

data/test/line_endings.gz ADDED Viewed

Binary file

data/test/tc_csv_parsing.rb CHANGED Viewed

@@ -108,6 +108,13 @@ class TestCSVParsing < Test::Unit::TestCase
     #
     assert_equal(Array.new, FasterCSV.parse_line("\n1,2,3\n"))
   end
+  def test_non_regex_edge_cases
+    # An early version of the non-regex parser fails this test
+    [["foo,\"foo,bar,baz,foo\",\"foo\"", ["foo", "foo,bar,baz,foo", "foo"]]].each do |edge_case|
+      assert_equal(edge_case.last, FasterCSV.parse_line(edge_case.first))
+    end
+  end
   def test_malformed_csv
     assert_raise(FasterCSV::MalformedCSVError) do

data/test/tc_interface.rb CHANGED Viewed

@@ -103,6 +103,17 @@ class TestFasterCSVInterface < Test::Unit::TestCase
       assert_equal(nil, csv.shift)
     end
   end
+  def test_long_line # ruby's regex parser may have problems with long rows
+    File.unlink(@path)
+    long_field_length = 2800
+    File.open(@path, "w") do |file|
+      file << "1\t2\t#{'3' * long_field_length}\r\n"
+    end
+    @expected = [%w{1 2} + ['3' * long_field_length]]
+    test_shift
+  end
   ### Test Write Interface ###

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: fastercsv
 version: !ruby/object:Gem::Version
-  version: 1.4.0
+  version: 1.5.0
 platform: ruby
 authors:
 - James Edward Gray II
@@ -9,11 +9,15 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2008-09-10 00:00:00 -05:00
+date: 2009-06-15 00:00:00 -05:00
 default_executable:
 dependencies: []
-description: FasterCSV is intended as a complete replacement to the CSV standard library. It is significantly faster and smaller while still being pure Ruby code. It also strives for a better interface.
+description: |
+  FasterCSV is intended as a complete replacement to the CSV standard library. It
+  is significantly faster and smaller while still being pure Ruby code. It also
+  strives for a better interface.
 email: james@grayproductions.net
 executables: []
@@ -52,6 +56,7 @@ files:
 - examples/purchase.csv
 - Rakefile
 - setup.rb
+- test/line_endings.gz
 - AUTHORS
 - COPYING
 - README
@@ -61,6 +66,8 @@ files:
 - LICENSE
 has_rdoc: true
 homepage: http://fastercsv.rubyforge.org
+licenses: []
 post_install_message:
 rdoc_options:
 - --title
@@ -84,9 +91,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project: fastercsv
-rubygems_version: 1.2.0
+rubygems_version: 1.3.4
 signing_key:
-specification_version: 2
+specification_version: 3
 summary: FasterCSV is CSV, but faster, smaller, and cleaner.
 test_files:
 - test/ts_all.rb