RubyGems - fastest-csv - Versions diffs - 0.0.1 → 0.0.4 - Mend

fastest-csv 0.0.1 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

data/.gitignore +3 -0
data/README.md +2 -2
data/Rakefile +20 -0
data/ext/csv_parser/CsvParser.java +89 -0
data/ext/csv_parser/CsvParserService.java +115 -0
data/ext/csv_parser/extconf.rb +7 -1
data/ext/csv_parser/parser.c +21 -20
data/fastest-csv.gemspec +10 -3
data/lib/fastest-csv/version.rb +1 -1
data/lib/fastest_csv.rb +42 -8
data/test/tc_csv_parsing.rb +11 -7
data/test/tc_interface.rb +9 -0
metadata +24 -6

data/.gitignore CHANGED

@@ -3,6 +3,7 @@
 .bundle
 .config
 .yardoc
+.DS_Store
 Gemfile.lock
 InstalledFiles
 _yardoc
@@ -15,3 +16,5 @@ spec/reports
 test/tmp
 test/version_tmp
 tmp
+lib/*.bundle
+lib/*.jar

data/README.md CHANGED

@@ -1,8 +1,8 @@
 # FastestCSV
-Fastest CSV class for MRI Ruby. Faster than faster_csv and fasterer-csv.
+Fastest CSV class for MRI Ruby and JRuby. Faster than faster_csv and fasterer-csv.
-Uses native C code to parse CSV lines. Not (yet) compatible with JRuby.
+Uses native C code to parse CSV lines in MRI Ruby and Java in JRuby.
 Supports standard CSV according to RFC4180. Not the so-called "csv" from Excel.

data/Rakefile CHANGED

@@ -1,2 +1,22 @@
 #!/usr/bin/env rake
 require "bundler/gem_tasks"
+spec = Gem::Specification.load('fastest-csv.gemspec')
+if RUBY_PLATFORM =~ /java/
+  require 'rake/javaextensiontask'
+  Rake::JavaExtensionTask.new('csv_parser', spec)
+else
+  require 'rake/extensiontask'
+  Rake::ExtensionTask.new('csv_parser', spec)
+end
+require 'rake/testtask'
+Rake::TestTask.new do |t|
+  t.libs << "test"
+  t.test_files = FileList['test/tc_*.rb']
+  #test.libs << 'lib' << 'test'
+  #test.pattern = 'test/**/test_*.rb'
+  #test.verbose = true
+end

data/ext/csv_parser/CsvParser.java ADDED

@@ -0,0 +1,89 @@
+//
+// Copyright (c) Maarten Oelering, BrightCode BV
+//
+package org.brightcode;
+import java.util.ArrayList;
+import java.util.List;
+public class CsvParser {
+    private static int DEF_ARRAY_LEN = 32;
+    private static int UNQUOTED = 0;
+    private static int IN_QUOTED = 1;
+    private static int QUOTE_IN_QUOTED = 2;
+    public static List parseLine(String line) {
+        int length = line.length();
+        if (length == 0)
+            return null;
+        int state = UNQUOTED;
+        StringBuilder value = new StringBuilder(length);   // field value, no longer than line
+        List<String> array = new ArrayList<String>(DEF_ARRAY_LEN);
+        for (int i = 0; i < length; i++) {
+            char c = line.charAt(i);
+            switch (c) {
+                case ',':
+                    if (state == UNQUOTED) {
+                        if (value.length() == 0) {
+                            array.add(null);
+                        }
+                        else {
+                            array.add(value.toString());
+                            value.setLength(0);
+                        }
+                    }
+                    else if (state == IN_QUOTED) {
+                        value.append(c);
+                    }
+                    else if (state == 2) {
+                        array.add(value.toString());
+                        value.setLength(0);
+                        state = UNQUOTED;
+                    }
+                    break;
+                case '"':
+                    if (state == UNQUOTED) {
+                        state = IN_QUOTED;
+                    }
+                    else if (state == IN_QUOTED) {
+                        state = QUOTE_IN_QUOTED;
+                    }
+                    else if (state == QUOTE_IN_QUOTED) {
+                        value.append(c);   // escaped quote
+                        state = IN_QUOTED;
+                    }
+                    break;
+                case '\r':
+                case '\n':
+                    if (state == IN_QUOTED) {
+                        value.append(c);
+                    }
+                    else {
+                        i = length;  // only parse first line if multiline
+                    }
+                    break;
+                default:
+                    value.append(c);
+                    break;
+            }
+        }
+        if (state == UNQUOTED) {
+            if (value.length() == 0) {
+                array.add(null);
+            }
+            else {
+                array.add(value.toString());
+                value.setLength(0);
+            }
+        }
+        else if (state == QUOTE_IN_QUOTED) {
+            array.add(value.toString());
+        }
+        return array;
+    }
+}

data/ext/csv_parser/CsvParserService.java ADDED

@@ -0,0 +1,115 @@
+//
+// Copyright (c) Maarten Oelering, BrightCode BV
+//
+package org.brightcode;
+import java.io.IOException;
+import org.jruby.Ruby;
+import org.jruby.RubyArray;
+import org.jruby.RubyModule;
+import org.jruby.RubyString;
+import org.jruby.runtime.Block;
+import org.jruby.runtime.CallbackFactory;
+import org.jruby.runtime.builtin.IRubyObject;
+import org.jruby.runtime.load.BasicLibraryService;
+public class CsvParserService implements BasicLibraryService {
+    private Ruby runtime;
+    private static int DEF_ARRAY_LEN = 32;
+    private static int UNQUOTED = 0;
+    private static int IN_QUOTED = 1;
+    private static int QUOTE_IN_QUOTED = 2;
+    // Initial setup function. Takes a reference to the current JRuby runtime and
+    // sets up our modules.
+    public boolean basicLoad(Ruby runtime) throws IOException {
+        this.runtime = runtime;
+        RubyModule mCsvParser = runtime.defineModule("CsvParser");
+        // TODO: CallbackFactory#getSingletonMethod is deprecated
+        CallbackFactory callbackFactory = runtime.callbackFactory(CsvParserService.class);
+        mCsvParser.defineModuleFunction("parse_line",
+            callbackFactory.getSingletonMethod("parseLine", RubyString.class));
+        return true;
+    }
+    public static IRubyObject parseLine(IRubyObject recv, RubyString line, Block unusedBlock) {
+        Ruby runtime = recv.getRuntime();
+        CharSequence seq = line.getValue();
+        int length = seq.length();
+        if (length == 0)
+            return runtime.getNil();
+        int state = UNQUOTED;
+        StringBuilder value = new StringBuilder(length);   // field value, no longer than line
+        RubyArray array = RubyArray.newArray(runtime, DEF_ARRAY_LEN);
+        for (int i = 0; i < length; i++) {
+            char c = seq.charAt(i);
+            switch (c) {
+                case ',':
+                    if (state == UNQUOTED) {
+                        if (value.length() == 0) {
+                            array.append(runtime.getNil());
+                        }
+                        else {
+                            array.append(RubyString.newString(runtime, value));
+                            value.setLength(0);
+                        }
+                    }
+                    else if (state == IN_QUOTED) {
+                        value.append(c);
+                    }
+                    else if (state == 2) {
+                        array.append(RubyString.newString(runtime, value));
+                        value.setLength(0);
+                        state = UNQUOTED;
+                    }
+                    break;
+                case '"':
+                    if (state == UNQUOTED) {
+                        state = IN_QUOTED;
+                    }
+                    else if (state == IN_QUOTED) {
+                        state = QUOTE_IN_QUOTED;
+                    }
+                    else if (state == QUOTE_IN_QUOTED) {
+                        value.append(c);   // escaped quote
+                        state = IN_QUOTED;
+                    }
+                    break;
+                case '\r':
+                case '\n':
+                    if (state == IN_QUOTED) {
+                        value.append(c);
+                    }
+                    else {
+                        i = length;  // only parse first line if multiline
+                    }
+                    break;
+                default:
+                    value.append(c);
+                    break;
+            }
+        }
+        if (state == UNQUOTED) {
+            if (value.length() == 0) {
+                array.append(runtime.getNil());
+            }
+            else {
+                array.append(RubyString.newString(runtime, value));
+                value.setLength(0);
+            }
+        }
+        else if (state == QUOTE_IN_QUOTED) {
+            array.append(RubyString.newString(runtime, value));
+        }
+        return array;
+    }
+}

data/ext/csv_parser/extconf.rb CHANGED

@@ -1,9 +1,15 @@
 #!/usr/bin/ruby -w
 require 'mkmf'
+extension_name = 'csv_parser'
+#dir_config(extension_name)
 if RUBY_VERSION =~ /1.8/ then
   $CPPFLAGS += " -DRUBY_18"
 end
-create_makefile('csv_parser')
+#if CONFIG["arch"] =~ /mswin32|mingw/
+#  $CFLAGS += " -march=i686"
+#end
+create_makefile(extension_name)

data/ext/csv_parser/parser.c CHANGED

@@ -9,10 +9,13 @@
   #include "ruby/io.h"
 #endif
-/* default allocated size is 16 */
 #define DEF_ARRAY_LEN 32
-static VALUE cFastestCSV;
+#define UNQUOTED 0
+#define IN_QUOTED 1
+#define QUOTE_IN_QUOTED 2
+static VALUE mCsvParser;
 static VALUE parse_line(VALUE self, VALUE str)
 {
@@ -25,7 +28,7 @@ static VALUE parse_line(VALUE self, VALUE str)
     if (len == 0)
         return Qnil;
-    VALUE array = rb_ary_new2(DEF_ARRAY_LEN);
+    VALUE array = rb_ary_new2(DEF_ARRAY_LEN); /* default allocated size is 16 */
     char value[len];  /* field value, no longer than line */
     int state = 0;
     int index = 0;
@@ -37,51 +40,49 @@ static VALUE parse_line(VALUE self, VALUE str)
         switch (c)
         {
             case ',':
-                if (state == 0) {
+                if (state == UNQUOTED) {
                     rb_ary_push(array, (index == 0 ? Qnil: rb_str_new(value, index)));
                     index = 0;
                 }
-                else if (state == 1) {
+                else if (state == IN_QUOTED) {
                     value[index++] = c;
                 }
-                else if (state == 2) {
+                else if (state == QUOTE_IN_QUOTED) {
                     rb_ary_push(array, rb_str_new(value, index));
                     index = 0;
-                    state = 0;  /* outside quoted */
+                    state = UNQUOTED;
                 }
                 break;
             case '"':
-                if (state == 0) {
-                    state = 1;  /* in quoted */
+                if (state == UNQUOTED) {
+                    state = IN_QUOTED;
                 }
                 else if (state == 1) {
-                    state = 2;  /* quote in quoted */
+                    state = QUOTE_IN_QUOTED;
                 }
-                else if (state == 2) {
+                else if (state == QUOTE_IN_QUOTED) {
                     value[index++] = c;  /* escaped quote */
-                    state = 1;  /* in quoted */
+                    state = IN_QUOTED;
                 }
                 break;
             case 13:  /* \r */
             case 10:  /* \n */
-                if (state == 1) { /* quoted */
+                if (state == IN_QUOTED) {
                     value[index++] = c;
                 }
                 else {
-                    /* only do first line */
-                    i = len;
+                    i = len;  /* only parse first line if multiline */
                 }
-                /* else eat it ??? or return so far */
                 break;
             default:
                 value[index++] = c;
         }
     }
-    if (state == 0) {
+    if (state == UNQUOTED) {
         rb_ary_push(array, (index == 0 ? Qnil: rb_str_new(value, index)));
     }
-    else if (state == 2) {
+    else if (state == QUOTE_IN_QUOTED) {
         rb_ary_push(array, rb_str_new(value, index));
     }
     return array;
@@ -89,6 +90,6 @@ static VALUE parse_line(VALUE self, VALUE str)
 void Init_csv_parser()
 {
-    cFastestCSV = rb_define_class("FastestCSV", rb_cObject);
-    rb_define_singleton_method(cFastestCSV, "parse_line", parse_line, 1);
+    mCsvParser = rb_define_module("CsvParser");
+    rb_define_module_function(mCsvParser, "parse_line", parse_line, 1);
 }

data/fastest-csv.gemspec CHANGED

@@ -4,8 +4,8 @@ require File.expand_path('../lib/fastest-csv/version', __FILE__)
 Gem::Specification.new do |gem|
   gem.authors       = ["Maarten Oelering"]
   gem.email         = ["maarten@brightcode.nl"]
-  gem.description   = %q{Fastest standard CSV parser for MRI Ruby}
-  gem.summary       = %q{Fastest standard CSV parser for MRI Ruby}
+  gem.description   = %q{Fastest standard CSV parser for MRI Ruby and JRuby}
+  gem.summary       = %q{Fastest standard CSV parser for MRI Ruby and JRuby}
   gem.homepage      = "https://github.com/brightcode/fastest-csv"
   gem.files         = `git ls-files`.split($\)
@@ -14,6 +14,13 @@ Gem::Specification.new do |gem|
   gem.name          = "fastest-csv"
   gem.require_paths = ["lib"]
   gem.version       = FastestCSV::VERSION
+  if RUBY_PLATFORM =~ /java/
+    gem.platform = "java"
+    gem.files << "lib/csv_parser.jar"
+  else
+    gem.extensions  = ['ext/csv_parser/extconf.rb']
+  end
-  gem.extensions    = ['ext/csv_parser/extconf.rb']
+  gem.add_development_dependency "rake-compiler"
 end

data/lib/fastest-csv/version.rb CHANGED

@@ -1,3 +1,3 @@
 class FastestCSV
-  VERSION = "0.0.1"
+  VERSION = "0.0.4"
 end

data/lib/fastest_csv.rb CHANGED

@@ -1,17 +1,30 @@
+# This loads either csv_parser.so, csv_parser.bundle or
+# csv_parser.jar, depending on your Ruby platform and OS
 require 'csv_parser'
 require 'stringio'
+# Fast CSV parser using native code
 class FastestCSV
+  include Enumerable
-  # This method opens an accounting file and passes each record to the provided +block+.
+  if RUBY_PLATFORM =~ /java/
+    if JRUBY_VERSION =~ /^1\.[0-6]/
+      require 'jruby'
+      org.brightcode.CsvParserService.new.basicLoad(JRuby.runtime)
+    else
+      include_package "org.brightcode"
+    end
+  end
+  # Pass each line of the specified +path+ as array to the provided +block+
   def self.foreach(path, &block)
     open(path) do |reader|
       reader.each(&block)
     end
   end
-  # This method opens a csv file. It will pass a Reader object to the provided block,
-  # or return a Reader object when no block is provided.
+  # Opens a csv file. Pass a FastestCSV instance to the provided block,
+  # or return it when no block is provided
   def self.open(path, mode = "rb")
     csv = new(File.open(path, mode))
     if block_given?
@@ -25,14 +38,17 @@ class FastestCSV
     end
   end
+  # Read all lines from the specified +path+ into an array of arrays
   def self.read(path)
     open(path, "rb") { |csv| csv.read }
   end
+  # Alias for FastestCSV.read
   def self.readlines(path)
     read(path)
   end
+  # Read all lines from the specified String into an array of arrays
   def self.parse(data, &block)
     csv = new(StringIO.new(data))
     if block.nil?
@@ -45,27 +61,44 @@ class FastestCSV
       csv.each(&block)
     end
   end
+  def self.parse_line(line)
+    CsvParser.parse_line(line)
+  end
+  # Create new FastestCSV wrapping the specified IO object
   def initialize(io)
     @io = io
   end
+  # Read from the wrapped IO passing each line as array to the specified block
   def each
-    while row = shift
-      yield row
+    if block_given?
+      while row = shift
+        yield row
+      end
+    else
+      to_enum # return enumerator
     end
   end
+  # Read all remaining lines from the wrapped IO into an array of arrays
   def read
     table = Array.new
     each {|row| table << row}
     table
   end
   alias_method :readlines, :read
+  # Rewind the underlying IO object and reset line counter
+  def rewind
+    @io.rewind
+  end
+  # Read next line from the wrapped IO and return as array or nil at EOF
   def shift
     if line = @io.gets
-      FastestCSV.parse_line(line)
+      CsvParser.parse_line(line)
     else
       nil
     end
@@ -73,6 +106,7 @@ class FastestCSV
   alias_method :gets,     :shift
   alias_method :readline, :shift
+  # Close the wrapped IO
   def close
     @io.close
   end
@@ -83,9 +117,9 @@ class FastestCSV
 end
 class String
-  # Equivalent to <tt>FasterCSV::parse_line(self, options)</tt>.
+  # Equivalent to <tt>FasterCSV::parse_line(self)</tt>
   def parse_csv
-    FastestCSV.parse_line(self)
+    CsvParser.parse_line(self)
   end
 end

data/test/tc_csv_parsing.rb CHANGED

@@ -13,11 +13,15 @@ require 'fastest_csv'
 #
 class TestCSVParsing < Test::Unit::TestCase
+  if RUBY_PLATFORM =~ /java/
+    include_package "org.brightcode"
+  end
   def test_mastering_regex_example
     ex = %Q{Ten Thousand,10000, 2710 ,,"10,000","It's ""10 Grand"", baby",10K}
     assert_equal( [ "Ten Thousand", "10000", " 2710 ", nil, "10,000",
                     "It's \"10 Grand\", baby", "10K" ],
-                  FastestCSV.parse_line(ex) )
+                  CsvParser.parse_line(ex) )
   end
   # Pulled from:  http://www.ruby-lang.org/cgi-bin/cvsweb.cgi/ruby/test/csv/test_csv.rb?rev=1.12.2.2;content-type=text%2Fplain
@@ -49,7 +53,7 @@ class TestCSVParsing < Test::Unit::TestCase
       ["foo,\"\r\n\n\",baz", ["foo", "\r\n\n", "baz"]],
       ["foo,\"foo,bar\",baz", ["foo", "foo,bar", "baz"]],
       [";,;", [";", ";"]] ].each do |csv_test|
-      assert_equal(csv_test.last, FastestCSV.parse_line(csv_test.first))
+      assert_equal(csv_test.last, CsvParser.parse_line(csv_test.first))
     end
     [ ["foo,\"\"\"\"\"\",baz", ["foo", "\"\"", "baz"]],
@@ -68,7 +72,7 @@ class TestCSVParsing < Test::Unit::TestCase
       ["foo,bar", ["foo", "bar"]],
       ["foo,\"\r\n\n\",baz", ["foo", "\r\n\n", "baz"]],
       ["foo,\"foo,bar\",baz", ["foo", "foo,bar", "baz"]] ].each do |csv_test|
-      assert_equal(csv_test.last, FastestCSV.parse_line(csv_test.first))
+      assert_equal(csv_test.last, CsvParser.parse_line(csv_test.first))
      end
   end
@@ -91,20 +95,20 @@ class TestCSVParsing < Test::Unit::TestCase
       [%Q{,"\r"},             [nil,"\r"]],
       [%Q{"\r\n,"},           ["\r\n,"]],
       [%Q{"\r\n,",},          ["\r\n,", nil]] ].each do |edge_case|
-        assert_equal(edge_case.last, FastestCSV.parse_line(edge_case.first))
+        assert_equal(edge_case.last, CsvParser.parse_line(edge_case.first))
       end
   end
   def test_james_edge_cases
     # A read at eof? should return nil.
-    assert_equal(nil, FastestCSV.parse_line(""))
+    assert_equal(nil, CsvParser.parse_line(""))
     #
     # With CSV it's impossible to tell an empty line from a line containing a
     # single +nil+ field.  The standard CSV library returns <tt>[nil]</tt>
     # in these cases, but <tt>Array.new</tt> makes more sense to me.
     #
     #assert_equal(Array.new, FastestCSV.parse_line("\n1,2,3\n"))
-    assert_equal([nil], FastestCSV.parse_line("\n1,2,3\n"))
+    assert_equal([nil], CsvParser.parse_line("\n1,2,3\n"))
   end
   def test_rob_edge_cases
@@ -119,7 +123,7 @@ class TestCSVParsing < Test::Unit::TestCase
       [%Q{"a\r\n\r\na","two CRLFs"},       ["a\r\n\r\na", 'two CRLFs']],
       [%Q{with blank,"start\n\nfinish"\n}, ['with blank', "start\n\nfinish"]],
     ].each do |edge_case|
-      assert_equal(edge_case.last, FastestCSV.parse_line(edge_case.first))
+      assert_equal(edge_case.last, CsvParser.parse_line(edge_case.first))
     end
   end

data/test/tc_interface.rb CHANGED

@@ -116,4 +116,13 @@ class TestFastestCSVInterface < Test::Unit::TestCase
     @expected = [%w{1 2} + ['3' * long_field_length]]
     test_shift
   end
+  def test_enumerable
+    FastestCSV.open(@path) do |csv|
+      assert(csv.include?(["1", "2", "3"]))
+      csv.rewind
+      assert_equal([["1", "2", "3"], ["4", "5"]], csv.to_a)
+    end
+  end
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: fastest-csv
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.4
   prerelease:
 platform: ruby
 authors:
@@ -9,9 +9,25 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-06-28 00:00:00.000000000 Z
-dependencies: []
-description: Fastest standard CSV parser for MRI Ruby
+date: 2013-08-15 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: rake-compiler
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+description: Fastest standard CSV parser for MRI Ruby and JRuby
 email:
 - maarten@brightcode.nl
 executables: []
@@ -24,6 +40,8 @@ files:
 - LICENSE
 - README.md
 - Rakefile
+- ext/csv_parser/CsvParser.java
+- ext/csv_parser/CsvParserService.java
 - ext/csv_parser/extconf.rb
 - ext/csv_parser/parser.c
 - fastest-csv.gemspec
@@ -52,10 +70,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 1.8.24
+rubygems_version: 1.8.25
 signing_key:
 specification_version: 3
-summary: Fastest standard CSV parser for MRI Ruby
+summary: Fastest standard CSV parser for MRI Ruby and JRuby
 test_files:
 - test/tc_csv_parsing.rb
 - test/tc_interface.rb