RubyGems - csvreader - Versions diffs - 0.5.0 → 0.6.0 - Mend

csvreader 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml +4 -4
data/Manifest.txt +7 -1
data/lib/csvreader.rb +12 -2
data/lib/csvreader/buffer.rb +8 -3
data/lib/csvreader/parser.rb +47 -336
data/lib/csvreader/parser_std.rb +255 -0
data/lib/csvreader/parser_strict.rb +269 -0
data/lib/csvreader/parser_tab.rb +57 -0
data/lib/csvreader/reader.rb +40 -100
data/lib/csvreader/reader_hash.rb +88 -0
data/lib/csvreader/version.rb +1 -1
data/test/helper.rb +4 -0
data/test/test_parser.rb +0 -3
data/test/test_parser_formats.rb +8 -11
data/test/test_parser_java.rb +219 -0
data/test/{test_parser_rfc4180.rb → test_parser_strict.rb} +17 -20
data/test/test_parser_tab.rb +48 -0
data/test/test_reader.rb +15 -16
metadata +9 -3

data/lib/csvreader/parser_tab.rb ADDED Viewed

@@ -0,0 +1,57 @@
+# encoding: utf-8
+class CsvReader
+class ParserTab
+def parse( data, **kwargs, &block )
+  ## note: input: required each_line (string or io/file for example)
+  ## note: kwargs NOT used for now (but required for "protocol/interface" by other parsers)
+  input = data   ## assume it's a string or io/file handle
+  if block_given?
+    parse_lines( input, &block )
+  else
+    records = []
+    parse_lines( input ) do |record|
+      records << record
+    end
+    records
+  end
+end ## method parse
+private
+def parse_lines( input, &block )
+  ## note: each line only works with \n (windows) or \r\n (unix)
+  ##   will NOT work with \r (old mac, any others?) only!!!!
+  input.each_line do |line|
+    ## puts "line:"
+    ## pp line
+    ##  note: chomp('') if is an empty string,
+    ##    it will remove all trailing newlines from the string.
+    ##    use line.sub(/[\n\r]*$/, '') or similar instead - why? why not?
+    line = line.chomp( '' )
+    ## pp line
+    # note: trailing empty fields get (auto-)trimmed by split !!!!!!!
+    values = line.split( "\t" )
+    ## pp values
+    ## note: requires block - enforce? how? why? why not?
+    block.call( values )
+  end
+end # method parse_lines
+end # class ParserTab
+end # class CsvReader

data/lib/csvreader/reader.rb CHANGED Viewed

@@ -9,35 +9,28 @@ class CsvReader
   end
   DEFAULT = new( Parser::DEFAULT )
+  STRICT  = new( Parser::STRICT )
   RFC4180 = new( Parser::RFC4180 )
   EXCEL   = new( Parser::EXCEL )
+  TAB     = new( Parser::TAB )
   def self.default()  DEFAULT; end    ## alternative alias for DEFAULT
+  def self.strict()   STRICT; end     ## alternative alias for RFC4180
   def self.rfc4180()  RFC4180; end    ## alternative alias for RFC4180
   def self.excel()    EXCEL; end      ## alternative alias for EXCEL
+  def self.tab()      TAB; end        ## alternative alias for TAB
   #####################
   ## convenience helpers defaulting to default csv dialect/format reader
   ##
-  ##   CsvReader.parse_line is the same as
-  ##     CsvReader::DEFAULT.parse_line or CsvReader.default.parse_line
+  ##   CsvReader.parse is the same as
+  ##     CsvReader::DEFAULT.parse or CsvReader.default.parse
   ##
-  def self.parse_line( data, sep: nil,
-                             converters: nil )
-     DEFAULT.parse_line( data, sep: sep, converters: converters )
-  end
   def self.parse( data, sep: nil,
-                        converters: nil )
-     DEFAULT.parse( data, sep: sep, converters: converters )
-  end
-  #### fix!!! remove - replace with parse with (optional) block!!!!!
-  def self.parse_lines( data, sep: nil,
-                              converters: nil, &block )
-     DEFAULT.parse_lines( data, sep: sep, converters: nil, &block )
+                        converters: nil, &block )
+     DEFAULT.parse( data, sep: sep, converters: converters, &block )
   end
   def self.read( path, sep: nil,
@@ -55,6 +48,20 @@ class CsvReader
   end
+  ############################
+  ## note: only add parse_line convenience helper for default
+  ##   always use parse (do NOT use parse_line)  - why? why not?
+  def self.parse_line( data, sep: nil,
+                             converters: nil )
+     records = []
+     DEFAULT.parse( data, sep: sep, converters: converters ) do |record|
+       records << record
+       break   # only parse first record
+     end
+     records.size == 0 ? nil : records.first
+  end
   #############################
   ## all "high-level" reader methods
@@ -62,33 +69,15 @@ class CsvReader
   ## note: allow "overriding" of separator
   ##    if sep is not nil otherwise use default dialect/format separator
+  def parse( data, sep: nil,
+                   converters: nil, &block )
+    kwargs = {
+      ##  converters: converters  ## todo: add converters
+    }
+    ## note: only add separator if present/defined (not nil)
+    kwargs[:sep] = sep    if sep && @parser.respond_to?( :'sep=' )
-  ##
-  ##  todo/fix: "unify" parse and parse_lines  !!!
-  ##    check for block_given? - why? why not?
-  def parse( data, sep: nil, limit: nil,
-                   converters: nil )
-    sep = @parser.config[:sep]  if sep.nil?
-    @parser.parse( data, sep: sep, limit: limit )
-  end
-  #### fix!!! remove - replace with parse with (optional) block!!!!!
-  def parse_lines( data, sep: nil,
-                         converters: nil, &block )
-    sep = @parser.config[:sep]  if sep.nil?
-    @parser.parse_lines( data, sep: sep, &block )
-  end
-  def parse_line( data, sep: nil,
-                        converters: nil )
-    records = parse( data, sep: sep, limit: 1 )
-    ## unwrap record if empty return nil - why? why not?
-    ##  return empty record e.g. [] - why? why not?
-    records.size == 0 ? nil : records.first
+    @parser.parse( data, kwargs, &block )
   end
   def read( path, sep: nil,
@@ -103,75 +92,26 @@ class CsvReader
   def foreach( path, sep: nil,
                      converters: nil, &block )
     File.open( path, 'r:bom|utf-8' ) do |file|
-      parse_lines( file, sep: sep, &block )
+      parse( file, sep: sep, &block )
     end
   end
   def header( path, sep: nil )   ## use header or headers - or use both (with alias)?
      # read first lines (only)
      #  and parse with csv to get header from csv library itself
-     record = nil
+     records = []
      File.open( path, 'r:bom|utf-8' ) do |file|
-        record = parse_line( file, sep: sep )
+        parse( file, sep: sep ) do |record|
+          records << record
+          break   ## only parse/read first record
+        end
      end
-     record  ## todo/fix: returns nil for empty - why? why not?
+     ## unwrap record if empty return nil - why? why not?
+     ##  return empty record e.g. [] - why? why not?
+     ##  returns nil for empty (for now) - why? why not?
+     records.size == 0 ? nil : records.first
   end  # method self.header
 end # class CsvReader
-class CsvHashReader
-def self.parse( data, sep: nil, headers: nil )
-  ## pass in headers as array e.g. ['A', 'B', 'C']
-  names = headers ? headers : nil
-  records = []
-  CsvReader.parse_lines( data ) do |values|     # sep: sep
-    if names.nil?
-      names = values   ## store header row / a.k.a. field/column names
-    else
-      record = names.zip( values ).to_h    ## todo/fix: check for more values than names/headers!!!
-      records << record
-    end
-  end
-  records
-end
-def self.read( path, sep: nil, headers: nil )
-  txt = File.open( path, 'r:bom|utf-8' ).read
-  parse( txt, sep: sep, headers: headers )
-end
-def self.foreach( path, sep: nil, headers: nil, &block )
-  ## pass in headers as array e.g. ['A', 'B', 'C']
-  names = headers ? headers : nil
-  CsvReader.foreach( path ) do |values|     # sep: sep
-    if names.nil?
-      names = values   ## store header row / a.k.a. field/column names
-    else
-      record = names.zip( values ).to_h    ## todo/fix: check for more values than names/headers!!!
-      block.call( record )
-    end
-  end
-end
-def self.header( path, sep: nil )   ## add header too? why? why not?
-  ## same as "classic" header method - delegate/reuse :-)
-  CsvReader.header( path, sep: sep )
-end
-end # class CsvHashReader

data/lib/csvreader/reader_hash.rb ADDED Viewed

@@ -0,0 +1,88 @@
+# encoding: utf-8
+class CsvHashReader
+  def initialize( parser )
+    @parser = parser
+  end
+  DEFAULT = new( CsvReader::Parser::DEFAULT )
+  STRICT  = new( CsvReader::Parser::STRICT )
+  RFC4180 = new( CsvReader::Parser::RFC4180 )
+  def self.default()  DEFAULT; end    ## alternative alias for DEFAULT
+  def self.strict()   STRICT;  end    ## alternative alias for STRICT
+  def self.rfc4180()  RFC4180; end    ## alternative alias for RFC4180
+  def self.parse( data, sep: nil, headers: nil, &block )
+    DEFAULT.parse( data, sep: sep, headers: headers, &block )
+  end
+  def self.read( path, sep: nil, headers: nil )
+    DEFAULT.read( path, sep: sep, headers: headers )
+  end
+  def self.foreach( path, sep: nil, headers: nil, &block )
+    DEFAULT.foreach( path,sep: sep, headers: headers, &block )
+  end
+#############################
+## all "high-level" reader methods
+##
+def parse( data, sep: nil, headers: nil, &block )
+  if block_given?
+    parse_lines( data, sep: sep, headers: headers, &block )
+  else
+    records = []
+    parse_lines( data, sep: sep, headers: headers ) do |record|
+      records << record
+    end
+    records
+  end
+end
+def read( path, sep: nil, headers: nil )
+  txt = File.open( path, 'r:bom|utf-8' ).read
+  parse( txt, sep: sep, headers: headers )
+end
+def foreach( path, sep: nil, headers: nil, &block )
+  File.open( path, 'r:bom|utf-8' ) do |file|
+    parse_lines( file, sep: sep, headers: headers, &block )
+  end
+end
+private
+####################
+## parse_lines helper method to keep in one (central) place only (for easy editing/changing)
+##   - builds key/value pairs
+def parse_lines( data, sep: nil, headers: nil, &block)
+  ## pass in headers as array e.g. ['A', 'B', 'C']
+  names = headers ? headers : nil
+  kwargs = {
+    ##  converters: converters  ## todo: add converters
+  }
+  kwargs[:sep] = sep   if sep && @parser.respond_to?( :'sep=' )   ## note: only add separator if present/defined (not nil)
+  @parser.parse( data, kwargs ) do |values|     # sep: sep
+    if names.nil?
+      names = values   ## store header row / a.k.a. field/column names
+    else
+      record = names.zip( values ).to_h    ## todo/fix: check for more values than names/headers!!!
+      block.call( record )
+    end
+  end
+end
+end # class CsvHashReader

data/lib/csvreader/version.rb CHANGED Viewed

@@ -4,7 +4,7 @@
 class CsvReader   ## note: uses a class for now - change to module - why? why not?
   MAJOR = 0    ## todo: namespace inside version or something - why? why not??
-  MINOR = 5
+  MINOR = 6
   PATCH = 0
   VERSION = [MAJOR,MINOR,PATCH].join('.')

data/test/helper.rb CHANGED Viewed

@@ -14,3 +14,7 @@ class CsvReader
     "#{root}/test/data"
   end
 end
+## CsvReader::ParserStd.logger.level    = :debug   ## turn on "global" logging
+## CsvReader::ParserStrict.logger.level = :debug   ## turn on "global" logging

data/test/test_parser.rb CHANGED Viewed

@@ -9,9 +9,6 @@ require 'helper'
 class TestParser < MiniTest::Test
-def setup
-  CsvReader::Parser.logger.level = :debug   ## turn on "global" logging - move to helper - why? why not?
-end
 def parser
   parser = CsvReader::Parser::DEFAULT

data/test/test_parser_formats.rb CHANGED Viewed

@@ -9,9 +9,6 @@ require 'helper'
 class TestParserFormats < MiniTest::Test
-def setup
-  CsvReader::Parser.logger.level = :debug   ## turn on "global" logging - move to helper - why? why not?
-end
 def parser
   CsvReader::Parser
@@ -37,17 +34,17 @@ def test_parse_whitespace
    ## strict rfc4180 - no trim leading or trailing spaces or blank lines
-   assert_equal records,   parser.rfc4180.parse( "a,b,c\n1,2,3" )
+   assert_equal records,   parser.strict.parse( "a,b,c\n1,2,3" )
    assert_equal [["a", "b", "c"],
                  [""],
-                 ["1", "2", "3"]], parser.rfc4180.parse( "a,b,c\n\n1,2,3" )
+                 ["1", "2", "3"]], parser.strict.parse( "a,b,c\n\n1,2,3" )
    assert_equal [[" a", " b ", "c "],
                  [""],
-                 ["1", "2", "3"]], parser.rfc4180.parse( " a, b ,c \n\n1,2,3" )
+                 ["1", "2", "3"]], parser.strict.parse( " a, b ,c \n\n1,2,3" )
     assert_equal [[" a", " b ", "c "],
                   [" "],
                   ["",""],
-                  ["1", "2", "3"]], parser.rfc4180.parse( " a, b ,c \n \n,\n1,2,3" )
+                  ["1", "2", "3"]], parser.strict.parse( " a, b ,c \n \n,\n1,2,3" )
 end
@@ -57,13 +54,13 @@ def test_parse_empties
     ## strict rfc4180 - no trim leading or trailing spaces or blank lines
     assert_equal [[""],
                   [" "],
-                  [" "]], parser.rfc4180.parse( "\n \n \n" )
+                  [" "]], parser.strict.parse( "\n \n \n" )
     assert_equal [[""],
                   [" "],
-                  [" "]], parser.rfc4180.parse( "\n \n " )
+                  [" "]], parser.strict.parse( "\n \n " )
-    assert_equal [[""]], parser.rfc4180.parse( "\n" )
-    assert_equal [],     parser.rfc4180.parse( "" )
+    assert_equal [[""]], parser.strict.parse( "\n" )
+    assert_equal [],     parser.strict.parse( "" )
 end
 end # class TestParserFormats

data/test/test_parser_java.rb ADDED Viewed

@@ -0,0 +1,219 @@
+# encoding: utf-8
+###
+#  to run use
+#     ruby -I ./lib -I ./test test/test_parser_java.rb
+require 'helper'
+##########################
+# try some tests from apache java (commons) csv reader
+#  see https://github.com/apache/commons-csv/blob/master/src/test/java/org/apache/commons/csv/LexerTest.java
+class TestParserJava < MiniTest::Test
+LF	= "\n"     ##   \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed)
+CR	= "\r"     ##   \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return)
+def parser
+  CsvReader::Parser
+end
+def test_surrounding_spaces_are_deleted
+  assert_equal [["noSpaces",
+                 "leadingSpaces",
+                 "trailingSpaces",
+                 "surroundingSpaces",
+                 "",
+                 "",
+                 ""]], parser.default.parse( "noSpaces,  leadingSpaces,trailingSpaces  ,  surroundingSpaces  ,  ,," )
+end
+def test_surrounding_tabs_are_deleted
+  assert_equal [["noTabs",
+                 "leadingTab",
+                 "trailingTab",
+                 "surroundingTabs",
+                 "",
+                 "",
+                 ""]], parser.default.parse( "noTabs,\tleadingTab,trailingTab\t,\tsurroundingTabs\t,\t\t,," )
+end
+def test_ignore_empty_lines
+  assert_equal [[ "first", "line", "" ],
+                [ "second", "line" ],
+                [ "third line" ],
+                [ "last", "line" ]],
+                parser.default.parse( "first,line,\n" + "\n" + "\n" +
+                              "second,line\n" + "\n" + "\n" +
+                              "third line \n" + "\n" + "\n" +
+                              "last, line \n" + "\n" + "\n" + "\n" )
+end
+def test_comments
+  assert_equal [["first",  "line", "" ],
+                ["second", "line", "tokenWith#no-comment" ],
+                ["third",  "line", "#no-comment" ]],
+                parser.default.parse( "first,line,\n" +
+                              "second,line,tokenWith#no-comment\n" +
+                              "# comment line \n" +
+                              "third,line,#no-comment\n" +
+                              "# penultimate comment\n" +
+                              "# Final comment\n" )
+end
+def test_comments_and_empty_lines
+  parser.strict.comment = '#'
+  assert_equal [[ "1", "2", "3", "" ], ## 1
+                [ "" ], ## 1b
+                [ "" ], ## 1c
+                [ "a", "b x", "c#no-comment" ], ## 2
+                [ "" ],  ## 4
+                [ "" ],  ## 4b
+                [ "d", "e", "#no-comment" ], ## 5
+                [ "" ], ## 5b
+                [ "" ], ## 5c
+                [ "" ], ## 6b
+                [ "" ]  ## 6c
+               ],
+               parser.strict.parse(
+                  "1,2,3,\n" + ## 1
+                  "\n" +       ## 1b
+                  "\n" +       ## 1c
+                  "a,b x,c#no-comment\n" + ## 2
+                  "#foo\n" + ## 3
+                  "\n" + ## 4
+                  "\n" + ## 4b
+                  "d,e,#no-comment\n" + ## 5
+                  "\n" + ## 5b
+                  "\n" + ## 5c
+                  "# penultimate comment\n" + ## 6
+                  "\n" + ## 6b
+                  "\n" + ## 6c
+                  "# Final comment\n" ## 7
+              )
+  parser.strict.comment = false    ## reset to defaults
+end
+def test_backslash_with_escaping
+  ## simple token with escaping enabled
+  assert_equal [[ "a", ",", "b\\" ],
+                [ ",", "\nc", "d\r" ],
+                [ "e" ]], parser.default.parse( "a,\\,,b\\\\\n" +
+                                                "\\,,\\\nc,d\\\r\n" +
+                                                "e" )
+  parser.strict.escape = "\\"
+  assert_equal [[ "a", ",", "b\\" ],
+                [ ",", "\nc", "d\r" ],
+                [ "e" ]], parser.strict.parse( "a,\\,,b\\\\\n" +
+                                               "\\,,\\\nc,d\\\r\n" +
+                                               "e" )
+  parser.strict.escape = false
+end
+def test_backslash_without_escaping
+  ## simple token with escaping not enabled
+  assert_equal [[ "a",
+                  "\\", ## an unquoted single backslash is not an escape char
+                  "",
+                  "b\\" ## an unquoted single backslash is not an escape char
+                ],
+                [ "\\", "", "" ]], parser.strict.parse( "a,\\,,b\\\n" +
+                                                        "\\,," )
+end
+def test_next_token4
+  ## encapsulator tokenizer (single line)
+  assert_equal [[ "a", "foo", "b" ],
+                [ "a", " foo", "b" ],
+                [ "a", "foo ", "b" ],
+                [ "a", " foo ", "b" ]],
+                parser.default.parse( "a,\"foo\",b\n" +
+                                      "a,   \" foo\",b\n" +
+                                      "a,\"foo \"  ,b\n" +
+                                      "a,  \" foo \"  ,b" )
+end
+def test_next_token5
+    ## encapsulator tokenizer (multi line, delimiter in string)
+   assert_equal [[ "a", "foo\n", "b" ],
+                 [ "foo\n  baar ,,," ],
+                 [ "\n\t \n" ]],
+                 parser.default.parse( "a,\"foo\n\",b\n" +
+                                       "\"foo\n  baar ,,,\"\n" +
+                                       "\"\n\t \n\"" )
+end
+def test_separator_is_tab
+  parser.strict.sep = "\t"
+  assert_equal [["one",
+                 "two",
+                 "",
+                 "four ",
+                 " five",
+                 " six" ]], parser.strict.parse( "one\ttwo\t\tfour \t five\t six" )
+  parser.strict.sep = ","   ## reset back to comma
+end
+def test_escaped_cr
+    assert_equal [[ "character" + CR + "Escaped" ]],
+                 parser.default.parse( "character\\" + CR + "Escaped" )
+end
+def test_cr
+   assert_equal [[ "character"  ],
+                 [ "NotEscaped" ]],
+                parser.default.parse( "character" + CR + "NotEscaped" )
+end
+def test_escaped_lf
+    assert_equal [[ "character" + LF + "Escaped" ]],
+                 parser.default.parse( "character\\" + LF + "Escaped" )
+end
+def test_lf
+   assert_equal [[ "character" ],
+                 [ "NotEscaped" ]],
+                 parser.default.parse( "character" + LF + "NotEscaped" )
+end
+def test_escaped_mysql_null_value
+  ## MySQL uses \N to symbolize null values. We have to restore this
+    ## note: "unknown escape sequences e.g. \N get passed "through" as-is (unescaped)"
+    ##   only supports \n \r  (sep e.g \, or \t)  (quote e.g. \") for now - any others?
+    assert_equal [[ "character\\NEscaped" ]],
+                 parser.default.parse( "character\\NEscaped" )
+end
+end # class TestParserJava