RubyGems - csvreader - Versions diffs - 1.1.1 → 1.1.2 - Mend

csvreader 1.1.1 → 1.1.2

Files changed (10) hide show

checksums.yaml +4 -4
data/Manifest.txt +3 -0
data/README.md +30 -0
data/lib/csvreader/base.rb +2 -1
data/lib/csvreader/parser_std.rb +56 -21
data/lib/csvreader/version.rb +1 -1
data/test/data/iris.attrib.csv +25 -0
data/test/data/lcc.attrib.csv +14 -0
data/test/test_parser_directive.rb +68 -0
metadata +5 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: c522e332ef3c1fead487b99d5fe147ba43ad2090
-  data.tar.gz: 51dd6d88ef8dc35615513961bab7e0e1c3b3512b
+  metadata.gz: cf620967ec1983a211f8e2436a4b50aca3bbe023
+  data.tar.gz: 76da0bbce4a76c4b60e37f1cb93be23d2aec504e
 SHA512:
-  metadata.gz: 7e563f75e916829e8de1b0a3b1208dd089de9a7907d010e3ba2cd23f1a70fedcb8d98c95e65c15ab7d3ad8705ae41a4ad6cd543ba20d6a72dc67f27b0060286b
-  data.tar.gz: 57036e2457b4dc1837748538062150650b47abef3d2493f4c4f42db4291fdd3001cb6fb218eca38c7c11816c67360cfea74030e137ff7edf8de1fb9e47f991ec
+  metadata.gz: 6024f630a6c982beffd597107cfa75c1e2d6e86e174408632f4e31aa8d4c5a2ea6be8608f678f64da6bd6ba914e9f3ed55fce044a25593bd92757a82bb0d082e
+  data.tar.gz: 98bed6e7938399640d942d5c8d9f420d01f4d048d06c09dec2f1e6e7e833a8c38c42419a520445b13166743615de7bd120eec20a4c607d377ebf40a0109bcc47

data/Manifest.txt CHANGED Viewed

@@ -22,12 +22,15 @@ test/data/beer11.csv
 test/data/cars11.csv
 test/data/cities11.csv
 test/data/customers11.csv
+test/data/iris.attrib.csv
 test/data/iris11.csv
+test/data/lcc.attrib.csv
 test/data/shakespeare.csv
 test/helper.rb
 test/test_buffer.rb
 test/test_converter.rb
 test/test_parser.rb
+test/test_parser_directive.rb
 test/test_parser_fixed.rb
 test/test_parser_formats.rb
 test/test_parser_java.rb

data/README.md CHANGED Viewed

@@ -8,6 +8,36 @@
 * forum :: [wwwmake](http://groups.google.com/group/wwwmake)
+## What's News?
+**v1.1.2**: Added built-in support for single quotes (`'`) to default parser ("The Right Way").
+Now you can use both, that is, single (`'...'`) or double quotes (`"..."`)
+like in ruby (or javascript or html or ...) :-).
+**v1.1.1**: Added built-in support for (optional) alternative comments (`%`) - used by
+ARFF (attribute relation file format) -
+and support for (optional) directives (`@`) in header (that is, before any records)
+to default parser ("The Right Way").
+Now you can use either `#` or `%` for comments, the first one "wins" - you CANNOT use both.
+Now you can use either a front matter (`---`) block
+or directives (e.g. `@attribute`, `@relation`, etc.)
+for meta data, the first one "wins" - you CANNOT use both.
+**v1.1.0**: Added new fixed width field (fwf) parser (see `ParserFixed`) for supporting fields with fixed width (and no separator)
+e.g.`Csv.fixed.parse( txt, width: [8,-2,8,-3,32,-2,14] )`.
+**v1.0.3**: Added built-in support for an (optional) front matter (`---`) meta data block
+in header (that is, before any records)
+to default parser ("The Right Way"). See [CSVY.org](http://csvy.org) for more.
+Use `Csv.parser.meta` to get the parsed meta data block hash (or `nil`) if none.
 ## Usage

data/lib/csvreader/base.rb CHANGED Viewed

@@ -165,4 +165,5 @@ end # class CsvHashReader
-puts CsvReader.banner   # say hello
+# say hello
+puts CsvReader.banner    if $DEBUG || (defined?($RUBYLIBS_DEBUG) && $RUBYLIBS_DEBUG)

data/lib/csvreader/parser_std.rb CHANGED Viewed

@@ -10,13 +10,17 @@ class ParserStd
 ## char constants
-DOUBLE_QUOTE = "\""
-BACKSLASH    = "\\"    ## use BACKSLASH_ESCAPE ??
-COMMENT      = "#"      ## use COMMENT_HASH or HASH or ??
-SPACE        = " "      ##   \s == ASCII 32 (dec)            =    (Space)
-TAB          = "\t"     ##   \t == ASCII 0x09 (hex)          = HT (Tab/horizontal tab)
-LF	         = "\n"     ##   \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed)
-CR	         = "\r"     ##   \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return)
+DOUBLE_QUOTE  = "\""
+SINGLE_QUOTE  = "'"
+BACKSLASH     = "\\"    ## use BACKSLASH_ESCAPE ??
+COMMENT1      = "#"      ## use COMMENT_HASH or HASH or ??
+COMMENT2      = "%"      ## use COMMENT_PERCENT or PERCENT or ??
+DIRECTIVE     = "@"     ## use a different name e.g. AT or ??
+SPACE         = " "      ##   \s == ASCII 32 (dec)            =    (Space)
+TAB           = "\t"     ##   \t == ASCII 0x09 (hex)          = HT (Tab/horizontal tab)
+LF	          = "\n"     ##   \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed)
+CR	          = "\r"     ##   \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return)
 ###################################
@@ -101,13 +105,14 @@ end ## method parse
 private
 def parse_escape( input )
   value = ""
   if input.peek == BACKSLASH
     input.getc ## eat-up backslash
-    if (c=input.peek; c==BACKSLASH || c==LF || c==CR || c==',' || c=='"' )
+    if (c=input.peek; c==BACKSLASH || c==LF || c==CR || c==',' || c==DOUBLE_QUOTE || c==SINGLE_QUOTE )
       logger.debug "  add escaped char >#{input.peek}< (#{input.peek.ord})"  if logger.debug?
       value << input.getc     ## add escaped char (e.g. lf, cr, etc.)
     else
@@ -122,23 +127,24 @@ def parse_escape( input )
 end
-def parse_doublequote( input )
+def parse_quote( input, quote:)
   value = ""
-  if input.peek == DOUBLE_QUOTE
-    input.getc  ## eat-up double_quote
+  if input.peek == quote
+    input.getc  ## eat-up quote
     loop do
-      while (c=input.peek; !(c==DOUBLE_QUOTE || c==BACKSLASH || input.eof?))
-        value << input.getc   ## eat-up everything until hitting double_quote (") or backslash (escape)
+      while (c=input.peek; !(c==quote || c==BACKSLASH || input.eof?))
+        value << input.getc   ## eat-up everything until hitting quote (e.g. " or ') or backslash (escape)
       end
       if input.eof?
         break
       elsif input.peek == BACKSLASH
         value << parse_escape( input )
-      else   ## assume input.peek == DOUBLE_QUOTE
-        input.getc ## eat-up double_quote
-        if input.peek == DOUBLE_QUOTE  ## doubled up quote?
+      else   ## assume input.peek == quote
+        input.getc ## eat-up quote
+        if input.peek == quote  ## doubled up quote?
           value << input.getc   ## add doube quote and continue!!!!
         else
           break
@@ -146,13 +152,14 @@ def parse_doublequote( input )
       end
     end
   else
-    raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - DOUBLE_QUOTE (\") expected in parse_double_quote!!!!" )
+    raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - QUOTE (#{quote}) expected in parse_quote!!!!" )
   end
   value
 end
 def parse_field( input )
   value = ""
@@ -175,11 +182,18 @@ def parse_field( input )
     end
   elsif input.peek == DOUBLE_QUOTE
     logger.debug "start double_quote field - peek >#{input.peek}< (#{input.peek.ord})"  if logger.debug?
-    value << parse_doublequote( input )
+    value << parse_quote( input, quote: DOUBLE_QUOTE )
     ## note: always eat-up all trailing spaces (" ") and tabs (\t)
     skip_spaces( input )
     logger.debug "end double_quote field - peek >#{input.peek}< (#{input.peek.ord})"  if logger.debug?
+  elsif input.peek == SINGLE_QUOTE    ## allow single quote too (by default)
+    logger.debug "start single_quote field - peek >#{input.peek}< (#{input.peek.ord})"  if logger.debug?
+    value << parse_quote( input, quote: SINGLE_QUOTE )
+    ## note: always eat-up all trailing spaces (" ") and tabs (\t)
+    skip_spaces( input )
+    logger.debug "end single_quote field - peek >#{input.peek}< (#{input.peek.ord})"  if logger.debug?
   else
     logger.debug "start reg field - peek >#{input.peek}< (#{input.peek.ord})"  if logger.debug?
     ## consume simple value
@@ -349,20 +363,41 @@ def parse_lines( input, &block )
   ##   used for meta block (can only start before any records e.g. if record_num == 0)
   record_num = 0
+  ## note: can either use '#' or '%' but NOT both; first one "wins"
+  comment = nil
+  ## note: can either use directives (@) or frontmatter (---) block; first one "wins"
+  has_seen_directive   = false
+  has_seen_frontmatter = false   ## - renameto  has_seen_dash (---) - why? why not???
   loop do
     break if input.eof?
     skipped_spaces = skip_spaces( input )
-    if input.peek == COMMENT        ## comment line
-      logger.debug "skipping comment - peek >#{input.peek}< (#{input.peek.ord})"  if logger.debug?
+    if comment.nil? && (c=input.peek; c==COMMENT1 || c==COMMENT2)
+      logger.debug "skipping comment (first) - peek >#{input.peek}< (#{input.peek.ord})"  if logger.debug?
+      comment = input.getc  ## first comment line (determines/fixes "allowed" comment-style)
+      skip_until_eol( input )
+      skip_newline( input )
+    elsif comment && input.peek == comment        ## (anther) comment line
+      logger.debug "skipping comment (follow-up) - peek >#{input.peek}< (#{input.peek.ord})"  if logger.debug?
       skip_until_eol( input )
       skip_newline( input )
     elsif (c=input.peek; c==LF || c==CR || input.eof?)
       logger.debug "skipping blank - peek >#{input.peek}< (#{input.peek.ord})"  if logger.debug?
       skip_newline( input )
-    elsif record_num == 0 && skipped_spaces == 0 && meta.nil? && input.peekn(4) =~ /^---[\n\r \t]$/
+    elsif record_num == 0 && has_seen_frontmatter == false && input.peek==DIRECTIVE
+      ## note: "skip" directives for now
+      has_seen_directive = true
+      logger.debug "skip directive"  if logger.debug?
+      skip_until_eol( input )
+      skip_newline( input )
+    elsif record_num == 0 && has_seen_directive == false && has_seen_frontmatter == false &&
+          skipped_spaces == 0 && input.peekn(4) =~ /^---[\n\r \t]$/
       ## note: assume "---" (MUST BE) followed by newline (\r or \n) or space starts a meta block
+      has_seen_frontmatter = true
       logger.debug "start meta block"  if logger.debug?
       ## note: meta gets stored as object attribute (state/state/state!!)
       ##   use meta attribute to get meta data after reading first record

data/lib/csvreader/version.rb CHANGED Viewed

@@ -5,7 +5,7 @@ class CsvReader   ## note: uses a class for now - change to module - why? why no
   MAJOR = 1    ## todo: namespace inside version or something - why? why not??
   MINOR = 1
-  PATCH = 1
+  PATCH = 2
   VERSION = [MAJOR,MINOR,PATCH].join('.')

data/test/data/iris.attrib.csv ADDED Viewed

@@ -0,0 +1,25 @@
+% 1. Title: Iris Plants Database
+%
+% 2. Sources:
+% (a) Creator: R.A. Fisher
+@RELATION iris
+@ATTRIBUTE sepallength NUMERIC
+@ATTRIBUTE sepalwidth NUMERIC
+@ATTRIBUTE petallength NUMERIC
+@ATTRIBUTE petalwidth NUMERIC
+@ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica}
+@DATA
+5.1,3.5,1.4,0.2,Iris-setosa
+4.9,3.0,1.4,0.2,Iris-setosa
+4.7,3.2,1.3,0.2,Iris-setosa
+4.6,3.1,1.5,0.2,Iris-setosa
+5.0,3.6,1.4,0.2,Iris-setosa
+5.4,3.9,1.7,0.4,Iris-setosa
+4.6,3.4,1.4,0.3,Iris-setosa
+5.0,3.4,1.5,0.2,Iris-setosa
+4.4,2.9,1.4,0.2,Iris-setosa
+4.9,3.1,1.5,0.1,Iris-setosa

data/test/data/lcc.attrib.csv ADDED Viewed

@@ -0,0 +1,14 @@
+%  Attribute-Relation File Format (ARFF) Example
+%    see https://www.cs.waikato.ac.nz/ml/weka/arff.html
+@relation LCCvsLCSH
+@attribute LCC string
+@attribute LCSH string
+@data
+AG5,   'Encyclopedias and dictionaries.;Twentieth century.'
+AS262, 'Science -- Soviet Union -- History.'
+AE5,   'Encyclopedias and dictionaries.'
+AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Phases.'
+AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Tables.'

data/test/test_parser_directive.rb ADDED Viewed

@@ -0,0 +1,68 @@
+# encoding: utf-8
+###
+#  to run use
+#     ruby -I ./lib -I ./test test/test_parser_directive.rb
+require 'helper'
+class TestParserDirective < MiniTest::Test
+def parser
+  parser = CsvReader::Parser::DEFAULT
+end
+def test_iris
+  records = [["5.1","3.5","1.4","0.2","Iris-setosa"],
+             ["4.9","3.0","1.4","0.2","Iris-setosa"]]
+  assert_equal records, parser.parse( <<TXT )
+% with meta data - arff (attribute relation file format)-style
+%
+@RELATION iris
+@ATTRIBUTE sepallength NUMERIC
+@ATTRIBUTE sepalwidth NUMERIC
+@ATTRIBUTE petallength NUMERIC
+@ATTRIBUTE petalwidth NUMERIC
+@ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica}
+@DATA
+5.1,3.5,1.4,0.2,Iris-setosa
+4.9,3.0,1.4,0.2,Iris-setosa
+TXT
+end
+def test_lcc
+  records = [['AG5',   'Encyclopedias and dictionaries.;Twentieth century.'],
+             ['AS262', 'Science -- Soviet Union -- History.'],
+             ['AE5',   'Encyclopedias and dictionaries.'],
+             ['AS281', 'Astronomy, Assyro-Babylonian.;Moon -- Phases.'],
+             ['AS281', 'Astronomy, Assyro-Babylonian.;Moon -- Tables.']]
+  assert_equal records, parser.parse( <<TXT )
+%  Attribute-Relation File Format (ARFF) Example
+%    see https://www.cs.waikato.ac.nz/ml/weka/arff.html
+@relation LCCvsLCSH
+@attribute LCC string
+@attribute LCSH string
+@data
+AG5,   'Encyclopedias and dictionaries.;Twentieth century.'
+AS262, 'Science -- Soviet Union -- History.'
+AE5,   'Encyclopedias and dictionaries.'
+AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Phases.'
+AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Tables.'
+TXT
+end
+end # class TestParserDirective

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: csvreader
 version: !ruby/object:Gem::Version
-  version: 1.1.1
+  version: 1.1.2
 platform: ruby
 authors:
 - Gerald Bauer
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2018-10-23 00:00:00.000000000 Z
+date: 2018-10-24 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rdoc
@@ -73,12 +73,15 @@ files:
 - test/data/cars11.csv
 - test/data/cities11.csv
 - test/data/customers11.csv
+- test/data/iris.attrib.csv
 - test/data/iris11.csv
+- test/data/lcc.attrib.csv
 - test/data/shakespeare.csv
 - test/helper.rb
 - test/test_buffer.rb
 - test/test_converter.rb
 - test/test_parser.rb
+- test/test_parser_directive.rb
 - test/test_parser_fixed.rb
 - test/test_parser_formats.rb
 - test/test_parser_java.rb