csvreader 1.1.1 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c522e332ef3c1fead487b99d5fe147ba43ad2090
4
- data.tar.gz: 51dd6d88ef8dc35615513961bab7e0e1c3b3512b
3
+ metadata.gz: cf620967ec1983a211f8e2436a4b50aca3bbe023
4
+ data.tar.gz: 76da0bbce4a76c4b60e37f1cb93be23d2aec504e
5
5
  SHA512:
6
- metadata.gz: 7e563f75e916829e8de1b0a3b1208dd089de9a7907d010e3ba2cd23f1a70fedcb8d98c95e65c15ab7d3ad8705ae41a4ad6cd543ba20d6a72dc67f27b0060286b
7
- data.tar.gz: 57036e2457b4dc1837748538062150650b47abef3d2493f4c4f42db4291fdd3001cb6fb218eca38c7c11816c67360cfea74030e137ff7edf8de1fb9e47f991ec
6
+ metadata.gz: 6024f630a6c982beffd597107cfa75c1e2d6e86e174408632f4e31aa8d4c5a2ea6be8608f678f64da6bd6ba914e9f3ed55fce044a25593bd92757a82bb0d082e
7
+ data.tar.gz: 98bed6e7938399640d942d5c8d9f420d01f4d048d06c09dec2f1e6e7e833a8c38c42419a520445b13166743615de7bd120eec20a4c607d377ebf40a0109bcc47
data/Manifest.txt CHANGED
@@ -22,12 +22,15 @@ test/data/beer11.csv
22
22
  test/data/cars11.csv
23
23
  test/data/cities11.csv
24
24
  test/data/customers11.csv
25
+ test/data/iris.attrib.csv
25
26
  test/data/iris11.csv
27
+ test/data/lcc.attrib.csv
26
28
  test/data/shakespeare.csv
27
29
  test/helper.rb
28
30
  test/test_buffer.rb
29
31
  test/test_converter.rb
30
32
  test/test_parser.rb
33
+ test/test_parser_directive.rb
31
34
  test/test_parser_fixed.rb
32
35
  test/test_parser_formats.rb
33
36
  test/test_parser_java.rb
data/README.md CHANGED
@@ -8,6 +8,36 @@
8
8
  * forum :: [wwwmake](http://groups.google.com/group/wwwmake)
9
9
 
10
10
 
11
+ ## What's News?
12
+
13
+
14
+
15
+ **v1.1.2**: Added built-in support for single quotes (`'`) to default parser ("The Right Way").
16
+ Now you can use both, that is, single (`'...'`) or double quotes (`"..."`)
17
+ like in ruby (or javascript or html or ...) :-).
18
+
19
+
20
+ **v1.1.1**: Added built-in support for (optional) alternative comments (`%`) - used by
21
+ ARFF (attribute relation file format) -
22
+ and support for (optional) directives (`@`) in header (that is, before any records)
23
+ to default parser ("The Right Way").
24
+ Now you can use either `#` or `%` for comments, the first one "wins" - you CANNOT use both.
25
+ Now you can use either a front matter (`---`) block
26
+ or directives (e.g. `@attribute`, `@relation`, etc.)
27
+ for meta data, the first one "wins" - you CANNOT use both.
28
+
29
+
30
+ **v1.1.0**: Added new fixed width field (fwf) parser (see `ParserFixed`) for supporting fields with fixed width (and no separator)
31
+ e.g.`Csv.fixed.parse( txt, width: [8,-2,8,-3,32,-2,14] )`.
32
+
33
+
34
+ **v1.0.3**: Added built-in support for an (optional) front matter (`---`) meta data block
35
+ in header (that is, before any records)
36
+ to default parser ("The Right Way"). See [CSVY.org](http://csvy.org) for more.
37
+ Use `Csv.parser.meta` to get the parsed meta data block hash (or `nil`) if none.
38
+
39
+
40
+
11
41
 
12
42
  ## Usage
13
43
 
@@ -165,4 +165,5 @@ end # class CsvHashReader
165
165
 
166
166
 
167
167
 
168
- puts CsvReader.banner # say hello
168
+ # say hello
169
+ puts CsvReader.banner if $DEBUG || (defined?($RUBYLIBS_DEBUG) && $RUBYLIBS_DEBUG)
@@ -10,13 +10,17 @@ class ParserStd
10
10
 
11
11
 
12
12
  ## char constants
13
- DOUBLE_QUOTE = "\""
14
- BACKSLASH = "\\" ## use BACKSLASH_ESCAPE ??
15
- COMMENT = "#" ## use COMMENT_HASH or HASH or ??
16
- SPACE = " " ## \s == ASCII 32 (dec) = (Space)
17
- TAB = "\t" ## \t == ASCII 0x09 (hex) = HT (Tab/horizontal tab)
18
- LF = "\n" ## \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed)
19
- CR = "\r" ## \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return)
13
+ DOUBLE_QUOTE = "\""
14
+ SINGLE_QUOTE = "'"
15
+ BACKSLASH = "\\" ## use BACKSLASH_ESCAPE ??
16
+ COMMENT1 = "#" ## use COMMENT_HASH or HASH or ??
17
+ COMMENT2 = "%" ## use COMMENT_PERCENT or PERCENT or ??
18
+ DIRECTIVE = "@" ## use a different name e.g. AT or ??
19
+ SPACE = " " ## \s == ASCII 32 (dec) = (Space)
20
+ TAB = "\t" ## \t == ASCII 0x09 (hex) = HT (Tab/horizontal tab)
21
+ LF = "\n" ## \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed)
22
+ CR = "\r" ## \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return)
23
+
20
24
 
21
25
 
22
26
  ###################################
@@ -101,13 +105,14 @@ end ## method parse
101
105
 
102
106
 
103
107
 
108
+
104
109
  private
105
110
 
106
111
  def parse_escape( input )
107
112
  value = ""
108
113
  if input.peek == BACKSLASH
109
114
  input.getc ## eat-up backslash
110
- if (c=input.peek; c==BACKSLASH || c==LF || c==CR || c==',' || c=='"' )
115
+ if (c=input.peek; c==BACKSLASH || c==LF || c==CR || c==',' || c==DOUBLE_QUOTE || c==SINGLE_QUOTE )
111
116
  logger.debug " add escaped char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
112
117
  value << input.getc ## add escaped char (e.g. lf, cr, etc.)
113
118
  else
@@ -122,23 +127,24 @@ def parse_escape( input )
122
127
  end
123
128
 
124
129
 
125
- def parse_doublequote( input )
130
+
131
+ def parse_quote( input, quote:)
126
132
  value = ""
127
- if input.peek == DOUBLE_QUOTE
128
- input.getc ## eat-up double_quote
133
+ if input.peek == quote
134
+ input.getc ## eat-up quote
129
135
 
130
136
  loop do
131
- while (c=input.peek; !(c==DOUBLE_QUOTE || c==BACKSLASH || input.eof?))
132
- value << input.getc ## eat-up everything until hitting double_quote (") or backslash (escape)
137
+ while (c=input.peek; !(c==quote || c==BACKSLASH || input.eof?))
138
+ value << input.getc ## eat-up everything until hitting quote (e.g. " or ') or backslash (escape)
133
139
  end
134
140
 
135
141
  if input.eof?
136
142
  break
137
143
  elsif input.peek == BACKSLASH
138
144
  value << parse_escape( input )
139
- else ## assume input.peek == DOUBLE_QUOTE
140
- input.getc ## eat-up double_quote
141
- if input.peek == DOUBLE_QUOTE ## doubled up quote?
145
+ else ## assume input.peek == quote
146
+ input.getc ## eat-up quote
147
+ if input.peek == quote ## doubled up quote?
142
148
  value << input.getc ## add doube quote and continue!!!!
143
149
  else
144
150
  break
@@ -146,13 +152,14 @@ def parse_doublequote( input )
146
152
  end
147
153
  end
148
154
  else
149
- raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - DOUBLE_QUOTE (\") expected in parse_double_quote!!!!" )
155
+ raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - QUOTE (#{quote}) expected in parse_quote!!!!" )
150
156
  end
151
157
  value
152
158
  end
153
159
 
154
160
 
155
161
 
162
+
156
163
  def parse_field( input )
157
164
  value = ""
158
165
 
@@ -175,11 +182,18 @@ def parse_field( input )
175
182
  end
176
183
  elsif input.peek == DOUBLE_QUOTE
177
184
  logger.debug "start double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
178
- value << parse_doublequote( input )
185
+ value << parse_quote( input, quote: DOUBLE_QUOTE )
179
186
 
180
187
  ## note: always eat-up all trailing spaces (" ") and tabs (\t)
181
188
  skip_spaces( input )
182
189
  logger.debug "end double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
190
+ elsif input.peek == SINGLE_QUOTE ## allow single quote too (by default)
191
+ logger.debug "start single_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
192
+ value << parse_quote( input, quote: SINGLE_QUOTE )
193
+
194
+ ## note: always eat-up all trailing spaces (" ") and tabs (\t)
195
+ skip_spaces( input )
196
+ logger.debug "end single_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
183
197
  else
184
198
  logger.debug "start reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
185
199
  ## consume simple value
@@ -349,20 +363,41 @@ def parse_lines( input, &block )
349
363
  ## used for meta block (can only start before any records e.g. if record_num == 0)
350
364
  record_num = 0
351
365
 
366
+ ## note: can either use '#' or '%' but NOT both; first one "wins"
367
+ comment = nil
368
+
369
+ ## note: can either use directives (@) or frontmatter (---) block; first one "wins"
370
+ has_seen_directive = false
371
+ has_seen_frontmatter = false ## - renameto has_seen_dash (---) - why? why not???
372
+
373
+
352
374
  loop do
353
375
  break if input.eof?
354
376
 
355
377
  skipped_spaces = skip_spaces( input )
356
378
 
357
- if input.peek == COMMENT ## comment line
358
- logger.debug "skipping comment - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
379
+ if comment.nil? && (c=input.peek; c==COMMENT1 || c==COMMENT2)
380
+ logger.debug "skipping comment (first) - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
381
+ comment = input.getc ## first comment line (determines/fixes "allowed" comment-style)
382
+ skip_until_eol( input )
383
+ skip_newline( input )
384
+ elsif comment && input.peek == comment ## (anther) comment line
385
+ logger.debug "skipping comment (follow-up) - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
359
386
  skip_until_eol( input )
360
387
  skip_newline( input )
361
388
  elsif (c=input.peek; c==LF || c==CR || input.eof?)
362
389
  logger.debug "skipping blank - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
363
390
  skip_newline( input )
364
- elsif record_num == 0 && skipped_spaces == 0 && meta.nil? && input.peekn(4) =~ /^---[\n\r \t]$/
391
+ elsif record_num == 0 && has_seen_frontmatter == false && input.peek==DIRECTIVE
392
+ ## note: "skip" directives for now
393
+ has_seen_directive = true
394
+ logger.debug "skip directive" if logger.debug?
395
+ skip_until_eol( input )
396
+ skip_newline( input )
397
+ elsif record_num == 0 && has_seen_directive == false && has_seen_frontmatter == false &&
398
+ skipped_spaces == 0 && input.peekn(4) =~ /^---[\n\r \t]$/
365
399
  ## note: assume "---" (MUST BE) followed by newline (\r or \n) or space starts a meta block
400
+ has_seen_frontmatter = true
366
401
  logger.debug "start meta block" if logger.debug?
367
402
  ## note: meta gets stored as object attribute (state/state/state!!)
368
403
  ## use meta attribute to get meta data after reading first record
@@ -5,7 +5,7 @@ class CsvReader ## note: uses a class for now - change to module - why? why no
5
5
 
6
6
  MAJOR = 1 ## todo: namespace inside version or something - why? why not??
7
7
  MINOR = 1
8
- PATCH = 1
8
+ PATCH = 2
9
9
  VERSION = [MAJOR,MINOR,PATCH].join('.')
10
10
 
11
11
 
@@ -0,0 +1,25 @@
1
+ % 1. Title: Iris Plants Database
2
+ %
3
+ % 2. Sources:
4
+ % (a) Creator: R.A. Fisher
5
+
6
+
7
+ @RELATION iris
8
+
9
+ @ATTRIBUTE sepallength NUMERIC
10
+ @ATTRIBUTE sepalwidth NUMERIC
11
+ @ATTRIBUTE petallength NUMERIC
12
+ @ATTRIBUTE petalwidth NUMERIC
13
+ @ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica}
14
+
15
+ @DATA
16
+ 5.1,3.5,1.4,0.2,Iris-setosa
17
+ 4.9,3.0,1.4,0.2,Iris-setosa
18
+ 4.7,3.2,1.3,0.2,Iris-setosa
19
+ 4.6,3.1,1.5,0.2,Iris-setosa
20
+ 5.0,3.6,1.4,0.2,Iris-setosa
21
+ 5.4,3.9,1.7,0.4,Iris-setosa
22
+ 4.6,3.4,1.4,0.3,Iris-setosa
23
+ 5.0,3.4,1.5,0.2,Iris-setosa
24
+ 4.4,2.9,1.4,0.2,Iris-setosa
25
+ 4.9,3.1,1.5,0.1,Iris-setosa
@@ -0,0 +1,14 @@
1
+ % Attribute-Relation File Format (ARFF) Example
2
+ % see https://www.cs.waikato.ac.nz/ml/weka/arff.html
3
+
4
+ @relation LCCvsLCSH
5
+
6
+ @attribute LCC string
7
+ @attribute LCSH string
8
+
9
+ @data
10
+ AG5, 'Encyclopedias and dictionaries.;Twentieth century.'
11
+ AS262, 'Science -- Soviet Union -- History.'
12
+ AE5, 'Encyclopedias and dictionaries.'
13
+ AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Phases.'
14
+ AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Tables.'
@@ -0,0 +1,68 @@
1
+ # encoding: utf-8
2
+
3
+ ###
4
+ # to run use
5
+ # ruby -I ./lib -I ./test test/test_parser_directive.rb
6
+
7
+
8
+ require 'helper'
9
+
10
+ class TestParserDirective < MiniTest::Test
11
+
12
+
13
+ def parser
14
+ parser = CsvReader::Parser::DEFAULT
15
+ end
16
+
17
+
18
+ def test_iris
19
+ records = [["5.1","3.5","1.4","0.2","Iris-setosa"],
20
+ ["4.9","3.0","1.4","0.2","Iris-setosa"]]
21
+
22
+
23
+ assert_equal records, parser.parse( <<TXT )
24
+ % with meta data - arff (attribute relation file format)-style
25
+ %
26
+
27
+ @RELATION iris
28
+
29
+ @ATTRIBUTE sepallength NUMERIC
30
+ @ATTRIBUTE sepalwidth NUMERIC
31
+ @ATTRIBUTE petallength NUMERIC
32
+ @ATTRIBUTE petalwidth NUMERIC
33
+ @ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica}
34
+
35
+ @DATA
36
+ 5.1,3.5,1.4,0.2,Iris-setosa
37
+ 4.9,3.0,1.4,0.2,Iris-setosa
38
+ TXT
39
+ end
40
+
41
+
42
+ def test_lcc
43
+ records = [['AG5', 'Encyclopedias and dictionaries.;Twentieth century.'],
44
+ ['AS262', 'Science -- Soviet Union -- History.'],
45
+ ['AE5', 'Encyclopedias and dictionaries.'],
46
+ ['AS281', 'Astronomy, Assyro-Babylonian.;Moon -- Phases.'],
47
+ ['AS281', 'Astronomy, Assyro-Babylonian.;Moon -- Tables.']]
48
+
49
+
50
+ assert_equal records, parser.parse( <<TXT )
51
+ % Attribute-Relation File Format (ARFF) Example
52
+ % see https://www.cs.waikato.ac.nz/ml/weka/arff.html
53
+
54
+ @relation LCCvsLCSH
55
+
56
+ @attribute LCC string
57
+ @attribute LCSH string
58
+
59
+ @data
60
+ AG5, 'Encyclopedias and dictionaries.;Twentieth century.'
61
+ AS262, 'Science -- Soviet Union -- History.'
62
+ AE5, 'Encyclopedias and dictionaries.'
63
+ AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Phases.'
64
+ AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Tables.'
65
+ TXT
66
+ end
67
+
68
+ end # class TestParserDirective
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: csvreader
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.1
4
+ version: 1.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-10-23 00:00:00.000000000 Z
11
+ date: 2018-10-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rdoc
@@ -73,12 +73,15 @@ files:
73
73
  - test/data/cars11.csv
74
74
  - test/data/cities11.csv
75
75
  - test/data/customers11.csv
76
+ - test/data/iris.attrib.csv
76
77
  - test/data/iris11.csv
78
+ - test/data/lcc.attrib.csv
77
79
  - test/data/shakespeare.csv
78
80
  - test/helper.rb
79
81
  - test/test_buffer.rb
80
82
  - test/test_converter.rb
81
83
  - test/test_parser.rb
84
+ - test/test_parser_directive.rb
82
85
  - test/test_parser_fixed.rb
83
86
  - test/test_parser_formats.rb
84
87
  - test/test_parser_java.rb