csvreader 1.1.1 → 1.1.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c522e332ef3c1fead487b99d5fe147ba43ad2090
4
- data.tar.gz: 51dd6d88ef8dc35615513961bab7e0e1c3b3512b
3
+ metadata.gz: cf620967ec1983a211f8e2436a4b50aca3bbe023
4
+ data.tar.gz: 76da0bbce4a76c4b60e37f1cb93be23d2aec504e
5
5
  SHA512:
6
- metadata.gz: 7e563f75e916829e8de1b0a3b1208dd089de9a7907d010e3ba2cd23f1a70fedcb8d98c95e65c15ab7d3ad8705ae41a4ad6cd543ba20d6a72dc67f27b0060286b
7
- data.tar.gz: 57036e2457b4dc1837748538062150650b47abef3d2493f4c4f42db4291fdd3001cb6fb218eca38c7c11816c67360cfea74030e137ff7edf8de1fb9e47f991ec
6
+ metadata.gz: 6024f630a6c982beffd597107cfa75c1e2d6e86e174408632f4e31aa8d4c5a2ea6be8608f678f64da6bd6ba914e9f3ed55fce044a25593bd92757a82bb0d082e
7
+ data.tar.gz: 98bed6e7938399640d942d5c8d9f420d01f4d048d06c09dec2f1e6e7e833a8c38c42419a520445b13166743615de7bd120eec20a4c607d377ebf40a0109bcc47
data/Manifest.txt CHANGED
@@ -22,12 +22,15 @@ test/data/beer11.csv
22
22
  test/data/cars11.csv
23
23
  test/data/cities11.csv
24
24
  test/data/customers11.csv
25
+ test/data/iris.attrib.csv
25
26
  test/data/iris11.csv
27
+ test/data/lcc.attrib.csv
26
28
  test/data/shakespeare.csv
27
29
  test/helper.rb
28
30
  test/test_buffer.rb
29
31
  test/test_converter.rb
30
32
  test/test_parser.rb
33
+ test/test_parser_directive.rb
31
34
  test/test_parser_fixed.rb
32
35
  test/test_parser_formats.rb
33
36
  test/test_parser_java.rb
data/README.md CHANGED
@@ -8,6 +8,36 @@
8
8
  * forum :: [wwwmake](http://groups.google.com/group/wwwmake)
9
9
 
10
10
 
11
+ ## What's News?
12
+
13
+
14
+
15
+ **v1.1.2**: Added built-in support for single quotes (`'`) to default parser ("The Right Way").
16
+ Now you can use both, that is, single (`'...'`) or double quotes (`"..."`)
17
+ like in ruby (or javascript or html or ...) :-).
18
+
19
+
20
+ **v1.1.1**: Added built-in support for (optional) alternative comments (`%`) - used by
21
+ ARFF (attribute relation file format) -
22
+ and support for (optional) directives (`@`) in header (that is, before any records)
23
+ to default parser ("The Right Way").
24
+ Now you can use either `#` or `%` for comments, the first one "wins" - you CANNOT use both.
25
+ Now you can use either a front matter (`---`) block
26
+ or directives (e.g. `@attribute`, `@relation`, etc.)
27
+ for meta data, the first one "wins" - you CANNOT use both.
28
+
29
+
30
+ **v1.1.0**: Added new fixed width field (fwf) parser (see `ParserFixed`) for supporting fields with fixed width (and no separator)
31
+ e.g.`Csv.fixed.parse( txt, width: [8,-2,8,-3,32,-2,14] )`.
32
+
33
+
34
+ **v1.0.3**: Added built-in support for an (optional) front matter (`---`) meta data block
35
+ in header (that is, before any records)
36
+ to default parser ("The Right Way"). See [CSVY.org](http://csvy.org) for more.
37
+ Use `Csv.parser.meta` to get the parsed meta data block hash (or `nil`) if none.
38
+
39
+
40
+
11
41
 
12
42
  ## Usage
13
43
 
@@ -165,4 +165,5 @@ end # class CsvHashReader
165
165
 
166
166
 
167
167
 
168
- puts CsvReader.banner # say hello
168
+ # say hello
169
+ puts CsvReader.banner if $DEBUG || (defined?($RUBYLIBS_DEBUG) && $RUBYLIBS_DEBUG)
@@ -10,13 +10,17 @@ class ParserStd
10
10
 
11
11
 
12
12
  ## char constants
13
- DOUBLE_QUOTE = "\""
14
- BACKSLASH = "\\" ## use BACKSLASH_ESCAPE ??
15
- COMMENT = "#" ## use COMMENT_HASH or HASH or ??
16
- SPACE = " " ## \s == ASCII 32 (dec) = (Space)
17
- TAB = "\t" ## \t == ASCII 0x09 (hex) = HT (Tab/horizontal tab)
18
- LF = "\n" ## \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed)
19
- CR = "\r" ## \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return)
13
+ DOUBLE_QUOTE = "\""
14
+ SINGLE_QUOTE = "'"
15
+ BACKSLASH = "\\" ## use BACKSLASH_ESCAPE ??
16
+ COMMENT1 = "#" ## use COMMENT_HASH or HASH or ??
17
+ COMMENT2 = "%" ## use COMMENT_PERCENT or PERCENT or ??
18
+ DIRECTIVE = "@" ## use a different name e.g. AT or ??
19
+ SPACE = " " ## \s == ASCII 32 (dec) = (Space)
20
+ TAB = "\t" ## \t == ASCII 0x09 (hex) = HT (Tab/horizontal tab)
21
+ LF = "\n" ## \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed)
22
+ CR = "\r" ## \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return)
23
+
20
24
 
21
25
 
22
26
  ###################################
@@ -101,13 +105,14 @@ end ## method parse
101
105
 
102
106
 
103
107
 
108
+
104
109
  private
105
110
 
106
111
  def parse_escape( input )
107
112
  value = ""
108
113
  if input.peek == BACKSLASH
109
114
  input.getc ## eat-up backslash
110
- if (c=input.peek; c==BACKSLASH || c==LF || c==CR || c==',' || c=='"' )
115
+ if (c=input.peek; c==BACKSLASH || c==LF || c==CR || c==',' || c==DOUBLE_QUOTE || c==SINGLE_QUOTE )
111
116
  logger.debug " add escaped char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
112
117
  value << input.getc ## add escaped char (e.g. lf, cr, etc.)
113
118
  else
@@ -122,23 +127,24 @@ def parse_escape( input )
122
127
  end
123
128
 
124
129
 
125
- def parse_doublequote( input )
130
+
131
+ def parse_quote( input, quote:)
126
132
  value = ""
127
- if input.peek == DOUBLE_QUOTE
128
- input.getc ## eat-up double_quote
133
+ if input.peek == quote
134
+ input.getc ## eat-up quote
129
135
 
130
136
  loop do
131
- while (c=input.peek; !(c==DOUBLE_QUOTE || c==BACKSLASH || input.eof?))
132
- value << input.getc ## eat-up everything until hitting double_quote (") or backslash (escape)
137
+ while (c=input.peek; !(c==quote || c==BACKSLASH || input.eof?))
138
+ value << input.getc ## eat-up everything until hitting quote (e.g. " or ') or backslash (escape)
133
139
  end
134
140
 
135
141
  if input.eof?
136
142
  break
137
143
  elsif input.peek == BACKSLASH
138
144
  value << parse_escape( input )
139
- else ## assume input.peek == DOUBLE_QUOTE
140
- input.getc ## eat-up double_quote
141
- if input.peek == DOUBLE_QUOTE ## doubled up quote?
145
+ else ## assume input.peek == quote
146
+ input.getc ## eat-up quote
147
+ if input.peek == quote ## doubled up quote?
142
148
  value << input.getc ## add doube quote and continue!!!!
143
149
  else
144
150
  break
@@ -146,13 +152,14 @@ def parse_doublequote( input )
146
152
  end
147
153
  end
148
154
  else
149
- raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - DOUBLE_QUOTE (\") expected in parse_double_quote!!!!" )
155
+ raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - QUOTE (#{quote}) expected in parse_quote!!!!" )
150
156
  end
151
157
  value
152
158
  end
153
159
 
154
160
 
155
161
 
162
+
156
163
  def parse_field( input )
157
164
  value = ""
158
165
 
@@ -175,11 +182,18 @@ def parse_field( input )
175
182
  end
176
183
  elsif input.peek == DOUBLE_QUOTE
177
184
  logger.debug "start double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
178
- value << parse_doublequote( input )
185
+ value << parse_quote( input, quote: DOUBLE_QUOTE )
179
186
 
180
187
  ## note: always eat-up all trailing spaces (" ") and tabs (\t)
181
188
  skip_spaces( input )
182
189
  logger.debug "end double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
190
+ elsif input.peek == SINGLE_QUOTE ## allow single quote too (by default)
191
+ logger.debug "start single_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
192
+ value << parse_quote( input, quote: SINGLE_QUOTE )
193
+
194
+ ## note: always eat-up all trailing spaces (" ") and tabs (\t)
195
+ skip_spaces( input )
196
+ logger.debug "end single_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
183
197
  else
184
198
  logger.debug "start reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
185
199
  ## consume simple value
@@ -349,20 +363,41 @@ def parse_lines( input, &block )
349
363
  ## used for meta block (can only start before any records e.g. if record_num == 0)
350
364
  record_num = 0
351
365
 
366
+ ## note: can either use '#' or '%' but NOT both; first one "wins"
367
+ comment = nil
368
+
369
+ ## note: can either use directives (@) or frontmatter (---) block; first one "wins"
370
+ has_seen_directive = false
371
+ has_seen_frontmatter = false ## - renameto has_seen_dash (---) - why? why not???
372
+
373
+
352
374
  loop do
353
375
  break if input.eof?
354
376
 
355
377
  skipped_spaces = skip_spaces( input )
356
378
 
357
- if input.peek == COMMENT ## comment line
358
- logger.debug "skipping comment - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
379
+ if comment.nil? && (c=input.peek; c==COMMENT1 || c==COMMENT2)
380
+ logger.debug "skipping comment (first) - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
381
+ comment = input.getc ## first comment line (determines/fixes "allowed" comment-style)
382
+ skip_until_eol( input )
383
+ skip_newline( input )
384
+ elsif comment && input.peek == comment ## (anther) comment line
385
+ logger.debug "skipping comment (follow-up) - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
359
386
  skip_until_eol( input )
360
387
  skip_newline( input )
361
388
  elsif (c=input.peek; c==LF || c==CR || input.eof?)
362
389
  logger.debug "skipping blank - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
363
390
  skip_newline( input )
364
- elsif record_num == 0 && skipped_spaces == 0 && meta.nil? && input.peekn(4) =~ /^---[\n\r \t]$/
391
+ elsif record_num == 0 && has_seen_frontmatter == false && input.peek==DIRECTIVE
392
+ ## note: "skip" directives for now
393
+ has_seen_directive = true
394
+ logger.debug "skip directive" if logger.debug?
395
+ skip_until_eol( input )
396
+ skip_newline( input )
397
+ elsif record_num == 0 && has_seen_directive == false && has_seen_frontmatter == false &&
398
+ skipped_spaces == 0 && input.peekn(4) =~ /^---[\n\r \t]$/
365
399
  ## note: assume "---" (MUST BE) followed by newline (\r or \n) or space starts a meta block
400
+ has_seen_frontmatter = true
366
401
  logger.debug "start meta block" if logger.debug?
367
402
  ## note: meta gets stored as object attribute (state/state/state!!)
368
403
  ## use meta attribute to get meta data after reading first record
@@ -5,7 +5,7 @@ class CsvReader ## note: uses a class for now - change to module - why? why no
5
5
 
6
6
  MAJOR = 1 ## todo: namespace inside version or something - why? why not??
7
7
  MINOR = 1
8
- PATCH = 1
8
+ PATCH = 2
9
9
  VERSION = [MAJOR,MINOR,PATCH].join('.')
10
10
 
11
11
 
@@ -0,0 +1,25 @@
1
+ % 1. Title: Iris Plants Database
2
+ %
3
+ % 2. Sources:
4
+ % (a) Creator: R.A. Fisher
5
+
6
+
7
+ @RELATION iris
8
+
9
+ @ATTRIBUTE sepallength NUMERIC
10
+ @ATTRIBUTE sepalwidth NUMERIC
11
+ @ATTRIBUTE petallength NUMERIC
12
+ @ATTRIBUTE petalwidth NUMERIC
13
+ @ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica}
14
+
15
+ @DATA
16
+ 5.1,3.5,1.4,0.2,Iris-setosa
17
+ 4.9,3.0,1.4,0.2,Iris-setosa
18
+ 4.7,3.2,1.3,0.2,Iris-setosa
19
+ 4.6,3.1,1.5,0.2,Iris-setosa
20
+ 5.0,3.6,1.4,0.2,Iris-setosa
21
+ 5.4,3.9,1.7,0.4,Iris-setosa
22
+ 4.6,3.4,1.4,0.3,Iris-setosa
23
+ 5.0,3.4,1.5,0.2,Iris-setosa
24
+ 4.4,2.9,1.4,0.2,Iris-setosa
25
+ 4.9,3.1,1.5,0.1,Iris-setosa
@@ -0,0 +1,14 @@
1
+ % Attribute-Relation File Format (ARFF) Example
2
+ % see https://www.cs.waikato.ac.nz/ml/weka/arff.html
3
+
4
+ @relation LCCvsLCSH
5
+
6
+ @attribute LCC string
7
+ @attribute LCSH string
8
+
9
+ @data
10
+ AG5, 'Encyclopedias and dictionaries.;Twentieth century.'
11
+ AS262, 'Science -- Soviet Union -- History.'
12
+ AE5, 'Encyclopedias and dictionaries.'
13
+ AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Phases.'
14
+ AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Tables.'
@@ -0,0 +1,68 @@
1
+ # encoding: utf-8
2
+
3
+ ###
4
+ # to run use
5
+ # ruby -I ./lib -I ./test test/test_parser_directive.rb
6
+
7
+
8
+ require 'helper'
9
+
10
+ class TestParserDirective < MiniTest::Test
11
+
12
+
13
+ def parser
14
+ parser = CsvReader::Parser::DEFAULT
15
+ end
16
+
17
+
18
+ def test_iris
19
+ records = [["5.1","3.5","1.4","0.2","Iris-setosa"],
20
+ ["4.9","3.0","1.4","0.2","Iris-setosa"]]
21
+
22
+
23
+ assert_equal records, parser.parse( <<TXT )
24
+ % with meta data - arff (attribute relation file format)-style
25
+ %
26
+
27
+ @RELATION iris
28
+
29
+ @ATTRIBUTE sepallength NUMERIC
30
+ @ATTRIBUTE sepalwidth NUMERIC
31
+ @ATTRIBUTE petallength NUMERIC
32
+ @ATTRIBUTE petalwidth NUMERIC
33
+ @ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica}
34
+
35
+ @DATA
36
+ 5.1,3.5,1.4,0.2,Iris-setosa
37
+ 4.9,3.0,1.4,0.2,Iris-setosa
38
+ TXT
39
+ end
40
+
41
+
42
+ def test_lcc
43
+ records = [['AG5', 'Encyclopedias and dictionaries.;Twentieth century.'],
44
+ ['AS262', 'Science -- Soviet Union -- History.'],
45
+ ['AE5', 'Encyclopedias and dictionaries.'],
46
+ ['AS281', 'Astronomy, Assyro-Babylonian.;Moon -- Phases.'],
47
+ ['AS281', 'Astronomy, Assyro-Babylonian.;Moon -- Tables.']]
48
+
49
+
50
+ assert_equal records, parser.parse( <<TXT )
51
+ % Attribute-Relation File Format (ARFF) Example
52
+ % see https://www.cs.waikato.ac.nz/ml/weka/arff.html
53
+
54
+ @relation LCCvsLCSH
55
+
56
+ @attribute LCC string
57
+ @attribute LCSH string
58
+
59
+ @data
60
+ AG5, 'Encyclopedias and dictionaries.;Twentieth century.'
61
+ AS262, 'Science -- Soviet Union -- History.'
62
+ AE5, 'Encyclopedias and dictionaries.'
63
+ AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Phases.'
64
+ AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Tables.'
65
+ TXT
66
+ end
67
+
68
+ end # class TestParserDirective
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: csvreader
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.1
4
+ version: 1.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-10-23 00:00:00.000000000 Z
11
+ date: 2018-10-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rdoc
@@ -73,12 +73,15 @@ files:
73
73
  - test/data/cars11.csv
74
74
  - test/data/cities11.csv
75
75
  - test/data/customers11.csv
76
+ - test/data/iris.attrib.csv
76
77
  - test/data/iris11.csv
78
+ - test/data/lcc.attrib.csv
77
79
  - test/data/shakespeare.csv
78
80
  - test/helper.rb
79
81
  - test/test_buffer.rb
80
82
  - test/test_converter.rb
81
83
  - test/test_parser.rb
84
+ - test/test_parser_directive.rb
82
85
  - test/test_parser_fixed.rb
83
86
  - test/test_parser_formats.rb
84
87
  - test/test_parser_java.rb