csvreader 1.1.1 → 1.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Manifest.txt +3 -0
- data/README.md +30 -0
- data/lib/csvreader/base.rb +2 -1
- data/lib/csvreader/parser_std.rb +56 -21
- data/lib/csvreader/version.rb +1 -1
- data/test/data/iris.attrib.csv +25 -0
- data/test/data/lcc.attrib.csv +14 -0
- data/test/test_parser_directive.rb +68 -0
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cf620967ec1983a211f8e2436a4b50aca3bbe023
|
4
|
+
data.tar.gz: 76da0bbce4a76c4b60e37f1cb93be23d2aec504e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6024f630a6c982beffd597107cfa75c1e2d6e86e174408632f4e31aa8d4c5a2ea6be8608f678f64da6bd6ba914e9f3ed55fce044a25593bd92757a82bb0d082e
|
7
|
+
data.tar.gz: 98bed6e7938399640d942d5c8d9f420d01f4d048d06c09dec2f1e6e7e833a8c38c42419a520445b13166743615de7bd120eec20a4c607d377ebf40a0109bcc47
|
data/Manifest.txt
CHANGED
@@ -22,12 +22,15 @@ test/data/beer11.csv
|
|
22
22
|
test/data/cars11.csv
|
23
23
|
test/data/cities11.csv
|
24
24
|
test/data/customers11.csv
|
25
|
+
test/data/iris.attrib.csv
|
25
26
|
test/data/iris11.csv
|
27
|
+
test/data/lcc.attrib.csv
|
26
28
|
test/data/shakespeare.csv
|
27
29
|
test/helper.rb
|
28
30
|
test/test_buffer.rb
|
29
31
|
test/test_converter.rb
|
30
32
|
test/test_parser.rb
|
33
|
+
test/test_parser_directive.rb
|
31
34
|
test/test_parser_fixed.rb
|
32
35
|
test/test_parser_formats.rb
|
33
36
|
test/test_parser_java.rb
|
data/README.md
CHANGED
@@ -8,6 +8,36 @@
|
|
8
8
|
* forum :: [wwwmake](http://groups.google.com/group/wwwmake)
|
9
9
|
|
10
10
|
|
11
|
+
## What's News?
|
12
|
+
|
13
|
+
|
14
|
+
|
15
|
+
**v1.1.2**: Added built-in support for single quotes (`'`) to default parser ("The Right Way").
|
16
|
+
Now you can use both, that is, single (`'...'`) or double quotes (`"..."`)
|
17
|
+
like in ruby (or javascript or html or ...) :-).
|
18
|
+
|
19
|
+
|
20
|
+
**v1.1.1**: Added built-in support for (optional) alternative comments (`%`) - used by
|
21
|
+
ARFF (attribute relation file format) -
|
22
|
+
and support for (optional) directives (`@`) in header (that is, before any records)
|
23
|
+
to default parser ("The Right Way").
|
24
|
+
Now you can use either `#` or `%` for comments, the first one "wins" - you CANNOT use both.
|
25
|
+
Now you can use either a front matter (`---`) block
|
26
|
+
or directives (e.g. `@attribute`, `@relation`, etc.)
|
27
|
+
for meta data, the first one "wins" - you CANNOT use both.
|
28
|
+
|
29
|
+
|
30
|
+
**v1.1.0**: Added new fixed width field (fwf) parser (see `ParserFixed`) for supporting fields with fixed width (and no separator)
|
31
|
+
e.g.`Csv.fixed.parse( txt, width: [8,-2,8,-3,32,-2,14] )`.
|
32
|
+
|
33
|
+
|
34
|
+
**v1.0.3**: Added built-in support for an (optional) front matter (`---`) meta data block
|
35
|
+
in header (that is, before any records)
|
36
|
+
to default parser ("The Right Way"). See [CSVY.org](http://csvy.org) for more.
|
37
|
+
Use `Csv.parser.meta` to get the parsed meta data block hash (or `nil`) if none.
|
38
|
+
|
39
|
+
|
40
|
+
|
11
41
|
|
12
42
|
## Usage
|
13
43
|
|
data/lib/csvreader/base.rb
CHANGED
data/lib/csvreader/parser_std.rb
CHANGED
@@ -10,13 +10,17 @@ class ParserStd
|
|
10
10
|
|
11
11
|
|
12
12
|
## char constants
|
13
|
-
DOUBLE_QUOTE
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
13
|
+
DOUBLE_QUOTE = "\""
|
14
|
+
SINGLE_QUOTE = "'"
|
15
|
+
BACKSLASH = "\\" ## use BACKSLASH_ESCAPE ??
|
16
|
+
COMMENT1 = "#" ## use COMMENT_HASH or HASH or ??
|
17
|
+
COMMENT2 = "%" ## use COMMENT_PERCENT or PERCENT or ??
|
18
|
+
DIRECTIVE = "@" ## use a different name e.g. AT or ??
|
19
|
+
SPACE = " " ## \s == ASCII 32 (dec) = (Space)
|
20
|
+
TAB = "\t" ## \t == ASCII 0x09 (hex) = HT (Tab/horizontal tab)
|
21
|
+
LF = "\n" ## \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed)
|
22
|
+
CR = "\r" ## \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return)
|
23
|
+
|
20
24
|
|
21
25
|
|
22
26
|
###################################
|
@@ -101,13 +105,14 @@ end ## method parse
|
|
101
105
|
|
102
106
|
|
103
107
|
|
108
|
+
|
104
109
|
private
|
105
110
|
|
106
111
|
def parse_escape( input )
|
107
112
|
value = ""
|
108
113
|
if input.peek == BACKSLASH
|
109
114
|
input.getc ## eat-up backslash
|
110
|
-
if (c=input.peek; c==BACKSLASH || c==LF || c==CR || c==',' || c==
|
115
|
+
if (c=input.peek; c==BACKSLASH || c==LF || c==CR || c==',' || c==DOUBLE_QUOTE || c==SINGLE_QUOTE )
|
111
116
|
logger.debug " add escaped char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
112
117
|
value << input.getc ## add escaped char (e.g. lf, cr, etc.)
|
113
118
|
else
|
@@ -122,23 +127,24 @@ def parse_escape( input )
|
|
122
127
|
end
|
123
128
|
|
124
129
|
|
125
|
-
|
130
|
+
|
131
|
+
def parse_quote( input, quote:)
|
126
132
|
value = ""
|
127
|
-
if input.peek ==
|
128
|
-
input.getc ## eat-up
|
133
|
+
if input.peek == quote
|
134
|
+
input.getc ## eat-up quote
|
129
135
|
|
130
136
|
loop do
|
131
|
-
while (c=input.peek; !(c==
|
132
|
-
value << input.getc ## eat-up everything until hitting
|
137
|
+
while (c=input.peek; !(c==quote || c==BACKSLASH || input.eof?))
|
138
|
+
value << input.getc ## eat-up everything until hitting quote (e.g. " or ') or backslash (escape)
|
133
139
|
end
|
134
140
|
|
135
141
|
if input.eof?
|
136
142
|
break
|
137
143
|
elsif input.peek == BACKSLASH
|
138
144
|
value << parse_escape( input )
|
139
|
-
else ## assume input.peek ==
|
140
|
-
input.getc ## eat-up
|
141
|
-
if input.peek ==
|
145
|
+
else ## assume input.peek == quote
|
146
|
+
input.getc ## eat-up quote
|
147
|
+
if input.peek == quote ## doubled up quote?
|
142
148
|
value << input.getc ## add doube quote and continue!!!!
|
143
149
|
else
|
144
150
|
break
|
@@ -146,13 +152,14 @@ def parse_doublequote( input )
|
|
146
152
|
end
|
147
153
|
end
|
148
154
|
else
|
149
|
-
raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< -
|
155
|
+
raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - QUOTE (#{quote}) expected in parse_quote!!!!" )
|
150
156
|
end
|
151
157
|
value
|
152
158
|
end
|
153
159
|
|
154
160
|
|
155
161
|
|
162
|
+
|
156
163
|
def parse_field( input )
|
157
164
|
value = ""
|
158
165
|
|
@@ -175,11 +182,18 @@ def parse_field( input )
|
|
175
182
|
end
|
176
183
|
elsif input.peek == DOUBLE_QUOTE
|
177
184
|
logger.debug "start double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
178
|
-
value <<
|
185
|
+
value << parse_quote( input, quote: DOUBLE_QUOTE )
|
179
186
|
|
180
187
|
## note: always eat-up all trailing spaces (" ") and tabs (\t)
|
181
188
|
skip_spaces( input )
|
182
189
|
logger.debug "end double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
190
|
+
elsif input.peek == SINGLE_QUOTE ## allow single quote too (by default)
|
191
|
+
logger.debug "start single_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
192
|
+
value << parse_quote( input, quote: SINGLE_QUOTE )
|
193
|
+
|
194
|
+
## note: always eat-up all trailing spaces (" ") and tabs (\t)
|
195
|
+
skip_spaces( input )
|
196
|
+
logger.debug "end single_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
183
197
|
else
|
184
198
|
logger.debug "start reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
185
199
|
## consume simple value
|
@@ -349,20 +363,41 @@ def parse_lines( input, &block )
|
|
349
363
|
## used for meta block (can only start before any records e.g. if record_num == 0)
|
350
364
|
record_num = 0
|
351
365
|
|
366
|
+
## note: can either use '#' or '%' but NOT both; first one "wins"
|
367
|
+
comment = nil
|
368
|
+
|
369
|
+
## note: can either use directives (@) or frontmatter (---) block; first one "wins"
|
370
|
+
has_seen_directive = false
|
371
|
+
has_seen_frontmatter = false ## - renameto has_seen_dash (---) - why? why not???
|
372
|
+
|
373
|
+
|
352
374
|
loop do
|
353
375
|
break if input.eof?
|
354
376
|
|
355
377
|
skipped_spaces = skip_spaces( input )
|
356
378
|
|
357
|
-
if input.peek ==
|
358
|
-
logger.debug "skipping comment - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
379
|
+
if comment.nil? && (c=input.peek; c==COMMENT1 || c==COMMENT2)
|
380
|
+
logger.debug "skipping comment (first) - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
381
|
+
comment = input.getc ## first comment line (determines/fixes "allowed" comment-style)
|
382
|
+
skip_until_eol( input )
|
383
|
+
skip_newline( input )
|
384
|
+
elsif comment && input.peek == comment ## (anther) comment line
|
385
|
+
logger.debug "skipping comment (follow-up) - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
359
386
|
skip_until_eol( input )
|
360
387
|
skip_newline( input )
|
361
388
|
elsif (c=input.peek; c==LF || c==CR || input.eof?)
|
362
389
|
logger.debug "skipping blank - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
363
390
|
skip_newline( input )
|
364
|
-
elsif record_num == 0 &&
|
391
|
+
elsif record_num == 0 && has_seen_frontmatter == false && input.peek==DIRECTIVE
|
392
|
+
## note: "skip" directives for now
|
393
|
+
has_seen_directive = true
|
394
|
+
logger.debug "skip directive" if logger.debug?
|
395
|
+
skip_until_eol( input )
|
396
|
+
skip_newline( input )
|
397
|
+
elsif record_num == 0 && has_seen_directive == false && has_seen_frontmatter == false &&
|
398
|
+
skipped_spaces == 0 && input.peekn(4) =~ /^---[\n\r \t]$/
|
365
399
|
## note: assume "---" (MUST BE) followed by newline (\r or \n) or space starts a meta block
|
400
|
+
has_seen_frontmatter = true
|
366
401
|
logger.debug "start meta block" if logger.debug?
|
367
402
|
## note: meta gets stored as object attribute (state/state/state!!)
|
368
403
|
## use meta attribute to get meta data after reading first record
|
data/lib/csvreader/version.rb
CHANGED
@@ -0,0 +1,25 @@
|
|
1
|
+
% 1. Title: Iris Plants Database
|
2
|
+
%
|
3
|
+
% 2. Sources:
|
4
|
+
% (a) Creator: R.A. Fisher
|
5
|
+
|
6
|
+
|
7
|
+
@RELATION iris
|
8
|
+
|
9
|
+
@ATTRIBUTE sepallength NUMERIC
|
10
|
+
@ATTRIBUTE sepalwidth NUMERIC
|
11
|
+
@ATTRIBUTE petallength NUMERIC
|
12
|
+
@ATTRIBUTE petalwidth NUMERIC
|
13
|
+
@ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica}
|
14
|
+
|
15
|
+
@DATA
|
16
|
+
5.1,3.5,1.4,0.2,Iris-setosa
|
17
|
+
4.9,3.0,1.4,0.2,Iris-setosa
|
18
|
+
4.7,3.2,1.3,0.2,Iris-setosa
|
19
|
+
4.6,3.1,1.5,0.2,Iris-setosa
|
20
|
+
5.0,3.6,1.4,0.2,Iris-setosa
|
21
|
+
5.4,3.9,1.7,0.4,Iris-setosa
|
22
|
+
4.6,3.4,1.4,0.3,Iris-setosa
|
23
|
+
5.0,3.4,1.5,0.2,Iris-setosa
|
24
|
+
4.4,2.9,1.4,0.2,Iris-setosa
|
25
|
+
4.9,3.1,1.5,0.1,Iris-setosa
|
@@ -0,0 +1,14 @@
|
|
1
|
+
% Attribute-Relation File Format (ARFF) Example
|
2
|
+
% see https://www.cs.waikato.ac.nz/ml/weka/arff.html
|
3
|
+
|
4
|
+
@relation LCCvsLCSH
|
5
|
+
|
6
|
+
@attribute LCC string
|
7
|
+
@attribute LCSH string
|
8
|
+
|
9
|
+
@data
|
10
|
+
AG5, 'Encyclopedias and dictionaries.;Twentieth century.'
|
11
|
+
AS262, 'Science -- Soviet Union -- History.'
|
12
|
+
AE5, 'Encyclopedias and dictionaries.'
|
13
|
+
AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Phases.'
|
14
|
+
AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Tables.'
|
@@ -0,0 +1,68 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
###
|
4
|
+
# to run use
|
5
|
+
# ruby -I ./lib -I ./test test/test_parser_directive.rb
|
6
|
+
|
7
|
+
|
8
|
+
require 'helper'
|
9
|
+
|
10
|
+
class TestParserDirective < MiniTest::Test
|
11
|
+
|
12
|
+
|
13
|
+
def parser
|
14
|
+
parser = CsvReader::Parser::DEFAULT
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
def test_iris
|
19
|
+
records = [["5.1","3.5","1.4","0.2","Iris-setosa"],
|
20
|
+
["4.9","3.0","1.4","0.2","Iris-setosa"]]
|
21
|
+
|
22
|
+
|
23
|
+
assert_equal records, parser.parse( <<TXT )
|
24
|
+
% with meta data - arff (attribute relation file format)-style
|
25
|
+
%
|
26
|
+
|
27
|
+
@RELATION iris
|
28
|
+
|
29
|
+
@ATTRIBUTE sepallength NUMERIC
|
30
|
+
@ATTRIBUTE sepalwidth NUMERIC
|
31
|
+
@ATTRIBUTE petallength NUMERIC
|
32
|
+
@ATTRIBUTE petalwidth NUMERIC
|
33
|
+
@ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica}
|
34
|
+
|
35
|
+
@DATA
|
36
|
+
5.1,3.5,1.4,0.2,Iris-setosa
|
37
|
+
4.9,3.0,1.4,0.2,Iris-setosa
|
38
|
+
TXT
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
def test_lcc
|
43
|
+
records = [['AG5', 'Encyclopedias and dictionaries.;Twentieth century.'],
|
44
|
+
['AS262', 'Science -- Soviet Union -- History.'],
|
45
|
+
['AE5', 'Encyclopedias and dictionaries.'],
|
46
|
+
['AS281', 'Astronomy, Assyro-Babylonian.;Moon -- Phases.'],
|
47
|
+
['AS281', 'Astronomy, Assyro-Babylonian.;Moon -- Tables.']]
|
48
|
+
|
49
|
+
|
50
|
+
assert_equal records, parser.parse( <<TXT )
|
51
|
+
% Attribute-Relation File Format (ARFF) Example
|
52
|
+
% see https://www.cs.waikato.ac.nz/ml/weka/arff.html
|
53
|
+
|
54
|
+
@relation LCCvsLCSH
|
55
|
+
|
56
|
+
@attribute LCC string
|
57
|
+
@attribute LCSH string
|
58
|
+
|
59
|
+
@data
|
60
|
+
AG5, 'Encyclopedias and dictionaries.;Twentieth century.'
|
61
|
+
AS262, 'Science -- Soviet Union -- History.'
|
62
|
+
AE5, 'Encyclopedias and dictionaries.'
|
63
|
+
AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Phases.'
|
64
|
+
AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Tables.'
|
65
|
+
TXT
|
66
|
+
end
|
67
|
+
|
68
|
+
end # class TestParserDirective
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: csvreader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-10-
|
11
|
+
date: 2018-10-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rdoc
|
@@ -73,12 +73,15 @@ files:
|
|
73
73
|
- test/data/cars11.csv
|
74
74
|
- test/data/cities11.csv
|
75
75
|
- test/data/customers11.csv
|
76
|
+
- test/data/iris.attrib.csv
|
76
77
|
- test/data/iris11.csv
|
78
|
+
- test/data/lcc.attrib.csv
|
77
79
|
- test/data/shakespeare.csv
|
78
80
|
- test/helper.rb
|
79
81
|
- test/test_buffer.rb
|
80
82
|
- test/test_converter.rb
|
81
83
|
- test/test_parser.rb
|
84
|
+
- test/test_parser_directive.rb
|
82
85
|
- test/test_parser_fixed.rb
|
83
86
|
- test/test_parser_formats.rb
|
84
87
|
- test/test_parser_java.rb
|