csvreader 1.1.1 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Manifest.txt +3 -0
- data/README.md +30 -0
- data/lib/csvreader/base.rb +2 -1
- data/lib/csvreader/parser_std.rb +56 -21
- data/lib/csvreader/version.rb +1 -1
- data/test/data/iris.attrib.csv +25 -0
- data/test/data/lcc.attrib.csv +14 -0
- data/test/test_parser_directive.rb +68 -0
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cf620967ec1983a211f8e2436a4b50aca3bbe023
|
4
|
+
data.tar.gz: 76da0bbce4a76c4b60e37f1cb93be23d2aec504e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6024f630a6c982beffd597107cfa75c1e2d6e86e174408632f4e31aa8d4c5a2ea6be8608f678f64da6bd6ba914e9f3ed55fce044a25593bd92757a82bb0d082e
|
7
|
+
data.tar.gz: 98bed6e7938399640d942d5c8d9f420d01f4d048d06c09dec2f1e6e7e833a8c38c42419a520445b13166743615de7bd120eec20a4c607d377ebf40a0109bcc47
|
data/Manifest.txt
CHANGED
@@ -22,12 +22,15 @@ test/data/beer11.csv
|
|
22
22
|
test/data/cars11.csv
|
23
23
|
test/data/cities11.csv
|
24
24
|
test/data/customers11.csv
|
25
|
+
test/data/iris.attrib.csv
|
25
26
|
test/data/iris11.csv
|
27
|
+
test/data/lcc.attrib.csv
|
26
28
|
test/data/shakespeare.csv
|
27
29
|
test/helper.rb
|
28
30
|
test/test_buffer.rb
|
29
31
|
test/test_converter.rb
|
30
32
|
test/test_parser.rb
|
33
|
+
test/test_parser_directive.rb
|
31
34
|
test/test_parser_fixed.rb
|
32
35
|
test/test_parser_formats.rb
|
33
36
|
test/test_parser_java.rb
|
data/README.md
CHANGED
@@ -8,6 +8,36 @@
|
|
8
8
|
* forum :: [wwwmake](http://groups.google.com/group/wwwmake)
|
9
9
|
|
10
10
|
|
11
|
+
## What's News?
|
12
|
+
|
13
|
+
|
14
|
+
|
15
|
+
**v1.1.2**: Added built-in support for single quotes (`'`) to default parser ("The Right Way").
|
16
|
+
Now you can use both, that is, single (`'...'`) or double quotes (`"..."`)
|
17
|
+
like in ruby (or javascript or html or ...) :-).
|
18
|
+
|
19
|
+
|
20
|
+
**v1.1.1**: Added built-in support for (optional) alternative comments (`%`) - used by
|
21
|
+
ARFF (attribute relation file format) -
|
22
|
+
and support for (optional) directives (`@`) in header (that is, before any records)
|
23
|
+
to default parser ("The Right Way").
|
24
|
+
Now you can use either `#` or `%` for comments, the first one "wins" - you CANNOT use both.
|
25
|
+
Now you can use either a front matter (`---`) block
|
26
|
+
or directives (e.g. `@attribute`, `@relation`, etc.)
|
27
|
+
for meta data, the first one "wins" - you CANNOT use both.
|
28
|
+
|
29
|
+
|
30
|
+
**v1.1.0**: Added new fixed width field (fwf) parser (see `ParserFixed`) for supporting fields with fixed width (and no separator)
|
31
|
+
e.g.`Csv.fixed.parse( txt, width: [8,-2,8,-3,32,-2,14] )`.
|
32
|
+
|
33
|
+
|
34
|
+
**v1.0.3**: Added built-in support for an (optional) front matter (`---`) meta data block
|
35
|
+
in header (that is, before any records)
|
36
|
+
to default parser ("The Right Way"). See [CSVY.org](http://csvy.org) for more.
|
37
|
+
Use `Csv.parser.meta` to get the parsed meta data block hash (or `nil`) if none.
|
38
|
+
|
39
|
+
|
40
|
+
|
11
41
|
|
12
42
|
## Usage
|
13
43
|
|
data/lib/csvreader/base.rb
CHANGED
data/lib/csvreader/parser_std.rb
CHANGED
@@ -10,13 +10,17 @@ class ParserStd
|
|
10
10
|
|
11
11
|
|
12
12
|
## char constants
|
13
|
-
DOUBLE_QUOTE
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
13
|
+
DOUBLE_QUOTE = "\""
|
14
|
+
SINGLE_QUOTE = "'"
|
15
|
+
BACKSLASH = "\\" ## use BACKSLASH_ESCAPE ??
|
16
|
+
COMMENT1 = "#" ## use COMMENT_HASH or HASH or ??
|
17
|
+
COMMENT2 = "%" ## use COMMENT_PERCENT or PERCENT or ??
|
18
|
+
DIRECTIVE = "@" ## use a different name e.g. AT or ??
|
19
|
+
SPACE = " " ## \s == ASCII 32 (dec) = (Space)
|
20
|
+
TAB = "\t" ## \t == ASCII 0x09 (hex) = HT (Tab/horizontal tab)
|
21
|
+
LF = "\n" ## \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed)
|
22
|
+
CR = "\r" ## \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return)
|
23
|
+
|
20
24
|
|
21
25
|
|
22
26
|
###################################
|
@@ -101,13 +105,14 @@ end ## method parse
|
|
101
105
|
|
102
106
|
|
103
107
|
|
108
|
+
|
104
109
|
private
|
105
110
|
|
106
111
|
def parse_escape( input )
|
107
112
|
value = ""
|
108
113
|
if input.peek == BACKSLASH
|
109
114
|
input.getc ## eat-up backslash
|
110
|
-
if (c=input.peek; c==BACKSLASH || c==LF || c==CR || c==',' || c==
|
115
|
+
if (c=input.peek; c==BACKSLASH || c==LF || c==CR || c==',' || c==DOUBLE_QUOTE || c==SINGLE_QUOTE )
|
111
116
|
logger.debug " add escaped char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
112
117
|
value << input.getc ## add escaped char (e.g. lf, cr, etc.)
|
113
118
|
else
|
@@ -122,23 +127,24 @@ def parse_escape( input )
|
|
122
127
|
end
|
123
128
|
|
124
129
|
|
125
|
-
|
130
|
+
|
131
|
+
def parse_quote( input, quote:)
|
126
132
|
value = ""
|
127
|
-
if input.peek ==
|
128
|
-
input.getc ## eat-up
|
133
|
+
if input.peek == quote
|
134
|
+
input.getc ## eat-up quote
|
129
135
|
|
130
136
|
loop do
|
131
|
-
while (c=input.peek; !(c==
|
132
|
-
value << input.getc ## eat-up everything until hitting
|
137
|
+
while (c=input.peek; !(c==quote || c==BACKSLASH || input.eof?))
|
138
|
+
value << input.getc ## eat-up everything until hitting quote (e.g. " or ') or backslash (escape)
|
133
139
|
end
|
134
140
|
|
135
141
|
if input.eof?
|
136
142
|
break
|
137
143
|
elsif input.peek == BACKSLASH
|
138
144
|
value << parse_escape( input )
|
139
|
-
else ## assume input.peek ==
|
140
|
-
input.getc ## eat-up
|
141
|
-
if input.peek ==
|
145
|
+
else ## assume input.peek == quote
|
146
|
+
input.getc ## eat-up quote
|
147
|
+
if input.peek == quote ## doubled up quote?
|
142
148
|
value << input.getc ## add doube quote and continue!!!!
|
143
149
|
else
|
144
150
|
break
|
@@ -146,13 +152,14 @@ def parse_doublequote( input )
|
|
146
152
|
end
|
147
153
|
end
|
148
154
|
else
|
149
|
-
raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< -
|
155
|
+
raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - QUOTE (#{quote}) expected in parse_quote!!!!" )
|
150
156
|
end
|
151
157
|
value
|
152
158
|
end
|
153
159
|
|
154
160
|
|
155
161
|
|
162
|
+
|
156
163
|
def parse_field( input )
|
157
164
|
value = ""
|
158
165
|
|
@@ -175,11 +182,18 @@ def parse_field( input )
|
|
175
182
|
end
|
176
183
|
elsif input.peek == DOUBLE_QUOTE
|
177
184
|
logger.debug "start double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
178
|
-
value <<
|
185
|
+
value << parse_quote( input, quote: DOUBLE_QUOTE )
|
179
186
|
|
180
187
|
## note: always eat-up all trailing spaces (" ") and tabs (\t)
|
181
188
|
skip_spaces( input )
|
182
189
|
logger.debug "end double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
190
|
+
elsif input.peek == SINGLE_QUOTE ## allow single quote too (by default)
|
191
|
+
logger.debug "start single_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
192
|
+
value << parse_quote( input, quote: SINGLE_QUOTE )
|
193
|
+
|
194
|
+
## note: always eat-up all trailing spaces (" ") and tabs (\t)
|
195
|
+
skip_spaces( input )
|
196
|
+
logger.debug "end single_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
183
197
|
else
|
184
198
|
logger.debug "start reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
185
199
|
## consume simple value
|
@@ -349,20 +363,41 @@ def parse_lines( input, &block )
|
|
349
363
|
## used for meta block (can only start before any records e.g. if record_num == 0)
|
350
364
|
record_num = 0
|
351
365
|
|
366
|
+
## note: can either use '#' or '%' but NOT both; first one "wins"
|
367
|
+
comment = nil
|
368
|
+
|
369
|
+
## note: can either use directives (@) or frontmatter (---) block; first one "wins"
|
370
|
+
has_seen_directive = false
|
371
|
+
has_seen_frontmatter = false ## - renameto has_seen_dash (---) - why? why not???
|
372
|
+
|
373
|
+
|
352
374
|
loop do
|
353
375
|
break if input.eof?
|
354
376
|
|
355
377
|
skipped_spaces = skip_spaces( input )
|
356
378
|
|
357
|
-
if input.peek ==
|
358
|
-
logger.debug "skipping comment - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
379
|
+
if comment.nil? && (c=input.peek; c==COMMENT1 || c==COMMENT2)
|
380
|
+
logger.debug "skipping comment (first) - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
381
|
+
comment = input.getc ## first comment line (determines/fixes "allowed" comment-style)
|
382
|
+
skip_until_eol( input )
|
383
|
+
skip_newline( input )
|
384
|
+
elsif comment && input.peek == comment ## (anther) comment line
|
385
|
+
logger.debug "skipping comment (follow-up) - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
359
386
|
skip_until_eol( input )
|
360
387
|
skip_newline( input )
|
361
388
|
elsif (c=input.peek; c==LF || c==CR || input.eof?)
|
362
389
|
logger.debug "skipping blank - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
363
390
|
skip_newline( input )
|
364
|
-
elsif record_num == 0 &&
|
391
|
+
elsif record_num == 0 && has_seen_frontmatter == false && input.peek==DIRECTIVE
|
392
|
+
## note: "skip" directives for now
|
393
|
+
has_seen_directive = true
|
394
|
+
logger.debug "skip directive" if logger.debug?
|
395
|
+
skip_until_eol( input )
|
396
|
+
skip_newline( input )
|
397
|
+
elsif record_num == 0 && has_seen_directive == false && has_seen_frontmatter == false &&
|
398
|
+
skipped_spaces == 0 && input.peekn(4) =~ /^---[\n\r \t]$/
|
365
399
|
## note: assume "---" (MUST BE) followed by newline (\r or \n) or space starts a meta block
|
400
|
+
has_seen_frontmatter = true
|
366
401
|
logger.debug "start meta block" if logger.debug?
|
367
402
|
## note: meta gets stored as object attribute (state/state/state!!)
|
368
403
|
## use meta attribute to get meta data after reading first record
|
data/lib/csvreader/version.rb
CHANGED
@@ -0,0 +1,25 @@
|
|
1
|
+
% 1. Title: Iris Plants Database
|
2
|
+
%
|
3
|
+
% 2. Sources:
|
4
|
+
% (a) Creator: R.A. Fisher
|
5
|
+
|
6
|
+
|
7
|
+
@RELATION iris
|
8
|
+
|
9
|
+
@ATTRIBUTE sepallength NUMERIC
|
10
|
+
@ATTRIBUTE sepalwidth NUMERIC
|
11
|
+
@ATTRIBUTE petallength NUMERIC
|
12
|
+
@ATTRIBUTE petalwidth NUMERIC
|
13
|
+
@ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica}
|
14
|
+
|
15
|
+
@DATA
|
16
|
+
5.1,3.5,1.4,0.2,Iris-setosa
|
17
|
+
4.9,3.0,1.4,0.2,Iris-setosa
|
18
|
+
4.7,3.2,1.3,0.2,Iris-setosa
|
19
|
+
4.6,3.1,1.5,0.2,Iris-setosa
|
20
|
+
5.0,3.6,1.4,0.2,Iris-setosa
|
21
|
+
5.4,3.9,1.7,0.4,Iris-setosa
|
22
|
+
4.6,3.4,1.4,0.3,Iris-setosa
|
23
|
+
5.0,3.4,1.5,0.2,Iris-setosa
|
24
|
+
4.4,2.9,1.4,0.2,Iris-setosa
|
25
|
+
4.9,3.1,1.5,0.1,Iris-setosa
|
@@ -0,0 +1,14 @@
|
|
1
|
+
% Attribute-Relation File Format (ARFF) Example
|
2
|
+
% see https://www.cs.waikato.ac.nz/ml/weka/arff.html
|
3
|
+
|
4
|
+
@relation LCCvsLCSH
|
5
|
+
|
6
|
+
@attribute LCC string
|
7
|
+
@attribute LCSH string
|
8
|
+
|
9
|
+
@data
|
10
|
+
AG5, 'Encyclopedias and dictionaries.;Twentieth century.'
|
11
|
+
AS262, 'Science -- Soviet Union -- History.'
|
12
|
+
AE5, 'Encyclopedias and dictionaries.'
|
13
|
+
AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Phases.'
|
14
|
+
AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Tables.'
|
@@ -0,0 +1,68 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
###
|
4
|
+
# to run use
|
5
|
+
# ruby -I ./lib -I ./test test/test_parser_directive.rb
|
6
|
+
|
7
|
+
|
8
|
+
require 'helper'
|
9
|
+
|
10
|
+
class TestParserDirective < MiniTest::Test
|
11
|
+
|
12
|
+
|
13
|
+
def parser
|
14
|
+
parser = CsvReader::Parser::DEFAULT
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
def test_iris
|
19
|
+
records = [["5.1","3.5","1.4","0.2","Iris-setosa"],
|
20
|
+
["4.9","3.0","1.4","0.2","Iris-setosa"]]
|
21
|
+
|
22
|
+
|
23
|
+
assert_equal records, parser.parse( <<TXT )
|
24
|
+
% with meta data - arff (attribute relation file format)-style
|
25
|
+
%
|
26
|
+
|
27
|
+
@RELATION iris
|
28
|
+
|
29
|
+
@ATTRIBUTE sepallength NUMERIC
|
30
|
+
@ATTRIBUTE sepalwidth NUMERIC
|
31
|
+
@ATTRIBUTE petallength NUMERIC
|
32
|
+
@ATTRIBUTE petalwidth NUMERIC
|
33
|
+
@ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica}
|
34
|
+
|
35
|
+
@DATA
|
36
|
+
5.1,3.5,1.4,0.2,Iris-setosa
|
37
|
+
4.9,3.0,1.4,0.2,Iris-setosa
|
38
|
+
TXT
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
def test_lcc
|
43
|
+
records = [['AG5', 'Encyclopedias and dictionaries.;Twentieth century.'],
|
44
|
+
['AS262', 'Science -- Soviet Union -- History.'],
|
45
|
+
['AE5', 'Encyclopedias and dictionaries.'],
|
46
|
+
['AS281', 'Astronomy, Assyro-Babylonian.;Moon -- Phases.'],
|
47
|
+
['AS281', 'Astronomy, Assyro-Babylonian.;Moon -- Tables.']]
|
48
|
+
|
49
|
+
|
50
|
+
assert_equal records, parser.parse( <<TXT )
|
51
|
+
% Attribute-Relation File Format (ARFF) Example
|
52
|
+
% see https://www.cs.waikato.ac.nz/ml/weka/arff.html
|
53
|
+
|
54
|
+
@relation LCCvsLCSH
|
55
|
+
|
56
|
+
@attribute LCC string
|
57
|
+
@attribute LCSH string
|
58
|
+
|
59
|
+
@data
|
60
|
+
AG5, 'Encyclopedias and dictionaries.;Twentieth century.'
|
61
|
+
AS262, 'Science -- Soviet Union -- History.'
|
62
|
+
AE5, 'Encyclopedias and dictionaries.'
|
63
|
+
AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Phases.'
|
64
|
+
AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Tables.'
|
65
|
+
TXT
|
66
|
+
end
|
67
|
+
|
68
|
+
end # class TestParserDirective
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: csvreader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-10-
|
11
|
+
date: 2018-10-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rdoc
|
@@ -73,12 +73,15 @@ files:
|
|
73
73
|
- test/data/cars11.csv
|
74
74
|
- test/data/cities11.csv
|
75
75
|
- test/data/customers11.csv
|
76
|
+
- test/data/iris.attrib.csv
|
76
77
|
- test/data/iris11.csv
|
78
|
+
- test/data/lcc.attrib.csv
|
77
79
|
- test/data/shakespeare.csv
|
78
80
|
- test/helper.rb
|
79
81
|
- test/test_buffer.rb
|
80
82
|
- test/test_converter.rb
|
81
83
|
- test/test_parser.rb
|
84
|
+
- test/test_parser_directive.rb
|
82
85
|
- test/test_parser_fixed.rb
|
83
86
|
- test/test_parser_formats.rb
|
84
87
|
- test/test_parser_java.rb
|