csvreader 1.0.0 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Manifest.txt +2 -0
- data/README.md +113 -7
- data/lib/csvreader.rb +12 -138
- data/lib/csvreader/base.rb +144 -0
- data/lib/csvreader/builder.rb +8 -6
- data/lib/csvreader/parser.rb +6 -0
- data/lib/csvreader/parser_strict.rb +57 -6
- data/lib/csvreader/reader.rb +0 -3
- data/lib/csvreader/reader_hash.rb +9 -3
- data/lib/csvreader/version.rb +1 -1
- data/test/helper.rb +2 -0
- data/test/test_parser_numeric.rb +38 -0
- data/test/test_reader.rb +5 -3
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 47c630dbfe75b03e4f2d03710cca0f4b3c66ea84
|
4
|
+
data.tar.gz: 7b604c9b9144190b1b8b4a4dcd848ea5b7f88aa4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2dad1ae27b4273b8e5a22cf6eae60f141eca381229ed03bfbe6f403e99aae07a16aa4fc84e1f22a953cd9db5da1b9fa2fb6266666fdc6a756872e8bd4ec8dfb9
|
7
|
+
data.tar.gz: 38b0002ea3bdfff0b7ce994064d4fb4993e75a3b1225128a50ad9e18180c12cdae9d2a40b6f694e70ebccf4d65005fd5a2145450c20a565de8d3f5af7b398a58
|
data/Manifest.txt
CHANGED
@@ -4,6 +4,7 @@ Manifest.txt
|
|
4
4
|
README.md
|
5
5
|
Rakefile
|
6
6
|
lib/csvreader.rb
|
7
|
+
lib/csvreader/base.rb
|
7
8
|
lib/csvreader/buffer.rb
|
8
9
|
lib/csvreader/builder.rb
|
9
10
|
lib/csvreader/converter.rb
|
@@ -26,6 +27,7 @@ test/test_parser.rb
|
|
26
27
|
test/test_parser_formats.rb
|
27
28
|
test/test_parser_java.rb
|
28
29
|
test/test_parser_null.rb
|
30
|
+
test/test_parser_numeric.rb
|
29
31
|
test/test_parser_strict.rb
|
30
32
|
test/test_parser_tab.rb
|
31
33
|
test/test_reader.rb
|
data/README.md
CHANGED
@@ -21,14 +21,14 @@ TXT
|
|
21
21
|
records = Csv.parse( txt ) ## or CsvReader.parse
|
22
22
|
pp records
|
23
23
|
# => [["1","2","3"],
|
24
|
-
# ["
|
24
|
+
# ["4","5","6"]]
|
25
25
|
|
26
26
|
# -or-
|
27
27
|
|
28
28
|
records = Csv.read( "values.csv" ) ## or CsvReader.read
|
29
29
|
pp records
|
30
30
|
# => [["1","2","3"],
|
31
|
-
# ["
|
31
|
+
# ["4","5","6"]]
|
32
32
|
|
33
33
|
# -or-
|
34
34
|
|
@@ -36,11 +36,11 @@ Csv.foreach( "values.csv" ) do |rec| ## or CsvReader.foreach
|
|
36
36
|
pp rec
|
37
37
|
end
|
38
38
|
# => ["1","2","3"]
|
39
|
-
# => ["
|
39
|
+
# => ["4","5","6"]
|
40
40
|
```
|
41
41
|
|
42
42
|
|
43
|
-
### What about converters?
|
43
|
+
### What about type inference and data converters?
|
44
44
|
|
45
45
|
Use the converters keyword option to (auto-)convert strings to nulls, booleans, integers, floats, dates, etc.
|
46
46
|
Example:
|
@@ -72,6 +72,18 @@ Built-in converters include:
|
|
72
72
|
| `:all` | shortcut for `[:null, :boolean, :date_time, :numeric]` |
|
73
73
|
|
74
74
|
|
75
|
+
Or add your own converters. Example:
|
76
|
+
|
77
|
+
``` ruby
|
78
|
+
Csv.parse( 'Ruby, 2020-03-01, 100', converters: [->(v) { Time.parse(v) rescue v }] )
|
79
|
+
#=> [["Ruby", 2020-03-01 00:00:00 +0200, "100"]]
|
80
|
+
```
|
81
|
+
|
82
|
+
A custom converter is a method that gets the value passed in
|
83
|
+
and if successful returns a non-string type (e.g. integer, float, date, etc.)
|
84
|
+
or a string (for further processing with all other converters in the "pipeline" configuration).
|
85
|
+
|
86
|
+
|
75
87
|
|
76
88
|
### What about Enumerable?
|
77
89
|
|
@@ -94,7 +106,7 @@ it = csv.to_enum
|
|
94
106
|
pp it.next
|
95
107
|
# => ["1","2","3"]
|
96
108
|
pp it.next
|
97
|
-
# => ["
|
109
|
+
# => ["4","5","6"]
|
98
110
|
```
|
99
111
|
|
100
112
|
|
@@ -150,7 +162,7 @@ end
|
|
150
162
|
|
151
163
|
### What about symbol keys for hashes?
|
152
164
|
|
153
|
-
Yes,
|
165
|
+
Yes, you can use the header_converters keyword option.
|
154
166
|
Use `:symbol` for (auto-)converting header (strings) to symbols.
|
155
167
|
Note: the symbol converter will also downcase all letters and
|
156
168
|
remove all non-alphanumeric (e.g. `!?$%`) chars
|
@@ -169,6 +181,15 @@ records = CsvHash.parse( txt, :converters => :all, :header_converters => :symbol
|
|
169
181
|
pp records
|
170
182
|
# => [{a: 1, b: 2, c: 3},
|
171
183
|
# {a: true, b: false, c: nil}]
|
184
|
+
|
185
|
+
# -or-
|
186
|
+
options = { :converters => :all,
|
187
|
+
:header_converters => :symbol }
|
188
|
+
|
189
|
+
records = CsvHash.parse( txt, options )
|
190
|
+
pp records
|
191
|
+
# => [{a: 1, b: 2, c: 3},
|
192
|
+
# {a: true, b: false, c: nil}]
|
172
193
|
```
|
173
194
|
|
174
195
|
Built-in header converters include:
|
@@ -180,6 +201,91 @@ Built-in header converters include:
|
|
180
201
|
|
181
202
|
|
182
203
|
|
204
|
+
### What about (typed) structs?
|
205
|
+
|
206
|
+
See the [csvrecord library »](https://github.com/csv11/csvrecord)
|
207
|
+
|
208
|
+
Example from the csvrecord docu:
|
209
|
+
|
210
|
+
Step 1: Define a (typed) struct for the comma-separated values (csv) records. Example:
|
211
|
+
|
212
|
+
```ruby
|
213
|
+
require 'csvrecord'
|
214
|
+
|
215
|
+
Beer = CsvRecord.define do
|
216
|
+
field :brewery ## note: default type is :string
|
217
|
+
field :city
|
218
|
+
field :name
|
219
|
+
field :abv, Float ## allows type specified as class (or use :float)
|
220
|
+
end
|
221
|
+
```
|
222
|
+
|
223
|
+
or in "classic" style:
|
224
|
+
|
225
|
+
```ruby
|
226
|
+
class Beer < CsvRecord::Base
|
227
|
+
field :brewery
|
228
|
+
field :city
|
229
|
+
field :name
|
230
|
+
field :abv, Float
|
231
|
+
end
|
232
|
+
```
|
233
|
+
|
234
|
+
|
235
|
+
Step 2: Read in the comma-separated values (csv) datafile. Example:
|
236
|
+
|
237
|
+
```ruby
|
238
|
+
beers = Beer.read( 'beer.csv' )
|
239
|
+
|
240
|
+
puts "#{beers.size} beers:"
|
241
|
+
pp beers
|
242
|
+
```
|
243
|
+
|
244
|
+
pretty prints (pp):
|
245
|
+
|
246
|
+
```
|
247
|
+
6 beers:
|
248
|
+
[#<Beer:0x302c760 @values=
|
249
|
+
["Andechser Klosterbrauerei", "Andechs", "Doppelbock Dunkel", 7.0]>,
|
250
|
+
#<Beer:0x3026fe8 @values=
|
251
|
+
["Augustiner Br\u00E4u M\u00FCnchen", "M\u00FCnchen", "Edelstoff", 5.6]>,
|
252
|
+
#<Beer:0x30257a0 @values=
|
253
|
+
["Bayerische Staatsbrauerei Weihenstephan", "Freising", "Hefe Weissbier", 5.4]>,
|
254
|
+
...
|
255
|
+
]
|
256
|
+
```
|
257
|
+
|
258
|
+
Or loop over the records. Example:
|
259
|
+
|
260
|
+
``` ruby
|
261
|
+
Beer.read( 'beer.csv' ).each do |rec|
|
262
|
+
puts "#{rec.name} (#{rec.abv}%) by #{rec.brewery}, #{rec.city}"
|
263
|
+
end
|
264
|
+
|
265
|
+
# -or-
|
266
|
+
|
267
|
+
Beer.foreach( 'beer.csv' ) do |rec|
|
268
|
+
puts "#{rec.name} (#{rec.abv}%) by #{rec.brewery}, #{rec.city}"
|
269
|
+
end
|
270
|
+
```
|
271
|
+
|
272
|
+
|
273
|
+
printing:
|
274
|
+
|
275
|
+
```
|
276
|
+
Doppelbock Dunkel (7.0%) by Andechser Klosterbrauerei, Andechs
|
277
|
+
Edelstoff (5.6%) by Augustiner Bräu München, München
|
278
|
+
Hefe Weissbier (5.4%) by Bayerische Staatsbrauerei Weihenstephan, Freising
|
279
|
+
Rauchbier Märzen (5.1%) by Brauerei Spezial, Bamberg
|
280
|
+
Münchner Dunkel (5.0%) by Hacker-Pschorr Bräu, München
|
281
|
+
Hofbräu Oktoberfestbier (6.3%) by Staatliches Hofbräuhaus München, München
|
282
|
+
```
|
283
|
+
|
284
|
+
|
285
|
+
### What about tabular data packages with pre-defined types / schemas?
|
286
|
+
|
287
|
+
See the [csvpack library »](https://github.com/csv11/csvpack)
|
288
|
+
|
183
289
|
|
184
290
|
|
185
291
|
|
@@ -319,7 +425,7 @@ Csv.strict.read( ..., sep: "\t" )
|
|
319
425
|
|
320
426
|
Two major design bugs and many many minor.
|
321
427
|
|
322
|
-
(1) The CSV class uses [`line.split(',')`](https://github.com/ruby/csv/blob/master/lib/csv.rb#
|
428
|
+
(1) The CSV class uses [`line.split(',')`](https://github.com/ruby/csv/blob/master/lib/csv.rb#L1255) with some kludges (†) with the claim it's faster.
|
323
429
|
What?! The right way: CSV needs its own purpose-built parser. There's no other
|
324
430
|
way you can handle all the (edge) cases with double quotes and escaped doubled up
|
325
431
|
double quotes. Period.
|
data/lib/csvreader.rb
CHANGED
@@ -1,138 +1,12 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
require '
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
require 'csvreader/version' # let version always go first
|
14
|
-
require 'csvreader/buffer'
|
15
|
-
require 'csvreader/parser_std' # best practices pre-configured out-of-the-box
|
16
|
-
require 'csvreader/parser_strict' # flexible (strict - no leading/trailing space triming, blanks, etc.), configure for different formats/dialects
|
17
|
-
require 'csvreader/parser_tab'
|
18
|
-
require 'csvreader/parser'
|
19
|
-
require 'csvreader/builder'
|
20
|
-
require 'csvreader/reader'
|
21
|
-
require 'csvreader/reader_hash'
|
22
|
-
require 'csvreader/converter'
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
class CsvReader
|
27
|
-
class Parser
|
28
|
-
|
29
|
-
## use/allow different "backends" e.g. ParserStd, ParserStrict, ParserTab, etc.
|
30
|
-
## parser must support parse method (with and without block)
|
31
|
-
## e.g. records = parse( data )
|
32
|
-
## -or-
|
33
|
-
## parse( data ) do |record|
|
34
|
-
## end
|
35
|
-
|
36
|
-
DEFAULT = ParserStd.new
|
37
|
-
|
38
|
-
RFC4180 = ParserStrict.new
|
39
|
-
STRICT = ParserStrict.new ## note: make strict its own instance (so you can change config without "breaking" rfc4180)
|
40
|
-
EXCEL = ParserStrict.new ## note: make excel its own instance (so you can change configs without "breaking" rfc4180/strict)
|
41
|
-
|
42
|
-
MYSQL = ParserStrict.new( sep: "\t",
|
43
|
-
quote: false,
|
44
|
-
escape: true,
|
45
|
-
null: "\\N" )
|
46
|
-
|
47
|
-
POSTGRES = POSTGRESQL = ParserStrict.new( doublequote: false,
|
48
|
-
escape: true,
|
49
|
-
null: "" )
|
50
|
-
|
51
|
-
POSTGRES_TEXT = POSTGRESQL_TEXT = ParserStrict.new( sep: "\t",
|
52
|
-
quote: false,
|
53
|
-
escape: true,
|
54
|
-
null: "\\N" )
|
55
|
-
|
56
|
-
TAB = ParserTab.new
|
57
|
-
|
58
|
-
|
59
|
-
def self.default() DEFAULT; end ## alternative alias for DEFAULT
|
60
|
-
def self.strict() STRICT; end ## alternative alias for STRICT
|
61
|
-
def self.rfc4180() RFC4180; end ## alternative alias for RFC4180
|
62
|
-
def self.excel() EXCEL; end ## alternative alias for EXCEL
|
63
|
-
def self.mysql() MYSQL; end
|
64
|
-
def self.postgresql() POSTGRESQL; end
|
65
|
-
def self.postgres() postgresql; end
|
66
|
-
def self.postgresql_text() POSTGRESQL_TEXT; end
|
67
|
-
def self.postgres_text() postgresql_text; end
|
68
|
-
def self.tab() TAB; end
|
69
|
-
end # class Parser
|
70
|
-
end # class CsvReader
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
class CsvReader
|
75
|
-
### pre-define CsvReader (built-in) formats/dialect
|
76
|
-
DEFAULT = CsvBuilder.new( Parser::DEFAULT )
|
77
|
-
|
78
|
-
STRICT = CsvBuilder.new( Parser::STRICT )
|
79
|
-
RFC4180 = CsvBuilder.new( Parser::RFC4180 )
|
80
|
-
EXCEL = CsvBuilder.new( Parser::EXCEL )
|
81
|
-
|
82
|
-
MYSQL = CsvBuilder.new( Parser::MYSQL )
|
83
|
-
POSTGRES = POSTGRESQL = CsvBuilder.new( Parser::POSTGRESQL )
|
84
|
-
POSTGRES_TEXT = POSTGRESQL_TEXT = CsvBuilder.new( Parser::POSTGRESQL_TEXT )
|
85
|
-
|
86
|
-
TAB = CsvBuilder.new( Parser::TAB )
|
87
|
-
|
88
|
-
|
89
|
-
def self.default() DEFAULT; end ## alternative alias for DEFAULT
|
90
|
-
def self.strict() STRICT; end ## alternative alias for STRICT
|
91
|
-
def self.rfc4180() RFC4180; end ## alternative alias for RFC4180
|
92
|
-
def self.excel() EXCEL; end ## alternative alias for EXCEL
|
93
|
-
def self.mysql() MYSQL; end
|
94
|
-
def self.postgresql() POSTGRESQL; end
|
95
|
-
def self.postgres() postgresql; end
|
96
|
-
def self.postgresql_text() POSTGRESQL_TEXT; end
|
97
|
-
def self.postgres_text() postgresql_text; end
|
98
|
-
def self.tab() TAB; end
|
99
|
-
end # class CsvReader
|
100
|
-
|
101
|
-
|
102
|
-
class CsvHashReader
|
103
|
-
### pre-define CsvReader (built-in) formats/dialect
|
104
|
-
DEFAULT = CsvHashBuilder.new( CsvReader::Parser::DEFAULT )
|
105
|
-
|
106
|
-
STRICT = CsvHashBuilder.new( CsvReader::Parser::STRICT )
|
107
|
-
RFC4180 = CsvHashBuilder.new( CsvReader::Parser::RFC4180 )
|
108
|
-
EXCEL = CsvHashBuilder.new( CsvReader::Parser::EXCEL )
|
109
|
-
|
110
|
-
MYSQL = CsvHashBuilder.new( CsvReader::Parser::MYSQL )
|
111
|
-
POSTGRES = POSTGRESQL = CsvHashBuilder.new( CsvReader::Parser::POSTGRESQL )
|
112
|
-
POSTGRES_TEXT = POSTGRESQL_TEXT = CsvHashBuilder.new( CsvReader::Parser::POSTGRESQL_TEXT )
|
113
|
-
|
114
|
-
TAB = CsvHashBuilder.new( CsvReader::Parser::TAB )
|
115
|
-
|
116
|
-
|
117
|
-
def self.default() DEFAULT; end ## alternative alias for DEFAULT
|
118
|
-
def self.strict() STRICT; end ## alternative alias for STRICT
|
119
|
-
def self.rfc4180() RFC4180; end ## alternative alias for RFC4180
|
120
|
-
def self.excel() EXCEL; end ## alternative alias for EXCEL
|
121
|
-
def self.mysql() MYSQL; end
|
122
|
-
def self.postgresql() POSTGRESQL; end
|
123
|
-
def self.postgres() postgresql; end
|
124
|
-
def self.postgresql_text() POSTGRESQL_TEXT; end
|
125
|
-
def self.postgres_text() postgresql_text; end
|
126
|
-
def self.tab() TAB; end
|
127
|
-
end # class CsvHashReader
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
## add convenience / shortcut alias
|
133
|
-
Csv = CsvReader
|
134
|
-
CsvHash = CsvHashReader
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
puts CsvReader.banner # say hello
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
## our own code (without "top-level" shortcuts e.g. "modular version")
|
5
|
+
require 'csvreader/base'
|
6
|
+
|
7
|
+
|
8
|
+
###
|
9
|
+
# add convenience top-level shortcuts / aliases
|
10
|
+
|
11
|
+
Csv = CsvReader
|
12
|
+
CsvHash = CsvHashReader
|
@@ -0,0 +1,144 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
require 'pp'
|
5
|
+
require 'logger'
|
6
|
+
require 'forwardable'
|
7
|
+
require 'stringio'
|
8
|
+
require 'date' ## use for Date.parse and DateTime.parse
|
9
|
+
|
10
|
+
|
11
|
+
###
|
12
|
+
# our own code
|
13
|
+
require 'csvreader/version' # let version always go first
|
14
|
+
require 'csvreader/buffer'
|
15
|
+
require 'csvreader/parser_std' # best practices pre-configured out-of-the-box
|
16
|
+
require 'csvreader/parser_strict' # flexible (strict - no leading/trailing space triming, blanks, etc.), configure for different formats/dialects
|
17
|
+
require 'csvreader/parser_tab'
|
18
|
+
require 'csvreader/parser'
|
19
|
+
require 'csvreader/converter'
|
20
|
+
require 'csvreader/reader'
|
21
|
+
require 'csvreader/reader_hash'
|
22
|
+
require 'csvreader/builder'
|
23
|
+
|
24
|
+
|
25
|
+
|
26
|
+
class CsvReader
|
27
|
+
class Parser
|
28
|
+
|
29
|
+
## use/allow different "backends" e.g. ParserStd, ParserStrict, ParserTab, etc.
|
30
|
+
## parser must support parse method (with and without block)
|
31
|
+
## e.g. records = parse( data )
|
32
|
+
## -or-
|
33
|
+
## parse( data ) do |record|
|
34
|
+
## end
|
35
|
+
|
36
|
+
DEFAULT = ParserStd.new
|
37
|
+
|
38
|
+
RFC4180 = ParserStrict.new
|
39
|
+
STRICT = ParserStrict.new ## note: make strict its own instance (so you can change config without "breaking" rfc4180)
|
40
|
+
EXCEL = ParserStrict.new ## note: make excel its own instance (so you can change configs without "breaking" rfc4180/strict)
|
41
|
+
|
42
|
+
MYSQL = ParserStrict.new( sep: "\t",
|
43
|
+
quote: false,
|
44
|
+
escape: true,
|
45
|
+
null: "\\N" )
|
46
|
+
|
47
|
+
POSTGRES = POSTGRESQL = ParserStrict.new( doublequote: false,
|
48
|
+
escape: true,
|
49
|
+
null: "" )
|
50
|
+
|
51
|
+
POSTGRES_TEXT = POSTGRESQL_TEXT = ParserStrict.new( sep: "\t",
|
52
|
+
quote: false,
|
53
|
+
escape: true,
|
54
|
+
null: "\\N" )
|
55
|
+
|
56
|
+
NUMERIC = ParserStrict.new( numeric: true,
|
57
|
+
nan: ['#NAN', 'NAN', 'NaN', 'nan' ],
|
58
|
+
null: "" )
|
59
|
+
|
60
|
+
|
61
|
+
TAB = ParserTab.new
|
62
|
+
|
63
|
+
|
64
|
+
def self.default() DEFAULT; end ## alternative alias for DEFAULT
|
65
|
+
def self.strict() STRICT; end ## alternative alias for STRICT
|
66
|
+
def self.rfc4180() RFC4180; end ## alternative alias for RFC4180
|
67
|
+
def self.excel() EXCEL; end ## alternative alias for EXCEL
|
68
|
+
def self.mysql() MYSQL; end
|
69
|
+
def self.postgresql() POSTGRESQL; end
|
70
|
+
def self.postgres() postgresql; end
|
71
|
+
def self.postgresql_text() POSTGRESQL_TEXT; end
|
72
|
+
def self.postgres_text() postgresql_text; end
|
73
|
+
def self.numeric() NUMERIC; end
|
74
|
+
def self.tab() TAB; end
|
75
|
+
end # class Parser
|
76
|
+
end # class CsvReader
|
77
|
+
|
78
|
+
|
79
|
+
|
80
|
+
class CsvReader
|
81
|
+
### pre-define CsvReader (built-in) formats/dialect
|
82
|
+
DEFAULT = Builder.new( Parser::DEFAULT )
|
83
|
+
|
84
|
+
STRICT = Builder.new( Parser::STRICT )
|
85
|
+
RFC4180 = Builder.new( Parser::RFC4180 )
|
86
|
+
EXCEL = Builder.new( Parser::EXCEL )
|
87
|
+
|
88
|
+
MYSQL = Builder.new( Parser::MYSQL )
|
89
|
+
POSTGRES = POSTGRESQL = Builder.new( Parser::POSTGRESQL )
|
90
|
+
POSTGRES_TEXT = POSTGRESQL_TEXT = Builder.new( Parser::POSTGRESQL_TEXT )
|
91
|
+
|
92
|
+
NUMERIC = Builder.new( Parser::NUMERIC )
|
93
|
+
|
94
|
+
TAB = Builder.new( Parser::TAB )
|
95
|
+
|
96
|
+
|
97
|
+
def self.default() DEFAULT; end ## alternative alias for DEFAULT
|
98
|
+
def self.strict() STRICT; end ## alternative alias for STRICT
|
99
|
+
def self.rfc4180() RFC4180; end ## alternative alias for RFC4180
|
100
|
+
def self.excel() EXCEL; end ## alternative alias for EXCEL
|
101
|
+
def self.mysql() MYSQL; end
|
102
|
+
def self.postgresql() POSTGRESQL; end
|
103
|
+
def self.postgres() postgresql; end
|
104
|
+
def self.postgresql_text() POSTGRESQL_TEXT; end
|
105
|
+
def self.postgres_text() postgresql_text; end
|
106
|
+
def self.numeric() NUMERIC; end
|
107
|
+
def self.tab() TAB; end
|
108
|
+
end # class CsvReader
|
109
|
+
|
110
|
+
|
111
|
+
class CsvHashReader
|
112
|
+
### pre-define CsvReader (built-in) formats/dialect
|
113
|
+
DEFAULT = Builder.new( Parser::DEFAULT )
|
114
|
+
|
115
|
+
STRICT = Builder.new( Parser::STRICT )
|
116
|
+
RFC4180 = Builder.new( Parser::RFC4180 )
|
117
|
+
EXCEL = Builder.new( Parser::EXCEL )
|
118
|
+
|
119
|
+
MYSQL = Builder.new( Parser::MYSQL )
|
120
|
+
POSTGRES = POSTGRESQL = Builder.new( Parser::POSTGRESQL )
|
121
|
+
POSTGRES_TEXT = POSTGRESQL_TEXT = Builder.new( Parser::POSTGRESQL_TEXT )
|
122
|
+
|
123
|
+
NUMERIC = Builder.new( Parser::NUMERIC )
|
124
|
+
|
125
|
+
TAB = Builder.new( Parser::TAB )
|
126
|
+
|
127
|
+
|
128
|
+
def self.default() DEFAULT; end ## alternative alias for DEFAULT
|
129
|
+
def self.strict() STRICT; end ## alternative alias for STRICT
|
130
|
+
def self.rfc4180() RFC4180; end ## alternative alias for RFC4180
|
131
|
+
def self.excel() EXCEL; end ## alternative alias for EXCEL
|
132
|
+
def self.mysql() MYSQL; end
|
133
|
+
def self.postgresql() POSTGRESQL; end
|
134
|
+
def self.postgres() postgresql; end
|
135
|
+
def self.postgresql_text() POSTGRESQL_TEXT; end
|
136
|
+
def self.postgres_text() postgresql_text; end
|
137
|
+
def self.numeric() NUMERIC; end
|
138
|
+
def self.tab() TAB; end
|
139
|
+
end # class CsvHashReader
|
140
|
+
|
141
|
+
|
142
|
+
|
143
|
+
|
144
|
+
puts CsvReader.banner # say hello
|
data/lib/csvreader/builder.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
|
-
|
4
|
-
class
|
3
|
+
class CsvReader
|
4
|
+
class Builder ## rename to CsvReaderBuilder - why? why not?
|
5
5
|
|
6
6
|
|
7
7
|
def initialize( parser )
|
@@ -53,12 +53,13 @@ class CsvBuilder ## rename to CsvReaderBuilder - why? why not?
|
|
53
53
|
sep: sep, converters: converters,
|
54
54
|
parser: @parser, &block )
|
55
55
|
end
|
56
|
-
end # class
|
57
|
-
|
56
|
+
end # class Builder
|
57
|
+
end # class CsvReader
|
58
58
|
|
59
59
|
|
60
60
|
|
61
|
-
class
|
61
|
+
class CsvHashReader
|
62
|
+
class Builder ## rename to CsvHashReaderBuilder - why? why not?
|
62
63
|
def initialize( parser )
|
63
64
|
@parser = parser
|
64
65
|
end
|
@@ -117,4 +118,5 @@ class CsvHashBuilder ## rename to CsvHashReaderBuilder - why? why not?
|
|
117
118
|
header_converters: header_converters,
|
118
119
|
parser: @parser, &block )
|
119
120
|
end
|
120
|
-
end # class
|
121
|
+
end # class Builder
|
122
|
+
end # class CsvHashReader
|
data/lib/csvreader/parser.rb
CHANGED
@@ -36,7 +36,9 @@ def initialize( sep: ',',
|
|
36
36
|
doublequote: true,
|
37
37
|
escape: false, ## true/false
|
38
38
|
null: nil, ## note: set to nil for no null vales / not availabe (na)
|
39
|
-
comment: false ## note: comment char e.g. # or false/nil
|
39
|
+
comment: false, ## note: comment char e.g. # or false/nil
|
40
|
+
numeric: false, ## (auto-)convert all non-quoted values to float
|
41
|
+
nan: nil ## note: only if numeric - set mappings for Float::NAN (not a number) values
|
40
42
|
)
|
41
43
|
@config = {} ## todo/fix: change config to proper dialect class/struct - why? why not?
|
42
44
|
@config[:sep] = sep
|
@@ -45,8 +47,11 @@ def initialize( sep: ',',
|
|
45
47
|
@config[:escape] = escape
|
46
48
|
@config[:null] = null
|
47
49
|
@config[:comment] = comment
|
50
|
+
@config[:numeric] = numeric
|
51
|
+
@config[:nan] = nan # not a number (NaN) e.g. Float::NAN
|
48
52
|
end
|
49
53
|
|
54
|
+
|
50
55
|
#########################################
|
51
56
|
## config convenience helpers
|
52
57
|
## e.g. use like Csv.mysql.sep = ',' etc. instead of
|
@@ -57,6 +62,8 @@ def doublequote=( value ) @config[:doublequote]=value; end
|
|
57
62
|
def escape=( value ) @config[:escape]=value; end
|
58
63
|
def null=( value ) @config[:null]=value; end
|
59
64
|
def comment=( value ) @config[:comment]=value; end
|
65
|
+
def numeric=( value ) @config[:numeric]=value; end
|
66
|
+
def nan=( value ) @config[:nan]=value; end
|
60
67
|
|
61
68
|
|
62
69
|
|
@@ -149,14 +156,22 @@ end
|
|
149
156
|
def parse_field( input, sep: )
|
150
157
|
value = ""
|
151
158
|
|
152
|
-
quote
|
153
|
-
escape
|
159
|
+
quote = config[:quote]
|
160
|
+
escape = config[:escape]
|
161
|
+
numeric = config[:numeric]
|
154
162
|
|
155
163
|
logger.debug "parse field - sep: >#{sep}< (#{sep.ord})" if logger.debug?
|
156
164
|
|
157
165
|
if (c=input.peek; c==sep || c==LF || c==CR || input.eof?) ## empty unquoted field
|
158
|
-
|
159
|
-
##
|
166
|
+
## note: allows null = '' that is turn unquoted empty strings into null/nil
|
167
|
+
## or if using numeric into NotANumber (NaN)
|
168
|
+
if is_null?( value )
|
169
|
+
value = nil
|
170
|
+
elsif numeric & is_nan?( value )
|
171
|
+
value = Float::NAN
|
172
|
+
else
|
173
|
+
# do nothing - keep value as is :-) e.g. "".
|
174
|
+
end
|
160
175
|
elsif quote && input.peek == quote
|
161
176
|
logger.debug "start quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
162
177
|
value << parse_quote( input, sep: sep )
|
@@ -174,7 +189,24 @@ def parse_field( input, sep: )
|
|
174
189
|
end
|
175
190
|
end
|
176
191
|
|
177
|
-
|
192
|
+
|
193
|
+
if is_null?( value ) ## note: null check only for UNQUOTED (not quoted/escaped) values
|
194
|
+
value = nil
|
195
|
+
elsif numeric
|
196
|
+
if is_nan?( value )
|
197
|
+
value = Float::NAN
|
198
|
+
else
|
199
|
+
## numeric - (auto-convert) non-quoted values (if NOT nil) to floats
|
200
|
+
if numeric.is_a?( Proc )
|
201
|
+
value = numeric.call( value ) ## allow custom converter proc (e.g. how to handle NaN and conversion errors?)
|
202
|
+
else
|
203
|
+
value = convert_to_float( value ) # default (fails silently) keep string value if cannot convert - change - why? why not?
|
204
|
+
end
|
205
|
+
end
|
206
|
+
else
|
207
|
+
# do nothing - keep value as is :-).
|
208
|
+
end
|
209
|
+
|
178
210
|
logger.debug "end reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
179
211
|
end
|
180
212
|
|
@@ -182,6 +214,7 @@ def parse_field( input, sep: )
|
|
182
214
|
end
|
183
215
|
|
184
216
|
|
217
|
+
|
185
218
|
def parse_record( input, sep: )
|
186
219
|
values = []
|
187
220
|
|
@@ -263,6 +296,24 @@ def parse_lines( input, sep:, &block )
|
|
263
296
|
end # method parse_lines
|
264
297
|
|
265
298
|
|
299
|
+
def convert_to_float( value ) Float( value ) rescue value; end
|
300
|
+
|
301
|
+
def is_nan?( value )
|
302
|
+
nan = @config[:nan]
|
303
|
+
if nan.nil?
|
304
|
+
false ## nothing set; return always false (not NaN)
|
305
|
+
elsif nan.is_a?( Proc )
|
306
|
+
nan.call( value )
|
307
|
+
elsif nan.is_a?( Array )
|
308
|
+
nan.include?( value )
|
309
|
+
elsif nan.is_a?( String )
|
310
|
+
value == nan
|
311
|
+
else ## unknown config style / setting
|
312
|
+
## todo: issue a warning or error - why? why not?
|
313
|
+
false ## nothing set; return always false (not nan)
|
314
|
+
end
|
315
|
+
end
|
316
|
+
|
266
317
|
def is_null?( value )
|
267
318
|
null = @config[:null]
|
268
319
|
if null.nil?
|
data/lib/csvreader/reader.rb
CHANGED
@@ -3,6 +3,12 @@
|
|
3
3
|
class CsvHashReader
|
4
4
|
|
5
5
|
|
6
|
+
## add convenience shortcuts / aliases for CsvReader support classes
|
7
|
+
Parser = CsvReader::Parser
|
8
|
+
Converter = CsvReader::Converter
|
9
|
+
|
10
|
+
|
11
|
+
|
6
12
|
def self.open( path, mode=nil,
|
7
13
|
headers: nil,
|
8
14
|
sep: nil,
|
@@ -113,10 +119,10 @@ def initialize( data, headers: nil, sep: nil,
|
|
113
119
|
|
114
120
|
@sep = sep
|
115
121
|
|
116
|
-
@converters =
|
117
|
-
@header_converters =
|
122
|
+
@converters = Converter.create_converters( converters )
|
123
|
+
@header_converters = Converter.create_header_converters( header_converters )
|
118
124
|
|
119
|
-
@parser = parser.nil? ?
|
125
|
+
@parser = parser.nil? ? Parser::DEFAULT : parser
|
120
126
|
end
|
121
127
|
|
122
128
|
|
data/lib/csvreader/version.rb
CHANGED
data/test/helper.rb
CHANGED
@@ -0,0 +1,38 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
###
|
4
|
+
# to run use
|
5
|
+
# ruby -I ./lib -I ./test test/test_parser_numeric.rb
|
6
|
+
|
7
|
+
|
8
|
+
require 'helper'
|
9
|
+
|
10
|
+
|
11
|
+
class TestParserNumeric < MiniTest::Test
|
12
|
+
|
13
|
+
def parser
|
14
|
+
CsvReader::Parser::NUMERIC
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
def test_parser_numeric
|
19
|
+
pp CsvReader::Parser::NUMERIC
|
20
|
+
pp CsvReader::Parser.numeric
|
21
|
+
assert true
|
22
|
+
end
|
23
|
+
|
24
|
+
def test_parse
|
25
|
+
assert_equal [[1.0,2.0,3.0],
|
26
|
+
[4.0,5.0,6.0]], parser.parse( "1,2,3\n4,5,6" )
|
27
|
+
assert_equal [[1.0,2.0,3.0],
|
28
|
+
["4","5","6"]], parser.parse( %Q{ 1,2 , 3\n"4","5","6"} )
|
29
|
+
assert_equal [["a","b","c"]], parser.parse( %Q{"a","b","c"} )
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
def test_empty
|
34
|
+
assert_equal [[nil,nil,nil],
|
35
|
+
["","",""]], parser.parse( %Q{,,\n"","",""} )
|
36
|
+
end
|
37
|
+
|
38
|
+
end # class TestParserNumeric
|
data/test/test_reader.rb
CHANGED
@@ -84,9 +84,11 @@ def test_enum
|
|
84
84
|
assert_equal ["a","b","c"], enum.next
|
85
85
|
|
86
86
|
## test Csv == CsvReader class alias
|
87
|
-
|
88
|
-
|
89
|
-
|
87
|
+
if defined?( Csv )
|
88
|
+
csv = Csv.new( "a,b,c" )
|
89
|
+
enum = csv.to_enum
|
90
|
+
assert_equal ["a","b","c"], enum.next
|
91
|
+
end
|
90
92
|
end
|
91
93
|
|
92
94
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: csvreader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-10-
|
11
|
+
date: 2018-10-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rdoc
|
@@ -55,6 +55,7 @@ files:
|
|
55
55
|
- README.md
|
56
56
|
- Rakefile
|
57
57
|
- lib/csvreader.rb
|
58
|
+
- lib/csvreader/base.rb
|
58
59
|
- lib/csvreader/buffer.rb
|
59
60
|
- lib/csvreader/builder.rb
|
60
61
|
- lib/csvreader/converter.rb
|
@@ -77,6 +78,7 @@ files:
|
|
77
78
|
- test/test_parser_formats.rb
|
78
79
|
- test/test_parser_java.rb
|
79
80
|
- test/test_parser_null.rb
|
81
|
+
- test/test_parser_numeric.rb
|
80
82
|
- test/test_parser_strict.rb
|
81
83
|
- test/test_parser_tab.rb
|
82
84
|
- test/test_reader.rb
|