csvreader 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Manifest.txt +2 -0
- data/README.md +113 -7
- data/lib/csvreader.rb +12 -138
- data/lib/csvreader/base.rb +144 -0
- data/lib/csvreader/builder.rb +8 -6
- data/lib/csvreader/parser.rb +6 -0
- data/lib/csvreader/parser_strict.rb +57 -6
- data/lib/csvreader/reader.rb +0 -3
- data/lib/csvreader/reader_hash.rb +9 -3
- data/lib/csvreader/version.rb +1 -1
- data/test/helper.rb +2 -0
- data/test/test_parser_numeric.rb +38 -0
- data/test/test_reader.rb +5 -3
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 47c630dbfe75b03e4f2d03710cca0f4b3c66ea84
|
4
|
+
data.tar.gz: 7b604c9b9144190b1b8b4a4dcd848ea5b7f88aa4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2dad1ae27b4273b8e5a22cf6eae60f141eca381229ed03bfbe6f403e99aae07a16aa4fc84e1f22a953cd9db5da1b9fa2fb6266666fdc6a756872e8bd4ec8dfb9
|
7
|
+
data.tar.gz: 38b0002ea3bdfff0b7ce994064d4fb4993e75a3b1225128a50ad9e18180c12cdae9d2a40b6f694e70ebccf4d65005fd5a2145450c20a565de8d3f5af7b398a58
|
data/Manifest.txt
CHANGED
@@ -4,6 +4,7 @@ Manifest.txt
|
|
4
4
|
README.md
|
5
5
|
Rakefile
|
6
6
|
lib/csvreader.rb
|
7
|
+
lib/csvreader/base.rb
|
7
8
|
lib/csvreader/buffer.rb
|
8
9
|
lib/csvreader/builder.rb
|
9
10
|
lib/csvreader/converter.rb
|
@@ -26,6 +27,7 @@ test/test_parser.rb
|
|
26
27
|
test/test_parser_formats.rb
|
27
28
|
test/test_parser_java.rb
|
28
29
|
test/test_parser_null.rb
|
30
|
+
test/test_parser_numeric.rb
|
29
31
|
test/test_parser_strict.rb
|
30
32
|
test/test_parser_tab.rb
|
31
33
|
test/test_reader.rb
|
data/README.md
CHANGED
@@ -21,14 +21,14 @@ TXT
|
|
21
21
|
records = Csv.parse( txt ) ## or CsvReader.parse
|
22
22
|
pp records
|
23
23
|
# => [["1","2","3"],
|
24
|
-
# ["
|
24
|
+
# ["4","5","6"]]
|
25
25
|
|
26
26
|
# -or-
|
27
27
|
|
28
28
|
records = Csv.read( "values.csv" ) ## or CsvReader.read
|
29
29
|
pp records
|
30
30
|
# => [["1","2","3"],
|
31
|
-
# ["
|
31
|
+
# ["4","5","6"]]
|
32
32
|
|
33
33
|
# -or-
|
34
34
|
|
@@ -36,11 +36,11 @@ Csv.foreach( "values.csv" ) do |rec| ## or CsvReader.foreach
|
|
36
36
|
pp rec
|
37
37
|
end
|
38
38
|
# => ["1","2","3"]
|
39
|
-
# => ["
|
39
|
+
# => ["4","5","6"]
|
40
40
|
```
|
41
41
|
|
42
42
|
|
43
|
-
### What about converters?
|
43
|
+
### What about type inference and data converters?
|
44
44
|
|
45
45
|
Use the converters keyword option to (auto-)convert strings to nulls, booleans, integers, floats, dates, etc.
|
46
46
|
Example:
|
@@ -72,6 +72,18 @@ Built-in converters include:
|
|
72
72
|
| `:all` | shortcut for `[:null, :boolean, :date_time, :numeric]` |
|
73
73
|
|
74
74
|
|
75
|
+
Or add your own converters. Example:
|
76
|
+
|
77
|
+
``` ruby
|
78
|
+
Csv.parse( 'Ruby, 2020-03-01, 100', converters: [->(v) { Time.parse(v) rescue v }] )
|
79
|
+
#=> [["Ruby", 2020-03-01 00:00:00 +0200, "100"]]
|
80
|
+
```
|
81
|
+
|
82
|
+
A custom converter is a method that gets the value passed in
|
83
|
+
and if successful returns a non-string type (e.g. integer, float, date, etc.)
|
84
|
+
or a string (for further processing with all other converters in the "pipeline" configuration).
|
85
|
+
|
86
|
+
|
75
87
|
|
76
88
|
### What about Enumerable?
|
77
89
|
|
@@ -94,7 +106,7 @@ it = csv.to_enum
|
|
94
106
|
pp it.next
|
95
107
|
# => ["1","2","3"]
|
96
108
|
pp it.next
|
97
|
-
# => ["
|
109
|
+
# => ["4","5","6"]
|
98
110
|
```
|
99
111
|
|
100
112
|
|
@@ -150,7 +162,7 @@ end
|
|
150
162
|
|
151
163
|
### What about symbol keys for hashes?
|
152
164
|
|
153
|
-
Yes,
|
165
|
+
Yes, you can use the header_converters keyword option.
|
154
166
|
Use `:symbol` for (auto-)converting header (strings) to symbols.
|
155
167
|
Note: the symbol converter will also downcase all letters and
|
156
168
|
remove all non-alphanumeric (e.g. `!?$%`) chars
|
@@ -169,6 +181,15 @@ records = CsvHash.parse( txt, :converters => :all, :header_converters => :symbol
|
|
169
181
|
pp records
|
170
182
|
# => [{a: 1, b: 2, c: 3},
|
171
183
|
# {a: true, b: false, c: nil}]
|
184
|
+
|
185
|
+
# -or-
|
186
|
+
options = { :converters => :all,
|
187
|
+
:header_converters => :symbol }
|
188
|
+
|
189
|
+
records = CsvHash.parse( txt, options )
|
190
|
+
pp records
|
191
|
+
# => [{a: 1, b: 2, c: 3},
|
192
|
+
# {a: true, b: false, c: nil}]
|
172
193
|
```
|
173
194
|
|
174
195
|
Built-in header converters include:
|
@@ -180,6 +201,91 @@ Built-in header converters include:
|
|
180
201
|
|
181
202
|
|
182
203
|
|
204
|
+
### What about (typed) structs?
|
205
|
+
|
206
|
+
See the [csvrecord library »](https://github.com/csv11/csvrecord)
|
207
|
+
|
208
|
+
Example from the csvrecord docu:
|
209
|
+
|
210
|
+
Step 1: Define a (typed) struct for the comma-separated values (csv) records. Example:
|
211
|
+
|
212
|
+
```ruby
|
213
|
+
require 'csvrecord'
|
214
|
+
|
215
|
+
Beer = CsvRecord.define do
|
216
|
+
field :brewery ## note: default type is :string
|
217
|
+
field :city
|
218
|
+
field :name
|
219
|
+
field :abv, Float ## allows type specified as class (or use :float)
|
220
|
+
end
|
221
|
+
```
|
222
|
+
|
223
|
+
or in "classic" style:
|
224
|
+
|
225
|
+
```ruby
|
226
|
+
class Beer < CsvRecord::Base
|
227
|
+
field :brewery
|
228
|
+
field :city
|
229
|
+
field :name
|
230
|
+
field :abv, Float
|
231
|
+
end
|
232
|
+
```
|
233
|
+
|
234
|
+
|
235
|
+
Step 2: Read in the comma-separated values (csv) datafile. Example:
|
236
|
+
|
237
|
+
```ruby
|
238
|
+
beers = Beer.read( 'beer.csv' )
|
239
|
+
|
240
|
+
puts "#{beers.size} beers:"
|
241
|
+
pp beers
|
242
|
+
```
|
243
|
+
|
244
|
+
pretty prints (pp):
|
245
|
+
|
246
|
+
```
|
247
|
+
6 beers:
|
248
|
+
[#<Beer:0x302c760 @values=
|
249
|
+
["Andechser Klosterbrauerei", "Andechs", "Doppelbock Dunkel", 7.0]>,
|
250
|
+
#<Beer:0x3026fe8 @values=
|
251
|
+
["Augustiner Br\u00E4u M\u00FCnchen", "M\u00FCnchen", "Edelstoff", 5.6]>,
|
252
|
+
#<Beer:0x30257a0 @values=
|
253
|
+
["Bayerische Staatsbrauerei Weihenstephan", "Freising", "Hefe Weissbier", 5.4]>,
|
254
|
+
...
|
255
|
+
]
|
256
|
+
```
|
257
|
+
|
258
|
+
Or loop over the records. Example:
|
259
|
+
|
260
|
+
``` ruby
|
261
|
+
Beer.read( 'beer.csv' ).each do |rec|
|
262
|
+
puts "#{rec.name} (#{rec.abv}%) by #{rec.brewery}, #{rec.city}"
|
263
|
+
end
|
264
|
+
|
265
|
+
# -or-
|
266
|
+
|
267
|
+
Beer.foreach( 'beer.csv' ) do |rec|
|
268
|
+
puts "#{rec.name} (#{rec.abv}%) by #{rec.brewery}, #{rec.city}"
|
269
|
+
end
|
270
|
+
```
|
271
|
+
|
272
|
+
|
273
|
+
printing:
|
274
|
+
|
275
|
+
```
|
276
|
+
Doppelbock Dunkel (7.0%) by Andechser Klosterbrauerei, Andechs
|
277
|
+
Edelstoff (5.6%) by Augustiner Bräu München, München
|
278
|
+
Hefe Weissbier (5.4%) by Bayerische Staatsbrauerei Weihenstephan, Freising
|
279
|
+
Rauchbier Märzen (5.1%) by Brauerei Spezial, Bamberg
|
280
|
+
Münchner Dunkel (5.0%) by Hacker-Pschorr Bräu, München
|
281
|
+
Hofbräu Oktoberfestbier (6.3%) by Staatliches Hofbräuhaus München, München
|
282
|
+
```
|
283
|
+
|
284
|
+
|
285
|
+
### What about tabular data packages with pre-defined types / schemas?
|
286
|
+
|
287
|
+
See the [csvpack library »](https://github.com/csv11/csvpack)
|
288
|
+
|
183
289
|
|
184
290
|
|
185
291
|
|
@@ -319,7 +425,7 @@ Csv.strict.read( ..., sep: "\t" )
|
|
319
425
|
|
320
426
|
Two major design bugs and many many minor.
|
321
427
|
|
322
|
-
(1) The CSV class uses [`line.split(',')`](https://github.com/ruby/csv/blob/master/lib/csv.rb#
|
428
|
+
(1) The CSV class uses [`line.split(',')`](https://github.com/ruby/csv/blob/master/lib/csv.rb#L1255) with some kludges (†) with the claim it's faster.
|
323
429
|
What?! The right way: CSV needs its own purpose-built parser. There's no other
|
324
430
|
way you can handle all the (edge) cases with double quotes and escaped doubled up
|
325
431
|
double quotes. Period.
|
data/lib/csvreader.rb
CHANGED
@@ -1,138 +1,12 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
require '
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
require 'csvreader/version' # let version always go first
|
14
|
-
require 'csvreader/buffer'
|
15
|
-
require 'csvreader/parser_std' # best practices pre-configured out-of-the-box
|
16
|
-
require 'csvreader/parser_strict' # flexible (strict - no leading/trailing space triming, blanks, etc.), configure for different formats/dialects
|
17
|
-
require 'csvreader/parser_tab'
|
18
|
-
require 'csvreader/parser'
|
19
|
-
require 'csvreader/builder'
|
20
|
-
require 'csvreader/reader'
|
21
|
-
require 'csvreader/reader_hash'
|
22
|
-
require 'csvreader/converter'
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
class CsvReader
|
27
|
-
class Parser
|
28
|
-
|
29
|
-
## use/allow different "backends" e.g. ParserStd, ParserStrict, ParserTab, etc.
|
30
|
-
## parser must support parse method (with and without block)
|
31
|
-
## e.g. records = parse( data )
|
32
|
-
## -or-
|
33
|
-
## parse( data ) do |record|
|
34
|
-
## end
|
35
|
-
|
36
|
-
DEFAULT = ParserStd.new
|
37
|
-
|
38
|
-
RFC4180 = ParserStrict.new
|
39
|
-
STRICT = ParserStrict.new ## note: make strict its own instance (so you can change config without "breaking" rfc4180)
|
40
|
-
EXCEL = ParserStrict.new ## note: make excel its own instance (so you can change configs without "breaking" rfc4180/strict)
|
41
|
-
|
42
|
-
MYSQL = ParserStrict.new( sep: "\t",
|
43
|
-
quote: false,
|
44
|
-
escape: true,
|
45
|
-
null: "\\N" )
|
46
|
-
|
47
|
-
POSTGRES = POSTGRESQL = ParserStrict.new( doublequote: false,
|
48
|
-
escape: true,
|
49
|
-
null: "" )
|
50
|
-
|
51
|
-
POSTGRES_TEXT = POSTGRESQL_TEXT = ParserStrict.new( sep: "\t",
|
52
|
-
quote: false,
|
53
|
-
escape: true,
|
54
|
-
null: "\\N" )
|
55
|
-
|
56
|
-
TAB = ParserTab.new
|
57
|
-
|
58
|
-
|
59
|
-
def self.default() DEFAULT; end ## alternative alias for DEFAULT
|
60
|
-
def self.strict() STRICT; end ## alternative alias for STRICT
|
61
|
-
def self.rfc4180() RFC4180; end ## alternative alias for RFC4180
|
62
|
-
def self.excel() EXCEL; end ## alternative alias for EXCEL
|
63
|
-
def self.mysql() MYSQL; end
|
64
|
-
def self.postgresql() POSTGRESQL; end
|
65
|
-
def self.postgres() postgresql; end
|
66
|
-
def self.postgresql_text() POSTGRESQL_TEXT; end
|
67
|
-
def self.postgres_text() postgresql_text; end
|
68
|
-
def self.tab() TAB; end
|
69
|
-
end # class Parser
|
70
|
-
end # class CsvReader
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
class CsvReader
|
75
|
-
### pre-define CsvReader (built-in) formats/dialect
|
76
|
-
DEFAULT = CsvBuilder.new( Parser::DEFAULT )
|
77
|
-
|
78
|
-
STRICT = CsvBuilder.new( Parser::STRICT )
|
79
|
-
RFC4180 = CsvBuilder.new( Parser::RFC4180 )
|
80
|
-
EXCEL = CsvBuilder.new( Parser::EXCEL )
|
81
|
-
|
82
|
-
MYSQL = CsvBuilder.new( Parser::MYSQL )
|
83
|
-
POSTGRES = POSTGRESQL = CsvBuilder.new( Parser::POSTGRESQL )
|
84
|
-
POSTGRES_TEXT = POSTGRESQL_TEXT = CsvBuilder.new( Parser::POSTGRESQL_TEXT )
|
85
|
-
|
86
|
-
TAB = CsvBuilder.new( Parser::TAB )
|
87
|
-
|
88
|
-
|
89
|
-
def self.default() DEFAULT; end ## alternative alias for DEFAULT
|
90
|
-
def self.strict() STRICT; end ## alternative alias for STRICT
|
91
|
-
def self.rfc4180() RFC4180; end ## alternative alias for RFC4180
|
92
|
-
def self.excel() EXCEL; end ## alternative alias for EXCEL
|
93
|
-
def self.mysql() MYSQL; end
|
94
|
-
def self.postgresql() POSTGRESQL; end
|
95
|
-
def self.postgres() postgresql; end
|
96
|
-
def self.postgresql_text() POSTGRESQL_TEXT; end
|
97
|
-
def self.postgres_text() postgresql_text; end
|
98
|
-
def self.tab() TAB; end
|
99
|
-
end # class CsvReader
|
100
|
-
|
101
|
-
|
102
|
-
class CsvHashReader
|
103
|
-
### pre-define CsvReader (built-in) formats/dialect
|
104
|
-
DEFAULT = CsvHashBuilder.new( CsvReader::Parser::DEFAULT )
|
105
|
-
|
106
|
-
STRICT = CsvHashBuilder.new( CsvReader::Parser::STRICT )
|
107
|
-
RFC4180 = CsvHashBuilder.new( CsvReader::Parser::RFC4180 )
|
108
|
-
EXCEL = CsvHashBuilder.new( CsvReader::Parser::EXCEL )
|
109
|
-
|
110
|
-
MYSQL = CsvHashBuilder.new( CsvReader::Parser::MYSQL )
|
111
|
-
POSTGRES = POSTGRESQL = CsvHashBuilder.new( CsvReader::Parser::POSTGRESQL )
|
112
|
-
POSTGRES_TEXT = POSTGRESQL_TEXT = CsvHashBuilder.new( CsvReader::Parser::POSTGRESQL_TEXT )
|
113
|
-
|
114
|
-
TAB = CsvHashBuilder.new( CsvReader::Parser::TAB )
|
115
|
-
|
116
|
-
|
117
|
-
def self.default() DEFAULT; end ## alternative alias for DEFAULT
|
118
|
-
def self.strict() STRICT; end ## alternative alias for STRICT
|
119
|
-
def self.rfc4180() RFC4180; end ## alternative alias for RFC4180
|
120
|
-
def self.excel() EXCEL; end ## alternative alias for EXCEL
|
121
|
-
def self.mysql() MYSQL; end
|
122
|
-
def self.postgresql() POSTGRESQL; end
|
123
|
-
def self.postgres() postgresql; end
|
124
|
-
def self.postgresql_text() POSTGRESQL_TEXT; end
|
125
|
-
def self.postgres_text() postgresql_text; end
|
126
|
-
def self.tab() TAB; end
|
127
|
-
end # class CsvHashReader
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
## add convenience / shortcut alias
|
133
|
-
Csv = CsvReader
|
134
|
-
CsvHash = CsvHashReader
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
puts CsvReader.banner # say hello
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
## our own code (without "top-level" shortcuts e.g. "modular version")
|
5
|
+
require 'csvreader/base'
|
6
|
+
|
7
|
+
|
8
|
+
###
|
9
|
+
# add convenience top-level shortcuts / aliases
|
10
|
+
|
11
|
+
Csv = CsvReader
|
12
|
+
CsvHash = CsvHashReader
|
@@ -0,0 +1,144 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
require 'pp'
|
5
|
+
require 'logger'
|
6
|
+
require 'forwardable'
|
7
|
+
require 'stringio'
|
8
|
+
require 'date' ## use for Date.parse and DateTime.parse
|
9
|
+
|
10
|
+
|
11
|
+
###
|
12
|
+
# our own code
|
13
|
+
require 'csvreader/version' # let version always go first
|
14
|
+
require 'csvreader/buffer'
|
15
|
+
require 'csvreader/parser_std' # best practices pre-configured out-of-the-box
|
16
|
+
require 'csvreader/parser_strict' # flexible (strict - no leading/trailing space triming, blanks, etc.), configure for different formats/dialects
|
17
|
+
require 'csvreader/parser_tab'
|
18
|
+
require 'csvreader/parser'
|
19
|
+
require 'csvreader/converter'
|
20
|
+
require 'csvreader/reader'
|
21
|
+
require 'csvreader/reader_hash'
|
22
|
+
require 'csvreader/builder'
|
23
|
+
|
24
|
+
|
25
|
+
|
26
|
+
class CsvReader
|
27
|
+
class Parser
|
28
|
+
|
29
|
+
## use/allow different "backends" e.g. ParserStd, ParserStrict, ParserTab, etc.
|
30
|
+
## parser must support parse method (with and without block)
|
31
|
+
## e.g. records = parse( data )
|
32
|
+
## -or-
|
33
|
+
## parse( data ) do |record|
|
34
|
+
## end
|
35
|
+
|
36
|
+
DEFAULT = ParserStd.new
|
37
|
+
|
38
|
+
RFC4180 = ParserStrict.new
|
39
|
+
STRICT = ParserStrict.new ## note: make strict its own instance (so you can change config without "breaking" rfc4180)
|
40
|
+
EXCEL = ParserStrict.new ## note: make excel its own instance (so you can change configs without "breaking" rfc4180/strict)
|
41
|
+
|
42
|
+
MYSQL = ParserStrict.new( sep: "\t",
|
43
|
+
quote: false,
|
44
|
+
escape: true,
|
45
|
+
null: "\\N" )
|
46
|
+
|
47
|
+
POSTGRES = POSTGRESQL = ParserStrict.new( doublequote: false,
|
48
|
+
escape: true,
|
49
|
+
null: "" )
|
50
|
+
|
51
|
+
POSTGRES_TEXT = POSTGRESQL_TEXT = ParserStrict.new( sep: "\t",
|
52
|
+
quote: false,
|
53
|
+
escape: true,
|
54
|
+
null: "\\N" )
|
55
|
+
|
56
|
+
NUMERIC = ParserStrict.new( numeric: true,
|
57
|
+
nan: ['#NAN', 'NAN', 'NaN', 'nan' ],
|
58
|
+
null: "" )
|
59
|
+
|
60
|
+
|
61
|
+
TAB = ParserTab.new
|
62
|
+
|
63
|
+
|
64
|
+
def self.default() DEFAULT; end ## alternative alias for DEFAULT
|
65
|
+
def self.strict() STRICT; end ## alternative alias for STRICT
|
66
|
+
def self.rfc4180() RFC4180; end ## alternative alias for RFC4180
|
67
|
+
def self.excel() EXCEL; end ## alternative alias for EXCEL
|
68
|
+
def self.mysql() MYSQL; end
|
69
|
+
def self.postgresql() POSTGRESQL; end
|
70
|
+
def self.postgres() postgresql; end
|
71
|
+
def self.postgresql_text() POSTGRESQL_TEXT; end
|
72
|
+
def self.postgres_text() postgresql_text; end
|
73
|
+
def self.numeric() NUMERIC; end
|
74
|
+
def self.tab() TAB; end
|
75
|
+
end # class Parser
|
76
|
+
end # class CsvReader
|
77
|
+
|
78
|
+
|
79
|
+
|
80
|
+
class CsvReader
|
81
|
+
### pre-define CsvReader (built-in) formats/dialect
|
82
|
+
DEFAULT = Builder.new( Parser::DEFAULT )
|
83
|
+
|
84
|
+
STRICT = Builder.new( Parser::STRICT )
|
85
|
+
RFC4180 = Builder.new( Parser::RFC4180 )
|
86
|
+
EXCEL = Builder.new( Parser::EXCEL )
|
87
|
+
|
88
|
+
MYSQL = Builder.new( Parser::MYSQL )
|
89
|
+
POSTGRES = POSTGRESQL = Builder.new( Parser::POSTGRESQL )
|
90
|
+
POSTGRES_TEXT = POSTGRESQL_TEXT = Builder.new( Parser::POSTGRESQL_TEXT )
|
91
|
+
|
92
|
+
NUMERIC = Builder.new( Parser::NUMERIC )
|
93
|
+
|
94
|
+
TAB = Builder.new( Parser::TAB )
|
95
|
+
|
96
|
+
|
97
|
+
def self.default() DEFAULT; end ## alternative alias for DEFAULT
|
98
|
+
def self.strict() STRICT; end ## alternative alias for STRICT
|
99
|
+
def self.rfc4180() RFC4180; end ## alternative alias for RFC4180
|
100
|
+
def self.excel() EXCEL; end ## alternative alias for EXCEL
|
101
|
+
def self.mysql() MYSQL; end
|
102
|
+
def self.postgresql() POSTGRESQL; end
|
103
|
+
def self.postgres() postgresql; end
|
104
|
+
def self.postgresql_text() POSTGRESQL_TEXT; end
|
105
|
+
def self.postgres_text() postgresql_text; end
|
106
|
+
def self.numeric() NUMERIC; end
|
107
|
+
def self.tab() TAB; end
|
108
|
+
end # class CsvReader
|
109
|
+
|
110
|
+
|
111
|
+
class CsvHashReader
|
112
|
+
### pre-define CsvReader (built-in) formats/dialect
|
113
|
+
DEFAULT = Builder.new( Parser::DEFAULT )
|
114
|
+
|
115
|
+
STRICT = Builder.new( Parser::STRICT )
|
116
|
+
RFC4180 = Builder.new( Parser::RFC4180 )
|
117
|
+
EXCEL = Builder.new( Parser::EXCEL )
|
118
|
+
|
119
|
+
MYSQL = Builder.new( Parser::MYSQL )
|
120
|
+
POSTGRES = POSTGRESQL = Builder.new( Parser::POSTGRESQL )
|
121
|
+
POSTGRES_TEXT = POSTGRESQL_TEXT = Builder.new( Parser::POSTGRESQL_TEXT )
|
122
|
+
|
123
|
+
NUMERIC = Builder.new( Parser::NUMERIC )
|
124
|
+
|
125
|
+
TAB = Builder.new( Parser::TAB )
|
126
|
+
|
127
|
+
|
128
|
+
def self.default() DEFAULT; end ## alternative alias for DEFAULT
|
129
|
+
def self.strict() STRICT; end ## alternative alias for STRICT
|
130
|
+
def self.rfc4180() RFC4180; end ## alternative alias for RFC4180
|
131
|
+
def self.excel() EXCEL; end ## alternative alias for EXCEL
|
132
|
+
def self.mysql() MYSQL; end
|
133
|
+
def self.postgresql() POSTGRESQL; end
|
134
|
+
def self.postgres() postgresql; end
|
135
|
+
def self.postgresql_text() POSTGRESQL_TEXT; end
|
136
|
+
def self.postgres_text() postgresql_text; end
|
137
|
+
def self.numeric() NUMERIC; end
|
138
|
+
def self.tab() TAB; end
|
139
|
+
end # class CsvHashReader
|
140
|
+
|
141
|
+
|
142
|
+
|
143
|
+
|
144
|
+
puts CsvReader.banner # say hello
|
data/lib/csvreader/builder.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
|
-
|
4
|
-
class
|
3
|
+
class CsvReader
|
4
|
+
class Builder ## rename to CsvReaderBuilder - why? why not?
|
5
5
|
|
6
6
|
|
7
7
|
def initialize( parser )
|
@@ -53,12 +53,13 @@ class CsvBuilder ## rename to CsvReaderBuilder - why? why not?
|
|
53
53
|
sep: sep, converters: converters,
|
54
54
|
parser: @parser, &block )
|
55
55
|
end
|
56
|
-
end # class
|
57
|
-
|
56
|
+
end # class Builder
|
57
|
+
end # class CsvReader
|
58
58
|
|
59
59
|
|
60
60
|
|
61
|
-
class
|
61
|
+
class CsvHashReader
|
62
|
+
class Builder ## rename to CsvHashReaderBuilder - why? why not?
|
62
63
|
def initialize( parser )
|
63
64
|
@parser = parser
|
64
65
|
end
|
@@ -117,4 +118,5 @@ class CsvHashBuilder ## rename to CsvHashReaderBuilder - why? why not?
|
|
117
118
|
header_converters: header_converters,
|
118
119
|
parser: @parser, &block )
|
119
120
|
end
|
120
|
-
end # class
|
121
|
+
end # class Builder
|
122
|
+
end # class CsvHashReader
|
data/lib/csvreader/parser.rb
CHANGED
@@ -36,7 +36,9 @@ def initialize( sep: ',',
|
|
36
36
|
doublequote: true,
|
37
37
|
escape: false, ## true/false
|
38
38
|
null: nil, ## note: set to nil for no null vales / not availabe (na)
|
39
|
-
comment: false ## note: comment char e.g. # or false/nil
|
39
|
+
comment: false, ## note: comment char e.g. # or false/nil
|
40
|
+
numeric: false, ## (auto-)convert all non-quoted values to float
|
41
|
+
nan: nil ## note: only if numeric - set mappings for Float::NAN (not a number) values
|
40
42
|
)
|
41
43
|
@config = {} ## todo/fix: change config to proper dialect class/struct - why? why not?
|
42
44
|
@config[:sep] = sep
|
@@ -45,8 +47,11 @@ def initialize( sep: ',',
|
|
45
47
|
@config[:escape] = escape
|
46
48
|
@config[:null] = null
|
47
49
|
@config[:comment] = comment
|
50
|
+
@config[:numeric] = numeric
|
51
|
+
@config[:nan] = nan # not a number (NaN) e.g. Float::NAN
|
48
52
|
end
|
49
53
|
|
54
|
+
|
50
55
|
#########################################
|
51
56
|
## config convenience helpers
|
52
57
|
## e.g. use like Csv.mysql.sep = ',' etc. instead of
|
@@ -57,6 +62,8 @@ def doublequote=( value ) @config[:doublequote]=value; end
|
|
57
62
|
def escape=( value ) @config[:escape]=value; end
|
58
63
|
def null=( value ) @config[:null]=value; end
|
59
64
|
def comment=( value ) @config[:comment]=value; end
|
65
|
+
def numeric=( value ) @config[:numeric]=value; end
|
66
|
+
def nan=( value ) @config[:nan]=value; end
|
60
67
|
|
61
68
|
|
62
69
|
|
@@ -149,14 +156,22 @@ end
|
|
149
156
|
def parse_field( input, sep: )
|
150
157
|
value = ""
|
151
158
|
|
152
|
-
quote
|
153
|
-
escape
|
159
|
+
quote = config[:quote]
|
160
|
+
escape = config[:escape]
|
161
|
+
numeric = config[:numeric]
|
154
162
|
|
155
163
|
logger.debug "parse field - sep: >#{sep}< (#{sep.ord})" if logger.debug?
|
156
164
|
|
157
165
|
if (c=input.peek; c==sep || c==LF || c==CR || input.eof?) ## empty unquoted field
|
158
|
-
|
159
|
-
##
|
166
|
+
## note: allows null = '' that is turn unquoted empty strings into null/nil
|
167
|
+
## or if using numeric into NotANumber (NaN)
|
168
|
+
if is_null?( value )
|
169
|
+
value = nil
|
170
|
+
elsif numeric & is_nan?( value )
|
171
|
+
value = Float::NAN
|
172
|
+
else
|
173
|
+
# do nothing - keep value as is :-) e.g. "".
|
174
|
+
end
|
160
175
|
elsif quote && input.peek == quote
|
161
176
|
logger.debug "start quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
162
177
|
value << parse_quote( input, sep: sep )
|
@@ -174,7 +189,24 @@ def parse_field( input, sep: )
|
|
174
189
|
end
|
175
190
|
end
|
176
191
|
|
177
|
-
|
192
|
+
|
193
|
+
if is_null?( value ) ## note: null check only for UNQUOTED (not quoted/escaped) values
|
194
|
+
value = nil
|
195
|
+
elsif numeric
|
196
|
+
if is_nan?( value )
|
197
|
+
value = Float::NAN
|
198
|
+
else
|
199
|
+
## numeric - (auto-convert) non-quoted values (if NOT nil) to floats
|
200
|
+
if numeric.is_a?( Proc )
|
201
|
+
value = numeric.call( value ) ## allow custom converter proc (e.g. how to handle NaN and conversion errors?)
|
202
|
+
else
|
203
|
+
value = convert_to_float( value ) # default (fails silently) keep string value if cannot convert - change - why? why not?
|
204
|
+
end
|
205
|
+
end
|
206
|
+
else
|
207
|
+
# do nothing - keep value as is :-).
|
208
|
+
end
|
209
|
+
|
178
210
|
logger.debug "end reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
179
211
|
end
|
180
212
|
|
@@ -182,6 +214,7 @@ def parse_field( input, sep: )
|
|
182
214
|
end
|
183
215
|
|
184
216
|
|
217
|
+
|
185
218
|
def parse_record( input, sep: )
|
186
219
|
values = []
|
187
220
|
|
@@ -263,6 +296,24 @@ def parse_lines( input, sep:, &block )
|
|
263
296
|
end # method parse_lines
|
264
297
|
|
265
298
|
|
299
|
+
def convert_to_float( value ) Float( value ) rescue value; end
|
300
|
+
|
301
|
+
def is_nan?( value )
|
302
|
+
nan = @config[:nan]
|
303
|
+
if nan.nil?
|
304
|
+
false ## nothing set; return always false (not NaN)
|
305
|
+
elsif nan.is_a?( Proc )
|
306
|
+
nan.call( value )
|
307
|
+
elsif nan.is_a?( Array )
|
308
|
+
nan.include?( value )
|
309
|
+
elsif nan.is_a?( String )
|
310
|
+
value == nan
|
311
|
+
else ## unknown config style / setting
|
312
|
+
## todo: issue a warning or error - why? why not?
|
313
|
+
false ## nothing set; return always false (not nan)
|
314
|
+
end
|
315
|
+
end
|
316
|
+
|
266
317
|
def is_null?( value )
|
267
318
|
null = @config[:null]
|
268
319
|
if null.nil?
|
data/lib/csvreader/reader.rb
CHANGED
@@ -3,6 +3,12 @@
|
|
3
3
|
class CsvHashReader
|
4
4
|
|
5
5
|
|
6
|
+
## add convenience shortcuts / aliases for CsvReader support classes
|
7
|
+
Parser = CsvReader::Parser
|
8
|
+
Converter = CsvReader::Converter
|
9
|
+
|
10
|
+
|
11
|
+
|
6
12
|
def self.open( path, mode=nil,
|
7
13
|
headers: nil,
|
8
14
|
sep: nil,
|
@@ -113,10 +119,10 @@ def initialize( data, headers: nil, sep: nil,
|
|
113
119
|
|
114
120
|
@sep = sep
|
115
121
|
|
116
|
-
@converters =
|
117
|
-
@header_converters =
|
122
|
+
@converters = Converter.create_converters( converters )
|
123
|
+
@header_converters = Converter.create_header_converters( header_converters )
|
118
124
|
|
119
|
-
@parser = parser.nil? ?
|
125
|
+
@parser = parser.nil? ? Parser::DEFAULT : parser
|
120
126
|
end
|
121
127
|
|
122
128
|
|
data/lib/csvreader/version.rb
CHANGED
data/test/helper.rb
CHANGED
@@ -0,0 +1,38 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
###
|
4
|
+
# to run use
|
5
|
+
# ruby -I ./lib -I ./test test/test_parser_numeric.rb
|
6
|
+
|
7
|
+
|
8
|
+
require 'helper'
|
9
|
+
|
10
|
+
|
11
|
+
class TestParserNumeric < MiniTest::Test
|
12
|
+
|
13
|
+
def parser
|
14
|
+
CsvReader::Parser::NUMERIC
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
def test_parser_numeric
|
19
|
+
pp CsvReader::Parser::NUMERIC
|
20
|
+
pp CsvReader::Parser.numeric
|
21
|
+
assert true
|
22
|
+
end
|
23
|
+
|
24
|
+
def test_parse
|
25
|
+
assert_equal [[1.0,2.0,3.0],
|
26
|
+
[4.0,5.0,6.0]], parser.parse( "1,2,3\n4,5,6" )
|
27
|
+
assert_equal [[1.0,2.0,3.0],
|
28
|
+
["4","5","6"]], parser.parse( %Q{ 1,2 , 3\n"4","5","6"} )
|
29
|
+
assert_equal [["a","b","c"]], parser.parse( %Q{"a","b","c"} )
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
def test_empty
|
34
|
+
assert_equal [[nil,nil,nil],
|
35
|
+
["","",""]], parser.parse( %Q{,,\n"","",""} )
|
36
|
+
end
|
37
|
+
|
38
|
+
end # class TestParserNumeric
|
data/test/test_reader.rb
CHANGED
@@ -84,9 +84,11 @@ def test_enum
|
|
84
84
|
assert_equal ["a","b","c"], enum.next
|
85
85
|
|
86
86
|
## test Csv == CsvReader class alias
|
87
|
-
|
88
|
-
|
89
|
-
|
87
|
+
if defined?( Csv )
|
88
|
+
csv = Csv.new( "a,b,c" )
|
89
|
+
enum = csv.to_enum
|
90
|
+
assert_equal ["a","b","c"], enum.next
|
91
|
+
end
|
90
92
|
end
|
91
93
|
|
92
94
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: csvreader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-10-
|
11
|
+
date: 2018-10-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rdoc
|
@@ -55,6 +55,7 @@ files:
|
|
55
55
|
- README.md
|
56
56
|
- Rakefile
|
57
57
|
- lib/csvreader.rb
|
58
|
+
- lib/csvreader/base.rb
|
58
59
|
- lib/csvreader/buffer.rb
|
59
60
|
- lib/csvreader/builder.rb
|
60
61
|
- lib/csvreader/converter.rb
|
@@ -77,6 +78,7 @@ files:
|
|
77
78
|
- test/test_parser_formats.rb
|
78
79
|
- test/test_parser_java.rb
|
79
80
|
- test/test_parser_null.rb
|
81
|
+
- test/test_parser_numeric.rb
|
80
82
|
- test/test_parser_strict.rb
|
81
83
|
- test/test_parser_tab.rb
|
82
84
|
- test/test_reader.rb
|