csvreader 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -2
- data/lib/csvreader/reader.rb +67 -38
- data/lib/csvreader/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a9bc6971bd638abc67e8e82e241dbb370602b0d5
|
4
|
+
data.tar.gz: 062f2727188a6f3705c21a5cc825194f84bea41c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 595f1c779e0457377fe5c09602cba1ce7754803b35b9280e06dfc752759da441c708db830cb159076ce3f445b3b6aaf1fef459ff9eaace4ae6a436988e52455f
|
7
|
+
data.tar.gz: f4dc02242912ba15bef498838093f85de22c3670b5bcb2b92139adbd9343cbddec753f755729f85b2962df26d724ff069cc73b8f5cccd3edb896dc0d3ac26969
|
data/README.md
CHANGED
@@ -164,7 +164,7 @@ see [`TabReader` »](https://github.com/datatxt/tabreader).
|
|
164
164
|
|
165
165
|
Two major design bugs and many many minor.
|
166
166
|
|
167
|
-
1) The CSV class uses `line.split(
|
167
|
+
(1) The CSV class uses `line.split(',')` with some kludges (†) with the claim its faster.
|
168
168
|
What?! The right way: CSV needs its own purpose-built parser. There's no other
|
169
169
|
way you can handle all the (edge) cases with double quotes and escaped doubled up
|
170
170
|
double quotes. Period.
|
@@ -175,7 +175,7 @@ Or handling double quotes inside values and so on and on.
|
|
175
175
|
|
176
176
|
(†): kludge - a workaround or quick-and-dirty solution that is clumsy, inelegant, inefficient, difficult to extend and hard to maintain
|
177
177
|
|
178
|
-
2) The CSV class returns `nil` for `,,` but an empty string (`""`)
|
178
|
+
(2) The CSV class returns `nil` for `,,` but an empty string (`""`)
|
179
179
|
for `"","",""`. The right way: All values are always strings. Period.
|
180
180
|
|
181
181
|
If you want to use `nil` you MUST configure a string (or strings)
|
data/lib/csvreader/reader.rb
CHANGED
@@ -6,6 +6,34 @@ module Csv ## check: rename to CsvSettings / CsvPref / CsvGlobals or similar
|
|
6
6
|
## STD_CSV_ENGINE = CSV ## to avoid name confusion use longer name - why? why not? find a better name?
|
7
7
|
## use __CSV__ or similar? or just ::CSV ??
|
8
8
|
|
9
|
+
|
10
|
+
class Dialect ## todo: use a module - it's just a namespace/module now - why? why not?
|
11
|
+
###
|
12
|
+
# (auto-)add these flavors/dialects:
|
13
|
+
# :tab -> uses TabReader(!)
|
14
|
+
# :strict|:rfc4180
|
15
|
+
# :unix -> uses unix-style escapes e.g. \n \" etc.
|
16
|
+
# :windows|:excel
|
17
|
+
# :guess|:auto -> guess (auto-detect) separator - why? why not?
|
18
|
+
|
19
|
+
## e.g. use Dialect.registry[:unix] = { ... } etc.
|
20
|
+
## note use @@ - there is only one registry
|
21
|
+
def self.registry() @@registry ||={} end
|
22
|
+
|
23
|
+
## add built-in dialects:
|
24
|
+
## trim - use strip? why? why not? use alias?
|
25
|
+
registry[:tab] = {} ##{ class: TabReader }
|
26
|
+
registry[:strict] = { strict: true, trim: false } ## add no comments, blank lines, etc. ???
|
27
|
+
registry[:rfc4180] = :strict ## alternative name
|
28
|
+
registry[:windows] = {}
|
29
|
+
registry[:excel] = :windows
|
30
|
+
registry[:unix] = {}
|
31
|
+
|
32
|
+
## todo: add some more
|
33
|
+
end # class Dialect
|
34
|
+
|
35
|
+
|
36
|
+
|
9
37
|
class Configuration
|
10
38
|
|
11
39
|
puts "CSV::VERSION:"
|
@@ -23,6 +51,9 @@ module Csv ## check: rename to CsvSettings / CsvPref / CsvGlobals or similar
|
|
23
51
|
|
24
52
|
|
25
53
|
attr_accessor :sep ## col_sep (column separator)
|
54
|
+
attr_accessor :na ## not available (string or array of strings or nil) - rename to nas/nils/nulls - why? why not?
|
55
|
+
attr_accessor :trim ### allow ltrim/rtrim/trim - why? why not?
|
56
|
+
attr_accessor :dialect
|
26
57
|
|
27
58
|
def initialize
|
28
59
|
@sep = ','
|
@@ -32,6 +63,8 @@ module Csv ## check: rename to CsvSettings / CsvPref / CsvGlobals or similar
|
|
32
63
|
self ## return self for chaining
|
33
64
|
end
|
34
65
|
|
66
|
+
def trim?() @trim; end ## strip leading and trailing spaces
|
67
|
+
|
35
68
|
def blank?( line )
|
36
69
|
## note: blank line does NOT include "blank" with spaces only!!
|
37
70
|
## use BLANK_REGEX in skip_lines to clean-up/skip/remove/ignore
|
@@ -96,46 +129,53 @@ end # module Csvv
|
|
96
129
|
|
97
130
|
class CsvReader
|
98
131
|
|
99
|
-
def self.
|
132
|
+
def self.parse_line( txt, sep: Csv.config.sep,
|
133
|
+
trim: Csv.config.trim?,
|
134
|
+
na: Csv.config.na,
|
135
|
+
dialect: Csv.config.dialect,
|
136
|
+
converters: nil)
|
137
|
+
## note: do NOT include headers option (otherwise single row gets skipped as first header row :-)
|
100
138
|
csv_options = Csv.config.default_options.merge(
|
101
|
-
|
102
|
-
|
103
|
-
external_encoding: 'utf-8' ## note: always (auto-)add utf-8 external encoding for now!!!
|
139
|
+
headers: false, ## note: always turn off headers!!!!!!
|
140
|
+
col_sep: sep
|
104
141
|
)
|
142
|
+
## pp csv_options
|
143
|
+
CSV.parse_line( txt, csv_options )
|
144
|
+
end
|
105
145
|
|
106
|
-
|
107
|
-
|
108
|
-
|
146
|
+
def self.parse( txt, sep: Csv.config.sep, headers: false )
|
147
|
+
csv_options = Csv.config.default_options.merge(
|
148
|
+
headers: headers,
|
149
|
+
col_sep: sep
|
150
|
+
)
|
151
|
+
## pp csv_options
|
152
|
+
CSV.parse( txt, csv_options )
|
109
153
|
end
|
110
154
|
|
111
155
|
def self.read( path, sep: Csv.config.sep, headers: false )
|
112
156
|
## note: use our own file.open
|
113
157
|
## always use utf-8 for now
|
114
158
|
## check/todo: add skip option bom too - why? why not?
|
115
|
-
txt = File.open( path, 'r:utf-8' )
|
159
|
+
txt = File.open( path, 'r:bom|utf-8' )
|
116
160
|
parse( txt, sep: sep, headers: headers )
|
117
161
|
end
|
118
162
|
|
119
|
-
def self.
|
163
|
+
def self.foreach( path, sep: Csv.config.sep, headers: false )
|
120
164
|
csv_options = Csv.config.default_options.merge(
|
121
165
|
headers: headers,
|
122
|
-
col_sep: sep
|
166
|
+
col_sep: sep,
|
167
|
+
external_encoding: 'utf-8' ## note: always (auto-)add utf-8 external encoding for now!!!
|
123
168
|
)
|
124
|
-
## pp csv_options
|
125
|
-
CSV.parse( txt, csv_options )
|
126
|
-
end
|
127
169
|
|
170
|
+
## todo/check/fix:
|
171
|
+
## can use bom e.g. 'bom|utf-8' - how?
|
172
|
+
## raises ArgumentError: unknown encoding name - bom|utf-8
|
128
173
|
|
129
|
-
def self.parse_line( txt, sep: Csv.config.sep )
|
130
|
-
## note: do NOT include headers option (otherwise single row gets skipped as first header row :-)
|
131
|
-
csv_options = Csv.config.default_options.merge(
|
132
|
-
headers: false, ## note: always turn off headers!!!!!!
|
133
|
-
col_sep: sep
|
134
|
-
)
|
135
|
-
## pp csv_options
|
136
|
-
CSV.parse_line( txt, csv_options )
|
137
|
-
end
|
138
174
|
|
175
|
+
CSV.foreach( path, csv_options ) do |row|
|
176
|
+
yield( row ) ## check/todo: use block.call( row ) ## why? why not?
|
177
|
+
end
|
178
|
+
end
|
139
179
|
|
140
180
|
def self.header( path, sep: Csv.config.sep ) ## use header or headers - or use both (with alias)?
|
141
181
|
# read first lines (only)
|
@@ -148,7 +188,7 @@ class CsvReader
|
|
148
188
|
## - NOT a blank line
|
149
189
|
|
150
190
|
lines = ''
|
151
|
-
File.open( path, 'r:utf-8' ) do |f|
|
191
|
+
File.open( path, 'r:bom|utf-8' ) do |f|
|
152
192
|
|
153
193
|
## todo/fix: how to handle empty files or files without headers?!
|
154
194
|
|
@@ -171,31 +211,20 @@ class CsvReader
|
|
171
211
|
parse_line( lines, sep: sep )
|
172
212
|
end # method self.header
|
173
213
|
|
174
|
-
####################
|
175
|
-
# helper methods
|
176
|
-
def self.unwrap( row_or_array ) ## unwrap row - find a better name? why? why not?
|
177
|
-
## return row values as array of strings
|
178
|
-
if row_or_array.is_a?( CSV::Row )
|
179
|
-
row = row_or_array
|
180
|
-
row.fields ## gets array of string of field values
|
181
|
-
else ## assume "classic" array of strings
|
182
|
-
array = row_or_array
|
183
|
-
end
|
184
|
-
end
|
185
214
|
end # class CsvReader
|
186
215
|
|
187
216
|
|
188
217
|
|
189
218
|
class CsvHashReader
|
190
219
|
|
191
|
-
def self.read( path, sep: Csv.config.sep, headers: true )
|
192
|
-
CsvReader.read( path, sep: sep, headers: headers )
|
193
|
-
end
|
194
|
-
|
195
220
|
def self.parse( txt, sep: Csv.config.sep, headers: true )
|
196
221
|
CsvReader.parse( txt, sep: sep, headers: headers )
|
197
222
|
end
|
198
223
|
|
224
|
+
def self.read( path, sep: Csv.config.sep, headers: true )
|
225
|
+
CsvReader.read( path, sep: sep, headers: headers )
|
226
|
+
end
|
227
|
+
|
199
228
|
def self.foreach( path, sep: Csv.config.sep, headers: true, &block )
|
200
229
|
CsvReader.foreach( path, sep: sep, headers: headers, &block )
|
201
230
|
end
|
data/lib/csvreader/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: csvreader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-08-
|
11
|
+
date: 2018-08-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rdoc
|