csvreader 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: af0fcea1b598e6123786a05532a6f5b2e10a4095
4
- data.tar.gz: ba2dc18a6076e425847b440c05819e898f0a66b2
3
+ metadata.gz: a9bc6971bd638abc67e8e82e241dbb370602b0d5
4
+ data.tar.gz: 062f2727188a6f3705c21a5cc825194f84bea41c
5
5
  SHA512:
6
- metadata.gz: 28f60b98574e5331b53280f27017fae776c787ee1b7a56815c8a8f9c21a0926e6f561ca8a75f1464f1743849f989a52f122fbe4f20086de8159cf2df53b71bbe
7
- data.tar.gz: 6d5b80b11e4774bc227bffe62bc829ab19b70f22fd69ca35c54526b85f261fa5c4bf0a7d87c9ba715738f50a6710bdd843f3b6cc1581f0d88744332fdf062796
6
+ metadata.gz: 595f1c779e0457377fe5c09602cba1ce7754803b35b9280e06dfc752759da441c708db830cb159076ce3f445b3b6aaf1fef459ff9eaace4ae6a436988e52455f
7
+ data.tar.gz: f4dc02242912ba15bef498838093f85de22c3670b5bcb2b92139adbd9343cbddec753f755729f85b2962df26d724ff069cc73b8f5cccd3edb896dc0d3ac26969
data/README.md CHANGED
@@ -164,7 +164,7 @@ see [`TabReader` »](https://github.com/datatxt/tabreader).
164
164
 
165
165
  Two major design bugs and many many minor.
166
166
 
167
- 1) The CSV class uses `line.split(`,`)` with some kludges (†) with the claim its faster.
167
+ (1) The CSV class uses `line.split(',')` with some kludges (†) with the claim its faster.
168
168
  What?! The right way: CSV needs its own purpose-built parser. There's no other
169
169
  way you can handle all the (edge) cases with double quotes and escaped doubled up
170
170
  double quotes. Period.
@@ -175,7 +175,7 @@ Or handling double quotes inside values and so on and on.
175
175
 
176
176
  (†): kludge - a workaround or quick-and-dirty solution that is clumsy, inelegant, inefficient, difficult to extend and hard to maintain
177
177
 
178
- 2) The CSV class returns `nil` for `,,` but an empty string (`""`)
178
+ (2) The CSV class returns `nil` for `,,` but an empty string (`""`)
179
179
  for `"","",""`. The right way: All values are always strings. Period.
180
180
 
181
181
  If you want to use `nil` you MUST configure a string (or strings)
@@ -6,6 +6,34 @@ module Csv ## check: rename to CsvSettings / CsvPref / CsvGlobals or similar
6
6
  ## STD_CSV_ENGINE = CSV ## to avoid name confusion use longer name - why? why not? find a better name?
7
7
  ## use __CSV__ or similar? or just ::CSV ??
8
8
 
9
+
10
+ class Dialect ## todo: use a module - it's just a namespace/module now - why? why not?
11
+ ###
12
+ # (auto-)add these flavors/dialects:
13
+ # :tab -> uses TabReader(!)
14
+ # :strict|:rfc4180
15
+ # :unix -> uses unix-style escapes e.g. \n \" etc.
16
+ # :windows|:excel
17
+ # :guess|:auto -> guess (auto-detect) separator - why? why not?
18
+
19
+ ## e.g. use Dialect.registry[:unix] = { ... } etc.
20
+ ## note use @@ - there is only one registry
21
+ def self.registry() @@registry ||={} end
22
+
23
+ ## add built-in dialects:
24
+ ## trim - use strip? why? why not? use alias?
25
+ registry[:tab] = {} ##{ class: TabReader }
26
+ registry[:strict] = { strict: true, trim: false } ## add no comments, blank lines, etc. ???
27
+ registry[:rfc4180] = :strict ## alternative name
28
+ registry[:windows] = {}
29
+ registry[:excel] = :windows
30
+ registry[:unix] = {}
31
+
32
+ ## todo: add some more
33
+ end # class Dialect
34
+
35
+
36
+
9
37
  class Configuration
10
38
 
11
39
  puts "CSV::VERSION:"
@@ -23,6 +51,9 @@ module Csv ## check: rename to CsvSettings / CsvPref / CsvGlobals or similar
23
51
 
24
52
 
25
53
  attr_accessor :sep ## col_sep (column separator)
54
+ attr_accessor :na ## not available (string or array of strings or nil) - rename to nas/nils/nulls - why? why not?
55
+ attr_accessor :trim ### allow ltrim/rtrim/trim - why? why not?
56
+ attr_accessor :dialect
26
57
 
27
58
  def initialize
28
59
  @sep = ','
@@ -32,6 +63,8 @@ module Csv ## check: rename to CsvSettings / CsvPref / CsvGlobals or similar
32
63
  self ## return self for chaining
33
64
  end
34
65
 
66
+ def trim?() @trim; end ## strip leading and trailing spaces
67
+
35
68
  def blank?( line )
36
69
  ## note: blank line does NOT include "blank" with spaces only!!
37
70
  ## use BLANK_REGEX in skip_lines to clean-up/skip/remove/ignore
@@ -96,46 +129,53 @@ end # module Csvv
96
129
 
97
130
  class CsvReader
98
131
 
99
- def self.foreach( path, sep: Csv.config.sep, headers: false )
132
+ def self.parse_line( txt, sep: Csv.config.sep,
133
+ trim: Csv.config.trim?,
134
+ na: Csv.config.na,
135
+ dialect: Csv.config.dialect,
136
+ converters: nil)
137
+ ## note: do NOT include headers option (otherwise single row gets skipped as first header row :-)
100
138
  csv_options = Csv.config.default_options.merge(
101
- headers: headers,
102
- col_sep: sep,
103
- external_encoding: 'utf-8' ## note: always (auto-)add utf-8 external encoding for now!!!
139
+ headers: false, ## note: always turn off headers!!!!!!
140
+ col_sep: sep
104
141
  )
142
+ ## pp csv_options
143
+ CSV.parse_line( txt, csv_options )
144
+ end
105
145
 
106
- CSV.foreach( path, csv_options ) do |row|
107
- yield( row ) ## check/todo: use block.call( row ) ## why? why not?
108
- end
146
+ def self.parse( txt, sep: Csv.config.sep, headers: false )
147
+ csv_options = Csv.config.default_options.merge(
148
+ headers: headers,
149
+ col_sep: sep
150
+ )
151
+ ## pp csv_options
152
+ CSV.parse( txt, csv_options )
109
153
  end
110
154
 
111
155
  def self.read( path, sep: Csv.config.sep, headers: false )
112
156
  ## note: use our own file.open
113
157
  ## always use utf-8 for now
114
158
  ## check/todo: add skip option bom too - why? why not?
115
- txt = File.open( path, 'r:utf-8' )
159
+ txt = File.open( path, 'r:bom|utf-8' )
116
160
  parse( txt, sep: sep, headers: headers )
117
161
  end
118
162
 
119
- def self.parse( txt, sep: Csv.config.sep, headers: false )
163
+ def self.foreach( path, sep: Csv.config.sep, headers: false )
120
164
  csv_options = Csv.config.default_options.merge(
121
165
  headers: headers,
122
- col_sep: sep
166
+ col_sep: sep,
167
+ external_encoding: 'utf-8' ## note: always (auto-)add utf-8 external encoding for now!!!
123
168
  )
124
- ## pp csv_options
125
- CSV.parse( txt, csv_options )
126
- end
127
169
 
170
+ ## todo/check/fix:
171
+ ## can use bom e.g. 'bom|utf-8' - how?
172
+ ## raises ArgumentError: unknown encoding name - bom|utf-8
128
173
 
129
- def self.parse_line( txt, sep: Csv.config.sep )
130
- ## note: do NOT include headers option (otherwise single row gets skipped as first header row :-)
131
- csv_options = Csv.config.default_options.merge(
132
- headers: false, ## note: always turn off headers!!!!!!
133
- col_sep: sep
134
- )
135
- ## pp csv_options
136
- CSV.parse_line( txt, csv_options )
137
- end
138
174
 
175
+ CSV.foreach( path, csv_options ) do |row|
176
+ yield( row ) ## check/todo: use block.call( row ) ## why? why not?
177
+ end
178
+ end
139
179
 
140
180
  def self.header( path, sep: Csv.config.sep ) ## use header or headers - or use both (with alias)?
141
181
  # read first lines (only)
@@ -148,7 +188,7 @@ class CsvReader
148
188
  ## - NOT a blank line
149
189
 
150
190
  lines = ''
151
- File.open( path, 'r:utf-8' ) do |f|
191
+ File.open( path, 'r:bom|utf-8' ) do |f|
152
192
 
153
193
  ## todo/fix: how to handle empty files or files without headers?!
154
194
 
@@ -171,31 +211,20 @@ class CsvReader
171
211
  parse_line( lines, sep: sep )
172
212
  end # method self.header
173
213
 
174
- ####################
175
- # helper methods
176
- def self.unwrap( row_or_array ) ## unwrap row - find a better name? why? why not?
177
- ## return row values as array of strings
178
- if row_or_array.is_a?( CSV::Row )
179
- row = row_or_array
180
- row.fields ## gets array of string of field values
181
- else ## assume "classic" array of strings
182
- array = row_or_array
183
- end
184
- end
185
214
  end # class CsvReader
186
215
 
187
216
 
188
217
 
189
218
  class CsvHashReader
190
219
 
191
- def self.read( path, sep: Csv.config.sep, headers: true )
192
- CsvReader.read( path, sep: sep, headers: headers )
193
- end
194
-
195
220
  def self.parse( txt, sep: Csv.config.sep, headers: true )
196
221
  CsvReader.parse( txt, sep: sep, headers: headers )
197
222
  end
198
223
 
224
+ def self.read( path, sep: Csv.config.sep, headers: true )
225
+ CsvReader.read( path, sep: sep, headers: headers )
226
+ end
227
+
199
228
  def self.foreach( path, sep: Csv.config.sep, headers: true, &block )
200
229
  CsvReader.foreach( path, sep: sep, headers: headers, &block )
201
230
  end
@@ -4,7 +4,7 @@
4
4
  class CsvReader ## note: uses a class for now - change to module - why? why not?
5
5
 
6
6
  MAJOR = 0 ## todo: namespace inside version or something - why? why not??
7
- MINOR = 2
7
+ MINOR = 3
8
8
  PATCH = 0
9
9
  VERSION = [MAJOR,MINOR,PATCH].join('.')
10
10
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: csvreader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-08-19 00:00:00.000000000 Z
11
+ date: 2018-08-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rdoc