censive 0.18 → 0.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/censive.gemspec +1 -1
- data/diagram/NFA to Regex.pdf +0 -0
- data/diagram/censive@ce9d51d.png +0 -0
- data/diagram/csv-ragel.dot +24 -0
- data/diagram/csv.dot +57 -0
- data/diagram/csv.png +0 -0
- data/diagram/csv.rl +45 -0
- data/diagram/csv.svg +270 -0
- data/diagram/diagram.dot +26 -0
- data/diagram/diagram.rl +50 -0
- data/lib/censive.rb +127 -97
- data/lib/censive.rb-20230208182732 +266 -0
- data/lib/censive.rb-20230208195221 +276 -0
- data/lib/censive.rb-20230209050227 +282 -0
- data/lib/flay.rb +227 -0
- data/lib/test-censive.rb +12 -0
- data/lib/test-csv.rb +12 -0
- metadata +17 -2
data/lib/censive.rb
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
# censive - A quick and lightweight CSV handling library for Ruby
|
|
5
5
|
#
|
|
6
6
|
# Author: Steve Shreeve (steve.shreeve@gmail.com)
|
|
7
|
-
# Date: Feb
|
|
7
|
+
# Date: Feb 10, 2023
|
|
8
8
|
#
|
|
9
9
|
# https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
|
|
10
10
|
# https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
|
|
@@ -14,14 +14,22 @@
|
|
|
14
14
|
# GOALS:
|
|
15
15
|
# 1. Faster than Ruby's default CSV library
|
|
16
16
|
# 2. Lightweight code with streamlined and optimized logic
|
|
17
|
-
# 3. Support most non-compliant CSV variations (
|
|
17
|
+
# 3. Support most non-compliant CSV variations (@excel, @relax, etc)
|
|
18
|
+
# 4. Support most commonly used CSV options (@sep, @quote, @strip, @drop, etc)
|
|
18
19
|
#
|
|
19
|
-
# TODO:
|
|
20
|
+
# TODO:
|
|
21
|
+
# 1. Support IO streaming
|
|
22
|
+
# 2. Review all encodings, we may be losing speed when mixing encodings
|
|
23
|
+
# 3. Huge speedup possible if our @unquoted regex reads beyond @cr?@lf's
|
|
24
|
+
# 4. Will using String#freeze give us a speed up?
|
|
25
|
+
# 5. Implement support for scan_until(string) <= right now only regex is valid
|
|
20
26
|
# ============================================================================
|
|
21
27
|
|
|
22
28
|
require "strscan"
|
|
23
29
|
|
|
24
30
|
class Censive < StringScanner
|
|
31
|
+
attr :encoding
|
|
32
|
+
|
|
25
33
|
def self.parse(...)
|
|
26
34
|
new(...).parse
|
|
27
35
|
end
|
|
@@ -34,86 +42,84 @@ class Censive < StringScanner
|
|
|
34
42
|
end
|
|
35
43
|
end
|
|
36
44
|
|
|
37
|
-
def initialize(str=
|
|
38
|
-
drop: false , # drop trailing empty
|
|
39
|
-
encoding:
|
|
45
|
+
def initialize(str=nil,
|
|
46
|
+
drop: false , # drop trailing empty columns?
|
|
47
|
+
encoding: nil , # character encoding
|
|
40
48
|
excel: false , # literals ="01" formulas =A1 + B2 http://bit.ly/3Y7jIvc
|
|
41
49
|
mode: :compact, # export mode: compact or full
|
|
42
|
-
out:
|
|
50
|
+
out: nil , # output stream, needs to respond to <<
|
|
43
51
|
quote: '"' , # quote character
|
|
44
52
|
relax: false , # relax quote parsing so ,"Fo"o, => ,"Fo""o",
|
|
45
53
|
rowsep: "\n" , # row separator for export
|
|
46
54
|
sep: "," , # column separator character
|
|
47
|
-
strip: false , # strip
|
|
48
|
-
**opts
|
|
55
|
+
strip: false , # strip columns when reading
|
|
56
|
+
**opts # grab bag
|
|
49
57
|
)
|
|
50
|
-
#
|
|
58
|
+
# initialize data source
|
|
59
|
+
if str && str.size < 100 && File.readable?(str)
|
|
60
|
+
str = File.open(str, encoding ? "r:#{encoding}" : "r").read
|
|
61
|
+
else
|
|
62
|
+
str ||= ""
|
|
63
|
+
str = str.encode(encoding) if encoding
|
|
64
|
+
end
|
|
65
|
+
super(str)
|
|
66
|
+
reset
|
|
67
|
+
|
|
68
|
+
# config options
|
|
69
|
+
@cheat = true
|
|
51
70
|
@drop = drop
|
|
52
|
-
@encoding = encoding
|
|
71
|
+
@encoding = str.encoding
|
|
53
72
|
@excel = excel
|
|
54
73
|
@mode = mode
|
|
55
|
-
@out = out
|
|
56
|
-
@quote = quote
|
|
74
|
+
@out = out || $stdout
|
|
57
75
|
@relax = relax
|
|
76
|
+
@strip = strip
|
|
77
|
+
|
|
78
|
+
# config strings
|
|
79
|
+
@quote = quote
|
|
58
80
|
@rowsep = rowsep
|
|
59
81
|
@sep = sep
|
|
60
|
-
@strip = strip
|
|
61
82
|
|
|
62
|
-
#
|
|
63
|
-
@cr
|
|
64
|
-
@lf
|
|
65
|
-
@es
|
|
66
|
-
@eq
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
@
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
end
|
|
75
|
-
|
|
76
|
-
|
|
83
|
+
# static strings
|
|
84
|
+
@cr = "\r"
|
|
85
|
+
@lf = "\n"
|
|
86
|
+
@es = ""
|
|
87
|
+
@eq = "="
|
|
88
|
+
|
|
89
|
+
# combinations
|
|
90
|
+
@esc = (@quote * 2)
|
|
91
|
+
@seq = [@sep, @eq].join # used for parsing in excel mode
|
|
92
|
+
|
|
93
|
+
# regexes
|
|
94
|
+
@eoc = /(?=#{"\\" + @sep}|#{@cr}|#{@lf}|\z)/o # end of cell
|
|
95
|
+
@eol = /#{@cr}#{@lf}?|#{@lf}/o # end of line
|
|
96
|
+
@escapes = /(#{@quote})|#{"\\"+@sep}|#{@cr}|#{@lf}/o
|
|
97
|
+
@quotable = /#{"\\"+@sep}|#{@cr}|#{@lf}/o
|
|
98
|
+
@quotes = /#{@quote}/o
|
|
99
|
+
@seps = /#{@sep}+/o
|
|
100
|
+
@quoted = @excel ? /(?:=)?#{@quote}/o : @quote
|
|
101
|
+
@unquoted = /[^#{@sep}#{@cr}#{@lf}][^#{@quote}#{@cr}#{@lf}]*/o
|
|
102
|
+
@leadzero = /\A0\d*\z/
|
|
77
103
|
end
|
|
78
104
|
|
|
79
105
|
def reset(str=nil)
|
|
80
|
-
self.string = str if str
|
|
81
|
-
super()
|
|
82
106
|
@rows = nil
|
|
83
107
|
@cols = @cells = 0
|
|
84
|
-
end
|
|
85
108
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
excel = true if @excel && scan(@eq)
|
|
90
|
-
|
|
91
|
-
if scan(@quote) # consume quoted cell
|
|
92
|
-
token = ""
|
|
93
|
-
while true
|
|
94
|
-
token << (scan_until(/#{@quote}/o) or bomb "unclosed quote")[0..-2]
|
|
95
|
-
token << @quote and next if scan(@quote)
|
|
96
|
-
break if scan(@eoc)
|
|
97
|
-
@relax or bomb "invalid character after quote"
|
|
98
|
-
token << @quote + (scan_until(/#{@quote}/o) or bomb "bad inline quote")
|
|
99
|
-
end
|
|
100
|
-
elsif scan(@sep) then return @es
|
|
101
|
-
elsif scan(@eol) then return nil
|
|
102
|
-
else # consume unquoted cell
|
|
103
|
-
token = scan_until(@eoc) or bomb "unexpected character"
|
|
104
|
-
token.prepend(@eq) if excel
|
|
105
|
-
end
|
|
106
|
-
scan(@sep)
|
|
107
|
-
@strip ? token.strip : token
|
|
108
|
-
end
|
|
109
|
-
|
|
110
|
-
def bomb(msg)
|
|
111
|
-
abort "\n#{File.basename($0)}: #{msg} at character #{pos} near '#{string[pos-4,7]}'"
|
|
109
|
+
self.string = str if str
|
|
110
|
+
@encoding = string.encoding
|
|
111
|
+
super()
|
|
112
112
|
end
|
|
113
113
|
|
|
114
114
|
# ==[ Parser ]==
|
|
115
115
|
|
|
116
116
|
def parse
|
|
117
|
+
|
|
118
|
+
# TODO: crazy optimization if NO QUOTES in rest
|
|
119
|
+
# unless rest.include?(@quote)
|
|
120
|
+
# @rows = rest...
|
|
121
|
+
# end
|
|
122
|
+
|
|
117
123
|
@rows = []
|
|
118
124
|
while row = next_row
|
|
119
125
|
@rows << row
|
|
@@ -125,18 +131,71 @@ class Censive < StringScanner
|
|
|
125
131
|
end
|
|
126
132
|
|
|
127
133
|
def next_row
|
|
134
|
+
if @cheat and line = scan_until(@eol)
|
|
135
|
+
row = line.chomp!.split(@sep, -1)
|
|
136
|
+
row.each do |col|
|
|
137
|
+
next if (saw = col.count(@quote)).zero?
|
|
138
|
+
next if (saw == 2) && col.delete_prefix!(@quote) && col.delete_suffix!(@quote)
|
|
139
|
+
@cheat = false
|
|
140
|
+
break
|
|
141
|
+
end if line.include?(@quote)
|
|
142
|
+
@cheat and return @strip ? row.each(&:strip!) : row
|
|
143
|
+
unscan
|
|
144
|
+
end
|
|
145
|
+
|
|
128
146
|
token = next_token or return
|
|
129
|
-
row = [
|
|
130
|
-
row
|
|
147
|
+
row = []
|
|
148
|
+
row.push(*token)
|
|
149
|
+
row.push(*token) while token = next_token
|
|
131
150
|
row
|
|
132
151
|
end
|
|
133
152
|
|
|
153
|
+
def next_token
|
|
154
|
+
if scan(@quoted) # quoted cell
|
|
155
|
+
token = ""
|
|
156
|
+
while true
|
|
157
|
+
token << (scan_until(@quotes) or bomb "unclosed quote")[0..-2]
|
|
158
|
+
token << @quote and next if scan(@quote)
|
|
159
|
+
scan(@eoc) and break
|
|
160
|
+
@relax or bomb "invalid character after quote"
|
|
161
|
+
token << @quote + (scan_until(@quotes) or bomb "bad inline quote")
|
|
162
|
+
end
|
|
163
|
+
scan(@sep)
|
|
164
|
+
@strip ? token.strip : token
|
|
165
|
+
elsif match = scan(@unquoted) # unquoted cell(s)
|
|
166
|
+
if check(@quote) && !match.chomp!(@sep) # if we see a stray quote
|
|
167
|
+
unless @excel && match.chomp!(@seq) # unless an excel literal, fix it
|
|
168
|
+
match << (scan_until(@eoc) or bomb "stray quote")
|
|
169
|
+
scan(@sep)
|
|
170
|
+
end
|
|
171
|
+
end
|
|
172
|
+
tokens = match.split(@sep, -1)
|
|
173
|
+
@strip ? tokens.map!(&:strip) : tokens
|
|
174
|
+
elsif scan(@sep)
|
|
175
|
+
match = scan(@seps)
|
|
176
|
+
match ? match.split(@sep, -1) : @es
|
|
177
|
+
else
|
|
178
|
+
scan(@eol)
|
|
179
|
+
nil
|
|
180
|
+
end
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
def each
|
|
184
|
+
@rows ||= parse
|
|
185
|
+
@rows.each {|row| yield row }
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
def export(**opts)
|
|
189
|
+
out = opts.empty? ? self : self.class.writer(**opts)
|
|
190
|
+
each {|row| out << row }
|
|
191
|
+
end
|
|
192
|
+
|
|
134
193
|
# ==[ Helpers ]==
|
|
135
194
|
|
|
136
195
|
# returns 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
|
|
137
196
|
def grok(str)
|
|
138
|
-
if idx = str.index(
|
|
139
|
-
$1 ? 2 : str.index(
|
|
197
|
+
if idx = str.index(@escapes)
|
|
198
|
+
$1 ? 2 : str.index(@quotes, idx) ? 2 : 1
|
|
140
199
|
else
|
|
141
200
|
0
|
|
142
201
|
end
|
|
@@ -156,11 +215,11 @@ class Censive < StringScanner
|
|
|
156
215
|
row
|
|
157
216
|
when 1
|
|
158
217
|
row.map do |col|
|
|
159
|
-
col.match?(
|
|
218
|
+
col.match?(@quotable) ? "#{q}#{col}#{q}" : col
|
|
160
219
|
end
|
|
161
220
|
else
|
|
162
221
|
row.map do |col|
|
|
163
|
-
@excel && col =~
|
|
222
|
+
@excel && col =~ @leadzero ? "=#{q}#{col}#{q}" :
|
|
164
223
|
case grok(col)
|
|
165
224
|
when 0 then col
|
|
166
225
|
when 1 then "#{q}#{col}#{q}"
|
|
@@ -171,7 +230,7 @@ class Censive < StringScanner
|
|
|
171
230
|
when :full
|
|
172
231
|
if @excel
|
|
173
232
|
row.map do |col|
|
|
174
|
-
col =~
|
|
233
|
+
col =~ @leadzero ? "=#{q}#{col}#{q}" : "#{q}#{col.gsub(q, @esc)}#{q}"
|
|
175
234
|
end
|
|
176
235
|
else
|
|
177
236
|
row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
|
|
@@ -181,16 +240,6 @@ class Censive < StringScanner
|
|
|
181
240
|
@out << out + @rowsep
|
|
182
241
|
end
|
|
183
242
|
|
|
184
|
-
def each
|
|
185
|
-
@rows ||= parse
|
|
186
|
-
@rows.each {|row| yield row }
|
|
187
|
-
end
|
|
188
|
-
|
|
189
|
-
def export(**opts)
|
|
190
|
-
out = opts.empty? ? self : self.class.writer(**opts)
|
|
191
|
-
each {|row| out << row }
|
|
192
|
-
end
|
|
193
|
-
|
|
194
243
|
def stats
|
|
195
244
|
wide = string.size.to_s.size
|
|
196
245
|
puts "%#{wide}d rows" % @rows.size
|
|
@@ -198,27 +247,8 @@ class Censive < StringScanner
|
|
|
198
247
|
puts "%#{wide}d cells" % @cells
|
|
199
248
|
puts "%#{wide}d bytes" % string.size
|
|
200
249
|
end
|
|
201
|
-
end
|
|
202
250
|
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
csv = Censive.new(raw, excel: true, relax: true)
|
|
207
|
-
csv.export # (sep: ",", excel: true)
|
|
251
|
+
def bomb(msg)
|
|
252
|
+
abort "\n#{File.basename($0)}: #{msg} at character #{pos} near '#{string[pos-4,7]}'"
|
|
253
|
+
end
|
|
208
254
|
end
|
|
209
|
-
|
|
210
|
-
__END__
|
|
211
|
-
Name,Age,Shoe
|
|
212
|
-
Alice,27,5
|
|
213
|
-
Bob,33,10 1/2
|
|
214
|
-
Charlie or "Chuck",=B2 + B3,9
|
|
215
|
-
"Doug E Fresh",="007",10
|
|
216
|
-
Subtotal,=sum(B2:B5),="01234"
|
|
217
|
-
|
|
218
|
-
# first line works in "relax" mode, bottom line is compliant
|
|
219
|
-
123,"CHO, JOELLE "JOJO"",456
|
|
220
|
-
123,"CHO, JOELLE ""JOJO""",456
|
|
221
|
-
|
|
222
|
-
# Excel mode checking
|
|
223
|
-
=,=x,x=,="x",="","","=",123,0123,="123",="0123"
|
|
224
|
-
,=x,x=,x,,,,,,=,,123,="0123",123,,="0123" # <= a little off
|
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
# ============================================================================
|
|
4
|
+
# censive - A quick and lightweight CSV handling library for Ruby
|
|
5
|
+
#
|
|
6
|
+
# Author: Steve Shreeve (steve.shreeve@gmail.com)
|
|
7
|
+
# Date: Feb 8, 2023
|
|
8
|
+
#
|
|
9
|
+
# https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
|
|
10
|
+
# https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
|
|
11
|
+
#
|
|
12
|
+
# Thanks to Sutou Kouhei (kou) for his excellent advice on using scan
|
|
13
|
+
# ============================================================================
|
|
14
|
+
# GOALS:
|
|
15
|
+
# 1. Faster than Ruby's default CSV library
|
|
16
|
+
# 2. Lightweight code with streamlined and optimized logic
|
|
17
|
+
# 3. Support most non-compliant CSV variations (eg - @excel, @relax, @strip)
|
|
18
|
+
#
|
|
19
|
+
# TODO:
|
|
20
|
+
# 1. Support IO streaming
|
|
21
|
+
# 2. Review all encodings, we may be losing speed when mixing encodings
|
|
22
|
+
# 3. Huge speedup possible if our @unquoted regex reads beyond @cr?@lf's
|
|
23
|
+
# 4. Will using String#freeze give us a speed up?
|
|
24
|
+
# 5. Implement support for scan_until(string) <= right now only regex is valid
|
|
25
|
+
# ============================================================================
|
|
26
|
+
|
|
27
|
+
require "strscan"
|
|
28
|
+
|
|
29
|
+
class Censive < StringScanner
|
|
30
|
+
attr :encoding
|
|
31
|
+
|
|
32
|
+
def self.parse(...)
|
|
33
|
+
new(...).parse
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def self.writer(obj=nil, **opts, &code)
|
|
37
|
+
case obj
|
|
38
|
+
when String then File.open(obj, "w") {|io| yield new(out: io, **opts, &code) }
|
|
39
|
+
when IO,nil then new(out: obj, **opts, &code)
|
|
40
|
+
else abort "#{File.basename($0)}: invalid #{obj.class} object in writer"
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def initialize(str=nil,
|
|
45
|
+
drop: false , # drop trailing empty fields?
|
|
46
|
+
encoding: nil , # character encoding
|
|
47
|
+
excel: false , # literals ="01" formulas =A1 + B2 http://bit.ly/3Y7jIvc
|
|
48
|
+
mode: :compact, # export mode: compact or full
|
|
49
|
+
out: nil , # output stream, needs to respond to <<
|
|
50
|
+
quote: '"' , # quote character
|
|
51
|
+
relax: false , # relax quote parsing so ,"Fo"o, => ,"Fo""o",
|
|
52
|
+
rowsep: "\n" , # row separator for export
|
|
53
|
+
sep: "," , # column separator character
|
|
54
|
+
strip: false , # strip fields when reading
|
|
55
|
+
**opts # grab bag
|
|
56
|
+
)
|
|
57
|
+
# initialize data source
|
|
58
|
+
if str && str.size < 100 && File.readable?(str)
|
|
59
|
+
str = File.open(str, encoding ? "r:#{encoding}" : "r").read
|
|
60
|
+
else
|
|
61
|
+
str ||= ""
|
|
62
|
+
str = str.encode(encoding) if encoding
|
|
63
|
+
end
|
|
64
|
+
super(str)
|
|
65
|
+
reset
|
|
66
|
+
|
|
67
|
+
# config options
|
|
68
|
+
@drop = drop
|
|
69
|
+
@encoding = str.encoding
|
|
70
|
+
@excel = excel
|
|
71
|
+
@mode = mode
|
|
72
|
+
@out = out || $stdout
|
|
73
|
+
@relax = relax
|
|
74
|
+
@strip = strip
|
|
75
|
+
|
|
76
|
+
# config strings
|
|
77
|
+
@quote = quote
|
|
78
|
+
@rowsep = rowsep
|
|
79
|
+
@sep = sep
|
|
80
|
+
|
|
81
|
+
# static strings
|
|
82
|
+
@cr = "\r"
|
|
83
|
+
@lf = "\n"
|
|
84
|
+
@es = ""
|
|
85
|
+
@eq = "="
|
|
86
|
+
|
|
87
|
+
# combinations
|
|
88
|
+
@esc = (@quote * 2)
|
|
89
|
+
@seq = [@sep, @eq].join # used for parsing in excel mode
|
|
90
|
+
|
|
91
|
+
#!# TODO: come up with a clean way to escape/encode all this
|
|
92
|
+
#!# TODO: maybe define @tokens = "#{@quote}#{@sep}#{@cr}#{@lf}", etc.
|
|
93
|
+
|
|
94
|
+
# regexes
|
|
95
|
+
@eoc = /(?=#{"\\" + @sep}|#{@cr}|#{@lf}|\z)/o # end of cell
|
|
96
|
+
@eol = /#{@cr}#{@lf}?|#{@lf}/o # end of line
|
|
97
|
+
@escapes = /(#{@quote})|#{"\\"+@sep}|#{@cr}|#{@lf}/o
|
|
98
|
+
@quotable = /#{"\\"+@sep}|#{@cr}|#{@lf}/o
|
|
99
|
+
@quotes = /#{@quote}/o
|
|
100
|
+
@seps = /#{@sep}+/o
|
|
101
|
+
@quoted = @excel ? /(?:=)?#{@quote}/o : @quote
|
|
102
|
+
@unquoted = /[^#{@sep}#{@cr}#{@lf}][^#{@quote}#{@cr}#{@lf}]*/o
|
|
103
|
+
@leadzero = /\A0\d*\z/
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def reset(str=nil)
|
|
107
|
+
@rows = nil
|
|
108
|
+
@cols = @cells = 0
|
|
109
|
+
|
|
110
|
+
#!# TODO: reset all encodings?
|
|
111
|
+
self.string = str if str
|
|
112
|
+
@encoding = string.encoding
|
|
113
|
+
super()
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# ==[ Parser ]==
|
|
117
|
+
|
|
118
|
+
def parse
|
|
119
|
+
@rows = []
|
|
120
|
+
while row = next_row
|
|
121
|
+
@rows << row
|
|
122
|
+
count = row.size
|
|
123
|
+
@cols = count if count > @cols
|
|
124
|
+
@cells += count
|
|
125
|
+
end
|
|
126
|
+
@rows
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def next_row
|
|
130
|
+
token = next_token or return
|
|
131
|
+
row = []
|
|
132
|
+
row.push(*token)
|
|
133
|
+
row.push(*token) while token = next_token
|
|
134
|
+
row
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def next_token
|
|
138
|
+
if scan(@quoted) # quoted cell
|
|
139
|
+
token = ""
|
|
140
|
+
while true
|
|
141
|
+
token << (scan_until(@quotes) or bomb "unclosed quote")[0..-2]
|
|
142
|
+
token << @quote and next if scan(@quote)
|
|
143
|
+
scan(@eoc) and break
|
|
144
|
+
@relax or bomb "invalid character after quote"
|
|
145
|
+
token << @quote + (scan_until(@quotes) or bomb "bad inline quote")
|
|
146
|
+
end
|
|
147
|
+
scan(@sep)
|
|
148
|
+
@strip ? token.strip : token
|
|
149
|
+
elsif match = scan(@unquoted) # unquoted cell(s)
|
|
150
|
+
if check(@quote) && !match.chomp!(@sep) # if we see a stray quote
|
|
151
|
+
unless @excel && match.chomp!(@seq) # unless an excel literal, fix it
|
|
152
|
+
match << (scan_until(@eoc) or bomb "stray quote")
|
|
153
|
+
scan(@sep)
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
tokens = match.split(@sep, -1)
|
|
157
|
+
@strip ? tokens.map!(&:strip) : tokens
|
|
158
|
+
elsif scan(@sep)
|
|
159
|
+
match = scan(@seps)
|
|
160
|
+
match ? match.split(@sep, -1) : @es
|
|
161
|
+
else
|
|
162
|
+
scan(@eol)
|
|
163
|
+
nil
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
def each
|
|
168
|
+
@rows ||= parse
|
|
169
|
+
@rows.each {|row| yield row }
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
def export(**opts)
|
|
173
|
+
out = opts.empty? ? self : self.class.writer(**opts)
|
|
174
|
+
each {|row| out << row }
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
# ==[ Helpers ]==
|
|
178
|
+
|
|
179
|
+
# returns 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
|
|
180
|
+
def grok(str)
|
|
181
|
+
if idx = str.index(@escapes)
|
|
182
|
+
$1 ? 2 : str.index(@quotes, idx) ? 2 : 1
|
|
183
|
+
else
|
|
184
|
+
0
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
# output a row
|
|
189
|
+
def <<(row)
|
|
190
|
+
|
|
191
|
+
# drop trailing empty columns
|
|
192
|
+
row.pop while row.last.empty? if @drop
|
|
193
|
+
|
|
194
|
+
s,q = @sep, @quote
|
|
195
|
+
out = case @mode
|
|
196
|
+
when :compact
|
|
197
|
+
case @excel ? 2 : grok(row.join)
|
|
198
|
+
when 0
|
|
199
|
+
row
|
|
200
|
+
when 1
|
|
201
|
+
row.map do |col|
|
|
202
|
+
col.match?(@quotable) ? "#{q}#{col}#{q}" : col
|
|
203
|
+
end
|
|
204
|
+
else
|
|
205
|
+
row.map do |col|
|
|
206
|
+
@excel && col =~ @leadzero ? "=#{q}#{col}#{q}" :
|
|
207
|
+
case grok(col)
|
|
208
|
+
when 0 then col
|
|
209
|
+
when 1 then "#{q}#{col}#{q}"
|
|
210
|
+
else "#{q}#{col.gsub(q, @esc)}#{q}"
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
end
|
|
214
|
+
when :full
|
|
215
|
+
if @excel
|
|
216
|
+
row.map do |col|
|
|
217
|
+
col =~ @leadzero ? "=#{q}#{col}#{q}" : "#{q}#{col.gsub(q, @esc)}#{q}"
|
|
218
|
+
end
|
|
219
|
+
else
|
|
220
|
+
row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
|
|
221
|
+
end
|
|
222
|
+
end.join(s)
|
|
223
|
+
|
|
224
|
+
@out << out + @rowsep
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
def stats
|
|
228
|
+
wide = string.size.to_s.size
|
|
229
|
+
puts "%#{wide}d rows" % @rows.size
|
|
230
|
+
puts "%#{wide}d columns" % @cols
|
|
231
|
+
puts "%#{wide}d cells" % @cells
|
|
232
|
+
puts "%#{wide}d bytes" % string.size
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
def bomb(msg)
|
|
236
|
+
abort "\n#{File.basename($0)}: #{msg} at character #{pos} near '#{string[pos-4,7]}'"
|
|
237
|
+
end
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
if __FILE__ == $0
|
|
241
|
+
raw = DATA.gets("\n\n").chomp
|
|
242
|
+
# raw = File.read(ARGV.first || "lc-2023.csv")
|
|
243
|
+
csv = Censive.new(raw, excel: true, relax: true)
|
|
244
|
+
csv.export # (excel: true) # sep: "|")
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
__END__
|
|
248
|
+
"Don",="007",10,"Ed"
|
|
249
|
+
Name,Age,,,Shoe,,,
|
|
250
|
+
"Alice",27,5
|
|
251
|
+
Bob,33,10 1/2
|
|
252
|
+
Charlie or "Chuck",=B2 + B3,9
|
|
253
|
+
Subtotal,=sum(B2:B5),="01234"
|
|
254
|
+
|
|
255
|
+
A,B,C,D
|
|
256
|
+
A,B,"C",D
|
|
257
|
+
A,B,C",D
|
|
258
|
+
A,B,"C",D
|
|
259
|
+
|
|
260
|
+
# first line works in "relax" mode, bottom line is compliant
|
|
261
|
+
123,"CHO, JOELLE "JOJO"",456
|
|
262
|
+
123,"CHO, JOELLE ""JOJO""",456
|
|
263
|
+
|
|
264
|
+
# Excel mode checking
|
|
265
|
+
=,=x,x=,="x",="","","=",123,0123,="123",="0123"
|
|
266
|
+
,=x,x=,x,,,,,,=,,123,="0123",123,,="0123" # <= a little off
|