censive 0.19 → 0.20

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,276 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # ============================================================================
4
+ # censive - A quick and lightweight CSV handling library for Ruby
5
+ #
6
+ # Author: Steve Shreeve (steve.shreeve@gmail.com)
7
+ # Date: Feb 8, 2023
8
+ #
9
+ # https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
10
+ # https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
11
+ #
12
+ # Thanks to Sutou Kouhei (kou) for his excellent advice on using scan
13
+ # ============================================================================
14
+ # GOALS:
15
+ # 1. Faster than Ruby's default CSV library
16
+ # 2. Lightweight code with streamlined and optimized logic
17
+ # 3. Support most non-compliant CSV variations (eg - @excel, @relax, @strip)
18
+ #
19
+ # TODO:
20
+ # 1. Support IO streaming
21
+ # 2. Review all encodings, we may be losing speed when mixing encodings
22
+ # 3. Huge speedup possible if our @unquoted regex reads beyond @cr?@lf's
23
+ # 4. Will using String#freeze give us a speed up?
24
+ # 5. Implement support for scan_until(string) <= right now only regex is valid
25
+ # ============================================================================
26
+
27
+ require "strscan"
28
+
29
+ class Censive < StringScanner
30
+ attr :encoding
31
+
32
+ def self.parse(...)
33
+ new(...).parse
34
+ end
35
+
36
+ def self.writer(obj=nil, **opts, &code)
37
+ case obj
38
+ when String then File.open(obj, "w") {|io| yield new(out: io, **opts, &code) }
39
+ when IO,nil then new(out: obj, **opts, &code)
40
+ else abort "#{File.basename($0)}: invalid #{obj.class} object in writer"
41
+ end
42
+ end
43
+
44
+ def initialize(str=nil,
45
+ drop: false , # drop trailing empty fields?
46
+ encoding: nil , # character encoding
47
+ excel: false , # literals ="01" formulas =A1 + B2 http://bit.ly/3Y7jIvc
48
+ mode: :compact, # export mode: compact or full
49
+ out: nil , # output stream, needs to respond to <<
50
+ quote: '"' , # quote character
51
+ relax: false , # relax quote parsing so ,"Fo"o, => ,"Fo""o",
52
+ rowsep: "\n" , # row separator for export
53
+ sep: "," , # column separator character
54
+ strip: false , # strip fields when reading
55
+ **opts # grab bag
56
+ )
57
+ # initialize data source
58
+ if str && str.size < 100 && File.readable?(str)
59
+ str = File.open(str, encoding ? "r:#{encoding}" : "r").read
60
+ else
61
+ str ||= ""
62
+ str = str.encode(encoding) if encoding
63
+ end
64
+ super(str)
65
+ reset
66
+
67
+ # config options
68
+ @drop = drop
69
+ @encoding = str.encoding
70
+ @excel = excel
71
+ @mode = mode
72
+ @out = out || $stdout
73
+ @relax = relax
74
+ @strip = strip
75
+
76
+ # config strings
77
+ @quote = quote
78
+ @rowsep = rowsep
79
+ @sep = sep
80
+
81
+ # static strings
82
+ @cr = "\r"
83
+ @lf = "\n"
84
+ @es = ""
85
+ @eq = "="
86
+
87
+ # combinations
88
+ @esc = (@quote * 2)
89
+ @seq = [@sep, @eq].join # used for parsing in excel mode
90
+
91
+ #!# TODO: come up with a clean way to escape/encode all this
92
+ #!# TODO: maybe define @tokens = "#{@quote}#{@sep}#{@cr}#{@lf}", etc.
93
+
94
+ # regexes
95
+ @eoc = /(?=#{"\\" + @sep}|#{@cr}|#{@lf}|\z)/o # end of cell
96
+ @eol = /#{@cr}#{@lf}?|#{@lf}/o # end of line
97
+ @escapes = /(#{@quote})|#{"\\"+@sep}|#{@cr}|#{@lf}/o
98
+ @quotable = /#{"\\"+@sep}|#{@cr}|#{@lf}/o
99
+ @quotes = /#{@quote}/o
100
+ @seps = /#{@sep}+/o
101
+ @quoted = @excel ? /(?:=)?#{@quote}/o : @quote
102
+ @unquoted = /[^#{@sep}#{@cr}#{@lf}][^#{@quote}]*/o
103
+ @leadzero = /\A0\d*\z/
104
+ end
105
+
106
+ def reset(str=nil)
107
+ @rows = nil
108
+ @cols = @cells = 0
109
+
110
+ #!# TODO: reset all encodings?
111
+ self.string = str if str
112
+ @encoding = string.encoding
113
+ super()
114
+ end
115
+
116
+ # ==[ Parser ]==
117
+
118
+ def parse
119
+ @rows = []
120
+ @hold = []
121
+ while row = next_row
122
+ @rows << row
123
+ count = row.size
124
+ @cols = count if count > @cols
125
+ @cells += count
126
+ end
127
+ @rows
128
+ end
129
+
130
+ def next_row
131
+ token = next_token or return
132
+ row = []
133
+ row.push(*token)
134
+ row.push(*token) while token = next_token
135
+ row
136
+ end
137
+
138
+ def next_token
139
+ @hold.empty? or return @hold.shift
140
+ if scan(@quoted) # quoted cell
141
+ token = ""
142
+ while true
143
+ token << (scan_until(@quotes) or bomb "unclosed quote")[0..-2]
144
+ token << @quote and next if scan(@quote)
145
+ scan(@eoc) and break
146
+ @relax or bomb "invalid character after quote"
147
+ token << @quote + (scan_until(@quotes) or bomb "bad inline quote")
148
+ end
149
+ scan(@sep)
150
+ @strip ? token.strip : token
151
+ elsif match = scan(@unquoted) # unquoted cell(s)
152
+ if check(@quote) && !match.chomp!(@sep) && !match.end_with?(@cr, @lf)
153
+ unless @excel && match.chomp!(@seq) # unless an excel literal, fix it
154
+ match << (scan_until(@eoc) or bomb "stray quote")
155
+ scan(@sep)
156
+ end
157
+ end
158
+ match.split(@eol, -1).each_with_index do |line, i|
159
+ if line.empty?
160
+ @hold.push(nil)
161
+ else
162
+ @hold.push(nil) if i > 0
163
+ cells = line.split(@sep, -1)
164
+ @hold.push(@strip ? cells.map!(&:strip) : cells)
165
+ end
166
+ end
167
+ @hold.shift
168
+ elsif scan(@sep)
169
+ match = scan(@seps)
170
+ match ? match.split(@sep, -1) : @es
171
+ else
172
+ scan(@eol)
173
+ nil
174
+ end
175
+ end
176
+
177
+ def each
178
+ @rows ||= parse
179
+ @rows.each {|row| yield row }
180
+ end
181
+
182
+ def export(**opts)
183
+ out = opts.empty? ? self : self.class.writer(**opts)
184
+ each {|row| out << row }
185
+ end
186
+
187
+ # ==[ Helpers ]==
188
+
189
+ # returns 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
190
+ def grok(str)
191
+ if idx = str.index(@escapes)
192
+ $1 ? 2 : str.index(@quotes, idx) ? 2 : 1
193
+ else
194
+ 0
195
+ end
196
+ end
197
+
198
+ # output a row
199
+ def <<(row)
200
+
201
+ # drop trailing empty columns
202
+ row.pop while row.last.empty? if @drop
203
+
204
+ s,q = @sep, @quote
205
+ out = case @mode
206
+ when :compact
207
+ case @excel ? 2 : grok(row.join)
208
+ when 0
209
+ row
210
+ when 1
211
+ row.map do |col|
212
+ col.match?(@quotable) ? "#{q}#{col}#{q}" : col
213
+ end
214
+ else
215
+ row.map do |col|
216
+ @excel && col =~ @leadzero ? "=#{q}#{col}#{q}" :
217
+ case grok(col)
218
+ when 0 then col
219
+ when 1 then "#{q}#{col}#{q}"
220
+ else "#{q}#{col.gsub(q, @esc)}#{q}"
221
+ end
222
+ end
223
+ end
224
+ when :full
225
+ if @excel
226
+ row.map do |col|
227
+ col =~ @leadzero ? "=#{q}#{col}#{q}" : "#{q}#{col.gsub(q, @esc)}#{q}"
228
+ end
229
+ else
230
+ row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
231
+ end
232
+ end.join(s)
233
+
234
+ @out << out + @rowsep
235
+ end
236
+
237
+ def stats
238
+ wide = string.size.to_s.size
239
+ puts "%#{wide}d rows" % @rows.size
240
+ puts "%#{wide}d columns" % @cols
241
+ puts "%#{wide}d cells" % @cells
242
+ puts "%#{wide}d bytes" % string.size
243
+ end
244
+
245
+ def bomb(msg)
246
+ abort "\n#{File.basename($0)}: #{msg} at character #{pos} near '#{string[pos-4,7]}'"
247
+ end
248
+ end
249
+
250
+ if __FILE__ == $0
251
+ raw = DATA.gets("\n\n").chomp
252
+ # raw = File.read(ARGV.first || "lc-2023.csv")
253
+ csv = Censive.new(raw, excel: true, relax: true)
254
+ csv.export # (excel: true) # sep: "|")
255
+ end
256
+
257
+ __END__
258
+ "Don",="007",10,11,"Ed",20
259
+ Name,Age,,,Shoe,,,
260
+ "Alice",27,5
261
+ Bob,33,10 1/2
262
+ Charlie or "Chuck",=B2 + B3,9
263
+ Subtotal,=sum(B2:B5),="01234"
264
+
265
+ A,B,C,D
266
+ A,B,"C",D
267
+ A,B,C",D
268
+ A,B,"C",D
269
+
270
+ # first line works in "relax" mode, bottom line is compliant
271
+ 123,"CHO, JOELLE "JOJO"",456
272
+ 123,"CHO, JOELLE ""JOJO""",456
273
+
274
+ # Excel mode checking
275
+ =,=x,x=,="x",="","","=",123,0123,="123",="0123"
276
+ ,=x,x=,x,,,,,,=,,123,="0123",123,,="0123" # <= a little off
@@ -0,0 +1,282 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # ============================================================================
4
+ # censive - A quick and lightweight CSV handling library for Ruby
5
+ #
6
+ # Author: Steve Shreeve (steve.shreeve@gmail.com)
7
+ # Date: Feb 9, 2023
8
+ #
9
+ # https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
10
+ # https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
11
+ #
12
+ # Thanks to Sutou Kouhei (kou) for his excellent advice on using scan
13
+ # ============================================================================
14
+ # GOALS:
15
+ # 1. Faster than Ruby's default CSV library
16
+ # 2. Lightweight code with streamlined and optimized logic
17
+ # 3. Support most non-compliant CSV variations (@excel, @relax, etc)
18
+ # 4. Support most commonly used CSV options (@sep, @quote, @strip, @drop, etc)
19
+ #
20
+ # TODO:
21
+ # 1. Support IO streaming
22
+ # 2. Review all encodings, we may be losing speed when mixing encodings
23
+ # 3. Huge speedup possible if our @unquoted regex reads beyond @cr?@lf's
24
+ # 4. Will using String#freeze give us a speed up?
25
+ # 5. Implement support for scan_until(string) <= right now only regex is valid
26
+ # ============================================================================
27
+
28
+ require "strscan"
29
+
30
+ class Censive < StringScanner
31
+ attr :encoding
32
+
33
+ def self.parse(...)
34
+ new(...).parse
35
+ end
36
+
37
+ def self.writer(obj=nil, **opts, &code)
38
+ case obj
39
+ when String then File.open(obj, "w") {|io| yield new(out: io, **opts, &code) }
40
+ when IO,nil then new(out: obj, **opts, &code)
41
+ else abort "#{File.basename($0)}: invalid #{obj.class} object in writer"
42
+ end
43
+ end
44
+
45
+ def initialize(str=nil,
46
+ drop: false , # drop trailing empty columns?
47
+ encoding: nil , # character encoding
48
+ excel: false , # literals ="01" formulas =A1 + B2 http://bit.ly/3Y7jIvc
49
+ mode: :compact, # export mode: compact or full
50
+ out: nil , # output stream, needs to respond to <<
51
+ quote: '"' , # quote character
52
+ relax: false , # relax quote parsing so ,"Fo"o, => ,"Fo""o",
53
+ rowsep: "\n" , # row separator for export
54
+ sep: "," , # column separator character
55
+ strip: false , # strip columns when reading
56
+ **opts # grab bag
57
+ )
58
+ # initialize data source
59
+ if str && str.size < 100 && File.readable?(str)
60
+ str = File.open(str, encoding ? "r:#{encoding}" : "r").read
61
+ else
62
+ str ||= ""
63
+ str = str.encode(encoding) if encoding
64
+ end
65
+ super(str)
66
+ reset
67
+
68
+ # config options
69
+ @cheat = true
70
+ @drop = drop
71
+ @encoding = str.encoding
72
+ @excel = excel
73
+ @mode = mode
74
+ @out = out || $stdout
75
+ @relax = relax
76
+ @strip = strip
77
+
78
+ # config strings
79
+ @quote = quote
80
+ @rowsep = rowsep
81
+ @sep = sep
82
+
83
+ # static strings
84
+ @cr = "\r"
85
+ @lf = "\n"
86
+ @es = ""
87
+ @eq = "="
88
+
89
+ # combinations
90
+ @esc = (@quote * 2)
91
+ @seq = [@sep, @eq].join # used for parsing in excel mode
92
+
93
+ #!# TODO: come up with a clean way to escape/encode all this
94
+ #!# TODO: maybe define @tokens = "#{@quote}#{@sep}#{@cr}#{@lf}", etc.
95
+
96
+ # regexes
97
+ @eoc = /(?=#{"\\" + @sep}|#{@cr}|#{@lf}|\z)/o # end of cell
98
+ @eol = /#{@cr}#{@lf}?|#{@lf}/o # end of line
99
+ @escapes = /(#{@quote})|#{"\\"+@sep}|#{@cr}|#{@lf}/o
100
+ @quotable = /#{"\\"+@sep}|#{@cr}|#{@lf}/o
101
+ @quotes = /#{@quote}/o
102
+ @seps = /#{@sep}+/o
103
+ @quoted = @excel ? /(?:=)?#{@quote}/o : @quote
104
+ @unquoted = /[^#{@sep}#{@cr}#{@lf}][^#{@quote}#{@cr}#{@lf}]*/o
105
+ @leadzero = /\A0\d*\z/
106
+ end
107
+
108
+ def reset(str=nil)
109
+ @rows = nil
110
+ @cols = @cells = 0
111
+
112
+ #!# TODO: reset all encodings?
113
+ self.string = str if str
114
+ @encoding = string.encoding
115
+ super()
116
+ end
117
+
118
+ # ==[ Parser ]==
119
+
120
+ def parse
121
+ @rows = []
122
+ while row = next_row
123
+ @rows << row
124
+ count = row.size
125
+ @cols = count if count > @cols
126
+ @cells += count
127
+ end
128
+ @rows
129
+ end
130
+
131
+ def next_row
132
+ if @cheat and line = scan_until(@eol)
133
+ row = line.chomp!.split(@sep, -1)
134
+ row.each do |col|
135
+ next if (saw = col.count(@quote)).zero?
136
+ next if (saw == 2) && col.delete_prefix!(@quote) && col.delete_suffix!(@quote)
137
+ @cheat = false
138
+ break
139
+ end if line.include?(@quote)
140
+ @cheat and return @strip ? row.each(&:strip!) : row
141
+ unscan
142
+ end
143
+
144
+ token = next_token or return
145
+ row = []
146
+ row.push(*token)
147
+ row.push(*token) while token = next_token
148
+ row
149
+ end
150
+
151
+ def next_token
152
+ if scan(@quoted) # quoted cell
153
+ token = ""
154
+ while true
155
+ token << (scan_until(@quotes) or bomb "unclosed quote")[0..-2]
156
+ token << @quote and next if scan(@quote)
157
+ scan(@eoc) and break
158
+ @relax or bomb "invalid character after quote"
159
+ token << @quote + (scan_until(@quotes) or bomb "bad inline quote")
160
+ end
161
+ scan(@sep)
162
+ @strip ? token.strip : token
163
+ elsif match = scan(@unquoted) # unquoted cell(s)
164
+ if check(@quote) && !match.chomp!(@sep) # if we see a stray quote
165
+ unless @excel && match.chomp!(@seq) # unless an excel literal, fix it
166
+ match << (scan_until(@eoc) or bomb "stray quote")
167
+ scan(@sep)
168
+ end
169
+ end
170
+ tokens = match.split(@sep, -1)
171
+ @strip ? tokens.map!(&:strip) : tokens
172
+ elsif scan(@sep)
173
+ match = scan(@seps)
174
+ match ? match.split(@sep, -1) : @es
175
+ else
176
+ scan(@eol)
177
+ nil
178
+ end
179
+ end
180
+
181
+ def each
182
+ @rows ||= parse
183
+ @rows.each {|row| yield row }
184
+ end
185
+
186
+ def export(**opts)
187
+ out = opts.empty? ? self : self.class.writer(**opts)
188
+ each {|row| out << row }
189
+ end
190
+
191
+ # ==[ Helpers ]==
192
+
193
+ # returns 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
194
+ def grok(str)
195
+ if idx = str.index(@escapes)
196
+ $1 ? 2 : str.index(@quotes, idx) ? 2 : 1
197
+ else
198
+ 0
199
+ end
200
+ end
201
+
202
+ # output a row
203
+ def <<(row)
204
+
205
+ # drop trailing empty columns
206
+ row.pop while row.last.empty? if @drop
207
+
208
+ s,q = @sep, @quote
209
+ out = case @mode
210
+ when :compact
211
+ case @excel ? 2 : grok(row.join)
212
+ when 0
213
+ row
214
+ when 1
215
+ row.map do |col|
216
+ col.match?(@quotable) ? "#{q}#{col}#{q}" : col
217
+ end
218
+ else
219
+ row.map do |col|
220
+ @excel && col =~ @leadzero ? "=#{q}#{col}#{q}" :
221
+ case grok(col)
222
+ when 0 then col
223
+ when 1 then "#{q}#{col}#{q}"
224
+ else "#{q}#{col.gsub(q, @esc)}#{q}"
225
+ end
226
+ end
227
+ end
228
+ when :full
229
+ if @excel
230
+ row.map do |col|
231
+ col =~ @leadzero ? "=#{q}#{col}#{q}" : "#{q}#{col.gsub(q, @esc)}#{q}"
232
+ end
233
+ else
234
+ row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
235
+ end
236
+ end.join(s)
237
+
238
+ @out << out + @rowsep
239
+ end
240
+
241
+ def stats
242
+ wide = string.size.to_s.size
243
+ puts "%#{wide}d rows" % @rows.size
244
+ puts "%#{wide}d columns" % @cols
245
+ puts "%#{wide}d cells" % @cells
246
+ puts "%#{wide}d bytes" % string.size
247
+ end
248
+
249
+ def bomb(msg)
250
+ abort "\n#{File.basename($0)}: #{msg} at character #{pos} near '#{string[pos-4,7]}'"
251
+ end
252
+ end
253
+
254
+ if __FILE__ == $0
255
+ # raw = DATA.gets("\n\n").chomp
256
+ # raw = File.read(ARGV.first || "lc-2023.csv")
257
+ raw = File.open("KEN_ALL.CSV", "r:cp932").read
258
+
259
+ csv = Censive.new(raw, excel: true, relax: true)
260
+ csv.export # (excel: true) # sep: "|")
261
+ end
262
+
263
+ __END__
264
+ "Don",="007",10,"Ed"
265
+ Name,Age,,,Shoe,,,
266
+ "Alice",27,5
267
+ Bob,33,10 1/2
268
+ Charlie or "Chuck",=B2 + B3,9
269
+ Subtotal,=sum(B2:B5),="01234"
270
+
271
+ A,B,C,D
272
+ A,B,"C",D
273
+ A,B,C",D
274
+ A,B,"C",D
275
+
276
+ # first line works in "relax" mode, bottom line is compliant
277
+ 123,"CHO, JOELLE "JOJO"",456
278
+ 123,"CHO, JOELLE ""JOJO""",456
279
+
280
+ # Excel mode checking
281
+ =,=x,x=,="x",="","","=",123,0123,="123",="0123"
282
+ ,=x,x=,x,,,,,,=,,123,="0123",123,,="0123" # <= a little off