censive 0.19 → 0.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,276 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # ============================================================================
4
+ # censive - A quick and lightweight CSV handling library for Ruby
5
+ #
6
+ # Author: Steve Shreeve (steve.shreeve@gmail.com)
7
+ # Date: Feb 8, 2023
8
+ #
9
+ # https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
10
+ # https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
11
+ #
12
+ # Thanks to Sutou Kouhei (kou) for his excellent advice on using scan
13
+ # ============================================================================
14
+ # GOALS:
15
+ # 1. Faster than Ruby's default CSV library
16
+ # 2. Lightweight code with streamlined and optimized logic
17
+ # 3. Support most non-compliant CSV variations (eg - @excel, @relax, @strip)
18
+ #
19
+ # TODO:
20
+ # 1. Support IO streaming
21
+ # 2. Review all encodings, we may be losing speed when mixing encodings
22
+ # 3. Huge speedup possible if our @unquoted regex reads beyond @cr?@lf's
23
+ # 4. Will using String#freeze give us a speed up?
24
+ # 5. Implement support for scan_until(string) <= right now only regex is valid
25
+ # ============================================================================
26
+
27
+ require "strscan"
28
+
29
+ class Censive < StringScanner
30
+ attr :encoding
31
+
32
+ def self.parse(...)
33
+ new(...).parse
34
+ end
35
+
36
+ def self.writer(obj=nil, **opts, &code)
37
+ case obj
38
+ when String then File.open(obj, "w") {|io| yield new(out: io, **opts, &code) }
39
+ when IO,nil then new(out: obj, **opts, &code)
40
+ else abort "#{File.basename($0)}: invalid #{obj.class} object in writer"
41
+ end
42
+ end
43
+
44
+ def initialize(str=nil,
45
+ drop: false , # drop trailing empty fields?
46
+ encoding: nil , # character encoding
47
+ excel: false , # literals ="01" formulas =A1 + B2 http://bit.ly/3Y7jIvc
48
+ mode: :compact, # export mode: compact or full
49
+ out: nil , # output stream, needs to respond to <<
50
+ quote: '"' , # quote character
51
+ relax: false , # relax quote parsing so ,"Fo"o, => ,"Fo""o",
52
+ rowsep: "\n" , # row separator for export
53
+ sep: "," , # column separator character
54
+ strip: false , # strip fields when reading
55
+ **opts # grab bag
56
+ )
57
+ # initialize data source
58
+ if str && str.size < 100 && File.readable?(str)
59
+ str = File.open(str, encoding ? "r:#{encoding}" : "r").read
60
+ else
61
+ str ||= ""
62
+ str = str.encode(encoding) if encoding
63
+ end
64
+ super(str)
65
+ reset
66
+
67
+ # config options
68
+ @drop = drop
69
+ @encoding = str.encoding
70
+ @excel = excel
71
+ @mode = mode
72
+ @out = out || $stdout
73
+ @relax = relax
74
+ @strip = strip
75
+
76
+ # config strings
77
+ @quote = quote
78
+ @rowsep = rowsep
79
+ @sep = sep
80
+
81
+ # static strings
82
+ @cr = "\r"
83
+ @lf = "\n"
84
+ @es = ""
85
+ @eq = "="
86
+
87
+ # combinations
88
+ @esc = (@quote * 2)
89
+ @seq = [@sep, @eq].join # used for parsing in excel mode
90
+
91
+ #!# TODO: come up with a clean way to escape/encode all this
92
+ #!# TODO: maybe define @tokens = "#{@quote}#{@sep}#{@cr}#{@lf}", etc.
93
+
94
+ # regexes
95
+ @eoc = /(?=#{"\\" + @sep}|#{@cr}|#{@lf}|\z)/o # end of cell
96
+ @eol = /#{@cr}#{@lf}?|#{@lf}/o # end of line
97
+ @escapes = /(#{@quote})|#{"\\"+@sep}|#{@cr}|#{@lf}/o
98
+ @quotable = /#{"\\"+@sep}|#{@cr}|#{@lf}/o
99
+ @quotes = /#{@quote}/o
100
+ @seps = /#{@sep}+/o
101
+ @quoted = @excel ? /(?:=)?#{@quote}/o : @quote
102
+ @unquoted = /[^#{@sep}#{@cr}#{@lf}][^#{@quote}]*/o
103
+ @leadzero = /\A0\d*\z/
104
+ end
105
+
106
+ def reset(str=nil)
107
+ @rows = nil
108
+ @cols = @cells = 0
109
+
110
+ #!# TODO: reset all encodings?
111
+ self.string = str if str
112
+ @encoding = string.encoding
113
+ super()
114
+ end
115
+
116
+ # ==[ Parser ]==
117
+
118
+ def parse
119
+ @rows = []
120
+ @hold = []
121
+ while row = next_row
122
+ @rows << row
123
+ count = row.size
124
+ @cols = count if count > @cols
125
+ @cells += count
126
+ end
127
+ @rows
128
+ end
129
+
130
+ def next_row
131
+ token = next_token or return
132
+ row = []
133
+ row.push(*token)
134
+ row.push(*token) while token = next_token
135
+ row
136
+ end
137
+
138
+ def next_token
139
+ @hold.empty? or return @hold.shift
140
+ if scan(@quoted) # quoted cell
141
+ token = ""
142
+ while true
143
+ token << (scan_until(@quotes) or bomb "unclosed quote")[0..-2]
144
+ token << @quote and next if scan(@quote)
145
+ scan(@eoc) and break
146
+ @relax or bomb "invalid character after quote"
147
+ token << @quote + (scan_until(@quotes) or bomb "bad inline quote")
148
+ end
149
+ scan(@sep)
150
+ @strip ? token.strip : token
151
+ elsif match = scan(@unquoted) # unquoted cell(s)
152
+ if check(@quote) && !match.chomp!(@sep) && !match.end_with?(@cr, @lf)
153
+ unless @excel && match.chomp!(@seq) # unless an excel literal, fix it
154
+ match << (scan_until(@eoc) or bomb "stray quote")
155
+ scan(@sep)
156
+ end
157
+ end
158
+ match.split(@eol, -1).each_with_index do |line, i|
159
+ if line.empty?
160
+ @hold.push(nil)
161
+ else
162
+ @hold.push(nil) if i > 0
163
+ cells = line.split(@sep, -1)
164
+ @hold.push(@strip ? cells.map!(&:strip) : cells)
165
+ end
166
+ end
167
+ @hold.shift
168
+ elsif scan(@sep)
169
+ match = scan(@seps)
170
+ match ? match.split(@sep, -1) : @es
171
+ else
172
+ scan(@eol)
173
+ nil
174
+ end
175
+ end
176
+
177
+ def each
178
+ @rows ||= parse
179
+ @rows.each {|row| yield row }
180
+ end
181
+
182
+ def export(**opts)
183
+ out = opts.empty? ? self : self.class.writer(**opts)
184
+ each {|row| out << row }
185
+ end
186
+
187
+ # ==[ Helpers ]==
188
+
189
+ # returns 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
190
+ def grok(str)
191
+ if idx = str.index(@escapes)
192
+ $1 ? 2 : str.index(@quotes, idx) ? 2 : 1
193
+ else
194
+ 0
195
+ end
196
+ end
197
+
198
+ # output a row
199
+ def <<(row)
200
+
201
+ # drop trailing empty columns
202
+ row.pop while row.last.empty? if @drop
203
+
204
+ s,q = @sep, @quote
205
+ out = case @mode
206
+ when :compact
207
+ case @excel ? 2 : grok(row.join)
208
+ when 0
209
+ row
210
+ when 1
211
+ row.map do |col|
212
+ col.match?(@quotable) ? "#{q}#{col}#{q}" : col
213
+ end
214
+ else
215
+ row.map do |col|
216
+ @excel && col =~ @leadzero ? "=#{q}#{col}#{q}" :
217
+ case grok(col)
218
+ when 0 then col
219
+ when 1 then "#{q}#{col}#{q}"
220
+ else "#{q}#{col.gsub(q, @esc)}#{q}"
221
+ end
222
+ end
223
+ end
224
+ when :full
225
+ if @excel
226
+ row.map do |col|
227
+ col =~ @leadzero ? "=#{q}#{col}#{q}" : "#{q}#{col.gsub(q, @esc)}#{q}"
228
+ end
229
+ else
230
+ row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
231
+ end
232
+ end.join(s)
233
+
234
+ @out << out + @rowsep
235
+ end
236
+
237
+ def stats
238
+ wide = string.size.to_s.size
239
+ puts "%#{wide}d rows" % @rows.size
240
+ puts "%#{wide}d columns" % @cols
241
+ puts "%#{wide}d cells" % @cells
242
+ puts "%#{wide}d bytes" % string.size
243
+ end
244
+
245
+ def bomb(msg)
246
+ abort "\n#{File.basename($0)}: #{msg} at character #{pos} near '#{string[pos-4,7]}'"
247
+ end
248
+ end
249
+
250
+ if __FILE__ == $0
251
+ raw = DATA.gets("\n\n").chomp
252
+ # raw = File.read(ARGV.first || "lc-2023.csv")
253
+ csv = Censive.new(raw, excel: true, relax: true)
254
+ csv.export # (excel: true) # sep: "|")
255
+ end
256
+
257
+ __END__
258
+ "Don",="007",10,11,"Ed",20
259
+ Name,Age,,,Shoe,,,
260
+ "Alice",27,5
261
+ Bob,33,10 1/2
262
+ Charlie or "Chuck",=B2 + B3,9
263
+ Subtotal,=sum(B2:B5),="01234"
264
+
265
+ A,B,C,D
266
+ A,B,"C",D
267
+ A,B,C",D
268
+ A,B,"C",D
269
+
270
+ # first line works in "relax" mode, bottom line is compliant
271
+ 123,"CHO, JOELLE "JOJO"",456
272
+ 123,"CHO, JOELLE ""JOJO""",456
273
+
274
+ # Excel mode checking
275
+ =,=x,x=,="x",="","","=",123,0123,="123",="0123"
276
+ ,=x,x=,x,,,,,,=,,123,="0123",123,,="0123" # <= a little off
@@ -0,0 +1,282 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # ============================================================================
4
+ # censive - A quick and lightweight CSV handling library for Ruby
5
+ #
6
+ # Author: Steve Shreeve (steve.shreeve@gmail.com)
7
+ # Date: Feb 9, 2023
8
+ #
9
+ # https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
10
+ # https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
11
+ #
12
+ # Thanks to Sutou Kouhei (kou) for his excellent advice on using scan
13
+ # ============================================================================
14
+ # GOALS:
15
+ # 1. Faster than Ruby's default CSV library
16
+ # 2. Lightweight code with streamlined and optimized logic
17
+ # 3. Support most non-compliant CSV variations (@excel, @relax, etc)
18
+ # 4. Support most commonly used CSV options (@sep, @quote, @strip, @drop, etc)
19
+ #
20
+ # TODO:
21
+ # 1. Support IO streaming
22
+ # 2. Review all encodings, we may be losing speed when mixing encodings
23
+ # 3. Huge speedup possible if our @unquoted regex reads beyond @cr?@lf's
24
+ # 4. Will using String#freeze give us a speed up?
25
+ # 5. Implement support for scan_until(string) <= right now only regex is valid
26
+ # ============================================================================
27
+
28
+ require "strscan"
29
+
30
+ class Censive < StringScanner
31
+ attr :encoding
32
+
33
+ def self.parse(...)
34
+ new(...).parse
35
+ end
36
+
37
+ def self.writer(obj=nil, **opts, &code)
38
+ case obj
39
+ when String then File.open(obj, "w") {|io| yield new(out: io, **opts, &code) }
40
+ when IO,nil then new(out: obj, **opts, &code)
41
+ else abort "#{File.basename($0)}: invalid #{obj.class} object in writer"
42
+ end
43
+ end
44
+
45
+ def initialize(str=nil,
46
+ drop: false , # drop trailing empty columns?
47
+ encoding: nil , # character encoding
48
+ excel: false , # literals ="01" formulas =A1 + B2 http://bit.ly/3Y7jIvc
49
+ mode: :compact, # export mode: compact or full
50
+ out: nil , # output stream, needs to respond to <<
51
+ quote: '"' , # quote character
52
+ relax: false , # relax quote parsing so ,"Fo"o, => ,"Fo""o",
53
+ rowsep: "\n" , # row separator for export
54
+ sep: "," , # column separator character
55
+ strip: false , # strip columns when reading
56
+ **opts # grab bag
57
+ )
58
+ # initialize data source
59
+ if str && str.size < 100 && File.readable?(str)
60
+ str = File.open(str, encoding ? "r:#{encoding}" : "r").read
61
+ else
62
+ str ||= ""
63
+ str = str.encode(encoding) if encoding
64
+ end
65
+ super(str)
66
+ reset
67
+
68
+ # config options
69
+ @cheat = true
70
+ @drop = drop
71
+ @encoding = str.encoding
72
+ @excel = excel
73
+ @mode = mode
74
+ @out = out || $stdout
75
+ @relax = relax
76
+ @strip = strip
77
+
78
+ # config strings
79
+ @quote = quote
80
+ @rowsep = rowsep
81
+ @sep = sep
82
+
83
+ # static strings
84
+ @cr = "\r"
85
+ @lf = "\n"
86
+ @es = ""
87
+ @eq = "="
88
+
89
+ # combinations
90
+ @esc = (@quote * 2)
91
+ @seq = [@sep, @eq].join # used for parsing in excel mode
92
+
93
+ #!# TODO: come up with a clean way to escape/encode all this
94
+ #!# TODO: maybe define @tokens = "#{@quote}#{@sep}#{@cr}#{@lf}", etc.
95
+
96
+ # regexes
97
+ @eoc = /(?=#{"\\" + @sep}|#{@cr}|#{@lf}|\z)/o # end of cell
98
+ @eol = /#{@cr}#{@lf}?|#{@lf}/o # end of line
99
+ @escapes = /(#{@quote})|#{"\\"+@sep}|#{@cr}|#{@lf}/o
100
+ @quotable = /#{"\\"+@sep}|#{@cr}|#{@lf}/o
101
+ @quotes = /#{@quote}/o
102
+ @seps = /#{@sep}+/o
103
+ @quoted = @excel ? /(?:=)?#{@quote}/o : @quote
104
+ @unquoted = /[^#{@sep}#{@cr}#{@lf}][^#{@quote}#{@cr}#{@lf}]*/o
105
+ @leadzero = /\A0\d*\z/
106
+ end
107
+
108
+ def reset(str=nil)
109
+ @rows = nil
110
+ @cols = @cells = 0
111
+
112
+ #!# TODO: reset all encodings?
113
+ self.string = str if str
114
+ @encoding = string.encoding
115
+ super()
116
+ end
117
+
118
+ # ==[ Parser ]==
119
+
120
+ def parse
121
+ @rows = []
122
+ while row = next_row
123
+ @rows << row
124
+ count = row.size
125
+ @cols = count if count > @cols
126
+ @cells += count
127
+ end
128
+ @rows
129
+ end
130
+
131
+ def next_row
132
+ if @cheat and line = scan_until(@eol)
133
+ row = line.chomp!.split(@sep, -1)
134
+ row.each do |col|
135
+ next if (saw = col.count(@quote)).zero?
136
+ next if (saw == 2) && col.delete_prefix!(@quote) && col.delete_suffix!(@quote)
137
+ @cheat = false
138
+ break
139
+ end if line.include?(@quote)
140
+ @cheat and return @strip ? row.each(&:strip!) : row
141
+ unscan
142
+ end
143
+
144
+ token = next_token or return
145
+ row = []
146
+ row.push(*token)
147
+ row.push(*token) while token = next_token
148
+ row
149
+ end
150
+
151
+ def next_token
152
+ if scan(@quoted) # quoted cell
153
+ token = ""
154
+ while true
155
+ token << (scan_until(@quotes) or bomb "unclosed quote")[0..-2]
156
+ token << @quote and next if scan(@quote)
157
+ scan(@eoc) and break
158
+ @relax or bomb "invalid character after quote"
159
+ token << @quote + (scan_until(@quotes) or bomb "bad inline quote")
160
+ end
161
+ scan(@sep)
162
+ @strip ? token.strip : token
163
+ elsif match = scan(@unquoted) # unquoted cell(s)
164
+ if check(@quote) && !match.chomp!(@sep) # if we see a stray quote
165
+ unless @excel && match.chomp!(@seq) # unless an excel literal, fix it
166
+ match << (scan_until(@eoc) or bomb "stray quote")
167
+ scan(@sep)
168
+ end
169
+ end
170
+ tokens = match.split(@sep, -1)
171
+ @strip ? tokens.map!(&:strip) : tokens
172
+ elsif scan(@sep)
173
+ match = scan(@seps)
174
+ match ? match.split(@sep, -1) : @es
175
+ else
176
+ scan(@eol)
177
+ nil
178
+ end
179
+ end
180
+
181
+ def each
182
+ @rows ||= parse
183
+ @rows.each {|row| yield row }
184
+ end
185
+
186
+ def export(**opts)
187
+ out = opts.empty? ? self : self.class.writer(**opts)
188
+ each {|row| out << row }
189
+ end
190
+
191
+ # ==[ Helpers ]==
192
+
193
+ # returns 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
194
+ def grok(str)
195
+ if idx = str.index(@escapes)
196
+ $1 ? 2 : str.index(@quotes, idx) ? 2 : 1
197
+ else
198
+ 0
199
+ end
200
+ end
201
+
202
+ # output a row
203
+ def <<(row)
204
+
205
+ # drop trailing empty columns
206
+ row.pop while row.last.empty? if @drop
207
+
208
+ s,q = @sep, @quote
209
+ out = case @mode
210
+ when :compact
211
+ case @excel ? 2 : grok(row.join)
212
+ when 0
213
+ row
214
+ when 1
215
+ row.map do |col|
216
+ col.match?(@quotable) ? "#{q}#{col}#{q}" : col
217
+ end
218
+ else
219
+ row.map do |col|
220
+ @excel && col =~ @leadzero ? "=#{q}#{col}#{q}" :
221
+ case grok(col)
222
+ when 0 then col
223
+ when 1 then "#{q}#{col}#{q}"
224
+ else "#{q}#{col.gsub(q, @esc)}#{q}"
225
+ end
226
+ end
227
+ end
228
+ when :full
229
+ if @excel
230
+ row.map do |col|
231
+ col =~ @leadzero ? "=#{q}#{col}#{q}" : "#{q}#{col.gsub(q, @esc)}#{q}"
232
+ end
233
+ else
234
+ row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
235
+ end
236
+ end.join(s)
237
+
238
+ @out << out + @rowsep
239
+ end
240
+
241
+ def stats
242
+ wide = string.size.to_s.size
243
+ puts "%#{wide}d rows" % @rows.size
244
+ puts "%#{wide}d columns" % @cols
245
+ puts "%#{wide}d cells" % @cells
246
+ puts "%#{wide}d bytes" % string.size
247
+ end
248
+
249
+ def bomb(msg)
250
+ abort "\n#{File.basename($0)}: #{msg} at character #{pos} near '#{string[pos-4,7]}'"
251
+ end
252
+ end
253
+
254
+ if __FILE__ == $0
255
+ # raw = DATA.gets("\n\n").chomp
256
+ # raw = File.read(ARGV.first || "lc-2023.csv")
257
+ raw = File.open("KEN_ALL.CSV", "r:cp932").read
258
+
259
+ csv = Censive.new(raw, excel: true, relax: true)
260
+ csv.export # (excel: true) # sep: "|")
261
+ end
262
+
263
+ __END__
264
+ "Don",="007",10,"Ed"
265
+ Name,Age,,,Shoe,,,
266
+ "Alice",27,5
267
+ Bob,33,10 1/2
268
+ Charlie or "Chuck",=B2 + B3,9
269
+ Subtotal,=sum(B2:B5),="01234"
270
+
271
+ A,B,C,D
272
+ A,B,"C",D
273
+ A,B,C",D
274
+ A,B,"C",D
275
+
276
+ # first line works in "relax" mode, bottom line is compliant
277
+ 123,"CHO, JOELLE "JOJO"",456
278
+ 123,"CHO, JOELLE ""JOJO""",456
279
+
280
+ # Excel mode checking
281
+ =,=x,x=,="x",="","","=",123,0123,="123",="0123"
282
+ ,=x,x=,x,,,,,,=,,123,="0123",123,,="0123" # <= a little off