censive 0.19 → 0.20

Sign up to get free protection for your applications and to get access to all the features.
data/lib/censive.rb CHANGED
@@ -4,7 +4,7 @@
4
4
  # censive - A quick and lightweight CSV handling library for Ruby
5
5
  #
6
6
  # Author: Steve Shreeve (steve.shreeve@gmail.com)
7
- # Date: Feb 5, 2023
7
+ # Date: Feb 10, 2023
8
8
  #
9
9
  # https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
10
10
  # https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
@@ -14,14 +14,22 @@
14
14
  # GOALS:
15
15
  # 1. Faster than Ruby's default CSV library
16
16
  # 2. Lightweight code with streamlined and optimized logic
17
- # 3. Support most non-compliant CSV variations (eg - @excel, @relax, @strip)
17
+ # 3. Support most non-compliant CSV variations (@excel, @relax, etc)
18
+ # 4. Support most commonly used CSV options (@sep, @quote, @strip, @drop, etc)
18
19
  #
19
- # TODO: Support IO streaming
20
+ # TODO:
21
+ # 1. Support IO streaming
22
+ # 2. Review all encodings, we may be losing speed when mixing encodings
23
+ # 3. Huge speedup possible if our @unquoted regex reads beyond @cr?@lf's
24
+ # 4. Will using String#freeze give us a speed up?
25
+ # 5. Implement support for scan_until(string) <= right now only regex is valid
20
26
  # ============================================================================
21
27
 
22
28
  require "strscan"
23
29
 
24
30
  class Censive < StringScanner
31
+ attr :encoding
32
+
25
33
  def self.parse(...)
26
34
  new(...).parse
27
35
  end
@@ -34,83 +42,84 @@ class Censive < StringScanner
34
42
  end
35
43
  end
36
44
 
37
- def initialize(str="",
38
- drop: false , # drop trailing empty fields?
39
- encoding: "utf-8" , # character encoding
45
+ def initialize(str=nil,
46
+ drop: false , # drop trailing empty columns?
47
+ encoding: nil , # character encoding
40
48
  excel: false , # literals ="01" formulas =A1 + B2 http://bit.ly/3Y7jIvc
41
49
  mode: :compact, # export mode: compact or full
42
- out: $stdout , # output stream, needs to respond to <<
50
+ out: nil , # output stream, needs to respond to <<
43
51
  quote: '"' , # quote character
44
52
  relax: false , # relax quote parsing so ,"Fo"o, => ,"Fo""o",
45
53
  rowsep: "\n" , # row separator for export
46
54
  sep: "," , # column separator character
47
- strip: false , # strip fields when reading
48
- **opts # grab bag
55
+ strip: false , # strip columns when reading
56
+ **opts # grab bag
49
57
  )
50
- # data source
51
- str = File.open(str, "r:#{encoding}").read if !str[100] && File.readable?(str)
58
+ # initialize data source
59
+ if str && str.size < 100 && File.readable?(str)
60
+ str = File.open(str, encoding ? "r:#{encoding}" : "r").read
61
+ else
62
+ str ||= ""
63
+ str = str.encode(encoding) if encoding
64
+ end
52
65
  super(str)
53
66
  reset
54
67
 
55
- # options
68
+ # config options
69
+ @cheat = true
56
70
  @drop = drop
71
+ @encoding = str.encoding
57
72
  @excel = excel
58
73
  @mode = mode
59
- @out = out
60
- @quote = quote
74
+ @out = out || $stdout
61
75
  @relax = relax
76
+ @strip = strip
77
+
78
+ # config strings
79
+ @quote = quote
62
80
  @rowsep = rowsep
63
81
  @sep = sep
64
- @strip = strip
65
82
 
66
- # definitions
67
- @cr = "\r"
68
- @lf = "\n"
69
- @es = ""
70
- @eq = "="
71
- @esc = (@quote * 2)
72
- @eol = /#{@cr}#{@lf}?|#{@lf}|\z/o # end of line
73
- @eoc = /(?=#{"\\" + @sep}|#{@cr}|#{@lf}|\z)/o # end of cell
83
+ # static strings
84
+ @cr = "\r"
85
+ @lf = "\n"
86
+ @es = ""
87
+ @eq = "="
88
+
89
+ # combinations
90
+ @esc = (@quote * 2)
91
+ @seq = [@sep, @eq].join # used for parsing in excel mode
92
+
93
+ # regexes
94
+ @eoc = /(?=#{"\\" + @sep}|#{@cr}|#{@lf}|\z)/o # end of cell
95
+ @eol = /#{@cr}#{@lf}?|#{@lf}/o # end of line
96
+ @escapes = /(#{@quote})|#{"\\"+@sep}|#{@cr}|#{@lf}/o
97
+ @quotable = /#{"\\"+@sep}|#{@cr}|#{@lf}/o
98
+ @quotes = /#{@quote}/o
99
+ @seps = /#{@sep}+/o
100
+ @quoted = @excel ? /(?:=)?#{@quote}/o : @quote
101
+ @unquoted = /[^#{@sep}#{@cr}#{@lf}][^#{@quote}#{@cr}#{@lf}]*/o
102
+ @leadzero = /\A0\d*\z/
74
103
  end
75
104
 
76
105
  def reset(str=nil)
77
- self.string = str if str
78
- super()
79
106
  @rows = nil
80
107
  @cols = @cells = 0
81
- end
82
-
83
- # ==[ Lexer ]==
84
-
85
- def next_token
86
- excel = true if @excel && scan(@eq)
87
-
88
- if scan(@quote) # consume quoted cell
89
- token = ""
90
- while true
91
- token << (scan_until(/#{@quote}/o) or bomb "unclosed quote")[0..-2]
92
- token << @quote and next if scan(@quote)
93
- break if scan(@eoc)
94
- @relax or bomb "invalid character after quote"
95
- token << @quote + (scan_until(/#{@quote}/o) or bomb "bad inline quote")
96
- end
97
- elsif scan(@sep) then return @es
98
- elsif scan(@eol) then return nil
99
- else # consume unquoted cell
100
- token = scan_until(@eoc) or bomb "unexpected character"
101
- token.prepend(@eq) if excel
102
- end
103
- scan(@sep)
104
- @strip ? token.strip : token
105
- end
106
108
 
107
- def bomb(msg)
108
- abort "\n#{File.basename($0)}: #{msg} at character #{pos} near '#{string[pos-4,7]}'"
109
+ self.string = str if str
110
+ @encoding = string.encoding
111
+ super()
109
112
  end
110
113
 
111
114
  # ==[ Parser ]==
112
115
 
113
116
  def parse
117
+
118
+ # TODO: crazy optimization if NO QUOTES in rest
119
+ # unless rest.include?(@quote)
120
+ # @rows = rest...
121
+ # end
122
+
114
123
  @rows = []
115
124
  while row = next_row
116
125
  @rows << row
@@ -122,18 +131,71 @@ class Censive < StringScanner
122
131
  end
123
132
 
124
133
  def next_row
134
+ if @cheat and line = scan_until(@eol)
135
+ row = line.chomp!.split(@sep, -1)
136
+ row.each do |col|
137
+ next if (saw = col.count(@quote)).zero?
138
+ next if (saw == 2) && col.delete_prefix!(@quote) && col.delete_suffix!(@quote)
139
+ @cheat = false
140
+ break
141
+ end if line.include?(@quote)
142
+ @cheat and return @strip ? row.each(&:strip!) : row
143
+ unscan
144
+ end
145
+
125
146
  token = next_token or return
126
- row = [token]
127
- row << token while token = next_token
147
+ row = []
148
+ row.push(*token)
149
+ row.push(*token) while token = next_token
128
150
  row
129
151
  end
130
152
 
153
+ def next_token
154
+ if scan(@quoted) # quoted cell
155
+ token = ""
156
+ while true
157
+ token << (scan_until(@quotes) or bomb "unclosed quote")[0..-2]
158
+ token << @quote and next if scan(@quote)
159
+ scan(@eoc) and break
160
+ @relax or bomb "invalid character after quote"
161
+ token << @quote + (scan_until(@quotes) or bomb "bad inline quote")
162
+ end
163
+ scan(@sep)
164
+ @strip ? token.strip : token
165
+ elsif match = scan(@unquoted) # unquoted cell(s)
166
+ if check(@quote) && !match.chomp!(@sep) # if we see a stray quote
167
+ unless @excel && match.chomp!(@seq) # unless an excel literal, fix it
168
+ match << (scan_until(@eoc) or bomb "stray quote")
169
+ scan(@sep)
170
+ end
171
+ end
172
+ tokens = match.split(@sep, -1)
173
+ @strip ? tokens.map!(&:strip) : tokens
174
+ elsif scan(@sep)
175
+ match = scan(@seps)
176
+ match ? match.split(@sep, -1) : @es
177
+ else
178
+ scan(@eol)
179
+ nil
180
+ end
181
+ end
182
+
183
+ def each
184
+ @rows ||= parse
185
+ @rows.each {|row| yield row }
186
+ end
187
+
188
+ def export(**opts)
189
+ out = opts.empty? ? self : self.class.writer(**opts)
190
+ each {|row| out << row }
191
+ end
192
+
131
193
  # ==[ Helpers ]==
132
194
 
133
195
  # returns 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
134
196
  def grok(str)
135
- if idx = str.index(/(#{@quote})|#{"\\"+@sep}|#{@cr}|#{@lf}/o)
136
- $1 ? 2 : str.index(/#{@quote}/o, idx) ? 2 : 1
197
+ if idx = str.index(@escapes)
198
+ $1 ? 2 : str.index(@quotes, idx) ? 2 : 1
137
199
  else
138
200
  0
139
201
  end
@@ -153,11 +215,11 @@ class Censive < StringScanner
153
215
  row
154
216
  when 1
155
217
  row.map do |col|
156
- col.match?(/#{"\\"+@sep}|#{@cr}|#{@lf}/o) ? "#{q}#{col}#{q}" : col
218
+ col.match?(@quotable) ? "#{q}#{col}#{q}" : col
157
219
  end
158
220
  else
159
221
  row.map do |col|
160
- @excel && col =~ /\A0\d*\z/ ? "=#{q}#{col}#{q}" :
222
+ @excel && col =~ @leadzero ? "=#{q}#{col}#{q}" :
161
223
  case grok(col)
162
224
  when 0 then col
163
225
  when 1 then "#{q}#{col}#{q}"
@@ -168,7 +230,7 @@ class Censive < StringScanner
168
230
  when :full
169
231
  if @excel
170
232
  row.map do |col|
171
- col =~ /\A0\d*\z/ ? "=#{q}#{col}#{q}" : "#{q}#{col.gsub(q, @esc)}#{q}"
233
+ col =~ @leadzero ? "=#{q}#{col}#{q}" : "#{q}#{col.gsub(q, @esc)}#{q}"
172
234
  end
173
235
  else
174
236
  row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
@@ -178,16 +240,6 @@ class Censive < StringScanner
178
240
  @out << out + @rowsep
179
241
  end
180
242
 
181
- def each
182
- @rows ||= parse
183
- @rows.each {|row| yield row }
184
- end
185
-
186
- def export(**opts)
187
- out = opts.empty? ? self : self.class.writer(**opts)
188
- each {|row| out << row }
189
- end
190
-
191
243
  def stats
192
244
  wide = string.size.to_s.size
193
245
  puts "%#{wide}d rows" % @rows.size
@@ -195,27 +247,8 @@ class Censive < StringScanner
195
247
  puts "%#{wide}d cells" % @cells
196
248
  puts "%#{wide}d bytes" % string.size
197
249
  end
198
- end
199
250
 
200
- if __FILE__ == $0
201
- raw = DATA.gets("\n\n").chomp
202
- # raw = File.read(ARGV.first || "lc-2023.csv")
203
- csv = Censive.new(raw, excel: true, relax: true)
204
- csv.export # (sep: ",", excel: true)
251
+ def bomb(msg)
252
+ abort "\n#{File.basename($0)}: #{msg} at character #{pos} near '#{string[pos-4,7]}'"
253
+ end
205
254
  end
206
-
207
- __END__
208
- Name,Age,Shoe
209
- Alice,27,5
210
- Bob,33,10 1/2
211
- Charlie or "Chuck",=B2 + B3,9
212
- "Doug E Fresh",="007",10
213
- Subtotal,=sum(B2:B5),="01234"
214
-
215
- # first line works in "relax" mode, bottom line is compliant
216
- 123,"CHO, JOELLE "JOJO"",456
217
- 123,"CHO, JOELLE ""JOJO""",456
218
-
219
- # Excel mode checking
220
- =,=x,x=,="x",="","","=",123,0123,="123",="0123"
221
- ,=x,x=,x,,,,,,=,,123,="0123",123,,="0123" # <= a little off
@@ -0,0 +1,266 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # ============================================================================
4
+ # censive - A quick and lightweight CSV handling library for Ruby
5
+ #
6
+ # Author: Steve Shreeve (steve.shreeve@gmail.com)
7
+ # Date: Feb 8, 2023
8
+ #
9
+ # https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
10
+ # https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
11
+ #
12
+ # Thanks to Sutou Kouhei (kou) for his excellent advice on using scan
13
+ # ============================================================================
14
+ # GOALS:
15
+ # 1. Faster than Ruby's default CSV library
16
+ # 2. Lightweight code with streamlined and optimized logic
17
+ # 3. Support most non-compliant CSV variations (eg - @excel, @relax, @strip)
18
+ #
19
+ # TODO:
20
+ # 1. Support IO streaming
21
+ # 2. Review all encodings, we may be losing speed when mixing encodings
22
+ # 3. Huge speedup possible if our @unquoted regex reads beyond @cr?@lf's
23
+ # 4. Will using String#freeze give us a speed up?
24
+ # 5. Implement support for scan_until(string) <= right now only regex is valid
25
+ # ============================================================================
26
+
27
+ require "strscan"
28
+
29
+ class Censive < StringScanner
30
+ attr :encoding
31
+
32
+ def self.parse(...)
33
+ new(...).parse
34
+ end
35
+
36
+ def self.writer(obj=nil, **opts, &code)
37
+ case obj
38
+ when String then File.open(obj, "w") {|io| yield new(out: io, **opts, &code) }
39
+ when IO,nil then new(out: obj, **opts, &code)
40
+ else abort "#{File.basename($0)}: invalid #{obj.class} object in writer"
41
+ end
42
+ end
43
+
44
+ def initialize(str=nil,
45
+ drop: false , # drop trailing empty fields?
46
+ encoding: nil , # character encoding
47
+ excel: false , # literals ="01" formulas =A1 + B2 http://bit.ly/3Y7jIvc
48
+ mode: :compact, # export mode: compact or full
49
+ out: nil , # output stream, needs to respond to <<
50
+ quote: '"' , # quote character
51
+ relax: false , # relax quote parsing so ,"Fo"o, => ,"Fo""o",
52
+ rowsep: "\n" , # row separator for export
53
+ sep: "," , # column separator character
54
+ strip: false , # strip fields when reading
55
+ **opts # grab bag
56
+ )
57
+ # initialize data source
58
+ if str && str.size < 100 && File.readable?(str)
59
+ str = File.open(str, encoding ? "r:#{encoding}" : "r").read
60
+ else
61
+ str ||= ""
62
+ str = str.encode(encoding) if encoding
63
+ end
64
+ super(str)
65
+ reset
66
+
67
+ # config options
68
+ @drop = drop
69
+ @encoding = str.encoding
70
+ @excel = excel
71
+ @mode = mode
72
+ @out = out || $stdout
73
+ @relax = relax
74
+ @strip = strip
75
+
76
+ # config strings
77
+ @quote = quote
78
+ @rowsep = rowsep
79
+ @sep = sep
80
+
81
+ # static strings
82
+ @cr = "\r"
83
+ @lf = "\n"
84
+ @es = ""
85
+ @eq = "="
86
+
87
+ # combinations
88
+ @esc = (@quote * 2)
89
+ @seq = [@sep, @eq].join # used for parsing in excel mode
90
+
91
+ #!# TODO: come up with a clean way to escape/encode all this
92
+ #!# TODO: maybe define @tokens = "#{@quote}#{@sep}#{@cr}#{@lf}", etc.
93
+
94
+ # regexes
95
+ @eoc = /(?=#{"\\" + @sep}|#{@cr}|#{@lf}|\z)/o # end of cell
96
+ @eol = /#{@cr}#{@lf}?|#{@lf}/o # end of line
97
+ @escapes = /(#{@quote})|#{"\\"+@sep}|#{@cr}|#{@lf}/o
98
+ @quotable = /#{"\\"+@sep}|#{@cr}|#{@lf}/o
99
+ @quotes = /#{@quote}/o
100
+ @seps = /#{@sep}+/o
101
+ @quoted = @excel ? /(?:=)?#{@quote}/o : @quote
102
+ @unquoted = /[^#{@sep}#{@cr}#{@lf}][^#{@quote}#{@cr}#{@lf}]*/o
103
+ @leadzero = /\A0\d*\z/
104
+ end
105
+
106
+ def reset(str=nil)
107
+ @rows = nil
108
+ @cols = @cells = 0
109
+
110
+ #!# TODO: reset all encodings?
111
+ self.string = str if str
112
+ @encoding = string.encoding
113
+ super()
114
+ end
115
+
116
+ # ==[ Parser ]==
117
+
118
+ def parse
119
+ @rows = []
120
+ while row = next_row
121
+ @rows << row
122
+ count = row.size
123
+ @cols = count if count > @cols
124
+ @cells += count
125
+ end
126
+ @rows
127
+ end
128
+
129
+ def next_row
130
+ token = next_token or return
131
+ row = []
132
+ row.push(*token)
133
+ row.push(*token) while token = next_token
134
+ row
135
+ end
136
+
137
+ def next_token
138
+ if scan(@quoted) # quoted cell
139
+ token = ""
140
+ while true
141
+ token << (scan_until(@quotes) or bomb "unclosed quote")[0..-2]
142
+ token << @quote and next if scan(@quote)
143
+ scan(@eoc) and break
144
+ @relax or bomb "invalid character after quote"
145
+ token << @quote + (scan_until(@quotes) or bomb "bad inline quote")
146
+ end
147
+ scan(@sep)
148
+ @strip ? token.strip : token
149
+ elsif match = scan(@unquoted) # unquoted cell(s)
150
+ if check(@quote) && !match.chomp!(@sep) # if we see a stray quote
151
+ unless @excel && match.chomp!(@seq) # unless an excel literal, fix it
152
+ match << (scan_until(@eoc) or bomb "stray quote")
153
+ scan(@sep)
154
+ end
155
+ end
156
+ tokens = match.split(@sep, -1)
157
+ @strip ? tokens.map!(&:strip) : tokens
158
+ elsif scan(@sep)
159
+ match = scan(@seps)
160
+ match ? match.split(@sep, -1) : @es
161
+ else
162
+ scan(@eol)
163
+ nil
164
+ end
165
+ end
166
+
167
+ def each
168
+ @rows ||= parse
169
+ @rows.each {|row| yield row }
170
+ end
171
+
172
+ def export(**opts)
173
+ out = opts.empty? ? self : self.class.writer(**opts)
174
+ each {|row| out << row }
175
+ end
176
+
177
+ # ==[ Helpers ]==
178
+
179
+ # returns 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
180
+ def grok(str)
181
+ if idx = str.index(@escapes)
182
+ $1 ? 2 : str.index(@quotes, idx) ? 2 : 1
183
+ else
184
+ 0
185
+ end
186
+ end
187
+
188
+ # output a row
189
+ def <<(row)
190
+
191
+ # drop trailing empty columns
192
+ row.pop while row.last.empty? if @drop
193
+
194
+ s,q = @sep, @quote
195
+ out = case @mode
196
+ when :compact
197
+ case @excel ? 2 : grok(row.join)
198
+ when 0
199
+ row
200
+ when 1
201
+ row.map do |col|
202
+ col.match?(@quotable) ? "#{q}#{col}#{q}" : col
203
+ end
204
+ else
205
+ row.map do |col|
206
+ @excel && col =~ @leadzero ? "=#{q}#{col}#{q}" :
207
+ case grok(col)
208
+ when 0 then col
209
+ when 1 then "#{q}#{col}#{q}"
210
+ else "#{q}#{col.gsub(q, @esc)}#{q}"
211
+ end
212
+ end
213
+ end
214
+ when :full
215
+ if @excel
216
+ row.map do |col|
217
+ col =~ @leadzero ? "=#{q}#{col}#{q}" : "#{q}#{col.gsub(q, @esc)}#{q}"
218
+ end
219
+ else
220
+ row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
221
+ end
222
+ end.join(s)
223
+
224
+ @out << out + @rowsep
225
+ end
226
+
227
+ def stats
228
+ wide = string.size.to_s.size
229
+ puts "%#{wide}d rows" % @rows.size
230
+ puts "%#{wide}d columns" % @cols
231
+ puts "%#{wide}d cells" % @cells
232
+ puts "%#{wide}d bytes" % string.size
233
+ end
234
+
235
+ def bomb(msg)
236
+ abort "\n#{File.basename($0)}: #{msg} at character #{pos} near '#{string[pos-4,7]}'"
237
+ end
238
+ end
239
+
240
+ if __FILE__ == $0
241
+ raw = DATA.gets("\n\n").chomp
242
+ # raw = File.read(ARGV.first || "lc-2023.csv")
243
+ csv = Censive.new(raw, excel: true, relax: true)
244
+ csv.export # (excel: true) # sep: "|")
245
+ end
246
+
247
+ __END__
248
+ "Don",="007",10,"Ed"
249
+ Name,Age,,,Shoe,,,
250
+ "Alice",27,5
251
+ Bob,33,10 1/2
252
+ Charlie or "Chuck",=B2 + B3,9
253
+ Subtotal,=sum(B2:B5),="01234"
254
+
255
+ A,B,C,D
256
+ A,B,"C",D
257
+ A,B,C",D
258
+ A,B,"C",D
259
+
260
+ # first line works in "relax" mode, bottom line is compliant
261
+ 123,"CHO, JOELLE "JOJO"",456
262
+ 123,"CHO, JOELLE ""JOJO""",456
263
+
264
+ # Excel mode checking
265
+ =,=x,x=,="x",="","","=",123,0123,="123",="0123"
266
+ ,=x,x=,x,,,,,,=,,123,="0123",123,,="0123" # <= a little off