censive 0.18 → 0.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/censive.rb CHANGED
@@ -4,7 +4,7 @@
4
4
  # censive - A quick and lightweight CSV handling library for Ruby
5
5
  #
6
6
  # Author: Steve Shreeve (steve.shreeve@gmail.com)
7
- # Date: Feb 5, 2023
7
+ # Date: Feb 10, 2023
8
8
  #
9
9
  # https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
10
10
  # https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
@@ -14,14 +14,22 @@
14
14
  # GOALS:
15
15
  # 1. Faster than Ruby's default CSV library
16
16
  # 2. Lightweight code with streamlined and optimized logic
17
- # 3. Support most non-compliant CSV variations (eg - @excel, @relax, @strip)
17
+ # 3. Support most non-compliant CSV variations (@excel, @relax, etc)
18
+ # 4. Support most commonly used CSV options (@sep, @quote, @strip, @drop, etc)
18
19
  #
19
- # TODO: Support IO streaming
20
+ # TODO:
21
+ # 1. Support IO streaming
22
+ # 2. Review all encodings, we may be losing speed when mixing encodings
23
+ # 3. Huge speedup possible if our @unquoted regex reads beyond @cr?@lf's
24
+ # 4. Will using String#freeze give us a speed up?
25
+ # 5. Implement support for scan_until(string) <= right now only regex is valid
20
26
  # ============================================================================
21
27
 
22
28
  require "strscan"
23
29
 
24
30
  class Censive < StringScanner
31
+ attr :encoding
32
+
25
33
  def self.parse(...)
26
34
  new(...).parse
27
35
  end
@@ -34,86 +42,84 @@ class Censive < StringScanner
34
42
  end
35
43
  end
36
44
 
37
- def initialize(str="",
38
- drop: false , # drop trailing empty fields?
39
- encoding: "utf-8" , # character encoding
45
+ def initialize(str=nil,
46
+ drop: false , # drop trailing empty columns?
47
+ encoding: nil , # character encoding
40
48
  excel: false , # literals ="01" formulas =A1 + B2 http://bit.ly/3Y7jIvc
41
49
  mode: :compact, # export mode: compact or full
42
- out: $stdout , # output stream, needs to respond to <<
50
+ out: nil , # output stream, needs to respond to <<
43
51
  quote: '"' , # quote character
44
52
  relax: false , # relax quote parsing so ,"Fo"o, => ,"Fo""o",
45
53
  rowsep: "\n" , # row separator for export
46
54
  sep: "," , # column separator character
47
- strip: false , # strip fields when reading
48
- **opts # grab bag
55
+ strip: false , # strip columns when reading
56
+ **opts # grab bag
49
57
  )
50
- # options
58
+ # initialize data source
59
+ if str && str.size < 100 && File.readable?(str)
60
+ str = File.open(str, encoding ? "r:#{encoding}" : "r").read
61
+ else
62
+ str ||= ""
63
+ str = str.encode(encoding) if encoding
64
+ end
65
+ super(str)
66
+ reset
67
+
68
+ # config options
69
+ @cheat = true
51
70
  @drop = drop
52
- @encoding = encoding
71
+ @encoding = str.encoding
53
72
  @excel = excel
54
73
  @mode = mode
55
- @out = out
56
- @quote = quote
74
+ @out = out || $stdout
57
75
  @relax = relax
76
+ @strip = strip
77
+
78
+ # config strings
79
+ @quote = quote
58
80
  @rowsep = rowsep
59
81
  @sep = sep
60
- @strip = strip
61
82
 
62
- # definitions
63
- @cr = "\r"
64
- @lf = "\n"
65
- @es = ""
66
- @eq = "="
67
- @esc = (@quote * 2)
68
- @eol = /#{@cr}#{@lf}?|#{@lf}|\z/o # end of line
69
- @eoc = /(?=#{"\\" + @sep}|#{@cr}|#{@lf}|\z)/o # end of cell
70
-
71
- # data source
72
- if str.size < 100 && File.readable?(str)
73
- str = File.open(str, "r:#{encoding}").read
74
- end
75
- super(str)
76
- reset
83
+ # static strings
84
+ @cr = "\r"
85
+ @lf = "\n"
86
+ @es = ""
87
+ @eq = "="
88
+
89
+ # combinations
90
+ @esc = (@quote * 2)
91
+ @seq = [@sep, @eq].join # used for parsing in excel mode
92
+
93
+ # regexes
94
+ @eoc = /(?=#{"\\" + @sep}|#{@cr}|#{@lf}|\z)/o # end of cell
95
+ @eol = /#{@cr}#{@lf}?|#{@lf}/o # end of line
96
+ @escapes = /(#{@quote})|#{"\\"+@sep}|#{@cr}|#{@lf}/o
97
+ @quotable = /#{"\\"+@sep}|#{@cr}|#{@lf}/o
98
+ @quotes = /#{@quote}/o
99
+ @seps = /#{@sep}+/o
100
+ @quoted = @excel ? /(?:=)?#{@quote}/o : @quote
101
+ @unquoted = /[^#{@sep}#{@cr}#{@lf}][^#{@quote}#{@cr}#{@lf}]*/o
102
+ @leadzero = /\A0\d*\z/
77
103
  end
78
104
 
79
105
  def reset(str=nil)
80
- self.string = str if str
81
- super()
82
106
  @rows = nil
83
107
  @cols = @cells = 0
84
- end
85
108
 
86
- # ==[ Lexer ]==
87
-
88
- def next_token
89
- excel = true if @excel && scan(@eq)
90
-
91
- if scan(@quote) # consume quoted cell
92
- token = ""
93
- while true
94
- token << (scan_until(/#{@quote}/o) or bomb "unclosed quote")[0..-2]
95
- token << @quote and next if scan(@quote)
96
- break if scan(@eoc)
97
- @relax or bomb "invalid character after quote"
98
- token << @quote + (scan_until(/#{@quote}/o) or bomb "bad inline quote")
99
- end
100
- elsif scan(@sep) then return @es
101
- elsif scan(@eol) then return nil
102
- else # consume unquoted cell
103
- token = scan_until(@eoc) or bomb "unexpected character"
104
- token.prepend(@eq) if excel
105
- end
106
- scan(@sep)
107
- @strip ? token.strip : token
108
- end
109
-
110
- def bomb(msg)
111
- abort "\n#{File.basename($0)}: #{msg} at character #{pos} near '#{string[pos-4,7]}'"
109
+ self.string = str if str
110
+ @encoding = string.encoding
111
+ super()
112
112
  end
113
113
 
114
114
  # ==[ Parser ]==
115
115
 
116
116
  def parse
117
+
118
+ # TODO: crazy optimization if NO QUOTES in rest
119
+ # unless rest.include?(@quote)
120
+ # @rows = rest...
121
+ # end
122
+
117
123
  @rows = []
118
124
  while row = next_row
119
125
  @rows << row
@@ -125,18 +131,71 @@ class Censive < StringScanner
125
131
  end
126
132
 
127
133
  def next_row
134
+ if @cheat and line = scan_until(@eol)
135
+ row = line.chomp!.split(@sep, -1)
136
+ row.each do |col|
137
+ next if (saw = col.count(@quote)).zero?
138
+ next if (saw == 2) && col.delete_prefix!(@quote) && col.delete_suffix!(@quote)
139
+ @cheat = false
140
+ break
141
+ end if line.include?(@quote)
142
+ @cheat and return @strip ? row.each(&:strip!) : row
143
+ unscan
144
+ end
145
+
128
146
  token = next_token or return
129
- row = [token]
130
- row << token while token = next_token
147
+ row = []
148
+ row.push(*token)
149
+ row.push(*token) while token = next_token
131
150
  row
132
151
  end
133
152
 
153
+ def next_token
154
+ if scan(@quoted) # quoted cell
155
+ token = ""
156
+ while true
157
+ token << (scan_until(@quotes) or bomb "unclosed quote")[0..-2]
158
+ token << @quote and next if scan(@quote)
159
+ scan(@eoc) and break
160
+ @relax or bomb "invalid character after quote"
161
+ token << @quote + (scan_until(@quotes) or bomb "bad inline quote")
162
+ end
163
+ scan(@sep)
164
+ @strip ? token.strip : token
165
+ elsif match = scan(@unquoted) # unquoted cell(s)
166
+ if check(@quote) && !match.chomp!(@sep) # if we see a stray quote
167
+ unless @excel && match.chomp!(@seq) # unless an excel literal, fix it
168
+ match << (scan_until(@eoc) or bomb "stray quote")
169
+ scan(@sep)
170
+ end
171
+ end
172
+ tokens = match.split(@sep, -1)
173
+ @strip ? tokens.map!(&:strip) : tokens
174
+ elsif scan(@sep)
175
+ match = scan(@seps)
176
+ match ? match.split(@sep, -1) : @es
177
+ else
178
+ scan(@eol)
179
+ nil
180
+ end
181
+ end
182
+
183
+ def each
184
+ @rows ||= parse
185
+ @rows.each {|row| yield row }
186
+ end
187
+
188
+ def export(**opts)
189
+ out = opts.empty? ? self : self.class.writer(**opts)
190
+ each {|row| out << row }
191
+ end
192
+
134
193
  # ==[ Helpers ]==
135
194
 
136
195
  # returns 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
137
196
  def grok(str)
138
- if idx = str.index(/(#{@quote})|#{"\\"+@sep}|#{@cr}|#{@lf}/o)
139
- $1 ? 2 : str.index(/#{@quote}/o, idx) ? 2 : 1
197
+ if idx = str.index(@escapes)
198
+ $1 ? 2 : str.index(@quotes, idx) ? 2 : 1
140
199
  else
141
200
  0
142
201
  end
@@ -156,11 +215,11 @@ class Censive < StringScanner
156
215
  row
157
216
  when 1
158
217
  row.map do |col|
159
- col.match?(/#{"\\"+@sep}|#{@cr}|#{@lf}/o) ? "#{q}#{col}#{q}" : col
218
+ col.match?(@quotable) ? "#{q}#{col}#{q}" : col
160
219
  end
161
220
  else
162
221
  row.map do |col|
163
- @excel && col =~ /\A0\d*\z/ ? "=#{q}#{col}#{q}" :
222
+ @excel && col =~ @leadzero ? "=#{q}#{col}#{q}" :
164
223
  case grok(col)
165
224
  when 0 then col
166
225
  when 1 then "#{q}#{col}#{q}"
@@ -171,7 +230,7 @@ class Censive < StringScanner
171
230
  when :full
172
231
  if @excel
173
232
  row.map do |col|
174
- col =~ /\A0\d*\z/ ? "=#{q}#{col}#{q}" : "#{q}#{col.gsub(q, @esc)}#{q}"
233
+ col =~ @leadzero ? "=#{q}#{col}#{q}" : "#{q}#{col.gsub(q, @esc)}#{q}"
175
234
  end
176
235
  else
177
236
  row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
@@ -181,16 +240,6 @@ class Censive < StringScanner
181
240
  @out << out + @rowsep
182
241
  end
183
242
 
184
- def each
185
- @rows ||= parse
186
- @rows.each {|row| yield row }
187
- end
188
-
189
- def export(**opts)
190
- out = opts.empty? ? self : self.class.writer(**opts)
191
- each {|row| out << row }
192
- end
193
-
194
243
  def stats
195
244
  wide = string.size.to_s.size
196
245
  puts "%#{wide}d rows" % @rows.size
@@ -198,27 +247,8 @@ class Censive < StringScanner
198
247
  puts "%#{wide}d cells" % @cells
199
248
  puts "%#{wide}d bytes" % string.size
200
249
  end
201
- end
202
250
 
203
- if __FILE__ == $0
204
- raw = DATA.gets("\n\n").chomp
205
- # raw = File.read(ARGV.first || "lc-2023.csv")
206
- csv = Censive.new(raw, excel: true, relax: true)
207
- csv.export # (sep: ",", excel: true)
251
+ def bomb(msg)
252
+ abort "\n#{File.basename($0)}: #{msg} at character #{pos} near '#{string[pos-4,7]}'"
253
+ end
208
254
  end
209
-
210
- __END__
211
- Name,Age,Shoe
212
- Alice,27,5
213
- Bob,33,10 1/2
214
- Charlie or "Chuck",=B2 + B3,9
215
- "Doug E Fresh",="007",10
216
- Subtotal,=sum(B2:B5),="01234"
217
-
218
- # first line works in "relax" mode, bottom line is compliant
219
- 123,"CHO, JOELLE "JOJO"",456
220
- 123,"CHO, JOELLE ""JOJO""",456
221
-
222
- # Excel mode checking
223
- =,=x,x=,="x",="","","=",123,0123,="123",="0123"
224
- ,=x,x=,x,,,,,,=,,123,="0123",123,,="0123" # <= a little off
@@ -0,0 +1,266 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # ============================================================================
4
+ # censive - A quick and lightweight CSV handling library for Ruby
5
+ #
6
+ # Author: Steve Shreeve (steve.shreeve@gmail.com)
7
+ # Date: Feb 8, 2023
8
+ #
9
+ # https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
10
+ # https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
11
+ #
12
+ # Thanks to Sutou Kouhei (kou) for his excellent advice on using scan
13
+ # ============================================================================
14
+ # GOALS:
15
+ # 1. Faster than Ruby's default CSV library
16
+ # 2. Lightweight code with streamlined and optimized logic
17
+ # 3. Support most non-compliant CSV variations (eg - @excel, @relax, @strip)
18
+ #
19
+ # TODO:
20
+ # 1. Support IO streaming
21
+ # 2. Review all encodings, we may be losing speed when mixing encodings
22
+ # 3. Huge speedup possible if our @unquoted regex reads beyond @cr?@lf's
23
+ # 4. Will using String#freeze give us a speed up?
24
+ # 5. Implement support for scan_until(string) <= right now only regex is valid
25
+ # ============================================================================
26
+
27
+ require "strscan"
28
+
29
+ class Censive < StringScanner
30
+ attr :encoding
31
+
32
+ def self.parse(...)
33
+ new(...).parse
34
+ end
35
+
36
+ def self.writer(obj=nil, **opts, &code)
37
+ case obj
38
+ when String then File.open(obj, "w") {|io| yield new(out: io, **opts, &code) }
39
+ when IO,nil then new(out: obj, **opts, &code)
40
+ else abort "#{File.basename($0)}: invalid #{obj.class} object in writer"
41
+ end
42
+ end
43
+
44
+ def initialize(str=nil,
45
+ drop: false , # drop trailing empty fields?
46
+ encoding: nil , # character encoding
47
+ excel: false , # literals ="01" formulas =A1 + B2 http://bit.ly/3Y7jIvc
48
+ mode: :compact, # export mode: compact or full
49
+ out: nil , # output stream, needs to respond to <<
50
+ quote: '"' , # quote character
51
+ relax: false , # relax quote parsing so ,"Fo"o, => ,"Fo""o",
52
+ rowsep: "\n" , # row separator for export
53
+ sep: "," , # column separator character
54
+ strip: false , # strip fields when reading
55
+ **opts # grab bag
56
+ )
57
+ # initialize data source
58
+ if str && str.size < 100 && File.readable?(str)
59
+ str = File.open(str, encoding ? "r:#{encoding}" : "r").read
60
+ else
61
+ str ||= ""
62
+ str = str.encode(encoding) if encoding
63
+ end
64
+ super(str)
65
+ reset
66
+
67
+ # config options
68
+ @drop = drop
69
+ @encoding = str.encoding
70
+ @excel = excel
71
+ @mode = mode
72
+ @out = out || $stdout
73
+ @relax = relax
74
+ @strip = strip
75
+
76
+ # config strings
77
+ @quote = quote
78
+ @rowsep = rowsep
79
+ @sep = sep
80
+
81
+ # static strings
82
+ @cr = "\r"
83
+ @lf = "\n"
84
+ @es = ""
85
+ @eq = "="
86
+
87
+ # combinations
88
+ @esc = (@quote * 2)
89
+ @seq = [@sep, @eq].join # used for parsing in excel mode
90
+
91
+ #!# TODO: come up with a clean way to escape/encode all this
92
+ #!# TODO: maybe define @tokens = "#{@quote}#{@sep}#{@cr}#{@lf}", etc.
93
+
94
+ # regexes
95
+ @eoc = /(?=#{"\\" + @sep}|#{@cr}|#{@lf}|\z)/o # end of cell
96
+ @eol = /#{@cr}#{@lf}?|#{@lf}/o # end of line
97
+ @escapes = /(#{@quote})|#{"\\"+@sep}|#{@cr}|#{@lf}/o
98
+ @quotable = /#{"\\"+@sep}|#{@cr}|#{@lf}/o
99
+ @quotes = /#{@quote}/o
100
+ @seps = /#{@sep}+/o
101
+ @quoted = @excel ? /(?:=)?#{@quote}/o : @quote
102
+ @unquoted = /[^#{@sep}#{@cr}#{@lf}][^#{@quote}#{@cr}#{@lf}]*/o
103
+ @leadzero = /\A0\d*\z/
104
+ end
105
+
106
+ def reset(str=nil)
107
+ @rows = nil
108
+ @cols = @cells = 0
109
+
110
+ #!# TODO: reset all encodings?
111
+ self.string = str if str
112
+ @encoding = string.encoding
113
+ super()
114
+ end
115
+
116
+ # ==[ Parser ]==
117
+
118
+ def parse
119
+ @rows = []
120
+ while row = next_row
121
+ @rows << row
122
+ count = row.size
123
+ @cols = count if count > @cols
124
+ @cells += count
125
+ end
126
+ @rows
127
+ end
128
+
129
+ def next_row
130
+ token = next_token or return
131
+ row = []
132
+ row.push(*token)
133
+ row.push(*token) while token = next_token
134
+ row
135
+ end
136
+
137
+ def next_token
138
+ if scan(@quoted) # quoted cell
139
+ token = ""
140
+ while true
141
+ token << (scan_until(@quotes) or bomb "unclosed quote")[0..-2]
142
+ token << @quote and next if scan(@quote)
143
+ scan(@eoc) and break
144
+ @relax or bomb "invalid character after quote"
145
+ token << @quote + (scan_until(@quotes) or bomb "bad inline quote")
146
+ end
147
+ scan(@sep)
148
+ @strip ? token.strip : token
149
+ elsif match = scan(@unquoted) # unquoted cell(s)
150
+ if check(@quote) && !match.chomp!(@sep) # if we see a stray quote
151
+ unless @excel && match.chomp!(@seq) # unless an excel literal, fix it
152
+ match << (scan_until(@eoc) or bomb "stray quote")
153
+ scan(@sep)
154
+ end
155
+ end
156
+ tokens = match.split(@sep, -1)
157
+ @strip ? tokens.map!(&:strip) : tokens
158
+ elsif scan(@sep)
159
+ match = scan(@seps)
160
+ match ? match.split(@sep, -1) : @es
161
+ else
162
+ scan(@eol)
163
+ nil
164
+ end
165
+ end
166
+
167
+ def each
168
+ @rows ||= parse
169
+ @rows.each {|row| yield row }
170
+ end
171
+
172
+ def export(**opts)
173
+ out = opts.empty? ? self : self.class.writer(**opts)
174
+ each {|row| out << row }
175
+ end
176
+
177
+ # ==[ Helpers ]==
178
+
179
+ # returns 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
180
+ def grok(str)
181
+ if idx = str.index(@escapes)
182
+ $1 ? 2 : str.index(@quotes, idx) ? 2 : 1
183
+ else
184
+ 0
185
+ end
186
+ end
187
+
188
+ # output a row
189
+ def <<(row)
190
+
191
+ # drop trailing empty columns
192
+ row.pop while row.last.empty? if @drop
193
+
194
+ s,q = @sep, @quote
195
+ out = case @mode
196
+ when :compact
197
+ case @excel ? 2 : grok(row.join)
198
+ when 0
199
+ row
200
+ when 1
201
+ row.map do |col|
202
+ col.match?(@quotable) ? "#{q}#{col}#{q}" : col
203
+ end
204
+ else
205
+ row.map do |col|
206
+ @excel && col =~ @leadzero ? "=#{q}#{col}#{q}" :
207
+ case grok(col)
208
+ when 0 then col
209
+ when 1 then "#{q}#{col}#{q}"
210
+ else "#{q}#{col.gsub(q, @esc)}#{q}"
211
+ end
212
+ end
213
+ end
214
+ when :full
215
+ if @excel
216
+ row.map do |col|
217
+ col =~ @leadzero ? "=#{q}#{col}#{q}" : "#{q}#{col.gsub(q, @esc)}#{q}"
218
+ end
219
+ else
220
+ row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
221
+ end
222
+ end.join(s)
223
+
224
+ @out << out + @rowsep
225
+ end
226
+
227
+ def stats
228
+ wide = string.size.to_s.size
229
+ puts "%#{wide}d rows" % @rows.size
230
+ puts "%#{wide}d columns" % @cols
231
+ puts "%#{wide}d cells" % @cells
232
+ puts "%#{wide}d bytes" % string.size
233
+ end
234
+
235
+ def bomb(msg)
236
+ abort "\n#{File.basename($0)}: #{msg} at character #{pos} near '#{string[pos-4,7]}'"
237
+ end
238
+ end
239
+
240
+ if __FILE__ == $0
241
+ raw = DATA.gets("\n\n").chomp
242
+ # raw = File.read(ARGV.first || "lc-2023.csv")
243
+ csv = Censive.new(raw, excel: true, relax: true)
244
+ csv.export # (excel: true) # sep: "|")
245
+ end
246
+
247
+ __END__
248
+ "Don",="007",10,"Ed"
249
+ Name,Age,,,Shoe,,,
250
+ "Alice",27,5
251
+ Bob,33,10 1/2
252
+ Charlie or "Chuck",=B2 + B3,9
253
+ Subtotal,=sum(B2:B5),="01234"
254
+
255
+ A,B,C,D
256
+ A,B,"C",D
257
+ A,B,C",D
258
+ A,B,"C",D
259
+
260
+ # first line works in "relax" mode, bottom line is compliant
261
+ 123,"CHO, JOELLE "JOJO"",456
262
+ 123,"CHO, JOELLE ""JOJO""",456
263
+
264
+ # Excel mode checking
265
+ =,=x,x=,="x",="","","=",123,0123,="123",="0123"
266
+ ,=x,x=,x,,,,,,=,,123,="0123",123,,="0123" # <= a little off