censive 0.19 → 0.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/censive.rb CHANGED
@@ -4,7 +4,7 @@
4
4
  # censive - A quick and lightweight CSV handling library for Ruby
5
5
  #
6
6
  # Author: Steve Shreeve (steve.shreeve@gmail.com)
7
- # Date: Feb 5, 2023
7
+ # Date: Feb 10, 2023
8
8
  #
9
9
  # https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
10
10
  # https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
@@ -14,14 +14,22 @@
14
14
  # GOALS:
15
15
  # 1. Faster than Ruby's default CSV library
16
16
  # 2. Lightweight code with streamlined and optimized logic
17
- # 3. Support most non-compliant CSV variations (eg - @excel, @relax, @strip)
17
+ # 3. Support most non-compliant CSV variations (@excel, @relax, etc)
18
+ # 4. Support most commonly used CSV options (@sep, @quote, @strip, @drop, etc)
18
19
  #
19
- # TODO: Support IO streaming
20
+ # TODO:
21
+ # 1. Support IO streaming
22
+ # 2. Review all encodings, we may be losing speed when mixing encodings
23
+ # 3. Huge speedup possible if our @unquoted regex reads beyond @cr?@lf's
24
+ # 4. Will using String#freeze give us a speed up?
25
+ # 5. Implement support for scan_until(string) <= right now only regex is valid
20
26
  # ============================================================================
21
27
 
22
28
  require "strscan"
23
29
 
24
30
  class Censive < StringScanner
31
+ attr :encoding
32
+
25
33
  def self.parse(...)
26
34
  new(...).parse
27
35
  end
@@ -34,83 +42,84 @@ class Censive < StringScanner
34
42
  end
35
43
  end
36
44
 
37
- def initialize(str="",
38
- drop: false , # drop trailing empty fields?
39
- encoding: "utf-8" , # character encoding
45
+ def initialize(str=nil,
46
+ drop: false , # drop trailing empty columns?
47
+ encoding: nil , # character encoding
40
48
  excel: false , # literals ="01" formulas =A1 + B2 http://bit.ly/3Y7jIvc
41
49
  mode: :compact, # export mode: compact or full
42
- out: $stdout , # output stream, needs to respond to <<
50
+ out: nil , # output stream, needs to respond to <<
43
51
  quote: '"' , # quote character
44
52
  relax: false , # relax quote parsing so ,"Fo"o, => ,"Fo""o",
45
53
  rowsep: "\n" , # row separator for export
46
54
  sep: "," , # column separator character
47
- strip: false , # strip fields when reading
48
- **opts # grab bag
55
+ strip: false , # strip columns when reading
56
+ **opts # grab bag
49
57
  )
50
- # data source
51
- str = File.open(str, "r:#{encoding}").read if !str[100] && File.readable?(str)
58
+ # initialize data source
59
+ if str && str.size < 100 && File.readable?(str)
60
+ str = File.open(str, encoding ? "r:#{encoding}" : "r").read
61
+ else
62
+ str ||= ""
63
+ str = str.encode(encoding) if encoding
64
+ end
52
65
  super(str)
53
66
  reset
54
67
 
55
- # options
68
+ # config options
69
+ @cheat = true
56
70
  @drop = drop
71
+ @encoding = str.encoding
57
72
  @excel = excel
58
73
  @mode = mode
59
- @out = out
60
- @quote = quote
74
+ @out = out || $stdout
61
75
  @relax = relax
76
+ @strip = strip
77
+
78
+ # config strings
79
+ @quote = quote
62
80
  @rowsep = rowsep
63
81
  @sep = sep
64
- @strip = strip
65
82
 
66
- # definitions
67
- @cr = "\r"
68
- @lf = "\n"
69
- @es = ""
70
- @eq = "="
71
- @esc = (@quote * 2)
72
- @eol = /#{@cr}#{@lf}?|#{@lf}|\z/o # end of line
73
- @eoc = /(?=#{"\\" + @sep}|#{@cr}|#{@lf}|\z)/o # end of cell
83
+ # static strings
84
+ @cr = "\r"
85
+ @lf = "\n"
86
+ @es = ""
87
+ @eq = "="
88
+
89
+ # combinations
90
+ @esc = (@quote * 2)
91
+ @seq = [@sep, @eq].join # used for parsing in excel mode
92
+
93
+ # regexes
94
+ @eoc = /(?=#{"\\" + @sep}|#{@cr}|#{@lf}|\z)/o # end of cell
95
+ @eol = /#{@cr}#{@lf}?|#{@lf}/o # end of line
96
+ @escapes = /(#{@quote})|#{"\\"+@sep}|#{@cr}|#{@lf}/o
97
+ @quotable = /#{"\\"+@sep}|#{@cr}|#{@lf}/o
98
+ @quotes = /#{@quote}/o
99
+ @seps = /#{@sep}+/o
100
+ @quoted = @excel ? /(?:=)?#{@quote}/o : @quote
101
+ @unquoted = /[^#{@sep}#{@cr}#{@lf}][^#{@quote}#{@cr}#{@lf}]*/o
102
+ @leadzero = /\A0\d*\z/
74
103
  end
75
104
 
76
105
  def reset(str=nil)
77
- self.string = str if str
78
- super()
79
106
  @rows = nil
80
107
  @cols = @cells = 0
81
- end
82
-
83
- # ==[ Lexer ]==
84
-
85
- def next_token
86
- excel = true if @excel && scan(@eq)
87
-
88
- if scan(@quote) # consume quoted cell
89
- token = ""
90
- while true
91
- token << (scan_until(/#{@quote}/o) or bomb "unclosed quote")[0..-2]
92
- token << @quote and next if scan(@quote)
93
- break if scan(@eoc)
94
- @relax or bomb "invalid character after quote"
95
- token << @quote + (scan_until(/#{@quote}/o) or bomb "bad inline quote")
96
- end
97
- elsif scan(@sep) then return @es
98
- elsif scan(@eol) then return nil
99
- else # consume unquoted cell
100
- token = scan_until(@eoc) or bomb "unexpected character"
101
- token.prepend(@eq) if excel
102
- end
103
- scan(@sep)
104
- @strip ? token.strip : token
105
- end
106
108
 
107
- def bomb(msg)
108
- abort "\n#{File.basename($0)}: #{msg} at character #{pos} near '#{string[pos-4,7]}'"
109
+ self.string = str if str
110
+ @encoding = string.encoding
111
+ super()
109
112
  end
110
113
 
111
114
  # ==[ Parser ]==
112
115
 
113
116
  def parse
117
+
118
+ # TODO: crazy optimization if NO QUOTES in rest
119
+ # unless rest.include?(@quote)
120
+ # @rows = rest...
121
+ # end
122
+
114
123
  @rows = []
115
124
  while row = next_row
116
125
  @rows << row
@@ -122,18 +131,71 @@ class Censive < StringScanner
122
131
  end
123
132
 
124
133
  def next_row
134
+ if @cheat and line = scan_until(@eol)
135
+ row = line.chomp!.split(@sep, -1)
136
+ row.each do |col|
137
+ next if (saw = col.count(@quote)).zero?
138
+ next if (saw == 2) && col.delete_prefix!(@quote) && col.delete_suffix!(@quote)
139
+ @cheat = false
140
+ break
141
+ end if line.include?(@quote)
142
+ @cheat and return @strip ? row.each(&:strip!) : row
143
+ unscan
144
+ end
145
+
125
146
  token = next_token or return
126
- row = [token]
127
- row << token while token = next_token
147
+ row = []
148
+ row.push(*token)
149
+ row.push(*token) while token = next_token
128
150
  row
129
151
  end
130
152
 
153
+ def next_token
154
+ if scan(@quoted) # quoted cell
155
+ token = ""
156
+ while true
157
+ token << (scan_until(@quotes) or bomb "unclosed quote")[0..-2]
158
+ token << @quote and next if scan(@quote)
159
+ scan(@eoc) and break
160
+ @relax or bomb "invalid character after quote"
161
+ token << @quote + (scan_until(@quotes) or bomb "bad inline quote")
162
+ end
163
+ scan(@sep)
164
+ @strip ? token.strip : token
165
+ elsif match = scan(@unquoted) # unquoted cell(s)
166
+ if check(@quote) && !match.chomp!(@sep) # if we see a stray quote
167
+ unless @excel && match.chomp!(@seq) # unless an excel literal, fix it
168
+ match << (scan_until(@eoc) or bomb "stray quote")
169
+ scan(@sep)
170
+ end
171
+ end
172
+ tokens = match.split(@sep, -1)
173
+ @strip ? tokens.map!(&:strip) : tokens
174
+ elsif scan(@sep)
175
+ match = scan(@seps)
176
+ match ? match.split(@sep, -1) : @es
177
+ else
178
+ scan(@eol)
179
+ nil
180
+ end
181
+ end
182
+
183
+ def each
184
+ @rows ||= parse
185
+ @rows.each {|row| yield row }
186
+ end
187
+
188
+ def export(**opts)
189
+ out = opts.empty? ? self : self.class.writer(**opts)
190
+ each {|row| out << row }
191
+ end
192
+
131
193
  # ==[ Helpers ]==
132
194
 
133
195
  # returns 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
134
196
  def grok(str)
135
- if idx = str.index(/(#{@quote})|#{"\\"+@sep}|#{@cr}|#{@lf}/o)
136
- $1 ? 2 : str.index(/#{@quote}/o, idx) ? 2 : 1
197
+ if idx = str.index(@escapes)
198
+ $1 ? 2 : str.index(@quotes, idx) ? 2 : 1
137
199
  else
138
200
  0
139
201
  end
@@ -153,11 +215,11 @@ class Censive < StringScanner
153
215
  row
154
216
  when 1
155
217
  row.map do |col|
156
- col.match?(/#{"\\"+@sep}|#{@cr}|#{@lf}/o) ? "#{q}#{col}#{q}" : col
218
+ col.match?(@quotable) ? "#{q}#{col}#{q}" : col
157
219
  end
158
220
  else
159
221
  row.map do |col|
160
- @excel && col =~ /\A0\d*\z/ ? "=#{q}#{col}#{q}" :
222
+ @excel && col =~ @leadzero ? "=#{q}#{col}#{q}" :
161
223
  case grok(col)
162
224
  when 0 then col
163
225
  when 1 then "#{q}#{col}#{q}"
@@ -168,7 +230,7 @@ class Censive < StringScanner
168
230
  when :full
169
231
  if @excel
170
232
  row.map do |col|
171
- col =~ /\A0\d*\z/ ? "=#{q}#{col}#{q}" : "#{q}#{col.gsub(q, @esc)}#{q}"
233
+ col =~ @leadzero ? "=#{q}#{col}#{q}" : "#{q}#{col.gsub(q, @esc)}#{q}"
172
234
  end
173
235
  else
174
236
  row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
@@ -178,16 +240,6 @@ class Censive < StringScanner
178
240
  @out << out + @rowsep
179
241
  end
180
242
 
181
- def each
182
- @rows ||= parse
183
- @rows.each {|row| yield row }
184
- end
185
-
186
- def export(**opts)
187
- out = opts.empty? ? self : self.class.writer(**opts)
188
- each {|row| out << row }
189
- end
190
-
191
243
  def stats
192
244
  wide = string.size.to_s.size
193
245
  puts "%#{wide}d rows" % @rows.size
@@ -195,27 +247,8 @@ class Censive < StringScanner
195
247
  puts "%#{wide}d cells" % @cells
196
248
  puts "%#{wide}d bytes" % string.size
197
249
  end
198
- end
199
250
 
200
- if __FILE__ == $0
201
- raw = DATA.gets("\n\n").chomp
202
- # raw = File.read(ARGV.first || "lc-2023.csv")
203
- csv = Censive.new(raw, excel: true, relax: true)
204
- csv.export # (sep: ",", excel: true)
251
+ def bomb(msg)
252
+ abort "\n#{File.basename($0)}: #{msg} at character #{pos} near '#{string[pos-4,7]}'"
253
+ end
205
254
  end
206
-
207
- __END__
208
- Name,Age,Shoe
209
- Alice,27,5
210
- Bob,33,10 1/2
211
- Charlie or "Chuck",=B2 + B3,9
212
- "Doug E Fresh",="007",10
213
- Subtotal,=sum(B2:B5),="01234"
214
-
215
- # first line works in "relax" mode, bottom line is compliant
216
- 123,"CHO, JOELLE "JOJO"",456
217
- 123,"CHO, JOELLE ""JOJO""",456
218
-
219
- # Excel mode checking
220
- =,=x,x=,="x",="","","=",123,0123,="123",="0123"
221
- ,=x,x=,x,,,,,,=,,123,="0123",123,,="0123" # <= a little off
@@ -0,0 +1,266 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # ============================================================================
4
+ # censive - A quick and lightweight CSV handling library for Ruby
5
+ #
6
+ # Author: Steve Shreeve (steve.shreeve@gmail.com)
7
+ # Date: Feb 8, 2023
8
+ #
9
+ # https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
10
+ # https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
11
+ #
12
+ # Thanks to Sutou Kouhei (kou) for his excellent advice on using scan
13
+ # ============================================================================
14
+ # GOALS:
15
+ # 1. Faster than Ruby's default CSV library
16
+ # 2. Lightweight code with streamlined and optimized logic
17
+ # 3. Support most non-compliant CSV variations (eg - @excel, @relax, @strip)
18
+ #
19
+ # TODO:
20
+ # 1. Support IO streaming
21
+ # 2. Review all encodings, we may be losing speed when mixing encodings
22
+ # 3. Huge speedup possible if our @unquoted regex reads beyond @cr?@lf's
23
+ # 4. Will using String#freeze give us a speed up?
24
+ # 5. Implement support for scan_until(string) <= right now only regex is valid
25
+ # ============================================================================
26
+
27
+ require "strscan"
28
+
29
+ class Censive < StringScanner
30
+ attr :encoding
31
+
32
+ def self.parse(...)
33
+ new(...).parse
34
+ end
35
+
36
+ def self.writer(obj=nil, **opts, &code)
37
+ case obj
38
+ when String then File.open(obj, "w") {|io| yield new(out: io, **opts, &code) }
39
+ when IO,nil then new(out: obj, **opts, &code)
40
+ else abort "#{File.basename($0)}: invalid #{obj.class} object in writer"
41
+ end
42
+ end
43
+
44
+ def initialize(str=nil,
45
+ drop: false , # drop trailing empty fields?
46
+ encoding: nil , # character encoding
47
+ excel: false , # literals ="01" formulas =A1 + B2 http://bit.ly/3Y7jIvc
48
+ mode: :compact, # export mode: compact or full
49
+ out: nil , # output stream, needs to respond to <<
50
+ quote: '"' , # quote character
51
+ relax: false , # relax quote parsing so ,"Fo"o, => ,"Fo""o",
52
+ rowsep: "\n" , # row separator for export
53
+ sep: "," , # column separator character
54
+ strip: false , # strip fields when reading
55
+ **opts # grab bag
56
+ )
57
+ # initialize data source
58
+ if str && str.size < 100 && File.readable?(str)
59
+ str = File.open(str, encoding ? "r:#{encoding}" : "r").read
60
+ else
61
+ str ||= ""
62
+ str = str.encode(encoding) if encoding
63
+ end
64
+ super(str)
65
+ reset
66
+
67
+ # config options
68
+ @drop = drop
69
+ @encoding = str.encoding
70
+ @excel = excel
71
+ @mode = mode
72
+ @out = out || $stdout
73
+ @relax = relax
74
+ @strip = strip
75
+
76
+ # config strings
77
+ @quote = quote
78
+ @rowsep = rowsep
79
+ @sep = sep
80
+
81
+ # static strings
82
+ @cr = "\r"
83
+ @lf = "\n"
84
+ @es = ""
85
+ @eq = "="
86
+
87
+ # combinations
88
+ @esc = (@quote * 2)
89
+ @seq = [@sep, @eq].join # used for parsing in excel mode
90
+
91
+ #!# TODO: come up with a clean way to escape/encode all this
92
+ #!# TODO: maybe define @tokens = "#{@quote}#{@sep}#{@cr}#{@lf}", etc.
93
+
94
+ # regexes
95
+ @eoc = /(?=#{"\\" + @sep}|#{@cr}|#{@lf}|\z)/o # end of cell
96
+ @eol = /#{@cr}#{@lf}?|#{@lf}/o # end of line
97
+ @escapes = /(#{@quote})|#{"\\"+@sep}|#{@cr}|#{@lf}/o
98
+ @quotable = /#{"\\"+@sep}|#{@cr}|#{@lf}/o
99
+ @quotes = /#{@quote}/o
100
+ @seps = /#{@sep}+/o
101
+ @quoted = @excel ? /(?:=)?#{@quote}/o : @quote
102
+ @unquoted = /[^#{@sep}#{@cr}#{@lf}][^#{@quote}#{@cr}#{@lf}]*/o
103
+ @leadzero = /\A0\d*\z/
104
+ end
105
+
106
+ def reset(str=nil)
107
+ @rows = nil
108
+ @cols = @cells = 0
109
+
110
+ #!# TODO: reset all encodings?
111
+ self.string = str if str
112
+ @encoding = string.encoding
113
+ super()
114
+ end
115
+
116
+ # ==[ Parser ]==
117
+
118
+ def parse
119
+ @rows = []
120
+ while row = next_row
121
+ @rows << row
122
+ count = row.size
123
+ @cols = count if count > @cols
124
+ @cells += count
125
+ end
126
+ @rows
127
+ end
128
+
129
+ def next_row
130
+ token = next_token or return
131
+ row = []
132
+ row.push(*token)
133
+ row.push(*token) while token = next_token
134
+ row
135
+ end
136
+
137
+ def next_token
138
+ if scan(@quoted) # quoted cell
139
+ token = ""
140
+ while true
141
+ token << (scan_until(@quotes) or bomb "unclosed quote")[0..-2]
142
+ token << @quote and next if scan(@quote)
143
+ scan(@eoc) and break
144
+ @relax or bomb "invalid character after quote"
145
+ token << @quote + (scan_until(@quotes) or bomb "bad inline quote")
146
+ end
147
+ scan(@sep)
148
+ @strip ? token.strip : token
149
+ elsif match = scan(@unquoted) # unquoted cell(s)
150
+ if check(@quote) && !match.chomp!(@sep) # if we see a stray quote
151
+ unless @excel && match.chomp!(@seq) # unless an excel literal, fix it
152
+ match << (scan_until(@eoc) or bomb "stray quote")
153
+ scan(@sep)
154
+ end
155
+ end
156
+ tokens = match.split(@sep, -1)
157
+ @strip ? tokens.map!(&:strip) : tokens
158
+ elsif scan(@sep)
159
+ match = scan(@seps)
160
+ match ? match.split(@sep, -1) : @es
161
+ else
162
+ scan(@eol)
163
+ nil
164
+ end
165
+ end
166
+
167
+ def each
168
+ @rows ||= parse
169
+ @rows.each {|row| yield row }
170
+ end
171
+
172
+ def export(**opts)
173
+ out = opts.empty? ? self : self.class.writer(**opts)
174
+ each {|row| out << row }
175
+ end
176
+
177
+ # ==[ Helpers ]==
178
+
179
+ # returns 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
180
+ def grok(str)
181
+ if idx = str.index(@escapes)
182
+ $1 ? 2 : str.index(@quotes, idx) ? 2 : 1
183
+ else
184
+ 0
185
+ end
186
+ end
187
+
188
+ # output a row
189
+ def <<(row)
190
+
191
+ # drop trailing empty columns
192
+ row.pop while row.last.empty? if @drop
193
+
194
+ s,q = @sep, @quote
195
+ out = case @mode
196
+ when :compact
197
+ case @excel ? 2 : grok(row.join)
198
+ when 0
199
+ row
200
+ when 1
201
+ row.map do |col|
202
+ col.match?(@quotable) ? "#{q}#{col}#{q}" : col
203
+ end
204
+ else
205
+ row.map do |col|
206
+ @excel && col =~ @leadzero ? "=#{q}#{col}#{q}" :
207
+ case grok(col)
208
+ when 0 then col
209
+ when 1 then "#{q}#{col}#{q}"
210
+ else "#{q}#{col.gsub(q, @esc)}#{q}"
211
+ end
212
+ end
213
+ end
214
+ when :full
215
+ if @excel
216
+ row.map do |col|
217
+ col =~ @leadzero ? "=#{q}#{col}#{q}" : "#{q}#{col.gsub(q, @esc)}#{q}"
218
+ end
219
+ else
220
+ row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
221
+ end
222
+ end.join(s)
223
+
224
+ @out << out + @rowsep
225
+ end
226
+
227
+ def stats
228
+ wide = string.size.to_s.size
229
+ puts "%#{wide}d rows" % @rows.size
230
+ puts "%#{wide}d columns" % @cols
231
+ puts "%#{wide}d cells" % @cells
232
+ puts "%#{wide}d bytes" % string.size
233
+ end
234
+
235
+ def bomb(msg)
236
+ abort "\n#{File.basename($0)}: #{msg} at character #{pos} near '#{string[pos-4,7]}'"
237
+ end
238
+ end
239
+
240
+ if __FILE__ == $0
241
+ raw = DATA.gets("\n\n").chomp
242
+ # raw = File.read(ARGV.first || "lc-2023.csv")
243
+ csv = Censive.new(raw, excel: true, relax: true)
244
+ csv.export # (excel: true) # sep: "|")
245
+ end
246
+
247
+ __END__
248
+ "Don",="007",10,"Ed"
249
+ Name,Age,,,Shoe,,,
250
+ "Alice",27,5
251
+ Bob,33,10 1/2
252
+ Charlie or "Chuck",=B2 + B3,9
253
+ Subtotal,=sum(B2:B5),="01234"
254
+
255
+ A,B,C,D
256
+ A,B,"C",D
257
+ A,B,C",D
258
+ A,B,"C",D
259
+
260
+ # first line works in "relax" mode, bottom line is compliant
261
+ 123,"CHO, JOELLE "JOJO"",456
262
+ 123,"CHO, JOELLE ""JOJO""",456
263
+
264
+ # Excel mode checking
265
+ =,=x,x=,="x",="","","=",123,0123,="123",="0123"
266
+ ,=x,x=,x,,,,,,=,,123,="0123",123,,="0123" # <= a little off