censive 0.20 → 0.21

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c8daaabf3200a72964f44377e4a1a5723e7790a96cb00f76433666cccdc73809
4
- data.tar.gz: b34491a185ccdd3e79d0877107d0ad33e9c9487b974398c18a911313795009b8
3
+ metadata.gz: 5dffdaf597e038881e378eb30acb7c44cde08de1f9e40e2180076eaa11356c68
4
+ data.tar.gz: f9d7f77ac597a5d5a86fc1adcad430802ab20bd306bf5856f1191f57ff22f872
5
5
  SHA512:
6
- metadata.gz: cc739653c328fd1c49e6a17d1aebc4a1e14f0707f252847cd134f07b9636ee4e043fed5a10b0d57550357d3ecd016621adfa69f8b6162765634e3e5759923804
7
- data.tar.gz: 0a34a13b24778d300d3e0cfb274f60c94263a960398984fec7d742280d66439a6453561f1913c9f607bf20ff4d7fd52172c09faad76bf460a236264f0cca53bc
6
+ metadata.gz: a0187489ebac8a9011f0f77dc9d52ca821ab080271f3eca6a1a40409b587534a9f4608d1f3b65a0253e587c242d01465e3cd773377f8d00b2fbd1723db4b5650
7
+ data.tar.gz: 94f2e7a204d8b40e058f41d193add0002d169d5d244e81c6895e465de159c6a953f09e313689891f7d12c05bead3baa41ad6fd525a8e297143758553e39ef1ba
data/censive.gemspec CHANGED
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = "censive"
3
- s.version = "0.20"
3
+ s.version = "0.21"
4
4
  s.author = "Steve Shreeve"
5
5
  s.email = "steve.shreeve@gmail.com"
6
6
  s.summary =
data/lib/censive.rb CHANGED
@@ -4,7 +4,7 @@
4
4
  # censive - A quick and lightweight CSV handling library for Ruby
5
5
  #
6
6
  # Author: Steve Shreeve (steve.shreeve@gmail.com)
7
- # Date: Feb 10, 2023
7
+ # Date: Feb 14, 2023
8
8
  #
9
9
  # https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
10
10
  # https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
@@ -20,7 +20,7 @@
20
20
  # TODO:
21
21
  # 1. Support IO streaming
22
22
  # 2. Review all encodings, we may be losing speed when mixing encodings
23
- # 3. Huge speedup possible if our @unquoted regex reads beyond @cr?@lf's
23
+ # 3. Speedup possible if our @unquoted regex reads beyond @eol's
24
24
  # 4. Will using String#freeze give us a speed up?
25
25
  # 5. Implement support for scan_until(string) <= right now only regex is valid
26
26
  # ============================================================================
@@ -28,7 +28,7 @@
28
28
  require "strscan"
29
29
 
30
30
  class Censive < StringScanner
31
- attr :encoding
31
+ attr :encoding, :out
32
32
 
33
33
  def self.parse(...)
34
34
  new(...).parse
@@ -114,12 +114,6 @@ class Censive < StringScanner
114
114
  # ==[ Parser ]==
115
115
 
116
116
  def parse
117
-
118
- # TODO: crazy optimization if NO QUOTES in rest
119
- # unless rest.include?(@quote)
120
- # @rows = rest...
121
- # end
122
-
123
117
  @rows = []
124
118
  while row = next_row
125
119
  @rows << row
@@ -188,6 +182,7 @@ class Censive < StringScanner
188
182
  def export(**opts)
189
183
  out = opts.empty? ? self : self.class.writer(**opts)
190
184
  each {|row| out << row }
185
+ out.out
191
186
  end
192
187
 
193
188
  # ==[ Helpers ]==
@@ -252,3 +247,33 @@ class Censive < StringScanner
252
247
  abort "\n#{File.basename($0)}: #{msg} at character #{pos} near '#{string[pos-4,7]}'"
253
248
  end
254
249
  end
250
+
251
+ if __FILE__ == $0
252
+ str = DATA.gets("\n\n").chomp
253
+ # str = File.read(ARGV.first || "lc-2023.csv")
254
+ # str = File.open("KEN_ALL.CSV", "r:cp932").read
255
+
256
+ # require "stringio"
257
+ # csv = Censive.new(str, excel: true, relax: true)
258
+ # out = "" # StringIO.new
259
+ # csv.export(out: out) # (excel: true) # sep: "|")
260
+ # puts out # .string
261
+
262
+ puts Censive.new(str, excel: true, relax: true, out: "").export
263
+ end
264
+
265
+ __END__
266
+ "Don",="007",10,"Ed"
267
+ Name,Age,,,Shoe,,,
268
+ "Alice",27,5
269
+ Bob,33,10 1/2
270
+ Charlie or "Chuck",=B2 + B3,9
271
+ Subtotal,=sum(B2:B5),="01234"
272
+ A,B,C,D
273
+ A,B,"C",D
274
+ A,B,C",D
275
+ A,B,"C",D
276
+ 123,"CHO, JOELLE "JOJO"",456
277
+ 123,"CHO, JOELLE ""JOJO""",456
278
+ =,=x,x=,="x",="","","=",123,0123,="123",="0123"
279
+ ,=x,x=,x,,,,,,=,,123,="0123",123,,="0123"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: censive
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.20'
4
+ version: '0.21'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Steve Shreeve
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-02-10 00:00:00.000000000 Z
11
+ date: 2023-02-14 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: A quick and lightweight CSV handling library for Ruby
14
14
  email: steve.shreeve@gmail.com
@@ -29,10 +29,6 @@ files:
29
29
  - diagram/diagram.dot
30
30
  - diagram/diagram.rl
31
31
  - lib/censive.rb
32
- - lib/censive.rb-20230208182732
33
- - lib/censive.rb-20230208195221
34
- - lib/censive.rb-20230209050227
35
- - lib/flay.rb
36
32
  - lib/test-censive.rb
37
33
  - lib/test-csv.rb
38
34
  - test/a-uses-tabs-and-single-quotes-and-no-trailing-newline.tsv
@@ -1,266 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- # ============================================================================
4
- # censive - A quick and lightweight CSV handling library for Ruby
5
- #
6
- # Author: Steve Shreeve (steve.shreeve@gmail.com)
7
- # Date: Feb 8, 2023
8
- #
9
- # https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
10
- # https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
11
- #
12
- # Thanks to Sutou Kouhei (kou) for his excellent advice on using scan
13
- # ============================================================================
14
- # GOALS:
15
- # 1. Faster than Ruby's default CSV library
16
- # 2. Lightweight code with streamlined and optimized logic
17
- # 3. Support most non-compliant CSV variations (eg - @excel, @relax, @strip)
18
- #
19
- # TODO:
20
- # 1. Support IO streaming
21
- # 2. Review all encodings, we may be losing speed when mixing encodings
22
- # 3. Huge speedup possible if our @unquoted regex reads beyond @cr?@lf's
23
- # 4. Will using String#freeze give us a speed up?
24
- # 5. Implement support for scan_until(string) <= right now only regex is valid
25
- # ============================================================================
26
-
27
- require "strscan"
28
-
29
- class Censive < StringScanner
30
- attr :encoding
31
-
32
- def self.parse(...)
33
- new(...).parse
34
- end
35
-
36
- def self.writer(obj=nil, **opts, &code)
37
- case obj
38
- when String then File.open(obj, "w") {|io| yield new(out: io, **opts, &code) }
39
- when IO,nil then new(out: obj, **opts, &code)
40
- else abort "#{File.basename($0)}: invalid #{obj.class} object in writer"
41
- end
42
- end
43
-
44
- def initialize(str=nil,
45
- drop: false , # drop trailing empty fields?
46
- encoding: nil , # character encoding
47
- excel: false , # literals ="01" formulas =A1 + B2 http://bit.ly/3Y7jIvc
48
- mode: :compact, # export mode: compact or full
49
- out: nil , # output stream, needs to respond to <<
50
- quote: '"' , # quote character
51
- relax: false , # relax quote parsing so ,"Fo"o, => ,"Fo""o",
52
- rowsep: "\n" , # row separator for export
53
- sep: "," , # column separator character
54
- strip: false , # strip fields when reading
55
- **opts # grab bag
56
- )
57
- # initialize data source
58
- if str && str.size < 100 && File.readable?(str)
59
- str = File.open(str, encoding ? "r:#{encoding}" : "r").read
60
- else
61
- str ||= ""
62
- str = str.encode(encoding) if encoding
63
- end
64
- super(str)
65
- reset
66
-
67
- # config options
68
- @drop = drop
69
- @encoding = str.encoding
70
- @excel = excel
71
- @mode = mode
72
- @out = out || $stdout
73
- @relax = relax
74
- @strip = strip
75
-
76
- # config strings
77
- @quote = quote
78
- @rowsep = rowsep
79
- @sep = sep
80
-
81
- # static strings
82
- @cr = "\r"
83
- @lf = "\n"
84
- @es = ""
85
- @eq = "="
86
-
87
- # combinations
88
- @esc = (@quote * 2)
89
- @seq = [@sep, @eq].join # used for parsing in excel mode
90
-
91
- #!# TODO: come up with a clean way to escape/encode all this
92
- #!# TODO: maybe define @tokens = "#{@quote}#{@sep}#{@cr}#{@lf}", etc.
93
-
94
- # regexes
95
- @eoc = /(?=#{"\\" + @sep}|#{@cr}|#{@lf}|\z)/o # end of cell
96
- @eol = /#{@cr}#{@lf}?|#{@lf}/o # end of line
97
- @escapes = /(#{@quote})|#{"\\"+@sep}|#{@cr}|#{@lf}/o
98
- @quotable = /#{"\\"+@sep}|#{@cr}|#{@lf}/o
99
- @quotes = /#{@quote}/o
100
- @seps = /#{@sep}+/o
101
- @quoted = @excel ? /(?:=)?#{@quote}/o : @quote
102
- @unquoted = /[^#{@sep}#{@cr}#{@lf}][^#{@quote}#{@cr}#{@lf}]*/o
103
- @leadzero = /\A0\d*\z/
104
- end
105
-
106
- def reset(str=nil)
107
- @rows = nil
108
- @cols = @cells = 0
109
-
110
- #!# TODO: reset all encodings?
111
- self.string = str if str
112
- @encoding = string.encoding
113
- super()
114
- end
115
-
116
- # ==[ Parser ]==
117
-
118
- def parse
119
- @rows = []
120
- while row = next_row
121
- @rows << row
122
- count = row.size
123
- @cols = count if count > @cols
124
- @cells += count
125
- end
126
- @rows
127
- end
128
-
129
- def next_row
130
- token = next_token or return
131
- row = []
132
- row.push(*token)
133
- row.push(*token) while token = next_token
134
- row
135
- end
136
-
137
- def next_token
138
- if scan(@quoted) # quoted cell
139
- token = ""
140
- while true
141
- token << (scan_until(@quotes) or bomb "unclosed quote")[0..-2]
142
- token << @quote and next if scan(@quote)
143
- scan(@eoc) and break
144
- @relax or bomb "invalid character after quote"
145
- token << @quote + (scan_until(@quotes) or bomb "bad inline quote")
146
- end
147
- scan(@sep)
148
- @strip ? token.strip : token
149
- elsif match = scan(@unquoted) # unquoted cell(s)
150
- if check(@quote) && !match.chomp!(@sep) # if we see a stray quote
151
- unless @excel && match.chomp!(@seq) # unless an excel literal, fix it
152
- match << (scan_until(@eoc) or bomb "stray quote")
153
- scan(@sep)
154
- end
155
- end
156
- tokens = match.split(@sep, -1)
157
- @strip ? tokens.map!(&:strip) : tokens
158
- elsif scan(@sep)
159
- match = scan(@seps)
160
- match ? match.split(@sep, -1) : @es
161
- else
162
- scan(@eol)
163
- nil
164
- end
165
- end
166
-
167
- def each
168
- @rows ||= parse
169
- @rows.each {|row| yield row }
170
- end
171
-
172
- def export(**opts)
173
- out = opts.empty? ? self : self.class.writer(**opts)
174
- each {|row| out << row }
175
- end
176
-
177
- # ==[ Helpers ]==
178
-
179
- # returns 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
180
- def grok(str)
181
- if idx = str.index(@escapes)
182
- $1 ? 2 : str.index(@quotes, idx) ? 2 : 1
183
- else
184
- 0
185
- end
186
- end
187
-
188
- # output a row
189
- def <<(row)
190
-
191
- # drop trailing empty columns
192
- row.pop while row.last.empty? if @drop
193
-
194
- s,q = @sep, @quote
195
- out = case @mode
196
- when :compact
197
- case @excel ? 2 : grok(row.join)
198
- when 0
199
- row
200
- when 1
201
- row.map do |col|
202
- col.match?(@quotable) ? "#{q}#{col}#{q}" : col
203
- end
204
- else
205
- row.map do |col|
206
- @excel && col =~ @leadzero ? "=#{q}#{col}#{q}" :
207
- case grok(col)
208
- when 0 then col
209
- when 1 then "#{q}#{col}#{q}"
210
- else "#{q}#{col.gsub(q, @esc)}#{q}"
211
- end
212
- end
213
- end
214
- when :full
215
- if @excel
216
- row.map do |col|
217
- col =~ @leadzero ? "=#{q}#{col}#{q}" : "#{q}#{col.gsub(q, @esc)}#{q}"
218
- end
219
- else
220
- row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
221
- end
222
- end.join(s)
223
-
224
- @out << out + @rowsep
225
- end
226
-
227
- def stats
228
- wide = string.size.to_s.size
229
- puts "%#{wide}d rows" % @rows.size
230
- puts "%#{wide}d columns" % @cols
231
- puts "%#{wide}d cells" % @cells
232
- puts "%#{wide}d bytes" % string.size
233
- end
234
-
235
- def bomb(msg)
236
- abort "\n#{File.basename($0)}: #{msg} at character #{pos} near '#{string[pos-4,7]}'"
237
- end
238
- end
239
-
240
- if __FILE__ == $0
241
- raw = DATA.gets("\n\n").chomp
242
- # raw = File.read(ARGV.first || "lc-2023.csv")
243
- csv = Censive.new(raw, excel: true, relax: true)
244
- csv.export # (excel: true) # sep: "|")
245
- end
246
-
247
- __END__
248
- "Don",="007",10,"Ed"
249
- Name,Age,,,Shoe,,,
250
- "Alice",27,5
251
- Bob,33,10 1/2
252
- Charlie or "Chuck",=B2 + B3,9
253
- Subtotal,=sum(B2:B5),="01234"
254
-
255
- A,B,C,D
256
- A,B,"C",D
257
- A,B,C",D
258
- A,B,"C",D
259
-
260
- # first line works in "relax" mode, bottom line is compliant
261
- 123,"CHO, JOELLE "JOJO"",456
262
- 123,"CHO, JOELLE ""JOJO""",456
263
-
264
- # Excel mode checking
265
- =,=x,x=,="x",="","","=",123,0123,="123",="0123"
266
- ,=x,x=,x,,,,,,=,,123,="0123",123,,="0123" # <= a little off
@@ -1,276 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- # ============================================================================
4
- # censive - A quick and lightweight CSV handling library for Ruby
5
- #
6
- # Author: Steve Shreeve (steve.shreeve@gmail.com)
7
- # Date: Feb 8, 2023
8
- #
9
- # https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
10
- # https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
11
- #
12
- # Thanks to Sutou Kouhei (kou) for his excellent advice on using scan
13
- # ============================================================================
14
- # GOALS:
15
- # 1. Faster than Ruby's default CSV library
16
- # 2. Lightweight code with streamlined and optimized logic
17
- # 3. Support most non-compliant CSV variations (eg - @excel, @relax, @strip)
18
- #
19
- # TODO:
20
- # 1. Support IO streaming
21
- # 2. Review all encodings, we may be losing speed when mixing encodings
22
- # 3. Huge speedup possible if our @unquoted regex reads beyond @cr?@lf's
23
- # 4. Will using String#freeze give us a speed up?
24
- # 5. Implement support for scan_until(string) <= right now only regex is valid
25
- # ============================================================================
26
-
27
- require "strscan"
28
-
29
- class Censive < StringScanner
30
- attr :encoding
31
-
32
- def self.parse(...)
33
- new(...).parse
34
- end
35
-
36
- def self.writer(obj=nil, **opts, &code)
37
- case obj
38
- when String then File.open(obj, "w") {|io| yield new(out: io, **opts, &code) }
39
- when IO,nil then new(out: obj, **opts, &code)
40
- else abort "#{File.basename($0)}: invalid #{obj.class} object in writer"
41
- end
42
- end
43
-
44
- def initialize(str=nil,
45
- drop: false , # drop trailing empty fields?
46
- encoding: nil , # character encoding
47
- excel: false , # literals ="01" formulas =A1 + B2 http://bit.ly/3Y7jIvc
48
- mode: :compact, # export mode: compact or full
49
- out: nil , # output stream, needs to respond to <<
50
- quote: '"' , # quote character
51
- relax: false , # relax quote parsing so ,"Fo"o, => ,"Fo""o",
52
- rowsep: "\n" , # row separator for export
53
- sep: "," , # column separator character
54
- strip: false , # strip fields when reading
55
- **opts # grab bag
56
- )
57
- # initialize data source
58
- if str && str.size < 100 && File.readable?(str)
59
- str = File.open(str, encoding ? "r:#{encoding}" : "r").read
60
- else
61
- str ||= ""
62
- str = str.encode(encoding) if encoding
63
- end
64
- super(str)
65
- reset
66
-
67
- # config options
68
- @drop = drop
69
- @encoding = str.encoding
70
- @excel = excel
71
- @mode = mode
72
- @out = out || $stdout
73
- @relax = relax
74
- @strip = strip
75
-
76
- # config strings
77
- @quote = quote
78
- @rowsep = rowsep
79
- @sep = sep
80
-
81
- # static strings
82
- @cr = "\r"
83
- @lf = "\n"
84
- @es = ""
85
- @eq = "="
86
-
87
- # combinations
88
- @esc = (@quote * 2)
89
- @seq = [@sep, @eq].join # used for parsing in excel mode
90
-
91
- #!# TODO: come up with a clean way to escape/encode all this
92
- #!# TODO: maybe define @tokens = "#{@quote}#{@sep}#{@cr}#{@lf}", etc.
93
-
94
- # regexes
95
- @eoc = /(?=#{"\\" + @sep}|#{@cr}|#{@lf}|\z)/o # end of cell
96
- @eol = /#{@cr}#{@lf}?|#{@lf}/o # end of line
97
- @escapes = /(#{@quote})|#{"\\"+@sep}|#{@cr}|#{@lf}/o
98
- @quotable = /#{"\\"+@sep}|#{@cr}|#{@lf}/o
99
- @quotes = /#{@quote}/o
100
- @seps = /#{@sep}+/o
101
- @quoted = @excel ? /(?:=)?#{@quote}/o : @quote
102
- @unquoted = /[^#{@sep}#{@cr}#{@lf}][^#{@quote}]*/o
103
- @leadzero = /\A0\d*\z/
104
- end
105
-
106
- def reset(str=nil)
107
- @rows = nil
108
- @cols = @cells = 0
109
-
110
- #!# TODO: reset all encodings?
111
- self.string = str if str
112
- @encoding = string.encoding
113
- super()
114
- end
115
-
116
- # ==[ Parser ]==
117
-
118
- def parse
119
- @rows = []
120
- @hold = []
121
- while row = next_row
122
- @rows << row
123
- count = row.size
124
- @cols = count if count > @cols
125
- @cells += count
126
- end
127
- @rows
128
- end
129
-
130
- def next_row
131
- token = next_token or return
132
- row = []
133
- row.push(*token)
134
- row.push(*token) while token = next_token
135
- row
136
- end
137
-
138
- def next_token
139
- @hold.empty? or return @hold.shift
140
- if scan(@quoted) # quoted cell
141
- token = ""
142
- while true
143
- token << (scan_until(@quotes) or bomb "unclosed quote")[0..-2]
144
- token << @quote and next if scan(@quote)
145
- scan(@eoc) and break
146
- @relax or bomb "invalid character after quote"
147
- token << @quote + (scan_until(@quotes) or bomb "bad inline quote")
148
- end
149
- scan(@sep)
150
- @strip ? token.strip : token
151
- elsif match = scan(@unquoted) # unquoted cell(s)
152
- if check(@quote) && !match.chomp!(@sep) && !match.end_with?(@cr, @lf)
153
- unless @excel && match.chomp!(@seq) # unless an excel literal, fix it
154
- match << (scan_until(@eoc) or bomb "stray quote")
155
- scan(@sep)
156
- end
157
- end
158
- match.split(@eol, -1).each_with_index do |line, i|
159
- if line.empty?
160
- @hold.push(nil)
161
- else
162
- @hold.push(nil) if i > 0
163
- cells = line.split(@sep, -1)
164
- @hold.push(@strip ? cells.map!(&:strip) : cells)
165
- end
166
- end
167
- @hold.shift
168
- elsif scan(@sep)
169
- match = scan(@seps)
170
- match ? match.split(@sep, -1) : @es
171
- else
172
- scan(@eol)
173
- nil
174
- end
175
- end
176
-
177
- def each
178
- @rows ||= parse
179
- @rows.each {|row| yield row }
180
- end
181
-
182
- def export(**opts)
183
- out = opts.empty? ? self : self.class.writer(**opts)
184
- each {|row| out << row }
185
- end
186
-
187
- # ==[ Helpers ]==
188
-
189
- # returns 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
190
- def grok(str)
191
- if idx = str.index(@escapes)
192
- $1 ? 2 : str.index(@quotes, idx) ? 2 : 1
193
- else
194
- 0
195
- end
196
- end
197
-
198
- # output a row
199
- def <<(row)
200
-
201
- # drop trailing empty columns
202
- row.pop while row.last.empty? if @drop
203
-
204
- s,q = @sep, @quote
205
- out = case @mode
206
- when :compact
207
- case @excel ? 2 : grok(row.join)
208
- when 0
209
- row
210
- when 1
211
- row.map do |col|
212
- col.match?(@quotable) ? "#{q}#{col}#{q}" : col
213
- end
214
- else
215
- row.map do |col|
216
- @excel && col =~ @leadzero ? "=#{q}#{col}#{q}" :
217
- case grok(col)
218
- when 0 then col
219
- when 1 then "#{q}#{col}#{q}"
220
- else "#{q}#{col.gsub(q, @esc)}#{q}"
221
- end
222
- end
223
- end
224
- when :full
225
- if @excel
226
- row.map do |col|
227
- col =~ @leadzero ? "=#{q}#{col}#{q}" : "#{q}#{col.gsub(q, @esc)}#{q}"
228
- end
229
- else
230
- row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
231
- end
232
- end.join(s)
233
-
234
- @out << out + @rowsep
235
- end
236
-
237
- def stats
238
- wide = string.size.to_s.size
239
- puts "%#{wide}d rows" % @rows.size
240
- puts "%#{wide}d columns" % @cols
241
- puts "%#{wide}d cells" % @cells
242
- puts "%#{wide}d bytes" % string.size
243
- end
244
-
245
- def bomb(msg)
246
- abort "\n#{File.basename($0)}: #{msg} at character #{pos} near '#{string[pos-4,7]}'"
247
- end
248
- end
249
-
250
- if __FILE__ == $0
251
- raw = DATA.gets("\n\n").chomp
252
- # raw = File.read(ARGV.first || "lc-2023.csv")
253
- csv = Censive.new(raw, excel: true, relax: true)
254
- csv.export # (excel: true) # sep: "|")
255
- end
256
-
257
- __END__
258
- "Don",="007",10,11,"Ed",20
259
- Name,Age,,,Shoe,,,
260
- "Alice",27,5
261
- Bob,33,10 1/2
262
- Charlie or "Chuck",=B2 + B3,9
263
- Subtotal,=sum(B2:B5),="01234"
264
-
265
- A,B,C,D
266
- A,B,"C",D
267
- A,B,C",D
268
- A,B,"C",D
269
-
270
- # first line works in "relax" mode, bottom line is compliant
271
- 123,"CHO, JOELLE "JOJO"",456
272
- 123,"CHO, JOELLE ""JOJO""",456
273
-
274
- # Excel mode checking
275
- =,=x,x=,="x",="","","=",123,0123,="123",="0123"
276
- ,=x,x=,x,,,,,,=,,123,="0123",123,,="0123" # <= a little off
@@ -1,282 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- # ============================================================================
4
- # censive - A quick and lightweight CSV handling library for Ruby
5
- #
6
- # Author: Steve Shreeve (steve.shreeve@gmail.com)
7
- # Date: Feb 9, 2023
8
- #
9
- # https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
10
- # https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
11
- #
12
- # Thanks to Sutou Kouhei (kou) for his excellent advice on using scan
13
- # ============================================================================
14
- # GOALS:
15
- # 1. Faster than Ruby's default CSV library
16
- # 2. Lightweight code with streamlined and optimized logic
17
- # 3. Support most non-compliant CSV variations (@excel, @relax, etc)
18
- # 4. Support most commonly used CSV options (@sep, @quote, @strip, @drop, etc)
19
- #
20
- # TODO:
21
- # 1. Support IO streaming
22
- # 2. Review all encodings, we may be losing speed when mixing encodings
23
- # 3. Huge speedup possible if our @unquoted regex reads beyond @cr?@lf's
24
- # 4. Will using String#freeze give us a speed up?
25
- # 5. Implement support for scan_until(string) <= right now only regex is valid
26
- # ============================================================================
27
-
28
- require "strscan"
29
-
30
- class Censive < StringScanner
31
- attr :encoding
32
-
33
- def self.parse(...)
34
- new(...).parse
35
- end
36
-
37
- def self.writer(obj=nil, **opts, &code)
38
- case obj
39
- when String then File.open(obj, "w") {|io| yield new(out: io, **opts, &code) }
40
- when IO,nil then new(out: obj, **opts, &code)
41
- else abort "#{File.basename($0)}: invalid #{obj.class} object in writer"
42
- end
43
- end
44
-
45
- def initialize(str=nil,
46
- drop: false , # drop trailing empty columns?
47
- encoding: nil , # character encoding
48
- excel: false , # literals ="01" formulas =A1 + B2 http://bit.ly/3Y7jIvc
49
- mode: :compact, # export mode: compact or full
50
- out: nil , # output stream, needs to respond to <<
51
- quote: '"' , # quote character
52
- relax: false , # relax quote parsing so ,"Fo"o, => ,"Fo""o",
53
- rowsep: "\n" , # row separator for export
54
- sep: "," , # column separator character
55
- strip: false , # strip columns when reading
56
- **opts # grab bag
57
- )
58
- # initialize data source
59
- if str && str.size < 100 && File.readable?(str)
60
- str = File.open(str, encoding ? "r:#{encoding}" : "r").read
61
- else
62
- str ||= ""
63
- str = str.encode(encoding) if encoding
64
- end
65
- super(str)
66
- reset
67
-
68
- # config options
69
- @cheat = true
70
- @drop = drop
71
- @encoding = str.encoding
72
- @excel = excel
73
- @mode = mode
74
- @out = out || $stdout
75
- @relax = relax
76
- @strip = strip
77
-
78
- # config strings
79
- @quote = quote
80
- @rowsep = rowsep
81
- @sep = sep
82
-
83
- # static strings
84
- @cr = "\r"
85
- @lf = "\n"
86
- @es = ""
87
- @eq = "="
88
-
89
- # combinations
90
- @esc = (@quote * 2)
91
- @seq = [@sep, @eq].join # used for parsing in excel mode
92
-
93
- #!# TODO: come up with a clean way to escape/encode all this
94
- #!# TODO: maybe define @tokens = "#{@quote}#{@sep}#{@cr}#{@lf}", etc.
95
-
96
- # regexes
97
- @eoc = /(?=#{"\\" + @sep}|#{@cr}|#{@lf}|\z)/o # end of cell
98
- @eol = /#{@cr}#{@lf}?|#{@lf}/o # end of line
99
- @escapes = /(#{@quote})|#{"\\"+@sep}|#{@cr}|#{@lf}/o
100
- @quotable = /#{"\\"+@sep}|#{@cr}|#{@lf}/o
101
- @quotes = /#{@quote}/o
102
- @seps = /#{@sep}+/o
103
- @quoted = @excel ? /(?:=)?#{@quote}/o : @quote
104
- @unquoted = /[^#{@sep}#{@cr}#{@lf}][^#{@quote}#{@cr}#{@lf}]*/o
105
- @leadzero = /\A0\d*\z/
106
- end
107
-
108
- def reset(str=nil)
109
- @rows = nil
110
- @cols = @cells = 0
111
-
112
- #!# TODO: reset all encodings?
113
- self.string = str if str
114
- @encoding = string.encoding
115
- super()
116
- end
117
-
118
- # ==[ Parser ]==
119
-
120
- def parse
121
- @rows = []
122
- while row = next_row
123
- @rows << row
124
- count = row.size
125
- @cols = count if count > @cols
126
- @cells += count
127
- end
128
- @rows
129
- end
130
-
131
- def next_row
132
- if @cheat and line = scan_until(@eol)
133
- row = line.chomp!.split(@sep, -1)
134
- row.each do |col|
135
- next if (saw = col.count(@quote)).zero?
136
- next if (saw == 2) && col.delete_prefix!(@quote) && col.delete_suffix!(@quote)
137
- @cheat = false
138
- break
139
- end if line.include?(@quote)
140
- @cheat and return @strip ? row.each(&:strip!) : row
141
- unscan
142
- end
143
-
144
- token = next_token or return
145
- row = []
146
- row.push(*token)
147
- row.push(*token) while token = next_token
148
- row
149
- end
150
-
151
- def next_token
152
- if scan(@quoted) # quoted cell
153
- token = ""
154
- while true
155
- token << (scan_until(@quotes) or bomb "unclosed quote")[0..-2]
156
- token << @quote and next if scan(@quote)
157
- scan(@eoc) and break
158
- @relax or bomb "invalid character after quote"
159
- token << @quote + (scan_until(@quotes) or bomb "bad inline quote")
160
- end
161
- scan(@sep)
162
- @strip ? token.strip : token
163
- elsif match = scan(@unquoted) # unquoted cell(s)
164
- if check(@quote) && !match.chomp!(@sep) # if we see a stray quote
165
- unless @excel && match.chomp!(@seq) # unless an excel literal, fix it
166
- match << (scan_until(@eoc) or bomb "stray quote")
167
- scan(@sep)
168
- end
169
- end
170
- tokens = match.split(@sep, -1)
171
- @strip ? tokens.map!(&:strip) : tokens
172
- elsif scan(@sep)
173
- match = scan(@seps)
174
- match ? match.split(@sep, -1) : @es
175
- else
176
- scan(@eol)
177
- nil
178
- end
179
- end
180
-
181
- def each
182
- @rows ||= parse
183
- @rows.each {|row| yield row }
184
- end
185
-
186
- def export(**opts)
187
- out = opts.empty? ? self : self.class.writer(**opts)
188
- each {|row| out << row }
189
- end
190
-
191
- # ==[ Helpers ]==
192
-
193
- # returns 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
194
- def grok(str)
195
- if idx = str.index(@escapes)
196
- $1 ? 2 : str.index(@quotes, idx) ? 2 : 1
197
- else
198
- 0
199
- end
200
- end
201
-
202
- # output a row
203
- def <<(row)
204
-
205
- # drop trailing empty columns
206
- row.pop while row.last.empty? if @drop
207
-
208
- s,q = @sep, @quote
209
- out = case @mode
210
- when :compact
211
- case @excel ? 2 : grok(row.join)
212
- when 0
213
- row
214
- when 1
215
- row.map do |col|
216
- col.match?(@quotable) ? "#{q}#{col}#{q}" : col
217
- end
218
- else
219
- row.map do |col|
220
- @excel && col =~ @leadzero ? "=#{q}#{col}#{q}" :
221
- case grok(col)
222
- when 0 then col
223
- when 1 then "#{q}#{col}#{q}"
224
- else "#{q}#{col.gsub(q, @esc)}#{q}"
225
- end
226
- end
227
- end
228
- when :full
229
- if @excel
230
- row.map do |col|
231
- col =~ @leadzero ? "=#{q}#{col}#{q}" : "#{q}#{col.gsub(q, @esc)}#{q}"
232
- end
233
- else
234
- row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
235
- end
236
- end.join(s)
237
-
238
- @out << out + @rowsep
239
- end
240
-
241
- def stats
242
- wide = string.size.to_s.size
243
- puts "%#{wide}d rows" % @rows.size
244
- puts "%#{wide}d columns" % @cols
245
- puts "%#{wide}d cells" % @cells
246
- puts "%#{wide}d bytes" % string.size
247
- end
248
-
249
- def bomb(msg)
250
- abort "\n#{File.basename($0)}: #{msg} at character #{pos} near '#{string[pos-4,7]}'"
251
- end
252
- end
253
-
254
- if __FILE__ == $0
255
- # raw = DATA.gets("\n\n").chomp
256
- # raw = File.read(ARGV.first || "lc-2023.csv")
257
- raw = File.open("KEN_ALL.CSV", "r:cp932").read
258
-
259
- csv = Censive.new(raw, excel: true, relax: true)
260
- csv.export # (excel: true) # sep: "|")
261
- end
262
-
263
- __END__
264
- "Don",="007",10,"Ed"
265
- Name,Age,,,Shoe,,,
266
- "Alice",27,5
267
- Bob,33,10 1/2
268
- Charlie or "Chuck",=B2 + B3,9
269
- Subtotal,=sum(B2:B5),="01234"
270
-
271
- A,B,C,D
272
- A,B,"C",D
273
- A,B,C",D
274
- A,B,"C",D
275
-
276
- # first line works in "relax" mode, bottom line is compliant
277
- 123,"CHO, JOELLE "JOJO"",456
278
- 123,"CHO, JOELLE ""JOJO""",456
279
-
280
- # Excel mode checking
281
- =,=x,x=,="x",="","","=",123,0123,="123",="0123"
282
- ,=x,x=,x,,,,,,=,,123,="0123",123,,="0123" # <= a little off
data/lib/flay.rb DELETED
@@ -1,227 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- # ============================================================================
4
- # flay - A quick and lightweight benchmarking tool for Ruby
5
- #
6
- # Author: Steve Shreeve (steve.shreeve@gmail.com)
7
- # Date: Feb 9, 2023
8
- # ============================================================================
9
- # GOALS:
10
- # 1. Provide a simple way to benchmark various code
11
- # 2. Easy to configure and start comparing results
12
- #
13
- # TODO:
14
- # 1. Everything
15
- # ============================================================================
16
-
17
- class Hash
18
- alias_method :default_lookup, :[]
19
-
20
- def [](key, miss=nil)
21
- key?(sym = key.to_sym) and return default_lookup(sym) || miss
22
- ary = key.to_s.split(/(?:[.\/\[]|\][.\/]?)/)
23
- val = ary.inject(self) do |obj, sub|
24
- if obj == self then default_lookup(sub.to_sym)
25
- elsif obj == nil then break
26
- elsif sub =~ /\A-?\d*\z/ then obj[sub.to_i]
27
- else obj[sub.to_sym]
28
- end
29
- end or miss
30
- end
31
-
32
- def method_missing(name, *args)
33
- name !~ /=$/ ? self[name, *args] : self[$`.to_sym] = args.first
34
- end
35
- end
36
-
37
- config = {
38
- environments: [
39
- {
40
- name: "Environment 1",
41
- before: <<~"|",
42
- # Environment 1 before
43
- |
44
- after: <<~"|",
45
- # Environment 1 after
46
- |
47
- },
48
- {
49
- name: "Environment 2",
50
- before: <<~"|",
51
- # Environment 1 before
52
- |
53
- after: <<~"|",
54
- # Environment 1 after
55
- |
56
- },
57
- ],
58
-
59
- contexts: [
60
- {
61
- name: "Context 1",
62
- before: <<~"|",
63
- # context 1 before
64
- |
65
- script: <<~"|",
66
- a = [*1..1e5]
67
- a.sum
68
- |
69
- after: <<~"|",
70
- # context 1 after
71
- |
72
- },
73
- {
74
- name: "Context 2",
75
- before: <<~"|",
76
- # context 2 before
77
- |
78
- after: <<~"|",
79
- # context 2 after
80
- |
81
- },
82
- ],
83
-
84
- tasks: [
85
- {
86
- name: "Task 1",
87
- runs: 35,
88
- before: <<~"|",
89
- # Task 1 before
90
- |
91
- after: <<~"|",
92
- # Task 1 after
93
- |
94
- },
95
- {
96
- name: "Task 2",
97
- secs: 30,
98
- before: <<~"|",
99
- # Task 2 before
100
- |
101
- after: <<~"|",
102
- # Task 2 after
103
- |
104
- },
105
- ],
106
- }
107
-
108
- # ==[ Helpers ]==
109
-
110
- def wrapper(object, type=nil)
111
- puts case type
112
- when :environment then template_for_environment object
113
- when :context then template_for_context object
114
- when :task then template_for_task object
115
- else section object
116
- end
117
- end
118
-
119
- def wrap(list, type=nil, **opts)
120
- list.each do |item|
121
- wrapper(item, type)
122
- yield item
123
- end
124
- end
125
-
126
- def section(text, wide=78, left=0)
127
- [
128
- "# ".ljust(wide, "="),
129
- "# #{text}",
130
- "# ".ljust(wide, "="),
131
- ].join("\n")
132
- end
133
-
134
- def hr(text, wide=78, left=0)
135
- [ " " * left, "# ==[ ", text, " ]" ].join.ljust(wide, "=")
136
- end
137
-
138
- # ==[ Templates ]==
139
-
140
- def template_for_environment(environment)
141
- <<~"|"
142
- #{ section "Environment: #{environment.name} " }
143
-
144
- # ==[ Code before environment ]==
145
-
146
- #{ environment.before }
147
- |
148
- end
149
-
150
- def template_for_context(context)
151
- <<~"|"
152
- #{ section "Context: #{context.name} " }
153
-
154
- # ==[ Code before context ]==
155
-
156
- #{ context.before }
157
- |
158
- end
159
-
160
- def template_for_task(task)
161
- <<~"|"
162
- #{ section "Task: #{task.name} " }
163
-
164
- # ==[ Code before task ]==
165
-
166
- #{ task.before }
167
-
168
- # ==[ Calculate the duration of a loop of empty runs ]==
169
-
170
- if #{ task.runs } == 1
171
- __flay_before_empty = 0
172
- __flay_after_empty = 0
173
- else
174
- __flay_before_empty = Process.clock_gettime(Process::CLOCK_MONOTONIC)
175
- __flay_runs = 0
176
- while __flay_runs < #{ task.runs } # this empty loop improves accuracy
177
- __flay_runs += 1
178
- end
179
- __flay_after_empty = Process.clock_gettime(Process::CLOCK_MONOTONIC)
180
- end
181
-
182
- # ==[ Calculate the duration of a loop of script runs ]==
183
-
184
- if #{ task.runs } == 1
185
- __flay_before_script = 0
186
- __flay_after_script = 0
187
- else
188
- __flay_before_script = Process.clock_gettime(Process::CLOCK_MONOTONIC)
189
- __flay_runs = 0
190
- while __flay_runs < #{ task.runs }
191
-
192
- # ==[ Before script ]==
193
-
194
- #{ task.script }
195
-
196
- # ==[ After script ]==
197
-
198
- __flay_runs += 1
199
- end
200
- __flay_after_script = Process.clock_gettime(Process::CLOCK_MONOTONIC)
201
- end
202
-
203
- # ==[ Code after task ]==
204
-
205
- #{ task.after }
206
-
207
- # ==[ Write out timestamps ]==
208
-
209
- __flay_duration = (__flay_after_script - __flay_before_script) -
210
- (__flay_after_empty - __flay_before_empty )
211
-
212
- File.write("/dev/null", __flay_duration.inspect)
213
- |
214
- end
215
-
216
- # ==[ Workflow ]==
217
-
218
- environments = config.environments
219
- contexts = config.contexts
220
- tasks = config.tasks
221
-
222
- wrap(environments, :environment) do |environment|
223
- wrap(tasks, :task) do |task|
224
- wrap(contexts, :context) do |context|
225
- end
226
- end
227
- end