censive 0.20 → 0.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c8daaabf3200a72964f44377e4a1a5723e7790a96cb00f76433666cccdc73809
4
- data.tar.gz: b34491a185ccdd3e79d0877107d0ad33e9c9487b974398c18a911313795009b8
3
+ metadata.gz: 5dffdaf597e038881e378eb30acb7c44cde08de1f9e40e2180076eaa11356c68
4
+ data.tar.gz: f9d7f77ac597a5d5a86fc1adcad430802ab20bd306bf5856f1191f57ff22f872
5
5
  SHA512:
6
- metadata.gz: cc739653c328fd1c49e6a17d1aebc4a1e14f0707f252847cd134f07b9636ee4e043fed5a10b0d57550357d3ecd016621adfa69f8b6162765634e3e5759923804
7
- data.tar.gz: 0a34a13b24778d300d3e0cfb274f60c94263a960398984fec7d742280d66439a6453561f1913c9f607bf20ff4d7fd52172c09faad76bf460a236264f0cca53bc
6
+ metadata.gz: a0187489ebac8a9011f0f77dc9d52ca821ab080271f3eca6a1a40409b587534a9f4608d1f3b65a0253e587c242d01465e3cd773377f8d00b2fbd1723db4b5650
7
+ data.tar.gz: 94f2e7a204d8b40e058f41d193add0002d169d5d244e81c6895e465de159c6a953f09e313689891f7d12c05bead3baa41ad6fd525a8e297143758553e39ef1ba
data/censive.gemspec CHANGED
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = "censive"
3
- s.version = "0.20"
3
+ s.version = "0.21"
4
4
  s.author = "Steve Shreeve"
5
5
  s.email = "steve.shreeve@gmail.com"
6
6
  s.summary =
data/lib/censive.rb CHANGED
@@ -4,7 +4,7 @@
4
4
  # censive - A quick and lightweight CSV handling library for Ruby
5
5
  #
6
6
  # Author: Steve Shreeve (steve.shreeve@gmail.com)
7
- # Date: Feb 10, 2023
7
+ # Date: Feb 14, 2023
8
8
  #
9
9
  # https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
10
10
  # https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
@@ -20,7 +20,7 @@
20
20
  # TODO:
21
21
  # 1. Support IO streaming
22
22
  # 2. Review all encodings, we may be losing speed when mixing encodings
23
- # 3. Huge speedup possible if our @unquoted regex reads beyond @cr?@lf's
23
+ # 3. Speedup possible if our @unquoted regex reads beyond @eol's
24
24
  # 4. Will using String#freeze give us a speed up?
25
25
  # 5. Implement support for scan_until(string) <= right now only regex is valid
26
26
  # ============================================================================
@@ -28,7 +28,7 @@
28
28
  require "strscan"
29
29
 
30
30
  class Censive < StringScanner
31
- attr :encoding
31
+ attr :encoding, :out
32
32
 
33
33
  def self.parse(...)
34
34
  new(...).parse
@@ -114,12 +114,6 @@ class Censive < StringScanner
114
114
  # ==[ Parser ]==
115
115
 
116
116
  def parse
117
-
118
- # TODO: crazy optimization if NO QUOTES in rest
119
- # unless rest.include?(@quote)
120
- # @rows = rest...
121
- # end
122
-
123
117
  @rows = []
124
118
  while row = next_row
125
119
  @rows << row
@@ -188,6 +182,7 @@ class Censive < StringScanner
188
182
  def export(**opts)
189
183
  out = opts.empty? ? self : self.class.writer(**opts)
190
184
  each {|row| out << row }
185
+ out.out
191
186
  end
192
187
 
193
188
  # ==[ Helpers ]==
@@ -252,3 +247,33 @@ class Censive < StringScanner
252
247
  abort "\n#{File.basename($0)}: #{msg} at character #{pos} near '#{string[pos-4,7]}'"
253
248
  end
254
249
  end
250
+
251
+ if __FILE__ == $0
252
+ str = DATA.gets("\n\n").chomp
253
+ # str = File.read(ARGV.first || "lc-2023.csv")
254
+ # str = File.open("KEN_ALL.CSV", "r:cp932").read
255
+
256
+ # require "stringio"
257
+ # csv = Censive.new(str, excel: true, relax: true)
258
+ # out = "" # StringIO.new
259
+ # csv.export(out: out) # (excel: true) # sep: "|")
260
+ # puts out # .string
261
+
262
+ puts Censive.new(str, excel: true, relax: true, out: "").export
263
+ end
264
+
265
+ __END__
266
+ "Don",="007",10,"Ed"
267
+ Name,Age,,,Shoe,,,
268
+ "Alice",27,5
269
+ Bob,33,10 1/2
270
+ Charlie or "Chuck",=B2 + B3,9
271
+ Subtotal,=sum(B2:B5),="01234"
272
+ A,B,C,D
273
+ A,B,"C",D
274
+ A,B,C",D
275
+ A,B,"C",D
276
+ 123,"CHO, JOELLE "JOJO"",456
277
+ 123,"CHO, JOELLE ""JOJO""",456
278
+ =,=x,x=,="x",="","","=",123,0123,="123",="0123"
279
+ ,=x,x=,x,,,,,,=,,123,="0123",123,,="0123"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: censive
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.20'
4
+ version: '0.21'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Steve Shreeve
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-02-10 00:00:00.000000000 Z
11
+ date: 2023-02-14 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: A quick and lightweight CSV handling library for Ruby
14
14
  email: steve.shreeve@gmail.com
@@ -29,10 +29,6 @@ files:
29
29
  - diagram/diagram.dot
30
30
  - diagram/diagram.rl
31
31
  - lib/censive.rb
32
- - lib/censive.rb-20230208182732
33
- - lib/censive.rb-20230208195221
34
- - lib/censive.rb-20230209050227
35
- - lib/flay.rb
36
32
  - lib/test-censive.rb
37
33
  - lib/test-csv.rb
38
34
  - test/a-uses-tabs-and-single-quotes-and-no-trailing-newline.tsv
@@ -1,266 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- # ============================================================================
4
- # censive - A quick and lightweight CSV handling library for Ruby
5
- #
6
- # Author: Steve Shreeve (steve.shreeve@gmail.com)
7
- # Date: Feb 8, 2023
8
- #
9
- # https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
10
- # https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
11
- #
12
- # Thanks to Sutou Kouhei (kou) for his excellent advice on using scan
13
- # ============================================================================
14
- # GOALS:
15
- # 1. Faster than Ruby's default CSV library
16
- # 2. Lightweight code with streamlined and optimized logic
17
- # 3. Support most non-compliant CSV variations (eg - @excel, @relax, @strip)
18
- #
19
- # TODO:
20
- # 1. Support IO streaming
21
- # 2. Review all encodings, we may be losing speed when mixing encodings
22
- # 3. Huge speedup possible if our @unquoted regex reads beyond @cr?@lf's
23
- # 4. Will using String#freeze give us a speed up?
24
- # 5. Implement support for scan_until(string) <= right now only regex is valid
25
- # ============================================================================
26
-
27
- require "strscan"
28
-
29
- class Censive < StringScanner
30
- attr :encoding
31
-
32
- def self.parse(...)
33
- new(...).parse
34
- end
35
-
36
- def self.writer(obj=nil, **opts, &code)
37
- case obj
38
- when String then File.open(obj, "w") {|io| yield new(out: io, **opts, &code) }
39
- when IO,nil then new(out: obj, **opts, &code)
40
- else abort "#{File.basename($0)}: invalid #{obj.class} object in writer"
41
- end
42
- end
43
-
44
- def initialize(str=nil,
45
- drop: false , # drop trailing empty fields?
46
- encoding: nil , # character encoding
47
- excel: false , # literals ="01" formulas =A1 + B2 http://bit.ly/3Y7jIvc
48
- mode: :compact, # export mode: compact or full
49
- out: nil , # output stream, needs to respond to <<
50
- quote: '"' , # quote character
51
- relax: false , # relax quote parsing so ,"Fo"o, => ,"Fo""o",
52
- rowsep: "\n" , # row separator for export
53
- sep: "," , # column separator character
54
- strip: false , # strip fields when reading
55
- **opts # grab bag
56
- )
57
- # initialize data source
58
- if str && str.size < 100 && File.readable?(str)
59
- str = File.open(str, encoding ? "r:#{encoding}" : "r").read
60
- else
61
- str ||= ""
62
- str = str.encode(encoding) if encoding
63
- end
64
- super(str)
65
- reset
66
-
67
- # config options
68
- @drop = drop
69
- @encoding = str.encoding
70
- @excel = excel
71
- @mode = mode
72
- @out = out || $stdout
73
- @relax = relax
74
- @strip = strip
75
-
76
- # config strings
77
- @quote = quote
78
- @rowsep = rowsep
79
- @sep = sep
80
-
81
- # static strings
82
- @cr = "\r"
83
- @lf = "\n"
84
- @es = ""
85
- @eq = "="
86
-
87
- # combinations
88
- @esc = (@quote * 2)
89
- @seq = [@sep, @eq].join # used for parsing in excel mode
90
-
91
- #!# TODO: come up with a clean way to escape/encode all this
92
- #!# TODO: maybe define @tokens = "#{@quote}#{@sep}#{@cr}#{@lf}", etc.
93
-
94
- # regexes
95
- @eoc = /(?=#{"\\" + @sep}|#{@cr}|#{@lf}|\z)/o # end of cell
96
- @eol = /#{@cr}#{@lf}?|#{@lf}/o # end of line
97
- @escapes = /(#{@quote})|#{"\\"+@sep}|#{@cr}|#{@lf}/o
98
- @quotable = /#{"\\"+@sep}|#{@cr}|#{@lf}/o
99
- @quotes = /#{@quote}/o
100
- @seps = /#{@sep}+/o
101
- @quoted = @excel ? /(?:=)?#{@quote}/o : @quote
102
- @unquoted = /[^#{@sep}#{@cr}#{@lf}][^#{@quote}#{@cr}#{@lf}]*/o
103
- @leadzero = /\A0\d*\z/
104
- end
105
-
106
- def reset(str=nil)
107
- @rows = nil
108
- @cols = @cells = 0
109
-
110
- #!# TODO: reset all encodings?
111
- self.string = str if str
112
- @encoding = string.encoding
113
- super()
114
- end
115
-
116
- # ==[ Parser ]==
117
-
118
- def parse
119
- @rows = []
120
- while row = next_row
121
- @rows << row
122
- count = row.size
123
- @cols = count if count > @cols
124
- @cells += count
125
- end
126
- @rows
127
- end
128
-
129
- def next_row
130
- token = next_token or return
131
- row = []
132
- row.push(*token)
133
- row.push(*token) while token = next_token
134
- row
135
- end
136
-
137
- def next_token
138
- if scan(@quoted) # quoted cell
139
- token = ""
140
- while true
141
- token << (scan_until(@quotes) or bomb "unclosed quote")[0..-2]
142
- token << @quote and next if scan(@quote)
143
- scan(@eoc) and break
144
- @relax or bomb "invalid character after quote"
145
- token << @quote + (scan_until(@quotes) or bomb "bad inline quote")
146
- end
147
- scan(@sep)
148
- @strip ? token.strip : token
149
- elsif match = scan(@unquoted) # unquoted cell(s)
150
- if check(@quote) && !match.chomp!(@sep) # if we see a stray quote
151
- unless @excel && match.chomp!(@seq) # unless an excel literal, fix it
152
- match << (scan_until(@eoc) or bomb "stray quote")
153
- scan(@sep)
154
- end
155
- end
156
- tokens = match.split(@sep, -1)
157
- @strip ? tokens.map!(&:strip) : tokens
158
- elsif scan(@sep)
159
- match = scan(@seps)
160
- match ? match.split(@sep, -1) : @es
161
- else
162
- scan(@eol)
163
- nil
164
- end
165
- end
166
-
167
- def each
168
- @rows ||= parse
169
- @rows.each {|row| yield row }
170
- end
171
-
172
- def export(**opts)
173
- out = opts.empty? ? self : self.class.writer(**opts)
174
- each {|row| out << row }
175
- end
176
-
177
- # ==[ Helpers ]==
178
-
179
- # returns 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
180
- def grok(str)
181
- if idx = str.index(@escapes)
182
- $1 ? 2 : str.index(@quotes, idx) ? 2 : 1
183
- else
184
- 0
185
- end
186
- end
187
-
188
- # output a row
189
- def <<(row)
190
-
191
- # drop trailing empty columns
192
- row.pop while row.last.empty? if @drop
193
-
194
- s,q = @sep, @quote
195
- out = case @mode
196
- when :compact
197
- case @excel ? 2 : grok(row.join)
198
- when 0
199
- row
200
- when 1
201
- row.map do |col|
202
- col.match?(@quotable) ? "#{q}#{col}#{q}" : col
203
- end
204
- else
205
- row.map do |col|
206
- @excel && col =~ @leadzero ? "=#{q}#{col}#{q}" :
207
- case grok(col)
208
- when 0 then col
209
- when 1 then "#{q}#{col}#{q}"
210
- else "#{q}#{col.gsub(q, @esc)}#{q}"
211
- end
212
- end
213
- end
214
- when :full
215
- if @excel
216
- row.map do |col|
217
- col =~ @leadzero ? "=#{q}#{col}#{q}" : "#{q}#{col.gsub(q, @esc)}#{q}"
218
- end
219
- else
220
- row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
221
- end
222
- end.join(s)
223
-
224
- @out << out + @rowsep
225
- end
226
-
227
- def stats
228
- wide = string.size.to_s.size
229
- puts "%#{wide}d rows" % @rows.size
230
- puts "%#{wide}d columns" % @cols
231
- puts "%#{wide}d cells" % @cells
232
- puts "%#{wide}d bytes" % string.size
233
- end
234
-
235
- def bomb(msg)
236
- abort "\n#{File.basename($0)}: #{msg} at character #{pos} near '#{string[pos-4,7]}'"
237
- end
238
- end
239
-
240
- if __FILE__ == $0
241
- raw = DATA.gets("\n\n").chomp
242
- # raw = File.read(ARGV.first || "lc-2023.csv")
243
- csv = Censive.new(raw, excel: true, relax: true)
244
- csv.export # (excel: true) # sep: "|")
245
- end
246
-
247
- __END__
248
- "Don",="007",10,"Ed"
249
- Name,Age,,,Shoe,,,
250
- "Alice",27,5
251
- Bob,33,10 1/2
252
- Charlie or "Chuck",=B2 + B3,9
253
- Subtotal,=sum(B2:B5),="01234"
254
-
255
- A,B,C,D
256
- A,B,"C",D
257
- A,B,C",D
258
- A,B,"C",D
259
-
260
- # first line works in "relax" mode, bottom line is compliant
261
- 123,"CHO, JOELLE "JOJO"",456
262
- 123,"CHO, JOELLE ""JOJO""",456
263
-
264
- # Excel mode checking
265
- =,=x,x=,="x",="","","=",123,0123,="123",="0123"
266
- ,=x,x=,x,,,,,,=,,123,="0123",123,,="0123" # <= a little off
@@ -1,276 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- # ============================================================================
4
- # censive - A quick and lightweight CSV handling library for Ruby
5
- #
6
- # Author: Steve Shreeve (steve.shreeve@gmail.com)
7
- # Date: Feb 8, 2023
8
- #
9
- # https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
10
- # https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
11
- #
12
- # Thanks to Sutou Kouhei (kou) for his excellent advice on using scan
13
- # ============================================================================
14
- # GOALS:
15
- # 1. Faster than Ruby's default CSV library
16
- # 2. Lightweight code with streamlined and optimized logic
17
- # 3. Support most non-compliant CSV variations (eg - @excel, @relax, @strip)
18
- #
19
- # TODO:
20
- # 1. Support IO streaming
21
- # 2. Review all encodings, we may be losing speed when mixing encodings
22
- # 3. Huge speedup possible if our @unquoted regex reads beyond @cr?@lf's
23
- # 4. Will using String#freeze give us a speed up?
24
- # 5. Implement support for scan_until(string) <= right now only regex is valid
25
- # ============================================================================
26
-
27
- require "strscan"
28
-
29
- class Censive < StringScanner
30
- attr :encoding
31
-
32
- def self.parse(...)
33
- new(...).parse
34
- end
35
-
36
- def self.writer(obj=nil, **opts, &code)
37
- case obj
38
- when String then File.open(obj, "w") {|io| yield new(out: io, **opts, &code) }
39
- when IO,nil then new(out: obj, **opts, &code)
40
- else abort "#{File.basename($0)}: invalid #{obj.class} object in writer"
41
- end
42
- end
43
-
44
- def initialize(str=nil,
45
- drop: false , # drop trailing empty fields?
46
- encoding: nil , # character encoding
47
- excel: false , # literals ="01" formulas =A1 + B2 http://bit.ly/3Y7jIvc
48
- mode: :compact, # export mode: compact or full
49
- out: nil , # output stream, needs to respond to <<
50
- quote: '"' , # quote character
51
- relax: false , # relax quote parsing so ,"Fo"o, => ,"Fo""o",
52
- rowsep: "\n" , # row separator for export
53
- sep: "," , # column separator character
54
- strip: false , # strip fields when reading
55
- **opts # grab bag
56
- )
57
- # initialize data source
58
- if str && str.size < 100 && File.readable?(str)
59
- str = File.open(str, encoding ? "r:#{encoding}" : "r").read
60
- else
61
- str ||= ""
62
- str = str.encode(encoding) if encoding
63
- end
64
- super(str)
65
- reset
66
-
67
- # config options
68
- @drop = drop
69
- @encoding = str.encoding
70
- @excel = excel
71
- @mode = mode
72
- @out = out || $stdout
73
- @relax = relax
74
- @strip = strip
75
-
76
- # config strings
77
- @quote = quote
78
- @rowsep = rowsep
79
- @sep = sep
80
-
81
- # static strings
82
- @cr = "\r"
83
- @lf = "\n"
84
- @es = ""
85
- @eq = "="
86
-
87
- # combinations
88
- @esc = (@quote * 2)
89
- @seq = [@sep, @eq].join # used for parsing in excel mode
90
-
91
- #!# TODO: come up with a clean way to escape/encode all this
92
- #!# TODO: maybe define @tokens = "#{@quote}#{@sep}#{@cr}#{@lf}", etc.
93
-
94
- # regexes
95
- @eoc = /(?=#{"\\" + @sep}|#{@cr}|#{@lf}|\z)/o # end of cell
96
- @eol = /#{@cr}#{@lf}?|#{@lf}/o # end of line
97
- @escapes = /(#{@quote})|#{"\\"+@sep}|#{@cr}|#{@lf}/o
98
- @quotable = /#{"\\"+@sep}|#{@cr}|#{@lf}/o
99
- @quotes = /#{@quote}/o
100
- @seps = /#{@sep}+/o
101
- @quoted = @excel ? /(?:=)?#{@quote}/o : @quote
102
- @unquoted = /[^#{@sep}#{@cr}#{@lf}][^#{@quote}]*/o
103
- @leadzero = /\A0\d*\z/
104
- end
105
-
106
- def reset(str=nil)
107
- @rows = nil
108
- @cols = @cells = 0
109
-
110
- #!# TODO: reset all encodings?
111
- self.string = str if str
112
- @encoding = string.encoding
113
- super()
114
- end
115
-
116
- # ==[ Parser ]==
117
-
118
- def parse
119
- @rows = []
120
- @hold = []
121
- while row = next_row
122
- @rows << row
123
- count = row.size
124
- @cols = count if count > @cols
125
- @cells += count
126
- end
127
- @rows
128
- end
129
-
130
- def next_row
131
- token = next_token or return
132
- row = []
133
- row.push(*token)
134
- row.push(*token) while token = next_token
135
- row
136
- end
137
-
138
- def next_token
139
- @hold.empty? or return @hold.shift
140
- if scan(@quoted) # quoted cell
141
- token = ""
142
- while true
143
- token << (scan_until(@quotes) or bomb "unclosed quote")[0..-2]
144
- token << @quote and next if scan(@quote)
145
- scan(@eoc) and break
146
- @relax or bomb "invalid character after quote"
147
- token << @quote + (scan_until(@quotes) or bomb "bad inline quote")
148
- end
149
- scan(@sep)
150
- @strip ? token.strip : token
151
- elsif match = scan(@unquoted) # unquoted cell(s)
152
- if check(@quote) && !match.chomp!(@sep) && !match.end_with?(@cr, @lf)
153
- unless @excel && match.chomp!(@seq) # unless an excel literal, fix it
154
- match << (scan_until(@eoc) or bomb "stray quote")
155
- scan(@sep)
156
- end
157
- end
158
- match.split(@eol, -1).each_with_index do |line, i|
159
- if line.empty?
160
- @hold.push(nil)
161
- else
162
- @hold.push(nil) if i > 0
163
- cells = line.split(@sep, -1)
164
- @hold.push(@strip ? cells.map!(&:strip) : cells)
165
- end
166
- end
167
- @hold.shift
168
- elsif scan(@sep)
169
- match = scan(@seps)
170
- match ? match.split(@sep, -1) : @es
171
- else
172
- scan(@eol)
173
- nil
174
- end
175
- end
176
-
177
- def each
178
- @rows ||= parse
179
- @rows.each {|row| yield row }
180
- end
181
-
182
- def export(**opts)
183
- out = opts.empty? ? self : self.class.writer(**opts)
184
- each {|row| out << row }
185
- end
186
-
187
- # ==[ Helpers ]==
188
-
189
- # returns 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
190
- def grok(str)
191
- if idx = str.index(@escapes)
192
- $1 ? 2 : str.index(@quotes, idx) ? 2 : 1
193
- else
194
- 0
195
- end
196
- end
197
-
198
- # output a row
199
- def <<(row)
200
-
201
- # drop trailing empty columns
202
- row.pop while row.last.empty? if @drop
203
-
204
- s,q = @sep, @quote
205
- out = case @mode
206
- when :compact
207
- case @excel ? 2 : grok(row.join)
208
- when 0
209
- row
210
- when 1
211
- row.map do |col|
212
- col.match?(@quotable) ? "#{q}#{col}#{q}" : col
213
- end
214
- else
215
- row.map do |col|
216
- @excel && col =~ @leadzero ? "=#{q}#{col}#{q}" :
217
- case grok(col)
218
- when 0 then col
219
- when 1 then "#{q}#{col}#{q}"
220
- else "#{q}#{col.gsub(q, @esc)}#{q}"
221
- end
222
- end
223
- end
224
- when :full
225
- if @excel
226
- row.map do |col|
227
- col =~ @leadzero ? "=#{q}#{col}#{q}" : "#{q}#{col.gsub(q, @esc)}#{q}"
228
- end
229
- else
230
- row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
231
- end
232
- end.join(s)
233
-
234
- @out << out + @rowsep
235
- end
236
-
237
- def stats
238
- wide = string.size.to_s.size
239
- puts "%#{wide}d rows" % @rows.size
240
- puts "%#{wide}d columns" % @cols
241
- puts "%#{wide}d cells" % @cells
242
- puts "%#{wide}d bytes" % string.size
243
- end
244
-
245
- def bomb(msg)
246
- abort "\n#{File.basename($0)}: #{msg} at character #{pos} near '#{string[pos-4,7]}'"
247
- end
248
- end
249
-
250
- if __FILE__ == $0
251
- raw = DATA.gets("\n\n").chomp
252
- # raw = File.read(ARGV.first || "lc-2023.csv")
253
- csv = Censive.new(raw, excel: true, relax: true)
254
- csv.export # (excel: true) # sep: "|")
255
- end
256
-
257
- __END__
258
- "Don",="007",10,11,"Ed",20
259
- Name,Age,,,Shoe,,,
260
- "Alice",27,5
261
- Bob,33,10 1/2
262
- Charlie or "Chuck",=B2 + B3,9
263
- Subtotal,=sum(B2:B5),="01234"
264
-
265
- A,B,C,D
266
- A,B,"C",D
267
- A,B,C",D
268
- A,B,"C",D
269
-
270
- # first line works in "relax" mode, bottom line is compliant
271
- 123,"CHO, JOELLE "JOJO"",456
272
- 123,"CHO, JOELLE ""JOJO""",456
273
-
274
- # Excel mode checking
275
- =,=x,x=,="x",="","","=",123,0123,="123",="0123"
276
- ,=x,x=,x,,,,,,=,,123,="0123",123,,="0123" # <= a little off
@@ -1,282 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- # ============================================================================
4
- # censive - A quick and lightweight CSV handling library for Ruby
5
- #
6
- # Author: Steve Shreeve (steve.shreeve@gmail.com)
7
- # Date: Feb 9, 2023
8
- #
9
- # https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
10
- # https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
11
- #
12
- # Thanks to Sutou Kouhei (kou) for his excellent advice on using scan
13
- # ============================================================================
14
- # GOALS:
15
- # 1. Faster than Ruby's default CSV library
16
- # 2. Lightweight code with streamlined and optimized logic
17
- # 3. Support most non-compliant CSV variations (@excel, @relax, etc)
18
- # 4. Support most commonly used CSV options (@sep, @quote, @strip, @drop, etc)
19
- #
20
- # TODO:
21
- # 1. Support IO streaming
22
- # 2. Review all encodings, we may be losing speed when mixing encodings
23
- # 3. Huge speedup possible if our @unquoted regex reads beyond @cr?@lf's
24
- # 4. Will using String#freeze give us a speed up?
25
- # 5. Implement support for scan_until(string) <= right now only regex is valid
26
- # ============================================================================
27
-
28
- require "strscan"
29
-
30
- class Censive < StringScanner
31
- attr :encoding
32
-
33
- def self.parse(...)
34
- new(...).parse
35
- end
36
-
37
- def self.writer(obj=nil, **opts, &code)
38
- case obj
39
- when String then File.open(obj, "w") {|io| yield new(out: io, **opts, &code) }
40
- when IO,nil then new(out: obj, **opts, &code)
41
- else abort "#{File.basename($0)}: invalid #{obj.class} object in writer"
42
- end
43
- end
44
-
45
- def initialize(str=nil,
46
- drop: false , # drop trailing empty columns?
47
- encoding: nil , # character encoding
48
- excel: false , # literals ="01" formulas =A1 + B2 http://bit.ly/3Y7jIvc
49
- mode: :compact, # export mode: compact or full
50
- out: nil , # output stream, needs to respond to <<
51
- quote: '"' , # quote character
52
- relax: false , # relax quote parsing so ,"Fo"o, => ,"Fo""o",
53
- rowsep: "\n" , # row separator for export
54
- sep: "," , # column separator character
55
- strip: false , # strip columns when reading
56
- **opts # grab bag
57
- )
58
- # initialize data source
59
- if str && str.size < 100 && File.readable?(str)
60
- str = File.open(str, encoding ? "r:#{encoding}" : "r").read
61
- else
62
- str ||= ""
63
- str = str.encode(encoding) if encoding
64
- end
65
- super(str)
66
- reset
67
-
68
- # config options
69
- @cheat = true
70
- @drop = drop
71
- @encoding = str.encoding
72
- @excel = excel
73
- @mode = mode
74
- @out = out || $stdout
75
- @relax = relax
76
- @strip = strip
77
-
78
- # config strings
79
- @quote = quote
80
- @rowsep = rowsep
81
- @sep = sep
82
-
83
- # static strings
84
- @cr = "\r"
85
- @lf = "\n"
86
- @es = ""
87
- @eq = "="
88
-
89
- # combinations
90
- @esc = (@quote * 2)
91
- @seq = [@sep, @eq].join # used for parsing in excel mode
92
-
93
- #!# TODO: come up with a clean way to escape/encode all this
94
- #!# TODO: maybe define @tokens = "#{@quote}#{@sep}#{@cr}#{@lf}", etc.
95
-
96
- # regexes
97
- @eoc = /(?=#{"\\" + @sep}|#{@cr}|#{@lf}|\z)/o # end of cell
98
- @eol = /#{@cr}#{@lf}?|#{@lf}/o # end of line
99
- @escapes = /(#{@quote})|#{"\\"+@sep}|#{@cr}|#{@lf}/o
100
- @quotable = /#{"\\"+@sep}|#{@cr}|#{@lf}/o
101
- @quotes = /#{@quote}/o
102
- @seps = /#{@sep}+/o
103
- @quoted = @excel ? /(?:=)?#{@quote}/o : @quote
104
- @unquoted = /[^#{@sep}#{@cr}#{@lf}][^#{@quote}#{@cr}#{@lf}]*/o
105
- @leadzero = /\A0\d*\z/
106
- end
107
-
108
- def reset(str=nil)
109
- @rows = nil
110
- @cols = @cells = 0
111
-
112
- #!# TODO: reset all encodings?
113
- self.string = str if str
114
- @encoding = string.encoding
115
- super()
116
- end
117
-
118
- # ==[ Parser ]==
119
-
120
- def parse
121
- @rows = []
122
- while row = next_row
123
- @rows << row
124
- count = row.size
125
- @cols = count if count > @cols
126
- @cells += count
127
- end
128
- @rows
129
- end
130
-
131
- def next_row
132
- if @cheat and line = scan_until(@eol)
133
- row = line.chomp!.split(@sep, -1)
134
- row.each do |col|
135
- next if (saw = col.count(@quote)).zero?
136
- next if (saw == 2) && col.delete_prefix!(@quote) && col.delete_suffix!(@quote)
137
- @cheat = false
138
- break
139
- end if line.include?(@quote)
140
- @cheat and return @strip ? row.each(&:strip!) : row
141
- unscan
142
- end
143
-
144
- token = next_token or return
145
- row = []
146
- row.push(*token)
147
- row.push(*token) while token = next_token
148
- row
149
- end
150
-
151
- def next_token
152
- if scan(@quoted) # quoted cell
153
- token = ""
154
- while true
155
- token << (scan_until(@quotes) or bomb "unclosed quote")[0..-2]
156
- token << @quote and next if scan(@quote)
157
- scan(@eoc) and break
158
- @relax or bomb "invalid character after quote"
159
- token << @quote + (scan_until(@quotes) or bomb "bad inline quote")
160
- end
161
- scan(@sep)
162
- @strip ? token.strip : token
163
- elsif match = scan(@unquoted) # unquoted cell(s)
164
- if check(@quote) && !match.chomp!(@sep) # if we see a stray quote
165
- unless @excel && match.chomp!(@seq) # unless an excel literal, fix it
166
- match << (scan_until(@eoc) or bomb "stray quote")
167
- scan(@sep)
168
- end
169
- end
170
- tokens = match.split(@sep, -1)
171
- @strip ? tokens.map!(&:strip) : tokens
172
- elsif scan(@sep)
173
- match = scan(@seps)
174
- match ? match.split(@sep, -1) : @es
175
- else
176
- scan(@eol)
177
- nil
178
- end
179
- end
180
-
181
- def each
182
- @rows ||= parse
183
- @rows.each {|row| yield row }
184
- end
185
-
186
- def export(**opts)
187
- out = opts.empty? ? self : self.class.writer(**opts)
188
- each {|row| out << row }
189
- end
190
-
191
- # ==[ Helpers ]==
192
-
193
- # returns 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
194
- def grok(str)
195
- if idx = str.index(@escapes)
196
- $1 ? 2 : str.index(@quotes, idx) ? 2 : 1
197
- else
198
- 0
199
- end
200
- end
201
-
202
- # output a row
203
- def <<(row)
204
-
205
- # drop trailing empty columns
206
- row.pop while row.last.empty? if @drop
207
-
208
- s,q = @sep, @quote
209
- out = case @mode
210
- when :compact
211
- case @excel ? 2 : grok(row.join)
212
- when 0
213
- row
214
- when 1
215
- row.map do |col|
216
- col.match?(@quotable) ? "#{q}#{col}#{q}" : col
217
- end
218
- else
219
- row.map do |col|
220
- @excel && col =~ @leadzero ? "=#{q}#{col}#{q}" :
221
- case grok(col)
222
- when 0 then col
223
- when 1 then "#{q}#{col}#{q}"
224
- else "#{q}#{col.gsub(q, @esc)}#{q}"
225
- end
226
- end
227
- end
228
- when :full
229
- if @excel
230
- row.map do |col|
231
- col =~ @leadzero ? "=#{q}#{col}#{q}" : "#{q}#{col.gsub(q, @esc)}#{q}"
232
- end
233
- else
234
- row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
235
- end
236
- end.join(s)
237
-
238
- @out << out + @rowsep
239
- end
240
-
241
- def stats
242
- wide = string.size.to_s.size
243
- puts "%#{wide}d rows" % @rows.size
244
- puts "%#{wide}d columns" % @cols
245
- puts "%#{wide}d cells" % @cells
246
- puts "%#{wide}d bytes" % string.size
247
- end
248
-
249
- def bomb(msg)
250
- abort "\n#{File.basename($0)}: #{msg} at character #{pos} near '#{string[pos-4,7]}'"
251
- end
252
- end
253
-
254
- if __FILE__ == $0
255
- # raw = DATA.gets("\n\n").chomp
256
- # raw = File.read(ARGV.first || "lc-2023.csv")
257
- raw = File.open("KEN_ALL.CSV", "r:cp932").read
258
-
259
- csv = Censive.new(raw, excel: true, relax: true)
260
- csv.export # (excel: true) # sep: "|")
261
- end
262
-
263
- __END__
264
- "Don",="007",10,"Ed"
265
- Name,Age,,,Shoe,,,
266
- "Alice",27,5
267
- Bob,33,10 1/2
268
- Charlie or "Chuck",=B2 + B3,9
269
- Subtotal,=sum(B2:B5),="01234"
270
-
271
- A,B,C,D
272
- A,B,"C",D
273
- A,B,C",D
274
- A,B,"C",D
275
-
276
- # first line works in "relax" mode, bottom line is compliant
277
- 123,"CHO, JOELLE "JOJO"",456
278
- 123,"CHO, JOELLE ""JOJO""",456
279
-
280
- # Excel mode checking
281
- =,=x,x=,="x",="","","=",123,0123,="123",="0123"
282
- ,=x,x=,x,,,,,,=,,123,="0123",123,,="0123" # <= a little off
data/lib/flay.rb DELETED
@@ -1,227 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- # ============================================================================
4
- # flay - A quick and lightweight benchmarking tool for Ruby
5
- #
6
- # Author: Steve Shreeve (steve.shreeve@gmail.com)
7
- # Date: Feb 9, 2023
8
- # ============================================================================
9
- # GOALS:
10
- # 1. Provide a simple way to benchmark various code
11
- # 2. Easy to configure and start comparing results
12
- #
13
- # TODO:
14
- # 1. Everything
15
- # ============================================================================
16
-
17
- class Hash
18
- alias_method :default_lookup, :[]
19
-
20
- def [](key, miss=nil)
21
- key?(sym = key.to_sym) and return default_lookup(sym) || miss
22
- ary = key.to_s.split(/(?:[.\/\[]|\][.\/]?)/)
23
- val = ary.inject(self) do |obj, sub|
24
- if obj == self then default_lookup(sub.to_sym)
25
- elsif obj == nil then break
26
- elsif sub =~ /\A-?\d*\z/ then obj[sub.to_i]
27
- else obj[sub.to_sym]
28
- end
29
- end or miss
30
- end
31
-
32
- def method_missing(name, *args)
33
- name !~ /=$/ ? self[name, *args] : self[$`.to_sym] = args.first
34
- end
35
- end
36
-
37
- config = {
38
- environments: [
39
- {
40
- name: "Environment 1",
41
- before: <<~"|",
42
- # Environment 1 before
43
- |
44
- after: <<~"|",
45
- # Environment 1 after
46
- |
47
- },
48
- {
49
- name: "Environment 2",
50
- before: <<~"|",
51
- # Environment 1 before
52
- |
53
- after: <<~"|",
54
- # Environment 1 after
55
- |
56
- },
57
- ],
58
-
59
- contexts: [
60
- {
61
- name: "Context 1",
62
- before: <<~"|",
63
- # context 1 before
64
- |
65
- script: <<~"|",
66
- a = [*1..1e5]
67
- a.sum
68
- |
69
- after: <<~"|",
70
- # context 1 after
71
- |
72
- },
73
- {
74
- name: "Context 2",
75
- before: <<~"|",
76
- # context 2 before
77
- |
78
- after: <<~"|",
79
- # context 2 after
80
- |
81
- },
82
- ],
83
-
84
- tasks: [
85
- {
86
- name: "Task 1",
87
- runs: 35,
88
- before: <<~"|",
89
- # Task 1 before
90
- |
91
- after: <<~"|",
92
- # Task 1 after
93
- |
94
- },
95
- {
96
- name: "Task 2",
97
- secs: 30,
98
- before: <<~"|",
99
- # Task 2 before
100
- |
101
- after: <<~"|",
102
- # Task 2 after
103
- |
104
- },
105
- ],
106
- }
107
-
108
- # ==[ Helpers ]==
109
-
110
- def wrapper(object, type=nil)
111
- puts case type
112
- when :environment then template_for_environment object
113
- when :context then template_for_context object
114
- when :task then template_for_task object
115
- else section object
116
- end
117
- end
118
-
119
- def wrap(list, type=nil, **opts)
120
- list.each do |item|
121
- wrapper(item, type)
122
- yield item
123
- end
124
- end
125
-
126
- def section(text, wide=78, left=0)
127
- [
128
- "# ".ljust(wide, "="),
129
- "# #{text}",
130
- "# ".ljust(wide, "="),
131
- ].join("\n")
132
- end
133
-
134
- def hr(text, wide=78, left=0)
135
- [ " " * left, "# ==[ ", text, " ]" ].join.ljust(wide, "=")
136
- end
137
-
138
- # ==[ Templates ]==
139
-
140
- def template_for_environment(environment)
141
- <<~"|"
142
- #{ section "Environment: #{environment.name} " }
143
-
144
- # ==[ Code before environment ]==
145
-
146
- #{ environment.before }
147
- |
148
- end
149
-
150
- def template_for_context(context)
151
- <<~"|"
152
- #{ section "Context: #{context.name} " }
153
-
154
- # ==[ Code before context ]==
155
-
156
- #{ context.before }
157
- |
158
- end
159
-
160
- def template_for_task(task)
161
- <<~"|"
162
- #{ section "Task: #{task.name} " }
163
-
164
- # ==[ Code before task ]==
165
-
166
- #{ task.before }
167
-
168
- # ==[ Calculate the duration of a loop of empty runs ]==
169
-
170
- if #{ task.runs } == 1
171
- __flay_before_empty = 0
172
- __flay_after_empty = 0
173
- else
174
- __flay_before_empty = Process.clock_gettime(Process::CLOCK_MONOTONIC)
175
- __flay_runs = 0
176
- while __flay_runs < #{ task.runs } # this empty loop improves accuracy
177
- __flay_runs += 1
178
- end
179
- __flay_after_empty = Process.clock_gettime(Process::CLOCK_MONOTONIC)
180
- end
181
-
182
- # ==[ Calculate the duration of a loop of script runs ]==
183
-
184
- if #{ task.runs } == 1
185
- __flay_before_script = 0
186
- __flay_after_script = 0
187
- else
188
- __flay_before_script = Process.clock_gettime(Process::CLOCK_MONOTONIC)
189
- __flay_runs = 0
190
- while __flay_runs < #{ task.runs }
191
-
192
- # ==[ Before script ]==
193
-
194
- #{ task.script }
195
-
196
- # ==[ After script ]==
197
-
198
- __flay_runs += 1
199
- end
200
- __flay_after_script = Process.clock_gettime(Process::CLOCK_MONOTONIC)
201
- end
202
-
203
- # ==[ Code after task ]==
204
-
205
- #{ task.after }
206
-
207
- # ==[ Write out timestamps ]==
208
-
209
- __flay_duration = (__flay_after_script - __flay_before_script) -
210
- (__flay_after_empty - __flay_before_empty )
211
-
212
- File.write("/dev/null", __flay_duration.inspect)
213
- |
214
- end
215
-
216
- # ==[ Workflow ]==
217
-
218
- environments = config.environments
219
- contexts = config.contexts
220
- tasks = config.tasks
221
-
222
- wrap(environments, :environment) do |environment|
223
- wrap(tasks, :task) do |task|
224
- wrap(contexts, :context) do |context|
225
- end
226
- end
227
- end