censive 0.10 → 0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (5) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +9 -0
  3. data/censive.gemspec +3 -2
  4. data/lib/censive.rb +71 -119
  5. metadata +17 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f4b06c1c42b5f813f8901c4e7240cdd43df1ccc22cf87327dc3ed7d850720eb4
4
- data.tar.gz: 97ab27b79eead81517fa28a4c51923fa02ec2fa95922f6f61dc509c7a4890b2e
3
+ metadata.gz: 91f3884b92c79f8cca37066a420c872b612c37ed736f61fa7d11c0f0fd861d8e
4
+ data.tar.gz: 71789d006309c1d87e681a7078e718342cb36e1f8a18690c59e51595c49d2e59
5
5
  SHA512:
6
- metadata.gz: a2f297ac516f5e01510a9ceb90cdb2cc1e782ff97a4f67515d73f6d56d8512cd4d9cbb5d04425bcdb8a7a5cdb63aeb2835e7bed2a76dcc149dae0bd63c4cc17b
7
- data.tar.gz: 85762c69bc669db5a48f0e3b58e4319afdc9f1765cc18cd2b6c9501aaaccf3e41dddb58d3654bc4c76242632eaf8988f348bb652c6c40105e0373b8afdf463d3
6
+ metadata.gz: d5fe33889abe08bc6aa57bb4dec2d65ef3e8935da5aa209a0bcb6060696de07cf44189ad9be6f43ffcd2897672668bd3aff7e8b713f5070199627e40edd3704f
7
+ data.tar.gz: 2c79d30b4682da07800c19eb93067c5c72d471f9abc7b824f7525b3d36b8fb1108d0e733706852f750baac57360af2d5b77527a1daab871e2313d558928ce240
data/README.md CHANGED
@@ -68,3 +68,12 @@ Censive.writer('out.tsv', sep: "\t", mode: :full) do |out|
68
68
  end
69
69
  end
70
70
  ```
71
+
72
+ Or, you can be more succinct with:
73
+
74
+ ```ruby
75
+ require 'censive'
76
+
77
+ csv = Censive.new(File.read('data.csv'))
78
+ csv.export(sep: "\t")
79
+ ```
data/censive.gemspec CHANGED
@@ -2,13 +2,14 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = "censive"
5
- s.version = "0.10"
5
+ s.version = "0.11"
6
6
  s.author = "Steve Shreeve"
7
7
  s.email = "steve.shreeve@gmail.com"
8
- s.summary = "A quick and lightweight CSV handling library for Ruby"
8
+ s.summary =
9
9
  s.description = "A quick and lightweight CSV handling library for Ruby"
10
10
  s.homepage = "https://github.com/shreeve/censive"
11
11
  s.license = "MIT"
12
12
  s.files = `git ls-files`.split("\n") - %w[.gitignore]
13
13
  s.executables = `(cd bin 2>&1) > /dev/null && git ls-files .`.split("\n")
14
+ s.add_dependency "strscan", ">= 3.0.6"
14
15
  end
data/lib/censive.rb CHANGED
@@ -1,37 +1,36 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- # ==============================================================================
3
+ # ============================================================================
4
4
  # censive - A quick and lightweight CSV handling library for Ruby
5
5
  #
6
6
  # Author: Steve Shreeve (steve.shreeve@gmail.com)
7
- # Date: Jan 30, 2023
8
- #
9
- # Thanks to https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
10
- # and, also https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
11
- # ==============================================================================
12
- # The goals are:
7
+ # Date: Feb 3, 2023
13
8
  #
9
+ # https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
10
+ # https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
11
+ # https://github.com/ruby/strscan/issues/53 for details
12
+ # https://github.com/ruby/strscan/pull/54 for code
13
+ # ============================================================================
14
+ # GOALS:
14
15
  # 1. Faster than Ruby's default CSV library
15
- # 2. Lightweight code base with streamlined method calls
16
- #
17
- # To consider:
16
+ # 2. Lightweight code base with streamlined logic
17
+ # 3. Support for most non-compliant CSV variations (eg - @relax, @excel)
18
18
  #
19
- # 1. Option to support IO streaming
20
- # 2. Option to strip whitespace
21
- # 3. Option to support headers in readers and writers
22
- # 4. Confirm file encodings such as UTF-8, UTF-16, etc.
23
- #
24
- # NOTE: Only getch and scan_until advance strscan's position
25
- # ==============================================================================
19
+ # TODO:
20
+ # 1. Support IO streaming
21
+ # 2. Add option to strip whitespace
22
+ # 3. Support CSV headers in first row
23
+ # ============================================================================
26
24
 
25
+ require 'bundler/setup'
27
26
  require 'strscan'
28
27
 
29
28
  class Censive < StringScanner
30
29
 
31
- def self.writer(obj=$stdout, **opts, &code)
30
+ def self.writer(obj=nil, **opts, &code)
32
31
  case obj
33
32
  when String then File.open(path, 'w') {|file| yield new(out: obj, **opts, &code) }
34
- when IO then new(out: obj, **opts, &code)
33
+ when IO,nil then new(out: obj, **opts, &code)
35
34
  else abort "#{File.basename($0)}: invalid #{obj.class} object in writer"
36
35
  end
37
36
  end
@@ -39,7 +38,7 @@ class Censive < StringScanner
39
38
  def initialize(str=nil,
40
39
  drop: false , # drop trailing empty fields?
41
40
  eol: "\n" , # line endings for exports
42
- excel: false , # literals(="01") formulas(=A1 + B2); http://bit.ly/3Y7jIvc
41
+ excel: false , # literals ="01" formulas =A1 + B2 http://bit.ly/3Y7jIvc
43
42
  mode: :compact, # export mode: compact or full
44
43
  out: nil , # output stream, needs to respond to <<
45
44
  quote: '"' , # quote character
@@ -51,81 +50,67 @@ class Censive < StringScanner
51
50
  reset
52
51
 
53
52
  @drop = drop
54
- @eol = eol .freeze #!# TODO: are the '.freeze' statements helpful?
53
+ @eol = eol
55
54
  @excel = excel
56
55
  @mode = mode
57
- @out = out
58
- @quote = quote.freeze
56
+ @out = out || $stdout
57
+ @quote = quote
59
58
  @relax = relax
60
- @sep = sep .freeze
61
-
62
- @es = "" .freeze
63
- @cr = "\r" .freeze
64
- @lf = "\n" .freeze
65
- @eq = "=" .freeze
66
- @esc = (@quote * 2).freeze
59
+ @sep = sep
67
60
 
68
- @tokens = [@sep,@quote,@cr,@lf,@es,nil]
61
+ @cr = "\r"
62
+ @lf = "\n"
63
+ @es = ""
64
+ @eq = "="
65
+ @esc = (@quote * 2)
69
66
  end
70
67
 
71
68
  def reset(str=nil)
72
69
  self.string = str if str
73
70
  super()
74
- @char = peek(1)
75
- @flag = nil
76
-
71
+ @char = curr_char
77
72
  @rows = nil
78
73
  @cols = @cells = 0
79
74
  end
80
75
 
81
76
  # ==[ Lexer ]==
82
77
 
83
- def next_char
84
- getch
85
- @char = peek(1)
86
- end
87
-
88
- def next_token
78
+ # pure ruby versions for debugging
79
+ # def curr_char; @char = string[pos]; end
80
+ # def next_char; scan(/./m); @char = string[pos]; end
89
81
 
90
- # process and clear @flag
91
- case @flag
92
- when @es then @flag = nil; [@cr,@lf,@es,nil].include?(@char) and return @es
93
- when @cr then @flag = nil; next_char == @lf and next_char
94
- when @lf then @flag = nil; next_char
95
- else @flag = nil
96
- end if @flag
82
+ def curr_char; @char = currchar; end
83
+ def next_char; @char = nextchar; end
97
84
 
98
- # Excel literals ="0123" and formulas =A1 + B2 (see http://bit.ly/3Y7jIvc)
85
+ def next_token
99
86
  if @excel && @char == @eq
100
- @flag = @eq
87
+ excel = true
101
88
  next_char
102
89
  end
103
90
 
104
- if @tokens.include?(@char)
91
+ if @char == @quote # consume quoted cell
92
+ match = ""
93
+ while true
94
+ next_char
95
+ match << (scan_until(/(?=#{@quote})/o) or bomb "unclosed quote")
96
+ match << @quote and next if next_char == @quote
97
+ break if [@sep,@cr,@lf,@es,nil].include?(@char)
98
+ @relax or bomb "invalid character after quote"
99
+ match << @quote + scan_until(/(?=#{@quote})/o) + @quote
100
+ end
101
+ next_char if @char == @sep
102
+ match
103
+ elsif [@sep,@cr,@lf,@es,nil].include?(@char)
105
104
  case @char
106
- when @quote # consume quoted cell
107
- match = ""
108
- while true
109
- getch # move past the quote that got us here
110
- match << (scan_until(/(?=#{@quote})/o) or bomb "unclosed quote")
111
- case next_char
112
- when @sep then @flag = @es; next_char; break
113
- when @quote then match << @quote
114
- when @cr,@lf,@es,nil then break
115
- else @relax ? match << (@quote + @char) : bomb("invalid character after quote")
116
- end
117
- end
118
- match
119
- when @sep then @flag = @es; next_char; @es
120
- when @cr then @flag = @cr; nil
121
- when @lf then @flag = @lf; nil
122
- when @es,nil then nil
105
+ when @sep then next_char; @es
106
+ when @cr then next_char == @lf and next_char; nil
107
+ when @lf then next_char; nil
108
+ else nil
123
109
  end
124
110
  else # consume unquoted cell
125
111
  match = scan_until(/(?=#{@sep}|#{@cr}|#{@lf}|\z)/o) or bomb "unexpected character"
126
- match = @eq + match and @flag = nil if @flag == @eq
127
- @char = peek(1)
128
- @char == @sep and @flag = @es and next_char
112
+ match.prepend(@eq) if excel
113
+ next_char if curr_char == @sep
129
114
  match
130
115
  end
131
116
  end
@@ -156,10 +141,10 @@ class Censive < StringScanner
156
141
 
157
142
  # ==[ Helpers ]==
158
143
 
159
- # grok returns: 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
144
+ # returns 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
160
145
  def grok(str)
161
- if pos = str.index(/(#{@quote})|#{@sep}|#{@cr}|#{@lf}/o)
162
- $1 ? 2 : str.index(/#{@quote}/o, pos) ? 2 : 1
146
+ if idx = str.index(/(#{@quote})|#{@sep}|#{@cr}|#{@lf}/o) #!# FIXME: regex injection is possible
147
+ $1 ? 2 : str.index(/#{@quote}/o, idx) ? 2 : 1
163
148
  else
164
149
  0
165
150
  end
@@ -167,17 +152,14 @@ class Censive < StringScanner
167
152
 
168
153
  # output a row
169
154
  def <<(row)
170
- @out or return super
171
155
 
172
156
  # drop trailing empty columns
173
157
  row.pop while row.last.empty? if @drop
174
158
 
175
- #!# FIXME: Excel output needs to protect 0-leading numbers
176
-
177
159
  s,q = @sep, @quote
178
160
  out = case @mode
179
161
  when :compact
180
- case grok(row.join)
162
+ case @excel ? 2 : grok(row.join)
181
163
  when 0
182
164
  row
183
165
  when 1
@@ -186,6 +168,7 @@ class Censive < StringScanner
186
168
  end
187
169
  else
188
170
  row.map do |col|
171
+ @excel && col =~ /\A0\d*\z/ ? "=#{q}#{col}#{q}" :
189
172
  case grok(col)
190
173
  when 0 then col
191
174
  when 1 then "#{q}#{col}#{q}"
@@ -197,7 +180,6 @@ class Censive < StringScanner
197
180
  row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
198
181
  end.join(s)
199
182
 
200
- # add line ending
201
183
  @out << out + @eol
202
184
  end
203
185
 
@@ -206,8 +188,8 @@ class Censive < StringScanner
206
188
  @rows.each {|row| yield row }
207
189
  end
208
190
 
209
- def export(...)
210
- out = self.class.writer(...)
191
+ def export(**opts)
192
+ out = opts.empty? ? self : self.class.writer(**opts)
211
193
  each {|row| out << row }
212
194
  end
213
195
 
@@ -220,12 +202,11 @@ class Censive < StringScanner
220
202
  end
221
203
  end
222
204
 
223
- # ==[ Command line ]==
224
-
225
205
  if __FILE__ == $0
226
206
  raw = DATA.gets("\n\n").chomp
227
- csv = Censive.new(raw, excel: true)
228
- csv.export # (sep: "\t", excel: true)
207
+ # raw = File.read(ARGV.first || "lc-2023.csv")
208
+ csv = Censive.new(raw, excel: true, relax: true)
209
+ csv.export # (sep: ",", excel: true)
229
210
  end
230
211
 
231
212
  __END__
@@ -236,39 +217,10 @@ Charlie or "Chuck",=B2 + B3,9
236
217
  "Doug E Fresh",="007",10
237
218
  Subtotal,=sum(B2:B5),="01234"
238
219
 
220
+ # first line works in "relax" mode, bottom line is compliant
221
+ 123,"CHO, JOELLE "JOJO"",456
222
+ 123,"CHO, JOELLE ""JOJO""",456
239
223
 
240
-
241
-
242
- path = '../test/a-uses-tabs-and-single-quotes-and-no-trailing-newline.tsv'
243
- data = File.read(path)
244
-
245
- out = Censive.writer
246
-
247
- Censive.new(data, sep: "\t", quote: "'").each do |row|
248
- p row
249
- end
250
-
251
- Censive.reader(path, sep: "\t", quote: "'").each {|r| p r}
252
-
253
- # STDOUT.sync = true
254
- #
255
- # data = File.read('1.csv')
256
- #
257
- # Censive.writer('out.csv') do |out|
258
- # Censive.new(data, relax: true, excel: true).each do |row|
259
- # out << row
260
- # end
261
- # end
262
- #
263
- # __END__
264
- #
265
- # ARGV << "z.csv" if ARGV.empty?
266
- #
267
- # path = ARGV.first
268
- # data = File.read(path)
269
- #
270
- # csv = Censive.new(data)
271
- #
272
- # data.size > 1e6 ? csv.parse : csv.parse.each {|cols| p cols }
273
- #
274
- # csv.stats
224
+ # Excel mode checking
225
+ =,=x,x=,="x",="","","=",123,0123,="123",="0123"
226
+ ,=x,x=,x,,,,,,=,,123,="0123",123,,="0123" # <= a little off
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: censive
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.10'
4
+ version: '0.11'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Steve Shreeve
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-02-02 00:00:00.000000000 Z
12
- dependencies: []
11
+ date: 2023-02-04 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: strscan
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: 3.0.6
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 3.0.6
13
27
  description: A quick and lightweight CSV handling library for Ruby
14
28
  email: steve.shreeve@gmail.com
15
29
  executables: []