censive 0.10 → 0.11

Sign up to get free protection for your applications and to get access to all the features.
Files changed (5) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +9 -0
  3. data/censive.gemspec +3 -2
  4. data/lib/censive.rb +71 -119
  5. metadata +17 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f4b06c1c42b5f813f8901c4e7240cdd43df1ccc22cf87327dc3ed7d850720eb4
4
- data.tar.gz: 97ab27b79eead81517fa28a4c51923fa02ec2fa95922f6f61dc509c7a4890b2e
3
+ metadata.gz: 91f3884b92c79f8cca37066a420c872b612c37ed736f61fa7d11c0f0fd861d8e
4
+ data.tar.gz: 71789d006309c1d87e681a7078e718342cb36e1f8a18690c59e51595c49d2e59
5
5
  SHA512:
6
- metadata.gz: a2f297ac516f5e01510a9ceb90cdb2cc1e782ff97a4f67515d73f6d56d8512cd4d9cbb5d04425bcdb8a7a5cdb63aeb2835e7bed2a76dcc149dae0bd63c4cc17b
7
- data.tar.gz: 85762c69bc669db5a48f0e3b58e4319afdc9f1765cc18cd2b6c9501aaaccf3e41dddb58d3654bc4c76242632eaf8988f348bb652c6c40105e0373b8afdf463d3
6
+ metadata.gz: d5fe33889abe08bc6aa57bb4dec2d65ef3e8935da5aa209a0bcb6060696de07cf44189ad9be6f43ffcd2897672668bd3aff7e8b713f5070199627e40edd3704f
7
+ data.tar.gz: 2c79d30b4682da07800c19eb93067c5c72d471f9abc7b824f7525b3d36b8fb1108d0e733706852f750baac57360af2d5b77527a1daab871e2313d558928ce240
data/README.md CHANGED
@@ -68,3 +68,12 @@ Censive.writer('out.tsv', sep: "\t", mode: :full) do |out|
68
68
  end
69
69
  end
70
70
  ```
71
+
72
+ Or, you can be more succinct with:
73
+
74
+ ```ruby
75
+ require 'censive'
76
+
77
+ csv = Censive.new(File.read('data.csv'))
78
+ csv.export(sep: "\t")
79
+ ```
data/censive.gemspec CHANGED
@@ -2,13 +2,14 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = "censive"
5
- s.version = "0.10"
5
+ s.version = "0.11"
6
6
  s.author = "Steve Shreeve"
7
7
  s.email = "steve.shreeve@gmail.com"
8
- s.summary = "A quick and lightweight CSV handling library for Ruby"
8
+ s.summary =
9
9
  s.description = "A quick and lightweight CSV handling library for Ruby"
10
10
  s.homepage = "https://github.com/shreeve/censive"
11
11
  s.license = "MIT"
12
12
  s.files = `git ls-files`.split("\n") - %w[.gitignore]
13
13
  s.executables = `(cd bin 2>&1) > /dev/null && git ls-files .`.split("\n")
14
+ s.add_dependency "strscan", ">= 3.0.6"
14
15
  end
data/lib/censive.rb CHANGED
@@ -1,37 +1,36 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- # ==============================================================================
3
+ # ============================================================================
4
4
  # censive - A quick and lightweight CSV handling library for Ruby
5
5
  #
6
6
  # Author: Steve Shreeve (steve.shreeve@gmail.com)
7
- # Date: Jan 30, 2023
8
- #
9
- # Thanks to https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
10
- # and, also https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
11
- # ==============================================================================
12
- # The goals are:
7
+ # Date: Feb 3, 2023
13
8
  #
9
+ # https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
10
+ # https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
11
+ # https://github.com/ruby/strscan/issues/53 for details
12
+ # https://github.com/ruby/strscan/pull/54 for code
13
+ # ============================================================================
14
+ # GOALS:
14
15
  # 1. Faster than Ruby's default CSV library
15
- # 2. Lightweight code base with streamlined method calls
16
- #
17
- # To consider:
16
+ # 2. Lightweight code base with streamlined logic
17
+ # 3. Support for most non-compliant CSV variations (eg - @relax, @excel)
18
18
  #
19
- # 1. Option to support IO streaming
20
- # 2. Option to strip whitespace
21
- # 3. Option to support headers in readers and writers
22
- # 4. Confirm file encodings such as UTF-8, UTF-16, etc.
23
- #
24
- # NOTE: Only getch and scan_until advance strscan's position
25
- # ==============================================================================
19
+ # TODO:
20
+ # 1. Support IO streaming
21
+ # 2. Add option to strip whitespace
22
+ # 3. Support CSV headers in first row
23
+ # ============================================================================
26
24
 
25
+ require 'bundler/setup'
27
26
  require 'strscan'
28
27
 
29
28
  class Censive < StringScanner
30
29
 
31
- def self.writer(obj=$stdout, **opts, &code)
30
+ def self.writer(obj=nil, **opts, &code)
32
31
  case obj
33
32
  when String then File.open(path, 'w') {|file| yield new(out: obj, **opts, &code) }
34
- when IO then new(out: obj, **opts, &code)
33
+ when IO,nil then new(out: obj, **opts, &code)
35
34
  else abort "#{File.basename($0)}: invalid #{obj.class} object in writer"
36
35
  end
37
36
  end
@@ -39,7 +38,7 @@ class Censive < StringScanner
39
38
  def initialize(str=nil,
40
39
  drop: false , # drop trailing empty fields?
41
40
  eol: "\n" , # line endings for exports
42
- excel: false , # literals(="01") formulas(=A1 + B2); http://bit.ly/3Y7jIvc
41
+ excel: false , # literals ="01" formulas =A1 + B2 http://bit.ly/3Y7jIvc
43
42
  mode: :compact, # export mode: compact or full
44
43
  out: nil , # output stream, needs to respond to <<
45
44
  quote: '"' , # quote character
@@ -51,81 +50,67 @@ class Censive < StringScanner
51
50
  reset
52
51
 
53
52
  @drop = drop
54
- @eol = eol .freeze #!# TODO: are the '.freeze' statements helpful?
53
+ @eol = eol
55
54
  @excel = excel
56
55
  @mode = mode
57
- @out = out
58
- @quote = quote.freeze
56
+ @out = out || $stdout
57
+ @quote = quote
59
58
  @relax = relax
60
- @sep = sep .freeze
61
-
62
- @es = "" .freeze
63
- @cr = "\r" .freeze
64
- @lf = "\n" .freeze
65
- @eq = "=" .freeze
66
- @esc = (@quote * 2).freeze
59
+ @sep = sep
67
60
 
68
- @tokens = [@sep,@quote,@cr,@lf,@es,nil]
61
+ @cr = "\r"
62
+ @lf = "\n"
63
+ @es = ""
64
+ @eq = "="
65
+ @esc = (@quote * 2)
69
66
  end
70
67
 
71
68
  def reset(str=nil)
72
69
  self.string = str if str
73
70
  super()
74
- @char = peek(1)
75
- @flag = nil
76
-
71
+ @char = curr_char
77
72
  @rows = nil
78
73
  @cols = @cells = 0
79
74
  end
80
75
 
81
76
  # ==[ Lexer ]==
82
77
 
83
- def next_char
84
- getch
85
- @char = peek(1)
86
- end
87
-
88
- def next_token
78
+ # pure ruby versions for debugging
79
+ # def curr_char; @char = string[pos]; end
80
+ # def next_char; scan(/./m); @char = string[pos]; end
89
81
 
90
- # process and clear @flag
91
- case @flag
92
- when @es then @flag = nil; [@cr,@lf,@es,nil].include?(@char) and return @es
93
- when @cr then @flag = nil; next_char == @lf and next_char
94
- when @lf then @flag = nil; next_char
95
- else @flag = nil
96
- end if @flag
82
+ def curr_char; @char = currchar; end
83
+ def next_char; @char = nextchar; end
97
84
 
98
- # Excel literals ="0123" and formulas =A1 + B2 (see http://bit.ly/3Y7jIvc)
85
+ def next_token
99
86
  if @excel && @char == @eq
100
- @flag = @eq
87
+ excel = true
101
88
  next_char
102
89
  end
103
90
 
104
- if @tokens.include?(@char)
91
+ if @char == @quote # consume quoted cell
92
+ match = ""
93
+ while true
94
+ next_char
95
+ match << (scan_until(/(?=#{@quote})/o) or bomb "unclosed quote")
96
+ match << @quote and next if next_char == @quote
97
+ break if [@sep,@cr,@lf,@es,nil].include?(@char)
98
+ @relax or bomb "invalid character after quote"
99
+ match << @quote + scan_until(/(?=#{@quote})/o) + @quote
100
+ end
101
+ next_char if @char == @sep
102
+ match
103
+ elsif [@sep,@cr,@lf,@es,nil].include?(@char)
105
104
  case @char
106
- when @quote # consume quoted cell
107
- match = ""
108
- while true
109
- getch # move past the quote that got us here
110
- match << (scan_until(/(?=#{@quote})/o) or bomb "unclosed quote")
111
- case next_char
112
- when @sep then @flag = @es; next_char; break
113
- when @quote then match << @quote
114
- when @cr,@lf,@es,nil then break
115
- else @relax ? match << (@quote + @char) : bomb("invalid character after quote")
116
- end
117
- end
118
- match
119
- when @sep then @flag = @es; next_char; @es
120
- when @cr then @flag = @cr; nil
121
- when @lf then @flag = @lf; nil
122
- when @es,nil then nil
105
+ when @sep then next_char; @es
106
+ when @cr then next_char == @lf and next_char; nil
107
+ when @lf then next_char; nil
108
+ else nil
123
109
  end
124
110
  else # consume unquoted cell
125
111
  match = scan_until(/(?=#{@sep}|#{@cr}|#{@lf}|\z)/o) or bomb "unexpected character"
126
- match = @eq + match and @flag = nil if @flag == @eq
127
- @char = peek(1)
128
- @char == @sep and @flag = @es and next_char
112
+ match.prepend(@eq) if excel
113
+ next_char if curr_char == @sep
129
114
  match
130
115
  end
131
116
  end
@@ -156,10 +141,10 @@ class Censive < StringScanner
156
141
 
157
142
  # ==[ Helpers ]==
158
143
 
159
- # grok returns: 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
144
+ # returns 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
160
145
  def grok(str)
161
- if pos = str.index(/(#{@quote})|#{@sep}|#{@cr}|#{@lf}/o)
162
- $1 ? 2 : str.index(/#{@quote}/o, pos) ? 2 : 1
146
+ if idx = str.index(/(#{@quote})|#{@sep}|#{@cr}|#{@lf}/o) #!# FIXME: regex injection is possible
147
+ $1 ? 2 : str.index(/#{@quote}/o, idx) ? 2 : 1
163
148
  else
164
149
  0
165
150
  end
@@ -167,17 +152,14 @@ class Censive < StringScanner
167
152
 
168
153
  # output a row
169
154
  def <<(row)
170
- @out or return super
171
155
 
172
156
  # drop trailing empty columns
173
157
  row.pop while row.last.empty? if @drop
174
158
 
175
- #!# FIXME: Excel output needs to protect 0-leading numbers
176
-
177
159
  s,q = @sep, @quote
178
160
  out = case @mode
179
161
  when :compact
180
- case grok(row.join)
162
+ case @excel ? 2 : grok(row.join)
181
163
  when 0
182
164
  row
183
165
  when 1
@@ -186,6 +168,7 @@ class Censive < StringScanner
186
168
  end
187
169
  else
188
170
  row.map do |col|
171
+ @excel && col =~ /\A0\d*\z/ ? "=#{q}#{col}#{q}" :
189
172
  case grok(col)
190
173
  when 0 then col
191
174
  when 1 then "#{q}#{col}#{q}"
@@ -197,7 +180,6 @@ class Censive < StringScanner
197
180
  row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
198
181
  end.join(s)
199
182
 
200
- # add line ending
201
183
  @out << out + @eol
202
184
  end
203
185
 
@@ -206,8 +188,8 @@ class Censive < StringScanner
206
188
  @rows.each {|row| yield row }
207
189
  end
208
190
 
209
- def export(...)
210
- out = self.class.writer(...)
191
+ def export(**opts)
192
+ out = opts.empty? ? self : self.class.writer(**opts)
211
193
  each {|row| out << row }
212
194
  end
213
195
 
@@ -220,12 +202,11 @@ class Censive < StringScanner
220
202
  end
221
203
  end
222
204
 
223
- # ==[ Command line ]==
224
-
225
205
  if __FILE__ == $0
226
206
  raw = DATA.gets("\n\n").chomp
227
- csv = Censive.new(raw, excel: true)
228
- csv.export # (sep: "\t", excel: true)
207
+ # raw = File.read(ARGV.first || "lc-2023.csv")
208
+ csv = Censive.new(raw, excel: true, relax: true)
209
+ csv.export # (sep: ",", excel: true)
229
210
  end
230
211
 
231
212
  __END__
@@ -236,39 +217,10 @@ Charlie or "Chuck",=B2 + B3,9
236
217
  "Doug E Fresh",="007",10
237
218
  Subtotal,=sum(B2:B5),="01234"
238
219
 
220
+ # first line works in "relax" mode, bottom line is compliant
221
+ 123,"CHO, JOELLE "JOJO"",456
222
+ 123,"CHO, JOELLE ""JOJO""",456
239
223
 
240
-
241
-
242
- path = '../test/a-uses-tabs-and-single-quotes-and-no-trailing-newline.tsv'
243
- data = File.read(path)
244
-
245
- out = Censive.writer
246
-
247
- Censive.new(data, sep: "\t", quote: "'").each do |row|
248
- p row
249
- end
250
-
251
- Censive.reader(path, sep: "\t", quote: "'").each {|r| p r}
252
-
253
- # STDOUT.sync = true
254
- #
255
- # data = File.read('1.csv')
256
- #
257
- # Censive.writer('out.csv') do |out|
258
- # Censive.new(data, relax: true, excel: true).each do |row|
259
- # out << row
260
- # end
261
- # end
262
- #
263
- # __END__
264
- #
265
- # ARGV << "z.csv" if ARGV.empty?
266
- #
267
- # path = ARGV.first
268
- # data = File.read(path)
269
- #
270
- # csv = Censive.new(data)
271
- #
272
- # data.size > 1e6 ? csv.parse : csv.parse.each {|cols| p cols }
273
- #
274
- # csv.stats
224
+ # Excel mode checking
225
+ =,=x,x=,="x",="","","=",123,0123,="123",="0123"
226
+ ,=x,x=,x,,,,,,=,,123,="0123",123,,="0123" # <= a little off
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: censive
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.10'
4
+ version: '0.11'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Steve Shreeve
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-02-02 00:00:00.000000000 Z
12
- dependencies: []
11
+ date: 2023-02-04 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: strscan
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: 3.0.6
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 3.0.6
13
27
  description: A quick and lightweight CSV handling library for Ruby
14
28
  email: steve.shreeve@gmail.com
15
29
  executables: []