censive 0.8 → 0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6419efcdc9274ea8bcf7b8527001e33f8bdfea348dfd911686cab36984d507da
4
- data.tar.gz: 3b59aead54517fd64d7ece3eaa6f459e301e1e48f1ae34772a7128c61fb739f2
3
+ metadata.gz: f4b06c1c42b5f813f8901c4e7240cdd43df1ccc22cf87327dc3ed7d850720eb4
4
+ data.tar.gz: 97ab27b79eead81517fa28a4c51923fa02ec2fa95922f6f61dc509c7a4890b2e
5
5
  SHA512:
6
- metadata.gz: 7910c09e76a81ed27870ea52fb6c8aea0316ed213c53a026d98adc64f93349477e6acab0a93b88c6f184ce1d317634ecdca9290d50bff9b117b98bedd3ac7b86
7
- data.tar.gz: 358ab985947d486b5f486b1f7e9c1f591e3b8e906b9eab59a4ed151e5f5d9652c211f2d2a4ee36f0543227e2ae5e33ba57f1e4c178f6f7e72e05c14d7b46895f
6
+ metadata.gz: a2f297ac516f5e01510a9ceb90cdb2cc1e782ff97a4f67515d73f6d56d8512cd4d9cbb5d04425bcdb8a7a5cdb63aeb2835e7bed2a76dcc149dae0bd63c4cc17b
7
+ data.tar.gz: 85762c69bc669db5a48f0e3b58e4319afdc9f1765cc18cd2b6c9501aaaccf3e41dddb58d3654bc4c76242632eaf8988f348bb652c6c40105e0373b8afdf463d3
data/README.md CHANGED
@@ -2,7 +2,58 @@
2
2
 
3
3
  A quick and lightweight CSV handling library for Ruby
4
4
 
5
- ## Writing CSV
5
+ ## Example
6
+
7
+ ```ruby
8
+ #!/usr/bin/env ruby
9
+
10
+ STDOUT.sync = true
11
+
12
+ require 'censive'
13
+ require 'fileutils'
14
+
15
+ abort "usage: #{File.basename($0)} <files>" if ARGV.empty?
16
+
17
+ rand = `LC_ALL=C tr -dc a-zA-Z0-9 < /dev/random | head -c12`
18
+
19
+ rows = []
20
+ cols = []
21
+ coun = 0
22
+ full = 0
23
+
24
+ ARGV.each do |path|
25
+ File.file?(path) or next
26
+
27
+ print "Processing #{path.inspect}"
28
+
29
+ rows.clear
30
+ cols.clear
31
+ seen = 0
32
+ coun += 1
33
+
34
+ dest = "#{path}-#{rand}"
35
+
36
+ begin
37
+ Censive.writer(dest) do |file|
38
+ Censive.reader(path, excel: true, relax: true).each do |cols|
39
+ file << cols
40
+ seen += 1
41
+ print "." if (seen % 1e5) == 0 # give a status update every so often
42
+ end
43
+ end
44
+ FileUtils.mv(dest, path)
45
+ full += (seen - 1)
46
+ puts " (#{seen - 1} rows of data)"
47
+ rescue
48
+ puts " - unable to process (#{$!})"
49
+ FileUtils.rm_f(dest)
50
+ end
51
+ end
52
+
53
+ puts "Processed #{coun} files with a total of #{full} rows of data" if coun > 1
54
+ ```
55
+
56
+ ## Convert a CSV file to a TSV file
6
57
 
7
58
  ```ruby
8
59
  require 'censive'
@@ -12,7 +63,7 @@ data = File.read('data.csv')
12
63
 
13
64
  # write out a tab-separated tsv file
14
65
  Censive.writer('out.tsv', sep: "\t", mode: :full) do |out|
15
- Censive.new(data).each do |row|
66
+ Censive.new(data, excel: true, relax: true).each do |row|
16
67
  out << row
17
68
  end
18
69
  end
data/censive.gemspec CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = "censive"
5
- s.version = "0.8"
5
+ s.version = "0.10"
6
6
  s.author = "Steve Shreeve"
7
7
  s.email = "steve.shreeve@gmail.com"
8
8
  s.summary = "A quick and lightweight CSV handling library for Ruby"
data/lib/censive.rb CHANGED
@@ -6,7 +6,8 @@
6
6
  # Author: Steve Shreeve (steve.shreeve@gmail.com)
7
7
  # Date: Jan 30, 2023
8
8
  #
9
- # Thanks: Crystal's CSV library, see https://crystal-lang.org/api/1.7.2/CSV.html
9
+ # Thanks to https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
10
+ # and, also https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
10
11
  # ==============================================================================
11
12
  # The goals are:
12
13
  #
@@ -17,7 +18,8 @@
17
18
  #
18
19
  # 1. Option to support IO streaming
19
20
  # 2. Option to strip whitespace
20
- # 3. Confirm file encodings such as UTF-8, UTF-16, etc.
21
+ # 3. Option to support headers in readers and writers
22
+ # 4. Confirm file encodings such as UTF-8, UTF-16, etc.
21
23
  #
22
24
  # NOTE: Only getch and scan_until advance strscan's position
23
25
  # ==============================================================================
@@ -26,36 +28,36 @@ require 'strscan'
26
28
 
27
29
  class Censive < StringScanner
28
30
 
29
- def self.writer(path, **opts)
30
- File.open(path, 'w') do |file|
31
- yield new(out: file, **opts)
31
+ def self.writer(obj=$stdout, **opts, &code)
32
+ case obj
33
+ when String then File.open(path, 'w') {|file| yield new(out: obj, **opts, &code) }
34
+ when IO then new(out: obj, **opts, &code)
35
+ else abort "#{File.basename($0)}: invalid #{obj.class} object in writer"
32
36
  end
33
37
  end
34
38
 
35
39
  def initialize(str=nil,
36
- sep: ',' , # column separator character
37
- quote: '"' , # quote character
38
-
39
- drop: false , # enable to drop trailing separators
40
- eol: "\n" , # desired line endings for exports
41
- excel: false , # allow ,="0123" style columns
40
+ drop: false , # drop trailing empty fields?
41
+ eol: "\n" , # line endings for exports
42
+ excel: false , # literals(="01") formulas(=A1 + B2); http://bit.ly/3Y7jIvc
42
43
  mode: :compact, # export mode: compact or full
43
- out: nil , # output IO/file
44
- relax: false , # relax parsing of quotes
45
-
44
+ out: nil , # output stream, needs to respond to <<
45
+ quote: '"' , # quote character
46
+ relax: false , # relax quote parsing so ,"Fo"o, => ,"Fo""o",
47
+ sep: ',' , # column separator character
46
48
  **opts # grab bag
47
49
  )
48
50
  super(str || '')
49
51
  reset
50
52
 
51
- @sep = sep .freeze
52
- @quote = quote.freeze
53
-
54
53
  @drop = drop
55
- @eol = eol.freeze
54
+ @eol = eol .freeze #!# TODO: are the '.freeze' statements helpful?
55
+ @excel = excel
56
56
  @mode = mode
57
57
  @out = out
58
+ @quote = quote.freeze
58
59
  @relax = relax
60
+ @sep = sep .freeze
59
61
 
60
62
  @es = "" .freeze
61
63
  @cr = "\r" .freeze
@@ -64,7 +66,6 @@ class Censive < StringScanner
64
66
  @esc = (@quote * 2).freeze
65
67
 
66
68
  @tokens = [@sep,@quote,@cr,@lf,@es,nil]
67
- @tokens << @eq if excel # See http://bit.ly/3Y7jIvc
68
69
  end
69
70
 
70
71
  def reset(str=nil)
@@ -85,30 +86,33 @@ class Censive < StringScanner
85
86
  end
86
87
 
87
88
  def next_token
89
+
90
+ # process and clear @flag
88
91
  case @flag
89
92
  when @es then @flag = nil; [@cr,@lf,@es,nil].include?(@char) and return @es
90
93
  when @cr then @flag = nil; next_char == @lf and next_char
91
94
  when @lf then @flag = nil; next_char
95
+ else @flag = nil
92
96
  end if @flag
93
97
 
98
+ # Excel literals ="0123" and formulas =A1 + B2 (see http://bit.ly/3Y7jIvc)
99
+ if @excel && @char == @eq
100
+ @flag = @eq
101
+ next_char
102
+ end
103
+
94
104
  if @tokens.include?(@char)
95
105
  case @char
96
- when @quote, @eq # consume quoted cell
97
- @char == @eq and next_char # excel mode: allows ,="012",
106
+ when @quote # consume quoted cell
98
107
  match = ""
99
108
  while true
100
- getch # consume the quote (optimized by not calling next_char)
109
+ getch # move past the quote that got us here
101
110
  match << (scan_until(/(?=#{@quote})/o) or bomb "unclosed quote")
102
111
  case next_char
103
112
  when @sep then @flag = @es; next_char; break
104
113
  when @quote then match << @quote
105
114
  when @cr,@lf,@es,nil then break
106
- else
107
- if @relax
108
- match << @quote + @char
109
- else
110
- bomb "invalid character after quote"
111
- end
115
+ else @relax ? match << (@quote + @char) : bomb("invalid character after quote")
112
116
  end
113
117
  end
114
118
  match
@@ -119,6 +123,7 @@ class Censive < StringScanner
119
123
  end
120
124
  else # consume unquoted cell
121
125
  match = scan_until(/(?=#{@sep}|#{@cr}|#{@lf}|\z)/o) or bomb "unexpected character"
126
+ match = @eq + match and @flag = nil if @flag == @eq
122
127
  @char = peek(1)
123
128
  @char == @sep and @flag = @es and next_char
124
129
  match
@@ -132,12 +137,12 @@ class Censive < StringScanner
132
137
  # ==[ Parser ]==
133
138
 
134
139
  def parse
135
- @rows ||= []
140
+ @rows = []
136
141
  while row = next_row
137
142
  @rows << row
138
- size = row.size
139
- @cols = size if size > @cols
140
- @cells += size
143
+ count = row.size
144
+ @cols = count if count > @cols
145
+ @cells += count
141
146
  end
142
147
  @rows
143
148
  end
@@ -151,28 +156,34 @@ class Censive < StringScanner
151
156
 
152
157
  # ==[ Helpers ]==
153
158
 
154
- # grok returns: 2 for seps and quotes, 1 for seps only, and 0 for neither
159
+ # grok returns: 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
155
160
  def grok(str)
156
- if pos = str.index(/(#{@quote})|#{@sep}/o)
161
+ if pos = str.index(/(#{@quote})|#{@sep}|#{@cr}|#{@lf}/o)
157
162
  $1 ? 2 : str.index(/#{@quote}/o, pos) ? 2 : 1
158
163
  else
159
164
  0
160
165
  end
161
166
  end
162
167
 
168
+ # output a row
163
169
  def <<(row)
164
170
  @out or return super
165
171
 
166
- # drop trailing seps, if specified
172
+ # drop trailing empty columns
167
173
  row.pop while row.last.empty? if @drop
168
174
 
169
- # most compact export format
175
+ #!# FIXME: Excel output needs to protect 0-leading numbers
176
+
170
177
  s,q = @sep, @quote
171
178
  out = case @mode
172
179
  when :compact
173
180
  case grok(row.join)
174
- when 0 then row
175
- when 1 then row.map {|col| col.include?(s) ? "#{q}#{col}#{q}" : col }
181
+ when 0
182
+ row
183
+ when 1
184
+ row.map do |col|
185
+ col.match?(/#{@sep}|#{@cr}|#{@lf}/o) ? "#{q}#{col}#{q}" : col
186
+ end
176
187
  else
177
188
  row.map do |col|
178
189
  case grok(col)
@@ -186,7 +197,7 @@ class Censive < StringScanner
186
197
  row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
187
198
  end.join(s)
188
199
 
189
- # write output, using desired line endings
200
+ # add line ending
190
201
  @out << out + @eol
191
202
  end
192
203
 
@@ -195,6 +206,11 @@ class Censive < StringScanner
195
206
  @rows.each {|row| yield row }
196
207
  end
197
208
 
209
+ def export(...)
210
+ out = self.class.writer(...)
211
+ each {|row| out << row }
212
+ end
213
+
198
214
  def stats
199
215
  wide = string.size.to_s.size
200
216
  puts "%#{wide}d rows" % @rows.size
@@ -204,9 +220,35 @@ class Censive < StringScanner
204
220
  end
205
221
  end
206
222
 
223
+ # ==[ Command line ]==
224
+
225
+ if __FILE__ == $0
226
+ raw = DATA.gets("\n\n").chomp
227
+ csv = Censive.new(raw, excel: true)
228
+ csv.export # (sep: "\t", excel: true)
229
+ end
230
+
207
231
  __END__
232
+ Name,Age,Shoe
233
+ Alice,27,5
234
+ Bob,33,10 1/2
235
+ Charlie or "Chuck",=B2 + B3,9
236
+ "Doug E Fresh",="007",10
237
+ Subtotal,=sum(B2:B5),="01234"
238
+
239
+
240
+
241
+
242
+ path = '../test/a-uses-tabs-and-single-quotes-and-no-trailing-newline.tsv'
243
+ data = File.read(path)
244
+
245
+ out = Censive.writer
208
246
 
209
- # ==[ Playground... ]==
247
+ Censive.new(data, sep: "\t", quote: "'").each do |row|
248
+ p row
249
+ end
250
+
251
+ Censive.reader(path, sep: "\t", quote: "'").each {|r| p r}
210
252
 
211
253
  # STDOUT.sync = true
212
254
  #
@@ -219,14 +261,14 @@ __END__
219
261
  # end
220
262
  #
221
263
  # __END__
222
-
223
- ARGV << "z.csv" if ARGV.empty?
224
-
225
- path = ARGV.first
226
- data = File.read(path)
227
-
228
- csv = Censive.new(data)
229
-
230
- data.size > 1e6 ? csv.parse : csv.parse.each {|cols| p cols }
231
-
232
- csv.stats
264
+ #
265
+ # ARGV << "z.csv" if ARGV.empty?
266
+ #
267
+ # path = ARGV.first
268
+ # data = File.read(path)
269
+ #
270
+ # csv = Censive.new(data)
271
+ #
272
+ # data.size > 1e6 ? csv.parse : csv.parse.each {|cols| p cols }
273
+ #
274
+ # csv.stats
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: censive
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.8'
4
+ version: '0.10'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Steve Shreeve
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-01-31 00:00:00.000000000 Z
11
+ date: 2023-02-02 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: A quick and lightweight CSV handling library for Ruby
14
14
  email: steve.shreeve@gmail.com
@@ -20,7 +20,7 @@ files:
20
20
  - README.md
21
21
  - censive.gemspec
22
22
  - lib/censive.rb
23
- - test/a-uses-tabs-and-single-quotes-and-no-trailing-newline.csv
23
+ - test/a-uses-tabs-and-single-quotes-and-no-trailing-newline.tsv
24
24
  homepage: https://github.com/shreeve/censive
25
25
  licenses:
26
26
  - MIT