censive 0.8 → 0.10

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6419efcdc9274ea8bcf7b8527001e33f8bdfea348dfd911686cab36984d507da
4
- data.tar.gz: 3b59aead54517fd64d7ece3eaa6f459e301e1e48f1ae34772a7128c61fb739f2
3
+ metadata.gz: f4b06c1c42b5f813f8901c4e7240cdd43df1ccc22cf87327dc3ed7d850720eb4
4
+ data.tar.gz: 97ab27b79eead81517fa28a4c51923fa02ec2fa95922f6f61dc509c7a4890b2e
5
5
  SHA512:
6
- metadata.gz: 7910c09e76a81ed27870ea52fb6c8aea0316ed213c53a026d98adc64f93349477e6acab0a93b88c6f184ce1d317634ecdca9290d50bff9b117b98bedd3ac7b86
7
- data.tar.gz: 358ab985947d486b5f486b1f7e9c1f591e3b8e906b9eab59a4ed151e5f5d9652c211f2d2a4ee36f0543227e2ae5e33ba57f1e4c178f6f7e72e05c14d7b46895f
6
+ metadata.gz: a2f297ac516f5e01510a9ceb90cdb2cc1e782ff97a4f67515d73f6d56d8512cd4d9cbb5d04425bcdb8a7a5cdb63aeb2835e7bed2a76dcc149dae0bd63c4cc17b
7
+ data.tar.gz: 85762c69bc669db5a48f0e3b58e4319afdc9f1765cc18cd2b6c9501aaaccf3e41dddb58d3654bc4c76242632eaf8988f348bb652c6c40105e0373b8afdf463d3
data/README.md CHANGED
@@ -2,7 +2,58 @@
2
2
 
3
3
  A quick and lightweight CSV handling library for Ruby
4
4
 
5
- ## Writing CSV
5
+ ## Example
6
+
7
+ ```ruby
8
+ #!/usr/bin/env ruby
9
+
10
+ STDOUT.sync = true
11
+
12
+ require 'censive'
13
+ require 'fileutils'
14
+
15
+ abort "usage: #{File.basename($0)} <files>" if ARGV.empty?
16
+
17
+ rand = `LC_ALL=C tr -dc a-zA-Z0-9 < /dev/random | head -c12`
18
+
19
+ rows = []
20
+ cols = []
21
+ coun = 0
22
+ full = 0
23
+
24
+ ARGV.each do |path|
25
+ File.file?(path) or next
26
+
27
+ print "Processing #{path.inspect}"
28
+
29
+ rows.clear
30
+ cols.clear
31
+ seen = 0
32
+ coun += 1
33
+
34
+ dest = "#{path}-#{rand}"
35
+
36
+ begin
37
+ Censive.writer(dest) do |file|
38
+ Censive.reader(path, excel: true, relax: true).each do |cols|
39
+ file << cols
40
+ seen += 1
41
+ print "." if (seen % 1e5) == 0 # give a status update every so often
42
+ end
43
+ end
44
+ FileUtils.mv(dest, path)
45
+ full += (seen - 1)
46
+ puts " (#{seen - 1} rows of data)"
47
+ rescue
48
+ puts " - unable to process (#{$!})"
49
+ FileUtils.rm_f(dest)
50
+ end
51
+ end
52
+
53
+ puts "Processed #{coun} files with a total of #{full} rows of data" if coun > 1
54
+ ```
55
+
56
+ ## Convert a CSV file to a TSV file
6
57
 
7
58
  ```ruby
8
59
  require 'censive'
@@ -12,7 +63,7 @@ data = File.read('data.csv')
12
63
 
13
64
  # write out a tab-separated tsv file
14
65
  Censive.writer('out.tsv', sep: "\t", mode: :full) do |out|
15
- Censive.new(data).each do |row|
66
+ Censive.new(data, excel: true, relax: true).each do |row|
16
67
  out << row
17
68
  end
18
69
  end
data/censive.gemspec CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = "censive"
5
- s.version = "0.8"
5
+ s.version = "0.10"
6
6
  s.author = "Steve Shreeve"
7
7
  s.email = "steve.shreeve@gmail.com"
8
8
  s.summary = "A quick and lightweight CSV handling library for Ruby"
data/lib/censive.rb CHANGED
@@ -6,7 +6,8 @@
6
6
  # Author: Steve Shreeve (steve.shreeve@gmail.com)
7
7
  # Date: Jan 30, 2023
8
8
  #
9
- # Thanks: Crystal's CSV library, see https://crystal-lang.org/api/1.7.2/CSV.html
9
+ # Thanks to https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
10
+ # and, also https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
10
11
  # ==============================================================================
11
12
  # The goals are:
12
13
  #
@@ -17,7 +18,8 @@
17
18
  #
18
19
  # 1. Option to support IO streaming
19
20
  # 2. Option to strip whitespace
20
- # 3. Confirm file encodings such as UTF-8, UTF-16, etc.
21
+ # 3. Option to support headers in readers and writers
22
+ # 4. Confirm file encodings such as UTF-8, UTF-16, etc.
21
23
  #
22
24
  # NOTE: Only getch and scan_until advance strscan's position
23
25
  # ==============================================================================
@@ -26,36 +28,36 @@ require 'strscan'
26
28
 
27
29
  class Censive < StringScanner
28
30
 
29
- def self.writer(path, **opts)
30
- File.open(path, 'w') do |file|
31
- yield new(out: file, **opts)
31
+ def self.writer(obj=$stdout, **opts, &code)
32
+ case obj
33
+ when String then File.open(path, 'w') {|file| yield new(out: obj, **opts, &code) }
34
+ when IO then new(out: obj, **opts, &code)
35
+ else abort "#{File.basename($0)}: invalid #{obj.class} object in writer"
32
36
  end
33
37
  end
34
38
 
35
39
  def initialize(str=nil,
36
- sep: ',' , # column separator character
37
- quote: '"' , # quote character
38
-
39
- drop: false , # enable to drop trailing separators
40
- eol: "\n" , # desired line endings for exports
41
- excel: false , # allow ,="0123" style columns
40
+ drop: false , # drop trailing empty fields?
41
+ eol: "\n" , # line endings for exports
42
+ excel: false , # literals(="01") formulas(=A1 + B2); http://bit.ly/3Y7jIvc
42
43
  mode: :compact, # export mode: compact or full
43
- out: nil , # output IO/file
44
- relax: false , # relax parsing of quotes
45
-
44
+ out: nil , # output stream, needs to respond to <<
45
+ quote: '"' , # quote character
46
+ relax: false , # relax quote parsing so ,"Fo"o, => ,"Fo""o",
47
+ sep: ',' , # column separator character
46
48
  **opts # grab bag
47
49
  )
48
50
  super(str || '')
49
51
  reset
50
52
 
51
- @sep = sep .freeze
52
- @quote = quote.freeze
53
-
54
53
  @drop = drop
55
- @eol = eol.freeze
54
+ @eol = eol .freeze #!# TODO: are the '.freeze' statements helpful?
55
+ @excel = excel
56
56
  @mode = mode
57
57
  @out = out
58
+ @quote = quote.freeze
58
59
  @relax = relax
60
+ @sep = sep .freeze
59
61
 
60
62
  @es = "" .freeze
61
63
  @cr = "\r" .freeze
@@ -64,7 +66,6 @@ class Censive < StringScanner
64
66
  @esc = (@quote * 2).freeze
65
67
 
66
68
  @tokens = [@sep,@quote,@cr,@lf,@es,nil]
67
- @tokens << @eq if excel # See http://bit.ly/3Y7jIvc
68
69
  end
69
70
 
70
71
  def reset(str=nil)
@@ -85,30 +86,33 @@ class Censive < StringScanner
85
86
  end
86
87
 
87
88
  def next_token
89
+
90
+ # process and clear @flag
88
91
  case @flag
89
92
  when @es then @flag = nil; [@cr,@lf,@es,nil].include?(@char) and return @es
90
93
  when @cr then @flag = nil; next_char == @lf and next_char
91
94
  when @lf then @flag = nil; next_char
95
+ else @flag = nil
92
96
  end if @flag
93
97
 
98
+ # Excel literals ="0123" and formulas =A1 + B2 (see http://bit.ly/3Y7jIvc)
99
+ if @excel && @char == @eq
100
+ @flag = @eq
101
+ next_char
102
+ end
103
+
94
104
  if @tokens.include?(@char)
95
105
  case @char
96
- when @quote, @eq # consume quoted cell
97
- @char == @eq and next_char # excel mode: allows ,="012",
106
+ when @quote # consume quoted cell
98
107
  match = ""
99
108
  while true
100
- getch # consume the quote (optimized by not calling next_char)
109
+ getch # move past the quote that got us here
101
110
  match << (scan_until(/(?=#{@quote})/o) or bomb "unclosed quote")
102
111
  case next_char
103
112
  when @sep then @flag = @es; next_char; break
104
113
  when @quote then match << @quote
105
114
  when @cr,@lf,@es,nil then break
106
- else
107
- if @relax
108
- match << @quote + @char
109
- else
110
- bomb "invalid character after quote"
111
- end
115
+ else @relax ? match << (@quote + @char) : bomb("invalid character after quote")
112
116
  end
113
117
  end
114
118
  match
@@ -119,6 +123,7 @@ class Censive < StringScanner
119
123
  end
120
124
  else # consume unquoted cell
121
125
  match = scan_until(/(?=#{@sep}|#{@cr}|#{@lf}|\z)/o) or bomb "unexpected character"
126
+ match = @eq + match and @flag = nil if @flag == @eq
122
127
  @char = peek(1)
123
128
  @char == @sep and @flag = @es and next_char
124
129
  match
@@ -132,12 +137,12 @@ class Censive < StringScanner
132
137
  # ==[ Parser ]==
133
138
 
134
139
  def parse
135
- @rows ||= []
140
+ @rows = []
136
141
  while row = next_row
137
142
  @rows << row
138
- size = row.size
139
- @cols = size if size > @cols
140
- @cells += size
143
+ count = row.size
144
+ @cols = count if count > @cols
145
+ @cells += count
141
146
  end
142
147
  @rows
143
148
  end
@@ -151,28 +156,34 @@ class Censive < StringScanner
151
156
 
152
157
  # ==[ Helpers ]==
153
158
 
154
- # grok returns: 2 for seps and quotes, 1 for seps only, and 0 for neither
159
+ # grok returns: 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
155
160
  def grok(str)
156
- if pos = str.index(/(#{@quote})|#{@sep}/o)
161
+ if pos = str.index(/(#{@quote})|#{@sep}|#{@cr}|#{@lf}/o)
157
162
  $1 ? 2 : str.index(/#{@quote}/o, pos) ? 2 : 1
158
163
  else
159
164
  0
160
165
  end
161
166
  end
162
167
 
168
+ # output a row
163
169
  def <<(row)
164
170
  @out or return super
165
171
 
166
- # drop trailing seps, if specified
172
+ # drop trailing empty columns
167
173
  row.pop while row.last.empty? if @drop
168
174
 
169
- # most compact export format
175
+ #!# FIXME: Excel output needs to protect 0-leading numbers
176
+
170
177
  s,q = @sep, @quote
171
178
  out = case @mode
172
179
  when :compact
173
180
  case grok(row.join)
174
- when 0 then row
175
- when 1 then row.map {|col| col.include?(s) ? "#{q}#{col}#{q}" : col }
181
+ when 0
182
+ row
183
+ when 1
184
+ row.map do |col|
185
+ col.match?(/#{@sep}|#{@cr}|#{@lf}/o) ? "#{q}#{col}#{q}" : col
186
+ end
176
187
  else
177
188
  row.map do |col|
178
189
  case grok(col)
@@ -186,7 +197,7 @@ class Censive < StringScanner
186
197
  row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
187
198
  end.join(s)
188
199
 
189
- # write output, using desired line endings
200
+ # add line ending
190
201
  @out << out + @eol
191
202
  end
192
203
 
@@ -195,6 +206,11 @@ class Censive < StringScanner
195
206
  @rows.each {|row| yield row }
196
207
  end
197
208
 
209
+ def export(...)
210
+ out = self.class.writer(...)
211
+ each {|row| out << row }
212
+ end
213
+
198
214
  def stats
199
215
  wide = string.size.to_s.size
200
216
  puts "%#{wide}d rows" % @rows.size
@@ -204,9 +220,35 @@ class Censive < StringScanner
204
220
  end
205
221
  end
206
222
 
223
+ # ==[ Command line ]==
224
+
225
+ if __FILE__ == $0
226
+ raw = DATA.gets("\n\n").chomp
227
+ csv = Censive.new(raw, excel: true)
228
+ csv.export # (sep: "\t", excel: true)
229
+ end
230
+
207
231
  __END__
232
+ Name,Age,Shoe
233
+ Alice,27,5
234
+ Bob,33,10 1/2
235
+ Charlie or "Chuck",=B2 + B3,9
236
+ "Doug E Fresh",="007",10
237
+ Subtotal,=sum(B2:B5),="01234"
238
+
239
+
240
+
241
+
242
+ path = '../test/a-uses-tabs-and-single-quotes-and-no-trailing-newline.tsv'
243
+ data = File.read(path)
244
+
245
+ out = Censive.writer
208
246
 
209
- # ==[ Playground... ]==
247
+ Censive.new(data, sep: "\t", quote: "'").each do |row|
248
+ p row
249
+ end
250
+
251
+ Censive.reader(path, sep: "\t", quote: "'").each {|r| p r}
210
252
 
211
253
  # STDOUT.sync = true
212
254
  #
@@ -219,14 +261,14 @@ __END__
219
261
  # end
220
262
  #
221
263
  # __END__
222
-
223
- ARGV << "z.csv" if ARGV.empty?
224
-
225
- path = ARGV.first
226
- data = File.read(path)
227
-
228
- csv = Censive.new(data)
229
-
230
- data.size > 1e6 ? csv.parse : csv.parse.each {|cols| p cols }
231
-
232
- csv.stats
264
+ #
265
+ # ARGV << "z.csv" if ARGV.empty?
266
+ #
267
+ # path = ARGV.first
268
+ # data = File.read(path)
269
+ #
270
+ # csv = Censive.new(data)
271
+ #
272
+ # data.size > 1e6 ? csv.parse : csv.parse.each {|cols| p cols }
273
+ #
274
+ # csv.stats
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: censive
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.8'
4
+ version: '0.10'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Steve Shreeve
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-01-31 00:00:00.000000000 Z
11
+ date: 2023-02-02 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: A quick and lightweight CSV handling library for Ruby
14
14
  email: steve.shreeve@gmail.com
@@ -20,7 +20,7 @@ files:
20
20
  - README.md
21
21
  - censive.gemspec
22
22
  - lib/censive.rb
23
- - test/a-uses-tabs-and-single-quotes-and-no-trailing-newline.csv
23
+ - test/a-uses-tabs-and-single-quotes-and-no-trailing-newline.tsv
24
24
  homepage: https://github.com/shreeve/censive
25
25
  licenses:
26
26
  - MIT