censive 0.8 → 0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f4b06c1c42b5f813f8901c4e7240cdd43df1ccc22cf87327dc3ed7d850720eb4
|
4
|
+
data.tar.gz: 97ab27b79eead81517fa28a4c51923fa02ec2fa95922f6f61dc509c7a4890b2e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a2f297ac516f5e01510a9ceb90cdb2cc1e782ff97a4f67515d73f6d56d8512cd4d9cbb5d04425bcdb8a7a5cdb63aeb2835e7bed2a76dcc149dae0bd63c4cc17b
|
7
|
+
data.tar.gz: 85762c69bc669db5a48f0e3b58e4319afdc9f1765cc18cd2b6c9501aaaccf3e41dddb58d3654bc4c76242632eaf8988f348bb652c6c40105e0373b8afdf463d3
|
data/README.md
CHANGED
@@ -2,7 +2,58 @@
|
|
2
2
|
|
3
3
|
A quick and lightweight CSV handling library for Ruby
|
4
4
|
|
5
|
-
##
|
5
|
+
## Example
|
6
|
+
|
7
|
+
```ruby
|
8
|
+
#!/usr/bin/env ruby
|
9
|
+
|
10
|
+
STDOUT.sync = true
|
11
|
+
|
12
|
+
require 'censive'
|
13
|
+
require 'fileutils'
|
14
|
+
|
15
|
+
abort "usage: #{File.basename($0)} <files>" if ARGV.empty?
|
16
|
+
|
17
|
+
rand = `LC_ALL=C tr -dc a-zA-Z0-9 < /dev/random | head -c12`
|
18
|
+
|
19
|
+
rows = []
|
20
|
+
cols = []
|
21
|
+
coun = 0
|
22
|
+
full = 0
|
23
|
+
|
24
|
+
ARGV.each do |path|
|
25
|
+
File.file?(path) or next
|
26
|
+
|
27
|
+
print "Processing #{path.inspect}"
|
28
|
+
|
29
|
+
rows.clear
|
30
|
+
cols.clear
|
31
|
+
seen = 0
|
32
|
+
coun += 1
|
33
|
+
|
34
|
+
dest = "#{path}-#{rand}"
|
35
|
+
|
36
|
+
begin
|
37
|
+
Censive.writer(dest) do |file|
|
38
|
+
Censive.reader(path, excel: true, relax: true).each do |cols|
|
39
|
+
file << cols
|
40
|
+
seen += 1
|
41
|
+
print "." if (seen % 1e5) == 0 # give a status update every so often
|
42
|
+
end
|
43
|
+
end
|
44
|
+
FileUtils.mv(dest, path)
|
45
|
+
full += (seen - 1)
|
46
|
+
puts " (#{seen - 1} rows of data)"
|
47
|
+
rescue
|
48
|
+
puts " - unable to process (#{$!})"
|
49
|
+
FileUtils.rm_f(dest)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
puts "Processed #{coun} files with a total of #{full} rows of data" if coun > 1
|
54
|
+
```
|
55
|
+
|
56
|
+
## Convert a CSV file to a TSV file
|
6
57
|
|
7
58
|
```ruby
|
8
59
|
require 'censive'
|
@@ -12,7 +63,7 @@ data = File.read('data.csv')
|
|
12
63
|
|
13
64
|
# write out a tab-separated tsv file
|
14
65
|
Censive.writer('out.tsv', sep: "\t", mode: :full) do |out|
|
15
|
-
Censive.new(data).each do |row|
|
66
|
+
Censive.new(data, excel: true, relax: true).each do |row|
|
16
67
|
out << row
|
17
68
|
end
|
18
69
|
end
|
data/censive.gemspec
CHANGED
data/lib/censive.rb
CHANGED
@@ -6,7 +6,8 @@
|
|
6
6
|
# Author: Steve Shreeve (steve.shreeve@gmail.com)
|
7
7
|
# Date: Jan 30, 2023
|
8
8
|
#
|
9
|
-
# Thanks
|
9
|
+
# Thanks to https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
|
10
|
+
# and, also https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
|
10
11
|
# ==============================================================================
|
11
12
|
# The goals are:
|
12
13
|
#
|
@@ -17,7 +18,8 @@
|
|
17
18
|
#
|
18
19
|
# 1. Option to support IO streaming
|
19
20
|
# 2. Option to strip whitespace
|
20
|
-
# 3.
|
21
|
+
# 3. Option to support headers in readers and writers
|
22
|
+
# 4. Confirm file encodings such as UTF-8, UTF-16, etc.
|
21
23
|
#
|
22
24
|
# NOTE: Only getch and scan_until advance strscan's position
|
23
25
|
# ==============================================================================
|
@@ -26,36 +28,36 @@ require 'strscan'
|
|
26
28
|
|
27
29
|
class Censive < StringScanner
|
28
30
|
|
29
|
-
def self.writer(
|
30
|
-
|
31
|
-
|
31
|
+
def self.writer(obj=$stdout, **opts, &code)
|
32
|
+
case obj
|
33
|
+
when String then File.open(path, 'w') {|file| yield new(out: obj, **opts, &code) }
|
34
|
+
when IO then new(out: obj, **opts, &code)
|
35
|
+
else abort "#{File.basename($0)}: invalid #{obj.class} object in writer"
|
32
36
|
end
|
33
37
|
end
|
34
38
|
|
35
39
|
def initialize(str=nil,
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
drop: false , # enable to drop trailing separators
|
40
|
-
eol: "\n" , # desired line endings for exports
|
41
|
-
excel: false , # allow ,="0123" style columns
|
40
|
+
drop: false , # drop trailing empty fields?
|
41
|
+
eol: "\n" , # line endings for exports
|
42
|
+
excel: false , # literals(="01") formulas(=A1 + B2); http://bit.ly/3Y7jIvc
|
42
43
|
mode: :compact, # export mode: compact or full
|
43
|
-
out: nil , # output
|
44
|
-
|
45
|
-
|
44
|
+
out: nil , # output stream, needs to respond to <<
|
45
|
+
quote: '"' , # quote character
|
46
|
+
relax: false , # relax quote parsing so ,"Fo"o, => ,"Fo""o",
|
47
|
+
sep: ',' , # column separator character
|
46
48
|
**opts # grab bag
|
47
49
|
)
|
48
50
|
super(str || '')
|
49
51
|
reset
|
50
52
|
|
51
|
-
@sep = sep .freeze
|
52
|
-
@quote = quote.freeze
|
53
|
-
|
54
53
|
@drop = drop
|
55
|
-
@eol = eol.freeze
|
54
|
+
@eol = eol .freeze #!# TODO: are the '.freeze' statements helpful?
|
55
|
+
@excel = excel
|
56
56
|
@mode = mode
|
57
57
|
@out = out
|
58
|
+
@quote = quote.freeze
|
58
59
|
@relax = relax
|
60
|
+
@sep = sep .freeze
|
59
61
|
|
60
62
|
@es = "" .freeze
|
61
63
|
@cr = "\r" .freeze
|
@@ -64,7 +66,6 @@ class Censive < StringScanner
|
|
64
66
|
@esc = (@quote * 2).freeze
|
65
67
|
|
66
68
|
@tokens = [@sep,@quote,@cr,@lf,@es,nil]
|
67
|
-
@tokens << @eq if excel # See http://bit.ly/3Y7jIvc
|
68
69
|
end
|
69
70
|
|
70
71
|
def reset(str=nil)
|
@@ -85,30 +86,33 @@ class Censive < StringScanner
|
|
85
86
|
end
|
86
87
|
|
87
88
|
def next_token
|
89
|
+
|
90
|
+
# process and clear @flag
|
88
91
|
case @flag
|
89
92
|
when @es then @flag = nil; [@cr,@lf,@es,nil].include?(@char) and return @es
|
90
93
|
when @cr then @flag = nil; next_char == @lf and next_char
|
91
94
|
when @lf then @flag = nil; next_char
|
95
|
+
else @flag = nil
|
92
96
|
end if @flag
|
93
97
|
|
98
|
+
# Excel literals ="0123" and formulas =A1 + B2 (see http://bit.ly/3Y7jIvc)
|
99
|
+
if @excel && @char == @eq
|
100
|
+
@flag = @eq
|
101
|
+
next_char
|
102
|
+
end
|
103
|
+
|
94
104
|
if @tokens.include?(@char)
|
95
105
|
case @char
|
96
|
-
when @quote
|
97
|
-
@char == @eq and next_char # excel mode: allows ,="012",
|
106
|
+
when @quote # consume quoted cell
|
98
107
|
match = ""
|
99
108
|
while true
|
100
|
-
getch #
|
109
|
+
getch # move past the quote that got us here
|
101
110
|
match << (scan_until(/(?=#{@quote})/o) or bomb "unclosed quote")
|
102
111
|
case next_char
|
103
112
|
when @sep then @flag = @es; next_char; break
|
104
113
|
when @quote then match << @quote
|
105
114
|
when @cr,@lf,@es,nil then break
|
106
|
-
else
|
107
|
-
if @relax
|
108
|
-
match << @quote + @char
|
109
|
-
else
|
110
|
-
bomb "invalid character after quote"
|
111
|
-
end
|
115
|
+
else @relax ? match << (@quote + @char) : bomb("invalid character after quote")
|
112
116
|
end
|
113
117
|
end
|
114
118
|
match
|
@@ -119,6 +123,7 @@ class Censive < StringScanner
|
|
119
123
|
end
|
120
124
|
else # consume unquoted cell
|
121
125
|
match = scan_until(/(?=#{@sep}|#{@cr}|#{@lf}|\z)/o) or bomb "unexpected character"
|
126
|
+
match = @eq + match and @flag = nil if @flag == @eq
|
122
127
|
@char = peek(1)
|
123
128
|
@char == @sep and @flag = @es and next_char
|
124
129
|
match
|
@@ -132,12 +137,12 @@ class Censive < StringScanner
|
|
132
137
|
# ==[ Parser ]==
|
133
138
|
|
134
139
|
def parse
|
135
|
-
@rows
|
140
|
+
@rows = []
|
136
141
|
while row = next_row
|
137
142
|
@rows << row
|
138
|
-
|
139
|
-
@cols =
|
140
|
-
@cells +=
|
143
|
+
count = row.size
|
144
|
+
@cols = count if count > @cols
|
145
|
+
@cells += count
|
141
146
|
end
|
142
147
|
@rows
|
143
148
|
end
|
@@ -151,28 +156,34 @@ class Censive < StringScanner
|
|
151
156
|
|
152
157
|
# ==[ Helpers ]==
|
153
158
|
|
154
|
-
# grok returns: 2
|
159
|
+
# grok returns: 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
|
155
160
|
def grok(str)
|
156
|
-
if pos = str.index(/(#{@quote})|#{@sep}/o)
|
161
|
+
if pos = str.index(/(#{@quote})|#{@sep}|#{@cr}|#{@lf}/o)
|
157
162
|
$1 ? 2 : str.index(/#{@quote}/o, pos) ? 2 : 1
|
158
163
|
else
|
159
164
|
0
|
160
165
|
end
|
161
166
|
end
|
162
167
|
|
168
|
+
# output a row
|
163
169
|
def <<(row)
|
164
170
|
@out or return super
|
165
171
|
|
166
|
-
# drop trailing
|
172
|
+
# drop trailing empty columns
|
167
173
|
row.pop while row.last.empty? if @drop
|
168
174
|
|
169
|
-
|
175
|
+
#!# FIXME: Excel output needs to protect 0-leading numbers
|
176
|
+
|
170
177
|
s,q = @sep, @quote
|
171
178
|
out = case @mode
|
172
179
|
when :compact
|
173
180
|
case grok(row.join)
|
174
|
-
when 0
|
175
|
-
|
181
|
+
when 0
|
182
|
+
row
|
183
|
+
when 1
|
184
|
+
row.map do |col|
|
185
|
+
col.match?(/#{@sep}|#{@cr}|#{@lf}/o) ? "#{q}#{col}#{q}" : col
|
186
|
+
end
|
176
187
|
else
|
177
188
|
row.map do |col|
|
178
189
|
case grok(col)
|
@@ -186,7 +197,7 @@ class Censive < StringScanner
|
|
186
197
|
row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
|
187
198
|
end.join(s)
|
188
199
|
|
189
|
-
#
|
200
|
+
# add line ending
|
190
201
|
@out << out + @eol
|
191
202
|
end
|
192
203
|
|
@@ -195,6 +206,11 @@ class Censive < StringScanner
|
|
195
206
|
@rows.each {|row| yield row }
|
196
207
|
end
|
197
208
|
|
209
|
+
def export(...)
|
210
|
+
out = self.class.writer(...)
|
211
|
+
each {|row| out << row }
|
212
|
+
end
|
213
|
+
|
198
214
|
def stats
|
199
215
|
wide = string.size.to_s.size
|
200
216
|
puts "%#{wide}d rows" % @rows.size
|
@@ -204,9 +220,35 @@ class Censive < StringScanner
|
|
204
220
|
end
|
205
221
|
end
|
206
222
|
|
223
|
+
# ==[ Command line ]==
|
224
|
+
|
225
|
+
if __FILE__ == $0
|
226
|
+
raw = DATA.gets("\n\n").chomp
|
227
|
+
csv = Censive.new(raw, excel: true)
|
228
|
+
csv.export # (sep: "\t", excel: true)
|
229
|
+
end
|
230
|
+
|
207
231
|
__END__
|
232
|
+
Name,Age,Shoe
|
233
|
+
Alice,27,5
|
234
|
+
Bob,33,10 1/2
|
235
|
+
Charlie or "Chuck",=B2 + B3,9
|
236
|
+
"Doug E Fresh",="007",10
|
237
|
+
Subtotal,=sum(B2:B5),="01234"
|
238
|
+
|
239
|
+
|
240
|
+
|
241
|
+
|
242
|
+
path = '../test/a-uses-tabs-and-single-quotes-and-no-trailing-newline.tsv'
|
243
|
+
data = File.read(path)
|
244
|
+
|
245
|
+
out = Censive.writer
|
208
246
|
|
209
|
-
|
247
|
+
Censive.new(data, sep: "\t", quote: "'").each do |row|
|
248
|
+
p row
|
249
|
+
end
|
250
|
+
|
251
|
+
Censive.reader(path, sep: "\t", quote: "'").each {|r| p r}
|
210
252
|
|
211
253
|
# STDOUT.sync = true
|
212
254
|
#
|
@@ -219,14 +261,14 @@ __END__
|
|
219
261
|
# end
|
220
262
|
#
|
221
263
|
# __END__
|
222
|
-
|
223
|
-
ARGV << "z.csv" if ARGV.empty?
|
224
|
-
|
225
|
-
path = ARGV.first
|
226
|
-
data = File.read(path)
|
227
|
-
|
228
|
-
csv = Censive.new(data)
|
229
|
-
|
230
|
-
data.size > 1e6 ? csv.parse : csv.parse.each {|cols| p cols }
|
231
|
-
|
232
|
-
csv.stats
|
264
|
+
#
|
265
|
+
# ARGV << "z.csv" if ARGV.empty?
|
266
|
+
#
|
267
|
+
# path = ARGV.first
|
268
|
+
# data = File.read(path)
|
269
|
+
#
|
270
|
+
# csv = Censive.new(data)
|
271
|
+
#
|
272
|
+
# data.size > 1e6 ? csv.parse : csv.parse.each {|cols| p cols }
|
273
|
+
#
|
274
|
+
# csv.stats
|
File without changes
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: censive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.10'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Steve Shreeve
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-02-02 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: A quick and lightweight CSV handling library for Ruby
|
14
14
|
email: steve.shreeve@gmail.com
|
@@ -20,7 +20,7 @@ files:
|
|
20
20
|
- README.md
|
21
21
|
- censive.gemspec
|
22
22
|
- lib/censive.rb
|
23
|
-
- test/a-uses-tabs-and-single-quotes-and-no-trailing-newline.
|
23
|
+
- test/a-uses-tabs-and-single-quotes-and-no-trailing-newline.tsv
|
24
24
|
homepage: https://github.com/shreeve/censive
|
25
25
|
licenses:
|
26
26
|
- MIT
|