censive 0.8 → 0.10
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f4b06c1c42b5f813f8901c4e7240cdd43df1ccc22cf87327dc3ed7d850720eb4
|
4
|
+
data.tar.gz: 97ab27b79eead81517fa28a4c51923fa02ec2fa95922f6f61dc509c7a4890b2e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a2f297ac516f5e01510a9ceb90cdb2cc1e782ff97a4f67515d73f6d56d8512cd4d9cbb5d04425bcdb8a7a5cdb63aeb2835e7bed2a76dcc149dae0bd63c4cc17b
|
7
|
+
data.tar.gz: 85762c69bc669db5a48f0e3b58e4319afdc9f1765cc18cd2b6c9501aaaccf3e41dddb58d3654bc4c76242632eaf8988f348bb652c6c40105e0373b8afdf463d3
|
data/README.md
CHANGED
@@ -2,7 +2,58 @@
|
|
2
2
|
|
3
3
|
A quick and lightweight CSV handling library for Ruby
|
4
4
|
|
5
|
-
##
|
5
|
+
## Example
|
6
|
+
|
7
|
+
```ruby
|
8
|
+
#!/usr/bin/env ruby
|
9
|
+
|
10
|
+
STDOUT.sync = true
|
11
|
+
|
12
|
+
require 'censive'
|
13
|
+
require 'fileutils'
|
14
|
+
|
15
|
+
abort "usage: #{File.basename($0)} <files>" if ARGV.empty?
|
16
|
+
|
17
|
+
rand = `LC_ALL=C tr -dc a-zA-Z0-9 < /dev/random | head -c12`
|
18
|
+
|
19
|
+
rows = []
|
20
|
+
cols = []
|
21
|
+
coun = 0
|
22
|
+
full = 0
|
23
|
+
|
24
|
+
ARGV.each do |path|
|
25
|
+
File.file?(path) or next
|
26
|
+
|
27
|
+
print "Processing #{path.inspect}"
|
28
|
+
|
29
|
+
rows.clear
|
30
|
+
cols.clear
|
31
|
+
seen = 0
|
32
|
+
coun += 1
|
33
|
+
|
34
|
+
dest = "#{path}-#{rand}"
|
35
|
+
|
36
|
+
begin
|
37
|
+
Censive.writer(dest) do |file|
|
38
|
+
Censive.reader(path, excel: true, relax: true).each do |cols|
|
39
|
+
file << cols
|
40
|
+
seen += 1
|
41
|
+
print "." if (seen % 1e5) == 0 # give a status update every so often
|
42
|
+
end
|
43
|
+
end
|
44
|
+
FileUtils.mv(dest, path)
|
45
|
+
full += (seen - 1)
|
46
|
+
puts " (#{seen - 1} rows of data)"
|
47
|
+
rescue
|
48
|
+
puts " - unable to process (#{$!})"
|
49
|
+
FileUtils.rm_f(dest)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
puts "Processed #{coun} files with a total of #{full} rows of data" if coun > 1
|
54
|
+
```
|
55
|
+
|
56
|
+
## Convert a CSV file to a TSV file
|
6
57
|
|
7
58
|
```ruby
|
8
59
|
require 'censive'
|
@@ -12,7 +63,7 @@ data = File.read('data.csv')
|
|
12
63
|
|
13
64
|
# write out a tab-separated tsv file
|
14
65
|
Censive.writer('out.tsv', sep: "\t", mode: :full) do |out|
|
15
|
-
Censive.new(data).each do |row|
|
66
|
+
Censive.new(data, excel: true, relax: true).each do |row|
|
16
67
|
out << row
|
17
68
|
end
|
18
69
|
end
|
data/censive.gemspec
CHANGED
data/lib/censive.rb
CHANGED
@@ -6,7 +6,8 @@
|
|
6
6
|
# Author: Steve Shreeve (steve.shreeve@gmail.com)
|
7
7
|
# Date: Jan 30, 2023
|
8
8
|
#
|
9
|
-
# Thanks
|
9
|
+
# Thanks to https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
|
10
|
+
# and, also https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
|
10
11
|
# ==============================================================================
|
11
12
|
# The goals are:
|
12
13
|
#
|
@@ -17,7 +18,8 @@
|
|
17
18
|
#
|
18
19
|
# 1. Option to support IO streaming
|
19
20
|
# 2. Option to strip whitespace
|
20
|
-
# 3.
|
21
|
+
# 3. Option to support headers in readers and writers
|
22
|
+
# 4. Confirm file encodings such as UTF-8, UTF-16, etc.
|
21
23
|
#
|
22
24
|
# NOTE: Only getch and scan_until advance strscan's position
|
23
25
|
# ==============================================================================
|
@@ -26,36 +28,36 @@ require 'strscan'
|
|
26
28
|
|
27
29
|
class Censive < StringScanner
|
28
30
|
|
29
|
-
def self.writer(
|
30
|
-
|
31
|
-
|
31
|
+
def self.writer(obj=$stdout, **opts, &code)
|
32
|
+
case obj
|
33
|
+
when String then File.open(path, 'w') {|file| yield new(out: obj, **opts, &code) }
|
34
|
+
when IO then new(out: obj, **opts, &code)
|
35
|
+
else abort "#{File.basename($0)}: invalid #{obj.class} object in writer"
|
32
36
|
end
|
33
37
|
end
|
34
38
|
|
35
39
|
def initialize(str=nil,
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
drop: false , # enable to drop trailing separators
|
40
|
-
eol: "\n" , # desired line endings for exports
|
41
|
-
excel: false , # allow ,="0123" style columns
|
40
|
+
drop: false , # drop trailing empty fields?
|
41
|
+
eol: "\n" , # line endings for exports
|
42
|
+
excel: false , # literals(="01") formulas(=A1 + B2); http://bit.ly/3Y7jIvc
|
42
43
|
mode: :compact, # export mode: compact or full
|
43
|
-
out: nil , # output
|
44
|
-
|
45
|
-
|
44
|
+
out: nil , # output stream, needs to respond to <<
|
45
|
+
quote: '"' , # quote character
|
46
|
+
relax: false , # relax quote parsing so ,"Fo"o, => ,"Fo""o",
|
47
|
+
sep: ',' , # column separator character
|
46
48
|
**opts # grab bag
|
47
49
|
)
|
48
50
|
super(str || '')
|
49
51
|
reset
|
50
52
|
|
51
|
-
@sep = sep .freeze
|
52
|
-
@quote = quote.freeze
|
53
|
-
|
54
53
|
@drop = drop
|
55
|
-
@eol = eol.freeze
|
54
|
+
@eol = eol .freeze #!# TODO: are the '.freeze' statements helpful?
|
55
|
+
@excel = excel
|
56
56
|
@mode = mode
|
57
57
|
@out = out
|
58
|
+
@quote = quote.freeze
|
58
59
|
@relax = relax
|
60
|
+
@sep = sep .freeze
|
59
61
|
|
60
62
|
@es = "" .freeze
|
61
63
|
@cr = "\r" .freeze
|
@@ -64,7 +66,6 @@ class Censive < StringScanner
|
|
64
66
|
@esc = (@quote * 2).freeze
|
65
67
|
|
66
68
|
@tokens = [@sep,@quote,@cr,@lf,@es,nil]
|
67
|
-
@tokens << @eq if excel # See http://bit.ly/3Y7jIvc
|
68
69
|
end
|
69
70
|
|
70
71
|
def reset(str=nil)
|
@@ -85,30 +86,33 @@ class Censive < StringScanner
|
|
85
86
|
end
|
86
87
|
|
87
88
|
def next_token
|
89
|
+
|
90
|
+
# process and clear @flag
|
88
91
|
case @flag
|
89
92
|
when @es then @flag = nil; [@cr,@lf,@es,nil].include?(@char) and return @es
|
90
93
|
when @cr then @flag = nil; next_char == @lf and next_char
|
91
94
|
when @lf then @flag = nil; next_char
|
95
|
+
else @flag = nil
|
92
96
|
end if @flag
|
93
97
|
|
98
|
+
# Excel literals ="0123" and formulas =A1 + B2 (see http://bit.ly/3Y7jIvc)
|
99
|
+
if @excel && @char == @eq
|
100
|
+
@flag = @eq
|
101
|
+
next_char
|
102
|
+
end
|
103
|
+
|
94
104
|
if @tokens.include?(@char)
|
95
105
|
case @char
|
96
|
-
when @quote
|
97
|
-
@char == @eq and next_char # excel mode: allows ,="012",
|
106
|
+
when @quote # consume quoted cell
|
98
107
|
match = ""
|
99
108
|
while true
|
100
|
-
getch #
|
109
|
+
getch # move past the quote that got us here
|
101
110
|
match << (scan_until(/(?=#{@quote})/o) or bomb "unclosed quote")
|
102
111
|
case next_char
|
103
112
|
when @sep then @flag = @es; next_char; break
|
104
113
|
when @quote then match << @quote
|
105
114
|
when @cr,@lf,@es,nil then break
|
106
|
-
else
|
107
|
-
if @relax
|
108
|
-
match << @quote + @char
|
109
|
-
else
|
110
|
-
bomb "invalid character after quote"
|
111
|
-
end
|
115
|
+
else @relax ? match << (@quote + @char) : bomb("invalid character after quote")
|
112
116
|
end
|
113
117
|
end
|
114
118
|
match
|
@@ -119,6 +123,7 @@ class Censive < StringScanner
|
|
119
123
|
end
|
120
124
|
else # consume unquoted cell
|
121
125
|
match = scan_until(/(?=#{@sep}|#{@cr}|#{@lf}|\z)/o) or bomb "unexpected character"
|
126
|
+
match = @eq + match and @flag = nil if @flag == @eq
|
122
127
|
@char = peek(1)
|
123
128
|
@char == @sep and @flag = @es and next_char
|
124
129
|
match
|
@@ -132,12 +137,12 @@ class Censive < StringScanner
|
|
132
137
|
# ==[ Parser ]==
|
133
138
|
|
134
139
|
def parse
|
135
|
-
@rows
|
140
|
+
@rows = []
|
136
141
|
while row = next_row
|
137
142
|
@rows << row
|
138
|
-
|
139
|
-
@cols =
|
140
|
-
@cells +=
|
143
|
+
count = row.size
|
144
|
+
@cols = count if count > @cols
|
145
|
+
@cells += count
|
141
146
|
end
|
142
147
|
@rows
|
143
148
|
end
|
@@ -151,28 +156,34 @@ class Censive < StringScanner
|
|
151
156
|
|
152
157
|
# ==[ Helpers ]==
|
153
158
|
|
154
|
-
# grok returns: 2
|
159
|
+
# grok returns: 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
|
155
160
|
def grok(str)
|
156
|
-
if pos = str.index(/(#{@quote})|#{@sep}/o)
|
161
|
+
if pos = str.index(/(#{@quote})|#{@sep}|#{@cr}|#{@lf}/o)
|
157
162
|
$1 ? 2 : str.index(/#{@quote}/o, pos) ? 2 : 1
|
158
163
|
else
|
159
164
|
0
|
160
165
|
end
|
161
166
|
end
|
162
167
|
|
168
|
+
# output a row
|
163
169
|
def <<(row)
|
164
170
|
@out or return super
|
165
171
|
|
166
|
-
# drop trailing
|
172
|
+
# drop trailing empty columns
|
167
173
|
row.pop while row.last.empty? if @drop
|
168
174
|
|
169
|
-
|
175
|
+
#!# FIXME: Excel output needs to protect 0-leading numbers
|
176
|
+
|
170
177
|
s,q = @sep, @quote
|
171
178
|
out = case @mode
|
172
179
|
when :compact
|
173
180
|
case grok(row.join)
|
174
|
-
when 0
|
175
|
-
|
181
|
+
when 0
|
182
|
+
row
|
183
|
+
when 1
|
184
|
+
row.map do |col|
|
185
|
+
col.match?(/#{@sep}|#{@cr}|#{@lf}/o) ? "#{q}#{col}#{q}" : col
|
186
|
+
end
|
176
187
|
else
|
177
188
|
row.map do |col|
|
178
189
|
case grok(col)
|
@@ -186,7 +197,7 @@ class Censive < StringScanner
|
|
186
197
|
row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
|
187
198
|
end.join(s)
|
188
199
|
|
189
|
-
#
|
200
|
+
# add line ending
|
190
201
|
@out << out + @eol
|
191
202
|
end
|
192
203
|
|
@@ -195,6 +206,11 @@ class Censive < StringScanner
|
|
195
206
|
@rows.each {|row| yield row }
|
196
207
|
end
|
197
208
|
|
209
|
+
def export(...)
|
210
|
+
out = self.class.writer(...)
|
211
|
+
each {|row| out << row }
|
212
|
+
end
|
213
|
+
|
198
214
|
def stats
|
199
215
|
wide = string.size.to_s.size
|
200
216
|
puts "%#{wide}d rows" % @rows.size
|
@@ -204,9 +220,35 @@ class Censive < StringScanner
|
|
204
220
|
end
|
205
221
|
end
|
206
222
|
|
223
|
+
# ==[ Command line ]==
|
224
|
+
|
225
|
+
if __FILE__ == $0
|
226
|
+
raw = DATA.gets("\n\n").chomp
|
227
|
+
csv = Censive.new(raw, excel: true)
|
228
|
+
csv.export # (sep: "\t", excel: true)
|
229
|
+
end
|
230
|
+
|
207
231
|
__END__
|
232
|
+
Name,Age,Shoe
|
233
|
+
Alice,27,5
|
234
|
+
Bob,33,10 1/2
|
235
|
+
Charlie or "Chuck",=B2 + B3,9
|
236
|
+
"Doug E Fresh",="007",10
|
237
|
+
Subtotal,=sum(B2:B5),="01234"
|
238
|
+
|
239
|
+
|
240
|
+
|
241
|
+
|
242
|
+
path = '../test/a-uses-tabs-and-single-quotes-and-no-trailing-newline.tsv'
|
243
|
+
data = File.read(path)
|
244
|
+
|
245
|
+
out = Censive.writer
|
208
246
|
|
209
|
-
|
247
|
+
Censive.new(data, sep: "\t", quote: "'").each do |row|
|
248
|
+
p row
|
249
|
+
end
|
250
|
+
|
251
|
+
Censive.reader(path, sep: "\t", quote: "'").each {|r| p r}
|
210
252
|
|
211
253
|
# STDOUT.sync = true
|
212
254
|
#
|
@@ -219,14 +261,14 @@ __END__
|
|
219
261
|
# end
|
220
262
|
#
|
221
263
|
# __END__
|
222
|
-
|
223
|
-
ARGV << "z.csv" if ARGV.empty?
|
224
|
-
|
225
|
-
path = ARGV.first
|
226
|
-
data = File.read(path)
|
227
|
-
|
228
|
-
csv = Censive.new(data)
|
229
|
-
|
230
|
-
data.size > 1e6 ? csv.parse : csv.parse.each {|cols| p cols }
|
231
|
-
|
232
|
-
csv.stats
|
264
|
+
#
|
265
|
+
# ARGV << "z.csv" if ARGV.empty?
|
266
|
+
#
|
267
|
+
# path = ARGV.first
|
268
|
+
# data = File.read(path)
|
269
|
+
#
|
270
|
+
# csv = Censive.new(data)
|
271
|
+
#
|
272
|
+
# data.size > 1e6 ? csv.parse : csv.parse.each {|cols| p cols }
|
273
|
+
#
|
274
|
+
# csv.stats
|
File without changes
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: censive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.10'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Steve Shreeve
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-02-02 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: A quick and lightweight CSV handling library for Ruby
|
14
14
|
email: steve.shreeve@gmail.com
|
@@ -20,7 +20,7 @@ files:
|
|
20
20
|
- README.md
|
21
21
|
- censive.gemspec
|
22
22
|
- lib/censive.rb
|
23
|
-
- test/a-uses-tabs-and-single-quotes-and-no-trailing-newline.
|
23
|
+
- test/a-uses-tabs-and-single-quotes-and-no-trailing-newline.tsv
|
24
24
|
homepage: https://github.com/shreeve/censive
|
25
25
|
licenses:
|
26
26
|
- MIT
|