censive 0.9 → 0.11
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 91f3884b92c79f8cca37066a420c872b612c37ed736f61fa7d11c0f0fd861d8e
|
4
|
+
data.tar.gz: 71789d006309c1d87e681a7078e718342cb36e1f8a18690c59e51595c49d2e59
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d5fe33889abe08bc6aa57bb4dec2d65ef3e8935da5aa209a0bcb6060696de07cf44189ad9be6f43ffcd2897672668bd3aff7e8b713f5070199627e40edd3704f
|
7
|
+
data.tar.gz: 2c79d30b4682da07800c19eb93067c5c72d471f9abc7b824f7525b3d36b8fb1108d0e733706852f750baac57360af2d5b77527a1daab871e2313d558928ce240
|
data/README.md
CHANGED
@@ -2,7 +2,58 @@
|
|
2
2
|
|
3
3
|
A quick and lightweight CSV handling library for Ruby
|
4
4
|
|
5
|
-
##
|
5
|
+
## Example
|
6
|
+
|
7
|
+
```ruby
|
8
|
+
#!/usr/bin/env ruby
|
9
|
+
|
10
|
+
STDOUT.sync = true
|
11
|
+
|
12
|
+
require 'censive'
|
13
|
+
require 'fileutils'
|
14
|
+
|
15
|
+
abort "usage: #{File.basename($0)} <files>" if ARGV.empty?
|
16
|
+
|
17
|
+
rand = `LC_ALL=C tr -dc a-zA-Z0-9 < /dev/random | head -c12`
|
18
|
+
|
19
|
+
rows = []
|
20
|
+
cols = []
|
21
|
+
coun = 0
|
22
|
+
full = 0
|
23
|
+
|
24
|
+
ARGV.each do |path|
|
25
|
+
File.file?(path) or next
|
26
|
+
|
27
|
+
print "Processing #{path.inspect}"
|
28
|
+
|
29
|
+
rows.clear
|
30
|
+
cols.clear
|
31
|
+
seen = 0
|
32
|
+
coun += 1
|
33
|
+
|
34
|
+
dest = "#{path}-#{rand}"
|
35
|
+
|
36
|
+
begin
|
37
|
+
Censive.writer(dest) do |file|
|
38
|
+
Censive.reader(path, excel: true, relax: true).each do |cols|
|
39
|
+
file << cols
|
40
|
+
seen += 1
|
41
|
+
print "." if (seen % 1e5) == 0 # give a status update every so often
|
42
|
+
end
|
43
|
+
end
|
44
|
+
FileUtils.mv(dest, path)
|
45
|
+
full += (seen - 1)
|
46
|
+
puts " (#{seen - 1} rows of data)"
|
47
|
+
rescue
|
48
|
+
puts " - unable to process (#{$!})"
|
49
|
+
FileUtils.rm_f(dest)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
puts "Processed #{coun} files with a total of #{full} rows of data" if coun > 1
|
54
|
+
```
|
55
|
+
|
56
|
+
## Convert a CSV file to a TSV file
|
6
57
|
|
7
58
|
```ruby
|
8
59
|
require 'censive'
|
@@ -17,3 +68,12 @@ Censive.writer('out.tsv', sep: "\t", mode: :full) do |out|
|
|
17
68
|
end
|
18
69
|
end
|
19
70
|
```
|
71
|
+
|
72
|
+
Or, you can be more succinct with:
|
73
|
+
|
74
|
+
```ruby
|
75
|
+
require 'censive'
|
76
|
+
|
77
|
+
csv = Censive.new(File.read('data.csv'))
|
78
|
+
csv.export(sep: "\t")
|
79
|
+
```
|
data/censive.gemspec
CHANGED
@@ -2,13 +2,14 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = "censive"
|
5
|
-
s.version = "0.
|
5
|
+
s.version = "0.11"
|
6
6
|
s.author = "Steve Shreeve"
|
7
7
|
s.email = "steve.shreeve@gmail.com"
|
8
|
-
s.summary =
|
8
|
+
s.summary =
|
9
9
|
s.description = "A quick and lightweight CSV handling library for Ruby"
|
10
10
|
s.homepage = "https://github.com/shreeve/censive"
|
11
11
|
s.license = "MIT"
|
12
12
|
s.files = `git ls-files`.split("\n") - %w[.gitignore]
|
13
13
|
s.executables = `(cd bin 2>&1) > /dev/null && git ls-files .`.split("\n")
|
14
|
+
s.add_dependency "strscan", ">= 3.0.6"
|
14
15
|
end
|
data/lib/censive.rb
CHANGED
@@ -1,137 +1,116 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
#
|
3
|
+
# ============================================================================
|
4
4
|
# censive - A quick and lightweight CSV handling library for Ruby
|
5
5
|
#
|
6
6
|
# Author: Steve Shreeve (steve.shreeve@gmail.com)
|
7
|
-
# Date:
|
8
|
-
#
|
9
|
-
# Thanks to https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
|
10
|
-
# and, also https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
|
11
|
-
# ==============================================================================
|
12
|
-
# The goals are:
|
7
|
+
# Date: Feb 3, 2023
|
13
8
|
#
|
9
|
+
# https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
|
10
|
+
# https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
|
11
|
+
# https://github.com/ruby/strscan/issues/53 for details
|
12
|
+
# https://github.com/ruby/strscan/pull/54 for code
|
13
|
+
# ============================================================================
|
14
|
+
# GOALS:
|
14
15
|
# 1. Faster than Ruby's default CSV library
|
15
|
-
# 2. Lightweight code base with streamlined
|
16
|
-
#
|
17
|
-
# To consider:
|
18
|
-
#
|
19
|
-
# 1. Option to support IO streaming
|
20
|
-
# 2. Option to strip whitespace
|
21
|
-
# 3. Option to support headers in readers and writers
|
22
|
-
# 4. Confirm file encodings such as UTF-8, UTF-16, etc.
|
16
|
+
# 2. Lightweight code base with streamlined logic
|
17
|
+
# 3. Support for most non-compliant CSV variations (eg - @relax, @excel)
|
23
18
|
#
|
24
|
-
#
|
25
|
-
#
|
19
|
+
# TODO:
|
20
|
+
# 1. Support IO streaming
|
21
|
+
# 2. Add option to strip whitespace
|
22
|
+
# 3. Support CSV headers in first row
|
23
|
+
# ============================================================================
|
26
24
|
|
25
|
+
require 'bundler/setup'
|
27
26
|
require 'strscan'
|
28
27
|
|
29
28
|
class Censive < StringScanner
|
30
29
|
|
31
|
-
def self.writer(
|
32
|
-
|
33
|
-
|
30
|
+
def self.writer(obj=nil, **opts, &code)
|
31
|
+
case obj
|
32
|
+
when String then File.open(path, 'w') {|file| yield new(out: obj, **opts, &code) }
|
33
|
+
when IO,nil then new(out: obj, **opts, &code)
|
34
|
+
else abort "#{File.basename($0)}: invalid #{obj.class} object in writer"
|
34
35
|
end
|
35
36
|
end
|
36
37
|
|
37
38
|
def initialize(str=nil,
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
drop: false , # enable to drop trailing separators
|
42
|
-
eol: "\n" , # desired line endings for exports
|
43
|
-
excel: false , # literals (="01"), formulas (=A1 + B2), see http://bit.ly/3Y7jIvc
|
39
|
+
drop: false , # drop trailing empty fields?
|
40
|
+
eol: "\n" , # line endings for exports
|
41
|
+
excel: false , # literals ="01" formulas =A1 + B2 http://bit.ly/3Y7jIvc
|
44
42
|
mode: :compact, # export mode: compact or full
|
45
|
-
out: nil , # output
|
46
|
-
|
47
|
-
|
43
|
+
out: nil , # output stream, needs to respond to <<
|
44
|
+
quote: '"' , # quote character
|
45
|
+
relax: false , # relax quote parsing so ,"Fo"o, => ,"Fo""o",
|
46
|
+
sep: ',' , # column separator character
|
48
47
|
**opts # grab bag
|
49
48
|
)
|
50
49
|
super(str || '')
|
51
50
|
reset
|
52
51
|
|
53
|
-
@sep = sep .freeze
|
54
|
-
@quote = quote.freeze
|
55
|
-
|
56
52
|
@drop = drop
|
57
|
-
@eol = eol
|
53
|
+
@eol = eol
|
58
54
|
@excel = excel
|
59
55
|
@mode = mode
|
60
|
-
@out = out
|
56
|
+
@out = out || $stdout
|
57
|
+
@quote = quote
|
61
58
|
@relax = relax
|
59
|
+
@sep = sep
|
62
60
|
|
63
|
-
@
|
64
|
-
@
|
65
|
-
@
|
66
|
-
@eq = "="
|
67
|
-
@esc = (@quote * 2)
|
68
|
-
|
69
|
-
@tokens = [@sep,@quote,@cr,@lf,@es,nil]
|
61
|
+
@cr = "\r"
|
62
|
+
@lf = "\n"
|
63
|
+
@es = ""
|
64
|
+
@eq = "="
|
65
|
+
@esc = (@quote * 2)
|
70
66
|
end
|
71
67
|
|
72
68
|
def reset(str=nil)
|
73
69
|
self.string = str if str
|
74
70
|
super()
|
75
|
-
@char =
|
76
|
-
@flag = nil
|
77
|
-
|
71
|
+
@char = curr_char
|
78
72
|
@rows = nil
|
79
73
|
@cols = @cells = 0
|
80
74
|
end
|
81
75
|
|
82
76
|
# ==[ Lexer ]==
|
83
77
|
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
end
|
88
|
-
|
89
|
-
def next_token
|
78
|
+
# pure ruby versions for debugging
|
79
|
+
# def curr_char; @char = string[pos]; end
|
80
|
+
# def next_char; scan(/./m); @char = string[pos]; end
|
90
81
|
|
91
|
-
|
92
|
-
|
93
|
-
when @es then @flag = nil; [@cr,@lf,@es,nil].include?(@char) and return @es
|
94
|
-
when @cr then @flag = nil; next_char == @lf and next_char
|
95
|
-
when @lf then @flag = nil; next_char
|
96
|
-
else @flag = nil
|
97
|
-
end if @flag
|
82
|
+
def curr_char; @char = currchar; end
|
83
|
+
def next_char; @char = nextchar; end
|
98
84
|
|
99
|
-
|
85
|
+
def next_token
|
100
86
|
if @excel && @char == @eq
|
101
|
-
|
87
|
+
excel = true
|
102
88
|
next_char
|
103
89
|
end
|
104
90
|
|
105
|
-
if @
|
91
|
+
if @char == @quote # consume quoted cell
|
92
|
+
match = ""
|
93
|
+
while true
|
94
|
+
next_char
|
95
|
+
match << (scan_until(/(?=#{@quote})/o) or bomb "unclosed quote")
|
96
|
+
match << @quote and next if next_char == @quote
|
97
|
+
break if [@sep,@cr,@lf,@es,nil].include?(@char)
|
98
|
+
@relax or bomb "invalid character after quote"
|
99
|
+
match << @quote + scan_until(/(?=#{@quote})/o) + @quote
|
100
|
+
end
|
101
|
+
next_char if @char == @sep
|
102
|
+
match
|
103
|
+
elsif [@sep,@cr,@lf,@es,nil].include?(@char)
|
106
104
|
case @char
|
107
|
-
when @
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
match << (scan_until(/(?=#{@quote})/o) or bomb "unclosed quote")
|
112
|
-
case next_char
|
113
|
-
when @sep then @flag = @es; next_char; break
|
114
|
-
when @quote then match << @quote
|
115
|
-
when @cr,@lf,@es,nil then break
|
116
|
-
else
|
117
|
-
if @relax
|
118
|
-
match << @quote + @char
|
119
|
-
else
|
120
|
-
bomb "invalid character after quote"
|
121
|
-
end
|
122
|
-
end
|
123
|
-
end
|
124
|
-
match
|
125
|
-
when @sep then @flag = @es; next_char; @es
|
126
|
-
when @cr then @flag = @cr; nil
|
127
|
-
when @lf then @flag = @lf; nil
|
128
|
-
when @es,nil then nil
|
105
|
+
when @sep then next_char; @es
|
106
|
+
when @cr then next_char == @lf and next_char; nil
|
107
|
+
when @lf then next_char; nil
|
108
|
+
else nil
|
129
109
|
end
|
130
110
|
else # consume unquoted cell
|
131
111
|
match = scan_until(/(?=#{@sep}|#{@cr}|#{@lf}|\z)/o) or bomb "unexpected character"
|
132
|
-
match
|
133
|
-
|
134
|
-
@char == @sep and @flag = @es and next_char
|
112
|
+
match.prepend(@eq) if excel
|
113
|
+
next_char if curr_char == @sep
|
135
114
|
match
|
136
115
|
end
|
137
116
|
end
|
@@ -143,12 +122,12 @@ class Censive < StringScanner
|
|
143
122
|
# ==[ Parser ]==
|
144
123
|
|
145
124
|
def parse
|
146
|
-
@rows
|
125
|
+
@rows = []
|
147
126
|
while row = next_row
|
148
127
|
@rows << row
|
149
|
-
|
150
|
-
@cols =
|
151
|
-
@cells +=
|
128
|
+
count = row.size
|
129
|
+
@cols = count if count > @cols
|
130
|
+
@cells += count
|
152
131
|
end
|
153
132
|
@rows
|
154
133
|
end
|
@@ -162,26 +141,25 @@ class Censive < StringScanner
|
|
162
141
|
|
163
142
|
# ==[ Helpers ]==
|
164
143
|
|
165
|
-
#
|
144
|
+
# returns 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
|
166
145
|
def grok(str)
|
167
|
-
if
|
168
|
-
$1 ? 2 : str.index(/#{@quote}/o,
|
146
|
+
if idx = str.index(/(#{@quote})|#{@sep}|#{@cr}|#{@lf}/o) #!# FIXME: regex injection is possible
|
147
|
+
$1 ? 2 : str.index(/#{@quote}/o, idx) ? 2 : 1
|
169
148
|
else
|
170
149
|
0
|
171
150
|
end
|
172
151
|
end
|
173
152
|
|
153
|
+
# output a row
|
174
154
|
def <<(row)
|
175
|
-
@out or return super
|
176
155
|
|
177
|
-
# drop trailing
|
156
|
+
# drop trailing empty columns
|
178
157
|
row.pop while row.last.empty? if @drop
|
179
158
|
|
180
|
-
# most compact export format
|
181
159
|
s,q = @sep, @quote
|
182
160
|
out = case @mode
|
183
161
|
when :compact
|
184
|
-
case grok(row.join)
|
162
|
+
case @excel ? 2 : grok(row.join)
|
185
163
|
when 0
|
186
164
|
row
|
187
165
|
when 1
|
@@ -190,6 +168,7 @@ class Censive < StringScanner
|
|
190
168
|
end
|
191
169
|
else
|
192
170
|
row.map do |col|
|
171
|
+
@excel && col =~ /\A0\d*\z/ ? "=#{q}#{col}#{q}" :
|
193
172
|
case grok(col)
|
194
173
|
when 0 then col
|
195
174
|
when 1 then "#{q}#{col}#{q}"
|
@@ -201,7 +180,6 @@ class Censive < StringScanner
|
|
201
180
|
row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
|
202
181
|
end.join(s)
|
203
182
|
|
204
|
-
# write output, using desired line endings
|
205
183
|
@out << out + @eol
|
206
184
|
end
|
207
185
|
|
@@ -210,6 +188,11 @@ class Censive < StringScanner
|
|
210
188
|
@rows.each {|row| yield row }
|
211
189
|
end
|
212
190
|
|
191
|
+
def export(**opts)
|
192
|
+
out = opts.empty? ? self : self.class.writer(**opts)
|
193
|
+
each {|row| out << row }
|
194
|
+
end
|
195
|
+
|
213
196
|
def stats
|
214
197
|
wide = string.size.to_s.size
|
215
198
|
puts "%#{wide}d rows" % @rows.size
|
@@ -219,29 +202,25 @@ class Censive < StringScanner
|
|
219
202
|
end
|
220
203
|
end
|
221
204
|
|
222
|
-
|
223
|
-
|
224
|
-
#
|
225
|
-
|
226
|
-
#
|
227
|
-
|
228
|
-
# data = File.read('1.csv')
|
229
|
-
#
|
230
|
-
# Censive.writer('out.csv') do |out|
|
231
|
-
# Censive.new(data, relax: true, excel: true).each do |row|
|
232
|
-
# out << row
|
233
|
-
# end
|
234
|
-
# end
|
235
|
-
#
|
236
|
-
# __END__
|
237
|
-
|
238
|
-
ARGV << "z.csv" if ARGV.empty?
|
239
|
-
|
240
|
-
path = ARGV.first
|
241
|
-
data = File.read(path)
|
242
|
-
|
243
|
-
csv = Censive.new(data)
|
244
|
-
|
245
|
-
data.size > 1e6 ? csv.parse : csv.parse.each {|cols| p cols }
|
205
|
+
if __FILE__ == $0
|
206
|
+
raw = DATA.gets("\n\n").chomp
|
207
|
+
# raw = File.read(ARGV.first || "lc-2023.csv")
|
208
|
+
csv = Censive.new(raw, excel: true, relax: true)
|
209
|
+
csv.export # (sep: ",", excel: true)
|
210
|
+
end
|
246
211
|
|
247
|
-
|
212
|
+
__END__
|
213
|
+
Name,Age,Shoe
|
214
|
+
Alice,27,5
|
215
|
+
Bob,33,10 1/2
|
216
|
+
Charlie or "Chuck",=B2 + B3,9
|
217
|
+
"Doug E Fresh",="007",10
|
218
|
+
Subtotal,=sum(B2:B5),="01234"
|
219
|
+
|
220
|
+
# first line works in "relax" mode, bottom line is compliant
|
221
|
+
123,"CHO, JOELLE "JOJO"",456
|
222
|
+
123,"CHO, JOELLE ""JOJO""",456
|
223
|
+
|
224
|
+
# Excel mode checking
|
225
|
+
=,=x,x=,="x",="","","=",123,0123,="123",="0123"
|
226
|
+
,=x,x=,x,,,,,,=,,123,="0123",123,,="0123" # <= a little off
|
File without changes
|
metadata
CHANGED
@@ -1,15 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: censive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.11'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Steve Shreeve
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
12
|
-
dependencies:
|
11
|
+
date: 2023-02-04 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: strscan
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 3.0.6
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 3.0.6
|
13
27
|
description: A quick and lightweight CSV handling library for Ruby
|
14
28
|
email: steve.shreeve@gmail.com
|
15
29
|
executables: []
|
@@ -20,7 +34,7 @@ files:
|
|
20
34
|
- README.md
|
21
35
|
- censive.gemspec
|
22
36
|
- lib/censive.rb
|
23
|
-
- test/a-uses-tabs-and-single-quotes-and-no-trailing-newline.
|
37
|
+
- test/a-uses-tabs-and-single-quotes-and-no-trailing-newline.tsv
|
24
38
|
homepage: https://github.com/shreeve/censive
|
25
39
|
licenses:
|
26
40
|
- MIT
|