censive 0.9 → 0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 91f3884b92c79f8cca37066a420c872b612c37ed736f61fa7d11c0f0fd861d8e
|
4
|
+
data.tar.gz: 71789d006309c1d87e681a7078e718342cb36e1f8a18690c59e51595c49d2e59
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d5fe33889abe08bc6aa57bb4dec2d65ef3e8935da5aa209a0bcb6060696de07cf44189ad9be6f43ffcd2897672668bd3aff7e8b713f5070199627e40edd3704f
|
7
|
+
data.tar.gz: 2c79d30b4682da07800c19eb93067c5c72d471f9abc7b824f7525b3d36b8fb1108d0e733706852f750baac57360af2d5b77527a1daab871e2313d558928ce240
|
data/README.md
CHANGED
@@ -2,7 +2,58 @@
|
|
2
2
|
|
3
3
|
A quick and lightweight CSV handling library for Ruby
|
4
4
|
|
5
|
-
##
|
5
|
+
## Example
|
6
|
+
|
7
|
+
```ruby
|
8
|
+
#!/usr/bin/env ruby
|
9
|
+
|
10
|
+
STDOUT.sync = true
|
11
|
+
|
12
|
+
require 'censive'
|
13
|
+
require 'fileutils'
|
14
|
+
|
15
|
+
abort "usage: #{File.basename($0)} <files>" if ARGV.empty?
|
16
|
+
|
17
|
+
rand = `LC_ALL=C tr -dc a-zA-Z0-9 < /dev/random | head -c12`
|
18
|
+
|
19
|
+
rows = []
|
20
|
+
cols = []
|
21
|
+
coun = 0
|
22
|
+
full = 0
|
23
|
+
|
24
|
+
ARGV.each do |path|
|
25
|
+
File.file?(path) or next
|
26
|
+
|
27
|
+
print "Processing #{path.inspect}"
|
28
|
+
|
29
|
+
rows.clear
|
30
|
+
cols.clear
|
31
|
+
seen = 0
|
32
|
+
coun += 1
|
33
|
+
|
34
|
+
dest = "#{path}-#{rand}"
|
35
|
+
|
36
|
+
begin
|
37
|
+
Censive.writer(dest) do |file|
|
38
|
+
Censive.reader(path, excel: true, relax: true).each do |cols|
|
39
|
+
file << cols
|
40
|
+
seen += 1
|
41
|
+
print "." if (seen % 1e5) == 0 # give a status update every so often
|
42
|
+
end
|
43
|
+
end
|
44
|
+
FileUtils.mv(dest, path)
|
45
|
+
full += (seen - 1)
|
46
|
+
puts " (#{seen - 1} rows of data)"
|
47
|
+
rescue
|
48
|
+
puts " - unable to process (#{$!})"
|
49
|
+
FileUtils.rm_f(dest)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
puts "Processed #{coun} files with a total of #{full} rows of data" if coun > 1
|
54
|
+
```
|
55
|
+
|
56
|
+
## Convert a CSV file to a TSV file
|
6
57
|
|
7
58
|
```ruby
|
8
59
|
require 'censive'
|
@@ -17,3 +68,12 @@ Censive.writer('out.tsv', sep: "\t", mode: :full) do |out|
|
|
17
68
|
end
|
18
69
|
end
|
19
70
|
```
|
71
|
+
|
72
|
+
Or, you can be more succinct with:
|
73
|
+
|
74
|
+
```ruby
|
75
|
+
require 'censive'
|
76
|
+
|
77
|
+
csv = Censive.new(File.read('data.csv'))
|
78
|
+
csv.export(sep: "\t")
|
79
|
+
```
|
data/censive.gemspec
CHANGED
@@ -2,13 +2,14 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = "censive"
|
5
|
-
s.version = "0.
|
5
|
+
s.version = "0.11"
|
6
6
|
s.author = "Steve Shreeve"
|
7
7
|
s.email = "steve.shreeve@gmail.com"
|
8
|
-
s.summary =
|
8
|
+
s.summary =
|
9
9
|
s.description = "A quick and lightweight CSV handling library for Ruby"
|
10
10
|
s.homepage = "https://github.com/shreeve/censive"
|
11
11
|
s.license = "MIT"
|
12
12
|
s.files = `git ls-files`.split("\n") - %w[.gitignore]
|
13
13
|
s.executables = `(cd bin 2>&1) > /dev/null && git ls-files .`.split("\n")
|
14
|
+
s.add_dependency "strscan", ">= 3.0.6"
|
14
15
|
end
|
data/lib/censive.rb
CHANGED
@@ -1,137 +1,116 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
#
|
3
|
+
# ============================================================================
|
4
4
|
# censive - A quick and lightweight CSV handling library for Ruby
|
5
5
|
#
|
6
6
|
# Author: Steve Shreeve (steve.shreeve@gmail.com)
|
7
|
-
# Date:
|
8
|
-
#
|
9
|
-
# Thanks to https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
|
10
|
-
# and, also https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
|
11
|
-
# ==============================================================================
|
12
|
-
# The goals are:
|
7
|
+
# Date: Feb 3, 2023
|
13
8
|
#
|
9
|
+
# https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
|
10
|
+
# https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
|
11
|
+
# https://github.com/ruby/strscan/issues/53 for details
|
12
|
+
# https://github.com/ruby/strscan/pull/54 for code
|
13
|
+
# ============================================================================
|
14
|
+
# GOALS:
|
14
15
|
# 1. Faster than Ruby's default CSV library
|
15
|
-
# 2. Lightweight code base with streamlined
|
16
|
-
#
|
17
|
-
# To consider:
|
18
|
-
#
|
19
|
-
# 1. Option to support IO streaming
|
20
|
-
# 2. Option to strip whitespace
|
21
|
-
# 3. Option to support headers in readers and writers
|
22
|
-
# 4. Confirm file encodings such as UTF-8, UTF-16, etc.
|
16
|
+
# 2. Lightweight code base with streamlined logic
|
17
|
+
# 3. Support for most non-compliant CSV variations (eg - @relax, @excel)
|
23
18
|
#
|
24
|
-
#
|
25
|
-
#
|
19
|
+
# TODO:
|
20
|
+
# 1. Support IO streaming
|
21
|
+
# 2. Add option to strip whitespace
|
22
|
+
# 3. Support CSV headers in first row
|
23
|
+
# ============================================================================
|
26
24
|
|
25
|
+
require 'bundler/setup'
|
27
26
|
require 'strscan'
|
28
27
|
|
29
28
|
class Censive < StringScanner
|
30
29
|
|
31
|
-
def self.writer(
|
32
|
-
|
33
|
-
|
30
|
+
def self.writer(obj=nil, **opts, &code)
|
31
|
+
case obj
|
32
|
+
when String then File.open(path, 'w') {|file| yield new(out: obj, **opts, &code) }
|
33
|
+
when IO,nil then new(out: obj, **opts, &code)
|
34
|
+
else abort "#{File.basename($0)}: invalid #{obj.class} object in writer"
|
34
35
|
end
|
35
36
|
end
|
36
37
|
|
37
38
|
def initialize(str=nil,
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
drop: false , # enable to drop trailing separators
|
42
|
-
eol: "\n" , # desired line endings for exports
|
43
|
-
excel: false , # literals (="01"), formulas (=A1 + B2), see http://bit.ly/3Y7jIvc
|
39
|
+
drop: false , # drop trailing empty fields?
|
40
|
+
eol: "\n" , # line endings for exports
|
41
|
+
excel: false , # literals ="01" formulas =A1 + B2 http://bit.ly/3Y7jIvc
|
44
42
|
mode: :compact, # export mode: compact or full
|
45
|
-
out: nil , # output
|
46
|
-
|
47
|
-
|
43
|
+
out: nil , # output stream, needs to respond to <<
|
44
|
+
quote: '"' , # quote character
|
45
|
+
relax: false , # relax quote parsing so ,"Fo"o, => ,"Fo""o",
|
46
|
+
sep: ',' , # column separator character
|
48
47
|
**opts # grab bag
|
49
48
|
)
|
50
49
|
super(str || '')
|
51
50
|
reset
|
52
51
|
|
53
|
-
@sep = sep .freeze
|
54
|
-
@quote = quote.freeze
|
55
|
-
|
56
52
|
@drop = drop
|
57
|
-
@eol = eol
|
53
|
+
@eol = eol
|
58
54
|
@excel = excel
|
59
55
|
@mode = mode
|
60
|
-
@out = out
|
56
|
+
@out = out || $stdout
|
57
|
+
@quote = quote
|
61
58
|
@relax = relax
|
59
|
+
@sep = sep
|
62
60
|
|
63
|
-
@
|
64
|
-
@
|
65
|
-
@
|
66
|
-
@eq = "="
|
67
|
-
@esc = (@quote * 2)
|
68
|
-
|
69
|
-
@tokens = [@sep,@quote,@cr,@lf,@es,nil]
|
61
|
+
@cr = "\r"
|
62
|
+
@lf = "\n"
|
63
|
+
@es = ""
|
64
|
+
@eq = "="
|
65
|
+
@esc = (@quote * 2)
|
70
66
|
end
|
71
67
|
|
72
68
|
def reset(str=nil)
|
73
69
|
self.string = str if str
|
74
70
|
super()
|
75
|
-
@char =
|
76
|
-
@flag = nil
|
77
|
-
|
71
|
+
@char = curr_char
|
78
72
|
@rows = nil
|
79
73
|
@cols = @cells = 0
|
80
74
|
end
|
81
75
|
|
82
76
|
# ==[ Lexer ]==
|
83
77
|
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
end
|
88
|
-
|
89
|
-
def next_token
|
78
|
+
# pure ruby versions for debugging
|
79
|
+
# def curr_char; @char = string[pos]; end
|
80
|
+
# def next_char; scan(/./m); @char = string[pos]; end
|
90
81
|
|
91
|
-
|
92
|
-
|
93
|
-
when @es then @flag = nil; [@cr,@lf,@es,nil].include?(@char) and return @es
|
94
|
-
when @cr then @flag = nil; next_char == @lf and next_char
|
95
|
-
when @lf then @flag = nil; next_char
|
96
|
-
else @flag = nil
|
97
|
-
end if @flag
|
82
|
+
def curr_char; @char = currchar; end
|
83
|
+
def next_char; @char = nextchar; end
|
98
84
|
|
99
|
-
|
85
|
+
def next_token
|
100
86
|
if @excel && @char == @eq
|
101
|
-
|
87
|
+
excel = true
|
102
88
|
next_char
|
103
89
|
end
|
104
90
|
|
105
|
-
if @
|
91
|
+
if @char == @quote # consume quoted cell
|
92
|
+
match = ""
|
93
|
+
while true
|
94
|
+
next_char
|
95
|
+
match << (scan_until(/(?=#{@quote})/o) or bomb "unclosed quote")
|
96
|
+
match << @quote and next if next_char == @quote
|
97
|
+
break if [@sep,@cr,@lf,@es,nil].include?(@char)
|
98
|
+
@relax or bomb "invalid character after quote"
|
99
|
+
match << @quote + scan_until(/(?=#{@quote})/o) + @quote
|
100
|
+
end
|
101
|
+
next_char if @char == @sep
|
102
|
+
match
|
103
|
+
elsif [@sep,@cr,@lf,@es,nil].include?(@char)
|
106
104
|
case @char
|
107
|
-
when @
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
match << (scan_until(/(?=#{@quote})/o) or bomb "unclosed quote")
|
112
|
-
case next_char
|
113
|
-
when @sep then @flag = @es; next_char; break
|
114
|
-
when @quote then match << @quote
|
115
|
-
when @cr,@lf,@es,nil then break
|
116
|
-
else
|
117
|
-
if @relax
|
118
|
-
match << @quote + @char
|
119
|
-
else
|
120
|
-
bomb "invalid character after quote"
|
121
|
-
end
|
122
|
-
end
|
123
|
-
end
|
124
|
-
match
|
125
|
-
when @sep then @flag = @es; next_char; @es
|
126
|
-
when @cr then @flag = @cr; nil
|
127
|
-
when @lf then @flag = @lf; nil
|
128
|
-
when @es,nil then nil
|
105
|
+
when @sep then next_char; @es
|
106
|
+
when @cr then next_char == @lf and next_char; nil
|
107
|
+
when @lf then next_char; nil
|
108
|
+
else nil
|
129
109
|
end
|
130
110
|
else # consume unquoted cell
|
131
111
|
match = scan_until(/(?=#{@sep}|#{@cr}|#{@lf}|\z)/o) or bomb "unexpected character"
|
132
|
-
match
|
133
|
-
|
134
|
-
@char == @sep and @flag = @es and next_char
|
112
|
+
match.prepend(@eq) if excel
|
113
|
+
next_char if curr_char == @sep
|
135
114
|
match
|
136
115
|
end
|
137
116
|
end
|
@@ -143,12 +122,12 @@ class Censive < StringScanner
|
|
143
122
|
# ==[ Parser ]==
|
144
123
|
|
145
124
|
def parse
|
146
|
-
@rows
|
125
|
+
@rows = []
|
147
126
|
while row = next_row
|
148
127
|
@rows << row
|
149
|
-
|
150
|
-
@cols =
|
151
|
-
@cells +=
|
128
|
+
count = row.size
|
129
|
+
@cols = count if count > @cols
|
130
|
+
@cells += count
|
152
131
|
end
|
153
132
|
@rows
|
154
133
|
end
|
@@ -162,26 +141,25 @@ class Censive < StringScanner
|
|
162
141
|
|
163
142
|
# ==[ Helpers ]==
|
164
143
|
|
165
|
-
#
|
144
|
+
# returns 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
|
166
145
|
def grok(str)
|
167
|
-
if
|
168
|
-
$1 ? 2 : str.index(/#{@quote}/o,
|
146
|
+
if idx = str.index(/(#{@quote})|#{@sep}|#{@cr}|#{@lf}/o) #!# FIXME: regex injection is possible
|
147
|
+
$1 ? 2 : str.index(/#{@quote}/o, idx) ? 2 : 1
|
169
148
|
else
|
170
149
|
0
|
171
150
|
end
|
172
151
|
end
|
173
152
|
|
153
|
+
# output a row
|
174
154
|
def <<(row)
|
175
|
-
@out or return super
|
176
155
|
|
177
|
-
# drop trailing
|
156
|
+
# drop trailing empty columns
|
178
157
|
row.pop while row.last.empty? if @drop
|
179
158
|
|
180
|
-
# most compact export format
|
181
159
|
s,q = @sep, @quote
|
182
160
|
out = case @mode
|
183
161
|
when :compact
|
184
|
-
case grok(row.join)
|
162
|
+
case @excel ? 2 : grok(row.join)
|
185
163
|
when 0
|
186
164
|
row
|
187
165
|
when 1
|
@@ -190,6 +168,7 @@ class Censive < StringScanner
|
|
190
168
|
end
|
191
169
|
else
|
192
170
|
row.map do |col|
|
171
|
+
@excel && col =~ /\A0\d*\z/ ? "=#{q}#{col}#{q}" :
|
193
172
|
case grok(col)
|
194
173
|
when 0 then col
|
195
174
|
when 1 then "#{q}#{col}#{q}"
|
@@ -201,7 +180,6 @@ class Censive < StringScanner
|
|
201
180
|
row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
|
202
181
|
end.join(s)
|
203
182
|
|
204
|
-
# write output, using desired line endings
|
205
183
|
@out << out + @eol
|
206
184
|
end
|
207
185
|
|
@@ -210,6 +188,11 @@ class Censive < StringScanner
|
|
210
188
|
@rows.each {|row| yield row }
|
211
189
|
end
|
212
190
|
|
191
|
+
def export(**opts)
|
192
|
+
out = opts.empty? ? self : self.class.writer(**opts)
|
193
|
+
each {|row| out << row }
|
194
|
+
end
|
195
|
+
|
213
196
|
def stats
|
214
197
|
wide = string.size.to_s.size
|
215
198
|
puts "%#{wide}d rows" % @rows.size
|
@@ -219,29 +202,25 @@ class Censive < StringScanner
|
|
219
202
|
end
|
220
203
|
end
|
221
204
|
|
222
|
-
|
223
|
-
|
224
|
-
#
|
225
|
-
|
226
|
-
#
|
227
|
-
|
228
|
-
# data = File.read('1.csv')
|
229
|
-
#
|
230
|
-
# Censive.writer('out.csv') do |out|
|
231
|
-
# Censive.new(data, relax: true, excel: true).each do |row|
|
232
|
-
# out << row
|
233
|
-
# end
|
234
|
-
# end
|
235
|
-
#
|
236
|
-
# __END__
|
237
|
-
|
238
|
-
ARGV << "z.csv" if ARGV.empty?
|
239
|
-
|
240
|
-
path = ARGV.first
|
241
|
-
data = File.read(path)
|
242
|
-
|
243
|
-
csv = Censive.new(data)
|
244
|
-
|
245
|
-
data.size > 1e6 ? csv.parse : csv.parse.each {|cols| p cols }
|
205
|
+
if __FILE__ == $0
|
206
|
+
raw = DATA.gets("\n\n").chomp
|
207
|
+
# raw = File.read(ARGV.first || "lc-2023.csv")
|
208
|
+
csv = Censive.new(raw, excel: true, relax: true)
|
209
|
+
csv.export # (sep: ",", excel: true)
|
210
|
+
end
|
246
211
|
|
247
|
-
|
212
|
+
__END__
|
213
|
+
Name,Age,Shoe
|
214
|
+
Alice,27,5
|
215
|
+
Bob,33,10 1/2
|
216
|
+
Charlie or "Chuck",=B2 + B3,9
|
217
|
+
"Doug E Fresh",="007",10
|
218
|
+
Subtotal,=sum(B2:B5),="01234"
|
219
|
+
|
220
|
+
# first line works in "relax" mode, bottom line is compliant
|
221
|
+
123,"CHO, JOELLE "JOJO"",456
|
222
|
+
123,"CHO, JOELLE ""JOJO""",456
|
223
|
+
|
224
|
+
# Excel mode checking
|
225
|
+
=,=x,x=,="x",="","","=",123,0123,="123",="0123"
|
226
|
+
,=x,x=,x,,,,,,=,,123,="0123",123,,="0123" # <= a little off
|
File without changes
|
metadata
CHANGED
@@ -1,15 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: censive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.11'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Steve Shreeve
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
12
|
-
dependencies:
|
11
|
+
date: 2023-02-04 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: strscan
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 3.0.6
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 3.0.6
|
13
27
|
description: A quick and lightweight CSV handling library for Ruby
|
14
28
|
email: steve.shreeve@gmail.com
|
15
29
|
executables: []
|
@@ -20,7 +34,7 @@ files:
|
|
20
34
|
- README.md
|
21
35
|
- censive.gemspec
|
22
36
|
- lib/censive.rb
|
23
|
-
- test/a-uses-tabs-and-single-quotes-and-no-trailing-newline.
|
37
|
+
- test/a-uses-tabs-and-single-quotes-and-no-trailing-newline.tsv
|
24
38
|
homepage: https://github.com/shreeve/censive
|
25
39
|
licenses:
|
26
40
|
- MIT
|