censive 0.7 → 0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/censive.gemspec +1 -1
- data/lib/censive.rb +57 -108
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6419efcdc9274ea8bcf7b8527001e33f8bdfea348dfd911686cab36984d507da
|
4
|
+
data.tar.gz: 3b59aead54517fd64d7ece3eaa6f459e301e1e48f1ae34772a7128c61fb739f2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7910c09e76a81ed27870ea52fb6c8aea0316ed213c53a026d98adc64f93349477e6acab0a93b88c6f184ce1d317634ecdca9290d50bff9b117b98bedd3ac7b86
|
7
|
+
data.tar.gz: 358ab985947d486b5f486b1f7e9c1f591e3b8e906b9eab59a4ed151e5f5d9652c211f2d2a4ee36f0543227e2ae5e33ba57f1e4c178f6f7e72e05c14d7b46895f
|
data/censive.gemspec
CHANGED
data/lib/censive.rb
CHANGED
@@ -5,6 +5,8 @@
|
|
5
5
|
#
|
6
6
|
# Author: Steve Shreeve (steve.shreeve@gmail.com)
|
7
7
|
# Date: Jan 30, 2023
|
8
|
+
#
|
9
|
+
# Thanks: Crystal's CSV library, see https://crystal-lang.org/api/1.7.2/CSV.html
|
8
10
|
# ==============================================================================
|
9
11
|
# The goals are:
|
10
12
|
#
|
@@ -15,10 +17,7 @@
|
|
15
17
|
#
|
16
18
|
# 1. Option to support IO streaming
|
17
19
|
# 2. Option to strip whitespace
|
18
|
-
# 3.
|
19
|
-
# 4. Option to force quotes in output
|
20
|
-
# 5. Option to allow reading excel CSV (="Text" for cells)
|
21
|
-
# 6. Confirm file encodings such as UTF-8, UTF-16, etc.
|
20
|
+
# 3. Confirm file encodings such as UTF-8, UTF-16, etc.
|
22
21
|
#
|
23
22
|
# NOTE: Only getch and scan_until advance strscan's position
|
24
23
|
# ==============================================================================
|
@@ -39,6 +38,7 @@ class Censive < StringScanner
|
|
39
38
|
|
40
39
|
drop: false , # enable to drop trailing separators
|
41
40
|
eol: "\n" , # desired line endings for exports
|
41
|
+
excel: false , # allow ,="0123" style columns
|
42
42
|
mode: :compact, # export mode: compact or full
|
43
43
|
out: nil , # output IO/file
|
44
44
|
relax: false , # relax parsing of quotes
|
@@ -48,56 +48,61 @@ class Censive < StringScanner
|
|
48
48
|
super(str || '')
|
49
49
|
reset
|
50
50
|
|
51
|
-
@sep
|
52
|
-
@quote
|
51
|
+
@sep = sep .freeze
|
52
|
+
@quote = quote.freeze
|
53
|
+
|
54
|
+
@drop = drop
|
55
|
+
@eol = eol.freeze
|
56
|
+
@mode = mode
|
57
|
+
@out = out
|
58
|
+
@relax = relax
|
53
59
|
|
54
|
-
@
|
55
|
-
@
|
56
|
-
@
|
57
|
-
@
|
58
|
-
@
|
60
|
+
@es = "" .freeze
|
61
|
+
@cr = "\r" .freeze
|
62
|
+
@lf = "\n" .freeze
|
63
|
+
@eq = "=" .freeze
|
64
|
+
@esc = (@quote * 2).freeze
|
59
65
|
|
60
|
-
@
|
61
|
-
@
|
62
|
-
@lf = "\n" .freeze
|
63
|
-
@esc = (@quote * 2).freeze
|
66
|
+
@tokens = [@sep,@quote,@cr,@lf,@es,nil]
|
67
|
+
@tokens << @eq if excel # See http://bit.ly/3Y7jIvc
|
64
68
|
end
|
65
69
|
|
66
70
|
def reset(str=nil)
|
67
71
|
self.string = str if str
|
68
72
|
super()
|
69
|
-
@char
|
70
|
-
@flag
|
73
|
+
@char = peek(1)
|
74
|
+
@flag = nil
|
71
75
|
|
72
|
-
@rows
|
73
|
-
@cols
|
76
|
+
@rows = nil
|
77
|
+
@cols = @cells = 0
|
74
78
|
end
|
75
79
|
|
76
80
|
# ==[ Lexer ]==
|
77
81
|
|
78
82
|
def next_char
|
79
83
|
getch
|
80
|
-
@char =
|
84
|
+
@char = peek(1)
|
81
85
|
end
|
82
86
|
|
83
87
|
def next_token
|
84
88
|
case @flag
|
85
|
-
when @es then @flag = nil; [@cr,@lf,nil].include?(@char) and return @es
|
89
|
+
when @es then @flag = nil; [@cr,@lf,@es,nil].include?(@char) and return @es
|
86
90
|
when @cr then @flag = nil; next_char == @lf and next_char
|
87
91
|
when @lf then @flag = nil; next_char
|
88
92
|
end if @flag
|
89
93
|
|
90
|
-
if
|
94
|
+
if @tokens.include?(@char)
|
91
95
|
case @char
|
92
|
-
when @quote #
|
96
|
+
when @quote, @eq # consume quoted cell
|
97
|
+
@char == @eq and next_char # excel mode: allows ,="012",
|
93
98
|
match = ""
|
94
99
|
while true
|
95
100
|
getch # consume the quote (optimized by not calling next_char)
|
96
101
|
match << (scan_until(/(?=#{@quote})/o) or bomb "unclosed quote")
|
97
102
|
case next_char
|
98
|
-
when @sep
|
99
|
-
when @quote
|
100
|
-
when @cr,@lf,nil then break
|
103
|
+
when @sep then @flag = @es; next_char; break
|
104
|
+
when @quote then match << @quote
|
105
|
+
when @cr,@lf,@es,nil then break
|
101
106
|
else
|
102
107
|
if @relax
|
103
108
|
match << @quote + @char
|
@@ -107,14 +112,14 @@ class Censive < StringScanner
|
|
107
112
|
end
|
108
113
|
end
|
109
114
|
match
|
110
|
-
when @sep
|
111
|
-
when @cr
|
112
|
-
when @lf
|
113
|
-
when nil
|
115
|
+
when @sep then @flag = @es; next_char; @es
|
116
|
+
when @cr then @flag = @cr; nil
|
117
|
+
when @lf then @flag = @lf; nil
|
118
|
+
when @es,nil then nil
|
114
119
|
end
|
115
|
-
else #
|
120
|
+
else # consume unquoted cell
|
116
121
|
match = scan_until(/(?=#{@sep}|#{@cr}|#{@lf}|\z)/o) or bomb "unexpected character"
|
117
|
-
@char =
|
122
|
+
@char = peek(1)
|
118
123
|
@char == @sep and @flag = @es and next_char
|
119
124
|
match
|
120
125
|
end
|
@@ -158,6 +163,9 @@ class Censive < StringScanner
|
|
158
163
|
def <<(row)
|
159
164
|
@out or return super
|
160
165
|
|
166
|
+
# drop trailing seps, if specified
|
167
|
+
row.pop while row.last.empty? if @drop
|
168
|
+
|
161
169
|
# most compact export format
|
162
170
|
s,q = @sep, @quote
|
163
171
|
out = case @mode
|
@@ -178,9 +186,6 @@ class Censive < StringScanner
|
|
178
186
|
row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
|
179
187
|
end.join(s)
|
180
188
|
|
181
|
-
# drop trailing seps, if specified
|
182
|
-
out.gsub!(/#{s}+\z/,'') if @drop
|
183
|
-
|
184
189
|
# write output, using desired line endings
|
185
190
|
@out << out + @eol
|
186
191
|
end
|
@@ -203,81 +208,25 @@ __END__
|
|
203
208
|
|
204
209
|
# ==[ Playground... ]==
|
205
210
|
|
206
|
-
STDOUT.sync = true
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
full = 0
|
218
|
-
|
219
|
-
ARGV.each do |path|
|
220
|
-
File.file?(path) or next
|
221
|
-
|
222
|
-
print "Processing #{path.inspect}"
|
223
|
-
|
224
|
-
rows.clear
|
225
|
-
cols.clear
|
226
|
-
seen = 0
|
227
|
-
coun += 1
|
228
|
-
|
229
|
-
dest = "#{path}-#{rand}"
|
230
|
-
|
231
|
-
begin
|
232
|
-
Censive.writer(dest) do |file|
|
233
|
-
Censive.new(File.read(path), relax: true).each do |cols|
|
234
|
-
cols.each {|cell| cell && cell.size >= 3 && cell.sub!(/\A="/, '') && cell.sub!(/"\z/, '') }
|
235
|
-
file << cols
|
236
|
-
seen += 1
|
237
|
-
print "." if (seen % 1e5) == 0
|
238
|
-
end
|
239
|
-
end
|
240
|
-
FileUtils.mv(dest, path)
|
241
|
-
full += (seen - 1)
|
242
|
-
puts " (#{seen - 1} rows of data)"
|
243
|
-
rescue
|
244
|
-
puts " - unable to process (#{$!})"
|
245
|
-
FileUtils.rm_f(dest)
|
246
|
-
end
|
247
|
-
end
|
248
|
-
|
249
|
-
puts "Processed #{coun} files with a total of #{full} rows of data" if coun > 1
|
250
|
-
|
251
|
-
__END__
|
252
|
-
,"CHUI, LOK HANG "BENNY", => ,"""CHUI, LOK HANG ""BENNY""",
|
211
|
+
# STDOUT.sync = true
|
212
|
+
#
|
213
|
+
# data = File.read('1.csv')
|
214
|
+
#
|
215
|
+
# Censive.writer('out.csv') do |out|
|
216
|
+
# Censive.new(data, relax: true, excel: true).each do |row|
|
217
|
+
# out << row
|
218
|
+
# end
|
219
|
+
# end
|
220
|
+
#
|
221
|
+
# __END__
|
253
222
|
|
254
|
-
|
223
|
+
ARGV << "z.csv" if ARGV.empty?
|
255
224
|
|
256
|
-
|
225
|
+
path = ARGV.first
|
226
|
+
data = File.read(path)
|
257
227
|
|
228
|
+
csv = Censive.new(data)
|
258
229
|
|
259
|
-
data
|
230
|
+
data.size > 1e6 ? csv.parse : csv.parse.each {|cols| p cols }
|
260
231
|
|
261
|
-
|
262
|
-
Censive.new(data).each do |row|
|
263
|
-
out << row
|
264
|
-
end
|
265
|
-
end
|
266
|
-
|
267
|
-
# ARGV << "z.csv" if ARGV.empty?
|
268
|
-
#
|
269
|
-
# case 1
|
270
|
-
# when 1
|
271
|
-
# path = ARGV.first
|
272
|
-
# data = File.read(path)
|
273
|
-
# when 2
|
274
|
-
# data = DATA.gets("\n\n").rstrip
|
275
|
-
# end
|
276
|
-
#
|
277
|
-
# STDOUT.sync = true
|
278
|
-
#
|
279
|
-
# csv = Censive.new(data)
|
280
|
-
#
|
281
|
-
# data.size > 1e6 ? csv.parse : csv.parse.each {|cols| p cols }
|
282
|
-
#
|
283
|
-
# csv.stats
|
232
|
+
csv.stats
|