censive 0.7 → 0.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/censive.gemspec +1 -1
- data/lib/censive.rb +77 -113
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cbca33c415269ae1fafea5297f2b409879a46c37c884a0a7017bca322bcff2a6
|
4
|
+
data.tar.gz: ac021ddf3d7503aebc5791b0912c6409a0888627060b532e65f6eb72b94965a3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8095c0c7704e3a6ee66930b36f0131b38d52a68cdd066d9677e8ceb58c4ecd7ce7eed496c78b1841cabe845b8c82624ca808b33a7cf7ec4c8fd504b287b3ffb5
|
7
|
+
data.tar.gz: 2e363b63b37977784a38c06e091f3201a1cd7a13138e8101e0e41ca49c47b3c4b433e7e6f2843a6816ddcbf9c1c8293da0d858f6be38bd0d3d82ed5dbd904bfe
|
data/README.md
CHANGED
data/censive.gemspec
CHANGED
data/lib/censive.rb
CHANGED
@@ -5,6 +5,9 @@
|
|
5
5
|
#
|
6
6
|
# Author: Steve Shreeve (steve.shreeve@gmail.com)
|
7
7
|
# Date: Jan 30, 2023
|
8
|
+
#
|
9
|
+
# Thanks to https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
|
10
|
+
# and, also https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
|
8
11
|
# ==============================================================================
|
9
12
|
# The goals are:
|
10
13
|
#
|
@@ -15,10 +18,8 @@
|
|
15
18
|
#
|
16
19
|
# 1. Option to support IO streaming
|
17
20
|
# 2. Option to strip whitespace
|
18
|
-
# 3. Option to
|
19
|
-
# 4.
|
20
|
-
# 5. Option to allow reading excel CSV (="Text" for cells)
|
21
|
-
# 6. Confirm file encodings such as UTF-8, UTF-16, etc.
|
21
|
+
# 3. Option to support headers in readers and writers
|
22
|
+
# 4. Confirm file encodings such as UTF-8, UTF-16, etc.
|
22
23
|
#
|
23
24
|
# NOTE: Only getch and scan_until advance strscan's position
|
24
25
|
# ==============================================================================
|
@@ -39,6 +40,7 @@ class Censive < StringScanner
|
|
39
40
|
|
40
41
|
drop: false , # enable to drop trailing separators
|
41
42
|
eol: "\n" , # desired line endings for exports
|
43
|
+
excel: false , # literals (="01"), formulas (=A1 + B2), see http://bit.ly/3Y7jIvc
|
42
44
|
mode: :compact, # export mode: compact or full
|
43
45
|
out: nil , # output IO/file
|
44
46
|
relax: false , # relax parsing of quotes
|
@@ -48,56 +50,69 @@ class Censive < StringScanner
|
|
48
50
|
super(str || '')
|
49
51
|
reset
|
50
52
|
|
51
|
-
@sep
|
52
|
-
@quote
|
53
|
+
@sep = sep .freeze
|
54
|
+
@quote = quote.freeze
|
55
|
+
|
56
|
+
@drop = drop
|
57
|
+
@eol = eol.freeze
|
58
|
+
@excel = excel
|
59
|
+
@mode = mode
|
60
|
+
@out = out
|
61
|
+
@relax = relax
|
53
62
|
|
54
|
-
@
|
55
|
-
@
|
56
|
-
@
|
57
|
-
@
|
58
|
-
@
|
63
|
+
@es = "" .freeze
|
64
|
+
@cr = "\r" .freeze
|
65
|
+
@lf = "\n" .freeze
|
66
|
+
@eq = "=" .freeze
|
67
|
+
@esc = (@quote * 2).freeze
|
59
68
|
|
60
|
-
@
|
61
|
-
@cr = "\r" .freeze
|
62
|
-
@lf = "\n" .freeze
|
63
|
-
@esc = (@quote * 2).freeze
|
69
|
+
@tokens = [@sep,@quote,@cr,@lf,@es,nil]
|
64
70
|
end
|
65
71
|
|
66
72
|
def reset(str=nil)
|
67
73
|
self.string = str if str
|
68
74
|
super()
|
69
|
-
@char
|
70
|
-
@flag
|
75
|
+
@char = peek(1)
|
76
|
+
@flag = nil
|
71
77
|
|
72
|
-
@rows
|
73
|
-
@cols
|
78
|
+
@rows = nil
|
79
|
+
@cols = @cells = 0
|
74
80
|
end
|
75
81
|
|
76
82
|
# ==[ Lexer ]==
|
77
83
|
|
78
84
|
def next_char
|
79
85
|
getch
|
80
|
-
@char =
|
86
|
+
@char = peek(1)
|
81
87
|
end
|
82
88
|
|
83
89
|
def next_token
|
90
|
+
|
91
|
+
# process and clear @flag
|
84
92
|
case @flag
|
85
|
-
when @es then @flag = nil; [@cr,@lf,nil].include?(@char) and return @es
|
93
|
+
when @es then @flag = nil; [@cr,@lf,@es,nil].include?(@char) and return @es
|
86
94
|
when @cr then @flag = nil; next_char == @lf and next_char
|
87
95
|
when @lf then @flag = nil; next_char
|
96
|
+
else @flag = nil
|
88
97
|
end if @flag
|
89
98
|
|
90
|
-
|
99
|
+
# See http://bit.ly/3Y7jIvc
|
100
|
+
if @excel && @char == @eq
|
101
|
+
@flag = @eq
|
102
|
+
next_char
|
103
|
+
end
|
104
|
+
|
105
|
+
if @tokens.include?(@char)
|
91
106
|
case @char
|
92
|
-
when @quote #
|
107
|
+
when @quote # consume quoted cell
|
93
108
|
match = ""
|
94
109
|
while true
|
95
|
-
getch # consume the quote
|
110
|
+
getch # consume the quote that got us here
|
96
111
|
match << (scan_until(/(?=#{@quote})/o) or bomb "unclosed quote")
|
97
112
|
case next_char
|
98
|
-
when @sep
|
99
|
-
when @quote
|
100
|
-
when @cr,@lf,nil then break
|
113
|
+
when @sep then @flag = @es; next_char; break
|
114
|
+
when @quote then match << @quote
|
115
|
+
when @cr,@lf,@es,nil then break
|
101
116
|
else
|
102
117
|
if @relax
|
103
118
|
match << @quote + @char
|
@@ -107,14 +122,15 @@ class Censive < StringScanner
|
|
107
122
|
end
|
108
123
|
end
|
109
124
|
match
|
110
|
-
when @sep
|
111
|
-
when @cr
|
112
|
-
when @lf
|
113
|
-
when nil
|
125
|
+
when @sep then @flag = @es; next_char; @es
|
126
|
+
when @cr then @flag = @cr; nil
|
127
|
+
when @lf then @flag = @lf; nil
|
128
|
+
when @es,nil then nil
|
114
129
|
end
|
115
|
-
else #
|
130
|
+
else # consume unquoted cell
|
116
131
|
match = scan_until(/(?=#{@sep}|#{@cr}|#{@lf}|\z)/o) or bomb "unexpected character"
|
117
|
-
|
132
|
+
match = @eq + match if @flag == @eq # preserve @eq for excel formulas
|
133
|
+
@char = peek(1)
|
118
134
|
@char == @sep and @flag = @es and next_char
|
119
135
|
match
|
120
136
|
end
|
@@ -146,9 +162,9 @@ class Censive < StringScanner
|
|
146
162
|
|
147
163
|
# ==[ Helpers ]==
|
148
164
|
|
149
|
-
# grok returns: 2
|
165
|
+
# grok returns: 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
|
150
166
|
def grok(str)
|
151
|
-
if pos = str.index(/(#{@quote})|#{@sep}/o)
|
167
|
+
if pos = str.index(/(#{@quote})|#{@sep}|#{@cr}|#{@lf}/o)
|
152
168
|
$1 ? 2 : str.index(/#{@quote}/o, pos) ? 2 : 1
|
153
169
|
else
|
154
170
|
0
|
@@ -158,13 +174,20 @@ class Censive < StringScanner
|
|
158
174
|
def <<(row)
|
159
175
|
@out or return super
|
160
176
|
|
177
|
+
# drop trailing seps, if specified
|
178
|
+
row.pop while row.last.empty? if @drop
|
179
|
+
|
161
180
|
# most compact export format
|
162
181
|
s,q = @sep, @quote
|
163
182
|
out = case @mode
|
164
183
|
when :compact
|
165
184
|
case grok(row.join)
|
166
|
-
when 0
|
167
|
-
|
185
|
+
when 0
|
186
|
+
row
|
187
|
+
when 1
|
188
|
+
row.map do |col|
|
189
|
+
col.match?(/#{@sep}|#{@cr}|#{@lf}/o) ? "#{q}#{col}#{q}" : col
|
190
|
+
end
|
168
191
|
else
|
169
192
|
row.map do |col|
|
170
193
|
case grok(col)
|
@@ -178,9 +201,6 @@ class Censive < StringScanner
|
|
178
201
|
row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
|
179
202
|
end.join(s)
|
180
203
|
|
181
|
-
# drop trailing seps, if specified
|
182
|
-
out.gsub!(/#{s}+\z/,'') if @drop
|
183
|
-
|
184
204
|
# write output, using desired line endings
|
185
205
|
@out << out + @eol
|
186
206
|
end
|
@@ -203,81 +223,25 @@ __END__
|
|
203
223
|
|
204
224
|
# ==[ Playground... ]==
|
205
225
|
|
206
|
-
STDOUT.sync = true
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
full = 0
|
218
|
-
|
219
|
-
ARGV.each do |path|
|
220
|
-
File.file?(path) or next
|
221
|
-
|
222
|
-
print "Processing #{path.inspect}"
|
223
|
-
|
224
|
-
rows.clear
|
225
|
-
cols.clear
|
226
|
-
seen = 0
|
227
|
-
coun += 1
|
228
|
-
|
229
|
-
dest = "#{path}-#{rand}"
|
230
|
-
|
231
|
-
begin
|
232
|
-
Censive.writer(dest) do |file|
|
233
|
-
Censive.new(File.read(path), relax: true).each do |cols|
|
234
|
-
cols.each {|cell| cell && cell.size >= 3 && cell.sub!(/\A="/, '') && cell.sub!(/"\z/, '') }
|
235
|
-
file << cols
|
236
|
-
seen += 1
|
237
|
-
print "." if (seen % 1e5) == 0
|
238
|
-
end
|
239
|
-
end
|
240
|
-
FileUtils.mv(dest, path)
|
241
|
-
full += (seen - 1)
|
242
|
-
puts " (#{seen - 1} rows of data)"
|
243
|
-
rescue
|
244
|
-
puts " - unable to process (#{$!})"
|
245
|
-
FileUtils.rm_f(dest)
|
246
|
-
end
|
247
|
-
end
|
248
|
-
|
249
|
-
puts "Processed #{coun} files with a total of #{full} rows of data" if coun > 1
|
250
|
-
|
251
|
-
__END__
|
252
|
-
,"CHUI, LOK HANG "BENNY", => ,"""CHUI, LOK HANG ""BENNY""",
|
226
|
+
# STDOUT.sync = true
|
227
|
+
#
|
228
|
+
# data = File.read('1.csv')
|
229
|
+
#
|
230
|
+
# Censive.writer('out.csv') do |out|
|
231
|
+
# Censive.new(data, relax: true, excel: true).each do |row|
|
232
|
+
# out << row
|
233
|
+
# end
|
234
|
+
# end
|
235
|
+
#
|
236
|
+
# __END__
|
253
237
|
|
254
|
-
|
238
|
+
ARGV << "z.csv" if ARGV.empty?
|
255
239
|
|
256
|
-
|
240
|
+
path = ARGV.first
|
241
|
+
data = File.read(path)
|
257
242
|
|
243
|
+
csv = Censive.new(data)
|
258
244
|
|
259
|
-
data
|
245
|
+
data.size > 1e6 ? csv.parse : csv.parse.each {|cols| p cols }
|
260
246
|
|
261
|
-
|
262
|
-
Censive.new(data).each do |row|
|
263
|
-
out << row
|
264
|
-
end
|
265
|
-
end
|
266
|
-
|
267
|
-
# ARGV << "z.csv" if ARGV.empty?
|
268
|
-
#
|
269
|
-
# case 1
|
270
|
-
# when 1
|
271
|
-
# path = ARGV.first
|
272
|
-
# data = File.read(path)
|
273
|
-
# when 2
|
274
|
-
# data = DATA.gets("\n\n").rstrip
|
275
|
-
# end
|
276
|
-
#
|
277
|
-
# STDOUT.sync = true
|
278
|
-
#
|
279
|
-
# csv = Censive.new(data)
|
280
|
-
#
|
281
|
-
# data.size > 1e6 ? csv.parse : csv.parse.each {|cols| p cols }
|
282
|
-
#
|
283
|
-
# csv.stats
|
247
|
+
csv.stats
|