censive 0.7 → 0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/censive.gemspec +1 -1
- data/lib/censive.rb +77 -113
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: cbca33c415269ae1fafea5297f2b409879a46c37c884a0a7017bca322bcff2a6
|
|
4
|
+
data.tar.gz: ac021ddf3d7503aebc5791b0912c6409a0888627060b532e65f6eb72b94965a3
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 8095c0c7704e3a6ee66930b36f0131b38d52a68cdd066d9677e8ceb58c4ecd7ce7eed496c78b1841cabe845b8c82624ca808b33a7cf7ec4c8fd504b287b3ffb5
|
|
7
|
+
data.tar.gz: 2e363b63b37977784a38c06e091f3201a1cd7a13138e8101e0e41ca49c47b3c4b433e7e6f2843a6816ddcbf9c1c8293da0d858f6be38bd0d3d82ed5dbd904bfe
|
data/README.md
CHANGED
data/censive.gemspec
CHANGED
data/lib/censive.rb
CHANGED
|
@@ -5,6 +5,9 @@
|
|
|
5
5
|
#
|
|
6
6
|
# Author: Steve Shreeve (steve.shreeve@gmail.com)
|
|
7
7
|
# Date: Jan 30, 2023
|
|
8
|
+
#
|
|
9
|
+
# Thanks to https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
|
|
10
|
+
# and, also https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
|
|
8
11
|
# ==============================================================================
|
|
9
12
|
# The goals are:
|
|
10
13
|
#
|
|
@@ -15,10 +18,8 @@
|
|
|
15
18
|
#
|
|
16
19
|
# 1. Option to support IO streaming
|
|
17
20
|
# 2. Option to strip whitespace
|
|
18
|
-
# 3. Option to
|
|
19
|
-
# 4.
|
|
20
|
-
# 5. Option to allow reading excel CSV (="Text" for cells)
|
|
21
|
-
# 6. Confirm file encodings such as UTF-8, UTF-16, etc.
|
|
21
|
+
# 3. Option to support headers in readers and writers
|
|
22
|
+
# 4. Confirm file encodings such as UTF-8, UTF-16, etc.
|
|
22
23
|
#
|
|
23
24
|
# NOTE: Only getch and scan_until advance strscan's position
|
|
24
25
|
# ==============================================================================
|
|
@@ -39,6 +40,7 @@ class Censive < StringScanner
|
|
|
39
40
|
|
|
40
41
|
drop: false , # enable to drop trailing separators
|
|
41
42
|
eol: "\n" , # desired line endings for exports
|
|
43
|
+
excel: false , # literals (="01"), formulas (=A1 + B2), see http://bit.ly/3Y7jIvc
|
|
42
44
|
mode: :compact, # export mode: compact or full
|
|
43
45
|
out: nil , # output IO/file
|
|
44
46
|
relax: false , # relax parsing of quotes
|
|
@@ -48,56 +50,69 @@ class Censive < StringScanner
|
|
|
48
50
|
super(str || '')
|
|
49
51
|
reset
|
|
50
52
|
|
|
51
|
-
@sep
|
|
52
|
-
@quote
|
|
53
|
+
@sep = sep .freeze
|
|
54
|
+
@quote = quote.freeze
|
|
55
|
+
|
|
56
|
+
@drop = drop
|
|
57
|
+
@eol = eol.freeze
|
|
58
|
+
@excel = excel
|
|
59
|
+
@mode = mode
|
|
60
|
+
@out = out
|
|
61
|
+
@relax = relax
|
|
53
62
|
|
|
54
|
-
@
|
|
55
|
-
@
|
|
56
|
-
@
|
|
57
|
-
@
|
|
58
|
-
@
|
|
63
|
+
@es = "" .freeze
|
|
64
|
+
@cr = "\r" .freeze
|
|
65
|
+
@lf = "\n" .freeze
|
|
66
|
+
@eq = "=" .freeze
|
|
67
|
+
@esc = (@quote * 2).freeze
|
|
59
68
|
|
|
60
|
-
@
|
|
61
|
-
@cr = "\r" .freeze
|
|
62
|
-
@lf = "\n" .freeze
|
|
63
|
-
@esc = (@quote * 2).freeze
|
|
69
|
+
@tokens = [@sep,@quote,@cr,@lf,@es,nil]
|
|
64
70
|
end
|
|
65
71
|
|
|
66
72
|
def reset(str=nil)
|
|
67
73
|
self.string = str if str
|
|
68
74
|
super()
|
|
69
|
-
@char
|
|
70
|
-
@flag
|
|
75
|
+
@char = peek(1)
|
|
76
|
+
@flag = nil
|
|
71
77
|
|
|
72
|
-
@rows
|
|
73
|
-
@cols
|
|
78
|
+
@rows = nil
|
|
79
|
+
@cols = @cells = 0
|
|
74
80
|
end
|
|
75
81
|
|
|
76
82
|
# ==[ Lexer ]==
|
|
77
83
|
|
|
78
84
|
def next_char
|
|
79
85
|
getch
|
|
80
|
-
@char =
|
|
86
|
+
@char = peek(1)
|
|
81
87
|
end
|
|
82
88
|
|
|
83
89
|
def next_token
|
|
90
|
+
|
|
91
|
+
# process and clear @flag
|
|
84
92
|
case @flag
|
|
85
|
-
when @es then @flag = nil; [@cr,@lf,nil].include?(@char) and return @es
|
|
93
|
+
when @es then @flag = nil; [@cr,@lf,@es,nil].include?(@char) and return @es
|
|
86
94
|
when @cr then @flag = nil; next_char == @lf and next_char
|
|
87
95
|
when @lf then @flag = nil; next_char
|
|
96
|
+
else @flag = nil
|
|
88
97
|
end if @flag
|
|
89
98
|
|
|
90
|
-
|
|
99
|
+
# See http://bit.ly/3Y7jIvc
|
|
100
|
+
if @excel && @char == @eq
|
|
101
|
+
@flag = @eq
|
|
102
|
+
next_char
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
if @tokens.include?(@char)
|
|
91
106
|
case @char
|
|
92
|
-
when @quote #
|
|
107
|
+
when @quote # consume quoted cell
|
|
93
108
|
match = ""
|
|
94
109
|
while true
|
|
95
|
-
getch # consume the quote
|
|
110
|
+
getch # consume the quote that got us here
|
|
96
111
|
match << (scan_until(/(?=#{@quote})/o) or bomb "unclosed quote")
|
|
97
112
|
case next_char
|
|
98
|
-
when @sep
|
|
99
|
-
when @quote
|
|
100
|
-
when @cr,@lf,nil then break
|
|
113
|
+
when @sep then @flag = @es; next_char; break
|
|
114
|
+
when @quote then match << @quote
|
|
115
|
+
when @cr,@lf,@es,nil then break
|
|
101
116
|
else
|
|
102
117
|
if @relax
|
|
103
118
|
match << @quote + @char
|
|
@@ -107,14 +122,15 @@ class Censive < StringScanner
|
|
|
107
122
|
end
|
|
108
123
|
end
|
|
109
124
|
match
|
|
110
|
-
when @sep
|
|
111
|
-
when @cr
|
|
112
|
-
when @lf
|
|
113
|
-
when nil
|
|
125
|
+
when @sep then @flag = @es; next_char; @es
|
|
126
|
+
when @cr then @flag = @cr; nil
|
|
127
|
+
when @lf then @flag = @lf; nil
|
|
128
|
+
when @es,nil then nil
|
|
114
129
|
end
|
|
115
|
-
else #
|
|
130
|
+
else # consume unquoted cell
|
|
116
131
|
match = scan_until(/(?=#{@sep}|#{@cr}|#{@lf}|\z)/o) or bomb "unexpected character"
|
|
117
|
-
|
|
132
|
+
match = @eq + match if @flag == @eq # preserve @eq for excel formulas
|
|
133
|
+
@char = peek(1)
|
|
118
134
|
@char == @sep and @flag = @es and next_char
|
|
119
135
|
match
|
|
120
136
|
end
|
|
@@ -146,9 +162,9 @@ class Censive < StringScanner
|
|
|
146
162
|
|
|
147
163
|
# ==[ Helpers ]==
|
|
148
164
|
|
|
149
|
-
# grok returns: 2
|
|
165
|
+
# grok returns: 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
|
|
150
166
|
def grok(str)
|
|
151
|
-
if pos = str.index(/(#{@quote})|#{@sep}/o)
|
|
167
|
+
if pos = str.index(/(#{@quote})|#{@sep}|#{@cr}|#{@lf}/o)
|
|
152
168
|
$1 ? 2 : str.index(/#{@quote}/o, pos) ? 2 : 1
|
|
153
169
|
else
|
|
154
170
|
0
|
|
@@ -158,13 +174,20 @@ class Censive < StringScanner
|
|
|
158
174
|
def <<(row)
|
|
159
175
|
@out or return super
|
|
160
176
|
|
|
177
|
+
# drop trailing seps, if specified
|
|
178
|
+
row.pop while row.last.empty? if @drop
|
|
179
|
+
|
|
161
180
|
# most compact export format
|
|
162
181
|
s,q = @sep, @quote
|
|
163
182
|
out = case @mode
|
|
164
183
|
when :compact
|
|
165
184
|
case grok(row.join)
|
|
166
|
-
when 0
|
|
167
|
-
|
|
185
|
+
when 0
|
|
186
|
+
row
|
|
187
|
+
when 1
|
|
188
|
+
row.map do |col|
|
|
189
|
+
col.match?(/#{@sep}|#{@cr}|#{@lf}/o) ? "#{q}#{col}#{q}" : col
|
|
190
|
+
end
|
|
168
191
|
else
|
|
169
192
|
row.map do |col|
|
|
170
193
|
case grok(col)
|
|
@@ -178,9 +201,6 @@ class Censive < StringScanner
|
|
|
178
201
|
row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
|
|
179
202
|
end.join(s)
|
|
180
203
|
|
|
181
|
-
# drop trailing seps, if specified
|
|
182
|
-
out.gsub!(/#{s}+\z/,'') if @drop
|
|
183
|
-
|
|
184
204
|
# write output, using desired line endings
|
|
185
205
|
@out << out + @eol
|
|
186
206
|
end
|
|
@@ -203,81 +223,25 @@ __END__
|
|
|
203
223
|
|
|
204
224
|
# ==[ Playground... ]==
|
|
205
225
|
|
|
206
|
-
STDOUT.sync = true
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
full = 0
|
|
218
|
-
|
|
219
|
-
ARGV.each do |path|
|
|
220
|
-
File.file?(path) or next
|
|
221
|
-
|
|
222
|
-
print "Processing #{path.inspect}"
|
|
223
|
-
|
|
224
|
-
rows.clear
|
|
225
|
-
cols.clear
|
|
226
|
-
seen = 0
|
|
227
|
-
coun += 1
|
|
228
|
-
|
|
229
|
-
dest = "#{path}-#{rand}"
|
|
230
|
-
|
|
231
|
-
begin
|
|
232
|
-
Censive.writer(dest) do |file|
|
|
233
|
-
Censive.new(File.read(path), relax: true).each do |cols|
|
|
234
|
-
cols.each {|cell| cell && cell.size >= 3 && cell.sub!(/\A="/, '') && cell.sub!(/"\z/, '') }
|
|
235
|
-
file << cols
|
|
236
|
-
seen += 1
|
|
237
|
-
print "." if (seen % 1e5) == 0
|
|
238
|
-
end
|
|
239
|
-
end
|
|
240
|
-
FileUtils.mv(dest, path)
|
|
241
|
-
full += (seen - 1)
|
|
242
|
-
puts " (#{seen - 1} rows of data)"
|
|
243
|
-
rescue
|
|
244
|
-
puts " - unable to process (#{$!})"
|
|
245
|
-
FileUtils.rm_f(dest)
|
|
246
|
-
end
|
|
247
|
-
end
|
|
248
|
-
|
|
249
|
-
puts "Processed #{coun} files with a total of #{full} rows of data" if coun > 1
|
|
250
|
-
|
|
251
|
-
__END__
|
|
252
|
-
,"CHUI, LOK HANG "BENNY", => ,"""CHUI, LOK HANG ""BENNY""",
|
|
226
|
+
# STDOUT.sync = true
|
|
227
|
+
#
|
|
228
|
+
# data = File.read('1.csv')
|
|
229
|
+
#
|
|
230
|
+
# Censive.writer('out.csv') do |out|
|
|
231
|
+
# Censive.new(data, relax: true, excel: true).each do |row|
|
|
232
|
+
# out << row
|
|
233
|
+
# end
|
|
234
|
+
# end
|
|
235
|
+
#
|
|
236
|
+
# __END__
|
|
253
237
|
|
|
254
|
-
|
|
238
|
+
ARGV << "z.csv" if ARGV.empty?
|
|
255
239
|
|
|
256
|
-
|
|
240
|
+
path = ARGV.first
|
|
241
|
+
data = File.read(path)
|
|
257
242
|
|
|
243
|
+
csv = Censive.new(data)
|
|
258
244
|
|
|
259
|
-
data
|
|
245
|
+
data.size > 1e6 ? csv.parse : csv.parse.each {|cols| p cols }
|
|
260
246
|
|
|
261
|
-
|
|
262
|
-
Censive.new(data).each do |row|
|
|
263
|
-
out << row
|
|
264
|
-
end
|
|
265
|
-
end
|
|
266
|
-
|
|
267
|
-
# ARGV << "z.csv" if ARGV.empty?
|
|
268
|
-
#
|
|
269
|
-
# case 1
|
|
270
|
-
# when 1
|
|
271
|
-
# path = ARGV.first
|
|
272
|
-
# data = File.read(path)
|
|
273
|
-
# when 2
|
|
274
|
-
# data = DATA.gets("\n\n").rstrip
|
|
275
|
-
# end
|
|
276
|
-
#
|
|
277
|
-
# STDOUT.sync = true
|
|
278
|
-
#
|
|
279
|
-
# csv = Censive.new(data)
|
|
280
|
-
#
|
|
281
|
-
# data.size > 1e6 ? csv.parse : csv.parse.each {|cols| p cols }
|
|
282
|
-
#
|
|
283
|
-
# csv.stats
|
|
247
|
+
csv.stats
|