censive 0.7 → 0.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (5) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/censive.gemspec +1 -1
  4. data/lib/censive.rb +77 -113
  5. metadata +1 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8a757fa8bbc5ddf364889e4b7feca2001f3784e8d0b2ff70a1b0349691a34aae
4
- data.tar.gz: 68dced562eb0dc9b7ad300447091ceb74c04a55201e88cc9fffbe1ba3bbc534d
3
+ metadata.gz: cbca33c415269ae1fafea5297f2b409879a46c37c884a0a7017bca322bcff2a6
4
+ data.tar.gz: ac021ddf3d7503aebc5791b0912c6409a0888627060b532e65f6eb72b94965a3
5
5
  SHA512:
6
- metadata.gz: c48d7e2bd3d1a7baa5fb2fae7b0553de665737849e9a50721f704a1a1f67c758c545dfe53d21f32ce386b20ea21f04c67ee8d765bf20653774b9475ebb60711f
7
- data.tar.gz: 411d59006ebcb6a07161186b56f73a8dcc73beeaecbe14e786ad237935c62fd6ef0631483c8f297399098b0dea2387863f7be8c878568e0558804e5bd20b55ee
6
+ metadata.gz: 8095c0c7704e3a6ee66930b36f0131b38d52a68cdd066d9677e8ceb58c4ecd7ce7eed496c78b1841cabe845b8c82624ca808b33a7cf7ec4c8fd504b287b3ffb5
7
+ data.tar.gz: 2e363b63b37977784a38c06e091f3201a1cd7a13138e8101e0e41ca49c47b3c4b433e7e6f2843a6816ddcbf9c1c8293da0d858f6be38bd0d3d82ed5dbd904bfe
data/README.md CHANGED
@@ -12,7 +12,7 @@ data = File.read('data.csv')
12
12
 
13
13
  # write out a tab-separated tsv file
14
14
  Censive.writer('out.tsv', sep: "\t", mode: :full) do |out|
15
- Censive.new(data).each do |row|
15
+ Censive.new(data, excel: true, relax: true).each do |row|
16
16
  out << row
17
17
  end
18
18
  end
data/censive.gemspec CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = "censive"
5
- s.version = "0.7"
5
+ s.version = "0.9"
6
6
  s.author = "Steve Shreeve"
7
7
  s.email = "steve.shreeve@gmail.com"
8
8
  s.summary = "A quick and lightweight CSV handling library for Ruby"
data/lib/censive.rb CHANGED
@@ -5,6 +5,9 @@
5
5
  #
6
6
  # Author: Steve Shreeve (steve.shreeve@gmail.com)
7
7
  # Date: Jan 30, 2023
8
+ #
9
+ # Thanks to https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
10
+ # and, also https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
8
11
  # ==============================================================================
9
12
  # The goals are:
10
13
  #
@@ -15,10 +18,8 @@
15
18
  #
16
19
  # 1. Option to support IO streaming
17
20
  # 2. Option to strip whitespace
18
- # 3. Option to change output line endings
19
- # 4. Option to force quotes in output
20
- # 5. Option to allow reading excel CSV (="Text" for cells)
21
- # 6. Confirm file encodings such as UTF-8, UTF-16, etc.
21
+ # 3. Option to support headers in readers and writers
22
+ # 4. Confirm file encodings such as UTF-8, UTF-16, etc.
22
23
  #
23
24
  # NOTE: Only getch and scan_until advance strscan's position
24
25
  # ==============================================================================
@@ -39,6 +40,7 @@ class Censive < StringScanner
39
40
 
40
41
  drop: false , # enable to drop trailing separators
41
42
  eol: "\n" , # desired line endings for exports
43
+ excel: false , # literals (="01"), formulas (=A1 + B2), see http://bit.ly/3Y7jIvc
42
44
  mode: :compact, # export mode: compact or full
43
45
  out: nil , # output IO/file
44
46
  relax: false , # relax parsing of quotes
@@ -48,56 +50,69 @@ class Censive < StringScanner
48
50
  super(str || '')
49
51
  reset
50
52
 
51
- @sep = sep .freeze
52
- @quote = quote.freeze
53
+ @sep = sep .freeze
54
+ @quote = quote.freeze
55
+
56
+ @drop = drop
57
+ @eol = eol.freeze
58
+ @excel = excel
59
+ @mode = mode
60
+ @out = out
61
+ @relax = relax
53
62
 
54
- @drop = drop
55
- @eol = eol.freeze
56
- @mode = mode
57
- @out = out
58
- @relax = relax
63
+ @es = "" .freeze
64
+ @cr = "\r" .freeze
65
+ @lf = "\n" .freeze
66
+ @eq = "=" .freeze
67
+ @esc = (@quote * 2).freeze
59
68
 
60
- @es = "" .freeze
61
- @cr = "\r" .freeze
62
- @lf = "\n" .freeze
63
- @esc = (@quote * 2).freeze
69
+ @tokens = [@sep,@quote,@cr,@lf,@es,nil]
64
70
  end
65
71
 
66
72
  def reset(str=nil)
67
73
  self.string = str if str
68
74
  super()
69
- @char = string[pos]
70
- @flag = nil
75
+ @char = peek(1)
76
+ @flag = nil
71
77
 
72
- @rows = nil
73
- @cols = @cells = 0
78
+ @rows = nil
79
+ @cols = @cells = 0
74
80
  end
75
81
 
76
82
  # ==[ Lexer ]==
77
83
 
78
84
  def next_char
79
85
  getch
80
- @char = string[pos]
86
+ @char = peek(1)
81
87
  end
82
88
 
83
89
  def next_token
90
+
91
+ # process and clear @flag
84
92
  case @flag
85
- when @es then @flag = nil; [@cr,@lf,nil].include?(@char) and return @es
93
+ when @es then @flag = nil; [@cr,@lf,@es,nil].include?(@char) and return @es
86
94
  when @cr then @flag = nil; next_char == @lf and next_char
87
95
  when @lf then @flag = nil; next_char
96
+ else @flag = nil
88
97
  end if @flag
89
98
 
90
- if [@sep,@quote,@cr,@lf,nil].include?(@char)
99
+ # See http://bit.ly/3Y7jIvc
100
+ if @excel && @char == @eq
101
+ @flag = @eq
102
+ next_char
103
+ end
104
+
105
+ if @tokens.include?(@char)
91
106
  case @char
92
- when @quote # consume_quoted_cell
107
+ when @quote # consume quoted cell
93
108
  match = ""
94
109
  while true
95
- getch # consume the quote (optimized by not calling next_char)
110
+ getch # consume the quote that got us here
96
111
  match << (scan_until(/(?=#{@quote})/o) or bomb "unclosed quote")
97
112
  case next_char
98
- when @sep then @flag = @es; next_char; break
99
- when @quote then match << @quote
100
- when @cr,@lf,nil then break
113
+ when @sep then @flag = @es; next_char; break
114
+ when @quote then match << @quote
115
+ when @cr,@lf,@es,nil then break
101
116
  else
102
117
  if @relax
103
118
  match << @quote + @char
@@ -107,14 +122,15 @@ class Censive < StringScanner
107
122
  end
108
123
  end
109
124
  match
110
- when @sep then @flag = @es; next_char; @es
111
- when @cr then @flag = @cr; nil
112
- when @lf then @flag = @lf; nil
113
- when nil then nil
125
+ when @sep then @flag = @es; next_char; @es
126
+ when @cr then @flag = @cr; nil
127
+ when @lf then @flag = @lf; nil
128
+ when @es,nil then nil
114
129
  end
115
- else # consume_unquoted_cell
130
+ else # consume unquoted cell
116
131
  match = scan_until(/(?=#{@sep}|#{@cr}|#{@lf}|\z)/o) or bomb "unexpected character"
117
- @char = string[pos]
132
+ match = @eq + match if @flag == @eq # preserve @eq for excel formulas
133
+ @char = peek(1)
118
134
  @char == @sep and @flag = @es and next_char
119
135
  match
120
136
  end
@@ -146,9 +162,9 @@ class Censive < StringScanner
146
162
 
147
163
  # ==[ Helpers ]==
148
164
 
149
- # grok returns: 2 for seps and quotes, 1 for seps only, and 0 for neither
165
+ # grok returns: 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
150
166
  def grok(str)
151
- if pos = str.index(/(#{@quote})|#{@sep}/o)
167
+ if pos = str.index(/(#{@quote})|#{@sep}|#{@cr}|#{@lf}/o)
152
168
  $1 ? 2 : str.index(/#{@quote}/o, pos) ? 2 : 1
153
169
  else
154
170
  0
@@ -158,13 +174,20 @@ class Censive < StringScanner
158
174
  def <<(row)
159
175
  @out or return super
160
176
 
177
+ # drop trailing seps, if specified
178
+ row.pop while row.last.empty? if @drop
179
+
161
180
  # most compact export format
162
181
  s,q = @sep, @quote
163
182
  out = case @mode
164
183
  when :compact
165
184
  case grok(row.join)
166
- when 0 then row
167
- when 1 then row.map {|col| col.include?(s) ? "#{q}#{col}#{q}" : col }
185
+ when 0
186
+ row
187
+ when 1
188
+ row.map do |col|
189
+ col.match?(/#{@sep}|#{@cr}|#{@lf}/o) ? "#{q}#{col}#{q}" : col
190
+ end
168
191
  else
169
192
  row.map do |col|
170
193
  case grok(col)
@@ -178,9 +201,6 @@ class Censive < StringScanner
178
201
  row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
179
202
  end.join(s)
180
203
 
181
- # drop trailing seps, if specified
182
- out.gsub!(/#{s}+\z/,'') if @drop
183
-
184
204
  # write output, using desired line endings
185
205
  @out << out + @eol
186
206
  end
@@ -203,81 +223,25 @@ __END__
203
223
 
204
224
  # ==[ Playground... ]==
205
225
 
206
- STDOUT.sync = true
207
-
208
- require 'fileutils'
209
-
210
- ARGV << "101.csv"
211
-
212
- rand = `LC_ALL=C tr -dc a-zA-Z0-9 < /dev/random | head -c12`
213
-
214
- rows = []
215
- cols = []
216
- coun = 0
217
- full = 0
218
-
219
- ARGV.each do |path|
220
- File.file?(path) or next
221
-
222
- print "Processing #{path.inspect}"
223
-
224
- rows.clear
225
- cols.clear
226
- seen = 0
227
- coun += 1
228
-
229
- dest = "#{path}-#{rand}"
230
-
231
- begin
232
- Censive.writer(dest) do |file|
233
- Censive.new(File.read(path), relax: true).each do |cols|
234
- cols.each {|cell| cell && cell.size >= 3 && cell.sub!(/\A="/, '') && cell.sub!(/"\z/, '') }
235
- file << cols
236
- seen += 1
237
- print "." if (seen % 1e5) == 0
238
- end
239
- end
240
- FileUtils.mv(dest, path)
241
- full += (seen - 1)
242
- puts " (#{seen - 1} rows of data)"
243
- rescue
244
- puts " - unable to process (#{$!})"
245
- FileUtils.rm_f(dest)
246
- end
247
- end
248
-
249
- puts "Processed #{coun} files with a total of #{full} rows of data" if coun > 1
250
-
251
- __END__
252
- ,"CHUI, LOK HANG "BENNY", => ,"""CHUI, LOK HANG ""BENNY""",
226
+ # STDOUT.sync = true
227
+ #
228
+ # data = File.read('1.csv')
229
+ #
230
+ # Censive.writer('out.csv') do |out|
231
+ # Censive.new(data, relax: true, excel: true).each do |row|
232
+ # out << row
233
+ # end
234
+ # end
235
+ #
236
+ # __END__
253
237
 
254
- ,"..............."B
238
+ ARGV << "z.csv" if ARGV.empty?
255
239
 
256
- __END__
240
+ path = ARGV.first
241
+ data = File.read(path)
257
242
 
243
+ csv = Censive.new(data)
258
244
 
259
- data = File.read('1.csv')
245
+ data.size > 1e6 ? csv.parse : csv.parse.each {|cols| p cols }
260
246
 
261
- Censive.writer('out.csv', sep: ',', quote: "'") do |out|
262
- Censive.new(data).each do |row|
263
- out << row
264
- end
265
- end
266
-
267
- # ARGV << "z.csv" if ARGV.empty?
268
- #
269
- # case 1
270
- # when 1
271
- # path = ARGV.first
272
- # data = File.read(path)
273
- # when 2
274
- # data = DATA.gets("\n\n").rstrip
275
- # end
276
- #
277
- # STDOUT.sync = true
278
- #
279
- # csv = Censive.new(data)
280
- #
281
- # data.size > 1e6 ? csv.parse : csv.parse.each {|cols| p cols }
282
- #
283
- # csv.stats
247
+ csv.stats
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: censive
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.7'
4
+ version: '0.9'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Steve Shreeve