censive 0.7 → 0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (5) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/censive.gemspec +1 -1
  4. data/lib/censive.rb +77 -113
  5. metadata +1 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8a757fa8bbc5ddf364889e4b7feca2001f3784e8d0b2ff70a1b0349691a34aae
4
- data.tar.gz: 68dced562eb0dc9b7ad300447091ceb74c04a55201e88cc9fffbe1ba3bbc534d
3
+ metadata.gz: cbca33c415269ae1fafea5297f2b409879a46c37c884a0a7017bca322bcff2a6
4
+ data.tar.gz: ac021ddf3d7503aebc5791b0912c6409a0888627060b532e65f6eb72b94965a3
5
5
  SHA512:
6
- metadata.gz: c48d7e2bd3d1a7baa5fb2fae7b0553de665737849e9a50721f704a1a1f67c758c545dfe53d21f32ce386b20ea21f04c67ee8d765bf20653774b9475ebb60711f
7
- data.tar.gz: 411d59006ebcb6a07161186b56f73a8dcc73beeaecbe14e786ad237935c62fd6ef0631483c8f297399098b0dea2387863f7be8c878568e0558804e5bd20b55ee
6
+ metadata.gz: 8095c0c7704e3a6ee66930b36f0131b38d52a68cdd066d9677e8ceb58c4ecd7ce7eed496c78b1841cabe845b8c82624ca808b33a7cf7ec4c8fd504b287b3ffb5
7
+ data.tar.gz: 2e363b63b37977784a38c06e091f3201a1cd7a13138e8101e0e41ca49c47b3c4b433e7e6f2843a6816ddcbf9c1c8293da0d858f6be38bd0d3d82ed5dbd904bfe
data/README.md CHANGED
@@ -12,7 +12,7 @@ data = File.read('data.csv')
12
12
 
13
13
  # write out a tab-separated tsv file
14
14
  Censive.writer('out.tsv', sep: "\t", mode: :full) do |out|
15
- Censive.new(data).each do |row|
15
+ Censive.new(data, excel: true, relax: true).each do |row|
16
16
  out << row
17
17
  end
18
18
  end
data/censive.gemspec CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = "censive"
5
- s.version = "0.7"
5
+ s.version = "0.9"
6
6
  s.author = "Steve Shreeve"
7
7
  s.email = "steve.shreeve@gmail.com"
8
8
  s.summary = "A quick and lightweight CSV handling library for Ruby"
data/lib/censive.rb CHANGED
@@ -5,6 +5,9 @@
5
5
  #
6
6
  # Author: Steve Shreeve (steve.shreeve@gmail.com)
7
7
  # Date: Jan 30, 2023
8
+ #
9
+ # Thanks to https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
10
+ # and, also https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
8
11
  # ==============================================================================
9
12
  # The goals are:
10
13
  #
@@ -15,10 +18,8 @@
15
18
  #
16
19
  # 1. Option to support IO streaming
17
20
  # 2. Option to strip whitespace
18
- # 3. Option to change output line endings
19
- # 4. Option to force quotes in output
20
- # 5. Option to allow reading excel CSV (="Text" for cells)
21
- # 6. Confirm file encodings such as UTF-8, UTF-16, etc.
21
+ # 3. Option to support headers in readers and writers
22
+ # 4. Confirm file encodings such as UTF-8, UTF-16, etc.
22
23
  #
23
24
  # NOTE: Only getch and scan_until advance strscan's position
24
25
  # ==============================================================================
@@ -39,6 +40,7 @@ class Censive < StringScanner
39
40
 
40
41
  drop: false , # enable to drop trailing separators
41
42
  eol: "\n" , # desired line endings for exports
43
+ excel: false , # literals (="01"), formulas (=A1 + B2), see http://bit.ly/3Y7jIvc
42
44
  mode: :compact, # export mode: compact or full
43
45
  out: nil , # output IO/file
44
46
  relax: false , # relax parsing of quotes
@@ -48,56 +50,69 @@ class Censive < StringScanner
48
50
  super(str || '')
49
51
  reset
50
52
 
51
- @sep = sep .freeze
52
- @quote = quote.freeze
53
+ @sep = sep .freeze
54
+ @quote = quote.freeze
55
+
56
+ @drop = drop
57
+ @eol = eol.freeze
58
+ @excel = excel
59
+ @mode = mode
60
+ @out = out
61
+ @relax = relax
53
62
 
54
- @drop = drop
55
- @eol = eol.freeze
56
- @mode = mode
57
- @out = out
58
- @relax = relax
63
+ @es = "" .freeze
64
+ @cr = "\r" .freeze
65
+ @lf = "\n" .freeze
66
+ @eq = "=" .freeze
67
+ @esc = (@quote * 2).freeze
59
68
 
60
- @es = "" .freeze
61
- @cr = "\r" .freeze
62
- @lf = "\n" .freeze
63
- @esc = (@quote * 2).freeze
69
+ @tokens = [@sep,@quote,@cr,@lf,@es,nil]
64
70
  end
65
71
 
66
72
  def reset(str=nil)
67
73
  self.string = str if str
68
74
  super()
69
- @char = string[pos]
70
- @flag = nil
75
+ @char = peek(1)
76
+ @flag = nil
71
77
 
72
- @rows = nil
73
- @cols = @cells = 0
78
+ @rows = nil
79
+ @cols = @cells = 0
74
80
  end
75
81
 
76
82
  # ==[ Lexer ]==
77
83
 
78
84
  def next_char
79
85
  getch
80
- @char = string[pos]
86
+ @char = peek(1)
81
87
  end
82
88
 
83
89
  def next_token
90
+
91
+ # process and clear @flag
84
92
  case @flag
85
- when @es then @flag = nil; [@cr,@lf,nil].include?(@char) and return @es
93
+ when @es then @flag = nil; [@cr,@lf,@es,nil].include?(@char) and return @es
86
94
  when @cr then @flag = nil; next_char == @lf and next_char
87
95
  when @lf then @flag = nil; next_char
96
+ else @flag = nil
88
97
  end if @flag
89
98
 
90
- if [@sep,@quote,@cr,@lf,nil].include?(@char)
99
+ # See http://bit.ly/3Y7jIvc
100
+ if @excel && @char == @eq
101
+ @flag = @eq
102
+ next_char
103
+ end
104
+
105
+ if @tokens.include?(@char)
91
106
  case @char
92
- when @quote # consume_quoted_cell
107
+ when @quote # consume quoted cell
93
108
  match = ""
94
109
  while true
95
- getch # consume the quote (optimized by not calling next_char)
110
+ getch # consume the quote that got us here
96
111
  match << (scan_until(/(?=#{@quote})/o) or bomb "unclosed quote")
97
112
  case next_char
98
- when @sep then @flag = @es; next_char; break
99
- when @quote then match << @quote
100
- when @cr,@lf,nil then break
113
+ when @sep then @flag = @es; next_char; break
114
+ when @quote then match << @quote
115
+ when @cr,@lf,@es,nil then break
101
116
  else
102
117
  if @relax
103
118
  match << @quote + @char
@@ -107,14 +122,15 @@ class Censive < StringScanner
107
122
  end
108
123
  end
109
124
  match
110
- when @sep then @flag = @es; next_char; @es
111
- when @cr then @flag = @cr; nil
112
- when @lf then @flag = @lf; nil
113
- when nil then nil
125
+ when @sep then @flag = @es; next_char; @es
126
+ when @cr then @flag = @cr; nil
127
+ when @lf then @flag = @lf; nil
128
+ when @es,nil then nil
114
129
  end
115
- else # consume_unquoted_cell
130
+ else # consume unquoted cell
116
131
  match = scan_until(/(?=#{@sep}|#{@cr}|#{@lf}|\z)/o) or bomb "unexpected character"
117
- @char = string[pos]
132
+ match = @eq + match if @flag == @eq # preserve @eq for excel formulas
133
+ @char = peek(1)
118
134
  @char == @sep and @flag = @es and next_char
119
135
  match
120
136
  end
@@ -146,9 +162,9 @@ class Censive < StringScanner
146
162
 
147
163
  # ==[ Helpers ]==
148
164
 
149
- # grok returns: 2 for seps and quotes, 1 for seps only, and 0 for neither
165
+ # grok returns: 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
150
166
  def grok(str)
151
- if pos = str.index(/(#{@quote})|#{@sep}/o)
167
+ if pos = str.index(/(#{@quote})|#{@sep}|#{@cr}|#{@lf}/o)
152
168
  $1 ? 2 : str.index(/#{@quote}/o, pos) ? 2 : 1
153
169
  else
154
170
  0
@@ -158,13 +174,20 @@ class Censive < StringScanner
158
174
  def <<(row)
159
175
  @out or return super
160
176
 
177
+ # drop trailing seps, if specified
178
+ row.pop while row.last.empty? if @drop
179
+
161
180
  # most compact export format
162
181
  s,q = @sep, @quote
163
182
  out = case @mode
164
183
  when :compact
165
184
  case grok(row.join)
166
- when 0 then row
167
- when 1 then row.map {|col| col.include?(s) ? "#{q}#{col}#{q}" : col }
185
+ when 0
186
+ row
187
+ when 1
188
+ row.map do |col|
189
+ col.match?(/#{@sep}|#{@cr}|#{@lf}/o) ? "#{q}#{col}#{q}" : col
190
+ end
168
191
  else
169
192
  row.map do |col|
170
193
  case grok(col)
@@ -178,9 +201,6 @@ class Censive < StringScanner
178
201
  row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
179
202
  end.join(s)
180
203
 
181
- # drop trailing seps, if specified
182
- out.gsub!(/#{s}+\z/,'') if @drop
183
-
184
204
  # write output, using desired line endings
185
205
  @out << out + @eol
186
206
  end
@@ -203,81 +223,25 @@ __END__
203
223
 
204
224
  # ==[ Playground... ]==
205
225
 
206
- STDOUT.sync = true
207
-
208
- require 'fileutils'
209
-
210
- ARGV << "101.csv"
211
-
212
- rand = `LC_ALL=C tr -dc a-zA-Z0-9 < /dev/random | head -c12`
213
-
214
- rows = []
215
- cols = []
216
- coun = 0
217
- full = 0
218
-
219
- ARGV.each do |path|
220
- File.file?(path) or next
221
-
222
- print "Processing #{path.inspect}"
223
-
224
- rows.clear
225
- cols.clear
226
- seen = 0
227
- coun += 1
228
-
229
- dest = "#{path}-#{rand}"
230
-
231
- begin
232
- Censive.writer(dest) do |file|
233
- Censive.new(File.read(path), relax: true).each do |cols|
234
- cols.each {|cell| cell && cell.size >= 3 && cell.sub!(/\A="/, '') && cell.sub!(/"\z/, '') }
235
- file << cols
236
- seen += 1
237
- print "." if (seen % 1e5) == 0
238
- end
239
- end
240
- FileUtils.mv(dest, path)
241
- full += (seen - 1)
242
- puts " (#{seen - 1} rows of data)"
243
- rescue
244
- puts " - unable to process (#{$!})"
245
- FileUtils.rm_f(dest)
246
- end
247
- end
248
-
249
- puts "Processed #{coun} files with a total of #{full} rows of data" if coun > 1
250
-
251
- __END__
252
- ,"CHUI, LOK HANG "BENNY", => ,"""CHUI, LOK HANG ""BENNY""",
226
+ # STDOUT.sync = true
227
+ #
228
+ # data = File.read('1.csv')
229
+ #
230
+ # Censive.writer('out.csv') do |out|
231
+ # Censive.new(data, relax: true, excel: true).each do |row|
232
+ # out << row
233
+ # end
234
+ # end
235
+ #
236
+ # __END__
253
237
 
254
- ,"..............."B
238
+ ARGV << "z.csv" if ARGV.empty?
255
239
 
256
- __END__
240
+ path = ARGV.first
241
+ data = File.read(path)
257
242
 
243
+ csv = Censive.new(data)
258
244
 
259
- data = File.read('1.csv')
245
+ data.size > 1e6 ? csv.parse : csv.parse.each {|cols| p cols }
260
246
 
261
- Censive.writer('out.csv', sep: ',', quote: "'") do |out|
262
- Censive.new(data).each do |row|
263
- out << row
264
- end
265
- end
266
-
267
- # ARGV << "z.csv" if ARGV.empty?
268
- #
269
- # case 1
270
- # when 1
271
- # path = ARGV.first
272
- # data = File.read(path)
273
- # when 2
274
- # data = DATA.gets("\n\n").rstrip
275
- # end
276
- #
277
- # STDOUT.sync = true
278
- #
279
- # csv = Censive.new(data)
280
- #
281
- # data.size > 1e6 ? csv.parse : csv.parse.each {|cols| p cols }
282
- #
283
- # csv.stats
247
+ csv.stats
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: censive
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.7'
4
+ version: '0.9'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Steve Shreeve