censive 0.9 → 0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cbca33c415269ae1fafea5297f2b409879a46c37c884a0a7017bca322bcff2a6
4
- data.tar.gz: ac021ddf3d7503aebc5791b0912c6409a0888627060b532e65f6eb72b94965a3
3
+ metadata.gz: 91f3884b92c79f8cca37066a420c872b612c37ed736f61fa7d11c0f0fd861d8e
4
+ data.tar.gz: 71789d006309c1d87e681a7078e718342cb36e1f8a18690c59e51595c49d2e59
5
5
  SHA512:
6
- metadata.gz: 8095c0c7704e3a6ee66930b36f0131b38d52a68cdd066d9677e8ceb58c4ecd7ce7eed496c78b1841cabe845b8c82624ca808b33a7cf7ec4c8fd504b287b3ffb5
7
- data.tar.gz: 2e363b63b37977784a38c06e091f3201a1cd7a13138e8101e0e41ca49c47b3c4b433e7e6f2843a6816ddcbf9c1c8293da0d858f6be38bd0d3d82ed5dbd904bfe
6
+ metadata.gz: d5fe33889abe08bc6aa57bb4dec2d65ef3e8935da5aa209a0bcb6060696de07cf44189ad9be6f43ffcd2897672668bd3aff7e8b713f5070199627e40edd3704f
7
+ data.tar.gz: 2c79d30b4682da07800c19eb93067c5c72d471f9abc7b824f7525b3d36b8fb1108d0e733706852f750baac57360af2d5b77527a1daab871e2313d558928ce240
data/README.md CHANGED
@@ -2,7 +2,58 @@
2
2
 
3
3
  A quick and lightweight CSV handling library for Ruby
4
4
 
5
- ## Writing CSV
5
+ ## Example
6
+
7
+ ```ruby
8
+ #!/usr/bin/env ruby
9
+
10
+ STDOUT.sync = true
11
+
12
+ require 'censive'
13
+ require 'fileutils'
14
+
15
+ abort "usage: #{File.basename($0)} <files>" if ARGV.empty?
16
+
17
+ rand = `LC_ALL=C tr -dc a-zA-Z0-9 < /dev/random | head -c12`
18
+
19
+ rows = []
20
+ cols = []
21
+ coun = 0
22
+ full = 0
23
+
24
+ ARGV.each do |path|
25
+ File.file?(path) or next
26
+
27
+ print "Processing #{path.inspect}"
28
+
29
+ rows.clear
30
+ cols.clear
31
+ seen = 0
32
+ coun += 1
33
+
34
+ dest = "#{path}-#{rand}"
35
+
36
+ begin
37
+ Censive.writer(dest) do |file|
38
+ Censive.reader(path, excel: true, relax: true).each do |cols|
39
+ file << cols
40
+ seen += 1
41
+ print "." if (seen % 1e5) == 0 # give a status update every so often
42
+ end
43
+ end
44
+ FileUtils.mv(dest, path)
45
+ full += (seen - 1)
46
+ puts " (#{seen - 1} rows of data)"
47
+ rescue
48
+ puts " - unable to process (#{$!})"
49
+ FileUtils.rm_f(dest)
50
+ end
51
+ end
52
+
53
+ puts "Processed #{coun} files with a total of #{full} rows of data" if coun > 1
54
+ ```
55
+
56
+ ## Convert a CSV file to a TSV file
6
57
 
7
58
  ```ruby
8
59
  require 'censive'
@@ -17,3 +68,12 @@ Censive.writer('out.tsv', sep: "\t", mode: :full) do |out|
17
68
  end
18
69
  end
19
70
  ```
71
+
72
+ Or, you can be more succinct with:
73
+
74
+ ```ruby
75
+ require 'censive'
76
+
77
+ csv = Censive.new(File.read('data.csv'))
78
+ csv.export(sep: "\t")
79
+ ```
data/censive.gemspec CHANGED
@@ -2,13 +2,14 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = "censive"
5
- s.version = "0.9"
5
+ s.version = "0.11"
6
6
  s.author = "Steve Shreeve"
7
7
  s.email = "steve.shreeve@gmail.com"
8
- s.summary = "A quick and lightweight CSV handling library for Ruby"
8
+ s.summary =
9
9
  s.description = "A quick and lightweight CSV handling library for Ruby"
10
10
  s.homepage = "https://github.com/shreeve/censive"
11
11
  s.license = "MIT"
12
12
  s.files = `git ls-files`.split("\n") - %w[.gitignore]
13
13
  s.executables = `(cd bin 2>&1) > /dev/null && git ls-files .`.split("\n")
14
+ s.add_dependency "strscan", ">= 3.0.6"
14
15
  end
data/lib/censive.rb CHANGED
@@ -1,137 +1,116 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- # ==============================================================================
3
+ # ============================================================================
4
4
  # censive - A quick and lightweight CSV handling library for Ruby
5
5
  #
6
6
  # Author: Steve Shreeve (steve.shreeve@gmail.com)
7
- # Date: Jan 30, 2023
8
- #
9
- # Thanks to https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
10
- # and, also https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
11
- # ==============================================================================
12
- # The goals are:
7
+ # Date: Feb 3, 2023
13
8
  #
9
+ # https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
10
+ # https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
11
+ # https://github.com/ruby/strscan/issues/53 for details
12
+ # https://github.com/ruby/strscan/pull/54 for code
13
+ # ============================================================================
14
+ # GOALS:
14
15
  # 1. Faster than Ruby's default CSV library
15
- # 2. Lightweight code base with streamlined method calls
16
- #
17
- # To consider:
18
- #
19
- # 1. Option to support IO streaming
20
- # 2. Option to strip whitespace
21
- # 3. Option to support headers in readers and writers
22
- # 4. Confirm file encodings such as UTF-8, UTF-16, etc.
16
+ # 2. Lightweight code base with streamlined logic
17
+ # 3. Support for most non-compliant CSV variations (eg - @relax, @excel)
23
18
  #
24
- # NOTE: Only getch and scan_until advance strscan's position
25
- # ==============================================================================
19
+ # TODO:
20
+ # 1. Support IO streaming
21
+ # 2. Add option to strip whitespace
22
+ # 3. Support CSV headers in first row
23
+ # ============================================================================
26
24
 
25
+ require 'bundler/setup'
27
26
  require 'strscan'
28
27
 
29
28
  class Censive < StringScanner
30
29
 
31
- def self.writer(path, **opts)
32
- File.open(path, 'w') do |file|
33
- yield new(out: file, **opts)
30
+ def self.writer(obj=nil, **opts, &code)
31
+ case obj
32
+ when String then File.open(path, 'w') {|file| yield new(out: obj, **opts, &code) }
33
+ when IO,nil then new(out: obj, **opts, &code)
34
+ else abort "#{File.basename($0)}: invalid #{obj.class} object in writer"
34
35
  end
35
36
  end
36
37
 
37
38
  def initialize(str=nil,
38
- sep: ',' , # column separator character
39
- quote: '"' , # quote character
40
-
41
- drop: false , # enable to drop trailing separators
42
- eol: "\n" , # desired line endings for exports
43
- excel: false , # literals (="01"), formulas (=A1 + B2), see http://bit.ly/3Y7jIvc
39
+ drop: false , # drop trailing empty fields?
40
+ eol: "\n" , # line endings for exports
41
+ excel: false , # literals ="01" formulas =A1 + B2 http://bit.ly/3Y7jIvc
44
42
  mode: :compact, # export mode: compact or full
45
- out: nil , # output IO/file
46
- relax: false , # relax parsing of quotes
47
-
43
+ out: nil , # output stream, needs to respond to <<
44
+ quote: '"' , # quote character
45
+ relax: false , # relax quote parsing so ,"Fo"o, => ,"Fo""o",
46
+ sep: ',' , # column separator character
48
47
  **opts # grab bag
49
48
  )
50
49
  super(str || '')
51
50
  reset
52
51
 
53
- @sep = sep .freeze
54
- @quote = quote.freeze
55
-
56
52
  @drop = drop
57
- @eol = eol.freeze
53
+ @eol = eol
58
54
  @excel = excel
59
55
  @mode = mode
60
- @out = out
56
+ @out = out || $stdout
57
+ @quote = quote
61
58
  @relax = relax
59
+ @sep = sep
62
60
 
63
- @es = "" .freeze
64
- @cr = "\r" .freeze
65
- @lf = "\n" .freeze
66
- @eq = "=" .freeze
67
- @esc = (@quote * 2).freeze
68
-
69
- @tokens = [@sep,@quote,@cr,@lf,@es,nil]
61
+ @cr = "\r"
62
+ @lf = "\n"
63
+ @es = ""
64
+ @eq = "="
65
+ @esc = (@quote * 2)
70
66
  end
71
67
 
72
68
  def reset(str=nil)
73
69
  self.string = str if str
74
70
  super()
75
- @char = peek(1)
76
- @flag = nil
77
-
71
+ @char = curr_char
78
72
  @rows = nil
79
73
  @cols = @cells = 0
80
74
  end
81
75
 
82
76
  # ==[ Lexer ]==
83
77
 
84
- def next_char
85
- getch
86
- @char = peek(1)
87
- end
88
-
89
- def next_token
78
+ # pure ruby versions for debugging
79
+ # def curr_char; @char = string[pos]; end
80
+ # def next_char; scan(/./m); @char = string[pos]; end
90
81
 
91
- # process and clear @flag
92
- case @flag
93
- when @es then @flag = nil; [@cr,@lf,@es,nil].include?(@char) and return @es
94
- when @cr then @flag = nil; next_char == @lf and next_char
95
- when @lf then @flag = nil; next_char
96
- else @flag = nil
97
- end if @flag
82
+ def curr_char; @char = currchar; end
83
+ def next_char; @char = nextchar; end
98
84
 
99
- # See http://bit.ly/3Y7jIvc
85
+ def next_token
100
86
  if @excel && @char == @eq
101
- @flag = @eq
87
+ excel = true
102
88
  next_char
103
89
  end
104
90
 
105
- if @tokens.include?(@char)
91
+ if @char == @quote # consume quoted cell
92
+ match = ""
93
+ while true
94
+ next_char
95
+ match << (scan_until(/(?=#{@quote})/o) or bomb "unclosed quote")
96
+ match << @quote and next if next_char == @quote
97
+ break if [@sep,@cr,@lf,@es,nil].include?(@char)
98
+ @relax or bomb "invalid character after quote"
99
+ match << @quote + scan_until(/(?=#{@quote})/o) + @quote
100
+ end
101
+ next_char if @char == @sep
102
+ match
103
+ elsif [@sep,@cr,@lf,@es,nil].include?(@char)
106
104
  case @char
107
- when @quote # consume quoted cell
108
- match = ""
109
- while true
110
- getch # consume the quote that got us here
111
- match << (scan_until(/(?=#{@quote})/o) or bomb "unclosed quote")
112
- case next_char
113
- when @sep then @flag = @es; next_char; break
114
- when @quote then match << @quote
115
- when @cr,@lf,@es,nil then break
116
- else
117
- if @relax
118
- match << @quote + @char
119
- else
120
- bomb "invalid character after quote"
121
- end
122
- end
123
- end
124
- match
125
- when @sep then @flag = @es; next_char; @es
126
- when @cr then @flag = @cr; nil
127
- when @lf then @flag = @lf; nil
128
- when @es,nil then nil
105
+ when @sep then next_char; @es
106
+ when @cr then next_char == @lf and next_char; nil
107
+ when @lf then next_char; nil
108
+ else nil
129
109
  end
130
110
  else # consume unquoted cell
131
111
  match = scan_until(/(?=#{@sep}|#{@cr}|#{@lf}|\z)/o) or bomb "unexpected character"
132
- match = @eq + match if @flag == @eq # preserve @eq for excel formulas
133
- @char = peek(1)
134
- @char == @sep and @flag = @es and next_char
112
+ match.prepend(@eq) if excel
113
+ next_char if curr_char == @sep
135
114
  match
136
115
  end
137
116
  end
@@ -143,12 +122,12 @@ class Censive < StringScanner
143
122
  # ==[ Parser ]==
144
123
 
145
124
  def parse
146
- @rows ||= []
125
+ @rows = []
147
126
  while row = next_row
148
127
  @rows << row
149
- size = row.size
150
- @cols = size if size > @cols
151
- @cells += size
128
+ count = row.size
129
+ @cols = count if count > @cols
130
+ @cells += count
152
131
  end
153
132
  @rows
154
133
  end
@@ -162,26 +141,25 @@ class Censive < StringScanner
162
141
 
163
142
  # ==[ Helpers ]==
164
143
 
165
- # grok returns: 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
144
+ # returns 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
166
145
  def grok(str)
167
- if pos = str.index(/(#{@quote})|#{@sep}|#{@cr}|#{@lf}/o)
168
- $1 ? 2 : str.index(/#{@quote}/o, pos) ? 2 : 1
146
+ if idx = str.index(/(#{@quote})|#{@sep}|#{@cr}|#{@lf}/o) #!# FIXME: regex injection is possible
147
+ $1 ? 2 : str.index(/#{@quote}/o, idx) ? 2 : 1
169
148
  else
170
149
  0
171
150
  end
172
151
  end
173
152
 
153
+ # output a row
174
154
  def <<(row)
175
- @out or return super
176
155
 
177
- # drop trailing seps, if specified
156
+ # drop trailing empty columns
178
157
  row.pop while row.last.empty? if @drop
179
158
 
180
- # most compact export format
181
159
  s,q = @sep, @quote
182
160
  out = case @mode
183
161
  when :compact
184
- case grok(row.join)
162
+ case @excel ? 2 : grok(row.join)
185
163
  when 0
186
164
  row
187
165
  when 1
@@ -190,6 +168,7 @@ class Censive < StringScanner
190
168
  end
191
169
  else
192
170
  row.map do |col|
171
+ @excel && col =~ /\A0\d*\z/ ? "=#{q}#{col}#{q}" :
193
172
  case grok(col)
194
173
  when 0 then col
195
174
  when 1 then "#{q}#{col}#{q}"
@@ -201,7 +180,6 @@ class Censive < StringScanner
201
180
  row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
202
181
  end.join(s)
203
182
 
204
- # write output, using desired line endings
205
183
  @out << out + @eol
206
184
  end
207
185
 
@@ -210,6 +188,11 @@ class Censive < StringScanner
210
188
  @rows.each {|row| yield row }
211
189
  end
212
190
 
191
+ def export(**opts)
192
+ out = opts.empty? ? self : self.class.writer(**opts)
193
+ each {|row| out << row }
194
+ end
195
+
213
196
  def stats
214
197
  wide = string.size.to_s.size
215
198
  puts "%#{wide}d rows" % @rows.size
@@ -219,29 +202,25 @@ class Censive < StringScanner
219
202
  end
220
203
  end
221
204
 
222
- __END__
223
-
224
- # ==[ Playground... ]==
225
-
226
- # STDOUT.sync = true
227
- #
228
- # data = File.read('1.csv')
229
- #
230
- # Censive.writer('out.csv') do |out|
231
- # Censive.new(data, relax: true, excel: true).each do |row|
232
- # out << row
233
- # end
234
- # end
235
- #
236
- # __END__
237
-
238
- ARGV << "z.csv" if ARGV.empty?
239
-
240
- path = ARGV.first
241
- data = File.read(path)
242
-
243
- csv = Censive.new(data)
244
-
245
- data.size > 1e6 ? csv.parse : csv.parse.each {|cols| p cols }
205
+ if __FILE__ == $0
206
+ raw = DATA.gets("\n\n").chomp
207
+ # raw = File.read(ARGV.first || "lc-2023.csv")
208
+ csv = Censive.new(raw, excel: true, relax: true)
209
+ csv.export # (sep: ",", excel: true)
210
+ end
246
211
 
247
- csv.stats
212
+ __END__
213
+ Name,Age,Shoe
214
+ Alice,27,5
215
+ Bob,33,10 1/2
216
+ Charlie or "Chuck",=B2 + B3,9
217
+ "Doug E Fresh",="007",10
218
+ Subtotal,=sum(B2:B5),="01234"
219
+
220
+ # first line works in "relax" mode, bottom line is compliant
221
+ 123,"CHO, JOELLE "JOJO"",456
222
+ 123,"CHO, JOELLE ""JOJO""",456
223
+
224
+ # Excel mode checking
225
+ =,=x,x=,="x",="","","=",123,0123,="123",="0123"
226
+ ,=x,x=,x,,,,,,=,,123,="0123",123,,="0123" # <= a little off
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: censive
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.9'
4
+ version: '0.11'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Steve Shreeve
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-01-31 00:00:00.000000000 Z
12
- dependencies: []
11
+ date: 2023-02-04 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: strscan
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: 3.0.6
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 3.0.6
13
27
  description: A quick and lightweight CSV handling library for Ruby
14
28
  email: steve.shreeve@gmail.com
15
29
  executables: []
@@ -20,7 +34,7 @@ files:
20
34
  - README.md
21
35
  - censive.gemspec
22
36
  - lib/censive.rb
23
- - test/a-uses-tabs-and-single-quotes-and-no-trailing-newline.csv
37
+ - test/a-uses-tabs-and-single-quotes-and-no-trailing-newline.tsv
24
38
  homepage: https://github.com/shreeve/censive
25
39
  licenses:
26
40
  - MIT