censive 0.9 → 0.11

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cbca33c415269ae1fafea5297f2b409879a46c37c884a0a7017bca322bcff2a6
4
- data.tar.gz: ac021ddf3d7503aebc5791b0912c6409a0888627060b532e65f6eb72b94965a3
3
+ metadata.gz: 91f3884b92c79f8cca37066a420c872b612c37ed736f61fa7d11c0f0fd861d8e
4
+ data.tar.gz: 71789d006309c1d87e681a7078e718342cb36e1f8a18690c59e51595c49d2e59
5
5
  SHA512:
6
- metadata.gz: 8095c0c7704e3a6ee66930b36f0131b38d52a68cdd066d9677e8ceb58c4ecd7ce7eed496c78b1841cabe845b8c82624ca808b33a7cf7ec4c8fd504b287b3ffb5
7
- data.tar.gz: 2e363b63b37977784a38c06e091f3201a1cd7a13138e8101e0e41ca49c47b3c4b433e7e6f2843a6816ddcbf9c1c8293da0d858f6be38bd0d3d82ed5dbd904bfe
6
+ metadata.gz: d5fe33889abe08bc6aa57bb4dec2d65ef3e8935da5aa209a0bcb6060696de07cf44189ad9be6f43ffcd2897672668bd3aff7e8b713f5070199627e40edd3704f
7
+ data.tar.gz: 2c79d30b4682da07800c19eb93067c5c72d471f9abc7b824f7525b3d36b8fb1108d0e733706852f750baac57360af2d5b77527a1daab871e2313d558928ce240
data/README.md CHANGED
@@ -2,7 +2,58 @@
2
2
 
3
3
  A quick and lightweight CSV handling library for Ruby
4
4
 
5
- ## Writing CSV
5
+ ## Example
6
+
7
+ ```ruby
8
+ #!/usr/bin/env ruby
9
+
10
+ STDOUT.sync = true
11
+
12
+ require 'censive'
13
+ require 'fileutils'
14
+
15
+ abort "usage: #{File.basename($0)} <files>" if ARGV.empty?
16
+
17
+ rand = `LC_ALL=C tr -dc a-zA-Z0-9 < /dev/random | head -c12`
18
+
19
+ rows = []
20
+ cols = []
21
+ coun = 0
22
+ full = 0
23
+
24
+ ARGV.each do |path|
25
+ File.file?(path) or next
26
+
27
+ print "Processing #{path.inspect}"
28
+
29
+ rows.clear
30
+ cols.clear
31
+ seen = 0
32
+ coun += 1
33
+
34
+ dest = "#{path}-#{rand}"
35
+
36
+ begin
37
+ Censive.writer(dest) do |file|
38
+ Censive.reader(path, excel: true, relax: true).each do |cols|
39
+ file << cols
40
+ seen += 1
41
+ print "." if (seen % 1e5) == 0 # give a status update every so often
42
+ end
43
+ end
44
+ FileUtils.mv(dest, path)
45
+ full += (seen - 1)
46
+ puts " (#{seen - 1} rows of data)"
47
+ rescue
48
+ puts " - unable to process (#{$!})"
49
+ FileUtils.rm_f(dest)
50
+ end
51
+ end
52
+
53
+ puts "Processed #{coun} files with a total of #{full} rows of data" if coun > 1
54
+ ```
55
+
56
+ ## Convert a CSV file to a TSV file
6
57
 
7
58
  ```ruby
8
59
  require 'censive'
@@ -17,3 +68,12 @@ Censive.writer('out.tsv', sep: "\t", mode: :full) do |out|
17
68
  end
18
69
  end
19
70
  ```
71
+
72
+ Or, you can be more succinct with:
73
+
74
+ ```ruby
75
+ require 'censive'
76
+
77
+ csv = Censive.new(File.read('data.csv'))
78
+ csv.export(sep: "\t")
79
+ ```
data/censive.gemspec CHANGED
@@ -2,13 +2,14 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = "censive"
5
- s.version = "0.9"
5
+ s.version = "0.11"
6
6
  s.author = "Steve Shreeve"
7
7
  s.email = "steve.shreeve@gmail.com"
8
- s.summary = "A quick and lightweight CSV handling library for Ruby"
8
+ s.summary =
9
9
  s.description = "A quick and lightweight CSV handling library for Ruby"
10
10
  s.homepage = "https://github.com/shreeve/censive"
11
11
  s.license = "MIT"
12
12
  s.files = `git ls-files`.split("\n") - %w[.gitignore]
13
13
  s.executables = `(cd bin 2>&1) > /dev/null && git ls-files .`.split("\n")
14
+ s.add_dependency "strscan", ">= 3.0.6"
14
15
  end
data/lib/censive.rb CHANGED
@@ -1,137 +1,116 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- # ==============================================================================
3
+ # ============================================================================
4
4
  # censive - A quick and lightweight CSV handling library for Ruby
5
5
  #
6
6
  # Author: Steve Shreeve (steve.shreeve@gmail.com)
7
- # Date: Jan 30, 2023
8
- #
9
- # Thanks to https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
10
- # and, also https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
11
- # ==============================================================================
12
- # The goals are:
7
+ # Date: Feb 3, 2023
13
8
  #
9
+ # https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
10
+ # https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
11
+ # https://github.com/ruby/strscan/issues/53 for details
12
+ # https://github.com/ruby/strscan/pull/54 for code
13
+ # ============================================================================
14
+ # GOALS:
14
15
  # 1. Faster than Ruby's default CSV library
15
- # 2. Lightweight code base with streamlined method calls
16
- #
17
- # To consider:
18
- #
19
- # 1. Option to support IO streaming
20
- # 2. Option to strip whitespace
21
- # 3. Option to support headers in readers and writers
22
- # 4. Confirm file encodings such as UTF-8, UTF-16, etc.
16
+ # 2. Lightweight code base with streamlined logic
17
+ # 3. Support for most non-compliant CSV variations (eg - @relax, @excel)
23
18
  #
24
- # NOTE: Only getch and scan_until advance strscan's position
25
- # ==============================================================================
19
+ # TODO:
20
+ # 1. Support IO streaming
21
+ # 2. Add option to strip whitespace
22
+ # 3. Support CSV headers in first row
23
+ # ============================================================================
26
24
 
25
+ require 'bundler/setup'
27
26
  require 'strscan'
28
27
 
29
28
  class Censive < StringScanner
30
29
 
31
- def self.writer(path, **opts)
32
- File.open(path, 'w') do |file|
33
- yield new(out: file, **opts)
30
+ def self.writer(obj=nil, **opts, &code)
31
+ case obj
32
+ when String then File.open(path, 'w') {|file| yield new(out: obj, **opts, &code) }
33
+ when IO,nil then new(out: obj, **opts, &code)
34
+ else abort "#{File.basename($0)}: invalid #{obj.class} object in writer"
34
35
  end
35
36
  end
36
37
 
37
38
  def initialize(str=nil,
38
- sep: ',' , # column separator character
39
- quote: '"' , # quote character
40
-
41
- drop: false , # enable to drop trailing separators
42
- eol: "\n" , # desired line endings for exports
43
- excel: false , # literals (="01"), formulas (=A1 + B2), see http://bit.ly/3Y7jIvc
39
+ drop: false , # drop trailing empty fields?
40
+ eol: "\n" , # line endings for exports
41
+ excel: false , # literals ="01" formulas =A1 + B2 http://bit.ly/3Y7jIvc
44
42
  mode: :compact, # export mode: compact or full
45
- out: nil , # output IO/file
46
- relax: false , # relax parsing of quotes
47
-
43
+ out: nil , # output stream, needs to respond to <<
44
+ quote: '"' , # quote character
45
+ relax: false , # relax quote parsing so ,"Fo"o, => ,"Fo""o",
46
+ sep: ',' , # column separator character
48
47
  **opts # grab bag
49
48
  )
50
49
  super(str || '')
51
50
  reset
52
51
 
53
- @sep = sep .freeze
54
- @quote = quote.freeze
55
-
56
52
  @drop = drop
57
- @eol = eol.freeze
53
+ @eol = eol
58
54
  @excel = excel
59
55
  @mode = mode
60
- @out = out
56
+ @out = out || $stdout
57
+ @quote = quote
61
58
  @relax = relax
59
+ @sep = sep
62
60
 
63
- @es = "" .freeze
64
- @cr = "\r" .freeze
65
- @lf = "\n" .freeze
66
- @eq = "=" .freeze
67
- @esc = (@quote * 2).freeze
68
-
69
- @tokens = [@sep,@quote,@cr,@lf,@es,nil]
61
+ @cr = "\r"
62
+ @lf = "\n"
63
+ @es = ""
64
+ @eq = "="
65
+ @esc = (@quote * 2)
70
66
  end
71
67
 
72
68
  def reset(str=nil)
73
69
  self.string = str if str
74
70
  super()
75
- @char = peek(1)
76
- @flag = nil
77
-
71
+ @char = curr_char
78
72
  @rows = nil
79
73
  @cols = @cells = 0
80
74
  end
81
75
 
82
76
  # ==[ Lexer ]==
83
77
 
84
- def next_char
85
- getch
86
- @char = peek(1)
87
- end
88
-
89
- def next_token
78
+ # pure ruby versions for debugging
79
+ # def curr_char; @char = string[pos]; end
80
+ # def next_char; scan(/./m); @char = string[pos]; end
90
81
 
91
- # process and clear @flag
92
- case @flag
93
- when @es then @flag = nil; [@cr,@lf,@es,nil].include?(@char) and return @es
94
- when @cr then @flag = nil; next_char == @lf and next_char
95
- when @lf then @flag = nil; next_char
96
- else @flag = nil
97
- end if @flag
82
+ def curr_char; @char = currchar; end
83
+ def next_char; @char = nextchar; end
98
84
 
99
- # See http://bit.ly/3Y7jIvc
85
+ def next_token
100
86
  if @excel && @char == @eq
101
- @flag = @eq
87
+ excel = true
102
88
  next_char
103
89
  end
104
90
 
105
- if @tokens.include?(@char)
91
+ if @char == @quote # consume quoted cell
92
+ match = ""
93
+ while true
94
+ next_char
95
+ match << (scan_until(/(?=#{@quote})/o) or bomb "unclosed quote")
96
+ match << @quote and next if next_char == @quote
97
+ break if [@sep,@cr,@lf,@es,nil].include?(@char)
98
+ @relax or bomb "invalid character after quote"
99
+ match << @quote + scan_until(/(?=#{@quote})/o) + @quote
100
+ end
101
+ next_char if @char == @sep
102
+ match
103
+ elsif [@sep,@cr,@lf,@es,nil].include?(@char)
106
104
  case @char
107
- when @quote # consume quoted cell
108
- match = ""
109
- while true
110
- getch # consume the quote that got us here
111
- match << (scan_until(/(?=#{@quote})/o) or bomb "unclosed quote")
112
- case next_char
113
- when @sep then @flag = @es; next_char; break
114
- when @quote then match << @quote
115
- when @cr,@lf,@es,nil then break
116
- else
117
- if @relax
118
- match << @quote + @char
119
- else
120
- bomb "invalid character after quote"
121
- end
122
- end
123
- end
124
- match
125
- when @sep then @flag = @es; next_char; @es
126
- when @cr then @flag = @cr; nil
127
- when @lf then @flag = @lf; nil
128
- when @es,nil then nil
105
+ when @sep then next_char; @es
106
+ when @cr then next_char == @lf and next_char; nil
107
+ when @lf then next_char; nil
108
+ else nil
129
109
  end
130
110
  else # consume unquoted cell
131
111
  match = scan_until(/(?=#{@sep}|#{@cr}|#{@lf}|\z)/o) or bomb "unexpected character"
132
- match = @eq + match if @flag == @eq # preserve @eq for excel formulas
133
- @char = peek(1)
134
- @char == @sep and @flag = @es and next_char
112
+ match.prepend(@eq) if excel
113
+ next_char if curr_char == @sep
135
114
  match
136
115
  end
137
116
  end
@@ -143,12 +122,12 @@ class Censive < StringScanner
143
122
  # ==[ Parser ]==
144
123
 
145
124
  def parse
146
- @rows ||= []
125
+ @rows = []
147
126
  while row = next_row
148
127
  @rows << row
149
- size = row.size
150
- @cols = size if size > @cols
151
- @cells += size
128
+ count = row.size
129
+ @cols = count if count > @cols
130
+ @cells += count
152
131
  end
153
132
  @rows
154
133
  end
@@ -162,26 +141,25 @@ class Censive < StringScanner
162
141
 
163
142
  # ==[ Helpers ]==
164
143
 
165
- # grok returns: 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
144
+ # returns 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
166
145
  def grok(str)
167
- if pos = str.index(/(#{@quote})|#{@sep}|#{@cr}|#{@lf}/o)
168
- $1 ? 2 : str.index(/#{@quote}/o, pos) ? 2 : 1
146
+ if idx = str.index(/(#{@quote})|#{@sep}|#{@cr}|#{@lf}/o) #!# FIXME: regex injection is possible
147
+ $1 ? 2 : str.index(/#{@quote}/o, idx) ? 2 : 1
169
148
  else
170
149
  0
171
150
  end
172
151
  end
173
152
 
153
+ # output a row
174
154
  def <<(row)
175
- @out or return super
176
155
 
177
- # drop trailing seps, if specified
156
+ # drop trailing empty columns
178
157
  row.pop while row.last.empty? if @drop
179
158
 
180
- # most compact export format
181
159
  s,q = @sep, @quote
182
160
  out = case @mode
183
161
  when :compact
184
- case grok(row.join)
162
+ case @excel ? 2 : grok(row.join)
185
163
  when 0
186
164
  row
187
165
  when 1
@@ -190,6 +168,7 @@ class Censive < StringScanner
190
168
  end
191
169
  else
192
170
  row.map do |col|
171
+ @excel && col =~ /\A0\d*\z/ ? "=#{q}#{col}#{q}" :
193
172
  case grok(col)
194
173
  when 0 then col
195
174
  when 1 then "#{q}#{col}#{q}"
@@ -201,7 +180,6 @@ class Censive < StringScanner
201
180
  row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
202
181
  end.join(s)
203
182
 
204
- # write output, using desired line endings
205
183
  @out << out + @eol
206
184
  end
207
185
 
@@ -210,6 +188,11 @@ class Censive < StringScanner
210
188
  @rows.each {|row| yield row }
211
189
  end
212
190
 
191
+ def export(**opts)
192
+ out = opts.empty? ? self : self.class.writer(**opts)
193
+ each {|row| out << row }
194
+ end
195
+
213
196
  def stats
214
197
  wide = string.size.to_s.size
215
198
  puts "%#{wide}d rows" % @rows.size
@@ -219,29 +202,25 @@ class Censive < StringScanner
219
202
  end
220
203
  end
221
204
 
222
- __END__
223
-
224
- # ==[ Playground... ]==
225
-
226
- # STDOUT.sync = true
227
- #
228
- # data = File.read('1.csv')
229
- #
230
- # Censive.writer('out.csv') do |out|
231
- # Censive.new(data, relax: true, excel: true).each do |row|
232
- # out << row
233
- # end
234
- # end
235
- #
236
- # __END__
237
-
238
- ARGV << "z.csv" if ARGV.empty?
239
-
240
- path = ARGV.first
241
- data = File.read(path)
242
-
243
- csv = Censive.new(data)
244
-
245
- data.size > 1e6 ? csv.parse : csv.parse.each {|cols| p cols }
205
+ if __FILE__ == $0
206
+ raw = DATA.gets("\n\n").chomp
207
+ # raw = File.read(ARGV.first || "lc-2023.csv")
208
+ csv = Censive.new(raw, excel: true, relax: true)
209
+ csv.export # (sep: ",", excel: true)
210
+ end
246
211
 
247
- csv.stats
212
+ __END__
213
+ Name,Age,Shoe
214
+ Alice,27,5
215
+ Bob,33,10 1/2
216
+ Charlie or "Chuck",=B2 + B3,9
217
+ "Doug E Fresh",="007",10
218
+ Subtotal,=sum(B2:B5),="01234"
219
+
220
+ # first line works in "relax" mode, bottom line is compliant
221
+ 123,"CHO, JOELLE "JOJO"",456
222
+ 123,"CHO, JOELLE ""JOJO""",456
223
+
224
+ # Excel mode checking
225
+ =,=x,x=,="x",="","","=",123,0123,="123",="0123"
226
+ ,=x,x=,x,,,,,,=,,123,="0123",123,,="0123" # <= a little off
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: censive
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.9'
4
+ version: '0.11'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Steve Shreeve
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-01-31 00:00:00.000000000 Z
12
- dependencies: []
11
+ date: 2023-02-04 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: strscan
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: 3.0.6
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 3.0.6
13
27
  description: A quick and lightweight CSV handling library for Ruby
14
28
  email: steve.shreeve@gmail.com
15
29
  executables: []
@@ -20,7 +34,7 @@ files:
20
34
  - README.md
21
35
  - censive.gemspec
22
36
  - lib/censive.rb
23
- - test/a-uses-tabs-and-single-quotes-and-no-trailing-newline.csv
37
+ - test/a-uses-tabs-and-single-quotes-and-no-trailing-newline.tsv
24
38
  homepage: https://github.com/shreeve/censive
25
39
  licenses:
26
40
  - MIT