csv_lazy 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile CHANGED
@@ -3,6 +3,8 @@ source "http://rubygems.org"
3
3
  # Example:
4
4
  # gem "activesupport", ">= 2.3.5"
5
5
 
6
+ gem "string_utils"
7
+
6
8
  # Add dependencies to develop your gem here.
7
9
  # Include everything needed to run rake, tests, features, etc.
8
10
  group :development do
data/Gemfile.lock CHANGED
@@ -20,6 +20,7 @@ GEM
20
20
  rspec-expectations (2.8.0)
21
21
  diff-lcs (~> 1.1.2)
22
22
  rspec-mocks (2.8.0)
23
+ string_utils ()
23
24
 
24
25
  PLATFORMS
25
26
  ruby
@@ -29,3 +30,4 @@ DEPENDENCIES
29
30
  jeweler (~> 1.8.4)
30
31
  rdoc (~> 3.12)
31
32
  rspec (~> 2.8.0)
33
+ string_utils
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.4
1
+ 0.0.5
data/csv_lazy.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "csv_lazy"
8
- s.version = "0.0.4"
8
+ s.version = "0.0.5"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Kasper Johansen"]
12
- s.date = "2013-03-22"
12
+ s.date = "2013-03-25"
13
13
  s.description = "A small CSV lib that skips whitespace-format-bugs and more."
14
14
  s.email = "k@spernj.org"
15
15
  s.extra_rdoc_files = [
@@ -29,7 +29,8 @@ Gem::Specification.new do |s|
29
29
  "lib/csv_lazy.rb",
30
30
  "spec/csv_lazy_spec.rb",
31
31
  "spec/spec_helper.rb",
32
- "spec/test1.csv.gz"
32
+ "spec/test1.csv.gz",
33
+ "spec/test2.csv"
33
34
  ]
34
35
  s.homepage = "http://github.com/kaspernj/csv_lazy"
35
36
  s.licenses = ["MIT"]
@@ -41,17 +42,20 @@ Gem::Specification.new do |s|
41
42
  s.specification_version = 3
42
43
 
43
44
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
45
+ s.add_runtime_dependency(%q<string_utils>, [">= 0"])
44
46
  s.add_development_dependency(%q<rspec>, ["~> 2.8.0"])
45
47
  s.add_development_dependency(%q<rdoc>, ["~> 3.12"])
46
48
  s.add_development_dependency(%q<bundler>, [">= 1.0.0"])
47
49
  s.add_development_dependency(%q<jeweler>, ["~> 1.8.4"])
48
50
  else
51
+ s.add_dependency(%q<string_utils>, [">= 0"])
49
52
  s.add_dependency(%q<rspec>, ["~> 2.8.0"])
50
53
  s.add_dependency(%q<rdoc>, ["~> 3.12"])
51
54
  s.add_dependency(%q<bundler>, [">= 1.0.0"])
52
55
  s.add_dependency(%q<jeweler>, ["~> 1.8.4"])
53
56
  end
54
57
  else
58
+ s.add_dependency(%q<string_utils>, [">= 0"])
55
59
  s.add_dependency(%q<rspec>, ["~> 2.8.0"])
56
60
  s.add_dependency(%q<rdoc>, ["~> 3.12"])
57
61
  s.add_dependency(%q<bundler>, [">= 1.0.0"])
data/lib/csv_lazy.rb CHANGED
@@ -1,3 +1,6 @@
1
+ #encoding: utf-8
2
+ require "string_utils"
3
+
1
4
  #A simple library for parsing CSV-files through IO's. Solves corrupt file formats automatically like when files contains several spaces after a column and more.
2
5
  class Csv_lazy
3
6
  include Enumerable
@@ -13,7 +16,8 @@ class Csv_lazy
13
16
  :quote_char => '"',
14
17
  :row_sep => "\n",
15
18
  :col_sep => ";",
16
- :headers => false
19
+ :headers => false,
20
+ :buffer_length => 4096
17
21
  }.merge(args)
18
22
 
19
23
  @io = @args[:io]
@@ -22,9 +26,14 @@ class Csv_lazy
22
26
  @debug = @args[:debug]
23
27
  @encode = @args[:encode]
24
28
  @mutex = Mutex.new
29
+ @buffer_length = @args[:buffer_length]
30
+ @escape_char = "\\"
31
+ @escaped_quote = "#{@escape_char}#{@args[:quote_char]}"
32
+ @escaped_quote_double = "#{@escape_char}#{@escape_char}#{@args[:quote_char]}"
33
+
25
34
  #@debug = true
26
35
 
27
- accepted = [:encode, :quote_char, :row_sep, :col_sep, :io, :debug, :headers]
36
+ accepted = [:encode, :quote_char, :row_sep, :col_sep, :io, :debug, :headers, :buffer_length]
28
37
  @args.each do |key, val|
29
38
  if accepted.index(key) == nil
30
39
  raise "Unknown argument: '#{key}'."
@@ -60,24 +69,20 @@ class Csv_lazy
60
69
 
61
70
  #Yields each row as an array.
62
71
  def each
63
- @mutex.synchronize do
64
- while row = read_row
65
- yield(row)
72
+ if block_given?
73
+ @mutex.synchronize do
74
+ while row = read_row
75
+ yield(row)
76
+ end
66
77
  end
67
- end
68
- end
69
-
70
- private
71
-
72
- #Reads more content into the buffer.
73
- def read_buffer
74
- read = @io.gets
75
-
76
- if !read
77
- @eof = true
78
78
  else
79
- read = read.encode(@encode) if @encode
80
- @buffer << read
79
+ Enumerable.new do |yielder|
80
+ @mutex.synchronize do
81
+ while row = read_row
82
+ yielder << row
83
+ end
84
+ end
85
+ end
81
86
  end
82
87
  end
83
88
 
@@ -109,6 +114,20 @@ class Csv_lazy
109
114
  end
110
115
  end
111
116
 
117
+ private
118
+
119
+ #Reads more content into the buffer.
120
+ def read_buffer
121
+ read = @io.gets
122
+
123
+ if !read
124
+ @eof = true
125
+ else
126
+ read = read.encode(@encode) if @encode
127
+ @buffer << read
128
+ end
129
+ end
130
+
112
131
  #Runs a regex against the buffer. If matched it also removes it from the buffer.
113
132
  def read_remove_regex(regex)
114
133
  if match = @buffer.match(regex)
@@ -130,23 +149,50 @@ class Csv_lazy
130
149
  return false
131
150
  end
132
151
 
152
+ def unescape(str)
153
+ return StringUtils.strtr(str, {
154
+ "\\\\" => "\\",
155
+ "\\t" => "\t",
156
+ "\\n" => "\n",
157
+ "\\\"" => "\""
158
+ })
159
+ end
160
+
133
161
  #Adds the next column to the row. Returns true if more columns should be read or false if this was the end of the row.
134
162
  def read_next_col
135
- read_buffer if @buffer.length < 4096
163
+ read_buffer if @buffer.length < @buffer_length
136
164
  return false if @buffer.empty? and @eof
137
165
 
138
166
  if @buffer.empty? or read_remove_regex(@regex_row_end)
139
167
  return false
140
168
  elsif match = read_remove_regex(@regex_begin_quote_char)
141
169
  read = ""
170
+ col_content = ""
142
171
 
143
172
  loop do
144
173
  match_read = read_remove_regex(@regex_read_until_quote_char)
145
174
  if !match_read
146
- read_buffer
175
+ if @eof
176
+ add_col(@buffer) unless @buffer.empty?
177
+ @buffer = ""
178
+ break
179
+ else
180
+ read_buffer
181
+ end
147
182
  else
148
- add_col(match_read[1])
149
- break
183
+ all = match_read[0]
184
+ escaped_quote_char = all[-@escaped_quote.length, @escaped_quote.length]
185
+ double_escaped_quote_char = all[-@escaped_quote_double.length, @escaped_quote_double.length]
186
+ all_without_quote = match_read[1]
187
+
188
+ if escaped_quote_char == @escaped_quote and double_escaped_quote_char != @escaped_quote_double
189
+ #continue reading - the quote char is escaped.
190
+ col_content << all
191
+ else
192
+ col_content << match_read[1]
193
+ add_col(unescape(col_content))
194
+ break
195
+ end
150
196
  end
151
197
  end
152
198
 
@@ -182,9 +228,10 @@ class Csv_lazy
182
228
  read_buffer
183
229
  raise Errno::EAGAIN
184
230
  else
185
- raise "Dont know what to do with buffer: #{@buffer}"
231
+ raise "Dont know what to do with buffer: '#{@buffer}'."
186
232
  end
187
233
  rescue Errno::EAGAIN
234
+ puts "csv_lazy: Retry! Probably we ran out of buffer..." if @debug
188
235
  retry
189
236
  end
190
237
 
@@ -86,8 +86,8 @@ describe "CsvLazy" do
86
86
 
87
87
  it "should be able to use headers and return hashes instead" do
88
88
  cont = "\"name\",age\r\n"
89
- cont += "\"Kasper Johansen\",27\r\n"
90
- cont += "\"Christina Stoeckel\",\"25\"\r\n"
89
+ cont << "\"Kasper Johansen\",27\r\n"
90
+ cont << "\"Christina Stoeckel\",\"25\"\r\n"
91
91
 
92
92
  line = 0
93
93
  Csv_lazy.new(:col_sep => ",", :io => StringIO.new(cont), :headers => true, :row_sep => "\r\n") do |csv|
@@ -109,4 +109,39 @@ describe "CsvLazy" do
109
109
 
110
110
  line.should eql(2)
111
111
  end
112
+
113
+ it "should be able to encode incoming strings from weird files without crashing" do
114
+ File.open("#{File.dirname(__FILE__)}/test2.csv", "rb", :encoding => "UTF-16LE") do |fp|
115
+ #Remove invalid UTF content.
116
+ fp.read(2)
117
+
118
+ Csv_lazy.new(:col_sep => ",", :io => fp, :headers => true, :row_sep => "\r\n", :quote_char => '"', :encode => "US-ASCII", :debug => false) do |csv|
119
+ csv.keys[0].should eql(:legacy_user_id)
120
+ csv.keys[1].should eql(:savings_percentage)
121
+ csv.keys[2].should eql(:active)
122
+ csv.keys.length.should eql(3)
123
+ end
124
+ end
125
+ end
126
+
127
+ it "should do proper escaping" do
128
+ cont = "\"Test1\";\"Test2 \\\"Wee\\\"\"\r\n"
129
+ cont << "\"Test3\";\"Test4 \\\"Wee\\\"\";\"Test5 \\\"Wee\\\"\"\r\n"
130
+
131
+ csv = Csv_lazy.new(:col_sep => ";", :io => StringIO.new(cont), :row_sep => "\r\n")
132
+
133
+ row = csv.read_row
134
+ row[0].should eql("Test1")
135
+ row[1].should eql("Test2 \"Wee\"")
136
+ row.length.should eql(2)
137
+
138
+ row = csv.read_row
139
+ row[0].should eql("Test3")
140
+ row[1].should eql("Test4 \"Wee\"")
141
+ row[2].should eql("Test5 \"Wee\"")
142
+ row.length.should eql(3)
143
+
144
+ row = csv.read_row
145
+ row.should eql(false)
146
+ end
112
147
  end
data/spec/test2.csv ADDED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: csv_lazy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,8 +9,24 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-03-22 00:00:00.000000000 Z
12
+ date: 2013-03-25 00:00:00.000000000 Z
13
13
  dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: string_utils
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
14
30
  - !ruby/object:Gem::Dependency
15
31
  name: rspec
16
32
  requirement: !ruby/object:Gem::Requirement
@@ -96,6 +112,7 @@ files:
96
112
  - spec/csv_lazy_spec.rb
97
113
  - spec/spec_helper.rb
98
114
  - spec/test1.csv.gz
115
+ - spec/test2.csv
99
116
  homepage: http://github.com/kaspernj/csv_lazy
100
117
  licenses:
101
118
  - MIT
@@ -111,7 +128,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
111
128
  version: '0'
112
129
  segments:
113
130
  - 0
114
- hash: 222799097840594694
131
+ hash: -1390404400491179462
115
132
  required_rubygems_version: !ruby/object:Gem::Requirement
116
133
  none: false
117
134
  requirements: