csv_lazy 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +2 -0
- data/Gemfile.lock +2 -0
- data/VERSION +1 -1
- data/csv_lazy.gemspec +7 -3
- data/lib/csv_lazy.rb +70 -23
- data/spec/csv_lazy_spec.rb +37 -2
- data/spec/test2.csv +0 -0
- metadata +20 -3
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.5
|
data/csv_lazy.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "csv_lazy"
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.5"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Kasper Johansen"]
|
12
|
-
s.date = "2013-03-
|
12
|
+
s.date = "2013-03-25"
|
13
13
|
s.description = "A small CSV lib that skips whitespace-format-bugs and more."
|
14
14
|
s.email = "k@spernj.org"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -29,7 +29,8 @@ Gem::Specification.new do |s|
|
|
29
29
|
"lib/csv_lazy.rb",
|
30
30
|
"spec/csv_lazy_spec.rb",
|
31
31
|
"spec/spec_helper.rb",
|
32
|
-
"spec/test1.csv.gz"
|
32
|
+
"spec/test1.csv.gz",
|
33
|
+
"spec/test2.csv"
|
33
34
|
]
|
34
35
|
s.homepage = "http://github.com/kaspernj/csv_lazy"
|
35
36
|
s.licenses = ["MIT"]
|
@@ -41,17 +42,20 @@ Gem::Specification.new do |s|
|
|
41
42
|
s.specification_version = 3
|
42
43
|
|
43
44
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
45
|
+
s.add_runtime_dependency(%q<string_utils>, [">= 0"])
|
44
46
|
s.add_development_dependency(%q<rspec>, ["~> 2.8.0"])
|
45
47
|
s.add_development_dependency(%q<rdoc>, ["~> 3.12"])
|
46
48
|
s.add_development_dependency(%q<bundler>, [">= 1.0.0"])
|
47
49
|
s.add_development_dependency(%q<jeweler>, ["~> 1.8.4"])
|
48
50
|
else
|
51
|
+
s.add_dependency(%q<string_utils>, [">= 0"])
|
49
52
|
s.add_dependency(%q<rspec>, ["~> 2.8.0"])
|
50
53
|
s.add_dependency(%q<rdoc>, ["~> 3.12"])
|
51
54
|
s.add_dependency(%q<bundler>, [">= 1.0.0"])
|
52
55
|
s.add_dependency(%q<jeweler>, ["~> 1.8.4"])
|
53
56
|
end
|
54
57
|
else
|
58
|
+
s.add_dependency(%q<string_utils>, [">= 0"])
|
55
59
|
s.add_dependency(%q<rspec>, ["~> 2.8.0"])
|
56
60
|
s.add_dependency(%q<rdoc>, ["~> 3.12"])
|
57
61
|
s.add_dependency(%q<bundler>, [">= 1.0.0"])
|
data/lib/csv_lazy.rb
CHANGED
@@ -1,3 +1,6 @@
|
|
1
|
+
#encoding: utf-8
|
2
|
+
require "string_utils"
|
3
|
+
|
1
4
|
#A simple library for parsing CSV-files through IO's. Solves corrupt file formats automatically like when files contains several spaces after a column and more.
|
2
5
|
class Csv_lazy
|
3
6
|
include Enumerable
|
@@ -13,7 +16,8 @@ class Csv_lazy
|
|
13
16
|
:quote_char => '"',
|
14
17
|
:row_sep => "\n",
|
15
18
|
:col_sep => ";",
|
16
|
-
:headers => false
|
19
|
+
:headers => false,
|
20
|
+
:buffer_length => 4096
|
17
21
|
}.merge(args)
|
18
22
|
|
19
23
|
@io = @args[:io]
|
@@ -22,9 +26,14 @@ class Csv_lazy
|
|
22
26
|
@debug = @args[:debug]
|
23
27
|
@encode = @args[:encode]
|
24
28
|
@mutex = Mutex.new
|
29
|
+
@buffer_length = @args[:buffer_length]
|
30
|
+
@escape_char = "\\"
|
31
|
+
@escaped_quote = "#{@escape_char}#{@args[:quote_char]}"
|
32
|
+
@escaped_quote_double = "#{@escape_char}#{@escape_char}#{@args[:quote_char]}"
|
33
|
+
|
25
34
|
#@debug = true
|
26
35
|
|
27
|
-
accepted = [:encode, :quote_char, :row_sep, :col_sep, :io, :debug, :headers]
|
36
|
+
accepted = [:encode, :quote_char, :row_sep, :col_sep, :io, :debug, :headers, :buffer_length]
|
28
37
|
@args.each do |key, val|
|
29
38
|
if accepted.index(key) == nil
|
30
39
|
raise "Unknown argument: '#{key}'."
|
@@ -60,24 +69,20 @@ class Csv_lazy
|
|
60
69
|
|
61
70
|
#Yields each row as an array.
|
62
71
|
def each
|
63
|
-
|
64
|
-
|
65
|
-
|
72
|
+
if block_given?
|
73
|
+
@mutex.synchronize do
|
74
|
+
while row = read_row
|
75
|
+
yield(row)
|
76
|
+
end
|
66
77
|
end
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
|
-
private
|
71
|
-
|
72
|
-
#Reads more content into the buffer.
|
73
|
-
def read_buffer
|
74
|
-
read = @io.gets
|
75
|
-
|
76
|
-
if !read
|
77
|
-
@eof = true
|
78
78
|
else
|
79
|
-
|
80
|
-
|
79
|
+
Enumerable.new do |yielder|
|
80
|
+
@mutex.synchronize do
|
81
|
+
while row = read_row
|
82
|
+
yielder << row
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
81
86
|
end
|
82
87
|
end
|
83
88
|
|
@@ -109,6 +114,20 @@ class Csv_lazy
|
|
109
114
|
end
|
110
115
|
end
|
111
116
|
|
117
|
+
private
|
118
|
+
|
119
|
+
#Reads more content into the buffer.
|
120
|
+
def read_buffer
|
121
|
+
read = @io.gets
|
122
|
+
|
123
|
+
if !read
|
124
|
+
@eof = true
|
125
|
+
else
|
126
|
+
read = read.encode(@encode) if @encode
|
127
|
+
@buffer << read
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
112
131
|
#Runs a regex against the buffer. If matched it also removes it from the buffer.
|
113
132
|
def read_remove_regex(regex)
|
114
133
|
if match = @buffer.match(regex)
|
@@ -130,23 +149,50 @@ class Csv_lazy
|
|
130
149
|
return false
|
131
150
|
end
|
132
151
|
|
152
|
+
def unescape(str)
|
153
|
+
return StringUtils.strtr(str, {
|
154
|
+
"\\\\" => "\\",
|
155
|
+
"\\t" => "\t",
|
156
|
+
"\\n" => "\n",
|
157
|
+
"\\\"" => "\""
|
158
|
+
})
|
159
|
+
end
|
160
|
+
|
133
161
|
#Adds the next column to the row. Returns true if more columns should be read or false if this was the end of the row.
|
134
162
|
def read_next_col
|
135
|
-
read_buffer if @buffer.length <
|
163
|
+
read_buffer if @buffer.length < @buffer_length
|
136
164
|
return false if @buffer.empty? and @eof
|
137
165
|
|
138
166
|
if @buffer.empty? or read_remove_regex(@regex_row_end)
|
139
167
|
return false
|
140
168
|
elsif match = read_remove_regex(@regex_begin_quote_char)
|
141
169
|
read = ""
|
170
|
+
col_content = ""
|
142
171
|
|
143
172
|
loop do
|
144
173
|
match_read = read_remove_regex(@regex_read_until_quote_char)
|
145
174
|
if !match_read
|
146
|
-
|
175
|
+
if @eof
|
176
|
+
add_col(@buffer) unless @buffer.empty?
|
177
|
+
@buffer = ""
|
178
|
+
break
|
179
|
+
else
|
180
|
+
read_buffer
|
181
|
+
end
|
147
182
|
else
|
148
|
-
|
149
|
-
|
183
|
+
all = match_read[0]
|
184
|
+
escaped_quote_char = all[-@escaped_quote.length, @escaped_quote.length]
|
185
|
+
double_escaped_quote_char = all[-@escaped_quote_double.length, @escaped_quote_double.length]
|
186
|
+
all_without_quote = match_read[1]
|
187
|
+
|
188
|
+
if escaped_quote_char == @escaped_quote and double_escaped_quote_char != @escaped_quote_double
|
189
|
+
#continue reading - the quote char is escaped.
|
190
|
+
col_content << all
|
191
|
+
else
|
192
|
+
col_content << match_read[1]
|
193
|
+
add_col(unescape(col_content))
|
194
|
+
break
|
195
|
+
end
|
150
196
|
end
|
151
197
|
end
|
152
198
|
|
@@ -182,9 +228,10 @@ class Csv_lazy
|
|
182
228
|
read_buffer
|
183
229
|
raise Errno::EAGAIN
|
184
230
|
else
|
185
|
-
raise "Dont know what to do with buffer: #{@buffer}"
|
231
|
+
raise "Dont know what to do with buffer: '#{@buffer}'."
|
186
232
|
end
|
187
233
|
rescue Errno::EAGAIN
|
234
|
+
puts "csv_lazy: Retry! Probably we ran out of buffer..." if @debug
|
188
235
|
retry
|
189
236
|
end
|
190
237
|
|
data/spec/csv_lazy_spec.rb
CHANGED
@@ -86,8 +86,8 @@ describe "CsvLazy" do
|
|
86
86
|
|
87
87
|
it "should be able to use headers and return hashes instead" do
|
88
88
|
cont = "\"name\",age\r\n"
|
89
|
-
cont
|
90
|
-
cont
|
89
|
+
cont << "\"Kasper Johansen\",27\r\n"
|
90
|
+
cont << "\"Christina Stoeckel\",\"25\"\r\n"
|
91
91
|
|
92
92
|
line = 0
|
93
93
|
Csv_lazy.new(:col_sep => ",", :io => StringIO.new(cont), :headers => true, :row_sep => "\r\n") do |csv|
|
@@ -109,4 +109,39 @@ describe "CsvLazy" do
|
|
109
109
|
|
110
110
|
line.should eql(2)
|
111
111
|
end
|
112
|
+
|
113
|
+
it "should be able to encode incoming strings from weird files without crashing" do
|
114
|
+
File.open("#{File.dirname(__FILE__)}/test2.csv", "rb", :encoding => "UTF-16LE") do |fp|
|
115
|
+
#Remove invalid UTF content.
|
116
|
+
fp.read(2)
|
117
|
+
|
118
|
+
Csv_lazy.new(:col_sep => ",", :io => fp, :headers => true, :row_sep => "\r\n", :quote_char => '"', :encode => "US-ASCII", :debug => false) do |csv|
|
119
|
+
csv.keys[0].should eql(:legacy_user_id)
|
120
|
+
csv.keys[1].should eql(:savings_percentage)
|
121
|
+
csv.keys[2].should eql(:active)
|
122
|
+
csv.keys.length.should eql(3)
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
it "should do proper escaping" do
|
128
|
+
cont = "\"Test1\";\"Test2 \\\"Wee\\\"\"\r\n"
|
129
|
+
cont << "\"Test3\";\"Test4 \\\"Wee\\\"\";\"Test5 \\\"Wee\\\"\"\r\n"
|
130
|
+
|
131
|
+
csv = Csv_lazy.new(:col_sep => ";", :io => StringIO.new(cont), :row_sep => "\r\n")
|
132
|
+
|
133
|
+
row = csv.read_row
|
134
|
+
row[0].should eql("Test1")
|
135
|
+
row[1].should eql("Test2 \"Wee\"")
|
136
|
+
row.length.should eql(2)
|
137
|
+
|
138
|
+
row = csv.read_row
|
139
|
+
row[0].should eql("Test3")
|
140
|
+
row[1].should eql("Test4 \"Wee\"")
|
141
|
+
row[2].should eql("Test5 \"Wee\"")
|
142
|
+
row.length.should eql(3)
|
143
|
+
|
144
|
+
row = csv.read_row
|
145
|
+
row.should eql(false)
|
146
|
+
end
|
112
147
|
end
|
data/spec/test2.csv
ADDED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: csv_lazy
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,8 +9,24 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-03-
|
12
|
+
date: 2013-03-25 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: string_utils
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
14
30
|
- !ruby/object:Gem::Dependency
|
15
31
|
name: rspec
|
16
32
|
requirement: !ruby/object:Gem::Requirement
|
@@ -96,6 +112,7 @@ files:
|
|
96
112
|
- spec/csv_lazy_spec.rb
|
97
113
|
- spec/spec_helper.rb
|
98
114
|
- spec/test1.csv.gz
|
115
|
+
- spec/test2.csv
|
99
116
|
homepage: http://github.com/kaspernj/csv_lazy
|
100
117
|
licenses:
|
101
118
|
- MIT
|
@@ -111,7 +128,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
111
128
|
version: '0'
|
112
129
|
segments:
|
113
130
|
- 0
|
114
|
-
hash:
|
131
|
+
hash: -1390404400491179462
|
115
132
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
116
133
|
none: false
|
117
134
|
requirements:
|