hippie_csv 0.0.11 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +11 -2
- data/lib/hippie_csv.rb +5 -2
- data/lib/hippie_csv/support.rb +10 -5
- data/lib/hippie_csv/version.rb +1 -1
- data/spec/fixtures/small_file.csv +2 -0
- data/spec/hippie_csv/support_spec.rb +17 -23
- data/spec/hippie_csv/version_spec.rb +1 -1
- data/spec/hippie_csv_spec.rb +130 -88
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: aca3f93c2f4a521f946abc23a9bf258fa32e1e5f
|
4
|
+
data.tar.gz: e7bb585167d4a9cc7b351593650991e7383359a5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 37be4a8ccff26295f9af6109207b8337c9bb0b9bddc392beb4ad7a668ff1102525dfb36e7eacdc462d5da6b61ac50de2681b2e6c3e3b3d2c8dbd46231dcd5748
|
7
|
+
data.tar.gz: bd0a83a02adf59c29fb7ebc8644b500d728552e902d04b4d5cb8262e8b485fa0c92d42b569ee6be6cafeeec5e65af566b9b809543fc0a7f96bc0d899f4f374cc
|
data/README.md
CHANGED
@@ -31,14 +31,23 @@ Or install it yourself as:
|
|
31
31
|
|
32
32
|
## Usage
|
33
33
|
|
34
|
-
Exposes
|
35
|
-
|
34
|
+
Exposes three public methods:
|
35
|
+
1. `.read` a file path to an array. Reads from the file all at once, building the whole CSV object in memory.
|
36
|
+
2. `.parse` an in memory string to an array.
|
37
|
+
3. `.stream` from a file path and parse line by line, calling a given block on each row.
|
38
|
+
|
39
|
+
**Note**: Processing large files using read or parse is a memory intensive operation. Use stream for parsing a CSV file line by line from the file to save memory. This method will use less memory but take longer, as we run each line through parse.
|
40
|
+
|
36
41
|
|
37
42
|
```ruby
|
38
43
|
require 'hippie_csv'
|
39
44
|
|
40
45
|
HippieCSV.read("path/to/data.csv")
|
41
46
|
|
47
|
+
HippieCSV.stream("path/to/data.csv") do |row|
|
48
|
+
# use row here...
|
49
|
+
end
|
50
|
+
|
42
51
|
HippieCSV.parse(csv_string)
|
43
52
|
```
|
44
53
|
|
data/lib/hippie_csv.rb
CHANGED
@@ -4,12 +4,15 @@ require "hippie_csv/errors"
|
|
4
4
|
|
5
5
|
module HippieCSV
|
6
6
|
def self.read(path)
|
7
|
-
string =
|
7
|
+
string = File.read(path, encoding: ENCODING_WITH_BOM)
|
8
8
|
parse(string)
|
9
9
|
end
|
10
10
|
|
11
11
|
def self.parse(string)
|
12
|
-
string = Support.encode(string)
|
13
12
|
Support.maybe_parse(string) || (raise UnableToParseError)
|
14
13
|
end
|
14
|
+
|
15
|
+
def self.stream(path, &block)
|
16
|
+
Support.maybe_stream(path, &block)
|
17
|
+
end
|
15
18
|
end
|
data/lib/hippie_csv/support.rb
CHANGED
@@ -4,10 +4,6 @@ require "rchardet"
|
|
4
4
|
module HippieCSV
|
5
5
|
module Support
|
6
6
|
class << self
|
7
|
-
def file_path_to_string(file_path)
|
8
|
-
File.read(file_path, encoding: ENCODING_WITH_BOM)
|
9
|
-
end
|
10
|
-
|
11
7
|
def encode(string)
|
12
8
|
string = ensure_valid_encoding(string)
|
13
9
|
|
@@ -19,8 +15,10 @@ module HippieCSV
|
|
19
15
|
end
|
20
16
|
|
21
17
|
def maybe_parse(string)
|
18
|
+
encoded_string = encode(string)
|
19
|
+
|
22
20
|
QUOTE_CHARACTERS.find do |quote_character|
|
23
|
-
[
|
21
|
+
[encoded_string, tolerate_escaping(encoded_string, quote_character), dump_quotes(encoded_string, quote_character)].find do |string_to_parse|
|
24
22
|
rescuing_malformed do
|
25
23
|
return parse_csv(string_to_parse.squeeze("\n").strip, quote_character)
|
26
24
|
end
|
@@ -36,6 +34,13 @@ module HippieCSV
|
|
36
34
|
)
|
37
35
|
end
|
38
36
|
|
37
|
+
def maybe_stream(path, &block)
|
38
|
+
File.foreach(path, encoding: ENCODING_WITH_BOM) do |line|
|
39
|
+
row = maybe_parse(line)
|
40
|
+
block.call(row.first) if row.first
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
39
44
|
def dump_quotes(string, quote_character)
|
40
45
|
string.gsub(quote_character, "")
|
41
46
|
end
|
data/lib/hippie_csv/version.rb
CHANGED
@@ -2,27 +2,6 @@ require "spec_helper"
|
|
2
2
|
|
3
3
|
describe HippieCSV::Support do
|
4
4
|
|
5
|
-
describe ".file_to_string" do
|
6
|
-
let(:file_path) { fixture_path(:normal) }
|
7
|
-
let(:result) { HippieCSV::Support.file_path_to_string(file_path) }
|
8
|
-
|
9
|
-
it "provides a string" do
|
10
|
-
expect(result.class).to eq String
|
11
|
-
end
|
12
|
-
|
13
|
-
it "reads the file" do
|
14
|
-
expect(result.slice(0,8)).to eq 'id,email'
|
15
|
-
end
|
16
|
-
|
17
|
-
context "with a byte order mark" do
|
18
|
-
let(:file_path) { fixture_path(:with_byte_order_mark) }
|
19
|
-
|
20
|
-
it "works" do
|
21
|
-
expect(result).to eq '"Name","Email Address","Date Added"'
|
22
|
-
end
|
23
|
-
end
|
24
|
-
end
|
25
|
-
|
26
5
|
describe ".encode" do
|
27
6
|
context "with invalid byte sequence" do
|
28
7
|
let(:string) { "\u0014\xFE\u000E\u0000" }
|
@@ -68,8 +47,23 @@ describe HippieCSV::Support do
|
|
68
47
|
end
|
69
48
|
|
70
49
|
describe ".maybe_parse" do
|
71
|
-
|
72
|
-
|
50
|
+
let(:file_path) { fixture_path(:small_file) }
|
51
|
+
it "works" do
|
52
|
+
expect(subject.maybe_parse(File.read(file_path))).to eq(
|
53
|
+
[["name", "email"], ["stephen", "test@example.com"]]
|
54
|
+
)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
describe ".maybe_stream" do
|
59
|
+
let(:file_path) { fixture_path(:small_file) }
|
60
|
+
it "works" do
|
61
|
+
result = []
|
62
|
+
subject.maybe_stream(file_path) { |row| result << row }
|
63
|
+
|
64
|
+
expect(result).to eq(
|
65
|
+
[["name", "email"], ["stephen", "test@example.com"]]
|
66
|
+
)
|
73
67
|
end
|
74
68
|
end
|
75
69
|
|
data/spec/hippie_csv_spec.rb
CHANGED
@@ -2,27 +2,28 @@ require "spec_helper"
|
|
2
2
|
require "csv"
|
3
3
|
|
4
4
|
describe HippieCSV do
|
5
|
+
|
5
6
|
let(:string) { "test" }
|
6
7
|
|
7
8
|
describe ".read" do
|
8
|
-
let(:path) { double }
|
9
9
|
|
10
|
-
it "
|
11
|
-
|
12
|
-
expect(subject).to receive(:parse).with(string)
|
10
|
+
it "reads and parses the file" do
|
11
|
+
path = fixture_path(:normal)
|
13
12
|
|
14
|
-
subject.read(path)
|
13
|
+
result = subject.read(path)
|
14
|
+
expect(result.first[0..1]).to eq(["id", "email"])
|
15
15
|
end
|
16
|
-
end
|
17
16
|
|
18
|
-
|
19
|
-
|
20
|
-
expect(subject::Support).to receive(:encode).with(string)
|
21
|
-
allow(subject::Support).to receive(:maybe_parse).and_return(double)
|
17
|
+
it "reads and parses the file with a byte order mark" do
|
18
|
+
path = fixture_path(:with_byte_order_mark)
|
22
19
|
|
23
|
-
subject.
|
20
|
+
result = subject.read(path)
|
21
|
+
expect(result).to eq([["Name", "Email Address", "Date Added"]])
|
24
22
|
end
|
25
23
|
|
24
|
+
end
|
25
|
+
|
26
|
+
describe ".parse" do
|
26
27
|
it "defers to support parse method" do
|
27
28
|
result = double
|
28
29
|
expect(subject::Support).to receive(:maybe_parse).with(string).and_return(result)
|
@@ -46,115 +47,156 @@ describe HippieCSV do
|
|
46
47
|
end
|
47
48
|
end
|
48
49
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
expect { CSV.read(path) }.to raise_error(CSV::MalformedCSVError)
|
50
|
+
describe ".stream" do
|
51
|
+
path = fixture_path(:normal)
|
52
|
+
let(:proc) { Proc.new {} }
|
53
53
|
|
54
|
-
|
55
|
-
|
56
|
-
end
|
54
|
+
it "encodes the string" do
|
55
|
+
allow(subject::Support).to receive(:maybe_stream).and_return(double)
|
57
56
|
|
58
|
-
|
59
|
-
|
60
|
-
expect { CSV.read(path) }.to raise_error(CSV::MalformedCSVError)
|
57
|
+
subject.stream(path, &proc)
|
58
|
+
end
|
61
59
|
|
62
|
-
|
63
|
-
|
60
|
+
it "defers to support stream method" do
|
61
|
+
result = double
|
62
|
+
expect(subject::Support).to receive(:maybe_stream).with(path, &proc).and_return(result)
|
63
|
+
expect(subject.stream(path, &proc)).to eq(result)
|
64
64
|
end
|
65
65
|
|
66
|
-
it "works
|
67
|
-
path = fixture_path(:
|
68
|
-
expect { CSV.read(path) }.to raise_error(ArgumentError)
|
66
|
+
it "works" do
|
67
|
+
path = fixture_path(:normal)
|
69
68
|
|
70
|
-
|
71
|
-
|
69
|
+
result = []
|
70
|
+
subject.stream(path) { |row| result << row }
|
71
|
+
expect(result[0]).to eq(["id", "email", "name", "country", "city", "created_at", "admin"])
|
72
72
|
end
|
73
|
+
end
|
73
74
|
|
74
|
-
|
75
|
-
path = fixture_path(:excel)
|
75
|
+
context "integration cases: hard/encountered problems" do
|
76
76
|
|
77
|
-
|
78
|
-
|
77
|
+
def read(path)
|
78
|
+
subject.read(path)
|
79
79
|
end
|
80
80
|
|
81
|
-
|
82
|
-
|
83
|
-
|
81
|
+
def stream(path)
|
82
|
+
[].tap do |rows|
|
83
|
+
subject.stream(path) do |row|
|
84
|
+
rows << row
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
84
88
|
|
85
|
-
|
86
|
-
|
87
|
-
expect(import[1][2]).to eq("James Jimmy Doe")
|
89
|
+
def subject_call_method(method, path)
|
90
|
+
send(method, path)
|
88
91
|
end
|
89
92
|
|
90
|
-
it "
|
91
|
-
|
93
|
+
it "::read deals with a long, challenging file (and quickly)" do
|
94
|
+
start_time = Time.now
|
95
|
+
path = fixture_path(:never_ordered)
|
92
96
|
|
93
97
|
import = subject.read(path)
|
94
|
-
|
95
|
-
expect(import[0]
|
98
|
+
|
99
|
+
expect(import[0].count).to eq(10)
|
100
|
+
expect(import.count).to eq(32803)
|
101
|
+
expect(Time.now).to be_within(5).of(start_time)
|
96
102
|
end
|
97
103
|
|
98
|
-
|
99
|
-
|
104
|
+
%w[read stream].each do |method|
|
105
|
+
it "::#{method} works when a BOM is present in the file" do
|
106
|
+
path = fixture_path(:with_byte_order_mark)
|
100
107
|
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
expect(import[0][2]).to eq("stephen@example.com")
|
105
|
-
end
|
108
|
+
import = subject_call_method(method, path)
|
109
|
+
expect(import[0]).to eq(["Name", "Email Address", "Date Added"])
|
110
|
+
end
|
106
111
|
|
107
|
-
|
108
|
-
|
112
|
+
it "::#{method} works with a malformed CSV" do
|
113
|
+
path = fixture_path(:malformed)
|
114
|
+
expect { CSV.read(path) }.to raise_error(CSV::MalformedCSVError)
|
109
115
|
|
110
|
-
|
111
|
-
|
112
|
-
|
116
|
+
import = subject_call_method(method, path)
|
117
|
+
expect(import[0]).to eq(%w(site lon lat max min precip snow snowdepth))
|
118
|
+
end
|
113
119
|
|
114
|
-
|
115
|
-
|
120
|
+
it "::#{method} works with odd encoding & emoji!" do
|
121
|
+
path = fixture_path(:encoding)
|
122
|
+
expect { CSV.read(path) }.to raise_error(ArgumentError)
|
116
123
|
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
end
|
124
|
+
import = subject_call_method(method, path)
|
125
|
+
expect(import[0].count).to eq(4)
|
126
|
+
end
|
121
127
|
|
122
|
-
|
123
|
-
|
124
|
-
path = fixture_path(:never_ordered)
|
128
|
+
it "::#{method} works with an excel export" do
|
129
|
+
path = fixture_path(:excel)
|
125
130
|
|
126
|
-
|
131
|
+
import = subject_call_method(method, path)
|
132
|
+
expect(import[0].count).to eq(24)
|
133
|
+
end
|
127
134
|
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
135
|
+
it "::#{method} works with unescaped internal quotes" do
|
136
|
+
path = fixture_path(:internal_quotes)
|
137
|
+
|
138
|
+
import = subject_call_method(method, path)
|
139
|
+
expect(import[1][1]).to eq("123")
|
140
|
+
expect(import[1][2]).to eq("James Jimmy Doe")
|
141
|
+
end
|
132
142
|
|
133
|
-
|
134
|
-
|
143
|
+
it "::#{method} works with escaped quotes" do
|
144
|
+
path = fixture_path(:escaped_quotes)
|
135
145
|
|
136
|
-
|
137
|
-
|
138
|
-
import
|
139
|
-
|
140
|
-
expect(import.count).to eq(8)
|
141
|
-
}.not_to raise_error
|
142
|
-
end
|
146
|
+
import = subject_call_method(method, path)
|
147
|
+
expect(import[0][1]).to eq("Lalo \"ElPapi\" Neymar")
|
148
|
+
expect(import[0][2]).to eq("lalo@example.com")
|
149
|
+
end
|
143
150
|
|
144
|
-
|
145
|
-
|
151
|
+
it "::#{method} works with an invalid escaped quotes case" do
|
152
|
+
path = fixture_path(:escaped_quotes_semicolons)
|
146
153
|
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
154
|
+
import = subject_call_method(method, path)
|
155
|
+
expect(import[0][0]).to eq("133")
|
156
|
+
expect(import[0][1]).to eq("z3268856")
|
157
|
+
expect(import[0][2]).to eq("stephen@example.com")
|
158
|
+
end
|
159
|
+
|
160
|
+
it "::#{method} works for a complicated case involving bad newlines and quote chars" do
|
161
|
+
path = fixture_path(:dos_line_ending)
|
162
|
+
|
163
|
+
import = subject_call_method(method, path)
|
164
|
+
expect(import[0].count).to eq(9)
|
165
|
+
end
|
166
|
+
|
167
|
+
it "::#{method} works for a hard case" do
|
168
|
+
path = fixture_path(:accents_semicolon_windows_1252)
|
169
|
+
|
170
|
+
import = subject_call_method(method, path)
|
171
|
+
expect(import[0][1]).to eq("Jérome")
|
172
|
+
expect(import[1][0]).to eq("Héloise")
|
173
|
+
end
|
174
|
+
|
175
|
+
it "::#{method} works when many invalid quote types contained" do
|
176
|
+
path = fixture_path(:bad_quoting)
|
177
|
+
|
178
|
+
expect {
|
179
|
+
import = subject_call_method(method, path)
|
180
|
+
expect(import.map(&:count).uniq).to eq([11])
|
181
|
+
expect(import.count).to eq(8)
|
182
|
+
}.not_to raise_error
|
183
|
+
end
|
184
|
+
|
185
|
+
it "::#{method} strips leading/trailing blank lines" do
|
186
|
+
path = fixture_path(:trailing_leading_blank_lines)
|
187
|
+
|
188
|
+
import = subject_call_method(method, path)
|
189
|
+
expect(import.first).not_to be_empty
|
190
|
+
expect(import.last).not_to be_empty
|
191
|
+
end
|
151
192
|
|
152
|
-
|
153
|
-
|
154
|
-
|
193
|
+
it "::#{method} maintains coherent column count when stripping blank lines" do
|
194
|
+
[:blank_lines_crlf, :trailing_leading_blank_lines].each do |fixture_name|
|
195
|
+
path = fixture_path(fixture_name)
|
155
196
|
|
156
|
-
|
157
|
-
|
197
|
+
import = subject_call_method(method, path)
|
198
|
+
expect(import.map(&:length).uniq.size).to eq(1)
|
199
|
+
end
|
158
200
|
end
|
159
201
|
end
|
160
202
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hippie_csv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Stephen O'Brien
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-10-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -111,6 +111,7 @@ files:
|
|
111
111
|
- spec/fixtures/malformed.csv
|
112
112
|
- spec/fixtures/never_ordered.csv
|
113
113
|
- spec/fixtures/normal.csv
|
114
|
+
- spec/fixtures/small_file.csv
|
114
115
|
- spec/fixtures/trailing_leading_blank_lines.csv
|
115
116
|
- spec/fixtures/with_byte_order_mark.csv
|
116
117
|
- spec/hippie_csv/constants_spec.rb
|
@@ -138,7 +139,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
138
139
|
version: '0'
|
139
140
|
requirements: []
|
140
141
|
rubyforge_project:
|
141
|
-
rubygems_version: 2.
|
142
|
+
rubygems_version: 2.6.13
|
142
143
|
signing_key:
|
143
144
|
specification_version: 4
|
144
145
|
summary: Tolerant, liberal CSV parsing
|
@@ -155,6 +156,7 @@ test_files:
|
|
155
156
|
- spec/fixtures/malformed.csv
|
156
157
|
- spec/fixtures/never_ordered.csv
|
157
158
|
- spec/fixtures/normal.csv
|
159
|
+
- spec/fixtures/small_file.csv
|
158
160
|
- spec/fixtures/trailing_leading_blank_lines.csv
|
159
161
|
- spec/fixtures/with_byte_order_mark.csv
|
160
162
|
- spec/hippie_csv/constants_spec.rb
|