hippie_csv 0.0.11 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +11 -2
- data/lib/hippie_csv.rb +5 -2
- data/lib/hippie_csv/support.rb +10 -5
- data/lib/hippie_csv/version.rb +1 -1
- data/spec/fixtures/small_file.csv +2 -0
- data/spec/hippie_csv/support_spec.rb +17 -23
- data/spec/hippie_csv/version_spec.rb +1 -1
- data/spec/hippie_csv_spec.rb +130 -88
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: aca3f93c2f4a521f946abc23a9bf258fa32e1e5f
|
4
|
+
data.tar.gz: e7bb585167d4a9cc7b351593650991e7383359a5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 37be4a8ccff26295f9af6109207b8337c9bb0b9bddc392beb4ad7a668ff1102525dfb36e7eacdc462d5da6b61ac50de2681b2e6c3e3b3d2c8dbd46231dcd5748
|
7
|
+
data.tar.gz: bd0a83a02adf59c29fb7ebc8644b500d728552e902d04b4d5cb8262e8b485fa0c92d42b569ee6be6cafeeec5e65af566b9b809543fc0a7f96bc0d899f4f374cc
|
data/README.md
CHANGED
@@ -31,14 +31,23 @@ Or install it yourself as:
|
|
31
31
|
|
32
32
|
## Usage
|
33
33
|
|
34
|
-
Exposes
|
35
|
-
|
34
|
+
Exposes three public methods:
|
35
|
+
1. `.read` a file path to an array. Reads from the file all at once, building the whole CSV object in memory.
|
36
|
+
2. `.parse` an in memory string to an array.
|
37
|
+
3. `.stream` from a file path and parse line by line, calling a given block on each row.
|
38
|
+
|
39
|
+
**Note**: Processing large files using read or parse is a memory intensive operation. Use stream for parsing a CSV file line by line from the file to save memory. This method will use less memory but take longer, as we run each line through parse.
|
40
|
+
|
36
41
|
|
37
42
|
```ruby
|
38
43
|
require 'hippie_csv'
|
39
44
|
|
40
45
|
HippieCSV.read("path/to/data.csv")
|
41
46
|
|
47
|
+
HippieCSV.stream("path/to/data.csv") do |row|
|
48
|
+
# use row here...
|
49
|
+
end
|
50
|
+
|
42
51
|
HippieCSV.parse(csv_string)
|
43
52
|
```
|
44
53
|
|
data/lib/hippie_csv.rb
CHANGED
@@ -4,12 +4,15 @@ require "hippie_csv/errors"
|
|
4
4
|
|
5
5
|
module HippieCSV
|
6
6
|
def self.read(path)
|
7
|
-
string =
|
7
|
+
string = File.read(path, encoding: ENCODING_WITH_BOM)
|
8
8
|
parse(string)
|
9
9
|
end
|
10
10
|
|
11
11
|
def self.parse(string)
|
12
|
-
string = Support.encode(string)
|
13
12
|
Support.maybe_parse(string) || (raise UnableToParseError)
|
14
13
|
end
|
14
|
+
|
15
|
+
def self.stream(path, &block)
|
16
|
+
Support.maybe_stream(path, &block)
|
17
|
+
end
|
15
18
|
end
|
data/lib/hippie_csv/support.rb
CHANGED
@@ -4,10 +4,6 @@ require "rchardet"
|
|
4
4
|
module HippieCSV
|
5
5
|
module Support
|
6
6
|
class << self
|
7
|
-
def file_path_to_string(file_path)
|
8
|
-
File.read(file_path, encoding: ENCODING_WITH_BOM)
|
9
|
-
end
|
10
|
-
|
11
7
|
def encode(string)
|
12
8
|
string = ensure_valid_encoding(string)
|
13
9
|
|
@@ -19,8 +15,10 @@ module HippieCSV
|
|
19
15
|
end
|
20
16
|
|
21
17
|
def maybe_parse(string)
|
18
|
+
encoded_string = encode(string)
|
19
|
+
|
22
20
|
QUOTE_CHARACTERS.find do |quote_character|
|
23
|
-
[
|
21
|
+
[encoded_string, tolerate_escaping(encoded_string, quote_character), dump_quotes(encoded_string, quote_character)].find do |string_to_parse|
|
24
22
|
rescuing_malformed do
|
25
23
|
return parse_csv(string_to_parse.squeeze("\n").strip, quote_character)
|
26
24
|
end
|
@@ -36,6 +34,13 @@ module HippieCSV
|
|
36
34
|
)
|
37
35
|
end
|
38
36
|
|
37
|
+
def maybe_stream(path, &block)
|
38
|
+
File.foreach(path, encoding: ENCODING_WITH_BOM) do |line|
|
39
|
+
row = maybe_parse(line)
|
40
|
+
block.call(row.first) if row.first
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
39
44
|
def dump_quotes(string, quote_character)
|
40
45
|
string.gsub(quote_character, "")
|
41
46
|
end
|
data/lib/hippie_csv/version.rb
CHANGED
@@ -2,27 +2,6 @@ require "spec_helper"
|
|
2
2
|
|
3
3
|
describe HippieCSV::Support do
|
4
4
|
|
5
|
-
describe ".file_to_string" do
|
6
|
-
let(:file_path) { fixture_path(:normal) }
|
7
|
-
let(:result) { HippieCSV::Support.file_path_to_string(file_path) }
|
8
|
-
|
9
|
-
it "provides a string" do
|
10
|
-
expect(result.class).to eq String
|
11
|
-
end
|
12
|
-
|
13
|
-
it "reads the file" do
|
14
|
-
expect(result.slice(0,8)).to eq 'id,email'
|
15
|
-
end
|
16
|
-
|
17
|
-
context "with a byte order mark" do
|
18
|
-
let(:file_path) { fixture_path(:with_byte_order_mark) }
|
19
|
-
|
20
|
-
it "works" do
|
21
|
-
expect(result).to eq '"Name","Email Address","Date Added"'
|
22
|
-
end
|
23
|
-
end
|
24
|
-
end
|
25
|
-
|
26
5
|
describe ".encode" do
|
27
6
|
context "with invalid byte sequence" do
|
28
7
|
let(:string) { "\u0014\xFE\u000E\u0000" }
|
@@ -68,8 +47,23 @@ describe HippieCSV::Support do
|
|
68
47
|
end
|
69
48
|
|
70
49
|
describe ".maybe_parse" do
|
71
|
-
|
72
|
-
|
50
|
+
let(:file_path) { fixture_path(:small_file) }
|
51
|
+
it "works" do
|
52
|
+
expect(subject.maybe_parse(File.read(file_path))).to eq(
|
53
|
+
[["name", "email"], ["stephen", "test@example.com"]]
|
54
|
+
)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
describe ".maybe_stream" do
|
59
|
+
let(:file_path) { fixture_path(:small_file) }
|
60
|
+
it "works" do
|
61
|
+
result = []
|
62
|
+
subject.maybe_stream(file_path) { |row| result << row }
|
63
|
+
|
64
|
+
expect(result).to eq(
|
65
|
+
[["name", "email"], ["stephen", "test@example.com"]]
|
66
|
+
)
|
73
67
|
end
|
74
68
|
end
|
75
69
|
|
data/spec/hippie_csv_spec.rb
CHANGED
@@ -2,27 +2,28 @@ require "spec_helper"
|
|
2
2
|
require "csv"
|
3
3
|
|
4
4
|
describe HippieCSV do
|
5
|
+
|
5
6
|
let(:string) { "test" }
|
6
7
|
|
7
8
|
describe ".read" do
|
8
|
-
let(:path) { double }
|
9
9
|
|
10
|
-
it "
|
11
|
-
|
12
|
-
expect(subject).to receive(:parse).with(string)
|
10
|
+
it "reads and parses the file" do
|
11
|
+
path = fixture_path(:normal)
|
13
12
|
|
14
|
-
subject.read(path)
|
13
|
+
result = subject.read(path)
|
14
|
+
expect(result.first[0..1]).to eq(["id", "email"])
|
15
15
|
end
|
16
|
-
end
|
17
16
|
|
18
|
-
|
19
|
-
|
20
|
-
expect(subject::Support).to receive(:encode).with(string)
|
21
|
-
allow(subject::Support).to receive(:maybe_parse).and_return(double)
|
17
|
+
it "reads and parses the file with a byte order mark" do
|
18
|
+
path = fixture_path(:with_byte_order_mark)
|
22
19
|
|
23
|
-
subject.
|
20
|
+
result = subject.read(path)
|
21
|
+
expect(result).to eq([["Name", "Email Address", "Date Added"]])
|
24
22
|
end
|
25
23
|
|
24
|
+
end
|
25
|
+
|
26
|
+
describe ".parse" do
|
26
27
|
it "defers to support parse method" do
|
27
28
|
result = double
|
28
29
|
expect(subject::Support).to receive(:maybe_parse).with(string).and_return(result)
|
@@ -46,115 +47,156 @@ describe HippieCSV do
|
|
46
47
|
end
|
47
48
|
end
|
48
49
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
expect { CSV.read(path) }.to raise_error(CSV::MalformedCSVError)
|
50
|
+
describe ".stream" do
|
51
|
+
path = fixture_path(:normal)
|
52
|
+
let(:proc) { Proc.new {} }
|
53
53
|
|
54
|
-
|
55
|
-
|
56
|
-
end
|
54
|
+
it "encodes the string" do
|
55
|
+
allow(subject::Support).to receive(:maybe_stream).and_return(double)
|
57
56
|
|
58
|
-
|
59
|
-
|
60
|
-
expect { CSV.read(path) }.to raise_error(CSV::MalformedCSVError)
|
57
|
+
subject.stream(path, &proc)
|
58
|
+
end
|
61
59
|
|
62
|
-
|
63
|
-
|
60
|
+
it "defers to support stream method" do
|
61
|
+
result = double
|
62
|
+
expect(subject::Support).to receive(:maybe_stream).with(path, &proc).and_return(result)
|
63
|
+
expect(subject.stream(path, &proc)).to eq(result)
|
64
64
|
end
|
65
65
|
|
66
|
-
it "works
|
67
|
-
path = fixture_path(:
|
68
|
-
expect { CSV.read(path) }.to raise_error(ArgumentError)
|
66
|
+
it "works" do
|
67
|
+
path = fixture_path(:normal)
|
69
68
|
|
70
|
-
|
71
|
-
|
69
|
+
result = []
|
70
|
+
subject.stream(path) { |row| result << row }
|
71
|
+
expect(result[0]).to eq(["id", "email", "name", "country", "city", "created_at", "admin"])
|
72
72
|
end
|
73
|
+
end
|
73
74
|
|
74
|
-
|
75
|
-
path = fixture_path(:excel)
|
75
|
+
context "integration cases: hard/encountered problems" do
|
76
76
|
|
77
|
-
|
78
|
-
|
77
|
+
def read(path)
|
78
|
+
subject.read(path)
|
79
79
|
end
|
80
80
|
|
81
|
-
|
82
|
-
|
83
|
-
|
81
|
+
def stream(path)
|
82
|
+
[].tap do |rows|
|
83
|
+
subject.stream(path) do |row|
|
84
|
+
rows << row
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
84
88
|
|
85
|
-
|
86
|
-
|
87
|
-
expect(import[1][2]).to eq("James Jimmy Doe")
|
89
|
+
def subject_call_method(method, path)
|
90
|
+
send(method, path)
|
88
91
|
end
|
89
92
|
|
90
|
-
it "
|
91
|
-
|
93
|
+
it "::read deals with a long, challenging file (and quickly)" do
|
94
|
+
start_time = Time.now
|
95
|
+
path = fixture_path(:never_ordered)
|
92
96
|
|
93
97
|
import = subject.read(path)
|
94
|
-
|
95
|
-
expect(import[0]
|
98
|
+
|
99
|
+
expect(import[0].count).to eq(10)
|
100
|
+
expect(import.count).to eq(32803)
|
101
|
+
expect(Time.now).to be_within(5).of(start_time)
|
96
102
|
end
|
97
103
|
|
98
|
-
|
99
|
-
|
104
|
+
%w[read stream].each do |method|
|
105
|
+
it "::#{method} works when a BOM is present in the file" do
|
106
|
+
path = fixture_path(:with_byte_order_mark)
|
100
107
|
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
expect(import[0][2]).to eq("stephen@example.com")
|
105
|
-
end
|
108
|
+
import = subject_call_method(method, path)
|
109
|
+
expect(import[0]).to eq(["Name", "Email Address", "Date Added"])
|
110
|
+
end
|
106
111
|
|
107
|
-
|
108
|
-
|
112
|
+
it "::#{method} works with a malformed CSV" do
|
113
|
+
path = fixture_path(:malformed)
|
114
|
+
expect { CSV.read(path) }.to raise_error(CSV::MalformedCSVError)
|
109
115
|
|
110
|
-
|
111
|
-
|
112
|
-
|
116
|
+
import = subject_call_method(method, path)
|
117
|
+
expect(import[0]).to eq(%w(site lon lat max min precip snow snowdepth))
|
118
|
+
end
|
113
119
|
|
114
|
-
|
115
|
-
|
120
|
+
it "::#{method} works with odd encoding & emoji!" do
|
121
|
+
path = fixture_path(:encoding)
|
122
|
+
expect { CSV.read(path) }.to raise_error(ArgumentError)
|
116
123
|
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
end
|
124
|
+
import = subject_call_method(method, path)
|
125
|
+
expect(import[0].count).to eq(4)
|
126
|
+
end
|
121
127
|
|
122
|
-
|
123
|
-
|
124
|
-
path = fixture_path(:never_ordered)
|
128
|
+
it "::#{method} works with an excel export" do
|
129
|
+
path = fixture_path(:excel)
|
125
130
|
|
126
|
-
|
131
|
+
import = subject_call_method(method, path)
|
132
|
+
expect(import[0].count).to eq(24)
|
133
|
+
end
|
127
134
|
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
135
|
+
it "::#{method} works with unescaped internal quotes" do
|
136
|
+
path = fixture_path(:internal_quotes)
|
137
|
+
|
138
|
+
import = subject_call_method(method, path)
|
139
|
+
expect(import[1][1]).to eq("123")
|
140
|
+
expect(import[1][2]).to eq("James Jimmy Doe")
|
141
|
+
end
|
132
142
|
|
133
|
-
|
134
|
-
|
143
|
+
it "::#{method} works with escaped quotes" do
|
144
|
+
path = fixture_path(:escaped_quotes)
|
135
145
|
|
136
|
-
|
137
|
-
|
138
|
-
import
|
139
|
-
|
140
|
-
expect(import.count).to eq(8)
|
141
|
-
}.not_to raise_error
|
142
|
-
end
|
146
|
+
import = subject_call_method(method, path)
|
147
|
+
expect(import[0][1]).to eq("Lalo \"ElPapi\" Neymar")
|
148
|
+
expect(import[0][2]).to eq("lalo@example.com")
|
149
|
+
end
|
143
150
|
|
144
|
-
|
145
|
-
|
151
|
+
it "::#{method} works with an invalid escaped quotes case" do
|
152
|
+
path = fixture_path(:escaped_quotes_semicolons)
|
146
153
|
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
154
|
+
import = subject_call_method(method, path)
|
155
|
+
expect(import[0][0]).to eq("133")
|
156
|
+
expect(import[0][1]).to eq("z3268856")
|
157
|
+
expect(import[0][2]).to eq("stephen@example.com")
|
158
|
+
end
|
159
|
+
|
160
|
+
it "::#{method} works for a complicated case involving bad newlines and quote chars" do
|
161
|
+
path = fixture_path(:dos_line_ending)
|
162
|
+
|
163
|
+
import = subject_call_method(method, path)
|
164
|
+
expect(import[0].count).to eq(9)
|
165
|
+
end
|
166
|
+
|
167
|
+
it "::#{method} works for a hard case" do
|
168
|
+
path = fixture_path(:accents_semicolon_windows_1252)
|
169
|
+
|
170
|
+
import = subject_call_method(method, path)
|
171
|
+
expect(import[0][1]).to eq("Jérome")
|
172
|
+
expect(import[1][0]).to eq("Héloise")
|
173
|
+
end
|
174
|
+
|
175
|
+
it "::#{method} works when many invalid quote types contained" do
|
176
|
+
path = fixture_path(:bad_quoting)
|
177
|
+
|
178
|
+
expect {
|
179
|
+
import = subject_call_method(method, path)
|
180
|
+
expect(import.map(&:count).uniq).to eq([11])
|
181
|
+
expect(import.count).to eq(8)
|
182
|
+
}.not_to raise_error
|
183
|
+
end
|
184
|
+
|
185
|
+
it "::#{method} strips leading/trailing blank lines" do
|
186
|
+
path = fixture_path(:trailing_leading_blank_lines)
|
187
|
+
|
188
|
+
import = subject_call_method(method, path)
|
189
|
+
expect(import.first).not_to be_empty
|
190
|
+
expect(import.last).not_to be_empty
|
191
|
+
end
|
151
192
|
|
152
|
-
|
153
|
-
|
154
|
-
|
193
|
+
it "::#{method} maintains coherent column count when stripping blank lines" do
|
194
|
+
[:blank_lines_crlf, :trailing_leading_blank_lines].each do |fixture_name|
|
195
|
+
path = fixture_path(fixture_name)
|
155
196
|
|
156
|
-
|
157
|
-
|
197
|
+
import = subject_call_method(method, path)
|
198
|
+
expect(import.map(&:length).uniq.size).to eq(1)
|
199
|
+
end
|
158
200
|
end
|
159
201
|
end
|
160
202
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hippie_csv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Stephen O'Brien
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-10-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -111,6 +111,7 @@ files:
|
|
111
111
|
- spec/fixtures/malformed.csv
|
112
112
|
- spec/fixtures/never_ordered.csv
|
113
113
|
- spec/fixtures/normal.csv
|
114
|
+
- spec/fixtures/small_file.csv
|
114
115
|
- spec/fixtures/trailing_leading_blank_lines.csv
|
115
116
|
- spec/fixtures/with_byte_order_mark.csv
|
116
117
|
- spec/hippie_csv/constants_spec.rb
|
@@ -138,7 +139,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
138
139
|
version: '0'
|
139
140
|
requirements: []
|
140
141
|
rubyforge_project:
|
141
|
-
rubygems_version: 2.
|
142
|
+
rubygems_version: 2.6.13
|
142
143
|
signing_key:
|
143
144
|
specification_version: 4
|
144
145
|
summary: Tolerant, liberal CSV parsing
|
@@ -155,6 +156,7 @@ test_files:
|
|
155
156
|
- spec/fixtures/malformed.csv
|
156
157
|
- spec/fixtures/never_ordered.csv
|
157
158
|
- spec/fixtures/normal.csv
|
159
|
+
- spec/fixtures/small_file.csv
|
158
160
|
- spec/fixtures/trailing_leading_blank_lines.csv
|
159
161
|
- spec/fixtures/with_byte_order_mark.csv
|
160
162
|
- spec/hippie_csv/constants_spec.rb
|