hippie_csv 0.0.11 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 07464f9018551eea7e183ef2ffd0e855b8a46d6a
4
- data.tar.gz: 98b28c4757768af0fe785127914bdc964a067cff
3
+ metadata.gz: aca3f93c2f4a521f946abc23a9bf258fa32e1e5f
4
+ data.tar.gz: e7bb585167d4a9cc7b351593650991e7383359a5
5
5
  SHA512:
6
- metadata.gz: 346448b5193bc9beeddee5ec1dfe066a795d0a8820b2dbf28910925bf461feaec1cbbcd886394bf8323d47d931cbfda6e540febe5d5a51935b6fc127c0b61573
7
- data.tar.gz: ec3278642148bc8597af5d5128fe2c9ac2231fa7b87c19dc40d3a0933ba0a41c0407464bcbe938609e3ea55c72587d39c3c50d56b04c280e4300bd8f5f27e44e
6
+ metadata.gz: 37be4a8ccff26295f9af6109207b8337c9bb0b9bddc392beb4ad7a668ff1102525dfb36e7eacdc462d5da6b61ac50de2681b2e6c3e3b3d2c8dbd46231dcd5748
7
+ data.tar.gz: bd0a83a02adf59c29fb7ebc8644b500d728552e902d04b4d5cb8262e8b485fa0c92d42b569ee6be6cafeeec5e65af566b9b809543fc0a7f96bc0d899f4f374cc
data/README.md CHANGED
@@ -31,14 +31,23 @@ Or install it yourself as:
31
31
 
32
32
  ## Usage
33
33
 
34
- Exposes two public methods: `read` (for paths to files), and `parse` (for
35
- strings).
34
+ Exposes three public methods:
35
+ 1. `.read` a file path to an array. Reads from the file all at once, building the whole CSV object in memory.
36
+ 2. `.parse` an in memory string to an array.
37
+ 3. `.stream` from a file path and parse line by line, calling a given block on each row.
38
+
39
+ **Note**: Processing large files using read or parse is a memory intensive operation. Use stream for parsing a CSV file line by line from the file to save memory. This method will use less memory but take longer, as we run each line through parse.
40
+
36
41
 
37
42
  ```ruby
38
43
  require 'hippie_csv'
39
44
 
40
45
  HippieCSV.read("path/to/data.csv")
41
46
 
47
+ HippieCSV.stream("path/to/data.csv") do |row|
48
+ # use row here...
49
+ end
50
+
42
51
  HippieCSV.parse(csv_string)
43
52
  ```
44
53
 
@@ -4,12 +4,15 @@ require "hippie_csv/errors"
4
4
 
5
5
  module HippieCSV
6
6
  def self.read(path)
7
- string = Support.file_path_to_string(path)
7
+ string = File.read(path, encoding: ENCODING_WITH_BOM)
8
8
  parse(string)
9
9
  end
10
10
 
11
11
  def self.parse(string)
12
- string = Support.encode(string)
13
12
  Support.maybe_parse(string) || (raise UnableToParseError)
14
13
  end
14
+
15
+ def self.stream(path, &block)
16
+ Support.maybe_stream(path, &block)
17
+ end
15
18
  end
@@ -4,10 +4,6 @@ require "rchardet"
4
4
  module HippieCSV
5
5
  module Support
6
6
  class << self
7
- def file_path_to_string(file_path)
8
- File.read(file_path, encoding: ENCODING_WITH_BOM)
9
- end
10
-
11
7
  def encode(string)
12
8
  string = ensure_valid_encoding(string)
13
9
 
@@ -19,8 +15,10 @@ module HippieCSV
19
15
  end
20
16
 
21
17
  def maybe_parse(string)
18
+ encoded_string = encode(string)
19
+
22
20
  QUOTE_CHARACTERS.find do |quote_character|
23
- [string, tolerate_escaping(string, quote_character), dump_quotes(string, quote_character)].find do |string_to_parse|
21
+ [encoded_string, tolerate_escaping(encoded_string, quote_character), dump_quotes(encoded_string, quote_character)].find do |string_to_parse|
24
22
  rescuing_malformed do
25
23
  return parse_csv(string_to_parse.squeeze("\n").strip, quote_character)
26
24
  end
@@ -36,6 +34,13 @@ module HippieCSV
36
34
  )
37
35
  end
38
36
 
37
+ def maybe_stream(path, &block)
38
+ File.foreach(path, encoding: ENCODING_WITH_BOM) do |line|
39
+ row = maybe_parse(line)
40
+ block.call(row.first) if row.first
41
+ end
42
+ end
43
+
39
44
  def dump_quotes(string, quote_character)
40
45
  string.gsub(quote_character, "")
41
46
  end
@@ -1,3 +1,3 @@
1
1
  module HippieCSV
2
- VERSION = "0.0.11"
2
+ VERSION = "0.1.0"
3
3
  end
@@ -0,0 +1,2 @@
1
+ name,email
2
+ stephen,test@example.com
@@ -2,27 +2,6 @@ require "spec_helper"
2
2
 
3
3
  describe HippieCSV::Support do
4
4
 
5
- describe ".file_to_string" do
6
- let(:file_path) { fixture_path(:normal) }
7
- let(:result) { HippieCSV::Support.file_path_to_string(file_path) }
8
-
9
- it "provides a string" do
10
- expect(result.class).to eq String
11
- end
12
-
13
- it "reads the file" do
14
- expect(result.slice(0,8)).to eq 'id,email'
15
- end
16
-
17
- context "with a byte order mark" do
18
- let(:file_path) { fixture_path(:with_byte_order_mark) }
19
-
20
- it "works" do
21
- expect(result).to eq '"Name","Email Address","Date Added"'
22
- end
23
- end
24
- end
25
-
26
5
  describe ".encode" do
27
6
  context "with invalid byte sequence" do
28
7
  let(:string) { "\u0014\xFE\u000E\u0000" }
@@ -68,8 +47,23 @@ describe HippieCSV::Support do
68
47
  end
69
48
 
70
49
  describe ".maybe_parse" do
71
- it "needs to be written" do
72
- skip # TODO write this test
50
+ let(:file_path) { fixture_path(:small_file) }
51
+ it "works" do
52
+ expect(subject.maybe_parse(File.read(file_path))).to eq(
53
+ [["name", "email"], ["stephen", "test@example.com"]]
54
+ )
55
+ end
56
+ end
57
+
58
+ describe ".maybe_stream" do
59
+ let(:file_path) { fixture_path(:small_file) }
60
+ it "works" do
61
+ result = []
62
+ subject.maybe_stream(file_path) { |row| result << row }
63
+
64
+ expect(result).to eq(
65
+ [["name", "email"], ["stephen", "test@example.com"]]
66
+ )
73
67
  end
74
68
  end
75
69
 
@@ -3,7 +3,7 @@ require "spec_helper"
3
3
  describe HippieCSV do
4
4
 
5
5
  it "defines a version" do
6
- expect(HippieCSV::VERSION).to eq("0.0.11")
6
+ expect(HippieCSV::VERSION).to eq("0.1.0")
7
7
  end
8
8
 
9
9
  end
@@ -2,27 +2,28 @@ require "spec_helper"
2
2
  require "csv"
3
3
 
4
4
  describe HippieCSV do
5
+
5
6
  let(:string) { "test" }
6
7
 
7
8
  describe ".read" do
8
- let(:path) { double }
9
9
 
10
- it "converts to string and parses" do
11
- expect(subject::Support).to receive(:file_path_to_string).with(path).and_return(string)
12
- expect(subject).to receive(:parse).with(string)
10
+ it "reads and parses the file" do
11
+ path = fixture_path(:normal)
13
12
 
14
- subject.read(path)
13
+ result = subject.read(path)
14
+ expect(result.first[0..1]).to eq(["id", "email"])
15
15
  end
16
- end
17
16
 
18
- describe ".parse" do
19
- it "encodes the string" do
20
- expect(subject::Support).to receive(:encode).with(string)
21
- allow(subject::Support).to receive(:maybe_parse).and_return(double)
17
+ it "reads and parses the file with a byte order mark" do
18
+ path = fixture_path(:with_byte_order_mark)
22
19
 
23
- subject.parse(string)
20
+ result = subject.read(path)
21
+ expect(result).to eq([["Name", "Email Address", "Date Added"]])
24
22
  end
25
23
 
24
+ end
25
+
26
+ describe ".parse" do
26
27
  it "defers to support parse method" do
27
28
  result = double
28
29
  expect(subject::Support).to receive(:maybe_parse).with(string).and_return(result)
@@ -46,115 +47,156 @@ describe HippieCSV do
46
47
  end
47
48
  end
48
49
 
49
- context "integration cases: hard/encountered problems" do
50
- it "works when a BOM is present in the file" do
51
- path = fixture_path(:with_byte_order_mark)
52
- expect { CSV.read(path) }.to raise_error(CSV::MalformedCSVError)
50
+ describe ".stream" do
51
+ path = fixture_path(:normal)
52
+ let(:proc) { Proc.new {} }
53
53
 
54
- import = subject.read(path)
55
- expect(import[0]).to eq(["Name", "Email Address", "Date Added"])
56
- end
54
+ it "encodes the string" do
55
+ allow(subject::Support).to receive(:maybe_stream).and_return(double)
57
56
 
58
- it "works with a malformed CSV" do
59
- path = fixture_path(:malformed)
60
- expect { CSV.read(path) }.to raise_error(CSV::MalformedCSVError)
57
+ subject.stream(path, &proc)
58
+ end
61
59
 
62
- import = subject.read(path)
63
- expect(import[0]).to eq(%w(site lon lat max min precip snow snowdepth))
60
+ it "defers to support stream method" do
61
+ result = double
62
+ expect(subject::Support).to receive(:maybe_stream).with(path, &proc).and_return(result)
63
+ expect(subject.stream(path, &proc)).to eq(result)
64
64
  end
65
65
 
66
- it "works with odd encoding & emoji!" do
67
- path = fixture_path(:encoding)
68
- expect { CSV.read(path) }.to raise_error(ArgumentError)
66
+ it "works" do
67
+ path = fixture_path(:normal)
69
68
 
70
- import = subject.read(path)
71
- expect(import[0].count).to eq(4)
69
+ result = []
70
+ subject.stream(path) { |row| result << row }
71
+ expect(result[0]).to eq(["id", "email", "name", "country", "city", "created_at", "admin"])
72
72
  end
73
+ end
73
74
 
74
- it "works with an excel export" do
75
- path = fixture_path(:excel)
75
+ context "integration cases: hard/encountered problems" do
76
76
 
77
- import = subject.read(path)
78
- expect(import[0].count).to eq(24)
77
+ def read(path)
78
+ subject.read(path)
79
79
  end
80
80
 
81
- it "works with unescaped internal quotes" do
82
- path = fixture_path(:internal_quotes)
83
- expect { CSV.read(path) }.to raise_error(CSV::MalformedCSVError)
81
+ def stream(path)
82
+ [].tap do |rows|
83
+ subject.stream(path) do |row|
84
+ rows << row
85
+ end
86
+ end
87
+ end
84
88
 
85
- import = subject.read(path)
86
- expect(import[1][1]).to eq("123")
87
- expect(import[1][2]).to eq("James Jimmy Doe")
89
+ def subject_call_method(method, path)
90
+ send(method, path)
88
91
  end
89
92
 
90
- it "works with escaped quotes" do
91
- path = fixture_path(:escaped_quotes)
93
+ it "::read deals with a long, challenging file (and quickly)" do
94
+ start_time = Time.now
95
+ path = fixture_path(:never_ordered)
92
96
 
93
97
  import = subject.read(path)
94
- expect(import[0][1]).to eq("Lalo \"ElPapi\" Neymar")
95
- expect(import[0][2]).to eq("lalo@example.com")
98
+
99
+ expect(import[0].count).to eq(10)
100
+ expect(import.count).to eq(32803)
101
+ expect(Time.now).to be_within(5).of(start_time)
96
102
  end
97
103
 
98
- it "works with an invalid escaped quotes case" do
99
- path = fixture_path(:escaped_quotes_semicolons)
104
+ %w[read stream].each do |method|
105
+ it "::#{method} works when a BOM is present in the file" do
106
+ path = fixture_path(:with_byte_order_mark)
100
107
 
101
- import = subject.read(path)
102
- expect(import[0][0]).to eq("133")
103
- expect(import[0][1]).to eq("z3268856")
104
- expect(import[0][2]).to eq("stephen@example.com")
105
- end
108
+ import = subject_call_method(method, path)
109
+ expect(import[0]).to eq(["Name", "Email Address", "Date Added"])
110
+ end
106
111
 
107
- it "works for a complicated case involving bad newlines and quote chars" do
108
- path = fixture_path(:dos_line_ending)
112
+ it "::#{method} works with a malformed CSV" do
113
+ path = fixture_path(:malformed)
114
+ expect { CSV.read(path) }.to raise_error(CSV::MalformedCSVError)
109
115
 
110
- import = subject.read(path)
111
- expect(import[0].count).to eq(9)
112
- end
116
+ import = subject_call_method(method, path)
117
+ expect(import[0]).to eq(%w(site lon lat max min precip snow snowdepth))
118
+ end
113
119
 
114
- it "works for a hard case" do
115
- path = fixture_path(:accents_semicolon_windows_1252)
120
+ it "::#{method} works with odd encoding & emoji!" do
121
+ path = fixture_path(:encoding)
122
+ expect { CSV.read(path) }.to raise_error(ArgumentError)
116
123
 
117
- import = subject.read(path)
118
- expect(import[0][1]).to eq("Jérome")
119
- expect(import[1][0]).to eq("Héloise")
120
- end
124
+ import = subject_call_method(method, path)
125
+ expect(import[0].count).to eq(4)
126
+ end
121
127
 
122
- it "deals with a long, challenging file (and quickly)" do
123
- start_time = Time.now
124
- path = fixture_path(:never_ordered)
128
+ it "::#{method} works with an excel export" do
129
+ path = fixture_path(:excel)
125
130
 
126
- import = subject.read(path)
131
+ import = subject_call_method(method, path)
132
+ expect(import[0].count).to eq(24)
133
+ end
127
134
 
128
- expect(import[0].count).to eq(10)
129
- expect(import.count).to eq(32803)
130
- expect(Time.now).to be_within(5).of(start_time)
131
- end
135
+ it "::#{method} works with unescaped internal quotes" do
136
+ path = fixture_path(:internal_quotes)
137
+
138
+ import = subject_call_method(method, path)
139
+ expect(import[1][1]).to eq("123")
140
+ expect(import[1][2]).to eq("James Jimmy Doe")
141
+ end
132
142
 
133
- it "works when many invalid quote types contained" do
134
- path = fixture_path(:bad_quoting)
143
+ it "::#{method} works with escaped quotes" do
144
+ path = fixture_path(:escaped_quotes)
135
145
 
136
- expect { CSV.read(path) }.to raise_error(CSV::MalformedCSVError)
137
- expect {
138
- import = subject.read(path)
139
- expect(import.map(&:count).uniq).to eq([11])
140
- expect(import.count).to eq(8)
141
- }.not_to raise_error
142
- end
146
+ import = subject_call_method(method, path)
147
+ expect(import[0][1]).to eq("Lalo \"ElPapi\" Neymar")
148
+ expect(import[0][2]).to eq("lalo@example.com")
149
+ end
143
150
 
144
- it "strips leading/trailing blank lines" do
145
- path = fixture_path(:trailing_leading_blank_lines)
151
+ it "::#{method} works with an invalid escaped quotes case" do
152
+ path = fixture_path(:escaped_quotes_semicolons)
146
153
 
147
- import = subject.read(path)
148
- expect(import.first).not_to be_empty
149
- expect(import.last).not_to be_empty
150
- end
154
+ import = subject_call_method(method, path)
155
+ expect(import[0][0]).to eq("133")
156
+ expect(import[0][1]).to eq("z3268856")
157
+ expect(import[0][2]).to eq("stephen@example.com")
158
+ end
159
+
160
+ it "::#{method} works for a complicated case involving bad newlines and quote chars" do
161
+ path = fixture_path(:dos_line_ending)
162
+
163
+ import = subject_call_method(method, path)
164
+ expect(import[0].count).to eq(9)
165
+ end
166
+
167
+ it "::#{method} works for a hard case" do
168
+ path = fixture_path(:accents_semicolon_windows_1252)
169
+
170
+ import = subject_call_method(method, path)
171
+ expect(import[0][1]).to eq("Jérome")
172
+ expect(import[1][0]).to eq("Héloise")
173
+ end
174
+
175
+ it "::#{method} works when many invalid quote types contained" do
176
+ path = fixture_path(:bad_quoting)
177
+
178
+ expect {
179
+ import = subject_call_method(method, path)
180
+ expect(import.map(&:count).uniq).to eq([11])
181
+ expect(import.count).to eq(8)
182
+ }.not_to raise_error
183
+ end
184
+
185
+ it "::#{method} strips leading/trailing blank lines" do
186
+ path = fixture_path(:trailing_leading_blank_lines)
187
+
188
+ import = subject_call_method(method, path)
189
+ expect(import.first).not_to be_empty
190
+ expect(import.last).not_to be_empty
191
+ end
151
192
 
152
- it "maintains coherent column count when stripping blank lines" do
153
- [:blank_lines_crlf, :trailing_leading_blank_lines].each do |fixture_name|
154
- path = fixture_path(fixture_name)
193
+ it "::#{method} maintains coherent column count when stripping blank lines" do
194
+ [:blank_lines_crlf, :trailing_leading_blank_lines].each do |fixture_name|
195
+ path = fixture_path(fixture_name)
155
196
 
156
- import = subject.read(path)
157
- expect(import.map(&:length).uniq.size).to eq(1)
197
+ import = subject_call_method(method, path)
198
+ expect(import.map(&:length).uniq.size).to eq(1)
199
+ end
158
200
  end
159
201
  end
160
202
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hippie_csv
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.11
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Stephen O'Brien
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-03-01 00:00:00.000000000 Z
11
+ date: 2017-10-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -111,6 +111,7 @@ files:
111
111
  - spec/fixtures/malformed.csv
112
112
  - spec/fixtures/never_ordered.csv
113
113
  - spec/fixtures/normal.csv
114
+ - spec/fixtures/small_file.csv
114
115
  - spec/fixtures/trailing_leading_blank_lines.csv
115
116
  - spec/fixtures/with_byte_order_mark.csv
116
117
  - spec/hippie_csv/constants_spec.rb
@@ -138,7 +139,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
138
139
  version: '0'
139
140
  requirements: []
140
141
  rubyforge_project:
141
- rubygems_version: 2.5.1
142
+ rubygems_version: 2.6.13
142
143
  signing_key:
143
144
  specification_version: 4
144
145
  summary: Tolerant, liberal CSV parsing
@@ -155,6 +156,7 @@ test_files:
155
156
  - spec/fixtures/malformed.csv
156
157
  - spec/fixtures/never_ordered.csv
157
158
  - spec/fixtures/normal.csv
159
+ - spec/fixtures/small_file.csv
158
160
  - spec/fixtures/trailing_leading_blank_lines.csv
159
161
  - spec/fixtures/with_byte_order_mark.csv
160
162
  - spec/hippie_csv/constants_spec.rb