datacatalog-importer 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/datacatalog-importer.gemspec +3 -1
- data/lib/utility.rb +49 -31
- data/spec/test.csv +1 -0
- data/spec/test.html +8 -0
- data/spec/utility_spec.rb +172 -36
- metadata +5 -3
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.3
|
@@ -5,7 +5,7 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{datacatalog-importer}
|
8
|
-
s.version = "0.2.
|
8
|
+
s.version = "0.2.3"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["David James"]
|
@@ -40,6 +40,8 @@ Gem::Specification.new do |s|
|
|
40
40
|
"natdat_is_hungry.md",
|
41
41
|
"spec/spec.opts",
|
42
42
|
"spec/spec_helper.rb",
|
43
|
+
"spec/test.csv",
|
44
|
+
"spec/test.html",
|
43
45
|
"spec/test.json",
|
44
46
|
"spec/test.xml",
|
45
47
|
"spec/utility_spec.rb"
|
data/lib/utility.rb
CHANGED
@@ -57,7 +57,7 @@ module DataCatalog
|
|
57
57
|
|
58
58
|
def self.headers
|
59
59
|
{
|
60
|
-
"UserAgent" => "National Data Catalog Importer/0.2.
|
60
|
+
"UserAgent" => "National Data Catalog Importer/0.2.3",
|
61
61
|
}
|
62
62
|
end
|
63
63
|
|
@@ -67,10 +67,10 @@ module DataCatalog
|
|
67
67
|
# * http://jerith.livejournal.com/40063.html
|
68
68
|
# * http://lindsaar.net/2007/12/9/rbuf_filltimeout-error
|
69
69
|
def self.fetch(uri, options={})
|
70
|
-
max_attempts = options
|
71
|
-
retry_delay = options
|
72
|
-
quiet = options
|
73
|
-
attempts
|
70
|
+
max_attempts = options.delete(:max_attempts) || 3
|
71
|
+
retry_delay = options.delete(:retry_delay) || 5
|
72
|
+
quiet = options.delete(:quiet) || false
|
73
|
+
attempts = 0
|
74
74
|
loop do
|
75
75
|
begin
|
76
76
|
puts "Fetching #{uri}..." unless quiet
|
@@ -85,6 +85,12 @@ module DataCatalog
|
|
85
85
|
end
|
86
86
|
end
|
87
87
|
end
|
88
|
+
|
89
|
+
def self.remove_fetch_options(options={})
|
90
|
+
[:max_attempts, :retry_delay, :quiet].each do |opt|
|
91
|
+
options.delete(opt)
|
92
|
+
end
|
93
|
+
end
|
88
94
|
|
89
95
|
def self.report_timing(label)
|
90
96
|
puts "Starting: [#{label}]"
|
@@ -99,9 +105,9 @@ module DataCatalog
|
|
99
105
|
# == CSV ==
|
100
106
|
|
101
107
|
# { :headers => true } is a common option
|
102
|
-
def self.parse_csv_from_file(
|
108
|
+
def self.parse_csv_from_file(file, options={})
|
103
109
|
extra_header_rows = options.delete(:extra_header_rows) || 0
|
104
|
-
File.open(
|
110
|
+
File.open(file) do |f|
|
105
111
|
extra_header_rows.times { f.gets } # ignore these rows
|
106
112
|
FasterCSV.parse(f, options)
|
107
113
|
end
|
@@ -112,19 +118,22 @@ module DataCatalog
|
|
112
118
|
FasterCSV.parse(data, options)
|
113
119
|
end
|
114
120
|
|
115
|
-
def self.parse_csv_from_file_or_uri(
|
121
|
+
def self.parse_csv_from_file_or_uri(file, uri, options={})
|
116
122
|
force_fetch = options.delete(:force_fetch) || false
|
117
123
|
if force_fetch || !File.exist?(file)
|
118
|
-
|
119
|
-
File.open(file, "w") { |f| f.write(
|
124
|
+
data = fetch(uri, options)
|
125
|
+
File.open(file, "w") { |f| f.write(data) }
|
126
|
+
else
|
127
|
+
remove_fetch_options(options)
|
120
128
|
end
|
129
|
+
# Why always parse the file? See Note 001, below.
|
121
130
|
parse_csv_from_file(file, options)
|
122
131
|
end
|
123
132
|
|
124
133
|
# == HTML ==
|
125
134
|
|
126
|
-
def self.parse_html_from_file(
|
127
|
-
File.open(
|
135
|
+
def self.parse_html_from_file(file)
|
136
|
+
File.open(file) do |f|
|
128
137
|
Nokogiri::HTML::Document.parse(f)
|
129
138
|
end
|
130
139
|
end
|
@@ -134,10 +143,13 @@ module DataCatalog
|
|
134
143
|
Nokogiri::HTML::Document.parse(data)
|
135
144
|
end
|
136
145
|
|
137
|
-
def self.parse_html_from_file_or_uri(
|
138
|
-
|
139
|
-
|
140
|
-
|
146
|
+
def self.parse_html_from_file_or_uri(file, uri, options={})
|
147
|
+
force_fetch = options.delete(:force_fetch) || false
|
148
|
+
if force_fetch || !File.exist?(file)
|
149
|
+
data = fetch(uri, options)
|
150
|
+
File.open(file, "w") { |f| f.write(data) }
|
151
|
+
else
|
152
|
+
remove_fetch_options(options)
|
141
153
|
end
|
142
154
|
# Why always parse the file? See Note 001, below.
|
143
155
|
parse_html_from_file(file)
|
@@ -145,8 +157,8 @@ module DataCatalog
|
|
145
157
|
|
146
158
|
# == JSON
|
147
159
|
|
148
|
-
def self.parse_json_from_file(
|
149
|
-
File.open(
|
160
|
+
def self.parse_json_from_file(file)
|
161
|
+
File.open(file) do |f|
|
150
162
|
JSON.parse(f.read)
|
151
163
|
end
|
152
164
|
end
|
@@ -156,10 +168,13 @@ module DataCatalog
|
|
156
168
|
JSON.parse(data)
|
157
169
|
end
|
158
170
|
|
159
|
-
def self.parse_json_from_file_or_uri(
|
160
|
-
|
161
|
-
|
162
|
-
|
171
|
+
def self.parse_json_from_file_or_uri(file, uri, options={})
|
172
|
+
force_fetch = options.delete(:force_fetch) || false
|
173
|
+
if force_fetch || !File.exist?(file)
|
174
|
+
data = fetch(uri, options)
|
175
|
+
File.open(file, "w") { |f| f.write(data) }
|
176
|
+
else
|
177
|
+
remove_fetch_options(options)
|
163
178
|
end
|
164
179
|
# Why always parse the file? See Note 001, below.
|
165
180
|
parse_json_from_file(file)
|
@@ -167,8 +182,8 @@ module DataCatalog
|
|
167
182
|
|
168
183
|
# == XML
|
169
184
|
|
170
|
-
def self.parse_xml_from_file(
|
171
|
-
File.open(
|
185
|
+
def self.parse_xml_from_file(file)
|
186
|
+
File.open(file) do |f|
|
172
187
|
Nokogiri::XML::Document.parse(f)
|
173
188
|
end
|
174
189
|
end
|
@@ -178,10 +193,13 @@ module DataCatalog
|
|
178
193
|
Nokogiri::XML::Document.parse(data)
|
179
194
|
end
|
180
195
|
|
181
|
-
def self.parse_xml_from_file_or_uri(
|
182
|
-
|
183
|
-
|
184
|
-
|
196
|
+
def self.parse_xml_from_file_or_uri(file, uri, options={})
|
197
|
+
force_fetch = options.delete(:force_fetch) || false
|
198
|
+
if force_fetch || !File.exist?(file)
|
199
|
+
data = fetch(uri, options)
|
200
|
+
File.open(file, "w") { |f| f.write(data) }
|
201
|
+
else
|
202
|
+
remove_fetch_options(options)
|
185
203
|
end
|
186
204
|
# Why always parse the file? See Note 001, below.
|
187
205
|
parse_xml_from_file(file)
|
@@ -189,10 +207,10 @@ module DataCatalog
|
|
189
207
|
|
190
208
|
# == YAML
|
191
209
|
|
192
|
-
# To load YAML use: YAML::load_file(
|
210
|
+
# To load YAML use: YAML::load_file(file)
|
193
211
|
|
194
|
-
def self.write_yaml(
|
195
|
-
File.open(
|
212
|
+
def self.write_yaml(file, contents)
|
213
|
+
File.open(file, "w") do |f|
|
196
214
|
YAML::dump(contents, f)
|
197
215
|
end
|
198
216
|
end
|
data/spec/test.csv
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
Metro Center,Dupont Circle
|
data/spec/test.html
ADDED
data/spec/utility_spec.rb
CHANGED
@@ -105,57 +105,193 @@ describe "Utility" do
|
|
105
105
|
end
|
106
106
|
end
|
107
107
|
|
108
|
-
|
108
|
+
describe "csv" do
|
109
|
+
describe "parse_csv_from_file" do
|
110
|
+
it "should work" do
|
111
|
+
file = File.dirname(__FILE__) + '/test.csv'
|
112
|
+
result = U.parse_csv_from_file(file)
|
113
|
+
result.should == [["Metro Center", "Dupont Circle"]]
|
114
|
+
end
|
115
|
+
end
|
109
116
|
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
117
|
+
describe "parse_csv_from_uri" do
|
118
|
+
it "should work" do
|
119
|
+
readable = Object.new
|
120
|
+
readable.stub(:read).and_return(%(Metro Center,Dupont Circle))
|
121
|
+
U.stub(:open).and_return(readable)
|
122
|
+
result = U.parse_csv_from_uri("fake", :quiet => true)
|
123
|
+
result.should == [["Metro Center", "Dupont Circle"]]
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
describe "parse_csv_from_file_or_uri" do
|
128
|
+
it "should work when file present" do
|
129
|
+
file = File.dirname(__FILE__) + '/test.csv'
|
130
|
+
result = U.parse_csv_from_file_or_uri(file, "fake", :quiet => true)
|
131
|
+
result.should == [["Metro Center", "Dupont Circle"]]
|
132
|
+
end
|
133
|
+
|
134
|
+
it "should work when file missing but url ok" do
|
135
|
+
readable = Object.new
|
136
|
+
readable.stub(:read).and_return(%(Metro Center,Dupont Circle))
|
137
|
+
U.stub(:open).and_return(readable)
|
138
|
+
file = File.dirname(__FILE__) + "/missing.csv"
|
139
|
+
FileUtils.rm(file) if File.exists?(file)
|
140
|
+
result = U.parse_csv_from_file_or_uri(file, "fake", :quiet => true)
|
141
|
+
FileUtils.rm(file) if File.exists?(file)
|
142
|
+
result.should == [["Metro Center", "Dupont Circle"]]
|
143
|
+
end
|
116
144
|
end
|
117
145
|
end
|
118
146
|
|
119
|
-
describe "
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
))
|
147
|
+
describe "html" do
|
148
|
+
describe "parse_html_from_file" do
|
149
|
+
it "should work" do
|
150
|
+
file = File.dirname(__FILE__) + '/test.html'
|
151
|
+
parsed = U.parse_html_from_file(file)
|
152
|
+
result = parsed.css('li').map(&:content)
|
153
|
+
result.should == ["Metro Center", "Dupont Circle"]
|
154
|
+
end
|
128
155
|
end
|
129
156
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
157
|
+
describe "parse_html_from_uri" do
|
158
|
+
it "should work" do
|
159
|
+
readable = Object.new
|
160
|
+
readable.stub(:read).and_return(%(
|
161
|
+
<html>
|
162
|
+
<body>
|
163
|
+
<ol>
|
164
|
+
<li>Metro Center</li>
|
165
|
+
<li>Dupont Circle</li>
|
166
|
+
</ol>
|
167
|
+
</body>
|
168
|
+
</html>
|
169
|
+
))
|
170
|
+
U.stub(:open).and_return(readable)
|
171
|
+
parsed = U.parse_html_from_uri("fake", :quiet => true)
|
172
|
+
result = parsed.css('li').map(&:content)
|
173
|
+
result.should == ["Metro Center", "Dupont Circle"]
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
describe "parse_html_from_file_or_uri" do
|
178
|
+
it "should work when file present" do
|
179
|
+
file = File.dirname(__FILE__) + '/test.html'
|
180
|
+
parsed = U.parse_html_from_file_or_uri(file, "fake", :quiet => true)
|
181
|
+
result = parsed.css('li').map(&:content)
|
182
|
+
result.should == ["Metro Center", "Dupont Circle"]
|
183
|
+
end
|
184
|
+
|
185
|
+
it "should work when file missing but url ok" do
|
186
|
+
readable = Object.new
|
187
|
+
readable.stub(:read).and_return(%(
|
188
|
+
<stations>
|
189
|
+
<station>Metro Center</station>
|
190
|
+
<station>Dupont Circle</station>
|
191
|
+
</stations>
|
192
|
+
))
|
193
|
+
U.stub(:open).and_return(readable)
|
194
|
+
file = File.dirname(__FILE__) + "/missing.html"
|
195
|
+
FileUtils.rm(file) if File.exists?(file)
|
196
|
+
parsed = U.parse_html_from_file_or_uri(file, "fake", :quiet => true)
|
197
|
+
FileUtils.rm(file) if File.exists?(file)
|
198
|
+
result = parsed.xpath('.//stations/station').map(&:content)
|
199
|
+
result.should == ["Metro Center", "Dupont Circle"]
|
200
|
+
end
|
135
201
|
end
|
136
202
|
end
|
137
203
|
|
138
|
-
|
204
|
+
describe "json" do
|
205
|
+
describe "parse_json_from_file" do
|
206
|
+
it "should work" do
|
207
|
+
file = File.dirname(__FILE__) + '/test.json'
|
208
|
+
U.parse_json_from_file(file).should ==
|
209
|
+
{ "stations" => ["Metro Center", "Dupont Circle"] }
|
210
|
+
end
|
211
|
+
end
|
139
212
|
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
213
|
+
describe "parse_json_from_uri" do
|
214
|
+
it "should work" do
|
215
|
+
readable = Object.new
|
216
|
+
readable.stub(:read).and_return(
|
217
|
+
%({"stations":["Metro Center","Dupont Circle"]}))
|
218
|
+
U.stub(:open).and_return(readable)
|
219
|
+
result = U.parse_json_from_uri("fake", :quiet => true)
|
220
|
+
result.should == { "stations" => ["Metro Center", "Dupont Circle"] }
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
describe "parse_json_from_file_or_uri" do
|
225
|
+
it "should work when file present" do
|
226
|
+
file = File.dirname(__FILE__) + '/test.json'
|
227
|
+
result = U.parse_json_from_file_or_uri(file, "fake", :quiet => true)
|
228
|
+
result.should == { "stations" => ["Metro Center", "Dupont Circle"] }
|
229
|
+
end
|
230
|
+
|
231
|
+
it "should work when file missing but url ok" do
|
232
|
+
readable = Object.new
|
233
|
+
readable.stub(:read).and_return(
|
234
|
+
%({"stations":["Metro Center","Dupont Circle"]}))
|
235
|
+
U.stub(:open).and_return(readable)
|
236
|
+
file = File.dirname(__FILE__) + "/missing.json"
|
237
|
+
FileUtils.rm(file) if File.exists?(file)
|
238
|
+
result = U.parse_json_from_file_or_uri(file, "fake", :quiet => true)
|
239
|
+
FileUtils.rm(file) if File.exists?(file)
|
240
|
+
result.should == { "stations" => ["Metro Center", "Dupont Circle"] }
|
241
|
+
end
|
145
242
|
end
|
146
243
|
end
|
147
244
|
|
148
|
-
describe "
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
245
|
+
describe "xml" do
|
246
|
+
describe "parse_xml_from_file" do
|
247
|
+
it "should work" do
|
248
|
+
file = File.dirname(__FILE__) + '/test.xml'
|
249
|
+
parsed = U.parse_xml_from_file(file)
|
250
|
+
result = parsed.xpath('.//stations/station').map(&:content)
|
251
|
+
result.should == ["Metro Center", "Dupont Circle"]
|
252
|
+
end
|
153
253
|
end
|
154
254
|
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
255
|
+
describe "parse_xml_from_uri" do
|
256
|
+
it "should work" do
|
257
|
+
readable = Object.new
|
258
|
+
readable.stub(:read).and_return(%(
|
259
|
+
<stations>
|
260
|
+
<station>Metro Center</station>
|
261
|
+
<station>Dupont Circle</station>
|
262
|
+
</stations>
|
263
|
+
))
|
264
|
+
U.stub(:open).and_return(readable)
|
265
|
+
parsed = U.parse_xml_from_uri("fake", :quiet => true)
|
266
|
+
result = parsed.xpath('.//stations/station').map(&:content)
|
267
|
+
result.should == ["Metro Center", "Dupont Circle"]
|
268
|
+
end
|
269
|
+
end
|
270
|
+
|
271
|
+
describe "parse_xml_from_file_or_uri" do
|
272
|
+
it "should work when file present" do
|
273
|
+
file = File.dirname(__FILE__) + '/test.xml'
|
274
|
+
parsed = U.parse_xml_from_file_or_uri(file, "fake", :quiet => true)
|
275
|
+
result = parsed.xpath('.//stations/station').map(&:content)
|
276
|
+
result.should == ["Metro Center", "Dupont Circle"]
|
277
|
+
end
|
278
|
+
|
279
|
+
it "should work when file missing but url ok" do
|
280
|
+
readable = Object.new
|
281
|
+
readable.stub(:read).and_return(%(
|
282
|
+
<stations>
|
283
|
+
<station>Metro Center</station>
|
284
|
+
<station>Dupont Circle</station>
|
285
|
+
</stations>
|
286
|
+
))
|
287
|
+
U.stub(:open).and_return(readable)
|
288
|
+
file = File.dirname(__FILE__) + "/missing.xml"
|
289
|
+
FileUtils.rm(file) if File.exists?(file)
|
290
|
+
parsed = U.parse_xml_from_file_or_uri(file, "fake", :quiet => true)
|
291
|
+
FileUtils.rm(file) if File.exists?(file)
|
292
|
+
result = parsed.xpath('.//stations/station').map(&:content)
|
293
|
+
result.should == ["Metro Center", "Dupont Circle"]
|
294
|
+
end
|
159
295
|
end
|
160
296
|
end
|
161
297
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datacatalog-importer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 17
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 2
|
9
|
-
-
|
10
|
-
version: 0.2.
|
9
|
+
- 3
|
10
|
+
version: 0.2.3
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- David James
|
@@ -99,6 +99,8 @@ files:
|
|
99
99
|
- natdat_is_hungry.md
|
100
100
|
- spec/spec.opts
|
101
101
|
- spec/spec_helper.rb
|
102
|
+
- spec/test.csv
|
103
|
+
- spec/test.html
|
102
104
|
- spec/test.json
|
103
105
|
- spec/test.xml
|
104
106
|
- spec/utility_spec.rb
|