datacatalog-importer 0.2.2 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/datacatalog-importer.gemspec +3 -1
- data/lib/utility.rb +49 -31
- data/spec/test.csv +1 -0
- data/spec/test.html +8 -0
- data/spec/utility_spec.rb +172 -36
- metadata +5 -3
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.3
|
@@ -5,7 +5,7 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{datacatalog-importer}
|
8
|
-
s.version = "0.2.
|
8
|
+
s.version = "0.2.3"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["David James"]
|
@@ -40,6 +40,8 @@ Gem::Specification.new do |s|
|
|
40
40
|
"natdat_is_hungry.md",
|
41
41
|
"spec/spec.opts",
|
42
42
|
"spec/spec_helper.rb",
|
43
|
+
"spec/test.csv",
|
44
|
+
"spec/test.html",
|
43
45
|
"spec/test.json",
|
44
46
|
"spec/test.xml",
|
45
47
|
"spec/utility_spec.rb"
|
data/lib/utility.rb
CHANGED
@@ -57,7 +57,7 @@ module DataCatalog
|
|
57
57
|
|
58
58
|
def self.headers
|
59
59
|
{
|
60
|
-
"UserAgent" => "National Data Catalog Importer/0.2.
|
60
|
+
"UserAgent" => "National Data Catalog Importer/0.2.3",
|
61
61
|
}
|
62
62
|
end
|
63
63
|
|
@@ -67,10 +67,10 @@ module DataCatalog
|
|
67
67
|
# * http://jerith.livejournal.com/40063.html
|
68
68
|
# * http://lindsaar.net/2007/12/9/rbuf_filltimeout-error
|
69
69
|
def self.fetch(uri, options={})
|
70
|
-
max_attempts = options
|
71
|
-
retry_delay = options
|
72
|
-
quiet = options
|
73
|
-
attempts
|
70
|
+
max_attempts = options.delete(:max_attempts) || 3
|
71
|
+
retry_delay = options.delete(:retry_delay) || 5
|
72
|
+
quiet = options.delete(:quiet) || false
|
73
|
+
attempts = 0
|
74
74
|
loop do
|
75
75
|
begin
|
76
76
|
puts "Fetching #{uri}..." unless quiet
|
@@ -85,6 +85,12 @@ module DataCatalog
|
|
85
85
|
end
|
86
86
|
end
|
87
87
|
end
|
88
|
+
|
89
|
+
def self.remove_fetch_options(options={})
|
90
|
+
[:max_attempts, :retry_delay, :quiet].each do |opt|
|
91
|
+
options.delete(opt)
|
92
|
+
end
|
93
|
+
end
|
88
94
|
|
89
95
|
def self.report_timing(label)
|
90
96
|
puts "Starting: [#{label}]"
|
@@ -99,9 +105,9 @@ module DataCatalog
|
|
99
105
|
# == CSV ==
|
100
106
|
|
101
107
|
# { :headers => true } is a common option
|
102
|
-
def self.parse_csv_from_file(
|
108
|
+
def self.parse_csv_from_file(file, options={})
|
103
109
|
extra_header_rows = options.delete(:extra_header_rows) || 0
|
104
|
-
File.open(
|
110
|
+
File.open(file) do |f|
|
105
111
|
extra_header_rows.times { f.gets } # ignore these rows
|
106
112
|
FasterCSV.parse(f, options)
|
107
113
|
end
|
@@ -112,19 +118,22 @@ module DataCatalog
|
|
112
118
|
FasterCSV.parse(data, options)
|
113
119
|
end
|
114
120
|
|
115
|
-
def self.parse_csv_from_file_or_uri(
|
121
|
+
def self.parse_csv_from_file_or_uri(file, uri, options={})
|
116
122
|
force_fetch = options.delete(:force_fetch) || false
|
117
123
|
if force_fetch || !File.exist?(file)
|
118
|
-
|
119
|
-
File.open(file, "w") { |f| f.write(
|
124
|
+
data = fetch(uri, options)
|
125
|
+
File.open(file, "w") { |f| f.write(data) }
|
126
|
+
else
|
127
|
+
remove_fetch_options(options)
|
120
128
|
end
|
129
|
+
# Why always parse the file? See Note 001, below.
|
121
130
|
parse_csv_from_file(file, options)
|
122
131
|
end
|
123
132
|
|
124
133
|
# == HTML ==
|
125
134
|
|
126
|
-
def self.parse_html_from_file(
|
127
|
-
File.open(
|
135
|
+
def self.parse_html_from_file(file)
|
136
|
+
File.open(file) do |f|
|
128
137
|
Nokogiri::HTML::Document.parse(f)
|
129
138
|
end
|
130
139
|
end
|
@@ -134,10 +143,13 @@ module DataCatalog
|
|
134
143
|
Nokogiri::HTML::Document.parse(data)
|
135
144
|
end
|
136
145
|
|
137
|
-
def self.parse_html_from_file_or_uri(
|
138
|
-
|
139
|
-
|
140
|
-
|
146
|
+
def self.parse_html_from_file_or_uri(file, uri, options={})
|
147
|
+
force_fetch = options.delete(:force_fetch) || false
|
148
|
+
if force_fetch || !File.exist?(file)
|
149
|
+
data = fetch(uri, options)
|
150
|
+
File.open(file, "w") { |f| f.write(data) }
|
151
|
+
else
|
152
|
+
remove_fetch_options(options)
|
141
153
|
end
|
142
154
|
# Why always parse the file? See Note 001, below.
|
143
155
|
parse_html_from_file(file)
|
@@ -145,8 +157,8 @@ module DataCatalog
|
|
145
157
|
|
146
158
|
# == JSON
|
147
159
|
|
148
|
-
def self.parse_json_from_file(
|
149
|
-
File.open(
|
160
|
+
def self.parse_json_from_file(file)
|
161
|
+
File.open(file) do |f|
|
150
162
|
JSON.parse(f.read)
|
151
163
|
end
|
152
164
|
end
|
@@ -156,10 +168,13 @@ module DataCatalog
|
|
156
168
|
JSON.parse(data)
|
157
169
|
end
|
158
170
|
|
159
|
-
def self.parse_json_from_file_or_uri(
|
160
|
-
|
161
|
-
|
162
|
-
|
171
|
+
def self.parse_json_from_file_or_uri(file, uri, options={})
|
172
|
+
force_fetch = options.delete(:force_fetch) || false
|
173
|
+
if force_fetch || !File.exist?(file)
|
174
|
+
data = fetch(uri, options)
|
175
|
+
File.open(file, "w") { |f| f.write(data) }
|
176
|
+
else
|
177
|
+
remove_fetch_options(options)
|
163
178
|
end
|
164
179
|
# Why always parse the file? See Note 001, below.
|
165
180
|
parse_json_from_file(file)
|
@@ -167,8 +182,8 @@ module DataCatalog
|
|
167
182
|
|
168
183
|
# == XML
|
169
184
|
|
170
|
-
def self.parse_xml_from_file(
|
171
|
-
File.open(
|
185
|
+
def self.parse_xml_from_file(file)
|
186
|
+
File.open(file) do |f|
|
172
187
|
Nokogiri::XML::Document.parse(f)
|
173
188
|
end
|
174
189
|
end
|
@@ -178,10 +193,13 @@ module DataCatalog
|
|
178
193
|
Nokogiri::XML::Document.parse(data)
|
179
194
|
end
|
180
195
|
|
181
|
-
def self.parse_xml_from_file_or_uri(
|
182
|
-
|
183
|
-
|
184
|
-
|
196
|
+
def self.parse_xml_from_file_or_uri(file, uri, options={})
|
197
|
+
force_fetch = options.delete(:force_fetch) || false
|
198
|
+
if force_fetch || !File.exist?(file)
|
199
|
+
data = fetch(uri, options)
|
200
|
+
File.open(file, "w") { |f| f.write(data) }
|
201
|
+
else
|
202
|
+
remove_fetch_options(options)
|
185
203
|
end
|
186
204
|
# Why always parse the file? See Note 001, below.
|
187
205
|
parse_xml_from_file(file)
|
@@ -189,10 +207,10 @@ module DataCatalog
|
|
189
207
|
|
190
208
|
# == YAML
|
191
209
|
|
192
|
-
# To load YAML use: YAML::load_file(
|
210
|
+
# To load YAML use: YAML::load_file(file)
|
193
211
|
|
194
|
-
def self.write_yaml(
|
195
|
-
File.open(
|
212
|
+
def self.write_yaml(file, contents)
|
213
|
+
File.open(file, "w") do |f|
|
196
214
|
YAML::dump(contents, f)
|
197
215
|
end
|
198
216
|
end
|
data/spec/test.csv
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
Metro Center,Dupont Circle
|
data/spec/test.html
ADDED
data/spec/utility_spec.rb
CHANGED
@@ -105,57 +105,193 @@ describe "Utility" do
|
|
105
105
|
end
|
106
106
|
end
|
107
107
|
|
108
|
-
|
108
|
+
describe "csv" do
|
109
|
+
describe "parse_csv_from_file" do
|
110
|
+
it "should work" do
|
111
|
+
file = File.dirname(__FILE__) + '/test.csv'
|
112
|
+
result = U.parse_csv_from_file(file)
|
113
|
+
result.should == [["Metro Center", "Dupont Circle"]]
|
114
|
+
end
|
115
|
+
end
|
109
116
|
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
117
|
+
describe "parse_csv_from_uri" do
|
118
|
+
it "should work" do
|
119
|
+
readable = Object.new
|
120
|
+
readable.stub(:read).and_return(%(Metro Center,Dupont Circle))
|
121
|
+
U.stub(:open).and_return(readable)
|
122
|
+
result = U.parse_csv_from_uri("fake", :quiet => true)
|
123
|
+
result.should == [["Metro Center", "Dupont Circle"]]
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
describe "parse_csv_from_file_or_uri" do
|
128
|
+
it "should work when file present" do
|
129
|
+
file = File.dirname(__FILE__) + '/test.csv'
|
130
|
+
result = U.parse_csv_from_file_or_uri(file, "fake", :quiet => true)
|
131
|
+
result.should == [["Metro Center", "Dupont Circle"]]
|
132
|
+
end
|
133
|
+
|
134
|
+
it "should work when file missing but url ok" do
|
135
|
+
readable = Object.new
|
136
|
+
readable.stub(:read).and_return(%(Metro Center,Dupont Circle))
|
137
|
+
U.stub(:open).and_return(readable)
|
138
|
+
file = File.dirname(__FILE__) + "/missing.csv"
|
139
|
+
FileUtils.rm(file) if File.exists?(file)
|
140
|
+
result = U.parse_csv_from_file_or_uri(file, "fake", :quiet => true)
|
141
|
+
FileUtils.rm(file) if File.exists?(file)
|
142
|
+
result.should == [["Metro Center", "Dupont Circle"]]
|
143
|
+
end
|
116
144
|
end
|
117
145
|
end
|
118
146
|
|
119
|
-
describe "
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
))
|
147
|
+
describe "html" do
|
148
|
+
describe "parse_html_from_file" do
|
149
|
+
it "should work" do
|
150
|
+
file = File.dirname(__FILE__) + '/test.html'
|
151
|
+
parsed = U.parse_html_from_file(file)
|
152
|
+
result = parsed.css('li').map(&:content)
|
153
|
+
result.should == ["Metro Center", "Dupont Circle"]
|
154
|
+
end
|
128
155
|
end
|
129
156
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
157
|
+
describe "parse_html_from_uri" do
|
158
|
+
it "should work" do
|
159
|
+
readable = Object.new
|
160
|
+
readable.stub(:read).and_return(%(
|
161
|
+
<html>
|
162
|
+
<body>
|
163
|
+
<ol>
|
164
|
+
<li>Metro Center</li>
|
165
|
+
<li>Dupont Circle</li>
|
166
|
+
</ol>
|
167
|
+
</body>
|
168
|
+
</html>
|
169
|
+
))
|
170
|
+
U.stub(:open).and_return(readable)
|
171
|
+
parsed = U.parse_html_from_uri("fake", :quiet => true)
|
172
|
+
result = parsed.css('li').map(&:content)
|
173
|
+
result.should == ["Metro Center", "Dupont Circle"]
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
describe "parse_html_from_file_or_uri" do
|
178
|
+
it "should work when file present" do
|
179
|
+
file = File.dirname(__FILE__) + '/test.html'
|
180
|
+
parsed = U.parse_html_from_file_or_uri(file, "fake", :quiet => true)
|
181
|
+
result = parsed.css('li').map(&:content)
|
182
|
+
result.should == ["Metro Center", "Dupont Circle"]
|
183
|
+
end
|
184
|
+
|
185
|
+
it "should work when file missing but url ok" do
|
186
|
+
readable = Object.new
|
187
|
+
readable.stub(:read).and_return(%(
|
188
|
+
<stations>
|
189
|
+
<station>Metro Center</station>
|
190
|
+
<station>Dupont Circle</station>
|
191
|
+
</stations>
|
192
|
+
))
|
193
|
+
U.stub(:open).and_return(readable)
|
194
|
+
file = File.dirname(__FILE__) + "/missing.html"
|
195
|
+
FileUtils.rm(file) if File.exists?(file)
|
196
|
+
parsed = U.parse_html_from_file_or_uri(file, "fake", :quiet => true)
|
197
|
+
FileUtils.rm(file) if File.exists?(file)
|
198
|
+
result = parsed.xpath('.//stations/station').map(&:content)
|
199
|
+
result.should == ["Metro Center", "Dupont Circle"]
|
200
|
+
end
|
135
201
|
end
|
136
202
|
end
|
137
203
|
|
138
|
-
|
204
|
+
describe "json" do
|
205
|
+
describe "parse_json_from_file" do
|
206
|
+
it "should work" do
|
207
|
+
file = File.dirname(__FILE__) + '/test.json'
|
208
|
+
U.parse_json_from_file(file).should ==
|
209
|
+
{ "stations" => ["Metro Center", "Dupont Circle"] }
|
210
|
+
end
|
211
|
+
end
|
139
212
|
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
213
|
+
describe "parse_json_from_uri" do
|
214
|
+
it "should work" do
|
215
|
+
readable = Object.new
|
216
|
+
readable.stub(:read).and_return(
|
217
|
+
%({"stations":["Metro Center","Dupont Circle"]}))
|
218
|
+
U.stub(:open).and_return(readable)
|
219
|
+
result = U.parse_json_from_uri("fake", :quiet => true)
|
220
|
+
result.should == { "stations" => ["Metro Center", "Dupont Circle"] }
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
describe "parse_json_from_file_or_uri" do
|
225
|
+
it "should work when file present" do
|
226
|
+
file = File.dirname(__FILE__) + '/test.json'
|
227
|
+
result = U.parse_json_from_file_or_uri(file, "fake", :quiet => true)
|
228
|
+
result.should == { "stations" => ["Metro Center", "Dupont Circle"] }
|
229
|
+
end
|
230
|
+
|
231
|
+
it "should work when file missing but url ok" do
|
232
|
+
readable = Object.new
|
233
|
+
readable.stub(:read).and_return(
|
234
|
+
%({"stations":["Metro Center","Dupont Circle"]}))
|
235
|
+
U.stub(:open).and_return(readable)
|
236
|
+
file = File.dirname(__FILE__) + "/missing.json"
|
237
|
+
FileUtils.rm(file) if File.exists?(file)
|
238
|
+
result = U.parse_json_from_file_or_uri(file, "fake", :quiet => true)
|
239
|
+
FileUtils.rm(file) if File.exists?(file)
|
240
|
+
result.should == { "stations" => ["Metro Center", "Dupont Circle"] }
|
241
|
+
end
|
145
242
|
end
|
146
243
|
end
|
147
244
|
|
148
|
-
describe "
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
245
|
+
describe "xml" do
|
246
|
+
describe "parse_xml_from_file" do
|
247
|
+
it "should work" do
|
248
|
+
file = File.dirname(__FILE__) + '/test.xml'
|
249
|
+
parsed = U.parse_xml_from_file(file)
|
250
|
+
result = parsed.xpath('.//stations/station').map(&:content)
|
251
|
+
result.should == ["Metro Center", "Dupont Circle"]
|
252
|
+
end
|
153
253
|
end
|
154
254
|
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
255
|
+
describe "parse_xml_from_uri" do
|
256
|
+
it "should work" do
|
257
|
+
readable = Object.new
|
258
|
+
readable.stub(:read).and_return(%(
|
259
|
+
<stations>
|
260
|
+
<station>Metro Center</station>
|
261
|
+
<station>Dupont Circle</station>
|
262
|
+
</stations>
|
263
|
+
))
|
264
|
+
U.stub(:open).and_return(readable)
|
265
|
+
parsed = U.parse_xml_from_uri("fake", :quiet => true)
|
266
|
+
result = parsed.xpath('.//stations/station').map(&:content)
|
267
|
+
result.should == ["Metro Center", "Dupont Circle"]
|
268
|
+
end
|
269
|
+
end
|
270
|
+
|
271
|
+
describe "parse_xml_from_file_or_uri" do
|
272
|
+
it "should work when file present" do
|
273
|
+
file = File.dirname(__FILE__) + '/test.xml'
|
274
|
+
parsed = U.parse_xml_from_file_or_uri(file, "fake", :quiet => true)
|
275
|
+
result = parsed.xpath('.//stations/station').map(&:content)
|
276
|
+
result.should == ["Metro Center", "Dupont Circle"]
|
277
|
+
end
|
278
|
+
|
279
|
+
it "should work when file missing but url ok" do
|
280
|
+
readable = Object.new
|
281
|
+
readable.stub(:read).and_return(%(
|
282
|
+
<stations>
|
283
|
+
<station>Metro Center</station>
|
284
|
+
<station>Dupont Circle</station>
|
285
|
+
</stations>
|
286
|
+
))
|
287
|
+
U.stub(:open).and_return(readable)
|
288
|
+
file = File.dirname(__FILE__) + "/missing.xml"
|
289
|
+
FileUtils.rm(file) if File.exists?(file)
|
290
|
+
parsed = U.parse_xml_from_file_or_uri(file, "fake", :quiet => true)
|
291
|
+
FileUtils.rm(file) if File.exists?(file)
|
292
|
+
result = parsed.xpath('.//stations/station').map(&:content)
|
293
|
+
result.should == ["Metro Center", "Dupont Circle"]
|
294
|
+
end
|
159
295
|
end
|
160
296
|
end
|
161
297
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datacatalog-importer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 17
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 2
|
9
|
-
-
|
10
|
-
version: 0.2.
|
9
|
+
- 3
|
10
|
+
version: 0.2.3
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- David James
|
@@ -99,6 +99,8 @@ files:
|
|
99
99
|
- natdat_is_hungry.md
|
100
100
|
- spec/spec.opts
|
101
101
|
- spec/spec_helper.rb
|
102
|
+
- spec/test.csv
|
103
|
+
- spec/test.html
|
102
104
|
- spec/test.json
|
103
105
|
- spec/test.xml
|
104
106
|
- spec/utility_spec.rb
|