datacatalog-importer 0.2.3 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/datacatalog-importer.gemspec +1 -1
- data/lib/utility.rb +35 -88
- data/spec/utility_spec.rb +28 -28
- metadata +3 -3
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.3.0
|
data/lib/utility.rb
CHANGED
@@ -57,7 +57,7 @@ module DataCatalog
|
|
57
57
|
|
58
58
|
def self.headers
|
59
59
|
{
|
60
|
-
"UserAgent" => "National Data Catalog Importer/0.
|
60
|
+
"UserAgent" => "National Data Catalog Importer/0.3.0",
|
61
61
|
}
|
62
62
|
end
|
63
63
|
|
@@ -101,99 +101,46 @@ module DataCatalog
|
|
101
101
|
puts "Elapsed time [#{label}] %.2f s" % diff
|
102
102
|
result
|
103
103
|
end
|
104
|
-
|
105
|
-
# ==
|
106
|
-
|
107
|
-
|
108
|
-
def self.parse_csv_from_file(file, options={})
|
109
|
-
extra_header_rows = options.delete(:extra_header_rows) || 0
|
110
|
-
File.open(file) do |f|
|
111
|
-
extra_header_rows.times { f.gets } # ignore these rows
|
112
|
-
FasterCSV.parse(f, options)
|
113
|
-
end
|
114
|
-
end
|
115
|
-
|
116
|
-
def self.parse_csv_from_uri(uri, options={})
|
117
|
-
data = fetch(uri, options)
|
118
|
-
FasterCSV.parse(data, options)
|
119
|
-
end
|
120
|
-
|
121
|
-
def self.parse_csv_from_file_or_uri(file, uri, options={})
|
122
|
-
force_fetch = options.delete(:force_fetch) || false
|
123
|
-
if force_fetch || !File.exist?(file)
|
124
|
-
data = fetch(uri, options)
|
125
|
-
File.open(file, "w") { |f| f.write(data) }
|
126
|
-
else
|
127
|
-
remove_fetch_options(options)
|
128
|
-
end
|
129
|
-
# Why always parse the file? See Note 001, below.
|
130
|
-
parse_csv_from_file(file, options)
|
131
|
-
end
|
132
|
-
|
133
|
-
# == HTML ==
|
134
|
-
|
135
|
-
def self.parse_html_from_file(file)
|
136
|
-
File.open(file) do |f|
|
137
|
-
Nokogiri::HTML::Document.parse(f)
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
141
|
-
def self.parse_html_from_uri(uri, options={})
|
142
|
-
data = fetch(uri, options)
|
143
|
-
Nokogiri::HTML::Document.parse(data)
|
144
|
-
end
|
145
|
-
|
146
|
-
def self.parse_html_from_file_or_uri(file, uri, options={})
|
147
|
-
force_fetch = options.delete(:force_fetch) || false
|
148
|
-
if force_fetch || !File.exist?(file)
|
149
|
-
data = fetch(uri, options)
|
150
|
-
File.open(file, "w") { |f| f.write(data) }
|
151
|
-
else
|
152
|
-
remove_fetch_options(options)
|
153
|
-
end
|
154
|
-
# Why always parse the file? See Note 001, below.
|
155
|
-
parse_html_from_file(file)
|
156
|
-
end
|
157
|
-
|
158
|
-
# == JSON
|
159
|
-
|
160
|
-
def self.parse_json_from_file(file)
|
104
|
+
|
105
|
+
# == Parsing ===
|
106
|
+
|
107
|
+
def self.parse_file(format, file, options={})
|
161
108
|
File.open(file) do |f|
|
162
|
-
|
109
|
+
case format
|
110
|
+
when :csv
|
111
|
+
extra_header_rows = options.delete(:extra_header_rows) || 0
|
112
|
+
extra_header_rows.times { f.gets } # ignore these rows
|
113
|
+
FasterCSV.parse(f, options)
|
114
|
+
when :xml
|
115
|
+
Nokogiri::XML::Document.parse(f)
|
116
|
+
when :json
|
117
|
+
JSON.parse(f.read)
|
118
|
+
when :html
|
119
|
+
Nokogiri::HTML::Document.parse(f)
|
120
|
+
else
|
121
|
+
raise "Unexpected format : #{format.inspect}"
|
122
|
+
end
|
163
123
|
end
|
164
124
|
end
|
165
|
-
|
166
|
-
def self.
|
125
|
+
|
126
|
+
def self.parse_uri(format, uri, options={})
|
167
127
|
data = fetch(uri, options)
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
128
|
+
case format
|
129
|
+
when :csv
|
130
|
+
# TODO: support extra_header_rows option
|
131
|
+
FasterCSV.parse(data, options)
|
132
|
+
when :xml
|
133
|
+
Nokogiri::XML::Document.parse(data)
|
134
|
+
when :json
|
135
|
+
JSON.parse(data)
|
136
|
+
when :html
|
137
|
+
Nokogiri::HTML::Document.parse(data)
|
176
138
|
else
|
177
|
-
|
139
|
+
raise "Unexpected format : #{format.inspect}"
|
178
140
|
end
|
179
|
-
# Why always parse the file? See Note 001, below.
|
180
|
-
parse_json_from_file(file)
|
181
141
|
end
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
def self.parse_xml_from_file(file)
|
186
|
-
File.open(file) do |f|
|
187
|
-
Nokogiri::XML::Document.parse(f)
|
188
|
-
end
|
189
|
-
end
|
190
|
-
|
191
|
-
def self.parse_xml_from_uri(uri, options={})
|
192
|
-
data = fetch(uri, options)
|
193
|
-
Nokogiri::XML::Document.parse(data)
|
194
|
-
end
|
195
|
-
|
196
|
-
def self.parse_xml_from_file_or_uri(file, uri, options={})
|
142
|
+
|
143
|
+
def self.parse_file_or_uri(format, file, uri, options={})
|
197
144
|
force_fetch = options.delete(:force_fetch) || false
|
198
145
|
if force_fetch || !File.exist?(file)
|
199
146
|
data = fetch(uri, options)
|
@@ -202,7 +149,7 @@ module DataCatalog
|
|
202
149
|
remove_fetch_options(options)
|
203
150
|
end
|
204
151
|
# Why always parse the file? See Note 001, below.
|
205
|
-
|
152
|
+
parse_file(format, file, options)
|
206
153
|
end
|
207
154
|
|
208
155
|
# == YAML
|
data/spec/utility_spec.rb
CHANGED
@@ -106,28 +106,28 @@ describe "Utility" do
|
|
106
106
|
end
|
107
107
|
|
108
108
|
describe "csv" do
|
109
|
-
describe "
|
109
|
+
describe "parse_file" do
|
110
110
|
it "should work" do
|
111
111
|
file = File.dirname(__FILE__) + '/test.csv'
|
112
|
-
result = U.
|
112
|
+
result = U.parse_file(:csv, file)
|
113
113
|
result.should == [["Metro Center", "Dupont Circle"]]
|
114
114
|
end
|
115
115
|
end
|
116
116
|
|
117
|
-
describe "
|
117
|
+
describe "parse_uri" do
|
118
118
|
it "should work" do
|
119
119
|
readable = Object.new
|
120
120
|
readable.stub(:read).and_return(%(Metro Center,Dupont Circle))
|
121
121
|
U.stub(:open).and_return(readable)
|
122
|
-
result = U.
|
122
|
+
result = U.parse_uri(:csv, "fake", :quiet => true)
|
123
123
|
result.should == [["Metro Center", "Dupont Circle"]]
|
124
124
|
end
|
125
125
|
end
|
126
126
|
|
127
|
-
describe "
|
127
|
+
describe "parse_file_or_uri" do
|
128
128
|
it "should work when file present" do
|
129
129
|
file = File.dirname(__FILE__) + '/test.csv'
|
130
|
-
result = U.
|
130
|
+
result = U.parse_file_or_uri(:csv, file, "fake", :quiet => true)
|
131
131
|
result.should == [["Metro Center", "Dupont Circle"]]
|
132
132
|
end
|
133
133
|
|
@@ -137,7 +137,7 @@ describe "Utility" do
|
|
137
137
|
U.stub(:open).and_return(readable)
|
138
138
|
file = File.dirname(__FILE__) + "/missing.csv"
|
139
139
|
FileUtils.rm(file) if File.exists?(file)
|
140
|
-
result = U.
|
140
|
+
result = U.parse_file_or_uri(:csv, file, "fake", :quiet => true)
|
141
141
|
FileUtils.rm(file) if File.exists?(file)
|
142
142
|
result.should == [["Metro Center", "Dupont Circle"]]
|
143
143
|
end
|
@@ -145,16 +145,16 @@ describe "Utility" do
|
|
145
145
|
end
|
146
146
|
|
147
147
|
describe "html" do
|
148
|
-
describe "
|
148
|
+
describe "parse_file" do
|
149
149
|
it "should work" do
|
150
150
|
file = File.dirname(__FILE__) + '/test.html'
|
151
|
-
parsed = U.
|
151
|
+
parsed = U.parse_file(:html, file)
|
152
152
|
result = parsed.css('li').map(&:content)
|
153
153
|
result.should == ["Metro Center", "Dupont Circle"]
|
154
154
|
end
|
155
155
|
end
|
156
156
|
|
157
|
-
describe "
|
157
|
+
describe "parse_uri" do
|
158
158
|
it "should work" do
|
159
159
|
readable = Object.new
|
160
160
|
readable.stub(:read).and_return(%(
|
@@ -168,16 +168,16 @@ describe "Utility" do
|
|
168
168
|
</html>
|
169
169
|
))
|
170
170
|
U.stub(:open).and_return(readable)
|
171
|
-
parsed = U.
|
171
|
+
parsed = U.parse_uri(:html, "fake", :quiet => true)
|
172
172
|
result = parsed.css('li').map(&:content)
|
173
173
|
result.should == ["Metro Center", "Dupont Circle"]
|
174
174
|
end
|
175
175
|
end
|
176
176
|
|
177
|
-
describe "
|
177
|
+
describe "parse_file_or_uri" do
|
178
178
|
it "should work when file present" do
|
179
179
|
file = File.dirname(__FILE__) + '/test.html'
|
180
|
-
parsed = U.
|
180
|
+
parsed = U.parse_file_or_uri(:html, file, "fake", :quiet => true)
|
181
181
|
result = parsed.css('li').map(&:content)
|
182
182
|
result.should == ["Metro Center", "Dupont Circle"]
|
183
183
|
end
|
@@ -193,7 +193,7 @@ describe "Utility" do
|
|
193
193
|
U.stub(:open).and_return(readable)
|
194
194
|
file = File.dirname(__FILE__) + "/missing.html"
|
195
195
|
FileUtils.rm(file) if File.exists?(file)
|
196
|
-
parsed = U.
|
196
|
+
parsed = U.parse_file_or_uri(:html, file, "fake", :quiet => true)
|
197
197
|
FileUtils.rm(file) if File.exists?(file)
|
198
198
|
result = parsed.xpath('.//stations/station').map(&:content)
|
199
199
|
result.should == ["Metro Center", "Dupont Circle"]
|
@@ -202,29 +202,29 @@ describe "Utility" do
|
|
202
202
|
end
|
203
203
|
|
204
204
|
describe "json" do
|
205
|
-
describe "
|
205
|
+
describe "parse_file" do
|
206
206
|
it "should work" do
|
207
207
|
file = File.dirname(__FILE__) + '/test.json'
|
208
|
-
U.
|
208
|
+
U.parse_file(:json, file).should ==
|
209
209
|
{ "stations" => ["Metro Center", "Dupont Circle"] }
|
210
210
|
end
|
211
211
|
end
|
212
212
|
|
213
|
-
describe "
|
213
|
+
describe "parse_uri" do
|
214
214
|
it "should work" do
|
215
215
|
readable = Object.new
|
216
216
|
readable.stub(:read).and_return(
|
217
217
|
%({"stations":["Metro Center","Dupont Circle"]}))
|
218
218
|
U.stub(:open).and_return(readable)
|
219
|
-
result = U.
|
219
|
+
result = U.parse_uri(:json, "fake", :quiet => true)
|
220
220
|
result.should == { "stations" => ["Metro Center", "Dupont Circle"] }
|
221
221
|
end
|
222
222
|
end
|
223
223
|
|
224
|
-
describe "
|
224
|
+
describe "parse_file_or_uri" do
|
225
225
|
it "should work when file present" do
|
226
226
|
file = File.dirname(__FILE__) + '/test.json'
|
227
|
-
result = U.
|
227
|
+
result = U.parse_file_or_uri(:json, file, "fake", :quiet => true)
|
228
228
|
result.should == { "stations" => ["Metro Center", "Dupont Circle"] }
|
229
229
|
end
|
230
230
|
|
@@ -235,7 +235,7 @@ describe "Utility" do
|
|
235
235
|
U.stub(:open).and_return(readable)
|
236
236
|
file = File.dirname(__FILE__) + "/missing.json"
|
237
237
|
FileUtils.rm(file) if File.exists?(file)
|
238
|
-
result = U.
|
238
|
+
result = U.parse_file_or_uri(:json, file, "fake", :quiet => true)
|
239
239
|
FileUtils.rm(file) if File.exists?(file)
|
240
240
|
result.should == { "stations" => ["Metro Center", "Dupont Circle"] }
|
241
241
|
end
|
@@ -243,16 +243,16 @@ describe "Utility" do
|
|
243
243
|
end
|
244
244
|
|
245
245
|
describe "xml" do
|
246
|
-
describe "
|
246
|
+
describe "parse_file" do
|
247
247
|
it "should work" do
|
248
248
|
file = File.dirname(__FILE__) + '/test.xml'
|
249
|
-
parsed = U.
|
249
|
+
parsed = U.parse_file(:xml, file)
|
250
250
|
result = parsed.xpath('.//stations/station').map(&:content)
|
251
251
|
result.should == ["Metro Center", "Dupont Circle"]
|
252
252
|
end
|
253
253
|
end
|
254
254
|
|
255
|
-
describe "
|
255
|
+
describe "parse_uri" do
|
256
256
|
it "should work" do
|
257
257
|
readable = Object.new
|
258
258
|
readable.stub(:read).and_return(%(
|
@@ -262,16 +262,16 @@ describe "Utility" do
|
|
262
262
|
</stations>
|
263
263
|
))
|
264
264
|
U.stub(:open).and_return(readable)
|
265
|
-
parsed = U.
|
265
|
+
parsed = U.parse_uri(:xml, "fake", :quiet => true)
|
266
266
|
result = parsed.xpath('.//stations/station').map(&:content)
|
267
267
|
result.should == ["Metro Center", "Dupont Circle"]
|
268
268
|
end
|
269
269
|
end
|
270
270
|
|
271
|
-
describe "
|
271
|
+
describe "parse_file_or_uri" do
|
272
272
|
it "should work when file present" do
|
273
273
|
file = File.dirname(__FILE__) + '/test.xml'
|
274
|
-
parsed = U.
|
274
|
+
parsed = U.parse_file_or_uri(:xml, file, "fake", :quiet => true)
|
275
275
|
result = parsed.xpath('.//stations/station').map(&:content)
|
276
276
|
result.should == ["Metro Center", "Dupont Circle"]
|
277
277
|
end
|
@@ -287,7 +287,7 @@ describe "Utility" do
|
|
287
287
|
U.stub(:open).and_return(readable)
|
288
288
|
file = File.dirname(__FILE__) + "/missing.xml"
|
289
289
|
FileUtils.rm(file) if File.exists?(file)
|
290
|
-
parsed = U.
|
290
|
+
parsed = U.parse_file_or_uri(:xml, file, "fake", :quiet => true)
|
291
291
|
FileUtils.rm(file) if File.exists?(file)
|
292
292
|
result = parsed.xpath('.//stations/station').map(&:content)
|
293
293
|
result.should == ["Metro Center", "Dupont Circle"]
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datacatalog-importer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
- 2
|
9
8
|
- 3
|
10
|
-
|
9
|
+
- 0
|
10
|
+
version: 0.3.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- David James
|