datacatalog-importer 0.2.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/datacatalog-importer.gemspec +1 -1
- data/lib/utility.rb +35 -88
- data/spec/utility_spec.rb +28 -28
- metadata +3 -3
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.3.0
|
data/lib/utility.rb
CHANGED
@@ -57,7 +57,7 @@ module DataCatalog
|
|
57
57
|
|
58
58
|
def self.headers
|
59
59
|
{
|
60
|
-
"UserAgent" => "National Data Catalog Importer/0.
|
60
|
+
"UserAgent" => "National Data Catalog Importer/0.3.0",
|
61
61
|
}
|
62
62
|
end
|
63
63
|
|
@@ -101,99 +101,46 @@ module DataCatalog
|
|
101
101
|
puts "Elapsed time [#{label}] %.2f s" % diff
|
102
102
|
result
|
103
103
|
end
|
104
|
-
|
105
|
-
# ==
|
106
|
-
|
107
|
-
|
108
|
-
def self.parse_csv_from_file(file, options={})
|
109
|
-
extra_header_rows = options.delete(:extra_header_rows) || 0
|
110
|
-
File.open(file) do |f|
|
111
|
-
extra_header_rows.times { f.gets } # ignore these rows
|
112
|
-
FasterCSV.parse(f, options)
|
113
|
-
end
|
114
|
-
end
|
115
|
-
|
116
|
-
def self.parse_csv_from_uri(uri, options={})
|
117
|
-
data = fetch(uri, options)
|
118
|
-
FasterCSV.parse(data, options)
|
119
|
-
end
|
120
|
-
|
121
|
-
def self.parse_csv_from_file_or_uri(file, uri, options={})
|
122
|
-
force_fetch = options.delete(:force_fetch) || false
|
123
|
-
if force_fetch || !File.exist?(file)
|
124
|
-
data = fetch(uri, options)
|
125
|
-
File.open(file, "w") { |f| f.write(data) }
|
126
|
-
else
|
127
|
-
remove_fetch_options(options)
|
128
|
-
end
|
129
|
-
# Why always parse the file? See Note 001, below.
|
130
|
-
parse_csv_from_file(file, options)
|
131
|
-
end
|
132
|
-
|
133
|
-
# == HTML ==
|
134
|
-
|
135
|
-
def self.parse_html_from_file(file)
|
136
|
-
File.open(file) do |f|
|
137
|
-
Nokogiri::HTML::Document.parse(f)
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
141
|
-
def self.parse_html_from_uri(uri, options={})
|
142
|
-
data = fetch(uri, options)
|
143
|
-
Nokogiri::HTML::Document.parse(data)
|
144
|
-
end
|
145
|
-
|
146
|
-
def self.parse_html_from_file_or_uri(file, uri, options={})
|
147
|
-
force_fetch = options.delete(:force_fetch) || false
|
148
|
-
if force_fetch || !File.exist?(file)
|
149
|
-
data = fetch(uri, options)
|
150
|
-
File.open(file, "w") { |f| f.write(data) }
|
151
|
-
else
|
152
|
-
remove_fetch_options(options)
|
153
|
-
end
|
154
|
-
# Why always parse the file? See Note 001, below.
|
155
|
-
parse_html_from_file(file)
|
156
|
-
end
|
157
|
-
|
158
|
-
# == JSON
|
159
|
-
|
160
|
-
def self.parse_json_from_file(file)
|
104
|
+
|
105
|
+
# == Parsing ===
|
106
|
+
|
107
|
+
def self.parse_file(format, file, options={})
|
161
108
|
File.open(file) do |f|
|
162
|
-
|
109
|
+
case format
|
110
|
+
when :csv
|
111
|
+
extra_header_rows = options.delete(:extra_header_rows) || 0
|
112
|
+
extra_header_rows.times { f.gets } # ignore these rows
|
113
|
+
FasterCSV.parse(f, options)
|
114
|
+
when :xml
|
115
|
+
Nokogiri::XML::Document.parse(f)
|
116
|
+
when :json
|
117
|
+
JSON.parse(f.read)
|
118
|
+
when :html
|
119
|
+
Nokogiri::HTML::Document.parse(f)
|
120
|
+
else
|
121
|
+
raise "Unexpected format : #{format.inspect}"
|
122
|
+
end
|
163
123
|
end
|
164
124
|
end
|
165
|
-
|
166
|
-
def self.
|
125
|
+
|
126
|
+
def self.parse_uri(format, uri, options={})
|
167
127
|
data = fetch(uri, options)
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
128
|
+
case format
|
129
|
+
when :csv
|
130
|
+
# TODO: support extra_header_rows option
|
131
|
+
FasterCSV.parse(data, options)
|
132
|
+
when :xml
|
133
|
+
Nokogiri::XML::Document.parse(data)
|
134
|
+
when :json
|
135
|
+
JSON.parse(data)
|
136
|
+
when :html
|
137
|
+
Nokogiri::HTML::Document.parse(data)
|
176
138
|
else
|
177
|
-
|
139
|
+
raise "Unexpected format : #{format.inspect}"
|
178
140
|
end
|
179
|
-
# Why always parse the file? See Note 001, below.
|
180
|
-
parse_json_from_file(file)
|
181
141
|
end
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
def self.parse_xml_from_file(file)
|
186
|
-
File.open(file) do |f|
|
187
|
-
Nokogiri::XML::Document.parse(f)
|
188
|
-
end
|
189
|
-
end
|
190
|
-
|
191
|
-
def self.parse_xml_from_uri(uri, options={})
|
192
|
-
data = fetch(uri, options)
|
193
|
-
Nokogiri::XML::Document.parse(data)
|
194
|
-
end
|
195
|
-
|
196
|
-
def self.parse_xml_from_file_or_uri(file, uri, options={})
|
142
|
+
|
143
|
+
def self.parse_file_or_uri(format, file, uri, options={})
|
197
144
|
force_fetch = options.delete(:force_fetch) || false
|
198
145
|
if force_fetch || !File.exist?(file)
|
199
146
|
data = fetch(uri, options)
|
@@ -202,7 +149,7 @@ module DataCatalog
|
|
202
149
|
remove_fetch_options(options)
|
203
150
|
end
|
204
151
|
# Why always parse the file? See Note 001, below.
|
205
|
-
|
152
|
+
parse_file(format, file, options)
|
206
153
|
end
|
207
154
|
|
208
155
|
# == YAML
|
data/spec/utility_spec.rb
CHANGED
@@ -106,28 +106,28 @@ describe "Utility" do
|
|
106
106
|
end
|
107
107
|
|
108
108
|
describe "csv" do
|
109
|
-
describe "
|
109
|
+
describe "parse_file" do
|
110
110
|
it "should work" do
|
111
111
|
file = File.dirname(__FILE__) + '/test.csv'
|
112
|
-
result = U.
|
112
|
+
result = U.parse_file(:csv, file)
|
113
113
|
result.should == [["Metro Center", "Dupont Circle"]]
|
114
114
|
end
|
115
115
|
end
|
116
116
|
|
117
|
-
describe "
|
117
|
+
describe "parse_uri" do
|
118
118
|
it "should work" do
|
119
119
|
readable = Object.new
|
120
120
|
readable.stub(:read).and_return(%(Metro Center,Dupont Circle))
|
121
121
|
U.stub(:open).and_return(readable)
|
122
|
-
result = U.
|
122
|
+
result = U.parse_uri(:csv, "fake", :quiet => true)
|
123
123
|
result.should == [["Metro Center", "Dupont Circle"]]
|
124
124
|
end
|
125
125
|
end
|
126
126
|
|
127
|
-
describe "
|
127
|
+
describe "parse_file_or_uri" do
|
128
128
|
it "should work when file present" do
|
129
129
|
file = File.dirname(__FILE__) + '/test.csv'
|
130
|
-
result = U.
|
130
|
+
result = U.parse_file_or_uri(:csv, file, "fake", :quiet => true)
|
131
131
|
result.should == [["Metro Center", "Dupont Circle"]]
|
132
132
|
end
|
133
133
|
|
@@ -137,7 +137,7 @@ describe "Utility" do
|
|
137
137
|
U.stub(:open).and_return(readable)
|
138
138
|
file = File.dirname(__FILE__) + "/missing.csv"
|
139
139
|
FileUtils.rm(file) if File.exists?(file)
|
140
|
-
result = U.
|
140
|
+
result = U.parse_file_or_uri(:csv, file, "fake", :quiet => true)
|
141
141
|
FileUtils.rm(file) if File.exists?(file)
|
142
142
|
result.should == [["Metro Center", "Dupont Circle"]]
|
143
143
|
end
|
@@ -145,16 +145,16 @@ describe "Utility" do
|
|
145
145
|
end
|
146
146
|
|
147
147
|
describe "html" do
|
148
|
-
describe "
|
148
|
+
describe "parse_file" do
|
149
149
|
it "should work" do
|
150
150
|
file = File.dirname(__FILE__) + '/test.html'
|
151
|
-
parsed = U.
|
151
|
+
parsed = U.parse_file(:html, file)
|
152
152
|
result = parsed.css('li').map(&:content)
|
153
153
|
result.should == ["Metro Center", "Dupont Circle"]
|
154
154
|
end
|
155
155
|
end
|
156
156
|
|
157
|
-
describe "
|
157
|
+
describe "parse_uri" do
|
158
158
|
it "should work" do
|
159
159
|
readable = Object.new
|
160
160
|
readable.stub(:read).and_return(%(
|
@@ -168,16 +168,16 @@ describe "Utility" do
|
|
168
168
|
</html>
|
169
169
|
))
|
170
170
|
U.stub(:open).and_return(readable)
|
171
|
-
parsed = U.
|
171
|
+
parsed = U.parse_uri(:html, "fake", :quiet => true)
|
172
172
|
result = parsed.css('li').map(&:content)
|
173
173
|
result.should == ["Metro Center", "Dupont Circle"]
|
174
174
|
end
|
175
175
|
end
|
176
176
|
|
177
|
-
describe "
|
177
|
+
describe "parse_file_or_uri" do
|
178
178
|
it "should work when file present" do
|
179
179
|
file = File.dirname(__FILE__) + '/test.html'
|
180
|
-
parsed = U.
|
180
|
+
parsed = U.parse_file_or_uri(:html, file, "fake", :quiet => true)
|
181
181
|
result = parsed.css('li').map(&:content)
|
182
182
|
result.should == ["Metro Center", "Dupont Circle"]
|
183
183
|
end
|
@@ -193,7 +193,7 @@ describe "Utility" do
|
|
193
193
|
U.stub(:open).and_return(readable)
|
194
194
|
file = File.dirname(__FILE__) + "/missing.html"
|
195
195
|
FileUtils.rm(file) if File.exists?(file)
|
196
|
-
parsed = U.
|
196
|
+
parsed = U.parse_file_or_uri(:html, file, "fake", :quiet => true)
|
197
197
|
FileUtils.rm(file) if File.exists?(file)
|
198
198
|
result = parsed.xpath('.//stations/station').map(&:content)
|
199
199
|
result.should == ["Metro Center", "Dupont Circle"]
|
@@ -202,29 +202,29 @@ describe "Utility" do
|
|
202
202
|
end
|
203
203
|
|
204
204
|
describe "json" do
|
205
|
-
describe "
|
205
|
+
describe "parse_file" do
|
206
206
|
it "should work" do
|
207
207
|
file = File.dirname(__FILE__) + '/test.json'
|
208
|
-
U.
|
208
|
+
U.parse_file(:json, file).should ==
|
209
209
|
{ "stations" => ["Metro Center", "Dupont Circle"] }
|
210
210
|
end
|
211
211
|
end
|
212
212
|
|
213
|
-
describe "
|
213
|
+
describe "parse_uri" do
|
214
214
|
it "should work" do
|
215
215
|
readable = Object.new
|
216
216
|
readable.stub(:read).and_return(
|
217
217
|
%({"stations":["Metro Center","Dupont Circle"]}))
|
218
218
|
U.stub(:open).and_return(readable)
|
219
|
-
result = U.
|
219
|
+
result = U.parse_uri(:json, "fake", :quiet => true)
|
220
220
|
result.should == { "stations" => ["Metro Center", "Dupont Circle"] }
|
221
221
|
end
|
222
222
|
end
|
223
223
|
|
224
|
-
describe "
|
224
|
+
describe "parse_file_or_uri" do
|
225
225
|
it "should work when file present" do
|
226
226
|
file = File.dirname(__FILE__) + '/test.json'
|
227
|
-
result = U.
|
227
|
+
result = U.parse_file_or_uri(:json, file, "fake", :quiet => true)
|
228
228
|
result.should == { "stations" => ["Metro Center", "Dupont Circle"] }
|
229
229
|
end
|
230
230
|
|
@@ -235,7 +235,7 @@ describe "Utility" do
|
|
235
235
|
U.stub(:open).and_return(readable)
|
236
236
|
file = File.dirname(__FILE__) + "/missing.json"
|
237
237
|
FileUtils.rm(file) if File.exists?(file)
|
238
|
-
result = U.
|
238
|
+
result = U.parse_file_or_uri(:json, file, "fake", :quiet => true)
|
239
239
|
FileUtils.rm(file) if File.exists?(file)
|
240
240
|
result.should == { "stations" => ["Metro Center", "Dupont Circle"] }
|
241
241
|
end
|
@@ -243,16 +243,16 @@ describe "Utility" do
|
|
243
243
|
end
|
244
244
|
|
245
245
|
describe "xml" do
|
246
|
-
describe "
|
246
|
+
describe "parse_file" do
|
247
247
|
it "should work" do
|
248
248
|
file = File.dirname(__FILE__) + '/test.xml'
|
249
|
-
parsed = U.
|
249
|
+
parsed = U.parse_file(:xml, file)
|
250
250
|
result = parsed.xpath('.//stations/station').map(&:content)
|
251
251
|
result.should == ["Metro Center", "Dupont Circle"]
|
252
252
|
end
|
253
253
|
end
|
254
254
|
|
255
|
-
describe "
|
255
|
+
describe "parse_uri" do
|
256
256
|
it "should work" do
|
257
257
|
readable = Object.new
|
258
258
|
readable.stub(:read).and_return(%(
|
@@ -262,16 +262,16 @@ describe "Utility" do
|
|
262
262
|
</stations>
|
263
263
|
))
|
264
264
|
U.stub(:open).and_return(readable)
|
265
|
-
parsed = U.
|
265
|
+
parsed = U.parse_uri(:xml, "fake", :quiet => true)
|
266
266
|
result = parsed.xpath('.//stations/station').map(&:content)
|
267
267
|
result.should == ["Metro Center", "Dupont Circle"]
|
268
268
|
end
|
269
269
|
end
|
270
270
|
|
271
|
-
describe "
|
271
|
+
describe "parse_file_or_uri" do
|
272
272
|
it "should work when file present" do
|
273
273
|
file = File.dirname(__FILE__) + '/test.xml'
|
274
|
-
parsed = U.
|
274
|
+
parsed = U.parse_file_or_uri(:xml, file, "fake", :quiet => true)
|
275
275
|
result = parsed.xpath('.//stations/station').map(&:content)
|
276
276
|
result.should == ["Metro Center", "Dupont Circle"]
|
277
277
|
end
|
@@ -287,7 +287,7 @@ describe "Utility" do
|
|
287
287
|
U.stub(:open).and_return(readable)
|
288
288
|
file = File.dirname(__FILE__) + "/missing.xml"
|
289
289
|
FileUtils.rm(file) if File.exists?(file)
|
290
|
-
parsed = U.
|
290
|
+
parsed = U.parse_file_or_uri(:xml, file, "fake", :quiet => true)
|
291
291
|
FileUtils.rm(file) if File.exists?(file)
|
292
292
|
result = parsed.xpath('.//stations/station').map(&:content)
|
293
293
|
result.should == ["Metro Center", "Dupont Circle"]
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datacatalog-importer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
- 2
|
9
8
|
- 3
|
10
|
-
|
9
|
+
- 0
|
10
|
+
version: 0.3.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- David James
|