datacatalog-importer 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.2
1
+ 0.2.3
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{datacatalog-importer}
8
- s.version = "0.2.2"
8
+ s.version = "0.2.3"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["David James"]
@@ -40,6 +40,8 @@ Gem::Specification.new do |s|
40
40
  "natdat_is_hungry.md",
41
41
  "spec/spec.opts",
42
42
  "spec/spec_helper.rb",
43
+ "spec/test.csv",
44
+ "spec/test.html",
43
45
  "spec/test.json",
44
46
  "spec/test.xml",
45
47
  "spec/utility_spec.rb"
data/lib/utility.rb CHANGED
@@ -57,7 +57,7 @@ module DataCatalog
57
57
 
58
58
  def self.headers
59
59
  {
60
- "UserAgent" => "National Data Catalog Importer/0.2.2",
60
+ "UserAgent" => "National Data Catalog Importer/0.2.3",
61
61
  }
62
62
  end
63
63
 
@@ -67,10 +67,10 @@ module DataCatalog
67
67
  # * http://jerith.livejournal.com/40063.html
68
68
  # * http://lindsaar.net/2007/12/9/rbuf_filltimeout-error
69
69
  def self.fetch(uri, options={})
70
- max_attempts = options[:max_attempts] || 3
71
- retry_delay = options[:retry_delay] || 5
72
- quiet = options[:quiet] || false
73
- attempts = 0
70
+ max_attempts = options.delete(:max_attempts) || 3
71
+ retry_delay = options.delete(:retry_delay) || 5
72
+ quiet = options.delete(:quiet) || false
73
+ attempts = 0
74
74
  loop do
75
75
  begin
76
76
  puts "Fetching #{uri}..." unless quiet
@@ -85,6 +85,12 @@ module DataCatalog
85
85
  end
86
86
  end
87
87
  end
88
+
89
+ def self.remove_fetch_options(options={})
90
+ [:max_attempts, :retry_delay, :quiet].each do |opt|
91
+ options.delete(opt)
92
+ end
93
+ end
88
94
 
89
95
  def self.report_timing(label)
90
96
  puts "Starting: [#{label}]"
@@ -99,9 +105,9 @@ module DataCatalog
99
105
  # == CSV ==
100
106
 
101
107
  # { :headers => true } is a common option
102
- def self.parse_csv_from_file(filename, options={})
108
+ def self.parse_csv_from_file(file, options={})
103
109
  extra_header_rows = options.delete(:extra_header_rows) || 0
104
- File.open(filename) do |f|
110
+ File.open(file) do |f|
105
111
  extra_header_rows.times { f.gets } # ignore these rows
106
112
  FasterCSV.parse(f, options)
107
113
  end
@@ -112,19 +118,22 @@ module DataCatalog
112
118
  FasterCSV.parse(data, options)
113
119
  end
114
120
 
115
- def self.parse_csv_from_file_or_uri(uri, file, options={})
121
+ def self.parse_csv_from_file_or_uri(file, uri, options={})
116
122
  force_fetch = options.delete(:force_fetch) || false
117
123
  if force_fetch || !File.exist?(file)
118
- document = fetch(uri, options)
119
- File.open(file, "w") { |f| f.write(document) }
124
+ data = fetch(uri, options)
125
+ File.open(file, "w") { |f| f.write(data) }
126
+ else
127
+ remove_fetch_options(options)
120
128
  end
129
+ # Why always parse the file? See Note 001, below.
121
130
  parse_csv_from_file(file, options)
122
131
  end
123
132
 
124
133
  # == HTML ==
125
134
 
126
- def self.parse_html_from_file(filename)
127
- File.open(filename) do |f|
135
+ def self.parse_html_from_file(file)
136
+ File.open(file) do |f|
128
137
  Nokogiri::HTML::Document.parse(f)
129
138
  end
130
139
  end
@@ -134,10 +143,13 @@ module DataCatalog
134
143
  Nokogiri::HTML::Document.parse(data)
135
144
  end
136
145
 
137
- def self.parse_html_from_file_or_uri(uri, file, options={})
138
- if options[:force_fetch] || !File.exist?(file)
139
- document = parse_html_from_uri(uri)
140
- File.open(file, "w") { |f| f.write(document) }
146
+ def self.parse_html_from_file_or_uri(file, uri, options={})
147
+ force_fetch = options.delete(:force_fetch) || false
148
+ if force_fetch || !File.exist?(file)
149
+ data = fetch(uri, options)
150
+ File.open(file, "w") { |f| f.write(data) }
151
+ else
152
+ remove_fetch_options(options)
141
153
  end
142
154
  # Why always parse the file? See Note 001, below.
143
155
  parse_html_from_file(file)
@@ -145,8 +157,8 @@ module DataCatalog
145
157
 
146
158
  # == JSON
147
159
 
148
- def self.parse_json_from_file(filename)
149
- File.open(filename) do |f|
160
+ def self.parse_json_from_file(file)
161
+ File.open(file) do |f|
150
162
  JSON.parse(f.read)
151
163
  end
152
164
  end
@@ -156,10 +168,13 @@ module DataCatalog
156
168
  JSON.parse(data)
157
169
  end
158
170
 
159
- def self.parse_json_from_file_or_uri(uri, file, options={})
160
- if options[:force_fetch] || !File.exist?(file)
161
- document = parse_json_from_uri(uri)
162
- File.open(file, "w") { |f| f.write(document) }
171
+ def self.parse_json_from_file_or_uri(file, uri, options={})
172
+ force_fetch = options.delete(:force_fetch) || false
173
+ if force_fetch || !File.exist?(file)
174
+ data = fetch(uri, options)
175
+ File.open(file, "w") { |f| f.write(data) }
176
+ else
177
+ remove_fetch_options(options)
163
178
  end
164
179
  # Why always parse the file? See Note 001, below.
165
180
  parse_json_from_file(file)
@@ -167,8 +182,8 @@ module DataCatalog
167
182
 
168
183
  # == XML
169
184
 
170
- def self.parse_xml_from_file(filename)
171
- File.open(filename) do |f|
185
+ def self.parse_xml_from_file(file)
186
+ File.open(file) do |f|
172
187
  Nokogiri::XML::Document.parse(f)
173
188
  end
174
189
  end
@@ -178,10 +193,13 @@ module DataCatalog
178
193
  Nokogiri::XML::Document.parse(data)
179
194
  end
180
195
 
181
- def self.parse_xml_from_file_or_uri(uri, file, options={})
182
- if options[:force_fetch] || !File.exist?(file)
183
- document = parse_xml_from_uri(uri)
184
- File.open(file, "w") { |f| f.write(document) }
196
+ def self.parse_xml_from_file_or_uri(file, uri, options={})
197
+ force_fetch = options.delete(:force_fetch) || false
198
+ if force_fetch || !File.exist?(file)
199
+ data = fetch(uri, options)
200
+ File.open(file, "w") { |f| f.write(data) }
201
+ else
202
+ remove_fetch_options(options)
185
203
  end
186
204
  # Why always parse the file? See Note 001, below.
187
205
  parse_xml_from_file(file)
@@ -189,10 +207,10 @@ module DataCatalog
189
207
 
190
208
  # == YAML
191
209
 
192
- # To load YAML use: YAML::load_file(filename)
210
+ # To load YAML use: YAML::load_file(file)
193
211
 
194
- def self.write_yaml(filename, contents)
195
- File.open(filename, "w") do |f|
212
+ def self.write_yaml(file, contents)
213
+ File.open(file, "w") do |f|
196
214
  YAML::dump(contents, f)
197
215
  end
198
216
  end
data/spec/test.csv ADDED
@@ -0,0 +1 @@
1
+ Metro Center,Dupont Circle
data/spec/test.html ADDED
@@ -0,0 +1,8 @@
1
+ <html>
2
+ <body>
3
+ <ol>
4
+ <li>Metro Center</li>
5
+ <li>Dupont Circle</li>
6
+ </ol>
7
+ </body>
8
+ </html>
data/spec/utility_spec.rb CHANGED
@@ -105,57 +105,193 @@ describe "Utility" do
105
105
  end
106
106
  end
107
107
 
108
- # == XML
108
+ describe "csv" do
109
+ describe "parse_csv_from_file" do
110
+ it "should work" do
111
+ file = File.dirname(__FILE__) + '/test.csv'
112
+ result = U.parse_csv_from_file(file)
113
+ result.should == [["Metro Center", "Dupont Circle"]]
114
+ end
115
+ end
109
116
 
110
- describe "parse_xml_from_file" do
111
- it "should work" do
112
- file = File.dirname(__FILE__) + '/test.xml'
113
- parsed = U.parse_xml_from_file(file)
114
- result = parsed.xpath('.//stations/station').map(&:content)
115
- result.should == ["Metro Center", "Dupont Circle"]
117
+ describe "parse_csv_from_uri" do
118
+ it "should work" do
119
+ readable = Object.new
120
+ readable.stub(:read).and_return(%(Metro Center,Dupont Circle))
121
+ U.stub(:open).and_return(readable)
122
+ result = U.parse_csv_from_uri("fake", :quiet => true)
123
+ result.should == [["Metro Center", "Dupont Circle"]]
124
+ end
125
+ end
126
+
127
+ describe "parse_csv_from_file_or_uri" do
128
+ it "should work when file present" do
129
+ file = File.dirname(__FILE__) + '/test.csv'
130
+ result = U.parse_csv_from_file_or_uri(file, "fake", :quiet => true)
131
+ result.should == [["Metro Center", "Dupont Circle"]]
132
+ end
133
+
134
+ it "should work when file missing but url ok" do
135
+ readable = Object.new
136
+ readable.stub(:read).and_return(%(Metro Center,Dupont Circle))
137
+ U.stub(:open).and_return(readable)
138
+ file = File.dirname(__FILE__) + "/missing.csv"
139
+ FileUtils.rm(file) if File.exists?(file)
140
+ result = U.parse_csv_from_file_or_uri(file, "fake", :quiet => true)
141
+ FileUtils.rm(file) if File.exists?(file)
142
+ result.should == [["Metro Center", "Dupont Circle"]]
143
+ end
116
144
  end
117
145
  end
118
146
 
119
- describe "parse_xml_from_uri" do
120
- before do
121
- @readable = Object.new
122
- @readable.stub(:read).and_return(%(
123
- <stations>
124
- <station>Metro Center</station>
125
- <station>Dupont Circle</station>
126
- </stations>
127
- ))
147
+ describe "html" do
148
+ describe "parse_html_from_file" do
149
+ it "should work" do
150
+ file = File.dirname(__FILE__) + '/test.html'
151
+ parsed = U.parse_html_from_file(file)
152
+ result = parsed.css('li').map(&:content)
153
+ result.should == ["Metro Center", "Dupont Circle"]
154
+ end
128
155
  end
129
156
 
130
- it "should work" do
131
- U.stub(:open).and_return(@readable)
132
- parsed = U.parse_xml_from_uri("fake", :quiet => true)
133
- result = parsed.xpath('.//stations/station').map(&:content)
134
- result.should == ["Metro Center", "Dupont Circle"]
157
+ describe "parse_html_from_uri" do
158
+ it "should work" do
159
+ readable = Object.new
160
+ readable.stub(:read).and_return(%(
161
+ <html>
162
+ <body>
163
+ <ol>
164
+ <li>Metro Center</li>
165
+ <li>Dupont Circle</li>
166
+ </ol>
167
+ </body>
168
+ </html>
169
+ ))
170
+ U.stub(:open).and_return(readable)
171
+ parsed = U.parse_html_from_uri("fake", :quiet => true)
172
+ result = parsed.css('li').map(&:content)
173
+ result.should == ["Metro Center", "Dupont Circle"]
174
+ end
175
+ end
176
+
177
+ describe "parse_html_from_file_or_uri" do
178
+ it "should work when file present" do
179
+ file = File.dirname(__FILE__) + '/test.html'
180
+ parsed = U.parse_html_from_file_or_uri(file, "fake", :quiet => true)
181
+ result = parsed.css('li').map(&:content)
182
+ result.should == ["Metro Center", "Dupont Circle"]
183
+ end
184
+
185
+ it "should work when file missing but url ok" do
186
+ readable = Object.new
187
+ readable.stub(:read).and_return(%(
188
+ <stations>
189
+ <station>Metro Center</station>
190
+ <station>Dupont Circle</station>
191
+ </stations>
192
+ ))
193
+ U.stub(:open).and_return(readable)
194
+ file = File.dirname(__FILE__) + "/missing.html"
195
+ FileUtils.rm(file) if File.exists?(file)
196
+ parsed = U.parse_html_from_file_or_uri(file, "fake", :quiet => true)
197
+ FileUtils.rm(file) if File.exists?(file)
198
+ result = parsed.xpath('.//stations/station').map(&:content)
199
+ result.should == ["Metro Center", "Dupont Circle"]
200
+ end
135
201
  end
136
202
  end
137
203
 
138
- # == JSON
204
+ describe "json" do
205
+ describe "parse_json_from_file" do
206
+ it "should work" do
207
+ file = File.dirname(__FILE__) + '/test.json'
208
+ U.parse_json_from_file(file).should ==
209
+ { "stations" => ["Metro Center", "Dupont Circle"] }
210
+ end
211
+ end
139
212
 
140
- describe "parse_json_from_file" do
141
- it "should work" do
142
- file = File.dirname(__FILE__) + '/test.json'
143
- U.parse_json_from_file(file).should ==
144
- { "stations" => ["Metro Center","Dupont Circle"] }
213
+ describe "parse_json_from_uri" do
214
+ it "should work" do
215
+ readable = Object.new
216
+ readable.stub(:read).and_return(
217
+ %({"stations":["Metro Center","Dupont Circle"]}))
218
+ U.stub(:open).and_return(readable)
219
+ result = U.parse_json_from_uri("fake", :quiet => true)
220
+ result.should == { "stations" => ["Metro Center", "Dupont Circle"] }
221
+ end
222
+ end
223
+
224
+ describe "parse_json_from_file_or_uri" do
225
+ it "should work when file present" do
226
+ file = File.dirname(__FILE__) + '/test.json'
227
+ result = U.parse_json_from_file_or_uri(file, "fake", :quiet => true)
228
+ result.should == { "stations" => ["Metro Center", "Dupont Circle"] }
229
+ end
230
+
231
+ it "should work when file missing but url ok" do
232
+ readable = Object.new
233
+ readable.stub(:read).and_return(
234
+ %({"stations":["Metro Center","Dupont Circle"]}))
235
+ U.stub(:open).and_return(readable)
236
+ file = File.dirname(__FILE__) + "/missing.json"
237
+ FileUtils.rm(file) if File.exists?(file)
238
+ result = U.parse_json_from_file_or_uri(file, "fake", :quiet => true)
239
+ FileUtils.rm(file) if File.exists?(file)
240
+ result.should == { "stations" => ["Metro Center", "Dupont Circle"] }
241
+ end
145
242
  end
146
243
  end
147
244
 
148
- describe "parse_json_from_uri" do
149
- before do
150
- @readable = Object.new
151
- @readable.stub(:read).and_return(
152
- %({"stations":["Metro Center","Dupont Circle"]}))
245
+ describe "xml" do
246
+ describe "parse_xml_from_file" do
247
+ it "should work" do
248
+ file = File.dirname(__FILE__) + '/test.xml'
249
+ parsed = U.parse_xml_from_file(file)
250
+ result = parsed.xpath('.//stations/station').map(&:content)
251
+ result.should == ["Metro Center", "Dupont Circle"]
252
+ end
153
253
  end
154
254
 
155
- it "should work" do
156
- U.stub(:open).and_return(@readable)
157
- result = U.parse_json_from_uri("fake", :quiet => true)
158
- result.should == { "stations" => ["Metro Center","Dupont Circle"] }
255
+ describe "parse_xml_from_uri" do
256
+ it "should work" do
257
+ readable = Object.new
258
+ readable.stub(:read).and_return(%(
259
+ <stations>
260
+ <station>Metro Center</station>
261
+ <station>Dupont Circle</station>
262
+ </stations>
263
+ ))
264
+ U.stub(:open).and_return(readable)
265
+ parsed = U.parse_xml_from_uri("fake", :quiet => true)
266
+ result = parsed.xpath('.//stations/station').map(&:content)
267
+ result.should == ["Metro Center", "Dupont Circle"]
268
+ end
269
+ end
270
+
271
+ describe "parse_xml_from_file_or_uri" do
272
+ it "should work when file present" do
273
+ file = File.dirname(__FILE__) + '/test.xml'
274
+ parsed = U.parse_xml_from_file_or_uri(file, "fake", :quiet => true)
275
+ result = parsed.xpath('.//stations/station').map(&:content)
276
+ result.should == ["Metro Center", "Dupont Circle"]
277
+ end
278
+
279
+ it "should work when file missing but url ok" do
280
+ readable = Object.new
281
+ readable.stub(:read).and_return(%(
282
+ <stations>
283
+ <station>Metro Center</station>
284
+ <station>Dupont Circle</station>
285
+ </stations>
286
+ ))
287
+ U.stub(:open).and_return(readable)
288
+ file = File.dirname(__FILE__) + "/missing.xml"
289
+ FileUtils.rm(file) if File.exists?(file)
290
+ parsed = U.parse_xml_from_file_or_uri(file, "fake", :quiet => true)
291
+ FileUtils.rm(file) if File.exists?(file)
292
+ result = parsed.xpath('.//stations/station').map(&:content)
293
+ result.should == ["Metro Center", "Dupont Circle"]
294
+ end
159
295
  end
160
296
  end
161
297
 
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datacatalog-importer
3
3
  version: !ruby/object:Gem::Version
4
- hash: 19
4
+ hash: 17
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 2
9
- - 2
10
- version: 0.2.2
9
+ - 3
10
+ version: 0.2.3
11
11
  platform: ruby
12
12
  authors:
13
13
  - David James
@@ -99,6 +99,8 @@ files:
99
99
  - natdat_is_hungry.md
100
100
  - spec/spec.opts
101
101
  - spec/spec_helper.rb
102
+ - spec/test.csv
103
+ - spec/test.html
102
104
  - spec/test.json
103
105
  - spec/test.xml
104
106
  - spec/utility_spec.rb