datacatalog-importer 0.2.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.2
1
+ 0.2.3
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{datacatalog-importer}
8
- s.version = "0.2.2"
8
+ s.version = "0.2.3"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["David James"]
@@ -40,6 +40,8 @@ Gem::Specification.new do |s|
40
40
  "natdat_is_hungry.md",
41
41
  "spec/spec.opts",
42
42
  "spec/spec_helper.rb",
43
+ "spec/test.csv",
44
+ "spec/test.html",
43
45
  "spec/test.json",
44
46
  "spec/test.xml",
45
47
  "spec/utility_spec.rb"
data/lib/utility.rb CHANGED
@@ -57,7 +57,7 @@ module DataCatalog
57
57
 
58
58
  def self.headers
59
59
  {
60
- "UserAgent" => "National Data Catalog Importer/0.2.2",
60
+ "UserAgent" => "National Data Catalog Importer/0.2.3",
61
61
  }
62
62
  end
63
63
 
@@ -67,10 +67,10 @@ module DataCatalog
67
67
  # * http://jerith.livejournal.com/40063.html
68
68
  # * http://lindsaar.net/2007/12/9/rbuf_filltimeout-error
69
69
  def self.fetch(uri, options={})
70
- max_attempts = options[:max_attempts] || 3
71
- retry_delay = options[:retry_delay] || 5
72
- quiet = options[:quiet] || false
73
- attempts = 0
70
+ max_attempts = options.delete(:max_attempts) || 3
71
+ retry_delay = options.delete(:retry_delay) || 5
72
+ quiet = options.delete(:quiet) || false
73
+ attempts = 0
74
74
  loop do
75
75
  begin
76
76
  puts "Fetching #{uri}..." unless quiet
@@ -85,6 +85,12 @@ module DataCatalog
85
85
  end
86
86
  end
87
87
  end
88
+
89
+ def self.remove_fetch_options(options={})
90
+ [:max_attempts, :retry_delay, :quiet].each do |opt|
91
+ options.delete(opt)
92
+ end
93
+ end
88
94
 
89
95
  def self.report_timing(label)
90
96
  puts "Starting: [#{label}]"
@@ -99,9 +105,9 @@ module DataCatalog
99
105
  # == CSV ==
100
106
 
101
107
  # { :headers => true } is a common option
102
- def self.parse_csv_from_file(filename, options={})
108
+ def self.parse_csv_from_file(file, options={})
103
109
  extra_header_rows = options.delete(:extra_header_rows) || 0
104
- File.open(filename) do |f|
110
+ File.open(file) do |f|
105
111
  extra_header_rows.times { f.gets } # ignore these rows
106
112
  FasterCSV.parse(f, options)
107
113
  end
@@ -112,19 +118,22 @@ module DataCatalog
112
118
  FasterCSV.parse(data, options)
113
119
  end
114
120
 
115
- def self.parse_csv_from_file_or_uri(uri, file, options={})
121
+ def self.parse_csv_from_file_or_uri(file, uri, options={})
116
122
  force_fetch = options.delete(:force_fetch) || false
117
123
  if force_fetch || !File.exist?(file)
118
- document = fetch(uri, options)
119
- File.open(file, "w") { |f| f.write(document) }
124
+ data = fetch(uri, options)
125
+ File.open(file, "w") { |f| f.write(data) }
126
+ else
127
+ remove_fetch_options(options)
120
128
  end
129
+ # Why always parse the file? See Note 001, below.
121
130
  parse_csv_from_file(file, options)
122
131
  end
123
132
 
124
133
  # == HTML ==
125
134
 
126
- def self.parse_html_from_file(filename)
127
- File.open(filename) do |f|
135
+ def self.parse_html_from_file(file)
136
+ File.open(file) do |f|
128
137
  Nokogiri::HTML::Document.parse(f)
129
138
  end
130
139
  end
@@ -134,10 +143,13 @@ module DataCatalog
134
143
  Nokogiri::HTML::Document.parse(data)
135
144
  end
136
145
 
137
- def self.parse_html_from_file_or_uri(uri, file, options={})
138
- if options[:force_fetch] || !File.exist?(file)
139
- document = parse_html_from_uri(uri)
140
- File.open(file, "w") { |f| f.write(document) }
146
+ def self.parse_html_from_file_or_uri(file, uri, options={})
147
+ force_fetch = options.delete(:force_fetch) || false
148
+ if force_fetch || !File.exist?(file)
149
+ data = fetch(uri, options)
150
+ File.open(file, "w") { |f| f.write(data) }
151
+ else
152
+ remove_fetch_options(options)
141
153
  end
142
154
  # Why always parse the file? See Note 001, below.
143
155
  parse_html_from_file(file)
@@ -145,8 +157,8 @@ module DataCatalog
145
157
 
146
158
  # == JSON
147
159
 
148
- def self.parse_json_from_file(filename)
149
- File.open(filename) do |f|
160
+ def self.parse_json_from_file(file)
161
+ File.open(file) do |f|
150
162
  JSON.parse(f.read)
151
163
  end
152
164
  end
@@ -156,10 +168,13 @@ module DataCatalog
156
168
  JSON.parse(data)
157
169
  end
158
170
 
159
- def self.parse_json_from_file_or_uri(uri, file, options={})
160
- if options[:force_fetch] || !File.exist?(file)
161
- document = parse_json_from_uri(uri)
162
- File.open(file, "w") { |f| f.write(document) }
171
+ def self.parse_json_from_file_or_uri(file, uri, options={})
172
+ force_fetch = options.delete(:force_fetch) || false
173
+ if force_fetch || !File.exist?(file)
174
+ data = fetch(uri, options)
175
+ File.open(file, "w") { |f| f.write(data) }
176
+ else
177
+ remove_fetch_options(options)
163
178
  end
164
179
  # Why always parse the file? See Note 001, below.
165
180
  parse_json_from_file(file)
@@ -167,8 +182,8 @@ module DataCatalog
167
182
 
168
183
  # == XML
169
184
 
170
- def self.parse_xml_from_file(filename)
171
- File.open(filename) do |f|
185
+ def self.parse_xml_from_file(file)
186
+ File.open(file) do |f|
172
187
  Nokogiri::XML::Document.parse(f)
173
188
  end
174
189
  end
@@ -178,10 +193,13 @@ module DataCatalog
178
193
  Nokogiri::XML::Document.parse(data)
179
194
  end
180
195
 
181
- def self.parse_xml_from_file_or_uri(uri, file, options={})
182
- if options[:force_fetch] || !File.exist?(file)
183
- document = parse_xml_from_uri(uri)
184
- File.open(file, "w") { |f| f.write(document) }
196
+ def self.parse_xml_from_file_or_uri(file, uri, options={})
197
+ force_fetch = options.delete(:force_fetch) || false
198
+ if force_fetch || !File.exist?(file)
199
+ data = fetch(uri, options)
200
+ File.open(file, "w") { |f| f.write(data) }
201
+ else
202
+ remove_fetch_options(options)
185
203
  end
186
204
  # Why always parse the file? See Note 001, below.
187
205
  parse_xml_from_file(file)
@@ -189,10 +207,10 @@ module DataCatalog
189
207
 
190
208
  # == YAML
191
209
 
192
- # To load YAML use: YAML::load_file(filename)
210
+ # To load YAML use: YAML::load_file(file)
193
211
 
194
- def self.write_yaml(filename, contents)
195
- File.open(filename, "w") do |f|
212
+ def self.write_yaml(file, contents)
213
+ File.open(file, "w") do |f|
196
214
  YAML::dump(contents, f)
197
215
  end
198
216
  end
data/spec/test.csv ADDED
@@ -0,0 +1 @@
1
+ Metro Center,Dupont Circle
data/spec/test.html ADDED
@@ -0,0 +1,8 @@
1
+ <html>
2
+ <body>
3
+ <ol>
4
+ <li>Metro Center</li>
5
+ <li>Dupont Circle</li>
6
+ </ol>
7
+ </body>
8
+ </html>
data/spec/utility_spec.rb CHANGED
@@ -105,57 +105,193 @@ describe "Utility" do
105
105
  end
106
106
  end
107
107
 
108
- # == XML
108
+ describe "csv" do
109
+ describe "parse_csv_from_file" do
110
+ it "should work" do
111
+ file = File.dirname(__FILE__) + '/test.csv'
112
+ result = U.parse_csv_from_file(file)
113
+ result.should == [["Metro Center", "Dupont Circle"]]
114
+ end
115
+ end
109
116
 
110
- describe "parse_xml_from_file" do
111
- it "should work" do
112
- file = File.dirname(__FILE__) + '/test.xml'
113
- parsed = U.parse_xml_from_file(file)
114
- result = parsed.xpath('.//stations/station').map(&:content)
115
- result.should == ["Metro Center", "Dupont Circle"]
117
+ describe "parse_csv_from_uri" do
118
+ it "should work" do
119
+ readable = Object.new
120
+ readable.stub(:read).and_return(%(Metro Center,Dupont Circle))
121
+ U.stub(:open).and_return(readable)
122
+ result = U.parse_csv_from_uri("fake", :quiet => true)
123
+ result.should == [["Metro Center", "Dupont Circle"]]
124
+ end
125
+ end
126
+
127
+ describe "parse_csv_from_file_or_uri" do
128
+ it "should work when file present" do
129
+ file = File.dirname(__FILE__) + '/test.csv'
130
+ result = U.parse_csv_from_file_or_uri(file, "fake", :quiet => true)
131
+ result.should == [["Metro Center", "Dupont Circle"]]
132
+ end
133
+
134
+ it "should work when file missing but url ok" do
135
+ readable = Object.new
136
+ readable.stub(:read).and_return(%(Metro Center,Dupont Circle))
137
+ U.stub(:open).and_return(readable)
138
+ file = File.dirname(__FILE__) + "/missing.csv"
139
+ FileUtils.rm(file) if File.exists?(file)
140
+ result = U.parse_csv_from_file_or_uri(file, "fake", :quiet => true)
141
+ FileUtils.rm(file) if File.exists?(file)
142
+ result.should == [["Metro Center", "Dupont Circle"]]
143
+ end
116
144
  end
117
145
  end
118
146
 
119
- describe "parse_xml_from_uri" do
120
- before do
121
- @readable = Object.new
122
- @readable.stub(:read).and_return(%(
123
- <stations>
124
- <station>Metro Center</station>
125
- <station>Dupont Circle</station>
126
- </stations>
127
- ))
147
+ describe "html" do
148
+ describe "parse_html_from_file" do
149
+ it "should work" do
150
+ file = File.dirname(__FILE__) + '/test.html'
151
+ parsed = U.parse_html_from_file(file)
152
+ result = parsed.css('li').map(&:content)
153
+ result.should == ["Metro Center", "Dupont Circle"]
154
+ end
128
155
  end
129
156
 
130
- it "should work" do
131
- U.stub(:open).and_return(@readable)
132
- parsed = U.parse_xml_from_uri("fake", :quiet => true)
133
- result = parsed.xpath('.//stations/station').map(&:content)
134
- result.should == ["Metro Center", "Dupont Circle"]
157
+ describe "parse_html_from_uri" do
158
+ it "should work" do
159
+ readable = Object.new
160
+ readable.stub(:read).and_return(%(
161
+ <html>
162
+ <body>
163
+ <ol>
164
+ <li>Metro Center</li>
165
+ <li>Dupont Circle</li>
166
+ </ol>
167
+ </body>
168
+ </html>
169
+ ))
170
+ U.stub(:open).and_return(readable)
171
+ parsed = U.parse_html_from_uri("fake", :quiet => true)
172
+ result = parsed.css('li').map(&:content)
173
+ result.should == ["Metro Center", "Dupont Circle"]
174
+ end
175
+ end
176
+
177
+ describe "parse_html_from_file_or_uri" do
178
+ it "should work when file present" do
179
+ file = File.dirname(__FILE__) + '/test.html'
180
+ parsed = U.parse_html_from_file_or_uri(file, "fake", :quiet => true)
181
+ result = parsed.css('li').map(&:content)
182
+ result.should == ["Metro Center", "Dupont Circle"]
183
+ end
184
+
185
+ it "should work when file missing but url ok" do
186
+ readable = Object.new
187
+ readable.stub(:read).and_return(%(
188
+ <stations>
189
+ <station>Metro Center</station>
190
+ <station>Dupont Circle</station>
191
+ </stations>
192
+ ))
193
+ U.stub(:open).and_return(readable)
194
+ file = File.dirname(__FILE__) + "/missing.html"
195
+ FileUtils.rm(file) if File.exists?(file)
196
+ parsed = U.parse_html_from_file_or_uri(file, "fake", :quiet => true)
197
+ FileUtils.rm(file) if File.exists?(file)
198
+ result = parsed.xpath('.//stations/station').map(&:content)
199
+ result.should == ["Metro Center", "Dupont Circle"]
200
+ end
135
201
  end
136
202
  end
137
203
 
138
- # == JSON
204
+ describe "json" do
205
+ describe "parse_json_from_file" do
206
+ it "should work" do
207
+ file = File.dirname(__FILE__) + '/test.json'
208
+ U.parse_json_from_file(file).should ==
209
+ { "stations" => ["Metro Center", "Dupont Circle"] }
210
+ end
211
+ end
139
212
 
140
- describe "parse_json_from_file" do
141
- it "should work" do
142
- file = File.dirname(__FILE__) + '/test.json'
143
- U.parse_json_from_file(file).should ==
144
- { "stations" => ["Metro Center","Dupont Circle"] }
213
+ describe "parse_json_from_uri" do
214
+ it "should work" do
215
+ readable = Object.new
216
+ readable.stub(:read).and_return(
217
+ %({"stations":["Metro Center","Dupont Circle"]}))
218
+ U.stub(:open).and_return(readable)
219
+ result = U.parse_json_from_uri("fake", :quiet => true)
220
+ result.should == { "stations" => ["Metro Center", "Dupont Circle"] }
221
+ end
222
+ end
223
+
224
+ describe "parse_json_from_file_or_uri" do
225
+ it "should work when file present" do
226
+ file = File.dirname(__FILE__) + '/test.json'
227
+ result = U.parse_json_from_file_or_uri(file, "fake", :quiet => true)
228
+ result.should == { "stations" => ["Metro Center", "Dupont Circle"] }
229
+ end
230
+
231
+ it "should work when file missing but url ok" do
232
+ readable = Object.new
233
+ readable.stub(:read).and_return(
234
+ %({"stations":["Metro Center","Dupont Circle"]}))
235
+ U.stub(:open).and_return(readable)
236
+ file = File.dirname(__FILE__) + "/missing.json"
237
+ FileUtils.rm(file) if File.exists?(file)
238
+ result = U.parse_json_from_file_or_uri(file, "fake", :quiet => true)
239
+ FileUtils.rm(file) if File.exists?(file)
240
+ result.should == { "stations" => ["Metro Center", "Dupont Circle"] }
241
+ end
145
242
  end
146
243
  end
147
244
 
148
- describe "parse_json_from_uri" do
149
- before do
150
- @readable = Object.new
151
- @readable.stub(:read).and_return(
152
- %({"stations":["Metro Center","Dupont Circle"]}))
245
+ describe "xml" do
246
+ describe "parse_xml_from_file" do
247
+ it "should work" do
248
+ file = File.dirname(__FILE__) + '/test.xml'
249
+ parsed = U.parse_xml_from_file(file)
250
+ result = parsed.xpath('.//stations/station').map(&:content)
251
+ result.should == ["Metro Center", "Dupont Circle"]
252
+ end
153
253
  end
154
254
 
155
- it "should work" do
156
- U.stub(:open).and_return(@readable)
157
- result = U.parse_json_from_uri("fake", :quiet => true)
158
- result.should == { "stations" => ["Metro Center","Dupont Circle"] }
255
+ describe "parse_xml_from_uri" do
256
+ it "should work" do
257
+ readable = Object.new
258
+ readable.stub(:read).and_return(%(
259
+ <stations>
260
+ <station>Metro Center</station>
261
+ <station>Dupont Circle</station>
262
+ </stations>
263
+ ))
264
+ U.stub(:open).and_return(readable)
265
+ parsed = U.parse_xml_from_uri("fake", :quiet => true)
266
+ result = parsed.xpath('.//stations/station').map(&:content)
267
+ result.should == ["Metro Center", "Dupont Circle"]
268
+ end
269
+ end
270
+
271
+ describe "parse_xml_from_file_or_uri" do
272
+ it "should work when file present" do
273
+ file = File.dirname(__FILE__) + '/test.xml'
274
+ parsed = U.parse_xml_from_file_or_uri(file, "fake", :quiet => true)
275
+ result = parsed.xpath('.//stations/station').map(&:content)
276
+ result.should == ["Metro Center", "Dupont Circle"]
277
+ end
278
+
279
+ it "should work when file missing but url ok" do
280
+ readable = Object.new
281
+ readable.stub(:read).and_return(%(
282
+ <stations>
283
+ <station>Metro Center</station>
284
+ <station>Dupont Circle</station>
285
+ </stations>
286
+ ))
287
+ U.stub(:open).and_return(readable)
288
+ file = File.dirname(__FILE__) + "/missing.xml"
289
+ FileUtils.rm(file) if File.exists?(file)
290
+ parsed = U.parse_xml_from_file_or_uri(file, "fake", :quiet => true)
291
+ FileUtils.rm(file) if File.exists?(file)
292
+ result = parsed.xpath('.//stations/station').map(&:content)
293
+ result.should == ["Metro Center", "Dupont Circle"]
294
+ end
159
295
  end
160
296
  end
161
297
 
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datacatalog-importer
3
3
  version: !ruby/object:Gem::Version
4
- hash: 19
4
+ hash: 17
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 2
9
- - 2
10
- version: 0.2.2
9
+ - 3
10
+ version: 0.2.3
11
11
  platform: ruby
12
12
  authors:
13
13
  - David James
@@ -99,6 +99,8 @@ files:
99
99
  - natdat_is_hungry.md
100
100
  - spec/spec.opts
101
101
  - spec/spec_helper.rb
102
+ - spec/test.csv
103
+ - spec/test.html
102
104
  - spec/test.json
103
105
  - spec/test.xml
104
106
  - spec/utility_spec.rb