metainspector 1.13.0 → 1.13.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/meta_inspector/scraper.rb +9 -2
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/fixtures/malformed_href.response +27 -0
- data/spec/metainspector_spec.rb +36 -14
- data/spec/spec_helper.rb +1 -0
- metadata +5 -4
@@ -53,12 +53,12 @@ module MetaInspector
|
|
53
53
|
|
54
54
|
# Internal links found on the page, as absolute URLs
|
55
55
|
def internal_links
|
56
|
-
@internal_links ||= links.select {|link|
|
56
|
+
@internal_links ||= links.select {|link| host_from_url(link) == host }
|
57
57
|
end
|
58
58
|
|
59
59
|
# External links found on the page, as absolute URLs
|
60
60
|
def external_links
|
61
|
-
@external_links ||= links.select {|link|
|
61
|
+
@external_links ||= links.select {|link| host_from_url(link) != host }
|
62
62
|
end
|
63
63
|
|
64
64
|
# Images found on the page, as absolute URLs
|
@@ -245,6 +245,13 @@ module MetaInspector
|
|
245
245
|
url =~ /^\/\// ? "#{scheme}://#{url[2..-1]}" : url
|
246
246
|
end
|
247
247
|
|
248
|
+
# Extracts the host from a given URL
|
249
|
+
def host_from_url(url)
|
250
|
+
URI.parse(url).host
|
251
|
+
rescue URI::InvalidURIError, URI::InvalidComponentError => e
|
252
|
+
add_fatal_error "Link parsing exception: #{e.message}" and nil
|
253
|
+
end
|
254
|
+
|
248
255
|
# Look for the first <p> block with 120 characters or more
|
249
256
|
def secondary_description
|
250
257
|
first_long_paragraph = parsed_document.search('//p[string-length() >= 120]').first
|
@@ -0,0 +1,27 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Server: nginx/0.7.67
|
3
|
+
Date: Fri, 18 Nov 2011 21:46:46 GMT
|
4
|
+
Content-Type: text/html
|
5
|
+
Connection: keep-alive
|
6
|
+
Last-Modified: Mon, 14 Nov 2011 16:53:18 GMT
|
7
|
+
Content-Length: 4987
|
8
|
+
X-Varnish: 2000423390
|
9
|
+
Age: 0
|
10
|
+
Via: 1.1 varnish
|
11
|
+
|
12
|
+
<html>
|
13
|
+
<head>
|
14
|
+
<title>Malformed hrefs</title>
|
15
|
+
</head>
|
16
|
+
<body>
|
17
|
+
<h1>Good links</h1>
|
18
|
+
<a href="/faqs">FAQs</a>
|
19
|
+
<a href="skype:joeuser?call">a skype link</a>
|
20
|
+
<a href="telnet://telnet.cdrom.com">a telnet link</a>
|
21
|
+
<a href="javascript:alert('ok');">ok</a>
|
22
|
+
|
23
|
+
<h1>Bad links due to malformed href</h1>
|
24
|
+
<a href="javascript://">oops</a>
|
25
|
+
<a href="mailto:email(at)example.com">
|
26
|
+
</body>
|
27
|
+
</html>
|
data/spec/metainspector_spec.rb
CHANGED
@@ -177,22 +177,44 @@ describe MetaInspector do
|
|
177
177
|
"http://example.com/search?q=espa%C3%B1a#top"]
|
178
178
|
end
|
179
179
|
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
180
|
+
describe "internal links" do
|
181
|
+
it "should get correct internal links, encoding the URLs as needed but respecting # and ?" do
|
182
|
+
m = MetaInspector.new('http://international.com')
|
183
|
+
m.internal_links.should == [ "http://international.com/espa%C3%B1a.asp",
|
184
|
+
"http://international.com/roman%C3%A9e",
|
185
|
+
"http://international.com/faqs#cami%C3%B3n",
|
186
|
+
"http://international.com/search?q=cami%C3%B3n",
|
187
|
+
"http://international.com/search?q=espa%C3%B1a#top"]
|
188
|
+
end
|
189
|
+
|
190
|
+
it "should not crash when processing malformed hrefs" do
|
191
|
+
m = MetaInspector.new('http://example.com/malformed_href')
|
192
|
+
expect {
|
193
|
+
m.internal_links.should == [ "http://example.com/faqs" ]
|
194
|
+
m.should_not be_ok
|
195
|
+
}.to_not raise_error
|
196
|
+
end
|
187
197
|
end
|
188
198
|
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
199
|
+
describe "external links" do
|
200
|
+
it "should get correct external links, encoding the URLs as needed but respecting # and ?" do
|
201
|
+
m = MetaInspector.new('http://international.com')
|
202
|
+
m.external_links.should == [ "http://example.com/espa%C3%B1a.asp",
|
203
|
+
"http://example.com/roman%C3%A9e",
|
204
|
+
"http://example.com/faqs#cami%C3%B3n",
|
205
|
+
"http://example.com/search?q=cami%C3%B3n",
|
206
|
+
"http://example.com/search?q=espa%C3%B1a#top"]
|
207
|
+
end
|
208
|
+
|
209
|
+
it "should not crash when processing malformed hrefs" do
|
210
|
+
m = MetaInspector.new('http://example.com/malformed_href')
|
211
|
+
expect {
|
212
|
+
m.external_links.should == ["skype:joeuser?call", "telnet://telnet.cdrom.com",
|
213
|
+
"javascript:alert('ok');", "javascript://",
|
214
|
+
"mailto:email(at)example.com"]
|
215
|
+
m.should_not be_ok
|
216
|
+
}.to_not raise_error
|
217
|
+
end
|
196
218
|
end
|
197
219
|
end
|
198
220
|
|
data/spec/spec_helper.rb
CHANGED
@@ -29,6 +29,7 @@ FakeWeb.register_uri(:get, "http://protocol-relative.com", :response => fixture_
|
|
29
29
|
FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
|
30
30
|
FakeWeb.register_uri(:get, "http://example.com/nonhttp", :response => fixture_file("nonhttp.response"))
|
31
31
|
FakeWeb.register_uri(:get, "http://example.com/invalid_href", :response => fixture_file("invalid_href.response"))
|
32
|
+
FakeWeb.register_uri(:get, "http://example.com/malformed_href", :response => fixture_file("malformed_href.response"))
|
32
33
|
FakeWeb.register_uri(:get, "http://www.youtube.com/watch?v=iaGSSrp49uc", :response => fixture_file("youtube.response"))
|
33
34
|
FakeWeb.register_uri(:get, "http://markupvalidator.com/faqs", :response => fixture_file("markupvalidator_faqs.response"))
|
34
35
|
FakeWeb.register_uri(:get, "https://twitter.com/markupvalidator", :response => fixture_file("twitter_markupvalidator.response"))
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 33
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 13
|
9
|
-
-
|
10
|
-
version: 1.13.
|
9
|
+
- 1
|
10
|
+
version: 1.13.1
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jaime Iniesta
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-12-
|
18
|
+
date: 2012-12-13 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
name: nokogiri
|
@@ -164,6 +164,7 @@ files:
|
|
164
164
|
- spec/fixtures/international.response
|
165
165
|
- spec/fixtures/invalid_href.response
|
166
166
|
- spec/fixtures/iteh.at.response
|
167
|
+
- spec/fixtures/malformed_href.response
|
167
168
|
- spec/fixtures/markupvalidator_faqs.response
|
168
169
|
- spec/fixtures/nonhttp.response
|
169
170
|
- spec/fixtures/pagerankalert.com.response
|