metainspector 1.13.0 → 1.13.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/meta_inspector/scraper.rb +9 -2
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/fixtures/malformed_href.response +27 -0
- data/spec/metainspector_spec.rb +36 -14
- data/spec/spec_helper.rb +1 -0
- metadata +5 -4
@@ -53,12 +53,12 @@ module MetaInspector
|
|
53
53
|
|
54
54
|
# Internal links found on the page, as absolute URLs
|
55
55
|
def internal_links
|
56
|
-
@internal_links ||= links.select {|link|
|
56
|
+
@internal_links ||= links.select {|link| host_from_url(link) == host }
|
57
57
|
end
|
58
58
|
|
59
59
|
# External links found on the page, as absolute URLs
|
60
60
|
def external_links
|
61
|
-
@external_links ||= links.select {|link|
|
61
|
+
@external_links ||= links.select {|link| host_from_url(link) != host }
|
62
62
|
end
|
63
63
|
|
64
64
|
# Images found on the page, as absolute URLs
|
@@ -245,6 +245,13 @@ module MetaInspector
|
|
245
245
|
url =~ /^\/\// ? "#{scheme}://#{url[2..-1]}" : url
|
246
246
|
end
|
247
247
|
|
248
|
+
# Extracts the host from a given URL
|
249
|
+
def host_from_url(url)
|
250
|
+
URI.parse(url).host
|
251
|
+
rescue URI::InvalidURIError, URI::InvalidComponentError => e
|
252
|
+
add_fatal_error "Link parsing exception: #{e.message}" and nil
|
253
|
+
end
|
254
|
+
|
248
255
|
# Look for the first <p> block with 120 characters or more
|
249
256
|
def secondary_description
|
250
257
|
first_long_paragraph = parsed_document.search('//p[string-length() >= 120]').first
|
@@ -0,0 +1,27 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Server: nginx/0.7.67
|
3
|
+
Date: Fri, 18 Nov 2011 21:46:46 GMT
|
4
|
+
Content-Type: text/html
|
5
|
+
Connection: keep-alive
|
6
|
+
Last-Modified: Mon, 14 Nov 2011 16:53:18 GMT
|
7
|
+
Content-Length: 4987
|
8
|
+
X-Varnish: 2000423390
|
9
|
+
Age: 0
|
10
|
+
Via: 1.1 varnish
|
11
|
+
|
12
|
+
<html>
|
13
|
+
<head>
|
14
|
+
<title>Malformed hrefs</title>
|
15
|
+
</head>
|
16
|
+
<body>
|
17
|
+
<h1>Good links</h1>
|
18
|
+
<a href="/faqs">FAQs</a>
|
19
|
+
<a href="skype:joeuser?call">a skype link</a>
|
20
|
+
<a href="telnet://telnet.cdrom.com">a telnet link</a>
|
21
|
+
<a href="javascript:alert('ok');">ok</a>
|
22
|
+
|
23
|
+
<h1>Bad links due to malformed href</h1>
|
24
|
+
<a href="javascript://">oops</a>
|
25
|
+
<a href="mailto:email(at)example.com">
|
26
|
+
</body>
|
27
|
+
</html>
|
data/spec/metainspector_spec.rb
CHANGED
@@ -177,22 +177,44 @@ describe MetaInspector do
|
|
177
177
|
"http://example.com/search?q=espa%C3%B1a#top"]
|
178
178
|
end
|
179
179
|
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
180
|
+
describe "internal links" do
|
181
|
+
it "should get correct internal links, encoding the URLs as needed but respecting # and ?" do
|
182
|
+
m = MetaInspector.new('http://international.com')
|
183
|
+
m.internal_links.should == [ "http://international.com/espa%C3%B1a.asp",
|
184
|
+
"http://international.com/roman%C3%A9e",
|
185
|
+
"http://international.com/faqs#cami%C3%B3n",
|
186
|
+
"http://international.com/search?q=cami%C3%B3n",
|
187
|
+
"http://international.com/search?q=espa%C3%B1a#top"]
|
188
|
+
end
|
189
|
+
|
190
|
+
it "should not crash when processing malformed hrefs" do
|
191
|
+
m = MetaInspector.new('http://example.com/malformed_href')
|
192
|
+
expect {
|
193
|
+
m.internal_links.should == [ "http://example.com/faqs" ]
|
194
|
+
m.should_not be_ok
|
195
|
+
}.to_not raise_error
|
196
|
+
end
|
187
197
|
end
|
188
198
|
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
199
|
+
describe "external links" do
|
200
|
+
it "should get correct external links, encoding the URLs as needed but respecting # and ?" do
|
201
|
+
m = MetaInspector.new('http://international.com')
|
202
|
+
m.external_links.should == [ "http://example.com/espa%C3%B1a.asp",
|
203
|
+
"http://example.com/roman%C3%A9e",
|
204
|
+
"http://example.com/faqs#cami%C3%B3n",
|
205
|
+
"http://example.com/search?q=cami%C3%B3n",
|
206
|
+
"http://example.com/search?q=espa%C3%B1a#top"]
|
207
|
+
end
|
208
|
+
|
209
|
+
it "should not crash when processing malformed hrefs" do
|
210
|
+
m = MetaInspector.new('http://example.com/malformed_href')
|
211
|
+
expect {
|
212
|
+
m.external_links.should == ["skype:joeuser?call", "telnet://telnet.cdrom.com",
|
213
|
+
"javascript:alert('ok');", "javascript://",
|
214
|
+
"mailto:email(at)example.com"]
|
215
|
+
m.should_not be_ok
|
216
|
+
}.to_not raise_error
|
217
|
+
end
|
196
218
|
end
|
197
219
|
end
|
198
220
|
|
data/spec/spec_helper.rb
CHANGED
@@ -29,6 +29,7 @@ FakeWeb.register_uri(:get, "http://protocol-relative.com", :response => fixture_
|
|
29
29
|
FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
|
30
30
|
FakeWeb.register_uri(:get, "http://example.com/nonhttp", :response => fixture_file("nonhttp.response"))
|
31
31
|
FakeWeb.register_uri(:get, "http://example.com/invalid_href", :response => fixture_file("invalid_href.response"))
|
32
|
+
FakeWeb.register_uri(:get, "http://example.com/malformed_href", :response => fixture_file("malformed_href.response"))
|
32
33
|
FakeWeb.register_uri(:get, "http://www.youtube.com/watch?v=iaGSSrp49uc", :response => fixture_file("youtube.response"))
|
33
34
|
FakeWeb.register_uri(:get, "http://markupvalidator.com/faqs", :response => fixture_file("markupvalidator_faqs.response"))
|
34
35
|
FakeWeb.register_uri(:get, "https://twitter.com/markupvalidator", :response => fixture_file("twitter_markupvalidator.response"))
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 33
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 13
|
9
|
-
-
|
10
|
-
version: 1.13.
|
9
|
+
- 1
|
10
|
+
version: 1.13.1
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jaime Iniesta
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-12-
|
18
|
+
date: 2012-12-13 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
name: nokogiri
|
@@ -164,6 +164,7 @@ files:
|
|
164
164
|
- spec/fixtures/international.response
|
165
165
|
- spec/fixtures/invalid_href.response
|
166
166
|
- spec/fixtures/iteh.at.response
|
167
|
+
- spec/fixtures/malformed_href.response
|
167
168
|
- spec/fixtures/markupvalidator_faqs.response
|
168
169
|
- spec/fixtures/nonhttp.response
|
169
170
|
- spec/fixtures/pagerankalert.com.response
|