metainspector 1.16.0 → 1.16.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/meta_inspector/scraper.rb +12 -1
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/fixtures/relative_links_with_base.response +22 -0
- data/spec/metainspector_spec.rb +12 -0
- data/spec/spec_helper.rb +4 -0
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bbf33936348d092cedb51b59a828bf90929c75ea
|
4
|
+
data.tar.gz: f8ba64edadd581c5d85c25d8139b99856cbe6a25
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 39370252996fc183c93eb1b49899628c1a7052250d2ef525957f984107de8b1cfcba9d6bcaf7f59ba611db342867e091b98dbaf2bab9b886f9eefaffceaf22c5
|
7
|
+
data.tar.gz: 2c119f82e2e1bb7de01e45c4e5ba83403c65f0fffde7fd739c19e179e177bb001ee652860a7389de1eb266c970ff28999fb88b909b3f184341383f96899fd2c6
|
@@ -237,12 +237,23 @@ module MetaInspector
|
|
237
237
|
if uri =~ /^\w*\:/i
|
238
238
|
normalize_url(uri)
|
239
239
|
else
|
240
|
-
Addressable::URI.join(
|
240
|
+
Addressable::URI.join(base_url, uri).normalize.to_s
|
241
241
|
end
|
242
242
|
rescue URI::InvalidURIError, Addressable::URI::InvalidURIError => e
|
243
243
|
add_fatal_error "Link parsing exception: #{e.message}" and nil
|
244
244
|
end
|
245
245
|
|
246
|
+
# Returns the base url to absolutify relative links. This can be the one set on a <base> tag,
|
247
|
+
# or the url of the document if no <base> tag was found.
|
248
|
+
def base_url
|
249
|
+
base_href || @url
|
250
|
+
end
|
251
|
+
|
252
|
+
# Returns the value of the href attribute on the <base /> tag, if it exists
|
253
|
+
def base_href
|
254
|
+
parsed_document.search('base').first.attributes['href'].value rescue nil
|
255
|
+
end
|
256
|
+
|
246
257
|
# Convert a protocol-relative url to its full form, depending on the scheme of the page that contains it
|
247
258
|
def unrelativize_url(url)
|
248
259
|
url =~ /^\/\// ? "#{scheme}://#{url[2..-1]}" : url
|
@@ -0,0 +1,22 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Server: nginx/1.0.5
|
3
|
+
Date: Thu, 29 Dec 2011 23:10:13 GMT
|
4
|
+
Content-Type: text/html
|
5
|
+
Content-Length: 15013
|
6
|
+
Last-Modified: Fri, 02 Dec 2011 21:00:49 GMT
|
7
|
+
Connection: keep-alive
|
8
|
+
Accept-Ranges: bytes
|
9
|
+
|
10
|
+
<!DOCTYPE html>
|
11
|
+
<html>
|
12
|
+
<head>
|
13
|
+
<base href="http://relativewithbase.com/" />
|
14
|
+
<meta charset="utf-8" />
|
15
|
+
<title>Relative links</title>
|
16
|
+
</head>
|
17
|
+
<body>
|
18
|
+
<p>Relative links</p>
|
19
|
+
<a href="about">About</a>
|
20
|
+
<a href="../sitemap">Sitemap</a>
|
21
|
+
</body>
|
22
|
+
</html>
|
data/spec/metainspector_spec.rb
CHANGED
@@ -272,6 +272,18 @@ describe MetaInspector do
|
|
272
272
|
end
|
273
273
|
end
|
274
274
|
|
275
|
+
describe 'Relative links with base' do
|
276
|
+
it 'should get the relative links from a document' do
|
277
|
+
m = MetaInspector.new('http://relativewithbase.com/company/page2')
|
278
|
+
m.internal_links.should == ['http://relativewithbase.com/about', 'http://relativewithbase.com/sitemap']
|
279
|
+
end
|
280
|
+
|
281
|
+
it 'should get the relative links from a directory' do
|
282
|
+
m = MetaInspector.new('http://relativewithbase.com/company/page2/')
|
283
|
+
m.internal_links.should == ['http://relativewithbase.com/about', 'http://relativewithbase.com/sitemap']
|
284
|
+
end
|
285
|
+
end
|
286
|
+
|
275
287
|
describe 'Non-HTTP links' do
|
276
288
|
before(:each) do
|
277
289
|
@m = MetaInspector.new('http://example.com/nonhttp')
|
data/spec/spec_helper.rb
CHANGED
@@ -47,6 +47,10 @@ FakeWeb.register_uri(:get, "http://relative.com/", :response => fixture_file("re
|
|
47
47
|
FakeWeb.register_uri(:get, "http://relative.com/company", :response => fixture_file("relative_links.response"))
|
48
48
|
FakeWeb.register_uri(:get, "http://relative.com/company/", :response => fixture_file("relative_links.response"))
|
49
49
|
|
50
|
+
FakeWeb.register_uri(:get, "http://relativewithbase.com/", :response => fixture_file("relative_links_with_base.response"))
|
51
|
+
FakeWeb.register_uri(:get, "http://relativewithbase.com/company/page2", :response => fixture_file("relative_links_with_base.response"))
|
52
|
+
FakeWeb.register_uri(:get, "http://relativewithbase.com/company/page2/", :response => fixture_file("relative_links_with_base.response"))
|
53
|
+
|
50
54
|
# These examples are used to test the redirections from HTTP to HTTPS and vice versa
|
51
55
|
# http://facebook.com => https://facebook.com
|
52
56
|
FakeWeb.register_uri(:get, "http://facebook.com/", :response => fixture_file("facebook.com.response"))
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.16.
|
4
|
+
version: 1.16.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jaime Iniesta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-10-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -162,6 +162,7 @@ files:
|
|
162
162
|
- spec/fixtures/pagerankalert.com.response
|
163
163
|
- spec/fixtures/protocol_relative.response
|
164
164
|
- spec/fixtures/relative_links.response
|
165
|
+
- spec/fixtures/relative_links_with_base.response
|
165
166
|
- spec/fixtures/tea-tron.com.response
|
166
167
|
- spec/fixtures/theonion-no-description.com.response
|
167
168
|
- spec/fixtures/theonion.com.response
|
@@ -192,7 +193,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
192
193
|
version: '0'
|
193
194
|
requirements: []
|
194
195
|
rubyforge_project:
|
195
|
-
rubygems_version: 2.
|
196
|
+
rubygems_version: 2.1.3
|
196
197
|
signing_key:
|
197
198
|
specification_version: 4
|
198
199
|
summary: MetaInspector is a ruby gem for web scraping purposes, that returns a hash
|