metainspector 1.16.0 → 1.16.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/meta_inspector/scraper.rb +12 -1
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/fixtures/relative_links_with_base.response +22 -0
- data/spec/metainspector_spec.rb +12 -0
- data/spec/spec_helper.rb +4 -0
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bbf33936348d092cedb51b59a828bf90929c75ea
|
4
|
+
data.tar.gz: f8ba64edadd581c5d85c25d8139b99856cbe6a25
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 39370252996fc183c93eb1b49899628c1a7052250d2ef525957f984107de8b1cfcba9d6bcaf7f59ba611db342867e091b98dbaf2bab9b886f9eefaffceaf22c5
|
7
|
+
data.tar.gz: 2c119f82e2e1bb7de01e45c4e5ba83403c65f0fffde7fd739c19e179e177bb001ee652860a7389de1eb266c970ff28999fb88b909b3f184341383f96899fd2c6
|
@@ -237,12 +237,23 @@ module MetaInspector
|
|
237
237
|
if uri =~ /^\w*\:/i
|
238
238
|
normalize_url(uri)
|
239
239
|
else
|
240
|
-
Addressable::URI.join(
|
240
|
+
Addressable::URI.join(base_url, uri).normalize.to_s
|
241
241
|
end
|
242
242
|
rescue URI::InvalidURIError, Addressable::URI::InvalidURIError => e
|
243
243
|
add_fatal_error "Link parsing exception: #{e.message}" and nil
|
244
244
|
end
|
245
245
|
|
246
|
+
# Returns the base url to absolutify relative links. This can be the one set on a <base> tag,
|
247
|
+
# or the url of the document if no <base> tag was found.
|
248
|
+
def base_url
|
249
|
+
base_href || @url
|
250
|
+
end
|
251
|
+
|
252
|
+
# Returns the value of the href attribute on the <base /> tag, if it exists
|
253
|
+
def base_href
|
254
|
+
parsed_document.search('base').first.attributes['href'].value rescue nil
|
255
|
+
end
|
256
|
+
|
246
257
|
# Convert a protocol-relative url to its full form, depending on the scheme of the page that contains it
|
247
258
|
def unrelativize_url(url)
|
248
259
|
url =~ /^\/\// ? "#{scheme}://#{url[2..-1]}" : url
|
@@ -0,0 +1,22 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Server: nginx/1.0.5
|
3
|
+
Date: Thu, 29 Dec 2011 23:10:13 GMT
|
4
|
+
Content-Type: text/html
|
5
|
+
Content-Length: 15013
|
6
|
+
Last-Modified: Fri, 02 Dec 2011 21:00:49 GMT
|
7
|
+
Connection: keep-alive
|
8
|
+
Accept-Ranges: bytes
|
9
|
+
|
10
|
+
<!DOCTYPE html>
|
11
|
+
<html>
|
12
|
+
<head>
|
13
|
+
<base href="http://relativewithbase.com/" />
|
14
|
+
<meta charset="utf-8" />
|
15
|
+
<title>Relative links</title>
|
16
|
+
</head>
|
17
|
+
<body>
|
18
|
+
<p>Relative links</p>
|
19
|
+
<a href="about">About</a>
|
20
|
+
<a href="../sitemap">Sitemap</a>
|
21
|
+
</body>
|
22
|
+
</html>
|
data/spec/metainspector_spec.rb
CHANGED
@@ -272,6 +272,18 @@ describe MetaInspector do
|
|
272
272
|
end
|
273
273
|
end
|
274
274
|
|
275
|
+
describe 'Relative links with base' do
|
276
|
+
it 'should get the relative links from a document' do
|
277
|
+
m = MetaInspector.new('http://relativewithbase.com/company/page2')
|
278
|
+
m.internal_links.should == ['http://relativewithbase.com/about', 'http://relativewithbase.com/sitemap']
|
279
|
+
end
|
280
|
+
|
281
|
+
it 'should get the relative links from a directory' do
|
282
|
+
m = MetaInspector.new('http://relativewithbase.com/company/page2/')
|
283
|
+
m.internal_links.should == ['http://relativewithbase.com/about', 'http://relativewithbase.com/sitemap']
|
284
|
+
end
|
285
|
+
end
|
286
|
+
|
275
287
|
describe 'Non-HTTP links' do
|
276
288
|
before(:each) do
|
277
289
|
@m = MetaInspector.new('http://example.com/nonhttp')
|
data/spec/spec_helper.rb
CHANGED
@@ -47,6 +47,10 @@ FakeWeb.register_uri(:get, "http://relative.com/", :response => fixture_file("re
|
|
47
47
|
FakeWeb.register_uri(:get, "http://relative.com/company", :response => fixture_file("relative_links.response"))
|
48
48
|
FakeWeb.register_uri(:get, "http://relative.com/company/", :response => fixture_file("relative_links.response"))
|
49
49
|
|
50
|
+
FakeWeb.register_uri(:get, "http://relativewithbase.com/", :response => fixture_file("relative_links_with_base.response"))
|
51
|
+
FakeWeb.register_uri(:get, "http://relativewithbase.com/company/page2", :response => fixture_file("relative_links_with_base.response"))
|
52
|
+
FakeWeb.register_uri(:get, "http://relativewithbase.com/company/page2/", :response => fixture_file("relative_links_with_base.response"))
|
53
|
+
|
50
54
|
# These examples are used to test the redirections from HTTP to HTTPS and vice versa
|
51
55
|
# http://facebook.com => https://facebook.com
|
52
56
|
FakeWeb.register_uri(:get, "http://facebook.com/", :response => fixture_file("facebook.com.response"))
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.16.
|
4
|
+
version: 1.16.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jaime Iniesta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-10-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -162,6 +162,7 @@ files:
|
|
162
162
|
- spec/fixtures/pagerankalert.com.response
|
163
163
|
- spec/fixtures/protocol_relative.response
|
164
164
|
- spec/fixtures/relative_links.response
|
165
|
+
- spec/fixtures/relative_links_with_base.response
|
165
166
|
- spec/fixtures/tea-tron.com.response
|
166
167
|
- spec/fixtures/theonion-no-description.com.response
|
167
168
|
- spec/fixtures/theonion.com.response
|
@@ -192,7 +193,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
192
193
|
version: '0'
|
193
194
|
requirements: []
|
194
195
|
rubyforge_project:
|
195
|
-
rubygems_version: 2.
|
196
|
+
rubygems_version: 2.1.3
|
196
197
|
signing_key:
|
197
198
|
specification_version: 4
|
198
199
|
summary: MetaInspector is a ruby gem for web scraping purposes, that returns a hash
|