metainspector 1.16.0 → 1.16.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f52d3692c9305059affd0cecbe31856a9c67ef08
4
- data.tar.gz: 9936b40ceb9430f22c1a77aeb57004e90c7dd3c4
3
+ metadata.gz: bbf33936348d092cedb51b59a828bf90929c75ea
4
+ data.tar.gz: f8ba64edadd581c5d85c25d8139b99856cbe6a25
5
5
  SHA512:
6
- metadata.gz: 25dc0a75ee2c5a464c781de219cfc34744d5853bc96c4bc6b996ca7860bf093e990982e333650e9fc4cbc21d27747d4cb480bc8d506e0890c91a9d7d21f6d3bf
7
- data.tar.gz: c278db782849963eecb68b37195f50ea5661d363574182930926f300f2182bb99062ec1dda42faba8833b1465cef2b3f9371da2a51c146990424e39b0466c334
6
+ metadata.gz: 39370252996fc183c93eb1b49899628c1a7052250d2ef525957f984107de8b1cfcba9d6bcaf7f59ba611db342867e091b98dbaf2bab9b886f9eefaffceaf22c5
7
+ data.tar.gz: 2c119f82e2e1bb7de01e45c4e5ba83403c65f0fffde7fd739c19e179e177bb001ee652860a7389de1eb266c970ff28999fb88b909b3f184341383f96899fd2c6
@@ -237,12 +237,23 @@ module MetaInspector
237
237
  if uri =~ /^\w*\:/i
238
238
  normalize_url(uri)
239
239
  else
240
- Addressable::URI.join(@url, uri).normalize.to_s
240
+ Addressable::URI.join(base_url, uri).normalize.to_s
241
241
  end
242
242
  rescue URI::InvalidURIError, Addressable::URI::InvalidURIError => e
243
243
  add_fatal_error "Link parsing exception: #{e.message}" and nil
244
244
  end
245
245
 
246
+ # Returns the base url to absolutify relative links. This can be the one set on a <base> tag,
247
+ # or the url of the document if no <base> tag was found.
248
+ def base_url
249
+ base_href || @url
250
+ end
251
+
252
+ # Returns the value of the href attribute on the <base /> tag, if it exists
253
+ def base_href
254
+ parsed_document.search('base').first.attributes['href'].value rescue nil
255
+ end
256
+
246
257
  # Convert a protocol-relative url to its full form, depending on the scheme of the page that contains it
247
258
  def unrelativize_url(url)
248
259
  url =~ /^\/\// ? "#{scheme}://#{url[2..-1]}" : url
@@ -1,5 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  module MetaInspector
4
- VERSION = "1.16.0"
4
+ VERSION = "1.16.1"
5
5
  end
@@ -0,0 +1,22 @@
1
+ HTTP/1.1 200 OK
2
+ Server: nginx/1.0.5
3
+ Date: Thu, 29 Dec 2011 23:10:13 GMT
4
+ Content-Type: text/html
5
+ Content-Length: 15013
6
+ Last-Modified: Fri, 02 Dec 2011 21:00:49 GMT
7
+ Connection: keep-alive
8
+ Accept-Ranges: bytes
9
+
10
+ <!DOCTYPE html>
11
+ <html>
12
+ <head>
13
+ <base href="http://relativewithbase.com/" />
14
+ <meta charset="utf-8" />
15
+ <title>Relative links</title>
16
+ </head>
17
+ <body>
18
+ <p>Relative links</p>
19
+ <a href="about">About</a>
20
+ <a href="../sitemap">Sitemap</a>
21
+ </body>
22
+ </html>
@@ -272,6 +272,18 @@ describe MetaInspector do
272
272
  end
273
273
  end
274
274
 
275
+ describe 'Relative links with base' do
276
+ it 'should get the relative links from a document' do
277
+ m = MetaInspector.new('http://relativewithbase.com/company/page2')
278
+ m.internal_links.should == ['http://relativewithbase.com/about', 'http://relativewithbase.com/sitemap']
279
+ end
280
+
281
+ it 'should get the relative links from a directory' do
282
+ m = MetaInspector.new('http://relativewithbase.com/company/page2/')
283
+ m.internal_links.should == ['http://relativewithbase.com/about', 'http://relativewithbase.com/sitemap']
284
+ end
285
+ end
286
+
275
287
  describe 'Non-HTTP links' do
276
288
  before(:each) do
277
289
  @m = MetaInspector.new('http://example.com/nonhttp')
@@ -47,6 +47,10 @@ FakeWeb.register_uri(:get, "http://relative.com/", :response => fixture_file("re
47
47
  FakeWeb.register_uri(:get, "http://relative.com/company", :response => fixture_file("relative_links.response"))
48
48
  FakeWeb.register_uri(:get, "http://relative.com/company/", :response => fixture_file("relative_links.response"))
49
49
 
50
+ FakeWeb.register_uri(:get, "http://relativewithbase.com/", :response => fixture_file("relative_links_with_base.response"))
51
+ FakeWeb.register_uri(:get, "http://relativewithbase.com/company/page2", :response => fixture_file("relative_links_with_base.response"))
52
+ FakeWeb.register_uri(:get, "http://relativewithbase.com/company/page2/", :response => fixture_file("relative_links_with_base.response"))
53
+
50
54
  # These examples are used to test the redirections from HTTP to HTTPS and vice versa
51
55
  # http://facebook.com => https://facebook.com
52
56
  FakeWeb.register_uri(:get, "http://facebook.com/", :response => fixture_file("facebook.com.response"))
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.16.0
4
+ version: 1.16.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jaime Iniesta
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-09-04 00:00:00.000000000 Z
11
+ date: 2013-10-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -162,6 +162,7 @@ files:
162
162
  - spec/fixtures/pagerankalert.com.response
163
163
  - spec/fixtures/protocol_relative.response
164
164
  - spec/fixtures/relative_links.response
165
+ - spec/fixtures/relative_links_with_base.response
165
166
  - spec/fixtures/tea-tron.com.response
166
167
  - spec/fixtures/theonion-no-description.com.response
167
168
  - spec/fixtures/theonion.com.response
@@ -192,7 +193,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
192
193
  version: '0'
193
194
  requirements: []
194
195
  rubyforge_project:
195
- rubygems_version: 2.0.3
196
+ rubygems_version: 2.1.3
196
197
  signing_key:
197
198
  specification_version: 4
198
199
  summary: MetaInspector is a ruby gem for web scraping purposes, that returns a hash