metainspector 1.16.0 → 1.16.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f52d3692c9305059affd0cecbe31856a9c67ef08
4
- data.tar.gz: 9936b40ceb9430f22c1a77aeb57004e90c7dd3c4
3
+ metadata.gz: bbf33936348d092cedb51b59a828bf90929c75ea
4
+ data.tar.gz: f8ba64edadd581c5d85c25d8139b99856cbe6a25
5
5
  SHA512:
6
- metadata.gz: 25dc0a75ee2c5a464c781de219cfc34744d5853bc96c4bc6b996ca7860bf093e990982e333650e9fc4cbc21d27747d4cb480bc8d506e0890c91a9d7d21f6d3bf
7
- data.tar.gz: c278db782849963eecb68b37195f50ea5661d363574182930926f300f2182bb99062ec1dda42faba8833b1465cef2b3f9371da2a51c146990424e39b0466c334
6
+ metadata.gz: 39370252996fc183c93eb1b49899628c1a7052250d2ef525957f984107de8b1cfcba9d6bcaf7f59ba611db342867e091b98dbaf2bab9b886f9eefaffceaf22c5
7
+ data.tar.gz: 2c119f82e2e1bb7de01e45c4e5ba83403c65f0fffde7fd739c19e179e177bb001ee652860a7389de1eb266c970ff28999fb88b909b3f184341383f96899fd2c6
@@ -237,12 +237,23 @@ module MetaInspector
237
237
  if uri =~ /^\w*\:/i
238
238
  normalize_url(uri)
239
239
  else
240
- Addressable::URI.join(@url, uri).normalize.to_s
240
+ Addressable::URI.join(base_url, uri).normalize.to_s
241
241
  end
242
242
  rescue URI::InvalidURIError, Addressable::URI::InvalidURIError => e
243
243
  add_fatal_error "Link parsing exception: #{e.message}" and nil
244
244
  end
245
245
 
246
+ # Returns the base url to absolutify relative links. This can be the one set on a <base> tag,
247
+ # or the url of the document if no <base> tag was found.
248
+ def base_url
249
+ base_href || @url
250
+ end
251
+
252
+ # Returns the value of the href attribute on the <base /> tag, if it exists
253
+ def base_href
254
+ parsed_document.search('base').first.attributes['href'].value rescue nil
255
+ end
256
+
246
257
  # Convert a protocol-relative url to its full form, depending on the scheme of the page that contains it
247
258
  def unrelativize_url(url)
248
259
  url =~ /^\/\// ? "#{scheme}://#{url[2..-1]}" : url
@@ -1,5 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  module MetaInspector
4
- VERSION = "1.16.0"
4
+ VERSION = "1.16.1"
5
5
  end
@@ -0,0 +1,22 @@
1
+ HTTP/1.1 200 OK
2
+ Server: nginx/1.0.5
3
+ Date: Thu, 29 Dec 2011 23:10:13 GMT
4
+ Content-Type: text/html
5
+ Content-Length: 15013
6
+ Last-Modified: Fri, 02 Dec 2011 21:00:49 GMT
7
+ Connection: keep-alive
8
+ Accept-Ranges: bytes
9
+
10
+ <!DOCTYPE html>
11
+ <html>
12
+ <head>
13
+ <base href="http://relativewithbase.com/" />
14
+ <meta charset="utf-8" />
15
+ <title>Relative links</title>
16
+ </head>
17
+ <body>
18
+ <p>Relative links</p>
19
+ <a href="about">About</a>
20
+ <a href="../sitemap">Sitemap</a>
21
+ </body>
22
+ </html>
@@ -272,6 +272,18 @@ describe MetaInspector do
272
272
  end
273
273
  end
274
274
 
275
+ describe 'Relative links with base' do
276
+ it 'should get the relative links from a document' do
277
+ m = MetaInspector.new('http://relativewithbase.com/company/page2')
278
+ m.internal_links.should == ['http://relativewithbase.com/about', 'http://relativewithbase.com/sitemap']
279
+ end
280
+
281
+ it 'should get the relative links from a directory' do
282
+ m = MetaInspector.new('http://relativewithbase.com/company/page2/')
283
+ m.internal_links.should == ['http://relativewithbase.com/about', 'http://relativewithbase.com/sitemap']
284
+ end
285
+ end
286
+
275
287
  describe 'Non-HTTP links' do
276
288
  before(:each) do
277
289
  @m = MetaInspector.new('http://example.com/nonhttp')
@@ -47,6 +47,10 @@ FakeWeb.register_uri(:get, "http://relative.com/", :response => fixture_file("re
47
47
  FakeWeb.register_uri(:get, "http://relative.com/company", :response => fixture_file("relative_links.response"))
48
48
  FakeWeb.register_uri(:get, "http://relative.com/company/", :response => fixture_file("relative_links.response"))
49
49
 
50
+ FakeWeb.register_uri(:get, "http://relativewithbase.com/", :response => fixture_file("relative_links_with_base.response"))
51
+ FakeWeb.register_uri(:get, "http://relativewithbase.com/company/page2", :response => fixture_file("relative_links_with_base.response"))
52
+ FakeWeb.register_uri(:get, "http://relativewithbase.com/company/page2/", :response => fixture_file("relative_links_with_base.response"))
53
+
50
54
  # These examples are used to test the redirections from HTTP to HTTPS and vice versa
51
55
  # http://facebook.com => https://facebook.com
52
56
  FakeWeb.register_uri(:get, "http://facebook.com/", :response => fixture_file("facebook.com.response"))
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.16.0
4
+ version: 1.16.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jaime Iniesta
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-09-04 00:00:00.000000000 Z
11
+ date: 2013-10-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -162,6 +162,7 @@ files:
162
162
  - spec/fixtures/pagerankalert.com.response
163
163
  - spec/fixtures/protocol_relative.response
164
164
  - spec/fixtures/relative_links.response
165
+ - spec/fixtures/relative_links_with_base.response
165
166
  - spec/fixtures/tea-tron.com.response
166
167
  - spec/fixtures/theonion-no-description.com.response
167
168
  - spec/fixtures/theonion.com.response
@@ -192,7 +193,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
192
193
  version: '0'
193
194
  requirements: []
194
195
  rubyforge_project:
195
- rubygems_version: 2.0.3
196
+ rubygems_version: 2.1.3
196
197
  signing_key:
197
198
  specification_version: 4
198
199
  summary: MetaInspector is a ruby gem for web scraping purposes, that returns a hash