metainspector 1.8.7 → 1.8.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc CHANGED
@@ -28,6 +28,7 @@ by defaul:
28
28
  Then you can see the scraped data like this:
29
29
 
30
30
  page.url # URL of the page
31
+ page.scheme # Scheme of the page (http, https)
31
32
  page.title # title of the page, as string
32
33
  page.links # array of strings, with every link found on the page
33
34
  page.absolute_links # array of all the links converted to absolute urls
@@ -9,12 +9,13 @@ require 'hashie/rash'
9
9
  # MetaInspector provides an easy way to scrape web pages and get its elements
10
10
  module MetaInspector
11
11
  class Scraper
12
- attr_reader :url
12
+ attr_reader :url, :scheme
13
13
  # Initializes a new instance of MetaInspector, setting the URL to the one given
14
14
  # If no scheme given, set it to http:// by default
15
15
  def initialize(url)
16
- @url = URI.parse(url).scheme.nil? ? 'http://' + url : url
17
- @data = Hashie::Rash.new('url' => @url)
16
+ @url = URI.parse(url).scheme.nil? ? 'http://' + url : url
17
+ @scheme = URI.parse(url).scheme || 'http'
18
+ @data = Hashie::Rash.new('url' => @url)
18
19
  end
19
20
 
20
21
  # Returns the parsed document title, from the content of the <title> tag.
@@ -44,7 +45,7 @@ module MetaInspector
44
45
 
45
46
  # Returns the links converted to absolute urls
46
47
  def absolute_links
47
- @data.absolute_links ||= links.map { |l| absolutify_url(l) }
48
+ @data.absolute_links ||= links.map { |l| absolutify_url(unrelativize_url(l)) }
48
49
  end
49
50
 
50
51
  def absolute_images
@@ -137,6 +138,11 @@ module MetaInspector
137
138
  url =~ /^http.*/ ? url : File.join(@url,url)
138
139
  end
139
140
 
141
+ # Convert a protocol-relative url to its full form, depending on the scheme of the page that contains it
142
+ def unrelativize_url(url)
143
+ url =~ /^\/\// ? "#{scheme}://#{url[2..-1]}" : url
144
+ end
145
+
140
146
  # Remove mailto links
141
147
  # TODO: convert this to a more generic filter to remove all non-http[s] like ftp, telnet, etc.
142
148
  def remove_mailto(links)
@@ -1,5 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  module MetaInspector
4
- VERSION = "1.8.7"
4
+ VERSION = "1.8.8"
5
5
  end
@@ -0,0 +1,26 @@
1
+ HTTP/1.1 200 OK
2
+ Server: nginx/1.0.5
3
+ Date: Thu, 29 Dec 2011 23:10:13 GMT
4
+ Content-Type: text/html
5
+ Content-Length: 15013
6
+ Last-Modified: Fri, 02 Dec 2011 21:00:49 GMT
7
+ Connection: keep-alive
8
+ Accept-Ranges: bytes
9
+
10
+ <!DOCTYPE html>
11
+ <html>
12
+ <head>
13
+ <meta charset="utf-8" />
14
+ <title>Protocol-relative URLs</title>
15
+ </head>
16
+ <body>
17
+ <p>Internal links</p>
18
+ <a href="/">Internal: home page</a>
19
+ <a href="/faqs">Internal: FAQs</a>
20
+ <a href="//protocol-relative.com/contact">Internal: protocol-relative</a>
21
+
22
+ <p>External links</p>
23
+ <a href="http://google.com">External: normal link</a>
24
+ <a href="//yahoo.com">External: protocol-relative link</a>
25
+ </body>
26
+ </html>
@@ -17,6 +17,11 @@ describe MetaInspector do
17
17
  @m = MetaInspector.new('pagerankalert.com')
18
18
  @m.url.should == 'http://pagerankalert.com'
19
19
  end
20
+
21
+ it "should store the scheme" do
22
+ MetaInspector.new('http://pagerankalert.com').scheme.should == 'http'
23
+ MetaInspector.new('https://pagerankalert.com').scheme.should == 'https'
24
+ end
20
25
  end
21
26
 
22
27
  context 'Doing a basic scrape' do
@@ -76,19 +81,19 @@ describe MetaInspector do
76
81
  @m.feed.should == 'http://www.tea-tron.com/jbravo/blog/feed/'
77
82
  end
78
83
  end
79
-
84
+
80
85
  context 'Page with missing meta description' do
81
86
  FakeWeb.register_uri(:get, "http://theonion-no-description.com", :response => fixture_file("theonion-no-description.com.response"))
82
-
83
- it "should find secondary description" do
87
+
88
+ it "should find secondary description" do
84
89
  @m = MetaInspector.new('http://theonion-no-description.com')
85
90
  @m.description == "SAN FRANCISCO&#8212;In a move expected to revolutionize the mobile device industry, Apple launched its fastest and most powerful iPhone to date Tuesday,"+
86
91
  " an innovative new model that can only be seen by the company's hippest and most dedicated customers. This is secondary text picked up because of a missing meta description."
87
92
  end
88
-
93
+
89
94
  end
90
-
91
-
95
+
96
+
92
97
  context 'Links' do
93
98
  before(:each) do
94
99
  @m = MetaInspector.new('http://pagerankalert.com')
@@ -119,6 +124,28 @@ describe MetaInspector do
119
124
  end
120
125
  end
121
126
 
127
+
128
+ context 'Protocol-relative URLs' do
129
+ FakeWeb.register_uri(:get, "http://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
130
+ FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
131
+
132
+ before(:each) do
133
+ @m_http = MetaInspector.new('http://protocol-relative.com')
134
+ @m_https = MetaInspector.new('https://protocol-relative.com')
135
+ end
136
+
137
+ it "should convert protocol-relative links to http" do
138
+ @m_http.absolute_links.should include('http://protocol-relative.com/contact')
139
+ @m_http.absolute_links.should include('http://yahoo.com')
140
+ end
141
+
142
+ it "should convert protocol-relative links to https" do
143
+ @m_https.absolute_links.should include('https://protocol-relative.com/contact')
144
+ @m_https.absolute_links.should include('https://yahoo.com')
145
+ end
146
+ end
147
+
148
+
122
149
  context 'Getting meta tags by ghost methods' do
123
150
  before(:each) do
124
151
  @m = MetaInspector.new('http://pagerankalert.com')
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- hash: 57
4
+ hash: 39
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
8
  - 8
9
- - 7
10
- version: 1.8.7
9
+ - 8
10
+ version: 1.8.8
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jaime Iniesta
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-12-13 00:00:00 Z
18
+ date: 2011-12-30 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  name: nokogiri
@@ -158,6 +158,7 @@ files:
158
158
  - spec/fixtures/guardian.co.uk.response
159
159
  - spec/fixtures/iteh.at.response
160
160
  - spec/fixtures/pagerankalert.com.response
161
+ - spec/fixtures/protocol_relative.response
161
162
  - spec/fixtures/tea-tron.com.response
162
163
  - spec/fixtures/theonion-no-description.com.response
163
164
  - spec/fixtures/theonion.com.response