metainspector 1.8.7 → 1.8.8

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -28,6 +28,7 @@ by defaul:
28
28
  Then you can see the scraped data like this:
29
29
 
30
30
  page.url # URL of the page
31
+ page.scheme # Scheme of the page (http, https)
31
32
  page.title # title of the page, as string
32
33
  page.links # array of strings, with every link found on the page
33
34
  page.absolute_links # array of all the links converted to absolute urls
@@ -9,12 +9,13 @@ require 'hashie/rash'
9
9
  # MetaInspector provides an easy way to scrape web pages and get its elements
10
10
  module MetaInspector
11
11
  class Scraper
12
- attr_reader :url
12
+ attr_reader :url, :scheme
13
13
  # Initializes a new instance of MetaInspector, setting the URL to the one given
14
14
  # If no scheme given, set it to http:// by default
15
15
  def initialize(url)
16
- @url = URI.parse(url).scheme.nil? ? 'http://' + url : url
17
- @data = Hashie::Rash.new('url' => @url)
16
+ @url = URI.parse(url).scheme.nil? ? 'http://' + url : url
17
+ @scheme = URI.parse(url).scheme || 'http'
18
+ @data = Hashie::Rash.new('url' => @url)
18
19
  end
19
20
 
20
21
  # Returns the parsed document title, from the content of the <title> tag.
@@ -44,7 +45,7 @@ module MetaInspector
44
45
 
45
46
  # Returns the links converted to absolute urls
46
47
  def absolute_links
47
- @data.absolute_links ||= links.map { |l| absolutify_url(l) }
48
+ @data.absolute_links ||= links.map { |l| absolutify_url(unrelativize_url(l)) }
48
49
  end
49
50
 
50
51
  def absolute_images
@@ -137,6 +138,11 @@ module MetaInspector
137
138
  url =~ /^http.*/ ? url : File.join(@url,url)
138
139
  end
139
140
 
141
+ # Convert a protocol-relative url to its full form, depending on the scheme of the page that contains it
142
+ def unrelativize_url(url)
143
+ url =~ /^\/\// ? "#{scheme}://#{url[2..-1]}" : url
144
+ end
145
+
140
146
  # Remove mailto links
141
147
  # TODO: convert this to a more generic filter to remove all non-http[s] like ftp, telnet, etc.
142
148
  def remove_mailto(links)
@@ -1,5 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  module MetaInspector
4
- VERSION = "1.8.7"
4
+ VERSION = "1.8.8"
5
5
  end
@@ -0,0 +1,26 @@
1
+ HTTP/1.1 200 OK
2
+ Server: nginx/1.0.5
3
+ Date: Thu, 29 Dec 2011 23:10:13 GMT
4
+ Content-Type: text/html
5
+ Content-Length: 15013
6
+ Last-Modified: Fri, 02 Dec 2011 21:00:49 GMT
7
+ Connection: keep-alive
8
+ Accept-Ranges: bytes
9
+
10
+ <!DOCTYPE html>
11
+ <html>
12
+ <head>
13
+ <meta charset="utf-8" />
14
+ <title>Protocol-relative URLs</title>
15
+ </head>
16
+ <body>
17
+ <p>Internal links</p>
18
+ <a href="/">Internal: home page</a>
19
+ <a href="/faqs">Internal: FAQs</a>
20
+ <a href="//protocol-relative.com/contact">Internal: protocol-relative</a>
21
+
22
+ <p>External links</p>
23
+ <a href="http://google.com">External: normal link</a>
24
+ <a href="//yahoo.com">External: protocol-relative link</a>
25
+ </body>
26
+ </html>
@@ -17,6 +17,11 @@ describe MetaInspector do
17
17
  @m = MetaInspector.new('pagerankalert.com')
18
18
  @m.url.should == 'http://pagerankalert.com'
19
19
  end
20
+
21
+ it "should store the scheme" do
22
+ MetaInspector.new('http://pagerankalert.com').scheme.should == 'http'
23
+ MetaInspector.new('https://pagerankalert.com').scheme.should == 'https'
24
+ end
20
25
  end
21
26
 
22
27
  context 'Doing a basic scrape' do
@@ -76,19 +81,19 @@ describe MetaInspector do
76
81
  @m.feed.should == 'http://www.tea-tron.com/jbravo/blog/feed/'
77
82
  end
78
83
  end
79
-
84
+
80
85
  context 'Page with missing meta description' do
81
86
  FakeWeb.register_uri(:get, "http://theonion-no-description.com", :response => fixture_file("theonion-no-description.com.response"))
82
-
83
- it "should find secondary description" do
87
+
88
+ it "should find secondary description" do
84
89
  @m = MetaInspector.new('http://theonion-no-description.com')
85
90
  @m.description == "SAN FRANCISCO&#8212;In a move expected to revolutionize the mobile device industry, Apple launched its fastest and most powerful iPhone to date Tuesday,"+
86
91
  " an innovative new model that can only be seen by the company's hippest and most dedicated customers. This is secondary text picked up because of a missing meta description."
87
92
  end
88
-
93
+
89
94
  end
90
-
91
-
95
+
96
+
92
97
  context 'Links' do
93
98
  before(:each) do
94
99
  @m = MetaInspector.new('http://pagerankalert.com')
@@ -119,6 +124,28 @@ describe MetaInspector do
119
124
  end
120
125
  end
121
126
 
127
+
128
+ context 'Protocol-relative URLs' do
129
+ FakeWeb.register_uri(:get, "http://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
130
+ FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
131
+
132
+ before(:each) do
133
+ @m_http = MetaInspector.new('http://protocol-relative.com')
134
+ @m_https = MetaInspector.new('https://protocol-relative.com')
135
+ end
136
+
137
+ it "should convert protocol-relative links to http" do
138
+ @m_http.absolute_links.should include('http://protocol-relative.com/contact')
139
+ @m_http.absolute_links.should include('http://yahoo.com')
140
+ end
141
+
142
+ it "should convert protocol-relative links to https" do
143
+ @m_https.absolute_links.should include('https://protocol-relative.com/contact')
144
+ @m_https.absolute_links.should include('https://yahoo.com')
145
+ end
146
+ end
147
+
148
+
122
149
  context 'Getting meta tags by ghost methods' do
123
150
  before(:each) do
124
151
  @m = MetaInspector.new('http://pagerankalert.com')
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- hash: 57
4
+ hash: 39
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
8
  - 8
9
- - 7
10
- version: 1.8.7
9
+ - 8
10
+ version: 1.8.8
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jaime Iniesta
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-12-13 00:00:00 Z
18
+ date: 2011-12-30 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  name: nokogiri
@@ -158,6 +158,7 @@ files:
158
158
  - spec/fixtures/guardian.co.uk.response
159
159
  - spec/fixtures/iteh.at.response
160
160
  - spec/fixtures/pagerankalert.com.response
161
+ - spec/fixtures/protocol_relative.response
161
162
  - spec/fixtures/tea-tron.com.response
162
163
  - spec/fixtures/theonion-no-description.com.response
163
164
  - spec/fixtures/theonion.com.response