metainspector 1.8.7 → 1.8.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +1 -0
- data/lib/meta_inspector/scraper.rb +10 -4
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/fixtures/protocol_relative.response +26 -0
- data/spec/metainspector_spec.rb +33 -6
- metadata +5 -4
data/README.rdoc
CHANGED
@@ -28,6 +28,7 @@ by defaul:
|
|
28
28
|
Then you can see the scraped data like this:
|
29
29
|
|
30
30
|
page.url # URL of the page
|
31
|
+
page.scheme # Scheme of the page (http, https)
|
31
32
|
page.title # title of the page, as string
|
32
33
|
page.links # array of strings, with every link found on the page
|
33
34
|
page.absolute_links # array of all the links converted to absolute urls
|
@@ -9,12 +9,13 @@ require 'hashie/rash'
|
|
9
9
|
# MetaInspector provides an easy way to scrape web pages and get its elements
|
10
10
|
module MetaInspector
|
11
11
|
class Scraper
|
12
|
-
attr_reader :url
|
12
|
+
attr_reader :url, :scheme
|
13
13
|
# Initializes a new instance of MetaInspector, setting the URL to the one given
|
14
14
|
# If no scheme given, set it to http:// by default
|
15
15
|
def initialize(url)
|
16
|
-
@url
|
17
|
-
@
|
16
|
+
@url = URI.parse(url).scheme.nil? ? 'http://' + url : url
|
17
|
+
@scheme = URI.parse(url).scheme || 'http'
|
18
|
+
@data = Hashie::Rash.new('url' => @url)
|
18
19
|
end
|
19
20
|
|
20
21
|
# Returns the parsed document title, from the content of the <title> tag.
|
@@ -44,7 +45,7 @@ module MetaInspector
|
|
44
45
|
|
45
46
|
# Returns the links converted to absolute urls
|
46
47
|
def absolute_links
|
47
|
-
@data.absolute_links ||= links.map { |l| absolutify_url(l) }
|
48
|
+
@data.absolute_links ||= links.map { |l| absolutify_url(unrelativize_url(l)) }
|
48
49
|
end
|
49
50
|
|
50
51
|
def absolute_images
|
@@ -137,6 +138,11 @@ module MetaInspector
|
|
137
138
|
url =~ /^http.*/ ? url : File.join(@url,url)
|
138
139
|
end
|
139
140
|
|
141
|
+
# Convert a protocol-relative url to its full form, depending on the scheme of the page that contains it
|
142
|
+
def unrelativize_url(url)
|
143
|
+
url =~ /^\/\// ? "#{scheme}://#{url[2..-1]}" : url
|
144
|
+
end
|
145
|
+
|
140
146
|
# Remove mailto links
|
141
147
|
# TODO: convert this to a more generic filter to remove all non-http[s] like ftp, telnet, etc.
|
142
148
|
def remove_mailto(links)
|
@@ -0,0 +1,26 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Server: nginx/1.0.5
|
3
|
+
Date: Thu, 29 Dec 2011 23:10:13 GMT
|
4
|
+
Content-Type: text/html
|
5
|
+
Content-Length: 15013
|
6
|
+
Last-Modified: Fri, 02 Dec 2011 21:00:49 GMT
|
7
|
+
Connection: keep-alive
|
8
|
+
Accept-Ranges: bytes
|
9
|
+
|
10
|
+
<!DOCTYPE html>
|
11
|
+
<html>
|
12
|
+
<head>
|
13
|
+
<meta charset="utf-8" />
|
14
|
+
<title>Protocol-relative URLs</title>
|
15
|
+
</head>
|
16
|
+
<body>
|
17
|
+
<p>Internal links</p>
|
18
|
+
<a href="/">Internal: home page</a>
|
19
|
+
<a href="/faqs">Internal: FAQs</a>
|
20
|
+
<a href="//protocol-relative.com/contact">Internal: protocol-relative</a>
|
21
|
+
|
22
|
+
<p>External links</p>
|
23
|
+
<a href="http://google.com">External: normal link</a>
|
24
|
+
<a href="//yahoo.com">External: protocol-relative link</a>
|
25
|
+
</body>
|
26
|
+
</html>
|
data/spec/metainspector_spec.rb
CHANGED
@@ -17,6 +17,11 @@ describe MetaInspector do
|
|
17
17
|
@m = MetaInspector.new('pagerankalert.com')
|
18
18
|
@m.url.should == 'http://pagerankalert.com'
|
19
19
|
end
|
20
|
+
|
21
|
+
it "should store the scheme" do
|
22
|
+
MetaInspector.new('http://pagerankalert.com').scheme.should == 'http'
|
23
|
+
MetaInspector.new('https://pagerankalert.com').scheme.should == 'https'
|
24
|
+
end
|
20
25
|
end
|
21
26
|
|
22
27
|
context 'Doing a basic scrape' do
|
@@ -76,19 +81,19 @@ describe MetaInspector do
|
|
76
81
|
@m.feed.should == 'http://www.tea-tron.com/jbravo/blog/feed/'
|
77
82
|
end
|
78
83
|
end
|
79
|
-
|
84
|
+
|
80
85
|
context 'Page with missing meta description' do
|
81
86
|
FakeWeb.register_uri(:get, "http://theonion-no-description.com", :response => fixture_file("theonion-no-description.com.response"))
|
82
|
-
|
83
|
-
it "should find secondary description" do
|
87
|
+
|
88
|
+
it "should find secondary description" do
|
84
89
|
@m = MetaInspector.new('http://theonion-no-description.com')
|
85
90
|
@m.description == "SAN FRANCISCO—In a move expected to revolutionize the mobile device industry, Apple launched its fastest and most powerful iPhone to date Tuesday,"+
|
86
91
|
" an innovative new model that can only be seen by the company's hippest and most dedicated customers. This is secondary text picked up because of a missing meta description."
|
87
92
|
end
|
88
|
-
|
93
|
+
|
89
94
|
end
|
90
|
-
|
91
|
-
|
95
|
+
|
96
|
+
|
92
97
|
context 'Links' do
|
93
98
|
before(:each) do
|
94
99
|
@m = MetaInspector.new('http://pagerankalert.com')
|
@@ -119,6 +124,28 @@ describe MetaInspector do
|
|
119
124
|
end
|
120
125
|
end
|
121
126
|
|
127
|
+
|
128
|
+
context 'Protocol-relative URLs' do
|
129
|
+
FakeWeb.register_uri(:get, "http://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
|
130
|
+
FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
|
131
|
+
|
132
|
+
before(:each) do
|
133
|
+
@m_http = MetaInspector.new('http://protocol-relative.com')
|
134
|
+
@m_https = MetaInspector.new('https://protocol-relative.com')
|
135
|
+
end
|
136
|
+
|
137
|
+
it "should convert protocol-relative links to http" do
|
138
|
+
@m_http.absolute_links.should include('http://protocol-relative.com/contact')
|
139
|
+
@m_http.absolute_links.should include('http://yahoo.com')
|
140
|
+
end
|
141
|
+
|
142
|
+
it "should convert protocol-relative links to https" do
|
143
|
+
@m_https.absolute_links.should include('https://protocol-relative.com/contact')
|
144
|
+
@m_https.absolute_links.should include('https://yahoo.com')
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
|
122
149
|
context 'Getting meta tags by ghost methods' do
|
123
150
|
before(:each) do
|
124
151
|
@m = MetaInspector.new('http://pagerankalert.com')
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 39
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 8
|
9
|
-
-
|
10
|
-
version: 1.8.
|
9
|
+
- 8
|
10
|
+
version: 1.8.8
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jaime Iniesta
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-12-
|
18
|
+
date: 2011-12-30 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
name: nokogiri
|
@@ -158,6 +158,7 @@ files:
|
|
158
158
|
- spec/fixtures/guardian.co.uk.response
|
159
159
|
- spec/fixtures/iteh.at.response
|
160
160
|
- spec/fixtures/pagerankalert.com.response
|
161
|
+
- spec/fixtures/protocol_relative.response
|
161
162
|
- spec/fixtures/tea-tron.com.response
|
162
163
|
- spec/fixtures/theonion-no-description.com.response
|
163
164
|
- spec/fixtures/theonion.com.response
|