metainspector 1.8.7 → 1.8.8
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +1 -0
- data/lib/meta_inspector/scraper.rb +10 -4
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/fixtures/protocol_relative.response +26 -0
- data/spec/metainspector_spec.rb +33 -6
- metadata +5 -4
data/README.rdoc
CHANGED
@@ -28,6 +28,7 @@ by defaul:
|
|
28
28
|
Then you can see the scraped data like this:
|
29
29
|
|
30
30
|
page.url # URL of the page
|
31
|
+
page.scheme # Scheme of the page (http, https)
|
31
32
|
page.title # title of the page, as string
|
32
33
|
page.links # array of strings, with every link found on the page
|
33
34
|
page.absolute_links # array of all the links converted to absolute urls
|
@@ -9,12 +9,13 @@ require 'hashie/rash'
|
|
9
9
|
# MetaInspector provides an easy way to scrape web pages and get its elements
|
10
10
|
module MetaInspector
|
11
11
|
class Scraper
|
12
|
-
attr_reader :url
|
12
|
+
attr_reader :url, :scheme
|
13
13
|
# Initializes a new instance of MetaInspector, setting the URL to the one given
|
14
14
|
# If no scheme given, set it to http:// by default
|
15
15
|
def initialize(url)
|
16
|
-
@url
|
17
|
-
@
|
16
|
+
@url = URI.parse(url).scheme.nil? ? 'http://' + url : url
|
17
|
+
@scheme = URI.parse(url).scheme || 'http'
|
18
|
+
@data = Hashie::Rash.new('url' => @url)
|
18
19
|
end
|
19
20
|
|
20
21
|
# Returns the parsed document title, from the content of the <title> tag.
|
@@ -44,7 +45,7 @@ module MetaInspector
|
|
44
45
|
|
45
46
|
# Returns the links converted to absolute urls
|
46
47
|
def absolute_links
|
47
|
-
@data.absolute_links ||= links.map { |l| absolutify_url(l) }
|
48
|
+
@data.absolute_links ||= links.map { |l| absolutify_url(unrelativize_url(l)) }
|
48
49
|
end
|
49
50
|
|
50
51
|
def absolute_images
|
@@ -137,6 +138,11 @@ module MetaInspector
|
|
137
138
|
url =~ /^http.*/ ? url : File.join(@url,url)
|
138
139
|
end
|
139
140
|
|
141
|
+
# Convert a protocol-relative url to its full form, depending on the scheme of the page that contains it
|
142
|
+
def unrelativize_url(url)
|
143
|
+
url =~ /^\/\// ? "#{scheme}://#{url[2..-1]}" : url
|
144
|
+
end
|
145
|
+
|
140
146
|
# Remove mailto links
|
141
147
|
# TODO: convert this to a more generic filter to remove all non-http[s] like ftp, telnet, etc.
|
142
148
|
def remove_mailto(links)
|
@@ -0,0 +1,26 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Server: nginx/1.0.5
|
3
|
+
Date: Thu, 29 Dec 2011 23:10:13 GMT
|
4
|
+
Content-Type: text/html
|
5
|
+
Content-Length: 15013
|
6
|
+
Last-Modified: Fri, 02 Dec 2011 21:00:49 GMT
|
7
|
+
Connection: keep-alive
|
8
|
+
Accept-Ranges: bytes
|
9
|
+
|
10
|
+
<!DOCTYPE html>
|
11
|
+
<html>
|
12
|
+
<head>
|
13
|
+
<meta charset="utf-8" />
|
14
|
+
<title>Protocol-relative URLs</title>
|
15
|
+
</head>
|
16
|
+
<body>
|
17
|
+
<p>Internal links</p>
|
18
|
+
<a href="/">Internal: home page</a>
|
19
|
+
<a href="/faqs">Internal: FAQs</a>
|
20
|
+
<a href="//protocol-relative.com/contact">Internal: protocol-relative</a>
|
21
|
+
|
22
|
+
<p>External links</p>
|
23
|
+
<a href="http://google.com">External: normal link</a>
|
24
|
+
<a href="//yahoo.com">External: protocol-relative link</a>
|
25
|
+
</body>
|
26
|
+
</html>
|
data/spec/metainspector_spec.rb
CHANGED
@@ -17,6 +17,11 @@ describe MetaInspector do
|
|
17
17
|
@m = MetaInspector.new('pagerankalert.com')
|
18
18
|
@m.url.should == 'http://pagerankalert.com'
|
19
19
|
end
|
20
|
+
|
21
|
+
it "should store the scheme" do
|
22
|
+
MetaInspector.new('http://pagerankalert.com').scheme.should == 'http'
|
23
|
+
MetaInspector.new('https://pagerankalert.com').scheme.should == 'https'
|
24
|
+
end
|
20
25
|
end
|
21
26
|
|
22
27
|
context 'Doing a basic scrape' do
|
@@ -76,19 +81,19 @@ describe MetaInspector do
|
|
76
81
|
@m.feed.should == 'http://www.tea-tron.com/jbravo/blog/feed/'
|
77
82
|
end
|
78
83
|
end
|
79
|
-
|
84
|
+
|
80
85
|
context 'Page with missing meta description' do
|
81
86
|
FakeWeb.register_uri(:get, "http://theonion-no-description.com", :response => fixture_file("theonion-no-description.com.response"))
|
82
|
-
|
83
|
-
it "should find secondary description" do
|
87
|
+
|
88
|
+
it "should find secondary description" do
|
84
89
|
@m = MetaInspector.new('http://theonion-no-description.com')
|
85
90
|
@m.description == "SAN FRANCISCO—In a move expected to revolutionize the mobile device industry, Apple launched its fastest and most powerful iPhone to date Tuesday,"+
|
86
91
|
" an innovative new model that can only be seen by the company's hippest and most dedicated customers. This is secondary text picked up because of a missing meta description."
|
87
92
|
end
|
88
|
-
|
93
|
+
|
89
94
|
end
|
90
|
-
|
91
|
-
|
95
|
+
|
96
|
+
|
92
97
|
context 'Links' do
|
93
98
|
before(:each) do
|
94
99
|
@m = MetaInspector.new('http://pagerankalert.com')
|
@@ -119,6 +124,28 @@ describe MetaInspector do
|
|
119
124
|
end
|
120
125
|
end
|
121
126
|
|
127
|
+
|
128
|
+
context 'Protocol-relative URLs' do
|
129
|
+
FakeWeb.register_uri(:get, "http://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
|
130
|
+
FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
|
131
|
+
|
132
|
+
before(:each) do
|
133
|
+
@m_http = MetaInspector.new('http://protocol-relative.com')
|
134
|
+
@m_https = MetaInspector.new('https://protocol-relative.com')
|
135
|
+
end
|
136
|
+
|
137
|
+
it "should convert protocol-relative links to http" do
|
138
|
+
@m_http.absolute_links.should include('http://protocol-relative.com/contact')
|
139
|
+
@m_http.absolute_links.should include('http://yahoo.com')
|
140
|
+
end
|
141
|
+
|
142
|
+
it "should convert protocol-relative links to https" do
|
143
|
+
@m_https.absolute_links.should include('https://protocol-relative.com/contact')
|
144
|
+
@m_https.absolute_links.should include('https://yahoo.com')
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
|
122
149
|
context 'Getting meta tags by ghost methods' do
|
123
150
|
before(:each) do
|
124
151
|
@m = MetaInspector.new('http://pagerankalert.com')
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 39
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 8
|
9
|
-
-
|
10
|
-
version: 1.8.
|
9
|
+
- 8
|
10
|
+
version: 1.8.8
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jaime Iniesta
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-12-
|
18
|
+
date: 2011-12-30 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
name: nokogiri
|
@@ -158,6 +158,7 @@ files:
|
|
158
158
|
- spec/fixtures/guardian.co.uk.response
|
159
159
|
- spec/fixtures/iteh.at.response
|
160
160
|
- spec/fixtures/pagerankalert.com.response
|
161
|
+
- spec/fixtures/protocol_relative.response
|
161
162
|
- spec/fixtures/tea-tron.com.response
|
162
163
|
- spec/fixtures/theonion-no-description.com.response
|
163
164
|
- spec/fixtures/theonion.com.response
|