metainspector 1.8.9 → 1.9.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/meta_inspector/scraper.rb +5 -10
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/fixtures/nonhttp.response +24 -0
- data/spec/metainspector_spec.rb +43 -39
- metadata +4 -3
@@ -31,9 +31,9 @@ module MetaInspector
|
|
31
31
|
|
32
32
|
# Returns the parsed document links
|
33
33
|
def links
|
34
|
-
@data.links ||=
|
35
|
-
|
36
|
-
|
34
|
+
@data.links ||= parsed_document.search("//a") \
|
35
|
+
.map {|link| link.attributes["href"] \
|
36
|
+
.to_s.strip}.uniq rescue nil
|
37
37
|
end
|
38
38
|
|
39
39
|
def images
|
@@ -137,8 +137,9 @@ module MetaInspector
|
|
137
137
|
private
|
138
138
|
|
139
139
|
# Convert a relative url like "/users" to an absolute one like "http://example.com/users"
|
140
|
+
# Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
|
140
141
|
def absolutify_url(url)
|
141
|
-
url =~
|
142
|
+
url =~ /^\w*\:/i ? url : File.join(@url,url)
|
142
143
|
end
|
143
144
|
|
144
145
|
# Convert a protocol-relative url to its full form, depending on the scheme of the page that contains it
|
@@ -146,12 +147,6 @@ module MetaInspector
|
|
146
147
|
url =~ /^\/\// ? "#{scheme}://#{url[2..-1]}" : url
|
147
148
|
end
|
148
149
|
|
149
|
-
# Remove mailto links
|
150
|
-
# TODO: convert this to a more generic filter to remove all non-http[s] like ftp, telnet, etc.
|
151
|
-
def remove_mailto(links)
|
152
|
-
links.reject {|l| l.index('mailto')}
|
153
|
-
end
|
154
|
-
|
155
150
|
# Look for the first <p> block with 120 characters or more
|
156
151
|
def secondary_description
|
157
152
|
(p = parsed_document.search('//p').map(&:text).select{ |p| p.length > 120 }.first).nil? ? '' : p
|
@@ -0,0 +1,24 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Server: nginx/0.7.67
|
3
|
+
Date: Fri, 18 Nov 2011 21:46:46 GMT
|
4
|
+
Content-Type: text/html
|
5
|
+
Connection: keep-alive
|
6
|
+
Last-Modified: Mon, 14 Nov 2011 16:53:18 GMT
|
7
|
+
Content-Length: 4987
|
8
|
+
X-Varnish: 2000423390
|
9
|
+
Age: 0
|
10
|
+
Via: 1.1 varnish
|
11
|
+
|
12
|
+
<html>
|
13
|
+
<head>
|
14
|
+
<title>Sample file non-http links</title>
|
15
|
+
</head>
|
16
|
+
<body>
|
17
|
+
<a href="ftp://ftp.cdrom.com">an FTP link</a>
|
18
|
+
<a href="FTP://FTP.CDROM.COM">an uppercase FTP link</a>
|
19
|
+
<a href="javascript:alert('hey');">a javascript function</a>
|
20
|
+
<a href="mailto:user@example.com">an email</a>
|
21
|
+
<a href="skype:joeuser?call">a skype link</a>
|
22
|
+
<a href="telnet://telnet.cdrom.com">a telnet link</a>
|
23
|
+
</body>
|
24
|
+
</html>
|
data/spec/metainspector_spec.rb
CHANGED
@@ -3,11 +3,18 @@
|
|
3
3
|
require File.join(File.dirname(__FILE__), "/spec_helper")
|
4
4
|
|
5
5
|
describe MetaInspector do
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
6
|
+
FakeWeb.register_uri(:get, "http://pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
|
7
|
+
FakeWeb.register_uri(:get, "http://www.alazan.com", :response => fixture_file("alazan.com.response"))
|
8
|
+
FakeWeb.register_uri(:get, "http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/", :response => fixture_file("theonion.com.response"))
|
9
|
+
FakeWeb.register_uri(:get, "http://theonion-no-description.com", :response => fixture_file("theonion-no-description.com.response"))
|
10
|
+
FakeWeb.register_uri(:get, "http://www.iteh.at", :response => fixture_file("iteh.at.response"))
|
11
|
+
FakeWeb.register_uri(:get, "http://www.tea-tron.com/jbravo/blog/", :response => fixture_file("tea-tron.com.response"))
|
12
|
+
FakeWeb.register_uri(:get, "http://www.guardian.co.uk/media/pda/2011/sep/15/techcrunch-arrington-startups", :response => fixture_file("guardian.co.uk.response"))
|
13
|
+
FakeWeb.register_uri(:get, "http://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
|
14
|
+
FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
|
15
|
+
FakeWeb.register_uri(:get, "http://example.com/nonhttp", :response => fixture_file("nonhttp.response"))
|
16
|
+
|
17
|
+
describe 'Initialization' do
|
11
18
|
it 'should accept an URL with a scheme' do
|
12
19
|
@m = MetaInspector.new('http://pagerankalert.com')
|
13
20
|
@m.url.should == 'http://pagerankalert.com'
|
@@ -24,14 +31,7 @@ describe MetaInspector do
|
|
24
31
|
end
|
25
32
|
end
|
26
33
|
|
27
|
-
|
28
|
-
|
29
|
-
FakeWeb.register_uri(:get, "http://pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
|
30
|
-
FakeWeb.register_uri(:get, "http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/", :response => fixture_file("theonion.com.response"))
|
31
|
-
FakeWeb.register_uri(:get, "http://www.iteh.at", :response => fixture_file("iteh.at.response"))
|
32
|
-
FakeWeb.register_uri(:get, "http://www.tea-tron.com/jbravo/blog/", :response => fixture_file("tea-tron.com.response"))
|
33
|
-
FakeWeb.register_uri(:get, "http://www.guardian.co.uk/media/pda/2011/sep/15/techcrunch-arrington-startups", :response => fixture_file("guardian.co.uk.response"))
|
34
|
-
|
34
|
+
describe 'Doing a basic scrape' do
|
35
35
|
EXPECTED_TITLE = 'PageRankAlert.com :: Track your PageRank changes'
|
36
36
|
|
37
37
|
before(:each) do
|
@@ -82,19 +82,15 @@ describe MetaInspector do
|
|
82
82
|
end
|
83
83
|
end
|
84
84
|
|
85
|
-
|
86
|
-
FakeWeb.register_uri(:get, "http://theonion-no-description.com", :response => fixture_file("theonion-no-description.com.response"))
|
87
|
-
|
85
|
+
describe 'Page with missing meta description' do
|
88
86
|
it "should find secondary description" do
|
89
87
|
@m = MetaInspector.new('http://theonion-no-description.com')
|
90
88
|
@m.description == "SAN FRANCISCO—In a move expected to revolutionize the mobile device industry, Apple launched its fastest and most powerful iPhone to date Tuesday,"+
|
91
89
|
" an innovative new model that can only be seen by the company's hippest and most dedicated customers. This is secondary text picked up because of a missing meta description."
|
92
90
|
end
|
93
|
-
|
94
91
|
end
|
95
92
|
|
96
|
-
|
97
|
-
context 'Links' do
|
93
|
+
describe 'Links' do
|
98
94
|
before(:each) do
|
99
95
|
@m = MetaInspector.new('http://pagerankalert.com')
|
100
96
|
end
|
@@ -105,6 +101,7 @@ describe MetaInspector do
|
|
105
101
|
"/es?language=es",
|
106
102
|
"/users/sign_up",
|
107
103
|
"/users/sign_in",
|
104
|
+
"mailto:pagerankalert@gmail.com",
|
108
105
|
"http://pagerankalert.posterous.com",
|
109
106
|
"http://twitter.com/pagerankalert",
|
110
107
|
"http://twitter.com/share"
|
@@ -117,6 +114,7 @@ describe MetaInspector do
|
|
117
114
|
"http://pagerankalert.com/es?language=es",
|
118
115
|
"http://pagerankalert.com/users/sign_up",
|
119
116
|
"http://pagerankalert.com/users/sign_in",
|
117
|
+
"mailto:pagerankalert@gmail.com",
|
120
118
|
"http://pagerankalert.posterous.com",
|
121
119
|
"http://twitter.com/pagerankalert",
|
122
120
|
"http://twitter.com/share"
|
@@ -124,11 +122,28 @@ describe MetaInspector do
|
|
124
122
|
end
|
125
123
|
end
|
126
124
|
|
125
|
+
describe 'Non-HTTP links' do
|
126
|
+
before(:each) do
|
127
|
+
@m = MetaInspector.new('http://example.com/nonhttp')
|
128
|
+
end
|
127
129
|
|
128
|
-
|
129
|
-
|
130
|
-
|
130
|
+
it "should get the links" do
|
131
|
+
@m.links.sort.should == [
|
132
|
+
"FTP://FTP.CDROM.COM",
|
133
|
+
"ftp://ftp.cdrom.com",
|
134
|
+
"javascript:alert('hey');",
|
135
|
+
"mailto:user@example.com",
|
136
|
+
"skype:joeuser?call",
|
137
|
+
"telnet://telnet.cdrom.com"
|
138
|
+
]
|
139
|
+
end
|
131
140
|
|
141
|
+
it "should return the same links as absolute links do" do
|
142
|
+
@m.absolute_links.should == @m.links
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
describe 'Protocol-relative URLs' do
|
132
147
|
before(:each) do
|
133
148
|
@m_http = MetaInspector.new('http://protocol-relative.com')
|
134
149
|
@m_https = MetaInspector.new('https://protocol-relative.com')
|
@@ -145,8 +160,7 @@ describe MetaInspector do
|
|
145
160
|
end
|
146
161
|
end
|
147
162
|
|
148
|
-
|
149
|
-
context 'Getting meta tags by ghost methods' do
|
163
|
+
describe 'Getting meta tags by ghost methods' do
|
150
164
|
before(:each) do
|
151
165
|
@m = MetaInspector.new('http://pagerankalert.com')
|
152
166
|
end
|
@@ -197,32 +211,22 @@ describe MetaInspector do
|
|
197
211
|
|
198
212
|
end
|
199
213
|
|
200
|
-
|
201
|
-
|
202
|
-
FakeWeb.register_uri(:get, "http://www.pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
|
203
|
-
FakeWeb.register_uri(:get, "http://www.alazan.com", :response => fixture_file("alazan.com.response"))
|
204
|
-
|
214
|
+
describe 'Charset detection' do
|
205
215
|
it "should detect windows-1252 charset" do
|
206
216
|
@m = MetaInspector.new('http://www.alazan.com')
|
207
217
|
@m.charset.should == "windows-1252"
|
208
218
|
end
|
209
219
|
|
210
220
|
it "should detect utf-8 charset" do
|
211
|
-
@m = MetaInspector.new('http://
|
221
|
+
@m = MetaInspector.new('http://pagerankalert.com')
|
212
222
|
@m.charset.should == "utf-8"
|
213
223
|
end
|
214
224
|
end
|
215
225
|
|
216
|
-
|
217
|
-
|
218
|
-
FakeWeb.register_uri(:get, "http://www.pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
|
219
|
-
|
226
|
+
describe 'to_hash' do
|
220
227
|
it "should return a hash with all the values set" do
|
221
|
-
@m = MetaInspector.new('http://
|
222
|
-
@m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://
|
228
|
+
@m = MetaInspector.new('http://pagerankalert.com')
|
229
|
+
@m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "links"=>["/", "/es?language=es", "/users/sign_up", "/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "absolute_links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"]}
|
223
230
|
end
|
224
|
-
|
225
231
|
end
|
226
|
-
|
227
|
-
|
228
232
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 51
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
|
-
- 8
|
9
8
|
- 9
|
10
|
-
|
9
|
+
- 0
|
10
|
+
version: 1.9.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jaime Iniesta
|
@@ -157,6 +157,7 @@ files:
|
|
157
157
|
- spec/fixtures/alazan.com.response
|
158
158
|
- spec/fixtures/guardian.co.uk.response
|
159
159
|
- spec/fixtures/iteh.at.response
|
160
|
+
- spec/fixtures/nonhttp.response
|
160
161
|
- spec/fixtures/pagerankalert.com.response
|
161
162
|
- spec/fixtures/protocol_relative.response
|
162
163
|
- spec/fixtures/tea-tron.com.response
|