metainspector 1.8.9 → 1.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/meta_inspector/scraper.rb +5 -10
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/fixtures/nonhttp.response +24 -0
- data/spec/metainspector_spec.rb +43 -39
- metadata +4 -3
@@ -31,9 +31,9 @@ module MetaInspector
|
|
31
31
|
|
32
32
|
# Returns the parsed document links
|
33
33
|
def links
|
34
|
-
@data.links ||=
|
35
|
-
|
36
|
-
|
34
|
+
@data.links ||= parsed_document.search("//a") \
|
35
|
+
.map {|link| link.attributes["href"] \
|
36
|
+
.to_s.strip}.uniq rescue nil
|
37
37
|
end
|
38
38
|
|
39
39
|
def images
|
@@ -137,8 +137,9 @@ module MetaInspector
|
|
137
137
|
private
|
138
138
|
|
139
139
|
# Convert a relative url like "/users" to an absolute one like "http://example.com/users"
|
140
|
+
# Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
|
140
141
|
def absolutify_url(url)
|
141
|
-
url =~
|
142
|
+
url =~ /^\w*\:/i ? url : File.join(@url,url)
|
142
143
|
end
|
143
144
|
|
144
145
|
# Convert a protocol-relative url to its full form, depending on the scheme of the page that contains it
|
@@ -146,12 +147,6 @@ module MetaInspector
|
|
146
147
|
url =~ /^\/\// ? "#{scheme}://#{url[2..-1]}" : url
|
147
148
|
end
|
148
149
|
|
149
|
-
# Remove mailto links
|
150
|
-
# TODO: convert this to a more generic filter to remove all non-http[s] like ftp, telnet, etc.
|
151
|
-
def remove_mailto(links)
|
152
|
-
links.reject {|l| l.index('mailto')}
|
153
|
-
end
|
154
|
-
|
155
150
|
# Look for the first <p> block with 120 characters or more
|
156
151
|
def secondary_description
|
157
152
|
(p = parsed_document.search('//p').map(&:text).select{ |p| p.length > 120 }.first).nil? ? '' : p
|
@@ -0,0 +1,24 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Server: nginx/0.7.67
|
3
|
+
Date: Fri, 18 Nov 2011 21:46:46 GMT
|
4
|
+
Content-Type: text/html
|
5
|
+
Connection: keep-alive
|
6
|
+
Last-Modified: Mon, 14 Nov 2011 16:53:18 GMT
|
7
|
+
Content-Length: 4987
|
8
|
+
X-Varnish: 2000423390
|
9
|
+
Age: 0
|
10
|
+
Via: 1.1 varnish
|
11
|
+
|
12
|
+
<html>
|
13
|
+
<head>
|
14
|
+
<title>Sample file non-http links</title>
|
15
|
+
</head>
|
16
|
+
<body>
|
17
|
+
<a href="ftp://ftp.cdrom.com">an FTP link</a>
|
18
|
+
<a href="FTP://FTP.CDROM.COM">an uppercase FTP link</a>
|
19
|
+
<a href="javascript:alert('hey');">a javascript function</a>
|
20
|
+
<a href="mailto:user@example.com">an email</a>
|
21
|
+
<a href="skype:joeuser?call">a skype link</a>
|
22
|
+
<a href="telnet://telnet.cdrom.com">a telnet link</a>
|
23
|
+
</body>
|
24
|
+
</html>
|
data/spec/metainspector_spec.rb
CHANGED
@@ -3,11 +3,18 @@
|
|
3
3
|
require File.join(File.dirname(__FILE__), "/spec_helper")
|
4
4
|
|
5
5
|
describe MetaInspector do
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
6
|
+
FakeWeb.register_uri(:get, "http://pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
|
7
|
+
FakeWeb.register_uri(:get, "http://www.alazan.com", :response => fixture_file("alazan.com.response"))
|
8
|
+
FakeWeb.register_uri(:get, "http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/", :response => fixture_file("theonion.com.response"))
|
9
|
+
FakeWeb.register_uri(:get, "http://theonion-no-description.com", :response => fixture_file("theonion-no-description.com.response"))
|
10
|
+
FakeWeb.register_uri(:get, "http://www.iteh.at", :response => fixture_file("iteh.at.response"))
|
11
|
+
FakeWeb.register_uri(:get, "http://www.tea-tron.com/jbravo/blog/", :response => fixture_file("tea-tron.com.response"))
|
12
|
+
FakeWeb.register_uri(:get, "http://www.guardian.co.uk/media/pda/2011/sep/15/techcrunch-arrington-startups", :response => fixture_file("guardian.co.uk.response"))
|
13
|
+
FakeWeb.register_uri(:get, "http://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
|
14
|
+
FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
|
15
|
+
FakeWeb.register_uri(:get, "http://example.com/nonhttp", :response => fixture_file("nonhttp.response"))
|
16
|
+
|
17
|
+
describe 'Initialization' do
|
11
18
|
it 'should accept an URL with a scheme' do
|
12
19
|
@m = MetaInspector.new('http://pagerankalert.com')
|
13
20
|
@m.url.should == 'http://pagerankalert.com'
|
@@ -24,14 +31,7 @@ describe MetaInspector do
|
|
24
31
|
end
|
25
32
|
end
|
26
33
|
|
27
|
-
|
28
|
-
|
29
|
-
FakeWeb.register_uri(:get, "http://pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
|
30
|
-
FakeWeb.register_uri(:get, "http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/", :response => fixture_file("theonion.com.response"))
|
31
|
-
FakeWeb.register_uri(:get, "http://www.iteh.at", :response => fixture_file("iteh.at.response"))
|
32
|
-
FakeWeb.register_uri(:get, "http://www.tea-tron.com/jbravo/blog/", :response => fixture_file("tea-tron.com.response"))
|
33
|
-
FakeWeb.register_uri(:get, "http://www.guardian.co.uk/media/pda/2011/sep/15/techcrunch-arrington-startups", :response => fixture_file("guardian.co.uk.response"))
|
34
|
-
|
34
|
+
describe 'Doing a basic scrape' do
|
35
35
|
EXPECTED_TITLE = 'PageRankAlert.com :: Track your PageRank changes'
|
36
36
|
|
37
37
|
before(:each) do
|
@@ -82,19 +82,15 @@ describe MetaInspector do
|
|
82
82
|
end
|
83
83
|
end
|
84
84
|
|
85
|
-
|
86
|
-
FakeWeb.register_uri(:get, "http://theonion-no-description.com", :response => fixture_file("theonion-no-description.com.response"))
|
87
|
-
|
85
|
+
describe 'Page with missing meta description' do
|
88
86
|
it "should find secondary description" do
|
89
87
|
@m = MetaInspector.new('http://theonion-no-description.com')
|
90
88
|
@m.description == "SAN FRANCISCO—In a move expected to revolutionize the mobile device industry, Apple launched its fastest and most powerful iPhone to date Tuesday,"+
|
91
89
|
" an innovative new model that can only be seen by the company's hippest and most dedicated customers. This is secondary text picked up because of a missing meta description."
|
92
90
|
end
|
93
|
-
|
94
91
|
end
|
95
92
|
|
96
|
-
|
97
|
-
context 'Links' do
|
93
|
+
describe 'Links' do
|
98
94
|
before(:each) do
|
99
95
|
@m = MetaInspector.new('http://pagerankalert.com')
|
100
96
|
end
|
@@ -105,6 +101,7 @@ describe MetaInspector do
|
|
105
101
|
"/es?language=es",
|
106
102
|
"/users/sign_up",
|
107
103
|
"/users/sign_in",
|
104
|
+
"mailto:pagerankalert@gmail.com",
|
108
105
|
"http://pagerankalert.posterous.com",
|
109
106
|
"http://twitter.com/pagerankalert",
|
110
107
|
"http://twitter.com/share"
|
@@ -117,6 +114,7 @@ describe MetaInspector do
|
|
117
114
|
"http://pagerankalert.com/es?language=es",
|
118
115
|
"http://pagerankalert.com/users/sign_up",
|
119
116
|
"http://pagerankalert.com/users/sign_in",
|
117
|
+
"mailto:pagerankalert@gmail.com",
|
120
118
|
"http://pagerankalert.posterous.com",
|
121
119
|
"http://twitter.com/pagerankalert",
|
122
120
|
"http://twitter.com/share"
|
@@ -124,11 +122,28 @@ describe MetaInspector do
|
|
124
122
|
end
|
125
123
|
end
|
126
124
|
|
125
|
+
describe 'Non-HTTP links' do
|
126
|
+
before(:each) do
|
127
|
+
@m = MetaInspector.new('http://example.com/nonhttp')
|
128
|
+
end
|
127
129
|
|
128
|
-
|
129
|
-
|
130
|
-
|
130
|
+
it "should get the links" do
|
131
|
+
@m.links.sort.should == [
|
132
|
+
"FTP://FTP.CDROM.COM",
|
133
|
+
"ftp://ftp.cdrom.com",
|
134
|
+
"javascript:alert('hey');",
|
135
|
+
"mailto:user@example.com",
|
136
|
+
"skype:joeuser?call",
|
137
|
+
"telnet://telnet.cdrom.com"
|
138
|
+
]
|
139
|
+
end
|
131
140
|
|
141
|
+
it "should return the same links as absolute links do" do
|
142
|
+
@m.absolute_links.should == @m.links
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
describe 'Protocol-relative URLs' do
|
132
147
|
before(:each) do
|
133
148
|
@m_http = MetaInspector.new('http://protocol-relative.com')
|
134
149
|
@m_https = MetaInspector.new('https://protocol-relative.com')
|
@@ -145,8 +160,7 @@ describe MetaInspector do
|
|
145
160
|
end
|
146
161
|
end
|
147
162
|
|
148
|
-
|
149
|
-
context 'Getting meta tags by ghost methods' do
|
163
|
+
describe 'Getting meta tags by ghost methods' do
|
150
164
|
before(:each) do
|
151
165
|
@m = MetaInspector.new('http://pagerankalert.com')
|
152
166
|
end
|
@@ -197,32 +211,22 @@ describe MetaInspector do
|
|
197
211
|
|
198
212
|
end
|
199
213
|
|
200
|
-
|
201
|
-
|
202
|
-
FakeWeb.register_uri(:get, "http://www.pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
|
203
|
-
FakeWeb.register_uri(:get, "http://www.alazan.com", :response => fixture_file("alazan.com.response"))
|
204
|
-
|
214
|
+
describe 'Charset detection' do
|
205
215
|
it "should detect windows-1252 charset" do
|
206
216
|
@m = MetaInspector.new('http://www.alazan.com')
|
207
217
|
@m.charset.should == "windows-1252"
|
208
218
|
end
|
209
219
|
|
210
220
|
it "should detect utf-8 charset" do
|
211
|
-
@m = MetaInspector.new('http://
|
221
|
+
@m = MetaInspector.new('http://pagerankalert.com')
|
212
222
|
@m.charset.should == "utf-8"
|
213
223
|
end
|
214
224
|
end
|
215
225
|
|
216
|
-
|
217
|
-
|
218
|
-
FakeWeb.register_uri(:get, "http://www.pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
|
219
|
-
|
226
|
+
describe 'to_hash' do
|
220
227
|
it "should return a hash with all the values set" do
|
221
|
-
@m = MetaInspector.new('http://
|
222
|
-
@m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://
|
228
|
+
@m = MetaInspector.new('http://pagerankalert.com')
|
229
|
+
@m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "links"=>["/", "/es?language=es", "/users/sign_up", "/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "absolute_links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"]}
|
223
230
|
end
|
224
|
-
|
225
231
|
end
|
226
|
-
|
227
|
-
|
228
232
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 51
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
|
-
- 8
|
9
8
|
- 9
|
10
|
-
|
9
|
+
- 0
|
10
|
+
version: 1.9.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jaime Iniesta
|
@@ -157,6 +157,7 @@ files:
|
|
157
157
|
- spec/fixtures/alazan.com.response
|
158
158
|
- spec/fixtures/guardian.co.uk.response
|
159
159
|
- spec/fixtures/iteh.at.response
|
160
|
+
- spec/fixtures/nonhttp.response
|
160
161
|
- spec/fixtures/pagerankalert.com.response
|
161
162
|
- spec/fixtures/protocol_relative.response
|
162
163
|
- spec/fixtures/tea-tron.com.response
|