metainspector 1.8.9 → 1.9.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -31,9 +31,9 @@ module MetaInspector
31
31
 
32
32
  # Returns the parsed document links
33
33
  def links
34
- @data.links ||= remove_mailto(parsed_document.search("//a") \
35
- .map {|link| link.attributes["href"] \
36
- .to_s.strip}.uniq) rescue nil
34
+ @data.links ||= parsed_document.search("//a") \
35
+ .map {|link| link.attributes["href"] \
36
+ .to_s.strip}.uniq rescue nil
37
37
  end
38
38
 
39
39
  def images
@@ -137,8 +137,9 @@ module MetaInspector
137
137
  private
138
138
 
139
139
  # Convert a relative url like "/users" to an absolute one like "http://example.com/users"
140
+ # Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
140
141
  def absolutify_url(url)
141
- url =~ /^http.*/ ? url : File.join(@url,url)
142
+ url =~ /^\w*\:/i ? url : File.join(@url,url)
142
143
  end
143
144
 
144
145
  # Convert a protocol-relative url to its full form, depending on the scheme of the page that contains it
@@ -146,12 +147,6 @@ module MetaInspector
146
147
  url =~ /^\/\// ? "#{scheme}://#{url[2..-1]}" : url
147
148
  end
148
149
 
149
- # Remove mailto links
150
- # TODO: convert this to a more generic filter to remove all non-http[s] like ftp, telnet, etc.
151
- def remove_mailto(links)
152
- links.reject {|l| l.index('mailto')}
153
- end
154
-
155
150
  # Look for the first <p> block with 120 characters or more
156
151
  def secondary_description
157
152
  (p = parsed_document.search('//p').map(&:text).select{ |p| p.length > 120 }.first).nil? ? '' : p
@@ -1,5 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  module MetaInspector
4
- VERSION = "1.8.9"
4
+ VERSION = "1.9.0"
5
5
  end
@@ -0,0 +1,24 @@
1
+ HTTP/1.1 200 OK
2
+ Server: nginx/0.7.67
3
+ Date: Fri, 18 Nov 2011 21:46:46 GMT
4
+ Content-Type: text/html
5
+ Connection: keep-alive
6
+ Last-Modified: Mon, 14 Nov 2011 16:53:18 GMT
7
+ Content-Length: 4987
8
+ X-Varnish: 2000423390
9
+ Age: 0
10
+ Via: 1.1 varnish
11
+
12
+ <html>
13
+ <head>
14
+ <title>Sample file non-http links</title>
15
+ </head>
16
+ <body>
17
+ <a href="ftp://ftp.cdrom.com">an FTP link</a>
18
+ <a href="FTP://FTP.CDROM.COM">an uppercase FTP link</a>
19
+ <a href="javascript:alert('hey');">a javascript function</a>
20
+ <a href="mailto:user@example.com">an email</a>
21
+ <a href="skype:joeuser?call">a skype link</a>
22
+ <a href="telnet://telnet.cdrom.com">a telnet link</a>
23
+ </body>
24
+ </html>
@@ -3,11 +3,18 @@
3
3
  require File.join(File.dirname(__FILE__), "/spec_helper")
4
4
 
5
5
  describe MetaInspector do
6
-
7
- context 'Initialization' do
8
-
9
- FakeWeb.register_uri(:get, "http://pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
10
-
6
+ FakeWeb.register_uri(:get, "http://pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
7
+ FakeWeb.register_uri(:get, "http://www.alazan.com", :response => fixture_file("alazan.com.response"))
8
+ FakeWeb.register_uri(:get, "http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/", :response => fixture_file("theonion.com.response"))
9
+ FakeWeb.register_uri(:get, "http://theonion-no-description.com", :response => fixture_file("theonion-no-description.com.response"))
10
+ FakeWeb.register_uri(:get, "http://www.iteh.at", :response => fixture_file("iteh.at.response"))
11
+ FakeWeb.register_uri(:get, "http://www.tea-tron.com/jbravo/blog/", :response => fixture_file("tea-tron.com.response"))
12
+ FakeWeb.register_uri(:get, "http://www.guardian.co.uk/media/pda/2011/sep/15/techcrunch-arrington-startups", :response => fixture_file("guardian.co.uk.response"))
13
+ FakeWeb.register_uri(:get, "http://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
14
+ FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
15
+ FakeWeb.register_uri(:get, "http://example.com/nonhttp", :response => fixture_file("nonhttp.response"))
16
+
17
+ describe 'Initialization' do
11
18
  it 'should accept an URL with a scheme' do
12
19
  @m = MetaInspector.new('http://pagerankalert.com')
13
20
  @m.url.should == 'http://pagerankalert.com'
@@ -24,14 +31,7 @@ describe MetaInspector do
24
31
  end
25
32
  end
26
33
 
27
- context 'Doing a basic scrape' do
28
-
29
- FakeWeb.register_uri(:get, "http://pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
30
- FakeWeb.register_uri(:get, "http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/", :response => fixture_file("theonion.com.response"))
31
- FakeWeb.register_uri(:get, "http://www.iteh.at", :response => fixture_file("iteh.at.response"))
32
- FakeWeb.register_uri(:get, "http://www.tea-tron.com/jbravo/blog/", :response => fixture_file("tea-tron.com.response"))
33
- FakeWeb.register_uri(:get, "http://www.guardian.co.uk/media/pda/2011/sep/15/techcrunch-arrington-startups", :response => fixture_file("guardian.co.uk.response"))
34
-
34
+ describe 'Doing a basic scrape' do
35
35
  EXPECTED_TITLE = 'PageRankAlert.com :: Track your PageRank changes'
36
36
 
37
37
  before(:each) do
@@ -82,19 +82,15 @@ describe MetaInspector do
82
82
  end
83
83
  end
84
84
 
85
- context 'Page with missing meta description' do
86
- FakeWeb.register_uri(:get, "http://theonion-no-description.com", :response => fixture_file("theonion-no-description.com.response"))
87
-
85
+ describe 'Page with missing meta description' do
88
86
  it "should find secondary description" do
89
87
  @m = MetaInspector.new('http://theonion-no-description.com')
90
88
  @m.description == "SAN FRANCISCO&#8212;In a move expected to revolutionize the mobile device industry, Apple launched its fastest and most powerful iPhone to date Tuesday,"+
91
89
  " an innovative new model that can only be seen by the company's hippest and most dedicated customers. This is secondary text picked up because of a missing meta description."
92
90
  end
93
-
94
91
  end
95
92
 
96
-
97
- context 'Links' do
93
+ describe 'Links' do
98
94
  before(:each) do
99
95
  @m = MetaInspector.new('http://pagerankalert.com')
100
96
  end
@@ -105,6 +101,7 @@ describe MetaInspector do
105
101
  "/es?language=es",
106
102
  "/users/sign_up",
107
103
  "/users/sign_in",
104
+ "mailto:pagerankalert@gmail.com",
108
105
  "http://pagerankalert.posterous.com",
109
106
  "http://twitter.com/pagerankalert",
110
107
  "http://twitter.com/share"
@@ -117,6 +114,7 @@ describe MetaInspector do
117
114
  "http://pagerankalert.com/es?language=es",
118
115
  "http://pagerankalert.com/users/sign_up",
119
116
  "http://pagerankalert.com/users/sign_in",
117
+ "mailto:pagerankalert@gmail.com",
120
118
  "http://pagerankalert.posterous.com",
121
119
  "http://twitter.com/pagerankalert",
122
120
  "http://twitter.com/share"
@@ -124,11 +122,28 @@ describe MetaInspector do
124
122
  end
125
123
  end
126
124
 
125
+ describe 'Non-HTTP links' do
126
+ before(:each) do
127
+ @m = MetaInspector.new('http://example.com/nonhttp')
128
+ end
127
129
 
128
- context 'Protocol-relative URLs' do
129
- FakeWeb.register_uri(:get, "http://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
130
- FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
130
+ it "should get the links" do
131
+ @m.links.sort.should == [
132
+ "FTP://FTP.CDROM.COM",
133
+ "ftp://ftp.cdrom.com",
134
+ "javascript:alert('hey');",
135
+ "mailto:user@example.com",
136
+ "skype:joeuser?call",
137
+ "telnet://telnet.cdrom.com"
138
+ ]
139
+ end
131
140
 
141
+ it "should return the same links as absolute links do" do
142
+ @m.absolute_links.should == @m.links
143
+ end
144
+ end
145
+
146
+ describe 'Protocol-relative URLs' do
132
147
  before(:each) do
133
148
  @m_http = MetaInspector.new('http://protocol-relative.com')
134
149
  @m_https = MetaInspector.new('https://protocol-relative.com')
@@ -145,8 +160,7 @@ describe MetaInspector do
145
160
  end
146
161
  end
147
162
 
148
-
149
- context 'Getting meta tags by ghost methods' do
163
+ describe 'Getting meta tags by ghost methods' do
150
164
  before(:each) do
151
165
  @m = MetaInspector.new('http://pagerankalert.com')
152
166
  end
@@ -197,32 +211,22 @@ describe MetaInspector do
197
211
 
198
212
  end
199
213
 
200
- context 'Charset detection' do
201
-
202
- FakeWeb.register_uri(:get, "http://www.pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
203
- FakeWeb.register_uri(:get, "http://www.alazan.com", :response => fixture_file("alazan.com.response"))
204
-
214
+ describe 'Charset detection' do
205
215
  it "should detect windows-1252 charset" do
206
216
  @m = MetaInspector.new('http://www.alazan.com')
207
217
  @m.charset.should == "windows-1252"
208
218
  end
209
219
 
210
220
  it "should detect utf-8 charset" do
211
- @m = MetaInspector.new('http://www.pagerankalert.com')
221
+ @m = MetaInspector.new('http://pagerankalert.com')
212
222
  @m.charset.should == "utf-8"
213
223
  end
214
224
  end
215
225
 
216
- context 'to_hash' do
217
-
218
- FakeWeb.register_uri(:get, "http://www.pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
219
-
226
+ describe 'to_hash' do
220
227
  it "should return a hash with all the values set" do
221
- @m = MetaInspector.new('http://www.pagerankalert.com')
222
- @m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://www.pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "links"=>["/", "/es?language=es", "/users/sign_up", "/users/sign_in", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "absolute_links"=>["http://www.pagerankalert.com/", "http://www.pagerankalert.com/es?language=es", "http://www.pagerankalert.com/users/sign_up", "http://www.pagerankalert.com/users/sign_in", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"]}
228
+ @m = MetaInspector.new('http://pagerankalert.com')
229
+ @m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "links"=>["/", "/es?language=es", "/users/sign_up", "/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "absolute_links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"]}
223
230
  end
224
-
225
231
  end
226
-
227
-
228
232
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- hash: 37
4
+ hash: 51
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
- - 8
9
8
  - 9
10
- version: 1.8.9
9
+ - 0
10
+ version: 1.9.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jaime Iniesta
@@ -157,6 +157,7 @@ files:
157
157
  - spec/fixtures/alazan.com.response
158
158
  - spec/fixtures/guardian.co.uk.response
159
159
  - spec/fixtures/iteh.at.response
160
+ - spec/fixtures/nonhttp.response
160
161
  - spec/fixtures/pagerankalert.com.response
161
162
  - spec/fixtures/protocol_relative.response
162
163
  - spec/fixtures/tea-tron.com.response