metainspector 1.8.9 → 1.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -31,9 +31,9 @@ module MetaInspector
31
31
 
32
32
  # Returns the parsed document links
33
33
  def links
34
- @data.links ||= remove_mailto(parsed_document.search("//a") \
35
- .map {|link| link.attributes["href"] \
36
- .to_s.strip}.uniq) rescue nil
34
+ @data.links ||= parsed_document.search("//a") \
35
+ .map {|link| link.attributes["href"] \
36
+ .to_s.strip}.uniq rescue nil
37
37
  end
38
38
 
39
39
  def images
@@ -137,8 +137,9 @@ module MetaInspector
137
137
  private
138
138
 
139
139
  # Convert a relative url like "/users" to an absolute one like "http://example.com/users"
140
+ # Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
140
141
  def absolutify_url(url)
141
- url =~ /^http.*/ ? url : File.join(@url,url)
142
+ url =~ /^\w*\:/i ? url : File.join(@url,url)
142
143
  end
143
144
 
144
145
  # Convert a protocol-relative url to its full form, depending on the scheme of the page that contains it
@@ -146,12 +147,6 @@ module MetaInspector
146
147
  url =~ /^\/\// ? "#{scheme}://#{url[2..-1]}" : url
147
148
  end
148
149
 
149
- # Remove mailto links
150
- # TODO: convert this to a more generic filter to remove all non-http[s] like ftp, telnet, etc.
151
- def remove_mailto(links)
152
- links.reject {|l| l.index('mailto')}
153
- end
154
-
155
150
  # Look for the first <p> block with 120 characters or more
156
151
  def secondary_description
157
152
  (p = parsed_document.search('//p').map(&:text).select{ |p| p.length > 120 }.first).nil? ? '' : p
@@ -1,5 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  module MetaInspector
4
- VERSION = "1.8.9"
4
+ VERSION = "1.9.0"
5
5
  end
@@ -0,0 +1,24 @@
1
+ HTTP/1.1 200 OK
2
+ Server: nginx/0.7.67
3
+ Date: Fri, 18 Nov 2011 21:46:46 GMT
4
+ Content-Type: text/html
5
+ Connection: keep-alive
6
+ Last-Modified: Mon, 14 Nov 2011 16:53:18 GMT
7
+ Content-Length: 4987
8
+ X-Varnish: 2000423390
9
+ Age: 0
10
+ Via: 1.1 varnish
11
+
12
+ <html>
13
+ <head>
14
+ <title>Sample file non-http links</title>
15
+ </head>
16
+ <body>
17
+ <a href="ftp://ftp.cdrom.com">an FTP link</a>
18
+ <a href="FTP://FTP.CDROM.COM">an uppercase FTP link</a>
19
+ <a href="javascript:alert('hey');">a javascript function</a>
20
+ <a href="mailto:user@example.com">an email</a>
21
+ <a href="skype:joeuser?call">a skype link</a>
22
+ <a href="telnet://telnet.cdrom.com">a telnet link</a>
23
+ </body>
24
+ </html>
@@ -3,11 +3,18 @@
3
3
  require File.join(File.dirname(__FILE__), "/spec_helper")
4
4
 
5
5
  describe MetaInspector do
6
-
7
- context 'Initialization' do
8
-
9
- FakeWeb.register_uri(:get, "http://pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
10
-
6
+ FakeWeb.register_uri(:get, "http://pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
7
+ FakeWeb.register_uri(:get, "http://www.alazan.com", :response => fixture_file("alazan.com.response"))
8
+ FakeWeb.register_uri(:get, "http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/", :response => fixture_file("theonion.com.response"))
9
+ FakeWeb.register_uri(:get, "http://theonion-no-description.com", :response => fixture_file("theonion-no-description.com.response"))
10
+ FakeWeb.register_uri(:get, "http://www.iteh.at", :response => fixture_file("iteh.at.response"))
11
+ FakeWeb.register_uri(:get, "http://www.tea-tron.com/jbravo/blog/", :response => fixture_file("tea-tron.com.response"))
12
+ FakeWeb.register_uri(:get, "http://www.guardian.co.uk/media/pda/2011/sep/15/techcrunch-arrington-startups", :response => fixture_file("guardian.co.uk.response"))
13
+ FakeWeb.register_uri(:get, "http://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
14
+ FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
15
+ FakeWeb.register_uri(:get, "http://example.com/nonhttp", :response => fixture_file("nonhttp.response"))
16
+
17
+ describe 'Initialization' do
11
18
  it 'should accept an URL with a scheme' do
12
19
  @m = MetaInspector.new('http://pagerankalert.com')
13
20
  @m.url.should == 'http://pagerankalert.com'
@@ -24,14 +31,7 @@ describe MetaInspector do
24
31
  end
25
32
  end
26
33
 
27
- context 'Doing a basic scrape' do
28
-
29
- FakeWeb.register_uri(:get, "http://pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
30
- FakeWeb.register_uri(:get, "http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/", :response => fixture_file("theonion.com.response"))
31
- FakeWeb.register_uri(:get, "http://www.iteh.at", :response => fixture_file("iteh.at.response"))
32
- FakeWeb.register_uri(:get, "http://www.tea-tron.com/jbravo/blog/", :response => fixture_file("tea-tron.com.response"))
33
- FakeWeb.register_uri(:get, "http://www.guardian.co.uk/media/pda/2011/sep/15/techcrunch-arrington-startups", :response => fixture_file("guardian.co.uk.response"))
34
-
34
+ describe 'Doing a basic scrape' do
35
35
  EXPECTED_TITLE = 'PageRankAlert.com :: Track your PageRank changes'
36
36
 
37
37
  before(:each) do
@@ -82,19 +82,15 @@ describe MetaInspector do
82
82
  end
83
83
  end
84
84
 
85
- context 'Page with missing meta description' do
86
- FakeWeb.register_uri(:get, "http://theonion-no-description.com", :response => fixture_file("theonion-no-description.com.response"))
87
-
85
+ describe 'Page with missing meta description' do
88
86
  it "should find secondary description" do
89
87
  @m = MetaInspector.new('http://theonion-no-description.com')
90
88
  @m.description == "SAN FRANCISCO&#8212;In a move expected to revolutionize the mobile device industry, Apple launched its fastest and most powerful iPhone to date Tuesday,"+
91
89
  " an innovative new model that can only be seen by the company's hippest and most dedicated customers. This is secondary text picked up because of a missing meta description."
92
90
  end
93
-
94
91
  end
95
92
 
96
-
97
- context 'Links' do
93
+ describe 'Links' do
98
94
  before(:each) do
99
95
  @m = MetaInspector.new('http://pagerankalert.com')
100
96
  end
@@ -105,6 +101,7 @@ describe MetaInspector do
105
101
  "/es?language=es",
106
102
  "/users/sign_up",
107
103
  "/users/sign_in",
104
+ "mailto:pagerankalert@gmail.com",
108
105
  "http://pagerankalert.posterous.com",
109
106
  "http://twitter.com/pagerankalert",
110
107
  "http://twitter.com/share"
@@ -117,6 +114,7 @@ describe MetaInspector do
117
114
  "http://pagerankalert.com/es?language=es",
118
115
  "http://pagerankalert.com/users/sign_up",
119
116
  "http://pagerankalert.com/users/sign_in",
117
+ "mailto:pagerankalert@gmail.com",
120
118
  "http://pagerankalert.posterous.com",
121
119
  "http://twitter.com/pagerankalert",
122
120
  "http://twitter.com/share"
@@ -124,11 +122,28 @@ describe MetaInspector do
124
122
  end
125
123
  end
126
124
 
125
+ describe 'Non-HTTP links' do
126
+ before(:each) do
127
+ @m = MetaInspector.new('http://example.com/nonhttp')
128
+ end
127
129
 
128
- context 'Protocol-relative URLs' do
129
- FakeWeb.register_uri(:get, "http://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
130
- FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
130
+ it "should get the links" do
131
+ @m.links.sort.should == [
132
+ "FTP://FTP.CDROM.COM",
133
+ "ftp://ftp.cdrom.com",
134
+ "javascript:alert('hey');",
135
+ "mailto:user@example.com",
136
+ "skype:joeuser?call",
137
+ "telnet://telnet.cdrom.com"
138
+ ]
139
+ end
131
140
 
141
+ it "should return the same links as absolute links do" do
142
+ @m.absolute_links.should == @m.links
143
+ end
144
+ end
145
+
146
+ describe 'Protocol-relative URLs' do
132
147
  before(:each) do
133
148
  @m_http = MetaInspector.new('http://protocol-relative.com')
134
149
  @m_https = MetaInspector.new('https://protocol-relative.com')
@@ -145,8 +160,7 @@ describe MetaInspector do
145
160
  end
146
161
  end
147
162
 
148
-
149
- context 'Getting meta tags by ghost methods' do
163
+ describe 'Getting meta tags by ghost methods' do
150
164
  before(:each) do
151
165
  @m = MetaInspector.new('http://pagerankalert.com')
152
166
  end
@@ -197,32 +211,22 @@ describe MetaInspector do
197
211
 
198
212
  end
199
213
 
200
- context 'Charset detection' do
201
-
202
- FakeWeb.register_uri(:get, "http://www.pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
203
- FakeWeb.register_uri(:get, "http://www.alazan.com", :response => fixture_file("alazan.com.response"))
204
-
214
+ describe 'Charset detection' do
205
215
  it "should detect windows-1252 charset" do
206
216
  @m = MetaInspector.new('http://www.alazan.com')
207
217
  @m.charset.should == "windows-1252"
208
218
  end
209
219
 
210
220
  it "should detect utf-8 charset" do
211
- @m = MetaInspector.new('http://www.pagerankalert.com')
221
+ @m = MetaInspector.new('http://pagerankalert.com')
212
222
  @m.charset.should == "utf-8"
213
223
  end
214
224
  end
215
225
 
216
- context 'to_hash' do
217
-
218
- FakeWeb.register_uri(:get, "http://www.pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
219
-
226
+ describe 'to_hash' do
220
227
  it "should return a hash with all the values set" do
221
- @m = MetaInspector.new('http://www.pagerankalert.com')
222
- @m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://www.pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "links"=>["/", "/es?language=es", "/users/sign_up", "/users/sign_in", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "absolute_links"=>["http://www.pagerankalert.com/", "http://www.pagerankalert.com/es?language=es", "http://www.pagerankalert.com/users/sign_up", "http://www.pagerankalert.com/users/sign_in", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"]}
228
+ @m = MetaInspector.new('http://pagerankalert.com')
229
+ @m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "links"=>["/", "/es?language=es", "/users/sign_up", "/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "absolute_links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"]}
223
230
  end
224
-
225
231
  end
226
-
227
-
228
232
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- hash: 37
4
+ hash: 51
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
- - 8
9
8
  - 9
10
- version: 1.8.9
9
+ - 0
10
+ version: 1.9.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jaime Iniesta
@@ -157,6 +157,7 @@ files:
157
157
  - spec/fixtures/alazan.com.response
158
158
  - spec/fixtures/guardian.co.uk.response
159
159
  - spec/fixtures/iteh.at.response
160
+ - spec/fixtures/nonhttp.response
160
161
  - spec/fixtures/pagerankalert.com.response
161
162
  - spec/fixtures/protocol_relative.response
162
163
  - spec/fixtures/tea-tron.com.response