metainspector 1.15.0 → 1.15.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -2,6 +2,7 @@
2
2
 
3
3
  require 'open-uri'
4
4
  require 'open_uri_redirections'
5
+ require 'addressable/uri'
5
6
  require 'nokogiri'
6
7
  require 'hashie/rash'
7
8
  require 'timeout'
@@ -22,7 +23,7 @@ module MetaInspector
22
23
  def initialize(url, options = {})
23
24
  options = defaults.merge(options)
24
25
 
25
- @url = with_default_scheme(encode_url(url))
26
+ @url = with_default_scheme(normalize_url(url))
26
27
  @scheme = URI.parse(@url).scheme
27
28
  @host = URI.parse(@url).host
28
29
  @root_url = "#{@scheme}://#{@host}/"
@@ -49,7 +50,7 @@ module MetaInspector
49
50
 
50
51
  # Links found on the page, as absolute URLs
51
52
  def links
52
- @links ||= parsed_links.map{ |l| absolutify_url(unrelativize_url(l)) }.compact
53
+ @links ||= parsed_links.map{ |l| absolutify_url(unrelativize_url(l)) }.compact.uniq
53
54
  end
54
55
 
55
56
  # Internal links found on the page, as absolute URLs
@@ -217,9 +218,9 @@ module MetaInspector
217
218
  @errors << error
218
219
  end
219
220
 
220
- # Encode url to deal with international characters
221
- def encode_url(url)
222
- URI.encode(url).to_s.gsub("%23", "#")
221
+ # Normalize url to deal with characters that should be encodes, add trailing slash, convert to downcase...
222
+ def normalize_url(url)
223
+ Addressable::URI.parse(url).normalize.to_s
223
224
  end
224
225
 
225
226
  # Adds 'http' as default scheme, if there if none
@@ -231,11 +232,11 @@ module MetaInspector
231
232
  # Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
232
233
  def absolutify_url(url)
233
234
  if url =~ /^\w*\:/i
234
- encode_url(url)
235
+ normalize_url(url)
235
236
  else
236
- URI.parse(root_url).merge(encode_url(url)).to_s
237
+ URI.parse(root_url).merge(normalize_url(url)).to_s
237
238
  end
238
- rescue URI::InvalidURIError => e
239
+ rescue URI::InvalidURIError, Addressable::URI::InvalidURIError => e
239
240
  add_fatal_error "Link parsing exception: #{e.message}" and nil
240
241
  end
241
242
 
@@ -247,7 +248,7 @@ module MetaInspector
247
248
  # Extracts the host from a given URL
248
249
  def host_from_url(url)
249
250
  URI.parse(url).host
250
- rescue URI::InvalidURIError, URI::InvalidComponentError => e
251
+ rescue URI::InvalidURIError, URI::InvalidComponentError, Addressable::URI::InvalidURIError => e
251
252
  add_fatal_error "Link parsing exception: #{e.message}" and nil
252
253
  end
253
254
 
@@ -1,5 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  module MetaInspector
4
- VERSION = "1.15.0"
4
+ VERSION = "1.15.1"
5
5
  end
@@ -17,6 +17,7 @@ Gem::Specification.new do |gem|
17
17
  gem.add_dependency 'nokogiri', '~> 1.5'
18
18
  gem.add_dependency 'rash', '0.3.2'
19
19
  gem.add_dependency 'open_uri_redirections', '~> 0.1.0'
20
+ gem.add_dependency 'addressable', '~> 2.3.2'
20
21
 
21
22
  gem.add_development_dependency 'rspec', '2.12.0'
22
23
  gem.add_development_dependency 'fakeweb', '1.3.0'
@@ -23,6 +23,7 @@ Cache-control: private
23
23
  <a href="/faqs#camión">FAQs camión</a>
24
24
  <a href="/search?q=camión">Search camión</a>
25
25
  <a href="/search?q=españa#top">Search España at top</a>
26
+ <a href="/index.php?q=españa&url=aHR0zZQ%3D%3D&amp;cntnt01pageid=21">A link with an encoded param, %3D should not be double-escaped</a>
26
27
 
27
28
  <h1>External links:</h1>
28
29
  <a href="http://example.com/españa.asp">España</a>
@@ -5,7 +5,7 @@ require File.join(File.dirname(__FILE__), "/spec_helper")
5
5
  describe MetaInspector do
6
6
  describe 'Initialization' do
7
7
  it 'should accept an URL with a scheme' do
8
- MetaInspector.new('http://pagerankalert.com').url.should == 'http://pagerankalert.com'
8
+ MetaInspector.new('http://pagerankalert.com').url.should == 'http://pagerankalert.com/'
9
9
  end
10
10
 
11
11
  it "should use http:// as a default scheme" do
@@ -114,7 +114,7 @@ describe MetaInspector do
114
114
  end
115
115
 
116
116
  describe 'Doing a basic scrape from passed url html' do
117
-
117
+
118
118
  before(:each) do
119
119
  @m = MetaInspector.new("http://cnn.com", :document => "<html><head><title>Hello From Passed Html</title><a href='/hello'>Hello link</a></head><body></body></html>")
120
120
  end
@@ -147,7 +147,7 @@ describe MetaInspector do
147
147
  "http://pagerankalert.com/users/sign_up",
148
148
  "http://pagerankalert.com/users/sign_in",
149
149
  "mailto:pagerankalert@gmail.com",
150
- "http://pagerankalert.posterous.com",
150
+ "http://pagerankalert.posterous.com/",
151
151
  "http://twitter.com/pagerankalert",
152
152
  "http://twitter.com/share" ]
153
153
  end
@@ -161,7 +161,7 @@ describe MetaInspector do
161
161
 
162
162
  it "should get correct absolute links for external pages" do
163
163
  @m.external_links.should == [ "mailto:pagerankalert@gmail.com",
164
- "http://pagerankalert.posterous.com",
164
+ "http://pagerankalert.posterous.com/",
165
165
  "http://twitter.com/pagerankalert",
166
166
  "http://twitter.com/share" ]
167
167
  end
@@ -178,13 +178,14 @@ describe MetaInspector do
178
178
  end
179
179
 
180
180
  describe "links with international characters" do
181
- it "should get correct absolute links, encoding the URLs as needed but respecting # and ?" do
181
+ it "should get correct absolute links, encoding the URLs as needed" do
182
182
  m = MetaInspector.new('http://international.com')
183
183
  m.links.should == [ "http://international.com/espa%C3%B1a.asp",
184
184
  "http://international.com/roman%C3%A9e",
185
185
  "http://international.com/faqs#cami%C3%B3n",
186
186
  "http://international.com/search?q=cami%C3%B3n",
187
187
  "http://international.com/search?q=espa%C3%B1a#top",
188
+ "http://international.com/index.php?q=espa%C3%B1a&url=aHR0zZQ==&cntnt01pageid=21",
188
189
  "http://example.com/espa%C3%B1a.asp",
189
190
  "http://example.com/roman%C3%A9e",
190
191
  "http://example.com/faqs#cami%C3%B3n",
@@ -199,7 +200,8 @@ describe MetaInspector do
199
200
  "http://international.com/roman%C3%A9e",
200
201
  "http://international.com/faqs#cami%C3%B3n",
201
202
  "http://international.com/search?q=cami%C3%B3n",
202
- "http://international.com/search?q=espa%C3%B1a#top"]
203
+ "http://international.com/search?q=espa%C3%B1a#top",
204
+ "http://international.com/index.php?q=espa%C3%B1a&url=aHR0zZQ==&cntnt01pageid=21"]
203
205
  end
204
206
 
205
207
  it "should not crash when processing malformed hrefs" do
@@ -225,8 +227,7 @@ describe MetaInspector do
225
227
  m = MetaInspector.new('http://example.com/malformed_href')
226
228
  expect {
227
229
  m.external_links.should == ["skype:joeuser?call", "telnet://telnet.cdrom.com",
228
- "javascript:alert('ok');", "javascript://",
229
- "mailto:email(at)example.com"]
230
+ "javascript:alert('ok');", "mailto:email(at)example.com"]
230
231
  m.should_not be_ok
231
232
  }.to_not raise_error
232
233
  end
@@ -257,8 +258,7 @@ describe MetaInspector do
257
258
 
258
259
  it "should get the links" do
259
260
  @m.links.sort.should == [
260
- "FTP://FTP.CDROM.COM",
261
- "ftp://ftp.cdrom.com",
261
+ "ftp://ftp.cdrom.com/",
262
262
  "javascript:alert('hey');",
263
263
  "mailto:user@example.com",
264
264
  "skype:joeuser?call",
@@ -275,12 +275,12 @@ describe MetaInspector do
275
275
 
276
276
  it "should convert protocol-relative links to http" do
277
277
  @m_http.links.should include('http://protocol-relative.com/contact')
278
- @m_http.links.should include('http://yahoo.com')
278
+ @m_http.links.should include('http://yahoo.com/')
279
279
  end
280
280
 
281
281
  it "should convert protocol-relative links to https" do
282
282
  @m_https.links.should include('https://protocol-relative.com/contact')
283
- @m_https.links.should include('https://yahoo.com')
283
+ @m_https.links.should include('https://yahoo.com/')
284
284
  end
285
285
  end
286
286
 
@@ -355,7 +355,40 @@ describe MetaInspector do
355
355
  describe 'to_hash' do
356
356
  it "should return a hash with all the values set" do
357
357
  @m = MetaInspector.new('http://pagerankalert.com')
358
- @m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "images"=>["http://pagerankalert.com/images/pagerank_alert.png?1305794559"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"], "internal_links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in"], "external_links" => ["mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"], "content_type" => "text/html"}
358
+ @m.to_hash.should == {
359
+ "url" =>"http://pagerankalert.com/",
360
+ "title" =>"PageRankAlert.com :: Track your PageRank changes",
361
+ "links" => ["http://pagerankalert.com/",
362
+ "http://pagerankalert.com/es?language=es",
363
+ "http://pagerankalert.com/users/sign_up",
364
+ "http://pagerankalert.com/users/sign_in",
365
+ "mailto:pagerankalert@gmail.com",
366
+ "http://pagerankalert.posterous.com/",
367
+ "http://twitter.com/pagerankalert",
368
+ "http://twitter.com/share"],
369
+ "internal_links" => ["http://pagerankalert.com/",
370
+ "http://pagerankalert.com/es?language=es",
371
+ "http://pagerankalert.com/users/sign_up",
372
+ "http://pagerankalert.com/users/sign_in"],
373
+ "external_links" => ["mailto:pagerankalert@gmail.com",
374
+ "http://pagerankalert.posterous.com/",
375
+ "http://twitter.com/pagerankalert",
376
+ "http://twitter.com/share"],
377
+ "images" => ["http://pagerankalert.com/images/pagerank_alert.png?1305794559"],
378
+ "charset" => "utf-8",
379
+ "feed" => "http://feeds.feedburner.com/PageRankAlert",
380
+ "content_type" =>"text/html",
381
+ "meta" => {
382
+ "name" => {
383
+ "description"=> "Track your PageRank(TM) changes and receive alerts by email",
384
+ "keywords" => "pagerank, seo, optimization, google",
385
+ "robots" => "all,follow",
386
+ "csrf_param" => "authenticity_token",
387
+ "csrf_token" => "iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="
388
+ },
389
+ "property"=>{}
390
+ }
391
+ }
359
392
  end
360
393
  end
361
394
 
@@ -9,7 +9,7 @@ describe MetaInspector do
9
9
  m = MetaInspector.new("http://facebook.com")
10
10
  m.title.should be_nil
11
11
  m.should_not be_ok
12
- m.errors.first.should == "Scraping exception: redirection forbidden: http://facebook.com -> https://www.facebook.com/"
12
+ m.errors.first.should == "Scraping exception: redirection forbidden: http://facebook.com/ -> https://www.facebook.com/"
13
13
  end
14
14
 
15
15
  it "allows safe redirections when :allow_redirections => :safe" do
@@ -30,14 +30,14 @@ describe MetaInspector do
30
30
  m = MetaInspector.new("https://unsafe-facebook.com")
31
31
  m.title.should be_nil
32
32
  m.should_not be_ok
33
- m.errors.first.should == "Scraping exception: redirection forbidden: https://unsafe-facebook.com -> http://unsafe-facebook.com/"
33
+ m.errors.first.should == "Scraping exception: redirection forbidden: https://unsafe-facebook.com/ -> http://unsafe-facebook.com/"
34
34
  end
35
35
 
36
36
  it "disallows unsafe redirections when :allow_redirections => :safe" do
37
37
  m = MetaInspector.new("https://unsafe-facebook.com", :allow_redirections => :safe)
38
38
  m.title.should be_nil
39
39
  m.should_not be_ok
40
- m.errors.first.should == "Scraping exception: redirection forbidden: https://unsafe-facebook.com -> http://unsafe-facebook.com/"
40
+ m.errors.first.should == "Scraping exception: redirection forbidden: https://unsafe-facebook.com/ -> http://unsafe-facebook.com/"
41
41
  end
42
42
 
43
43
  it "allows unsafe redirections when :allow_redirections => :all" do
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- hash: 43
4
+ hash: 41
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
8
  - 15
9
- - 0
10
- version: 1.15.0
9
+ - 1
10
+ version: 1.15.1
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jaime Iniesta
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2013-01-19 00:00:00 Z
18
+ date: 2013-02-06 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  name: nokogiri
@@ -65,9 +65,25 @@ dependencies:
65
65
  type: :runtime
66
66
  version_requirements: *id003
67
67
  - !ruby/object:Gem::Dependency
68
- name: rspec
68
+ name: addressable
69
69
  prerelease: false
70
70
  requirement: &id004 !ruby/object:Gem::Requirement
71
+ none: false
72
+ requirements:
73
+ - - ~>
74
+ - !ruby/object:Gem::Version
75
+ hash: 7
76
+ segments:
77
+ - 2
78
+ - 3
79
+ - 2
80
+ version: 2.3.2
81
+ type: :runtime
82
+ version_requirements: *id004
83
+ - !ruby/object:Gem::Dependency
84
+ name: rspec
85
+ prerelease: false
86
+ requirement: &id005 !ruby/object:Gem::Requirement
71
87
  none: false
72
88
  requirements:
73
89
  - - "="
@@ -79,11 +95,11 @@ dependencies:
79
95
  - 0
80
96
  version: 2.12.0
81
97
  type: :development
82
- version_requirements: *id004
98
+ version_requirements: *id005
83
99
  - !ruby/object:Gem::Dependency
84
100
  name: fakeweb
85
101
  prerelease: false
86
- requirement: &id005 !ruby/object:Gem::Requirement
102
+ requirement: &id006 !ruby/object:Gem::Requirement
87
103
  none: false
88
104
  requirements:
89
105
  - - "="
@@ -95,11 +111,11 @@ dependencies:
95
111
  - 0
96
112
  version: 1.3.0
97
113
  type: :development
98
- version_requirements: *id005
114
+ version_requirements: *id006
99
115
  - !ruby/object:Gem::Dependency
100
116
  name: awesome_print
101
117
  prerelease: false
102
- requirement: &id006 !ruby/object:Gem::Requirement
118
+ requirement: &id007 !ruby/object:Gem::Requirement
103
119
  none: false
104
120
  requirements:
105
121
  - - "="
@@ -111,11 +127,11 @@ dependencies:
111
127
  - 0
112
128
  version: 1.1.0
113
129
  type: :development
114
- version_requirements: *id006
130
+ version_requirements: *id007
115
131
  - !ruby/object:Gem::Dependency
116
132
  name: rake
117
133
  prerelease: false
118
- requirement: &id007 !ruby/object:Gem::Requirement
134
+ requirement: &id008 !ruby/object:Gem::Requirement
119
135
  none: false
120
136
  requirements:
121
137
  - - ~>
@@ -127,7 +143,7 @@ dependencies:
127
143
  - 3
128
144
  version: 10.0.3
129
145
  type: :development
130
- version_requirements: *id007
146
+ version_requirements: *id008
131
147
  description: MetaInspector lets you scrape a web page and get its title, charset, link and meta tags
132
148
  email:
133
149
  - jaimeiniesta@gmail.com