metainspector 1.15.0 → 1.15.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,7 @@
2
2
 
3
3
  require 'open-uri'
4
4
  require 'open_uri_redirections'
5
+ require 'addressable/uri'
5
6
  require 'nokogiri'
6
7
  require 'hashie/rash'
7
8
  require 'timeout'
@@ -22,7 +23,7 @@ module MetaInspector
22
23
  def initialize(url, options = {})
23
24
  options = defaults.merge(options)
24
25
 
25
- @url = with_default_scheme(encode_url(url))
26
+ @url = with_default_scheme(normalize_url(url))
26
27
  @scheme = URI.parse(@url).scheme
27
28
  @host = URI.parse(@url).host
28
29
  @root_url = "#{@scheme}://#{@host}/"
@@ -49,7 +50,7 @@ module MetaInspector
49
50
 
50
51
  # Links found on the page, as absolute URLs
51
52
  def links
52
- @links ||= parsed_links.map{ |l| absolutify_url(unrelativize_url(l)) }.compact
53
+ @links ||= parsed_links.map{ |l| absolutify_url(unrelativize_url(l)) }.compact.uniq
53
54
  end
54
55
 
55
56
  # Internal links found on the page, as absolute URLs
@@ -217,9 +218,9 @@ module MetaInspector
217
218
  @errors << error
218
219
  end
219
220
 
220
- # Encode url to deal with international characters
221
- def encode_url(url)
222
- URI.encode(url).to_s.gsub("%23", "#")
221
+ # Normalize url to deal with characters that should be encodes, add trailing slash, convert to downcase...
222
+ def normalize_url(url)
223
+ Addressable::URI.parse(url).normalize.to_s
223
224
  end
224
225
 
225
226
  # Adds 'http' as default scheme, if there if none
@@ -231,11 +232,11 @@ module MetaInspector
231
232
  # Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
232
233
  def absolutify_url(url)
233
234
  if url =~ /^\w*\:/i
234
- encode_url(url)
235
+ normalize_url(url)
235
236
  else
236
- URI.parse(root_url).merge(encode_url(url)).to_s
237
+ URI.parse(root_url).merge(normalize_url(url)).to_s
237
238
  end
238
- rescue URI::InvalidURIError => e
239
+ rescue URI::InvalidURIError, Addressable::URI::InvalidURIError => e
239
240
  add_fatal_error "Link parsing exception: #{e.message}" and nil
240
241
  end
241
242
 
@@ -247,7 +248,7 @@ module MetaInspector
247
248
  # Extracts the host from a given URL
248
249
  def host_from_url(url)
249
250
  URI.parse(url).host
250
- rescue URI::InvalidURIError, URI::InvalidComponentError => e
251
+ rescue URI::InvalidURIError, URI::InvalidComponentError, Addressable::URI::InvalidURIError => e
251
252
  add_fatal_error "Link parsing exception: #{e.message}" and nil
252
253
  end
253
254
 
@@ -1,5 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  module MetaInspector
4
- VERSION = "1.15.0"
4
+ VERSION = "1.15.1"
5
5
  end
@@ -17,6 +17,7 @@ Gem::Specification.new do |gem|
17
17
  gem.add_dependency 'nokogiri', '~> 1.5'
18
18
  gem.add_dependency 'rash', '0.3.2'
19
19
  gem.add_dependency 'open_uri_redirections', '~> 0.1.0'
20
+ gem.add_dependency 'addressable', '~> 2.3.2'
20
21
 
21
22
  gem.add_development_dependency 'rspec', '2.12.0'
22
23
  gem.add_development_dependency 'fakeweb', '1.3.0'
@@ -23,6 +23,7 @@ Cache-control: private
23
23
  <a href="/faqs#camión">FAQs camión</a>
24
24
  <a href="/search?q=camión">Search camión</a>
25
25
  <a href="/search?q=españa#top">Search España at top</a>
26
+ <a href="/index.php?q=españa&url=aHR0zZQ%3D%3D&amp;cntnt01pageid=21">A link with an encoded param, %3D should not be double-escaped</a>
26
27
 
27
28
  <h1>External links:</h1>
28
29
  <a href="http://example.com/españa.asp">España</a>
@@ -5,7 +5,7 @@ require File.join(File.dirname(__FILE__), "/spec_helper")
5
5
  describe MetaInspector do
6
6
  describe 'Initialization' do
7
7
  it 'should accept an URL with a scheme' do
8
- MetaInspector.new('http://pagerankalert.com').url.should == 'http://pagerankalert.com'
8
+ MetaInspector.new('http://pagerankalert.com').url.should == 'http://pagerankalert.com/'
9
9
  end
10
10
 
11
11
  it "should use http:// as a default scheme" do
@@ -114,7 +114,7 @@ describe MetaInspector do
114
114
  end
115
115
 
116
116
  describe 'Doing a basic scrape from passed url html' do
117
-
117
+
118
118
  before(:each) do
119
119
  @m = MetaInspector.new("http://cnn.com", :document => "<html><head><title>Hello From Passed Html</title><a href='/hello'>Hello link</a></head><body></body></html>")
120
120
  end
@@ -147,7 +147,7 @@ describe MetaInspector do
147
147
  "http://pagerankalert.com/users/sign_up",
148
148
  "http://pagerankalert.com/users/sign_in",
149
149
  "mailto:pagerankalert@gmail.com",
150
- "http://pagerankalert.posterous.com",
150
+ "http://pagerankalert.posterous.com/",
151
151
  "http://twitter.com/pagerankalert",
152
152
  "http://twitter.com/share" ]
153
153
  end
@@ -161,7 +161,7 @@ describe MetaInspector do
161
161
 
162
162
  it "should get correct absolute links for external pages" do
163
163
  @m.external_links.should == [ "mailto:pagerankalert@gmail.com",
164
- "http://pagerankalert.posterous.com",
164
+ "http://pagerankalert.posterous.com/",
165
165
  "http://twitter.com/pagerankalert",
166
166
  "http://twitter.com/share" ]
167
167
  end
@@ -178,13 +178,14 @@ describe MetaInspector do
178
178
  end
179
179
 
180
180
  describe "links with international characters" do
181
- it "should get correct absolute links, encoding the URLs as needed but respecting # and ?" do
181
+ it "should get correct absolute links, encoding the URLs as needed" do
182
182
  m = MetaInspector.new('http://international.com')
183
183
  m.links.should == [ "http://international.com/espa%C3%B1a.asp",
184
184
  "http://international.com/roman%C3%A9e",
185
185
  "http://international.com/faqs#cami%C3%B3n",
186
186
  "http://international.com/search?q=cami%C3%B3n",
187
187
  "http://international.com/search?q=espa%C3%B1a#top",
188
+ "http://international.com/index.php?q=espa%C3%B1a&url=aHR0zZQ==&cntnt01pageid=21",
188
189
  "http://example.com/espa%C3%B1a.asp",
189
190
  "http://example.com/roman%C3%A9e",
190
191
  "http://example.com/faqs#cami%C3%B3n",
@@ -199,7 +200,8 @@ describe MetaInspector do
199
200
  "http://international.com/roman%C3%A9e",
200
201
  "http://international.com/faqs#cami%C3%B3n",
201
202
  "http://international.com/search?q=cami%C3%B3n",
202
- "http://international.com/search?q=espa%C3%B1a#top"]
203
+ "http://international.com/search?q=espa%C3%B1a#top",
204
+ "http://international.com/index.php?q=espa%C3%B1a&url=aHR0zZQ==&cntnt01pageid=21"]
203
205
  end
204
206
 
205
207
  it "should not crash when processing malformed hrefs" do
@@ -225,8 +227,7 @@ describe MetaInspector do
225
227
  m = MetaInspector.new('http://example.com/malformed_href')
226
228
  expect {
227
229
  m.external_links.should == ["skype:joeuser?call", "telnet://telnet.cdrom.com",
228
- "javascript:alert('ok');", "javascript://",
229
- "mailto:email(at)example.com"]
230
+ "javascript:alert('ok');", "mailto:email(at)example.com"]
230
231
  m.should_not be_ok
231
232
  }.to_not raise_error
232
233
  end
@@ -257,8 +258,7 @@ describe MetaInspector do
257
258
 
258
259
  it "should get the links" do
259
260
  @m.links.sort.should == [
260
- "FTP://FTP.CDROM.COM",
261
- "ftp://ftp.cdrom.com",
261
+ "ftp://ftp.cdrom.com/",
262
262
  "javascript:alert('hey');",
263
263
  "mailto:user@example.com",
264
264
  "skype:joeuser?call",
@@ -275,12 +275,12 @@ describe MetaInspector do
275
275
 
276
276
  it "should convert protocol-relative links to http" do
277
277
  @m_http.links.should include('http://protocol-relative.com/contact')
278
- @m_http.links.should include('http://yahoo.com')
278
+ @m_http.links.should include('http://yahoo.com/')
279
279
  end
280
280
 
281
281
  it "should convert protocol-relative links to https" do
282
282
  @m_https.links.should include('https://protocol-relative.com/contact')
283
- @m_https.links.should include('https://yahoo.com')
283
+ @m_https.links.should include('https://yahoo.com/')
284
284
  end
285
285
  end
286
286
 
@@ -355,7 +355,40 @@ describe MetaInspector do
355
355
  describe 'to_hash' do
356
356
  it "should return a hash with all the values set" do
357
357
  @m = MetaInspector.new('http://pagerankalert.com')
358
- @m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "images"=>["http://pagerankalert.com/images/pagerank_alert.png?1305794559"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"], "internal_links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in"], "external_links" => ["mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"], "content_type" => "text/html"}
358
+ @m.to_hash.should == {
359
+ "url" =>"http://pagerankalert.com/",
360
+ "title" =>"PageRankAlert.com :: Track your PageRank changes",
361
+ "links" => ["http://pagerankalert.com/",
362
+ "http://pagerankalert.com/es?language=es",
363
+ "http://pagerankalert.com/users/sign_up",
364
+ "http://pagerankalert.com/users/sign_in",
365
+ "mailto:pagerankalert@gmail.com",
366
+ "http://pagerankalert.posterous.com/",
367
+ "http://twitter.com/pagerankalert",
368
+ "http://twitter.com/share"],
369
+ "internal_links" => ["http://pagerankalert.com/",
370
+ "http://pagerankalert.com/es?language=es",
371
+ "http://pagerankalert.com/users/sign_up",
372
+ "http://pagerankalert.com/users/sign_in"],
373
+ "external_links" => ["mailto:pagerankalert@gmail.com",
374
+ "http://pagerankalert.posterous.com/",
375
+ "http://twitter.com/pagerankalert",
376
+ "http://twitter.com/share"],
377
+ "images" => ["http://pagerankalert.com/images/pagerank_alert.png?1305794559"],
378
+ "charset" => "utf-8",
379
+ "feed" => "http://feeds.feedburner.com/PageRankAlert",
380
+ "content_type" =>"text/html",
381
+ "meta" => {
382
+ "name" => {
383
+ "description"=> "Track your PageRank(TM) changes and receive alerts by email",
384
+ "keywords" => "pagerank, seo, optimization, google",
385
+ "robots" => "all,follow",
386
+ "csrf_param" => "authenticity_token",
387
+ "csrf_token" => "iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="
388
+ },
389
+ "property"=>{}
390
+ }
391
+ }
359
392
  end
360
393
  end
361
394
 
@@ -9,7 +9,7 @@ describe MetaInspector do
9
9
  m = MetaInspector.new("http://facebook.com")
10
10
  m.title.should be_nil
11
11
  m.should_not be_ok
12
- m.errors.first.should == "Scraping exception: redirection forbidden: http://facebook.com -> https://www.facebook.com/"
12
+ m.errors.first.should == "Scraping exception: redirection forbidden: http://facebook.com/ -> https://www.facebook.com/"
13
13
  end
14
14
 
15
15
  it "allows safe redirections when :allow_redirections => :safe" do
@@ -30,14 +30,14 @@ describe MetaInspector do
30
30
  m = MetaInspector.new("https://unsafe-facebook.com")
31
31
  m.title.should be_nil
32
32
  m.should_not be_ok
33
- m.errors.first.should == "Scraping exception: redirection forbidden: https://unsafe-facebook.com -> http://unsafe-facebook.com/"
33
+ m.errors.first.should == "Scraping exception: redirection forbidden: https://unsafe-facebook.com/ -> http://unsafe-facebook.com/"
34
34
  end
35
35
 
36
36
  it "disallows unsafe redirections when :allow_redirections => :safe" do
37
37
  m = MetaInspector.new("https://unsafe-facebook.com", :allow_redirections => :safe)
38
38
  m.title.should be_nil
39
39
  m.should_not be_ok
40
- m.errors.first.should == "Scraping exception: redirection forbidden: https://unsafe-facebook.com -> http://unsafe-facebook.com/"
40
+ m.errors.first.should == "Scraping exception: redirection forbidden: https://unsafe-facebook.com/ -> http://unsafe-facebook.com/"
41
41
  end
42
42
 
43
43
  it "allows unsafe redirections when :allow_redirections => :all" do
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- hash: 43
4
+ hash: 41
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
8
  - 15
9
- - 0
10
- version: 1.15.0
9
+ - 1
10
+ version: 1.15.1
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jaime Iniesta
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2013-01-19 00:00:00 Z
18
+ date: 2013-02-06 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  name: nokogiri
@@ -65,9 +65,25 @@ dependencies:
65
65
  type: :runtime
66
66
  version_requirements: *id003
67
67
  - !ruby/object:Gem::Dependency
68
- name: rspec
68
+ name: addressable
69
69
  prerelease: false
70
70
  requirement: &id004 !ruby/object:Gem::Requirement
71
+ none: false
72
+ requirements:
73
+ - - ~>
74
+ - !ruby/object:Gem::Version
75
+ hash: 7
76
+ segments:
77
+ - 2
78
+ - 3
79
+ - 2
80
+ version: 2.3.2
81
+ type: :runtime
82
+ version_requirements: *id004
83
+ - !ruby/object:Gem::Dependency
84
+ name: rspec
85
+ prerelease: false
86
+ requirement: &id005 !ruby/object:Gem::Requirement
71
87
  none: false
72
88
  requirements:
73
89
  - - "="
@@ -79,11 +95,11 @@ dependencies:
79
95
  - 0
80
96
  version: 2.12.0
81
97
  type: :development
82
- version_requirements: *id004
98
+ version_requirements: *id005
83
99
  - !ruby/object:Gem::Dependency
84
100
  name: fakeweb
85
101
  prerelease: false
86
- requirement: &id005 !ruby/object:Gem::Requirement
102
+ requirement: &id006 !ruby/object:Gem::Requirement
87
103
  none: false
88
104
  requirements:
89
105
  - - "="
@@ -95,11 +111,11 @@ dependencies:
95
111
  - 0
96
112
  version: 1.3.0
97
113
  type: :development
98
- version_requirements: *id005
114
+ version_requirements: *id006
99
115
  - !ruby/object:Gem::Dependency
100
116
  name: awesome_print
101
117
  prerelease: false
102
- requirement: &id006 !ruby/object:Gem::Requirement
118
+ requirement: &id007 !ruby/object:Gem::Requirement
103
119
  none: false
104
120
  requirements:
105
121
  - - "="
@@ -111,11 +127,11 @@ dependencies:
111
127
  - 0
112
128
  version: 1.1.0
113
129
  type: :development
114
- version_requirements: *id006
130
+ version_requirements: *id007
115
131
  - !ruby/object:Gem::Dependency
116
132
  name: rake
117
133
  prerelease: false
118
- requirement: &id007 !ruby/object:Gem::Requirement
134
+ requirement: &id008 !ruby/object:Gem::Requirement
119
135
  none: false
120
136
  requirements:
121
137
  - - ~>
@@ -127,7 +143,7 @@ dependencies:
127
143
  - 3
128
144
  version: 10.0.3
129
145
  type: :development
130
- version_requirements: *id007
146
+ version_requirements: *id008
131
147
  description: MetaInspector lets you scrape a web page and get its title, charset, link and meta tags
132
148
  email:
133
149
  - jaimeiniesta@gmail.com