metainspector 1.15.0 → 1.15.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/meta_inspector/scraper.rb +10 -9
- data/lib/meta_inspector/version.rb +1 -1
- data/meta_inspector.gemspec +1 -0
- data/spec/fixtures/international.response +1 -0
- data/spec/metainspector_spec.rb +46 -13
- data/spec/redirections_spec.rb +3 -3
- metadata +28 -12
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
require 'open-uri'
|
4
4
|
require 'open_uri_redirections'
|
5
|
+
require 'addressable/uri'
|
5
6
|
require 'nokogiri'
|
6
7
|
require 'hashie/rash'
|
7
8
|
require 'timeout'
|
@@ -22,7 +23,7 @@ module MetaInspector
|
|
22
23
|
def initialize(url, options = {})
|
23
24
|
options = defaults.merge(options)
|
24
25
|
|
25
|
-
@url = with_default_scheme(
|
26
|
+
@url = with_default_scheme(normalize_url(url))
|
26
27
|
@scheme = URI.parse(@url).scheme
|
27
28
|
@host = URI.parse(@url).host
|
28
29
|
@root_url = "#{@scheme}://#{@host}/"
|
@@ -49,7 +50,7 @@ module MetaInspector
|
|
49
50
|
|
50
51
|
# Links found on the page, as absolute URLs
|
51
52
|
def links
|
52
|
-
@links ||= parsed_links.map{ |l| absolutify_url(unrelativize_url(l)) }.compact
|
53
|
+
@links ||= parsed_links.map{ |l| absolutify_url(unrelativize_url(l)) }.compact.uniq
|
53
54
|
end
|
54
55
|
|
55
56
|
# Internal links found on the page, as absolute URLs
|
@@ -217,9 +218,9 @@ module MetaInspector
|
|
217
218
|
@errors << error
|
218
219
|
end
|
219
220
|
|
220
|
-
#
|
221
|
-
def
|
222
|
-
URI.
|
221
|
+
# Normalize url to deal with characters that should be encodes, add trailing slash, convert to downcase...
|
222
|
+
def normalize_url(url)
|
223
|
+
Addressable::URI.parse(url).normalize.to_s
|
223
224
|
end
|
224
225
|
|
225
226
|
# Adds 'http' as default scheme, if there if none
|
@@ -231,11 +232,11 @@ module MetaInspector
|
|
231
232
|
# Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
|
232
233
|
def absolutify_url(url)
|
233
234
|
if url =~ /^\w*\:/i
|
234
|
-
|
235
|
+
normalize_url(url)
|
235
236
|
else
|
236
|
-
URI.parse(root_url).merge(
|
237
|
+
URI.parse(root_url).merge(normalize_url(url)).to_s
|
237
238
|
end
|
238
|
-
rescue URI::InvalidURIError => e
|
239
|
+
rescue URI::InvalidURIError, Addressable::URI::InvalidURIError => e
|
239
240
|
add_fatal_error "Link parsing exception: #{e.message}" and nil
|
240
241
|
end
|
241
242
|
|
@@ -247,7 +248,7 @@ module MetaInspector
|
|
247
248
|
# Extracts the host from a given URL
|
248
249
|
def host_from_url(url)
|
249
250
|
URI.parse(url).host
|
250
|
-
rescue URI::InvalidURIError, URI::InvalidComponentError => e
|
251
|
+
rescue URI::InvalidURIError, URI::InvalidComponentError, Addressable::URI::InvalidURIError => e
|
251
252
|
add_fatal_error "Link parsing exception: #{e.message}" and nil
|
252
253
|
end
|
253
254
|
|
data/meta_inspector.gemspec
CHANGED
@@ -17,6 +17,7 @@ Gem::Specification.new do |gem|
|
|
17
17
|
gem.add_dependency 'nokogiri', '~> 1.5'
|
18
18
|
gem.add_dependency 'rash', '0.3.2'
|
19
19
|
gem.add_dependency 'open_uri_redirections', '~> 0.1.0'
|
20
|
+
gem.add_dependency 'addressable', '~> 2.3.2'
|
20
21
|
|
21
22
|
gem.add_development_dependency 'rspec', '2.12.0'
|
22
23
|
gem.add_development_dependency 'fakeweb', '1.3.0'
|
@@ -23,6 +23,7 @@ Cache-control: private
|
|
23
23
|
<a href="/faqs#camión">FAQs camión</a>
|
24
24
|
<a href="/search?q=camión">Search camión</a>
|
25
25
|
<a href="/search?q=españa#top">Search España at top</a>
|
26
|
+
<a href="/index.php?q=españa&url=aHR0zZQ%3D%3D&cntnt01pageid=21">A link with an encoded param, %3D should not be double-escaped</a>
|
26
27
|
|
27
28
|
<h1>External links:</h1>
|
28
29
|
<a href="http://example.com/españa.asp">España</a>
|
data/spec/metainspector_spec.rb
CHANGED
@@ -5,7 +5,7 @@ require File.join(File.dirname(__FILE__), "/spec_helper")
|
|
5
5
|
describe MetaInspector do
|
6
6
|
describe 'Initialization' do
|
7
7
|
it 'should accept an URL with a scheme' do
|
8
|
-
MetaInspector.new('http://pagerankalert.com').url.should == 'http://pagerankalert.com'
|
8
|
+
MetaInspector.new('http://pagerankalert.com').url.should == 'http://pagerankalert.com/'
|
9
9
|
end
|
10
10
|
|
11
11
|
it "should use http:// as a default scheme" do
|
@@ -114,7 +114,7 @@ describe MetaInspector do
|
|
114
114
|
end
|
115
115
|
|
116
116
|
describe 'Doing a basic scrape from passed url html' do
|
117
|
-
|
117
|
+
|
118
118
|
before(:each) do
|
119
119
|
@m = MetaInspector.new("http://cnn.com", :document => "<html><head><title>Hello From Passed Html</title><a href='/hello'>Hello link</a></head><body></body></html>")
|
120
120
|
end
|
@@ -147,7 +147,7 @@ describe MetaInspector do
|
|
147
147
|
"http://pagerankalert.com/users/sign_up",
|
148
148
|
"http://pagerankalert.com/users/sign_in",
|
149
149
|
"mailto:pagerankalert@gmail.com",
|
150
|
-
"http://pagerankalert.posterous.com",
|
150
|
+
"http://pagerankalert.posterous.com/",
|
151
151
|
"http://twitter.com/pagerankalert",
|
152
152
|
"http://twitter.com/share" ]
|
153
153
|
end
|
@@ -161,7 +161,7 @@ describe MetaInspector do
|
|
161
161
|
|
162
162
|
it "should get correct absolute links for external pages" do
|
163
163
|
@m.external_links.should == [ "mailto:pagerankalert@gmail.com",
|
164
|
-
"http://pagerankalert.posterous.com",
|
164
|
+
"http://pagerankalert.posterous.com/",
|
165
165
|
"http://twitter.com/pagerankalert",
|
166
166
|
"http://twitter.com/share" ]
|
167
167
|
end
|
@@ -178,13 +178,14 @@ describe MetaInspector do
|
|
178
178
|
end
|
179
179
|
|
180
180
|
describe "links with international characters" do
|
181
|
-
it "should get correct absolute links, encoding the URLs as needed
|
181
|
+
it "should get correct absolute links, encoding the URLs as needed" do
|
182
182
|
m = MetaInspector.new('http://international.com')
|
183
183
|
m.links.should == [ "http://international.com/espa%C3%B1a.asp",
|
184
184
|
"http://international.com/roman%C3%A9e",
|
185
185
|
"http://international.com/faqs#cami%C3%B3n",
|
186
186
|
"http://international.com/search?q=cami%C3%B3n",
|
187
187
|
"http://international.com/search?q=espa%C3%B1a#top",
|
188
|
+
"http://international.com/index.php?q=espa%C3%B1a&url=aHR0zZQ==&cntnt01pageid=21",
|
188
189
|
"http://example.com/espa%C3%B1a.asp",
|
189
190
|
"http://example.com/roman%C3%A9e",
|
190
191
|
"http://example.com/faqs#cami%C3%B3n",
|
@@ -199,7 +200,8 @@ describe MetaInspector do
|
|
199
200
|
"http://international.com/roman%C3%A9e",
|
200
201
|
"http://international.com/faqs#cami%C3%B3n",
|
201
202
|
"http://international.com/search?q=cami%C3%B3n",
|
202
|
-
"http://international.com/search?q=espa%C3%B1a#top"
|
203
|
+
"http://international.com/search?q=espa%C3%B1a#top",
|
204
|
+
"http://international.com/index.php?q=espa%C3%B1a&url=aHR0zZQ==&cntnt01pageid=21"]
|
203
205
|
end
|
204
206
|
|
205
207
|
it "should not crash when processing malformed hrefs" do
|
@@ -225,8 +227,7 @@ describe MetaInspector do
|
|
225
227
|
m = MetaInspector.new('http://example.com/malformed_href')
|
226
228
|
expect {
|
227
229
|
m.external_links.should == ["skype:joeuser?call", "telnet://telnet.cdrom.com",
|
228
|
-
"javascript:alert('ok');", "
|
229
|
-
"mailto:email(at)example.com"]
|
230
|
+
"javascript:alert('ok');", "mailto:email(at)example.com"]
|
230
231
|
m.should_not be_ok
|
231
232
|
}.to_not raise_error
|
232
233
|
end
|
@@ -257,8 +258,7 @@ describe MetaInspector do
|
|
257
258
|
|
258
259
|
it "should get the links" do
|
259
260
|
@m.links.sort.should == [
|
260
|
-
"
|
261
|
-
"ftp://ftp.cdrom.com",
|
261
|
+
"ftp://ftp.cdrom.com/",
|
262
262
|
"javascript:alert('hey');",
|
263
263
|
"mailto:user@example.com",
|
264
264
|
"skype:joeuser?call",
|
@@ -275,12 +275,12 @@ describe MetaInspector do
|
|
275
275
|
|
276
276
|
it "should convert protocol-relative links to http" do
|
277
277
|
@m_http.links.should include('http://protocol-relative.com/contact')
|
278
|
-
@m_http.links.should include('http://yahoo.com')
|
278
|
+
@m_http.links.should include('http://yahoo.com/')
|
279
279
|
end
|
280
280
|
|
281
281
|
it "should convert protocol-relative links to https" do
|
282
282
|
@m_https.links.should include('https://protocol-relative.com/contact')
|
283
|
-
@m_https.links.should include('https://yahoo.com')
|
283
|
+
@m_https.links.should include('https://yahoo.com/')
|
284
284
|
end
|
285
285
|
end
|
286
286
|
|
@@ -355,7 +355,40 @@ describe MetaInspector do
|
|
355
355
|
describe 'to_hash' do
|
356
356
|
it "should return a hash with all the values set" do
|
357
357
|
@m = MetaInspector.new('http://pagerankalert.com')
|
358
|
-
@m.to_hash.should == {
|
358
|
+
@m.to_hash.should == {
|
359
|
+
"url" =>"http://pagerankalert.com/",
|
360
|
+
"title" =>"PageRankAlert.com :: Track your PageRank changes",
|
361
|
+
"links" => ["http://pagerankalert.com/",
|
362
|
+
"http://pagerankalert.com/es?language=es",
|
363
|
+
"http://pagerankalert.com/users/sign_up",
|
364
|
+
"http://pagerankalert.com/users/sign_in",
|
365
|
+
"mailto:pagerankalert@gmail.com",
|
366
|
+
"http://pagerankalert.posterous.com/",
|
367
|
+
"http://twitter.com/pagerankalert",
|
368
|
+
"http://twitter.com/share"],
|
369
|
+
"internal_links" => ["http://pagerankalert.com/",
|
370
|
+
"http://pagerankalert.com/es?language=es",
|
371
|
+
"http://pagerankalert.com/users/sign_up",
|
372
|
+
"http://pagerankalert.com/users/sign_in"],
|
373
|
+
"external_links" => ["mailto:pagerankalert@gmail.com",
|
374
|
+
"http://pagerankalert.posterous.com/",
|
375
|
+
"http://twitter.com/pagerankalert",
|
376
|
+
"http://twitter.com/share"],
|
377
|
+
"images" => ["http://pagerankalert.com/images/pagerank_alert.png?1305794559"],
|
378
|
+
"charset" => "utf-8",
|
379
|
+
"feed" => "http://feeds.feedburner.com/PageRankAlert",
|
380
|
+
"content_type" =>"text/html",
|
381
|
+
"meta" => {
|
382
|
+
"name" => {
|
383
|
+
"description"=> "Track your PageRank(TM) changes and receive alerts by email",
|
384
|
+
"keywords" => "pagerank, seo, optimization, google",
|
385
|
+
"robots" => "all,follow",
|
386
|
+
"csrf_param" => "authenticity_token",
|
387
|
+
"csrf_token" => "iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="
|
388
|
+
},
|
389
|
+
"property"=>{}
|
390
|
+
}
|
391
|
+
}
|
359
392
|
end
|
360
393
|
end
|
361
394
|
|
data/spec/redirections_spec.rb
CHANGED
@@ -9,7 +9,7 @@ describe MetaInspector do
|
|
9
9
|
m = MetaInspector.new("http://facebook.com")
|
10
10
|
m.title.should be_nil
|
11
11
|
m.should_not be_ok
|
12
|
-
m.errors.first.should == "Scraping exception: redirection forbidden: http://facebook.com -> https://www.facebook.com/"
|
12
|
+
m.errors.first.should == "Scraping exception: redirection forbidden: http://facebook.com/ -> https://www.facebook.com/"
|
13
13
|
end
|
14
14
|
|
15
15
|
it "allows safe redirections when :allow_redirections => :safe" do
|
@@ -30,14 +30,14 @@ describe MetaInspector do
|
|
30
30
|
m = MetaInspector.new("https://unsafe-facebook.com")
|
31
31
|
m.title.should be_nil
|
32
32
|
m.should_not be_ok
|
33
|
-
m.errors.first.should == "Scraping exception: redirection forbidden: https://unsafe-facebook.com -> http://unsafe-facebook.com/"
|
33
|
+
m.errors.first.should == "Scraping exception: redirection forbidden: https://unsafe-facebook.com/ -> http://unsafe-facebook.com/"
|
34
34
|
end
|
35
35
|
|
36
36
|
it "disallows unsafe redirections when :allow_redirections => :safe" do
|
37
37
|
m = MetaInspector.new("https://unsafe-facebook.com", :allow_redirections => :safe)
|
38
38
|
m.title.should be_nil
|
39
39
|
m.should_not be_ok
|
40
|
-
m.errors.first.should == "Scraping exception: redirection forbidden: https://unsafe-facebook.com -> http://unsafe-facebook.com/"
|
40
|
+
m.errors.first.should == "Scraping exception: redirection forbidden: https://unsafe-facebook.com/ -> http://unsafe-facebook.com/"
|
41
41
|
end
|
42
42
|
|
43
43
|
it "allows unsafe redirections when :allow_redirections => :all" do
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 41
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 15
|
9
|
-
-
|
10
|
-
version: 1.15.
|
9
|
+
- 1
|
10
|
+
version: 1.15.1
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jaime Iniesta
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2013-
|
18
|
+
date: 2013-02-06 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
name: nokogiri
|
@@ -65,9 +65,25 @@ dependencies:
|
|
65
65
|
type: :runtime
|
66
66
|
version_requirements: *id003
|
67
67
|
- !ruby/object:Gem::Dependency
|
68
|
-
name:
|
68
|
+
name: addressable
|
69
69
|
prerelease: false
|
70
70
|
requirement: &id004 !ruby/object:Gem::Requirement
|
71
|
+
none: false
|
72
|
+
requirements:
|
73
|
+
- - ~>
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
hash: 7
|
76
|
+
segments:
|
77
|
+
- 2
|
78
|
+
- 3
|
79
|
+
- 2
|
80
|
+
version: 2.3.2
|
81
|
+
type: :runtime
|
82
|
+
version_requirements: *id004
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rspec
|
85
|
+
prerelease: false
|
86
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
71
87
|
none: false
|
72
88
|
requirements:
|
73
89
|
- - "="
|
@@ -79,11 +95,11 @@ dependencies:
|
|
79
95
|
- 0
|
80
96
|
version: 2.12.0
|
81
97
|
type: :development
|
82
|
-
version_requirements: *
|
98
|
+
version_requirements: *id005
|
83
99
|
- !ruby/object:Gem::Dependency
|
84
100
|
name: fakeweb
|
85
101
|
prerelease: false
|
86
|
-
requirement: &
|
102
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
87
103
|
none: false
|
88
104
|
requirements:
|
89
105
|
- - "="
|
@@ -95,11 +111,11 @@ dependencies:
|
|
95
111
|
- 0
|
96
112
|
version: 1.3.0
|
97
113
|
type: :development
|
98
|
-
version_requirements: *
|
114
|
+
version_requirements: *id006
|
99
115
|
- !ruby/object:Gem::Dependency
|
100
116
|
name: awesome_print
|
101
117
|
prerelease: false
|
102
|
-
requirement: &
|
118
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
103
119
|
none: false
|
104
120
|
requirements:
|
105
121
|
- - "="
|
@@ -111,11 +127,11 @@ dependencies:
|
|
111
127
|
- 0
|
112
128
|
version: 1.1.0
|
113
129
|
type: :development
|
114
|
-
version_requirements: *
|
130
|
+
version_requirements: *id007
|
115
131
|
- !ruby/object:Gem::Dependency
|
116
132
|
name: rake
|
117
133
|
prerelease: false
|
118
|
-
requirement: &
|
134
|
+
requirement: &id008 !ruby/object:Gem::Requirement
|
119
135
|
none: false
|
120
136
|
requirements:
|
121
137
|
- - ~>
|
@@ -127,7 +143,7 @@ dependencies:
|
|
127
143
|
- 3
|
128
144
|
version: 10.0.3
|
129
145
|
type: :development
|
130
|
-
version_requirements: *
|
146
|
+
version_requirements: *id008
|
131
147
|
description: MetaInspector lets you scrape a web page and get its title, charset, link and meta tags
|
132
148
|
email:
|
133
149
|
- jaimeiniesta@gmail.com
|