metainspector 1.15.0 → 1.15.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/meta_inspector/scraper.rb +10 -9
- data/lib/meta_inspector/version.rb +1 -1
- data/meta_inspector.gemspec +1 -0
- data/spec/fixtures/international.response +1 -0
- data/spec/metainspector_spec.rb +46 -13
- data/spec/redirections_spec.rb +3 -3
- metadata +28 -12
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
require 'open-uri'
|
4
4
|
require 'open_uri_redirections'
|
5
|
+
require 'addressable/uri'
|
5
6
|
require 'nokogiri'
|
6
7
|
require 'hashie/rash'
|
7
8
|
require 'timeout'
|
@@ -22,7 +23,7 @@ module MetaInspector
|
|
22
23
|
def initialize(url, options = {})
|
23
24
|
options = defaults.merge(options)
|
24
25
|
|
25
|
-
@url = with_default_scheme(
|
26
|
+
@url = with_default_scheme(normalize_url(url))
|
26
27
|
@scheme = URI.parse(@url).scheme
|
27
28
|
@host = URI.parse(@url).host
|
28
29
|
@root_url = "#{@scheme}://#{@host}/"
|
@@ -49,7 +50,7 @@ module MetaInspector
|
|
49
50
|
|
50
51
|
# Links found on the page, as absolute URLs
|
51
52
|
def links
|
52
|
-
@links ||= parsed_links.map{ |l| absolutify_url(unrelativize_url(l)) }.compact
|
53
|
+
@links ||= parsed_links.map{ |l| absolutify_url(unrelativize_url(l)) }.compact.uniq
|
53
54
|
end
|
54
55
|
|
55
56
|
# Internal links found on the page, as absolute URLs
|
@@ -217,9 +218,9 @@ module MetaInspector
|
|
217
218
|
@errors << error
|
218
219
|
end
|
219
220
|
|
220
|
-
#
|
221
|
-
def
|
222
|
-
URI.
|
221
|
+
# Normalize url to deal with characters that should be encodes, add trailing slash, convert to downcase...
|
222
|
+
def normalize_url(url)
|
223
|
+
Addressable::URI.parse(url).normalize.to_s
|
223
224
|
end
|
224
225
|
|
225
226
|
# Adds 'http' as default scheme, if there if none
|
@@ -231,11 +232,11 @@ module MetaInspector
|
|
231
232
|
# Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
|
232
233
|
def absolutify_url(url)
|
233
234
|
if url =~ /^\w*\:/i
|
234
|
-
|
235
|
+
normalize_url(url)
|
235
236
|
else
|
236
|
-
URI.parse(root_url).merge(
|
237
|
+
URI.parse(root_url).merge(normalize_url(url)).to_s
|
237
238
|
end
|
238
|
-
rescue URI::InvalidURIError => e
|
239
|
+
rescue URI::InvalidURIError, Addressable::URI::InvalidURIError => e
|
239
240
|
add_fatal_error "Link parsing exception: #{e.message}" and nil
|
240
241
|
end
|
241
242
|
|
@@ -247,7 +248,7 @@ module MetaInspector
|
|
247
248
|
# Extracts the host from a given URL
|
248
249
|
def host_from_url(url)
|
249
250
|
URI.parse(url).host
|
250
|
-
rescue URI::InvalidURIError, URI::InvalidComponentError => e
|
251
|
+
rescue URI::InvalidURIError, URI::InvalidComponentError, Addressable::URI::InvalidURIError => e
|
251
252
|
add_fatal_error "Link parsing exception: #{e.message}" and nil
|
252
253
|
end
|
253
254
|
|
data/meta_inspector.gemspec
CHANGED
@@ -17,6 +17,7 @@ Gem::Specification.new do |gem|
|
|
17
17
|
gem.add_dependency 'nokogiri', '~> 1.5'
|
18
18
|
gem.add_dependency 'rash', '0.3.2'
|
19
19
|
gem.add_dependency 'open_uri_redirections', '~> 0.1.0'
|
20
|
+
gem.add_dependency 'addressable', '~> 2.3.2'
|
20
21
|
|
21
22
|
gem.add_development_dependency 'rspec', '2.12.0'
|
22
23
|
gem.add_development_dependency 'fakeweb', '1.3.0'
|
@@ -23,6 +23,7 @@ Cache-control: private
|
|
23
23
|
<a href="/faqs#camión">FAQs camión</a>
|
24
24
|
<a href="/search?q=camión">Search camión</a>
|
25
25
|
<a href="/search?q=españa#top">Search España at top</a>
|
26
|
+
<a href="/index.php?q=españa&url=aHR0zZQ%3D%3D&cntnt01pageid=21">A link with an encoded param, %3D should not be double-escaped</a>
|
26
27
|
|
27
28
|
<h1>External links:</h1>
|
28
29
|
<a href="http://example.com/españa.asp">España</a>
|
data/spec/metainspector_spec.rb
CHANGED
@@ -5,7 +5,7 @@ require File.join(File.dirname(__FILE__), "/spec_helper")
|
|
5
5
|
describe MetaInspector do
|
6
6
|
describe 'Initialization' do
|
7
7
|
it 'should accept an URL with a scheme' do
|
8
|
-
MetaInspector.new('http://pagerankalert.com').url.should == 'http://pagerankalert.com'
|
8
|
+
MetaInspector.new('http://pagerankalert.com').url.should == 'http://pagerankalert.com/'
|
9
9
|
end
|
10
10
|
|
11
11
|
it "should use http:// as a default scheme" do
|
@@ -114,7 +114,7 @@ describe MetaInspector do
|
|
114
114
|
end
|
115
115
|
|
116
116
|
describe 'Doing a basic scrape from passed url html' do
|
117
|
-
|
117
|
+
|
118
118
|
before(:each) do
|
119
119
|
@m = MetaInspector.new("http://cnn.com", :document => "<html><head><title>Hello From Passed Html</title><a href='/hello'>Hello link</a></head><body></body></html>")
|
120
120
|
end
|
@@ -147,7 +147,7 @@ describe MetaInspector do
|
|
147
147
|
"http://pagerankalert.com/users/sign_up",
|
148
148
|
"http://pagerankalert.com/users/sign_in",
|
149
149
|
"mailto:pagerankalert@gmail.com",
|
150
|
-
"http://pagerankalert.posterous.com",
|
150
|
+
"http://pagerankalert.posterous.com/",
|
151
151
|
"http://twitter.com/pagerankalert",
|
152
152
|
"http://twitter.com/share" ]
|
153
153
|
end
|
@@ -161,7 +161,7 @@ describe MetaInspector do
|
|
161
161
|
|
162
162
|
it "should get correct absolute links for external pages" do
|
163
163
|
@m.external_links.should == [ "mailto:pagerankalert@gmail.com",
|
164
|
-
"http://pagerankalert.posterous.com",
|
164
|
+
"http://pagerankalert.posterous.com/",
|
165
165
|
"http://twitter.com/pagerankalert",
|
166
166
|
"http://twitter.com/share" ]
|
167
167
|
end
|
@@ -178,13 +178,14 @@ describe MetaInspector do
|
|
178
178
|
end
|
179
179
|
|
180
180
|
describe "links with international characters" do
|
181
|
-
it "should get correct absolute links, encoding the URLs as needed
|
181
|
+
it "should get correct absolute links, encoding the URLs as needed" do
|
182
182
|
m = MetaInspector.new('http://international.com')
|
183
183
|
m.links.should == [ "http://international.com/espa%C3%B1a.asp",
|
184
184
|
"http://international.com/roman%C3%A9e",
|
185
185
|
"http://international.com/faqs#cami%C3%B3n",
|
186
186
|
"http://international.com/search?q=cami%C3%B3n",
|
187
187
|
"http://international.com/search?q=espa%C3%B1a#top",
|
188
|
+
"http://international.com/index.php?q=espa%C3%B1a&url=aHR0zZQ==&cntnt01pageid=21",
|
188
189
|
"http://example.com/espa%C3%B1a.asp",
|
189
190
|
"http://example.com/roman%C3%A9e",
|
190
191
|
"http://example.com/faqs#cami%C3%B3n",
|
@@ -199,7 +200,8 @@ describe MetaInspector do
|
|
199
200
|
"http://international.com/roman%C3%A9e",
|
200
201
|
"http://international.com/faqs#cami%C3%B3n",
|
201
202
|
"http://international.com/search?q=cami%C3%B3n",
|
202
|
-
"http://international.com/search?q=espa%C3%B1a#top"
|
203
|
+
"http://international.com/search?q=espa%C3%B1a#top",
|
204
|
+
"http://international.com/index.php?q=espa%C3%B1a&url=aHR0zZQ==&cntnt01pageid=21"]
|
203
205
|
end
|
204
206
|
|
205
207
|
it "should not crash when processing malformed hrefs" do
|
@@ -225,8 +227,7 @@ describe MetaInspector do
|
|
225
227
|
m = MetaInspector.new('http://example.com/malformed_href')
|
226
228
|
expect {
|
227
229
|
m.external_links.should == ["skype:joeuser?call", "telnet://telnet.cdrom.com",
|
228
|
-
"javascript:alert('ok');", "
|
229
|
-
"mailto:email(at)example.com"]
|
230
|
+
"javascript:alert('ok');", "mailto:email(at)example.com"]
|
230
231
|
m.should_not be_ok
|
231
232
|
}.to_not raise_error
|
232
233
|
end
|
@@ -257,8 +258,7 @@ describe MetaInspector do
|
|
257
258
|
|
258
259
|
it "should get the links" do
|
259
260
|
@m.links.sort.should == [
|
260
|
-
"
|
261
|
-
"ftp://ftp.cdrom.com",
|
261
|
+
"ftp://ftp.cdrom.com/",
|
262
262
|
"javascript:alert('hey');",
|
263
263
|
"mailto:user@example.com",
|
264
264
|
"skype:joeuser?call",
|
@@ -275,12 +275,12 @@ describe MetaInspector do
|
|
275
275
|
|
276
276
|
it "should convert protocol-relative links to http" do
|
277
277
|
@m_http.links.should include('http://protocol-relative.com/contact')
|
278
|
-
@m_http.links.should include('http://yahoo.com')
|
278
|
+
@m_http.links.should include('http://yahoo.com/')
|
279
279
|
end
|
280
280
|
|
281
281
|
it "should convert protocol-relative links to https" do
|
282
282
|
@m_https.links.should include('https://protocol-relative.com/contact')
|
283
|
-
@m_https.links.should include('https://yahoo.com')
|
283
|
+
@m_https.links.should include('https://yahoo.com/')
|
284
284
|
end
|
285
285
|
end
|
286
286
|
|
@@ -355,7 +355,40 @@ describe MetaInspector do
|
|
355
355
|
describe 'to_hash' do
|
356
356
|
it "should return a hash with all the values set" do
|
357
357
|
@m = MetaInspector.new('http://pagerankalert.com')
|
358
|
-
@m.to_hash.should == {
|
358
|
+
@m.to_hash.should == {
|
359
|
+
"url" =>"http://pagerankalert.com/",
|
360
|
+
"title" =>"PageRankAlert.com :: Track your PageRank changes",
|
361
|
+
"links" => ["http://pagerankalert.com/",
|
362
|
+
"http://pagerankalert.com/es?language=es",
|
363
|
+
"http://pagerankalert.com/users/sign_up",
|
364
|
+
"http://pagerankalert.com/users/sign_in",
|
365
|
+
"mailto:pagerankalert@gmail.com",
|
366
|
+
"http://pagerankalert.posterous.com/",
|
367
|
+
"http://twitter.com/pagerankalert",
|
368
|
+
"http://twitter.com/share"],
|
369
|
+
"internal_links" => ["http://pagerankalert.com/",
|
370
|
+
"http://pagerankalert.com/es?language=es",
|
371
|
+
"http://pagerankalert.com/users/sign_up",
|
372
|
+
"http://pagerankalert.com/users/sign_in"],
|
373
|
+
"external_links" => ["mailto:pagerankalert@gmail.com",
|
374
|
+
"http://pagerankalert.posterous.com/",
|
375
|
+
"http://twitter.com/pagerankalert",
|
376
|
+
"http://twitter.com/share"],
|
377
|
+
"images" => ["http://pagerankalert.com/images/pagerank_alert.png?1305794559"],
|
378
|
+
"charset" => "utf-8",
|
379
|
+
"feed" => "http://feeds.feedburner.com/PageRankAlert",
|
380
|
+
"content_type" =>"text/html",
|
381
|
+
"meta" => {
|
382
|
+
"name" => {
|
383
|
+
"description"=> "Track your PageRank(TM) changes and receive alerts by email",
|
384
|
+
"keywords" => "pagerank, seo, optimization, google",
|
385
|
+
"robots" => "all,follow",
|
386
|
+
"csrf_param" => "authenticity_token",
|
387
|
+
"csrf_token" => "iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="
|
388
|
+
},
|
389
|
+
"property"=>{}
|
390
|
+
}
|
391
|
+
}
|
359
392
|
end
|
360
393
|
end
|
361
394
|
|
data/spec/redirections_spec.rb
CHANGED
@@ -9,7 +9,7 @@ describe MetaInspector do
|
|
9
9
|
m = MetaInspector.new("http://facebook.com")
|
10
10
|
m.title.should be_nil
|
11
11
|
m.should_not be_ok
|
12
|
-
m.errors.first.should == "Scraping exception: redirection forbidden: http://facebook.com -> https://www.facebook.com/"
|
12
|
+
m.errors.first.should == "Scraping exception: redirection forbidden: http://facebook.com/ -> https://www.facebook.com/"
|
13
13
|
end
|
14
14
|
|
15
15
|
it "allows safe redirections when :allow_redirections => :safe" do
|
@@ -30,14 +30,14 @@ describe MetaInspector do
|
|
30
30
|
m = MetaInspector.new("https://unsafe-facebook.com")
|
31
31
|
m.title.should be_nil
|
32
32
|
m.should_not be_ok
|
33
|
-
m.errors.first.should == "Scraping exception: redirection forbidden: https://unsafe-facebook.com -> http://unsafe-facebook.com/"
|
33
|
+
m.errors.first.should == "Scraping exception: redirection forbidden: https://unsafe-facebook.com/ -> http://unsafe-facebook.com/"
|
34
34
|
end
|
35
35
|
|
36
36
|
it "disallows unsafe redirections when :allow_redirections => :safe" do
|
37
37
|
m = MetaInspector.new("https://unsafe-facebook.com", :allow_redirections => :safe)
|
38
38
|
m.title.should be_nil
|
39
39
|
m.should_not be_ok
|
40
|
-
m.errors.first.should == "Scraping exception: redirection forbidden: https://unsafe-facebook.com -> http://unsafe-facebook.com/"
|
40
|
+
m.errors.first.should == "Scraping exception: redirection forbidden: https://unsafe-facebook.com/ -> http://unsafe-facebook.com/"
|
41
41
|
end
|
42
42
|
|
43
43
|
it "allows unsafe redirections when :allow_redirections => :all" do
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 41
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 15
|
9
|
-
-
|
10
|
-
version: 1.15.
|
9
|
+
- 1
|
10
|
+
version: 1.15.1
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jaime Iniesta
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2013-
|
18
|
+
date: 2013-02-06 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
name: nokogiri
|
@@ -65,9 +65,25 @@ dependencies:
|
|
65
65
|
type: :runtime
|
66
66
|
version_requirements: *id003
|
67
67
|
- !ruby/object:Gem::Dependency
|
68
|
-
name:
|
68
|
+
name: addressable
|
69
69
|
prerelease: false
|
70
70
|
requirement: &id004 !ruby/object:Gem::Requirement
|
71
|
+
none: false
|
72
|
+
requirements:
|
73
|
+
- - ~>
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
hash: 7
|
76
|
+
segments:
|
77
|
+
- 2
|
78
|
+
- 3
|
79
|
+
- 2
|
80
|
+
version: 2.3.2
|
81
|
+
type: :runtime
|
82
|
+
version_requirements: *id004
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rspec
|
85
|
+
prerelease: false
|
86
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
71
87
|
none: false
|
72
88
|
requirements:
|
73
89
|
- - "="
|
@@ -79,11 +95,11 @@ dependencies:
|
|
79
95
|
- 0
|
80
96
|
version: 2.12.0
|
81
97
|
type: :development
|
82
|
-
version_requirements: *
|
98
|
+
version_requirements: *id005
|
83
99
|
- !ruby/object:Gem::Dependency
|
84
100
|
name: fakeweb
|
85
101
|
prerelease: false
|
86
|
-
requirement: &
|
102
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
87
103
|
none: false
|
88
104
|
requirements:
|
89
105
|
- - "="
|
@@ -95,11 +111,11 @@ dependencies:
|
|
95
111
|
- 0
|
96
112
|
version: 1.3.0
|
97
113
|
type: :development
|
98
|
-
version_requirements: *
|
114
|
+
version_requirements: *id006
|
99
115
|
- !ruby/object:Gem::Dependency
|
100
116
|
name: awesome_print
|
101
117
|
prerelease: false
|
102
|
-
requirement: &
|
118
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
103
119
|
none: false
|
104
120
|
requirements:
|
105
121
|
- - "="
|
@@ -111,11 +127,11 @@ dependencies:
|
|
111
127
|
- 0
|
112
128
|
version: 1.1.0
|
113
129
|
type: :development
|
114
|
-
version_requirements: *
|
130
|
+
version_requirements: *id007
|
115
131
|
- !ruby/object:Gem::Dependency
|
116
132
|
name: rake
|
117
133
|
prerelease: false
|
118
|
-
requirement: &
|
134
|
+
requirement: &id008 !ruby/object:Gem::Requirement
|
119
135
|
none: false
|
120
136
|
requirements:
|
121
137
|
- - ~>
|
@@ -127,7 +143,7 @@ dependencies:
|
|
127
143
|
- 3
|
128
144
|
version: 10.0.3
|
129
145
|
type: :development
|
130
|
-
version_requirements: *
|
146
|
+
version_requirements: *id008
|
131
147
|
description: MetaInspector lets you scrape a web page and get its title, charset, link and meta tags
|
132
148
|
email:
|
133
149
|
- jaimeiniesta@gmail.com
|