metainspector 1.11.0 → 1.12.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +12 -0
- data/lib/meta_inspector.rb +1 -0
- data/lib/meta_inspector/open_uri.rb +50 -0
- data/lib/meta_inspector/scraper.rb +22 -4
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/fixtures/facebook.com.response +6 -0
- data/spec/fixtures/https.facebook.com.response +18 -0
- data/spec/fixtures/unsafe_facebook.com.response +18 -0
- data/spec/fixtures/unsafe_https.facebook.com.response +6 -0
- data/spec/metainspector_spec.rb +0 -25
- data/spec/redirections_spec.rb +50 -0
- data/spec/spec_helper.rb +39 -1
- metadata +10 -4
data/README.rdoc
CHANGED
@@ -38,6 +38,14 @@ MetaInspector will try to parse all URLs by default. If you want to parse only t
|
|
38
38
|
|
39
39
|
page = MetaInspector.new('markupvalidator.com', :html_content_only => true)
|
40
40
|
|
41
|
+
MetaInspector allows safe redirects from http to https sites by default. Passing allow_safe_redirections as false will throw exceptions on such redirects.
|
42
|
+
|
43
|
+
page = MetaInspector.new('facebook.com', :allow_safe_redirections => false)
|
44
|
+
|
45
|
+
To enable unsafe redirects from https to http sites you can pass allow_unsafe_redirections as true. If this option is not specified or is false an exception is thrown on such redirects.
|
46
|
+
|
47
|
+
page = MetaInspector.new('facebook.com', :allow_unsafe_redirections => true)
|
48
|
+
|
41
49
|
Then you can see the scraped data like this:
|
42
50
|
|
43
51
|
page.url # URL of the page
|
@@ -91,6 +99,10 @@ In case there have been any errors, you can check them with:
|
|
91
99
|
|
92
100
|
page.errors # Will return an array with the error messages
|
93
101
|
|
102
|
+
If you also want to see the errors on console, you can initialize MetaInspector with the verbose option like that:
|
103
|
+
|
104
|
+
page = MetaInspector.new('http://example.com', :verbose => true)
|
105
|
+
|
94
106
|
= Examples
|
95
107
|
|
96
108
|
You can find some sample scripts on the samples folder, including a basic scraping and a spider that will follow external links using a queue. What follows is an example of use from irb:
|
data/lib/meta_inspector.rb
CHANGED
@@ -0,0 +1,50 @@
|
|
1
|
+
# Patch to allow open-uri to follow safe (http to https) and unsafe redirects (https to http).
|
2
|
+
# Original gist URL:
|
3
|
+
# https://gist.github.com/1271420
|
4
|
+
#
|
5
|
+
# Relevant issue:
|
6
|
+
# http://redmine.ruby-lang.org/issues/3719
|
7
|
+
#
|
8
|
+
# Source here:
|
9
|
+
# https://github.com/ruby/ruby/blob/trunk/lib/open-uri.rb
|
10
|
+
|
11
|
+
module OpenURI
|
12
|
+
class <<self
|
13
|
+
alias_method :open_uri_original, :open_uri
|
14
|
+
alias_method :redirectable_cautious?, :redirectable?
|
15
|
+
|
16
|
+
def redirectable_safe?(uri1, uri2)
|
17
|
+
uri1.scheme.downcase == uri2.scheme.downcase || (uri1.scheme.downcase == "http" && uri2.scheme.downcase == "https")
|
18
|
+
end
|
19
|
+
|
20
|
+
def redirectable_unsafe?(uri1, uri2)
|
21
|
+
!redirectable_safe?(uri1, uri2)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# The original open_uri takes *args but then doesn't do anything with them.
|
26
|
+
# Assume we can only handle a hash.
|
27
|
+
def self.open_uri(name, options = {})
|
28
|
+
redirectable_unsafe = options.delete :allow_unsafe_redirections
|
29
|
+
redirectable_safe = options.delete :allow_safe_redirections
|
30
|
+
|
31
|
+
if redirectable_unsafe
|
32
|
+
class <<self
|
33
|
+
remove_method :redirectable?
|
34
|
+
alias_method :redirectable?, :redirectable_unsafe?
|
35
|
+
end
|
36
|
+
elsif redirectable_safe
|
37
|
+
class <<self
|
38
|
+
remove_method :redirectable?
|
39
|
+
alias_method :redirectable?, :redirectable_safe?
|
40
|
+
end
|
41
|
+
else
|
42
|
+
class <<self
|
43
|
+
remove_method :redirectable?
|
44
|
+
alias_method :redirectable?, :redirectable_cautious?
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
self.open_uri_original name, options
|
49
|
+
end
|
50
|
+
end
|
@@ -9,20 +9,28 @@ require 'timeout'
|
|
9
9
|
module MetaInspector
|
10
10
|
class Scraper
|
11
11
|
attr_reader :url, :scheme, :host, :root_url, :errors, :content_type, :timeout, :html_content_only
|
12
|
+
attr_reader :allow_safe_redirections, :allow_unsafe_redirections, :verbose
|
12
13
|
|
13
14
|
# Initializes a new instance of MetaInspector, setting the URL to the one given
|
14
15
|
# Options:
|
15
16
|
# => timeout: defaults to 20 seconds
|
16
17
|
# => html_content_type_only: if an exception should be raised if request content-type is not text/html. Defaults to false
|
18
|
+
# => allow_safe_redirections: if redirects from http to https sites on the same domain should be allowed or not
|
19
|
+
# => allow_unsafe_redirections: if redirects from https to http sites on the same domain should be allowed or not
|
17
20
|
def initialize(url, options = {})
|
21
|
+
options = defaults.merge(options)
|
22
|
+
|
18
23
|
@url = with_default_scheme(encode_url(url))
|
19
24
|
@scheme = URI.parse(@url).scheme
|
20
25
|
@host = URI.parse(@url).host
|
21
26
|
@root_url = "#{@scheme}://#{@host}/"
|
22
|
-
@timeout = options[:timeout]
|
27
|
+
@timeout = options[:timeout]
|
23
28
|
@data = Hashie::Rash.new
|
24
29
|
@errors = []
|
25
|
-
@html_content_only
|
30
|
+
@html_content_only = options[:html_content_only]
|
31
|
+
@allow_safe_redirections = options[:allow_safe_redirections]
|
32
|
+
@allow_unsafe_redirections = options[:allow_unsafe_redirections]
|
33
|
+
@verbose = options[:verbose]
|
26
34
|
end
|
27
35
|
|
28
36
|
# Returns the parsed document title, from the content of the <title> tag.
|
@@ -134,6 +142,16 @@ module MetaInspector
|
|
134
142
|
|
135
143
|
private
|
136
144
|
|
145
|
+
def defaults
|
146
|
+
{
|
147
|
+
:timeout => 20,
|
148
|
+
:html_content_only => false,
|
149
|
+
:allow_safe_redirections => true,
|
150
|
+
:allow_unsafe_redirections => false,
|
151
|
+
:verbose => false
|
152
|
+
}
|
153
|
+
end
|
154
|
+
|
137
155
|
# Scrapers for all meta_tags in the form of "meta_name" are automatically defined. This has been tested for
|
138
156
|
# meta name: keywords, description, robots, generator
|
139
157
|
# meta http-equiv: content-language, Content-Type
|
@@ -156,7 +174,7 @@ module MetaInspector
|
|
156
174
|
|
157
175
|
# Makes the request to the server
|
158
176
|
def request
|
159
|
-
Timeout::timeout(timeout) { @request ||= open(url) }
|
177
|
+
Timeout::timeout(timeout) { @request ||= open(url, {:allow_safe_redirections => allow_safe_redirections, :allow_unsafe_redirections => allow_unsafe_redirections}) }
|
160
178
|
|
161
179
|
rescue TimeoutError
|
162
180
|
add_fatal_error 'Timeout!!!'
|
@@ -199,7 +217,7 @@ module MetaInspector
|
|
199
217
|
|
200
218
|
# Stores the error for later inspection
|
201
219
|
def add_fatal_error(error)
|
202
|
-
warn error
|
220
|
+
warn error if verbose
|
203
221
|
@errors << error
|
204
222
|
end
|
205
223
|
|
@@ -0,0 +1,18 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Server: nginx/0.7.67
|
3
|
+
Date: Fri, 18 Nov 2011 21:46:46 GMT
|
4
|
+
Content-Type: text/html
|
5
|
+
Connection: keep-alive
|
6
|
+
Last-Modified: Mon, 14 Nov 2011 16:53:18 GMT
|
7
|
+
Content-Length: 4987
|
8
|
+
X-Varnish: 2000423390
|
9
|
+
Age: 0
|
10
|
+
Via: 1.1 varnish
|
11
|
+
|
12
|
+
<html>
|
13
|
+
<head>
|
14
|
+
<title>Hello From Facebook</title>
|
15
|
+
</head>
|
16
|
+
<body>
|
17
|
+
</body>
|
18
|
+
</html>
|
@@ -0,0 +1,18 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Server: nginx/0.7.67
|
3
|
+
Date: Fri, 18 Nov 2011 21:46:46 GMT
|
4
|
+
Content-Type: text/html
|
5
|
+
Connection: keep-alive
|
6
|
+
Last-Modified: Mon, 14 Nov 2011 16:53:18 GMT
|
7
|
+
Content-Length: 4987
|
8
|
+
X-Varnish: 2000423390
|
9
|
+
Age: 0
|
10
|
+
Via: 1.1 varnish
|
11
|
+
|
12
|
+
<html>
|
13
|
+
<head>
|
14
|
+
<title>Hello From Unsafe Facebook</title>
|
15
|
+
</head>
|
16
|
+
<body>
|
17
|
+
</body>
|
18
|
+
</html>
|
data/spec/metainspector_spec.rb
CHANGED
@@ -3,31 +3,6 @@
|
|
3
3
|
require File.join(File.dirname(__FILE__), "/spec_helper")
|
4
4
|
|
5
5
|
describe MetaInspector do
|
6
|
-
FakeWeb.register_uri(:get, "http://pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
|
7
|
-
FakeWeb.register_uri(:get, "pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
|
8
|
-
FakeWeb.register_uri(:get, "http://www.alazan.com", :response => fixture_file("alazan.com.response"))
|
9
|
-
FakeWeb.register_uri(:get, "http://alazan.com/websolution.asp", :response => fixture_file("alazan_websolution.response"))
|
10
|
-
FakeWeb.register_uri(:get, "http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/", :response => fixture_file("theonion.com.response"))
|
11
|
-
FakeWeb.register_uri(:get, "http://theonion-no-description.com", :response => fixture_file("theonion-no-description.com.response"))
|
12
|
-
FakeWeb.register_uri(:get, "http://www.iteh.at", :response => fixture_file("iteh.at.response"))
|
13
|
-
FakeWeb.register_uri(:get, "http://www.tea-tron.com/jbravo/blog/", :response => fixture_file("tea-tron.com.response"))
|
14
|
-
FakeWeb.register_uri(:get, "http://www.guardian.co.uk/media/pda/2011/sep/15/techcrunch-arrington-startups", :response => fixture_file("guardian.co.uk.response"))
|
15
|
-
FakeWeb.register_uri(:get, "http://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
|
16
|
-
FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
|
17
|
-
FakeWeb.register_uri(:get, "http://example.com/nonhttp", :response => fixture_file("nonhttp.response"))
|
18
|
-
FakeWeb.register_uri(:get, "http://example.com/invalid_href", :response => fixture_file("invalid_href.response"))
|
19
|
-
FakeWeb.register_uri(:get, "http://www.youtube.com/watch?v=iaGSSrp49uc", :response => fixture_file("youtube.response"))
|
20
|
-
FakeWeb.register_uri(:get, "http://markupvalidator.com/faqs", :response => fixture_file("markupvalidator_faqs.response"))
|
21
|
-
FakeWeb.register_uri(:get, "https://twitter.com/markupvalidator", :response => fixture_file("twitter_markupvalidator.response"))
|
22
|
-
FakeWeb.register_uri(:get, "https://example.com/empty", :response => fixture_file("empty_page.response"))
|
23
|
-
FakeWeb.register_uri(:get, "http://international.com", :response => fixture_file("international.response"))
|
24
|
-
FakeWeb.register_uri(:get, "http://charset000.com", :response => fixture_file("charset_000.response"))
|
25
|
-
FakeWeb.register_uri(:get, "http://charset001.com", :response => fixture_file("charset_001.response"))
|
26
|
-
FakeWeb.register_uri(:get, "http://charset002.com", :response => fixture_file("charset_002.response"))
|
27
|
-
FakeWeb.register_uri(:get, "http://www.inkthemes.com/", :response => fixture_file("wordpress_site.response"))
|
28
|
-
FakeWeb.register_uri(:get, "http://pagerankalert.com/image.png", :body => "Image", :content_type => "image/png")
|
29
|
-
FakeWeb.register_uri(:get, "http://pagerankalert.com/file.tar.gz", :body => "Image", :content_type => "application/x-gzip")
|
30
|
-
|
31
6
|
describe 'Initialization' do
|
32
7
|
it 'should accept an URL with a scheme' do
|
33
8
|
MetaInspector.new('http://pagerankalert.com').url.should == 'http://pagerankalert.com'
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require File.join(File.dirname(__FILE__), "/spec_helper")
|
4
|
+
|
5
|
+
describe MetaInspector do
|
6
|
+
describe "redirections" do
|
7
|
+
describe "safe redirections (HTTP to HTTPS)" do
|
8
|
+
it "allows safe redirections by default" do
|
9
|
+
m = MetaInspector.new("http://facebook.com")
|
10
|
+
m.title.should == "Hello From Facebook"
|
11
|
+
m.should be_ok
|
12
|
+
end
|
13
|
+
|
14
|
+
it "allows safe redirections when specifically set to true" do
|
15
|
+
m = MetaInspector.new("http://facebook.com", :allow_safe_redirections => true)
|
16
|
+
m.title.should == "Hello From Facebook"
|
17
|
+
m.should be_ok
|
18
|
+
end
|
19
|
+
|
20
|
+
it "disallows safe redirections if set to false" do
|
21
|
+
m = MetaInspector.new("http://facebook.com", :allow_safe_redirections => false)
|
22
|
+
m.title.should be_nil
|
23
|
+
m.should_not be_ok
|
24
|
+
m.errors.first.should == "Scraping exception: redirection forbidden: http://facebook.com -> https://www.facebook.com/"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
describe "unsafe redirections (HTTPS to HTTP)" do
|
29
|
+
it "disallows unsafe redirections by default" do
|
30
|
+
m = MetaInspector.new("https://unsafe-facebook.com")
|
31
|
+
m.title.should be_nil
|
32
|
+
m.should_not be_ok
|
33
|
+
m.errors.first.should == "Scraping exception: redirection forbidden: https://unsafe-facebook.com -> http://unsafe-facebook.com/"
|
34
|
+
end
|
35
|
+
|
36
|
+
it "disallows unsafe redirections when specifically set to false" do
|
37
|
+
m = MetaInspector.new("https://unsafe-facebook.com", :allow_unsafe_redirections => false)
|
38
|
+
m.title.should be_nil
|
39
|
+
m.should_not be_ok
|
40
|
+
m.errors.first.should == "Scraping exception: redirection forbidden: https://unsafe-facebook.com -> http://unsafe-facebook.com/"
|
41
|
+
end
|
42
|
+
|
43
|
+
it "allows unsafe redirections if set to true" do
|
44
|
+
m = MetaInspector.new("https://unsafe-facebook.com", :allow_unsafe_redirections => true)
|
45
|
+
m.title.should == "Hello From Unsafe Facebook"
|
46
|
+
m.should be_ok
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
data/spec/spec_helper.rb
CHANGED
@@ -10,4 +10,42 @@ def fixture_file(filename)
|
|
10
10
|
return '' if filename == ''
|
11
11
|
file_path = File.expand_path(File.dirname(__FILE__) + '/fixtures/' + filename)
|
12
12
|
File.read(file_path)
|
13
|
-
end
|
13
|
+
end
|
14
|
+
|
15
|
+
#######################
|
16
|
+
# Faked web responses #
|
17
|
+
#######################
|
18
|
+
|
19
|
+
FakeWeb.register_uri(:get, "http://pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
|
20
|
+
FakeWeb.register_uri(:get, "pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
|
21
|
+
FakeWeb.register_uri(:get, "http://www.alazan.com", :response => fixture_file("alazan.com.response"))
|
22
|
+
FakeWeb.register_uri(:get, "http://alazan.com/websolution.asp", :response => fixture_file("alazan_websolution.response"))
|
23
|
+
FakeWeb.register_uri(:get, "http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/", :response => fixture_file("theonion.com.response"))
|
24
|
+
FakeWeb.register_uri(:get, "http://theonion-no-description.com", :response => fixture_file("theonion-no-description.com.response"))
|
25
|
+
FakeWeb.register_uri(:get, "http://www.iteh.at", :response => fixture_file("iteh.at.response"))
|
26
|
+
FakeWeb.register_uri(:get, "http://www.tea-tron.com/jbravo/blog/", :response => fixture_file("tea-tron.com.response"))
|
27
|
+
FakeWeb.register_uri(:get, "http://www.guardian.co.uk/media/pda/2011/sep/15/techcrunch-arrington-startups", :response => fixture_file("guardian.co.uk.response"))
|
28
|
+
FakeWeb.register_uri(:get, "http://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
|
29
|
+
FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
|
30
|
+
FakeWeb.register_uri(:get, "http://example.com/nonhttp", :response => fixture_file("nonhttp.response"))
|
31
|
+
FakeWeb.register_uri(:get, "http://example.com/invalid_href", :response => fixture_file("invalid_href.response"))
|
32
|
+
FakeWeb.register_uri(:get, "http://www.youtube.com/watch?v=iaGSSrp49uc", :response => fixture_file("youtube.response"))
|
33
|
+
FakeWeb.register_uri(:get, "http://markupvalidator.com/faqs", :response => fixture_file("markupvalidator_faqs.response"))
|
34
|
+
FakeWeb.register_uri(:get, "https://twitter.com/markupvalidator", :response => fixture_file("twitter_markupvalidator.response"))
|
35
|
+
FakeWeb.register_uri(:get, "https://example.com/empty", :response => fixture_file("empty_page.response"))
|
36
|
+
FakeWeb.register_uri(:get, "http://international.com", :response => fixture_file("international.response"))
|
37
|
+
FakeWeb.register_uri(:get, "http://charset000.com", :response => fixture_file("charset_000.response"))
|
38
|
+
FakeWeb.register_uri(:get, "http://charset001.com", :response => fixture_file("charset_001.response"))
|
39
|
+
FakeWeb.register_uri(:get, "http://charset002.com", :response => fixture_file("charset_002.response"))
|
40
|
+
FakeWeb.register_uri(:get, "http://www.inkthemes.com/", :response => fixture_file("wordpress_site.response"))
|
41
|
+
FakeWeb.register_uri(:get, "http://pagerankalert.com/image.png", :body => "Image", :content_type => "image/png")
|
42
|
+
FakeWeb.register_uri(:get, "http://pagerankalert.com/file.tar.gz", :body => "Image", :content_type => "application/x-gzip")
|
43
|
+
|
44
|
+
# These examples are used to test the redirections from HTTP to HTTPS and vice versa
|
45
|
+
# http://facebook.com => https://facebook.com
|
46
|
+
FakeWeb.register_uri(:get, "http://facebook.com/", :response => fixture_file("facebook.com.response"))
|
47
|
+
FakeWeb.register_uri(:get, "https://www.facebook.com/", :response => fixture_file("https.facebook.com.response"))
|
48
|
+
|
49
|
+
# https://unsafe-facebook.com => http://unsafe-facebook.com
|
50
|
+
FakeWeb.register_uri(:get, "https://unsafe-facebook.com/", :response => fixture_file("unsafe_https.facebook.com.response"))
|
51
|
+
FakeWeb.register_uri(:get, "http://unsafe-facebook.com/", :response => fixture_file("unsafe_facebook.com.response"))
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 39
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
|
-
-
|
8
|
+
- 12
|
9
9
|
- 0
|
10
|
-
version: 1.
|
10
|
+
version: 1.12.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jaime Iniesta
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-
|
18
|
+
date: 2012-12-01 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
version_requirements: &id001 !ruby/object:Gem::Requirement
|
@@ -131,6 +131,7 @@ files:
|
|
131
131
|
- README.rdoc
|
132
132
|
- Rakefile
|
133
133
|
- lib/meta_inspector.rb
|
134
|
+
- lib/meta_inspector/open_uri.rb
|
134
135
|
- lib/meta_inspector/scraper.rb
|
135
136
|
- lib/meta_inspector/version.rb
|
136
137
|
- lib/metainspector.rb
|
@@ -143,7 +144,9 @@ files:
|
|
143
144
|
- spec/fixtures/charset_001.response
|
144
145
|
- spec/fixtures/charset_002.response
|
145
146
|
- spec/fixtures/empty_page.response
|
147
|
+
- spec/fixtures/facebook.com.response
|
146
148
|
- spec/fixtures/guardian.co.uk.response
|
149
|
+
- spec/fixtures/https.facebook.com.response
|
147
150
|
- spec/fixtures/international.response
|
148
151
|
- spec/fixtures/invalid_href.response
|
149
152
|
- spec/fixtures/iteh.at.response
|
@@ -155,9 +158,12 @@ files:
|
|
155
158
|
- spec/fixtures/theonion-no-description.com.response
|
156
159
|
- spec/fixtures/theonion.com.response
|
157
160
|
- spec/fixtures/twitter_markupvalidator.response
|
161
|
+
- spec/fixtures/unsafe_facebook.com.response
|
162
|
+
- spec/fixtures/unsafe_https.facebook.com.response
|
158
163
|
- spec/fixtures/wordpress_site.response
|
159
164
|
- spec/fixtures/youtube.response
|
160
165
|
- spec/metainspector_spec.rb
|
166
|
+
- spec/redirections_spec.rb
|
161
167
|
- spec/spec_helper.rb
|
162
168
|
homepage: https://github.com/jaimeiniesta/metainspector
|
163
169
|
licenses: []
|