metainspector 1.11.0 → 1.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +12 -0
- data/lib/meta_inspector.rb +1 -0
- data/lib/meta_inspector/open_uri.rb +50 -0
- data/lib/meta_inspector/scraper.rb +22 -4
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/fixtures/facebook.com.response +6 -0
- data/spec/fixtures/https.facebook.com.response +18 -0
- data/spec/fixtures/unsafe_facebook.com.response +18 -0
- data/spec/fixtures/unsafe_https.facebook.com.response +6 -0
- data/spec/metainspector_spec.rb +0 -25
- data/spec/redirections_spec.rb +50 -0
- data/spec/spec_helper.rb +39 -1
- metadata +10 -4
data/README.rdoc
CHANGED
@@ -38,6 +38,14 @@ MetaInspector will try to parse all URLs by default. If you want to parse only t
|
|
38
38
|
|
39
39
|
page = MetaInspector.new('markupvalidator.com', :html_content_only => true)
|
40
40
|
|
41
|
+
MetaInspector allows safe redirects from http to https sites by default. Passing allow_safe_redirections as false will throw exceptions on such redirects.
|
42
|
+
|
43
|
+
page = MetaInspector.new('facebook.com', :allow_safe_redirections => false)
|
44
|
+
|
45
|
+
To enable unsafe redirects from https to http sites you can pass allow_unsafe_redirections as true. If this option is not specified or is false an exception is thrown on such redirects.
|
46
|
+
|
47
|
+
page = MetaInspector.new('facebook.com', :allow_unsafe_redirections => true)
|
48
|
+
|
41
49
|
Then you can see the scraped data like this:
|
42
50
|
|
43
51
|
page.url # URL of the page
|
@@ -91,6 +99,10 @@ In case there have been any errors, you can check them with:
|
|
91
99
|
|
92
100
|
page.errors # Will return an array with the error messages
|
93
101
|
|
102
|
+
If you also want to see the errors on console, you can initialize MetaInspector with the verbose option like that:
|
103
|
+
|
104
|
+
page = MetaInspector.new('http://example.com', :verbose => true)
|
105
|
+
|
94
106
|
= Examples
|
95
107
|
|
96
108
|
You can find some sample scripts on the samples folder, including a basic scraping and a spider that will follow external links using a queue. What follows is an example of use from irb:
|
data/lib/meta_inspector.rb
CHANGED
@@ -0,0 +1,50 @@
|
|
1
|
+
# Patch to allow open-uri to follow safe (http to https) and unsafe redirects (https to http).
|
2
|
+
# Original gist URL:
|
3
|
+
# https://gist.github.com/1271420
|
4
|
+
#
|
5
|
+
# Relevant issue:
|
6
|
+
# http://redmine.ruby-lang.org/issues/3719
|
7
|
+
#
|
8
|
+
# Source here:
|
9
|
+
# https://github.com/ruby/ruby/blob/trunk/lib/open-uri.rb
|
10
|
+
|
11
|
+
module OpenURI
|
12
|
+
class <<self
|
13
|
+
alias_method :open_uri_original, :open_uri
|
14
|
+
alias_method :redirectable_cautious?, :redirectable?
|
15
|
+
|
16
|
+
def redirectable_safe?(uri1, uri2)
|
17
|
+
uri1.scheme.downcase == uri2.scheme.downcase || (uri1.scheme.downcase == "http" && uri2.scheme.downcase == "https")
|
18
|
+
end
|
19
|
+
|
20
|
+
def redirectable_unsafe?(uri1, uri2)
|
21
|
+
!redirectable_safe?(uri1, uri2)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# The original open_uri takes *args but then doesn't do anything with them.
|
26
|
+
# Assume we can only handle a hash.
|
27
|
+
def self.open_uri(name, options = {})
|
28
|
+
redirectable_unsafe = options.delete :allow_unsafe_redirections
|
29
|
+
redirectable_safe = options.delete :allow_safe_redirections
|
30
|
+
|
31
|
+
if redirectable_unsafe
|
32
|
+
class <<self
|
33
|
+
remove_method :redirectable?
|
34
|
+
alias_method :redirectable?, :redirectable_unsafe?
|
35
|
+
end
|
36
|
+
elsif redirectable_safe
|
37
|
+
class <<self
|
38
|
+
remove_method :redirectable?
|
39
|
+
alias_method :redirectable?, :redirectable_safe?
|
40
|
+
end
|
41
|
+
else
|
42
|
+
class <<self
|
43
|
+
remove_method :redirectable?
|
44
|
+
alias_method :redirectable?, :redirectable_cautious?
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
self.open_uri_original name, options
|
49
|
+
end
|
50
|
+
end
|
@@ -9,20 +9,28 @@ require 'timeout'
|
|
9
9
|
module MetaInspector
|
10
10
|
class Scraper
|
11
11
|
attr_reader :url, :scheme, :host, :root_url, :errors, :content_type, :timeout, :html_content_only
|
12
|
+
attr_reader :allow_safe_redirections, :allow_unsafe_redirections, :verbose
|
12
13
|
|
13
14
|
# Initializes a new instance of MetaInspector, setting the URL to the one given
|
14
15
|
# Options:
|
15
16
|
# => timeout: defaults to 20 seconds
|
16
17
|
# => html_content_type_only: if an exception should be raised if request content-type is not text/html. Defaults to false
|
18
|
+
# => allow_safe_redirections: if redirects from http to https sites on the same domain should be allowed or not
|
19
|
+
# => allow_unsafe_redirections: if redirects from https to http sites on the same domain should be allowed or not
|
17
20
|
def initialize(url, options = {})
|
21
|
+
options = defaults.merge(options)
|
22
|
+
|
18
23
|
@url = with_default_scheme(encode_url(url))
|
19
24
|
@scheme = URI.parse(@url).scheme
|
20
25
|
@host = URI.parse(@url).host
|
21
26
|
@root_url = "#{@scheme}://#{@host}/"
|
22
|
-
@timeout = options[:timeout]
|
27
|
+
@timeout = options[:timeout]
|
23
28
|
@data = Hashie::Rash.new
|
24
29
|
@errors = []
|
25
|
-
@html_content_only
|
30
|
+
@html_content_only = options[:html_content_only]
|
31
|
+
@allow_safe_redirections = options[:allow_safe_redirections]
|
32
|
+
@allow_unsafe_redirections = options[:allow_unsafe_redirections]
|
33
|
+
@verbose = options[:verbose]
|
26
34
|
end
|
27
35
|
|
28
36
|
# Returns the parsed document title, from the content of the <title> tag.
|
@@ -134,6 +142,16 @@ module MetaInspector
|
|
134
142
|
|
135
143
|
private
|
136
144
|
|
145
|
+
def defaults
|
146
|
+
{
|
147
|
+
:timeout => 20,
|
148
|
+
:html_content_only => false,
|
149
|
+
:allow_safe_redirections => true,
|
150
|
+
:allow_unsafe_redirections => false,
|
151
|
+
:verbose => false
|
152
|
+
}
|
153
|
+
end
|
154
|
+
|
137
155
|
# Scrapers for all meta_tags in the form of "meta_name" are automatically defined. This has been tested for
|
138
156
|
# meta name: keywords, description, robots, generator
|
139
157
|
# meta http-equiv: content-language, Content-Type
|
@@ -156,7 +174,7 @@ module MetaInspector
|
|
156
174
|
|
157
175
|
# Makes the request to the server
|
158
176
|
def request
|
159
|
-
Timeout::timeout(timeout) { @request ||= open(url) }
|
177
|
+
Timeout::timeout(timeout) { @request ||= open(url, {:allow_safe_redirections => allow_safe_redirections, :allow_unsafe_redirections => allow_unsafe_redirections}) }
|
160
178
|
|
161
179
|
rescue TimeoutError
|
162
180
|
add_fatal_error 'Timeout!!!'
|
@@ -199,7 +217,7 @@ module MetaInspector
|
|
199
217
|
|
200
218
|
# Stores the error for later inspection
|
201
219
|
def add_fatal_error(error)
|
202
|
-
warn error
|
220
|
+
warn error if verbose
|
203
221
|
@errors << error
|
204
222
|
end
|
205
223
|
|
@@ -0,0 +1,18 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Server: nginx/0.7.67
|
3
|
+
Date: Fri, 18 Nov 2011 21:46:46 GMT
|
4
|
+
Content-Type: text/html
|
5
|
+
Connection: keep-alive
|
6
|
+
Last-Modified: Mon, 14 Nov 2011 16:53:18 GMT
|
7
|
+
Content-Length: 4987
|
8
|
+
X-Varnish: 2000423390
|
9
|
+
Age: 0
|
10
|
+
Via: 1.1 varnish
|
11
|
+
|
12
|
+
<html>
|
13
|
+
<head>
|
14
|
+
<title>Hello From Facebook</title>
|
15
|
+
</head>
|
16
|
+
<body>
|
17
|
+
</body>
|
18
|
+
</html>
|
@@ -0,0 +1,18 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Server: nginx/0.7.67
|
3
|
+
Date: Fri, 18 Nov 2011 21:46:46 GMT
|
4
|
+
Content-Type: text/html
|
5
|
+
Connection: keep-alive
|
6
|
+
Last-Modified: Mon, 14 Nov 2011 16:53:18 GMT
|
7
|
+
Content-Length: 4987
|
8
|
+
X-Varnish: 2000423390
|
9
|
+
Age: 0
|
10
|
+
Via: 1.1 varnish
|
11
|
+
|
12
|
+
<html>
|
13
|
+
<head>
|
14
|
+
<title>Hello From Unsafe Facebook</title>
|
15
|
+
</head>
|
16
|
+
<body>
|
17
|
+
</body>
|
18
|
+
</html>
|
data/spec/metainspector_spec.rb
CHANGED
@@ -3,31 +3,6 @@
|
|
3
3
|
require File.join(File.dirname(__FILE__), "/spec_helper")
|
4
4
|
|
5
5
|
describe MetaInspector do
|
6
|
-
FakeWeb.register_uri(:get, "http://pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
|
7
|
-
FakeWeb.register_uri(:get, "pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
|
8
|
-
FakeWeb.register_uri(:get, "http://www.alazan.com", :response => fixture_file("alazan.com.response"))
|
9
|
-
FakeWeb.register_uri(:get, "http://alazan.com/websolution.asp", :response => fixture_file("alazan_websolution.response"))
|
10
|
-
FakeWeb.register_uri(:get, "http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/", :response => fixture_file("theonion.com.response"))
|
11
|
-
FakeWeb.register_uri(:get, "http://theonion-no-description.com", :response => fixture_file("theonion-no-description.com.response"))
|
12
|
-
FakeWeb.register_uri(:get, "http://www.iteh.at", :response => fixture_file("iteh.at.response"))
|
13
|
-
FakeWeb.register_uri(:get, "http://www.tea-tron.com/jbravo/blog/", :response => fixture_file("tea-tron.com.response"))
|
14
|
-
FakeWeb.register_uri(:get, "http://www.guardian.co.uk/media/pda/2011/sep/15/techcrunch-arrington-startups", :response => fixture_file("guardian.co.uk.response"))
|
15
|
-
FakeWeb.register_uri(:get, "http://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
|
16
|
-
FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
|
17
|
-
FakeWeb.register_uri(:get, "http://example.com/nonhttp", :response => fixture_file("nonhttp.response"))
|
18
|
-
FakeWeb.register_uri(:get, "http://example.com/invalid_href", :response => fixture_file("invalid_href.response"))
|
19
|
-
FakeWeb.register_uri(:get, "http://www.youtube.com/watch?v=iaGSSrp49uc", :response => fixture_file("youtube.response"))
|
20
|
-
FakeWeb.register_uri(:get, "http://markupvalidator.com/faqs", :response => fixture_file("markupvalidator_faqs.response"))
|
21
|
-
FakeWeb.register_uri(:get, "https://twitter.com/markupvalidator", :response => fixture_file("twitter_markupvalidator.response"))
|
22
|
-
FakeWeb.register_uri(:get, "https://example.com/empty", :response => fixture_file("empty_page.response"))
|
23
|
-
FakeWeb.register_uri(:get, "http://international.com", :response => fixture_file("international.response"))
|
24
|
-
FakeWeb.register_uri(:get, "http://charset000.com", :response => fixture_file("charset_000.response"))
|
25
|
-
FakeWeb.register_uri(:get, "http://charset001.com", :response => fixture_file("charset_001.response"))
|
26
|
-
FakeWeb.register_uri(:get, "http://charset002.com", :response => fixture_file("charset_002.response"))
|
27
|
-
FakeWeb.register_uri(:get, "http://www.inkthemes.com/", :response => fixture_file("wordpress_site.response"))
|
28
|
-
FakeWeb.register_uri(:get, "http://pagerankalert.com/image.png", :body => "Image", :content_type => "image/png")
|
29
|
-
FakeWeb.register_uri(:get, "http://pagerankalert.com/file.tar.gz", :body => "Image", :content_type => "application/x-gzip")
|
30
|
-
|
31
6
|
describe 'Initialization' do
|
32
7
|
it 'should accept an URL with a scheme' do
|
33
8
|
MetaInspector.new('http://pagerankalert.com').url.should == 'http://pagerankalert.com'
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require File.join(File.dirname(__FILE__), "/spec_helper")
|
4
|
+
|
5
|
+
describe MetaInspector do
|
6
|
+
describe "redirections" do
|
7
|
+
describe "safe redirections (HTTP to HTTPS)" do
|
8
|
+
it "allows safe redirections by default" do
|
9
|
+
m = MetaInspector.new("http://facebook.com")
|
10
|
+
m.title.should == "Hello From Facebook"
|
11
|
+
m.should be_ok
|
12
|
+
end
|
13
|
+
|
14
|
+
it "allows safe redirections when specifically set to true" do
|
15
|
+
m = MetaInspector.new("http://facebook.com", :allow_safe_redirections => true)
|
16
|
+
m.title.should == "Hello From Facebook"
|
17
|
+
m.should be_ok
|
18
|
+
end
|
19
|
+
|
20
|
+
it "disallows safe redirections if set to false" do
|
21
|
+
m = MetaInspector.new("http://facebook.com", :allow_safe_redirections => false)
|
22
|
+
m.title.should be_nil
|
23
|
+
m.should_not be_ok
|
24
|
+
m.errors.first.should == "Scraping exception: redirection forbidden: http://facebook.com -> https://www.facebook.com/"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
describe "unsafe redirections (HTTPS to HTTP)" do
|
29
|
+
it "disallows unsafe redirections by default" do
|
30
|
+
m = MetaInspector.new("https://unsafe-facebook.com")
|
31
|
+
m.title.should be_nil
|
32
|
+
m.should_not be_ok
|
33
|
+
m.errors.first.should == "Scraping exception: redirection forbidden: https://unsafe-facebook.com -> http://unsafe-facebook.com/"
|
34
|
+
end
|
35
|
+
|
36
|
+
it "disallows unsafe redirections when specifically set to false" do
|
37
|
+
m = MetaInspector.new("https://unsafe-facebook.com", :allow_unsafe_redirections => false)
|
38
|
+
m.title.should be_nil
|
39
|
+
m.should_not be_ok
|
40
|
+
m.errors.first.should == "Scraping exception: redirection forbidden: https://unsafe-facebook.com -> http://unsafe-facebook.com/"
|
41
|
+
end
|
42
|
+
|
43
|
+
it "allows unsafe redirections if set to true" do
|
44
|
+
m = MetaInspector.new("https://unsafe-facebook.com", :allow_unsafe_redirections => true)
|
45
|
+
m.title.should == "Hello From Unsafe Facebook"
|
46
|
+
m.should be_ok
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
data/spec/spec_helper.rb
CHANGED
@@ -10,4 +10,42 @@ def fixture_file(filename)
|
|
10
10
|
return '' if filename == ''
|
11
11
|
file_path = File.expand_path(File.dirname(__FILE__) + '/fixtures/' + filename)
|
12
12
|
File.read(file_path)
|
13
|
-
end
|
13
|
+
end
|
14
|
+
|
15
|
+
#######################
|
16
|
+
# Faked web responses #
|
17
|
+
#######################
|
18
|
+
|
19
|
+
FakeWeb.register_uri(:get, "http://pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
|
20
|
+
FakeWeb.register_uri(:get, "pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
|
21
|
+
FakeWeb.register_uri(:get, "http://www.alazan.com", :response => fixture_file("alazan.com.response"))
|
22
|
+
FakeWeb.register_uri(:get, "http://alazan.com/websolution.asp", :response => fixture_file("alazan_websolution.response"))
|
23
|
+
FakeWeb.register_uri(:get, "http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/", :response => fixture_file("theonion.com.response"))
|
24
|
+
FakeWeb.register_uri(:get, "http://theonion-no-description.com", :response => fixture_file("theonion-no-description.com.response"))
|
25
|
+
FakeWeb.register_uri(:get, "http://www.iteh.at", :response => fixture_file("iteh.at.response"))
|
26
|
+
FakeWeb.register_uri(:get, "http://www.tea-tron.com/jbravo/blog/", :response => fixture_file("tea-tron.com.response"))
|
27
|
+
FakeWeb.register_uri(:get, "http://www.guardian.co.uk/media/pda/2011/sep/15/techcrunch-arrington-startups", :response => fixture_file("guardian.co.uk.response"))
|
28
|
+
FakeWeb.register_uri(:get, "http://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
|
29
|
+
FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
|
30
|
+
FakeWeb.register_uri(:get, "http://example.com/nonhttp", :response => fixture_file("nonhttp.response"))
|
31
|
+
FakeWeb.register_uri(:get, "http://example.com/invalid_href", :response => fixture_file("invalid_href.response"))
|
32
|
+
FakeWeb.register_uri(:get, "http://www.youtube.com/watch?v=iaGSSrp49uc", :response => fixture_file("youtube.response"))
|
33
|
+
FakeWeb.register_uri(:get, "http://markupvalidator.com/faqs", :response => fixture_file("markupvalidator_faqs.response"))
|
34
|
+
FakeWeb.register_uri(:get, "https://twitter.com/markupvalidator", :response => fixture_file("twitter_markupvalidator.response"))
|
35
|
+
FakeWeb.register_uri(:get, "https://example.com/empty", :response => fixture_file("empty_page.response"))
|
36
|
+
FakeWeb.register_uri(:get, "http://international.com", :response => fixture_file("international.response"))
|
37
|
+
FakeWeb.register_uri(:get, "http://charset000.com", :response => fixture_file("charset_000.response"))
|
38
|
+
FakeWeb.register_uri(:get, "http://charset001.com", :response => fixture_file("charset_001.response"))
|
39
|
+
FakeWeb.register_uri(:get, "http://charset002.com", :response => fixture_file("charset_002.response"))
|
40
|
+
FakeWeb.register_uri(:get, "http://www.inkthemes.com/", :response => fixture_file("wordpress_site.response"))
|
41
|
+
FakeWeb.register_uri(:get, "http://pagerankalert.com/image.png", :body => "Image", :content_type => "image/png")
|
42
|
+
FakeWeb.register_uri(:get, "http://pagerankalert.com/file.tar.gz", :body => "Image", :content_type => "application/x-gzip")
|
43
|
+
|
44
|
+
# These examples are used to test the redirections from HTTP to HTTPS and vice versa
|
45
|
+
# http://facebook.com => https://facebook.com
|
46
|
+
FakeWeb.register_uri(:get, "http://facebook.com/", :response => fixture_file("facebook.com.response"))
|
47
|
+
FakeWeb.register_uri(:get, "https://www.facebook.com/", :response => fixture_file("https.facebook.com.response"))
|
48
|
+
|
49
|
+
# https://unsafe-facebook.com => http://unsafe-facebook.com
|
50
|
+
FakeWeb.register_uri(:get, "https://unsafe-facebook.com/", :response => fixture_file("unsafe_https.facebook.com.response"))
|
51
|
+
FakeWeb.register_uri(:get, "http://unsafe-facebook.com/", :response => fixture_file("unsafe_facebook.com.response"))
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 39
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
|
-
-
|
8
|
+
- 12
|
9
9
|
- 0
|
10
|
-
version: 1.
|
10
|
+
version: 1.12.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jaime Iniesta
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-
|
18
|
+
date: 2012-12-01 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
version_requirements: &id001 !ruby/object:Gem::Requirement
|
@@ -131,6 +131,7 @@ files:
|
|
131
131
|
- README.rdoc
|
132
132
|
- Rakefile
|
133
133
|
- lib/meta_inspector.rb
|
134
|
+
- lib/meta_inspector/open_uri.rb
|
134
135
|
- lib/meta_inspector/scraper.rb
|
135
136
|
- lib/meta_inspector/version.rb
|
136
137
|
- lib/metainspector.rb
|
@@ -143,7 +144,9 @@ files:
|
|
143
144
|
- spec/fixtures/charset_001.response
|
144
145
|
- spec/fixtures/charset_002.response
|
145
146
|
- spec/fixtures/empty_page.response
|
147
|
+
- spec/fixtures/facebook.com.response
|
146
148
|
- spec/fixtures/guardian.co.uk.response
|
149
|
+
- spec/fixtures/https.facebook.com.response
|
147
150
|
- spec/fixtures/international.response
|
148
151
|
- spec/fixtures/invalid_href.response
|
149
152
|
- spec/fixtures/iteh.at.response
|
@@ -155,9 +158,12 @@ files:
|
|
155
158
|
- spec/fixtures/theonion-no-description.com.response
|
156
159
|
- spec/fixtures/theonion.com.response
|
157
160
|
- spec/fixtures/twitter_markupvalidator.response
|
161
|
+
- spec/fixtures/unsafe_facebook.com.response
|
162
|
+
- spec/fixtures/unsafe_https.facebook.com.response
|
158
163
|
- spec/fixtures/wordpress_site.response
|
159
164
|
- spec/fixtures/youtube.response
|
160
165
|
- spec/metainspector_spec.rb
|
166
|
+
- spec/redirections_spec.rb
|
161
167
|
- spec/spec_helper.rb
|
162
168
|
homepage: https://github.com/jaimeiniesta/metainspector
|
163
169
|
licenses: []
|