metainspector 1.14.0 → 1.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +7 -5
- data/lib/meta_inspector/scraper.rb +7 -11
- data/lib/meta_inspector/version.rb +1 -1
- data/meta_inspector.gemspec +1 -1
- data/spec/redirections_spec.rb +14 -14
- metadata +8 -8
data/README.md
CHANGED
@@ -93,13 +93,15 @@ You can set a different timeout with a second parameter, like this:
|
|
93
93
|
|
94
94
|
### Redirections
|
95
95
|
|
96
|
-
|
96
|
+
By default, redirections from HTTP to HTTPS, and from HTTPS to HTTP are disallowed.
|
97
97
|
|
98
|
-
|
98
|
+
However, you can tell MetaInspector to allow these redirections with the option `:allow_redirections`, like this:
|
99
99
|
|
100
|
-
|
101
|
-
|
102
|
-
|
100
|
+
# This will allow HTTP => HTTPS redirections
|
101
|
+
page = MetaInspector.new('facebook.com', :allow_redirections => :safe)
|
102
|
+
|
103
|
+
# And this will allow HTTP => HTTPS ("safe") and HTTPS => HTTP ("unsafe") redirections
|
104
|
+
page = MetaInspector.new('facebook.com', :allow_redirections => :all)
|
103
105
|
|
104
106
|
### HTML Content Only
|
105
107
|
|
@@ -10,14 +10,13 @@ require 'timeout'
|
|
10
10
|
module MetaInspector
|
11
11
|
class Scraper
|
12
12
|
attr_reader :url, :scheme, :host, :root_url, :errors, :content_type, :timeout, :html_content_only
|
13
|
-
attr_reader :
|
13
|
+
attr_reader :allow_redirections, :verbose
|
14
14
|
|
15
15
|
# Initializes a new instance of MetaInspector, setting the URL to the one given
|
16
16
|
# Options:
|
17
17
|
# => timeout: defaults to 20 seconds
|
18
18
|
# => html_content_type_only: if an exception should be raised if request content-type is not text/html. Defaults to false
|
19
|
-
# =>
|
20
|
-
# => allow_unsafe_redirections: if redirects from https to http sites on the same domain should be allowed or not
|
19
|
+
# => allow_redirections: when :safe, allows HTTP => HTTPS redirections. When :all, it also allows HTTPS => HTTP
|
21
20
|
# => document: the html of the url as a string
|
22
21
|
# => verbose: if the errors should be logged to the screen
|
23
22
|
def initialize(url, options = {})
|
@@ -30,11 +29,10 @@ module MetaInspector
|
|
30
29
|
@timeout = options[:timeout]
|
31
30
|
@data = Hashie::Rash.new
|
32
31
|
@errors = []
|
33
|
-
@html_content_only
|
34
|
-
@
|
35
|
-
@
|
36
|
-
@
|
37
|
-
@document = options[:document]
|
32
|
+
@html_content_only = options[:html_content_only]
|
33
|
+
@allow_redirections = options[:allow_redirections]
|
34
|
+
@verbose = options[:verbose]
|
35
|
+
@document = options[:document]
|
38
36
|
end
|
39
37
|
|
40
38
|
# Returns the parsed document title, from the content of the <title> tag.
|
@@ -139,8 +137,6 @@ module MetaInspector
|
|
139
137
|
{
|
140
138
|
:timeout => 20,
|
141
139
|
:html_content_only => false,
|
142
|
-
:allow_safe_redirections => true,
|
143
|
-
:allow_unsafe_redirections => false,
|
144
140
|
:verbose => false
|
145
141
|
}
|
146
142
|
end
|
@@ -167,7 +163,7 @@ module MetaInspector
|
|
167
163
|
|
168
164
|
# Makes the request to the server
|
169
165
|
def request
|
170
|
-
Timeout::timeout(timeout) { @request ||= open(url, {:
|
166
|
+
Timeout::timeout(timeout) { @request ||= open(url, {:allow_redirections => allow_redirections}) }
|
171
167
|
|
172
168
|
rescue TimeoutError
|
173
169
|
add_fatal_error 'Timeout!!!'
|
data/meta_inspector.gemspec
CHANGED
@@ -16,7 +16,7 @@ Gem::Specification.new do |gem|
|
|
16
16
|
|
17
17
|
gem.add_dependency 'nokogiri', '~> 1.5'
|
18
18
|
gem.add_dependency 'rash', '0.3.2'
|
19
|
-
gem.add_dependency 'open_uri_redirections', '0.0
|
19
|
+
gem.add_dependency 'open_uri_redirections', '~> 0.1.0'
|
20
20
|
|
21
21
|
gem.add_development_dependency 'rspec', '2.12.0'
|
22
22
|
gem.add_development_dependency 'fakeweb', '1.3.0'
|
data/spec/redirections_spec.rb
CHANGED
@@ -5,23 +5,23 @@ require File.join(File.dirname(__FILE__), "/spec_helper")
|
|
5
5
|
describe MetaInspector do
|
6
6
|
describe "redirections" do
|
7
7
|
describe "safe redirections (HTTP to HTTPS)" do
|
8
|
-
it "
|
8
|
+
it "disallows safe redirections by default" do
|
9
9
|
m = MetaInspector.new("http://facebook.com")
|
10
|
-
m.title.should
|
11
|
-
m.
|
10
|
+
m.title.should be_nil
|
11
|
+
m.should_not be_ok
|
12
|
+
m.errors.first.should == "Scraping exception: redirection forbidden: http://facebook.com -> https://www.facebook.com/"
|
12
13
|
end
|
13
14
|
|
14
|
-
it "allows safe redirections when
|
15
|
-
m = MetaInspector.new("http://facebook.com", :
|
15
|
+
it "allows safe redirections when :allow_redirections => :safe" do
|
16
|
+
m = MetaInspector.new("http://facebook.com", :allow_redirections => :safe)
|
16
17
|
m.title.should == "Hello From Facebook"
|
17
18
|
m.should be_ok
|
18
19
|
end
|
19
20
|
|
20
|
-
it "
|
21
|
-
m = MetaInspector.new("http://facebook.com", :
|
22
|
-
m.title.should
|
23
|
-
m.
|
24
|
-
m.errors.first.should == "Scraping exception: redirection forbidden: http://facebook.com -> https://www.facebook.com/"
|
21
|
+
it "allows safe redirections when :allow_redirections => :all" do
|
22
|
+
m = MetaInspector.new("http://facebook.com", :allow_redirections => :all)
|
23
|
+
m.title.should == "Hello From Facebook"
|
24
|
+
m.should be_ok
|
25
25
|
end
|
26
26
|
end
|
27
27
|
|
@@ -33,15 +33,15 @@ describe MetaInspector do
|
|
33
33
|
m.errors.first.should == "Scraping exception: redirection forbidden: https://unsafe-facebook.com -> http://unsafe-facebook.com/"
|
34
34
|
end
|
35
35
|
|
36
|
-
it "disallows unsafe redirections when
|
37
|
-
m = MetaInspector.new("https://unsafe-facebook.com", :
|
36
|
+
it "disallows unsafe redirections when :allow_redirections => :safe" do
|
37
|
+
m = MetaInspector.new("https://unsafe-facebook.com", :allow_redirections => :safe)
|
38
38
|
m.title.should be_nil
|
39
39
|
m.should_not be_ok
|
40
40
|
m.errors.first.should == "Scraping exception: redirection forbidden: https://unsafe-facebook.com -> http://unsafe-facebook.com/"
|
41
41
|
end
|
42
42
|
|
43
|
-
it "allows unsafe redirections
|
44
|
-
m = MetaInspector.new("https://unsafe-facebook.com", :
|
43
|
+
it "allows unsafe redirections when :allow_redirections => :all" do
|
44
|
+
m = MetaInspector.new("https://unsafe-facebook.com", :allow_redirections => :all)
|
45
45
|
m.title.should == "Hello From Unsafe Facebook"
|
46
46
|
m.should be_ok
|
47
47
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 43
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
|
-
-
|
8
|
+
- 15
|
9
9
|
- 0
|
10
|
-
version: 1.
|
10
|
+
version: 1.15.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jaime Iniesta
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2013-01-
|
18
|
+
date: 2013-01-19 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
name: nokogiri
|
@@ -54,14 +54,14 @@ dependencies:
|
|
54
54
|
requirement: &id003 !ruby/object:Gem::Requirement
|
55
55
|
none: false
|
56
56
|
requirements:
|
57
|
-
- -
|
57
|
+
- - ~>
|
58
58
|
- !ruby/object:Gem::Version
|
59
|
-
hash:
|
59
|
+
hash: 27
|
60
60
|
segments:
|
61
61
|
- 0
|
62
|
-
- 0
|
63
62
|
- 1
|
64
|
-
|
63
|
+
- 0
|
64
|
+
version: 0.1.0
|
65
65
|
type: :runtime
|
66
66
|
version_requirements: *id003
|
67
67
|
- !ruby/object:Gem::Dependency
|