metainspector 1.11.0 → 1.12.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -38,6 +38,14 @@ MetaInspector will try to parse all URLs by default. If you want to parse only t
38
38
 
39
39
  page = MetaInspector.new('markupvalidator.com', :html_content_only => true)
40
40
 
41
+ MetaInspector allows safe redirects from http to https sites by default. Passing allow_safe_redirections as false will throw exceptions on such redirects.
42
+
43
+ page = MetaInspector.new('facebook.com', :allow_safe_redirections => false)
44
+
45
+ To enable unsafe redirects from https to http sites you can pass allow_unsafe_redirections as true. If this option is not specified or is false an exception is thrown on such redirects.
46
+
47
+ page = MetaInspector.new('facebook.com', :allow_unsafe_redirections => true)
48
+
41
49
  Then you can see the scraped data like this:
42
50
 
43
51
  page.url # URL of the page
@@ -91,6 +99,10 @@ In case there have been any errors, you can check them with:
91
99
 
92
100
  page.errors # Will return an array with the error messages
93
101
 
102
+ If you also want to see the errors on console, you can initialize MetaInspector with the verbose option like that:
103
+
104
+ page = MetaInspector.new('http://example.com', :verbose => true)
105
+
94
106
  = Examples
95
107
 
96
108
  You can find some sample scripts on the samples folder, including a basic scraping and a spider that will follow external links using a queue. What follows is an example of use from irb:
@@ -1,6 +1,7 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/scraper'))
4
+ require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/open_uri'))
4
5
 
5
6
  module MetaInspector
6
7
  extend self
@@ -0,0 +1,50 @@
1
+ # Patch to allow open-uri to follow safe (http to https) and unsafe redirects (https to http).
2
+ # Original gist URL:
3
+ # https://gist.github.com/1271420
4
+ #
5
+ # Relevant issue:
6
+ # http://redmine.ruby-lang.org/issues/3719
7
+ #
8
+ # Source here:
9
+ # https://github.com/ruby/ruby/blob/trunk/lib/open-uri.rb
10
+
11
+ module OpenURI
12
+ class <<self
13
+ alias_method :open_uri_original, :open_uri
14
+ alias_method :redirectable_cautious?, :redirectable?
15
+
16
+ def redirectable_safe?(uri1, uri2)
17
+ uri1.scheme.downcase == uri2.scheme.downcase || (uri1.scheme.downcase == "http" && uri2.scheme.downcase == "https")
18
+ end
19
+
20
+ def redirectable_unsafe?(uri1, uri2)
21
+ !redirectable_safe?(uri1, uri2)
22
+ end
23
+ end
24
+
25
+ # The original open_uri takes *args but then doesn't do anything with them.
26
+ # Assume we can only handle a hash.
27
+ def self.open_uri(name, options = {})
28
+ redirectable_unsafe = options.delete :allow_unsafe_redirections
29
+ redirectable_safe = options.delete :allow_safe_redirections
30
+
31
+ if redirectable_unsafe
32
+ class <<self
33
+ remove_method :redirectable?
34
+ alias_method :redirectable?, :redirectable_unsafe?
35
+ end
36
+ elsif redirectable_safe
37
+ class <<self
38
+ remove_method :redirectable?
39
+ alias_method :redirectable?, :redirectable_safe?
40
+ end
41
+ else
42
+ class <<self
43
+ remove_method :redirectable?
44
+ alias_method :redirectable?, :redirectable_cautious?
45
+ end
46
+ end
47
+
48
+ self.open_uri_original name, options
49
+ end
50
+ end
@@ -9,20 +9,28 @@ require 'timeout'
9
9
  module MetaInspector
10
10
  class Scraper
11
11
  attr_reader :url, :scheme, :host, :root_url, :errors, :content_type, :timeout, :html_content_only
12
+ attr_reader :allow_safe_redirections, :allow_unsafe_redirections, :verbose
12
13
 
13
14
  # Initializes a new instance of MetaInspector, setting the URL to the one given
14
15
  # Options:
15
16
  # => timeout: defaults to 20 seconds
16
17
  # => html_content_type_only: if an exception should be raised if request content-type is not text/html. Defaults to false
18
+ # => allow_safe_redirections: if redirects from http to https sites on the same domain should be allowed or not
19
+ # => allow_unsafe_redirections: if redirects from https to http sites on the same domain should be allowed or not
17
20
  def initialize(url, options = {})
21
+ options = defaults.merge(options)
22
+
18
23
  @url = with_default_scheme(encode_url(url))
19
24
  @scheme = URI.parse(@url).scheme
20
25
  @host = URI.parse(@url).host
21
26
  @root_url = "#{@scheme}://#{@host}/"
22
- @timeout = options[:timeout] || 20
27
+ @timeout = options[:timeout]
23
28
  @data = Hashie::Rash.new
24
29
  @errors = []
25
- @html_content_only = options[:html_content_only] || false
30
+ @html_content_only = options[:html_content_only]
31
+ @allow_safe_redirections = options[:allow_safe_redirections]
32
+ @allow_unsafe_redirections = options[:allow_unsafe_redirections]
33
+ @verbose = options[:verbose]
26
34
  end
27
35
 
28
36
  # Returns the parsed document title, from the content of the <title> tag.
@@ -134,6 +142,16 @@ module MetaInspector
134
142
 
135
143
  private
136
144
 
145
+ def defaults
146
+ {
147
+ :timeout => 20,
148
+ :html_content_only => false,
149
+ :allow_safe_redirections => true,
150
+ :allow_unsafe_redirections => false,
151
+ :verbose => false
152
+ }
153
+ end
154
+
137
155
  # Scrapers for all meta_tags in the form of "meta_name" are automatically defined. This has been tested for
138
156
  # meta name: keywords, description, robots, generator
139
157
  # meta http-equiv: content-language, Content-Type
@@ -156,7 +174,7 @@ module MetaInspector
156
174
 
157
175
  # Makes the request to the server
158
176
  def request
159
- Timeout::timeout(timeout) { @request ||= open(url) }
177
+ Timeout::timeout(timeout) { @request ||= open(url, {:allow_safe_redirections => allow_safe_redirections, :allow_unsafe_redirections => allow_unsafe_redirections}) }
160
178
 
161
179
  rescue TimeoutError
162
180
  add_fatal_error 'Timeout!!!'
@@ -199,7 +217,7 @@ module MetaInspector
199
217
 
200
218
  # Stores the error for later inspection
201
219
  def add_fatal_error(error)
202
- warn error
220
+ warn error if verbose
203
221
  @errors << error
204
222
  end
205
223
 
@@ -1,5 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  module MetaInspector
4
- VERSION = "1.11.0"
4
+ VERSION = "1.12.0"
5
5
  end
@@ -0,0 +1,6 @@
1
+ HTTP/1.1 302 Found
2
+ Location: https://www.facebook.com/
3
+ Connection:keep-alive
4
+ Content-Length:0
5
+ Content-Type:text/html; charset=utf-8
6
+ Date:Mon, 26 Nov 2012 23:02:17 GMT
@@ -0,0 +1,18 @@
1
+ HTTP/1.1 200 OK
2
+ Server: nginx/0.7.67
3
+ Date: Fri, 18 Nov 2011 21:46:46 GMT
4
+ Content-Type: text/html
5
+ Connection: keep-alive
6
+ Last-Modified: Mon, 14 Nov 2011 16:53:18 GMT
7
+ Content-Length: 4987
8
+ X-Varnish: 2000423390
9
+ Age: 0
10
+ Via: 1.1 varnish
11
+
12
+ <html>
13
+ <head>
14
+ <title>Hello From Facebook</title>
15
+ </head>
16
+ <body>
17
+ </body>
18
+ </html>
@@ -0,0 +1,18 @@
1
+ HTTP/1.1 200 OK
2
+ Server: nginx/0.7.67
3
+ Date: Fri, 18 Nov 2011 21:46:46 GMT
4
+ Content-Type: text/html
5
+ Connection: keep-alive
6
+ Last-Modified: Mon, 14 Nov 2011 16:53:18 GMT
7
+ Content-Length: 4987
8
+ X-Varnish: 2000423390
9
+ Age: 0
10
+ Via: 1.1 varnish
11
+
12
+ <html>
13
+ <head>
14
+ <title>Hello From Unsafe Facebook</title>
15
+ </head>
16
+ <body>
17
+ </body>
18
+ </html>
@@ -0,0 +1,6 @@
1
+ HTTP/1.1 302 Found
2
+ Location: http://unsafe-facebook.com/
3
+ Connection:keep-alive
4
+ Content-Length:0
5
+ Content-Type:text/html; charset=utf-8
6
+ Date:Mon, 26 Nov 2012 23:02:17 GMT
@@ -3,31 +3,6 @@
3
3
  require File.join(File.dirname(__FILE__), "/spec_helper")
4
4
 
5
5
  describe MetaInspector do
6
- FakeWeb.register_uri(:get, "http://pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
7
- FakeWeb.register_uri(:get, "pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
8
- FakeWeb.register_uri(:get, "http://www.alazan.com", :response => fixture_file("alazan.com.response"))
9
- FakeWeb.register_uri(:get, "http://alazan.com/websolution.asp", :response => fixture_file("alazan_websolution.response"))
10
- FakeWeb.register_uri(:get, "http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/", :response => fixture_file("theonion.com.response"))
11
- FakeWeb.register_uri(:get, "http://theonion-no-description.com", :response => fixture_file("theonion-no-description.com.response"))
12
- FakeWeb.register_uri(:get, "http://www.iteh.at", :response => fixture_file("iteh.at.response"))
13
- FakeWeb.register_uri(:get, "http://www.tea-tron.com/jbravo/blog/", :response => fixture_file("tea-tron.com.response"))
14
- FakeWeb.register_uri(:get, "http://www.guardian.co.uk/media/pda/2011/sep/15/techcrunch-arrington-startups", :response => fixture_file("guardian.co.uk.response"))
15
- FakeWeb.register_uri(:get, "http://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
16
- FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
17
- FakeWeb.register_uri(:get, "http://example.com/nonhttp", :response => fixture_file("nonhttp.response"))
18
- FakeWeb.register_uri(:get, "http://example.com/invalid_href", :response => fixture_file("invalid_href.response"))
19
- FakeWeb.register_uri(:get, "http://www.youtube.com/watch?v=iaGSSrp49uc", :response => fixture_file("youtube.response"))
20
- FakeWeb.register_uri(:get, "http://markupvalidator.com/faqs", :response => fixture_file("markupvalidator_faqs.response"))
21
- FakeWeb.register_uri(:get, "https://twitter.com/markupvalidator", :response => fixture_file("twitter_markupvalidator.response"))
22
- FakeWeb.register_uri(:get, "https://example.com/empty", :response => fixture_file("empty_page.response"))
23
- FakeWeb.register_uri(:get, "http://international.com", :response => fixture_file("international.response"))
24
- FakeWeb.register_uri(:get, "http://charset000.com", :response => fixture_file("charset_000.response"))
25
- FakeWeb.register_uri(:get, "http://charset001.com", :response => fixture_file("charset_001.response"))
26
- FakeWeb.register_uri(:get, "http://charset002.com", :response => fixture_file("charset_002.response"))
27
- FakeWeb.register_uri(:get, "http://www.inkthemes.com/", :response => fixture_file("wordpress_site.response"))
28
- FakeWeb.register_uri(:get, "http://pagerankalert.com/image.png", :body => "Image", :content_type => "image/png")
29
- FakeWeb.register_uri(:get, "http://pagerankalert.com/file.tar.gz", :body => "Image", :content_type => "application/x-gzip")
30
-
31
6
  describe 'Initialization' do
32
7
  it 'should accept an URL with a scheme' do
33
8
  MetaInspector.new('http://pagerankalert.com').url.should == 'http://pagerankalert.com'
@@ -0,0 +1,50 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require File.join(File.dirname(__FILE__), "/spec_helper")
4
+
5
+ describe MetaInspector do
6
+ describe "redirections" do
7
+ describe "safe redirections (HTTP to HTTPS)" do
8
+ it "allows safe redirections by default" do
9
+ m = MetaInspector.new("http://facebook.com")
10
+ m.title.should == "Hello From Facebook"
11
+ m.should be_ok
12
+ end
13
+
14
+ it "allows safe redirections when specifically set to true" do
15
+ m = MetaInspector.new("http://facebook.com", :allow_safe_redirections => true)
16
+ m.title.should == "Hello From Facebook"
17
+ m.should be_ok
18
+ end
19
+
20
+ it "disallows safe redirections if set to false" do
21
+ m = MetaInspector.new("http://facebook.com", :allow_safe_redirections => false)
22
+ m.title.should be_nil
23
+ m.should_not be_ok
24
+ m.errors.first.should == "Scraping exception: redirection forbidden: http://facebook.com -> https://www.facebook.com/"
25
+ end
26
+ end
27
+
28
+ describe "unsafe redirections (HTTPS to HTTP)" do
29
+ it "disallows unsafe redirections by default" do
30
+ m = MetaInspector.new("https://unsafe-facebook.com")
31
+ m.title.should be_nil
32
+ m.should_not be_ok
33
+ m.errors.first.should == "Scraping exception: redirection forbidden: https://unsafe-facebook.com -> http://unsafe-facebook.com/"
34
+ end
35
+
36
+ it "disallows unsafe redirections when specifically set to false" do
37
+ m = MetaInspector.new("https://unsafe-facebook.com", :allow_unsafe_redirections => false)
38
+ m.title.should be_nil
39
+ m.should_not be_ok
40
+ m.errors.first.should == "Scraping exception: redirection forbidden: https://unsafe-facebook.com -> http://unsafe-facebook.com/"
41
+ end
42
+
43
+ it "allows unsafe redirections if set to true" do
44
+ m = MetaInspector.new("https://unsafe-facebook.com", :allow_unsafe_redirections => true)
45
+ m.title.should == "Hello From Unsafe Facebook"
46
+ m.should be_ok
47
+ end
48
+ end
49
+ end
50
+ end
data/spec/spec_helper.rb CHANGED
@@ -10,4 +10,42 @@ def fixture_file(filename)
10
10
  return '' if filename == ''
11
11
  file_path = File.expand_path(File.dirname(__FILE__) + '/fixtures/' + filename)
12
12
  File.read(file_path)
13
- end
13
+ end
14
+
15
+ #######################
16
+ # Faked web responses #
17
+ #######################
18
+
19
+ FakeWeb.register_uri(:get, "http://pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
20
+ FakeWeb.register_uri(:get, "pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
21
+ FakeWeb.register_uri(:get, "http://www.alazan.com", :response => fixture_file("alazan.com.response"))
22
+ FakeWeb.register_uri(:get, "http://alazan.com/websolution.asp", :response => fixture_file("alazan_websolution.response"))
23
+ FakeWeb.register_uri(:get, "http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/", :response => fixture_file("theonion.com.response"))
24
+ FakeWeb.register_uri(:get, "http://theonion-no-description.com", :response => fixture_file("theonion-no-description.com.response"))
25
+ FakeWeb.register_uri(:get, "http://www.iteh.at", :response => fixture_file("iteh.at.response"))
26
+ FakeWeb.register_uri(:get, "http://www.tea-tron.com/jbravo/blog/", :response => fixture_file("tea-tron.com.response"))
27
+ FakeWeb.register_uri(:get, "http://www.guardian.co.uk/media/pda/2011/sep/15/techcrunch-arrington-startups", :response => fixture_file("guardian.co.uk.response"))
28
+ FakeWeb.register_uri(:get, "http://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
29
+ FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
30
+ FakeWeb.register_uri(:get, "http://example.com/nonhttp", :response => fixture_file("nonhttp.response"))
31
+ FakeWeb.register_uri(:get, "http://example.com/invalid_href", :response => fixture_file("invalid_href.response"))
32
+ FakeWeb.register_uri(:get, "http://www.youtube.com/watch?v=iaGSSrp49uc", :response => fixture_file("youtube.response"))
33
+ FakeWeb.register_uri(:get, "http://markupvalidator.com/faqs", :response => fixture_file("markupvalidator_faqs.response"))
34
+ FakeWeb.register_uri(:get, "https://twitter.com/markupvalidator", :response => fixture_file("twitter_markupvalidator.response"))
35
+ FakeWeb.register_uri(:get, "https://example.com/empty", :response => fixture_file("empty_page.response"))
36
+ FakeWeb.register_uri(:get, "http://international.com", :response => fixture_file("international.response"))
37
+ FakeWeb.register_uri(:get, "http://charset000.com", :response => fixture_file("charset_000.response"))
38
+ FakeWeb.register_uri(:get, "http://charset001.com", :response => fixture_file("charset_001.response"))
39
+ FakeWeb.register_uri(:get, "http://charset002.com", :response => fixture_file("charset_002.response"))
40
+ FakeWeb.register_uri(:get, "http://www.inkthemes.com/", :response => fixture_file("wordpress_site.response"))
41
+ FakeWeb.register_uri(:get, "http://pagerankalert.com/image.png", :body => "Image", :content_type => "image/png")
42
+ FakeWeb.register_uri(:get, "http://pagerankalert.com/file.tar.gz", :body => "Image", :content_type => "application/x-gzip")
43
+
44
+ # These examples are used to test the redirections from HTTP to HTTPS and vice versa
45
+ # http://facebook.com => https://facebook.com
46
+ FakeWeb.register_uri(:get, "http://facebook.com/", :response => fixture_file("facebook.com.response"))
47
+ FakeWeb.register_uri(:get, "https://www.facebook.com/", :response => fixture_file("https.facebook.com.response"))
48
+
49
+ # https://unsafe-facebook.com => http://unsafe-facebook.com
50
+ FakeWeb.register_uri(:get, "https://unsafe-facebook.com/", :response => fixture_file("unsafe_https.facebook.com.response"))
51
+ FakeWeb.register_uri(:get, "http://unsafe-facebook.com/", :response => fixture_file("unsafe_facebook.com.response"))
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- hash: 59
4
+ hash: 39
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
- - 11
8
+ - 12
9
9
  - 0
10
- version: 1.11.0
10
+ version: 1.12.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jaime Iniesta
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2012-11-26 00:00:00 Z
18
+ date: 2012-12-01 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  version_requirements: &id001 !ruby/object:Gem::Requirement
@@ -131,6 +131,7 @@ files:
131
131
  - README.rdoc
132
132
  - Rakefile
133
133
  - lib/meta_inspector.rb
134
+ - lib/meta_inspector/open_uri.rb
134
135
  - lib/meta_inspector/scraper.rb
135
136
  - lib/meta_inspector/version.rb
136
137
  - lib/metainspector.rb
@@ -143,7 +144,9 @@ files:
143
144
  - spec/fixtures/charset_001.response
144
145
  - spec/fixtures/charset_002.response
145
146
  - spec/fixtures/empty_page.response
147
+ - spec/fixtures/facebook.com.response
146
148
  - spec/fixtures/guardian.co.uk.response
149
+ - spec/fixtures/https.facebook.com.response
147
150
  - spec/fixtures/international.response
148
151
  - spec/fixtures/invalid_href.response
149
152
  - spec/fixtures/iteh.at.response
@@ -155,9 +158,12 @@ files:
155
158
  - spec/fixtures/theonion-no-description.com.response
156
159
  - spec/fixtures/theonion.com.response
157
160
  - spec/fixtures/twitter_markupvalidator.response
161
+ - spec/fixtures/unsafe_facebook.com.response
162
+ - spec/fixtures/unsafe_https.facebook.com.response
158
163
  - spec/fixtures/wordpress_site.response
159
164
  - spec/fixtures/youtube.response
160
165
  - spec/metainspector_spec.rb
166
+ - spec/redirections_spec.rb
161
167
  - spec/spec_helper.rb
162
168
  homepage: https://github.com/jaimeiniesta/metainspector
163
169
  licenses: []