metainspector 1.11.0 → 1.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc CHANGED
@@ -38,6 +38,14 @@ MetaInspector will try to parse all URLs by default. If you want to parse only t
38
38
 
39
39
  page = MetaInspector.new('markupvalidator.com', :html_content_only => true)
40
40
 
41
+ MetaInspector allows safe redirects from http to https sites by default. Passing allow_safe_redirections as false will throw exceptions on such redirects.
42
+
43
+ page = MetaInspector.new('facebook.com', :allow_safe_redirections => false)
44
+
45
+ To enable unsafe redirects from https to http sites you can pass allow_unsafe_redirections as true. If this option is not specified or is false an exception is thrown on such redirects.
46
+
47
+ page = MetaInspector.new('facebook.com', :allow_unsafe_redirections => true)
48
+
41
49
  Then you can see the scraped data like this:
42
50
 
43
51
  page.url # URL of the page
@@ -91,6 +99,10 @@ In case there have been any errors, you can check them with:
91
99
 
92
100
  page.errors # Will return an array with the error messages
93
101
 
102
+ If you also want to see the errors on console, you can initialize MetaInspector with the verbose option like that:
103
+
104
+ page = MetaInspector.new('http://example.com', :verbose => true)
105
+
94
106
  = Examples
95
107
 
96
108
  You can find some sample scripts on the samples folder, including a basic scraping and a spider that will follow external links using a queue. What follows is an example of use from irb:
@@ -1,6 +1,7 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/scraper'))
4
+ require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/open_uri'))
4
5
 
5
6
  module MetaInspector
6
7
  extend self
@@ -0,0 +1,50 @@
1
+ # Patch to allow open-uri to follow safe (http to https) and unsafe redirects (https to http).
2
+ # Original gist URL:
3
+ # https://gist.github.com/1271420
4
+ #
5
+ # Relevant issue:
6
+ # http://redmine.ruby-lang.org/issues/3719
7
+ #
8
+ # Source here:
9
+ # https://github.com/ruby/ruby/blob/trunk/lib/open-uri.rb
10
+
11
+ module OpenURI
12
+ class <<self
13
+ alias_method :open_uri_original, :open_uri
14
+ alias_method :redirectable_cautious?, :redirectable?
15
+
16
+ def redirectable_safe?(uri1, uri2)
17
+ uri1.scheme.downcase == uri2.scheme.downcase || (uri1.scheme.downcase == "http" && uri2.scheme.downcase == "https")
18
+ end
19
+
20
+ def redirectable_unsafe?(uri1, uri2)
21
+ !redirectable_safe?(uri1, uri2)
22
+ end
23
+ end
24
+
25
+ # The original open_uri takes *args but then doesn't do anything with them.
26
+ # Assume we can only handle a hash.
27
+ def self.open_uri(name, options = {})
28
+ redirectable_unsafe = options.delete :allow_unsafe_redirections
29
+ redirectable_safe = options.delete :allow_safe_redirections
30
+
31
+ if redirectable_unsafe
32
+ class <<self
33
+ remove_method :redirectable?
34
+ alias_method :redirectable?, :redirectable_unsafe?
35
+ end
36
+ elsif redirectable_safe
37
+ class <<self
38
+ remove_method :redirectable?
39
+ alias_method :redirectable?, :redirectable_safe?
40
+ end
41
+ else
42
+ class <<self
43
+ remove_method :redirectable?
44
+ alias_method :redirectable?, :redirectable_cautious?
45
+ end
46
+ end
47
+
48
+ self.open_uri_original name, options
49
+ end
50
+ end
@@ -9,20 +9,28 @@ require 'timeout'
9
9
  module MetaInspector
10
10
  class Scraper
11
11
  attr_reader :url, :scheme, :host, :root_url, :errors, :content_type, :timeout, :html_content_only
12
+ attr_reader :allow_safe_redirections, :allow_unsafe_redirections, :verbose
12
13
 
13
14
  # Initializes a new instance of MetaInspector, setting the URL to the one given
14
15
  # Options:
15
16
  # => timeout: defaults to 20 seconds
16
17
  # => html_content_type_only: if an exception should be raised if request content-type is not text/html. Defaults to false
18
+ # => allow_safe_redirections: if redirects from http to https sites on the same domain should be allowed or not
19
+ # => allow_unsafe_redirections: if redirects from https to http sites on the same domain should be allowed or not
17
20
  def initialize(url, options = {})
21
+ options = defaults.merge(options)
22
+
18
23
  @url = with_default_scheme(encode_url(url))
19
24
  @scheme = URI.parse(@url).scheme
20
25
  @host = URI.parse(@url).host
21
26
  @root_url = "#{@scheme}://#{@host}/"
22
- @timeout = options[:timeout] || 20
27
+ @timeout = options[:timeout]
23
28
  @data = Hashie::Rash.new
24
29
  @errors = []
25
- @html_content_only = options[:html_content_only] || false
30
+ @html_content_only = options[:html_content_only]
31
+ @allow_safe_redirections = options[:allow_safe_redirections]
32
+ @allow_unsafe_redirections = options[:allow_unsafe_redirections]
33
+ @verbose = options[:verbose]
26
34
  end
27
35
 
28
36
  # Returns the parsed document title, from the content of the <title> tag.
@@ -134,6 +142,16 @@ module MetaInspector
134
142
 
135
143
  private
136
144
 
145
+ def defaults
146
+ {
147
+ :timeout => 20,
148
+ :html_content_only => false,
149
+ :allow_safe_redirections => true,
150
+ :allow_unsafe_redirections => false,
151
+ :verbose => false
152
+ }
153
+ end
154
+
137
155
  # Scrapers for all meta_tags in the form of "meta_name" are automatically defined. This has been tested for
138
156
  # meta name: keywords, description, robots, generator
139
157
  # meta http-equiv: content-language, Content-Type
@@ -156,7 +174,7 @@ module MetaInspector
156
174
 
157
175
  # Makes the request to the server
158
176
  def request
159
- Timeout::timeout(timeout) { @request ||= open(url) }
177
+ Timeout::timeout(timeout) { @request ||= open(url, {:allow_safe_redirections => allow_safe_redirections, :allow_unsafe_redirections => allow_unsafe_redirections}) }
160
178
 
161
179
  rescue TimeoutError
162
180
  add_fatal_error 'Timeout!!!'
@@ -199,7 +217,7 @@ module MetaInspector
199
217
 
200
218
  # Stores the error for later inspection
201
219
  def add_fatal_error(error)
202
- warn error
220
+ warn error if verbose
203
221
  @errors << error
204
222
  end
205
223
 
@@ -1,5 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  module MetaInspector
4
- VERSION = "1.11.0"
4
+ VERSION = "1.12.0"
5
5
  end
@@ -0,0 +1,6 @@
1
+ HTTP/1.1 302 Found
2
+ Location: https://www.facebook.com/
3
+ Connection:keep-alive
4
+ Content-Length:0
5
+ Content-Type:text/html; charset=utf-8
6
+ Date:Mon, 26 Nov 2012 23:02:17 GMT
@@ -0,0 +1,18 @@
1
+ HTTP/1.1 200 OK
2
+ Server: nginx/0.7.67
3
+ Date: Fri, 18 Nov 2011 21:46:46 GMT
4
+ Content-Type: text/html
5
+ Connection: keep-alive
6
+ Last-Modified: Mon, 14 Nov 2011 16:53:18 GMT
7
+ Content-Length: 4987
8
+ X-Varnish: 2000423390
9
+ Age: 0
10
+ Via: 1.1 varnish
11
+
12
+ <html>
13
+ <head>
14
+ <title>Hello From Facebook</title>
15
+ </head>
16
+ <body>
17
+ </body>
18
+ </html>
@@ -0,0 +1,18 @@
1
+ HTTP/1.1 200 OK
2
+ Server: nginx/0.7.67
3
+ Date: Fri, 18 Nov 2011 21:46:46 GMT
4
+ Content-Type: text/html
5
+ Connection: keep-alive
6
+ Last-Modified: Mon, 14 Nov 2011 16:53:18 GMT
7
+ Content-Length: 4987
8
+ X-Varnish: 2000423390
9
+ Age: 0
10
+ Via: 1.1 varnish
11
+
12
+ <html>
13
+ <head>
14
+ <title>Hello From Unsafe Facebook</title>
15
+ </head>
16
+ <body>
17
+ </body>
18
+ </html>
@@ -0,0 +1,6 @@
1
+ HTTP/1.1 302 Found
2
+ Location: http://unsafe-facebook.com/
3
+ Connection:keep-alive
4
+ Content-Length:0
5
+ Content-Type:text/html; charset=utf-8
6
+ Date:Mon, 26 Nov 2012 23:02:17 GMT
@@ -3,31 +3,6 @@
3
3
  require File.join(File.dirname(__FILE__), "/spec_helper")
4
4
 
5
5
  describe MetaInspector do
6
- FakeWeb.register_uri(:get, "http://pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
7
- FakeWeb.register_uri(:get, "pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
8
- FakeWeb.register_uri(:get, "http://www.alazan.com", :response => fixture_file("alazan.com.response"))
9
- FakeWeb.register_uri(:get, "http://alazan.com/websolution.asp", :response => fixture_file("alazan_websolution.response"))
10
- FakeWeb.register_uri(:get, "http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/", :response => fixture_file("theonion.com.response"))
11
- FakeWeb.register_uri(:get, "http://theonion-no-description.com", :response => fixture_file("theonion-no-description.com.response"))
12
- FakeWeb.register_uri(:get, "http://www.iteh.at", :response => fixture_file("iteh.at.response"))
13
- FakeWeb.register_uri(:get, "http://www.tea-tron.com/jbravo/blog/", :response => fixture_file("tea-tron.com.response"))
14
- FakeWeb.register_uri(:get, "http://www.guardian.co.uk/media/pda/2011/sep/15/techcrunch-arrington-startups", :response => fixture_file("guardian.co.uk.response"))
15
- FakeWeb.register_uri(:get, "http://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
16
- FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
17
- FakeWeb.register_uri(:get, "http://example.com/nonhttp", :response => fixture_file("nonhttp.response"))
18
- FakeWeb.register_uri(:get, "http://example.com/invalid_href", :response => fixture_file("invalid_href.response"))
19
- FakeWeb.register_uri(:get, "http://www.youtube.com/watch?v=iaGSSrp49uc", :response => fixture_file("youtube.response"))
20
- FakeWeb.register_uri(:get, "http://markupvalidator.com/faqs", :response => fixture_file("markupvalidator_faqs.response"))
21
- FakeWeb.register_uri(:get, "https://twitter.com/markupvalidator", :response => fixture_file("twitter_markupvalidator.response"))
22
- FakeWeb.register_uri(:get, "https://example.com/empty", :response => fixture_file("empty_page.response"))
23
- FakeWeb.register_uri(:get, "http://international.com", :response => fixture_file("international.response"))
24
- FakeWeb.register_uri(:get, "http://charset000.com", :response => fixture_file("charset_000.response"))
25
- FakeWeb.register_uri(:get, "http://charset001.com", :response => fixture_file("charset_001.response"))
26
- FakeWeb.register_uri(:get, "http://charset002.com", :response => fixture_file("charset_002.response"))
27
- FakeWeb.register_uri(:get, "http://www.inkthemes.com/", :response => fixture_file("wordpress_site.response"))
28
- FakeWeb.register_uri(:get, "http://pagerankalert.com/image.png", :body => "Image", :content_type => "image/png")
29
- FakeWeb.register_uri(:get, "http://pagerankalert.com/file.tar.gz", :body => "Image", :content_type => "application/x-gzip")
30
-
31
6
  describe 'Initialization' do
32
7
  it 'should accept an URL with a scheme' do
33
8
  MetaInspector.new('http://pagerankalert.com').url.should == 'http://pagerankalert.com'
@@ -0,0 +1,50 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require File.join(File.dirname(__FILE__), "/spec_helper")
4
+
5
+ describe MetaInspector do
6
+ describe "redirections" do
7
+ describe "safe redirections (HTTP to HTTPS)" do
8
+ it "allows safe redirections by default" do
9
+ m = MetaInspector.new("http://facebook.com")
10
+ m.title.should == "Hello From Facebook"
11
+ m.should be_ok
12
+ end
13
+
14
+ it "allows safe redirections when specifically set to true" do
15
+ m = MetaInspector.new("http://facebook.com", :allow_safe_redirections => true)
16
+ m.title.should == "Hello From Facebook"
17
+ m.should be_ok
18
+ end
19
+
20
+ it "disallows safe redirections if set to false" do
21
+ m = MetaInspector.new("http://facebook.com", :allow_safe_redirections => false)
22
+ m.title.should be_nil
23
+ m.should_not be_ok
24
+ m.errors.first.should == "Scraping exception: redirection forbidden: http://facebook.com -> https://www.facebook.com/"
25
+ end
26
+ end
27
+
28
+ describe "unsafe redirections (HTTPS to HTTP)" do
29
+ it "disallows unsafe redirections by default" do
30
+ m = MetaInspector.new("https://unsafe-facebook.com")
31
+ m.title.should be_nil
32
+ m.should_not be_ok
33
+ m.errors.first.should == "Scraping exception: redirection forbidden: https://unsafe-facebook.com -> http://unsafe-facebook.com/"
34
+ end
35
+
36
+ it "disallows unsafe redirections when specifically set to false" do
37
+ m = MetaInspector.new("https://unsafe-facebook.com", :allow_unsafe_redirections => false)
38
+ m.title.should be_nil
39
+ m.should_not be_ok
40
+ m.errors.first.should == "Scraping exception: redirection forbidden: https://unsafe-facebook.com -> http://unsafe-facebook.com/"
41
+ end
42
+
43
+ it "allows unsafe redirections if set to true" do
44
+ m = MetaInspector.new("https://unsafe-facebook.com", :allow_unsafe_redirections => true)
45
+ m.title.should == "Hello From Unsafe Facebook"
46
+ m.should be_ok
47
+ end
48
+ end
49
+ end
50
+ end
data/spec/spec_helper.rb CHANGED
@@ -10,4 +10,42 @@ def fixture_file(filename)
10
10
  return '' if filename == ''
11
11
  file_path = File.expand_path(File.dirname(__FILE__) + '/fixtures/' + filename)
12
12
  File.read(file_path)
13
- end
13
+ end
14
+
15
+ #######################
16
+ # Faked web responses #
17
+ #######################
18
+
19
+ FakeWeb.register_uri(:get, "http://pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
20
+ FakeWeb.register_uri(:get, "pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
21
+ FakeWeb.register_uri(:get, "http://www.alazan.com", :response => fixture_file("alazan.com.response"))
22
+ FakeWeb.register_uri(:get, "http://alazan.com/websolution.asp", :response => fixture_file("alazan_websolution.response"))
23
+ FakeWeb.register_uri(:get, "http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/", :response => fixture_file("theonion.com.response"))
24
+ FakeWeb.register_uri(:get, "http://theonion-no-description.com", :response => fixture_file("theonion-no-description.com.response"))
25
+ FakeWeb.register_uri(:get, "http://www.iteh.at", :response => fixture_file("iteh.at.response"))
26
+ FakeWeb.register_uri(:get, "http://www.tea-tron.com/jbravo/blog/", :response => fixture_file("tea-tron.com.response"))
27
+ FakeWeb.register_uri(:get, "http://www.guardian.co.uk/media/pda/2011/sep/15/techcrunch-arrington-startups", :response => fixture_file("guardian.co.uk.response"))
28
+ FakeWeb.register_uri(:get, "http://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
29
+ FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
30
+ FakeWeb.register_uri(:get, "http://example.com/nonhttp", :response => fixture_file("nonhttp.response"))
31
+ FakeWeb.register_uri(:get, "http://example.com/invalid_href", :response => fixture_file("invalid_href.response"))
32
+ FakeWeb.register_uri(:get, "http://www.youtube.com/watch?v=iaGSSrp49uc", :response => fixture_file("youtube.response"))
33
+ FakeWeb.register_uri(:get, "http://markupvalidator.com/faqs", :response => fixture_file("markupvalidator_faqs.response"))
34
+ FakeWeb.register_uri(:get, "https://twitter.com/markupvalidator", :response => fixture_file("twitter_markupvalidator.response"))
35
+ FakeWeb.register_uri(:get, "https://example.com/empty", :response => fixture_file("empty_page.response"))
36
+ FakeWeb.register_uri(:get, "http://international.com", :response => fixture_file("international.response"))
37
+ FakeWeb.register_uri(:get, "http://charset000.com", :response => fixture_file("charset_000.response"))
38
+ FakeWeb.register_uri(:get, "http://charset001.com", :response => fixture_file("charset_001.response"))
39
+ FakeWeb.register_uri(:get, "http://charset002.com", :response => fixture_file("charset_002.response"))
40
+ FakeWeb.register_uri(:get, "http://www.inkthemes.com/", :response => fixture_file("wordpress_site.response"))
41
+ FakeWeb.register_uri(:get, "http://pagerankalert.com/image.png", :body => "Image", :content_type => "image/png")
42
+ FakeWeb.register_uri(:get, "http://pagerankalert.com/file.tar.gz", :body => "Image", :content_type => "application/x-gzip")
43
+
44
+ # These examples are used to test the redirections from HTTP to HTTPS and vice versa
45
+ # http://facebook.com => https://facebook.com
46
+ FakeWeb.register_uri(:get, "http://facebook.com/", :response => fixture_file("facebook.com.response"))
47
+ FakeWeb.register_uri(:get, "https://www.facebook.com/", :response => fixture_file("https.facebook.com.response"))
48
+
49
+ # https://unsafe-facebook.com => http://unsafe-facebook.com
50
+ FakeWeb.register_uri(:get, "https://unsafe-facebook.com/", :response => fixture_file("unsafe_https.facebook.com.response"))
51
+ FakeWeb.register_uri(:get, "http://unsafe-facebook.com/", :response => fixture_file("unsafe_facebook.com.response"))
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- hash: 59
4
+ hash: 39
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
- - 11
8
+ - 12
9
9
  - 0
10
- version: 1.11.0
10
+ version: 1.12.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jaime Iniesta
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2012-11-26 00:00:00 Z
18
+ date: 2012-12-01 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  version_requirements: &id001 !ruby/object:Gem::Requirement
@@ -131,6 +131,7 @@ files:
131
131
  - README.rdoc
132
132
  - Rakefile
133
133
  - lib/meta_inspector.rb
134
+ - lib/meta_inspector/open_uri.rb
134
135
  - lib/meta_inspector/scraper.rb
135
136
  - lib/meta_inspector/version.rb
136
137
  - lib/metainspector.rb
@@ -143,7 +144,9 @@ files:
143
144
  - spec/fixtures/charset_001.response
144
145
  - spec/fixtures/charset_002.response
145
146
  - spec/fixtures/empty_page.response
147
+ - spec/fixtures/facebook.com.response
146
148
  - spec/fixtures/guardian.co.uk.response
149
+ - spec/fixtures/https.facebook.com.response
147
150
  - spec/fixtures/international.response
148
151
  - spec/fixtures/invalid_href.response
149
152
  - spec/fixtures/iteh.at.response
@@ -155,9 +158,12 @@ files:
155
158
  - spec/fixtures/theonion-no-description.com.response
156
159
  - spec/fixtures/theonion.com.response
157
160
  - spec/fixtures/twitter_markupvalidator.response
161
+ - spec/fixtures/unsafe_facebook.com.response
162
+ - spec/fixtures/unsafe_https.facebook.com.response
158
163
  - spec/fixtures/wordpress_site.response
159
164
  - spec/fixtures/youtube.response
160
165
  - spec/metainspector_spec.rb
166
+ - spec/redirections_spec.rb
161
167
  - spec/spec_helper.rb
162
168
  homepage: https://github.com/jaimeiniesta/metainspector
163
169
  licenses: []