anemone 0.3.1 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG.rdoc CHANGED
@@ -1,3 +1,19 @@
1
+ == 0.3.2 / 2010-02-04
2
+
3
+ * Bug fixes
4
+
5
+ * Fixed issue that allowed following redirects off the original domain
6
+
7
+ == 0.3.1 / 2010-01-22
8
+
9
+ * Minor enhancements
10
+
11
+ * Added an attr_accessor to Page for the HTTP response body
12
+
13
+ * Bug fixes
14
+
15
+ * Fixed incorrect method calls in CLI scripts
16
+
1
17
  == 0.3.0 / 2009-12-15
2
18
 
3
19
  * Major enchancements
data/lib/anemone/core.rb CHANGED
@@ -7,7 +7,7 @@ require 'anemone/storage'
7
7
 
8
8
  module Anemone
9
9
 
10
- VERSION = '0.3.1';
10
+ VERSION = '0.3.2';
11
11
 
12
12
  #
13
13
  # Convenience method to start a crawl
data/lib/anemone/http.rb CHANGED
@@ -62,7 +62,7 @@ module Anemone
62
62
  yield response, code, loc, redirect_to, response_time
63
63
 
64
64
  limit = redirect_limit
65
- while response.is_a?(Net::HTTPRedirection) and limit > 0
65
+ while redirect_to && allowed?(redirect_to, url) && limit > 0
66
66
  loc = redirect_to
67
67
  loc = url.merge(loc) if loc.relative?
68
68
  response, response_time = get_response(loc, referer)
@@ -128,5 +128,12 @@ module Anemone
128
128
  @opts[:verbose]
129
129
  end
130
130
 
131
+ #
132
+ # Allowed to connect to the requested url?
133
+ #
134
+ def allowed?(to_url, from_url)
135
+ to_url.host.nil? || (to_url.host == from_url.host)
136
+ end
137
+
131
138
  end
132
139
  end
data/spec/core_spec.rb CHANGED
@@ -19,7 +19,7 @@ module Anemone
19
19
  Anemone.crawl(pages[0].url, @opts).should have(4).pages
20
20
  end
21
21
 
22
- it "should not leave the original domain" do
22
+ it "should not follow links that leave the original domain" do
23
23
  pages = []
24
24
  pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
25
25
  pages << FakePage.new('1')
@@ -30,6 +30,17 @@ module Anemone
30
30
  core.pages.keys.should_not include('http://www.other.com/')
31
31
  end
32
32
 
33
+ it "should not follow redirects that leave the original domain" do
34
+ pages = []
35
+ pages << FakePage.new('0', :links => ['1'], :redirect => 'http://www.other.com/')
36
+ pages << FakePage.new('1')
37
+
38
+ core = Anemone.crawl(pages[0].url, @opts)
39
+
40
+ core.should have(2).pages
41
+ core.pages.keys.should_not include('http://www.other.com/')
42
+ end
43
+
33
44
  it "should follow http redirects" do
34
45
  pages = []
35
46
  pages << FakePage.new('0', :links => ['1'])
@@ -9,12 +9,12 @@ FakeWeb.allow_net_connect = false
9
9
 
10
10
  module Anemone
11
11
  SPEC_DOMAIN = "http://www.example.com/"
12
-
12
+
13
13
  class FakePage
14
14
  attr_accessor :links
15
15
  attr_accessor :hrefs
16
16
  attr_accessor :body
17
-
17
+
18
18
  def initialize(name = '', options = {})
19
19
  @name = name
20
20
  @links = [options[:links]].flatten if options.has_key?(:links)
@@ -22,30 +22,38 @@ module Anemone
22
22
  @redirect = options[:redirect] if options.has_key?(:redirect)
23
23
  @content_type = options[:content_type] || "text/html"
24
24
  @body = options[:body]
25
-
25
+
26
26
  create_body unless @body
27
27
  add_to_fakeweb
28
28
  end
29
-
29
+
30
30
  def url
31
31
  SPEC_DOMAIN + @name
32
32
  end
33
-
33
+
34
34
  private
35
-
35
+
36
36
  def create_body
37
37
  @body = "<html><body>"
38
38
  @links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
39
39
  @hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
40
40
  @body += "</body></html>"
41
41
  end
42
-
42
+
43
43
  def add_to_fakeweb
44
44
  options = {:body => @body, :content_type => @content_type, :status => [200, "OK"]}
45
-
45
+
46
46
  if @redirect
47
- options[:status] = [301, "Permanently Moved"]
48
- options[:location] = SPEC_DOMAIN + @redirect
47
+ options[:status] = [301, "Permanently Moved"]
48
+
49
+ # only prepend SPEC_DOMAIN if a relative url (without an http scheme) was specified
50
+ redirect_url = (@redirect =~ /http/) ? @redirect : SPEC_DOMAIN + @redirect
51
+ options[:location] = redirect_url
52
+
53
+ # register the page this one redirects to
54
+ FakeWeb.register_uri(:get, redirect_url, {:body => '',
55
+ :content_type => @content_type,
56
+ :status => [200, "OK"]})
49
57
  end
50
58
 
51
59
  FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anemone
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Kite
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-01-22 00:00:00 -06:00
12
+ date: 2010-02-04 00:00:00 -06:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency