anemone 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG.rdoc CHANGED
@@ -1,3 +1,19 @@
1
+ == 0.3.2 / 2010-02-04
2
+
3
+ * Bug fixes
4
+
5
+ * Fixed issue that allowed following redirects off the original domain
6
+
7
+ == 0.3.1 / 2010-01-22
8
+
9
+ * Minor enhancements
10
+
11
+ * Added an attr_accessor to Page for the HTTP response body
12
+
13
+ * Bug fixes
14
+
15
+ * Fixed incorrect method calls in CLI scripts
16
+
1
17
  == 0.3.0 / 2009-12-15
2
18
 
3
19
  * Major enchancements
data/lib/anemone/core.rb CHANGED
@@ -7,7 +7,7 @@ require 'anemone/storage'
7
7
 
8
8
  module Anemone
9
9
 
10
- VERSION = '0.3.1';
10
+ VERSION = '0.3.2';
11
11
 
12
12
  #
13
13
  # Convenience method to start a crawl
data/lib/anemone/http.rb CHANGED
@@ -62,7 +62,7 @@ module Anemone
62
62
  yield response, code, loc, redirect_to, response_time
63
63
 
64
64
  limit = redirect_limit
65
- while response.is_a?(Net::HTTPRedirection) and limit > 0
65
+ while redirect_to && allowed?(redirect_to, url) && limit > 0
66
66
  loc = redirect_to
67
67
  loc = url.merge(loc) if loc.relative?
68
68
  response, response_time = get_response(loc, referer)
@@ -128,5 +128,12 @@ module Anemone
128
128
  @opts[:verbose]
129
129
  end
130
130
 
131
+ #
132
+ # Allowed to connect to the requested url?
133
+ #
134
+ def allowed?(to_url, from_url)
135
+ to_url.host.nil? || (to_url.host == from_url.host)
136
+ end
137
+
131
138
  end
132
139
  end
data/spec/core_spec.rb CHANGED
@@ -19,7 +19,7 @@ module Anemone
19
19
  Anemone.crawl(pages[0].url, @opts).should have(4).pages
20
20
  end
21
21
 
22
- it "should not leave the original domain" do
22
+ it "should not follow links that leave the original domain" do
23
23
  pages = []
24
24
  pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
25
25
  pages << FakePage.new('1')
@@ -30,6 +30,17 @@ module Anemone
30
30
  core.pages.keys.should_not include('http://www.other.com/')
31
31
  end
32
32
 
33
+ it "should not follow redirects that leave the original domain" do
34
+ pages = []
35
+ pages << FakePage.new('0', :links => ['1'], :redirect => 'http://www.other.com/')
36
+ pages << FakePage.new('1')
37
+
38
+ core = Anemone.crawl(pages[0].url, @opts)
39
+
40
+ core.should have(2).pages
41
+ core.pages.keys.should_not include('http://www.other.com/')
42
+ end
43
+
33
44
  it "should follow http redirects" do
34
45
  pages = []
35
46
  pages << FakePage.new('0', :links => ['1'])
@@ -9,12 +9,12 @@ FakeWeb.allow_net_connect = false
9
9
 
10
10
  module Anemone
11
11
  SPEC_DOMAIN = "http://www.example.com/"
12
-
12
+
13
13
  class FakePage
14
14
  attr_accessor :links
15
15
  attr_accessor :hrefs
16
16
  attr_accessor :body
17
-
17
+
18
18
  def initialize(name = '', options = {})
19
19
  @name = name
20
20
  @links = [options[:links]].flatten if options.has_key?(:links)
@@ -22,30 +22,38 @@ module Anemone
22
22
  @redirect = options[:redirect] if options.has_key?(:redirect)
23
23
  @content_type = options[:content_type] || "text/html"
24
24
  @body = options[:body]
25
-
25
+
26
26
  create_body unless @body
27
27
  add_to_fakeweb
28
28
  end
29
-
29
+
30
30
  def url
31
31
  SPEC_DOMAIN + @name
32
32
  end
33
-
33
+
34
34
  private
35
-
35
+
36
36
  def create_body
37
37
  @body = "<html><body>"
38
38
  @links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
39
39
  @hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
40
40
  @body += "</body></html>"
41
41
  end
42
-
42
+
43
43
  def add_to_fakeweb
44
44
  options = {:body => @body, :content_type => @content_type, :status => [200, "OK"]}
45
-
45
+
46
46
  if @redirect
47
- options[:status] = [301, "Permanently Moved"]
48
- options[:location] = SPEC_DOMAIN + @redirect
47
+ options[:status] = [301, "Permanently Moved"]
48
+
49
+ # only prepend SPEC_DOMAIN if a relative url (without an http scheme) was specified
50
+ redirect_url = (@redirect =~ /http/) ? @redirect : SPEC_DOMAIN + @redirect
51
+ options[:location] = redirect_url
52
+
53
+ # register the page this one redirects to
54
+ FakeWeb.register_uri(:get, redirect_url, {:body => '',
55
+ :content_type => @content_type,
56
+ :status => [200, "OK"]})
49
57
  end
50
58
 
51
59
  FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anemone
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Kite
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-01-22 00:00:00 -06:00
12
+ date: 2010-02-04 00:00:00 -06:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency