anemone 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.rdoc +16 -0
- data/lib/anemone/core.rb +1 -1
- data/lib/anemone/http.rb +8 -1
- data/spec/core_spec.rb +12 -1
- data/spec/fakeweb_helper.rb +18 -10
- metadata +2 -2
data/CHANGELOG.rdoc
CHANGED
@@ -1,3 +1,19 @@
|
|
1
|
+
== 0.3.2 / 2010-02-04
|
2
|
+
|
3
|
+
* Bug fixes
|
4
|
+
|
5
|
+
* Fixed issue that allowed following redirects off the original domain
|
6
|
+
|
7
|
+
== 0.3.1 / 2010-01-22
|
8
|
+
|
9
|
+
* Minor enhancements
|
10
|
+
|
11
|
+
* Added an attr_accessor to Page for the HTTP response body
|
12
|
+
|
13
|
+
* Bug fixes
|
14
|
+
|
15
|
+
* Fixed incorrect method calls in CLI scripts
|
16
|
+
|
1
17
|
== 0.3.0 / 2009-12-15
|
2
18
|
|
3
19
|
* Major enchancements
|
data/lib/anemone/core.rb
CHANGED
data/lib/anemone/http.rb
CHANGED
@@ -62,7 +62,7 @@ module Anemone
|
|
62
62
|
yield response, code, loc, redirect_to, response_time
|
63
63
|
|
64
64
|
limit = redirect_limit
|
65
|
-
while
|
65
|
+
while redirect_to && allowed?(redirect_to, url) && limit > 0
|
66
66
|
loc = redirect_to
|
67
67
|
loc = url.merge(loc) if loc.relative?
|
68
68
|
response, response_time = get_response(loc, referer)
|
@@ -128,5 +128,12 @@ module Anemone
|
|
128
128
|
@opts[:verbose]
|
129
129
|
end
|
130
130
|
|
131
|
+
#
|
132
|
+
# Allowed to connect to the requested url?
|
133
|
+
#
|
134
|
+
def allowed?(to_url, from_url)
|
135
|
+
to_url.host.nil? || (to_url.host == from_url.host)
|
136
|
+
end
|
137
|
+
|
131
138
|
end
|
132
139
|
end
|
data/spec/core_spec.rb
CHANGED
@@ -19,7 +19,7 @@ module Anemone
|
|
19
19
|
Anemone.crawl(pages[0].url, @opts).should have(4).pages
|
20
20
|
end
|
21
21
|
|
22
|
-
it "should not leave the original domain" do
|
22
|
+
it "should not follow links that leave the original domain" do
|
23
23
|
pages = []
|
24
24
|
pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
|
25
25
|
pages << FakePage.new('1')
|
@@ -30,6 +30,17 @@ module Anemone
|
|
30
30
|
core.pages.keys.should_not include('http://www.other.com/')
|
31
31
|
end
|
32
32
|
|
33
|
+
it "should not follow redirects that leave the original domain" do
|
34
|
+
pages = []
|
35
|
+
pages << FakePage.new('0', :links => ['1'], :redirect => 'http://www.other.com/')
|
36
|
+
pages << FakePage.new('1')
|
37
|
+
|
38
|
+
core = Anemone.crawl(pages[0].url, @opts)
|
39
|
+
|
40
|
+
core.should have(2).pages
|
41
|
+
core.pages.keys.should_not include('http://www.other.com/')
|
42
|
+
end
|
43
|
+
|
33
44
|
it "should follow http redirects" do
|
34
45
|
pages = []
|
35
46
|
pages << FakePage.new('0', :links => ['1'])
|
data/spec/fakeweb_helper.rb
CHANGED
@@ -9,12 +9,12 @@ FakeWeb.allow_net_connect = false
|
|
9
9
|
|
10
10
|
module Anemone
|
11
11
|
SPEC_DOMAIN = "http://www.example.com/"
|
12
|
-
|
12
|
+
|
13
13
|
class FakePage
|
14
14
|
attr_accessor :links
|
15
15
|
attr_accessor :hrefs
|
16
16
|
attr_accessor :body
|
17
|
-
|
17
|
+
|
18
18
|
def initialize(name = '', options = {})
|
19
19
|
@name = name
|
20
20
|
@links = [options[:links]].flatten if options.has_key?(:links)
|
@@ -22,30 +22,38 @@ module Anemone
|
|
22
22
|
@redirect = options[:redirect] if options.has_key?(:redirect)
|
23
23
|
@content_type = options[:content_type] || "text/html"
|
24
24
|
@body = options[:body]
|
25
|
-
|
25
|
+
|
26
26
|
create_body unless @body
|
27
27
|
add_to_fakeweb
|
28
28
|
end
|
29
|
-
|
29
|
+
|
30
30
|
def url
|
31
31
|
SPEC_DOMAIN + @name
|
32
32
|
end
|
33
|
-
|
33
|
+
|
34
34
|
private
|
35
|
-
|
35
|
+
|
36
36
|
def create_body
|
37
37
|
@body = "<html><body>"
|
38
38
|
@links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
|
39
39
|
@hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
|
40
40
|
@body += "</body></html>"
|
41
41
|
end
|
42
|
-
|
42
|
+
|
43
43
|
def add_to_fakeweb
|
44
44
|
options = {:body => @body, :content_type => @content_type, :status => [200, "OK"]}
|
45
|
-
|
45
|
+
|
46
46
|
if @redirect
|
47
|
-
options[:status] = [301, "Permanently Moved"]
|
48
|
-
|
47
|
+
options[:status] = [301, "Permanently Moved"]
|
48
|
+
|
49
|
+
# only prepend SPEC_DOMAIN if a relative url (without an http scheme) was specified
|
50
|
+
redirect_url = (@redirect =~ /http/) ? @redirect : SPEC_DOMAIN + @redirect
|
51
|
+
options[:location] = redirect_url
|
52
|
+
|
53
|
+
# register the page this one redirects to
|
54
|
+
FakeWeb.register_uri(:get, redirect_url, {:body => '',
|
55
|
+
:content_type => @content_type,
|
56
|
+
:status => [200, "OK"]})
|
49
57
|
end
|
50
58
|
|
51
59
|
FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anemone
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Kite
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-
|
12
|
+
date: 2010-02-04 00:00:00 -06:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|