rdaneel 0.0.0 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2009 Edgar
1
+ Copyright (c) 2009 has_many :deveopers
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining
4
4
  a copy of this software and associated documentation files (the
@@ -18,3 +18,4 @@ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
18
  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
19
  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
20
  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21
+
data/README.rdoc CHANGED
@@ -1,13 +1,74 @@
1
1
  = R.Daneel
2
2
 
3
- Obey robots.txt on top of em-http-request (http://github.com/igrigorik/em-http-request - Asynchronous HTTP Client)
3
+ An EventMachine+Ruby library to fetch urls following robots.txt rules.
4
+
5
+ RDaneel is built it on top of @igrigorik's {em-http-request}[http://github.com/igrigorik/em-http-request]
6
+
7
+ == Features
8
+
9
+ - Support following redirects, honoring robots.txt for each host in the redirect chain.
10
+ - Support an external cache to store robots.txt
11
+ - Compatible with all options defined in em-http-request
12
+
13
+ == Install
14
+
15
+ $ gem install rdaneel
16
+
17
+ == Examples
18
+
19
+ === Following redirects
20
+
21
+ require 'rdaneel'
22
+
23
+ EM.run do
24
+ r = RDaneel.new("http://bit.ly/cbEnpa")
25
+ r.callback{
26
+ puts r.http_client.response_header.status
27
+ puts r.http_client.response[0,80]
28
+ puts r.redirects
29
+ puts r.uri
30
+ EM.stop
31
+ }
32
+ r.errback{
33
+ puts "should not happen"
34
+ EM.stop
35
+ }
36
+ r.get(:redirects => 3)
37
+ end
38
+
39
+ => 200
40
+ => <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
41
+ => http://bit.ly:80/cbEnpa
42
+ => http://github.com:80/hasmanydevelopers/RDaneel
43
+
44
+
45
+ === Denied by robots.txt
46
+
47
+ require 'rdaneel'
48
+
49
+ EM.run do
50
+ r = RDaneel.new("http://github.com/hasmanydevelopers/RDaneel/tarball/v0.0.0")
51
+ r.callback{
52
+ puts "should not happen"
53
+ EM.stop
54
+ }
55
+ r.errback{
56
+ puts r.error
57
+ EM.stop
58
+ }
59
+ r.get(:redirects => 3)
60
+ end
61
+
62
+ => robots denied
63
+
64
+
65
+ == Why RDaneel?
4
66
 
5
67
  R Daneel Olivaw is a fictional robot created by Isaac Asimov - http://en.wikipedia.org/wiki/R._Daneel_Olivaw
6
68
 
7
- == Important
69
+ == Acknowledge
8
70
 
9
- The same em-http-request options apply here.
10
- But when following redirects the method won't check the intermediate robots.txt, just the first one.
71
+ To Ilya Grigorik (@igrigorik) for em-http-request lib and his support and advice.
11
72
 
12
73
 
13
74
  == Note on Patches/Pull Requests
@@ -22,5 +83,5 @@ But when following redirects the method won't check the intermediate robots.txt,
22
83
 
23
84
  == Copyright
24
85
 
25
- Copyright (c) 2010 Edgar. See LICENSE for details.
86
+ Copyright (c) 2010 has_many :developers. See LICENSE for details.
26
87
 
data/Rakefile CHANGED
@@ -7,9 +7,9 @@ begin
7
7
  gem.name = "rdaneel"
8
8
  gem.summary = %Q{Obey robots.txt on top of em-http-request (Asynchronous HTTP Client)}
9
9
  gem.description = %Q{Add robots.txt support on top of em-http-request}
10
- gem.email = "edgargonzalez@gmail.com"
10
+ gem.email = ["edgargonzalez@gmail.com", "anibalrojas@gmail.com"]
11
11
  gem.homepage = "http://github.com/hasmanydevelopers/RDaneel"
12
- gem.authors = ["Edgar Gonzalez"]
12
+ gem.authors = ["Edgar Gonzalez", "Anibal Rojas"]
13
13
  gem.add_dependency("em-http-request", ">= 0.2.10")
14
14
  gem.add_dependency('robot_rules', '>= 0.9.1')
15
15
  gem.add_development_dependency "rspec", ">= 1.2.9"
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.0
1
+ 0.1.0
data/lib/rdaneel.rb CHANGED
@@ -1,16 +1,14 @@
1
1
  require 'em-http'
2
2
  require 'robot_rules'
3
- require 'net/http'
4
-
5
- module Net
6
- class DisobeyingRobotsTxt < HTTPBadResponse ; end
7
- end
8
3
 
9
4
  class RDaneel
5
+ include EM::Deferrable
6
+
7
+ DEFAULT_OPTIONS = {:head => {'user-agent' => 'RDaneel'}}
10
8
 
11
9
  class << self
12
- def robots_cache=(klass, options={})
13
- @robots_cache = klass.new(options)
10
+ def robots_cache=(c)
11
+ @robots_cache = c
14
12
  end
15
13
 
16
14
  def robots_cache
@@ -18,76 +16,138 @@ class RDaneel
18
16
  end
19
17
  end
20
18
 
19
+ attr_accessor :uri
20
+ attr_reader :error, :redirects, :http_client
21
+
21
22
  def initialize(uri)
22
23
  @uri = uri.kind_of?(Addressable::URI) ? uri : Addressable::URI::parse(uri)
24
+ @redirects = []
23
25
  end
24
26
 
25
- def robots_cache
26
- self.class.robots_cache
27
- end
27
+ def get(opts = {})
28
+ current_uri = @uri
29
+ options = DEFAULT_OPTIONS.merge(opts)
30
+ max_redirects = options.delete(:redirects).to_i
31
+ useragent = options[:head]['user-agent']
28
32
 
29
- #
30
- # The same em-http-request options apply here.
31
- # But when following redirects the method won't check the intermediate robots.txt, just the first one.
32
- #
33
- def get(options = {}, &blk)
34
- useragent = "RDaneel"
35
- if options[:head]
36
- options[:head].keys.each do |k|
37
- useragent = options[:head][k] if k.to_s.downcase == "user-agent"
38
- end
39
- end
40
- if robots_cache && robots_file = robots_cache.get(robots_txt_url)
41
- if robots_allowed?(robots_file, useragent)
42
- http = EventMachine::HttpRequest.new(@uri).get(options)
43
- http.callback {blk.call(http)}
44
- http.errback {blk.call(http)}
33
+ _get = lambda {}
34
+
35
+ _handle_uri_callback = lambda {|h|
36
+ if success?(h)
37
+ @uri = current_uri if current_uri != @uri
38
+ @http_client = h
39
+ succeed(self)
40
+ elsif redirected?(h)
41
+ if @redirects.size >= max_redirects
42
+ @http_client = h
43
+ @error = "Exceeded maximum number of redirects"
44
+ fail(self)
45
+ return
46
+ end
47
+ begin
48
+ @redirects << current_uri.to_s
49
+ current_uri = redirect_url(h, current_uri)
50
+ if @redirects.include?(current_uri.to_s)
51
+ @http_client = h
52
+ @error = "infinite redirect"
53
+ fail(self)
54
+ return
55
+ end
56
+ _get.call
57
+ rescue
58
+ @http_client = h
59
+ @error = "mal formed redirected url"
60
+ fail(self)
61
+ end
45
62
  else
46
- conn = EventMachine::HttpClient.new("")
47
- conn.uri = @uri
48
- conn.on_error("robots.txt")
49
- blk.call(conn)
63
+ # other error
64
+ self.http_client = h
65
+ @error = "not success and not redirect"
66
+ fail(self)
50
67
  end
51
- else
52
- robots = EventMachine::HttpRequest.new(robots_txt_url).get
53
- robots.callback {
54
- robots_file = robots.response
55
- robots_cache.put(robots_txt_url, robots_file) if robots_cache
56
- if robots_allowed?(robots_file, useragent)
57
- http = EventMachine::HttpRequest.new(@uri).get(options)
58
- http.callback {blk.call(http)}
59
- http.errback {blk.call(http)}
68
+ }
69
+ _get = lambda {
70
+ if robots_cache && robots_file = robots_cache[robots_txt_url(current_uri)]
71
+ if robots_allowed?(robots_file, useragent, current_uri)
72
+ h = EM::HttpRequest.new(current_uri).get(options)
73
+ h.callback(&_handle_uri_callback)
74
+ h.errback {
75
+ @http_client = h
76
+ @error = h.error
77
+ fail(self)
78
+ }
60
79
  else
61
- conn = EventMachine::HttpClient.new("")
62
- conn.uri = @uri
63
- conn.on_error("robots.txt")
64
- blk.call(conn)
80
+ @http_client = EM::HttpClient.new("")
81
+ @error = "robots denied"
82
+ fail(self)
65
83
  end
66
- }
67
- robots.errback {
68
- http = EventMachine::HttpRequest.new(@uri).get(options)
69
- http.callback {blk.call(http)}
70
- http.errback {blk.call(http)}
71
- }
72
- end
84
+ else
85
+ robots = EM::HttpRequest.new(robots_txt_url(current_uri)).get
86
+ robots.callback {
87
+ robots_file = robots.response
88
+ robots_cache[robots_txt_url(current_uri)] = robots_file if robots_cache
89
+ if robots_allowed?(robots_file, useragent, current_uri)
90
+ h = EM::HttpRequest.new(current_uri).get(options)
91
+ h.callback(&_handle_uri_callback)
92
+ h.errback {
93
+ @http_client = h
94
+ @error = h.error
95
+ fail(self)
96
+ }
97
+ else
98
+ @http_client = EM::HttpClient.new("")
99
+ @error = "robots denied"
100
+ fail(self)
101
+ end
102
+ }
103
+ robots.errback {
104
+ robots_cache.put[robots_txt_url(current_uri)] = "" if robots_cache
105
+ h = EM::HttpRequest.new(current_uri).get(options)
106
+ h.callback(&_handle_uri_callback)
107
+ h.errback {
108
+ @http_client = h
109
+ @error = h.error
110
+ fail(self)
111
+ }
112
+ }
113
+ end
114
+ }
115
+ _get.call
116
+ end
117
+
118
+ def robots_cache
119
+ self.class.robots_cache
73
120
  end
74
121
 
75
122
  protected
76
123
 
77
- def robots_allowed?(robots_file, useragent)
124
+ def robots_allowed?(robots_file, useragent, u)
78
125
  rules = RobotRules.new(useragent)
79
- rules.parse(@uri.to_s, robots_file)
80
- rules.allowed? @uri.to_s
126
+ rules.parse(u.to_s, robots_file)
127
+ rules.allowed? u.to_s
81
128
  end
82
129
 
83
- def robots_txt_url
84
- location = if @uri.port == 80
85
- @uri.host
130
+ def robots_txt_url(u)
131
+ location = if u.port == 80
132
+ u.host
86
133
  else
87
- "#{@uri.host}:#{@uri.port}"
134
+ "#{u.host}:#{u.port}"
88
135
  end
89
136
  "http://#{location}/robots.txt"
90
137
  end
91
138
 
139
+ def success?(http_client)
140
+ http_client.response_header.status == 200
141
+ end
142
+
143
+ def redirected?(http_client)
144
+ http_client.response_header.status == 301 || http_client.response_header.status == 302
145
+ end
146
+
147
+ def redirect_url(http_client, u)
148
+ location = Addressable::URI.parse(http_client.response_header.location)
149
+ return u.join(location) if location.relative?
150
+ return location
151
+ end
92
152
  end
93
153
 
@@ -0,0 +1,136 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "RDaneel when there are no redirects" do
4
+
5
+ describe "when a successfull status different than 200 is issued for robots.txt" do
6
+
7
+ it "should get the content ignoring the redirect"
8
+
9
+ end
10
+
11
+ describe "when a redirect other than 301 and 302 is issued for robots.txt" do
12
+
13
+ it "should get the content ignoring the redirect"
14
+
15
+ end
16
+
17
+ (301..302).each do |status|
18
+
19
+ describe "when robots.txt has been moved (http code #{status})" do
20
+ before(:each) do
21
+ burrito.mount( :path => '/hello_world', :status => 200,
22
+ :body => 'Hello World!', :block => should_be_hit_once )
23
+ burrito.mount( :path => '/robots.txt', :status => status,
24
+ :location => 'http://127.0.0.1:8080/golems.txt',
25
+ :block => should_be_hit_once )
26
+ burrito.mount( :path => '/golems.txt', :status => 200,
27
+ :block => should_not_be_hit )
28
+ end
29
+
30
+ after(:each) do
31
+ burrito.unmount('/hello_world')
32
+ burrito.unmount('/robots.txt')
33
+ burrito.unmount('/golems.txt')
34
+ end
35
+
36
+ it "should get the content ignoring the redirect" do
37
+
38
+ EM.run do
39
+ r = RDaneel.new("http://127.0.0.1:8080/hello_world")
40
+ r.callback do
41
+ r.http_client.response_header.status.should == 200
42
+ r.http_client.response.should == "Hello World!"
43
+ r.redirects.should be_empty
44
+ EM.stop
45
+ end
46
+ r.errback do
47
+ fail
48
+ EM.stop
49
+ end
50
+ r.get
51
+ end
52
+
53
+ end
54
+
55
+ end
56
+
57
+ end
58
+
59
+ (400..417).each do |status|
60
+
61
+ describe "when there is a CLIENT error #{status} associated to robots.txt" do
62
+ before(:each) do
63
+ burrito.mount( :path => '/hello_world', :status => 200,
64
+ :body => 'Hello World!', :block => should_be_hit_once )
65
+ burrito.mount( :path => '/robots.txt', :status => status,
66
+ :block => should_be_hit_once )
67
+ end
68
+
69
+ after(:each) do
70
+ burrito.unmount('/hello_world')
71
+ burrito.unmount('/robots.txt')
72
+ end
73
+
74
+ it "should get the content" do
75
+
76
+ EM.run do
77
+ r = RDaneel.new("http://127.0.0.1:8080/hello_world")
78
+ r.callback do
79
+ r.http_client.response_header.status.should == 200
80
+ r.http_client.response.should == "Hello World!"
81
+ r.redirects.should be_empty
82
+ EM.stop
83
+ end
84
+ r.errback do
85
+ fail
86
+ EM.stop
87
+ end
88
+ r.get
89
+ end
90
+
91
+ end
92
+
93
+ end
94
+
95
+ end
96
+
97
+ (500..505).each do |status|
98
+
99
+ describe "when there is a SERVER error #{status} associated to robots.txt" do
100
+ before(:each) do
101
+ burrito.mount( :path => '/hello_world', :status => 200,
102
+ :body => 'Hello World!', :block => should_be_hit_once )
103
+ burrito.mount( :path => '/robots.txt', :status => status,
104
+ :block => should_be_hit_once )
105
+ end
106
+
107
+ after (:each) do
108
+ burrito.unmount('/hello_world')
109
+ burrito.unmount('/robots.txt')
110
+ end
111
+
112
+ it "should get the content" do
113
+
114
+ EM.run do
115
+ r = RDaneel.new("http://127.0.0.1:8080/hello_world")
116
+ r.callback do
117
+ r.http_client.response_header.status.should == 200
118
+ r.http_client.response.should == "Hello World!"
119
+ r.redirects.should be_empty
120
+ EM.stop
121
+ end
122
+ r.errback do
123
+ fail
124
+ EM.stop
125
+ end
126
+ r.get
127
+ end
128
+
129
+ end
130
+
131
+ end
132
+
133
+ end
134
+
135
+ end
136
+
@@ -0,0 +1,175 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "RDaneel when there are redirects" do
4
+
5
+ describe "when there is no robots.txt in the host (ONLY one host)" do
6
+
7
+ describe "when no redirection limit has been set" do
8
+ before(:each) do
9
+ burrito.mount( :path => '/robots.txt', :status => 404,
10
+ :block => should_be_hit_once )
11
+ burrito.mount( :path => '/redirect_me', :status => 301,
12
+ :location => 'http://127.0.0.1:8080/hello_world',
13
+ :block => should_be_hit_once )
14
+ burrito.mount( :path => '/hello_world', :status => 200,
15
+ :body => 'Hello World!',
16
+ :block => should_not_be_hit )
17
+ end
18
+
19
+ after(:each) do
20
+ burrito.unmount('/robots.txt')
21
+ burrito.unmount('/redirect_me')
22
+ burrito.unmount('/hello_world')
23
+ end
24
+
25
+ it "should not follow redirects" do
26
+ EM.run do
27
+ r = RDaneel.new("http://127.0.0.1:8080/redirect_me")
28
+ r.callback do
29
+ fail
30
+ EM.stop
31
+ end
32
+ r.errback do
33
+ r.redirects.should be_empty
34
+ r.error.should == "Exceeded maximum number of redirects"
35
+ EM.stop
36
+ end
37
+ r.get
38
+ end
39
+
40
+ end
41
+
42
+ end
43
+
44
+ describe "when a maximum number or redirects is set" do
45
+
46
+ describe "when there are less redirects than the maximum specified" do
47
+ before(:each) do
48
+ burrito.mount( :path => '/robots.txt', :status => 404,
49
+ :block => should_be_hit(3) )
50
+ burrito.mount( :path => '/redirect_me', :status => 301,
51
+ :location => 'http://127.0.0.1:8080/redirect_me_again',
52
+ :block => should_be_hit_once )
53
+ burrito.mount( :path => '/redirect_me_again', :status => 301,
54
+ :location => 'http://127.0.0.1:8080/hello_world',
55
+ :block => should_be_hit_once )
56
+ burrito.mount( :path => '/hello_world', :status => 200,
57
+ :body => 'Hello World!',
58
+ :block => should_be_hit_once )
59
+ end
60
+
61
+ after(:each) do
62
+ burrito.unmount('/robots.txt')
63
+ burrito.unmount('/redirect_me')
64
+ burrito.unmount('/redirect_me_again')
65
+ burrito.unmount('/hello_world')
66
+ end
67
+
68
+ it "should get the content following all the redirects" do
69
+ EM.run do
70
+ r = RDaneel.new("http://127.0.0.1:8080/redirect_me")
71
+ r.callback do
72
+ r.http_client.response_header.status.should == 200
73
+ r.http_client.response.should == "Hello World!"
74
+ r.redirects.should == [ "http://127.0.0.1:8080/redirect_me",
75
+ "http://127.0.0.1:8080/redirect_me_again"]
76
+ r.uri.to_s.should == "http://127.0.0.1:8080/hello_world"
77
+ EM.stop
78
+ end
79
+ r.errback do
80
+ fail
81
+ EM.stop
82
+ end
83
+ r.get(:redirects => 3)
84
+ end
85
+
86
+ end
87
+
88
+ end
89
+
90
+ describe "when there are as many redirects as the maximum" do
91
+ before(:each) do
92
+ burrito.mount( :path => '/robots.txt', :status => 404,
93
+ :block => should_be_hit_twice )
94
+ burrito.mount( :path => '/redirect_me', :status => 301,
95
+ :location => 'http://127.0.0.1:8080/hello_world',
96
+ :block => should_be_hit_once )
97
+ burrito.mount( :path => '/hello_world', :status => 200,
98
+ :body => 'Hello World!',
99
+ :block => should_be_hit_once )
100
+ end
101
+
102
+ after(:each) do
103
+ burrito.unmount('/robots.txt')
104
+ burrito.unmount('/redirect_me')
105
+ burrito.unmount('/hello_world')
106
+ end
107
+
108
+ it "should get the content following all the redirects" do
109
+ EM.run do
110
+ r = RDaneel.new("http://127.0.0.1:8080/redirect_me")
111
+ r.callback do
112
+ r.http_client.response_header.status.should == 200
113
+ r.http_client.response.should == "Hello World!"
114
+ r.redirects.should == ['http://127.0.0.1:8080/redirect_me']
115
+ r.uri.to_s.should == "http://127.0.0.1:8080/hello_world"
116
+ EM.stop
117
+ end
118
+ r.errback do
119
+ fail
120
+ EM.stop
121
+ end
122
+ r.get(:redirects => 1)
123
+ end
124
+
125
+ end
126
+
127
+ end
128
+
129
+ describe "when the number of redirects exceed the maximum specified" do
130
+ before(:each) do
131
+ burrito.mount( :path => '/robots.txt', :status => 404,
132
+ :block => should_be_hit_twice )
133
+ burrito.mount( :path => '/redirect_me', :status => 301,
134
+ :location => 'http://127.0.0.1:8080/redirect_me_again',
135
+ :block => should_be_hit_once )
136
+ burrito.mount( :path => '/redirect_me_again', :status => 301,
137
+ :location => 'http://127.0.0.1:8080/hello_world',
138
+ :block => should_be_hit_once )
139
+ burrito.mount( :path => '/hello_world', :status => 200,
140
+ :body => 'Hello World!',
141
+ :block => should_not_be_hit )
142
+ end
143
+
144
+ after(:each) do
145
+ burrito.unmount('/robots.txt')
146
+ burrito.unmount('/redirect_me')
147
+ burrito.unmount('/redirect_me_again')
148
+ burrito.unmount('/hello_world')
149
+ end
150
+
151
+ it "should stop following redirects once the maximum specified is reached" do
152
+ EM.run do
153
+ r = RDaneel.new("http://127.0.0.1:8080/redirect_me")
154
+ r.callback do
155
+ fail
156
+ EM.stop
157
+ end
158
+ r.errback do
159
+ r.redirects.should == ['http://127.0.0.1:8080/redirect_me']
160
+ r.error.should == "Exceeded maximum number of redirects"
161
+ EM.stop
162
+ end
163
+ r.get(:redirects => 1)
164
+ end
165
+
166
+ end
167
+
168
+ end
169
+
170
+ end
171
+
172
+ end
173
+
174
+ end
175
+
data/spec/spec_helper.rb CHANGED
@@ -4,8 +4,75 @@ require 'rubygems'
4
4
  require 'rdaneel'
5
5
  require 'spec'
6
6
  require 'spec/autorun'
7
+ require 'webrick'
7
8
 
8
9
  Spec::Runner.configure do |config|
10
+ config.before :suite do
11
+ burrito
12
+ end
13
+ config.after :suite do
14
+ burrito.stop
15
+ end
16
+ end
17
+
18
+ def burrito
19
+ Thread.current[:burrito] ||= Burrito.new
20
+ end
21
+
22
+ class Burrito
23
+
24
+ def initialize( options={}, &blk )
25
+ webrick_log_file = '/dev/null' # disable logging
26
+ webrick_logger = WEBrick::Log.new(webrick_log_file, WEBrick::Log::DEBUG)
27
+ access_log_stream = webrick_logger
28
+ access_log = [[ access_log_stream, WEBrick::AccessLog::COMBINED_LOG_FORMAT ]]
29
+ default_opts = {:Port => 8080, :Logger => webrick_logger, :AccessLog => access_log }
30
+ @server = WEBrick::HTTPServer.new( default_opts.merge(options) )
31
+ @server_thread = Thread.new {
32
+ blk.call(@server) if blk
33
+ @server.start
34
+ }
35
+ @server
36
+ end
37
+
38
+ def mount( opts )
39
+ raise ":path is required" if opts[:path].nil?
40
+ raise ":status is required" if opts[:status].nil?
41
+ @server.mount_proc( opts[:path],
42
+ lambda { |req, resp|
43
+ resp.status = opts[:status]
44
+ resp.body = opts[:body] unless opts[:body].nil?
45
+ resp['Location'] = opts[:location] unless opts[:location].nil?
46
+ opts[:block].call unless opts[:block].nil?
47
+ } )
48
+ end
49
+
50
+ def stop
51
+ @server.shutdown
52
+ @server_thread.join
53
+ end
54
+
55
+ def unmount(path)
56
+ @server.unmount(path)
57
+ end
58
+
59
+ end
60
+
61
+ def should_not_be_hit
62
+ should_be_hit( 0 )
63
+ end
64
+
65
+ def should_be_hit_once
66
+ should_be_hit( 1 )
67
+ end
68
+
69
+ def should_be_hit_twice
70
+ should_be_hit( 2 )
71
+ end
9
72
 
73
+ def should_be_hit( times = 1 )
74
+ l = lambda {}
75
+ m = l.should_receive(:call).exactly(times).times
76
+ return l
10
77
  end
11
78
 
@@ -0,0 +1,44 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "RDaneel when there is a cache" do
4
+
5
+ describe "when there is no robots.txt in the host" do
6
+
7
+ before(:each) do
8
+ RDaneel.robots_cache = {}
9
+ burrito.mount( :path => '/robots.txt', :status => 404,
10
+ :block => should_be_hit_once )
11
+ burrito.mount( :path => '/redirect_me', :status => 301,
12
+ :location => 'http://127.0.0.1:8080/hello_world',
13
+ :block => should_be_hit_once )
14
+ burrito.mount( :path => '/hello_world', :status => 200,
15
+ :body => 'Hello World!',
16
+ :block => should_be_hit_once )
17
+ end
18
+
19
+ after(:each) do
20
+ burrito.unmount('/robots.txt')
21
+ burrito.unmount('/redirect_me')
22
+ burrito.unmount('/hello_world')
23
+ end
24
+
25
+ it "should try to get the robots.txt just once" do
26
+ EM.run do
27
+ r = RDaneel.new("http://127.0.0.1:8080/redirect_me")
28
+ r.callback do
29
+ r.http_client.response_header.status.should == 200
30
+ r.http_client.response.should == "Hello World!"
31
+ r.redirects.should == [ "http://127.0.0.1:8080/redirect_me"]
32
+ r.uri.to_s.should == "http://127.0.0.1:8080/hello_world"
33
+ EM.stop
34
+ end
35
+ r.errback do
36
+ fail
37
+ EM.stop
38
+ end
39
+ r.get(:redirects => 3)
40
+ end
41
+ end
42
+ end
43
+ end
44
+
metadata CHANGED
@@ -4,17 +4,18 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
+ - 1
7
8
  - 0
8
- - 0
9
- version: 0.0.0
9
+ version: 0.1.0
10
10
  platform: ruby
11
11
  authors:
12
12
  - Edgar Gonzalez
13
+ - Anibal Rojas
13
14
  autorequire:
14
15
  bindir: bin
15
16
  cert_chain: []
16
17
 
17
- date: 2010-07-14 00:00:00 -04:30
18
+ date: 2010-07-19 00:00:00 -04:30
18
19
  default_executable:
19
20
  dependencies:
20
21
  - !ruby/object:Gem::Dependency
@@ -60,7 +61,9 @@ dependencies:
60
61
  type: :development
61
62
  version_requirements: *id003
62
63
  description: Add robots.txt support on top of em-http-request
63
- email: edgargonzalez@gmail.com
64
+ email:
65
+ - edgargonzalez@gmail.com
66
+ - anibalrojas@gmail.com
64
67
  executables: []
65
68
 
66
69
  extensions: []
@@ -76,9 +79,11 @@ files:
76
79
  - Rakefile
77
80
  - VERSION
78
81
  - lib/rdaneel.rb
79
- - spec/rdaneel_spec.rb
82
+ - spec/no_redirects_neither_robots_spec.rb
83
+ - spec/redirects_without_robots_spec.rb
80
84
  - spec/spec.opts
81
85
  - spec/spec_helper.rb
86
+ - spec/using_cache_spec.rb
82
87
  has_rdoc: true
83
88
  homepage: http://github.com/hasmanydevelopers/RDaneel
84
89
  licenses: []
@@ -110,5 +115,7 @@ signing_key:
110
115
  specification_version: 3
111
116
  summary: Obey robots.txt on top of em-http-request (Asynchronous HTTP Client)
112
117
  test_files:
118
+ - spec/using_cache_spec.rb
119
+ - spec/no_redirects_neither_robots_spec.rb
113
120
  - spec/spec_helper.rb
114
- - spec/rdaneel_spec.rb
121
+ - spec/redirects_without_robots_spec.rb
data/spec/rdaneel_spec.rb DELETED
@@ -1,96 +0,0 @@
1
- require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
- require 'webrick'
3
-
4
- describe "RDaneel" do
5
-
6
- describe "when there is no robots.txt" do
7
- before(:all) do
8
- start_server do |s|
9
- s.mount_proc('/hello_world', lambda { |req, resp| resp.status = 200; resp.body = "hello world"})
10
- end
11
- end
12
-
13
- after(:all) do
14
- stop_server
15
- end
16
-
17
- it "should follow and get the uri" do
18
- EM.run {
19
- RDaneel.new("http://127.0.0.1:8080/hello_world").get do |http|
20
- http.response_header.status.should == 200
21
- http.response.should == "hello world"
22
- http.error.should == ''
23
- EM.stop
24
- end
25
- }
26
- end
27
- end
28
-
29
- describe "when there is a robots.txt that allow the uri requested" do
30
- before(:all) do
31
- start_server do |s|
32
- s.mount_proc('/robots.txt', lambda { |req, resp| resp.status = 200; resp.body = "User-agent: *\nDisallow: /images"})
33
- s.mount_proc('/hello_world', lambda { |req, resp| resp.status = 200; resp.body = "hello world"})
34
- end
35
- end
36
-
37
- after(:all) do
38
- stop_server
39
- end
40
-
41
- it "should follow and get the uri" do
42
- EM.run {
43
- RDaneel.new("http://127.0.0.1:8080/hello_world").get do |http|
44
- http.response_header.status.should == 200
45
- http.response.should == "hello world"
46
- http.error.should == ''
47
- EM.stop
48
- end
49
- }
50
- end
51
- end
52
-
53
- describe "when there is a robots.txt that disallow all content for all bots" do
54
- before(:all) do
55
- start_server do |s|
56
- s.mount_proc('/robots.txt', lambda { |req, resp| resp.status = 200; resp.body = "User-agent: *\nDisallow: /"})
57
- s.mount_proc('/hello_world', lambda { |req, resp| resp.status = 200; resp.body = "hello world"})
58
- end
59
- end
60
-
61
- after(:all) do
62
- stop_server
63
- end
64
-
65
- it "shouldn't get the uri" do
66
- EM.run {
67
- RDaneel.new("http://127.0.0.1:8080/hello_world").get do |http|
68
- http.error.should == 'robots.txt'
69
- http.uri.to_s.should == "http://127.0.0.1:8080/hello_world"
70
- EM.stop
71
- end
72
- }
73
- end
74
- end
75
-
76
-
77
-
78
- end
79
-
80
- def start_server(options={}, &blk)
81
- @server = WEBrick::HTTPServer.new({:Port => 8080}.merge(options))
82
- @server_thread = Thread.new {
83
- blk.call(@server) if blk
84
- @server.start
85
- }
86
- end
87
-
88
- def stop_server
89
- @server.shutdown
90
- @server_thread.join
91
- end
92
-
93
- def robots_txt
94
-
95
- end
96
-