rdaneel 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc CHANGED
@@ -1,6 +1,6 @@
1
1
  = R.Daneel
2
2
 
3
- An EventMachine+Ruby library to fetch urls following robots.txt rules.
3
+ An EventMachine+Ruby library to fetch urls obeying robots.txt rules.
4
4
 
5
5
  RDaneel is built it on top of @igrigorik's {em-http-request}[http://github.com/igrigorik/em-http-request]
6
6
 
data/Rakefile CHANGED
@@ -11,7 +11,7 @@ begin
11
11
  gem.homepage = "http://github.com/hasmanydevelopers/RDaneel"
12
12
  gem.authors = ["Edgar Gonzalez", "Anibal Rojas"]
13
13
  gem.add_dependency("em-http-request", ">= 0.2.10")
14
- gem.add_dependency('robot_rules', '>= 0.9.1')
14
+ gem.add_dependency('robot_rules', '>= 0.9.3')
15
15
  gem.add_development_dependency "rspec", ">= 1.2.9"
16
16
  # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
17
17
  end
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.0
1
+ 0.1.2
data/lib/rdaneel.rb CHANGED
@@ -21,6 +21,7 @@ class RDaneel
21
21
 
22
22
  def initialize(uri)
23
23
  @uri = uri.kind_of?(Addressable::URI) ? uri : Addressable::URI::parse(uri)
24
+ @uri.path = "/" if @uri.path.nil? || @uri.path == ""
24
25
  @redirects = []
25
26
  end
26
27
 
@@ -61,39 +62,53 @@ class RDaneel
61
62
  end
62
63
  else
63
64
  # other error
64
- self.http_client = h
65
+ @http_client = h
65
66
  @error = "not success and not redirect"
66
67
  fail(self)
67
68
  end
68
69
  }
69
70
  _get = lambda {
70
- if robots_cache && robots_file = robots_cache[robots_txt_url(current_uri)]
71
- if robots_allowed?(robots_file, useragent, current_uri)
72
- h = EM::HttpRequest.new(current_uri).get(options)
73
- h.callback(&_handle_uri_callback)
74
- h.errback {
75
- @http_client = h
76
- @error = h.error
71
+ robots_url = robots_txt_url(current_uri)
72
+ if robots_cache && robots_file = robots_cache[robots_url.to_s]
73
+ if robots_allowed?(robots_file, useragent, robots_url, current_uri)
74
+ begin
75
+ h = EM::HttpRequest.new(current_uri).get(options)
76
+ h.callback(&_handle_uri_callback)
77
+ h.errback {
78
+ @http_client = h
79
+ @error = h.error
80
+ fail(self)
81
+ }
82
+ rescue StandardError => se
83
+ @http_client = EM::HttpClient.new("")
84
+ @error = "#{se.message}\n#{se.backtrace.inspect}"
77
85
  fail(self)
78
- }
86
+ end
79
87
  else
80
88
  @http_client = EM::HttpClient.new("")
81
89
  @error = "robots denied"
82
90
  fail(self)
83
91
  end
84
92
  else
85
- robots = EM::HttpRequest.new(robots_txt_url(current_uri)).get
93
+ robots_url = robots_txt_url(current_uri)
94
+ robots = EM::HttpRequest.new(robots_url).get
86
95
  robots.callback {
87
96
  robots_file = robots.response
88
- robots_cache[robots_txt_url(current_uri)] = robots_file if robots_cache
89
- if robots_allowed?(robots_file, useragent, current_uri)
90
- h = EM::HttpRequest.new(current_uri).get(options)
91
- h.callback(&_handle_uri_callback)
92
- h.errback {
93
- @http_client = h
94
- @error = h.error
97
+ robots_cache[robots_url.to_s] = robots_file if robots_cache
98
+ if robots_allowed?(robots_file, useragent, robots_url, current_uri)
99
+ begin
100
+ h = EM::HttpRequest.new(current_uri).get(options)
101
+ h.callback(&_handle_uri_callback)
102
+ h.errback {
103
+ @http_client = h
104
+ @error = h.error
105
+ fail(self)
106
+ }
107
+ rescue StandardError => se
108
+ @http_client = EM::HttpClient.new("")
109
+ @error = "#{se.message}\n#{se.backtrace.inspect}"
95
110
  fail(self)
96
- }
111
+ end
97
112
  else
98
113
  @http_client = EM::HttpClient.new("")
99
114
  @error = "robots denied"
@@ -101,7 +116,7 @@ class RDaneel
101
116
  end
102
117
  }
103
118
  robots.errback {
104
- robots_cache.put[robots_txt_url(current_uri)] = "" if robots_cache
119
+ robots_cache[robots_url.to_s] = "" if robots_cache
105
120
  h = EM::HttpRequest.new(current_uri).get(options)
106
121
  h.callback(&_handle_uri_callback)
107
122
  h.errback {
@@ -121,10 +136,14 @@ class RDaneel
121
136
 
122
137
  protected
123
138
 
124
- def robots_allowed?(robots_file, useragent, u)
125
- rules = RobotRules.new(useragent)
126
- rules.parse(u.to_s, robots_file)
127
- rules.allowed? u.to_s
139
+ def robots_allowed?(robots_file, useragent, robots_url, url)
140
+ begin
141
+ rules = RobotRules.new(useragent)
142
+ rules.parse(robots_url, robots_file)
143
+ return rules.allowed? url
144
+ rescue StandardError => err
145
+ return true
146
+ end
128
147
  end
129
148
 
130
149
  def robots_txt_url(u)
@@ -133,7 +152,7 @@ class RDaneel
133
152
  else
134
153
  "#{u.host}:#{u.port}"
135
154
  end
136
- "http://#{location}/robots.txt"
155
+ Addressable::URI.parse("http://#{location}/robots.txt")
137
156
  end
138
157
 
139
158
  def success?(http_client)
@@ -146,8 +165,9 @@ class RDaneel
146
165
 
147
166
  def redirect_url(http_client, u)
148
167
  location = Addressable::URI.parse(http_client.response_header.location)
149
- return u.join(location) if location.relative?
150
- return location
168
+ location = u.join(location) if location.relative?
169
+ location.path = "/" if location.path.nil? || location.path == ""
170
+ location
151
171
  end
152
172
  end
153
173
 
@@ -2,6 +2,41 @@ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
2
 
3
3
  describe "RDaneel when there are no redirects" do
4
4
 
5
+ describe "when not exist a robots.txt (404) and the url requested is /" do
6
+ before(:each) do
7
+ burrito.mount( :path => '/', :status => 200,
8
+ :body => 'Hello World!', :block => should_be_hit_once )
9
+ burrito.mount( :path => '/robots.txt', :status => 404,
10
+ :block => should_be_hit_once )
11
+ end
12
+
13
+ after(:each) do
14
+ burrito.unmount('/')
15
+ burrito.unmount('/robots.txt')
16
+ end
17
+
18
+ it "should get the content is the url not end with /" do
19
+
20
+ EM.run do
21
+ r = RDaneel.new("http://127.0.0.1:8080")
22
+ r.callback do
23
+ r.http_client.response_header.status.should == 200
24
+ r.http_client.response.should == "Hello World!"
25
+ r.redirects.should be_empty
26
+ EM.stop
27
+ end
28
+ r.errback do
29
+ fail
30
+ EM.stop
31
+ end
32
+ r.get
33
+ end
34
+
35
+ end
36
+ end
37
+
38
+
39
+
5
40
  describe "when a successfull status different than 200 is issued for robots.txt" do
6
41
 
7
42
  it "should get the content ignoring the redirect"
data/spec/spec_helper.rb CHANGED
@@ -3,7 +3,6 @@ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
3
  require 'rubygems'
4
4
  require 'rdaneel'
5
5
  require 'spec'
6
- require 'spec/autorun'
7
6
  require 'webrick'
8
7
 
9
8
  Spec::Runner.configure do |config|
@@ -33,6 +32,7 @@ class Burrito
33
32
  @server.start
34
33
  }
35
34
  @server
35
+ sleep 0.5
36
36
  end
37
37
 
38
38
  def mount( opts )
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 0
9
- version: 0.1.0
8
+ - 2
9
+ version: 0.1.2
10
10
  platform: ruby
11
11
  authors:
12
12
  - Edgar Gonzalez
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-07-19 00:00:00 -04:30
18
+ date: 2010-07-21 00:00:00 -04:30
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -42,8 +42,8 @@ dependencies:
42
42
  segments:
43
43
  - 0
44
44
  - 9
45
- - 1
46
- version: 0.9.1
45
+ - 3
46
+ version: 0.9.3
47
47
  type: :runtime
48
48
  version_requirements: *id002
49
49
  - !ruby/object:Gem::Dependency