rdaneel 0.1.0 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -1,6 +1,6 @@
1
1
  = R.Daneel
2
2
 
3
- An EventMachine+Ruby library to fetch urls following robots.txt rules.
3
+ An EventMachine+Ruby library to fetch urls obeying robots.txt rules.
4
4
 
5
5
  RDaneel is built it on top of @igrigorik's {em-http-request}[http://github.com/igrigorik/em-http-request]
6
6
 
data/Rakefile CHANGED
@@ -11,7 +11,7 @@ begin
11
11
  gem.homepage = "http://github.com/hasmanydevelopers/RDaneel"
12
12
  gem.authors = ["Edgar Gonzalez", "Anibal Rojas"]
13
13
  gem.add_dependency("em-http-request", ">= 0.2.10")
14
- gem.add_dependency('robot_rules', '>= 0.9.1')
14
+ gem.add_dependency('robot_rules', '>= 0.9.3')
15
15
  gem.add_development_dependency "rspec", ">= 1.2.9"
16
16
  # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
17
17
  end
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.0
1
+ 0.1.2
data/lib/rdaneel.rb CHANGED
@@ -21,6 +21,7 @@ class RDaneel
21
21
 
22
22
  def initialize(uri)
23
23
  @uri = uri.kind_of?(Addressable::URI) ? uri : Addressable::URI::parse(uri)
24
+ @uri.path = "/" if @uri.path.nil? || @uri.path == ""
24
25
  @redirects = []
25
26
  end
26
27
 
@@ -61,39 +62,53 @@ class RDaneel
61
62
  end
62
63
  else
63
64
  # other error
64
- self.http_client = h
65
+ @http_client = h
65
66
  @error = "not success and not redirect"
66
67
  fail(self)
67
68
  end
68
69
  }
69
70
  _get = lambda {
70
- if robots_cache && robots_file = robots_cache[robots_txt_url(current_uri)]
71
- if robots_allowed?(robots_file, useragent, current_uri)
72
- h = EM::HttpRequest.new(current_uri).get(options)
73
- h.callback(&_handle_uri_callback)
74
- h.errback {
75
- @http_client = h
76
- @error = h.error
71
+ robots_url = robots_txt_url(current_uri)
72
+ if robots_cache && robots_file = robots_cache[robots_url.to_s]
73
+ if robots_allowed?(robots_file, useragent, robots_url, current_uri)
74
+ begin
75
+ h = EM::HttpRequest.new(current_uri).get(options)
76
+ h.callback(&_handle_uri_callback)
77
+ h.errback {
78
+ @http_client = h
79
+ @error = h.error
80
+ fail(self)
81
+ }
82
+ rescue StandardError => se
83
+ @http_client = EM::HttpClient.new("")
84
+ @error = "#{se.message}\n#{se.backtrace.inspect}"
77
85
  fail(self)
78
- }
86
+ end
79
87
  else
80
88
  @http_client = EM::HttpClient.new("")
81
89
  @error = "robots denied"
82
90
  fail(self)
83
91
  end
84
92
  else
85
- robots = EM::HttpRequest.new(robots_txt_url(current_uri)).get
93
+ robots_url = robots_txt_url(current_uri)
94
+ robots = EM::HttpRequest.new(robots_url).get
86
95
  robots.callback {
87
96
  robots_file = robots.response
88
- robots_cache[robots_txt_url(current_uri)] = robots_file if robots_cache
89
- if robots_allowed?(robots_file, useragent, current_uri)
90
- h = EM::HttpRequest.new(current_uri).get(options)
91
- h.callback(&_handle_uri_callback)
92
- h.errback {
93
- @http_client = h
94
- @error = h.error
97
+ robots_cache[robots_url.to_s] = robots_file if robots_cache
98
+ if robots_allowed?(robots_file, useragent, robots_url, current_uri)
99
+ begin
100
+ h = EM::HttpRequest.new(current_uri).get(options)
101
+ h.callback(&_handle_uri_callback)
102
+ h.errback {
103
+ @http_client = h
104
+ @error = h.error
105
+ fail(self)
106
+ }
107
+ rescue StandardError => se
108
+ @http_client = EM::HttpClient.new("")
109
+ @error = "#{se.message}\n#{se.backtrace.inspect}"
95
110
  fail(self)
96
- }
111
+ end
97
112
  else
98
113
  @http_client = EM::HttpClient.new("")
99
114
  @error = "robots denied"
@@ -101,7 +116,7 @@ class RDaneel
101
116
  end
102
117
  }
103
118
  robots.errback {
104
- robots_cache.put[robots_txt_url(current_uri)] = "" if robots_cache
119
+ robots_cache[robots_url.to_s] = "" if robots_cache
105
120
  h = EM::HttpRequest.new(current_uri).get(options)
106
121
  h.callback(&_handle_uri_callback)
107
122
  h.errback {
@@ -121,10 +136,14 @@ class RDaneel
121
136
 
122
137
  protected
123
138
 
124
- def robots_allowed?(robots_file, useragent, u)
125
- rules = RobotRules.new(useragent)
126
- rules.parse(u.to_s, robots_file)
127
- rules.allowed? u.to_s
139
+ def robots_allowed?(robots_file, useragent, robots_url, url)
140
+ begin
141
+ rules = RobotRules.new(useragent)
142
+ rules.parse(robots_url, robots_file)
143
+ return rules.allowed? url
144
+ rescue StandardError => err
145
+ return true
146
+ end
128
147
  end
129
148
 
130
149
  def robots_txt_url(u)
@@ -133,7 +152,7 @@ class RDaneel
133
152
  else
134
153
  "#{u.host}:#{u.port}"
135
154
  end
136
- "http://#{location}/robots.txt"
155
+ Addressable::URI.parse("http://#{location}/robots.txt")
137
156
  end
138
157
 
139
158
  def success?(http_client)
@@ -146,8 +165,9 @@ class RDaneel
146
165
 
147
166
  def redirect_url(http_client, u)
148
167
  location = Addressable::URI.parse(http_client.response_header.location)
149
- return u.join(location) if location.relative?
150
- return location
168
+ location = u.join(location) if location.relative?
169
+ location.path = "/" if location.path.nil? || location.path == ""
170
+ location
151
171
  end
152
172
  end
153
173
 
@@ -2,6 +2,41 @@ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
2
 
3
3
  describe "RDaneel when there are no redirects" do
4
4
 
5
+ describe "when not exist a robots.txt (404) and the url requested is /" do
6
+ before(:each) do
7
+ burrito.mount( :path => '/', :status => 200,
8
+ :body => 'Hello World!', :block => should_be_hit_once )
9
+ burrito.mount( :path => '/robots.txt', :status => 404,
10
+ :block => should_be_hit_once )
11
+ end
12
+
13
+ after(:each) do
14
+ burrito.unmount('/')
15
+ burrito.unmount('/robots.txt')
16
+ end
17
+
18
+ it "should get the content is the url not end with /" do
19
+
20
+ EM.run do
21
+ r = RDaneel.new("http://127.0.0.1:8080")
22
+ r.callback do
23
+ r.http_client.response_header.status.should == 200
24
+ r.http_client.response.should == "Hello World!"
25
+ r.redirects.should be_empty
26
+ EM.stop
27
+ end
28
+ r.errback do
29
+ fail
30
+ EM.stop
31
+ end
32
+ r.get
33
+ end
34
+
35
+ end
36
+ end
37
+
38
+
39
+
5
40
  describe "when a successfull status different than 200 is issued for robots.txt" do
6
41
 
7
42
  it "should get the content ignoring the redirect"
data/spec/spec_helper.rb CHANGED
@@ -3,7 +3,6 @@ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
3
  require 'rubygems'
4
4
  require 'rdaneel'
5
5
  require 'spec'
6
- require 'spec/autorun'
7
6
  require 'webrick'
8
7
 
9
8
  Spec::Runner.configure do |config|
@@ -33,6 +32,7 @@ class Burrito
33
32
  @server.start
34
33
  }
35
34
  @server
35
+ sleep 0.5
36
36
  end
37
37
 
38
38
  def mount( opts )
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 0
9
- version: 0.1.0
8
+ - 2
9
+ version: 0.1.2
10
10
  platform: ruby
11
11
  authors:
12
12
  - Edgar Gonzalez
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-07-19 00:00:00 -04:30
18
+ date: 2010-07-21 00:00:00 -04:30
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -42,8 +42,8 @@ dependencies:
42
42
  segments:
43
43
  - 0
44
44
  - 9
45
- - 1
46
- version: 0.9.1
45
+ - 3
46
+ version: 0.9.3
47
47
  type: :runtime
48
48
  version_requirements: *id002
49
49
  - !ruby/object:Gem::Dependency