rdaneel 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Edgar
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,26 @@
1
+ = R.Daneel
2
+
3
+ Obey robots.txt on top of em-http-request (http://github.com/igrigorik/em-http-request - Asynchronous HTTP Client)
4
+
5
+ R Daneel Olivaw is a fictional robot created by Isaac Asimov - http://en.wikipedia.org/wiki/R._Daneel_Olivaw
6
+
7
+ == Important
8
+
9
+ The same em-http-request options apply here.
10
+ But when following redirects the method won't check the intermediate robots.txt, just the first one.
11
+
12
+
13
+ == Note on Patches/Pull Requests
14
+
15
+ * Fork the project.
16
+ * Make your feature addition or bug fix.
17
+ * Add tests for it. This is important so I don't break it in a
18
+ future version unintentionally.
19
+ * Commit, do not mess with rakefile, version, or history.
20
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
21
+ * Send me a pull request. Bonus points for topic branches.
22
+
23
+ == Copyright
24
+
25
+ Copyright (c) 2010 Edgar. See LICENSE for details.
26
+
data/Rakefile ADDED
@@ -0,0 +1,48 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "rdaneel"
8
+ gem.summary = %Q{Obey robots.txt on top of em-http-request (Asynchronous HTTP Client)}
9
+ gem.description = %Q{Add robots.txt support on top of em-http-request}
10
+ gem.email = "edgargonzalez@gmail.com"
11
+ gem.homepage = "http://github.com/hasmanydevelopers/RDaneel"
12
+ gem.authors = ["Edgar Gonzalez"]
13
+ gem.add_dependency("em-http-request", ">= 0.2.10")
14
+ gem.add_dependency('robot_rules', '>= 0.9.1')
15
+ gem.add_development_dependency "rspec", ">= 1.2.9"
16
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
17
+ end
18
+ Jeweler::GemcutterTasks.new
19
+ rescue LoadError
20
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
21
+ end
22
+
23
+ require 'spec/rake/spectask'
24
+ Spec::Rake::SpecTask.new(:spec) do |spec|
25
+ spec.libs << 'lib' << 'spec'
26
+ spec.spec_files = FileList['spec/**/*_spec.rb']
27
+ end
28
+
29
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
30
+ spec.libs << 'lib' << 'spec'
31
+ spec.pattern = 'spec/**/*_spec.rb'
32
+ spec.rcov = true
33
+ end
34
+
35
+ task :spec => :check_dependencies
36
+
37
+ task :default => :spec
38
+
39
+ require 'rake/rdoctask'
40
+ Rake::RDocTask.new do |rdoc|
41
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
42
+
43
+ rdoc.rdoc_dir = 'rdoc'
44
+ rdoc.title = "rdaneel #{version}"
45
+ rdoc.rdoc_files.include('README*')
46
+ rdoc.rdoc_files.include('lib/**/*.rb')
47
+ end
48
+
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.0
data/lib/rdaneel.rb ADDED
@@ -0,0 +1,93 @@
1
+ require 'em-http'
2
+ require 'robot_rules'
3
+ require 'net/http'
4
+
5
+ module Net
6
+ class DisobeyingRobotsTxt < HTTPBadResponse ; end
7
+ end
8
+
9
+ class RDaneel
10
+
11
+ class << self
12
+ def robots_cache=(klass, options={})
13
+ @robots_cache = klass.new(options)
14
+ end
15
+
16
+ def robots_cache
17
+ @robots_cache
18
+ end
19
+ end
20
+
21
+ def initialize(uri)
22
+ @uri = uri.kind_of?(Addressable::URI) ? uri : Addressable::URI::parse(uri)
23
+ end
24
+
25
+ def robots_cache
26
+ self.class.robots_cache
27
+ end
28
+
29
+ #
30
+ # The same em-http-request options apply here.
31
+ # But when following redirects the method won't check the intermediate robots.txt, just the first one.
32
+ #
33
+ def get(options = {}, &blk)
34
+ useragent = "RDaneel"
35
+ if options[:head]
36
+ options[:head].keys.each do |k|
37
+ useragent = options[:head][k] if k.to_s.downcase == "user-agent"
38
+ end
39
+ end
40
+ if robots_cache && robots_file = robots_cache.get(robots_txt_url)
41
+ if robots_allowed?(robots_file, useragent)
42
+ http = EventMachine::HttpRequest.new(@uri).get(options)
43
+ http.callback {blk.call(http)}
44
+ http.errback {blk.call(http)}
45
+ else
46
+ conn = EventMachine::HttpClient.new("")
47
+ conn.uri = @uri
48
+ conn.on_error("robots.txt")
49
+ blk.call(conn)
50
+ end
51
+ else
52
+ robots = EventMachine::HttpRequest.new(robots_txt_url).get
53
+ robots.callback {
54
+ robots_file = robots.response
55
+ robots_cache.put(robots_txt_url, robots_file) if robots_cache
56
+ if robots_allowed?(robots_file, useragent)
57
+ http = EventMachine::HttpRequest.new(@uri).get(options)
58
+ http.callback {blk.call(http)}
59
+ http.errback {blk.call(http)}
60
+ else
61
+ conn = EventMachine::HttpClient.new("")
62
+ conn.uri = @uri
63
+ conn.on_error("robots.txt")
64
+ blk.call(conn)
65
+ end
66
+ }
67
+ robots.errback {
68
+ http = EventMachine::HttpRequest.new(@uri).get(options)
69
+ http.callback {blk.call(http)}
70
+ http.errback {blk.call(http)}
71
+ }
72
+ end
73
+ end
74
+
75
+ protected
76
+
77
+ def robots_allowed?(robots_file, useragent)
78
+ rules = RobotRules.new(useragent)
79
+ rules.parse(@uri.to_s, robots_file)
80
+ rules.allowed? @uri.to_s
81
+ end
82
+
83
+ def robots_txt_url
84
+ location = if @uri.port == 80
85
+ @uri.host
86
+ else
87
+ "#{@uri.host}:#{@uri.port}"
88
+ end
89
+ "http://#{location}/robots.txt"
90
+ end
91
+
92
+ end
93
+
@@ -0,0 +1,96 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+ require 'webrick'
3
+
4
+ describe "RDaneel" do
5
+
6
+ describe "when there is no robots.txt" do
7
+ before(:all) do
8
+ start_server do |s|
9
+ s.mount_proc('/hello_world', lambda { |req, resp| resp.status = 200; resp.body = "hello world"})
10
+ end
11
+ end
12
+
13
+ after(:all) do
14
+ stop_server
15
+ end
16
+
17
+ it "should follow and get the uri" do
18
+ EM.run {
19
+ RDaneel.new("http://127.0.0.1:8080/hello_world").get do |http|
20
+ http.response_header.status.should == 200
21
+ http.response.should == "hello world"
22
+ http.error.should == ''
23
+ EM.stop
24
+ end
25
+ }
26
+ end
27
+ end
28
+
29
+ describe "when there is a robots.txt that allow the uri requested" do
30
+ before(:all) do
31
+ start_server do |s|
32
+ s.mount_proc('/robots.txt', lambda { |req, resp| resp.status = 200; resp.body = "User-agent: *\nDisallow: /images"})
33
+ s.mount_proc('/hello_world', lambda { |req, resp| resp.status = 200; resp.body = "hello world"})
34
+ end
35
+ end
36
+
37
+ after(:all) do
38
+ stop_server
39
+ end
40
+
41
+ it "should follow and get the uri" do
42
+ EM.run {
43
+ RDaneel.new("http://127.0.0.1:8080/hello_world").get do |http|
44
+ http.response_header.status.should == 200
45
+ http.response.should == "hello world"
46
+ http.error.should == ''
47
+ EM.stop
48
+ end
49
+ }
50
+ end
51
+ end
52
+
53
+ describe "when there is a robots.txt that disallow all content for all bots" do
54
+ before(:all) do
55
+ start_server do |s|
56
+ s.mount_proc('/robots.txt', lambda { |req, resp| resp.status = 200; resp.body = "User-agent: *\nDisallow: /"})
57
+ s.mount_proc('/hello_world', lambda { |req, resp| resp.status = 200; resp.body = "hello world"})
58
+ end
59
+ end
60
+
61
+ after(:all) do
62
+ stop_server
63
+ end
64
+
65
+ it "shouldn't get the uri" do
66
+ EM.run {
67
+ RDaneel.new("http://127.0.0.1:8080/hello_world").get do |http|
68
+ http.error.should == 'robots.txt'
69
+ http.uri.to_s.should == "http://127.0.0.1:8080/hello_world"
70
+ EM.stop
71
+ end
72
+ }
73
+ end
74
+ end
75
+
76
+
77
+
78
+ end
79
+
80
+ def start_server(options={}, &blk)
81
+ @server = WEBrick::HTTPServer.new({:Port => 8080}.merge(options))
82
+ @server_thread = Thread.new {
83
+ blk.call(@server) if blk
84
+ @server.start
85
+ }
86
+ end
87
+
88
+ def stop_server
89
+ @server.shutdown
90
+ @server_thread.join
91
+ end
92
+
93
+ def robots_txt
94
+
95
+ end
96
+
data/spec/spec.opts ADDED
@@ -0,0 +1,6 @@
1
+ --color
2
+ --format progress
3
+ --loadby mtime
4
+ --reverse
5
+ --backtrace
6
+
@@ -0,0 +1,11 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ require 'rubygems'
4
+ require 'rdaneel'
5
+ require 'spec'
6
+ require 'spec/autorun'
7
+
8
+ Spec::Runner.configure do |config|
9
+
10
+ end
11
+
metadata ADDED
@@ -0,0 +1,114 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rdaneel
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 0
9
+ version: 0.0.0
10
+ platform: ruby
11
+ authors:
12
+ - Edgar Gonzalez
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-07-14 00:00:00 -04:30
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: em-http-request
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ segments:
28
+ - 0
29
+ - 2
30
+ - 10
31
+ version: 0.2.10
32
+ type: :runtime
33
+ version_requirements: *id001
34
+ - !ruby/object:Gem::Dependency
35
+ name: robot_rules
36
+ prerelease: false
37
+ requirement: &id002 !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ segments:
42
+ - 0
43
+ - 9
44
+ - 1
45
+ version: 0.9.1
46
+ type: :runtime
47
+ version_requirements: *id002
48
+ - !ruby/object:Gem::Dependency
49
+ name: rspec
50
+ prerelease: false
51
+ requirement: &id003 !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ segments:
56
+ - 1
57
+ - 2
58
+ - 9
59
+ version: 1.2.9
60
+ type: :development
61
+ version_requirements: *id003
62
+ description: Add robots.txt support on top of em-http-request
63
+ email: edgargonzalez@gmail.com
64
+ executables: []
65
+
66
+ extensions: []
67
+
68
+ extra_rdoc_files:
69
+ - LICENSE
70
+ - README.rdoc
71
+ files:
72
+ - .document
73
+ - .gitignore
74
+ - LICENSE
75
+ - README.rdoc
76
+ - Rakefile
77
+ - VERSION
78
+ - lib/rdaneel.rb
79
+ - spec/rdaneel_spec.rb
80
+ - spec/spec.opts
81
+ - spec/spec_helper.rb
82
+ has_rdoc: true
83
+ homepage: http://github.com/hasmanydevelopers/RDaneel
84
+ licenses: []
85
+
86
+ post_install_message:
87
+ rdoc_options:
88
+ - --charset=UTF-8
89
+ require_paths:
90
+ - lib
91
+ required_ruby_version: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ segments:
96
+ - 0
97
+ version: "0"
98
+ required_rubygems_version: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ segments:
103
+ - 0
104
+ version: "0"
105
+ requirements: []
106
+
107
+ rubyforge_project:
108
+ rubygems_version: 1.3.6
109
+ signing_key:
110
+ specification_version: 3
111
+ summary: Obey robots.txt on top of em-http-request (Asynchronous HTTP Client)
112
+ test_files:
113
+ - spec/spec_helper.rb
114
+ - spec/rdaneel_spec.rb