rdaneel 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Edgar
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,26 @@
1
+ = R.Daneel
2
+
3
+ Obey robots.txt on top of em-http-request (http://github.com/igrigorik/em-http-request - Asynchronous HTTP Client)
4
+
5
+ R Daneel Olivaw is a fictional robot created by Isaac Asimov - http://en.wikipedia.org/wiki/R._Daneel_Olivaw
6
+
7
+ == Important
8
+
9
+ The same em-http-request options apply here.
10
+ But when following redirects the method won't check the intermediate robots.txt, just the first one.
11
+
12
+
13
+ == Note on Patches/Pull Requests
14
+
15
+ * Fork the project.
16
+ * Make your feature addition or bug fix.
17
+ * Add tests for it. This is important so I don't break it in a
18
+ future version unintentionally.
19
+ * Commit, do not mess with rakefile, version, or history.
20
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
21
+ * Send me a pull request. Bonus points for topic branches.
22
+
23
+ == Copyright
24
+
25
+ Copyright (c) 2010 Edgar. See LICENSE for details.
26
+
data/Rakefile ADDED
@@ -0,0 +1,48 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "rdaneel"
8
+ gem.summary = %Q{Obey robots.txt on top of em-http-request (Asynchronous HTTP Client)}
9
+ gem.description = %Q{Add robots.txt support on top of em-http-request}
10
+ gem.email = "edgargonzalez@gmail.com"
11
+ gem.homepage = "http://github.com/hasmanydevelopers/RDaneel"
12
+ gem.authors = ["Edgar Gonzalez"]
13
+ gem.add_dependency("em-http-request", ">= 0.2.10")
14
+ gem.add_dependency('robot_rules', '>= 0.9.1')
15
+ gem.add_development_dependency "rspec", ">= 1.2.9"
16
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
17
+ end
18
+ Jeweler::GemcutterTasks.new
19
+ rescue LoadError
20
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
21
+ end
22
+
23
+ require 'spec/rake/spectask'
24
+ Spec::Rake::SpecTask.new(:spec) do |spec|
25
+ spec.libs << 'lib' << 'spec'
26
+ spec.spec_files = FileList['spec/**/*_spec.rb']
27
+ end
28
+
29
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
30
+ spec.libs << 'lib' << 'spec'
31
+ spec.pattern = 'spec/**/*_spec.rb'
32
+ spec.rcov = true
33
+ end
34
+
35
+ task :spec => :check_dependencies
36
+
37
+ task :default => :spec
38
+
39
+ require 'rake/rdoctask'
40
+ Rake::RDocTask.new do |rdoc|
41
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
42
+
43
+ rdoc.rdoc_dir = 'rdoc'
44
+ rdoc.title = "rdaneel #{version}"
45
+ rdoc.rdoc_files.include('README*')
46
+ rdoc.rdoc_files.include('lib/**/*.rb')
47
+ end
48
+
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.0
data/lib/rdaneel.rb ADDED
@@ -0,0 +1,93 @@
1
+ require 'em-http'
2
+ require 'robot_rules'
3
+ require 'net/http'
4
+
5
+ module Net
6
+ class DisobeyingRobotsTxt < HTTPBadResponse ; end
7
+ end
8
+
9
+ class RDaneel
10
+
11
+ class << self
12
+ def robots_cache=(klass, options={})
13
+ @robots_cache = klass.new(options)
14
+ end
15
+
16
+ def robots_cache
17
+ @robots_cache
18
+ end
19
+ end
20
+
21
+ def initialize(uri)
22
+ @uri = uri.kind_of?(Addressable::URI) ? uri : Addressable::URI::parse(uri)
23
+ end
24
+
25
+ def robots_cache
26
+ self.class.robots_cache
27
+ end
28
+
29
+ #
30
+ # The same em-http-request options apply here.
31
+ # But when following redirects the method won't check the intermediate robots.txt, just the first one.
32
+ #
33
+ def get(options = {}, &blk)
34
+ useragent = "RDaneel"
35
+ if options[:head]
36
+ options[:head].keys.each do |k|
37
+ useragent = options[:head][k] if k.to_s.downcase == "user-agent"
38
+ end
39
+ end
40
+ if robots_cache && robots_file = robots_cache.get(robots_txt_url)
41
+ if robots_allowed?(robots_file, useragent)
42
+ http = EventMachine::HttpRequest.new(@uri).get(options)
43
+ http.callback {blk.call(http)}
44
+ http.errback {blk.call(http)}
45
+ else
46
+ conn = EventMachine::HttpClient.new("")
47
+ conn.uri = @uri
48
+ conn.on_error("robots.txt")
49
+ blk.call(conn)
50
+ end
51
+ else
52
+ robots = EventMachine::HttpRequest.new(robots_txt_url).get
53
+ robots.callback {
54
+ robots_file = robots.response
55
+ robots_cache.put(robots_txt_url, robots_file) if robots_cache
56
+ if robots_allowed?(robots_file, useragent)
57
+ http = EventMachine::HttpRequest.new(@uri).get(options)
58
+ http.callback {blk.call(http)}
59
+ http.errback {blk.call(http)}
60
+ else
61
+ conn = EventMachine::HttpClient.new("")
62
+ conn.uri = @uri
63
+ conn.on_error("robots.txt")
64
+ blk.call(conn)
65
+ end
66
+ }
67
+ robots.errback {
68
+ http = EventMachine::HttpRequest.new(@uri).get(options)
69
+ http.callback {blk.call(http)}
70
+ http.errback {blk.call(http)}
71
+ }
72
+ end
73
+ end
74
+
75
+ protected
76
+
77
+ def robots_allowed?(robots_file, useragent)
78
+ rules = RobotRules.new(useragent)
79
+ rules.parse(@uri.to_s, robots_file)
80
+ rules.allowed? @uri.to_s
81
+ end
82
+
83
+ def robots_txt_url
84
+ location = if @uri.port == 80
85
+ @uri.host
86
+ else
87
+ "#{@uri.host}:#{@uri.port}"
88
+ end
89
+ "http://#{location}/robots.txt"
90
+ end
91
+
92
+ end
93
+
@@ -0,0 +1,96 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+ require 'webrick'
3
+
4
+ describe "RDaneel" do
5
+
6
+ describe "when there is no robots.txt" do
7
+ before(:all) do
8
+ start_server do |s|
9
+ s.mount_proc('/hello_world', lambda { |req, resp| resp.status = 200; resp.body = "hello world"})
10
+ end
11
+ end
12
+
13
+ after(:all) do
14
+ stop_server
15
+ end
16
+
17
+ it "should follow and get the uri" do
18
+ EM.run {
19
+ RDaneel.new("http://127.0.0.1:8080/hello_world").get do |http|
20
+ http.response_header.status.should == 200
21
+ http.response.should == "hello world"
22
+ http.error.should == ''
23
+ EM.stop
24
+ end
25
+ }
26
+ end
27
+ end
28
+
29
+ describe "when there is a robots.txt that allow the uri requested" do
30
+ before(:all) do
31
+ start_server do |s|
32
+ s.mount_proc('/robots.txt', lambda { |req, resp| resp.status = 200; resp.body = "User-agent: *\nDisallow: /images"})
33
+ s.mount_proc('/hello_world', lambda { |req, resp| resp.status = 200; resp.body = "hello world"})
34
+ end
35
+ end
36
+
37
+ after(:all) do
38
+ stop_server
39
+ end
40
+
41
+ it "should follow and get the uri" do
42
+ EM.run {
43
+ RDaneel.new("http://127.0.0.1:8080/hello_world").get do |http|
44
+ http.response_header.status.should == 200
45
+ http.response.should == "hello world"
46
+ http.error.should == ''
47
+ EM.stop
48
+ end
49
+ }
50
+ end
51
+ end
52
+
53
+ describe "when there is a robots.txt that disallow all content for all bots" do
54
+ before(:all) do
55
+ start_server do |s|
56
+ s.mount_proc('/robots.txt', lambda { |req, resp| resp.status = 200; resp.body = "User-agent: *\nDisallow: /"})
57
+ s.mount_proc('/hello_world', lambda { |req, resp| resp.status = 200; resp.body = "hello world"})
58
+ end
59
+ end
60
+
61
+ after(:all) do
62
+ stop_server
63
+ end
64
+
65
+ it "shouldn't get the uri" do
66
+ EM.run {
67
+ RDaneel.new("http://127.0.0.1:8080/hello_world").get do |http|
68
+ http.error.should == 'robots.txt'
69
+ http.uri.to_s.should == "http://127.0.0.1:8080/hello_world"
70
+ EM.stop
71
+ end
72
+ }
73
+ end
74
+ end
75
+
76
+
77
+
78
+ end
79
+
80
+ def start_server(options={}, &blk)
81
+ @server = WEBrick::HTTPServer.new({:Port => 8080}.merge(options))
82
+ @server_thread = Thread.new {
83
+ blk.call(@server) if blk
84
+ @server.start
85
+ }
86
+ end
87
+
88
+ def stop_server
89
+ @server.shutdown
90
+ @server_thread.join
91
+ end
92
+
93
+ def robots_txt
94
+
95
+ end
96
+
data/spec/spec.opts ADDED
@@ -0,0 +1,6 @@
1
+ --color
2
+ --format progress
3
+ --loadby mtime
4
+ --reverse
5
+ --backtrace
6
+
@@ -0,0 +1,11 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ require 'rubygems'
4
+ require 'rdaneel'
5
+ require 'spec'
6
+ require 'spec/autorun'
7
+
8
+ Spec::Runner.configure do |config|
9
+
10
+ end
11
+
metadata ADDED
@@ -0,0 +1,114 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rdaneel
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 0
9
+ version: 0.0.0
10
+ platform: ruby
11
+ authors:
12
+ - Edgar Gonzalez
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-07-14 00:00:00 -04:30
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: em-http-request
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ segments:
28
+ - 0
29
+ - 2
30
+ - 10
31
+ version: 0.2.10
32
+ type: :runtime
33
+ version_requirements: *id001
34
+ - !ruby/object:Gem::Dependency
35
+ name: robot_rules
36
+ prerelease: false
37
+ requirement: &id002 !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ segments:
42
+ - 0
43
+ - 9
44
+ - 1
45
+ version: 0.9.1
46
+ type: :runtime
47
+ version_requirements: *id002
48
+ - !ruby/object:Gem::Dependency
49
+ name: rspec
50
+ prerelease: false
51
+ requirement: &id003 !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ segments:
56
+ - 1
57
+ - 2
58
+ - 9
59
+ version: 1.2.9
60
+ type: :development
61
+ version_requirements: *id003
62
+ description: Add robots.txt support on top of em-http-request
63
+ email: edgargonzalez@gmail.com
64
+ executables: []
65
+
66
+ extensions: []
67
+
68
+ extra_rdoc_files:
69
+ - LICENSE
70
+ - README.rdoc
71
+ files:
72
+ - .document
73
+ - .gitignore
74
+ - LICENSE
75
+ - README.rdoc
76
+ - Rakefile
77
+ - VERSION
78
+ - lib/rdaneel.rb
79
+ - spec/rdaneel_spec.rb
80
+ - spec/spec.opts
81
+ - spec/spec_helper.rb
82
+ has_rdoc: true
83
+ homepage: http://github.com/hasmanydevelopers/RDaneel
84
+ licenses: []
85
+
86
+ post_install_message:
87
+ rdoc_options:
88
+ - --charset=UTF-8
89
+ require_paths:
90
+ - lib
91
+ required_ruby_version: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ segments:
96
+ - 0
97
+ version: "0"
98
+ required_rubygems_version: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ segments:
103
+ - 0
104
+ version: "0"
105
+ requirements: []
106
+
107
+ rubyforge_project:
108
+ rubygems_version: 1.3.6
109
+ signing_key:
110
+ specification_version: 3
111
+ summary: Obey robots.txt on top of em-http-request (Asynchronous HTTP Client)
112
+ test_files:
113
+ - spec/spec_helper.rb
114
+ - spec/rdaneel_spec.rb