jeremyf-robot_rules 0.9.1

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 James Edward Gray II and Jeremy Friesen
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.markdown ADDED
@@ -0,0 +1,62 @@
1
+ robot_rules
2
+ ===========
3
+
4
+ A tool to determine if the robots.txt would prevent a given user agent
5
+ from making a request to a given URI.
6
+
7
+ Example
8
+ -------
9
+
10
+ Given the following:
11
+ #!/usr/local/bin/ruby -w
12
+
13
+ require "robot_rules"
14
+ require "open-uri"
15
+
16
+ rules = RobotRules.new("RubyQuizBrowser 1.0")
17
+ robots_url = "http://pragmaticprogrammer.com/robots.txt"
18
+
19
+ open(robots_url) do |url|
20
+ data = url.read
21
+
22
+ puts "/robots.txt:"
23
+ puts data
24
+ puts
25
+
26
+ rules.parse(robots_url, data)
27
+ end
28
+
29
+ puts "URL tests:"
30
+ %w{ http://pragmaticprogrammer.com/images/dave.jpg
31
+ http://pragmaticprogrammer.com/imagination }.each do |test|
32
+ puts "rules.allowed?( #{test.inspect} )"
33
+ puts rules.allowed?(test)
34
+ end
35
+
36
+ __END__
37
+
38
+ This script will print
39
+
40
+ /robots.txt:
41
+ User-agent: *
42
+ Disallow: images
43
+
44
+ URL tests:
45
+ rules.allowed?( "http://pragmaticprogrammer.com/images/dave.jpg" )
46
+ false
47
+ rules.allowed?( "http://pragmaticprogrammer.com/imagination" )
48
+ true
49
+
50
+
51
+
52
+ History
53
+ -------
54
+
55
+ RobotRules was created by James Edward Gray II as a response to "Port a
56
+ Library" Ruby Quiz #64. A few years later, Jeremy Friesen wrapped the
57
+ library up into a gem and added some tests.
58
+
59
+ Copyright
60
+ ---------
61
+
62
+ Copyright (c) 2009 James Edward Gray II and Jeremy Friesen. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,56 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "robot_rules"
8
+ gem.summary = %Q{A tool to determine if the robots.txt would prevent a given user agent from making a request to a given URI.}
9
+ gem.email = "jeremy.n.friesen@gmail.com"
10
+ gem.homepage = "http://github.com/jeremyf/robot_rules"
11
+ gem.authors = ["James Edward Gray II", "Jeremy Friesen"]
12
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
13
+ end
14
+
15
+ rescue LoadError
16
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
17
+ end
18
+
19
+ require 'rake/testtask'
20
+ Rake::TestTask.new(:test) do |test|
21
+ test.libs << 'lib' << 'test'
22
+ test.pattern = 'test/**/*_test.rb'
23
+ test.verbose = true
24
+ end
25
+
26
+ begin
27
+ require 'rcov/rcovtask'
28
+ Rcov::RcovTask.new do |test|
29
+ test.libs << 'test'
30
+ test.pattern = 'test/**/*_test.rb'
31
+ test.verbose = true
32
+ end
33
+ rescue LoadError
34
+ task :rcov do
35
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
36
+ end
37
+ end
38
+
39
+
40
+ task :default => :test
41
+
42
+ require 'rake/rdoctask'
43
+ Rake::RDocTask.new do |rdoc|
44
+ if File.exist?('VERSION.yml')
45
+ config = YAML.load(File.read('VERSION.yml'))
46
+ version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
47
+ else
48
+ version = ""
49
+ end
50
+
51
+ rdoc.rdoc_dir = 'rdoc'
52
+ rdoc.title = "robot_rules #{version}"
53
+ rdoc.rdoc_files.include('README*')
54
+ rdoc.rdoc_files.include('lib/**/*.rb')
55
+ end
56
+
data/VERSION.yml ADDED
@@ -0,0 +1,4 @@
1
+ ---
2
+ :major: 0
3
+ :minor: 9
4
+ :patch: 1
@@ -0,0 +1,76 @@
1
+ #!/usr/local/bin/ruby -w
2
+
3
+ # robot_rules.rb
4
+ #
5
+ # Created by James Edward Gray II on 2006-01-31.
6
+ # Copyright 2006 Gray Productions. All rights reserved.
7
+
8
+ require "uri"
9
+
10
+ # Based on Perl's WWW::RobotRules module, by Gisle Aas.
11
+ class RobotRules
12
+ def initialize( user_agent )
13
+ @user_agent = user_agent.scan(/\S+/).first.sub(%r{/.*},
14
+ "").downcase
15
+ @rules = Hash.new { |rules, rule| rules[rule] = Array.new }
16
+ end
17
+
18
+ def parse( text_uri, robots_data )
19
+ uri = URI.parse(text_uri)
20
+ location = "#{uri.host}:#{uri.port}"
21
+ @rules.delete(location)
22
+
23
+ rules = robots_data.split(/[\015\012]+/).map { |rule| rule.sub(/\s*#.*$/, "") }
24
+ anon_rules = Array.new
25
+ my_rules = Array.new
26
+ current = anon_rules
27
+ rules.each do |rule|
28
+ case rule
29
+ when /^\s*User-Agent\s*:\s*(.+?)\s*$/i
30
+ break unless my_rules.empty?
31
+
32
+ current = if $1 == "*"
33
+ anon_rules
34
+ elsif $1.downcase.index(@user_agent)
35
+ my_rules
36
+ else
37
+ nil
38
+ end
39
+ when /^\s*Disallow\s*:\s*(.*?)\s*$/i
40
+ next if current.nil?
41
+
42
+ if $1.empty?
43
+ current << nil
44
+ else
45
+ disallow = URI.parse($1)
46
+
47
+ next unless disallow.scheme.nil? or disallow.scheme == uri.scheme
48
+ next unless disallow.port.nil? or disallow.port == uri.port
49
+ next unless disallow.host.nil? or disallow.host.downcase == uri.host.downcase
50
+
51
+ disallow = disallow.path
52
+ disallow = "/" if disallow.empty?
53
+ disallow = "/#{disallow}" unless disallow[0] == ?/
54
+
55
+ current << disallow
56
+ end
57
+ end
58
+ end
59
+
60
+ @rules[location] = if my_rules.empty?
61
+ anon_rules.compact
62
+ else
63
+ my_rules.compact
64
+ end
65
+ end
66
+
67
+ def allowed?( text_uri )
68
+ uri = URI.parse(text_uri)
69
+ location = "#{uri.host}:#{uri.port}"
70
+ path = uri.path
71
+
72
+ return true unless %w{http https}.include?(uri.scheme)
73
+
74
+ not @rules[location].any? { |rule| path.index(rule) == 0 }
75
+ end
76
+ end
@@ -0,0 +1,46 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = %q{robot_rules}
5
+ s.version = "0.9.1"
6
+
7
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
+ s.authors = ["James Edward Gray II", "Jeremy Friesen"]
9
+ s.date = %q{2009-07-28}
10
+ s.email = %q{jeremy.n.friesen@gmail.com}
11
+ s.extra_rdoc_files = [
12
+ "LICENSE",
13
+ "README.markdown"
14
+ ]
15
+ s.files = [
16
+ ".document",
17
+ ".gitignore",
18
+ "LICENSE",
19
+ "README.markdown",
20
+ "Rakefile",
21
+ "VERSION.yml",
22
+ "lib/robot_rules.rb",
23
+ "robot_rules.gemspec",
24
+ "test/robot_rules_test.rb",
25
+ "test/test_helper.rb"
26
+ ]
27
+ s.homepage = %q{http://github.com/jeremyf/robot_rules}
28
+ s.rdoc_options = ["--charset=UTF-8"]
29
+ s.require_paths = ["lib"]
30
+ s.rubygems_version = %q{1.3.4}
31
+ s.summary = %q{A tool to determine if the robots.txt would prevent a given user agent from making a request to a given URI.}
32
+ s.test_files = [
33
+ "test/robot_rules_test.rb",
34
+ "test/test_helper.rb"
35
+ ]
36
+
37
+ if s.respond_to? :specification_version then
38
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
39
+ s.specification_version = 3
40
+
41
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
42
+ else
43
+ end
44
+ else
45
+ end
46
+ end
@@ -0,0 +1,37 @@
1
+ require 'test_helper'
2
+
3
+ class RobotRulesTest < Test::Unit::TestCase
4
+ SITE_URL = "http://www.example.com"
5
+ def setup
6
+ @robot_rule = RobotRules.new('Ruby Spider 1.0')
7
+ @robot_rule.parse(File.join(SITE_URL,'robots.txt'), %(User-agent: *\nDisallow: images))
8
+ end
9
+ def test_should_allow_path_imagination
10
+ assert_equal true, @robot_rule.allowed?(File.join(SITE_URL, 'imagination/me.jpg'))
11
+ end
12
+ def test_should_disallow_path_images
13
+ assert_equal false, @robot_rule.allowed?(File.join(SITE_URL, 'images/me.jpg'))
14
+ end
15
+ def test_should_allow_path_images_for_other_site
16
+ assert_equal true, @robot_rule.allowed?(File.join("http://google.com", 'images/me.jpg'))
17
+ end
18
+ def test_should_disallow_path_images_for_other_site
19
+ assert_equal true, @robot_rule.allowed?(File.join("http://google.com", 'images/me.jpg'))
20
+ end
21
+
22
+ def test_should_abide_by_disallowed_user_agent
23
+ @robot_rule = RobotRules.new('Microsoft')
24
+ robots_txt = %(/robots.txt:\nUser-agent: Microsoft\nDisallow: google\nUser-agent: *\nDisallow: images)
25
+ @robot_rule.parse(File.join(SITE_URL,'robots.txt'), robots_txt)
26
+
27
+ assert_equal false, @robot_rule.allowed?(File.join(SITE_URL, 'google/hellow_world.txt'))
28
+ end
29
+
30
+ def test_should_allow_user_agent_to_specified_path
31
+ @robot_rule = RobotRules.new('Google')
32
+ robots_txt = %(/robots.txt:\nUser-agent: Microsoft\nDisallow: google\nUser-agent: *\nDisallow: images)
33
+ @robot_rule.parse(File.join(SITE_URL,'robots.txt'), robots_txt)
34
+
35
+ assert_equal true, @robot_rule.allowed?(File.join(SITE_URL, 'google/hellow_world.txt'))
36
+ end
37
+ end
@@ -0,0 +1,9 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+
4
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
5
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
6
+ require 'robot_rules'
7
+
8
+ class Test::Unit::TestCase
9
+ end
metadata ADDED
@@ -0,0 +1,66 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: jeremyf-robot_rules
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.9.1
5
+ platform: ruby
6
+ authors:
7
+ - James Edward Gray II
8
+ - Jeremy Friesen
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2009-07-28 00:00:00 -07:00
14
+ default_executable:
15
+ dependencies: []
16
+
17
+ description:
18
+ email: jeremy.n.friesen@gmail.com
19
+ executables: []
20
+
21
+ extensions: []
22
+
23
+ extra_rdoc_files:
24
+ - LICENSE
25
+ - README.markdown
26
+ files:
27
+ - .document
28
+ - .gitignore
29
+ - LICENSE
30
+ - README.markdown
31
+ - Rakefile
32
+ - VERSION.yml
33
+ - lib/robot_rules.rb
34
+ - robot_rules.gemspec
35
+ - test/robot_rules_test.rb
36
+ - test/test_helper.rb
37
+ has_rdoc: false
38
+ homepage: http://github.com/jeremyf/robot_rules
39
+ licenses:
40
+ post_install_message:
41
+ rdoc_options:
42
+ - --charset=UTF-8
43
+ require_paths:
44
+ - lib
45
+ required_ruby_version: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: "0"
50
+ version:
51
+ required_rubygems_version: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ version: "0"
56
+ version:
57
+ requirements: []
58
+
59
+ rubyforge_project:
60
+ rubygems_version: 1.3.5
61
+ signing_key:
62
+ specification_version: 3
63
+ summary: A tool to determine if the robots.txt would prevent a given user agent from making a request to a given URI.
64
+ test_files:
65
+ - test/robot_rules_test.rb
66
+ - test/test_helper.rb