robot_rules 0.9.1

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 James Edward Gray II and Jeremy Friesen
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.markdown ADDED
@@ -0,0 +1,62 @@
1
+ robot_rules
2
+ ===========
3
+
4
+ A tool to determine if the robots.txt would prevent a given user agent
5
+ from making a request to a given URI.
6
+
7
+ Example
8
+ -------
9
+
10
+ Given the following:
11
+ #!/usr/local/bin/ruby -w
12
+
13
+ require "robot_rules"
14
+ require "open-uri"
15
+
16
+ rules = RobotRules.new("RubyQuizBrowser 1.0")
17
+ robots_url = "http://pragmaticprogrammer.com/robots.txt"
18
+
19
+ open(robots_url) do |url|
20
+ data = url.read
21
+
22
+ puts "/robots.txt:"
23
+ puts data
24
+ puts
25
+
26
+ rules.parse(robots_url, data)
27
+ end
28
+
29
+ puts "URL tests:"
30
+ %w{ http://pragmaticprogrammer.com/images/dave.jpg
31
+ http://pragmaticprogrammer.com/imagination }.each do |test|
32
+ puts "rules.allowed?( #{test.inspect} )"
33
+ puts rules.allowed?(test)
34
+ end
35
+
36
+ __END__
37
+
38
+ This script will print
39
+
40
+ /robots.txt:
41
+ User-agent: *
42
+ Disallow: images
43
+
44
+ URL tests:
45
+ rules.allowed?( "http://pragmaticprogrammer.com/images/dave.jpg" )
46
+ false
47
+ rules.allowed?( "http://pragmaticprogrammer.com/imagination" )
48
+ true
49
+
50
+
51
+
52
+ History
53
+ -------
54
+
55
+ RobotRules was created by James Edward Gray II as a response to "Port a
56
+ Library" Ruby Quiz #64. A few years later, Jeremy Friesen wrapped the
57
+ library up into a gem and added some tests.
58
+
59
+ Copyright
60
+ ---------
61
+
62
+ Copyright (c) 2009 James Edward Gray II and Jeremy Friesen. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,55 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "robot_rules"
8
+ gem.summary = %Q{A tool to determine if the robots.txt would prevent a given user agent from making a request to a given URI.}
9
+ gem.email = "edgargonzalez@gmail.com"
10
+ gem.homepage = "http://github.com/hasmanydevelopers/robot_rules"
11
+ gem.authors = ["James Edward Gray II", "Jeremy Friesen", "Edgar Gonzalez"]
12
+ end
13
+ Jeweler::GemcutterTasks.new
14
+ rescue LoadError
15
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
16
+ end
17
+
18
+ require 'rake/testtask'
19
+ Rake::TestTask.new(:test) do |test|
20
+ test.libs << 'lib' << 'test'
21
+ test.pattern = 'test/**/*_test.rb'
22
+ test.verbose = true
23
+ end
24
+
25
+ begin
26
+ require 'rcov/rcovtask'
27
+ Rcov::RcovTask.new do |test|
28
+ test.libs << 'test'
29
+ test.pattern = 'test/**/*_test.rb'
30
+ test.verbose = true
31
+ end
32
+ rescue LoadError
33
+ task :rcov do
34
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
35
+ end
36
+ end
37
+
38
+
39
+ task :default => :test
40
+
41
+ require 'rake/rdoctask'
42
+ Rake::RDocTask.new do |rdoc|
43
+ if File.exist?('VERSION.yml')
44
+ config = YAML.load(File.read('VERSION.yml'))
45
+ version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
46
+ else
47
+ version = ""
48
+ end
49
+
50
+ rdoc.rdoc_dir = 'rdoc'
51
+ rdoc.title = "robot_rules #{version}"
52
+ rdoc.rdoc_files.include('README*')
53
+ rdoc.rdoc_files.include('lib/**/*.rb')
54
+ end
55
+
data/VERSION.yml ADDED
@@ -0,0 +1,4 @@
1
+ ---
2
+ :major: 0
3
+ :minor: 9
4
+ :patch: 1
@@ -0,0 +1,76 @@
1
+ #!/usr/local/bin/ruby -w
2
+
3
+ # robot_rules.rb
4
+ #
5
+ # Created by James Edward Gray II on 2006-01-31.
6
+ # Copyright 2006 Gray Productions. All rights reserved.
7
+
8
+ require "uri"
9
+
10
+ # Based on Perl's WWW::RobotRules module, by Gisle Aas.
11
+ class RobotRules
12
+ def initialize( user_agent )
13
+ @user_agent = user_agent.scan(/\S+/).first.sub(%r{/.*},
14
+ "").downcase
15
+ @rules = Hash.new { |rules, rule| rules[rule] = Array.new }
16
+ end
17
+
18
+ def parse( text_uri, robots_data )
19
+ uri = URI.parse(text_uri)
20
+ location = "#{uri.host}:#{uri.port}"
21
+ @rules.delete(location)
22
+
23
+ rules = robots_data.split(/[\015\012]+/).map { |rule| rule.sub(/\s*#.*$/, "") }
24
+ anon_rules = Array.new
25
+ my_rules = Array.new
26
+ current = anon_rules
27
+ rules.each do |rule|
28
+ case rule
29
+ when /^\s*User-Agent\s*:\s*(.+?)\s*$/i
30
+ break unless my_rules.empty?
31
+
32
+ current = if $1 == "*"
33
+ anon_rules
34
+ elsif $1.downcase.index(@user_agent)
35
+ my_rules
36
+ else
37
+ nil
38
+ end
39
+ when /^\s*Disallow\s*:\s*(.*?)\s*$/i
40
+ next if current.nil?
41
+
42
+ if $1.empty?
43
+ current << nil
44
+ else
45
+ disallow = URI.parse($1)
46
+
47
+ next unless disallow.scheme.nil? or disallow.scheme == uri.scheme
48
+ next unless disallow.port.nil? or disallow.port == uri.port
49
+ next unless disallow.host.nil? or disallow.host.downcase == uri.host.downcase
50
+
51
+ disallow = disallow.path
52
+ disallow = "/" if disallow.empty?
53
+ disallow = "/#{disallow}" unless disallow[0] == ?/
54
+
55
+ current << disallow
56
+ end
57
+ end
58
+ end
59
+
60
+ @rules[location] = if my_rules.empty?
61
+ anon_rules.compact
62
+ else
63
+ my_rules.compact
64
+ end
65
+ end
66
+
67
+ def allowed?( text_uri )
68
+ uri = URI.parse(text_uri)
69
+ location = "#{uri.host}:#{uri.port}"
70
+ path = uri.path
71
+
72
+ return true unless %w{http https}.include?(uri.scheme)
73
+
74
+ not @rules[location].any? { |rule| path.index(rule) == 0 }
75
+ end
76
+ end
@@ -0,0 +1,50 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{robot_rules}
8
+ s.version = "0.9.1"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["James Edward Gray II", "Jeremy Friesen", "Edgar Gonzalez"]
12
+ s.date = %q{2010-07-09}
13
+ s.email = %q{edgargonzalez@gmail.com}
14
+ s.extra_rdoc_files = [
15
+ "LICENSE",
16
+ "README.markdown"
17
+ ]
18
+ s.files = [
19
+ ".document",
20
+ ".gitignore",
21
+ "LICENSE",
22
+ "README.markdown",
23
+ "Rakefile",
24
+ "VERSION.yml",
25
+ "lib/robot_rules.rb",
26
+ "robot_rules.gemspec",
27
+ "test/robot_rules_test.rb",
28
+ "test/test_helper.rb"
29
+ ]
30
+ s.homepage = %q{http://github.com/hasmanydevelopers/robot_rules}
31
+ s.rdoc_options = ["--charset=UTF-8"]
32
+ s.require_paths = ["lib"]
33
+ s.rubygems_version = %q{1.3.6}
34
+ s.summary = %q{A tool to determine if the robots.txt would prevent a given user agent from making a request to a given URI.}
35
+ s.test_files = [
36
+ "test/test_helper.rb",
37
+ "test/robot_rules_test.rb"
38
+ ]
39
+
40
+ if s.respond_to? :specification_version then
41
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
42
+ s.specification_version = 3
43
+
44
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
45
+ else
46
+ end
47
+ else
48
+ end
49
+ end
50
+
@@ -0,0 +1,37 @@
1
+ require 'test_helper'
2
+
3
+ class RobotRulesTest < Test::Unit::TestCase
4
+ SITE_URL = "http://www.example.com"
5
+ def setup
6
+ @robot_rule = RobotRules.new('Ruby Spider 1.0')
7
+ @robot_rule.parse(File.join(SITE_URL,'robots.txt'), %(User-agent: *\nDisallow: images))
8
+ end
9
+ def test_should_allow_path_imagination
10
+ assert_equal true, @robot_rule.allowed?(File.join(SITE_URL, 'imagination/me.jpg'))
11
+ end
12
+ def test_should_disallow_path_images
13
+ assert_equal false, @robot_rule.allowed?(File.join(SITE_URL, 'images/me.jpg'))
14
+ end
15
+ def test_should_allow_path_images_for_other_site
16
+ assert_equal true, @robot_rule.allowed?(File.join("http://google.com", 'images/me.jpg'))
17
+ end
18
+ def test_should_disallow_path_images_for_other_site
19
+ assert_equal true, @robot_rule.allowed?(File.join("http://google.com", 'images/me.jpg'))
20
+ end
21
+
22
+ def test_should_abide_by_disallowed_user_agent
23
+ @robot_rule = RobotRules.new('Microsoft')
24
+ robots_txt = %(/robots.txt:\nUser-agent: Microsoft\nDisallow: google\nUser-agent: *\nDisallow: images)
25
+ @robot_rule.parse(File.join(SITE_URL,'robots.txt'), robots_txt)
26
+
27
+ assert_equal false, @robot_rule.allowed?(File.join(SITE_URL, 'google/hellow_world.txt'))
28
+ end
29
+
30
+ def test_should_allow_user_agent_to_specified_path
31
+ @robot_rule = RobotRules.new('Google')
32
+ robots_txt = %(/robots.txt:\nUser-agent: Microsoft\nDisallow: google\nUser-agent: *\nDisallow: images)
33
+ @robot_rule.parse(File.join(SITE_URL,'robots.txt'), robots_txt)
34
+
35
+ assert_equal true, @robot_rule.allowed?(File.join(SITE_URL, 'google/hellow_world.txt'))
36
+ end
37
+ end
@@ -0,0 +1,9 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+
4
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
5
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
6
+ require 'robot_rules'
7
+
8
+ class Test::Unit::TestCase
9
+ end
metadata ADDED
@@ -0,0 +1,75 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: robot_rules
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 9
8
+ - 1
9
+ version: 0.9.1
10
+ platform: ruby
11
+ authors:
12
+ - James Edward Gray II
13
+ - Jeremy Friesen
14
+ - Edgar Gonzalez
15
+ autorequire:
16
+ bindir: bin
17
+ cert_chain: []
18
+
19
+ date: 2010-07-09 00:00:00 -04:30
20
+ default_executable:
21
+ dependencies: []
22
+
23
+ description:
24
+ email: edgargonzalez@gmail.com
25
+ executables: []
26
+
27
+ extensions: []
28
+
29
+ extra_rdoc_files:
30
+ - LICENSE
31
+ - README.markdown
32
+ files:
33
+ - .document
34
+ - .gitignore
35
+ - LICENSE
36
+ - README.markdown
37
+ - Rakefile
38
+ - VERSION.yml
39
+ - lib/robot_rules.rb
40
+ - robot_rules.gemspec
41
+ - test/robot_rules_test.rb
42
+ - test/test_helper.rb
43
+ has_rdoc: true
44
+ homepage: http://github.com/hasmanydevelopers/robot_rules
45
+ licenses: []
46
+
47
+ post_install_message:
48
+ rdoc_options:
49
+ - --charset=UTF-8
50
+ require_paths:
51
+ - lib
52
+ required_ruby_version: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ segments:
57
+ - 0
58
+ version: "0"
59
+ required_rubygems_version: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ segments:
64
+ - 0
65
+ version: "0"
66
+ requirements: []
67
+
68
+ rubyforge_project:
69
+ rubygems_version: 1.3.6
70
+ signing_key:
71
+ specification_version: 3
72
+ summary: A tool to determine if the robots.txt would prevent a given user agent from making a request to a given URI.
73
+ test_files:
74
+ - test/test_helper.rb
75
+ - test/robot_rules_test.rb