robots 0.0.0 → 0.7.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1 @@
1
+ *.gem
@@ -0,0 +1,19 @@
1
+ 0.7.3
2
+ - Move to jeweler
3
+ 0.7.2
4
+ - Add Ruby 1.9 compatibility
5
+ 0.5-0.7.1
6
+ - Lost the changelog information :/
7
+ 0.4.0
8
+ - Fixed other_values bug
9
+ - added crawl-delay support
10
+ 0.3.2
11
+ - fixed breaking on reddit.com
12
+ 0.3.1
13
+ - fixed bug in disallows handling
14
+ - partially mocked out open-uri
15
+ 0.3.0
16
+ - added loggable dependency
17
+ 0.2.0
18
+ - IF robot.txt 404s, assume allowed.
19
+ - Added CHANGELOG
data/README ADDED
@@ -0,0 +1,33 @@
1
+ A simple Ruby library to parse robots.txt.
2
+
3
+ Usage:
4
+
5
+ robots = Robots.new "Some User Agent"
6
+ assert robots.allowed?("http://www.yelp.com/foo")
7
+ assert !robots.allowed?("http://www.yelp.com/mail?foo=bar")
8
+ robots.other_values("http://foo.com") # gets misc. key/values (i.e. sitemaps)
9
+
10
+ If you want caching, you're on your own. I suggest marshalling an instance of the parser.
11
+
12
+ Copyright (c) 2008 Kyle Maxwell
13
+
14
+ Permission is hereby granted, free of charge, to any person
15
+ obtaining a copy of this software and associated documentation
16
+ files (the "Software"), to deal in the Software without
17
+ restriction, including without limitation the rights to use,
18
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
19
+ copies of the Software, and to permit persons to whom the
20
+ Software is furnished to do so, subject to the following
21
+ conditions:
22
+
23
+ The above copyright notice and this permission notice shall be
24
+ included in all copies or substantial portions of the Software.
25
+
26
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
27
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
28
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
29
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
30
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
31
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
32
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
33
+ OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,57 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "robots"
8
+ gem.summary = "Simple robots.txt parser"
9
+ gem.description = "It parses robots.txt files"
10
+ gem.email = "kyle@kylemaxwell.com"
11
+ gem.homepage = "http://github.com/fizx/robots"
12
+ gem.authors = ["Kyle Maxwell"]
13
+ gem.add_development_dependency "thoughtbot-shoulda"
14
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
+ end
16
+ Jeweler::GemcutterTasks.new
17
+ rescue LoadError
18
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
19
+ end
20
+
21
+ require 'rake/testtask'
22
+ Rake::TestTask.new(:test) do |test|
23
+ test.libs << 'lib' << 'test'
24
+ test.pattern = 'test/**/*_test.rb'
25
+ test.verbose = true
26
+ end
27
+
28
+ begin
29
+ require 'rcov/rcovtask'
30
+ Rcov::RcovTask.new do |test|
31
+ test.libs << 'test'
32
+ test.pattern = 'test/**/*_test.rb'
33
+ test.verbose = true
34
+ end
35
+ rescue LoadError
36
+ task :rcov do
37
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
38
+ end
39
+ end
40
+
41
+ task :test => :check_dependencies
42
+
43
+ task :default => :test
44
+
45
+ require 'rake/rdoctask'
46
+ Rake::RDocTask.new do |rdoc|
47
+ if File.exist?('VERSION')
48
+ version = File.read('VERSION')
49
+ else
50
+ version = ""
51
+ end
52
+
53
+ rdoc.rdoc_dir = 'rdoc'
54
+ rdoc.title = "robots #{version}"
55
+ rdoc.rdoc_files.include('README*')
56
+ rdoc.rdoc_files.include('lib/**/*.rb')
57
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.7.3
@@ -0,0 +1,133 @@
1
+ require "open-uri"
2
+ require "uri"
3
+ require "rubygems"
4
+ require "timeout"
5
+
6
+ class Robots
7
+
8
+ DEFAULT_TIMEOUT = 3
9
+
10
+ class ParsedRobots
11
+
12
+ def initialize(uri, user_agent)
13
+ @last_accessed = Time.at(1)
14
+
15
+ io = nil
16
+ begin
17
+ Timeout::timeout(Robots.timeout) do
18
+ io = URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
19
+ end
20
+ rescue Timeout::Error
21
+ STDERR.puts "robots.txt request timed out"
22
+ end
23
+
24
+
25
+ if !io || io.content_type != "text/plain" || io.status != ["200", "OK"]
26
+ io = StringIO.new("User-agent: *\nAllow: /\n")
27
+ end
28
+
29
+ @other = {}
30
+ @disallows = {}
31
+ @allows = {}
32
+ @delays = {} # added delays to make it work
33
+ agent = /.*/
34
+ io.each do |line|
35
+ next if line =~ /^\s*(#.*|$)/
36
+ arr = line.split(":")
37
+ key = arr.shift
38
+ value = arr.join(":").strip
39
+ value.strip!
40
+ case key
41
+ when "User-agent"
42
+ agent = to_regex(value)
43
+ when "Allow"
44
+ @allows[agent] ||= []
45
+ @allows[agent] << to_regex(value)
46
+ when "Disallow"
47
+ @disallows[agent] ||= []
48
+ @disallows[agent] << to_regex(value)
49
+ when "Crawl-delay"
50
+ @delays[agent] = value.to_i
51
+ else
52
+ @other[key] = value
53
+ end
54
+ end
55
+
56
+ @parsed = true
57
+ end
58
+
59
+ def allowed?(uri, user_agent)
60
+ return true unless @parsed
61
+ allowed = true
62
+ path = uri.request_uri
63
+
64
+ @disallows.each do |key, value|
65
+ if user_agent =~ key
66
+ value.each do |rule|
67
+ if path =~ rule
68
+ allowed = false
69
+ end
70
+ end
71
+ end
72
+ end
73
+
74
+ @allows.each do |key, value|
75
+ unless allowed
76
+ if user_agent =~ key
77
+ value.each do |rule|
78
+ if path =~ rule
79
+ allowed = true
80
+ end
81
+ end
82
+ end
83
+ end
84
+ end
85
+
86
+ if allowed && @delays[user_agent]
87
+ sleep @delays[user_agent] - (Time.now - @last_accessed)
88
+ @last_accessed = Time.now
89
+ end
90
+
91
+ return allowed
92
+ end
93
+
94
+ def other_values
95
+ @other
96
+ end
97
+
98
+ protected
99
+
100
+ def to_regex(pattern)
101
+ pattern = Regexp.escape(pattern)
102
+ pattern.gsub!(Regexp.escape("*"), ".*")
103
+ Regexp.compile("^#{pattern}")
104
+ end
105
+ end
106
+
107
+ def self.timeout=(t)
108
+ @timeout = t
109
+ end
110
+
111
+ def self.timeout
112
+ @timeout || DEFAULT_TIMEOUT
113
+ end
114
+
115
+ def initialize(user_agent)
116
+ @user_agent = user_agent
117
+ @parsed = {}
118
+ end
119
+
120
+ def allowed?(uri)
121
+ uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
122
+ host = uri.host
123
+ @parsed[host] ||= ParsedRobots.new(uri, @user_agent)
124
+ @parsed[host].allowed?(uri, @user_agent)
125
+ end
126
+
127
+ def other_values(uri)
128
+ uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
129
+ host = uri.host
130
+ @parsed[host] ||= ParsedRobots.new(uri, @user_agent)
131
+ @parsed[host].other_values
132
+ end
133
+ end
@@ -0,0 +1,50 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{robots}
8
+ s.version = "0.7.3"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Kyle Maxwell"]
12
+ s.date = %q{2009-10-18}
13
+ s.description = %q{It parses robots.txt files}
14
+ s.email = %q{kyle@kylemaxwell.com}
15
+ s.extra_rdoc_files = [
16
+ "README"
17
+ ]
18
+ s.files = [
19
+ ".gitignore",
20
+ "CHANGELOG",
21
+ "README",
22
+ "Rakefile",
23
+ "VERSION",
24
+ "lib/robots.rb",
25
+ "robots.gemspec",
26
+ "test/fixtures/robots1.txt",
27
+ "test/test_robots.rb"
28
+ ]
29
+ s.homepage = %q{http://github.com/fizx/robots}
30
+ s.rdoc_options = ["--charset=UTF-8"]
31
+ s.require_paths = ["lib"]
32
+ s.rubygems_version = %q{1.3.4}
33
+ s.summary = %q{Simple robots.txt parser}
34
+ s.test_files = [
35
+ "test/test_robots.rb"
36
+ ]
37
+
38
+ if s.respond_to? :specification_version then
39
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
40
+ s.specification_version = 3
41
+
42
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
43
+ s.add_development_dependency(%q<thoughtbot-shoulda>, [">= 0"])
44
+ else
45
+ s.add_dependency(%q<thoughtbot-shoulda>, [">= 0"])
46
+ end
47
+ else
48
+ s.add_dependency(%q<thoughtbot-shoulda>, [">= 0"])
49
+ end
50
+ end
File without changes
@@ -0,0 +1,45 @@
1
+ #!/usr/bin/env ruby
2
+ require "test/unit"
3
+ require File.dirname(__FILE__) + "/../lib/robots"
4
+
5
+ module Kernel
6
+ alias_method :open_old, :open
7
+
8
+ def set_open(key, value)
9
+ @fake_open_values ||= {}
10
+ @fake_open_values[key] = value
11
+ end
12
+
13
+ def open(*args)
14
+ @fake_open_values ||= {}
15
+ @fake_open_values[args.first] || open_old(*args)
16
+ end
17
+ end
18
+
19
+ class TestRobots < Test::Unit::TestCase
20
+ def setup
21
+ @robots = Robots.new "Ruby-Robot.txt Parser Test Script"
22
+ end
23
+
24
+ def test_allowed_if_no_robots
25
+ assert @robots.allowed?("http://www.yahoo.com")
26
+ end
27
+
28
+ def test_reddit
29
+ assert @robots.allowed?("http://reddit.com")
30
+ end
31
+
32
+ def test_other
33
+ assert @robots.allowed?("http://www.yelp.com/foo")
34
+ assert !@robots.allowed?("http://www.yelp.com/mail?foo=bar")
35
+ end
36
+
37
+ def test_site_with_disallowed
38
+ assert @robots.allowed?("http://www.google.com/")
39
+ end
40
+
41
+ def test_other_values
42
+ sitemap = {"Sitemap" => "http://www.eventbrite.com/sitemap_index.xml"}
43
+ assert_equal(sitemap, @robots.other_values("http://eventbrite.com"))
44
+ end
45
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: robots
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.0
4
+ version: 0.7.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kyle Maxwell
@@ -9,27 +9,44 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-10-10 00:00:00 -07:00
12
+ date: 2009-10-18 00:00:00 -07:00
13
13
  default_executable:
14
- dependencies: []
15
-
16
- description:
17
- email:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: thoughtbot-shoulda
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ version:
25
+ description: It parses robots.txt files
26
+ email: kyle@kylemaxwell.com
18
27
  executables: []
19
28
 
20
29
  extensions: []
21
30
 
22
- extra_rdoc_files: []
23
-
24
- files: []
25
-
31
+ extra_rdoc_files:
32
+ - README
33
+ files:
34
+ - .gitignore
35
+ - CHANGELOG
36
+ - README
37
+ - Rakefile
38
+ - VERSION
39
+ - lib/robots.rb
40
+ - robots.gemspec
41
+ - test/fixtures/robots1.txt
42
+ - test/test_robots.rb
26
43
  has_rdoc: true
27
- homepage:
44
+ homepage: http://github.com/fizx/robots
28
45
  licenses: []
29
46
 
30
47
  post_install_message:
31
- rdoc_options: []
32
-
48
+ rdoc_options:
49
+ - --charset=UTF-8
33
50
  require_paths:
34
51
  - lib
35
52
  required_ruby_version: !ruby/object:Gem::Requirement
@@ -50,6 +67,6 @@ rubyforge_project:
50
67
  rubygems_version: 1.3.4
51
68
  signing_key:
52
69
  specification_version: 3
53
- summary: Placeholder for a gem to be migrated later
54
- test_files: []
55
-
70
+ summary: Simple robots.txt parser
71
+ test_files:
72
+ - test/test_robots.rb