robots 0.0.0 → 0.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ *.gem
@@ -0,0 +1,19 @@
1
+ 0.7.3
2
+ - Move to jeweler
3
+ 0.7.2
4
+ - Add Ruby 1.9 compatibility
5
+ 0.5-0.7.1
6
+ - Lost the changelog information :/
7
+ 0.4.0
8
+ - Fixed other_values bug
9
+ - added crawl-delay support
10
+ 0.3.2
11
+ - fixed breaking on reddit.com
12
+ 0.3.1
13
+ - fixed bug in disallows handling
14
+ - partially mocked out open-uri
15
+ 0.3.0
16
+ - added loggable dependency
17
+ 0.2.0
18
+ - IF robot.txt 404s, assume allowed.
19
+ - Added CHANGELOG
data/README ADDED
@@ -0,0 +1,33 @@
1
+ A simple Ruby library to parse robots.txt.
2
+
3
+ Usage:
4
+
5
+ robots = Robots.new "Some User Agent"
6
+ assert robots.allowed?("http://www.yelp.com/foo")
7
+ assert !robots.allowed?("http://www.yelp.com/mail?foo=bar")
8
+ robots.other_values("http://foo.com") # gets misc. key/values (i.e. sitemaps)
9
+
10
+ If you want caching, you're on your own. I suggest marshalling an instance of the parser.
11
+
12
+ Copyright (c) 2008 Kyle Maxwell
13
+
14
+ Permission is hereby granted, free of charge, to any person
15
+ obtaining a copy of this software and associated documentation
16
+ files (the "Software"), to deal in the Software without
17
+ restriction, including without limitation the rights to use,
18
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
19
+ copies of the Software, and to permit persons to whom the
20
+ Software is furnished to do so, subject to the following
21
+ conditions:
22
+
23
+ The above copyright notice and this permission notice shall be
24
+ included in all copies or substantial portions of the Software.
25
+
26
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
27
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
28
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
29
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
30
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
31
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
32
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
33
+ OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,57 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "robots"
8
+ gem.summary = "Simple robots.txt parser"
9
+ gem.description = "It parses robots.txt files"
10
+ gem.email = "kyle@kylemaxwell.com"
11
+ gem.homepage = "http://github.com/fizx/robots"
12
+ gem.authors = ["Kyle Maxwell"]
13
+ gem.add_development_dependency "thoughtbot-shoulda"
14
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
+ end
16
+ Jeweler::GemcutterTasks.new
17
+ rescue LoadError
18
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
19
+ end
20
+
21
+ require 'rake/testtask'
22
+ Rake::TestTask.new(:test) do |test|
23
+ test.libs << 'lib' << 'test'
24
+ test.pattern = 'test/**/*_test.rb'
25
+ test.verbose = true
26
+ end
27
+
28
+ begin
29
+ require 'rcov/rcovtask'
30
+ Rcov::RcovTask.new do |test|
31
+ test.libs << 'test'
32
+ test.pattern = 'test/**/*_test.rb'
33
+ test.verbose = true
34
+ end
35
+ rescue LoadError
36
+ task :rcov do
37
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
38
+ end
39
+ end
40
+
41
+ task :test => :check_dependencies
42
+
43
+ task :default => :test
44
+
45
+ require 'rake/rdoctask'
46
+ Rake::RDocTask.new do |rdoc|
47
+ if File.exist?('VERSION')
48
+ version = File.read('VERSION')
49
+ else
50
+ version = ""
51
+ end
52
+
53
+ rdoc.rdoc_dir = 'rdoc'
54
+ rdoc.title = "robots #{version}"
55
+ rdoc.rdoc_files.include('README*')
56
+ rdoc.rdoc_files.include('lib/**/*.rb')
57
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.7.3
@@ -0,0 +1,133 @@
1
+ require "open-uri"
2
+ require "uri"
3
+ require "rubygems"
4
+ require "timeout"
5
+
6
+ class Robots
7
+
8
+ DEFAULT_TIMEOUT = 3
9
+
10
+ class ParsedRobots
11
+
12
+ def initialize(uri, user_agent)
13
+ @last_accessed = Time.at(1)
14
+
15
+ io = nil
16
+ begin
17
+ Timeout::timeout(Robots.timeout) do
18
+ io = URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
19
+ end
20
+ rescue Timeout::Error
21
+ STDERR.puts "robots.txt request timed out"
22
+ end
23
+
24
+
25
+ if !io || io.content_type != "text/plain" || io.status != ["200", "OK"]
26
+ io = StringIO.new("User-agent: *\nAllow: /\n")
27
+ end
28
+
29
+ @other = {}
30
+ @disallows = {}
31
+ @allows = {}
32
+ @delays = {} # added delays to make it work
33
+ agent = /.*/
34
+ io.each do |line|
35
+ next if line =~ /^\s*(#.*|$)/
36
+ arr = line.split(":")
37
+ key = arr.shift
38
+ value = arr.join(":").strip
39
+ value.strip!
40
+ case key
41
+ when "User-agent"
42
+ agent = to_regex(value)
43
+ when "Allow"
44
+ @allows[agent] ||= []
45
+ @allows[agent] << to_regex(value)
46
+ when "Disallow"
47
+ @disallows[agent] ||= []
48
+ @disallows[agent] << to_regex(value)
49
+ when "Crawl-delay"
50
+ @delays[agent] = value.to_i
51
+ else
52
+ @other[key] = value
53
+ end
54
+ end
55
+
56
+ @parsed = true
57
+ end
58
+
59
+ def allowed?(uri, user_agent)
60
+ return true unless @parsed
61
+ allowed = true
62
+ path = uri.request_uri
63
+
64
+ @disallows.each do |key, value|
65
+ if user_agent =~ key
66
+ value.each do |rule|
67
+ if path =~ rule
68
+ allowed = false
69
+ end
70
+ end
71
+ end
72
+ end
73
+
74
+ @allows.each do |key, value|
75
+ unless allowed
76
+ if user_agent =~ key
77
+ value.each do |rule|
78
+ if path =~ rule
79
+ allowed = true
80
+ end
81
+ end
82
+ end
83
+ end
84
+ end
85
+
86
+ if allowed && @delays[user_agent]
87
+ sleep @delays[user_agent] - (Time.now - @last_accessed)
88
+ @last_accessed = Time.now
89
+ end
90
+
91
+ return allowed
92
+ end
93
+
94
+ def other_values
95
+ @other
96
+ end
97
+
98
+ protected
99
+
100
+ def to_regex(pattern)
101
+ pattern = Regexp.escape(pattern)
102
+ pattern.gsub!(Regexp.escape("*"), ".*")
103
+ Regexp.compile("^#{pattern}")
104
+ end
105
+ end
106
+
107
+ def self.timeout=(t)
108
+ @timeout = t
109
+ end
110
+
111
+ def self.timeout
112
+ @timeout || DEFAULT_TIMEOUT
113
+ end
114
+
115
+ def initialize(user_agent)
116
+ @user_agent = user_agent
117
+ @parsed = {}
118
+ end
119
+
120
+ def allowed?(uri)
121
+ uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
122
+ host = uri.host
123
+ @parsed[host] ||= ParsedRobots.new(uri, @user_agent)
124
+ @parsed[host].allowed?(uri, @user_agent)
125
+ end
126
+
127
+ def other_values(uri)
128
+ uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
129
+ host = uri.host
130
+ @parsed[host] ||= ParsedRobots.new(uri, @user_agent)
131
+ @parsed[host].other_values
132
+ end
133
+ end
@@ -0,0 +1,50 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{robots}
8
+ s.version = "0.7.3"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Kyle Maxwell"]
12
+ s.date = %q{2009-10-18}
13
+ s.description = %q{It parses robots.txt files}
14
+ s.email = %q{kyle@kylemaxwell.com}
15
+ s.extra_rdoc_files = [
16
+ "README"
17
+ ]
18
+ s.files = [
19
+ ".gitignore",
20
+ "CHANGELOG",
21
+ "README",
22
+ "Rakefile",
23
+ "VERSION",
24
+ "lib/robots.rb",
25
+ "robots.gemspec",
26
+ "test/fixtures/robots1.txt",
27
+ "test/test_robots.rb"
28
+ ]
29
+ s.homepage = %q{http://github.com/fizx/robots}
30
+ s.rdoc_options = ["--charset=UTF-8"]
31
+ s.require_paths = ["lib"]
32
+ s.rubygems_version = %q{1.3.4}
33
+ s.summary = %q{Simple robots.txt parser}
34
+ s.test_files = [
35
+ "test/test_robots.rb"
36
+ ]
37
+
38
+ if s.respond_to? :specification_version then
39
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
40
+ s.specification_version = 3
41
+
42
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
43
+ s.add_development_dependency(%q<thoughtbot-shoulda>, [">= 0"])
44
+ else
45
+ s.add_dependency(%q<thoughtbot-shoulda>, [">= 0"])
46
+ end
47
+ else
48
+ s.add_dependency(%q<thoughtbot-shoulda>, [">= 0"])
49
+ end
50
+ end
File without changes
@@ -0,0 +1,45 @@
1
+ #!/usr/bin/env ruby
2
+ require "test/unit"
3
+ require File.dirname(__FILE__) + "/../lib/robots"
4
+
5
+ module Kernel
6
+ alias_method :open_old, :open
7
+
8
+ def set_open(key, value)
9
+ @fake_open_values ||= {}
10
+ @fake_open_values[key] = value
11
+ end
12
+
13
+ def open(*args)
14
+ @fake_open_values ||= {}
15
+ @fake_open_values[args.first] || open_old(*args)
16
+ end
17
+ end
18
+
19
+ class TestRobots < Test::Unit::TestCase
20
+ def setup
21
+ @robots = Robots.new "Ruby-Robot.txt Parser Test Script"
22
+ end
23
+
24
+ def test_allowed_if_no_robots
25
+ assert @robots.allowed?("http://www.yahoo.com")
26
+ end
27
+
28
+ def test_reddit
29
+ assert @robots.allowed?("http://reddit.com")
30
+ end
31
+
32
+ def test_other
33
+ assert @robots.allowed?("http://www.yelp.com/foo")
34
+ assert !@robots.allowed?("http://www.yelp.com/mail?foo=bar")
35
+ end
36
+
37
+ def test_site_with_disallowed
38
+ assert @robots.allowed?("http://www.google.com/")
39
+ end
40
+
41
+ def test_other_values
42
+ sitemap = {"Sitemap" => "http://www.eventbrite.com/sitemap_index.xml"}
43
+ assert_equal(sitemap, @robots.other_values("http://eventbrite.com"))
44
+ end
45
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: robots
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.0
4
+ version: 0.7.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kyle Maxwell
@@ -9,27 +9,44 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-10-10 00:00:00 -07:00
12
+ date: 2009-10-18 00:00:00 -07:00
13
13
  default_executable:
14
- dependencies: []
15
-
16
- description:
17
- email:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: thoughtbot-shoulda
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ version:
25
+ description: It parses robots.txt files
26
+ email: kyle@kylemaxwell.com
18
27
  executables: []
19
28
 
20
29
  extensions: []
21
30
 
22
- extra_rdoc_files: []
23
-
24
- files: []
25
-
31
+ extra_rdoc_files:
32
+ - README
33
+ files:
34
+ - .gitignore
35
+ - CHANGELOG
36
+ - README
37
+ - Rakefile
38
+ - VERSION
39
+ - lib/robots.rb
40
+ - robots.gemspec
41
+ - test/fixtures/robots1.txt
42
+ - test/test_robots.rb
26
43
  has_rdoc: true
27
- homepage:
44
+ homepage: http://github.com/fizx/robots
28
45
  licenses: []
29
46
 
30
47
  post_install_message:
31
- rdoc_options: []
32
-
48
+ rdoc_options:
49
+ - --charset=UTF-8
33
50
  require_paths:
34
51
  - lib
35
52
  required_ruby_version: !ruby/object:Gem::Requirement
@@ -50,6 +67,6 @@ rubyforge_project:
50
67
  rubygems_version: 1.3.4
51
68
  signing_key:
52
69
  specification_version: 3
53
- summary: Placeholder for a gem to be migrated later
54
- test_files: []
55
-
70
+ summary: Simple robots.txt parser
71
+ test_files:
72
+ - test/test_robots.rb