simply-robotstxt 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +22 -0
- data/lib/robotstxtparser.rb +48 -0
- metadata +63 -0
data/README
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
robots.txt is an extremely simply format, but it still needs some parser loving. This class will normalise robots.txt entries for you.
|
2
|
+
|
3
|
+
USAGE
|
4
|
+
-----
|
5
|
+
|
6
|
+
With the following robots.txt:
|
7
|
+
|
8
|
+
User-agent: *
|
9
|
+
Disallow: /logs
|
10
|
+
|
11
|
+
User-agent: Google
|
12
|
+
Disallow: /admin
|
13
|
+
|
14
|
+
Use it like this:
|
15
|
+
|
16
|
+
require 'robotstxtparser'
|
17
|
+
|
18
|
+
# Also accepts a local file
|
19
|
+
rp = RobotsTxtParser.new()
|
20
|
+
rp.read("http://something.com/robots.txt")
|
21
|
+
|
22
|
+
rp.user_agents('Google') # returns ["/logs", "/admin"]
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
|
3
|
+
class RobotsTxtParser
|
4
|
+
|
5
|
+
attr_reader :user_agents
|
6
|
+
|
7
|
+
def read(path)
|
8
|
+
begin
|
9
|
+
if path.include?("://")
|
10
|
+
raw_data = open(path)
|
11
|
+
else
|
12
|
+
raw_data = File.open(path)
|
13
|
+
end
|
14
|
+
rescue
|
15
|
+
raw_data = nil
|
16
|
+
end
|
17
|
+
|
18
|
+
@user_agents = Hash.new
|
19
|
+
|
20
|
+
return unless raw_data
|
21
|
+
|
22
|
+
parse(raw_data)
|
23
|
+
end
|
24
|
+
|
25
|
+
def parse(raw_data)
|
26
|
+
current_agent = nil
|
27
|
+
|
28
|
+
raw_data.each_line do |line|
|
29
|
+
|
30
|
+
if line.match(/^User Agent:/)
|
31
|
+
current_agent = line.gsub("User Agent:","").strip
|
32
|
+
elsif line.match(/^Disallow:/)
|
33
|
+
@user_agents[current_agent] = Array.new unless @user_agents[current_agent]
|
34
|
+
@user_agents[current_agent].push line.gsub("Disallow:", "").strip
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
add_wildcard_records
|
39
|
+
end
|
40
|
+
|
41
|
+
def add_wildcard_records
|
42
|
+
if @user_agents.has_key?('*')
|
43
|
+
@user_agents.each do |agent, records|
|
44
|
+
@user_agents[agent] = records + @user_agents['*']
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
metadata
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: simply-robotstxt
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 1
|
7
|
+
- 0
|
8
|
+
- 4
|
9
|
+
version: 1.0.4
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Colin Ramsay
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2009-06-30 00:00:00 +01:00
|
18
|
+
default_executable:
|
19
|
+
dependencies: []
|
20
|
+
|
21
|
+
description: Parse robots.txt files from Ruby.
|
22
|
+
email: colin@gotripod.com
|
23
|
+
executables: []
|
24
|
+
|
25
|
+
extensions: []
|
26
|
+
|
27
|
+
extra_rdoc_files: []
|
28
|
+
|
29
|
+
files:
|
30
|
+
- README
|
31
|
+
- lib/robotstxtparser.rb
|
32
|
+
has_rdoc: true
|
33
|
+
homepage: http://github.com/colinramsay/simply-robotstxt
|
34
|
+
licenses: []
|
35
|
+
|
36
|
+
post_install_message:
|
37
|
+
rdoc_options: []
|
38
|
+
|
39
|
+
require_paths:
|
40
|
+
- lib
|
41
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
42
|
+
requirements:
|
43
|
+
- - ">="
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
segments:
|
46
|
+
- 0
|
47
|
+
version: "0"
|
48
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
segments:
|
53
|
+
- 0
|
54
|
+
version: "0"
|
55
|
+
requirements: []
|
56
|
+
|
57
|
+
rubyforge_project:
|
58
|
+
rubygems_version: 1.3.6
|
59
|
+
signing_key:
|
60
|
+
specification_version: 3
|
61
|
+
summary: Parse robots.txt files from Ruby.
|
62
|
+
test_files: []
|
63
|
+
|