colinramsay-robotstxt 1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/README +17 -0
  2. data/lib/robotstxtparser.rb +44 -0
  3. metadata +54 -0
data/README ADDED
@@ -0,0 +1,17 @@
1
+ robots.txt is an extremely simply format, but it still needs some parser loving. This class will normalise robots.txt entries for you.
2
+
3
+ USAGE
4
+ -----
5
+
6
+ With the following robots.txt:
7
+
8
+ User-agent: *
9
+ Disallow: /logs
10
+
11
+ User-agent: Google
12
+ Disallow: /admin
13
+
14
+ # Also accepts a local file
15
+ rp = RobotsParser.new("http://something.com/robots.txt")
16
+
17
+ rp.user_agents('Google') # returns ["/logs", "/admin"]
@@ -0,0 +1,44 @@
1
+ require 'open-uri'
2
+
3
+ class RobotsTxtParser
4
+
5
+ attr_reader :user_agents
6
+
7
+ def initialize(path)
8
+ if path.include?("://")
9
+ raw_data = open(path)
10
+ else
11
+ raw_data = File.open(path)
12
+ end
13
+
14
+ return unless raw_data
15
+
16
+ @user_agents = Hash.new
17
+
18
+ parse(raw_data)
19
+ end
20
+
21
+ def parse(raw_data)
22
+ current_agent = nil
23
+
24
+ raw_data.each_line do |line|
25
+
26
+ if line.match(/^User Agent:/)
27
+ current_agent = line.gsub("User Agent:","").strip
28
+ elsif line.match(/^Disallow:/)
29
+ @user_agents[current_agent] = Array.new unless @user_agents[current_agent]
30
+ @user_agents[current_agent].push line.gsub("Disallow:", "").strip
31
+ end
32
+ end
33
+
34
+ add_wildcard_records
35
+ end
36
+
37
+ def add_wildcard_records
38
+ if @user_agents.has_key?('*')
39
+ @user_agents.each do |agent, records|
40
+ @user_agents[agent] = records + @user_agents['*']
41
+ end
42
+ end
43
+ end
44
+ end
metadata ADDED
@@ -0,0 +1,54 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: colinramsay-robotstxt
3
+ version: !ruby/object:Gem::Version
4
+ version: "1.0"
5
+ platform: ruby
6
+ authors:
7
+ - Colin Ramsay
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-06-23 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: Parse robots.txt files from Ruby.
17
+ email: colin@gotripod.com
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files: []
23
+
24
+ files:
25
+ - README
26
+ - lib/robotstxtparser.rb
27
+ has_rdoc: false
28
+ homepage: http://github.com/colinramsay/robotstxt
29
+ post_install_message:
30
+ rdoc_options: []
31
+
32
+ require_paths:
33
+ - lib
34
+ required_ruby_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: "0"
39
+ version:
40
+ required_rubygems_version: !ruby/object:Gem::Requirement
41
+ requirements:
42
+ - - ">="
43
+ - !ruby/object:Gem::Version
44
+ version: "0"
45
+ version:
46
+ requirements: []
47
+
48
+ rubyforge_project:
49
+ rubygems_version: 1.2.0
50
+ signing_key:
51
+ specification_version: 2
52
+ summary: Parse robots.txt files from Ruby.
53
+ test_files: []
54
+