colinramsay-robotstxt 1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/README +17 -0
  2. data/lib/robotstxtparser.rb +44 -0
  3. metadata +54 -0
data/README ADDED
@@ -0,0 +1,17 @@
1
+ robots.txt is an extremely simply format, but it still needs some parser loving. This class will normalise robots.txt entries for you.
2
+
3
+ USAGE
4
+ -----
5
+
6
+ With the following robots.txt:
7
+
8
+ User-agent: *
9
+ Disallow: /logs
10
+
11
+ User-agent: Google
12
+ Disallow: /admin
13
+
14
+ # Also accepts a local file
15
+ rp = RobotsParser.new("http://something.com/robots.txt")
16
+
17
+ rp.user_agents('Google') # returns ["/logs", "/admin"]
@@ -0,0 +1,44 @@
1
+ require 'open-uri'
2
+
3
+ class RobotsTxtParser
4
+
5
+ attr_reader :user_agents
6
+
7
+ def initialize(path)
8
+ if path.include?("://")
9
+ raw_data = open(path)
10
+ else
11
+ raw_data = File.open(path)
12
+ end
13
+
14
+ return unless raw_data
15
+
16
+ @user_agents = Hash.new
17
+
18
+ parse(raw_data)
19
+ end
20
+
21
+ def parse(raw_data)
22
+ current_agent = nil
23
+
24
+ raw_data.each_line do |line|
25
+
26
+ if line.match(/^User Agent:/)
27
+ current_agent = line.gsub("User Agent:","").strip
28
+ elsif line.match(/^Disallow:/)
29
+ @user_agents[current_agent] = Array.new unless @user_agents[current_agent]
30
+ @user_agents[current_agent].push line.gsub("Disallow:", "").strip
31
+ end
32
+ end
33
+
34
+ add_wildcard_records
35
+ end
36
+
37
+ def add_wildcard_records
38
+ if @user_agents.has_key?('*')
39
+ @user_agents.each do |agent, records|
40
+ @user_agents[agent] = records + @user_agents['*']
41
+ end
42
+ end
43
+ end
44
+ end
metadata ADDED
@@ -0,0 +1,54 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: colinramsay-robotstxt
3
+ version: !ruby/object:Gem::Version
4
+ version: "1.0"
5
+ platform: ruby
6
+ authors:
7
+ - Colin Ramsay
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-06-23 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: Parse robots.txt files from Ruby.
17
+ email: colin@gotripod.com
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files: []
23
+
24
+ files:
25
+ - README
26
+ - lib/robotstxtparser.rb
27
+ has_rdoc: false
28
+ homepage: http://github.com/colinramsay/robotstxt
29
+ post_install_message:
30
+ rdoc_options: []
31
+
32
+ require_paths:
33
+ - lib
34
+ required_ruby_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: "0"
39
+ version:
40
+ required_rubygems_version: !ruby/object:Gem::Requirement
41
+ requirements:
42
+ - - ">="
43
+ - !ruby/object:Gem::Version
44
+ version: "0"
45
+ version:
46
+ requirements: []
47
+
48
+ rubyforge_project:
49
+ rubygems_version: 1.2.0
50
+ signing_key:
51
+ specification_version: 2
52
+ summary: Parse robots.txt files from Ruby.
53
+ test_files: []
54
+