simply-robotstxt 1.0.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/README +22 -0
  2. data/lib/robotstxtparser.rb +48 -0
  3. metadata +63 -0
data/README ADDED
@@ -0,0 +1,22 @@
1
+ robots.txt is an extremely simply format, but it still needs some parser loving. This class will normalise robots.txt entries for you.
2
+
3
+ USAGE
4
+ -----
5
+
6
+ With the following robots.txt:
7
+
8
+ User-agent: *
9
+ Disallow: /logs
10
+
11
+ User-agent: Google
12
+ Disallow: /admin
13
+
14
+ Use it like this:
15
+
16
+ require 'robotstxtparser'
17
+
18
+ # Also accepts a local file
19
+ rp = RobotsTxtParser.new()
20
+ rp.read("http://something.com/robots.txt")
21
+
22
+ rp.user_agents('Google') # returns ["/logs", "/admin"]
@@ -0,0 +1,48 @@
1
+ require 'open-uri'
2
+
3
+ class RobotsTxtParser
4
+
5
+ attr_reader :user_agents
6
+
7
+ def read(path)
8
+ begin
9
+ if path.include?("://")
10
+ raw_data = open(path)
11
+ else
12
+ raw_data = File.open(path)
13
+ end
14
+ rescue
15
+ raw_data = nil
16
+ end
17
+
18
+ @user_agents = Hash.new
19
+
20
+ return unless raw_data
21
+
22
+ parse(raw_data)
23
+ end
24
+
25
+ def parse(raw_data)
26
+ current_agent = nil
27
+
28
+ raw_data.each_line do |line|
29
+
30
+ if line.match(/^User Agent:/)
31
+ current_agent = line.gsub("User Agent:","").strip
32
+ elsif line.match(/^Disallow:/)
33
+ @user_agents[current_agent] = Array.new unless @user_agents[current_agent]
34
+ @user_agents[current_agent].push line.gsub("Disallow:", "").strip
35
+ end
36
+ end
37
+
38
+ add_wildcard_records
39
+ end
40
+
41
+ def add_wildcard_records
42
+ if @user_agents.has_key?('*')
43
+ @user_agents.each do |agent, records|
44
+ @user_agents[agent] = records + @user_agents['*']
45
+ end
46
+ end
47
+ end
48
+ end
metadata ADDED
@@ -0,0 +1,63 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: simply-robotstxt
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 1
7
+ - 0
8
+ - 4
9
+ version: 1.0.4
10
+ platform: ruby
11
+ authors:
12
+ - Colin Ramsay
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2009-06-30 00:00:00 +01:00
18
+ default_executable:
19
+ dependencies: []
20
+
21
+ description: Parse robots.txt files from Ruby.
22
+ email: colin@gotripod.com
23
+ executables: []
24
+
25
+ extensions: []
26
+
27
+ extra_rdoc_files: []
28
+
29
+ files:
30
+ - README
31
+ - lib/robotstxtparser.rb
32
+ has_rdoc: true
33
+ homepage: http://github.com/colinramsay/simply-robotstxt
34
+ licenses: []
35
+
36
+ post_install_message:
37
+ rdoc_options: []
38
+
39
+ require_paths:
40
+ - lib
41
+ required_ruby_version: !ruby/object:Gem::Requirement
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ segments:
46
+ - 0
47
+ version: "0"
48
+ required_rubygems_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ segments:
53
+ - 0
54
+ version: "0"
55
+ requirements: []
56
+
57
+ rubyforge_project:
58
+ rubygems_version: 1.3.6
59
+ signing_key:
60
+ specification_version: 3
61
+ summary: Parse robots.txt files from Ruby.
62
+ test_files: []
63
+