simply-robotstxt 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/README +22 -0
  2. data/lib/robotstxtparser.rb +48 -0
  3. metadata +63 -0
data/README ADDED
@@ -0,0 +1,22 @@
1
+ robots.txt is an extremely simply format, but it still needs some parser loving. This class will normalise robots.txt entries for you.
2
+
3
+ USAGE
4
+ -----
5
+
6
+ With the following robots.txt:
7
+
8
+ User-agent: *
9
+ Disallow: /logs
10
+
11
+ User-agent: Google
12
+ Disallow: /admin
13
+
14
+ Use it like this:
15
+
16
+ require 'robotstxtparser'
17
+
18
+ # Also accepts a local file
19
+ rp = RobotsTxtParser.new()
20
+ rp.read("http://something.com/robots.txt")
21
+
22
+ rp.user_agents('Google') # returns ["/logs", "/admin"]
@@ -0,0 +1,48 @@
1
+ require 'open-uri'
2
+
3
+ class RobotsTxtParser
4
+
5
+ attr_reader :user_agents
6
+
7
+ def read(path)
8
+ begin
9
+ if path.include?("://")
10
+ raw_data = open(path)
11
+ else
12
+ raw_data = File.open(path)
13
+ end
14
+ rescue
15
+ raw_data = nil
16
+ end
17
+
18
+ @user_agents = Hash.new
19
+
20
+ return unless raw_data
21
+
22
+ parse(raw_data)
23
+ end
24
+
25
+ def parse(raw_data)
26
+ current_agent = nil
27
+
28
+ raw_data.each_line do |line|
29
+
30
+ if line.match(/^User Agent:/)
31
+ current_agent = line.gsub("User Agent:","").strip
32
+ elsif line.match(/^Disallow:/)
33
+ @user_agents[current_agent] = Array.new unless @user_agents[current_agent]
34
+ @user_agents[current_agent].push line.gsub("Disallow:", "").strip
35
+ end
36
+ end
37
+
38
+ add_wildcard_records
39
+ end
40
+
41
+ def add_wildcard_records
42
+ if @user_agents.has_key?('*')
43
+ @user_agents.each do |agent, records|
44
+ @user_agents[agent] = records + @user_agents['*']
45
+ end
46
+ end
47
+ end
48
+ end
metadata ADDED
@@ -0,0 +1,63 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: simply-robotstxt
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 1
7
+ - 0
8
+ - 4
9
+ version: 1.0.4
10
+ platform: ruby
11
+ authors:
12
+ - Colin Ramsay
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2009-06-30 00:00:00 +01:00
18
+ default_executable:
19
+ dependencies: []
20
+
21
+ description: Parse robots.txt files from Ruby.
22
+ email: colin@gotripod.com
23
+ executables: []
24
+
25
+ extensions: []
26
+
27
+ extra_rdoc_files: []
28
+
29
+ files:
30
+ - README
31
+ - lib/robotstxtparser.rb
32
+ has_rdoc: true
33
+ homepage: http://github.com/colinramsay/simply-robotstxt
34
+ licenses: []
35
+
36
+ post_install_message:
37
+ rdoc_options: []
38
+
39
+ require_paths:
40
+ - lib
41
+ required_ruby_version: !ruby/object:Gem::Requirement
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ segments:
46
+ - 0
47
+ version: "0"
48
+ required_rubygems_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ segments:
53
+ - 0
54
+ version: "0"
55
+ requirements: []
56
+
57
+ rubyforge_project:
58
+ rubygems_version: 1.3.6
59
+ signing_key:
60
+ specification_version: 3
61
+ summary: Parse robots.txt files from Ruby.
62
+ test_files: []
63
+