robotx 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +34 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +17 -0
- data/LICENSE +21 -0
- data/README.md +72 -0
- data/Rakefile +2 -0
- data/lib/robotx.rb +114 -0
- data/robotx.gemspec +22 -0
- metadata +80 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 161a4310d0e1b28e499ce5dd6226125c6e345dd6
|
4
|
+
data.tar.gz: 2c33050af6edcdc516611e7eb8e1efc5a497ecf5
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 6dc47d5c31e4629bb462ed353e31ec5e2b5b98fbf2a56363d87c8e9c9a8ed5a611341d88268f29b66b5b268acf4dce8e7766b7be0d6f189f1696544602d86d89
|
7
|
+
data.tar.gz: b939d2cf78e12054a92f8693ad35e5cc55efe75098df8b2e0c76c4347e3ad3763a15812a51968eba40c72b849c51b5373abddc9f0dfd92d49c3c2b1b85d59e84
|
data/.gitignore
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
/.config
|
4
|
+
/coverage/
|
5
|
+
/InstalledFiles
|
6
|
+
/pkg/
|
7
|
+
/spec/reports/
|
8
|
+
/test/tmp/
|
9
|
+
/test/version_tmp/
|
10
|
+
/tmp/
|
11
|
+
|
12
|
+
## Specific to RubyMotion:
|
13
|
+
.dat*
|
14
|
+
.repl_history
|
15
|
+
build/
|
16
|
+
|
17
|
+
## Documentation cache and generated files:
|
18
|
+
/.yardoc/
|
19
|
+
/_yardoc/
|
20
|
+
/doc/
|
21
|
+
/rdoc/
|
22
|
+
|
23
|
+
## Environment normalisation:
|
24
|
+
/.bundle/
|
25
|
+
/lib/bundler/man/
|
26
|
+
|
27
|
+
# for a library or gem, you might want to ignore these files since the code is
|
28
|
+
# intended to run in multiple environments; otherwise, check them in:
|
29
|
+
# Gemfile.lock
|
30
|
+
# .ruby-version
|
31
|
+
# .ruby-gemset
|
32
|
+
|
33
|
+
# unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
|
34
|
+
.rvmrc
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2014
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
# Robotx
|
2
|
+
Robotx _(pronounced "robotex")_ is a simple but powerful parser for robots.txt files.
|
3
|
+
It offers a bunch of features which allows you to check whether an URL is allowed/disallowed to be visited by a crawler.
|
4
|
+
|
5
|
+
|
6
|
+
## Features
|
7
|
+
|
8
|
+
- Maintains lists for allowed/disallowed URLs
|
9
|
+
- Simple method to check whether an URL or just a path is allowed to be visited
|
10
|
+
- Show all user agents covered by the robots.txt
|
11
|
+
- Get the 'Crawl-Delay' for a website
|
12
|
+
- Support for sitemap(s)
|
13
|
+
|
14
|
+
## Installation
|
15
|
+
### With Bundler
|
16
|
+
Just add to your Gemfile
|
17
|
+
~~~ruby
|
18
|
+
gem 'robotx'
|
19
|
+
~~~
|
20
|
+
|
21
|
+
### Without Bundler
|
22
|
+
If you're not using Bundler just execute on your commandline
|
23
|
+
~~~bash
|
24
|
+
$ gem install robotx
|
25
|
+
~~~
|
26
|
+
|
27
|
+
## Usage
|
28
|
+
### Support for different user agents
|
29
|
+
Robotx can be initialized with a special user agent. The default user agent is `*`.
|
30
|
+
**Please note:** All method results depends on the user agent Robotx was initialized with.
|
31
|
+
~~~ruby
|
32
|
+
require 'robotx'
|
33
|
+
|
34
|
+
# Initialize with the default user agent '*'
|
35
|
+
robots_txt = Robotx.new('https://github.com')
|
36
|
+
robots_txt.allowed # => ["/humans.txt"]
|
37
|
+
|
38
|
+
# Initialize with 'googlebot' as user agent
|
39
|
+
robots_txt = Robotx.new('https://github.com', 'googlebot')
|
40
|
+
robots_txt.allowed # => ["/*/*/tree/master", "/*/*/blob/master"]
|
41
|
+
~~~
|
42
|
+
|
43
|
+
### Check whether an URL is allowed to be indexed
|
44
|
+
~~~ruby
|
45
|
+
require 'robotx'
|
46
|
+
|
47
|
+
robots_txt = Robotx.new('https://github.com')
|
48
|
+
robots_txt.allowed?('/humans.txt') # => true
|
49
|
+
robots_txt.allowed?('/') # => false
|
50
|
+
~~~
|
51
|
+
|
52
|
+
### Get all allowed/disallowed URLs
|
53
|
+
~~~ruby
|
54
|
+
require 'robotx'
|
55
|
+
|
56
|
+
robots_txt = Robotx.new('https://github.com')
|
57
|
+
robots_txt.allowed # => ["/humans.txt"]
|
58
|
+
robots_txt.disallowed # => ["/"]
|
59
|
+
~~~
|
60
|
+
|
61
|
+
### Get additional information
|
62
|
+
~~~ruby
|
63
|
+
require 'robotx'
|
64
|
+
|
65
|
+
robots_txt = Robotx.new('https://github.com')
|
66
|
+
robots_txt.sitemap # => []
|
67
|
+
robots_txt.crawl_delay # => 0
|
68
|
+
robots_txt.user_agents # => ["googlebot", "baiduspider", ...]
|
69
|
+
~~~
|
70
|
+
|
71
|
+
## Todo
|
72
|
+
- Add tests
|
data/Rakefile
ADDED
data/lib/robotx.rb
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
require 'timeout'
|
2
|
+
require 'stringio'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'uri'
|
5
|
+
require 'set'
|
6
|
+
|
7
|
+
class Robotx
|
8
|
+
|
9
|
+
TIMEOUT = 30 # seconds
|
10
|
+
|
11
|
+
def initialize(uri, user_agent='*')
|
12
|
+
@uri = URI.parse(URI.encode(uri))
|
13
|
+
raise URI::InvalidURIError.new('scheme or host missing') unless @uri.scheme and @uri.host
|
14
|
+
|
15
|
+
@user_agent = user_agent.downcase
|
16
|
+
@robots_data = parse_robots_txt
|
17
|
+
end
|
18
|
+
|
19
|
+
def allowed
|
20
|
+
return disallowed.empty? ? ['/'] : @robots_data.fetch(@user_agent, {}).fetch('allow', ['/'])
|
21
|
+
end
|
22
|
+
|
23
|
+
def disallowed
|
24
|
+
return @robots_data.fetch(@user_agent, {}).fetch('disallow', [])
|
25
|
+
end
|
26
|
+
|
27
|
+
def allowed?(data)
|
28
|
+
if data.is_a?(Array) or data.is_a?(Set)
|
29
|
+
return {}.tap do |hash|
|
30
|
+
data.each do |uri|
|
31
|
+
hash[uri] = check_permission(uri)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
return check_permission(data)
|
37
|
+
end
|
38
|
+
|
39
|
+
def sitemap
|
40
|
+
return @robots_data.fetch('sitemap', [])
|
41
|
+
end
|
42
|
+
|
43
|
+
def crawl_delay
|
44
|
+
return [@robots_data.fetch(@user_agent, {}).fetch('crawl-delay', 0), 0].max
|
45
|
+
end
|
46
|
+
|
47
|
+
def user_agents
|
48
|
+
return @robots_data.keys.delete_if { |agent| agent == 'sitemap' }
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
|
53
|
+
def load_robots_txt
|
54
|
+
Timeout::timeout(Robotx::TIMEOUT) do
|
55
|
+
if robots_txt_io = URI.join(@uri, 'robots.txt').open('User-Agent' => @user_agent) and robots_txt_io.content_type.downcase == 'text/plain' and robots_txt_io.status == ['200', 'OK']
|
56
|
+
return robots_txt_io
|
57
|
+
end
|
58
|
+
raise OpenURI::HTTPError
|
59
|
+
end
|
60
|
+
rescue
|
61
|
+
return StringIO.new("User-agent: *\nAllow: /\n")
|
62
|
+
end
|
63
|
+
|
64
|
+
def parse_robots_txt
|
65
|
+
agent = '*'
|
66
|
+
{}.tap do |hash|
|
67
|
+
load_robots_txt.each do |line|
|
68
|
+
next if line =~ /^\s*(#.*|$)/
|
69
|
+
|
70
|
+
data = line.split(/:/).map(&:strip)
|
71
|
+
key = data.shift
|
72
|
+
value = data.join
|
73
|
+
|
74
|
+
case key.downcase
|
75
|
+
when 'user-agent'
|
76
|
+
agent = value.downcase
|
77
|
+
hash[agent] ||= {}
|
78
|
+
when 'allow'
|
79
|
+
hash[agent]['allow'] ||= []
|
80
|
+
hash[agent]['allow'] << value.sub(/(\/){2,}$/, '')
|
81
|
+
when 'disallow'
|
82
|
+
# Disallow: '' means Allow: '/'
|
83
|
+
if value.empty?
|
84
|
+
hash[agent]['allow'] ||= []
|
85
|
+
hash[agent]['allow'] << '/'
|
86
|
+
else
|
87
|
+
hash[agent]['disallow'] ||= []
|
88
|
+
hash[agent]['disallow'] << value.sub(/(\/){2,}$/, '')
|
89
|
+
end
|
90
|
+
when 'crawl-delay'
|
91
|
+
hash[agent]['crawl-delay'] = value.to_i
|
92
|
+
when 'sitemap'
|
93
|
+
hash['sitemap'] ||= []
|
94
|
+
hash['sitemap'] << value.sub(/(\/){2,}$/, '')
|
95
|
+
else
|
96
|
+
hash[key] ||= []
|
97
|
+
hash[key] << value.sub(/(\/){2,}$/, '')
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
rescue
|
102
|
+
{}
|
103
|
+
end
|
104
|
+
|
105
|
+
def check_permission(uri)
|
106
|
+
uri = URI.parse(URI.encode(uri))
|
107
|
+
return true unless (@robots_data or @robots_data.any?) or (uri.scheme and uri.host)
|
108
|
+
|
109
|
+
uri_path = uri.path.sub(/(\/){2,}$/, '')
|
110
|
+
pattern = Regexp.compile("(^#{Regexp.escape(uri_path)}[\/]*$)|(^/$)")
|
111
|
+
return (@robots_data.fetch(@user_agent, {}).fetch('disallow', []).grep(pattern).empty? or @robots_data.fetch(@user_agent, {}).fetch('allow', []).grep(pattern).any?)
|
112
|
+
end
|
113
|
+
|
114
|
+
end
|
data/robotx.gemspec
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = "robotx"
|
7
|
+
spec.version = "0.1.0"
|
8
|
+
spec.authors = ["Matthias Kalb"]
|
9
|
+
spec.email = ["matthias.kalb@railsmechanic.de"]
|
10
|
+
spec.summary = %q{A parser for the robots.txt file}
|
11
|
+
spec.description = %q{A simple to use parser for the robots.txt file.}
|
12
|
+
spec.homepage = "https://github.com/railsmechanic/robotx"
|
13
|
+
spec.license = "MIT"
|
14
|
+
|
15
|
+
spec.files = `git ls-files -z`.split("\x0")
|
16
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
17
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
|
+
spec.require_paths = ["lib"]
|
19
|
+
|
20
|
+
spec.add_development_dependency "bundler", "~> 1.6"
|
21
|
+
spec.add_development_dependency "rake"
|
22
|
+
end
|
metadata
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: robotx
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Matthias Kalb
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-07-04 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.6'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.6'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
description: A simple to use parser for the robots.txt file.
|
42
|
+
email:
|
43
|
+
- matthias.kalb@railsmechanic.de
|
44
|
+
executables: []
|
45
|
+
extensions: []
|
46
|
+
extra_rdoc_files: []
|
47
|
+
files:
|
48
|
+
- .gitignore
|
49
|
+
- Gemfile
|
50
|
+
- Gemfile.lock
|
51
|
+
- LICENSE
|
52
|
+
- README.md
|
53
|
+
- Rakefile
|
54
|
+
- lib/robotx.rb
|
55
|
+
- robotx.gemspec
|
56
|
+
homepage: https://github.com/railsmechanic/robotx
|
57
|
+
licenses:
|
58
|
+
- MIT
|
59
|
+
metadata: {}
|
60
|
+
post_install_message:
|
61
|
+
rdoc_options: []
|
62
|
+
require_paths:
|
63
|
+
- lib
|
64
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
70
|
+
requirements:
|
71
|
+
- - '>='
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
version: '0'
|
74
|
+
requirements: []
|
75
|
+
rubyforge_project:
|
76
|
+
rubygems_version: 2.2.2
|
77
|
+
signing_key:
|
78
|
+
specification_version: 4
|
79
|
+
summary: A parser for the robots.txt file
|
80
|
+
test_files: []
|