robotx 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (10) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +34 -0
  3. data/Gemfile +4 -0
  4. data/Gemfile.lock +17 -0
  5. data/LICENSE +21 -0
  6. data/README.md +72 -0
  7. data/Rakefile +2 -0
  8. data/lib/robotx.rb +114 -0
  9. data/robotx.gemspec +22 -0
  10. metadata +80 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 161a4310d0e1b28e499ce5dd6226125c6e345dd6
4
+ data.tar.gz: 2c33050af6edcdc516611e7eb8e1efc5a497ecf5
5
+ SHA512:
6
+ metadata.gz: 6dc47d5c31e4629bb462ed353e31ec5e2b5b98fbf2a56363d87c8e9c9a8ed5a611341d88268f29b66b5b268acf4dce8e7766b7be0d6f189f1696544602d86d89
7
+ data.tar.gz: b939d2cf78e12054a92f8693ad35e5cc55efe75098df8b2e0c76c4347e3ad3763a15812a51968eba40c72b849c51b5373abddc9f0dfd92d49c3c2b1b85d59e84
data/.gitignore ADDED
@@ -0,0 +1,34 @@
1
+ *.gem
2
+ *.rbc
3
+ /.config
4
+ /coverage/
5
+ /InstalledFiles
6
+ /pkg/
7
+ /spec/reports/
8
+ /test/tmp/
9
+ /test/version_tmp/
10
+ /tmp/
11
+
12
+ ## Specific to RubyMotion:
13
+ .dat*
14
+ .repl_history
15
+ build/
16
+
17
+ ## Documentation cache and generated files:
18
+ /.yardoc/
19
+ /_yardoc/
20
+ /doc/
21
+ /rdoc/
22
+
23
+ ## Environment normalisation:
24
+ /.bundle/
25
+ /lib/bundler/man/
26
+
27
+ # for a library or gem, you might want to ignore these files since the code is
28
+ # intended to run in multiple environments; otherwise, check them in:
29
+ # Gemfile.lock
30
+ # .ruby-version
31
+ # .ruby-gemset
32
+
33
+ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
34
+ .rvmrc
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in robotx.gemspec
4
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,17 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ robotx (0.1.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ rake (10.3.2)
10
+
11
+ PLATFORMS
12
+ ruby
13
+
14
+ DEPENDENCIES
15
+ bundler (~> 1.6)
16
+ rake
17
+ robotx!
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2014
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,72 @@
1
+ # Robotx
2
+ Robotx _(pronounced "robotex")_ is a simple but powerful parser for robots.txt files.
3
+ It offers a bunch of features which allows you to check whether an URL is allowed/disallowed to be visited by a crawler.
4
+
5
+
6
+ ## Features
7
+
8
+ - Maintains lists for allowed/disallowed URLs
9
+ - Simple method to check whether an URL or just a path is allowed to be visited
10
+ - Show all user agents covered by the robots.txt
11
+ - Get the 'Crawl-Delay' for a website
12
+ - Support for sitemap(s)
13
+
14
+ ## Installation
15
+ ### With Bundler
16
+ Just add to your Gemfile
17
+ ~~~ruby
18
+ gem 'robotx'
19
+ ~~~
20
+
21
+ ### Without Bundler
22
+ If you're not using Bundler just execute on your commandline
23
+ ~~~bash
24
+ $ gem install robotx
25
+ ~~~
26
+
27
+ ## Usage
28
+ ### Support for different user agents
29
+ Robotx can be initialized with a special user agent. The default user agent is `*`.
30
+ **Please note:** All method results depends on the user agent Robotx was initialized with.
31
+ ~~~ruby
32
+ require 'robotx'
33
+
34
+ # Initialize with the default user agent '*'
35
+ robots_txt = Robotx.new('https://github.com')
36
+ robots_txt.allowed # => ["/humans.txt"]
37
+
38
+ # Initialize with 'googlebot' as user agent
39
+ robots_txt = Robotx.new('https://github.com', 'googlebot')
40
+ robots_txt.allowed # => ["/*/*/tree/master", "/*/*/blob/master"]
41
+ ~~~
42
+
43
+ ### Check whether an URL is allowed to be indexed
44
+ ~~~ruby
45
+ require 'robotx'
46
+
47
+ robots_txt = Robotx.new('https://github.com')
48
+ robots_txt.allowed?('/humans.txt') # => true
49
+ robots_txt.allowed?('/') # => false
50
+ ~~~
51
+
52
+ ### Get all allowed/disallowed URLs
53
+ ~~~ruby
54
+ require 'robotx'
55
+
56
+ robots_txt = Robotx.new('https://github.com')
57
+ robots_txt.allowed # => ["/humans.txt"]
58
+ robots_txt.disallowed # => ["/"]
59
+ ~~~
60
+
61
+ ### Get additional information
62
+ ~~~ruby
63
+ require 'robotx'
64
+
65
+ robots_txt = Robotx.new('https://github.com')
66
+ robots_txt.sitemap # => []
67
+ robots_txt.crawl_delay # => 0
68
+ robots_txt.user_agents # => ["googlebot", "baiduspider", ...]
69
+ ~~~
70
+
71
+ ## Todo
72
+ - Add tests
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
data/lib/robotx.rb ADDED
@@ -0,0 +1,114 @@
1
+ require 'timeout'
2
+ require 'stringio'
3
+ require 'open-uri'
4
+ require 'uri'
5
+ require 'set'
6
+
7
+ class Robotx
8
+
9
+ TIMEOUT = 30 # seconds
10
+
11
+ def initialize(uri, user_agent='*')
12
+ @uri = URI.parse(URI.encode(uri))
13
+ raise URI::InvalidURIError.new('scheme or host missing') unless @uri.scheme and @uri.host
14
+
15
+ @user_agent = user_agent.downcase
16
+ @robots_data = parse_robots_txt
17
+ end
18
+
19
+ def allowed
20
+ return disallowed.empty? ? ['/'] : @robots_data.fetch(@user_agent, {}).fetch('allow', ['/'])
21
+ end
22
+
23
+ def disallowed
24
+ return @robots_data.fetch(@user_agent, {}).fetch('disallow', [])
25
+ end
26
+
27
+ def allowed?(data)
28
+ if data.is_a?(Array) or data.is_a?(Set)
29
+ return {}.tap do |hash|
30
+ data.each do |uri|
31
+ hash[uri] = check_permission(uri)
32
+ end
33
+ end
34
+ end
35
+
36
+ return check_permission(data)
37
+ end
38
+
39
+ def sitemap
40
+ return @robots_data.fetch('sitemap', [])
41
+ end
42
+
43
+ def crawl_delay
44
+ return [@robots_data.fetch(@user_agent, {}).fetch('crawl-delay', 0), 0].max
45
+ end
46
+
47
+ def user_agents
48
+ return @robots_data.keys.delete_if { |agent| agent == 'sitemap' }
49
+ end
50
+
51
+ private
52
+
53
+ def load_robots_txt
54
+ Timeout::timeout(Robotx::TIMEOUT) do
55
+ if robots_txt_io = URI.join(@uri, 'robots.txt').open('User-Agent' => @user_agent) and robots_txt_io.content_type.downcase == 'text/plain' and robots_txt_io.status == ['200', 'OK']
56
+ return robots_txt_io
57
+ end
58
+ raise OpenURI::HTTPError
59
+ end
60
+ rescue
61
+ return StringIO.new("User-agent: *\nAllow: /\n")
62
+ end
63
+
64
+ def parse_robots_txt
65
+ agent = '*'
66
+ {}.tap do |hash|
67
+ load_robots_txt.each do |line|
68
+ next if line =~ /^\s*(#.*|$)/
69
+
70
+ data = line.split(/:/).map(&:strip)
71
+ key = data.shift
72
+ value = data.join
73
+
74
+ case key.downcase
75
+ when 'user-agent'
76
+ agent = value.downcase
77
+ hash[agent] ||= {}
78
+ when 'allow'
79
+ hash[agent]['allow'] ||= []
80
+ hash[agent]['allow'] << value.sub(/(\/){2,}$/, '')
81
+ when 'disallow'
82
+ # Disallow: '' means Allow: '/'
83
+ if value.empty?
84
+ hash[agent]['allow'] ||= []
85
+ hash[agent]['allow'] << '/'
86
+ else
87
+ hash[agent]['disallow'] ||= []
88
+ hash[agent]['disallow'] << value.sub(/(\/){2,}$/, '')
89
+ end
90
+ when 'crawl-delay'
91
+ hash[agent]['crawl-delay'] = value.to_i
92
+ when 'sitemap'
93
+ hash['sitemap'] ||= []
94
+ hash['sitemap'] << value.sub(/(\/){2,}$/, '')
95
+ else
96
+ hash[key] ||= []
97
+ hash[key] << value.sub(/(\/){2,}$/, '')
98
+ end
99
+ end
100
+ end
101
+ rescue
102
+ {}
103
+ end
104
+
105
+ def check_permission(uri)
106
+ uri = URI.parse(URI.encode(uri))
107
+ return true unless (@robots_data or @robots_data.any?) or (uri.scheme and uri.host)
108
+
109
+ uri_path = uri.path.sub(/(\/){2,}$/, '')
110
+ pattern = Regexp.compile("(^#{Regexp.escape(uri_path)}[\/]*$)|(^/$)")
111
+ return (@robots_data.fetch(@user_agent, {}).fetch('disallow', []).grep(pattern).empty? or @robots_data.fetch(@user_agent, {}).fetch('allow', []).grep(pattern).any?)
112
+ end
113
+
114
+ end
data/robotx.gemspec ADDED
@@ -0,0 +1,22 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "robotx"
7
+ spec.version = "0.1.0"
8
+ spec.authors = ["Matthias Kalb"]
9
+ spec.email = ["matthias.kalb@railsmechanic.de"]
10
+ spec.summary = %q{A parser for the robots.txt file}
11
+ spec.description = %q{A simple to use parser for the robots.txt file.}
12
+ spec.homepage = "https://github.com/railsmechanic/robotx"
13
+ spec.license = "MIT"
14
+
15
+ spec.files = `git ls-files -z`.split("\x0")
16
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
+ spec.require_paths = ["lib"]
19
+
20
+ spec.add_development_dependency "bundler", "~> 1.6"
21
+ spec.add_development_dependency "rake"
22
+ end
metadata ADDED
@@ -0,0 +1,80 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: robotx
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Matthias Kalb
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-07-04 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: A simple to use parser for the robots.txt file.
42
+ email:
43
+ - matthias.kalb@railsmechanic.de
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - .gitignore
49
+ - Gemfile
50
+ - Gemfile.lock
51
+ - LICENSE
52
+ - README.md
53
+ - Rakefile
54
+ - lib/robotx.rb
55
+ - robotx.gemspec
56
+ homepage: https://github.com/railsmechanic/robotx
57
+ licenses:
58
+ - MIT
59
+ metadata: {}
60
+ post_install_message:
61
+ rdoc_options: []
62
+ require_paths:
63
+ - lib
64
+ required_ruby_version: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ required_rubygems_version: !ruby/object:Gem::Requirement
70
+ requirements:
71
+ - - '>='
72
+ - !ruby/object:Gem::Version
73
+ version: '0'
74
+ requirements: []
75
+ rubyforge_project:
76
+ rubygems_version: 2.2.2
77
+ signing_key:
78
+ specification_version: 4
79
+ summary: A parser for the robots.txt file
80
+ test_files: []