spider 0.4.4 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. checksums.yaml +7 -0
  2. data/AUTHORS +12 -0
  3. data/CHANGES +6 -0
  4. data/LICENSE +21 -0
  5. data/{README → README.md} +50 -43
  6. data/lib/spider.rb +12 -29
  7. data/lib/spider/included_in_memcached.rb +1 -24
  8. data/lib/spider/next_urls_in_sqs.rb +6 -29
  9. data/lib/spider/robot_rules.rb +61 -57
  10. data/lib/spider/spider_instance.rb +8 -31
  11. data/spider.gemspec +4 -2
  12. metadata +33 -124
  13. data/doc/classes/BeStaticServerPages.html +0 -197
  14. data/doc/classes/BeStaticServerPages.src/M000030.html +0 -19
  15. data/doc/classes/BeStaticServerPages.src/M000031.html +0 -19
  16. data/doc/classes/BeStaticServerPages.src/M000032.html +0 -18
  17. data/doc/classes/BeStaticServerPages.src/M000033.html +0 -18
  18. data/doc/classes/IncludedInMemcached.html +0 -199
  19. data/doc/classes/IncludedInMemcached.src/M000015.html +0 -18
  20. data/doc/classes/IncludedInMemcached.src/M000016.html +0 -18
  21. data/doc/classes/IncludedInMemcached.src/M000017.html +0 -18
  22. data/doc/classes/LoopingServlet.html +0 -137
  23. data/doc/classes/LoopingServlet.src/M000037.html +0 -23
  24. data/doc/classes/NextUrlsInSQS.html +0 -204
  25. data/doc/classes/NextUrlsInSQS.src/M000018.html +0 -19
  26. data/doc/classes/NextUrlsInSQS.src/M000019.html +0 -22
  27. data/doc/classes/NextUrlsInSQS.src/M000020.html +0 -19
  28. data/doc/classes/QueryServlet.html +0 -137
  29. data/doc/classes/QueryServlet.src/M000038.html +0 -19
  30. data/doc/classes/RobotRules.html +0 -175
  31. data/doc/classes/RobotRules.src/M000034.html +0 -19
  32. data/doc/classes/RobotRules.src/M000035.html +0 -67
  33. data/doc/classes/RobotRules.src/M000036.html +0 -24
  34. data/doc/classes/Spider.html +0 -170
  35. data/doc/classes/Spider.src/M000029.html +0 -21
  36. data/doc/classes/SpiderInstance.html +0 -345
  37. data/doc/classes/SpiderInstance.src/M000021.html +0 -18
  38. data/doc/classes/SpiderInstance.src/M000022.html +0 -22
  39. data/doc/classes/SpiderInstance.src/M000023.html +0 -22
  40. data/doc/classes/SpiderInstance.src/M000024.html +0 -24
  41. data/doc/classes/SpiderInstance.src/M000025.html +0 -18
  42. data/doc/classes/SpiderInstance.src/M000026.html +0 -18
  43. data/doc/classes/SpiderInstance.src/M000027.html +0 -18
  44. data/doc/classes/SpiderInstance.src/M000028.html +0 -18
  45. data/doc/created.rid +0 -1
  46. data/doc/files/README.html +0 -223
  47. data/doc/files/lib/spider/included_in_memcached_rb.html +0 -142
  48. data/doc/files/lib/spider/next_urls_in_sqs_rb.html +0 -144
  49. data/doc/files/lib/spider/robot_rules_rb.html +0 -114
  50. data/doc/files/lib/spider/spider_instance_rb.html +0 -117
  51. data/doc/files/lib/spider_rb.html +0 -254
  52. data/doc/files/spec/spec_helper_rb.html +0 -196
  53. data/doc/files/spec/spec_helper_rb.src/M000001.html +0 -20
  54. data/doc/files/spec/spec_helper_rb.src/M000002.html +0 -26
  55. data/doc/files/spec/spec_helper_rb.src/M000003.html +0 -24
  56. data/doc/files/spec/spec_helper_rb.src/M000004.html +0 -18
  57. data/doc/files/spec/spec_helper_rb.src/M000005.html +0 -23
  58. data/doc/files/spec/spider/included_in_memcached_spec_rb.html +0 -142
  59. data/doc/files/spec/spider/included_in_memcached_spec_rb.src/M000006.html +0 -19
  60. data/doc/files/spec/spider/included_in_memcached_spec_rb.src/M000007.html +0 -18
  61. data/doc/files/spec/spider/spider_instance_spec_rb.html +0 -210
  62. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000008.html +0 -21
  63. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000009.html +0 -19
  64. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000010.html +0 -19
  65. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000011.html +0 -27
  66. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000012.html +0 -26
  67. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000013.html +0 -27
  68. data/doc/files/spec/spider_spec_rb.html +0 -127
  69. data/doc/files/spec/spider_spec_rb.src/M000014.html +0 -23
  70. data/doc/fr_class_index.html +0 -34
  71. data/doc/fr_file_index.html +0 -35
  72. data/doc/fr_method_index.html +0 -64
  73. data/doc/index.html +0 -24
  74. data/doc/rdoc-style.css +0 -208
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 1d6465ee9f80195a1002053f826f1b80187020a3
4
+ data.tar.gz: 1218142b1d76482cf5baccd1f288934cd7a6b003
5
+ SHA512:
6
+ metadata.gz: 2725ca0197ec2801836d94615e4ece0196c131a9ff500ed5837c22e320e06b33a8f609add7d41eabb8fa19114a60af71057b5bdebaf8f94e2be116148d6ad123
7
+ data.tar.gz: 5497c85e9759542ecb0cbb612484de0b185f7428c5a2c5222e1fbc7e1e3f69bac727bfddd883967c5eeb6c5bfaca0b9dfbe130eaaed35cc9e8cb96fb87abddc5
data/AUTHORS ADDED
@@ -0,0 +1,12 @@
1
+ The Ruby Spider Gem would not be what it is today without the help of
2
+ the following kind souls:
3
+
4
+ Brian Campbell
5
+ Henri Cook
6
+ James Edward Gray II
7
+ Joao Eriberto Mota Filho
8
+ John Buckley
9
+ John Nagro
10
+ Mike Burns
11
+ Matt Horan
12
+ Sander van der Vliet
data/CHANGES CHANGED
@@ -1,3 +1,9 @@
1
+ 2016-05-13
2
+ * fixed #1 thanks to @eribertomota
3
+ * got it running on more recent versions of ruby
4
+ * cleaned up the docs a bit
5
+ * cleaned up the licensing and attribution
6
+
1
7
  2009-05-21
2
8
  * fixed an issue with robots.txt on ssl hosts
3
9
  * fixed an issue with pulling robots.txt from disallowed hosts
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2007-2016 Spider Team Authors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -1,66 +1,80 @@
1
1
 
2
- Spider, a Web spidering library for Ruby. It handles the robots.txt,
3
- scraping, collecting, and looping so that you can just handle the data.
2
+ # Spider
3
+ _a Web spidering library for Ruby. It handles the robots.txt,
4
+ scraping, collecting, and looping so that you can just handle the data._
4
5
 
5
- == Examples
6
+ ## Examples
6
7
 
7
- === Crawl the Web, loading each page in turn, until you run out of memory
8
+ ### Crawl the Web, loading each page in turn, until you run out of memory
8
9
 
10
+ ```ruby
9
11
  require 'spider'
10
- Spider.start_at('http://mike-burns.com/') {}
12
+ Spider.start_at('http://cashcats.biz/') {}
13
+ ```
11
14
 
12
- === To handle erroneous responses
15
+ ### To handle erroneous responses
13
16
 
17
+ ```ruby
14
18
  require 'spider'
15
- Spider.start_at('http://mike-burns.com/') do |s|
19
+ Spider.start_at('http://cashcats.biz/') do |s|
16
20
  s.on :failure do |a_url, resp, prior_url|
17
21
  puts "URL failed: #{a_url}"
18
22
  puts " linked from #{prior_url}"
19
23
  end
20
24
  end
25
+ ```
21
26
 
22
- === Or handle successful responses
27
+ ### Or handle successful responses
23
28
 
29
+ ```ruby
24
30
  require 'spider'
25
- Spider.start_at('http://mike-burns.com/') do |s|
31
+ Spider.start_at('http://cashcats.biz/') do |s|
26
32
  s.on :success do |a_url, resp, prior_url|
27
33
  puts "#{a_url}: #{resp.code}"
28
34
  puts resp.body
29
35
  puts
30
36
  end
31
37
  end
38
+ ```
32
39
 
33
- === Limit to just one domain
40
+ ### Limit to just one domain
34
41
 
42
+ ```ruby
35
43
  require 'spider'
36
- Spider.start_at('http://mike-burns.com/') do |s|
44
+ Spider.start_at('http://cashcats.biz/') do |s|
37
45
  s.add_url_check do |a_url|
38
- a_url =~ %r{^http://mike-burns.com.*}
46
+ a_url =~ %r{^http://cashcats.biz.*}
39
47
  end
40
48
  end
49
+ ```
41
50
 
42
- === Pass headers to some requests
51
+ ### Pass headers to some requests
43
52
 
53
+ ```ruby
44
54
  require 'spider'
45
- Spider.start_at('http://mike-burns.com/') do |s|
55
+ Spider.start_at('http://cashcats.biz/') do |s|
46
56
  s.setup do |a_url|
47
57
  if a_url =~ %r{^http://.*wikipedia.*}
48
58
  headers['User-Agent'] = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
49
59
  end
50
60
  end
51
61
  end
62
+ ```
52
63
 
53
- === Use memcached to track cycles
64
+ ### Use memcached to track cycles
54
65
 
66
+ ```ruby
55
67
  require 'spider'
56
68
  require 'spider/included_in_memcached'
57
69
  SERVERS = ['10.0.10.2:11211','10.0.10.3:11211','10.0.10.4:11211']
58
- Spider.start_at('http://mike-burns.com/') do |s|
70
+ Spider.start_at('http://cashcats.biz/') do |s|
59
71
  s.check_already_seen_with IncludedInMemcached.new(SERVERS)
60
72
  end
73
+ ```
61
74
 
62
- === Track cycles with a custom object
75
+ ### Track cycles with a custom object
63
76
 
77
+ ```ruby
64
78
  require 'spider'
65
79
  class ExpireLinks < Hash
66
80
  def <<(v)
@@ -71,50 +85,58 @@ scraping, collecting, and looping so that you can just handle the data.
71
85
  end
72
86
  end
73
87
 
74
- Spider.start_at('http://mike-burns.com/') do |s|
88
+ Spider.start_at('http://cashcats.biz/') do |s|
75
89
  s.check_already_seen_with ExpireLinks.new
76
90
  end
91
+ ```
77
92
 
78
- === Store nodes to visit with Amazon SQS
93
+ ### Store nodes to visit with Amazon SQS
79
94
 
95
+ ```ruby
80
96
  require 'spider'
81
97
  require 'spider/next_urls_in_sqs'
82
- Spider.start_at('http://mike-burns.com') do |s|
98
+ Spider.start_at('http://cashcats.biz') do |s|
83
99
  s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY)
84
100
  end
101
+ ```
85
102
 
86
- ==== Store nodes to visit with a custom object
103
+ ### Store nodes to visit with a custom object
87
104
 
105
+ ```ruby
88
106
  require 'spider'
89
107
  class MyArray < Array
90
108
  def pop
91
109
  super
92
110
  end
93
-
111
+
94
112
  def push(a_msg)
95
113
  super(a_msg)
96
114
  end
97
115
  end
98
116
 
99
- Spider.start_at('http://mike-burns.com') do |s|
117
+ Spider.start_at('http://cashcats.biz') do |s|
100
118
  s.store_next_urls_with MyArray.new
101
119
  end
120
+ ```
102
121
 
103
- === Create a URL graph
122
+ ### Create a URL graph
104
123
 
124
+ ```ruby
105
125
  require 'spider'
106
126
  nodes = {}
107
- Spider.start_at('http://mike-burns.com/') do |s|
108
- s.add_url_check {|a_url| a_url =~ %r{^http://mike-burns.com.*} }
127
+ Spider.start_at('http://cashcats.biz/') do |s|
128
+ s.add_url_check {|a_url| a_url =~ %r{^http://cashcats.biz.*} }
109
129
 
110
130
  s.on(:every) do |a_url, resp, prior_url|
111
131
  nodes[prior_url] ||= []
112
132
  nodes[prior_url] << a_url
113
133
  end
114
134
  end
135
+ ```
115
136
 
116
- === Use a proxy
137
+ ### Use a proxy
117
138
 
139
+ ```ruby
118
140
  require 'net/http_configuration'
119
141
  require 'spider'
120
142
  http_conf = Net::HTTP::Configuration.new(:proxy_host => '7proxies.org',
@@ -128,19 +150,4 @@ scraping, collecting, and looping so that you can just handle the data.
128
150
  end
129
151
  end
130
152
  end
131
-
132
- == Author
133
-
134
- John Nagro john.nagro@gmail.com
135
-
136
- Mike Burns http://mike-burns.com mike@mike-burns.com (original author)
137
-
138
- Many thanks to:
139
- Matt Horan
140
- Henri Cook
141
- Sander van der Vliet
142
- John Buckley
143
- Brian Campbell
144
-
145
- With `robot_rules' from James Edward Gray II via
146
- http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589
153
+ ```
@@ -1,41 +1,24 @@
1
- # Copyright 2007-2008 Mike Burns & John Nagro
2
- # :include: README
3
-
4
- # Redistribution and use in source and binary forms, with or without
5
- # modification, are permitted provided that the following conditions are met:
6
- # * Redistributions of source code must retain the above copyright
7
- # notice, this list of conditions and the following disclaimer.
8
- # * Redistributions in binary form must reproduce the above copyright
9
- # notice, this list of conditions and the following disclaimer in the
10
- # documentation and/or other materials provided with the distribution.
11
- # * Neither the name Mike Burns nor the
12
- # names of his contributors may be used to endorse or promote products
13
- # derived from this software without specific prior written permission.
14
- #
15
- # THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
16
- # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17
- # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18
- # DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
19
- # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20
- # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21
- # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22
- # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23
- # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
- # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
-
26
1
  require File.dirname(__FILE__)+'/spider/spider_instance'
27
2
 
28
3
  # A spidering library for Ruby. Handles robots.txt, scraping, finding more
29
4
  # links, and doing it all over again.
30
5
  class Spider
6
+
7
+ VERSION_INFO = [0, 5, 0] unless defined?(self::VERSION_INFO)
8
+ VERSION = VERSION_INFO.map(&:to_s).join('.') unless defined?(self::VERSION)
9
+
10
+ def self.version
11
+ VERSION
12
+ end
13
+
31
14
  # Runs the spider starting at the given URL. Also takes a block that is given
32
15
  # the SpiderInstance. Use the block to define the rules and handlers for
33
16
  # the discovered Web pages. See SpiderInstance for the possible rules and
34
17
  # handlers.
35
18
  #
36
- # Spider.start_at('http://mike-burns.com/') do |s|
19
+ # Spider.start_at('http://cashcats.biz/') do |s|
37
20
  # s.add_url_check do |a_url|
38
- # a_url =~ %r{^http://mike-burns.com.*}
21
+ # a_url =~ %r{^http://cashcats.biz.*}
39
22
  # end
40
23
  #
41
24
  # s.on 404 do |a_url, resp, prior_url|
@@ -52,8 +35,8 @@ class Spider
52
35
  # end
53
36
 
54
37
  def self.start_at(a_url, &block)
55
- rules = RobotRules.new('Ruby Spider 1.0')
56
- a_spider = SpiderInstance.new({nil => a_url}, [], rules, [])
38
+ rules = RobotRules.new("Ruby Spider #{Spider::VERSION}")
39
+ a_spider = SpiderInstance.new({nil => [a_url]}, [], rules, [])
57
40
  block.call(a_spider)
58
41
  a_spider.start!
59
42
  end
@@ -1,32 +1,9 @@
1
1
  # Use memcached to track cycles.
2
- #
3
- # Copyright 2007 Mike Burns
4
- # Redistribution and use in source and binary forms, with or without
5
- # modification, are permitted provided that the following conditions are met:
6
- # * Redistributions of source code must retain the above copyright
7
- # notice, this list of conditions and the following disclaimer.
8
- # * Redistributions in binary form must reproduce the above copyright
9
- # notice, this list of conditions and the following disclaimer in the
10
- # documentation and/or other materials provided with the distribution.
11
- # * Neither the name Mike Burns nor the
12
- # names of his contributors may be used to endorse or promote products
13
- # derived from this software without specific prior written permission.
14
- #
15
- # THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
16
- # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17
- # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18
- # DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
19
- # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20
- # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21
- # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22
- # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23
- # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
- # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
2
 
26
3
  require 'memcache'
27
4
 
28
5
  # A specialized class using memcached to track items stored. It supports
29
- # three operations: new, <<, and include? . Together these can be used to
6
+ # three operations: new, <<, and include? . Together these can be used to
30
7
  # add items to the memcache, then determine whether the item has been added.
31
8
  #
32
9
  # To use it with Spider use the check_already_seen_with method:
@@ -1,34 +1,11 @@
1
1
  # Use AmazonSQS to track nodes to visit.
2
- #
3
- # Copyright 2008 John Nagro
4
- # Redistribution and use in source and binary forms, with or without
5
- # modification, are permitted provided that the following conditions are met:
6
- # * Redistributions of source code must retain the above copyright
7
- # notice, this list of conditions and the following disclaimer.
8
- # * Redistributions in binary form must reproduce the above copyright
9
- # notice, this list of conditions and the following disclaimer in the
10
- # documentation and/or other materials provided with the distribution.
11
- # * Neither the name Mike Burns nor the
12
- # names of his contributors may be used to endorse or promote products
13
- # derived from this software without specific prior written permission.
14
- #
15
- # THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
16
- # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17
- # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18
- # DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
19
- # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20
- # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21
- # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22
- # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23
- # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
- # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
2
 
26
3
  require 'rubygems'
27
4
  require 'right_aws'
28
5
  require 'yaml'
29
6
 
30
7
  # A specialized class using AmazonSQS to track nodes to walk. It supports
31
- # two operations: push and pop . Together these can be used to
8
+ # two operations: push and pop . Together these can be used to
32
9
  # add items to the queue, then pull items off the queue.
33
10
  #
34
11
  # This is useful if you want multiple Spider processes crawling the same
@@ -47,8 +24,8 @@ class NextUrlsInSQS
47
24
  @sqs = RightAws::SqsGen2.new(aws_access_key, aws_secret_access_key)
48
25
  @queue = @sqs.queue(queue_name)
49
26
  end
50
-
51
- # Pull an item off the queue, loop until data is found. Data is
27
+
28
+ # Pull an item off the queue, loop until data is found. Data is
52
29
  # encoded with YAML.
53
30
  def pop
54
31
  while true
@@ -57,10 +34,10 @@ class NextUrlsInSQS
57
34
  sleep 5
58
35
  end
59
36
  end
60
-
37
+
61
38
  # Put data on the queue. Data is encoded with YAML.
62
39
  def push(a_msg)
63
40
  encoded_message = YAML::dump(a_msg)
64
41
  @queue.push(a_msg)
65
- end
66
- end
42
+ end
43
+ end
@@ -1,77 +1,81 @@
1
- # Understand robots.txt.
1
+ #!/usr/local/bin/ruby -w
2
2
 
3
+ # robot_rules.rb
4
+ #
3
5
  # Created by James Edward Gray II on 2006-01-31.
4
6
  # Copyright 2006 Gray Productions. All rights reserved.
7
+ # https://github.com/eribertomota/robot_rules.rb
8
+ # https://github.com/johnnagro/spider/issues/1
5
9
 
6
10
  require "uri"
7
11
 
8
12
  # Based on Perl's WWW::RobotRules module, by Gisle Aas.
9
13
  class RobotRules
10
- def initialize( user_agent )
11
- @user_agent = user_agent.scan(/\S+/).first.sub(%r{/.*}, "").downcase
12
- @rules = Hash.new { |rules, rule| rules[rule] = Array.new }
13
- end
14
+ def initialize( user_agent )
15
+ @user_agent = user_agent.scan(/\S+/).first.sub(%r{/.*},
16
+ "").downcase
17
+ @rules = Hash.new { |rules, rule| rules[rule] = Array.new }
18
+ end
14
19
 
15
- def parse( text_uri, robots_data )
16
- uri = URI.parse(text_uri)
17
- location = "#{uri.host}:#{uri.port}"
18
- @rules.delete(location)
20
+ def parse( text_uri, robots_data )
21
+ uri = URI.parse(text_uri)
22
+ location = "#{uri.host}:#{uri.port}"
23
+ @rules.delete(location)
19
24
 
20
- rules = robots_data.split(/[\015\012]+/).map do |rule|
21
- rule.sub(/\s*#.*$/, "")
22
- end
23
- anon_rules = Array.new
24
- my_rules = Array.new
25
- current = anon_rules
26
- rules.each do |rule|
27
- case rule
28
- when /^\s*User-Agent\s*:\s*(.+?)\s*$/i
29
- break unless my_rules.empty?
25
+ rules = robots_data.split(/[\015\012]+/).
26
+ map { |rule| rule.sub(/\s*#.*$/, "") }
27
+ anon_rules = Array.new
28
+ my_rules = Array.new
29
+ current = anon_rules
30
+ rules.each do |rule|
31
+ case rule
32
+ when /^\s*User-Agent\s*:\s*(.+?)\s*$/i
33
+ break unless my_rules.empty?
30
34
 
31
- current = if $1 == "*"
32
- anon_rules
33
- elsif $1.downcase.index(@user_agent)
34
- my_rules
35
- else
36
- nil
37
- end
38
- when /^\s*Disallow\s*:\s*(.*?)\s*$/i
39
- next if current.nil?
35
+ current = if $1 == "*"
36
+ anon_rules
37
+ elsif $1.downcase.index(@user_agent)
38
+ my_rules
39
+ else
40
+ nil
41
+ end
42
+ when /^\s*Disallow\s*:\s*(.*?)\s*$/i
43
+ next if current.nil?
40
44
 
41
- if $1.empty?
42
- current << nil
43
- else
44
- disallow = URI.parse($1)
45
+ if $1.empty?
46
+ current << nil
47
+ else
48
+ disallow = URI.parse($1)
45
49
 
46
- next unless disallow.scheme.nil? or disallow.scheme ==
47
- uri.scheme
48
- next unless disallow.port.nil? or disallow.port == uri.port
49
- next unless disallow.host.nil? or
50
- disallow.host.downcase == uri.host.downcase
50
+ next unless disallow.scheme.nil? or disallow.scheme ==
51
+ uri.scheme
52
+ next unless disallow.port.nil? or disallow.port == uri.port
53
+ next unless disallow.host.nil? or
54
+ disallow.host.downcase == uri.host.downcase
51
55
 
52
- disallow = disallow.path
53
- disallow = "/" if disallow.empty?
54
- disallow = "/#{disallow}" unless disallow[0] == ?/
56
+ disallow = disallow.path
57
+ disallow = "/" if disallow.empty?
58
+ disallow = "/#{disallow}" unless disallow[0] == ?/
55
59
 
56
- current << disallow
57
- end
58
- end
59
- end
60
+ current << disallow
61
+ end
62
+ end
63
+ end
60
64
 
61
- @rules[location] = if my_rules.empty?
62
- anon_rules.compact
63
- else
64
- my_rules.compact
65
- end
66
- end
65
+ @rules[location] = if my_rules.empty?
66
+ anon_rules.compact
67
+ else
68
+ my_rules.compact
69
+ end
70
+ end
67
71
 
68
- def allowed?( text_uri )
69
- uri = URI.parse(text_uri)
70
- location = "#{uri.host}:#{uri.port}"
71
- path = uri.path
72
+ def allowed?( text_uri )
73
+ uri = URI.parse(text_uri)
74
+ location = "#{uri.host}:#{uri.port}"
75
+ path = uri.path
72
76
 
73
- return true unless %w{http https}.include?(uri.scheme)
77
+ return true unless %w{http https}.include?(uri.scheme)
74
78
 
75
- not @rules[location].any? { |rule| path.index(rule) == 0 }
76
- end
79
+ not @rules[location].any? { |rule| path.index(rule) == 0 }
80
+ end
77
81
  end