spider 0.4.4 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (74) hide show
  1. checksums.yaml +7 -0
  2. data/AUTHORS +12 -0
  3. data/CHANGES +6 -0
  4. data/LICENSE +21 -0
  5. data/{README → README.md} +50 -43
  6. data/lib/spider.rb +12 -29
  7. data/lib/spider/included_in_memcached.rb +1 -24
  8. data/lib/spider/next_urls_in_sqs.rb +6 -29
  9. data/lib/spider/robot_rules.rb +61 -57
  10. data/lib/spider/spider_instance.rb +8 -31
  11. data/spider.gemspec +4 -2
  12. metadata +33 -124
  13. data/doc/classes/BeStaticServerPages.html +0 -197
  14. data/doc/classes/BeStaticServerPages.src/M000030.html +0 -19
  15. data/doc/classes/BeStaticServerPages.src/M000031.html +0 -19
  16. data/doc/classes/BeStaticServerPages.src/M000032.html +0 -18
  17. data/doc/classes/BeStaticServerPages.src/M000033.html +0 -18
  18. data/doc/classes/IncludedInMemcached.html +0 -199
  19. data/doc/classes/IncludedInMemcached.src/M000015.html +0 -18
  20. data/doc/classes/IncludedInMemcached.src/M000016.html +0 -18
  21. data/doc/classes/IncludedInMemcached.src/M000017.html +0 -18
  22. data/doc/classes/LoopingServlet.html +0 -137
  23. data/doc/classes/LoopingServlet.src/M000037.html +0 -23
  24. data/doc/classes/NextUrlsInSQS.html +0 -204
  25. data/doc/classes/NextUrlsInSQS.src/M000018.html +0 -19
  26. data/doc/classes/NextUrlsInSQS.src/M000019.html +0 -22
  27. data/doc/classes/NextUrlsInSQS.src/M000020.html +0 -19
  28. data/doc/classes/QueryServlet.html +0 -137
  29. data/doc/classes/QueryServlet.src/M000038.html +0 -19
  30. data/doc/classes/RobotRules.html +0 -175
  31. data/doc/classes/RobotRules.src/M000034.html +0 -19
  32. data/doc/classes/RobotRules.src/M000035.html +0 -67
  33. data/doc/classes/RobotRules.src/M000036.html +0 -24
  34. data/doc/classes/Spider.html +0 -170
  35. data/doc/classes/Spider.src/M000029.html +0 -21
  36. data/doc/classes/SpiderInstance.html +0 -345
  37. data/doc/classes/SpiderInstance.src/M000021.html +0 -18
  38. data/doc/classes/SpiderInstance.src/M000022.html +0 -22
  39. data/doc/classes/SpiderInstance.src/M000023.html +0 -22
  40. data/doc/classes/SpiderInstance.src/M000024.html +0 -24
  41. data/doc/classes/SpiderInstance.src/M000025.html +0 -18
  42. data/doc/classes/SpiderInstance.src/M000026.html +0 -18
  43. data/doc/classes/SpiderInstance.src/M000027.html +0 -18
  44. data/doc/classes/SpiderInstance.src/M000028.html +0 -18
  45. data/doc/created.rid +0 -1
  46. data/doc/files/README.html +0 -223
  47. data/doc/files/lib/spider/included_in_memcached_rb.html +0 -142
  48. data/doc/files/lib/spider/next_urls_in_sqs_rb.html +0 -144
  49. data/doc/files/lib/spider/robot_rules_rb.html +0 -114
  50. data/doc/files/lib/spider/spider_instance_rb.html +0 -117
  51. data/doc/files/lib/spider_rb.html +0 -254
  52. data/doc/files/spec/spec_helper_rb.html +0 -196
  53. data/doc/files/spec/spec_helper_rb.src/M000001.html +0 -20
  54. data/doc/files/spec/spec_helper_rb.src/M000002.html +0 -26
  55. data/doc/files/spec/spec_helper_rb.src/M000003.html +0 -24
  56. data/doc/files/spec/spec_helper_rb.src/M000004.html +0 -18
  57. data/doc/files/spec/spec_helper_rb.src/M000005.html +0 -23
  58. data/doc/files/spec/spider/included_in_memcached_spec_rb.html +0 -142
  59. data/doc/files/spec/spider/included_in_memcached_spec_rb.src/M000006.html +0 -19
  60. data/doc/files/spec/spider/included_in_memcached_spec_rb.src/M000007.html +0 -18
  61. data/doc/files/spec/spider/spider_instance_spec_rb.html +0 -210
  62. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000008.html +0 -21
  63. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000009.html +0 -19
  64. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000010.html +0 -19
  65. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000011.html +0 -27
  66. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000012.html +0 -26
  67. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000013.html +0 -27
  68. data/doc/files/spec/spider_spec_rb.html +0 -127
  69. data/doc/files/spec/spider_spec_rb.src/M000014.html +0 -23
  70. data/doc/fr_class_index.html +0 -34
  71. data/doc/fr_file_index.html +0 -35
  72. data/doc/fr_method_index.html +0 -64
  73. data/doc/index.html +0 -24
  74. data/doc/rdoc-style.css +0 -208
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 1d6465ee9f80195a1002053f826f1b80187020a3
4
+ data.tar.gz: 1218142b1d76482cf5baccd1f288934cd7a6b003
5
+ SHA512:
6
+ metadata.gz: 2725ca0197ec2801836d94615e4ece0196c131a9ff500ed5837c22e320e06b33a8f609add7d41eabb8fa19114a60af71057b5bdebaf8f94e2be116148d6ad123
7
+ data.tar.gz: 5497c85e9759542ecb0cbb612484de0b185f7428c5a2c5222e1fbc7e1e3f69bac727bfddd883967c5eeb6c5bfaca0b9dfbe130eaaed35cc9e8cb96fb87abddc5
data/AUTHORS ADDED
@@ -0,0 +1,12 @@
1
+ The Ruby Spider Gem would not be what it is today without the help of
2
+ the following kind souls:
3
+
4
+ Brian Campbell
5
+ Henri Cook
6
+ James Edward Gray II
7
+ Joao Eriberto Mota Filho
8
+ John Buckley
9
+ John Nagro
10
+ Mike Burns
11
+ Matt Horan
12
+ Sander van der Vliet
data/CHANGES CHANGED
@@ -1,3 +1,9 @@
1
+ 2016-05-13
2
+ * fixed #1 thanks to @eribertomota
3
+ * got it running on more recent versions of ruby
4
+ * cleaned up the docs a bit
5
+ * cleaned up the licensing and attribution
6
+
1
7
  2009-05-21
2
8
  * fixed an issue with robots.txt on ssl hosts
3
9
  * fixed an issue with pulling robots.txt from disallowed hosts
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2007-2016 Spider Team Authors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -1,66 +1,80 @@
1
1
 
2
- Spider, a Web spidering library for Ruby. It handles the robots.txt,
3
- scraping, collecting, and looping so that you can just handle the data.
2
+ # Spider
3
+ _a Web spidering library for Ruby. It handles the robots.txt,
4
+ scraping, collecting, and looping so that you can just handle the data._
4
5
 
5
- == Examples
6
+ ## Examples
6
7
 
7
- === Crawl the Web, loading each page in turn, until you run out of memory
8
+ ### Crawl the Web, loading each page in turn, until you run out of memory
8
9
 
10
+ ```ruby
9
11
  require 'spider'
10
- Spider.start_at('http://mike-burns.com/') {}
12
+ Spider.start_at('http://cashcats.biz/') {}
13
+ ```
11
14
 
12
- === To handle erroneous responses
15
+ ### To handle erroneous responses
13
16
 
17
+ ```ruby
14
18
  require 'spider'
15
- Spider.start_at('http://mike-burns.com/') do |s|
19
+ Spider.start_at('http://cashcats.biz/') do |s|
16
20
  s.on :failure do |a_url, resp, prior_url|
17
21
  puts "URL failed: #{a_url}"
18
22
  puts " linked from #{prior_url}"
19
23
  end
20
24
  end
25
+ ```
21
26
 
22
- === Or handle successful responses
27
+ ### Or handle successful responses
23
28
 
29
+ ```ruby
24
30
  require 'spider'
25
- Spider.start_at('http://mike-burns.com/') do |s|
31
+ Spider.start_at('http://cashcats.biz/') do |s|
26
32
  s.on :success do |a_url, resp, prior_url|
27
33
  puts "#{a_url}: #{resp.code}"
28
34
  puts resp.body
29
35
  puts
30
36
  end
31
37
  end
38
+ ```
32
39
 
33
- === Limit to just one domain
40
+ ### Limit to just one domain
34
41
 
42
+ ```ruby
35
43
  require 'spider'
36
- Spider.start_at('http://mike-burns.com/') do |s|
44
+ Spider.start_at('http://cashcats.biz/') do |s|
37
45
  s.add_url_check do |a_url|
38
- a_url =~ %r{^http://mike-burns.com.*}
46
+ a_url =~ %r{^http://cashcats.biz.*}
39
47
  end
40
48
  end
49
+ ```
41
50
 
42
- === Pass headers to some requests
51
+ ### Pass headers to some requests
43
52
 
53
+ ```ruby
44
54
  require 'spider'
45
- Spider.start_at('http://mike-burns.com/') do |s|
55
+ Spider.start_at('http://cashcats.biz/') do |s|
46
56
  s.setup do |a_url|
47
57
  if a_url =~ %r{^http://.*wikipedia.*}
48
58
  headers['User-Agent'] = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
49
59
  end
50
60
  end
51
61
  end
62
+ ```
52
63
 
53
- === Use memcached to track cycles
64
+ ### Use memcached to track cycles
54
65
 
66
+ ```ruby
55
67
  require 'spider'
56
68
  require 'spider/included_in_memcached'
57
69
  SERVERS = ['10.0.10.2:11211','10.0.10.3:11211','10.0.10.4:11211']
58
- Spider.start_at('http://mike-burns.com/') do |s|
70
+ Spider.start_at('http://cashcats.biz/') do |s|
59
71
  s.check_already_seen_with IncludedInMemcached.new(SERVERS)
60
72
  end
73
+ ```
61
74
 
62
- === Track cycles with a custom object
75
+ ### Track cycles with a custom object
63
76
 
77
+ ```ruby
64
78
  require 'spider'
65
79
  class ExpireLinks < Hash
66
80
  def <<(v)
@@ -71,50 +85,58 @@ scraping, collecting, and looping so that you can just handle the data.
71
85
  end
72
86
  end
73
87
 
74
- Spider.start_at('http://mike-burns.com/') do |s|
88
+ Spider.start_at('http://cashcats.biz/') do |s|
75
89
  s.check_already_seen_with ExpireLinks.new
76
90
  end
91
+ ```
77
92
 
78
- === Store nodes to visit with Amazon SQS
93
+ ### Store nodes to visit with Amazon SQS
79
94
 
95
+ ```ruby
80
96
  require 'spider'
81
97
  require 'spider/next_urls_in_sqs'
82
- Spider.start_at('http://mike-burns.com') do |s|
98
+ Spider.start_at('http://cashcats.biz') do |s|
83
99
  s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY)
84
100
  end
101
+ ```
85
102
 
86
- ==== Store nodes to visit with a custom object
103
+ ### Store nodes to visit with a custom object
87
104
 
105
+ ```ruby
88
106
  require 'spider'
89
107
  class MyArray < Array
90
108
  def pop
91
109
  super
92
110
  end
93
-
111
+
94
112
  def push(a_msg)
95
113
  super(a_msg)
96
114
  end
97
115
  end
98
116
 
99
- Spider.start_at('http://mike-burns.com') do |s|
117
+ Spider.start_at('http://cashcats.biz') do |s|
100
118
  s.store_next_urls_with MyArray.new
101
119
  end
120
+ ```
102
121
 
103
- === Create a URL graph
122
+ ### Create a URL graph
104
123
 
124
+ ```ruby
105
125
  require 'spider'
106
126
  nodes = {}
107
- Spider.start_at('http://mike-burns.com/') do |s|
108
- s.add_url_check {|a_url| a_url =~ %r{^http://mike-burns.com.*} }
127
+ Spider.start_at('http://cashcats.biz/') do |s|
128
+ s.add_url_check {|a_url| a_url =~ %r{^http://cashcats.biz.*} }
109
129
 
110
130
  s.on(:every) do |a_url, resp, prior_url|
111
131
  nodes[prior_url] ||= []
112
132
  nodes[prior_url] << a_url
113
133
  end
114
134
  end
135
+ ```
115
136
 
116
- === Use a proxy
137
+ ### Use a proxy
117
138
 
139
+ ```ruby
118
140
  require 'net/http_configuration'
119
141
  require 'spider'
120
142
  http_conf = Net::HTTP::Configuration.new(:proxy_host => '7proxies.org',
@@ -128,19 +150,4 @@ scraping, collecting, and looping so that you can just handle the data.
128
150
  end
129
151
  end
130
152
  end
131
-
132
- == Author
133
-
134
- John Nagro john.nagro@gmail.com
135
-
136
- Mike Burns http://mike-burns.com mike@mike-burns.com (original author)
137
-
138
- Many thanks to:
139
- Matt Horan
140
- Henri Cook
141
- Sander van der Vliet
142
- John Buckley
143
- Brian Campbell
144
-
145
- With `robot_rules' from James Edward Gray II via
146
- http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589
153
+ ```
@@ -1,41 +1,24 @@
1
- # Copyright 2007-2008 Mike Burns & John Nagro
2
- # :include: README
3
-
4
- # Redistribution and use in source and binary forms, with or without
5
- # modification, are permitted provided that the following conditions are met:
6
- # * Redistributions of source code must retain the above copyright
7
- # notice, this list of conditions and the following disclaimer.
8
- # * Redistributions in binary form must reproduce the above copyright
9
- # notice, this list of conditions and the following disclaimer in the
10
- # documentation and/or other materials provided with the distribution.
11
- # * Neither the name Mike Burns nor the
12
- # names of his contributors may be used to endorse or promote products
13
- # derived from this software without specific prior written permission.
14
- #
15
- # THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
16
- # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17
- # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18
- # DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
19
- # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20
- # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21
- # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22
- # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23
- # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
- # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
-
26
1
  require File.dirname(__FILE__)+'/spider/spider_instance'
27
2
 
28
3
  # A spidering library for Ruby. Handles robots.txt, scraping, finding more
29
4
  # links, and doing it all over again.
30
5
  class Spider
6
+
7
+ VERSION_INFO = [0, 5, 0] unless defined?(self::VERSION_INFO)
8
+ VERSION = VERSION_INFO.map(&:to_s).join('.') unless defined?(self::VERSION)
9
+
10
+ def self.version
11
+ VERSION
12
+ end
13
+
31
14
  # Runs the spider starting at the given URL. Also takes a block that is given
32
15
  # the SpiderInstance. Use the block to define the rules and handlers for
33
16
  # the discovered Web pages. See SpiderInstance for the possible rules and
34
17
  # handlers.
35
18
  #
36
- # Spider.start_at('http://mike-burns.com/') do |s|
19
+ # Spider.start_at('http://cashcats.biz/') do |s|
37
20
  # s.add_url_check do |a_url|
38
- # a_url =~ %r{^http://mike-burns.com.*}
21
+ # a_url =~ %r{^http://cashcats.biz.*}
39
22
  # end
40
23
  #
41
24
  # s.on 404 do |a_url, resp, prior_url|
@@ -52,8 +35,8 @@ class Spider
52
35
  # end
53
36
 
54
37
  def self.start_at(a_url, &block)
55
- rules = RobotRules.new('Ruby Spider 1.0')
56
- a_spider = SpiderInstance.new({nil => a_url}, [], rules, [])
38
+ rules = RobotRules.new("Ruby Spider #{Spider::VERSION}")
39
+ a_spider = SpiderInstance.new({nil => [a_url]}, [], rules, [])
57
40
  block.call(a_spider)
58
41
  a_spider.start!
59
42
  end
@@ -1,32 +1,9 @@
1
1
  # Use memcached to track cycles.
2
- #
3
- # Copyright 2007 Mike Burns
4
- # Redistribution and use in source and binary forms, with or without
5
- # modification, are permitted provided that the following conditions are met:
6
- # * Redistributions of source code must retain the above copyright
7
- # notice, this list of conditions and the following disclaimer.
8
- # * Redistributions in binary form must reproduce the above copyright
9
- # notice, this list of conditions and the following disclaimer in the
10
- # documentation and/or other materials provided with the distribution.
11
- # * Neither the name Mike Burns nor the
12
- # names of his contributors may be used to endorse or promote products
13
- # derived from this software without specific prior written permission.
14
- #
15
- # THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
16
- # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17
- # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18
- # DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
19
- # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20
- # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21
- # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22
- # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23
- # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
- # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
2
 
26
3
  require 'memcache'
27
4
 
28
5
  # A specialized class using memcached to track items stored. It supports
29
- # three operations: new, <<, and include? . Together these can be used to
6
+ # three operations: new, <<, and include? . Together these can be used to
30
7
  # add items to the memcache, then determine whether the item has been added.
31
8
  #
32
9
  # To use it with Spider use the check_already_seen_with method:
@@ -1,34 +1,11 @@
1
1
  # Use AmazonSQS to track nodes to visit.
2
- #
3
- # Copyright 2008 John Nagro
4
- # Redistribution and use in source and binary forms, with or without
5
- # modification, are permitted provided that the following conditions are met:
6
- # * Redistributions of source code must retain the above copyright
7
- # notice, this list of conditions and the following disclaimer.
8
- # * Redistributions in binary form must reproduce the above copyright
9
- # notice, this list of conditions and the following disclaimer in the
10
- # documentation and/or other materials provided with the distribution.
11
- # * Neither the name Mike Burns nor the
12
- # names of his contributors may be used to endorse or promote products
13
- # derived from this software without specific prior written permission.
14
- #
15
- # THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
16
- # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17
- # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18
- # DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
19
- # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20
- # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21
- # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22
- # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23
- # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
- # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
2
 
26
3
  require 'rubygems'
27
4
  require 'right_aws'
28
5
  require 'yaml'
29
6
 
30
7
  # A specialized class using AmazonSQS to track nodes to walk. It supports
31
- # two operations: push and pop . Together these can be used to
8
+ # two operations: push and pop . Together these can be used to
32
9
  # add items to the queue, then pull items off the queue.
33
10
  #
34
11
  # This is useful if you want multiple Spider processes crawling the same
@@ -47,8 +24,8 @@ class NextUrlsInSQS
47
24
  @sqs = RightAws::SqsGen2.new(aws_access_key, aws_secret_access_key)
48
25
  @queue = @sqs.queue(queue_name)
49
26
  end
50
-
51
- # Pull an item off the queue, loop until data is found. Data is
27
+
28
+ # Pull an item off the queue, loop until data is found. Data is
52
29
  # encoded with YAML.
53
30
  def pop
54
31
  while true
@@ -57,10 +34,10 @@ class NextUrlsInSQS
57
34
  sleep 5
58
35
  end
59
36
  end
60
-
37
+
61
38
  # Put data on the queue. Data is encoded with YAML.
62
39
  def push(a_msg)
63
40
  encoded_message = YAML::dump(a_msg)
64
41
  @queue.push(a_msg)
65
- end
66
- end
42
+ end
43
+ end
@@ -1,77 +1,81 @@
1
- # Understand robots.txt.
1
+ #!/usr/local/bin/ruby -w
2
2
 
3
+ # robot_rules.rb
4
+ #
3
5
  # Created by James Edward Gray II on 2006-01-31.
4
6
  # Copyright 2006 Gray Productions. All rights reserved.
7
+ # https://github.com/eribertomota/robot_rules.rb
8
+ # https://github.com/johnnagro/spider/issues/1
5
9
 
6
10
  require "uri"
7
11
 
8
12
  # Based on Perl's WWW::RobotRules module, by Gisle Aas.
9
13
  class RobotRules
10
- def initialize( user_agent )
11
- @user_agent = user_agent.scan(/\S+/).first.sub(%r{/.*}, "").downcase
12
- @rules = Hash.new { |rules, rule| rules[rule] = Array.new }
13
- end
14
+ def initialize( user_agent )
15
+ @user_agent = user_agent.scan(/\S+/).first.sub(%r{/.*},
16
+ "").downcase
17
+ @rules = Hash.new { |rules, rule| rules[rule] = Array.new }
18
+ end
14
19
 
15
- def parse( text_uri, robots_data )
16
- uri = URI.parse(text_uri)
17
- location = "#{uri.host}:#{uri.port}"
18
- @rules.delete(location)
20
+ def parse( text_uri, robots_data )
21
+ uri = URI.parse(text_uri)
22
+ location = "#{uri.host}:#{uri.port}"
23
+ @rules.delete(location)
19
24
 
20
- rules = robots_data.split(/[\015\012]+/).map do |rule|
21
- rule.sub(/\s*#.*$/, "")
22
- end
23
- anon_rules = Array.new
24
- my_rules = Array.new
25
- current = anon_rules
26
- rules.each do |rule|
27
- case rule
28
- when /^\s*User-Agent\s*:\s*(.+?)\s*$/i
29
- break unless my_rules.empty?
25
+ rules = robots_data.split(/[\015\012]+/).
26
+ map { |rule| rule.sub(/\s*#.*$/, "") }
27
+ anon_rules = Array.new
28
+ my_rules = Array.new
29
+ current = anon_rules
30
+ rules.each do |rule|
31
+ case rule
32
+ when /^\s*User-Agent\s*:\s*(.+?)\s*$/i
33
+ break unless my_rules.empty?
30
34
 
31
- current = if $1 == "*"
32
- anon_rules
33
- elsif $1.downcase.index(@user_agent)
34
- my_rules
35
- else
36
- nil
37
- end
38
- when /^\s*Disallow\s*:\s*(.*?)\s*$/i
39
- next if current.nil?
35
+ current = if $1 == "*"
36
+ anon_rules
37
+ elsif $1.downcase.index(@user_agent)
38
+ my_rules
39
+ else
40
+ nil
41
+ end
42
+ when /^\s*Disallow\s*:\s*(.*?)\s*$/i
43
+ next if current.nil?
40
44
 
41
- if $1.empty?
42
- current << nil
43
- else
44
- disallow = URI.parse($1)
45
+ if $1.empty?
46
+ current << nil
47
+ else
48
+ disallow = URI.parse($1)
45
49
 
46
- next unless disallow.scheme.nil? or disallow.scheme ==
47
- uri.scheme
48
- next unless disallow.port.nil? or disallow.port == uri.port
49
- next unless disallow.host.nil? or
50
- disallow.host.downcase == uri.host.downcase
50
+ next unless disallow.scheme.nil? or disallow.scheme ==
51
+ uri.scheme
52
+ next unless disallow.port.nil? or disallow.port == uri.port
53
+ next unless disallow.host.nil? or
54
+ disallow.host.downcase == uri.host.downcase
51
55
 
52
- disallow = disallow.path
53
- disallow = "/" if disallow.empty?
54
- disallow = "/#{disallow}" unless disallow[0] == ?/
56
+ disallow = disallow.path
57
+ disallow = "/" if disallow.empty?
58
+ disallow = "/#{disallow}" unless disallow[0] == ?/
55
59
 
56
- current << disallow
57
- end
58
- end
59
- end
60
+ current << disallow
61
+ end
62
+ end
63
+ end
60
64
 
61
- @rules[location] = if my_rules.empty?
62
- anon_rules.compact
63
- else
64
- my_rules.compact
65
- end
66
- end
65
+ @rules[location] = if my_rules.empty?
66
+ anon_rules.compact
67
+ else
68
+ my_rules.compact
69
+ end
70
+ end
67
71
 
68
- def allowed?( text_uri )
69
- uri = URI.parse(text_uri)
70
- location = "#{uri.host}:#{uri.port}"
71
- path = uri.path
72
+ def allowed?( text_uri )
73
+ uri = URI.parse(text_uri)
74
+ location = "#{uri.host}:#{uri.port}"
75
+ path = uri.path
72
76
 
73
- return true unless %w{http https}.include?(uri.scheme)
77
+ return true unless %w{http https}.include?(uri.scheme)
74
78
 
75
- not @rules[location].any? { |rule| path.index(rule) == 0 }
76
- end
79
+ not @rules[location].any? { |rule| path.index(rule) == 0 }
80
+ end
77
81
  end