spider 0.4.4 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/AUTHORS +12 -0
- data/CHANGES +6 -0
- data/LICENSE +21 -0
- data/{README → README.md} +50 -43
- data/lib/spider.rb +12 -29
- data/lib/spider/included_in_memcached.rb +1 -24
- data/lib/spider/next_urls_in_sqs.rb +6 -29
- data/lib/spider/robot_rules.rb +61 -57
- data/lib/spider/spider_instance.rb +8 -31
- data/spider.gemspec +4 -2
- metadata +33 -124
- data/doc/classes/BeStaticServerPages.html +0 -197
- data/doc/classes/BeStaticServerPages.src/M000030.html +0 -19
- data/doc/classes/BeStaticServerPages.src/M000031.html +0 -19
- data/doc/classes/BeStaticServerPages.src/M000032.html +0 -18
- data/doc/classes/BeStaticServerPages.src/M000033.html +0 -18
- data/doc/classes/IncludedInMemcached.html +0 -199
- data/doc/classes/IncludedInMemcached.src/M000015.html +0 -18
- data/doc/classes/IncludedInMemcached.src/M000016.html +0 -18
- data/doc/classes/IncludedInMemcached.src/M000017.html +0 -18
- data/doc/classes/LoopingServlet.html +0 -137
- data/doc/classes/LoopingServlet.src/M000037.html +0 -23
- data/doc/classes/NextUrlsInSQS.html +0 -204
- data/doc/classes/NextUrlsInSQS.src/M000018.html +0 -19
- data/doc/classes/NextUrlsInSQS.src/M000019.html +0 -22
- data/doc/classes/NextUrlsInSQS.src/M000020.html +0 -19
- data/doc/classes/QueryServlet.html +0 -137
- data/doc/classes/QueryServlet.src/M000038.html +0 -19
- data/doc/classes/RobotRules.html +0 -175
- data/doc/classes/RobotRules.src/M000034.html +0 -19
- data/doc/classes/RobotRules.src/M000035.html +0 -67
- data/doc/classes/RobotRules.src/M000036.html +0 -24
- data/doc/classes/Spider.html +0 -170
- data/doc/classes/Spider.src/M000029.html +0 -21
- data/doc/classes/SpiderInstance.html +0 -345
- data/doc/classes/SpiderInstance.src/M000021.html +0 -18
- data/doc/classes/SpiderInstance.src/M000022.html +0 -22
- data/doc/classes/SpiderInstance.src/M000023.html +0 -22
- data/doc/classes/SpiderInstance.src/M000024.html +0 -24
- data/doc/classes/SpiderInstance.src/M000025.html +0 -18
- data/doc/classes/SpiderInstance.src/M000026.html +0 -18
- data/doc/classes/SpiderInstance.src/M000027.html +0 -18
- data/doc/classes/SpiderInstance.src/M000028.html +0 -18
- data/doc/created.rid +0 -1
- data/doc/files/README.html +0 -223
- data/doc/files/lib/spider/included_in_memcached_rb.html +0 -142
- data/doc/files/lib/spider/next_urls_in_sqs_rb.html +0 -144
- data/doc/files/lib/spider/robot_rules_rb.html +0 -114
- data/doc/files/lib/spider/spider_instance_rb.html +0 -117
- data/doc/files/lib/spider_rb.html +0 -254
- data/doc/files/spec/spec_helper_rb.html +0 -196
- data/doc/files/spec/spec_helper_rb.src/M000001.html +0 -20
- data/doc/files/spec/spec_helper_rb.src/M000002.html +0 -26
- data/doc/files/spec/spec_helper_rb.src/M000003.html +0 -24
- data/doc/files/spec/spec_helper_rb.src/M000004.html +0 -18
- data/doc/files/spec/spec_helper_rb.src/M000005.html +0 -23
- data/doc/files/spec/spider/included_in_memcached_spec_rb.html +0 -142
- data/doc/files/spec/spider/included_in_memcached_spec_rb.src/M000006.html +0 -19
- data/doc/files/spec/spider/included_in_memcached_spec_rb.src/M000007.html +0 -18
- data/doc/files/spec/spider/spider_instance_spec_rb.html +0 -210
- data/doc/files/spec/spider/spider_instance_spec_rb.src/M000008.html +0 -21
- data/doc/files/spec/spider/spider_instance_spec_rb.src/M000009.html +0 -19
- data/doc/files/spec/spider/spider_instance_spec_rb.src/M000010.html +0 -19
- data/doc/files/spec/spider/spider_instance_spec_rb.src/M000011.html +0 -27
- data/doc/files/spec/spider/spider_instance_spec_rb.src/M000012.html +0 -26
- data/doc/files/spec/spider/spider_instance_spec_rb.src/M000013.html +0 -27
- data/doc/files/spec/spider_spec_rb.html +0 -127
- data/doc/files/spec/spider_spec_rb.src/M000014.html +0 -23
- data/doc/fr_class_index.html +0 -34
- data/doc/fr_file_index.html +0 -35
- data/doc/fr_method_index.html +0 -64
- data/doc/index.html +0 -24
- data/doc/rdoc-style.css +0 -208
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 1d6465ee9f80195a1002053f826f1b80187020a3
|
4
|
+
data.tar.gz: 1218142b1d76482cf5baccd1f288934cd7a6b003
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 2725ca0197ec2801836d94615e4ece0196c131a9ff500ed5837c22e320e06b33a8f609add7d41eabb8fa19114a60af71057b5bdebaf8f94e2be116148d6ad123
|
7
|
+
data.tar.gz: 5497c85e9759542ecb0cbb612484de0b185f7428c5a2c5222e1fbc7e1e3f69bac727bfddd883967c5eeb6c5bfaca0b9dfbe130eaaed35cc9e8cb96fb87abddc5
|
data/AUTHORS
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
The Ruby Spider Gem would not be what it is today without the help of
|
2
|
+
the following kind souls:
|
3
|
+
|
4
|
+
Brian Campbell
|
5
|
+
Henri Cook
|
6
|
+
James Edward Gray II
|
7
|
+
Joao Eriberto Mota Filho
|
8
|
+
John Buckley
|
9
|
+
John Nagro
|
10
|
+
Mike Burns
|
11
|
+
Matt Horan
|
12
|
+
Sander van der Vliet
|
data/CHANGES
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
2016-05-13
|
2
|
+
* fixed #1 thanks to @eribertomota
|
3
|
+
* got it running on more recent versions of ruby
|
4
|
+
* cleaned up the docs a bit
|
5
|
+
* cleaned up the licensing and attribution
|
6
|
+
|
1
7
|
2009-05-21
|
2
8
|
* fixed an issue with robots.txt on ssl hosts
|
3
9
|
* fixed an issue with pulling robots.txt from disallowed hosts
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2007-2016 Spider Team Authors
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/{README → README.md}
RENAMED
@@ -1,66 +1,80 @@
|
|
1
1
|
|
2
|
-
Spider
|
3
|
-
|
2
|
+
# Spider
|
3
|
+
_a Web spidering library for Ruby. It handles the robots.txt,
|
4
|
+
scraping, collecting, and looping so that you can just handle the data._
|
4
5
|
|
5
|
-
|
6
|
+
## Examples
|
6
7
|
|
7
|
-
|
8
|
+
### Crawl the Web, loading each page in turn, until you run out of memory
|
8
9
|
|
10
|
+
```ruby
|
9
11
|
require 'spider'
|
10
|
-
Spider.start_at('http://
|
12
|
+
Spider.start_at('http://cashcats.biz/') {}
|
13
|
+
```
|
11
14
|
|
12
|
-
|
15
|
+
### To handle erroneous responses
|
13
16
|
|
17
|
+
```ruby
|
14
18
|
require 'spider'
|
15
|
-
Spider.start_at('http://
|
19
|
+
Spider.start_at('http://cashcats.biz/') do |s|
|
16
20
|
s.on :failure do |a_url, resp, prior_url|
|
17
21
|
puts "URL failed: #{a_url}"
|
18
22
|
puts " linked from #{prior_url}"
|
19
23
|
end
|
20
24
|
end
|
25
|
+
```
|
21
26
|
|
22
|
-
|
27
|
+
### Or handle successful responses
|
23
28
|
|
29
|
+
```ruby
|
24
30
|
require 'spider'
|
25
|
-
Spider.start_at('http://
|
31
|
+
Spider.start_at('http://cashcats.biz/') do |s|
|
26
32
|
s.on :success do |a_url, resp, prior_url|
|
27
33
|
puts "#{a_url}: #{resp.code}"
|
28
34
|
puts resp.body
|
29
35
|
puts
|
30
36
|
end
|
31
37
|
end
|
38
|
+
```
|
32
39
|
|
33
|
-
|
40
|
+
### Limit to just one domain
|
34
41
|
|
42
|
+
```ruby
|
35
43
|
require 'spider'
|
36
|
-
Spider.start_at('http://
|
44
|
+
Spider.start_at('http://cashcats.biz/') do |s|
|
37
45
|
s.add_url_check do |a_url|
|
38
|
-
a_url =~ %r{^http://
|
46
|
+
a_url =~ %r{^http://cashcats.biz.*}
|
39
47
|
end
|
40
48
|
end
|
49
|
+
```
|
41
50
|
|
42
|
-
|
51
|
+
### Pass headers to some requests
|
43
52
|
|
53
|
+
```ruby
|
44
54
|
require 'spider'
|
45
|
-
Spider.start_at('http://
|
55
|
+
Spider.start_at('http://cashcats.biz/') do |s|
|
46
56
|
s.setup do |a_url|
|
47
57
|
if a_url =~ %r{^http://.*wikipedia.*}
|
48
58
|
headers['User-Agent'] = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
49
59
|
end
|
50
60
|
end
|
51
61
|
end
|
62
|
+
```
|
52
63
|
|
53
|
-
|
64
|
+
### Use memcached to track cycles
|
54
65
|
|
66
|
+
```ruby
|
55
67
|
require 'spider'
|
56
68
|
require 'spider/included_in_memcached'
|
57
69
|
SERVERS = ['10.0.10.2:11211','10.0.10.3:11211','10.0.10.4:11211']
|
58
|
-
Spider.start_at('http://
|
70
|
+
Spider.start_at('http://cashcats.biz/') do |s|
|
59
71
|
s.check_already_seen_with IncludedInMemcached.new(SERVERS)
|
60
72
|
end
|
73
|
+
```
|
61
74
|
|
62
|
-
|
75
|
+
### Track cycles with a custom object
|
63
76
|
|
77
|
+
```ruby
|
64
78
|
require 'spider'
|
65
79
|
class ExpireLinks < Hash
|
66
80
|
def <<(v)
|
@@ -71,50 +85,58 @@ scraping, collecting, and looping so that you can just handle the data.
|
|
71
85
|
end
|
72
86
|
end
|
73
87
|
|
74
|
-
Spider.start_at('http://
|
88
|
+
Spider.start_at('http://cashcats.biz/') do |s|
|
75
89
|
s.check_already_seen_with ExpireLinks.new
|
76
90
|
end
|
91
|
+
```
|
77
92
|
|
78
|
-
|
93
|
+
### Store nodes to visit with Amazon SQS
|
79
94
|
|
95
|
+
```ruby
|
80
96
|
require 'spider'
|
81
97
|
require 'spider/next_urls_in_sqs'
|
82
|
-
Spider.start_at('http://
|
98
|
+
Spider.start_at('http://cashcats.biz') do |s|
|
83
99
|
s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY)
|
84
100
|
end
|
101
|
+
```
|
85
102
|
|
86
|
-
|
103
|
+
### Store nodes to visit with a custom object
|
87
104
|
|
105
|
+
```ruby
|
88
106
|
require 'spider'
|
89
107
|
class MyArray < Array
|
90
108
|
def pop
|
91
109
|
super
|
92
110
|
end
|
93
|
-
|
111
|
+
|
94
112
|
def push(a_msg)
|
95
113
|
super(a_msg)
|
96
114
|
end
|
97
115
|
end
|
98
116
|
|
99
|
-
Spider.start_at('http://
|
117
|
+
Spider.start_at('http://cashcats.biz') do |s|
|
100
118
|
s.store_next_urls_with MyArray.new
|
101
119
|
end
|
120
|
+
```
|
102
121
|
|
103
|
-
|
122
|
+
### Create a URL graph
|
104
123
|
|
124
|
+
```ruby
|
105
125
|
require 'spider'
|
106
126
|
nodes = {}
|
107
|
-
Spider.start_at('http://
|
108
|
-
s.add_url_check {|a_url| a_url =~ %r{^http://
|
127
|
+
Spider.start_at('http://cashcats.biz/') do |s|
|
128
|
+
s.add_url_check {|a_url| a_url =~ %r{^http://cashcats.biz.*} }
|
109
129
|
|
110
130
|
s.on(:every) do |a_url, resp, prior_url|
|
111
131
|
nodes[prior_url] ||= []
|
112
132
|
nodes[prior_url] << a_url
|
113
133
|
end
|
114
134
|
end
|
135
|
+
```
|
115
136
|
|
116
|
-
|
137
|
+
### Use a proxy
|
117
138
|
|
139
|
+
```ruby
|
118
140
|
require 'net/http_configuration'
|
119
141
|
require 'spider'
|
120
142
|
http_conf = Net::HTTP::Configuration.new(:proxy_host => '7proxies.org',
|
@@ -128,19 +150,4 @@ scraping, collecting, and looping so that you can just handle the data.
|
|
128
150
|
end
|
129
151
|
end
|
130
152
|
end
|
131
|
-
|
132
|
-
== Author
|
133
|
-
|
134
|
-
John Nagro john.nagro@gmail.com
|
135
|
-
|
136
|
-
Mike Burns http://mike-burns.com mike@mike-burns.com (original author)
|
137
|
-
|
138
|
-
Many thanks to:
|
139
|
-
Matt Horan
|
140
|
-
Henri Cook
|
141
|
-
Sander van der Vliet
|
142
|
-
John Buckley
|
143
|
-
Brian Campbell
|
144
|
-
|
145
|
-
With `robot_rules' from James Edward Gray II via
|
146
|
-
http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589
|
153
|
+
```
|
data/lib/spider.rb
CHANGED
@@ -1,41 +1,24 @@
|
|
1
|
-
# Copyright 2007-2008 Mike Burns & John Nagro
|
2
|
-
# :include: README
|
3
|
-
|
4
|
-
# Redistribution and use in source and binary forms, with or without
|
5
|
-
# modification, are permitted provided that the following conditions are met:
|
6
|
-
# * Redistributions of source code must retain the above copyright
|
7
|
-
# notice, this list of conditions and the following disclaimer.
|
8
|
-
# * Redistributions in binary form must reproduce the above copyright
|
9
|
-
# notice, this list of conditions and the following disclaimer in the
|
10
|
-
# documentation and/or other materials provided with the distribution.
|
11
|
-
# * Neither the name Mike Burns nor the
|
12
|
-
# names of his contributors may be used to endorse or promote products
|
13
|
-
# derived from this software without specific prior written permission.
|
14
|
-
#
|
15
|
-
# THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
|
16
|
-
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
17
|
-
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
18
|
-
# DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
|
19
|
-
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
20
|
-
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
21
|
-
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
22
|
-
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
23
|
-
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
24
|
-
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
25
|
-
|
26
1
|
require File.dirname(__FILE__)+'/spider/spider_instance'
|
27
2
|
|
28
3
|
# A spidering library for Ruby. Handles robots.txt, scraping, finding more
|
29
4
|
# links, and doing it all over again.
|
30
5
|
class Spider
|
6
|
+
|
7
|
+
VERSION_INFO = [0, 5, 0] unless defined?(self::VERSION_INFO)
|
8
|
+
VERSION = VERSION_INFO.map(&:to_s).join('.') unless defined?(self::VERSION)
|
9
|
+
|
10
|
+
def self.version
|
11
|
+
VERSION
|
12
|
+
end
|
13
|
+
|
31
14
|
# Runs the spider starting at the given URL. Also takes a block that is given
|
32
15
|
# the SpiderInstance. Use the block to define the rules and handlers for
|
33
16
|
# the discovered Web pages. See SpiderInstance for the possible rules and
|
34
17
|
# handlers.
|
35
18
|
#
|
36
|
-
# Spider.start_at('http://
|
19
|
+
# Spider.start_at('http://cashcats.biz/') do |s|
|
37
20
|
# s.add_url_check do |a_url|
|
38
|
-
# a_url =~ %r{^http://
|
21
|
+
# a_url =~ %r{^http://cashcats.biz.*}
|
39
22
|
# end
|
40
23
|
#
|
41
24
|
# s.on 404 do |a_url, resp, prior_url|
|
@@ -52,8 +35,8 @@ class Spider
|
|
52
35
|
# end
|
53
36
|
|
54
37
|
def self.start_at(a_url, &block)
|
55
|
-
rules = RobotRules.new(
|
56
|
-
a_spider = SpiderInstance.new({nil => a_url}, [], rules, [])
|
38
|
+
rules = RobotRules.new("Ruby Spider #{Spider::VERSION}")
|
39
|
+
a_spider = SpiderInstance.new({nil => [a_url]}, [], rules, [])
|
57
40
|
block.call(a_spider)
|
58
41
|
a_spider.start!
|
59
42
|
end
|
@@ -1,32 +1,9 @@
|
|
1
1
|
# Use memcached to track cycles.
|
2
|
-
#
|
3
|
-
# Copyright 2007 Mike Burns
|
4
|
-
# Redistribution and use in source and binary forms, with or without
|
5
|
-
# modification, are permitted provided that the following conditions are met:
|
6
|
-
# * Redistributions of source code must retain the above copyright
|
7
|
-
# notice, this list of conditions and the following disclaimer.
|
8
|
-
# * Redistributions in binary form must reproduce the above copyright
|
9
|
-
# notice, this list of conditions and the following disclaimer in the
|
10
|
-
# documentation and/or other materials provided with the distribution.
|
11
|
-
# * Neither the name Mike Burns nor the
|
12
|
-
# names of his contributors may be used to endorse or promote products
|
13
|
-
# derived from this software without specific prior written permission.
|
14
|
-
#
|
15
|
-
# THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
|
16
|
-
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
17
|
-
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
18
|
-
# DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
|
19
|
-
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
20
|
-
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
21
|
-
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
22
|
-
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
23
|
-
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
24
|
-
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
25
2
|
|
26
3
|
require 'memcache'
|
27
4
|
|
28
5
|
# A specialized class using memcached to track items stored. It supports
|
29
|
-
# three operations: new, <<, and include? . Together these can be used to
|
6
|
+
# three operations: new, <<, and include? . Together these can be used to
|
30
7
|
# add items to the memcache, then determine whether the item has been added.
|
31
8
|
#
|
32
9
|
# To use it with Spider use the check_already_seen_with method:
|
@@ -1,34 +1,11 @@
|
|
1
1
|
# Use AmazonSQS to track nodes to visit.
|
2
|
-
#
|
3
|
-
# Copyright 2008 John Nagro
|
4
|
-
# Redistribution and use in source and binary forms, with or without
|
5
|
-
# modification, are permitted provided that the following conditions are met:
|
6
|
-
# * Redistributions of source code must retain the above copyright
|
7
|
-
# notice, this list of conditions and the following disclaimer.
|
8
|
-
# * Redistributions in binary form must reproduce the above copyright
|
9
|
-
# notice, this list of conditions and the following disclaimer in the
|
10
|
-
# documentation and/or other materials provided with the distribution.
|
11
|
-
# * Neither the name Mike Burns nor the
|
12
|
-
# names of his contributors may be used to endorse or promote products
|
13
|
-
# derived from this software without specific prior written permission.
|
14
|
-
#
|
15
|
-
# THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
|
16
|
-
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
17
|
-
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
18
|
-
# DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
|
19
|
-
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
20
|
-
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
21
|
-
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
22
|
-
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
23
|
-
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
24
|
-
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
25
2
|
|
26
3
|
require 'rubygems'
|
27
4
|
require 'right_aws'
|
28
5
|
require 'yaml'
|
29
6
|
|
30
7
|
# A specialized class using AmazonSQS to track nodes to walk. It supports
|
31
|
-
# two operations: push and pop . Together these can be used to
|
8
|
+
# two operations: push and pop . Together these can be used to
|
32
9
|
# add items to the queue, then pull items off the queue.
|
33
10
|
#
|
34
11
|
# This is useful if you want multiple Spider processes crawling the same
|
@@ -47,8 +24,8 @@ class NextUrlsInSQS
|
|
47
24
|
@sqs = RightAws::SqsGen2.new(aws_access_key, aws_secret_access_key)
|
48
25
|
@queue = @sqs.queue(queue_name)
|
49
26
|
end
|
50
|
-
|
51
|
-
# Pull an item off the queue, loop until data is found. Data is
|
27
|
+
|
28
|
+
# Pull an item off the queue, loop until data is found. Data is
|
52
29
|
# encoded with YAML.
|
53
30
|
def pop
|
54
31
|
while true
|
@@ -57,10 +34,10 @@ class NextUrlsInSQS
|
|
57
34
|
sleep 5
|
58
35
|
end
|
59
36
|
end
|
60
|
-
|
37
|
+
|
61
38
|
# Put data on the queue. Data is encoded with YAML.
|
62
39
|
def push(a_msg)
|
63
40
|
encoded_message = YAML::dump(a_msg)
|
64
41
|
@queue.push(a_msg)
|
65
|
-
end
|
66
|
-
end
|
42
|
+
end
|
43
|
+
end
|
data/lib/spider/robot_rules.rb
CHANGED
@@ -1,77 +1,81 @@
|
|
1
|
-
|
1
|
+
#!/usr/local/bin/ruby -w
|
2
2
|
|
3
|
+
# robot_rules.rb
|
4
|
+
#
|
3
5
|
# Created by James Edward Gray II on 2006-01-31.
|
4
6
|
# Copyright 2006 Gray Productions. All rights reserved.
|
7
|
+
# https://github.com/eribertomota/robot_rules.rb
|
8
|
+
# https://github.com/johnnagro/spider/issues/1
|
5
9
|
|
6
10
|
require "uri"
|
7
11
|
|
8
12
|
# Based on Perl's WWW::RobotRules module, by Gisle Aas.
|
9
13
|
class RobotRules
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
+
def initialize( user_agent )
|
15
|
+
@user_agent = user_agent.scan(/\S+/).first.sub(%r{/.*},
|
16
|
+
"").downcase
|
17
|
+
@rules = Hash.new { |rules, rule| rules[rule] = Array.new }
|
18
|
+
end
|
14
19
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
20
|
+
def parse( text_uri, robots_data )
|
21
|
+
uri = URI.parse(text_uri)
|
22
|
+
location = "#{uri.host}:#{uri.port}"
|
23
|
+
@rules.delete(location)
|
19
24
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
break unless my_rules.empty?
|
25
|
+
rules = robots_data.split(/[\015\012]+/).
|
26
|
+
map { |rule| rule.sub(/\s*#.*$/, "") }
|
27
|
+
anon_rules = Array.new
|
28
|
+
my_rules = Array.new
|
29
|
+
current = anon_rules
|
30
|
+
rules.each do |rule|
|
31
|
+
case rule
|
32
|
+
when /^\s*User-Agent\s*:\s*(.+?)\s*$/i
|
33
|
+
break unless my_rules.empty?
|
30
34
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
35
|
+
current = if $1 == "*"
|
36
|
+
anon_rules
|
37
|
+
elsif $1.downcase.index(@user_agent)
|
38
|
+
my_rules
|
39
|
+
else
|
40
|
+
nil
|
41
|
+
end
|
42
|
+
when /^\s*Disallow\s*:\s*(.*?)\s*$/i
|
43
|
+
next if current.nil?
|
40
44
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
+
if $1.empty?
|
46
|
+
current << nil
|
47
|
+
else
|
48
|
+
disallow = URI.parse($1)
|
45
49
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
50
|
+
next unless disallow.scheme.nil? or disallow.scheme ==
|
51
|
+
uri.scheme
|
52
|
+
next unless disallow.port.nil? or disallow.port == uri.port
|
53
|
+
next unless disallow.host.nil? or
|
54
|
+
disallow.host.downcase == uri.host.downcase
|
51
55
|
|
52
|
-
|
53
|
-
|
54
|
-
|
56
|
+
disallow = disallow.path
|
57
|
+
disallow = "/" if disallow.empty?
|
58
|
+
disallow = "/#{disallow}" unless disallow[0] == ?/
|
55
59
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
+
current << disallow
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
60
64
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
65
|
+
@rules[location] = if my_rules.empty?
|
66
|
+
anon_rules.compact
|
67
|
+
else
|
68
|
+
my_rules.compact
|
69
|
+
end
|
70
|
+
end
|
67
71
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
+
def allowed?( text_uri )
|
73
|
+
uri = URI.parse(text_uri)
|
74
|
+
location = "#{uri.host}:#{uri.port}"
|
75
|
+
path = uri.path
|
72
76
|
|
73
|
-
|
77
|
+
return true unless %w{http https}.include?(uri.scheme)
|
74
78
|
|
75
|
-
|
76
|
-
|
79
|
+
not @rules[location].any? { |rule| path.index(rule) == 0 }
|
80
|
+
end
|
77
81
|
end
|