spider 0.4.4 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/AUTHORS +12 -0
- data/CHANGES +6 -0
- data/LICENSE +21 -0
- data/{README → README.md} +50 -43
- data/lib/spider.rb +12 -29
- data/lib/spider/included_in_memcached.rb +1 -24
- data/lib/spider/next_urls_in_sqs.rb +6 -29
- data/lib/spider/robot_rules.rb +61 -57
- data/lib/spider/spider_instance.rb +8 -31
- data/spider.gemspec +4 -2
- metadata +33 -124
- data/doc/classes/BeStaticServerPages.html +0 -197
- data/doc/classes/BeStaticServerPages.src/M000030.html +0 -19
- data/doc/classes/BeStaticServerPages.src/M000031.html +0 -19
- data/doc/classes/BeStaticServerPages.src/M000032.html +0 -18
- data/doc/classes/BeStaticServerPages.src/M000033.html +0 -18
- data/doc/classes/IncludedInMemcached.html +0 -199
- data/doc/classes/IncludedInMemcached.src/M000015.html +0 -18
- data/doc/classes/IncludedInMemcached.src/M000016.html +0 -18
- data/doc/classes/IncludedInMemcached.src/M000017.html +0 -18
- data/doc/classes/LoopingServlet.html +0 -137
- data/doc/classes/LoopingServlet.src/M000037.html +0 -23
- data/doc/classes/NextUrlsInSQS.html +0 -204
- data/doc/classes/NextUrlsInSQS.src/M000018.html +0 -19
- data/doc/classes/NextUrlsInSQS.src/M000019.html +0 -22
- data/doc/classes/NextUrlsInSQS.src/M000020.html +0 -19
- data/doc/classes/QueryServlet.html +0 -137
- data/doc/classes/QueryServlet.src/M000038.html +0 -19
- data/doc/classes/RobotRules.html +0 -175
- data/doc/classes/RobotRules.src/M000034.html +0 -19
- data/doc/classes/RobotRules.src/M000035.html +0 -67
- data/doc/classes/RobotRules.src/M000036.html +0 -24
- data/doc/classes/Spider.html +0 -170
- data/doc/classes/Spider.src/M000029.html +0 -21
- data/doc/classes/SpiderInstance.html +0 -345
- data/doc/classes/SpiderInstance.src/M000021.html +0 -18
- data/doc/classes/SpiderInstance.src/M000022.html +0 -22
- data/doc/classes/SpiderInstance.src/M000023.html +0 -22
- data/doc/classes/SpiderInstance.src/M000024.html +0 -24
- data/doc/classes/SpiderInstance.src/M000025.html +0 -18
- data/doc/classes/SpiderInstance.src/M000026.html +0 -18
- data/doc/classes/SpiderInstance.src/M000027.html +0 -18
- data/doc/classes/SpiderInstance.src/M000028.html +0 -18
- data/doc/created.rid +0 -1
- data/doc/files/README.html +0 -223
- data/doc/files/lib/spider/included_in_memcached_rb.html +0 -142
- data/doc/files/lib/spider/next_urls_in_sqs_rb.html +0 -144
- data/doc/files/lib/spider/robot_rules_rb.html +0 -114
- data/doc/files/lib/spider/spider_instance_rb.html +0 -117
- data/doc/files/lib/spider_rb.html +0 -254
- data/doc/files/spec/spec_helper_rb.html +0 -196
- data/doc/files/spec/spec_helper_rb.src/M000001.html +0 -20
- data/doc/files/spec/spec_helper_rb.src/M000002.html +0 -26
- data/doc/files/spec/spec_helper_rb.src/M000003.html +0 -24
- data/doc/files/spec/spec_helper_rb.src/M000004.html +0 -18
- data/doc/files/spec/spec_helper_rb.src/M000005.html +0 -23
- data/doc/files/spec/spider/included_in_memcached_spec_rb.html +0 -142
- data/doc/files/spec/spider/included_in_memcached_spec_rb.src/M000006.html +0 -19
- data/doc/files/spec/spider/included_in_memcached_spec_rb.src/M000007.html +0 -18
- data/doc/files/spec/spider/spider_instance_spec_rb.html +0 -210
- data/doc/files/spec/spider/spider_instance_spec_rb.src/M000008.html +0 -21
- data/doc/files/spec/spider/spider_instance_spec_rb.src/M000009.html +0 -19
- data/doc/files/spec/spider/spider_instance_spec_rb.src/M000010.html +0 -19
- data/doc/files/spec/spider/spider_instance_spec_rb.src/M000011.html +0 -27
- data/doc/files/spec/spider/spider_instance_spec_rb.src/M000012.html +0 -26
- data/doc/files/spec/spider/spider_instance_spec_rb.src/M000013.html +0 -27
- data/doc/files/spec/spider_spec_rb.html +0 -127
- data/doc/files/spec/spider_spec_rb.src/M000014.html +0 -23
- data/doc/fr_class_index.html +0 -34
- data/doc/fr_file_index.html +0 -35
- data/doc/fr_method_index.html +0 -64
- data/doc/index.html +0 -24
- data/doc/rdoc-style.css +0 -208
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA1:
|
|
3
|
+
metadata.gz: 1d6465ee9f80195a1002053f826f1b80187020a3
|
|
4
|
+
data.tar.gz: 1218142b1d76482cf5baccd1f288934cd7a6b003
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 2725ca0197ec2801836d94615e4ece0196c131a9ff500ed5837c22e320e06b33a8f609add7d41eabb8fa19114a60af71057b5bdebaf8f94e2be116148d6ad123
|
|
7
|
+
data.tar.gz: 5497c85e9759542ecb0cbb612484de0b185f7428c5a2c5222e1fbc7e1e3f69bac727bfddd883967c5eeb6c5bfaca0b9dfbe130eaaed35cc9e8cb96fb87abddc5
|
data/AUTHORS
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
The Ruby Spider Gem would not be what it is today without the help of
|
|
2
|
+
the following kind souls:
|
|
3
|
+
|
|
4
|
+
Brian Campbell
|
|
5
|
+
Henri Cook
|
|
6
|
+
James Edward Gray II
|
|
7
|
+
Joao Eriberto Mota Filho
|
|
8
|
+
John Buckley
|
|
9
|
+
John Nagro
|
|
10
|
+
Mike Burns
|
|
11
|
+
Matt Horan
|
|
12
|
+
Sander van der Vliet
|
data/CHANGES
CHANGED
|
@@ -1,3 +1,9 @@
|
|
|
1
|
+
2016-05-13
|
|
2
|
+
* fixed #1 thanks to @eribertomota
|
|
3
|
+
* got it running on more recent versions of ruby
|
|
4
|
+
* cleaned up the docs a bit
|
|
5
|
+
* cleaned up the licensing and attribution
|
|
6
|
+
|
|
1
7
|
2009-05-21
|
|
2
8
|
* fixed an issue with robots.txt on ssl hosts
|
|
3
9
|
* fixed an issue with pulling robots.txt from disallowed hosts
|
data/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2007-2016 Spider Team Authors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
data/{README → README.md}
RENAMED
|
@@ -1,66 +1,80 @@
|
|
|
1
1
|
|
|
2
|
-
Spider
|
|
3
|
-
|
|
2
|
+
# Spider
|
|
3
|
+
_a Web spidering library for Ruby. It handles the robots.txt,
|
|
4
|
+
scraping, collecting, and looping so that you can just handle the data._
|
|
4
5
|
|
|
5
|
-
|
|
6
|
+
## Examples
|
|
6
7
|
|
|
7
|
-
|
|
8
|
+
### Crawl the Web, loading each page in turn, until you run out of memory
|
|
8
9
|
|
|
10
|
+
```ruby
|
|
9
11
|
require 'spider'
|
|
10
|
-
Spider.start_at('http://
|
|
12
|
+
Spider.start_at('http://cashcats.biz/') {}
|
|
13
|
+
```
|
|
11
14
|
|
|
12
|
-
|
|
15
|
+
### To handle erroneous responses
|
|
13
16
|
|
|
17
|
+
```ruby
|
|
14
18
|
require 'spider'
|
|
15
|
-
Spider.start_at('http://
|
|
19
|
+
Spider.start_at('http://cashcats.biz/') do |s|
|
|
16
20
|
s.on :failure do |a_url, resp, prior_url|
|
|
17
21
|
puts "URL failed: #{a_url}"
|
|
18
22
|
puts " linked from #{prior_url}"
|
|
19
23
|
end
|
|
20
24
|
end
|
|
25
|
+
```
|
|
21
26
|
|
|
22
|
-
|
|
27
|
+
### Or handle successful responses
|
|
23
28
|
|
|
29
|
+
```ruby
|
|
24
30
|
require 'spider'
|
|
25
|
-
Spider.start_at('http://
|
|
31
|
+
Spider.start_at('http://cashcats.biz/') do |s|
|
|
26
32
|
s.on :success do |a_url, resp, prior_url|
|
|
27
33
|
puts "#{a_url}: #{resp.code}"
|
|
28
34
|
puts resp.body
|
|
29
35
|
puts
|
|
30
36
|
end
|
|
31
37
|
end
|
|
38
|
+
```
|
|
32
39
|
|
|
33
|
-
|
|
40
|
+
### Limit to just one domain
|
|
34
41
|
|
|
42
|
+
```ruby
|
|
35
43
|
require 'spider'
|
|
36
|
-
Spider.start_at('http://
|
|
44
|
+
Spider.start_at('http://cashcats.biz/') do |s|
|
|
37
45
|
s.add_url_check do |a_url|
|
|
38
|
-
a_url =~ %r{^http://
|
|
46
|
+
a_url =~ %r{^http://cashcats.biz.*}
|
|
39
47
|
end
|
|
40
48
|
end
|
|
49
|
+
```
|
|
41
50
|
|
|
42
|
-
|
|
51
|
+
### Pass headers to some requests
|
|
43
52
|
|
|
53
|
+
```ruby
|
|
44
54
|
require 'spider'
|
|
45
|
-
Spider.start_at('http://
|
|
55
|
+
Spider.start_at('http://cashcats.biz/') do |s|
|
|
46
56
|
s.setup do |a_url|
|
|
47
57
|
if a_url =~ %r{^http://.*wikipedia.*}
|
|
48
58
|
headers['User-Agent'] = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
|
49
59
|
end
|
|
50
60
|
end
|
|
51
61
|
end
|
|
62
|
+
```
|
|
52
63
|
|
|
53
|
-
|
|
64
|
+
### Use memcached to track cycles
|
|
54
65
|
|
|
66
|
+
```ruby
|
|
55
67
|
require 'spider'
|
|
56
68
|
require 'spider/included_in_memcached'
|
|
57
69
|
SERVERS = ['10.0.10.2:11211','10.0.10.3:11211','10.0.10.4:11211']
|
|
58
|
-
Spider.start_at('http://
|
|
70
|
+
Spider.start_at('http://cashcats.biz/') do |s|
|
|
59
71
|
s.check_already_seen_with IncludedInMemcached.new(SERVERS)
|
|
60
72
|
end
|
|
73
|
+
```
|
|
61
74
|
|
|
62
|
-
|
|
75
|
+
### Track cycles with a custom object
|
|
63
76
|
|
|
77
|
+
```ruby
|
|
64
78
|
require 'spider'
|
|
65
79
|
class ExpireLinks < Hash
|
|
66
80
|
def <<(v)
|
|
@@ -71,50 +85,58 @@ scraping, collecting, and looping so that you can just handle the data.
|
|
|
71
85
|
end
|
|
72
86
|
end
|
|
73
87
|
|
|
74
|
-
Spider.start_at('http://
|
|
88
|
+
Spider.start_at('http://cashcats.biz/') do |s|
|
|
75
89
|
s.check_already_seen_with ExpireLinks.new
|
|
76
90
|
end
|
|
91
|
+
```
|
|
77
92
|
|
|
78
|
-
|
|
93
|
+
### Store nodes to visit with Amazon SQS
|
|
79
94
|
|
|
95
|
+
```ruby
|
|
80
96
|
require 'spider'
|
|
81
97
|
require 'spider/next_urls_in_sqs'
|
|
82
|
-
Spider.start_at('http://
|
|
98
|
+
Spider.start_at('http://cashcats.biz') do |s|
|
|
83
99
|
s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY)
|
|
84
100
|
end
|
|
101
|
+
```
|
|
85
102
|
|
|
86
|
-
|
|
103
|
+
### Store nodes to visit with a custom object
|
|
87
104
|
|
|
105
|
+
```ruby
|
|
88
106
|
require 'spider'
|
|
89
107
|
class MyArray < Array
|
|
90
108
|
def pop
|
|
91
109
|
super
|
|
92
110
|
end
|
|
93
|
-
|
|
111
|
+
|
|
94
112
|
def push(a_msg)
|
|
95
113
|
super(a_msg)
|
|
96
114
|
end
|
|
97
115
|
end
|
|
98
116
|
|
|
99
|
-
Spider.start_at('http://
|
|
117
|
+
Spider.start_at('http://cashcats.biz') do |s|
|
|
100
118
|
s.store_next_urls_with MyArray.new
|
|
101
119
|
end
|
|
120
|
+
```
|
|
102
121
|
|
|
103
|
-
|
|
122
|
+
### Create a URL graph
|
|
104
123
|
|
|
124
|
+
```ruby
|
|
105
125
|
require 'spider'
|
|
106
126
|
nodes = {}
|
|
107
|
-
Spider.start_at('http://
|
|
108
|
-
s.add_url_check {|a_url| a_url =~ %r{^http://
|
|
127
|
+
Spider.start_at('http://cashcats.biz/') do |s|
|
|
128
|
+
s.add_url_check {|a_url| a_url =~ %r{^http://cashcats.biz.*} }
|
|
109
129
|
|
|
110
130
|
s.on(:every) do |a_url, resp, prior_url|
|
|
111
131
|
nodes[prior_url] ||= []
|
|
112
132
|
nodes[prior_url] << a_url
|
|
113
133
|
end
|
|
114
134
|
end
|
|
135
|
+
```
|
|
115
136
|
|
|
116
|
-
|
|
137
|
+
### Use a proxy
|
|
117
138
|
|
|
139
|
+
```ruby
|
|
118
140
|
require 'net/http_configuration'
|
|
119
141
|
require 'spider'
|
|
120
142
|
http_conf = Net::HTTP::Configuration.new(:proxy_host => '7proxies.org',
|
|
@@ -128,19 +150,4 @@ scraping, collecting, and looping so that you can just handle the data.
|
|
|
128
150
|
end
|
|
129
151
|
end
|
|
130
152
|
end
|
|
131
|
-
|
|
132
|
-
== Author
|
|
133
|
-
|
|
134
|
-
John Nagro john.nagro@gmail.com
|
|
135
|
-
|
|
136
|
-
Mike Burns http://mike-burns.com mike@mike-burns.com (original author)
|
|
137
|
-
|
|
138
|
-
Many thanks to:
|
|
139
|
-
Matt Horan
|
|
140
|
-
Henri Cook
|
|
141
|
-
Sander van der Vliet
|
|
142
|
-
John Buckley
|
|
143
|
-
Brian Campbell
|
|
144
|
-
|
|
145
|
-
With `robot_rules' from James Edward Gray II via
|
|
146
|
-
http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589
|
|
153
|
+
```
|
data/lib/spider.rb
CHANGED
|
@@ -1,41 +1,24 @@
|
|
|
1
|
-
# Copyright 2007-2008 Mike Burns & John Nagro
|
|
2
|
-
# :include: README
|
|
3
|
-
|
|
4
|
-
# Redistribution and use in source and binary forms, with or without
|
|
5
|
-
# modification, are permitted provided that the following conditions are met:
|
|
6
|
-
# * Redistributions of source code must retain the above copyright
|
|
7
|
-
# notice, this list of conditions and the following disclaimer.
|
|
8
|
-
# * Redistributions in binary form must reproduce the above copyright
|
|
9
|
-
# notice, this list of conditions and the following disclaimer in the
|
|
10
|
-
# documentation and/or other materials provided with the distribution.
|
|
11
|
-
# * Neither the name Mike Burns nor the
|
|
12
|
-
# names of his contributors may be used to endorse or promote products
|
|
13
|
-
# derived from this software without specific prior written permission.
|
|
14
|
-
#
|
|
15
|
-
# THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
|
|
16
|
-
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
17
|
-
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
18
|
-
# DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
|
|
19
|
-
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
20
|
-
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
21
|
-
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
22
|
-
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
23
|
-
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
24
|
-
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
25
|
-
|
|
26
1
|
require File.dirname(__FILE__)+'/spider/spider_instance'
|
|
27
2
|
|
|
28
3
|
# A spidering library for Ruby. Handles robots.txt, scraping, finding more
|
|
29
4
|
# links, and doing it all over again.
|
|
30
5
|
class Spider
|
|
6
|
+
|
|
7
|
+
VERSION_INFO = [0, 5, 0] unless defined?(self::VERSION_INFO)
|
|
8
|
+
VERSION = VERSION_INFO.map(&:to_s).join('.') unless defined?(self::VERSION)
|
|
9
|
+
|
|
10
|
+
def self.version
|
|
11
|
+
VERSION
|
|
12
|
+
end
|
|
13
|
+
|
|
31
14
|
# Runs the spider starting at the given URL. Also takes a block that is given
|
|
32
15
|
# the SpiderInstance. Use the block to define the rules and handlers for
|
|
33
16
|
# the discovered Web pages. See SpiderInstance for the possible rules and
|
|
34
17
|
# handlers.
|
|
35
18
|
#
|
|
36
|
-
# Spider.start_at('http://
|
|
19
|
+
# Spider.start_at('http://cashcats.biz/') do |s|
|
|
37
20
|
# s.add_url_check do |a_url|
|
|
38
|
-
# a_url =~ %r{^http://
|
|
21
|
+
# a_url =~ %r{^http://cashcats.biz.*}
|
|
39
22
|
# end
|
|
40
23
|
#
|
|
41
24
|
# s.on 404 do |a_url, resp, prior_url|
|
|
@@ -52,8 +35,8 @@ class Spider
|
|
|
52
35
|
# end
|
|
53
36
|
|
|
54
37
|
def self.start_at(a_url, &block)
|
|
55
|
-
rules = RobotRules.new(
|
|
56
|
-
a_spider = SpiderInstance.new({nil => a_url}, [], rules, [])
|
|
38
|
+
rules = RobotRules.new("Ruby Spider #{Spider::VERSION}")
|
|
39
|
+
a_spider = SpiderInstance.new({nil => [a_url]}, [], rules, [])
|
|
57
40
|
block.call(a_spider)
|
|
58
41
|
a_spider.start!
|
|
59
42
|
end
|
|
@@ -1,32 +1,9 @@
|
|
|
1
1
|
# Use memcached to track cycles.
|
|
2
|
-
#
|
|
3
|
-
# Copyright 2007 Mike Burns
|
|
4
|
-
# Redistribution and use in source and binary forms, with or without
|
|
5
|
-
# modification, are permitted provided that the following conditions are met:
|
|
6
|
-
# * Redistributions of source code must retain the above copyright
|
|
7
|
-
# notice, this list of conditions and the following disclaimer.
|
|
8
|
-
# * Redistributions in binary form must reproduce the above copyright
|
|
9
|
-
# notice, this list of conditions and the following disclaimer in the
|
|
10
|
-
# documentation and/or other materials provided with the distribution.
|
|
11
|
-
# * Neither the name Mike Burns nor the
|
|
12
|
-
# names of his contributors may be used to endorse or promote products
|
|
13
|
-
# derived from this software without specific prior written permission.
|
|
14
|
-
#
|
|
15
|
-
# THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
|
|
16
|
-
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
17
|
-
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
18
|
-
# DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
|
|
19
|
-
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
20
|
-
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
21
|
-
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
22
|
-
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
23
|
-
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
24
|
-
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
25
2
|
|
|
26
3
|
require 'memcache'
|
|
27
4
|
|
|
28
5
|
# A specialized class using memcached to track items stored. It supports
|
|
29
|
-
# three operations: new, <<, and include? . Together these can be used to
|
|
6
|
+
# three operations: new, <<, and include? . Together these can be used to
|
|
30
7
|
# add items to the memcache, then determine whether the item has been added.
|
|
31
8
|
#
|
|
32
9
|
# To use it with Spider use the check_already_seen_with method:
|
|
@@ -1,34 +1,11 @@
|
|
|
1
1
|
# Use AmazonSQS to track nodes to visit.
|
|
2
|
-
#
|
|
3
|
-
# Copyright 2008 John Nagro
|
|
4
|
-
# Redistribution and use in source and binary forms, with or without
|
|
5
|
-
# modification, are permitted provided that the following conditions are met:
|
|
6
|
-
# * Redistributions of source code must retain the above copyright
|
|
7
|
-
# notice, this list of conditions and the following disclaimer.
|
|
8
|
-
# * Redistributions in binary form must reproduce the above copyright
|
|
9
|
-
# notice, this list of conditions and the following disclaimer in the
|
|
10
|
-
# documentation and/or other materials provided with the distribution.
|
|
11
|
-
# * Neither the name Mike Burns nor the
|
|
12
|
-
# names of his contributors may be used to endorse or promote products
|
|
13
|
-
# derived from this software without specific prior written permission.
|
|
14
|
-
#
|
|
15
|
-
# THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
|
|
16
|
-
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
17
|
-
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
18
|
-
# DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
|
|
19
|
-
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
20
|
-
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
21
|
-
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
22
|
-
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
23
|
-
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
24
|
-
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
25
2
|
|
|
26
3
|
require 'rubygems'
|
|
27
4
|
require 'right_aws'
|
|
28
5
|
require 'yaml'
|
|
29
6
|
|
|
30
7
|
# A specialized class using AmazonSQS to track nodes to walk. It supports
|
|
31
|
-
# two operations: push and pop . Together these can be used to
|
|
8
|
+
# two operations: push and pop . Together these can be used to
|
|
32
9
|
# add items to the queue, then pull items off the queue.
|
|
33
10
|
#
|
|
34
11
|
# This is useful if you want multiple Spider processes crawling the same
|
|
@@ -47,8 +24,8 @@ class NextUrlsInSQS
|
|
|
47
24
|
@sqs = RightAws::SqsGen2.new(aws_access_key, aws_secret_access_key)
|
|
48
25
|
@queue = @sqs.queue(queue_name)
|
|
49
26
|
end
|
|
50
|
-
|
|
51
|
-
# Pull an item off the queue, loop until data is found. Data is
|
|
27
|
+
|
|
28
|
+
# Pull an item off the queue, loop until data is found. Data is
|
|
52
29
|
# encoded with YAML.
|
|
53
30
|
def pop
|
|
54
31
|
while true
|
|
@@ -57,10 +34,10 @@ class NextUrlsInSQS
|
|
|
57
34
|
sleep 5
|
|
58
35
|
end
|
|
59
36
|
end
|
|
60
|
-
|
|
37
|
+
|
|
61
38
|
# Put data on the queue. Data is encoded with YAML.
|
|
62
39
|
def push(a_msg)
|
|
63
40
|
encoded_message = YAML::dump(a_msg)
|
|
64
41
|
@queue.push(a_msg)
|
|
65
|
-
end
|
|
66
|
-
end
|
|
42
|
+
end
|
|
43
|
+
end
|
data/lib/spider/robot_rules.rb
CHANGED
|
@@ -1,77 +1,81 @@
|
|
|
1
|
-
|
|
1
|
+
#!/usr/local/bin/ruby -w
|
|
2
2
|
|
|
3
|
+
# robot_rules.rb
|
|
4
|
+
#
|
|
3
5
|
# Created by James Edward Gray II on 2006-01-31.
|
|
4
6
|
# Copyright 2006 Gray Productions. All rights reserved.
|
|
7
|
+
# https://github.com/eribertomota/robot_rules.rb
|
|
8
|
+
# https://github.com/johnnagro/spider/issues/1
|
|
5
9
|
|
|
6
10
|
require "uri"
|
|
7
11
|
|
|
8
12
|
# Based on Perl's WWW::RobotRules module, by Gisle Aas.
|
|
9
13
|
class RobotRules
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
+
def initialize( user_agent )
|
|
15
|
+
@user_agent = user_agent.scan(/\S+/).first.sub(%r{/.*},
|
|
16
|
+
"").downcase
|
|
17
|
+
@rules = Hash.new { |rules, rule| rules[rule] = Array.new }
|
|
18
|
+
end
|
|
14
19
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
20
|
+
def parse( text_uri, robots_data )
|
|
21
|
+
uri = URI.parse(text_uri)
|
|
22
|
+
location = "#{uri.host}:#{uri.port}"
|
|
23
|
+
@rules.delete(location)
|
|
19
24
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
break unless my_rules.empty?
|
|
25
|
+
rules = robots_data.split(/[\015\012]+/).
|
|
26
|
+
map { |rule| rule.sub(/\s*#.*$/, "") }
|
|
27
|
+
anon_rules = Array.new
|
|
28
|
+
my_rules = Array.new
|
|
29
|
+
current = anon_rules
|
|
30
|
+
rules.each do |rule|
|
|
31
|
+
case rule
|
|
32
|
+
when /^\s*User-Agent\s*:\s*(.+?)\s*$/i
|
|
33
|
+
break unless my_rules.empty?
|
|
30
34
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
35
|
+
current = if $1 == "*"
|
|
36
|
+
anon_rules
|
|
37
|
+
elsif $1.downcase.index(@user_agent)
|
|
38
|
+
my_rules
|
|
39
|
+
else
|
|
40
|
+
nil
|
|
41
|
+
end
|
|
42
|
+
when /^\s*Disallow\s*:\s*(.*?)\s*$/i
|
|
43
|
+
next if current.nil?
|
|
40
44
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
+
if $1.empty?
|
|
46
|
+
current << nil
|
|
47
|
+
else
|
|
48
|
+
disallow = URI.parse($1)
|
|
45
49
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
50
|
+
next unless disallow.scheme.nil? or disallow.scheme ==
|
|
51
|
+
uri.scheme
|
|
52
|
+
next unless disallow.port.nil? or disallow.port == uri.port
|
|
53
|
+
next unless disallow.host.nil? or
|
|
54
|
+
disallow.host.downcase == uri.host.downcase
|
|
51
55
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
56
|
+
disallow = disallow.path
|
|
57
|
+
disallow = "/" if disallow.empty?
|
|
58
|
+
disallow = "/#{disallow}" unless disallow[0] == ?/
|
|
55
59
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
+
current << disallow
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
60
64
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
65
|
+
@rules[location] = if my_rules.empty?
|
|
66
|
+
anon_rules.compact
|
|
67
|
+
else
|
|
68
|
+
my_rules.compact
|
|
69
|
+
end
|
|
70
|
+
end
|
|
67
71
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
+
def allowed?( text_uri )
|
|
73
|
+
uri = URI.parse(text_uri)
|
|
74
|
+
location = "#{uri.host}:#{uri.port}"
|
|
75
|
+
path = uri.path
|
|
72
76
|
|
|
73
|
-
|
|
77
|
+
return true unless %w{http https}.include?(uri.scheme)
|
|
74
78
|
|
|
75
|
-
|
|
76
|
-
|
|
79
|
+
not @rules[location].any? { |rule| path.index(rule) == 0 }
|
|
80
|
+
end
|
|
77
81
|
end
|