spider 0.4.4 → 0.5.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/AUTHORS +17 -0
- data/CHANGES +16 -0
- data/LICENSE +21 -0
- data/{README → README.md} +73 -44
- data/lib/spider.rb +12 -29
- data/lib/spider/included_in_file.rb +32 -0
- data/lib/spider/included_in_memcached.rb +1 -24
- data/lib/spider/included_in_redis.rb +31 -0
- data/lib/spider/next_urls_in_sqs.rb +6 -29
- data/lib/spider/robot_rules.rb +61 -57
- data/lib/spider/spider_instance.rb +16 -35
- data/spec/spider/included_in_redis_spec.rb +43 -0
- data/spider.gemspec +5 -3
- metadata +38 -125
- data/doc/classes/BeStaticServerPages.html +0 -197
- data/doc/classes/BeStaticServerPages.src/M000030.html +0 -19
- data/doc/classes/BeStaticServerPages.src/M000031.html +0 -19
- data/doc/classes/BeStaticServerPages.src/M000032.html +0 -18
- data/doc/classes/BeStaticServerPages.src/M000033.html +0 -18
- data/doc/classes/IncludedInMemcached.html +0 -199
- data/doc/classes/IncludedInMemcached.src/M000015.html +0 -18
- data/doc/classes/IncludedInMemcached.src/M000016.html +0 -18
- data/doc/classes/IncludedInMemcached.src/M000017.html +0 -18
- data/doc/classes/LoopingServlet.html +0 -137
- data/doc/classes/LoopingServlet.src/M000037.html +0 -23
- data/doc/classes/NextUrlsInSQS.html +0 -204
- data/doc/classes/NextUrlsInSQS.src/M000018.html +0 -19
- data/doc/classes/NextUrlsInSQS.src/M000019.html +0 -22
- data/doc/classes/NextUrlsInSQS.src/M000020.html +0 -19
- data/doc/classes/QueryServlet.html +0 -137
- data/doc/classes/QueryServlet.src/M000038.html +0 -19
- data/doc/classes/RobotRules.html +0 -175
- data/doc/classes/RobotRules.src/M000034.html +0 -19
- data/doc/classes/RobotRules.src/M000035.html +0 -67
- data/doc/classes/RobotRules.src/M000036.html +0 -24
- data/doc/classes/Spider.html +0 -170
- data/doc/classes/Spider.src/M000029.html +0 -21
- data/doc/classes/SpiderInstance.html +0 -345
- data/doc/classes/SpiderInstance.src/M000021.html +0 -18
- data/doc/classes/SpiderInstance.src/M000022.html +0 -22
- data/doc/classes/SpiderInstance.src/M000023.html +0 -22
- data/doc/classes/SpiderInstance.src/M000024.html +0 -24
- data/doc/classes/SpiderInstance.src/M000025.html +0 -18
- data/doc/classes/SpiderInstance.src/M000026.html +0 -18
- data/doc/classes/SpiderInstance.src/M000027.html +0 -18
- data/doc/classes/SpiderInstance.src/M000028.html +0 -18
- data/doc/created.rid +0 -1
- data/doc/files/README.html +0 -223
- data/doc/files/lib/spider/included_in_memcached_rb.html +0 -142
- data/doc/files/lib/spider/next_urls_in_sqs_rb.html +0 -144
- data/doc/files/lib/spider/robot_rules_rb.html +0 -114
- data/doc/files/lib/spider/spider_instance_rb.html +0 -117
- data/doc/files/lib/spider_rb.html +0 -254
- data/doc/files/spec/spec_helper_rb.html +0 -196
- data/doc/files/spec/spec_helper_rb.src/M000001.html +0 -20
- data/doc/files/spec/spec_helper_rb.src/M000002.html +0 -26
- data/doc/files/spec/spec_helper_rb.src/M000003.html +0 -24
- data/doc/files/spec/spec_helper_rb.src/M000004.html +0 -18
- data/doc/files/spec/spec_helper_rb.src/M000005.html +0 -23
- data/doc/files/spec/spider/included_in_memcached_spec_rb.html +0 -142
- data/doc/files/spec/spider/included_in_memcached_spec_rb.src/M000006.html +0 -19
- data/doc/files/spec/spider/included_in_memcached_spec_rb.src/M000007.html +0 -18
- data/doc/files/spec/spider/spider_instance_spec_rb.html +0 -210
- data/doc/files/spec/spider/spider_instance_spec_rb.src/M000008.html +0 -21
- data/doc/files/spec/spider/spider_instance_spec_rb.src/M000009.html +0 -19
- data/doc/files/spec/spider/spider_instance_spec_rb.src/M000010.html +0 -19
- data/doc/files/spec/spider/spider_instance_spec_rb.src/M000011.html +0 -27
- data/doc/files/spec/spider/spider_instance_spec_rb.src/M000012.html +0 -26
- data/doc/files/spec/spider/spider_instance_spec_rb.src/M000013.html +0 -27
- data/doc/files/spec/spider_spec_rb.html +0 -127
- data/doc/files/spec/spider_spec_rb.src/M000014.html +0 -23
- data/doc/fr_class_index.html +0 -34
- data/doc/fr_file_index.html +0 -35
- data/doc/fr_method_index.html +0 -64
- data/doc/index.html +0 -24
- data/doc/rdoc-style.css +0 -208
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: b87ed979c115546fa802f888fea0baf322f458be6e50e40f8ea8fd9ee392c8ac
|
4
|
+
data.tar.gz: eabe506949614a5622afa2def1da954d352d737dccadd0622d060be13c061115
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ab52efe227f19067dd52efb0333d5901d3725f4e88fa0b86942c12bd702efa6b3bf4ed72b03d5ce58b32a22c574b0ee31764fa5e37df07a57132c250bf6b0658
|
7
|
+
data.tar.gz: ede92b88eb09867c41c1f7fcca58c99a55e63f02bb54fa094ece40e8875240734aa9abdcaf043db003120a44b239198dc6d063779543927febb493116732b7f3
|
data/AUTHORS
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
The Ruby Spider Gem would not be what it is today without the help of
|
2
|
+
the following kind souls:
|
3
|
+
|
4
|
+
Alexandre Rousseau
|
5
|
+
Brian Campbell
|
6
|
+
Henri Cook
|
7
|
+
James Edward Gray II
|
8
|
+
Jeremy Evans
|
9
|
+
Joao Eriberto Mota Filho
|
10
|
+
John Buckley
|
11
|
+
John Nagro
|
12
|
+
Matt Horan
|
13
|
+
Marc (@brigriffin)
|
14
|
+
Mike Burns (original author)
|
15
|
+
Olle Jonsson
|
16
|
+
Sander van der Vliet
|
17
|
+
Stuart Yamartino
|
data/CHANGES
CHANGED
@@ -1,3 +1,19 @@
|
|
1
|
+
2018-04-23 v0.5.3
|
2
|
+
* release simply to add missing CHANGES notes
|
3
|
+
|
4
|
+
2018-04-23 v0.5.2
|
5
|
+
* fixed #2 thanks to @jeremyevans
|
6
|
+
* added Redis as cache wrapper thanks to @brigriffin
|
7
|
+
|
8
|
+
2016-09-04 v0.5.1
|
9
|
+
* added the ability to stop a crawl
|
10
|
+
|
11
|
+
2016-05-13 v0.5.0
|
12
|
+
* fixed #1 thanks to @eribertomota
|
13
|
+
* got it running on more recent versions of ruby
|
14
|
+
* cleaned up the docs a bit
|
15
|
+
* cleaned up the licensing and attribution
|
16
|
+
|
1
17
|
2009-05-21
|
2
18
|
* fixed an issue with robots.txt on ssl hosts
|
3
19
|
* fixed an issue with pulling robots.txt from disallowed hosts
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2007-2016 Spider Team Authors
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/{README → README.md}
RENAMED
@@ -1,66 +1,100 @@
|
|
1
1
|
|
2
|
-
Spider
|
3
|
-
|
2
|
+
# Spider
|
3
|
+
_a Web spidering library for Ruby. It handles the robots.txt,
|
4
|
+
scraping, collecting, and looping so that you can just handle the data._
|
4
5
|
|
5
|
-
|
6
|
+
## Examples
|
6
7
|
|
7
|
-
|
8
|
+
### Crawl the Web, loading each page in turn, until you run out of memory
|
8
9
|
|
10
|
+
```ruby
|
9
11
|
require 'spider'
|
10
|
-
Spider.start_at('http://
|
12
|
+
Spider.start_at('http://cashcats.biz/') {}
|
13
|
+
```
|
11
14
|
|
12
|
-
|
15
|
+
### To handle erroneous responses
|
13
16
|
|
17
|
+
```ruby
|
14
18
|
require 'spider'
|
15
|
-
Spider.start_at('http://
|
19
|
+
Spider.start_at('http://cashcats.biz/') do |s|
|
16
20
|
s.on :failure do |a_url, resp, prior_url|
|
17
21
|
puts "URL failed: #{a_url}"
|
18
22
|
puts " linked from #{prior_url}"
|
19
23
|
end
|
20
24
|
end
|
25
|
+
```
|
21
26
|
|
22
|
-
|
27
|
+
### Or handle successful responses
|
23
28
|
|
29
|
+
```ruby
|
24
30
|
require 'spider'
|
25
|
-
Spider.start_at('http://
|
31
|
+
Spider.start_at('http://cashcats.biz/') do |s|
|
26
32
|
s.on :success do |a_url, resp, prior_url|
|
27
33
|
puts "#{a_url}: #{resp.code}"
|
28
34
|
puts resp.body
|
29
35
|
puts
|
30
36
|
end
|
31
37
|
end
|
38
|
+
```
|
32
39
|
|
33
|
-
|
40
|
+
### Limit to just one domain
|
34
41
|
|
42
|
+
```ruby
|
35
43
|
require 'spider'
|
36
|
-
Spider.start_at('http://
|
44
|
+
Spider.start_at('http://cashcats.biz/') do |s|
|
37
45
|
s.add_url_check do |a_url|
|
38
|
-
a_url =~ %r{^http://
|
46
|
+
a_url =~ %r{^http://cashcats.biz.*}
|
39
47
|
end
|
40
48
|
end
|
49
|
+
```
|
41
50
|
|
42
|
-
|
51
|
+
### Pass headers to some requests
|
43
52
|
|
53
|
+
```ruby
|
44
54
|
require 'spider'
|
45
|
-
Spider.start_at('http://
|
55
|
+
Spider.start_at('http://cashcats.biz/') do |s|
|
46
56
|
s.setup do |a_url|
|
47
57
|
if a_url =~ %r{^http://.*wikipedia.*}
|
48
58
|
headers['User-Agent'] = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
49
59
|
end
|
50
60
|
end
|
51
61
|
end
|
62
|
+
```
|
52
63
|
|
53
|
-
|
64
|
+
### Use memcached to track cycles
|
54
65
|
|
66
|
+
```ruby
|
55
67
|
require 'spider'
|
56
68
|
require 'spider/included_in_memcached'
|
57
69
|
SERVERS = ['10.0.10.2:11211','10.0.10.3:11211','10.0.10.4:11211']
|
58
|
-
Spider.start_at('http://
|
70
|
+
Spider.start_at('http://cashcats.biz/') do |s|
|
59
71
|
s.check_already_seen_with IncludedInMemcached.new(SERVERS)
|
60
72
|
end
|
73
|
+
```
|
61
74
|
|
62
|
-
|
75
|
+
### Use Redis to track cycles
|
63
76
|
|
77
|
+
```ruby
|
78
|
+
require 'spider'
|
79
|
+
require 'spider/included_in_redis'
|
80
|
+
Spider.start_at('http://cashcats.biz/') do |s|
|
81
|
+
s.check_already_seen_with IncludedInRedis.new(host: '127.0.0.1', port: 6379)
|
82
|
+
end
|
83
|
+
```
|
84
|
+
|
85
|
+
### Use Plain text to track cycles
|
86
|
+
|
87
|
+
```ruby
|
88
|
+
require 'spider'
|
89
|
+
require 'spider/included_in_redis'
|
90
|
+
Spider.start_at('http://cashcats.biz/') do |s|
|
91
|
+
s.check_already_seen_with IncludedInFile.new('/tmp/cashcats_crawl.txt')
|
92
|
+
end
|
93
|
+
```
|
94
|
+
|
95
|
+
### Track cycles with a custom object
|
96
|
+
|
97
|
+
```ruby
|
64
98
|
require 'spider'
|
65
99
|
class ExpireLinks < Hash
|
66
100
|
def <<(v)
|
@@ -71,54 +105,62 @@ scraping, collecting, and looping so that you can just handle the data.
|
|
71
105
|
end
|
72
106
|
end
|
73
107
|
|
74
|
-
Spider.start_at('http://
|
108
|
+
Spider.start_at('http://cashcats.biz/') do |s|
|
75
109
|
s.check_already_seen_with ExpireLinks.new
|
76
110
|
end
|
111
|
+
```
|
77
112
|
|
78
|
-
|
113
|
+
### Store nodes to visit with Amazon SQS
|
79
114
|
|
115
|
+
```ruby
|
80
116
|
require 'spider'
|
81
117
|
require 'spider/next_urls_in_sqs'
|
82
|
-
Spider.start_at('http://
|
118
|
+
Spider.start_at('http://cashcats.biz') do |s|
|
83
119
|
s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY)
|
84
120
|
end
|
121
|
+
```
|
85
122
|
|
86
|
-
|
123
|
+
### Store nodes to visit with a custom object
|
87
124
|
|
125
|
+
```ruby
|
88
126
|
require 'spider'
|
89
127
|
class MyArray < Array
|
90
128
|
def pop
|
91
|
-
|
129
|
+
super
|
92
130
|
end
|
93
|
-
|
131
|
+
|
94
132
|
def push(a_msg)
|
95
133
|
super(a_msg)
|
96
134
|
end
|
97
135
|
end
|
98
136
|
|
99
|
-
Spider.start_at('http://
|
137
|
+
Spider.start_at('http://cashcats.biz') do |s|
|
100
138
|
s.store_next_urls_with MyArray.new
|
101
139
|
end
|
140
|
+
```
|
102
141
|
|
103
|
-
|
142
|
+
### Create a URL graph
|
104
143
|
|
144
|
+
```ruby
|
105
145
|
require 'spider'
|
106
146
|
nodes = {}
|
107
|
-
Spider.start_at('http://
|
108
|
-
s.add_url_check {|a_url| a_url =~ %r{^http://
|
147
|
+
Spider.start_at('http://cashcats.biz/') do |s|
|
148
|
+
s.add_url_check {|a_url| a_url =~ %r{^http://cashcats.biz.*} }
|
109
149
|
|
110
150
|
s.on(:every) do |a_url, resp, prior_url|
|
111
151
|
nodes[prior_url] ||= []
|
112
152
|
nodes[prior_url] << a_url
|
113
153
|
end
|
114
154
|
end
|
155
|
+
```
|
115
156
|
|
116
|
-
|
157
|
+
### Use a proxy
|
117
158
|
|
159
|
+
```ruby
|
118
160
|
require 'net/http_configuration'
|
119
161
|
require 'spider'
|
120
162
|
http_conf = Net::HTTP::Configuration.new(:proxy_host => '7proxies.org',
|
121
|
-
:proxy_port => 8881)
|
163
|
+
:proxy_port => 8881)
|
122
164
|
http_conf.apply do
|
123
165
|
Spider.start_at('http://img.4chan.org/b/') do |s|
|
124
166
|
s.on(:success) do |a_url, resp, prior_url|
|
@@ -128,19 +170,6 @@ scraping, collecting, and looping so that you can just handle the data.
|
|
128
170
|
end
|
129
171
|
end
|
130
172
|
end
|
173
|
+
```
|
131
174
|
|
132
|
-
|
133
|
-
|
134
|
-
John Nagro john.nagro@gmail.com
|
135
|
-
|
136
|
-
Mike Burns http://mike-burns.com mike@mike-burns.com (original author)
|
137
|
-
|
138
|
-
Many thanks to:
|
139
|
-
Matt Horan
|
140
|
-
Henri Cook
|
141
|
-
Sander van der Vliet
|
142
|
-
John Buckley
|
143
|
-
Brian Campbell
|
144
|
-
|
145
|
-
With `robot_rules' from James Edward Gray II via
|
146
|
-
http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589
|
175
|
+
_Copyright (c) 2007-2016 Spider Team Authors_
|
data/lib/spider.rb
CHANGED
@@ -1,41 +1,24 @@
|
|
1
|
-
# Copyright 2007-2008 Mike Burns & John Nagro
|
2
|
-
# :include: README
|
3
|
-
|
4
|
-
# Redistribution and use in source and binary forms, with or without
|
5
|
-
# modification, are permitted provided that the following conditions are met:
|
6
|
-
# * Redistributions of source code must retain the above copyright
|
7
|
-
# notice, this list of conditions and the following disclaimer.
|
8
|
-
# * Redistributions in binary form must reproduce the above copyright
|
9
|
-
# notice, this list of conditions and the following disclaimer in the
|
10
|
-
# documentation and/or other materials provided with the distribution.
|
11
|
-
# * Neither the name Mike Burns nor the
|
12
|
-
# names of his contributors may be used to endorse or promote products
|
13
|
-
# derived from this software without specific prior written permission.
|
14
|
-
#
|
15
|
-
# THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
|
16
|
-
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
17
|
-
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
18
|
-
# DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
|
19
|
-
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
20
|
-
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
21
|
-
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
22
|
-
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
23
|
-
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
24
|
-
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
25
|
-
|
26
1
|
require File.dirname(__FILE__)+'/spider/spider_instance'
|
27
2
|
|
28
3
|
# A spidering library for Ruby. Handles robots.txt, scraping, finding more
|
29
4
|
# links, and doing it all over again.
|
30
5
|
class Spider
|
6
|
+
|
7
|
+
VERSION_INFO = [0, 5, 4] unless defined?(self::VERSION_INFO)
|
8
|
+
VERSION = VERSION_INFO.map(&:to_s).join('.') unless defined?(self::VERSION)
|
9
|
+
|
10
|
+
def self.version
|
11
|
+
VERSION
|
12
|
+
end
|
13
|
+
|
31
14
|
# Runs the spider starting at the given URL. Also takes a block that is given
|
32
15
|
# the SpiderInstance. Use the block to define the rules and handlers for
|
33
16
|
# the discovered Web pages. See SpiderInstance for the possible rules and
|
34
17
|
# handlers.
|
35
18
|
#
|
36
|
-
# Spider.start_at('http://
|
19
|
+
# Spider.start_at('http://cashcats.biz/') do |s|
|
37
20
|
# s.add_url_check do |a_url|
|
38
|
-
# a_url =~ %r{^http://
|
21
|
+
# a_url =~ %r{^http://cashcats.biz.*}
|
39
22
|
# end
|
40
23
|
#
|
41
24
|
# s.on 404 do |a_url, resp, prior_url|
|
@@ -52,8 +35,8 @@ class Spider
|
|
52
35
|
# end
|
53
36
|
|
54
37
|
def self.start_at(a_url, &block)
|
55
|
-
rules = RobotRules.new(
|
56
|
-
a_spider = SpiderInstance.new({nil => a_url}, [], rules, [])
|
38
|
+
rules = RobotRules.new("Ruby Spider #{Spider::VERSION}")
|
39
|
+
a_spider = SpiderInstance.new({nil => [a_url]}, [], rules, [])
|
57
40
|
block.call(a_spider)
|
58
41
|
a_spider.start!
|
59
42
|
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# Use plain text file to track cycles.
|
2
|
+
|
3
|
+
# A specialized class using a plain text to track items stored. It supports
|
4
|
+
# three operations: new, <<, and include? . Together these can be used to
|
5
|
+
# add items to the text file, then determine whether the item has been added.
|
6
|
+
#
|
7
|
+
# To use it with Spider use the check_already_seen_with method:
|
8
|
+
#
|
9
|
+
# Spider.start_at('http://example.com/') do |s|
|
10
|
+
# s.check_already_seen_with IncludedInFile.new('/tmp/crawled.log')
|
11
|
+
# end
|
12
|
+
class IncludedInFile
|
13
|
+
# Construct a new IncludedInFile instance.
|
14
|
+
# @param filepath [String] as path of file to store crawled URL
|
15
|
+
def initialize(filepath)
|
16
|
+
@filepath = filepath
|
17
|
+
# create file if not exists
|
18
|
+
File.write(@filepath, '') unless File.file?(@filepath)
|
19
|
+
@urls = File.readlines(@filepath).map(&:chomp)
|
20
|
+
end
|
21
|
+
|
22
|
+
# Add an item to the file & array of URL.
|
23
|
+
def <<(v)
|
24
|
+
@urls << v.to_s
|
25
|
+
File.write(@filepath, "#{v}\r\n", File.size(@filepath), mode: 'a')
|
26
|
+
end
|
27
|
+
|
28
|
+
# True if the item is in the file.
|
29
|
+
def include?(v)
|
30
|
+
@urls.include? v.to_s
|
31
|
+
end
|
32
|
+
end
|
@@ -1,32 +1,9 @@
|
|
1
1
|
# Use memcached to track cycles.
|
2
|
-
#
|
3
|
-
# Copyright 2007 Mike Burns
|
4
|
-
# Redistribution and use in source and binary forms, with or without
|
5
|
-
# modification, are permitted provided that the following conditions are met:
|
6
|
-
# * Redistributions of source code must retain the above copyright
|
7
|
-
# notice, this list of conditions and the following disclaimer.
|
8
|
-
# * Redistributions in binary form must reproduce the above copyright
|
9
|
-
# notice, this list of conditions and the following disclaimer in the
|
10
|
-
# documentation and/or other materials provided with the distribution.
|
11
|
-
# * Neither the name Mike Burns nor the
|
12
|
-
# names of his contributors may be used to endorse or promote products
|
13
|
-
# derived from this software without specific prior written permission.
|
14
|
-
#
|
15
|
-
# THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
|
16
|
-
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
17
|
-
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
18
|
-
# DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
|
19
|
-
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
20
|
-
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
21
|
-
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
22
|
-
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
23
|
-
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
24
|
-
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
25
2
|
|
26
3
|
require 'memcache'
|
27
4
|
|
28
5
|
# A specialized class using memcached to track items stored. It supports
|
29
|
-
# three operations: new, <<, and include? . Together these can be used to
|
6
|
+
# three operations: new, <<, and include? . Together these can be used to
|
30
7
|
# add items to the memcache, then determine whether the item has been added.
|
31
8
|
#
|
32
9
|
# To use it with Spider use the check_already_seen_with method:
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# Use Redis to track cycles.
|
2
|
+
|
3
|
+
require 'redis'
|
4
|
+
require 'json'
|
5
|
+
|
6
|
+
# A specialized class using Redis to track items stored. It supports
|
7
|
+
# three operations: new, <<, and include? . Together these can be used to
|
8
|
+
# add items to Redis, then determine whether the item has been added.
|
9
|
+
#
|
10
|
+
# To use it with Spider use the check_already_seen_with method:
|
11
|
+
#
|
12
|
+
# Spider.start_at('http://example.com/') do |s|
|
13
|
+
# s.check_already_seen_with IncludedInRedis.new(host: '127.0.0.1', port: 6379)
|
14
|
+
# end
|
15
|
+
class IncludedInRedis
|
16
|
+
# Construct a new IncludedInRedis instance. All arguments here are
|
17
|
+
# passed to Redis (part of the redis gem).
|
18
|
+
def initialize(*a)
|
19
|
+
@c = Redis.new(*a)
|
20
|
+
end
|
21
|
+
|
22
|
+
# Add an item to Redis
|
23
|
+
def <<(v)
|
24
|
+
@c.set(v.to_s, v.to_json)
|
25
|
+
end
|
26
|
+
|
27
|
+
# True if the item is in Redis
|
28
|
+
def include?(v)
|
29
|
+
@c.get(v.to_s) == v.to_json
|
30
|
+
end
|
31
|
+
end
|
@@ -1,34 +1,11 @@
|
|
1
1
|
# Use AmazonSQS to track nodes to visit.
|
2
|
-
#
|
3
|
-
# Copyright 2008 John Nagro
|
4
|
-
# Redistribution and use in source and binary forms, with or without
|
5
|
-
# modification, are permitted provided that the following conditions are met:
|
6
|
-
# * Redistributions of source code must retain the above copyright
|
7
|
-
# notice, this list of conditions and the following disclaimer.
|
8
|
-
# * Redistributions in binary form must reproduce the above copyright
|
9
|
-
# notice, this list of conditions and the following disclaimer in the
|
10
|
-
# documentation and/or other materials provided with the distribution.
|
11
|
-
# * Neither the name Mike Burns nor the
|
12
|
-
# names of his contributors may be used to endorse or promote products
|
13
|
-
# derived from this software without specific prior written permission.
|
14
|
-
#
|
15
|
-
# THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
|
16
|
-
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
17
|
-
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
18
|
-
# DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
|
19
|
-
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
20
|
-
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
21
|
-
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
22
|
-
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
23
|
-
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
24
|
-
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
25
2
|
|
26
3
|
require 'rubygems'
|
27
4
|
require 'right_aws'
|
28
5
|
require 'yaml'
|
29
6
|
|
30
7
|
# A specialized class using AmazonSQS to track nodes to walk. It supports
|
31
|
-
# two operations: push and pop . Together these can be used to
|
8
|
+
# two operations: push and pop . Together these can be used to
|
32
9
|
# add items to the queue, then pull items off the queue.
|
33
10
|
#
|
34
11
|
# This is useful if you want multiple Spider processes crawling the same
|
@@ -47,8 +24,8 @@ class NextUrlsInSQS
|
|
47
24
|
@sqs = RightAws::SqsGen2.new(aws_access_key, aws_secret_access_key)
|
48
25
|
@queue = @sqs.queue(queue_name)
|
49
26
|
end
|
50
|
-
|
51
|
-
# Pull an item off the queue, loop until data is found. Data is
|
27
|
+
|
28
|
+
# Pull an item off the queue, loop until data is found. Data is
|
52
29
|
# encoded with YAML.
|
53
30
|
def pop
|
54
31
|
while true
|
@@ -57,10 +34,10 @@ class NextUrlsInSQS
|
|
57
34
|
sleep 5
|
58
35
|
end
|
59
36
|
end
|
60
|
-
|
37
|
+
|
61
38
|
# Put data on the queue. Data is encoded with YAML.
|
62
39
|
def push(a_msg)
|
63
40
|
encoded_message = YAML::dump(a_msg)
|
64
41
|
@queue.push(a_msg)
|
65
|
-
end
|
66
|
-
end
|
42
|
+
end
|
43
|
+
end
|