spider 0.4.0 → 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGES +7 -0
- data/README +1 -1
- data/doc/classes/IncludedInMemcached.html +14 -5
- data/doc/classes/SpiderInstance.html +9 -9
- data/doc/created.rid +1 -1
- data/doc/files/README.html +2 -2
- data/doc/files/lib/{included_in_memcached_rb.html → spider/included_in_memcached_rb.html} +9 -3
- data/doc/files/lib/{spider_instance_rb.html → spider/spider_instance_rb.html} +4 -4
- data/doc/files/lib/spider_rb.html +2 -2
- data/doc/fr_file_index.html +2 -2
- data/lib/spider.rb +1 -1
- data/lib/spider/included_in_memcached.rb +52 -0
- data/lib/{robot_rules.rb → spider/robot_rules.rb} +2 -0
- data/lib/{spider_instance.rb → spider/spider_instance.rb} +22 -18
- data/spec/spec_helper.rb +90 -0
- data/spec/{included_in_memcached_spec.rb → spider/included_in_memcached_spec.rb} +2 -3
- data/spec/{spider_instance_spec.rb → spider/spider_instance_spec.rb} +35 -57
- data/spec/spider_spec.rb +29 -6
- data/spider.gemspec +1 -1
- metadata +13 -18
- data/lib/included_in_memcached.rb +0 -22
- data/test_server/client.rb +0 -26
- data/test_server/server1/page1.html +0 -1
- data/test_server/server1/page2.html +0 -3
- data/test_server/server2/page1.html +0 -1
- data/test_server/server2/page2.html +0 -2
- data/test_server/servers.rb +0 -24
data/CHANGES
CHANGED
data/README
CHANGED
@@ -108,7 +108,7 @@ scraping, collecting, and looping so that you can just handle the data.
|
|
108
108
|
|
109
109
|
Mike Burns http://mike-burns.com mike@mike-burns.com
|
110
110
|
|
111
|
-
Help from Matt Horan and
|
111
|
+
Help from Matt Horan, John Nagro, and Henri Cook.
|
112
112
|
|
113
113
|
With `robot_rules' from James Edward Gray II via
|
114
114
|
http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589
|
@@ -55,8 +55,8 @@
|
|
55
55
|
<tr class="top-aligned-row">
|
56
56
|
<td><strong>In:</strong></td>
|
57
57
|
<td>
|
58
|
-
<a href="../files/lib/included_in_memcached_rb.html">
|
59
|
-
lib/included_in_memcached.rb
|
58
|
+
<a href="../files/lib/spider/included_in_memcached_rb.html">
|
59
|
+
lib/spider/included_in_memcached.rb
|
60
60
|
</a>
|
61
61
|
<br />
|
62
62
|
</td>
|
@@ -86,6 +86,15 @@ three operations: <a href="IncludedInMemcached.html#M000001">new</a>,
|
|
86
86
|
Together these can be used to add items to the memcache, then determine
|
87
87
|
whether the item has been added.
|
88
88
|
</p>
|
89
|
+
<p>
|
90
|
+
To use it with <a href="Spider.html">Spider</a> use the
|
91
|
+
check_already_seen_with method:
|
92
|
+
</p>
|
93
|
+
<pre>
|
94
|
+
Spider.start_at('http://example.com/') do |s|
|
95
|
+
s.check_already_seen_with IncludedInMemcached.new('localhost:11211')
|
96
|
+
end
|
97
|
+
</pre>
|
89
98
|
|
90
99
|
</div>
|
91
100
|
|
@@ -139,7 +148,7 @@ arguments here are passed to MemCache (part of the memcache-client gem).
|
|
139
148
|
onclick="toggleCode('M000001-source');return false;">[Source]</a></p>
|
140
149
|
<div class="method-source-code" id="M000001-source">
|
141
150
|
<pre>
|
142
|
-
<span class="ruby-comment cmt"># File lib/included_in_memcached.rb, line
|
151
|
+
<span class="ruby-comment cmt"># File lib/spider/included_in_memcached.rb, line 39</span>
|
143
152
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-operator">*</span><span class="ruby-identifier">a</span>)
|
144
153
|
<span class="ruby-ivar">@c</span> = <span class="ruby-constant">MemCache</span>.<span class="ruby-identifier">new</span>(<span class="ruby-operator">*</span><span class="ruby-identifier">a</span>)
|
145
154
|
<span class="ruby-keyword kw">end</span>
|
@@ -167,7 +176,7 @@ Add an item to the memcache.
|
|
167
176
|
onclick="toggleCode('M000002-source');return false;">[Source]</a></p>
|
168
177
|
<div class="method-source-code" id="M000002-source">
|
169
178
|
<pre>
|
170
|
-
<span class="ruby-comment cmt"># File lib/included_in_memcached.rb, line
|
179
|
+
<span class="ruby-comment cmt"># File lib/spider/included_in_memcached.rb, line 44</span>
|
171
180
|
<span class="ruby-keyword kw">def</span> <span class="ruby-operator"><<</span>(<span class="ruby-identifier">v</span>)
|
172
181
|
<span class="ruby-ivar">@c</span>.<span class="ruby-identifier">add</span>(<span class="ruby-identifier">v</span>.<span class="ruby-identifier">to_s</span>, <span class="ruby-identifier">v</span>)
|
173
182
|
<span class="ruby-keyword kw">end</span>
|
@@ -193,7 +202,7 @@ True if the item is in the memcache.
|
|
193
202
|
onclick="toggleCode('M000003-source');return false;">[Source]</a></p>
|
194
203
|
<div class="method-source-code" id="M000003-source">
|
195
204
|
<pre>
|
196
|
-
<span class="ruby-comment cmt"># File lib/included_in_memcached.rb, line
|
205
|
+
<span class="ruby-comment cmt"># File lib/spider/included_in_memcached.rb, line 49</span>
|
197
206
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">include?</span>(<span class="ruby-identifier">v</span>)
|
198
207
|
<span class="ruby-ivar">@c</span>.<span class="ruby-identifier">get</span>(<span class="ruby-identifier">v</span>.<span class="ruby-identifier">to_s</span>) <span class="ruby-operator">==</span> <span class="ruby-identifier">v</span>
|
199
208
|
<span class="ruby-keyword kw">end</span>
|
@@ -55,8 +55,8 @@
|
|
55
55
|
<tr class="top-aligned-row">
|
56
56
|
<td><strong>In:</strong></td>
|
57
57
|
<td>
|
58
|
-
<a href="../files/lib/spider_instance_rb.html">
|
59
|
-
lib/spider_instance.rb
|
58
|
+
<a href="../files/lib/spider/spider_instance_rb.html">
|
59
|
+
lib/spider/spider_instance.rb
|
60
60
|
</a>
|
61
61
|
<br />
|
62
62
|
</td>
|
@@ -140,7 +140,7 @@ href="http://mike-burns.com">mike-burns.com</a>’:
|
|
140
140
|
onclick="toggleCode('M000004-source');return false;">[Source]</a></p>
|
141
141
|
<div class="method-source-code" id="M000004-source">
|
142
142
|
<pre>
|
143
|
-
<span class="ruby-comment cmt"># File lib/spider_instance.rb, line
|
143
|
+
<span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 70</span>
|
144
144
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">add_url_check</span>(<span class="ruby-operator">&</span><span class="ruby-identifier">block</span>)
|
145
145
|
<span class="ruby-ivar">@url_checks</span> <span class="ruby-operator"><<</span> <span class="ruby-identifier">block</span>
|
146
146
|
<span class="ruby-keyword kw">end</span>
|
@@ -186,7 +186,7 @@ understand just << and included? .
|
|
186
186
|
onclick="toggleCode('M000005-source');return false;">[Source]</a></p>
|
187
187
|
<div class="method-source-code" id="M000005-source">
|
188
188
|
<pre>
|
189
|
-
<span class="ruby-comment cmt"># File lib/spider_instance.rb, line
|
189
|
+
<span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 91</span>
|
190
190
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">check_already_seen_with</span>(<span class="ruby-identifier">cacher</span>)
|
191
191
|
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">cacher</span>.<span class="ruby-identifier">respond_to?</span>(<span class="ruby-identifier">:<<</span>) <span class="ruby-operator">&&</span> <span class="ruby-identifier">cacher</span>.<span class="ruby-identifier">respond_to?</span>(<span class="ruby-identifier">:include?</span>)
|
192
192
|
<span class="ruby-ivar">@seen</span> = <span class="ruby-identifier">cacher</span>
|
@@ -216,7 +216,7 @@ Reset the <a href="SpiderInstance.html#M000009">headers</a> hash.
|
|
216
216
|
onclick="toggleCode('M000010-source');return false;">[Source]</a></p>
|
217
217
|
<div class="method-source-code" id="M000010-source">
|
218
218
|
<pre>
|
219
|
-
<span class="ruby-comment cmt"># File lib/spider_instance.rb, line
|
219
|
+
<span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 158</span>
|
220
220
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">clear_headers</span>
|
221
221
|
<span class="ruby-ivar">@headers</span> = {}
|
222
222
|
<span class="ruby-keyword kw">end</span>
|
@@ -245,7 +245,7 @@ Use like a hash:
|
|
245
245
|
onclick="toggleCode('M000009-source');return false;">[Source]</a></p>
|
246
246
|
<div class="method-source-code" id="M000009-source">
|
247
247
|
<pre>
|
248
|
-
<span class="ruby-comment cmt"># File lib/spider_instance.rb, line
|
248
|
+
<span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 146</span>
|
249
249
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">headers</span>
|
250
250
|
<span class="ruby-constant">HeaderSetter</span>.<span class="ruby-identifier">new</span>(<span class="ruby-keyword kw">self</span>)
|
251
251
|
<span class="ruby-keyword kw">end</span>
|
@@ -294,7 +294,7 @@ For example:
|
|
294
294
|
onclick="toggleCode('M000006-source');return false;">[Source]</a></p>
|
295
295
|
<div class="method-source-code" id="M000006-source">
|
296
296
|
<pre>
|
297
|
-
<span class="ruby-comment cmt"># File lib/spider_instance.rb, line
|
297
|
+
<span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 121</span>
|
298
298
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">on</span>(<span class="ruby-identifier">code</span>, <span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&</span><span class="ruby-identifier">block</span>)
|
299
299
|
<span class="ruby-identifier">f</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
|
300
300
|
<span class="ruby-keyword kw">case</span> <span class="ruby-identifier">code</span>
|
@@ -331,7 +331,7 @@ Run before the HTTP request. Given the URL as a string.
|
|
331
331
|
onclick="toggleCode('M000007-source');return false;">[Source]</a></p>
|
332
332
|
<div class="method-source-code" id="M000007-source">
|
333
333
|
<pre>
|
334
|
-
<span class="ruby-comment cmt"># File lib/spider_instance.rb, line
|
334
|
+
<span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 135</span>
|
335
335
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">setup</span>(<span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&</span><span class="ruby-identifier">block</span>)
|
336
336
|
<span class="ruby-ivar">@setup</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
|
337
337
|
<span class="ruby-keyword kw">end</span>
|
@@ -357,7 +357,7 @@ Run last, once for each page. Given the URL as a string.
|
|
357
357
|
onclick="toggleCode('M000008-source');return false;">[Source]</a></p>
|
358
358
|
<div class="method-source-code" id="M000008-source">
|
359
359
|
<pre>
|
360
|
-
<span class="ruby-comment cmt"># File lib/spider_instance.rb, line
|
360
|
+
<span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 140</span>
|
361
361
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">teardown</span>(<span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&</span><span class="ruby-identifier">block</span>)
|
362
362
|
<span class="ruby-ivar">@teardown</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
|
363
363
|
<span class="ruby-keyword kw">end</span>
|
data/doc/created.rid
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
Sat, 10 Nov 2007 00:25:19 -0500
|
data/doc/files/README.html
CHANGED
@@ -56,7 +56,7 @@
|
|
56
56
|
</tr>
|
57
57
|
<tr class="top-aligned-row">
|
58
58
|
<td><strong>Last Update:</strong></td>
|
59
|
-
<td>
|
59
|
+
<td>Thu Nov 08 17:51:17 -0500 2007</td>
|
60
60
|
</tr>
|
61
61
|
</table>
|
62
62
|
</div>
|
@@ -182,7 +182,7 @@ Mike Burns <a href="http://mike-burns.com">mike-burns.com</a>
|
|
182
182
|
mike@mike-burns.com
|
183
183
|
</p>
|
184
184
|
<p>
|
185
|
-
Help from Matt Horan and
|
185
|
+
Help from Matt Horan, John Nagro, and Henri Cook.
|
186
186
|
</p>
|
187
187
|
<p>
|
188
188
|
With `robot_rules’ from James Edward Gray II via <a
|
@@ -8,7 +8,7 @@
|
|
8
8
|
<title>File: included_in_memcached.rb</title>
|
9
9
|
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
10
|
<meta http-equiv="Content-Script-Type" content="text/javascript" />
|
11
|
-
<link rel="stylesheet" href="
|
11
|
+
<link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
|
12
12
|
<script type="text/javascript">
|
13
13
|
// <![CDATA[
|
14
14
|
|
@@ -51,12 +51,12 @@
|
|
51
51
|
<table class="header-table">
|
52
52
|
<tr class="top-aligned-row">
|
53
53
|
<td><strong>Path:</strong></td>
|
54
|
-
<td>lib/included_in_memcached.rb
|
54
|
+
<td>lib/spider/included_in_memcached.rb
|
55
55
|
</td>
|
56
56
|
</tr>
|
57
57
|
<tr class="top-aligned-row">
|
58
58
|
<td><strong>Last Update:</strong></td>
|
59
|
-
<td>
|
59
|
+
<td>Sat Nov 10 00:24:11 -0500 2007</td>
|
60
60
|
</tr>
|
61
61
|
</table>
|
62
62
|
</div>
|
@@ -68,6 +68,12 @@
|
|
68
68
|
|
69
69
|
<div id="contextContent">
|
70
70
|
|
71
|
+
<div id="description">
|
72
|
+
<p>
|
73
|
+
Use memcached to track cycles.
|
74
|
+
</p>
|
75
|
+
|
76
|
+
</div>
|
71
77
|
|
72
78
|
<div id="requires-list">
|
73
79
|
<h3 class="section-bar">Required files</h3>
|
@@ -8,7 +8,7 @@
|
|
8
8
|
<title>File: spider_instance.rb</title>
|
9
9
|
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
10
|
<meta http-equiv="Content-Script-Type" content="text/javascript" />
|
11
|
-
<link rel="stylesheet" href="
|
11
|
+
<link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
|
12
12
|
<script type="text/javascript">
|
13
13
|
// <![CDATA[
|
14
14
|
|
@@ -51,12 +51,12 @@
|
|
51
51
|
<table class="header-table">
|
52
52
|
<tr class="top-aligned-row">
|
53
53
|
<td><strong>Path:</strong></td>
|
54
|
-
<td>lib/spider_instance.rb
|
54
|
+
<td>lib/spider/spider_instance.rb
|
55
55
|
</td>
|
56
56
|
</tr>
|
57
57
|
<tr class="top-aligned-row">
|
58
58
|
<td><strong>Last Update:</strong></td>
|
59
|
-
<td>
|
59
|
+
<td>Sat Nov 10 00:25:04 -0500 2007</td>
|
60
60
|
</tr>
|
61
61
|
</table>
|
62
62
|
</div>
|
@@ -70,7 +70,7 @@
|
|
70
70
|
|
71
71
|
<div id="description">
|
72
72
|
<p>
|
73
|
-
|
73
|
+
Specialized spidering rules.
|
74
74
|
</p>
|
75
75
|
|
76
76
|
</div>
|
@@ -56,7 +56,7 @@
|
|
56
56
|
</tr>
|
57
57
|
<tr class="top-aligned-row">
|
58
58
|
<td><strong>Last Update:</strong></td>
|
59
|
-
<td>
|
59
|
+
<td>Thu Nov 08 17:29:01 -0500 2007</td>
|
60
60
|
</tr>
|
61
61
|
</table>
|
62
62
|
</div>
|
@@ -182,7 +182,7 @@ Mike Burns <a href="http://mike-burns.com">mike-burns.com</a>
|
|
182
182
|
mike@mike-burns.com
|
183
183
|
</p>
|
184
184
|
<p>
|
185
|
-
Help from Matt Horan and
|
185
|
+
Help from Matt Horan, John Nagro, and Henri Cook.
|
186
186
|
</p>
|
187
187
|
<p>
|
188
188
|
With `robot_rules’ from James Edward Gray II via <a
|
data/doc/fr_file_index.html
CHANGED
@@ -21,9 +21,9 @@
|
|
21
21
|
<h1 class="section-bar">Files</h1>
|
22
22
|
<div id="index-entries">
|
23
23
|
<a href="files/README.html">README</a><br />
|
24
|
-
<a href="files/lib/included_in_memcached_rb.html">lib/included_in_memcached.rb</a><br />
|
25
24
|
<a href="files/lib/spider_rb.html">lib/spider.rb</a><br />
|
26
|
-
<a href="files/lib/
|
25
|
+
<a href="files/lib/spider/included_in_memcached_rb.html">lib/spider/included_in_memcached.rb</a><br />
|
26
|
+
<a href="files/lib/spider/spider_instance_rb.html">lib/spider/spider_instance.rb</a><br />
|
27
27
|
</div>
|
28
28
|
</div>
|
29
29
|
</body>
|
data/lib/spider.rb
CHANGED
@@ -23,7 +23,7 @@
|
|
23
23
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
24
24
|
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
25
25
|
|
26
|
-
require File.dirname(__FILE__)+'/spider_instance'
|
26
|
+
require File.dirname(__FILE__)+'/spider/spider_instance'
|
27
27
|
|
28
28
|
# A spidering library for Ruby. Handles robots.txt, scraping, finding more
|
29
29
|
# links, and doing it all over again.
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# Use memcached to track cycles.
|
2
|
+
|
3
|
+
# Redistribution and use in source and binary forms, with or without
|
4
|
+
# modification, are permitted provided that the following conditions are met:
|
5
|
+
# * Redistributions of source code must retain the above copyright
|
6
|
+
# notice, this list of conditions and the following disclaimer.
|
7
|
+
# * Redistributions in binary form must reproduce the above copyright
|
8
|
+
# notice, this list of conditions and the following disclaimer in the
|
9
|
+
# documentation and/or other materials provided with the distribution.
|
10
|
+
# * Neither the name Mike Burns nor the
|
11
|
+
# names of his contributors may be used to endorse or promote products
|
12
|
+
# derived from this software without specific prior written permission.
|
13
|
+
#
|
14
|
+
# THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
|
15
|
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
16
|
+
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
17
|
+
# DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
|
18
|
+
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
19
|
+
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
20
|
+
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
21
|
+
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
22
|
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
23
|
+
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
24
|
+
|
25
|
+
require 'memcache'
|
26
|
+
|
27
|
+
# A specialized class using memcached to track items stored. It supports
|
28
|
+
# three operations: new, <<, and include? . Together these can be used to
|
29
|
+
# add items to the memcache, then determine whether the item has been added.
|
30
|
+
#
|
31
|
+
# To use it with Spider use the check_already_seen_with method:
|
32
|
+
#
|
33
|
+
# Spider.start_at('http://example.com/') do |s|
|
34
|
+
# s.check_already_seen_with IncludedInMemcached.new('localhost:11211')
|
35
|
+
# end
|
36
|
+
class IncludedInMemcached
|
37
|
+
# Construct a new IncludedInMemcached instance. All arguments here are
|
38
|
+
# passed to MemCache (part of the memcache-client gem).
|
39
|
+
def initialize(*a)
|
40
|
+
@c = MemCache.new(*a)
|
41
|
+
end
|
42
|
+
|
43
|
+
# Add an item to the memcache.
|
44
|
+
def <<(v)
|
45
|
+
@c.add(v.to_s, v)
|
46
|
+
end
|
47
|
+
|
48
|
+
# True if the item is in the memcache.
|
49
|
+
def include?(v)
|
50
|
+
@c.get(v.to_s) == v
|
51
|
+
end
|
52
|
+
end
|
@@ -1,5 +1,6 @@
|
|
1
|
-
#
|
1
|
+
# Specialized spidering rules.
|
2
2
|
|
3
|
+
# Copyright 2007 Mike Burns
|
3
4
|
# Redistribution and use in source and binary forms, with or without
|
4
5
|
# modification, are permitted provided that the following conditions are met:
|
5
6
|
# * Redistributions of source code must retain the above copyright
|
@@ -22,7 +23,7 @@
|
|
22
23
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
23
24
|
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
24
25
|
|
25
|
-
require 'robot_rules'
|
26
|
+
require File.dirname(__FILE__)+'/robot_rules.rb'
|
26
27
|
require 'open-uri'
|
27
28
|
require 'uri'
|
28
29
|
require 'net/http'
|
@@ -221,7 +222,7 @@ class SpiderInstance
|
|
221
222
|
r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri,
|
222
223
|
@headers))}
|
223
224
|
if r.redirect?
|
224
|
-
get_page(URI.parse(r['Location']), &block)
|
225
|
+
get_page(URI.parse(construct_complete_url(parsed_url,r['Location'])), &block)
|
225
226
|
else
|
226
227
|
block.call(r)
|
227
228
|
end
|
@@ -252,21 +253,7 @@ class SpiderInstance
|
|
252
253
|
if parsed_link.fragment == '#'
|
253
254
|
nil
|
254
255
|
else
|
255
|
-
|
256
|
-
when 'http'
|
257
|
-
link
|
258
|
-
when nil
|
259
|
-
u = URI.parse(base_url)
|
260
|
-
if link[0].chr == '/'
|
261
|
-
"#{u.scheme}://#{u.host}:#{u.port}#{link}"
|
262
|
-
elsif u.path.nil? || u.path == ''
|
263
|
-
"#{u.scheme}://#{u.host}:#{u.port}/#{link}"
|
264
|
-
else
|
265
|
-
"#{u.scheme}://#{u.host}:#{u.port}/#{u.path}/#{link}"
|
266
|
-
end
|
267
|
-
else
|
268
|
-
nil
|
269
|
-
end
|
256
|
+
construct_complete_url(base_url, link, parsed_link)
|
270
257
|
end
|
271
258
|
rescue
|
272
259
|
nil
|
@@ -274,6 +261,23 @@ class SpiderInstance
|
|
274
261
|
end.compact
|
275
262
|
end
|
276
263
|
|
264
|
+
def construct_complete_url(base_url, additional_url, parsed_additional_url = nil) #:nodoc:
|
265
|
+
parsed_additional_url ||= URI.parse(additional_url)
|
266
|
+
case parsed_additional_url.scheme
|
267
|
+
when nil
|
268
|
+
u = base_url.is_a?(URI) ? base_url : URI.parse(base_url)
|
269
|
+
if additional_url[0].chr == '/'
|
270
|
+
"#{u.scheme}://#{u.host}:#{u.port}#{additional_url}"
|
271
|
+
elsif u.path.nil? || u.path == ''
|
272
|
+
"#{u.scheme}://#{u.host}:#{u.port}/#{additional_url}"
|
273
|
+
else
|
274
|
+
"#{u.scheme}://#{u.host}:#{u.port}/#{u.path}/#{additional_url}"
|
275
|
+
end
|
276
|
+
else
|
277
|
+
additional_url
|
278
|
+
end
|
279
|
+
end
|
280
|
+
|
277
281
|
def remove_trailing_slash(s) #:nodoc:
|
278
282
|
s.sub(%r{/*$},'')
|
279
283
|
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'webrick'
|
3
|
+
require 'spec'
|
4
|
+
|
5
|
+
Spec::Runner.configure { |c| c.mock_with :mocha }
|
6
|
+
|
7
|
+
def local_require(*files)
|
8
|
+
files.each do |file|
|
9
|
+
require File.dirname(__FILE__)+'/../lib/'+file
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
class BeStaticServerPages
|
14
|
+
def initialize
|
15
|
+
@pages = ['http://localhost:8888/', 'http://localhost:8888/foo']
|
16
|
+
@actual = nil
|
17
|
+
end
|
18
|
+
|
19
|
+
attr :actual, true
|
20
|
+
|
21
|
+
def matches?(actual)
|
22
|
+
@actual = actual
|
23
|
+
actual == @pages
|
24
|
+
end
|
25
|
+
|
26
|
+
def failure_message
|
27
|
+
"expected #{@pages.inspect}, got #{@actual.inspect}"
|
28
|
+
end
|
29
|
+
|
30
|
+
def description
|
31
|
+
"be the pages returned by the static server (#{@pages.inspect})"
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def with_web_server(svlt)
|
36
|
+
server = WEBrick::HTTPServer.new(:Port => 8888, :Logger => null_logger,
|
37
|
+
:AccessLog => [])
|
38
|
+
server.mount('/', svlt)
|
39
|
+
Thread.new {server.start}
|
40
|
+
begin
|
41
|
+
yield
|
42
|
+
ensure
|
43
|
+
server.shutdown
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def with_memcached
|
48
|
+
system('memcached -d -P /tmp/spider-memcached.pid')
|
49
|
+
cacher = IncludedInMemcached.new('localhost:11211')
|
50
|
+
begin
|
51
|
+
yield
|
52
|
+
ensure
|
53
|
+
system('kill -KILL `cat /tmp/spider-memcached.pid`')
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def be_static_server_pages
|
58
|
+
BeStaticServerPages.new
|
59
|
+
end
|
60
|
+
|
61
|
+
class QueryServlet < WEBrick::HTTPServlet::AbstractServlet
|
62
|
+
def do_GET(req, res)
|
63
|
+
res['Content-type'] = 'text/plain'
|
64
|
+
res.body = "response\n"
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
class LoopingServlet < WEBrick::HTTPServlet::AbstractServlet
|
69
|
+
def do_GET(req, res)
|
70
|
+
res['Content-type'] = 'text/html'
|
71
|
+
if req.path == '/foo'
|
72
|
+
res.body = <<-END
|
73
|
+
<a href="/">a</a>
|
74
|
+
END
|
75
|
+
else
|
76
|
+
res.body = <<-END
|
77
|
+
<a href="/foo">b</a>
|
78
|
+
END
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def null_logger
|
84
|
+
l = stub
|
85
|
+
[:log, :fatal, :error, :warn , :info, :debug].each do |k|
|
86
|
+
l.stubs(k)
|
87
|
+
l.stubs("#{k}?".to_sym)
|
88
|
+
end
|
89
|
+
l
|
90
|
+
end
|
@@ -1,8 +1,7 @@
|
|
1
|
-
require '
|
2
|
-
require 'spec'
|
1
|
+
require File.dirname(__FILE__)+'/../spec_helper'
|
3
2
|
|
4
3
|
def before_specing_memcached
|
5
|
-
|
4
|
+
local_require 'spider/included_in_memcached'
|
6
5
|
system('memcached -d -P /tmp/spider-memcached.pid')
|
7
6
|
end
|
8
7
|
|
@@ -1,49 +1,35 @@
|
|
1
|
-
require '
|
2
|
-
require 'spec'
|
1
|
+
require File.dirname(__FILE__)+'/../spec_helper'
|
3
2
|
require 'webrick'
|
4
3
|
require 'webrick/https'
|
5
|
-
|
6
|
-
require File.dirname(__FILE__)+'/../lib/included_in_memcached'
|
4
|
+
local_require 'spider', 'spider/included_in_memcached'
|
7
5
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
<a href="/foo">b</a>
|
27
|
-
END
|
6
|
+
describe 'SpiderInstance' do
|
7
|
+
# http://www.rcuk.ac.uk/ redirects to /default.htm, which isn't a complete
|
8
|
+
# URL. Bug reported by Henri Cook.
|
9
|
+
it 'should construct a complete redirect URL' do
|
10
|
+
@response_called = false
|
11
|
+
redirected_resp = stub(:redirect? => true,
|
12
|
+
:[] => '/default.htm')
|
13
|
+
success_resp = stub(:redirect? => false)
|
14
|
+
http_req = stub(:request => true)
|
15
|
+
http_mock_redir = stub(:use_ssl= => true)
|
16
|
+
http_mock_redir.stubs(:start).yields(http_req).returns(redirected_resp)
|
17
|
+
http_mock_success = stub(:use_ssl= => true)
|
18
|
+
http_mock_success.stubs(:start).yields(http_req).returns(success_resp)
|
19
|
+
Net::HTTP.expects(:new).times(2).returns(http_mock_redir).then.
|
20
|
+
returns(http_mock_success)
|
21
|
+
si = SpiderInstance.new({nil => ['http://www.rcuk.ac.uk/']})
|
22
|
+
si.get_page(URI.parse('http://www.rcuk.ac.uk/')) do |resp|
|
23
|
+
@response_called = true
|
28
24
|
end
|
25
|
+
@response_called.should be_true
|
29
26
|
end
|
30
|
-
end
|
31
|
-
|
32
|
-
def null_logger
|
33
|
-
l = stub
|
34
|
-
[:log, :fatal, :error, :warn , :info, :debug].each do |k|
|
35
|
-
l.stubs(k)
|
36
|
-
l.stubs("#{k}?".to_sym)
|
37
|
-
end
|
38
|
-
l
|
39
|
-
end
|
40
27
|
|
41
|
-
describe 'SpiderInstance' do
|
42
28
|
it 'should prevent cycles with an IncludedInMemcached' do
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
29
|
+
with_memcached do
|
30
|
+
cacher = IncludedInMemcached.new('localhost:11211')
|
31
|
+
it_should_prevent_cycles_with(cacher)
|
32
|
+
end
|
47
33
|
end
|
48
34
|
|
49
35
|
it 'should prevent cycles with an Array' do
|
@@ -129,15 +115,12 @@ describe 'SpiderInstance' do
|
|
129
115
|
u = 'http://localhost:8888?s=1'
|
130
116
|
u_p = URI.parse(u)
|
131
117
|
@block_called = false
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
si.get_page(u_p) do
|
138
|
-
@block_called = true
|
118
|
+
with_web_server(QueryServlet) do
|
119
|
+
si = SpiderInstance.new({nil => [u]})
|
120
|
+
si.get_page(u_p) do
|
121
|
+
@block_called = true
|
122
|
+
end
|
139
123
|
end
|
140
|
-
server.shutdown
|
141
124
|
@block_called.should be_true
|
142
125
|
end
|
143
126
|
|
@@ -413,15 +396,10 @@ describe 'SpiderInstance' do
|
|
413
396
|
u2 = 'http://localhost:8888/foo'
|
414
397
|
u_p2 = URI.parse(u2)
|
415
398
|
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
si = SpiderInstance.new(nil => [u])
|
422
|
-
si.check_already_seen_with cacher
|
423
|
-
si.start!
|
424
|
-
|
425
|
-
server.shutdown
|
399
|
+
with_web_server(LoopingServlet) do
|
400
|
+
si = SpiderInstance.new(nil => [u])
|
401
|
+
si.check_already_seen_with cacher
|
402
|
+
si.start!
|
403
|
+
end
|
426
404
|
end
|
427
405
|
end
|
data/spec/spider_spec.rb
CHANGED
@@ -1,10 +1,33 @@
|
|
1
|
-
require '
|
2
|
-
|
3
|
-
require File.dirname(__FILE__)+'/../lib/spider'
|
1
|
+
require File.dirname(__FILE__)+'/spec_helper'
|
2
|
+
local_require 'spider', 'spider/included_in_memcached'
|
4
3
|
|
5
4
|
describe 'Spider' do
|
6
|
-
it 'should
|
7
|
-
|
8
|
-
|
5
|
+
it 'should find two pages without cycles using defaults' do
|
6
|
+
u = []
|
7
|
+
with_web_server(LoopingServlet) do
|
8
|
+
u = find_pages_with_static_server
|
9
|
+
end
|
10
|
+
u.should be_static_server_pages
|
11
|
+
end
|
12
|
+
|
13
|
+
it 'should find two pages without cycles using memcached' do
|
14
|
+
u = []
|
15
|
+
with_web_server(LoopingServlet) do
|
16
|
+
with_memcached do
|
17
|
+
u = find_pages_with_static_server do |s|
|
18
|
+
s.check_already_seen_with IncludedInMemcached.new('localhost:11211')
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
u.should be_static_server_pages
|
23
|
+
end
|
24
|
+
|
25
|
+
def find_pages_with_static_server(&block)
|
26
|
+
pages = []
|
27
|
+
Spider.start_at('http://localhost:8888/') do |s|
|
28
|
+
block.call(s) unless block.nil?
|
29
|
+
s.on(:every){ |u,r,p| pages << u }
|
30
|
+
end
|
31
|
+
pages
|
9
32
|
end
|
10
33
|
end
|
data/spider.gemspec
CHANGED
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.4
|
|
3
3
|
specification_version: 1
|
4
4
|
name: spider
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.4.
|
7
|
-
date: 2007-11-
|
6
|
+
version: 0.4.1
|
7
|
+
date: 2007-11-10 00:00:00 -05:00
|
8
8
|
summary: A Web spidering library
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -34,8 +34,9 @@ files:
|
|
34
34
|
- doc/files
|
35
35
|
- doc/files/lib
|
36
36
|
- doc/files/lib/spider_rb.html
|
37
|
-
- doc/files/lib/
|
38
|
-
- doc/files/lib/
|
37
|
+
- doc/files/lib/spider
|
38
|
+
- doc/files/lib/spider/spider_instance_rb.html
|
39
|
+
- doc/files/lib/spider/included_in_memcached_rb.html
|
39
40
|
- doc/files/README.html
|
40
41
|
- doc/classes
|
41
42
|
- doc/classes/IncludedInMemcached.html
|
@@ -47,26 +48,20 @@ files:
|
|
47
48
|
- doc/index.html
|
48
49
|
- doc/created.rid
|
49
50
|
- spec
|
51
|
+
- spec/spider
|
52
|
+
- spec/spider/included_in_memcached_spec.rb
|
53
|
+
- spec/spider/spider_instance_spec.rb
|
50
54
|
- spec/spider_spec.rb
|
51
|
-
- spec/
|
52
|
-
- spec/spider_instance_spec.rb
|
55
|
+
- spec/spec_helper.rb
|
53
56
|
- README
|
54
57
|
- spider.gemspec
|
55
58
|
- CHANGES
|
56
59
|
- lib
|
57
60
|
- lib/spider.rb
|
58
|
-
- lib/
|
59
|
-
- lib/
|
60
|
-
- lib/
|
61
|
-
-
|
62
|
-
- test_server/server1
|
63
|
-
- test_server/server1/page1.html
|
64
|
-
- test_server/server1/page2.html
|
65
|
-
- test_server/server2
|
66
|
-
- test_server/server2/page1.html
|
67
|
-
- test_server/server2/page2.html
|
68
|
-
- test_server/servers.rb
|
69
|
-
- test_server/client.rb
|
61
|
+
- lib/spider
|
62
|
+
- lib/spider/included_in_memcached.rb
|
63
|
+
- lib/spider/robot_rules.rb
|
64
|
+
- lib/spider/spider_instance.rb
|
70
65
|
test_files: []
|
71
66
|
|
72
67
|
rdoc_options: []
|
@@ -1,22 +0,0 @@
|
|
1
|
-
require 'memcache'
|
2
|
-
|
3
|
-
# A specialized class using memcached to track items stored. It supports
|
4
|
-
# three operations: new, <<, and include? . Together these can be used to
|
5
|
-
# add items to the memcache, then determine whether the item has been added.
|
6
|
-
class IncludedInMemcached
|
7
|
-
# Construct a new IncludedInMemcached instance. All arguments here are
|
8
|
-
# passed to MemCache (part of the memcache-client gem).
|
9
|
-
def initialize(*a)
|
10
|
-
@c = MemCache.new(*a)
|
11
|
-
end
|
12
|
-
|
13
|
-
# Add an item to the memcache.
|
14
|
-
def <<(v)
|
15
|
-
@c.add(v.to_s, v)
|
16
|
-
end
|
17
|
-
|
18
|
-
# True if the item is in the memcache.
|
19
|
-
def include?(v)
|
20
|
-
@c.get(v.to_s) == v
|
21
|
-
end
|
22
|
-
end
|
data/test_server/client.rb
DELETED
@@ -1,26 +0,0 @@
|
|
1
|
-
#!/usr/local/bin/ruby -w
|
2
|
-
|
3
|
-
require 'rubygems'
|
4
|
-
require 'spider'
|
5
|
-
|
6
|
-
Spider.start_at('http://localhost:8880/page1.html') do |s|
|
7
|
-
s.add_url_check do |a_url|
|
8
|
-
a_url =~ %r{^http://localhost:8880.*}
|
9
|
-
end
|
10
|
-
|
11
|
-
s.on 404 do |a_url, resp, prior|
|
12
|
-
puts "URL not found: #{a_url}"
|
13
|
-
end
|
14
|
-
|
15
|
-
s.on :success do |a_url, resp, prior|
|
16
|
-
puts "body: #{resp.body}"
|
17
|
-
end
|
18
|
-
|
19
|
-
s.on :any do |a_url, resp, prior|
|
20
|
-
puts "URL returned anything: #{a_url} with this code #{resp.code}"
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
%w(INT TERM).each do |signal|
|
25
|
-
trap(signal) { exit }
|
26
|
-
end
|
@@ -1 +0,0 @@
|
|
1
|
-
<a href="page2.html">See page two!</a>
|
@@ -1 +0,0 @@
|
|
1
|
-
<a href="page2.html">See page two!</a>
|
data/test_server/servers.rb
DELETED
@@ -1,24 +0,0 @@
|
|
1
|
-
#!/usr/local/bin/ruby -w
|
2
|
-
# Two Web servers, on different ports of localhost, serving two pages each.
|
3
|
-
# One page links to the next; the next links to both the first and to the the
|
4
|
-
# first on the other server.
|
5
|
-
# This is used to test: cycles, domain restrictions.
|
6
|
-
|
7
|
-
require 'webrick'
|
8
|
-
|
9
|
-
server1 = WEBrick::HTTPServer.new(:Port => 8880)
|
10
|
-
server1.mount('/', WEBrick::HTTPServlet::FileHandler,
|
11
|
-
File.dirname(__FILE__)+'/server1')
|
12
|
-
server2 = WEBrick::HTTPServer.new(:Port => 8881)
|
13
|
-
server2.mount('/', WEBrick::HTTPServlet::FileHandler,
|
14
|
-
File.dirname(__FILE__)+'/server2')
|
15
|
-
|
16
|
-
%w(INT TERM).each do |signal|
|
17
|
-
trap(signal) { [server1,server2].each { |server| server.shutdown } }
|
18
|
-
end
|
19
|
-
|
20
|
-
threads = []
|
21
|
-
[server1,server2].each do |server|
|
22
|
-
threads << Thread.new { server.start }
|
23
|
-
end
|
24
|
-
threads.each { |t| t.join }
|