spider 0.4.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES +7 -0
- data/README +1 -1
- data/doc/classes/IncludedInMemcached.html +14 -5
- data/doc/classes/SpiderInstance.html +9 -9
- data/doc/created.rid +1 -1
- data/doc/files/README.html +2 -2
- data/doc/files/lib/{included_in_memcached_rb.html → spider/included_in_memcached_rb.html} +9 -3
- data/doc/files/lib/{spider_instance_rb.html → spider/spider_instance_rb.html} +4 -4
- data/doc/files/lib/spider_rb.html +2 -2
- data/doc/fr_file_index.html +2 -2
- data/lib/spider.rb +1 -1
- data/lib/spider/included_in_memcached.rb +52 -0
- data/lib/{robot_rules.rb → spider/robot_rules.rb} +2 -0
- data/lib/{spider_instance.rb → spider/spider_instance.rb} +22 -18
- data/spec/spec_helper.rb +90 -0
- data/spec/{included_in_memcached_spec.rb → spider/included_in_memcached_spec.rb} +2 -3
- data/spec/{spider_instance_spec.rb → spider/spider_instance_spec.rb} +35 -57
- data/spec/spider_spec.rb +29 -6
- data/spider.gemspec +1 -1
- metadata +13 -18
- data/lib/included_in_memcached.rb +0 -22
- data/test_server/client.rb +0 -26
- data/test_server/server1/page1.html +0 -1
- data/test_server/server1/page2.html +0 -3
- data/test_server/server2/page1.html +0 -1
- data/test_server/server2/page2.html +0 -2
- data/test_server/servers.rb +0 -24
data/CHANGES
CHANGED
data/README
CHANGED
@@ -108,7 +108,7 @@ scraping, collecting, and looping so that you can just handle the data.
|
|
108
108
|
|
109
109
|
Mike Burns http://mike-burns.com mike@mike-burns.com
|
110
110
|
|
111
|
-
Help from Matt Horan and
|
111
|
+
Help from Matt Horan, John Nagro, and Henri Cook.
|
112
112
|
|
113
113
|
With `robot_rules' from James Edward Gray II via
|
114
114
|
http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589
|
@@ -55,8 +55,8 @@
|
|
55
55
|
<tr class="top-aligned-row">
|
56
56
|
<td><strong>In:</strong></td>
|
57
57
|
<td>
|
58
|
-
<a href="../files/lib/included_in_memcached_rb.html">
|
59
|
-
lib/included_in_memcached.rb
|
58
|
+
<a href="../files/lib/spider/included_in_memcached_rb.html">
|
59
|
+
lib/spider/included_in_memcached.rb
|
60
60
|
</a>
|
61
61
|
<br />
|
62
62
|
</td>
|
@@ -86,6 +86,15 @@ three operations: <a href="IncludedInMemcached.html#M000001">new</a>,
|
|
86
86
|
Together these can be used to add items to the memcache, then determine
|
87
87
|
whether the item has been added.
|
88
88
|
</p>
|
89
|
+
<p>
|
90
|
+
To use it with <a href="Spider.html">Spider</a> use the
|
91
|
+
check_already_seen_with method:
|
92
|
+
</p>
|
93
|
+
<pre>
|
94
|
+
Spider.start_at('http://example.com/') do |s|
|
95
|
+
s.check_already_seen_with IncludedInMemcached.new('localhost:11211')
|
96
|
+
end
|
97
|
+
</pre>
|
89
98
|
|
90
99
|
</div>
|
91
100
|
|
@@ -139,7 +148,7 @@ arguments here are passed to MemCache (part of the memcache-client gem).
|
|
139
148
|
onclick="toggleCode('M000001-source');return false;">[Source]</a></p>
|
140
149
|
<div class="method-source-code" id="M000001-source">
|
141
150
|
<pre>
|
142
|
-
<span class="ruby-comment cmt"># File lib/included_in_memcached.rb, line
|
151
|
+
<span class="ruby-comment cmt"># File lib/spider/included_in_memcached.rb, line 39</span>
|
143
152
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-operator">*</span><span class="ruby-identifier">a</span>)
|
144
153
|
<span class="ruby-ivar">@c</span> = <span class="ruby-constant">MemCache</span>.<span class="ruby-identifier">new</span>(<span class="ruby-operator">*</span><span class="ruby-identifier">a</span>)
|
145
154
|
<span class="ruby-keyword kw">end</span>
|
@@ -167,7 +176,7 @@ Add an item to the memcache.
|
|
167
176
|
onclick="toggleCode('M000002-source');return false;">[Source]</a></p>
|
168
177
|
<div class="method-source-code" id="M000002-source">
|
169
178
|
<pre>
|
170
|
-
<span class="ruby-comment cmt"># File lib/included_in_memcached.rb, line
|
179
|
+
<span class="ruby-comment cmt"># File lib/spider/included_in_memcached.rb, line 44</span>
|
171
180
|
<span class="ruby-keyword kw">def</span> <span class="ruby-operator"><<</span>(<span class="ruby-identifier">v</span>)
|
172
181
|
<span class="ruby-ivar">@c</span>.<span class="ruby-identifier">add</span>(<span class="ruby-identifier">v</span>.<span class="ruby-identifier">to_s</span>, <span class="ruby-identifier">v</span>)
|
173
182
|
<span class="ruby-keyword kw">end</span>
|
@@ -193,7 +202,7 @@ True if the item is in the memcache.
|
|
193
202
|
onclick="toggleCode('M000003-source');return false;">[Source]</a></p>
|
194
203
|
<div class="method-source-code" id="M000003-source">
|
195
204
|
<pre>
|
196
|
-
<span class="ruby-comment cmt"># File lib/included_in_memcached.rb, line
|
205
|
+
<span class="ruby-comment cmt"># File lib/spider/included_in_memcached.rb, line 49</span>
|
197
206
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">include?</span>(<span class="ruby-identifier">v</span>)
|
198
207
|
<span class="ruby-ivar">@c</span>.<span class="ruby-identifier">get</span>(<span class="ruby-identifier">v</span>.<span class="ruby-identifier">to_s</span>) <span class="ruby-operator">==</span> <span class="ruby-identifier">v</span>
|
199
208
|
<span class="ruby-keyword kw">end</span>
|
@@ -55,8 +55,8 @@
|
|
55
55
|
<tr class="top-aligned-row">
|
56
56
|
<td><strong>In:</strong></td>
|
57
57
|
<td>
|
58
|
-
<a href="../files/lib/spider_instance_rb.html">
|
59
|
-
lib/spider_instance.rb
|
58
|
+
<a href="../files/lib/spider/spider_instance_rb.html">
|
59
|
+
lib/spider/spider_instance.rb
|
60
60
|
</a>
|
61
61
|
<br />
|
62
62
|
</td>
|
@@ -140,7 +140,7 @@ href="http://mike-burns.com">mike-burns.com</a>’:
|
|
140
140
|
onclick="toggleCode('M000004-source');return false;">[Source]</a></p>
|
141
141
|
<div class="method-source-code" id="M000004-source">
|
142
142
|
<pre>
|
143
|
-
<span class="ruby-comment cmt"># File lib/spider_instance.rb, line
|
143
|
+
<span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 70</span>
|
144
144
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">add_url_check</span>(<span class="ruby-operator">&</span><span class="ruby-identifier">block</span>)
|
145
145
|
<span class="ruby-ivar">@url_checks</span> <span class="ruby-operator"><<</span> <span class="ruby-identifier">block</span>
|
146
146
|
<span class="ruby-keyword kw">end</span>
|
@@ -186,7 +186,7 @@ understand just << and included? .
|
|
186
186
|
onclick="toggleCode('M000005-source');return false;">[Source]</a></p>
|
187
187
|
<div class="method-source-code" id="M000005-source">
|
188
188
|
<pre>
|
189
|
-
<span class="ruby-comment cmt"># File lib/spider_instance.rb, line
|
189
|
+
<span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 91</span>
|
190
190
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">check_already_seen_with</span>(<span class="ruby-identifier">cacher</span>)
|
191
191
|
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">cacher</span>.<span class="ruby-identifier">respond_to?</span>(<span class="ruby-identifier">:<<</span>) <span class="ruby-operator">&&</span> <span class="ruby-identifier">cacher</span>.<span class="ruby-identifier">respond_to?</span>(<span class="ruby-identifier">:include?</span>)
|
192
192
|
<span class="ruby-ivar">@seen</span> = <span class="ruby-identifier">cacher</span>
|
@@ -216,7 +216,7 @@ Reset the <a href="SpiderInstance.html#M000009">headers</a> hash.
|
|
216
216
|
onclick="toggleCode('M000010-source');return false;">[Source]</a></p>
|
217
217
|
<div class="method-source-code" id="M000010-source">
|
218
218
|
<pre>
|
219
|
-
<span class="ruby-comment cmt"># File lib/spider_instance.rb, line
|
219
|
+
<span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 158</span>
|
220
220
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">clear_headers</span>
|
221
221
|
<span class="ruby-ivar">@headers</span> = {}
|
222
222
|
<span class="ruby-keyword kw">end</span>
|
@@ -245,7 +245,7 @@ Use like a hash:
|
|
245
245
|
onclick="toggleCode('M000009-source');return false;">[Source]</a></p>
|
246
246
|
<div class="method-source-code" id="M000009-source">
|
247
247
|
<pre>
|
248
|
-
<span class="ruby-comment cmt"># File lib/spider_instance.rb, line
|
248
|
+
<span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 146</span>
|
249
249
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">headers</span>
|
250
250
|
<span class="ruby-constant">HeaderSetter</span>.<span class="ruby-identifier">new</span>(<span class="ruby-keyword kw">self</span>)
|
251
251
|
<span class="ruby-keyword kw">end</span>
|
@@ -294,7 +294,7 @@ For example:
|
|
294
294
|
onclick="toggleCode('M000006-source');return false;">[Source]</a></p>
|
295
295
|
<div class="method-source-code" id="M000006-source">
|
296
296
|
<pre>
|
297
|
-
<span class="ruby-comment cmt"># File lib/spider_instance.rb, line
|
297
|
+
<span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 121</span>
|
298
298
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">on</span>(<span class="ruby-identifier">code</span>, <span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&</span><span class="ruby-identifier">block</span>)
|
299
299
|
<span class="ruby-identifier">f</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
|
300
300
|
<span class="ruby-keyword kw">case</span> <span class="ruby-identifier">code</span>
|
@@ -331,7 +331,7 @@ Run before the HTTP request. Given the URL as a string.
|
|
331
331
|
onclick="toggleCode('M000007-source');return false;">[Source]</a></p>
|
332
332
|
<div class="method-source-code" id="M000007-source">
|
333
333
|
<pre>
|
334
|
-
<span class="ruby-comment cmt"># File lib/spider_instance.rb, line
|
334
|
+
<span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 135</span>
|
335
335
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">setup</span>(<span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&</span><span class="ruby-identifier">block</span>)
|
336
336
|
<span class="ruby-ivar">@setup</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
|
337
337
|
<span class="ruby-keyword kw">end</span>
|
@@ -357,7 +357,7 @@ Run last, once for each page. Given the URL as a string.
|
|
357
357
|
onclick="toggleCode('M000008-source');return false;">[Source]</a></p>
|
358
358
|
<div class="method-source-code" id="M000008-source">
|
359
359
|
<pre>
|
360
|
-
<span class="ruby-comment cmt"># File lib/spider_instance.rb, line
|
360
|
+
<span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 140</span>
|
361
361
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">teardown</span>(<span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&</span><span class="ruby-identifier">block</span>)
|
362
362
|
<span class="ruby-ivar">@teardown</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
|
363
363
|
<span class="ruby-keyword kw">end</span>
|
data/doc/created.rid
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
Sat, 10 Nov 2007 00:25:19 -0500
|
data/doc/files/README.html
CHANGED
@@ -56,7 +56,7 @@
|
|
56
56
|
</tr>
|
57
57
|
<tr class="top-aligned-row">
|
58
58
|
<td><strong>Last Update:</strong></td>
|
59
|
-
<td>
|
59
|
+
<td>Thu Nov 08 17:51:17 -0500 2007</td>
|
60
60
|
</tr>
|
61
61
|
</table>
|
62
62
|
</div>
|
@@ -182,7 +182,7 @@ Mike Burns <a href="http://mike-burns.com">mike-burns.com</a>
|
|
182
182
|
mike@mike-burns.com
|
183
183
|
</p>
|
184
184
|
<p>
|
185
|
-
Help from Matt Horan and
|
185
|
+
Help from Matt Horan, John Nagro, and Henri Cook.
|
186
186
|
</p>
|
187
187
|
<p>
|
188
188
|
With `robot_rules’ from James Edward Gray II via <a
|
@@ -8,7 +8,7 @@
|
|
8
8
|
<title>File: included_in_memcached.rb</title>
|
9
9
|
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
10
|
<meta http-equiv="Content-Script-Type" content="text/javascript" />
|
11
|
-
<link rel="stylesheet" href="
|
11
|
+
<link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
|
12
12
|
<script type="text/javascript">
|
13
13
|
// <![CDATA[
|
14
14
|
|
@@ -51,12 +51,12 @@
|
|
51
51
|
<table class="header-table">
|
52
52
|
<tr class="top-aligned-row">
|
53
53
|
<td><strong>Path:</strong></td>
|
54
|
-
<td>lib/included_in_memcached.rb
|
54
|
+
<td>lib/spider/included_in_memcached.rb
|
55
55
|
</td>
|
56
56
|
</tr>
|
57
57
|
<tr class="top-aligned-row">
|
58
58
|
<td><strong>Last Update:</strong></td>
|
59
|
-
<td>
|
59
|
+
<td>Sat Nov 10 00:24:11 -0500 2007</td>
|
60
60
|
</tr>
|
61
61
|
</table>
|
62
62
|
</div>
|
@@ -68,6 +68,12 @@
|
|
68
68
|
|
69
69
|
<div id="contextContent">
|
70
70
|
|
71
|
+
<div id="description">
|
72
|
+
<p>
|
73
|
+
Use memcached to track cycles.
|
74
|
+
</p>
|
75
|
+
|
76
|
+
</div>
|
71
77
|
|
72
78
|
<div id="requires-list">
|
73
79
|
<h3 class="section-bar">Required files</h3>
|
@@ -8,7 +8,7 @@
|
|
8
8
|
<title>File: spider_instance.rb</title>
|
9
9
|
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
10
|
<meta http-equiv="Content-Script-Type" content="text/javascript" />
|
11
|
-
<link rel="stylesheet" href="
|
11
|
+
<link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
|
12
12
|
<script type="text/javascript">
|
13
13
|
// <![CDATA[
|
14
14
|
|
@@ -51,12 +51,12 @@
|
|
51
51
|
<table class="header-table">
|
52
52
|
<tr class="top-aligned-row">
|
53
53
|
<td><strong>Path:</strong></td>
|
54
|
-
<td>lib/spider_instance.rb
|
54
|
+
<td>lib/spider/spider_instance.rb
|
55
55
|
</td>
|
56
56
|
</tr>
|
57
57
|
<tr class="top-aligned-row">
|
58
58
|
<td><strong>Last Update:</strong></td>
|
59
|
-
<td>
|
59
|
+
<td>Sat Nov 10 00:25:04 -0500 2007</td>
|
60
60
|
</tr>
|
61
61
|
</table>
|
62
62
|
</div>
|
@@ -70,7 +70,7 @@
|
|
70
70
|
|
71
71
|
<div id="description">
|
72
72
|
<p>
|
73
|
-
|
73
|
+
Specialized spidering rules.
|
74
74
|
</p>
|
75
75
|
|
76
76
|
</div>
|
@@ -56,7 +56,7 @@
|
|
56
56
|
</tr>
|
57
57
|
<tr class="top-aligned-row">
|
58
58
|
<td><strong>Last Update:</strong></td>
|
59
|
-
<td>
|
59
|
+
<td>Thu Nov 08 17:29:01 -0500 2007</td>
|
60
60
|
</tr>
|
61
61
|
</table>
|
62
62
|
</div>
|
@@ -182,7 +182,7 @@ Mike Burns <a href="http://mike-burns.com">mike-burns.com</a>
|
|
182
182
|
mike@mike-burns.com
|
183
183
|
</p>
|
184
184
|
<p>
|
185
|
-
Help from Matt Horan and
|
185
|
+
Help from Matt Horan, John Nagro, and Henri Cook.
|
186
186
|
</p>
|
187
187
|
<p>
|
188
188
|
With `robot_rules’ from James Edward Gray II via <a
|
data/doc/fr_file_index.html
CHANGED
@@ -21,9 +21,9 @@
|
|
21
21
|
<h1 class="section-bar">Files</h1>
|
22
22
|
<div id="index-entries">
|
23
23
|
<a href="files/README.html">README</a><br />
|
24
|
-
<a href="files/lib/included_in_memcached_rb.html">lib/included_in_memcached.rb</a><br />
|
25
24
|
<a href="files/lib/spider_rb.html">lib/spider.rb</a><br />
|
26
|
-
<a href="files/lib/
|
25
|
+
<a href="files/lib/spider/included_in_memcached_rb.html">lib/spider/included_in_memcached.rb</a><br />
|
26
|
+
<a href="files/lib/spider/spider_instance_rb.html">lib/spider/spider_instance.rb</a><br />
|
27
27
|
</div>
|
28
28
|
</div>
|
29
29
|
</body>
|
data/lib/spider.rb
CHANGED
@@ -23,7 +23,7 @@
|
|
23
23
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
24
24
|
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
25
25
|
|
26
|
-
require File.dirname(__FILE__)+'/spider_instance'
|
26
|
+
require File.dirname(__FILE__)+'/spider/spider_instance'
|
27
27
|
|
28
28
|
# A spidering library for Ruby. Handles robots.txt, scraping, finding more
|
29
29
|
# links, and doing it all over again.
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# Use memcached to track cycles.
|
2
|
+
|
3
|
+
# Redistribution and use in source and binary forms, with or without
|
4
|
+
# modification, are permitted provided that the following conditions are met:
|
5
|
+
# * Redistributions of source code must retain the above copyright
|
6
|
+
# notice, this list of conditions and the following disclaimer.
|
7
|
+
# * Redistributions in binary form must reproduce the above copyright
|
8
|
+
# notice, this list of conditions and the following disclaimer in the
|
9
|
+
# documentation and/or other materials provided with the distribution.
|
10
|
+
# * Neither the name Mike Burns nor the
|
11
|
+
# names of his contributors may be used to endorse or promote products
|
12
|
+
# derived from this software without specific prior written permission.
|
13
|
+
#
|
14
|
+
# THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
|
15
|
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
16
|
+
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
17
|
+
# DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
|
18
|
+
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
19
|
+
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
20
|
+
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
21
|
+
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
22
|
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
23
|
+
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
24
|
+
|
25
|
+
require 'memcache'
|
26
|
+
|
27
|
+
# A specialized class using memcached to track items stored. It supports
|
28
|
+
# three operations: new, <<, and include? . Together these can be used to
|
29
|
+
# add items to the memcache, then determine whether the item has been added.
|
30
|
+
#
|
31
|
+
# To use it with Spider use the check_already_seen_with method:
|
32
|
+
#
|
33
|
+
# Spider.start_at('http://example.com/') do |s|
|
34
|
+
# s.check_already_seen_with IncludedInMemcached.new('localhost:11211')
|
35
|
+
# end
|
36
|
+
class IncludedInMemcached
|
37
|
+
# Construct a new IncludedInMemcached instance. All arguments here are
|
38
|
+
# passed to MemCache (part of the memcache-client gem).
|
39
|
+
def initialize(*a)
|
40
|
+
@c = MemCache.new(*a)
|
41
|
+
end
|
42
|
+
|
43
|
+
# Add an item to the memcache.
|
44
|
+
def <<(v)
|
45
|
+
@c.add(v.to_s, v)
|
46
|
+
end
|
47
|
+
|
48
|
+
# True if the item is in the memcache.
|
49
|
+
def include?(v)
|
50
|
+
@c.get(v.to_s) == v
|
51
|
+
end
|
52
|
+
end
|
@@ -1,5 +1,6 @@
|
|
1
|
-
#
|
1
|
+
# Specialized spidering rules.
|
2
2
|
|
3
|
+
# Copyright 2007 Mike Burns
|
3
4
|
# Redistribution and use in source and binary forms, with or without
|
4
5
|
# modification, are permitted provided that the following conditions are met:
|
5
6
|
# * Redistributions of source code must retain the above copyright
|
@@ -22,7 +23,7 @@
|
|
22
23
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
23
24
|
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
24
25
|
|
25
|
-
require 'robot_rules'
|
26
|
+
require File.dirname(__FILE__)+'/robot_rules.rb'
|
26
27
|
require 'open-uri'
|
27
28
|
require 'uri'
|
28
29
|
require 'net/http'
|
@@ -221,7 +222,7 @@ class SpiderInstance
|
|
221
222
|
r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri,
|
222
223
|
@headers))}
|
223
224
|
if r.redirect?
|
224
|
-
get_page(URI.parse(r['Location']), &block)
|
225
|
+
get_page(URI.parse(construct_complete_url(parsed_url,r['Location'])), &block)
|
225
226
|
else
|
226
227
|
block.call(r)
|
227
228
|
end
|
@@ -252,21 +253,7 @@ class SpiderInstance
|
|
252
253
|
if parsed_link.fragment == '#'
|
253
254
|
nil
|
254
255
|
else
|
255
|
-
|
256
|
-
when 'http'
|
257
|
-
link
|
258
|
-
when nil
|
259
|
-
u = URI.parse(base_url)
|
260
|
-
if link[0].chr == '/'
|
261
|
-
"#{u.scheme}://#{u.host}:#{u.port}#{link}"
|
262
|
-
elsif u.path.nil? || u.path == ''
|
263
|
-
"#{u.scheme}://#{u.host}:#{u.port}/#{link}"
|
264
|
-
else
|
265
|
-
"#{u.scheme}://#{u.host}:#{u.port}/#{u.path}/#{link}"
|
266
|
-
end
|
267
|
-
else
|
268
|
-
nil
|
269
|
-
end
|
256
|
+
construct_complete_url(base_url, link, parsed_link)
|
270
257
|
end
|
271
258
|
rescue
|
272
259
|
nil
|
@@ -274,6 +261,23 @@ class SpiderInstance
|
|
274
261
|
end.compact
|
275
262
|
end
|
276
263
|
|
264
|
+
def construct_complete_url(base_url, additional_url, parsed_additional_url = nil) #:nodoc:
|
265
|
+
parsed_additional_url ||= URI.parse(additional_url)
|
266
|
+
case parsed_additional_url.scheme
|
267
|
+
when nil
|
268
|
+
u = base_url.is_a?(URI) ? base_url : URI.parse(base_url)
|
269
|
+
if additional_url[0].chr == '/'
|
270
|
+
"#{u.scheme}://#{u.host}:#{u.port}#{additional_url}"
|
271
|
+
elsif u.path.nil? || u.path == ''
|
272
|
+
"#{u.scheme}://#{u.host}:#{u.port}/#{additional_url}"
|
273
|
+
else
|
274
|
+
"#{u.scheme}://#{u.host}:#{u.port}/#{u.path}/#{additional_url}"
|
275
|
+
end
|
276
|
+
else
|
277
|
+
additional_url
|
278
|
+
end
|
279
|
+
end
|
280
|
+
|
277
281
|
def remove_trailing_slash(s) #:nodoc:
|
278
282
|
s.sub(%r{/*$},'')
|
279
283
|
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'webrick'
|
3
|
+
require 'spec'
|
4
|
+
|
5
|
+
Spec::Runner.configure { |c| c.mock_with :mocha }
|
6
|
+
|
7
|
+
def local_require(*files)
|
8
|
+
files.each do |file|
|
9
|
+
require File.dirname(__FILE__)+'/../lib/'+file
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
class BeStaticServerPages
|
14
|
+
def initialize
|
15
|
+
@pages = ['http://localhost:8888/', 'http://localhost:8888/foo']
|
16
|
+
@actual = nil
|
17
|
+
end
|
18
|
+
|
19
|
+
attr :actual, true
|
20
|
+
|
21
|
+
def matches?(actual)
|
22
|
+
@actual = actual
|
23
|
+
actual == @pages
|
24
|
+
end
|
25
|
+
|
26
|
+
def failure_message
|
27
|
+
"expected #{@pages.inspect}, got #{@actual.inspect}"
|
28
|
+
end
|
29
|
+
|
30
|
+
def description
|
31
|
+
"be the pages returned by the static server (#{@pages.inspect})"
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def with_web_server(svlt)
|
36
|
+
server = WEBrick::HTTPServer.new(:Port => 8888, :Logger => null_logger,
|
37
|
+
:AccessLog => [])
|
38
|
+
server.mount('/', svlt)
|
39
|
+
Thread.new {server.start}
|
40
|
+
begin
|
41
|
+
yield
|
42
|
+
ensure
|
43
|
+
server.shutdown
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def with_memcached
|
48
|
+
system('memcached -d -P /tmp/spider-memcached.pid')
|
49
|
+
cacher = IncludedInMemcached.new('localhost:11211')
|
50
|
+
begin
|
51
|
+
yield
|
52
|
+
ensure
|
53
|
+
system('kill -KILL `cat /tmp/spider-memcached.pid`')
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def be_static_server_pages
|
58
|
+
BeStaticServerPages.new
|
59
|
+
end
|
60
|
+
|
61
|
+
class QueryServlet < WEBrick::HTTPServlet::AbstractServlet
|
62
|
+
def do_GET(req, res)
|
63
|
+
res['Content-type'] = 'text/plain'
|
64
|
+
res.body = "response\n"
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
class LoopingServlet < WEBrick::HTTPServlet::AbstractServlet
|
69
|
+
def do_GET(req, res)
|
70
|
+
res['Content-type'] = 'text/html'
|
71
|
+
if req.path == '/foo'
|
72
|
+
res.body = <<-END
|
73
|
+
<a href="/">a</a>
|
74
|
+
END
|
75
|
+
else
|
76
|
+
res.body = <<-END
|
77
|
+
<a href="/foo">b</a>
|
78
|
+
END
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def null_logger
|
84
|
+
l = stub
|
85
|
+
[:log, :fatal, :error, :warn , :info, :debug].each do |k|
|
86
|
+
l.stubs(k)
|
87
|
+
l.stubs("#{k}?".to_sym)
|
88
|
+
end
|
89
|
+
l
|
90
|
+
end
|
@@ -1,8 +1,7 @@
|
|
1
|
-
require '
|
2
|
-
require 'spec'
|
1
|
+
require File.dirname(__FILE__)+'/../spec_helper'
|
3
2
|
|
4
3
|
def before_specing_memcached
|
5
|
-
|
4
|
+
local_require 'spider/included_in_memcached'
|
6
5
|
system('memcached -d -P /tmp/spider-memcached.pid')
|
7
6
|
end
|
8
7
|
|
@@ -1,49 +1,35 @@
|
|
1
|
-
require '
|
2
|
-
require 'spec'
|
1
|
+
require File.dirname(__FILE__)+'/../spec_helper'
|
3
2
|
require 'webrick'
|
4
3
|
require 'webrick/https'
|
5
|
-
|
6
|
-
require File.dirname(__FILE__)+'/../lib/included_in_memcached'
|
4
|
+
local_require 'spider', 'spider/included_in_memcached'
|
7
5
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
<a href="/foo">b</a>
|
27
|
-
END
|
6
|
+
describe 'SpiderInstance' do
|
7
|
+
# http://www.rcuk.ac.uk/ redirects to /default.htm, which isn't a complete
|
8
|
+
# URL. Bug reported by Henri Cook.
|
9
|
+
it 'should construct a complete redirect URL' do
|
10
|
+
@response_called = false
|
11
|
+
redirected_resp = stub(:redirect? => true,
|
12
|
+
:[] => '/default.htm')
|
13
|
+
success_resp = stub(:redirect? => false)
|
14
|
+
http_req = stub(:request => true)
|
15
|
+
http_mock_redir = stub(:use_ssl= => true)
|
16
|
+
http_mock_redir.stubs(:start).yields(http_req).returns(redirected_resp)
|
17
|
+
http_mock_success = stub(:use_ssl= => true)
|
18
|
+
http_mock_success.stubs(:start).yields(http_req).returns(success_resp)
|
19
|
+
Net::HTTP.expects(:new).times(2).returns(http_mock_redir).then.
|
20
|
+
returns(http_mock_success)
|
21
|
+
si = SpiderInstance.new({nil => ['http://www.rcuk.ac.uk/']})
|
22
|
+
si.get_page(URI.parse('http://www.rcuk.ac.uk/')) do |resp|
|
23
|
+
@response_called = true
|
28
24
|
end
|
25
|
+
@response_called.should be_true
|
29
26
|
end
|
30
|
-
end
|
31
|
-
|
32
|
-
def null_logger
|
33
|
-
l = stub
|
34
|
-
[:log, :fatal, :error, :warn , :info, :debug].each do |k|
|
35
|
-
l.stubs(k)
|
36
|
-
l.stubs("#{k}?".to_sym)
|
37
|
-
end
|
38
|
-
l
|
39
|
-
end
|
40
27
|
|
41
|
-
describe 'SpiderInstance' do
|
42
28
|
it 'should prevent cycles with an IncludedInMemcached' do
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
29
|
+
with_memcached do
|
30
|
+
cacher = IncludedInMemcached.new('localhost:11211')
|
31
|
+
it_should_prevent_cycles_with(cacher)
|
32
|
+
end
|
47
33
|
end
|
48
34
|
|
49
35
|
it 'should prevent cycles with an Array' do
|
@@ -129,15 +115,12 @@ describe 'SpiderInstance' do
|
|
129
115
|
u = 'http://localhost:8888?s=1'
|
130
116
|
u_p = URI.parse(u)
|
131
117
|
@block_called = false
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
si.get_page(u_p) do
|
138
|
-
@block_called = true
|
118
|
+
with_web_server(QueryServlet) do
|
119
|
+
si = SpiderInstance.new({nil => [u]})
|
120
|
+
si.get_page(u_p) do
|
121
|
+
@block_called = true
|
122
|
+
end
|
139
123
|
end
|
140
|
-
server.shutdown
|
141
124
|
@block_called.should be_true
|
142
125
|
end
|
143
126
|
|
@@ -413,15 +396,10 @@ describe 'SpiderInstance' do
|
|
413
396
|
u2 = 'http://localhost:8888/foo'
|
414
397
|
u_p2 = URI.parse(u2)
|
415
398
|
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
si = SpiderInstance.new(nil => [u])
|
422
|
-
si.check_already_seen_with cacher
|
423
|
-
si.start!
|
424
|
-
|
425
|
-
server.shutdown
|
399
|
+
with_web_server(LoopingServlet) do
|
400
|
+
si = SpiderInstance.new(nil => [u])
|
401
|
+
si.check_already_seen_with cacher
|
402
|
+
si.start!
|
403
|
+
end
|
426
404
|
end
|
427
405
|
end
|
data/spec/spider_spec.rb
CHANGED
@@ -1,10 +1,33 @@
|
|
1
|
-
require '
|
2
|
-
|
3
|
-
require File.dirname(__FILE__)+'/../lib/spider'
|
1
|
+
require File.dirname(__FILE__)+'/spec_helper'
|
2
|
+
local_require 'spider', 'spider/included_in_memcached'
|
4
3
|
|
5
4
|
describe 'Spider' do
|
6
|
-
it 'should
|
7
|
-
|
8
|
-
|
5
|
+
it 'should find two pages without cycles using defaults' do
|
6
|
+
u = []
|
7
|
+
with_web_server(LoopingServlet) do
|
8
|
+
u = find_pages_with_static_server
|
9
|
+
end
|
10
|
+
u.should be_static_server_pages
|
11
|
+
end
|
12
|
+
|
13
|
+
it 'should find two pages without cycles using memcached' do
|
14
|
+
u = []
|
15
|
+
with_web_server(LoopingServlet) do
|
16
|
+
with_memcached do
|
17
|
+
u = find_pages_with_static_server do |s|
|
18
|
+
s.check_already_seen_with IncludedInMemcached.new('localhost:11211')
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
u.should be_static_server_pages
|
23
|
+
end
|
24
|
+
|
25
|
+
def find_pages_with_static_server(&block)
|
26
|
+
pages = []
|
27
|
+
Spider.start_at('http://localhost:8888/') do |s|
|
28
|
+
block.call(s) unless block.nil?
|
29
|
+
s.on(:every){ |u,r,p| pages << u }
|
30
|
+
end
|
31
|
+
pages
|
9
32
|
end
|
10
33
|
end
|
data/spider.gemspec
CHANGED
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.4
|
|
3
3
|
specification_version: 1
|
4
4
|
name: spider
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.4.
|
7
|
-
date: 2007-11-
|
6
|
+
version: 0.4.1
|
7
|
+
date: 2007-11-10 00:00:00 -05:00
|
8
8
|
summary: A Web spidering library
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -34,8 +34,9 @@ files:
|
|
34
34
|
- doc/files
|
35
35
|
- doc/files/lib
|
36
36
|
- doc/files/lib/spider_rb.html
|
37
|
-
- doc/files/lib/
|
38
|
-
- doc/files/lib/
|
37
|
+
- doc/files/lib/spider
|
38
|
+
- doc/files/lib/spider/spider_instance_rb.html
|
39
|
+
- doc/files/lib/spider/included_in_memcached_rb.html
|
39
40
|
- doc/files/README.html
|
40
41
|
- doc/classes
|
41
42
|
- doc/classes/IncludedInMemcached.html
|
@@ -47,26 +48,20 @@ files:
|
|
47
48
|
- doc/index.html
|
48
49
|
- doc/created.rid
|
49
50
|
- spec
|
51
|
+
- spec/spider
|
52
|
+
- spec/spider/included_in_memcached_spec.rb
|
53
|
+
- spec/spider/spider_instance_spec.rb
|
50
54
|
- spec/spider_spec.rb
|
51
|
-
- spec/
|
52
|
-
- spec/spider_instance_spec.rb
|
55
|
+
- spec/spec_helper.rb
|
53
56
|
- README
|
54
57
|
- spider.gemspec
|
55
58
|
- CHANGES
|
56
59
|
- lib
|
57
60
|
- lib/spider.rb
|
58
|
-
- lib/
|
59
|
-
- lib/
|
60
|
-
- lib/
|
61
|
-
-
|
62
|
-
- test_server/server1
|
63
|
-
- test_server/server1/page1.html
|
64
|
-
- test_server/server1/page2.html
|
65
|
-
- test_server/server2
|
66
|
-
- test_server/server2/page1.html
|
67
|
-
- test_server/server2/page2.html
|
68
|
-
- test_server/servers.rb
|
69
|
-
- test_server/client.rb
|
61
|
+
- lib/spider
|
62
|
+
- lib/spider/included_in_memcached.rb
|
63
|
+
- lib/spider/robot_rules.rb
|
64
|
+
- lib/spider/spider_instance.rb
|
70
65
|
test_files: []
|
71
66
|
|
72
67
|
rdoc_options: []
|
@@ -1,22 +0,0 @@
|
|
1
|
-
require 'memcache'
|
2
|
-
|
3
|
-
# A specialized class using memcached to track items stored. It supports
|
4
|
-
# three operations: new, <<, and include? . Together these can be used to
|
5
|
-
# add items to the memcache, then determine whether the item has been added.
|
6
|
-
class IncludedInMemcached
|
7
|
-
# Construct a new IncludedInMemcached instance. All arguments here are
|
8
|
-
# passed to MemCache (part of the memcache-client gem).
|
9
|
-
def initialize(*a)
|
10
|
-
@c = MemCache.new(*a)
|
11
|
-
end
|
12
|
-
|
13
|
-
# Add an item to the memcache.
|
14
|
-
def <<(v)
|
15
|
-
@c.add(v.to_s, v)
|
16
|
-
end
|
17
|
-
|
18
|
-
# True if the item is in the memcache.
|
19
|
-
def include?(v)
|
20
|
-
@c.get(v.to_s) == v
|
21
|
-
end
|
22
|
-
end
|
data/test_server/client.rb
DELETED
@@ -1,26 +0,0 @@
|
|
1
|
-
#!/usr/local/bin/ruby -w
|
2
|
-
|
3
|
-
require 'rubygems'
|
4
|
-
require 'spider'
|
5
|
-
|
6
|
-
Spider.start_at('http://localhost:8880/page1.html') do |s|
|
7
|
-
s.add_url_check do |a_url|
|
8
|
-
a_url =~ %r{^http://localhost:8880.*}
|
9
|
-
end
|
10
|
-
|
11
|
-
s.on 404 do |a_url, resp, prior|
|
12
|
-
puts "URL not found: #{a_url}"
|
13
|
-
end
|
14
|
-
|
15
|
-
s.on :success do |a_url, resp, prior|
|
16
|
-
puts "body: #{resp.body}"
|
17
|
-
end
|
18
|
-
|
19
|
-
s.on :any do |a_url, resp, prior|
|
20
|
-
puts "URL returned anything: #{a_url} with this code #{resp.code}"
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
%w(INT TERM).each do |signal|
|
25
|
-
trap(signal) { exit }
|
26
|
-
end
|
@@ -1 +0,0 @@
|
|
1
|
-
<a href="page2.html">See page two!</a>
|
@@ -1 +0,0 @@
|
|
1
|
-
<a href="page2.html">See page two!</a>
|
data/test_server/servers.rb
DELETED
@@ -1,24 +0,0 @@
|
|
1
|
-
#!/usr/local/bin/ruby -w
|
2
|
-
# Two Web servers, on different ports of localhost, serving two pages each.
|
3
|
-
# One page links to the next; the next links to both the first and to the the
|
4
|
-
# first on the other server.
|
5
|
-
# This is used to test: cycles, domain restrictions.
|
6
|
-
|
7
|
-
require 'webrick'
|
8
|
-
|
9
|
-
server1 = WEBrick::HTTPServer.new(:Port => 8880)
|
10
|
-
server1.mount('/', WEBrick::HTTPServlet::FileHandler,
|
11
|
-
File.dirname(__FILE__)+'/server1')
|
12
|
-
server2 = WEBrick::HTTPServer.new(:Port => 8881)
|
13
|
-
server2.mount('/', WEBrick::HTTPServlet::FileHandler,
|
14
|
-
File.dirname(__FILE__)+'/server2')
|
15
|
-
|
16
|
-
%w(INT TERM).each do |signal|
|
17
|
-
trap(signal) { [server1,server2].each { |server| server.shutdown } }
|
18
|
-
end
|
19
|
-
|
20
|
-
threads = []
|
21
|
-
[server1,server2].each do |server|
|
22
|
-
threads << Thread.new { server.start }
|
23
|
-
end
|
24
|
-
threads.each { |t| t.join }
|