spider 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGES +3 -0
- data/README +90 -17
- data/doc/classes/IncludedInMemcached.html +217 -0
- data/doc/classes/Spider.html +10 -8
- data/doc/classes/SpiderInstance.html +96 -45
- data/doc/created.rid +1 -1
- data/doc/files/README.html +95 -21
- data/doc/{classes/Net.html → files/lib/included_in_memcached_rb.html} +23 -16
- data/doc/files/lib/spider_instance_rb.html +118 -0
- data/doc/files/lib/spider_rb.html +95 -32
- data/doc/fr_class_index.html +1 -0
- data/doc/fr_file_index.html +2 -0
- data/doc/fr_method_index.html +11 -7
- data/lib/included_in_memcached.rb +22 -0
- data/lib/spider.rb +4 -246
- data/lib/spider_instance.rb +290 -0
- data/spec/included_in_memcached_spec.rb +44 -0
- data/spec/spider_instance_spec.rb +46 -4
- data/spider.gemspec +1 -1
- metadata +8 -8
- data/doc/classes/Net/HTTPRedirection.html +0 -144
- data/doc/classes/Net/HTTPResponse.html +0 -166
- data/doc/classes/Net/HTTPSuccess.html +0 -144
- data/doc/classes/NilClass.html +0 -144
data/CHANGES
CHANGED
data/README
CHANGED
@@ -1,41 +1,114 @@
|
|
1
1
|
Spider, a Web spidering library for Ruby. It handles the robots.txt,
|
2
2
|
scraping, collecting, and looping so that you can just handle the data.
|
3
3
|
|
4
|
-
==
|
4
|
+
== Examples
|
5
5
|
|
6
|
+
=== Crawl the Web, loading each page in turn, until you run out of memory
|
7
|
+
|
8
|
+
require 'spider'
|
9
|
+
Spider.start_at('http://mike-burns.com/') {}
|
10
|
+
|
11
|
+
=== To handle erroneous responses
|
12
|
+
|
13
|
+
require 'spider'
|
14
|
+
Spider.start_at('http://mike-burns.com/') do |s|
|
15
|
+
s.on :failure do |a_url, resp, prior_url|
|
16
|
+
puts "URL failed: #{a_url}"
|
17
|
+
puts " linked from #{prior_url}"
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
=== Or handle successful responses
|
22
|
+
|
23
|
+
require 'spider'
|
24
|
+
Spider.start_at('http://mike-burns.com/') do |s|
|
25
|
+
s.on :success do |a_url, resp, prior_url|
|
26
|
+
puts "#{a_url}: #{resp.code}"
|
27
|
+
puts resp.body
|
28
|
+
puts
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
=== Limit to just one domain
|
33
|
+
|
34
|
+
require 'spider'
|
6
35
|
Spider.start_at('http://mike-burns.com/') do |s|
|
7
|
-
# Limit the pages to just this domain.
|
8
36
|
s.add_url_check do |a_url|
|
9
37
|
a_url =~ %r{^http://mike-burns.com.*}
|
10
38
|
end
|
39
|
+
end
|
11
40
|
|
12
|
-
|
13
|
-
s.on 404 do |a_url, resp, prior_url|
|
14
|
-
puts "URL not found: #{a_url}"
|
15
|
-
end
|
41
|
+
=== Pass headers to some requests
|
16
42
|
|
17
|
-
|
18
|
-
|
19
|
-
|
43
|
+
require 'spider'
|
44
|
+
Spider.start_at('http://mike-burns.com/') do |s|
|
45
|
+
s.setup do |a_url|
|
46
|
+
if a_url =~ %r{^http://.*wikipedia.*}
|
47
|
+
headers['User-Agent'] = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
48
|
+
end
|
20
49
|
end
|
50
|
+
end
|
21
51
|
|
22
|
-
|
23
|
-
|
24
|
-
|
52
|
+
=== Use memcached to track cycles
|
53
|
+
|
54
|
+
require 'spider'
|
55
|
+
require 'spider/included_in_memcached'
|
56
|
+
SERVERS = ['10.0.10.2:11211','10.0.10.3:11211','10.0.10.4:11211']
|
57
|
+
Spider.start_at('http://mike-burns.com/') do |s|
|
58
|
+
s.check_already_seen_with IncludedInMemcached.new(SERVERS)
|
59
|
+
end
|
60
|
+
|
61
|
+
=== Track cycles with a custom object
|
62
|
+
|
63
|
+
require 'spider'
|
64
|
+
|
65
|
+
class ExpireLinks < Hash
|
66
|
+
def <<(v)
|
67
|
+
[v] = Time.now
|
68
|
+
end
|
69
|
+
def include?(v)
|
70
|
+
[v] && (Time.now + 86400) <= [v]
|
25
71
|
end
|
26
72
|
end
|
27
73
|
|
74
|
+
Spider.start_at('http://mike-burns.com/') do |s|
|
75
|
+
s.check_already_seen_with ExpireLinks.new
|
76
|
+
end
|
77
|
+
|
78
|
+
=== Create a URL graph
|
79
|
+
|
80
|
+
require 'spider'
|
81
|
+
nodes = {}
|
82
|
+
Spider.start_at('http://mike-burns.com/') do |s|
|
83
|
+
s.add_url_check {|a_url| a_url =~ %r{^http://mike-burns.com.*} }
|
28
84
|
|
29
|
-
|
85
|
+
s.on(:every) do |a_url, resp, prior_url|
|
86
|
+
nodes[prior_url] ||= []
|
87
|
+
nodes[prior_url] << a_url
|
88
|
+
end
|
89
|
+
end
|
30
90
|
|
31
|
-
|
32
|
-
|
33
|
-
|
91
|
+
=== Use a proxy
|
92
|
+
|
93
|
+
require 'net/http_configuration'
|
94
|
+
require 'spider'
|
95
|
+
http_conf = Net::HTTP::Configuration.new(:proxy_host => '7proxies.org',
|
96
|
+
:proxy_port => 8881)
|
97
|
+
http_conf.apply do
|
98
|
+
Spider.start_at('http://img.4chan.org/b/') do |s|
|
99
|
+
s.on(:success) do |a_url, resp, prior_url|
|
100
|
+
File.open(a_url.gsub('/',':'),'w') do |f|
|
101
|
+
f.write(resp.body)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
34
106
|
|
35
107
|
== Author
|
36
108
|
|
37
109
|
Mike Burns http://mike-burns.com mike@mike-burns.com
|
38
110
|
|
39
|
-
|
111
|
+
Help from Matt Horan and John Nagro.
|
112
|
+
|
40
113
|
With `robot_rules' from James Edward Gray II via
|
41
114
|
http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589
|
@@ -0,0 +1,217 @@
|
|
1
|
+
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
+
<!DOCTYPE html
|
3
|
+
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
+
|
6
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
7
|
+
<head>
|
8
|
+
<title>Class: IncludedInMemcached</title>
|
9
|
+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
+
<meta http-equiv="Content-Script-Type" content="text/javascript" />
|
11
|
+
<link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" />
|
12
|
+
<script type="text/javascript">
|
13
|
+
// <![CDATA[
|
14
|
+
|
15
|
+
function popupCode( url ) {
|
16
|
+
window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
|
17
|
+
}
|
18
|
+
|
19
|
+
function toggleCode( id ) {
|
20
|
+
if ( document.getElementById )
|
21
|
+
elem = document.getElementById( id );
|
22
|
+
else if ( document.all )
|
23
|
+
elem = eval( "document.all." + id );
|
24
|
+
else
|
25
|
+
return false;
|
26
|
+
|
27
|
+
elemStyle = elem.style;
|
28
|
+
|
29
|
+
if ( elemStyle.display != "block" ) {
|
30
|
+
elemStyle.display = "block"
|
31
|
+
} else {
|
32
|
+
elemStyle.display = "none"
|
33
|
+
}
|
34
|
+
|
35
|
+
return true;
|
36
|
+
}
|
37
|
+
|
38
|
+
// Make codeblocks hidden by default
|
39
|
+
document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
|
40
|
+
|
41
|
+
// ]]>
|
42
|
+
</script>
|
43
|
+
|
44
|
+
</head>
|
45
|
+
<body>
|
46
|
+
|
47
|
+
|
48
|
+
|
49
|
+
<div id="classHeader">
|
50
|
+
<table class="header-table">
|
51
|
+
<tr class="top-aligned-row">
|
52
|
+
<td><strong>Class</strong></td>
|
53
|
+
<td class="class-name-in-header">IncludedInMemcached</td>
|
54
|
+
</tr>
|
55
|
+
<tr class="top-aligned-row">
|
56
|
+
<td><strong>In:</strong></td>
|
57
|
+
<td>
|
58
|
+
<a href="../files/lib/included_in_memcached_rb.html">
|
59
|
+
lib/included_in_memcached.rb
|
60
|
+
</a>
|
61
|
+
<br />
|
62
|
+
</td>
|
63
|
+
</tr>
|
64
|
+
|
65
|
+
<tr class="top-aligned-row">
|
66
|
+
<td><strong>Parent:</strong></td>
|
67
|
+
<td>
|
68
|
+
Object
|
69
|
+
</td>
|
70
|
+
</tr>
|
71
|
+
</table>
|
72
|
+
</div>
|
73
|
+
<!-- banner header -->
|
74
|
+
|
75
|
+
<div id="bodyContent">
|
76
|
+
|
77
|
+
|
78
|
+
|
79
|
+
<div id="contextContent">
|
80
|
+
|
81
|
+
<div id="description">
|
82
|
+
<p>
|
83
|
+
A specialized class using memcached to track items stored. It supports
|
84
|
+
three operations: <a href="IncludedInMemcached.html#M000001">new</a>,
|
85
|
+
<<, and <a href="IncludedInMemcached.html#M000003">include?</a> .
|
86
|
+
Together these can be used to add items to the memcache, then determine
|
87
|
+
whether the item has been added.
|
88
|
+
</p>
|
89
|
+
|
90
|
+
</div>
|
91
|
+
|
92
|
+
|
93
|
+
</div>
|
94
|
+
|
95
|
+
<div id="method-list">
|
96
|
+
<h3 class="section-bar">Methods</h3>
|
97
|
+
|
98
|
+
<div class="name-list">
|
99
|
+
<a href="#M000002"><<</a>
|
100
|
+
<a href="#M000003">include?</a>
|
101
|
+
<a href="#M000001">new</a>
|
102
|
+
</div>
|
103
|
+
</div>
|
104
|
+
|
105
|
+
</div>
|
106
|
+
|
107
|
+
|
108
|
+
<!-- if includes -->
|
109
|
+
|
110
|
+
<div id="section">
|
111
|
+
|
112
|
+
|
113
|
+
|
114
|
+
|
115
|
+
|
116
|
+
|
117
|
+
|
118
|
+
|
119
|
+
<!-- if method_list -->
|
120
|
+
<div id="methods">
|
121
|
+
<h3 class="section-bar">Public Class methods</h3>
|
122
|
+
|
123
|
+
<div id="method-M000001" class="method-detail">
|
124
|
+
<a name="M000001"></a>
|
125
|
+
|
126
|
+
<div class="method-heading">
|
127
|
+
<a href="#M000001" class="method-signature">
|
128
|
+
<span class="method-name">new</span><span class="method-args">(*a)</span>
|
129
|
+
</a>
|
130
|
+
</div>
|
131
|
+
|
132
|
+
<div class="method-description">
|
133
|
+
<p>
|
134
|
+
Construct a <a href="IncludedInMemcached.html#M000001">new</a> <a
|
135
|
+
href="IncludedInMemcached.html">IncludedInMemcached</a> instance. All
|
136
|
+
arguments here are passed to MemCache (part of the memcache-client gem).
|
137
|
+
</p>
|
138
|
+
<p><a class="source-toggle" href="#"
|
139
|
+
onclick="toggleCode('M000001-source');return false;">[Source]</a></p>
|
140
|
+
<div class="method-source-code" id="M000001-source">
|
141
|
+
<pre>
|
142
|
+
<span class="ruby-comment cmt"># File lib/included_in_memcached.rb, line 9</span>
|
143
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-operator">*</span><span class="ruby-identifier">a</span>)
|
144
|
+
<span class="ruby-ivar">@c</span> = <span class="ruby-constant">MemCache</span>.<span class="ruby-identifier">new</span>(<span class="ruby-operator">*</span><span class="ruby-identifier">a</span>)
|
145
|
+
<span class="ruby-keyword kw">end</span>
|
146
|
+
</pre>
|
147
|
+
</div>
|
148
|
+
</div>
|
149
|
+
</div>
|
150
|
+
|
151
|
+
<h3 class="section-bar">Public Instance methods</h3>
|
152
|
+
|
153
|
+
<div id="method-M000002" class="method-detail">
|
154
|
+
<a name="M000002"></a>
|
155
|
+
|
156
|
+
<div class="method-heading">
|
157
|
+
<a href="#M000002" class="method-signature">
|
158
|
+
<span class="method-name"><<</span><span class="method-args">(v)</span>
|
159
|
+
</a>
|
160
|
+
</div>
|
161
|
+
|
162
|
+
<div class="method-description">
|
163
|
+
<p>
|
164
|
+
Add an item to the memcache.
|
165
|
+
</p>
|
166
|
+
<p><a class="source-toggle" href="#"
|
167
|
+
onclick="toggleCode('M000002-source');return false;">[Source]</a></p>
|
168
|
+
<div class="method-source-code" id="M000002-source">
|
169
|
+
<pre>
|
170
|
+
<span class="ruby-comment cmt"># File lib/included_in_memcached.rb, line 14</span>
|
171
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-operator"><<</span>(<span class="ruby-identifier">v</span>)
|
172
|
+
<span class="ruby-ivar">@c</span>.<span class="ruby-identifier">add</span>(<span class="ruby-identifier">v</span>.<span class="ruby-identifier">to_s</span>, <span class="ruby-identifier">v</span>)
|
173
|
+
<span class="ruby-keyword kw">end</span>
|
174
|
+
</pre>
|
175
|
+
</div>
|
176
|
+
</div>
|
177
|
+
</div>
|
178
|
+
|
179
|
+
<div id="method-M000003" class="method-detail">
|
180
|
+
<a name="M000003"></a>
|
181
|
+
|
182
|
+
<div class="method-heading">
|
183
|
+
<a href="#M000003" class="method-signature">
|
184
|
+
<span class="method-name">include?</span><span class="method-args">(v)</span>
|
185
|
+
</a>
|
186
|
+
</div>
|
187
|
+
|
188
|
+
<div class="method-description">
|
189
|
+
<p>
|
190
|
+
True if the item is in the memcache.
|
191
|
+
</p>
|
192
|
+
<p><a class="source-toggle" href="#"
|
193
|
+
onclick="toggleCode('M000003-source');return false;">[Source]</a></p>
|
194
|
+
<div class="method-source-code" id="M000003-source">
|
195
|
+
<pre>
|
196
|
+
<span class="ruby-comment cmt"># File lib/included_in_memcached.rb, line 19</span>
|
197
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">include?</span>(<span class="ruby-identifier">v</span>)
|
198
|
+
<span class="ruby-ivar">@c</span>.<span class="ruby-identifier">get</span>(<span class="ruby-identifier">v</span>.<span class="ruby-identifier">to_s</span>) <span class="ruby-operator">==</span> <span class="ruby-identifier">v</span>
|
199
|
+
<span class="ruby-keyword kw">end</span>
|
200
|
+
</pre>
|
201
|
+
</div>
|
202
|
+
</div>
|
203
|
+
</div>
|
204
|
+
|
205
|
+
|
206
|
+
</div>
|
207
|
+
|
208
|
+
|
209
|
+
</div>
|
210
|
+
|
211
|
+
|
212
|
+
<div id="validator-badges">
|
213
|
+
<p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
|
214
|
+
</div>
|
215
|
+
|
216
|
+
</body>
|
217
|
+
</html>
|
data/doc/classes/Spider.html
CHANGED
@@ -93,7 +93,7 @@ links, and doing it all over again.
|
|
93
93
|
<h3 class="section-bar">Methods</h3>
|
94
94
|
|
95
95
|
<div class="name-list">
|
96
|
-
<a href="#
|
96
|
+
<a href="#M000011">start_at</a>
|
97
97
|
</div>
|
98
98
|
</div>
|
99
99
|
|
@@ -115,11 +115,11 @@ links, and doing it all over again.
|
|
115
115
|
<div id="methods">
|
116
116
|
<h3 class="section-bar">Public Class methods</h3>
|
117
117
|
|
118
|
-
<div id="method-
|
119
|
-
<a name="
|
118
|
+
<div id="method-M000011" class="method-detail">
|
119
|
+
<a name="M000011"></a>
|
120
120
|
|
121
121
|
<div class="method-heading">
|
122
|
-
<a href="#
|
122
|
+
<a href="#M000011" class="method-signature">
|
123
123
|
<span class="method-name">start_at</span><span class="method-args">(a_url, &block)</span>
|
124
124
|
</a>
|
125
125
|
</div>
|
@@ -128,7 +128,9 @@ links, and doing it all over again.
|
|
128
128
|
<p>
|
129
129
|
Runs the spider starting at the given URL. Also takes a block that is given
|
130
130
|
the <a href="SpiderInstance.html">SpiderInstance</a>. Use the block to
|
131
|
-
define the rules and handlers for the discovered Web pages.
|
131
|
+
define the rules and handlers for the discovered Web pages. See <a
|
132
|
+
href="SpiderInstance.html">SpiderInstance</a> for the possible rules and
|
133
|
+
handlers.
|
132
134
|
</p>
|
133
135
|
<pre>
|
134
136
|
Spider.start_at('http://mike-burns.com/') do |s|
|
@@ -150,10 +152,10 @@ define the rules and handlers for the discovered Web pages.
|
|
150
152
|
end
|
151
153
|
</pre>
|
152
154
|
<p><a class="source-toggle" href="#"
|
153
|
-
onclick="toggleCode('
|
154
|
-
<div class="method-source-code" id="
|
155
|
+
onclick="toggleCode('M000011-source');return false;">[Source]</a></p>
|
156
|
+
<div class="method-source-code" id="M000011-source">
|
155
157
|
<pre>
|
156
|
-
<span class="ruby-comment cmt"># File lib/spider.rb, line
|
158
|
+
<span class="ruby-comment cmt"># File lib/spider.rb, line 54</span>
|
157
159
|
<span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">start_at</span>(<span class="ruby-identifier">a_url</span>, <span class="ruby-operator">&</span><span class="ruby-identifier">block</span>)
|
158
160
|
<span class="ruby-identifier">rules</span> = <span class="ruby-constant">RobotRules</span>.<span class="ruby-identifier">new</span>(<span class="ruby-value str">'Ruby Spider 1.0'</span>)
|
159
161
|
<span class="ruby-identifier">a_spider</span> = <span class="ruby-constant">SpiderInstance</span>.<span class="ruby-identifier">new</span>({<span class="ruby-keyword kw">nil</span> =<span class="ruby-operator">></span> <span class="ruby-identifier">a_url</span>}, [], <span class="ruby-identifier">rules</span>, [])
|
@@ -55,8 +55,8 @@
|
|
55
55
|
<tr class="top-aligned-row">
|
56
56
|
<td><strong>In:</strong></td>
|
57
57
|
<td>
|
58
|
-
<a href="../files/lib/
|
59
|
-
lib/
|
58
|
+
<a href="../files/lib/spider_instance_rb.html">
|
59
|
+
lib/spider_instance.rb
|
60
60
|
</a>
|
61
61
|
<br />
|
62
62
|
</td>
|
@@ -86,12 +86,13 @@
|
|
86
86
|
<h3 class="section-bar">Methods</h3>
|
87
87
|
|
88
88
|
<div class="name-list">
|
89
|
-
<a href="#
|
90
|
-
<a href="#
|
91
|
-
<a href="#
|
92
|
-
<a href="#
|
93
|
-
<a href="#
|
94
|
-
<a href="#
|
89
|
+
<a href="#M000004">add_url_check</a>
|
90
|
+
<a href="#M000005">check_already_seen_with</a>
|
91
|
+
<a href="#M000010">clear_headers</a>
|
92
|
+
<a href="#M000009">headers</a>
|
93
|
+
<a href="#M000006">on</a>
|
94
|
+
<a href="#M000007">setup</a>
|
95
|
+
<a href="#M000008">teardown</a>
|
95
96
|
</div>
|
96
97
|
</div>
|
97
98
|
|
@@ -113,11 +114,11 @@
|
|
113
114
|
<div id="methods">
|
114
115
|
<h3 class="section-bar">Public Instance methods</h3>
|
115
116
|
|
116
|
-
<div id="method-
|
117
|
-
<a name="
|
117
|
+
<div id="method-M000004" class="method-detail">
|
118
|
+
<a name="M000004"></a>
|
118
119
|
|
119
120
|
<div class="method-heading">
|
120
|
-
<a href="#
|
121
|
+
<a href="#M000004" class="method-signature">
|
121
122
|
<span class="method-name">add_url_check</span><span class="method-args">(&block)</span>
|
122
123
|
</a>
|
123
124
|
</div>
|
@@ -136,10 +137,10 @@ href="http://mike-burns.com">mike-burns.com</a>’:
|
|
136
137
|
add_url_check { |a_url| a_url =~ %r{^http://mike-burns.com.*}
|
137
138
|
</pre>
|
138
139
|
<p><a class="source-toggle" href="#"
|
139
|
-
onclick="toggleCode('
|
140
|
-
<div class="method-source-code" id="
|
140
|
+
onclick="toggleCode('M000004-source');return false;">[Source]</a></p>
|
141
|
+
<div class="method-source-code" id="M000004-source">
|
141
142
|
<pre>
|
142
|
-
<span class="ruby-comment cmt"># File lib/
|
143
|
+
<span class="ruby-comment cmt"># File lib/spider_instance.rb, line 69</span>
|
143
144
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">add_url_check</span>(<span class="ruby-operator">&</span><span class="ruby-identifier">block</span>)
|
144
145
|
<span class="ruby-ivar">@url_checks</span> <span class="ruby-operator"><<</span> <span class="ruby-identifier">block</span>
|
145
146
|
<span class="ruby-keyword kw">end</span>
|
@@ -148,24 +149,74 @@ href="http://mike-burns.com">mike-burns.com</a>’:
|
|
148
149
|
</div>
|
149
150
|
</div>
|
150
151
|
|
151
|
-
<div id="method-
|
152
|
-
<a name="
|
152
|
+
<div id="method-M000005" class="method-detail">
|
153
|
+
<a name="M000005"></a>
|
153
154
|
|
154
155
|
<div class="method-heading">
|
155
|
-
<a href="#
|
156
|
+
<a href="#M000005" class="method-signature">
|
157
|
+
<span class="method-name">check_already_seen_with</span><span class="method-args">(cacher)</span>
|
158
|
+
</a>
|
159
|
+
</div>
|
160
|
+
|
161
|
+
<div class="method-description">
|
162
|
+
<p>
|
163
|
+
The Web is a graph; to avoid cycles we store the nodes (URLs) already
|
164
|
+
visited. The Web is a really, really, really big graph; as such, this list
|
165
|
+
of visited nodes grows really, really, really big.
|
166
|
+
</p>
|
167
|
+
<p>
|
168
|
+
Change the object used to store these seen nodes with this. The default
|
169
|
+
object is an instance of Array. Available with <a
|
170
|
+
href="Spider.html">Spider</a> is a wrapper of memcached.
|
171
|
+
</p>
|
172
|
+
<p>
|
173
|
+
You can implement a custom class for this; any object passed to <a
|
174
|
+
href="SpiderInstance.html#M000005">check_already_seen_with</a> must
|
175
|
+
understand just << and included? .
|
176
|
+
</p>
|
177
|
+
<pre>
|
178
|
+
# default
|
179
|
+
check_already_seen_with Array.new
|
180
|
+
|
181
|
+
# memcached
|
182
|
+
require 'spider/included_in_memcached'
|
183
|
+
check_already_seen_with IncludedInMemcached.new('localhost:11211')
|
184
|
+
</pre>
|
185
|
+
<p><a class="source-toggle" href="#"
|
186
|
+
onclick="toggleCode('M000005-source');return false;">[Source]</a></p>
|
187
|
+
<div class="method-source-code" id="M000005-source">
|
188
|
+
<pre>
|
189
|
+
<span class="ruby-comment cmt"># File lib/spider_instance.rb, line 90</span>
|
190
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">check_already_seen_with</span>(<span class="ruby-identifier">cacher</span>)
|
191
|
+
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">cacher</span>.<span class="ruby-identifier">respond_to?</span>(<span class="ruby-identifier">:<<</span>) <span class="ruby-operator">&&</span> <span class="ruby-identifier">cacher</span>.<span class="ruby-identifier">respond_to?</span>(<span class="ruby-identifier">:include?</span>)
|
192
|
+
<span class="ruby-ivar">@seen</span> = <span class="ruby-identifier">cacher</span>
|
193
|
+
<span class="ruby-keyword kw">else</span>
|
194
|
+
<span class="ruby-identifier">raise</span> <span class="ruby-constant">ArgumentError</span>, <span class="ruby-value str">'expected something that responds to << and included?'</span>
|
195
|
+
<span class="ruby-keyword kw">end</span>
|
196
|
+
<span class="ruby-keyword kw">end</span>
|
197
|
+
</pre>
|
198
|
+
</div>
|
199
|
+
</div>
|
200
|
+
</div>
|
201
|
+
|
202
|
+
<div id="method-M000010" class="method-detail">
|
203
|
+
<a name="M000010"></a>
|
204
|
+
|
205
|
+
<div class="method-heading">
|
206
|
+
<a href="#M000010" class="method-signature">
|
156
207
|
<span class="method-name">clear_headers</span><span class="method-args">()</span>
|
157
208
|
</a>
|
158
209
|
</div>
|
159
210
|
|
160
211
|
<div class="method-description">
|
161
212
|
<p>
|
162
|
-
Reset the <a href="SpiderInstance.html#
|
213
|
+
Reset the <a href="SpiderInstance.html#M000009">headers</a> hash.
|
163
214
|
</p>
|
164
215
|
<p><a class="source-toggle" href="#"
|
165
|
-
onclick="toggleCode('
|
166
|
-
<div class="method-source-code" id="
|
216
|
+
onclick="toggleCode('M000010-source');return false;">[Source]</a></p>
|
217
|
+
<div class="method-source-code" id="M000010-source">
|
167
218
|
<pre>
|
168
|
-
<span class="ruby-comment cmt"># File lib/
|
219
|
+
<span class="ruby-comment cmt"># File lib/spider_instance.rb, line 157</span>
|
169
220
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">clear_headers</span>
|
170
221
|
<span class="ruby-ivar">@headers</span> = {}
|
171
222
|
<span class="ruby-keyword kw">end</span>
|
@@ -174,11 +225,11 @@ Reset the <a href="SpiderInstance.html#M000005">headers</a> hash.
|
|
174
225
|
</div>
|
175
226
|
</div>
|
176
227
|
|
177
|
-
<div id="method-
|
178
|
-
<a name="
|
228
|
+
<div id="method-M000009" class="method-detail">
|
229
|
+
<a name="M000009"></a>
|
179
230
|
|
180
231
|
<div class="method-heading">
|
181
|
-
<a href="#
|
232
|
+
<a href="#M000009" class="method-signature">
|
182
233
|
<span class="method-name">headers</span><span class="method-args">()</span>
|
183
234
|
</a>
|
184
235
|
</div>
|
@@ -191,10 +242,10 @@ Use like a hash:
|
|
191
242
|
headers['Cookies'] = 'user_id=1;password=btrross3'
|
192
243
|
</pre>
|
193
244
|
<p><a class="source-toggle" href="#"
|
194
|
-
onclick="toggleCode('
|
195
|
-
<div class="method-source-code" id="
|
245
|
+
onclick="toggleCode('M000009-source');return false;">[Source]</a></p>
|
246
|
+
<div class="method-source-code" id="M000009-source">
|
196
247
|
<pre>
|
197
|
-
<span class="ruby-comment cmt"># File lib/
|
248
|
+
<span class="ruby-comment cmt"># File lib/spider_instance.rb, line 145</span>
|
198
249
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">headers</span>
|
199
250
|
<span class="ruby-constant">HeaderSetter</span>.<span class="ruby-identifier">new</span>(<span class="ruby-keyword kw">self</span>)
|
200
251
|
<span class="ruby-keyword kw">end</span>
|
@@ -203,11 +254,11 @@ Use like a hash:
|
|
203
254
|
</div>
|
204
255
|
</div>
|
205
256
|
|
206
|
-
<div id="method-
|
207
|
-
<a name="
|
257
|
+
<div id="method-M000006" class="method-detail">
|
258
|
+
<a name="M000006"></a>
|
208
259
|
|
209
260
|
<div class="method-heading">
|
210
|
-
<a href="#
|
261
|
+
<a href="#M000006" class="method-signature">
|
211
262
|
<span class="method-name">on</span><span class="method-args">(code, p = nil, &block)</span>
|
212
263
|
</a>
|
213
264
|
</div>
|
@@ -240,10 +291,10 @@ For example:
|
|
240
291
|
end
|
241
292
|
</pre>
|
242
293
|
<p><a class="source-toggle" href="#"
|
243
|
-
onclick="toggleCode('
|
244
|
-
<div class="method-source-code" id="
|
294
|
+
onclick="toggleCode('M000006-source');return false;">[Source]</a></p>
|
295
|
+
<div class="method-source-code" id="M000006-source">
|
245
296
|
<pre>
|
246
|
-
<span class="ruby-comment cmt"># File lib/
|
297
|
+
<span class="ruby-comment cmt"># File lib/spider_instance.rb, line 120</span>
|
247
298
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">on</span>(<span class="ruby-identifier">code</span>, <span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&</span><span class="ruby-identifier">block</span>)
|
248
299
|
<span class="ruby-identifier">f</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
|
249
300
|
<span class="ruby-keyword kw">case</span> <span class="ruby-identifier">code</span>
|
@@ -258,11 +309,11 @@ For example:
|
|
258
309
|
</div>
|
259
310
|
</div>
|
260
311
|
|
261
|
-
<div id="method-
|
262
|
-
<a name="
|
312
|
+
<div id="method-M000007" class="method-detail">
|
313
|
+
<a name="M000007"></a>
|
263
314
|
|
264
315
|
<div class="method-heading">
|
265
|
-
<a href="#
|
316
|
+
<a href="#M000007" class="method-signature">
|
266
317
|
<span class="method-name">setup</span><span class="method-args">(p = nil, &block)</span>
|
267
318
|
</a>
|
268
319
|
</div>
|
@@ -277,10 +328,10 @@ Run before the HTTP request. Given the URL as a string.
|
|
277
328
|
end
|
278
329
|
</pre>
|
279
330
|
<p><a class="source-toggle" href="#"
|
280
|
-
onclick="toggleCode('
|
281
|
-
<div class="method-source-code" id="
|
331
|
+
onclick="toggleCode('M000007-source');return false;">[Source]</a></p>
|
332
|
+
<div class="method-source-code" id="M000007-source">
|
282
333
|
<pre>
|
283
|
-
<span class="ruby-comment cmt"># File lib/
|
334
|
+
<span class="ruby-comment cmt"># File lib/spider_instance.rb, line 134</span>
|
284
335
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">setup</span>(<span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&</span><span class="ruby-identifier">block</span>)
|
285
336
|
<span class="ruby-ivar">@setup</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
|
286
337
|
<span class="ruby-keyword kw">end</span>
|
@@ -289,11 +340,11 @@ Run before the HTTP request. Given the URL as a string.
|
|
289
340
|
</div>
|
290
341
|
</div>
|
291
342
|
|
292
|
-
<div id="method-
|
293
|
-
<a name="
|
343
|
+
<div id="method-M000008" class="method-detail">
|
344
|
+
<a name="M000008"></a>
|
294
345
|
|
295
346
|
<div class="method-heading">
|
296
|
-
<a href="#
|
347
|
+
<a href="#M000008" class="method-signature">
|
297
348
|
<span class="method-name">teardown</span><span class="method-args">(p = nil, &block)</span>
|
298
349
|
</a>
|
299
350
|
</div>
|
@@ -303,10 +354,10 @@ Run before the HTTP request. Given the URL as a string.
|
|
303
354
|
Run last, once for each page. Given the URL as a string.
|
304
355
|
</p>
|
305
356
|
<p><a class="source-toggle" href="#"
|
306
|
-
onclick="toggleCode('
|
307
|
-
<div class="method-source-code" id="
|
357
|
+
onclick="toggleCode('M000008-source');return false;">[Source]</a></p>
|
358
|
+
<div class="method-source-code" id="M000008-source">
|
308
359
|
<pre>
|
309
|
-
<span class="ruby-comment cmt"># File lib/
|
360
|
+
<span class="ruby-comment cmt"># File lib/spider_instance.rb, line 139</span>
|
310
361
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">teardown</span>(<span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&</span><span class="ruby-identifier">block</span>)
|
311
362
|
<span class="ruby-ivar">@teardown</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
|
312
363
|
<span class="ruby-keyword kw">end</span>
|