spider 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES +3 -0
- data/README +90 -17
- data/doc/classes/IncludedInMemcached.html +217 -0
- data/doc/classes/Spider.html +10 -8
- data/doc/classes/SpiderInstance.html +96 -45
- data/doc/created.rid +1 -1
- data/doc/files/README.html +95 -21
- data/doc/{classes/Net.html → files/lib/included_in_memcached_rb.html} +23 -16
- data/doc/files/lib/spider_instance_rb.html +118 -0
- data/doc/files/lib/spider_rb.html +95 -32
- data/doc/fr_class_index.html +1 -0
- data/doc/fr_file_index.html +2 -0
- data/doc/fr_method_index.html +11 -7
- data/lib/included_in_memcached.rb +22 -0
- data/lib/spider.rb +4 -246
- data/lib/spider_instance.rb +290 -0
- data/spec/included_in_memcached_spec.rb +44 -0
- data/spec/spider_instance_spec.rb +46 -4
- data/spider.gemspec +1 -1
- metadata +8 -8
- data/doc/classes/Net/HTTPRedirection.html +0 -144
- data/doc/classes/Net/HTTPResponse.html +0 -166
- data/doc/classes/Net/HTTPSuccess.html +0 -144
- data/doc/classes/NilClass.html +0 -144
data/CHANGES
CHANGED
data/README
CHANGED
@@ -1,41 +1,114 @@
|
|
1
1
|
Spider, a Web spidering library for Ruby. It handles the robots.txt,
|
2
2
|
scraping, collecting, and looping so that you can just handle the data.
|
3
3
|
|
4
|
-
==
|
4
|
+
== Examples
|
5
5
|
|
6
|
+
=== Crawl the Web, loading each page in turn, until you run out of memory
|
7
|
+
|
8
|
+
require 'spider'
|
9
|
+
Spider.start_at('http://mike-burns.com/') {}
|
10
|
+
|
11
|
+
=== To handle erroneous responses
|
12
|
+
|
13
|
+
require 'spider'
|
14
|
+
Spider.start_at('http://mike-burns.com/') do |s|
|
15
|
+
s.on :failure do |a_url, resp, prior_url|
|
16
|
+
puts "URL failed: #{a_url}"
|
17
|
+
puts " linked from #{prior_url}"
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
=== Or handle successful responses
|
22
|
+
|
23
|
+
require 'spider'
|
24
|
+
Spider.start_at('http://mike-burns.com/') do |s|
|
25
|
+
s.on :success do |a_url, resp, prior_url|
|
26
|
+
puts "#{a_url}: #{resp.code}"
|
27
|
+
puts resp.body
|
28
|
+
puts
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
=== Limit to just one domain
|
33
|
+
|
34
|
+
require 'spider'
|
6
35
|
Spider.start_at('http://mike-burns.com/') do |s|
|
7
|
-
# Limit the pages to just this domain.
|
8
36
|
s.add_url_check do |a_url|
|
9
37
|
a_url =~ %r{^http://mike-burns.com.*}
|
10
38
|
end
|
39
|
+
end
|
11
40
|
|
12
|
-
|
13
|
-
s.on 404 do |a_url, resp, prior_url|
|
14
|
-
puts "URL not found: #{a_url}"
|
15
|
-
end
|
41
|
+
=== Pass headers to some requests
|
16
42
|
|
17
|
-
|
18
|
-
|
19
|
-
|
43
|
+
require 'spider'
|
44
|
+
Spider.start_at('http://mike-burns.com/') do |s|
|
45
|
+
s.setup do |a_url|
|
46
|
+
if a_url =~ %r{^http://.*wikipedia.*}
|
47
|
+
headers['User-Agent'] = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
48
|
+
end
|
20
49
|
end
|
50
|
+
end
|
21
51
|
|
22
|
-
|
23
|
-
|
24
|
-
|
52
|
+
=== Use memcached to track cycles
|
53
|
+
|
54
|
+
require 'spider'
|
55
|
+
require 'spider/included_in_memcached'
|
56
|
+
SERVERS = ['10.0.10.2:11211','10.0.10.3:11211','10.0.10.4:11211']
|
57
|
+
Spider.start_at('http://mike-burns.com/') do |s|
|
58
|
+
s.check_already_seen_with IncludedInMemcached.new(SERVERS)
|
59
|
+
end
|
60
|
+
|
61
|
+
=== Track cycles with a custom object
|
62
|
+
|
63
|
+
require 'spider'
|
64
|
+
|
65
|
+
class ExpireLinks < Hash
|
66
|
+
def <<(v)
|
67
|
+
[v] = Time.now
|
68
|
+
end
|
69
|
+
def include?(v)
|
70
|
+
[v] && (Time.now + 86400) <= [v]
|
25
71
|
end
|
26
72
|
end
|
27
73
|
|
74
|
+
Spider.start_at('http://mike-burns.com/') do |s|
|
75
|
+
s.check_already_seen_with ExpireLinks.new
|
76
|
+
end
|
77
|
+
|
78
|
+
=== Create a URL graph
|
79
|
+
|
80
|
+
require 'spider'
|
81
|
+
nodes = {}
|
82
|
+
Spider.start_at('http://mike-burns.com/') do |s|
|
83
|
+
s.add_url_check {|a_url| a_url =~ %r{^http://mike-burns.com.*} }
|
28
84
|
|
29
|
-
|
85
|
+
s.on(:every) do |a_url, resp, prior_url|
|
86
|
+
nodes[prior_url] ||= []
|
87
|
+
nodes[prior_url] << a_url
|
88
|
+
end
|
89
|
+
end
|
30
90
|
|
31
|
-
|
32
|
-
|
33
|
-
|
91
|
+
=== Use a proxy
|
92
|
+
|
93
|
+
require 'net/http_configuration'
|
94
|
+
require 'spider'
|
95
|
+
http_conf = Net::HTTP::Configuration.new(:proxy_host => '7proxies.org',
|
96
|
+
:proxy_port => 8881)
|
97
|
+
http_conf.apply do
|
98
|
+
Spider.start_at('http://img.4chan.org/b/') do |s|
|
99
|
+
s.on(:success) do |a_url, resp, prior_url|
|
100
|
+
File.open(a_url.gsub('/',':'),'w') do |f|
|
101
|
+
f.write(resp.body)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
34
106
|
|
35
107
|
== Author
|
36
108
|
|
37
109
|
Mike Burns http://mike-burns.com mike@mike-burns.com
|
38
110
|
|
39
|
-
|
111
|
+
Help from Matt Horan and John Nagro.
|
112
|
+
|
40
113
|
With `robot_rules' from James Edward Gray II via
|
41
114
|
http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589
|
@@ -0,0 +1,217 @@
|
|
1
|
+
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
+
<!DOCTYPE html
|
3
|
+
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
+
|
6
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
7
|
+
<head>
|
8
|
+
<title>Class: IncludedInMemcached</title>
|
9
|
+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
+
<meta http-equiv="Content-Script-Type" content="text/javascript" />
|
11
|
+
<link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" />
|
12
|
+
<script type="text/javascript">
|
13
|
+
// <![CDATA[
|
14
|
+
|
15
|
+
function popupCode( url ) {
|
16
|
+
window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
|
17
|
+
}
|
18
|
+
|
19
|
+
function toggleCode( id ) {
|
20
|
+
if ( document.getElementById )
|
21
|
+
elem = document.getElementById( id );
|
22
|
+
else if ( document.all )
|
23
|
+
elem = eval( "document.all." + id );
|
24
|
+
else
|
25
|
+
return false;
|
26
|
+
|
27
|
+
elemStyle = elem.style;
|
28
|
+
|
29
|
+
if ( elemStyle.display != "block" ) {
|
30
|
+
elemStyle.display = "block"
|
31
|
+
} else {
|
32
|
+
elemStyle.display = "none"
|
33
|
+
}
|
34
|
+
|
35
|
+
return true;
|
36
|
+
}
|
37
|
+
|
38
|
+
// Make codeblocks hidden by default
|
39
|
+
document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
|
40
|
+
|
41
|
+
// ]]>
|
42
|
+
</script>
|
43
|
+
|
44
|
+
</head>
|
45
|
+
<body>
|
46
|
+
|
47
|
+
|
48
|
+
|
49
|
+
<div id="classHeader">
|
50
|
+
<table class="header-table">
|
51
|
+
<tr class="top-aligned-row">
|
52
|
+
<td><strong>Class</strong></td>
|
53
|
+
<td class="class-name-in-header">IncludedInMemcached</td>
|
54
|
+
</tr>
|
55
|
+
<tr class="top-aligned-row">
|
56
|
+
<td><strong>In:</strong></td>
|
57
|
+
<td>
|
58
|
+
<a href="../files/lib/included_in_memcached_rb.html">
|
59
|
+
lib/included_in_memcached.rb
|
60
|
+
</a>
|
61
|
+
<br />
|
62
|
+
</td>
|
63
|
+
</tr>
|
64
|
+
|
65
|
+
<tr class="top-aligned-row">
|
66
|
+
<td><strong>Parent:</strong></td>
|
67
|
+
<td>
|
68
|
+
Object
|
69
|
+
</td>
|
70
|
+
</tr>
|
71
|
+
</table>
|
72
|
+
</div>
|
73
|
+
<!-- banner header -->
|
74
|
+
|
75
|
+
<div id="bodyContent">
|
76
|
+
|
77
|
+
|
78
|
+
|
79
|
+
<div id="contextContent">
|
80
|
+
|
81
|
+
<div id="description">
|
82
|
+
<p>
|
83
|
+
A specialized class using memcached to track items stored. It supports
|
84
|
+
three operations: <a href="IncludedInMemcached.html#M000001">new</a>,
|
85
|
+
<<, and <a href="IncludedInMemcached.html#M000003">include?</a> .
|
86
|
+
Together these can be used to add items to the memcache, then determine
|
87
|
+
whether the item has been added.
|
88
|
+
</p>
|
89
|
+
|
90
|
+
</div>
|
91
|
+
|
92
|
+
|
93
|
+
</div>
|
94
|
+
|
95
|
+
<div id="method-list">
|
96
|
+
<h3 class="section-bar">Methods</h3>
|
97
|
+
|
98
|
+
<div class="name-list">
|
99
|
+
<a href="#M000002"><<</a>
|
100
|
+
<a href="#M000003">include?</a>
|
101
|
+
<a href="#M000001">new</a>
|
102
|
+
</div>
|
103
|
+
</div>
|
104
|
+
|
105
|
+
</div>
|
106
|
+
|
107
|
+
|
108
|
+
<!-- if includes -->
|
109
|
+
|
110
|
+
<div id="section">
|
111
|
+
|
112
|
+
|
113
|
+
|
114
|
+
|
115
|
+
|
116
|
+
|
117
|
+
|
118
|
+
|
119
|
+
<!-- if method_list -->
|
120
|
+
<div id="methods">
|
121
|
+
<h3 class="section-bar">Public Class methods</h3>
|
122
|
+
|
123
|
+
<div id="method-M000001" class="method-detail">
|
124
|
+
<a name="M000001"></a>
|
125
|
+
|
126
|
+
<div class="method-heading">
|
127
|
+
<a href="#M000001" class="method-signature">
|
128
|
+
<span class="method-name">new</span><span class="method-args">(*a)</span>
|
129
|
+
</a>
|
130
|
+
</div>
|
131
|
+
|
132
|
+
<div class="method-description">
|
133
|
+
<p>
|
134
|
+
Construct a <a href="IncludedInMemcached.html#M000001">new</a> <a
|
135
|
+
href="IncludedInMemcached.html">IncludedInMemcached</a> instance. All
|
136
|
+
arguments here are passed to MemCache (part of the memcache-client gem).
|
137
|
+
</p>
|
138
|
+
<p><a class="source-toggle" href="#"
|
139
|
+
onclick="toggleCode('M000001-source');return false;">[Source]</a></p>
|
140
|
+
<div class="method-source-code" id="M000001-source">
|
141
|
+
<pre>
|
142
|
+
<span class="ruby-comment cmt"># File lib/included_in_memcached.rb, line 9</span>
|
143
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-operator">*</span><span class="ruby-identifier">a</span>)
|
144
|
+
<span class="ruby-ivar">@c</span> = <span class="ruby-constant">MemCache</span>.<span class="ruby-identifier">new</span>(<span class="ruby-operator">*</span><span class="ruby-identifier">a</span>)
|
145
|
+
<span class="ruby-keyword kw">end</span>
|
146
|
+
</pre>
|
147
|
+
</div>
|
148
|
+
</div>
|
149
|
+
</div>
|
150
|
+
|
151
|
+
<h3 class="section-bar">Public Instance methods</h3>
|
152
|
+
|
153
|
+
<div id="method-M000002" class="method-detail">
|
154
|
+
<a name="M000002"></a>
|
155
|
+
|
156
|
+
<div class="method-heading">
|
157
|
+
<a href="#M000002" class="method-signature">
|
158
|
+
<span class="method-name"><<</span><span class="method-args">(v)</span>
|
159
|
+
</a>
|
160
|
+
</div>
|
161
|
+
|
162
|
+
<div class="method-description">
|
163
|
+
<p>
|
164
|
+
Add an item to the memcache.
|
165
|
+
</p>
|
166
|
+
<p><a class="source-toggle" href="#"
|
167
|
+
onclick="toggleCode('M000002-source');return false;">[Source]</a></p>
|
168
|
+
<div class="method-source-code" id="M000002-source">
|
169
|
+
<pre>
|
170
|
+
<span class="ruby-comment cmt"># File lib/included_in_memcached.rb, line 14</span>
|
171
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-operator"><<</span>(<span class="ruby-identifier">v</span>)
|
172
|
+
<span class="ruby-ivar">@c</span>.<span class="ruby-identifier">add</span>(<span class="ruby-identifier">v</span>.<span class="ruby-identifier">to_s</span>, <span class="ruby-identifier">v</span>)
|
173
|
+
<span class="ruby-keyword kw">end</span>
|
174
|
+
</pre>
|
175
|
+
</div>
|
176
|
+
</div>
|
177
|
+
</div>
|
178
|
+
|
179
|
+
<div id="method-M000003" class="method-detail">
|
180
|
+
<a name="M000003"></a>
|
181
|
+
|
182
|
+
<div class="method-heading">
|
183
|
+
<a href="#M000003" class="method-signature">
|
184
|
+
<span class="method-name">include?</span><span class="method-args">(v)</span>
|
185
|
+
</a>
|
186
|
+
</div>
|
187
|
+
|
188
|
+
<div class="method-description">
|
189
|
+
<p>
|
190
|
+
True if the item is in the memcache.
|
191
|
+
</p>
|
192
|
+
<p><a class="source-toggle" href="#"
|
193
|
+
onclick="toggleCode('M000003-source');return false;">[Source]</a></p>
|
194
|
+
<div class="method-source-code" id="M000003-source">
|
195
|
+
<pre>
|
196
|
+
<span class="ruby-comment cmt"># File lib/included_in_memcached.rb, line 19</span>
|
197
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">include?</span>(<span class="ruby-identifier">v</span>)
|
198
|
+
<span class="ruby-ivar">@c</span>.<span class="ruby-identifier">get</span>(<span class="ruby-identifier">v</span>.<span class="ruby-identifier">to_s</span>) <span class="ruby-operator">==</span> <span class="ruby-identifier">v</span>
|
199
|
+
<span class="ruby-keyword kw">end</span>
|
200
|
+
</pre>
|
201
|
+
</div>
|
202
|
+
</div>
|
203
|
+
</div>
|
204
|
+
|
205
|
+
|
206
|
+
</div>
|
207
|
+
|
208
|
+
|
209
|
+
</div>
|
210
|
+
|
211
|
+
|
212
|
+
<div id="validator-badges">
|
213
|
+
<p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
|
214
|
+
</div>
|
215
|
+
|
216
|
+
</body>
|
217
|
+
</html>
|
data/doc/classes/Spider.html
CHANGED
@@ -93,7 +93,7 @@ links, and doing it all over again.
|
|
93
93
|
<h3 class="section-bar">Methods</h3>
|
94
94
|
|
95
95
|
<div class="name-list">
|
96
|
-
<a href="#
|
96
|
+
<a href="#M000011">start_at</a>
|
97
97
|
</div>
|
98
98
|
</div>
|
99
99
|
|
@@ -115,11 +115,11 @@ links, and doing it all over again.
|
|
115
115
|
<div id="methods">
|
116
116
|
<h3 class="section-bar">Public Class methods</h3>
|
117
117
|
|
118
|
-
<div id="method-
|
119
|
-
<a name="
|
118
|
+
<div id="method-M000011" class="method-detail">
|
119
|
+
<a name="M000011"></a>
|
120
120
|
|
121
121
|
<div class="method-heading">
|
122
|
-
<a href="#
|
122
|
+
<a href="#M000011" class="method-signature">
|
123
123
|
<span class="method-name">start_at</span><span class="method-args">(a_url, &block)</span>
|
124
124
|
</a>
|
125
125
|
</div>
|
@@ -128,7 +128,9 @@ links, and doing it all over again.
|
|
128
128
|
<p>
|
129
129
|
Runs the spider starting at the given URL. Also takes a block that is given
|
130
130
|
the <a href="SpiderInstance.html">SpiderInstance</a>. Use the block to
|
131
|
-
define the rules and handlers for the discovered Web pages.
|
131
|
+
define the rules and handlers for the discovered Web pages. See <a
|
132
|
+
href="SpiderInstance.html">SpiderInstance</a> for the possible rules and
|
133
|
+
handlers.
|
132
134
|
</p>
|
133
135
|
<pre>
|
134
136
|
Spider.start_at('http://mike-burns.com/') do |s|
|
@@ -150,10 +152,10 @@ define the rules and handlers for the discovered Web pages.
|
|
150
152
|
end
|
151
153
|
</pre>
|
152
154
|
<p><a class="source-toggle" href="#"
|
153
|
-
onclick="toggleCode('
|
154
|
-
<div class="method-source-code" id="
|
155
|
+
onclick="toggleCode('M000011-source');return false;">[Source]</a></p>
|
156
|
+
<div class="method-source-code" id="M000011-source">
|
155
157
|
<pre>
|
156
|
-
<span class="ruby-comment cmt"># File lib/spider.rb, line
|
158
|
+
<span class="ruby-comment cmt"># File lib/spider.rb, line 54</span>
|
157
159
|
<span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">start_at</span>(<span class="ruby-identifier">a_url</span>, <span class="ruby-operator">&</span><span class="ruby-identifier">block</span>)
|
158
160
|
<span class="ruby-identifier">rules</span> = <span class="ruby-constant">RobotRules</span>.<span class="ruby-identifier">new</span>(<span class="ruby-value str">'Ruby Spider 1.0'</span>)
|
159
161
|
<span class="ruby-identifier">a_spider</span> = <span class="ruby-constant">SpiderInstance</span>.<span class="ruby-identifier">new</span>({<span class="ruby-keyword kw">nil</span> =<span class="ruby-operator">></span> <span class="ruby-identifier">a_url</span>}, [], <span class="ruby-identifier">rules</span>, [])
|
@@ -55,8 +55,8 @@
|
|
55
55
|
<tr class="top-aligned-row">
|
56
56
|
<td><strong>In:</strong></td>
|
57
57
|
<td>
|
58
|
-
<a href="../files/lib/
|
59
|
-
lib/
|
58
|
+
<a href="../files/lib/spider_instance_rb.html">
|
59
|
+
lib/spider_instance.rb
|
60
60
|
</a>
|
61
61
|
<br />
|
62
62
|
</td>
|
@@ -86,12 +86,13 @@
|
|
86
86
|
<h3 class="section-bar">Methods</h3>
|
87
87
|
|
88
88
|
<div class="name-list">
|
89
|
-
<a href="#
|
90
|
-
<a href="#
|
91
|
-
<a href="#
|
92
|
-
<a href="#
|
93
|
-
<a href="#
|
94
|
-
<a href="#
|
89
|
+
<a href="#M000004">add_url_check</a>
|
90
|
+
<a href="#M000005">check_already_seen_with</a>
|
91
|
+
<a href="#M000010">clear_headers</a>
|
92
|
+
<a href="#M000009">headers</a>
|
93
|
+
<a href="#M000006">on</a>
|
94
|
+
<a href="#M000007">setup</a>
|
95
|
+
<a href="#M000008">teardown</a>
|
95
96
|
</div>
|
96
97
|
</div>
|
97
98
|
|
@@ -113,11 +114,11 @@
|
|
113
114
|
<div id="methods">
|
114
115
|
<h3 class="section-bar">Public Instance methods</h3>
|
115
116
|
|
116
|
-
<div id="method-
|
117
|
-
<a name="
|
117
|
+
<div id="method-M000004" class="method-detail">
|
118
|
+
<a name="M000004"></a>
|
118
119
|
|
119
120
|
<div class="method-heading">
|
120
|
-
<a href="#
|
121
|
+
<a href="#M000004" class="method-signature">
|
121
122
|
<span class="method-name">add_url_check</span><span class="method-args">(&block)</span>
|
122
123
|
</a>
|
123
124
|
</div>
|
@@ -136,10 +137,10 @@ href="http://mike-burns.com">mike-burns.com</a>’:
|
|
136
137
|
add_url_check { |a_url| a_url =~ %r{^http://mike-burns.com.*}
|
137
138
|
</pre>
|
138
139
|
<p><a class="source-toggle" href="#"
|
139
|
-
onclick="toggleCode('
|
140
|
-
<div class="method-source-code" id="
|
140
|
+
onclick="toggleCode('M000004-source');return false;">[Source]</a></p>
|
141
|
+
<div class="method-source-code" id="M000004-source">
|
141
142
|
<pre>
|
142
|
-
<span class="ruby-comment cmt"># File lib/
|
143
|
+
<span class="ruby-comment cmt"># File lib/spider_instance.rb, line 69</span>
|
143
144
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">add_url_check</span>(<span class="ruby-operator">&</span><span class="ruby-identifier">block</span>)
|
144
145
|
<span class="ruby-ivar">@url_checks</span> <span class="ruby-operator"><<</span> <span class="ruby-identifier">block</span>
|
145
146
|
<span class="ruby-keyword kw">end</span>
|
@@ -148,24 +149,74 @@ href="http://mike-burns.com">mike-burns.com</a>’:
|
|
148
149
|
</div>
|
149
150
|
</div>
|
150
151
|
|
151
|
-
<div id="method-
|
152
|
-
<a name="
|
152
|
+
<div id="method-M000005" class="method-detail">
|
153
|
+
<a name="M000005"></a>
|
153
154
|
|
154
155
|
<div class="method-heading">
|
155
|
-
<a href="#
|
156
|
+
<a href="#M000005" class="method-signature">
|
157
|
+
<span class="method-name">check_already_seen_with</span><span class="method-args">(cacher)</span>
|
158
|
+
</a>
|
159
|
+
</div>
|
160
|
+
|
161
|
+
<div class="method-description">
|
162
|
+
<p>
|
163
|
+
The Web is a graph; to avoid cycles we store the nodes (URLs) already
|
164
|
+
visited. The Web is a really, really, really big graph; as such, this list
|
165
|
+
of visited nodes grows really, really, really big.
|
166
|
+
</p>
|
167
|
+
<p>
|
168
|
+
Change the object used to store these seen nodes with this. The default
|
169
|
+
object is an instance of Array. Available with <a
|
170
|
+
href="Spider.html">Spider</a> is a wrapper of memcached.
|
171
|
+
</p>
|
172
|
+
<p>
|
173
|
+
You can implement a custom class for this; any object passed to <a
|
174
|
+
href="SpiderInstance.html#M000005">check_already_seen_with</a> must
|
175
|
+
understand just << and included? .
|
176
|
+
</p>
|
177
|
+
<pre>
|
178
|
+
# default
|
179
|
+
check_already_seen_with Array.new
|
180
|
+
|
181
|
+
# memcached
|
182
|
+
require 'spider/included_in_memcached'
|
183
|
+
check_already_seen_with IncludedInMemcached.new('localhost:11211')
|
184
|
+
</pre>
|
185
|
+
<p><a class="source-toggle" href="#"
|
186
|
+
onclick="toggleCode('M000005-source');return false;">[Source]</a></p>
|
187
|
+
<div class="method-source-code" id="M000005-source">
|
188
|
+
<pre>
|
189
|
+
<span class="ruby-comment cmt"># File lib/spider_instance.rb, line 90</span>
|
190
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">check_already_seen_with</span>(<span class="ruby-identifier">cacher</span>)
|
191
|
+
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">cacher</span>.<span class="ruby-identifier">respond_to?</span>(<span class="ruby-identifier">:<<</span>) <span class="ruby-operator">&&</span> <span class="ruby-identifier">cacher</span>.<span class="ruby-identifier">respond_to?</span>(<span class="ruby-identifier">:include?</span>)
|
192
|
+
<span class="ruby-ivar">@seen</span> = <span class="ruby-identifier">cacher</span>
|
193
|
+
<span class="ruby-keyword kw">else</span>
|
194
|
+
<span class="ruby-identifier">raise</span> <span class="ruby-constant">ArgumentError</span>, <span class="ruby-value str">'expected something that responds to << and included?'</span>
|
195
|
+
<span class="ruby-keyword kw">end</span>
|
196
|
+
<span class="ruby-keyword kw">end</span>
|
197
|
+
</pre>
|
198
|
+
</div>
|
199
|
+
</div>
|
200
|
+
</div>
|
201
|
+
|
202
|
+
<div id="method-M000010" class="method-detail">
|
203
|
+
<a name="M000010"></a>
|
204
|
+
|
205
|
+
<div class="method-heading">
|
206
|
+
<a href="#M000010" class="method-signature">
|
156
207
|
<span class="method-name">clear_headers</span><span class="method-args">()</span>
|
157
208
|
</a>
|
158
209
|
</div>
|
159
210
|
|
160
211
|
<div class="method-description">
|
161
212
|
<p>
|
162
|
-
Reset the <a href="SpiderInstance.html#
|
213
|
+
Reset the <a href="SpiderInstance.html#M000009">headers</a> hash.
|
163
214
|
</p>
|
164
215
|
<p><a class="source-toggle" href="#"
|
165
|
-
onclick="toggleCode('
|
166
|
-
<div class="method-source-code" id="
|
216
|
+
onclick="toggleCode('M000010-source');return false;">[Source]</a></p>
|
217
|
+
<div class="method-source-code" id="M000010-source">
|
167
218
|
<pre>
|
168
|
-
<span class="ruby-comment cmt"># File lib/
|
219
|
+
<span class="ruby-comment cmt"># File lib/spider_instance.rb, line 157</span>
|
169
220
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">clear_headers</span>
|
170
221
|
<span class="ruby-ivar">@headers</span> = {}
|
171
222
|
<span class="ruby-keyword kw">end</span>
|
@@ -174,11 +225,11 @@ Reset the <a href="SpiderInstance.html#M000005">headers</a> hash.
|
|
174
225
|
</div>
|
175
226
|
</div>
|
176
227
|
|
177
|
-
<div id="method-
|
178
|
-
<a name="
|
228
|
+
<div id="method-M000009" class="method-detail">
|
229
|
+
<a name="M000009"></a>
|
179
230
|
|
180
231
|
<div class="method-heading">
|
181
|
-
<a href="#
|
232
|
+
<a href="#M000009" class="method-signature">
|
182
233
|
<span class="method-name">headers</span><span class="method-args">()</span>
|
183
234
|
</a>
|
184
235
|
</div>
|
@@ -191,10 +242,10 @@ Use like a hash:
|
|
191
242
|
headers['Cookies'] = 'user_id=1;password=btrross3'
|
192
243
|
</pre>
|
193
244
|
<p><a class="source-toggle" href="#"
|
194
|
-
onclick="toggleCode('
|
195
|
-
<div class="method-source-code" id="
|
245
|
+
onclick="toggleCode('M000009-source');return false;">[Source]</a></p>
|
246
|
+
<div class="method-source-code" id="M000009-source">
|
196
247
|
<pre>
|
197
|
-
<span class="ruby-comment cmt"># File lib/
|
248
|
+
<span class="ruby-comment cmt"># File lib/spider_instance.rb, line 145</span>
|
198
249
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">headers</span>
|
199
250
|
<span class="ruby-constant">HeaderSetter</span>.<span class="ruby-identifier">new</span>(<span class="ruby-keyword kw">self</span>)
|
200
251
|
<span class="ruby-keyword kw">end</span>
|
@@ -203,11 +254,11 @@ Use like a hash:
|
|
203
254
|
</div>
|
204
255
|
</div>
|
205
256
|
|
206
|
-
<div id="method-
|
207
|
-
<a name="
|
257
|
+
<div id="method-M000006" class="method-detail">
|
258
|
+
<a name="M000006"></a>
|
208
259
|
|
209
260
|
<div class="method-heading">
|
210
|
-
<a href="#
|
261
|
+
<a href="#M000006" class="method-signature">
|
211
262
|
<span class="method-name">on</span><span class="method-args">(code, p = nil, &block)</span>
|
212
263
|
</a>
|
213
264
|
</div>
|
@@ -240,10 +291,10 @@ For example:
|
|
240
291
|
end
|
241
292
|
</pre>
|
242
293
|
<p><a class="source-toggle" href="#"
|
243
|
-
onclick="toggleCode('
|
244
|
-
<div class="method-source-code" id="
|
294
|
+
onclick="toggleCode('M000006-source');return false;">[Source]</a></p>
|
295
|
+
<div class="method-source-code" id="M000006-source">
|
245
296
|
<pre>
|
246
|
-
<span class="ruby-comment cmt"># File lib/
|
297
|
+
<span class="ruby-comment cmt"># File lib/spider_instance.rb, line 120</span>
|
247
298
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">on</span>(<span class="ruby-identifier">code</span>, <span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&</span><span class="ruby-identifier">block</span>)
|
248
299
|
<span class="ruby-identifier">f</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
|
249
300
|
<span class="ruby-keyword kw">case</span> <span class="ruby-identifier">code</span>
|
@@ -258,11 +309,11 @@ For example:
|
|
258
309
|
</div>
|
259
310
|
</div>
|
260
311
|
|
261
|
-
<div id="method-
|
262
|
-
<a name="
|
312
|
+
<div id="method-M000007" class="method-detail">
|
313
|
+
<a name="M000007"></a>
|
263
314
|
|
264
315
|
<div class="method-heading">
|
265
|
-
<a href="#
|
316
|
+
<a href="#M000007" class="method-signature">
|
266
317
|
<span class="method-name">setup</span><span class="method-args">(p = nil, &block)</span>
|
267
318
|
</a>
|
268
319
|
</div>
|
@@ -277,10 +328,10 @@ Run before the HTTP request. Given the URL as a string.
|
|
277
328
|
end
|
278
329
|
</pre>
|
279
330
|
<p><a class="source-toggle" href="#"
|
280
|
-
onclick="toggleCode('
|
281
|
-
<div class="method-source-code" id="
|
331
|
+
onclick="toggleCode('M000007-source');return false;">[Source]</a></p>
|
332
|
+
<div class="method-source-code" id="M000007-source">
|
282
333
|
<pre>
|
283
|
-
<span class="ruby-comment cmt"># File lib/
|
334
|
+
<span class="ruby-comment cmt"># File lib/spider_instance.rb, line 134</span>
|
284
335
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">setup</span>(<span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&</span><span class="ruby-identifier">block</span>)
|
285
336
|
<span class="ruby-ivar">@setup</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
|
286
337
|
<span class="ruby-keyword kw">end</span>
|
@@ -289,11 +340,11 @@ Run before the HTTP request. Given the URL as a string.
|
|
289
340
|
</div>
|
290
341
|
</div>
|
291
342
|
|
292
|
-
<div id="method-
|
293
|
-
<a name="
|
343
|
+
<div id="method-M000008" class="method-detail">
|
344
|
+
<a name="M000008"></a>
|
294
345
|
|
295
346
|
<div class="method-heading">
|
296
|
-
<a href="#
|
347
|
+
<a href="#M000008" class="method-signature">
|
297
348
|
<span class="method-name">teardown</span><span class="method-args">(p = nil, &block)</span>
|
298
349
|
</a>
|
299
350
|
</div>
|
@@ -303,10 +354,10 @@ Run before the HTTP request. Given the URL as a string.
|
|
303
354
|
Run last, once for each page. Given the URL as a string.
|
304
355
|
</p>
|
305
356
|
<p><a class="source-toggle" href="#"
|
306
|
-
onclick="toggleCode('
|
307
|
-
<div class="method-source-code" id="
|
357
|
+
onclick="toggleCode('M000008-source');return false;">[Source]</a></p>
|
358
|
+
<div class="method-source-code" id="M000008-source">
|
308
359
|
<pre>
|
309
|
-
<span class="ruby-comment cmt"># File lib/
|
360
|
+
<span class="ruby-comment cmt"># File lib/spider_instance.rb, line 139</span>
|
310
361
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">teardown</span>(<span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&</span><span class="ruby-identifier">block</span>)
|
311
362
|
<span class="ruby-ivar">@teardown</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
|
312
363
|
<span class="ruby-keyword kw">end</span>
|