spider 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,144 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
+ <head>
8
+ <title>Class: NilClass</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
+ <link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" />
12
+ <script type="text/javascript">
13
+ // <![CDATA[
14
+
15
+ function popupCode( url ) {
16
+ window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
+ }
18
+
19
+ function toggleCode( id ) {
20
+ if ( document.getElementById )
21
+ elem = document.getElementById( id );
22
+ else if ( document.all )
23
+ elem = eval( "document.all." + id );
24
+ else
25
+ return false;
26
+
27
+ elemStyle = elem.style;
28
+
29
+ if ( elemStyle.display != "block" ) {
30
+ elemStyle.display = "block"
31
+ } else {
32
+ elemStyle.display = "none"
33
+ }
34
+
35
+ return true;
36
+ }
37
+
38
+ // Make codeblocks hidden by default
39
+ document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
+
41
+ // ]]>
42
+ </script>
43
+
44
+ </head>
45
+ <body>
46
+
47
+
48
+
49
+ <div id="classHeader">
50
+ <table class="header-table">
51
+ <tr class="top-aligned-row">
52
+ <td><strong>Class</strong></td>
53
+ <td class="class-name-in-header">NilClass</td>
54
+ </tr>
55
+ <tr class="top-aligned-row">
56
+ <td><strong>In:</strong></td>
57
+ <td>
58
+ <a href="../files/lib/spider_rb.html">
59
+ lib/spider.rb
60
+ </a>
61
+ <br />
62
+ </td>
63
+ </tr>
64
+
65
+ <tr class="top-aligned-row">
66
+ <td><strong>Parent:</strong></td>
67
+ <td>
68
+ Object
69
+ </td>
70
+ </tr>
71
+ </table>
72
+ </div>
73
+ <!-- banner header -->
74
+
75
+ <div id="bodyContent">
76
+
77
+
78
+
79
+ <div id="contextContent">
80
+
81
+
82
+
83
+ </div>
84
+
85
+ <div id="method-list">
86
+ <h3 class="section-bar">Methods</h3>
87
+
88
+ <div class="name-list">
89
+ <a href="#M000008">merge</a>&nbsp;&nbsp;
90
+ </div>
91
+ </div>
92
+
93
+ </div>
94
+
95
+
96
+ <!-- if includes -->
97
+
98
+ <div id="section">
99
+
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+ <!-- if method_list -->
108
+ <div id="methods">
109
+ <h3 class="section-bar">Public Instance methods</h3>
110
+
111
+ <div id="method-M000008" class="method-detail">
112
+ <a name="M000008"></a>
113
+
114
+ <div class="method-heading">
115
+ <a href="#M000008" class="method-signature">
116
+ <span class="method-name">merge</span><span class="method-args">(h)</span>
117
+ </a>
118
+ </div>
119
+
120
+ <div class="method-description">
121
+ <p><a class="source-toggle" href="#"
122
+ onclick="toggleCode('M000008-source');return false;">[Source]</a></p>
123
+ <div class="method-source-code" id="M000008-source">
124
+ <pre>
125
+ <span class="ruby-comment cmt"># File lib/spider.rb, line 44</span>
126
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">merge</span>(<span class="ruby-identifier">h</span>); <span class="ruby-identifier">h</span>; <span class="ruby-keyword kw">end</span>
127
+ </pre>
128
+ </div>
129
+ </div>
130
+ </div>
131
+
132
+
133
+ </div>
134
+
135
+
136
+ </div>
137
+
138
+
139
+ <div id="validator-badges">
140
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
141
+ </div>
142
+
143
+ </body>
144
+ </html>
@@ -93,7 +93,7 @@ links, and doing it all over again.
93
93
  <h3 class="section-bar">Methods</h3>
94
94
 
95
95
  <div class="name-list">
96
- <a href="#M000004">start_at</a>&nbsp;&nbsp;
96
+ <a href="#M000007">start_at</a>&nbsp;&nbsp;
97
97
  </div>
98
98
  </div>
99
99
 
@@ -115,11 +115,11 @@ links, and doing it all over again.
115
115
  <div id="methods">
116
116
  <h3 class="section-bar">Public Class methods</h3>
117
117
 
118
- <div id="method-M000004" class="method-detail">
119
- <a name="M000004"></a>
118
+ <div id="method-M000007" class="method-detail">
119
+ <a name="M000007"></a>
120
120
 
121
121
  <div class="method-heading">
122
- <a href="#M000004" class="method-signature">
122
+ <a href="#M000007" class="method-signature">
123
123
  <span class="method-name">start_at</span><span class="method-args">(a_url, &amp;block)</span>
124
124
  </a>
125
125
  </div>
@@ -136,27 +136,27 @@ define the rules and handlers for the discovered Web pages.
136
136
  a_url =~ %r{^http://mike-burns.com.*}
137
137
  end
138
138
 
139
- s.on 404 do |a_url, err_code|
139
+ s.on 404 do |a_url, resp, prior_url|
140
140
  puts &quot;URL not found: #{a_url}&quot;
141
141
  end
142
142
 
143
- s.on :success do |a_url, code, headers, body|
144
- puts &quot;body: #{body}&quot;
143
+ s.on :success do |a_url, resp, prior_url|
144
+ puts &quot;body: #{resp.body}&quot;
145
145
  end
146
146
 
147
- s.on :any do |a_url, resp|
147
+ s.on :every do |a_url, resp, prior_url|
148
148
  puts &quot;URL returned anything: #{a_url} with this code #{resp.code}&quot;
149
149
  end
150
150
  end
151
151
  </pre>
152
152
  <p><a class="source-toggle" href="#"
153
- onclick="toggleCode('M000004-source');return false;">[Source]</a></p>
154
- <div class="method-source-code" id="M000004-source">
153
+ onclick="toggleCode('M000007-source');return false;">[Source]</a></p>
154
+ <div class="method-source-code" id="M000007-source">
155
155
  <pre>
156
- <span class="ruby-comment cmt"># File lib/spider.rb, line 68</span>
156
+ <span class="ruby-comment cmt"># File lib/spider.rb, line 74</span>
157
157
  <span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">start_at</span>(<span class="ruby-identifier">a_url</span>, <span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
158
158
  <span class="ruby-identifier">rules</span> = <span class="ruby-constant">RobotRules</span>.<span class="ruby-identifier">new</span>(<span class="ruby-value str">'Ruby Spider 1.0'</span>)
159
- <span class="ruby-identifier">a_spider</span> = <span class="ruby-constant">SpiderInstance</span>.<span class="ruby-identifier">new</span>([<span class="ruby-identifier">a_url</span>], [], <span class="ruby-identifier">rules</span>, [])
159
+ <span class="ruby-identifier">a_spider</span> = <span class="ruby-constant">SpiderInstance</span>.<span class="ruby-identifier">new</span>({<span class="ruby-keyword kw">nil</span> =<span class="ruby-operator">&gt;</span> <span class="ruby-identifier">a_url</span>}, [], <span class="ruby-identifier">rules</span>, [])
160
160
  <span class="ruby-identifier">block</span>.<span class="ruby-identifier">call</span>(<span class="ruby-identifier">a_spider</span>)
161
161
  <span class="ruby-identifier">a_spider</span>.<span class="ruby-identifier">start!</span>
162
162
  <span class="ruby-keyword kw">end</span>
@@ -87,8 +87,11 @@
87
87
 
88
88
  <div class="name-list">
89
89
  <a href="#M000001">add_url_check</a>&nbsp;&nbsp;
90
+ <a href="#M000006">clear_headers</a>&nbsp;&nbsp;
91
+ <a href="#M000005">headers</a>&nbsp;&nbsp;
90
92
  <a href="#M000002">on</a>&nbsp;&nbsp;
91
- <a href="#M000003">remove_trailing_slash</a>&nbsp;&nbsp;
93
+ <a href="#M000003">setup</a>&nbsp;&nbsp;
94
+ <a href="#M000004">teardown</a>&nbsp;&nbsp;
92
95
  </div>
93
96
  </div>
94
97
 
@@ -136,7 +139,7 @@ href="http://mike-burns.com">mike-burns.com</a>&#8217;:
136
139
  onclick="toggleCode('M000001-source');return false;">[Source]</a></p>
137
140
  <div class="method-source-code" id="M000001-source">
138
141
  <pre>
139
- <span class="ruby-comment cmt"># File lib/spider.rb, line 94</span>
142
+ <span class="ruby-comment cmt"># File lib/spider.rb, line 103</span>
140
143
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">add_url_check</span>(<span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
141
144
  <span class="ruby-ivar">@url_checks</span> <span class="ruby-operator">&lt;&lt;</span> <span class="ruby-identifier">block</span>
142
145
  <span class="ruby-keyword kw">end</span>
@@ -145,6 +148,61 @@ href="http://mike-burns.com">mike-burns.com</a>&#8217;:
145
148
  </div>
146
149
  </div>
147
150
 
151
+ <div id="method-M000006" class="method-detail">
152
+ <a name="M000006"></a>
153
+
154
+ <div class="method-heading">
155
+ <a href="#M000006" class="method-signature">
156
+ <span class="method-name">clear_headers</span><span class="method-args">()</span>
157
+ </a>
158
+ </div>
159
+
160
+ <div class="method-description">
161
+ <p>
162
+ Reset the <a href="SpiderInstance.html#M000005">headers</a> hash.
163
+ </p>
164
+ <p><a class="source-toggle" href="#"
165
+ onclick="toggleCode('M000006-source');return false;">[Source]</a></p>
166
+ <div class="method-source-code" id="M000006-source">
167
+ <pre>
168
+ <span class="ruby-comment cmt"># File lib/spider.rb, line 170</span>
169
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">clear_headers</span>
170
+ <span class="ruby-ivar">@headers</span> = {}
171
+ <span class="ruby-keyword kw">end</span>
172
+ </pre>
173
+ </div>
174
+ </div>
175
+ </div>
176
+
177
+ <div id="method-M000005" class="method-detail">
178
+ <a name="M000005"></a>
179
+
180
+ <div class="method-heading">
181
+ <a href="#M000005" class="method-signature">
182
+ <span class="method-name">headers</span><span class="method-args">()</span>
183
+ </a>
184
+ </div>
185
+
186
+ <div class="method-description">
187
+ <p>
188
+ Use like a hash:
189
+ </p>
190
+ <pre>
191
+ headers['Cookies'] = 'user_id=1;password=btrross3'
192
+ </pre>
193
+ <p><a class="source-toggle" href="#"
194
+ onclick="toggleCode('M000005-source');return false;">[Source]</a></p>
195
+ <div class="method-source-code" id="M000005-source">
196
+ <pre>
197
+ <span class="ruby-comment cmt"># File lib/spider.rb, line 158</span>
198
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">headers</span>
199
+ <span class="ruby-constant">HeaderSetter</span>.<span class="ruby-identifier">new</span>(<span class="ruby-keyword kw">self</span>)
200
+ <span class="ruby-keyword kw">end</span>
201
+ </pre>
202
+ </div>
203
+ </div>
204
+ </div>
205
+
148
206
  <div id="method-M000002" class="method-detail">
149
207
  <a name="M000002"></a>
150
208
 
@@ -156,39 +214,28 @@ href="http://mike-burns.com">mike-burns.com</a>&#8217;:
156
214
 
157
215
  <div class="method-description">
158
216
  <p>
159
- Add a response handler. A response handler&#8216;s trigger can be :any,
217
+ Add a response handler. A response handler&#8216;s trigger can be :every,
160
218
  :success, :failure, or any HTTP status code. The handler itself can be
161
- either a Proc or a block. The arguments to the block depends <a
162
- href="SpiderInstance.html#M000002">on</a> the trigger:
163
- </p>
164
- <p>
165
- If the trigger is :any, the arguments are the URL as a string and an
166
- instance of Net::HTTPResponse.
167
- </p>
168
- <p>
169
- If the trigger is :success or any HTTP status code that represents a
170
- successful result, the arguments are the URL as a string, the HTTP status
171
- code, an instance of Net::HTTPSuccess, and the body of the result as a
172
- string.
219
+ either a Proc or a block.
173
220
  </p>
174
221
  <p>
175
- If the trigger is :failure or any HTTP status code that represents a failed
176
- result, the arguments are the URL as a string and the HTTP status code.
222
+ The arguments to the block are: the URL as a string, an instance of
223
+ Net::HTTPResponse, and the prior URL as a string.
177
224
  </p>
178
225
  <p>
179
226
  For example:
180
227
  </p>
181
228
  <pre>
182
- on 404 do |a_url, code|
229
+ on 404 do |a_url, resp, prior_url|
183
230
  puts &quot;URL not found: #{a_url}&quot;
184
231
  end
185
232
 
186
- on :success do |a_url, code, resp, body|
233
+ on :success do |a_url, resp, prior_url|
187
234
  puts a_url
188
- puts body
235
+ puts resp.body
189
236
  end
190
237
 
191
- on :any do |a_url, resp|
238
+ on :every do |a_url, resp, prior_url|
192
239
  puts &quot;Given this code: #{resp.code}&quot;
193
240
  end
194
241
  </pre>
@@ -196,18 +243,14 @@ For example:
196
243
  onclick="toggleCode('M000002-source');return false;">[Source]</a></p>
197
244
  <div class="method-source-code" id="M000002-source">
198
245
  <pre>
199
- <span class="ruby-comment cmt"># File lib/spider.rb, line 131</span>
246
+ <span class="ruby-comment cmt"># File lib/spider.rb, line 133</span>
200
247
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">on</span>(<span class="ruby-identifier">code</span>, <span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
201
248
  <span class="ruby-identifier">f</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
202
249
  <span class="ruby-keyword kw">case</span> <span class="ruby-identifier">code</span>
203
250
  <span class="ruby-keyword kw">when</span> <span class="ruby-constant">Fixnum</span>
204
- <span class="ruby-ivar">@callbacks</span>[<span class="ruby-identifier">success_or_failure</span>(<span class="ruby-identifier">code</span>)][<span class="ruby-identifier">code</span>] = <span class="ruby-identifier">f</span>
251
+ <span class="ruby-ivar">@callbacks</span>[<span class="ruby-identifier">code</span>] = <span class="ruby-identifier">f</span>
205
252
  <span class="ruby-keyword kw">else</span>
206
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">:any</span> <span class="ruby-operator">==</span> <span class="ruby-identifier">code</span>.<span class="ruby-identifier">to_sym</span>
207
- <span class="ruby-ivar">@callbacks</span>[<span class="ruby-identifier">:any</span>] = <span class="ruby-identifier">f</span>
208
- <span class="ruby-keyword kw">else</span>
209
- <span class="ruby-ivar">@callbacks</span>[<span class="ruby-identifier">code</span>.<span class="ruby-identifier">to_sym</span>][<span class="ruby-identifier">:any</span>] = <span class="ruby-identifier">f</span>
210
- <span class="ruby-keyword kw">end</span>
253
+ <span class="ruby-ivar">@callbacks</span>[<span class="ruby-identifier">code</span>.<span class="ruby-identifier">to_sym</span>] = <span class="ruby-identifier">f</span>
211
254
  <span class="ruby-keyword kw">end</span>
212
255
  <span class="ruby-keyword kw">end</span>
213
256
  </pre>
@@ -220,18 +263,52 @@ For example:
220
263
 
221
264
  <div class="method-heading">
222
265
  <a href="#M000003" class="method-signature">
223
- <span class="method-name">remove_trailing_slash</span><span class="method-args">(s)</span>
266
+ <span class="method-name">setup</span><span class="method-args">(p = nil, &amp;block)</span>
224
267
  </a>
225
268
  </div>
226
269
 
227
270
  <div class="method-description">
271
+ <p>
272
+ Run before the HTTP request. Given the URL as a string.
273
+ </p>
274
+ <pre>
275
+ setup do |a_url|
276
+ headers['Cookies'] = 'user_id=1;admin=true'
277
+ end
278
+ </pre>
228
279
  <p><a class="source-toggle" href="#"
229
280
  onclick="toggleCode('M000003-source');return false;">[Source]</a></p>
230
281
  <div class="method-source-code" id="M000003-source">
231
282
  <pre>
232
- <span class="ruby-comment cmt"># File lib/spider.rb, line 257</span>
233
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">remove_trailing_slash</span>(<span class="ruby-identifier">s</span>)
234
- <span class="ruby-identifier">s</span>.<span class="ruby-identifier">sub</span>(<span class="ruby-regexp re">%r{/*$}</span>,<span class="ruby-value str">''</span>)
283
+ <span class="ruby-comment cmt"># File lib/spider.rb, line 147</span>
284
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">setup</span>(<span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
285
+ <span class="ruby-ivar">@setup</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
286
+ <span class="ruby-keyword kw">end</span>
287
+ </pre>
288
+ </div>
289
+ </div>
290
+ </div>
291
+
292
+ <div id="method-M000004" class="method-detail">
293
+ <a name="M000004"></a>
294
+
295
+ <div class="method-heading">
296
+ <a href="#M000004" class="method-signature">
297
+ <span class="method-name">teardown</span><span class="method-args">(p = nil, &amp;block)</span>
298
+ </a>
299
+ </div>
300
+
301
+ <div class="method-description">
302
+ <p>
303
+ Run last, once for each page. Given the URL as a string.
304
+ </p>
305
+ <p><a class="source-toggle" href="#"
306
+ onclick="toggleCode('M000004-source');return false;">[Source]</a></p>
307
+ <div class="method-source-code" id="M000004-source">
308
+ <pre>
309
+ <span class="ruby-comment cmt"># File lib/spider.rb, line 152</span>
310
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">teardown</span>(<span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
311
+ <span class="ruby-ivar">@teardown</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
235
312
  <span class="ruby-keyword kw">end</span>
236
313
  </pre>
237
314
  </div>
@@ -1 +1 @@
1
- Tue, 23 Oct 2007 23:14:46 -0400
1
+ Wed, 31 Oct 2007 23:51:58 -0400
@@ -56,7 +56,7 @@
56
56
  </tr>
57
57
  <tr class="top-aligned-row">
58
58
  <td><strong>Last Update:</strong></td>
59
- <td>Mon Oct 22 07:34:31 -0400 2007</td>
59
+ <td>Wed Oct 31 23:26:17 -0400 2007</td>
60
60
  </tr>
61
61
  </table>
62
62
  </div>
@@ -83,17 +83,17 @@ you can just handle the data.
83
83
  end
84
84
 
85
85
  # Handle 404s.
86
- s.on 404 do |a_url, err_code|
86
+ s.on 404 do |a_url, resp, prior_url|
87
87
  puts &quot;URL not found: #{a_url}&quot;
88
88
  end
89
89
 
90
90
  # Handle 2xx.
91
- s.on :success do |a_url, code, headers, body|
92
- puts &quot;body: #{body}&quot;
91
+ s.on :success do |a_url, resp, prior_url|
92
+ puts &quot;body: #{resp.body}&quot;
93
93
  end
94
94
 
95
95
  # Handle everything.
96
- s.on :any do |a_url, resp|
96
+ s.on :every do |a_url, resp, prior_url|
97
97
  puts &quot;URL returned anything: #{a_url} with this code #{resp.code}&quot;
98
98
  end
99
99
  end
@@ -56,7 +56,7 @@
56
56
  </tr>
57
57
  <tr class="top-aligned-row">
58
58
  <td><strong>Last Update:</strong></td>
59
- <td>Tue Oct 23 23:11:42 -0400 2007</td>
59
+ <td>Wed Oct 31 23:25:57 -0400 2007</td>
60
60
  </tr>
61
61
  </table>
62
62
  </div>
@@ -83,17 +83,17 @@ collecting, and looping so that you can just handle the data.
83
83
  end
84
84
 
85
85
  # Handle 404s.
86
- s.on 404 do |a_url, err_code|
86
+ s.on 404 do |a_url, resp, prior_url|
87
87
  puts &quot;URL not found: #{a_url}&quot;
88
88
  end
89
89
 
90
90
  # Handle 2xx.
91
- s.on :success do |a_url, code, headers, body|
92
- puts &quot;body: #{body}&quot;
91
+ s.on :success do |a_url, resp, prior_url|
92
+ puts &quot;body: #{resp.body}&quot;
93
93
  end
94
94
 
95
95
  # Handle everything.
96
- s.on :any do |a_url, resp|
96
+ s.on :every do |a_url, resp, prior_url|
97
97
  puts &quot;URL returned anything: #{a_url} with this code #{resp.code}&quot;
98
98
  end
99
99
  end