spider 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,144 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
+ <head>
8
+ <title>Class: NilClass</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
+ <link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" />
12
+ <script type="text/javascript">
13
+ // <![CDATA[
14
+
15
+ function popupCode( url ) {
16
+ window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
+ }
18
+
19
+ function toggleCode( id ) {
20
+ if ( document.getElementById )
21
+ elem = document.getElementById( id );
22
+ else if ( document.all )
23
+ elem = eval( "document.all." + id );
24
+ else
25
+ return false;
26
+
27
+ elemStyle = elem.style;
28
+
29
+ if ( elemStyle.display != "block" ) {
30
+ elemStyle.display = "block"
31
+ } else {
32
+ elemStyle.display = "none"
33
+ }
34
+
35
+ return true;
36
+ }
37
+
38
+ // Make codeblocks hidden by default
39
+ document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
+
41
+ // ]]>
42
+ </script>
43
+
44
+ </head>
45
+ <body>
46
+
47
+
48
+
49
+ <div id="classHeader">
50
+ <table class="header-table">
51
+ <tr class="top-aligned-row">
52
+ <td><strong>Class</strong></td>
53
+ <td class="class-name-in-header">NilClass</td>
54
+ </tr>
55
+ <tr class="top-aligned-row">
56
+ <td><strong>In:</strong></td>
57
+ <td>
58
+ <a href="../files/lib/spider_rb.html">
59
+ lib/spider.rb
60
+ </a>
61
+ <br />
62
+ </td>
63
+ </tr>
64
+
65
+ <tr class="top-aligned-row">
66
+ <td><strong>Parent:</strong></td>
67
+ <td>
68
+ Object
69
+ </td>
70
+ </tr>
71
+ </table>
72
+ </div>
73
+ <!-- banner header -->
74
+
75
+ <div id="bodyContent">
76
+
77
+
78
+
79
+ <div id="contextContent">
80
+
81
+
82
+
83
+ </div>
84
+
85
+ <div id="method-list">
86
+ <h3 class="section-bar">Methods</h3>
87
+
88
+ <div class="name-list">
89
+ <a href="#M000008">merge</a>&nbsp;&nbsp;
90
+ </div>
91
+ </div>
92
+
93
+ </div>
94
+
95
+
96
+ <!-- if includes -->
97
+
98
+ <div id="section">
99
+
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+ <!-- if method_list -->
108
+ <div id="methods">
109
+ <h3 class="section-bar">Public Instance methods</h3>
110
+
111
+ <div id="method-M000008" class="method-detail">
112
+ <a name="M000008"></a>
113
+
114
+ <div class="method-heading">
115
+ <a href="#M000008" class="method-signature">
116
+ <span class="method-name">merge</span><span class="method-args">(h)</span>
117
+ </a>
118
+ </div>
119
+
120
+ <div class="method-description">
121
+ <p><a class="source-toggle" href="#"
122
+ onclick="toggleCode('M000008-source');return false;">[Source]</a></p>
123
+ <div class="method-source-code" id="M000008-source">
124
+ <pre>
125
+ <span class="ruby-comment cmt"># File lib/spider.rb, line 44</span>
126
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">merge</span>(<span class="ruby-identifier">h</span>); <span class="ruby-identifier">h</span>; <span class="ruby-keyword kw">end</span>
127
+ </pre>
128
+ </div>
129
+ </div>
130
+ </div>
131
+
132
+
133
+ </div>
134
+
135
+
136
+ </div>
137
+
138
+
139
+ <div id="validator-badges">
140
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
141
+ </div>
142
+
143
+ </body>
144
+ </html>
@@ -93,7 +93,7 @@ links, and doing it all over again.
93
93
  <h3 class="section-bar">Methods</h3>
94
94
 
95
95
  <div class="name-list">
96
- <a href="#M000004">start_at</a>&nbsp;&nbsp;
96
+ <a href="#M000007">start_at</a>&nbsp;&nbsp;
97
97
  </div>
98
98
  </div>
99
99
 
@@ -115,11 +115,11 @@ links, and doing it all over again.
115
115
  <div id="methods">
116
116
  <h3 class="section-bar">Public Class methods</h3>
117
117
 
118
- <div id="method-M000004" class="method-detail">
119
- <a name="M000004"></a>
118
+ <div id="method-M000007" class="method-detail">
119
+ <a name="M000007"></a>
120
120
 
121
121
  <div class="method-heading">
122
- <a href="#M000004" class="method-signature">
122
+ <a href="#M000007" class="method-signature">
123
123
  <span class="method-name">start_at</span><span class="method-args">(a_url, &amp;block)</span>
124
124
  </a>
125
125
  </div>
@@ -136,27 +136,27 @@ define the rules and handlers for the discovered Web pages.
136
136
  a_url =~ %r{^http://mike-burns.com.*}
137
137
  end
138
138
 
139
- s.on 404 do |a_url, err_code|
139
+ s.on 404 do |a_url, resp, prior_url|
140
140
  puts &quot;URL not found: #{a_url}&quot;
141
141
  end
142
142
 
143
- s.on :success do |a_url, code, headers, body|
144
- puts &quot;body: #{body}&quot;
143
+ s.on :success do |a_url, resp, prior_url|
144
+ puts &quot;body: #{resp.body}&quot;
145
145
  end
146
146
 
147
- s.on :any do |a_url, resp|
147
+ s.on :every do |a_url, resp, prior_url|
148
148
  puts &quot;URL returned anything: #{a_url} with this code #{resp.code}&quot;
149
149
  end
150
150
  end
151
151
  </pre>
152
152
  <p><a class="source-toggle" href="#"
153
- onclick="toggleCode('M000004-source');return false;">[Source]</a></p>
154
- <div class="method-source-code" id="M000004-source">
153
+ onclick="toggleCode('M000007-source');return false;">[Source]</a></p>
154
+ <div class="method-source-code" id="M000007-source">
155
155
  <pre>
156
- <span class="ruby-comment cmt"># File lib/spider.rb, line 68</span>
156
+ <span class="ruby-comment cmt"># File lib/spider.rb, line 74</span>
157
157
  <span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">start_at</span>(<span class="ruby-identifier">a_url</span>, <span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
158
158
  <span class="ruby-identifier">rules</span> = <span class="ruby-constant">RobotRules</span>.<span class="ruby-identifier">new</span>(<span class="ruby-value str">'Ruby Spider 1.0'</span>)
159
- <span class="ruby-identifier">a_spider</span> = <span class="ruby-constant">SpiderInstance</span>.<span class="ruby-identifier">new</span>([<span class="ruby-identifier">a_url</span>], [], <span class="ruby-identifier">rules</span>, [])
159
+ <span class="ruby-identifier">a_spider</span> = <span class="ruby-constant">SpiderInstance</span>.<span class="ruby-identifier">new</span>({<span class="ruby-keyword kw">nil</span> =<span class="ruby-operator">&gt;</span> <span class="ruby-identifier">a_url</span>}, [], <span class="ruby-identifier">rules</span>, [])
160
160
  <span class="ruby-identifier">block</span>.<span class="ruby-identifier">call</span>(<span class="ruby-identifier">a_spider</span>)
161
161
  <span class="ruby-identifier">a_spider</span>.<span class="ruby-identifier">start!</span>
162
162
  <span class="ruby-keyword kw">end</span>
@@ -87,8 +87,11 @@
87
87
 
88
88
  <div class="name-list">
89
89
  <a href="#M000001">add_url_check</a>&nbsp;&nbsp;
90
+ <a href="#M000006">clear_headers</a>&nbsp;&nbsp;
91
+ <a href="#M000005">headers</a>&nbsp;&nbsp;
90
92
  <a href="#M000002">on</a>&nbsp;&nbsp;
91
- <a href="#M000003">remove_trailing_slash</a>&nbsp;&nbsp;
93
+ <a href="#M000003">setup</a>&nbsp;&nbsp;
94
+ <a href="#M000004">teardown</a>&nbsp;&nbsp;
92
95
  </div>
93
96
  </div>
94
97
 
@@ -136,7 +139,7 @@ href="http://mike-burns.com">mike-burns.com</a>&#8217;:
136
139
  onclick="toggleCode('M000001-source');return false;">[Source]</a></p>
137
140
  <div class="method-source-code" id="M000001-source">
138
141
  <pre>
139
- <span class="ruby-comment cmt"># File lib/spider.rb, line 94</span>
142
+ <span class="ruby-comment cmt"># File lib/spider.rb, line 103</span>
140
143
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">add_url_check</span>(<span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
141
144
  <span class="ruby-ivar">@url_checks</span> <span class="ruby-operator">&lt;&lt;</span> <span class="ruby-identifier">block</span>
142
145
  <span class="ruby-keyword kw">end</span>
@@ -145,6 +148,61 @@ href="http://mike-burns.com">mike-burns.com</a>&#8217;:
145
148
  </div>
146
149
  </div>
147
150
 
151
+ <div id="method-M000006" class="method-detail">
152
+ <a name="M000006"></a>
153
+
154
+ <div class="method-heading">
155
+ <a href="#M000006" class="method-signature">
156
+ <span class="method-name">clear_headers</span><span class="method-args">()</span>
157
+ </a>
158
+ </div>
159
+
160
+ <div class="method-description">
161
+ <p>
162
+ Reset the <a href="SpiderInstance.html#M000005">headers</a> hash.
163
+ </p>
164
+ <p><a class="source-toggle" href="#"
165
+ onclick="toggleCode('M000006-source');return false;">[Source]</a></p>
166
+ <div class="method-source-code" id="M000006-source">
167
+ <pre>
168
+ <span class="ruby-comment cmt"># File lib/spider.rb, line 170</span>
169
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">clear_headers</span>
170
+ <span class="ruby-ivar">@headers</span> = {}
171
+ <span class="ruby-keyword kw">end</span>
172
+ </pre>
173
+ </div>
174
+ </div>
175
+ </div>
176
+
177
+ <div id="method-M000005" class="method-detail">
178
+ <a name="M000005"></a>
179
+
180
+ <div class="method-heading">
181
+ <a href="#M000005" class="method-signature">
182
+ <span class="method-name">headers</span><span class="method-args">()</span>
183
+ </a>
184
+ </div>
185
+
186
+ <div class="method-description">
187
+ <p>
188
+ Use like a hash:
189
+ </p>
190
+ <pre>
191
+ headers['Cookies'] = 'user_id=1;password=btrross3'
192
+ </pre>
193
+ <p><a class="source-toggle" href="#"
194
+ onclick="toggleCode('M000005-source');return false;">[Source]</a></p>
195
+ <div class="method-source-code" id="M000005-source">
196
+ <pre>
197
+ <span class="ruby-comment cmt"># File lib/spider.rb, line 158</span>
198
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">headers</span>
199
+ <span class="ruby-constant">HeaderSetter</span>.<span class="ruby-identifier">new</span>(<span class="ruby-keyword kw">self</span>)
200
+ <span class="ruby-keyword kw">end</span>
201
+ </pre>
202
+ </div>
203
+ </div>
204
+ </div>
205
+
148
206
  <div id="method-M000002" class="method-detail">
149
207
  <a name="M000002"></a>
150
208
 
@@ -156,39 +214,28 @@ href="http://mike-burns.com">mike-burns.com</a>&#8217;:
156
214
 
157
215
  <div class="method-description">
158
216
  <p>
159
- Add a response handler. A response handler&#8216;s trigger can be :any,
217
+ Add a response handler. A response handler&#8216;s trigger can be :every,
160
218
  :success, :failure, or any HTTP status code. The handler itself can be
161
- either a Proc or a block. The arguments to the block depends <a
162
- href="SpiderInstance.html#M000002">on</a> the trigger:
163
- </p>
164
- <p>
165
- If the trigger is :any, the arguments are the URL as a string and an
166
- instance of Net::HTTPResponse.
167
- </p>
168
- <p>
169
- If the trigger is :success or any HTTP status code that represents a
170
- successful result, the arguments are the URL as a string, the HTTP status
171
- code, an instance of Net::HTTPSuccess, and the body of the result as a
172
- string.
219
+ either a Proc or a block.
173
220
  </p>
174
221
  <p>
175
- If the trigger is :failure or any HTTP status code that represents a failed
176
- result, the arguments are the URL as a string and the HTTP status code.
222
+ The arguments to the block are: the URL as a string, an instance of
223
+ Net::HTTPResponse, and the prior URL as a string.
177
224
  </p>
178
225
  <p>
179
226
  For example:
180
227
  </p>
181
228
  <pre>
182
- on 404 do |a_url, code|
229
+ on 404 do |a_url, resp, prior_url|
183
230
  puts &quot;URL not found: #{a_url}&quot;
184
231
  end
185
232
 
186
- on :success do |a_url, code, resp, body|
233
+ on :success do |a_url, resp, prior_url|
187
234
  puts a_url
188
- puts body
235
+ puts resp.body
189
236
  end
190
237
 
191
- on :any do |a_url, resp|
238
+ on :every do |a_url, resp, prior_url|
192
239
  puts &quot;Given this code: #{resp.code}&quot;
193
240
  end
194
241
  </pre>
@@ -196,18 +243,14 @@ For example:
196
243
  onclick="toggleCode('M000002-source');return false;">[Source]</a></p>
197
244
  <div class="method-source-code" id="M000002-source">
198
245
  <pre>
199
- <span class="ruby-comment cmt"># File lib/spider.rb, line 131</span>
246
+ <span class="ruby-comment cmt"># File lib/spider.rb, line 133</span>
200
247
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">on</span>(<span class="ruby-identifier">code</span>, <span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
201
248
  <span class="ruby-identifier">f</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
202
249
  <span class="ruby-keyword kw">case</span> <span class="ruby-identifier">code</span>
203
250
  <span class="ruby-keyword kw">when</span> <span class="ruby-constant">Fixnum</span>
204
- <span class="ruby-ivar">@callbacks</span>[<span class="ruby-identifier">success_or_failure</span>(<span class="ruby-identifier">code</span>)][<span class="ruby-identifier">code</span>] = <span class="ruby-identifier">f</span>
251
+ <span class="ruby-ivar">@callbacks</span>[<span class="ruby-identifier">code</span>] = <span class="ruby-identifier">f</span>
205
252
  <span class="ruby-keyword kw">else</span>
206
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">:any</span> <span class="ruby-operator">==</span> <span class="ruby-identifier">code</span>.<span class="ruby-identifier">to_sym</span>
207
- <span class="ruby-ivar">@callbacks</span>[<span class="ruby-identifier">:any</span>] = <span class="ruby-identifier">f</span>
208
- <span class="ruby-keyword kw">else</span>
209
- <span class="ruby-ivar">@callbacks</span>[<span class="ruby-identifier">code</span>.<span class="ruby-identifier">to_sym</span>][<span class="ruby-identifier">:any</span>] = <span class="ruby-identifier">f</span>
210
- <span class="ruby-keyword kw">end</span>
253
+ <span class="ruby-ivar">@callbacks</span>[<span class="ruby-identifier">code</span>.<span class="ruby-identifier">to_sym</span>] = <span class="ruby-identifier">f</span>
211
254
  <span class="ruby-keyword kw">end</span>
212
255
  <span class="ruby-keyword kw">end</span>
213
256
  </pre>
@@ -220,18 +263,52 @@ For example:
220
263
 
221
264
  <div class="method-heading">
222
265
  <a href="#M000003" class="method-signature">
223
- <span class="method-name">remove_trailing_slash</span><span class="method-args">(s)</span>
266
+ <span class="method-name">setup</span><span class="method-args">(p = nil, &amp;block)</span>
224
267
  </a>
225
268
  </div>
226
269
 
227
270
  <div class="method-description">
271
+ <p>
272
+ Run before the HTTP request. Given the URL as a string.
273
+ </p>
274
+ <pre>
275
+ setup do |a_url|
276
+ headers['Cookies'] = 'user_id=1;admin=true'
277
+ end
278
+ </pre>
228
279
  <p><a class="source-toggle" href="#"
229
280
  onclick="toggleCode('M000003-source');return false;">[Source]</a></p>
230
281
  <div class="method-source-code" id="M000003-source">
231
282
  <pre>
232
- <span class="ruby-comment cmt"># File lib/spider.rb, line 257</span>
233
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">remove_trailing_slash</span>(<span class="ruby-identifier">s</span>)
234
- <span class="ruby-identifier">s</span>.<span class="ruby-identifier">sub</span>(<span class="ruby-regexp re">%r{/*$}</span>,<span class="ruby-value str">''</span>)
283
+ <span class="ruby-comment cmt"># File lib/spider.rb, line 147</span>
284
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">setup</span>(<span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
285
+ <span class="ruby-ivar">@setup</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
286
+ <span class="ruby-keyword kw">end</span>
287
+ </pre>
288
+ </div>
289
+ </div>
290
+ </div>
291
+
292
+ <div id="method-M000004" class="method-detail">
293
+ <a name="M000004"></a>
294
+
295
+ <div class="method-heading">
296
+ <a href="#M000004" class="method-signature">
297
+ <span class="method-name">teardown</span><span class="method-args">(p = nil, &amp;block)</span>
298
+ </a>
299
+ </div>
300
+
301
+ <div class="method-description">
302
+ <p>
303
+ Run last, once for each page. Given the URL as a string.
304
+ </p>
305
+ <p><a class="source-toggle" href="#"
306
+ onclick="toggleCode('M000004-source');return false;">[Source]</a></p>
307
+ <div class="method-source-code" id="M000004-source">
308
+ <pre>
309
+ <span class="ruby-comment cmt"># File lib/spider.rb, line 152</span>
310
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">teardown</span>(<span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
311
+ <span class="ruby-ivar">@teardown</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
235
312
  <span class="ruby-keyword kw">end</span>
236
313
  </pre>
237
314
  </div>
@@ -1 +1 @@
1
- Tue, 23 Oct 2007 23:14:46 -0400
1
+ Wed, 31 Oct 2007 23:51:58 -0400
@@ -56,7 +56,7 @@
56
56
  </tr>
57
57
  <tr class="top-aligned-row">
58
58
  <td><strong>Last Update:</strong></td>
59
- <td>Mon Oct 22 07:34:31 -0400 2007</td>
59
+ <td>Wed Oct 31 23:26:17 -0400 2007</td>
60
60
  </tr>
61
61
  </table>
62
62
  </div>
@@ -83,17 +83,17 @@ you can just handle the data.
83
83
  end
84
84
 
85
85
  # Handle 404s.
86
- s.on 404 do |a_url, err_code|
86
+ s.on 404 do |a_url, resp, prior_url|
87
87
  puts &quot;URL not found: #{a_url}&quot;
88
88
  end
89
89
 
90
90
  # Handle 2xx.
91
- s.on :success do |a_url, code, headers, body|
92
- puts &quot;body: #{body}&quot;
91
+ s.on :success do |a_url, resp, prior_url|
92
+ puts &quot;body: #{resp.body}&quot;
93
93
  end
94
94
 
95
95
  # Handle everything.
96
- s.on :any do |a_url, resp|
96
+ s.on :every do |a_url, resp, prior_url|
97
97
  puts &quot;URL returned anything: #{a_url} with this code #{resp.code}&quot;
98
98
  end
99
99
  end
@@ -56,7 +56,7 @@
56
56
  </tr>
57
57
  <tr class="top-aligned-row">
58
58
  <td><strong>Last Update:</strong></td>
59
- <td>Tue Oct 23 23:11:42 -0400 2007</td>
59
+ <td>Wed Oct 31 23:25:57 -0400 2007</td>
60
60
  </tr>
61
61
  </table>
62
62
  </div>
@@ -83,17 +83,17 @@ collecting, and looping so that you can just handle the data.
83
83
  end
84
84
 
85
85
  # Handle 404s.
86
- s.on 404 do |a_url, err_code|
86
+ s.on 404 do |a_url, resp, prior_url|
87
87
  puts &quot;URL not found: #{a_url}&quot;
88
88
  end
89
89
 
90
90
  # Handle 2xx.
91
- s.on :success do |a_url, code, headers, body|
92
- puts &quot;body: #{body}&quot;
91
+ s.on :success do |a_url, resp, prior_url|
92
+ puts &quot;body: #{resp.body}&quot;
93
93
  end
94
94
 
95
95
  # Handle everything.
96
- s.on :any do |a_url, resp|
96
+ s.on :every do |a_url, resp, prior_url|
97
97
  puts &quot;URL returned anything: #{a_url} with this code #{resp.code}&quot;
98
98
  end
99
99
  end