spider 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES +10 -0
- data/README +29 -25
- data/doc/classes/Net.html +101 -0
- data/doc/classes/Spider.html +180 -0
- data/doc/classes/SpiderInstance.html +229 -0
- data/doc/created.rid +1 -0
- data/doc/files/README.html +149 -0
- data/doc/files/lib/spider_rb.html +159 -0
- data/doc/fr_class_index.html +29 -0
- data/doc/fr_file_index.html +28 -0
- data/doc/fr_method_index.html +29 -0
- data/doc/index.html +24 -0
- data/doc/rdoc-style.css +208 -0
- data/lib/spider.rb +208 -80
- data/spec/spider_instance_spec.rb +219 -0
- data/spec/spider_spec.rb +10 -0
- data/spider.gemspec +2 -2
- data/test_server/client.rb +22 -0
- data/test_server/server1/page1.html +1 -0
- data/test_server/server1/page2.html +3 -0
- data/test_server/server2/page1.html +1 -0
- data/test_server/server2/page2.html +2 -0
- data/test_server/servers.rb +24 -0
- metadata +32 -6
- data/LICENSE +0 -339
data/CHANGES
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
2007-10-22:
|
2
|
+
* Use RSpec to ensure that it mostly works.
|
3
|
+
* Use WEBrick to create a small test server for additional testing.
|
4
|
+
* Completely re-do the API to prepare for future expansion.
|
5
|
+
* Add the ability to apply each URL to a series of custom allowed?-like
|
6
|
+
matchers.
|
7
|
+
|
8
|
+
2007-03-30:
|
9
|
+
* Clean up the documentation.
|
10
|
+
|
1
11
|
2007-03-28:
|
2
12
|
* Change the tail recursion to a `while' loop, to please Ruby.
|
3
13
|
* Documentation.
|
data/README
CHANGED
@@ -1,37 +1,41 @@
|
|
1
1
|
Spider, a Web spidering library for Ruby. It handles the robots.txt,
|
2
2
|
scraping, collecting, and looping so that you can just handle the data.
|
3
3
|
|
4
|
-
==
|
4
|
+
== Usage
|
5
|
+
|
6
|
+
Spider.start_at('http://mike-burns.com/') do |s|
|
7
|
+
# Limit the pages to just this domain.
|
8
|
+
s.add_url_check do |a_url|
|
9
|
+
a_url =~ %r{^http://mike-burns.com.*}
|
10
|
+
end
|
11
|
+
|
12
|
+
# Handle 404s.
|
13
|
+
s.on 404 do |a_url, err_code|
|
14
|
+
puts "URL not found: #{a_url}"
|
15
|
+
end
|
16
|
+
|
17
|
+
# Handle 2xx.
|
18
|
+
s.on :success do |a_url, code, headers, body|
|
19
|
+
puts "body: #{body}"
|
20
|
+
end
|
21
|
+
|
22
|
+
# Handle everything.
|
23
|
+
s.on :any do |a_url, resp|
|
24
|
+
puts "URL returned anything: #{a_url} with this code #{resp.code}"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
== Requirements
|
5
30
|
|
6
31
|
This library uses `robot_rules' (included), `open-uri', and `uri'. Any modern
|
7
32
|
Ruby should work; if yours doesn't, let me know so I can update this with your
|
8
33
|
version number.
|
9
34
|
|
10
|
-
==
|
11
|
-
|
12
|
-
One function: `spider'. It takes a list of seed URLs and a block; this block is
|
13
|
-
passed each URL and its Web page. This function never returns, ideally.
|
14
|
-
|
15
|
-
spider : [String] (String String -> a) -> omega
|
16
|
-
|
17
|
-
Examples:
|
18
|
-
|
19
|
-
require 'spider'
|
20
|
-
include Spider
|
21
|
-
|
22
|
-
spider(['http://yahoo.com']) do |a_url, web_page|
|
23
|
-
puts "At #{a_url}"
|
24
|
-
end
|
25
|
-
|
26
|
-
spider(['http://mike-burns.com','http://matthoran.com']) do |u, page|
|
27
|
-
# assumes `scrape_images' and `store_image!' functions.
|
28
|
-
scrape_images(page).each { |img| store_image!(img) }
|
29
|
-
end
|
30
|
-
|
31
|
-
== Author ==
|
35
|
+
== Author
|
32
36
|
|
33
37
|
Mike Burns http://mike-burns.com mike@mike-burns.com
|
34
38
|
|
35
|
-
With help from Matt Horan.
|
39
|
+
With help from Matt Horan and John Nagro.
|
36
40
|
With `robot_rules' from James Edward Gray II via
|
37
|
-
|
41
|
+
http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589
|
@@ -0,0 +1,101 @@
|
|
1
|
+
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
+
<!DOCTYPE html
|
3
|
+
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
+
|
6
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
7
|
+
<head>
|
8
|
+
<title>Module: Net</title>
|
9
|
+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
+
<meta http-equiv="Content-Script-Type" content="text/javascript" />
|
11
|
+
<link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" />
|
12
|
+
<script type="text/javascript">
|
13
|
+
// <![CDATA[
|
14
|
+
|
15
|
+
function popupCode( url ) {
|
16
|
+
window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
|
17
|
+
}
|
18
|
+
|
19
|
+
function toggleCode( id ) {
|
20
|
+
if ( document.getElementById )
|
21
|
+
elem = document.getElementById( id );
|
22
|
+
else if ( document.all )
|
23
|
+
elem = eval( "document.all." + id );
|
24
|
+
else
|
25
|
+
return false;
|
26
|
+
|
27
|
+
elemStyle = elem.style;
|
28
|
+
|
29
|
+
if ( elemStyle.display != "block" ) {
|
30
|
+
elemStyle.display = "block"
|
31
|
+
} else {
|
32
|
+
elemStyle.display = "none"
|
33
|
+
}
|
34
|
+
|
35
|
+
return true;
|
36
|
+
}
|
37
|
+
|
38
|
+
// Make codeblocks hidden by default
|
39
|
+
document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
|
40
|
+
|
41
|
+
// ]]>
|
42
|
+
</script>
|
43
|
+
|
44
|
+
</head>
|
45
|
+
<body>
|
46
|
+
|
47
|
+
|
48
|
+
|
49
|
+
<div id="classHeader">
|
50
|
+
<table class="header-table">
|
51
|
+
<tr class="top-aligned-row">
|
52
|
+
<td><strong>Module</strong></td>
|
53
|
+
<td class="class-name-in-header">Net</td>
|
54
|
+
</tr>
|
55
|
+
<tr class="top-aligned-row">
|
56
|
+
<td><strong>In:</strong></td>
|
57
|
+
<td>
|
58
|
+
</td>
|
59
|
+
</tr>
|
60
|
+
|
61
|
+
</table>
|
62
|
+
</div>
|
63
|
+
<!-- banner header -->
|
64
|
+
|
65
|
+
<div id="bodyContent">
|
66
|
+
|
67
|
+
|
68
|
+
|
69
|
+
<div id="contextContent">
|
70
|
+
|
71
|
+
|
72
|
+
|
73
|
+
</div>
|
74
|
+
|
75
|
+
|
76
|
+
</div>
|
77
|
+
|
78
|
+
|
79
|
+
<!-- if includes -->
|
80
|
+
|
81
|
+
<div id="section">
|
82
|
+
|
83
|
+
|
84
|
+
|
85
|
+
|
86
|
+
|
87
|
+
|
88
|
+
|
89
|
+
|
90
|
+
<!-- if method_list -->
|
91
|
+
|
92
|
+
|
93
|
+
</div>
|
94
|
+
|
95
|
+
|
96
|
+
<div id="validator-badges">
|
97
|
+
<p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
|
98
|
+
</div>
|
99
|
+
|
100
|
+
</body>
|
101
|
+
</html>
|
@@ -0,0 +1,180 @@
|
|
1
|
+
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
+
<!DOCTYPE html
|
3
|
+
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
+
|
6
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
7
|
+
<head>
|
8
|
+
<title>Class: Spider</title>
|
9
|
+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
+
<meta http-equiv="Content-Script-Type" content="text/javascript" />
|
11
|
+
<link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" />
|
12
|
+
<script type="text/javascript">
|
13
|
+
// <![CDATA[
|
14
|
+
|
15
|
+
function popupCode( url ) {
|
16
|
+
window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
|
17
|
+
}
|
18
|
+
|
19
|
+
function toggleCode( id ) {
|
20
|
+
if ( document.getElementById )
|
21
|
+
elem = document.getElementById( id );
|
22
|
+
else if ( document.all )
|
23
|
+
elem = eval( "document.all." + id );
|
24
|
+
else
|
25
|
+
return false;
|
26
|
+
|
27
|
+
elemStyle = elem.style;
|
28
|
+
|
29
|
+
if ( elemStyle.display != "block" ) {
|
30
|
+
elemStyle.display = "block"
|
31
|
+
} else {
|
32
|
+
elemStyle.display = "none"
|
33
|
+
}
|
34
|
+
|
35
|
+
return true;
|
36
|
+
}
|
37
|
+
|
38
|
+
// Make codeblocks hidden by default
|
39
|
+
document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
|
40
|
+
|
41
|
+
// ]]>
|
42
|
+
</script>
|
43
|
+
|
44
|
+
</head>
|
45
|
+
<body>
|
46
|
+
|
47
|
+
|
48
|
+
|
49
|
+
<div id="classHeader">
|
50
|
+
<table class="header-table">
|
51
|
+
<tr class="top-aligned-row">
|
52
|
+
<td><strong>Class</strong></td>
|
53
|
+
<td class="class-name-in-header">Spider</td>
|
54
|
+
</tr>
|
55
|
+
<tr class="top-aligned-row">
|
56
|
+
<td><strong>In:</strong></td>
|
57
|
+
<td>
|
58
|
+
<a href="../files/lib/spider_rb.html">
|
59
|
+
lib/spider.rb
|
60
|
+
</a>
|
61
|
+
<br />
|
62
|
+
</td>
|
63
|
+
</tr>
|
64
|
+
|
65
|
+
<tr class="top-aligned-row">
|
66
|
+
<td><strong>Parent:</strong></td>
|
67
|
+
<td>
|
68
|
+
Object
|
69
|
+
</td>
|
70
|
+
</tr>
|
71
|
+
</table>
|
72
|
+
</div>
|
73
|
+
<!-- banner header -->
|
74
|
+
|
75
|
+
<div id="bodyContent">
|
76
|
+
|
77
|
+
|
78
|
+
|
79
|
+
<div id="contextContent">
|
80
|
+
|
81
|
+
<div id="description">
|
82
|
+
<p>
|
83
|
+
A spidering library for Ruby. Handles robots.txt, scraping, finding more
|
84
|
+
links, and doing it all over again.
|
85
|
+
</p>
|
86
|
+
|
87
|
+
</div>
|
88
|
+
|
89
|
+
|
90
|
+
</div>
|
91
|
+
|
92
|
+
<div id="method-list">
|
93
|
+
<h3 class="section-bar">Methods</h3>
|
94
|
+
|
95
|
+
<div class="name-list">
|
96
|
+
<a href="#M000003">start_at</a>
|
97
|
+
</div>
|
98
|
+
</div>
|
99
|
+
|
100
|
+
</div>
|
101
|
+
|
102
|
+
|
103
|
+
<!-- if includes -->
|
104
|
+
|
105
|
+
<div id="section">
|
106
|
+
|
107
|
+
|
108
|
+
|
109
|
+
|
110
|
+
|
111
|
+
|
112
|
+
|
113
|
+
|
114
|
+
<!-- if method_list -->
|
115
|
+
<div id="methods">
|
116
|
+
<h3 class="section-bar">Public Class methods</h3>
|
117
|
+
|
118
|
+
<div id="method-M000003" class="method-detail">
|
119
|
+
<a name="M000003"></a>
|
120
|
+
|
121
|
+
<div class="method-heading">
|
122
|
+
<a href="#M000003" class="method-signature">
|
123
|
+
<span class="method-name">start_at</span><span class="method-args">(a_url, &block)</span>
|
124
|
+
</a>
|
125
|
+
</div>
|
126
|
+
|
127
|
+
<div class="method-description">
|
128
|
+
<p>
|
129
|
+
Runs the spider starting at the given URL. Also takes a block that is given
|
130
|
+
the <a href="SpiderInstance.html">SpiderInstance</a>. Use the block to
|
131
|
+
define the rules and handlers for the discovered Web pages.
|
132
|
+
</p>
|
133
|
+
<pre>
|
134
|
+
Spider.start_at('http://mike-burns.com/') do |s|
|
135
|
+
s.add_url_check do |a_url|
|
136
|
+
a_url =~ %r{^http://mike-burns.com.*}
|
137
|
+
end
|
138
|
+
|
139
|
+
s.on 404 do |a_url, err_code|
|
140
|
+
puts "URL not found: #{a_url}"
|
141
|
+
end
|
142
|
+
|
143
|
+
s.on :success do |a_url, code, headers, body|
|
144
|
+
puts "body: #{body}"
|
145
|
+
end
|
146
|
+
|
147
|
+
s.on :any do |a_url, resp|
|
148
|
+
puts "URL returned anything: #{a_url} with this code #{resp.code}"
|
149
|
+
end
|
150
|
+
end
|
151
|
+
</pre>
|
152
|
+
<p><a class="source-toggle" href="#"
|
153
|
+
onclick="toggleCode('M000003-source');return false;">[Source]</a></p>
|
154
|
+
<div class="method-source-code" id="M000003-source">
|
155
|
+
<pre>
|
156
|
+
<span class="ruby-comment cmt"># File lib/spider.rb, line 55</span>
|
157
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">start_at</span>(<span class="ruby-identifier">a_url</span>, <span class="ruby-operator">&</span><span class="ruby-identifier">block</span>)
|
158
|
+
<span class="ruby-identifier">rules</span> = <span class="ruby-constant">RobotRules</span>.<span class="ruby-identifier">new</span>(<span class="ruby-value str">'Ruby Spider 1.0'</span>)
|
159
|
+
<span class="ruby-identifier">a_spider</span> = <span class="ruby-constant">SpiderInstance</span>.<span class="ruby-identifier">new</span>([<span class="ruby-identifier">a_url</span>], [], <span class="ruby-identifier">rules</span>, [])
|
160
|
+
<span class="ruby-identifier">block</span>.<span class="ruby-identifier">call</span>(<span class="ruby-identifier">a_spider</span>)
|
161
|
+
<span class="ruby-identifier">a_spider</span>.<span class="ruby-identifier">start!</span>
|
162
|
+
<span class="ruby-keyword kw">end</span>
|
163
|
+
</pre>
|
164
|
+
</div>
|
165
|
+
</div>
|
166
|
+
</div>
|
167
|
+
|
168
|
+
|
169
|
+
</div>
|
170
|
+
|
171
|
+
|
172
|
+
</div>
|
173
|
+
|
174
|
+
|
175
|
+
<div id="validator-badges">
|
176
|
+
<p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
|
177
|
+
</div>
|
178
|
+
|
179
|
+
</body>
|
180
|
+
</html>
|
@@ -0,0 +1,229 @@
|
|
1
|
+
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
+
<!DOCTYPE html
|
3
|
+
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
+
|
6
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
7
|
+
<head>
|
8
|
+
<title>Class: SpiderInstance</title>
|
9
|
+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
+
<meta http-equiv="Content-Script-Type" content="text/javascript" />
|
11
|
+
<link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" />
|
12
|
+
<script type="text/javascript">
|
13
|
+
// <![CDATA[
|
14
|
+
|
15
|
+
function popupCode( url ) {
|
16
|
+
window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
|
17
|
+
}
|
18
|
+
|
19
|
+
function toggleCode( id ) {
|
20
|
+
if ( document.getElementById )
|
21
|
+
elem = document.getElementById( id );
|
22
|
+
else if ( document.all )
|
23
|
+
elem = eval( "document.all." + id );
|
24
|
+
else
|
25
|
+
return false;
|
26
|
+
|
27
|
+
elemStyle = elem.style;
|
28
|
+
|
29
|
+
if ( elemStyle.display != "block" ) {
|
30
|
+
elemStyle.display = "block"
|
31
|
+
} else {
|
32
|
+
elemStyle.display = "none"
|
33
|
+
}
|
34
|
+
|
35
|
+
return true;
|
36
|
+
}
|
37
|
+
|
38
|
+
// Make codeblocks hidden by default
|
39
|
+
document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
|
40
|
+
|
41
|
+
// ]]>
|
42
|
+
</script>
|
43
|
+
|
44
|
+
</head>
|
45
|
+
<body>
|
46
|
+
|
47
|
+
|
48
|
+
|
49
|
+
<div id="classHeader">
|
50
|
+
<table class="header-table">
|
51
|
+
<tr class="top-aligned-row">
|
52
|
+
<td><strong>Class</strong></td>
|
53
|
+
<td class="class-name-in-header">SpiderInstance</td>
|
54
|
+
</tr>
|
55
|
+
<tr class="top-aligned-row">
|
56
|
+
<td><strong>In:</strong></td>
|
57
|
+
<td>
|
58
|
+
<a href="../files/lib/spider_rb.html">
|
59
|
+
lib/spider.rb
|
60
|
+
</a>
|
61
|
+
<br />
|
62
|
+
</td>
|
63
|
+
</tr>
|
64
|
+
|
65
|
+
<tr class="top-aligned-row">
|
66
|
+
<td><strong>Parent:</strong></td>
|
67
|
+
<td>
|
68
|
+
Object
|
69
|
+
</td>
|
70
|
+
</tr>
|
71
|
+
</table>
|
72
|
+
</div>
|
73
|
+
<!-- banner header -->
|
74
|
+
|
75
|
+
<div id="bodyContent">
|
76
|
+
|
77
|
+
|
78
|
+
|
79
|
+
<div id="contextContent">
|
80
|
+
|
81
|
+
|
82
|
+
|
83
|
+
</div>
|
84
|
+
|
85
|
+
<div id="method-list">
|
86
|
+
<h3 class="section-bar">Methods</h3>
|
87
|
+
|
88
|
+
<div class="name-list">
|
89
|
+
<a href="#M000001">add_url_check</a>
|
90
|
+
<a href="#M000002">on</a>
|
91
|
+
</div>
|
92
|
+
</div>
|
93
|
+
|
94
|
+
</div>
|
95
|
+
|
96
|
+
|
97
|
+
<!-- if includes -->
|
98
|
+
|
99
|
+
<div id="section">
|
100
|
+
|
101
|
+
|
102
|
+
|
103
|
+
|
104
|
+
|
105
|
+
|
106
|
+
|
107
|
+
|
108
|
+
<!-- if method_list -->
|
109
|
+
<div id="methods">
|
110
|
+
<h3 class="section-bar">Public Instance methods</h3>
|
111
|
+
|
112
|
+
<div id="method-M000001" class="method-detail">
|
113
|
+
<a name="M000001"></a>
|
114
|
+
|
115
|
+
<div class="method-heading">
|
116
|
+
<a href="#M000001" class="method-signature">
|
117
|
+
<span class="method-name">add_url_check</span><span class="method-args">(&block)</span>
|
118
|
+
</a>
|
119
|
+
</div>
|
120
|
+
|
121
|
+
<div class="method-description">
|
122
|
+
<p>
|
123
|
+
Add a predicate that determines whether to continue down this URL‘s
|
124
|
+
path. All predicates must be true in order for a URL to proceed.
|
125
|
+
</p>
|
126
|
+
<p>
|
127
|
+
Takes a block that takes a string and produces a boolean. For example, this
|
128
|
+
will ensure that the URL starts with ‘<a
|
129
|
+
href="http://mike-burns.com">mike-burns.com</a>’:
|
130
|
+
</p>
|
131
|
+
<pre>
|
132
|
+
add_url_check { |a_url| a_url =~ %r{^http://mike-burns.com.*}
|
133
|
+
</pre>
|
134
|
+
<p><a class="source-toggle" href="#"
|
135
|
+
onclick="toggleCode('M000001-source');return false;">[Source]</a></p>
|
136
|
+
<div class="method-source-code" id="M000001-source">
|
137
|
+
<pre>
|
138
|
+
<span class="ruby-comment cmt"># File lib/spider.rb, line 81</span>
|
139
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">add_url_check</span>(<span class="ruby-operator">&</span><span class="ruby-identifier">block</span>)
|
140
|
+
<span class="ruby-ivar">@url_checks</span> <span class="ruby-operator"><<</span> <span class="ruby-identifier">block</span>
|
141
|
+
<span class="ruby-keyword kw">end</span>
|
142
|
+
</pre>
|
143
|
+
</div>
|
144
|
+
</div>
|
145
|
+
</div>
|
146
|
+
|
147
|
+
<div id="method-M000002" class="method-detail">
|
148
|
+
<a name="M000002"></a>
|
149
|
+
|
150
|
+
<div class="method-heading">
|
151
|
+
<a href="#M000002" class="method-signature">
|
152
|
+
<span class="method-name">on</span><span class="method-args">(code, p = nil, &block)</span>
|
153
|
+
</a>
|
154
|
+
</div>
|
155
|
+
|
156
|
+
<div class="method-description">
|
157
|
+
<p>
|
158
|
+
Add a response handler. A response handler‘s trigger can be :any,
|
159
|
+
:success, :failure, or any HTTP status code. The handler itself can be
|
160
|
+
either a Proc or a block. The arguments to the block depends <a
|
161
|
+
href="SpiderInstance.html#M000002">on</a> the trigger:
|
162
|
+
</p>
|
163
|
+
<p>
|
164
|
+
If the trigger is :any, the arguments are the URL as a string and an
|
165
|
+
instance of Net::HTTPResponse.
|
166
|
+
</p>
|
167
|
+
<p>
|
168
|
+
If the trigger is :success or any HTTP status code that represents a
|
169
|
+
successful result, the arguments are the URL as a string, the HTTP status
|
170
|
+
code, an instance of Net::HTTPSuccess, and the body of the result as a
|
171
|
+
string.
|
172
|
+
</p>
|
173
|
+
<p>
|
174
|
+
If the trigger is :failure or any HTTP status code that represents a failed
|
175
|
+
result, the arguments are the URL as a string and the HTTP status code.
|
176
|
+
</p>
|
177
|
+
<p>
|
178
|
+
For example:
|
179
|
+
</p>
|
180
|
+
<pre>
|
181
|
+
on 404 do |a_url, code|
|
182
|
+
puts "URL not found: #{a_url}"
|
183
|
+
end
|
184
|
+
|
185
|
+
on :success do |a_url, code, resp, body|
|
186
|
+
puts a_url
|
187
|
+
puts body
|
188
|
+
end
|
189
|
+
|
190
|
+
on :any do |a_url, resp|
|
191
|
+
puts "Given this code: #{resp.code}"
|
192
|
+
end
|
193
|
+
</pre>
|
194
|
+
<p><a class="source-toggle" href="#"
|
195
|
+
onclick="toggleCode('M000002-source');return false;">[Source]</a></p>
|
196
|
+
<div class="method-source-code" id="M000002-source">
|
197
|
+
<pre>
|
198
|
+
<span class="ruby-comment cmt"># File lib/spider.rb, line 118</span>
|
199
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">on</span>(<span class="ruby-identifier">code</span>, <span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&</span><span class="ruby-identifier">block</span>)
|
200
|
+
<span class="ruby-identifier">f</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
|
201
|
+
<span class="ruby-keyword kw">case</span> <span class="ruby-identifier">code</span>
|
202
|
+
<span class="ruby-keyword kw">when</span> <span class="ruby-constant">Fixnum</span>
|
203
|
+
<span class="ruby-ivar">@callbacks</span>[<span class="ruby-identifier">success_or_failure</span>(<span class="ruby-identifier">code</span>)][<span class="ruby-identifier">code</span>] = <span class="ruby-identifier">f</span>
|
204
|
+
<span class="ruby-keyword kw">else</span>
|
205
|
+
<span class="ruby-keyword kw">if</span> <span class="ruby-identifier">:any</span> <span class="ruby-operator">==</span> <span class="ruby-identifier">code</span>.<span class="ruby-identifier">to_sym</span>
|
206
|
+
<span class="ruby-ivar">@callbacks</span>[<span class="ruby-identifier">:any</span>] = <span class="ruby-identifier">f</span>
|
207
|
+
<span class="ruby-keyword kw">else</span>
|
208
|
+
<span class="ruby-ivar">@callbacks</span>[<span class="ruby-identifier">code</span>.<span class="ruby-identifier">to_sym</span>][<span class="ruby-identifier">:any</span>] = <span class="ruby-identifier">f</span>
|
209
|
+
<span class="ruby-keyword kw">end</span>
|
210
|
+
<span class="ruby-keyword kw">end</span>
|
211
|
+
<span class="ruby-keyword kw">end</span>
|
212
|
+
</pre>
|
213
|
+
</div>
|
214
|
+
</div>
|
215
|
+
</div>
|
216
|
+
|
217
|
+
|
218
|
+
</div>
|
219
|
+
|
220
|
+
|
221
|
+
</div>
|
222
|
+
|
223
|
+
|
224
|
+
<div id="validator-badges">
|
225
|
+
<p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
|
226
|
+
</div>
|
227
|
+
|
228
|
+
</body>
|
229
|
+
</html>
|