spider 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGES CHANGED
@@ -1,3 +1,13 @@
1
+ 2007-10-22:
2
+ * Use RSpec to ensure that it mostly works.
3
+ * Use WEBrick to create a small test server for additional testing.
4
+ * Completely re-do the API to prepare for future expansion.
5
+ * Add the ability to apply each URL to a series of custom allowed?-like
6
+ matchers.
7
+
8
+ 2007-03-30:
9
+ * Clean up the documentation.
10
+
1
11
  2007-03-28:
2
12
  * Change the tail recursion to a `while' loop, to please Ruby.
3
13
  * Documentation.
data/README CHANGED
@@ -1,37 +1,41 @@
1
1
  Spider, a Web spidering library for Ruby. It handles the robots.txt,
2
2
  scraping, collecting, and looping so that you can just handle the data.
3
3
 
4
- == Requirements ==
4
+ == Usage
5
+
6
+ Spider.start_at('http://mike-burns.com/') do |s|
7
+ # Limit the pages to just this domain.
8
+ s.add_url_check do |a_url|
9
+ a_url =~ %r{^http://mike-burns.com.*}
10
+ end
11
+
12
+ # Handle 404s.
13
+ s.on 404 do |a_url, err_code|
14
+ puts "URL not found: #{a_url}"
15
+ end
16
+
17
+ # Handle 2xx.
18
+ s.on :success do |a_url, code, headers, body|
19
+ puts "body: #{body}"
20
+ end
21
+
22
+ # Handle everything.
23
+ s.on :any do |a_url, resp|
24
+ puts "URL returned anything: #{a_url} with this code #{resp.code}"
25
+ end
26
+ end
27
+
28
+
29
+ == Requirements
5
30
 
6
31
  This library uses `robot_rules' (included), `open-uri', and `uri'. Any modern
7
32
  Ruby should work; if yours doesn't, let me know so I can update this with your
8
33
  version number.
9
34
 
10
- == Usage ==
11
-
12
- One function: `spider'. It takes a list of seed URLs and a block; this block is
13
- passed each URL and its Web page. This function never returns, ideally.
14
-
15
- spider : [String] (String String -> a) -> omega
16
-
17
- Examples:
18
-
19
- require 'spider'
20
- include Spider
21
-
22
- spider(['http://yahoo.com']) do |a_url, web_page|
23
- puts "At #{a_url}"
24
- end
25
-
26
- spider(['http://mike-burns.com','http://matthoran.com']) do |u, page|
27
- # assumes `scrape_images' and `store_image!' functions.
28
- scrape_images(page).each { |img| store_image!(img) }
29
- end
30
-
31
- == Author ==
35
+ == Author
32
36
 
33
37
  Mike Burns http://mike-burns.com mike@mike-burns.com
34
38
 
35
- With help from Matt Horan.
39
+ With help from Matt Horan and John Nagro.
36
40
  With `robot_rules' from James Edward Gray II via
37
- http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589
41
+ http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589
@@ -0,0 +1,101 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
+ <head>
8
+ <title>Module: Net</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
+ <link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" />
12
+ <script type="text/javascript">
13
+ // <![CDATA[
14
+
15
+ function popupCode( url ) {
16
+ window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
+ }
18
+
19
+ function toggleCode( id ) {
20
+ if ( document.getElementById )
21
+ elem = document.getElementById( id );
22
+ else if ( document.all )
23
+ elem = eval( "document.all." + id );
24
+ else
25
+ return false;
26
+
27
+ elemStyle = elem.style;
28
+
29
+ if ( elemStyle.display != "block" ) {
30
+ elemStyle.display = "block"
31
+ } else {
32
+ elemStyle.display = "none"
33
+ }
34
+
35
+ return true;
36
+ }
37
+
38
+ // Make codeblocks hidden by default
39
+ document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
+
41
+ // ]]>
42
+ </script>
43
+
44
+ </head>
45
+ <body>
46
+
47
+
48
+
49
+ <div id="classHeader">
50
+ <table class="header-table">
51
+ <tr class="top-aligned-row">
52
+ <td><strong>Module</strong></td>
53
+ <td class="class-name-in-header">Net</td>
54
+ </tr>
55
+ <tr class="top-aligned-row">
56
+ <td><strong>In:</strong></td>
57
+ <td>
58
+ </td>
59
+ </tr>
60
+
61
+ </table>
62
+ </div>
63
+ <!-- banner header -->
64
+
65
+ <div id="bodyContent">
66
+
67
+
68
+
69
+ <div id="contextContent">
70
+
71
+
72
+
73
+ </div>
74
+
75
+
76
+ </div>
77
+
78
+
79
+ <!-- if includes -->
80
+
81
+ <div id="section">
82
+
83
+
84
+
85
+
86
+
87
+
88
+
89
+
90
+ <!-- if method_list -->
91
+
92
+
93
+ </div>
94
+
95
+
96
+ <div id="validator-badges">
97
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
98
+ </div>
99
+
100
+ </body>
101
+ </html>
@@ -0,0 +1,180 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
+ <head>
8
+ <title>Class: Spider</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
+ <link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" />
12
+ <script type="text/javascript">
13
+ // <![CDATA[
14
+
15
+ function popupCode( url ) {
16
+ window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
+ }
18
+
19
+ function toggleCode( id ) {
20
+ if ( document.getElementById )
21
+ elem = document.getElementById( id );
22
+ else if ( document.all )
23
+ elem = eval( "document.all." + id );
24
+ else
25
+ return false;
26
+
27
+ elemStyle = elem.style;
28
+
29
+ if ( elemStyle.display != "block" ) {
30
+ elemStyle.display = "block"
31
+ } else {
32
+ elemStyle.display = "none"
33
+ }
34
+
35
+ return true;
36
+ }
37
+
38
+ // Make codeblocks hidden by default
39
+ document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
+
41
+ // ]]>
42
+ </script>
43
+
44
+ </head>
45
+ <body>
46
+
47
+
48
+
49
+ <div id="classHeader">
50
+ <table class="header-table">
51
+ <tr class="top-aligned-row">
52
+ <td><strong>Class</strong></td>
53
+ <td class="class-name-in-header">Spider</td>
54
+ </tr>
55
+ <tr class="top-aligned-row">
56
+ <td><strong>In:</strong></td>
57
+ <td>
58
+ <a href="../files/lib/spider_rb.html">
59
+ lib/spider.rb
60
+ </a>
61
+ <br />
62
+ </td>
63
+ </tr>
64
+
65
+ <tr class="top-aligned-row">
66
+ <td><strong>Parent:</strong></td>
67
+ <td>
68
+ Object
69
+ </td>
70
+ </tr>
71
+ </table>
72
+ </div>
73
+ <!-- banner header -->
74
+
75
+ <div id="bodyContent">
76
+
77
+
78
+
79
+ <div id="contextContent">
80
+
81
+ <div id="description">
82
+ <p>
83
+ A spidering library for Ruby. Handles robots.txt, scraping, finding more
84
+ links, and doing it all over again.
85
+ </p>
86
+
87
+ </div>
88
+
89
+
90
+ </div>
91
+
92
+ <div id="method-list">
93
+ <h3 class="section-bar">Methods</h3>
94
+
95
+ <div class="name-list">
96
+ <a href="#M000003">start_at</a>&nbsp;&nbsp;
97
+ </div>
98
+ </div>
99
+
100
+ </div>
101
+
102
+
103
+ <!-- if includes -->
104
+
105
+ <div id="section">
106
+
107
+
108
+
109
+
110
+
111
+
112
+
113
+
114
+ <!-- if method_list -->
115
+ <div id="methods">
116
+ <h3 class="section-bar">Public Class methods</h3>
117
+
118
+ <div id="method-M000003" class="method-detail">
119
+ <a name="M000003"></a>
120
+
121
+ <div class="method-heading">
122
+ <a href="#M000003" class="method-signature">
123
+ <span class="method-name">start_at</span><span class="method-args">(a_url, &amp;block)</span>
124
+ </a>
125
+ </div>
126
+
127
+ <div class="method-description">
128
+ <p>
129
+ Runs the spider starting at the given URL. Also takes a block that is given
130
+ the <a href="SpiderInstance.html">SpiderInstance</a>. Use the block to
131
+ define the rules and handlers for the discovered Web pages.
132
+ </p>
133
+ <pre>
134
+ Spider.start_at('http://mike-burns.com/') do |s|
135
+ s.add_url_check do |a_url|
136
+ a_url =~ %r{^http://mike-burns.com.*}
137
+ end
138
+
139
+ s.on 404 do |a_url, err_code|
140
+ puts &quot;URL not found: #{a_url}&quot;
141
+ end
142
+
143
+ s.on :success do |a_url, code, headers, body|
144
+ puts &quot;body: #{body}&quot;
145
+ end
146
+
147
+ s.on :any do |a_url, resp|
148
+ puts &quot;URL returned anything: #{a_url} with this code #{resp.code}&quot;
149
+ end
150
+ end
151
+ </pre>
152
+ <p><a class="source-toggle" href="#"
153
+ onclick="toggleCode('M000003-source');return false;">[Source]</a></p>
154
+ <div class="method-source-code" id="M000003-source">
155
+ <pre>
156
+ <span class="ruby-comment cmt"># File lib/spider.rb, line 55</span>
157
+ <span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">start_at</span>(<span class="ruby-identifier">a_url</span>, <span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
158
+ <span class="ruby-identifier">rules</span> = <span class="ruby-constant">RobotRules</span>.<span class="ruby-identifier">new</span>(<span class="ruby-value str">'Ruby Spider 1.0'</span>)
159
+ <span class="ruby-identifier">a_spider</span> = <span class="ruby-constant">SpiderInstance</span>.<span class="ruby-identifier">new</span>([<span class="ruby-identifier">a_url</span>], [], <span class="ruby-identifier">rules</span>, [])
160
+ <span class="ruby-identifier">block</span>.<span class="ruby-identifier">call</span>(<span class="ruby-identifier">a_spider</span>)
161
+ <span class="ruby-identifier">a_spider</span>.<span class="ruby-identifier">start!</span>
162
+ <span class="ruby-keyword kw">end</span>
163
+ </pre>
164
+ </div>
165
+ </div>
166
+ </div>
167
+
168
+
169
+ </div>
170
+
171
+
172
+ </div>
173
+
174
+
175
+ <div id="validator-badges">
176
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
177
+ </div>
178
+
179
+ </body>
180
+ </html>
@@ -0,0 +1,229 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
+ <head>
8
+ <title>Class: SpiderInstance</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
+ <link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" />
12
+ <script type="text/javascript">
13
+ // <![CDATA[
14
+
15
+ function popupCode( url ) {
16
+ window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
+ }
18
+
19
+ function toggleCode( id ) {
20
+ if ( document.getElementById )
21
+ elem = document.getElementById( id );
22
+ else if ( document.all )
23
+ elem = eval( "document.all." + id );
24
+ else
25
+ return false;
26
+
27
+ elemStyle = elem.style;
28
+
29
+ if ( elemStyle.display != "block" ) {
30
+ elemStyle.display = "block"
31
+ } else {
32
+ elemStyle.display = "none"
33
+ }
34
+
35
+ return true;
36
+ }
37
+
38
+ // Make codeblocks hidden by default
39
+ document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
+
41
+ // ]]>
42
+ </script>
43
+
44
+ </head>
45
+ <body>
46
+
47
+
48
+
49
+ <div id="classHeader">
50
+ <table class="header-table">
51
+ <tr class="top-aligned-row">
52
+ <td><strong>Class</strong></td>
53
+ <td class="class-name-in-header">SpiderInstance</td>
54
+ </tr>
55
+ <tr class="top-aligned-row">
56
+ <td><strong>In:</strong></td>
57
+ <td>
58
+ <a href="../files/lib/spider_rb.html">
59
+ lib/spider.rb
60
+ </a>
61
+ <br />
62
+ </td>
63
+ </tr>
64
+
65
+ <tr class="top-aligned-row">
66
+ <td><strong>Parent:</strong></td>
67
+ <td>
68
+ Object
69
+ </td>
70
+ </tr>
71
+ </table>
72
+ </div>
73
+ <!-- banner header -->
74
+
75
+ <div id="bodyContent">
76
+
77
+
78
+
79
+ <div id="contextContent">
80
+
81
+
82
+
83
+ </div>
84
+
85
+ <div id="method-list">
86
+ <h3 class="section-bar">Methods</h3>
87
+
88
+ <div class="name-list">
89
+ <a href="#M000001">add_url_check</a>&nbsp;&nbsp;
90
+ <a href="#M000002">on</a>&nbsp;&nbsp;
91
+ </div>
92
+ </div>
93
+
94
+ </div>
95
+
96
+
97
+ <!-- if includes -->
98
+
99
+ <div id="section">
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+
108
+ <!-- if method_list -->
109
+ <div id="methods">
110
+ <h3 class="section-bar">Public Instance methods</h3>
111
+
112
+ <div id="method-M000001" class="method-detail">
113
+ <a name="M000001"></a>
114
+
115
+ <div class="method-heading">
116
+ <a href="#M000001" class="method-signature">
117
+ <span class="method-name">add_url_check</span><span class="method-args">(&amp;block)</span>
118
+ </a>
119
+ </div>
120
+
121
+ <div class="method-description">
122
+ <p>
123
+ Add a predicate that determines whether to continue down this URL&#8216;s
124
+ path. All predicates must be true in order for a URL to proceed.
125
+ </p>
126
+ <p>
127
+ Takes a block that takes a string and produces a boolean. For example, this
128
+ will ensure that the URL starts with &#8216;<a
129
+ href="http://mike-burns.com">mike-burns.com</a>&#8217;:
130
+ </p>
131
+ <pre>
132
+ add_url_check { |a_url| a_url =~ %r{^http://mike-burns.com.*}
133
+ </pre>
134
+ <p><a class="source-toggle" href="#"
135
+ onclick="toggleCode('M000001-source');return false;">[Source]</a></p>
136
+ <div class="method-source-code" id="M000001-source">
137
+ <pre>
138
+ <span class="ruby-comment cmt"># File lib/spider.rb, line 81</span>
139
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">add_url_check</span>(<span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
140
+ <span class="ruby-ivar">@url_checks</span> <span class="ruby-operator">&lt;&lt;</span> <span class="ruby-identifier">block</span>
141
+ <span class="ruby-keyword kw">end</span>
142
+ </pre>
143
+ </div>
144
+ </div>
145
+ </div>
146
+
147
+ <div id="method-M000002" class="method-detail">
148
+ <a name="M000002"></a>
149
+
150
+ <div class="method-heading">
151
+ <a href="#M000002" class="method-signature">
152
+ <span class="method-name">on</span><span class="method-args">(code, p = nil, &amp;block)</span>
153
+ </a>
154
+ </div>
155
+
156
+ <div class="method-description">
157
+ <p>
158
+ Add a response handler. A response handler&#8216;s trigger can be :any,
159
+ :success, :failure, or any HTTP status code. The handler itself can be
160
+ either a Proc or a block. The arguments to the block depends <a
161
+ href="SpiderInstance.html#M000002">on</a> the trigger:
162
+ </p>
163
+ <p>
164
+ If the trigger is :any, the arguments are the URL as a string and an
165
+ instance of Net::HTTPResponse.
166
+ </p>
167
+ <p>
168
+ If the trigger is :success or any HTTP status code that represents a
169
+ successful result, the arguments are the URL as a string, the HTTP status
170
+ code, an instance of Net::HTTPSuccess, and the body of the result as a
171
+ string.
172
+ </p>
173
+ <p>
174
+ If the trigger is :failure or any HTTP status code that represents a failed
175
+ result, the arguments are the URL as a string and the HTTP status code.
176
+ </p>
177
+ <p>
178
+ For example:
179
+ </p>
180
+ <pre>
181
+ on 404 do |a_url, code|
182
+ puts &quot;URL not found: #{a_url}&quot;
183
+ end
184
+
185
+ on :success do |a_url, code, resp, body|
186
+ puts a_url
187
+ puts body
188
+ end
189
+
190
+ on :any do |a_url, resp|
191
+ puts &quot;Given this code: #{resp.code}&quot;
192
+ end
193
+ </pre>
194
+ <p><a class="source-toggle" href="#"
195
+ onclick="toggleCode('M000002-source');return false;">[Source]</a></p>
196
+ <div class="method-source-code" id="M000002-source">
197
+ <pre>
198
+ <span class="ruby-comment cmt"># File lib/spider.rb, line 118</span>
199
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">on</span>(<span class="ruby-identifier">code</span>, <span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
200
+ <span class="ruby-identifier">f</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
201
+ <span class="ruby-keyword kw">case</span> <span class="ruby-identifier">code</span>
202
+ <span class="ruby-keyword kw">when</span> <span class="ruby-constant">Fixnum</span>
203
+ <span class="ruby-ivar">@callbacks</span>[<span class="ruby-identifier">success_or_failure</span>(<span class="ruby-identifier">code</span>)][<span class="ruby-identifier">code</span>] = <span class="ruby-identifier">f</span>
204
+ <span class="ruby-keyword kw">else</span>
205
+ <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">:any</span> <span class="ruby-operator">==</span> <span class="ruby-identifier">code</span>.<span class="ruby-identifier">to_sym</span>
206
+ <span class="ruby-ivar">@callbacks</span>[<span class="ruby-identifier">:any</span>] = <span class="ruby-identifier">f</span>
207
+ <span class="ruby-keyword kw">else</span>
208
+ <span class="ruby-ivar">@callbacks</span>[<span class="ruby-identifier">code</span>.<span class="ruby-identifier">to_sym</span>][<span class="ruby-identifier">:any</span>] = <span class="ruby-identifier">f</span>
209
+ <span class="ruby-keyword kw">end</span>
210
+ <span class="ruby-keyword kw">end</span>
211
+ <span class="ruby-keyword kw">end</span>
212
+ </pre>
213
+ </div>
214
+ </div>
215
+ </div>
216
+
217
+
218
+ </div>
219
+
220
+
221
+ </div>
222
+
223
+
224
+ <div id="validator-badges">
225
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
226
+ </div>
227
+
228
+ </body>
229
+ </html>