spider 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGES CHANGED
@@ -1,3 +1,13 @@
1
+ 2007-10-22:
2
+ * Use RSpec to ensure that it mostly works.
3
+ * Use WEBrick to create a small test server for additional testing.
4
+ * Completely re-do the API to prepare for future expansion.
5
+ * Add the ability to apply each URL to a series of custom allowed?-like
6
+ matchers.
7
+
8
+ 2007-03-30:
9
+ * Clean up the documentation.
10
+
1
11
  2007-03-28:
2
12
  * Change the tail recursion to a `while' loop, to please Ruby.
3
13
  * Documentation.
data/README CHANGED
@@ -1,37 +1,41 @@
1
1
  Spider, a Web spidering library for Ruby. It handles the robots.txt,
2
2
  scraping, collecting, and looping so that you can just handle the data.
3
3
 
4
- == Requirements ==
4
+ == Usage
5
+
6
+ Spider.start_at('http://mike-burns.com/') do |s|
7
+ # Limit the pages to just this domain.
8
+ s.add_url_check do |a_url|
9
+ a_url =~ %r{^http://mike-burns.com.*}
10
+ end
11
+
12
+ # Handle 404s.
13
+ s.on 404 do |a_url, err_code|
14
+ puts "URL not found: #{a_url}"
15
+ end
16
+
17
+ # Handle 2xx.
18
+ s.on :success do |a_url, code, headers, body|
19
+ puts "body: #{body}"
20
+ end
21
+
22
+ # Handle everything.
23
+ s.on :any do |a_url, resp|
24
+ puts "URL returned anything: #{a_url} with this code #{resp.code}"
25
+ end
26
+ end
27
+
28
+
29
+ == Requirements
5
30
 
6
31
  This library uses `robot_rules' (included), `open-uri', and `uri'. Any modern
7
32
  Ruby should work; if yours doesn't, let me know so I can update this with your
8
33
  version number.
9
34
 
10
- == Usage ==
11
-
12
- One function: `spider'. It takes a list of seed URLs and a block; this block is
13
- passed each URL and its Web page. This function never returns, ideally.
14
-
15
- spider : [String] (String String -> a) -> omega
16
-
17
- Examples:
18
-
19
- require 'spider'
20
- include Spider
21
-
22
- spider(['http://yahoo.com']) do |a_url, web_page|
23
- puts "At #{a_url}"
24
- end
25
-
26
- spider(['http://mike-burns.com','http://matthoran.com']) do |u, page|
27
- # assumes `scrape_images' and `store_image!' functions.
28
- scrape_images(page).each { |img| store_image!(img) }
29
- end
30
-
31
- == Author ==
35
+ == Author
32
36
 
33
37
  Mike Burns http://mike-burns.com mike@mike-burns.com
34
38
 
35
- With help from Matt Horan.
39
+ With help from Matt Horan and John Nagro.
36
40
  With `robot_rules' from James Edward Gray II via
37
- http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589
41
+ http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589
@@ -0,0 +1,101 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
+ <head>
8
+ <title>Module: Net</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
+ <link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" />
12
+ <script type="text/javascript">
13
+ // <![CDATA[
14
+
15
+ function popupCode( url ) {
16
+ window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
+ }
18
+
19
+ function toggleCode( id ) {
20
+ if ( document.getElementById )
21
+ elem = document.getElementById( id );
22
+ else if ( document.all )
23
+ elem = eval( "document.all." + id );
24
+ else
25
+ return false;
26
+
27
+ elemStyle = elem.style;
28
+
29
+ if ( elemStyle.display != "block" ) {
30
+ elemStyle.display = "block"
31
+ } else {
32
+ elemStyle.display = "none"
33
+ }
34
+
35
+ return true;
36
+ }
37
+
38
+ // Make codeblocks hidden by default
39
+ document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
+
41
+ // ]]>
42
+ </script>
43
+
44
+ </head>
45
+ <body>
46
+
47
+
48
+
49
+ <div id="classHeader">
50
+ <table class="header-table">
51
+ <tr class="top-aligned-row">
52
+ <td><strong>Module</strong></td>
53
+ <td class="class-name-in-header">Net</td>
54
+ </tr>
55
+ <tr class="top-aligned-row">
56
+ <td><strong>In:</strong></td>
57
+ <td>
58
+ </td>
59
+ </tr>
60
+
61
+ </table>
62
+ </div>
63
+ <!-- banner header -->
64
+
65
+ <div id="bodyContent">
66
+
67
+
68
+
69
+ <div id="contextContent">
70
+
71
+
72
+
73
+ </div>
74
+
75
+
76
+ </div>
77
+
78
+
79
+ <!-- if includes -->
80
+
81
+ <div id="section">
82
+
83
+
84
+
85
+
86
+
87
+
88
+
89
+
90
+ <!-- if method_list -->
91
+
92
+
93
+ </div>
94
+
95
+
96
+ <div id="validator-badges">
97
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
98
+ </div>
99
+
100
+ </body>
101
+ </html>
@@ -0,0 +1,180 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
+ <head>
8
+ <title>Class: Spider</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
+ <link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" />
12
+ <script type="text/javascript">
13
+ // <![CDATA[
14
+
15
+ function popupCode( url ) {
16
+ window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
+ }
18
+
19
+ function toggleCode( id ) {
20
+ if ( document.getElementById )
21
+ elem = document.getElementById( id );
22
+ else if ( document.all )
23
+ elem = eval( "document.all." + id );
24
+ else
25
+ return false;
26
+
27
+ elemStyle = elem.style;
28
+
29
+ if ( elemStyle.display != "block" ) {
30
+ elemStyle.display = "block"
31
+ } else {
32
+ elemStyle.display = "none"
33
+ }
34
+
35
+ return true;
36
+ }
37
+
38
+ // Make codeblocks hidden by default
39
+ document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
+
41
+ // ]]>
42
+ </script>
43
+
44
+ </head>
45
+ <body>
46
+
47
+
48
+
49
+ <div id="classHeader">
50
+ <table class="header-table">
51
+ <tr class="top-aligned-row">
52
+ <td><strong>Class</strong></td>
53
+ <td class="class-name-in-header">Spider</td>
54
+ </tr>
55
+ <tr class="top-aligned-row">
56
+ <td><strong>In:</strong></td>
57
+ <td>
58
+ <a href="../files/lib/spider_rb.html">
59
+ lib/spider.rb
60
+ </a>
61
+ <br />
62
+ </td>
63
+ </tr>
64
+
65
+ <tr class="top-aligned-row">
66
+ <td><strong>Parent:</strong></td>
67
+ <td>
68
+ Object
69
+ </td>
70
+ </tr>
71
+ </table>
72
+ </div>
73
+ <!-- banner header -->
74
+
75
+ <div id="bodyContent">
76
+
77
+
78
+
79
+ <div id="contextContent">
80
+
81
+ <div id="description">
82
+ <p>
83
+ A spidering library for Ruby. Handles robots.txt, scraping, finding more
84
+ links, and doing it all over again.
85
+ </p>
86
+
87
+ </div>
88
+
89
+
90
+ </div>
91
+
92
+ <div id="method-list">
93
+ <h3 class="section-bar">Methods</h3>
94
+
95
+ <div class="name-list">
96
+ <a href="#M000003">start_at</a>&nbsp;&nbsp;
97
+ </div>
98
+ </div>
99
+
100
+ </div>
101
+
102
+
103
+ <!-- if includes -->
104
+
105
+ <div id="section">
106
+
107
+
108
+
109
+
110
+
111
+
112
+
113
+
114
+ <!-- if method_list -->
115
+ <div id="methods">
116
+ <h3 class="section-bar">Public Class methods</h3>
117
+
118
+ <div id="method-M000003" class="method-detail">
119
+ <a name="M000003"></a>
120
+
121
+ <div class="method-heading">
122
+ <a href="#M000003" class="method-signature">
123
+ <span class="method-name">start_at</span><span class="method-args">(a_url, &amp;block)</span>
124
+ </a>
125
+ </div>
126
+
127
+ <div class="method-description">
128
+ <p>
129
+ Runs the spider starting at the given URL. Also takes a block that is given
130
+ the <a href="SpiderInstance.html">SpiderInstance</a>. Use the block to
131
+ define the rules and handlers for the discovered Web pages.
132
+ </p>
133
+ <pre>
134
+ Spider.start_at('http://mike-burns.com/') do |s|
135
+ s.add_url_check do |a_url|
136
+ a_url =~ %r{^http://mike-burns.com.*}
137
+ end
138
+
139
+ s.on 404 do |a_url, err_code|
140
+ puts &quot;URL not found: #{a_url}&quot;
141
+ end
142
+
143
+ s.on :success do |a_url, code, headers, body|
144
+ puts &quot;body: #{body}&quot;
145
+ end
146
+
147
+ s.on :any do |a_url, resp|
148
+ puts &quot;URL returned anything: #{a_url} with this code #{resp.code}&quot;
149
+ end
150
+ end
151
+ </pre>
152
+ <p><a class="source-toggle" href="#"
153
+ onclick="toggleCode('M000003-source');return false;">[Source]</a></p>
154
+ <div class="method-source-code" id="M000003-source">
155
+ <pre>
156
+ <span class="ruby-comment cmt"># File lib/spider.rb, line 55</span>
157
+ <span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">start_at</span>(<span class="ruby-identifier">a_url</span>, <span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
158
+ <span class="ruby-identifier">rules</span> = <span class="ruby-constant">RobotRules</span>.<span class="ruby-identifier">new</span>(<span class="ruby-value str">'Ruby Spider 1.0'</span>)
159
+ <span class="ruby-identifier">a_spider</span> = <span class="ruby-constant">SpiderInstance</span>.<span class="ruby-identifier">new</span>([<span class="ruby-identifier">a_url</span>], [], <span class="ruby-identifier">rules</span>, [])
160
+ <span class="ruby-identifier">block</span>.<span class="ruby-identifier">call</span>(<span class="ruby-identifier">a_spider</span>)
161
+ <span class="ruby-identifier">a_spider</span>.<span class="ruby-identifier">start!</span>
162
+ <span class="ruby-keyword kw">end</span>
163
+ </pre>
164
+ </div>
165
+ </div>
166
+ </div>
167
+
168
+
169
+ </div>
170
+
171
+
172
+ </div>
173
+
174
+
175
+ <div id="validator-badges">
176
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
177
+ </div>
178
+
179
+ </body>
180
+ </html>
@@ -0,0 +1,229 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
+ <head>
8
+ <title>Class: SpiderInstance</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
+ <link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" />
12
+ <script type="text/javascript">
13
+ // <![CDATA[
14
+
15
+ function popupCode( url ) {
16
+ window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
+ }
18
+
19
+ function toggleCode( id ) {
20
+ if ( document.getElementById )
21
+ elem = document.getElementById( id );
22
+ else if ( document.all )
23
+ elem = eval( "document.all." + id );
24
+ else
25
+ return false;
26
+
27
+ elemStyle = elem.style;
28
+
29
+ if ( elemStyle.display != "block" ) {
30
+ elemStyle.display = "block"
31
+ } else {
32
+ elemStyle.display = "none"
33
+ }
34
+
35
+ return true;
36
+ }
37
+
38
+ // Make codeblocks hidden by default
39
+ document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
+
41
+ // ]]>
42
+ </script>
43
+
44
+ </head>
45
+ <body>
46
+
47
+
48
+
49
+ <div id="classHeader">
50
+ <table class="header-table">
51
+ <tr class="top-aligned-row">
52
+ <td><strong>Class</strong></td>
53
+ <td class="class-name-in-header">SpiderInstance</td>
54
+ </tr>
55
+ <tr class="top-aligned-row">
56
+ <td><strong>In:</strong></td>
57
+ <td>
58
+ <a href="../files/lib/spider_rb.html">
59
+ lib/spider.rb
60
+ </a>
61
+ <br />
62
+ </td>
63
+ </tr>
64
+
65
+ <tr class="top-aligned-row">
66
+ <td><strong>Parent:</strong></td>
67
+ <td>
68
+ Object
69
+ </td>
70
+ </tr>
71
+ </table>
72
+ </div>
73
+ <!-- banner header -->
74
+
75
+ <div id="bodyContent">
76
+
77
+
78
+
79
+ <div id="contextContent">
80
+
81
+
82
+
83
+ </div>
84
+
85
+ <div id="method-list">
86
+ <h3 class="section-bar">Methods</h3>
87
+
88
+ <div class="name-list">
89
+ <a href="#M000001">add_url_check</a>&nbsp;&nbsp;
90
+ <a href="#M000002">on</a>&nbsp;&nbsp;
91
+ </div>
92
+ </div>
93
+
94
+ </div>
95
+
96
+
97
+ <!-- if includes -->
98
+
99
+ <div id="section">
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+
108
+ <!-- if method_list -->
109
+ <div id="methods">
110
+ <h3 class="section-bar">Public Instance methods</h3>
111
+
112
+ <div id="method-M000001" class="method-detail">
113
+ <a name="M000001"></a>
114
+
115
+ <div class="method-heading">
116
+ <a href="#M000001" class="method-signature">
117
+ <span class="method-name">add_url_check</span><span class="method-args">(&amp;block)</span>
118
+ </a>
119
+ </div>
120
+
121
+ <div class="method-description">
122
+ <p>
123
+ Add a predicate that determines whether to continue down this URL&#8216;s
124
+ path. All predicates must be true in order for a URL to proceed.
125
+ </p>
126
+ <p>
127
+ Takes a block that takes a string and produces a boolean. For example, this
128
+ will ensure that the URL starts with &#8216;<a
129
+ href="http://mike-burns.com">mike-burns.com</a>&#8217;:
130
+ </p>
131
+ <pre>
132
+ add_url_check { |a_url| a_url =~ %r{^http://mike-burns.com.*}
133
+ </pre>
134
+ <p><a class="source-toggle" href="#"
135
+ onclick="toggleCode('M000001-source');return false;">[Source]</a></p>
136
+ <div class="method-source-code" id="M000001-source">
137
+ <pre>
138
+ <span class="ruby-comment cmt"># File lib/spider.rb, line 81</span>
139
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">add_url_check</span>(<span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
140
+ <span class="ruby-ivar">@url_checks</span> <span class="ruby-operator">&lt;&lt;</span> <span class="ruby-identifier">block</span>
141
+ <span class="ruby-keyword kw">end</span>
142
+ </pre>
143
+ </div>
144
+ </div>
145
+ </div>
146
+
147
+ <div id="method-M000002" class="method-detail">
148
+ <a name="M000002"></a>
149
+
150
+ <div class="method-heading">
151
+ <a href="#M000002" class="method-signature">
152
+ <span class="method-name">on</span><span class="method-args">(code, p = nil, &amp;block)</span>
153
+ </a>
154
+ </div>
155
+
156
+ <div class="method-description">
157
+ <p>
158
+ Add a response handler. A response handler&#8216;s trigger can be :any,
159
+ :success, :failure, or any HTTP status code. The handler itself can be
160
+ either a Proc or a block. The arguments to the block depends <a
161
+ href="SpiderInstance.html#M000002">on</a> the trigger:
162
+ </p>
163
+ <p>
164
+ If the trigger is :any, the arguments are the URL as a string and an
165
+ instance of Net::HTTPResponse.
166
+ </p>
167
+ <p>
168
+ If the trigger is :success or any HTTP status code that represents a
169
+ successful result, the arguments are the URL as a string, the HTTP status
170
+ code, an instance of Net::HTTPSuccess, and the body of the result as a
171
+ string.
172
+ </p>
173
+ <p>
174
+ If the trigger is :failure or any HTTP status code that represents a failed
175
+ result, the arguments are the URL as a string and the HTTP status code.
176
+ </p>
177
+ <p>
178
+ For example:
179
+ </p>
180
+ <pre>
181
+ on 404 do |a_url, code|
182
+ puts &quot;URL not found: #{a_url}&quot;
183
+ end
184
+
185
+ on :success do |a_url, code, resp, body|
186
+ puts a_url
187
+ puts body
188
+ end
189
+
190
+ on :any do |a_url, resp|
191
+ puts &quot;Given this code: #{resp.code}&quot;
192
+ end
193
+ </pre>
194
+ <p><a class="source-toggle" href="#"
195
+ onclick="toggleCode('M000002-source');return false;">[Source]</a></p>
196
+ <div class="method-source-code" id="M000002-source">
197
+ <pre>
198
+ <span class="ruby-comment cmt"># File lib/spider.rb, line 118</span>
199
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">on</span>(<span class="ruby-identifier">code</span>, <span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
200
+ <span class="ruby-identifier">f</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
201
+ <span class="ruby-keyword kw">case</span> <span class="ruby-identifier">code</span>
202
+ <span class="ruby-keyword kw">when</span> <span class="ruby-constant">Fixnum</span>
203
+ <span class="ruby-ivar">@callbacks</span>[<span class="ruby-identifier">success_or_failure</span>(<span class="ruby-identifier">code</span>)][<span class="ruby-identifier">code</span>] = <span class="ruby-identifier">f</span>
204
+ <span class="ruby-keyword kw">else</span>
205
+ <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">:any</span> <span class="ruby-operator">==</span> <span class="ruby-identifier">code</span>.<span class="ruby-identifier">to_sym</span>
206
+ <span class="ruby-ivar">@callbacks</span>[<span class="ruby-identifier">:any</span>] = <span class="ruby-identifier">f</span>
207
+ <span class="ruby-keyword kw">else</span>
208
+ <span class="ruby-ivar">@callbacks</span>[<span class="ruby-identifier">code</span>.<span class="ruby-identifier">to_sym</span>][<span class="ruby-identifier">:any</span>] = <span class="ruby-identifier">f</span>
209
+ <span class="ruby-keyword kw">end</span>
210
+ <span class="ruby-keyword kw">end</span>
211
+ <span class="ruby-keyword kw">end</span>
212
+ </pre>
213
+ </div>
214
+ </div>
215
+ </div>
216
+
217
+
218
+ </div>
219
+
220
+
221
+ </div>
222
+
223
+
224
+ <div id="validator-badges">
225
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
226
+ </div>
227
+
228
+ </body>
229
+ </html>