spider 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1 +1 @@
1
- Wed, 31 Oct 2007 23:51:58 -0400
1
+ Fri, 02 Nov 2007 17:20:02 -0400
@@ -56,7 +56,7 @@
56
56
  </tr>
57
57
  <tr class="top-aligned-row">
58
58
  <td><strong>Last Update:</strong></td>
59
- <td>Wed Oct 31 23:26:17 -0400 2007</td>
59
+ <td>Fri Nov 02 17:19:47 -0400 2007</td>
60
60
  </tr>
61
61
  </table>
62
62
  </div>
@@ -74,44 +74,118 @@
74
74
  Ruby. It handles the robots.txt, scraping, collecting, and looping so that
75
75
  you can just handle the data.
76
76
  </p>
77
- <h2>Usage</h2>
77
+ <h2>Examples</h2>
78
+ <h3>Crawl the Web, loading each page in turn, until you run out of memory</h3>
78
79
  <pre>
80
+ require 'spider'
81
+ Spider.start_at('http://mike-burns.com/') {}
82
+ </pre>
83
+ <h3>To handle erroneous responses</h3>
84
+ <pre>
85
+ require 'spider'
86
+ Spider.start_at('http://mike-burns.com/') do |s|
87
+ s.on :failure do |a_url, resp, prior_url|
88
+ puts &quot;URL failed: #{a_url}&quot;
89
+ puts &quot; linked from #{prior_url}&quot;
90
+ end
91
+ end
92
+ </pre>
93
+ <h3>Or handle successful responses</h3>
94
+ <pre>
95
+ require 'spider'
96
+ Spider.start_at('http://mike-burns.com/') do |s|
97
+ s.on :success do |a_url, resp, prior_url|
98
+ puts &quot;#{a_url}: #{resp.code}&quot;
99
+ puts resp.body
100
+ puts
101
+ end
102
+ end
103
+ </pre>
104
+ <h3>Limit to just one domain</h3>
105
+ <pre>
106
+ require 'spider'
79
107
  Spider.start_at('http://mike-burns.com/') do |s|
80
- # Limit the pages to just this domain.
81
108
  s.add_url_check do |a_url|
82
109
  a_url =~ %r{^http://mike-burns.com.*}
83
110
  end
84
-
85
- # Handle 404s.
86
- s.on 404 do |a_url, resp, prior_url|
87
- puts &quot;URL not found: #{a_url}&quot;
111
+ end
112
+ </pre>
113
+ <h3>Pass headers to some requests</h3>
114
+ <pre>
115
+ require 'spider'
116
+ Spider.start_at('http://mike-burns.com/') do |s|
117
+ s.setup do |a_url|
118
+ if a_url =~ %r{^http://.*wikipedia.*}
119
+ headers['User-Agent'] = &quot;Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)&quot;
120
+ end
88
121
  end
122
+ end
123
+ </pre>
124
+ <h3>Use memcached to track cycles</h3>
125
+ <pre>
126
+ require 'spider'
127
+ require 'spider/included_in_memcached'
128
+ SERVERS = ['10.0.10.2:11211','10.0.10.3:11211','10.0.10.4:11211']
129
+ Spider.start_at('http://mike-burns.com/') do |s|
130
+ s.check_already_seen_with IncludedInMemcached.new(SERVERS)
131
+ end
132
+ </pre>
133
+ <h3>Track cycles with a custom object</h3>
134
+ <pre>
135
+ require 'spider'
89
136
 
90
- # Handle 2xx.
91
- s.on :success do |a_url, resp, prior_url|
92
- puts &quot;body: #{resp.body}&quot;
137
+ class ExpireLinks &lt; Hash
138
+ def &lt;&lt;(v)
139
+ [v] = Time.now
140
+ end
141
+ def include?(v)
142
+ [v] &amp;&amp; (Time.now + 86400) &lt;= [v]
93
143
  end
144
+ end
94
145
 
95
- # Handle everything.
96
- s.on :every do |a_url, resp, prior_url|
97
- puts &quot;URL returned anything: #{a_url} with this code #{resp.code}&quot;
146
+ Spider.start_at('http://mike-burns.com/') do |s|
147
+ s.check_already_seen_with ExpireLinks.new
148
+ end
149
+ </pre>
150
+ <h3>Create a URL graph</h3>
151
+ <pre>
152
+ require 'spider'
153
+ nodes = {}
154
+ Spider.start_at('http://mike-burns.com/') do |s|
155
+ s.add_url_check {|a_url| a_url =~ %r{^http://mike-burns.com.*} }
156
+
157
+ s.on(:every) do |a_url, resp, prior_url|
158
+ nodes[prior_url] ||= []
159
+ nodes[prior_url] &lt;&lt; a_url
160
+ end
161
+ end
162
+ </pre>
163
+ <h3>Use a proxy</h3>
164
+ <pre>
165
+ require 'net/http_configuration'
166
+ require 'spider'
167
+ http_conf = Net::HTTP::Configuration.new(:proxy_host =&gt; '7proxies.org',
168
+ :proxy_port =&gt; 8881)
169
+ http_conf.apply do
170
+ Spider.start_at('http://img.4chan.org/b/') do |s|
171
+ s.on(:success) do |a_url, resp, prior_url|
172
+ File.open(a_url.gsub('/',':'),'w') do |f|
173
+ f.write(resp.body)
174
+ end
175
+ end
98
176
  end
99
177
  end
100
178
  </pre>
101
- <h2>Requirements</h2>
102
- <p>
103
- This library uses `robot_rules&#8217; (included), `open-uri&#8217;, and
104
- `uri&#8217;. Any modern Ruby should work; if yours doesn&#8216;t, let me
105
- know so I can update this with your version number.
106
- </p>
107
179
  <h2>Author</h2>
108
180
  <p>
109
181
  Mike Burns <a href="http://mike-burns.com">mike-burns.com</a>
110
182
  mike@mike-burns.com
111
183
  </p>
112
184
  <p>
113
- With help from Matt Horan and John Nagro. With `robot_rules&#8217; from
114
- James Edward Gray II via <a
185
+ Help from Matt Horan and John Nagro.
186
+ </p>
187
+ <p>
188
+ With `robot_rules&#8217; from James Edward Gray II via <a
115
189
  href="http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589">blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589</a>
116
190
  </p>
117
191
 
@@ -5,10 +5,10 @@
5
5
 
6
6
  <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
7
  <head>
8
- <title>Module: Net</title>
8
+ <title>File: included_in_memcached.rb</title>
9
9
  <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
10
  <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
- <link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" />
11
+ <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
12
12
  <script type="text/javascript">
13
13
  // <![CDATA[
14
14
 
@@ -46,20 +46,20 @@
46
46
 
47
47
 
48
48
 
49
- <div id="classHeader">
50
- <table class="header-table">
51
- <tr class="top-aligned-row">
52
- <td><strong>Module</strong></td>
53
- <td class="class-name-in-header">Net</td>
54
- </tr>
55
- <tr class="top-aligned-row">
56
- <td><strong>In:</strong></td>
57
- <td>
58
- </td>
59
- </tr>
60
-
61
- </table>
62
- </div>
49
+ <div id="fileHeader">
50
+ <h1>included_in_memcached.rb</h1>
51
+ <table class="header-table">
52
+ <tr class="top-aligned-row">
53
+ <td><strong>Path:</strong></td>
54
+ <td>lib/included_in_memcached.rb
55
+ </td>
56
+ </tr>
57
+ <tr class="top-aligned-row">
58
+ <td><strong>Last Update:</strong></td>
59
+ <td>Fri Nov 02 15:04:14 -0400 2007</td>
60
+ </tr>
61
+ </table>
62
+ </div>
63
63
  <!-- banner header -->
64
64
 
65
65
  <div id="bodyContent">
@@ -69,6 +69,13 @@
69
69
  <div id="contextContent">
70
70
 
71
71
 
72
+ <div id="requires-list">
73
+ <h3 class="section-bar">Required files</h3>
74
+
75
+ <div class="name-list">
76
+ memcache&nbsp;&nbsp;
77
+ </div>
78
+ </div>
72
79
 
73
80
  </div>
74
81
 
@@ -0,0 +1,118 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
+ <head>
8
+ <title>File: spider_instance.rb</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
+ <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
12
+ <script type="text/javascript">
13
+ // <![CDATA[
14
+
15
+ function popupCode( url ) {
16
+ window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
+ }
18
+
19
+ function toggleCode( id ) {
20
+ if ( document.getElementById )
21
+ elem = document.getElementById( id );
22
+ else if ( document.all )
23
+ elem = eval( "document.all." + id );
24
+ else
25
+ return false;
26
+
27
+ elemStyle = elem.style;
28
+
29
+ if ( elemStyle.display != "block" ) {
30
+ elemStyle.display = "block"
31
+ } else {
32
+ elemStyle.display = "none"
33
+ }
34
+
35
+ return true;
36
+ }
37
+
38
+ // Make codeblocks hidden by default
39
+ document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
+
41
+ // ]]>
42
+ </script>
43
+
44
+ </head>
45
+ <body>
46
+
47
+
48
+
49
+ <div id="fileHeader">
50
+ <h1>spider_instance.rb</h1>
51
+ <table class="header-table">
52
+ <tr class="top-aligned-row">
53
+ <td><strong>Path:</strong></td>
54
+ <td>lib/spider_instance.rb
55
+ </td>
56
+ </tr>
57
+ <tr class="top-aligned-row">
58
+ <td><strong>Last Update:</strong></td>
59
+ <td>Fri Nov 02 17:05:49 -0400 2007</td>
60
+ </tr>
61
+ </table>
62
+ </div>
63
+ <!-- banner header -->
64
+
65
+ <div id="bodyContent">
66
+
67
+
68
+
69
+ <div id="contextContent">
70
+
71
+ <div id="description">
72
+ <p>
73
+ Copyright 2007 Mike Burns
74
+ </p>
75
+
76
+ </div>
77
+
78
+ <div id="requires-list">
79
+ <h3 class="section-bar">Required files</h3>
80
+
81
+ <div class="name-list">
82
+ robot_rules&nbsp;&nbsp;
83
+ open-uri&nbsp;&nbsp;
84
+ uri&nbsp;&nbsp;
85
+ net/http&nbsp;&nbsp;
86
+ net/https&nbsp;&nbsp;
87
+ </div>
88
+ </div>
89
+
90
+ </div>
91
+
92
+
93
+ </div>
94
+
95
+
96
+ <!-- if includes -->
97
+
98
+ <div id="section">
99
+
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+ <!-- if method_list -->
108
+
109
+
110
+ </div>
111
+
112
+
113
+ <div id="validator-badges">
114
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
115
+ </div>
116
+
117
+ </body>
118
+ </html>
@@ -56,7 +56,7 @@
56
56
  </tr>
57
57
  <tr class="top-aligned-row">
58
58
  <td><strong>Last Update:</strong></td>
59
- <td>Wed Oct 31 23:25:57 -0400 2007</td>
59
+ <td>Fri Nov 02 12:32:39 -0400 2007</td>
60
60
  </tr>
61
61
  </table>
62
62
  </div>
@@ -74,60 +74,123 @@ Copyright 2007 Mike Burns <a href="../../classes/Spider.html">Spider</a>, a
74
74
  Web spidering library for Ruby. It handles the robots.txt, scraping,
75
75
  collecting, and looping so that you can just handle the data.
76
76
  </p>
77
- <h2>Usage</h2>
77
+ <h2>Examples</h2>
78
+ <h3>Crawl the Web, loading each page in turn, until you run out of memory</h3>
78
79
  <pre>
80
+ require 'spider'
81
+ Spider.start_at('http://mike-burns.com/') {}
82
+ </pre>
83
+ <h3>To handle erroneous responses</h3>
84
+ <pre>
85
+ require 'spider'
86
+ Spider.start_at('http://mike-burns.com/') do |s|
87
+ s.on :failure do |a_url, resp, prior_url|
88
+ puts &quot;URL failed: #{a_url}&quot;
89
+ puts &quot; linked from #{prior_url}&quot;
90
+ end
91
+ end
92
+ </pre>
93
+ <h3>Or handle successful responses</h3>
94
+ <pre>
95
+ require 'spider'
96
+ Spider.start_at('http://mike-burns.com/') do |s|
97
+ s.on :success do |a_url, resp, prior_url|
98
+ puts &quot;#{a_url}: #{resp.code}&quot;
99
+ puts resp.body
100
+ puts
101
+ end
102
+ end
103
+ </pre>
104
+ <h3>Limit to just one domain</h3>
105
+ <pre>
106
+ require 'spider'
79
107
  Spider.start_at('http://mike-burns.com/') do |s|
80
- # Limit the pages to just this domain.
81
108
  s.add_url_check do |a_url|
82
109
  a_url =~ %r{^http://mike-burns.com.*}
83
110
  end
84
-
85
- # Handle 404s.
86
- s.on 404 do |a_url, resp, prior_url|
87
- puts &quot;URL not found: #{a_url}&quot;
111
+ end
112
+ </pre>
113
+ <h3>Pass headers to some requests</h3>
114
+ <pre>
115
+ require 'spider'
116
+ Spider.start_at('http://mike-burns.com/') do |s|
117
+ s.setup do |a_url|
118
+ if a_url =~ %r{^http://.*wikipedia.*}
119
+ headers['User-Agent'] = &quot;Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)&quot;
120
+ end
88
121
  end
122
+ end
123
+ </pre>
124
+ <h3>Use memcached to track cycles</h3>
125
+ <pre>
126
+ require 'spider'
127
+ require 'spider/included_in_memcached'
128
+ SERVERS = ['10.0.10.2:11211','10.0.10.3:11211','10.0.10.4:11211']
129
+ Spider.start_at('http://mike-burns.com/') do |s|
130
+ s.check_already_seen_with IncludedInMemcached.new(SERVERS)
131
+ end
132
+ </pre>
133
+ <h3>Track cycles with a custom object</h3>
134
+ <pre>
135
+ require 'spider'
89
136
 
90
- # Handle 2xx.
91
- s.on :success do |a_url, resp, prior_url|
92
- puts &quot;body: #{resp.body}&quot;
137
+ class ExpireLinks &lt; Hash
138
+ def &lt;&lt;(v)
139
+ [v] = Time.now
140
+ end
141
+ def include?(v)
142
+ [v] &amp;&amp; (Time.now + 86400) &lt;= [v]
93
143
  end
144
+ end
145
+
146
+ Spider.start_at('http://mike-burns.com/') do |s|
147
+ s.check_already_seen_with ExpireLinks.new
148
+ end
149
+ </pre>
150
+ <h3>Create a URL graph</h3>
151
+ <pre>
152
+ require 'spider'
153
+ nodes = {}
154
+ Spider.start_at('http://mike-burns.com/') do |s|
155
+ s.add_url_check {|a_url| a_url =~ %r{^http://mike-burns.com.*} }
94
156
 
95
- # Handle everything.
96
- s.on :every do |a_url, resp, prior_url|
97
- puts &quot;URL returned anything: #{a_url} with this code #{resp.code}&quot;
157
+ s.on(:every) do |a_url, resp, prior_url|
158
+ nodes[prior_url] ||= []
159
+ nodes[prior_url] &lt;&lt; a_url
160
+ end
161
+ end
162
+ </pre>
163
+ <h3>Use a proxy</h3>
164
+ <pre>
165
+ require 'net/http_configuration'
166
+ require 'spider'
167
+ http_conf = Net::HTTP::Configuration.new(:proxy_host =&gt; '7proxies.org',
168
+ :proxy_port =&gt; 8881)
169
+ http_conf.apply do
170
+ Spider.start_at('http://img.4chan.org/b/') do |s|
171
+ s.on(:success) do |a_url, resp, prior_url|
172
+ File.open(a_url.gsub('/',':'),'w') do |f|
173
+ f.write(resp.body)
174
+ end
175
+ end
98
176
  end
99
177
  end
100
178
  </pre>
101
- <h2>Requirements</h2>
102
- <p>
103
- This library uses `robot_rules&#8217; (included), `open-uri&#8217;, and
104
- `uri&#8217;. Any modern Ruby should work; if yours doesn&#8216;t, let me
105
- know so I can update this with your version number.
106
- </p>
107
179
  <h2>Author</h2>
108
180
  <p>
109
181
  Mike Burns <a href="http://mike-burns.com">mike-burns.com</a>
110
182
  mike@mike-burns.com
111
183
  </p>
112
184
  <p>
113
- With help from Matt Horan and John Nagro. With `robot_rules&#8217; from
114
- James Edward Gray II via <a
185
+ Help from Matt Horan and John Nagro.
186
+ </p>
187
+ <p>
188
+ With `robot_rules&#8217; from James Edward Gray II via <a
115
189
  href="http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589">blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589</a>
116
190
  </p>
117
191
 
118
192
  </div>
119
193
 
120
- <div id="requires-list">
121
- <h3 class="section-bar">Required files</h3>
122
-
123
- <div class="name-list">
124
- robot_rules&nbsp;&nbsp;
125
- open-uri&nbsp;&nbsp;
126
- uri&nbsp;&nbsp;
127
- net/http&nbsp;&nbsp;
128
- net/https&nbsp;&nbsp;
129
- </div>
130
- </div>
131
194
 
132
195
  </div>
133
196