spider 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1 +1 @@
1
- Wed, 31 Oct 2007 23:51:58 -0400
1
+ Fri, 02 Nov 2007 17:20:02 -0400
@@ -56,7 +56,7 @@
56
56
  </tr>
57
57
  <tr class="top-aligned-row">
58
58
  <td><strong>Last Update:</strong></td>
59
- <td>Wed Oct 31 23:26:17 -0400 2007</td>
59
+ <td>Fri Nov 02 17:19:47 -0400 2007</td>
60
60
  </tr>
61
61
  </table>
62
62
  </div>
@@ -74,44 +74,118 @@
74
74
  Ruby. It handles the robots.txt, scraping, collecting, and looping so that
75
75
  you can just handle the data.
76
76
  </p>
77
- <h2>Usage</h2>
77
+ <h2>Examples</h2>
78
+ <h3>Crawl the Web, loading each page in turn, until you run out of memory</h3>
78
79
  <pre>
80
+ require 'spider'
81
+ Spider.start_at('http://mike-burns.com/') {}
82
+ </pre>
83
+ <h3>To handle erroneous responses</h3>
84
+ <pre>
85
+ require 'spider'
86
+ Spider.start_at('http://mike-burns.com/') do |s|
87
+ s.on :failure do |a_url, resp, prior_url|
88
+ puts &quot;URL failed: #{a_url}&quot;
89
+ puts &quot; linked from #{prior_url}&quot;
90
+ end
91
+ end
92
+ </pre>
93
+ <h3>Or handle successful responses</h3>
94
+ <pre>
95
+ require 'spider'
96
+ Spider.start_at('http://mike-burns.com/') do |s|
97
+ s.on :success do |a_url, resp, prior_url|
98
+ puts &quot;#{a_url}: #{resp.code}&quot;
99
+ puts resp.body
100
+ puts
101
+ end
102
+ end
103
+ </pre>
104
+ <h3>Limit to just one domain</h3>
105
+ <pre>
106
+ require 'spider'
79
107
  Spider.start_at('http://mike-burns.com/') do |s|
80
- # Limit the pages to just this domain.
81
108
  s.add_url_check do |a_url|
82
109
  a_url =~ %r{^http://mike-burns.com.*}
83
110
  end
84
-
85
- # Handle 404s.
86
- s.on 404 do |a_url, resp, prior_url|
87
- puts &quot;URL not found: #{a_url}&quot;
111
+ end
112
+ </pre>
113
+ <h3>Pass headers to some requests</h3>
114
+ <pre>
115
+ require 'spider'
116
+ Spider.start_at('http://mike-burns.com/') do |s|
117
+ s.setup do |a_url|
118
+ if a_url =~ %r{^http://.*wikipedia.*}
119
+ headers['User-Agent'] = &quot;Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)&quot;
120
+ end
88
121
  end
122
+ end
123
+ </pre>
124
+ <h3>Use memcached to track cycles</h3>
125
+ <pre>
126
+ require 'spider'
127
+ require 'spider/included_in_memcached'
128
+ SERVERS = ['10.0.10.2:11211','10.0.10.3:11211','10.0.10.4:11211']
129
+ Spider.start_at('http://mike-burns.com/') do |s|
130
+ s.check_already_seen_with IncludedInMemcached.new(SERVERS)
131
+ end
132
+ </pre>
133
+ <h3>Track cycles with a custom object</h3>
134
+ <pre>
135
+ require 'spider'
89
136
 
90
- # Handle 2xx.
91
- s.on :success do |a_url, resp, prior_url|
92
- puts &quot;body: #{resp.body}&quot;
137
+ class ExpireLinks &lt; Hash
138
+ def &lt;&lt;(v)
139
+ [v] = Time.now
140
+ end
141
+ def include?(v)
142
+ [v] &amp;&amp; (Time.now + 86400) &lt;= [v]
93
143
  end
144
+ end
94
145
 
95
- # Handle everything.
96
- s.on :every do |a_url, resp, prior_url|
97
- puts &quot;URL returned anything: #{a_url} with this code #{resp.code}&quot;
146
+ Spider.start_at('http://mike-burns.com/') do |s|
147
+ s.check_already_seen_with ExpireLinks.new
148
+ end
149
+ </pre>
150
+ <h3>Create a URL graph</h3>
151
+ <pre>
152
+ require 'spider'
153
+ nodes = {}
154
+ Spider.start_at('http://mike-burns.com/') do |s|
155
+ s.add_url_check {|a_url| a_url =~ %r{^http://mike-burns.com.*} }
156
+
157
+ s.on(:every) do |a_url, resp, prior_url|
158
+ nodes[prior_url] ||= []
159
+ nodes[prior_url] &lt;&lt; a_url
160
+ end
161
+ end
162
+ </pre>
163
+ <h3>Use a proxy</h3>
164
+ <pre>
165
+ require 'net/http_configuration'
166
+ require 'spider'
167
+ http_conf = Net::HTTP::Configuration.new(:proxy_host =&gt; '7proxies.org',
168
+ :proxy_port =&gt; 8881)
169
+ http_conf.apply do
170
+ Spider.start_at('http://img.4chan.org/b/') do |s|
171
+ s.on(:success) do |a_url, resp, prior_url|
172
+ File.open(a_url.gsub('/',':'),'w') do |f|
173
+ f.write(resp.body)
174
+ end
175
+ end
98
176
  end
99
177
  end
100
178
  </pre>
101
- <h2>Requirements</h2>
102
- <p>
103
- This library uses `robot_rules&#8217; (included), `open-uri&#8217;, and
104
- `uri&#8217;. Any modern Ruby should work; if yours doesn&#8216;t, let me
105
- know so I can update this with your version number.
106
- </p>
107
179
  <h2>Author</h2>
108
180
  <p>
109
181
  Mike Burns <a href="http://mike-burns.com">mike-burns.com</a>
110
182
  mike@mike-burns.com
111
183
  </p>
112
184
  <p>
113
- With help from Matt Horan and John Nagro. With `robot_rules&#8217; from
114
- James Edward Gray II via <a
185
+ Help from Matt Horan and John Nagro.
186
+ </p>
187
+ <p>
188
+ With `robot_rules&#8217; from James Edward Gray II via <a
115
189
  href="http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589">blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589</a>
116
190
  </p>
117
191
 
@@ -5,10 +5,10 @@
5
5
 
6
6
  <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
7
  <head>
8
- <title>Module: Net</title>
8
+ <title>File: included_in_memcached.rb</title>
9
9
  <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
10
  <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
- <link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" />
11
+ <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
12
12
  <script type="text/javascript">
13
13
  // <![CDATA[
14
14
 
@@ -46,20 +46,20 @@
46
46
 
47
47
 
48
48
 
49
- <div id="classHeader">
50
- <table class="header-table">
51
- <tr class="top-aligned-row">
52
- <td><strong>Module</strong></td>
53
- <td class="class-name-in-header">Net</td>
54
- </tr>
55
- <tr class="top-aligned-row">
56
- <td><strong>In:</strong></td>
57
- <td>
58
- </td>
59
- </tr>
60
-
61
- </table>
62
- </div>
49
+ <div id="fileHeader">
50
+ <h1>included_in_memcached.rb</h1>
51
+ <table class="header-table">
52
+ <tr class="top-aligned-row">
53
+ <td><strong>Path:</strong></td>
54
+ <td>lib/included_in_memcached.rb
55
+ </td>
56
+ </tr>
57
+ <tr class="top-aligned-row">
58
+ <td><strong>Last Update:</strong></td>
59
+ <td>Fri Nov 02 15:04:14 -0400 2007</td>
60
+ </tr>
61
+ </table>
62
+ </div>
63
63
  <!-- banner header -->
64
64
 
65
65
  <div id="bodyContent">
@@ -69,6 +69,13 @@
69
69
  <div id="contextContent">
70
70
 
71
71
 
72
+ <div id="requires-list">
73
+ <h3 class="section-bar">Required files</h3>
74
+
75
+ <div class="name-list">
76
+ memcache&nbsp;&nbsp;
77
+ </div>
78
+ </div>
72
79
 
73
80
  </div>
74
81
 
@@ -0,0 +1,118 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
+ <head>
8
+ <title>File: spider_instance.rb</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
+ <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
12
+ <script type="text/javascript">
13
+ // <![CDATA[
14
+
15
+ function popupCode( url ) {
16
+ window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
+ }
18
+
19
+ function toggleCode( id ) {
20
+ if ( document.getElementById )
21
+ elem = document.getElementById( id );
22
+ else if ( document.all )
23
+ elem = eval( "document.all." + id );
24
+ else
25
+ return false;
26
+
27
+ elemStyle = elem.style;
28
+
29
+ if ( elemStyle.display != "block" ) {
30
+ elemStyle.display = "block"
31
+ } else {
32
+ elemStyle.display = "none"
33
+ }
34
+
35
+ return true;
36
+ }
37
+
38
+ // Make codeblocks hidden by default
39
+ document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
+
41
+ // ]]>
42
+ </script>
43
+
44
+ </head>
45
+ <body>
46
+
47
+
48
+
49
+ <div id="fileHeader">
50
+ <h1>spider_instance.rb</h1>
51
+ <table class="header-table">
52
+ <tr class="top-aligned-row">
53
+ <td><strong>Path:</strong></td>
54
+ <td>lib/spider_instance.rb
55
+ </td>
56
+ </tr>
57
+ <tr class="top-aligned-row">
58
+ <td><strong>Last Update:</strong></td>
59
+ <td>Fri Nov 02 17:05:49 -0400 2007</td>
60
+ </tr>
61
+ </table>
62
+ </div>
63
+ <!-- banner header -->
64
+
65
+ <div id="bodyContent">
66
+
67
+
68
+
69
+ <div id="contextContent">
70
+
71
+ <div id="description">
72
+ <p>
73
+ Copyright 2007 Mike Burns
74
+ </p>
75
+
76
+ </div>
77
+
78
+ <div id="requires-list">
79
+ <h3 class="section-bar">Required files</h3>
80
+
81
+ <div class="name-list">
82
+ robot_rules&nbsp;&nbsp;
83
+ open-uri&nbsp;&nbsp;
84
+ uri&nbsp;&nbsp;
85
+ net/http&nbsp;&nbsp;
86
+ net/https&nbsp;&nbsp;
87
+ </div>
88
+ </div>
89
+
90
+ </div>
91
+
92
+
93
+ </div>
94
+
95
+
96
+ <!-- if includes -->
97
+
98
+ <div id="section">
99
+
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+ <!-- if method_list -->
108
+
109
+
110
+ </div>
111
+
112
+
113
+ <div id="validator-badges">
114
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
115
+ </div>
116
+
117
+ </body>
118
+ </html>
@@ -56,7 +56,7 @@
56
56
  </tr>
57
57
  <tr class="top-aligned-row">
58
58
  <td><strong>Last Update:</strong></td>
59
- <td>Wed Oct 31 23:25:57 -0400 2007</td>
59
+ <td>Fri Nov 02 12:32:39 -0400 2007</td>
60
60
  </tr>
61
61
  </table>
62
62
  </div>
@@ -74,60 +74,123 @@ Copyright 2007 Mike Burns <a href="../../classes/Spider.html">Spider</a>, a
74
74
  Web spidering library for Ruby. It handles the robots.txt, scraping,
75
75
  collecting, and looping so that you can just handle the data.
76
76
  </p>
77
- <h2>Usage</h2>
77
+ <h2>Examples</h2>
78
+ <h3>Crawl the Web, loading each page in turn, until you run out of memory</h3>
78
79
  <pre>
80
+ require 'spider'
81
+ Spider.start_at('http://mike-burns.com/') {}
82
+ </pre>
83
+ <h3>To handle erroneous responses</h3>
84
+ <pre>
85
+ require 'spider'
86
+ Spider.start_at('http://mike-burns.com/') do |s|
87
+ s.on :failure do |a_url, resp, prior_url|
88
+ puts &quot;URL failed: #{a_url}&quot;
89
+ puts &quot; linked from #{prior_url}&quot;
90
+ end
91
+ end
92
+ </pre>
93
+ <h3>Or handle successful responses</h3>
94
+ <pre>
95
+ require 'spider'
96
+ Spider.start_at('http://mike-burns.com/') do |s|
97
+ s.on :success do |a_url, resp, prior_url|
98
+ puts &quot;#{a_url}: #{resp.code}&quot;
99
+ puts resp.body
100
+ puts
101
+ end
102
+ end
103
+ </pre>
104
+ <h3>Limit to just one domain</h3>
105
+ <pre>
106
+ require 'spider'
79
107
  Spider.start_at('http://mike-burns.com/') do |s|
80
- # Limit the pages to just this domain.
81
108
  s.add_url_check do |a_url|
82
109
  a_url =~ %r{^http://mike-burns.com.*}
83
110
  end
84
-
85
- # Handle 404s.
86
- s.on 404 do |a_url, resp, prior_url|
87
- puts &quot;URL not found: #{a_url}&quot;
111
+ end
112
+ </pre>
113
+ <h3>Pass headers to some requests</h3>
114
+ <pre>
115
+ require 'spider'
116
+ Spider.start_at('http://mike-burns.com/') do |s|
117
+ s.setup do |a_url|
118
+ if a_url =~ %r{^http://.*wikipedia.*}
119
+ headers['User-Agent'] = &quot;Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)&quot;
120
+ end
88
121
  end
122
+ end
123
+ </pre>
124
+ <h3>Use memcached to track cycles</h3>
125
+ <pre>
126
+ require 'spider'
127
+ require 'spider/included_in_memcached'
128
+ SERVERS = ['10.0.10.2:11211','10.0.10.3:11211','10.0.10.4:11211']
129
+ Spider.start_at('http://mike-burns.com/') do |s|
130
+ s.check_already_seen_with IncludedInMemcached.new(SERVERS)
131
+ end
132
+ </pre>
133
+ <h3>Track cycles with a custom object</h3>
134
+ <pre>
135
+ require 'spider'
89
136
 
90
- # Handle 2xx.
91
- s.on :success do |a_url, resp, prior_url|
92
- puts &quot;body: #{resp.body}&quot;
137
+ class ExpireLinks &lt; Hash
138
+ def &lt;&lt;(v)
139
+ [v] = Time.now
140
+ end
141
+ def include?(v)
142
+ [v] &amp;&amp; (Time.now + 86400) &lt;= [v]
93
143
  end
144
+ end
145
+
146
+ Spider.start_at('http://mike-burns.com/') do |s|
147
+ s.check_already_seen_with ExpireLinks.new
148
+ end
149
+ </pre>
150
+ <h3>Create a URL graph</h3>
151
+ <pre>
152
+ require 'spider'
153
+ nodes = {}
154
+ Spider.start_at('http://mike-burns.com/') do |s|
155
+ s.add_url_check {|a_url| a_url =~ %r{^http://mike-burns.com.*} }
94
156
 
95
- # Handle everything.
96
- s.on :every do |a_url, resp, prior_url|
97
- puts &quot;URL returned anything: #{a_url} with this code #{resp.code}&quot;
157
+ s.on(:every) do |a_url, resp, prior_url|
158
+ nodes[prior_url] ||= []
159
+ nodes[prior_url] &lt;&lt; a_url
160
+ end
161
+ end
162
+ </pre>
163
+ <h3>Use a proxy</h3>
164
+ <pre>
165
+ require 'net/http_configuration'
166
+ require 'spider'
167
+ http_conf = Net::HTTP::Configuration.new(:proxy_host =&gt; '7proxies.org',
168
+ :proxy_port =&gt; 8881)
169
+ http_conf.apply do
170
+ Spider.start_at('http://img.4chan.org/b/') do |s|
171
+ s.on(:success) do |a_url, resp, prior_url|
172
+ File.open(a_url.gsub('/',':'),'w') do |f|
173
+ f.write(resp.body)
174
+ end
175
+ end
98
176
  end
99
177
  end
100
178
  </pre>
101
- <h2>Requirements</h2>
102
- <p>
103
- This library uses `robot_rules&#8217; (included), `open-uri&#8217;, and
104
- `uri&#8217;. Any modern Ruby should work; if yours doesn&#8216;t, let me
105
- know so I can update this with your version number.
106
- </p>
107
179
  <h2>Author</h2>
108
180
  <p>
109
181
  Mike Burns <a href="http://mike-burns.com">mike-burns.com</a>
110
182
  mike@mike-burns.com
111
183
  </p>
112
184
  <p>
113
- With help from Matt Horan and John Nagro. With `robot_rules&#8217; from
114
- James Edward Gray II via <a
185
+ Help from Matt Horan and John Nagro.
186
+ </p>
187
+ <p>
188
+ With `robot_rules&#8217; from James Edward Gray II via <a
115
189
  href="http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589">blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589</a>
116
190
  </p>
117
191
 
118
192
  </div>
119
193
 
120
- <div id="requires-list">
121
- <h3 class="section-bar">Required files</h3>
122
-
123
- <div class="name-list">
124
- robot_rules&nbsp;&nbsp;
125
- open-uri&nbsp;&nbsp;
126
- uri&nbsp;&nbsp;
127
- net/http&nbsp;&nbsp;
128
- net/https&nbsp;&nbsp;
129
- </div>
130
- </div>
131
194
 
132
195
  </div>
133
196