spider 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES +3 -0
- data/README +90 -17
- data/doc/classes/IncludedInMemcached.html +217 -0
- data/doc/classes/Spider.html +10 -8
- data/doc/classes/SpiderInstance.html +96 -45
- data/doc/created.rid +1 -1
- data/doc/files/README.html +95 -21
- data/doc/{classes/Net.html → files/lib/included_in_memcached_rb.html} +23 -16
- data/doc/files/lib/spider_instance_rb.html +118 -0
- data/doc/files/lib/spider_rb.html +95 -32
- data/doc/fr_class_index.html +1 -0
- data/doc/fr_file_index.html +2 -0
- data/doc/fr_method_index.html +11 -7
- data/lib/included_in_memcached.rb +22 -0
- data/lib/spider.rb +4 -246
- data/lib/spider_instance.rb +290 -0
- data/spec/included_in_memcached_spec.rb +44 -0
- data/spec/spider_instance_spec.rb +46 -4
- data/spider.gemspec +1 -1
- metadata +8 -8
- data/doc/classes/Net/HTTPRedirection.html +0 -144
- data/doc/classes/Net/HTTPResponse.html +0 -166
- data/doc/classes/Net/HTTPSuccess.html +0 -144
- data/doc/classes/NilClass.html +0 -144
data/doc/created.rid
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
Fri, 02 Nov 2007 17:20:02 -0400
|
data/doc/files/README.html
CHANGED
@@ -56,7 +56,7 @@
|
|
56
56
|
</tr>
|
57
57
|
<tr class="top-aligned-row">
|
58
58
|
<td><strong>Last Update:</strong></td>
|
59
|
-
<td>
|
59
|
+
<td>Fri Nov 02 17:19:47 -0400 2007</td>
|
60
60
|
</tr>
|
61
61
|
</table>
|
62
62
|
</div>
|
@@ -74,44 +74,118 @@
|
|
74
74
|
Ruby. It handles the robots.txt, scraping, collecting, and looping so that
|
75
75
|
you can just handle the data.
|
76
76
|
</p>
|
77
|
-
<h2>
|
77
|
+
<h2>Examples</h2>
|
78
|
+
<h3>Crawl the Web, loading each page in turn, until you run out of memory</h3>
|
78
79
|
<pre>
|
80
|
+
require 'spider'
|
81
|
+
Spider.start_at('http://mike-burns.com/') {}
|
82
|
+
</pre>
|
83
|
+
<h3>To handle erroneous responses</h3>
|
84
|
+
<pre>
|
85
|
+
require 'spider'
|
86
|
+
Spider.start_at('http://mike-burns.com/') do |s|
|
87
|
+
s.on :failure do |a_url, resp, prior_url|
|
88
|
+
puts "URL failed: #{a_url}"
|
89
|
+
puts " linked from #{prior_url}"
|
90
|
+
end
|
91
|
+
end
|
92
|
+
</pre>
|
93
|
+
<h3>Or handle successful responses</h3>
|
94
|
+
<pre>
|
95
|
+
require 'spider'
|
96
|
+
Spider.start_at('http://mike-burns.com/') do |s|
|
97
|
+
s.on :success do |a_url, resp, prior_url|
|
98
|
+
puts "#{a_url}: #{resp.code}"
|
99
|
+
puts resp.body
|
100
|
+
puts
|
101
|
+
end
|
102
|
+
end
|
103
|
+
</pre>
|
104
|
+
<h3>Limit to just one domain</h3>
|
105
|
+
<pre>
|
106
|
+
require 'spider'
|
79
107
|
Spider.start_at('http://mike-burns.com/') do |s|
|
80
|
-
# Limit the pages to just this domain.
|
81
108
|
s.add_url_check do |a_url|
|
82
109
|
a_url =~ %r{^http://mike-burns.com.*}
|
83
110
|
end
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
111
|
+
end
|
112
|
+
</pre>
|
113
|
+
<h3>Pass headers to some requests</h3>
|
114
|
+
<pre>
|
115
|
+
require 'spider'
|
116
|
+
Spider.start_at('http://mike-burns.com/') do |s|
|
117
|
+
s.setup do |a_url|
|
118
|
+
if a_url =~ %r{^http://.*wikipedia.*}
|
119
|
+
headers['User-Agent'] = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
120
|
+
end
|
88
121
|
end
|
122
|
+
end
|
123
|
+
</pre>
|
124
|
+
<h3>Use memcached to track cycles</h3>
|
125
|
+
<pre>
|
126
|
+
require 'spider'
|
127
|
+
require 'spider/included_in_memcached'
|
128
|
+
SERVERS = ['10.0.10.2:11211','10.0.10.3:11211','10.0.10.4:11211']
|
129
|
+
Spider.start_at('http://mike-burns.com/') do |s|
|
130
|
+
s.check_already_seen_with IncludedInMemcached.new(SERVERS)
|
131
|
+
end
|
132
|
+
</pre>
|
133
|
+
<h3>Track cycles with a custom object</h3>
|
134
|
+
<pre>
|
135
|
+
require 'spider'
|
89
136
|
|
90
|
-
|
91
|
-
|
92
|
-
|
137
|
+
class ExpireLinks < Hash
|
138
|
+
def <<(v)
|
139
|
+
[v] = Time.now
|
140
|
+
end
|
141
|
+
def include?(v)
|
142
|
+
[v] && (Time.now + 86400) <= [v]
|
93
143
|
end
|
144
|
+
end
|
94
145
|
|
95
|
-
|
96
|
-
s.
|
97
|
-
|
146
|
+
Spider.start_at('http://mike-burns.com/') do |s|
|
147
|
+
s.check_already_seen_with ExpireLinks.new
|
148
|
+
end
|
149
|
+
</pre>
|
150
|
+
<h3>Create a URL graph</h3>
|
151
|
+
<pre>
|
152
|
+
require 'spider'
|
153
|
+
nodes = {}
|
154
|
+
Spider.start_at('http://mike-burns.com/') do |s|
|
155
|
+
s.add_url_check {|a_url| a_url =~ %r{^http://mike-burns.com.*} }
|
156
|
+
|
157
|
+
s.on(:every) do |a_url, resp, prior_url|
|
158
|
+
nodes[prior_url] ||= []
|
159
|
+
nodes[prior_url] << a_url
|
160
|
+
end
|
161
|
+
end
|
162
|
+
</pre>
|
163
|
+
<h3>Use a proxy</h3>
|
164
|
+
<pre>
|
165
|
+
require 'net/http_configuration'
|
166
|
+
require 'spider'
|
167
|
+
http_conf = Net::HTTP::Configuration.new(:proxy_host => '7proxies.org',
|
168
|
+
:proxy_port => 8881)
|
169
|
+
http_conf.apply do
|
170
|
+
Spider.start_at('http://img.4chan.org/b/') do |s|
|
171
|
+
s.on(:success) do |a_url, resp, prior_url|
|
172
|
+
File.open(a_url.gsub('/',':'),'w') do |f|
|
173
|
+
f.write(resp.body)
|
174
|
+
end
|
175
|
+
end
|
98
176
|
end
|
99
177
|
end
|
100
178
|
</pre>
|
101
|
-
<h2>Requirements</h2>
|
102
|
-
<p>
|
103
|
-
This library uses `robot_rules’ (included), `open-uri’, and
|
104
|
-
`uri’. Any modern Ruby should work; if yours doesn‘t, let me
|
105
|
-
know so I can update this with your version number.
|
106
|
-
</p>
|
107
179
|
<h2>Author</h2>
|
108
180
|
<p>
|
109
181
|
Mike Burns <a href="http://mike-burns.com">mike-burns.com</a>
|
110
182
|
mike@mike-burns.com
|
111
183
|
</p>
|
112
184
|
<p>
|
113
|
-
|
114
|
-
|
185
|
+
Help from Matt Horan and John Nagro.
|
186
|
+
</p>
|
187
|
+
<p>
|
188
|
+
With `robot_rules’ from James Edward Gray II via <a
|
115
189
|
href="http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589">blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589</a>
|
116
190
|
</p>
|
117
191
|
|
@@ -5,10 +5,10 @@
|
|
5
5
|
|
6
6
|
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
7
7
|
<head>
|
8
|
-
<title>
|
8
|
+
<title>File: included_in_memcached.rb</title>
|
9
9
|
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
10
|
<meta http-equiv="Content-Script-Type" content="text/javascript" />
|
11
|
-
<link rel="stylesheet" href="
|
11
|
+
<link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
|
12
12
|
<script type="text/javascript">
|
13
13
|
// <![CDATA[
|
14
14
|
|
@@ -46,20 +46,20 @@
|
|
46
46
|
|
47
47
|
|
48
48
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
49
|
+
<div id="fileHeader">
|
50
|
+
<h1>included_in_memcached.rb</h1>
|
51
|
+
<table class="header-table">
|
52
|
+
<tr class="top-aligned-row">
|
53
|
+
<td><strong>Path:</strong></td>
|
54
|
+
<td>lib/included_in_memcached.rb
|
55
|
+
</td>
|
56
|
+
</tr>
|
57
|
+
<tr class="top-aligned-row">
|
58
|
+
<td><strong>Last Update:</strong></td>
|
59
|
+
<td>Fri Nov 02 15:04:14 -0400 2007</td>
|
60
|
+
</tr>
|
61
|
+
</table>
|
62
|
+
</div>
|
63
63
|
<!-- banner header -->
|
64
64
|
|
65
65
|
<div id="bodyContent">
|
@@ -69,6 +69,13 @@
|
|
69
69
|
<div id="contextContent">
|
70
70
|
|
71
71
|
|
72
|
+
<div id="requires-list">
|
73
|
+
<h3 class="section-bar">Required files</h3>
|
74
|
+
|
75
|
+
<div class="name-list">
|
76
|
+
memcache
|
77
|
+
</div>
|
78
|
+
</div>
|
72
79
|
|
73
80
|
</div>
|
74
81
|
|
@@ -0,0 +1,118 @@
|
|
1
|
+
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
+
<!DOCTYPE html
|
3
|
+
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
+
|
6
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
7
|
+
<head>
|
8
|
+
<title>File: spider_instance.rb</title>
|
9
|
+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
+
<meta http-equiv="Content-Script-Type" content="text/javascript" />
|
11
|
+
<link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
|
12
|
+
<script type="text/javascript">
|
13
|
+
// <![CDATA[
|
14
|
+
|
15
|
+
function popupCode( url ) {
|
16
|
+
window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
|
17
|
+
}
|
18
|
+
|
19
|
+
function toggleCode( id ) {
|
20
|
+
if ( document.getElementById )
|
21
|
+
elem = document.getElementById( id );
|
22
|
+
else if ( document.all )
|
23
|
+
elem = eval( "document.all." + id );
|
24
|
+
else
|
25
|
+
return false;
|
26
|
+
|
27
|
+
elemStyle = elem.style;
|
28
|
+
|
29
|
+
if ( elemStyle.display != "block" ) {
|
30
|
+
elemStyle.display = "block"
|
31
|
+
} else {
|
32
|
+
elemStyle.display = "none"
|
33
|
+
}
|
34
|
+
|
35
|
+
return true;
|
36
|
+
}
|
37
|
+
|
38
|
+
// Make codeblocks hidden by default
|
39
|
+
document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
|
40
|
+
|
41
|
+
// ]]>
|
42
|
+
</script>
|
43
|
+
|
44
|
+
</head>
|
45
|
+
<body>
|
46
|
+
|
47
|
+
|
48
|
+
|
49
|
+
<div id="fileHeader">
|
50
|
+
<h1>spider_instance.rb</h1>
|
51
|
+
<table class="header-table">
|
52
|
+
<tr class="top-aligned-row">
|
53
|
+
<td><strong>Path:</strong></td>
|
54
|
+
<td>lib/spider_instance.rb
|
55
|
+
</td>
|
56
|
+
</tr>
|
57
|
+
<tr class="top-aligned-row">
|
58
|
+
<td><strong>Last Update:</strong></td>
|
59
|
+
<td>Fri Nov 02 17:05:49 -0400 2007</td>
|
60
|
+
</tr>
|
61
|
+
</table>
|
62
|
+
</div>
|
63
|
+
<!-- banner header -->
|
64
|
+
|
65
|
+
<div id="bodyContent">
|
66
|
+
|
67
|
+
|
68
|
+
|
69
|
+
<div id="contextContent">
|
70
|
+
|
71
|
+
<div id="description">
|
72
|
+
<p>
|
73
|
+
Copyright 2007 Mike Burns
|
74
|
+
</p>
|
75
|
+
|
76
|
+
</div>
|
77
|
+
|
78
|
+
<div id="requires-list">
|
79
|
+
<h3 class="section-bar">Required files</h3>
|
80
|
+
|
81
|
+
<div class="name-list">
|
82
|
+
robot_rules
|
83
|
+
open-uri
|
84
|
+
uri
|
85
|
+
net/http
|
86
|
+
net/https
|
87
|
+
</div>
|
88
|
+
</div>
|
89
|
+
|
90
|
+
</div>
|
91
|
+
|
92
|
+
|
93
|
+
</div>
|
94
|
+
|
95
|
+
|
96
|
+
<!-- if includes -->
|
97
|
+
|
98
|
+
<div id="section">
|
99
|
+
|
100
|
+
|
101
|
+
|
102
|
+
|
103
|
+
|
104
|
+
|
105
|
+
|
106
|
+
|
107
|
+
<!-- if method_list -->
|
108
|
+
|
109
|
+
|
110
|
+
</div>
|
111
|
+
|
112
|
+
|
113
|
+
<div id="validator-badges">
|
114
|
+
<p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
|
115
|
+
</div>
|
116
|
+
|
117
|
+
</body>
|
118
|
+
</html>
|
@@ -56,7 +56,7 @@
|
|
56
56
|
</tr>
|
57
57
|
<tr class="top-aligned-row">
|
58
58
|
<td><strong>Last Update:</strong></td>
|
59
|
-
<td>
|
59
|
+
<td>Fri Nov 02 12:32:39 -0400 2007</td>
|
60
60
|
</tr>
|
61
61
|
</table>
|
62
62
|
</div>
|
@@ -74,60 +74,123 @@ Copyright 2007 Mike Burns <a href="../../classes/Spider.html">Spider</a>, a
|
|
74
74
|
Web spidering library for Ruby. It handles the robots.txt, scraping,
|
75
75
|
collecting, and looping so that you can just handle the data.
|
76
76
|
</p>
|
77
|
-
<h2>
|
77
|
+
<h2>Examples</h2>
|
78
|
+
<h3>Crawl the Web, loading each page in turn, until you run out of memory</h3>
|
78
79
|
<pre>
|
80
|
+
require 'spider'
|
81
|
+
Spider.start_at('http://mike-burns.com/') {}
|
82
|
+
</pre>
|
83
|
+
<h3>To handle erroneous responses</h3>
|
84
|
+
<pre>
|
85
|
+
require 'spider'
|
86
|
+
Spider.start_at('http://mike-burns.com/') do |s|
|
87
|
+
s.on :failure do |a_url, resp, prior_url|
|
88
|
+
puts "URL failed: #{a_url}"
|
89
|
+
puts " linked from #{prior_url}"
|
90
|
+
end
|
91
|
+
end
|
92
|
+
</pre>
|
93
|
+
<h3>Or handle successful responses</h3>
|
94
|
+
<pre>
|
95
|
+
require 'spider'
|
96
|
+
Spider.start_at('http://mike-burns.com/') do |s|
|
97
|
+
s.on :success do |a_url, resp, prior_url|
|
98
|
+
puts "#{a_url}: #{resp.code}"
|
99
|
+
puts resp.body
|
100
|
+
puts
|
101
|
+
end
|
102
|
+
end
|
103
|
+
</pre>
|
104
|
+
<h3>Limit to just one domain</h3>
|
105
|
+
<pre>
|
106
|
+
require 'spider'
|
79
107
|
Spider.start_at('http://mike-burns.com/') do |s|
|
80
|
-
# Limit the pages to just this domain.
|
81
108
|
s.add_url_check do |a_url|
|
82
109
|
a_url =~ %r{^http://mike-burns.com.*}
|
83
110
|
end
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
111
|
+
end
|
112
|
+
</pre>
|
113
|
+
<h3>Pass headers to some requests</h3>
|
114
|
+
<pre>
|
115
|
+
require 'spider'
|
116
|
+
Spider.start_at('http://mike-burns.com/') do |s|
|
117
|
+
s.setup do |a_url|
|
118
|
+
if a_url =~ %r{^http://.*wikipedia.*}
|
119
|
+
headers['User-Agent'] = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
120
|
+
end
|
88
121
|
end
|
122
|
+
end
|
123
|
+
</pre>
|
124
|
+
<h3>Use memcached to track cycles</h3>
|
125
|
+
<pre>
|
126
|
+
require 'spider'
|
127
|
+
require 'spider/included_in_memcached'
|
128
|
+
SERVERS = ['10.0.10.2:11211','10.0.10.3:11211','10.0.10.4:11211']
|
129
|
+
Spider.start_at('http://mike-burns.com/') do |s|
|
130
|
+
s.check_already_seen_with IncludedInMemcached.new(SERVERS)
|
131
|
+
end
|
132
|
+
</pre>
|
133
|
+
<h3>Track cycles with a custom object</h3>
|
134
|
+
<pre>
|
135
|
+
require 'spider'
|
89
136
|
|
90
|
-
|
91
|
-
|
92
|
-
|
137
|
+
class ExpireLinks < Hash
|
138
|
+
def <<(v)
|
139
|
+
[v] = Time.now
|
140
|
+
end
|
141
|
+
def include?(v)
|
142
|
+
[v] && (Time.now + 86400) <= [v]
|
93
143
|
end
|
144
|
+
end
|
145
|
+
|
146
|
+
Spider.start_at('http://mike-burns.com/') do |s|
|
147
|
+
s.check_already_seen_with ExpireLinks.new
|
148
|
+
end
|
149
|
+
</pre>
|
150
|
+
<h3>Create a URL graph</h3>
|
151
|
+
<pre>
|
152
|
+
require 'spider'
|
153
|
+
nodes = {}
|
154
|
+
Spider.start_at('http://mike-burns.com/') do |s|
|
155
|
+
s.add_url_check {|a_url| a_url =~ %r{^http://mike-burns.com.*} }
|
94
156
|
|
95
|
-
|
96
|
-
|
97
|
-
|
157
|
+
s.on(:every) do |a_url, resp, prior_url|
|
158
|
+
nodes[prior_url] ||= []
|
159
|
+
nodes[prior_url] << a_url
|
160
|
+
end
|
161
|
+
end
|
162
|
+
</pre>
|
163
|
+
<h3>Use a proxy</h3>
|
164
|
+
<pre>
|
165
|
+
require 'net/http_configuration'
|
166
|
+
require 'spider'
|
167
|
+
http_conf = Net::HTTP::Configuration.new(:proxy_host => '7proxies.org',
|
168
|
+
:proxy_port => 8881)
|
169
|
+
http_conf.apply do
|
170
|
+
Spider.start_at('http://img.4chan.org/b/') do |s|
|
171
|
+
s.on(:success) do |a_url, resp, prior_url|
|
172
|
+
File.open(a_url.gsub('/',':'),'w') do |f|
|
173
|
+
f.write(resp.body)
|
174
|
+
end
|
175
|
+
end
|
98
176
|
end
|
99
177
|
end
|
100
178
|
</pre>
|
101
|
-
<h2>Requirements</h2>
|
102
|
-
<p>
|
103
|
-
This library uses `robot_rules’ (included), `open-uri’, and
|
104
|
-
`uri’. Any modern Ruby should work; if yours doesn‘t, let me
|
105
|
-
know so I can update this with your version number.
|
106
|
-
</p>
|
107
179
|
<h2>Author</h2>
|
108
180
|
<p>
|
109
181
|
Mike Burns <a href="http://mike-burns.com">mike-burns.com</a>
|
110
182
|
mike@mike-burns.com
|
111
183
|
</p>
|
112
184
|
<p>
|
113
|
-
|
114
|
-
|
185
|
+
Help from Matt Horan and John Nagro.
|
186
|
+
</p>
|
187
|
+
<p>
|
188
|
+
With `robot_rules’ from James Edward Gray II via <a
|
115
189
|
href="http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589">blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589</a>
|
116
190
|
</p>
|
117
191
|
|
118
192
|
</div>
|
119
193
|
|
120
|
-
<div id="requires-list">
|
121
|
-
<h3 class="section-bar">Required files</h3>
|
122
|
-
|
123
|
-
<div class="name-list">
|
124
|
-
robot_rules
|
125
|
-
open-uri
|
126
|
-
uri
|
127
|
-
net/http
|
128
|
-
net/https
|
129
|
-
</div>
|
130
|
-
</div>
|
131
194
|
|
132
195
|
</div>
|
133
196
|
|