spider 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGES +3 -0
- data/README +90 -17
- data/doc/classes/IncludedInMemcached.html +217 -0
- data/doc/classes/Spider.html +10 -8
- data/doc/classes/SpiderInstance.html +96 -45
- data/doc/created.rid +1 -1
- data/doc/files/README.html +95 -21
- data/doc/{classes/Net.html → files/lib/included_in_memcached_rb.html} +23 -16
- data/doc/files/lib/spider_instance_rb.html +118 -0
- data/doc/files/lib/spider_rb.html +95 -32
- data/doc/fr_class_index.html +1 -0
- data/doc/fr_file_index.html +2 -0
- data/doc/fr_method_index.html +11 -7
- data/lib/included_in_memcached.rb +22 -0
- data/lib/spider.rb +4 -246
- data/lib/spider_instance.rb +290 -0
- data/spec/included_in_memcached_spec.rb +44 -0
- data/spec/spider_instance_spec.rb +46 -4
- data/spider.gemspec +1 -1
- metadata +8 -8
- data/doc/classes/Net/HTTPRedirection.html +0 -144
- data/doc/classes/Net/HTTPResponse.html +0 -166
- data/doc/classes/Net/HTTPSuccess.html +0 -144
- data/doc/classes/NilClass.html +0 -144
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'spec'
|
3
|
+
|
4
|
+
def before_specing_memcached
|
5
|
+
require File.dirname(__FILE__)+'/../lib/included_in_memcached'
|
6
|
+
system('memcached -d -P /tmp/spider-memcached.pid')
|
7
|
+
end
|
8
|
+
|
9
|
+
def after_specing_memcached
|
10
|
+
system('kill -KILL `cat /tmp/spider-memcached.pid`')
|
11
|
+
end
|
12
|
+
|
13
|
+
Spec::Runner.configure { |c| c.mock_with :mocha }
|
14
|
+
|
15
|
+
describe 'Object to halt cycles' do
|
16
|
+
before do
|
17
|
+
before_specing_memcached
|
18
|
+
end
|
19
|
+
|
20
|
+
it 'should understand <<' do
|
21
|
+
c = IncludedInMemcached.new('localhost:11211')
|
22
|
+
c.should respond_to(:<<)
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'should understand included?' do
|
26
|
+
c = IncludedInMemcached.new('localhost:11211')
|
27
|
+
c.should respond_to(:include?)
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'should produce false if the object is not included' do
|
31
|
+
c = IncludedInMemcached.new('localhost:11211')
|
32
|
+
c.include?('a').should be_false
|
33
|
+
end
|
34
|
+
|
35
|
+
it 'should produce true if the object is included' do
|
36
|
+
c = IncludedInMemcached.new('localhost:11211')
|
37
|
+
c << 'a'
|
38
|
+
c.include?('a').should be_true
|
39
|
+
end
|
40
|
+
|
41
|
+
after do
|
42
|
+
after_specing_memcached
|
43
|
+
end
|
44
|
+
end
|
@@ -3,6 +3,7 @@ require 'spec'
|
|
3
3
|
require 'webrick'
|
4
4
|
require 'webrick/https'
|
5
5
|
require File.dirname(__FILE__)+'/../lib/spider'
|
6
|
+
require File.dirname(__FILE__)+'/../lib/included_in_memcached'
|
6
7
|
|
7
8
|
Spec::Runner.configure { |c| c.mock_with :mocha }
|
8
9
|
|
@@ -13,6 +14,21 @@ class QueryServlet < WEBrick::HTTPServlet::AbstractServlet
|
|
13
14
|
end
|
14
15
|
end
|
15
16
|
|
17
|
+
class LoopingServlet < WEBrick::HTTPServlet::AbstractServlet
|
18
|
+
def do_GET(req, res)
|
19
|
+
res['Content-type'] = 'text/html'
|
20
|
+
if req.path == '/foo'
|
21
|
+
res.body = <<-END
|
22
|
+
<a href="/">a</a>
|
23
|
+
END
|
24
|
+
else
|
25
|
+
res.body = <<-END
|
26
|
+
<a href="/foo">b</a>
|
27
|
+
END
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
16
32
|
def null_logger
|
17
33
|
l = stub
|
18
34
|
[:log, :fatal, :error, :warn , :info, :debug].each do |k|
|
@@ -23,6 +39,18 @@ def null_logger
|
|
23
39
|
end
|
24
40
|
|
25
41
|
describe 'SpiderInstance' do
|
42
|
+
it 'should prevent cycles with an IncludedInMemcached' do
|
43
|
+
system('memcached -d -P /tmp/spider-memcached.pid')
|
44
|
+
cacher = IncludedInMemcached.new('localhost:11211')
|
45
|
+
it_should_prevent_cycles_with(cacher)
|
46
|
+
system('kill -KILL `cat /tmp/spider-memcached.pid`')
|
47
|
+
end
|
48
|
+
|
49
|
+
it 'should prevent cycles with an Array' do
|
50
|
+
cacher = Array.new
|
51
|
+
it_should_prevent_cycles_with(cacher)
|
52
|
+
end
|
53
|
+
|
26
54
|
it 'should call the "setup" callback before loading the Web page' do
|
27
55
|
mock_successful_http
|
28
56
|
@on_called = false
|
@@ -70,8 +98,6 @@ describe 'SpiderInstance' do
|
|
70
98
|
si.start!
|
71
99
|
end
|
72
100
|
|
73
|
-
it 'should allow for a proxy' # fill in more
|
74
|
-
|
75
101
|
it 'should call the :every callback with the current URL, the response, and the prior URL' do
|
76
102
|
mock_successful_http
|
77
103
|
callback_arguments_on(:every)
|
@@ -146,8 +172,6 @@ describe 'SpiderInstance' do
|
|
146
172
|
@page_called.should be_true
|
147
173
|
end
|
148
174
|
|
149
|
-
it 'should maintain the entire graph within some external object (or memory, or memcached)'
|
150
|
-
|
151
175
|
it 'should skip URLs when allowable_url? is false' do
|
152
176
|
u = 'http://example.com/'
|
153
177
|
u_p = URI.parse(u)
|
@@ -382,4 +406,22 @@ describe 'SpiderInstance' do
|
|
382
406
|
end
|
383
407
|
si.start!
|
384
408
|
end
|
409
|
+
|
410
|
+
def it_should_prevent_cycles_with(cacher)
|
411
|
+
u = 'http://localhost:8888/'
|
412
|
+
u_p = URI.parse(u)
|
413
|
+
u2 = 'http://localhost:8888/foo'
|
414
|
+
u_p2 = URI.parse(u2)
|
415
|
+
|
416
|
+
server = WEBrick::HTTPServer.new(:Port => 8888, :Logger => null_logger,
|
417
|
+
:AccessLog => [])
|
418
|
+
server.mount('/', LoopingServlet)
|
419
|
+
Thread.new {server.start}
|
420
|
+
|
421
|
+
si = SpiderInstance.new(nil => [u])
|
422
|
+
si.check_already_seen_with cacher
|
423
|
+
si.start!
|
424
|
+
|
425
|
+
server.shutdown
|
426
|
+
end
|
385
427
|
end
|
data/spider.gemspec
CHANGED
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.4
|
|
3
3
|
specification_version: 1
|
4
4
|
name: spider
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.
|
7
|
-
date: 2007-11-
|
6
|
+
version: 0.4.0
|
7
|
+
date: 2007-11-02 00:00:00 -04:00
|
8
8
|
summary: A Web spidering library
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -34,16 +34,13 @@ files:
|
|
34
34
|
- doc/files
|
35
35
|
- doc/files/lib
|
36
36
|
- doc/files/lib/spider_rb.html
|
37
|
+
- doc/files/lib/spider_instance_rb.html
|
38
|
+
- doc/files/lib/included_in_memcached_rb.html
|
37
39
|
- doc/files/README.html
|
38
40
|
- doc/classes
|
41
|
+
- doc/classes/IncludedInMemcached.html
|
39
42
|
- doc/classes/SpiderInstance.html
|
40
43
|
- doc/classes/Spider.html
|
41
|
-
- doc/classes/Net.html
|
42
|
-
- doc/classes/NilClass.html
|
43
|
-
- doc/classes/Net
|
44
|
-
- doc/classes/Net/HTTPRedirection.html
|
45
|
-
- doc/classes/Net/HTTPSuccess.html
|
46
|
-
- doc/classes/Net/HTTPResponse.html
|
47
44
|
- doc/fr_file_index.html
|
48
45
|
- doc/fr_class_index.html
|
49
46
|
- doc/fr_method_index.html
|
@@ -51,6 +48,7 @@ files:
|
|
51
48
|
- doc/created.rid
|
52
49
|
- spec
|
53
50
|
- spec/spider_spec.rb
|
51
|
+
- spec/included_in_memcached_spec.rb
|
54
52
|
- spec/spider_instance_spec.rb
|
55
53
|
- README
|
56
54
|
- spider.gemspec
|
@@ -58,6 +56,8 @@ files:
|
|
58
56
|
- lib
|
59
57
|
- lib/spider.rb
|
60
58
|
- lib/robot_rules.rb
|
59
|
+
- lib/spider_instance.rb
|
60
|
+
- lib/included_in_memcached.rb
|
61
61
|
- test_server
|
62
62
|
- test_server/server1
|
63
63
|
- test_server/server1/page1.html
|
@@ -1,144 +0,0 @@
|
|
1
|
-
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
-
<!DOCTYPE html
|
3
|
-
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
-
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
-
|
6
|
-
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
7
|
-
<head>
|
8
|
-
<title>Class: Net::HTTPRedirection</title>
|
9
|
-
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
-
<meta http-equiv="Content-Script-Type" content="text/javascript" />
|
11
|
-
<link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
|
12
|
-
<script type="text/javascript">
|
13
|
-
// <![CDATA[
|
14
|
-
|
15
|
-
function popupCode( url ) {
|
16
|
-
window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
|
17
|
-
}
|
18
|
-
|
19
|
-
function toggleCode( id ) {
|
20
|
-
if ( document.getElementById )
|
21
|
-
elem = document.getElementById( id );
|
22
|
-
else if ( document.all )
|
23
|
-
elem = eval( "document.all." + id );
|
24
|
-
else
|
25
|
-
return false;
|
26
|
-
|
27
|
-
elemStyle = elem.style;
|
28
|
-
|
29
|
-
if ( elemStyle.display != "block" ) {
|
30
|
-
elemStyle.display = "block"
|
31
|
-
} else {
|
32
|
-
elemStyle.display = "none"
|
33
|
-
}
|
34
|
-
|
35
|
-
return true;
|
36
|
-
}
|
37
|
-
|
38
|
-
// Make codeblocks hidden by default
|
39
|
-
document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
|
40
|
-
|
41
|
-
// ]]>
|
42
|
-
</script>
|
43
|
-
|
44
|
-
</head>
|
45
|
-
<body>
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
<div id="classHeader">
|
50
|
-
<table class="header-table">
|
51
|
-
<tr class="top-aligned-row">
|
52
|
-
<td><strong>Class</strong></td>
|
53
|
-
<td class="class-name-in-header">Net::HTTPRedirection</td>
|
54
|
-
</tr>
|
55
|
-
<tr class="top-aligned-row">
|
56
|
-
<td><strong>In:</strong></td>
|
57
|
-
<td>
|
58
|
-
<a href="../../files/lib/spider_rb.html">
|
59
|
-
lib/spider.rb
|
60
|
-
</a>
|
61
|
-
<br />
|
62
|
-
</td>
|
63
|
-
</tr>
|
64
|
-
|
65
|
-
<tr class="top-aligned-row">
|
66
|
-
<td><strong>Parent:</strong></td>
|
67
|
-
<td>
|
68
|
-
Object
|
69
|
-
</td>
|
70
|
-
</tr>
|
71
|
-
</table>
|
72
|
-
</div>
|
73
|
-
<!-- banner header -->
|
74
|
-
|
75
|
-
<div id="bodyContent">
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
<div id="contextContent">
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
</div>
|
84
|
-
|
85
|
-
<div id="method-list">
|
86
|
-
<h3 class="section-bar">Methods</h3>
|
87
|
-
|
88
|
-
<div class="name-list">
|
89
|
-
<a href="#M000008">redirect?</a>
|
90
|
-
</div>
|
91
|
-
</div>
|
92
|
-
|
93
|
-
</div>
|
94
|
-
|
95
|
-
|
96
|
-
<!-- if includes -->
|
97
|
-
|
98
|
-
<div id="section">
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
<!-- if method_list -->
|
108
|
-
<div id="methods">
|
109
|
-
<h3 class="section-bar">Public Instance methods</h3>
|
110
|
-
|
111
|
-
<div id="method-M000008" class="method-detail">
|
112
|
-
<a name="M000008"></a>
|
113
|
-
|
114
|
-
<div class="method-heading">
|
115
|
-
<a href="#M000008" class="method-signature">
|
116
|
-
<span class="method-name">redirect?</span><span class="method-args">()</span>
|
117
|
-
</a>
|
118
|
-
</div>
|
119
|
-
|
120
|
-
<div class="method-description">
|
121
|
-
<p><a class="source-toggle" href="#"
|
122
|
-
onclick="toggleCode('M000008-source');return false;">[Source]</a></p>
|
123
|
-
<div class="method-source-code" id="M000008-source">
|
124
|
-
<pre>
|
125
|
-
<span class="ruby-comment cmt"># File lib/spider.rb, line 41</span>
|
126
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">redirect?</span>; <span class="ruby-keyword kw">true</span>; <span class="ruby-keyword kw">end</span>
|
127
|
-
</pre>
|
128
|
-
</div>
|
129
|
-
</div>
|
130
|
-
</div>
|
131
|
-
|
132
|
-
|
133
|
-
</div>
|
134
|
-
|
135
|
-
|
136
|
-
</div>
|
137
|
-
|
138
|
-
|
139
|
-
<div id="validator-badges">
|
140
|
-
<p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
|
141
|
-
</div>
|
142
|
-
|
143
|
-
</body>
|
144
|
-
</html>
|
@@ -1,166 +0,0 @@
|
|
1
|
-
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
-
<!DOCTYPE html
|
3
|
-
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
-
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
-
|
6
|
-
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
7
|
-
<head>
|
8
|
-
<title>Class: Net::HTTPResponse</title>
|
9
|
-
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
-
<meta http-equiv="Content-Script-Type" content="text/javascript" />
|
11
|
-
<link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
|
12
|
-
<script type="text/javascript">
|
13
|
-
// <![CDATA[
|
14
|
-
|
15
|
-
function popupCode( url ) {
|
16
|
-
window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
|
17
|
-
}
|
18
|
-
|
19
|
-
function toggleCode( id ) {
|
20
|
-
if ( document.getElementById )
|
21
|
-
elem = document.getElementById( id );
|
22
|
-
else if ( document.all )
|
23
|
-
elem = eval( "document.all." + id );
|
24
|
-
else
|
25
|
-
return false;
|
26
|
-
|
27
|
-
elemStyle = elem.style;
|
28
|
-
|
29
|
-
if ( elemStyle.display != "block" ) {
|
30
|
-
elemStyle.display = "block"
|
31
|
-
} else {
|
32
|
-
elemStyle.display = "none"
|
33
|
-
}
|
34
|
-
|
35
|
-
return true;
|
36
|
-
}
|
37
|
-
|
38
|
-
// Make codeblocks hidden by default
|
39
|
-
document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
|
40
|
-
|
41
|
-
// ]]>
|
42
|
-
</script>
|
43
|
-
|
44
|
-
</head>
|
45
|
-
<body>
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
<div id="classHeader">
|
50
|
-
<table class="header-table">
|
51
|
-
<tr class="top-aligned-row">
|
52
|
-
<td><strong>Class</strong></td>
|
53
|
-
<td class="class-name-in-header">Net::HTTPResponse</td>
|
54
|
-
</tr>
|
55
|
-
<tr class="top-aligned-row">
|
56
|
-
<td><strong>In:</strong></td>
|
57
|
-
<td>
|
58
|
-
<a href="../../files/lib/spider_rb.html">
|
59
|
-
lib/spider.rb
|
60
|
-
</a>
|
61
|
-
<br />
|
62
|
-
</td>
|
63
|
-
</tr>
|
64
|
-
|
65
|
-
<tr class="top-aligned-row">
|
66
|
-
<td><strong>Parent:</strong></td>
|
67
|
-
<td>
|
68
|
-
Object
|
69
|
-
</td>
|
70
|
-
</tr>
|
71
|
-
</table>
|
72
|
-
</div>
|
73
|
-
<!-- banner header -->
|
74
|
-
|
75
|
-
<div id="bodyContent">
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
<div id="contextContent">
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
</div>
|
84
|
-
|
85
|
-
<div id="method-list">
|
86
|
-
<h3 class="section-bar">Methods</h3>
|
87
|
-
|
88
|
-
<div class="name-list">
|
89
|
-
<a href="#M000011">redirect?</a>
|
90
|
-
<a href="#M000010">success?</a>
|
91
|
-
</div>
|
92
|
-
</div>
|
93
|
-
|
94
|
-
</div>
|
95
|
-
|
96
|
-
|
97
|
-
<!-- if includes -->
|
98
|
-
|
99
|
-
<div id="section">
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
<!-- if method_list -->
|
109
|
-
<div id="methods">
|
110
|
-
<h3 class="section-bar">Public Instance methods</h3>
|
111
|
-
|
112
|
-
<div id="method-M000011" class="method-detail">
|
113
|
-
<a name="M000011"></a>
|
114
|
-
|
115
|
-
<div class="method-heading">
|
116
|
-
<a href="#M000011" class="method-signature">
|
117
|
-
<span class="method-name">redirect?</span><span class="method-args">()</span>
|
118
|
-
</a>
|
119
|
-
</div>
|
120
|
-
|
121
|
-
<div class="method-description">
|
122
|
-
<p><a class="source-toggle" href="#"
|
123
|
-
onclick="toggleCode('M000011-source');return false;">[Source]</a></p>
|
124
|
-
<div class="method-source-code" id="M000011-source">
|
125
|
-
<pre>
|
126
|
-
<span class="ruby-comment cmt"># File lib/spider.rb, line 35</span>
|
127
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">redirect?</span>; <span class="ruby-keyword kw">false</span>; <span class="ruby-keyword kw">end</span>
|
128
|
-
</pre>
|
129
|
-
</div>
|
130
|
-
</div>
|
131
|
-
</div>
|
132
|
-
|
133
|
-
<div id="method-M000010" class="method-detail">
|
134
|
-
<a name="M000010"></a>
|
135
|
-
|
136
|
-
<div class="method-heading">
|
137
|
-
<a href="#M000010" class="method-signature">
|
138
|
-
<span class="method-name">success?</span><span class="method-args">()</span>
|
139
|
-
</a>
|
140
|
-
</div>
|
141
|
-
|
142
|
-
<div class="method-description">
|
143
|
-
<p><a class="source-toggle" href="#"
|
144
|
-
onclick="toggleCode('M000010-source');return false;">[Source]</a></p>
|
145
|
-
<div class="method-source-code" id="M000010-source">
|
146
|
-
<pre>
|
147
|
-
<span class="ruby-comment cmt"># File lib/spider.rb, line 34</span>
|
148
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">success?</span>; <span class="ruby-keyword kw">false</span>; <span class="ruby-keyword kw">end</span>
|
149
|
-
</pre>
|
150
|
-
</div>
|
151
|
-
</div>
|
152
|
-
</div>
|
153
|
-
|
154
|
-
|
155
|
-
</div>
|
156
|
-
|
157
|
-
|
158
|
-
</div>
|
159
|
-
|
160
|
-
|
161
|
-
<div id="validator-badges">
|
162
|
-
<p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
|
163
|
-
</div>
|
164
|
-
|
165
|
-
</body>
|
166
|
-
</html>
|