spider 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES +3 -0
- data/README +90 -17
- data/doc/classes/IncludedInMemcached.html +217 -0
- data/doc/classes/Spider.html +10 -8
- data/doc/classes/SpiderInstance.html +96 -45
- data/doc/created.rid +1 -1
- data/doc/files/README.html +95 -21
- data/doc/{classes/Net.html → files/lib/included_in_memcached_rb.html} +23 -16
- data/doc/files/lib/spider_instance_rb.html +118 -0
- data/doc/files/lib/spider_rb.html +95 -32
- data/doc/fr_class_index.html +1 -0
- data/doc/fr_file_index.html +2 -0
- data/doc/fr_method_index.html +11 -7
- data/lib/included_in_memcached.rb +22 -0
- data/lib/spider.rb +4 -246
- data/lib/spider_instance.rb +290 -0
- data/spec/included_in_memcached_spec.rb +44 -0
- data/spec/spider_instance_spec.rb +46 -4
- data/spider.gemspec +1 -1
- metadata +8 -8
- data/doc/classes/Net/HTTPRedirection.html +0 -144
- data/doc/classes/Net/HTTPResponse.html +0 -166
- data/doc/classes/Net/HTTPSuccess.html +0 -144
- data/doc/classes/NilClass.html +0 -144
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'spec'
|
3
|
+
|
4
|
+
def before_specing_memcached
|
5
|
+
require File.dirname(__FILE__)+'/../lib/included_in_memcached'
|
6
|
+
system('memcached -d -P /tmp/spider-memcached.pid')
|
7
|
+
end
|
8
|
+
|
9
|
+
def after_specing_memcached
|
10
|
+
system('kill -KILL `cat /tmp/spider-memcached.pid`')
|
11
|
+
end
|
12
|
+
|
13
|
+
Spec::Runner.configure { |c| c.mock_with :mocha }
|
14
|
+
|
15
|
+
describe 'Object to halt cycles' do
|
16
|
+
before do
|
17
|
+
before_specing_memcached
|
18
|
+
end
|
19
|
+
|
20
|
+
it 'should understand <<' do
|
21
|
+
c = IncludedInMemcached.new('localhost:11211')
|
22
|
+
c.should respond_to(:<<)
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'should understand included?' do
|
26
|
+
c = IncludedInMemcached.new('localhost:11211')
|
27
|
+
c.should respond_to(:include?)
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'should produce false if the object is not included' do
|
31
|
+
c = IncludedInMemcached.new('localhost:11211')
|
32
|
+
c.include?('a').should be_false
|
33
|
+
end
|
34
|
+
|
35
|
+
it 'should produce true if the object is included' do
|
36
|
+
c = IncludedInMemcached.new('localhost:11211')
|
37
|
+
c << 'a'
|
38
|
+
c.include?('a').should be_true
|
39
|
+
end
|
40
|
+
|
41
|
+
after do
|
42
|
+
after_specing_memcached
|
43
|
+
end
|
44
|
+
end
|
@@ -3,6 +3,7 @@ require 'spec'
|
|
3
3
|
require 'webrick'
|
4
4
|
require 'webrick/https'
|
5
5
|
require File.dirname(__FILE__)+'/../lib/spider'
|
6
|
+
require File.dirname(__FILE__)+'/../lib/included_in_memcached'
|
6
7
|
|
7
8
|
Spec::Runner.configure { |c| c.mock_with :mocha }
|
8
9
|
|
@@ -13,6 +14,21 @@ class QueryServlet < WEBrick::HTTPServlet::AbstractServlet
|
|
13
14
|
end
|
14
15
|
end
|
15
16
|
|
17
|
+
class LoopingServlet < WEBrick::HTTPServlet::AbstractServlet
|
18
|
+
def do_GET(req, res)
|
19
|
+
res['Content-type'] = 'text/html'
|
20
|
+
if req.path == '/foo'
|
21
|
+
res.body = <<-END
|
22
|
+
<a href="/">a</a>
|
23
|
+
END
|
24
|
+
else
|
25
|
+
res.body = <<-END
|
26
|
+
<a href="/foo">b</a>
|
27
|
+
END
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
16
32
|
def null_logger
|
17
33
|
l = stub
|
18
34
|
[:log, :fatal, :error, :warn , :info, :debug].each do |k|
|
@@ -23,6 +39,18 @@ def null_logger
|
|
23
39
|
end
|
24
40
|
|
25
41
|
describe 'SpiderInstance' do
|
42
|
+
it 'should prevent cycles with an IncludedInMemcached' do
|
43
|
+
system('memcached -d -P /tmp/spider-memcached.pid')
|
44
|
+
cacher = IncludedInMemcached.new('localhost:11211')
|
45
|
+
it_should_prevent_cycles_with(cacher)
|
46
|
+
system('kill -KILL `cat /tmp/spider-memcached.pid`')
|
47
|
+
end
|
48
|
+
|
49
|
+
it 'should prevent cycles with an Array' do
|
50
|
+
cacher = Array.new
|
51
|
+
it_should_prevent_cycles_with(cacher)
|
52
|
+
end
|
53
|
+
|
26
54
|
it 'should call the "setup" callback before loading the Web page' do
|
27
55
|
mock_successful_http
|
28
56
|
@on_called = false
|
@@ -70,8 +98,6 @@ describe 'SpiderInstance' do
|
|
70
98
|
si.start!
|
71
99
|
end
|
72
100
|
|
73
|
-
it 'should allow for a proxy' # fill in more
|
74
|
-
|
75
101
|
it 'should call the :every callback with the current URL, the response, and the prior URL' do
|
76
102
|
mock_successful_http
|
77
103
|
callback_arguments_on(:every)
|
@@ -146,8 +172,6 @@ describe 'SpiderInstance' do
|
|
146
172
|
@page_called.should be_true
|
147
173
|
end
|
148
174
|
|
149
|
-
it 'should maintain the entire graph within some external object (or memory, or memcached)'
|
150
|
-
|
151
175
|
it 'should skip URLs when allowable_url? is false' do
|
152
176
|
u = 'http://example.com/'
|
153
177
|
u_p = URI.parse(u)
|
@@ -382,4 +406,22 @@ describe 'SpiderInstance' do
|
|
382
406
|
end
|
383
407
|
si.start!
|
384
408
|
end
|
409
|
+
|
410
|
+
def it_should_prevent_cycles_with(cacher)
|
411
|
+
u = 'http://localhost:8888/'
|
412
|
+
u_p = URI.parse(u)
|
413
|
+
u2 = 'http://localhost:8888/foo'
|
414
|
+
u_p2 = URI.parse(u2)
|
415
|
+
|
416
|
+
server = WEBrick::HTTPServer.new(:Port => 8888, :Logger => null_logger,
|
417
|
+
:AccessLog => [])
|
418
|
+
server.mount('/', LoopingServlet)
|
419
|
+
Thread.new {server.start}
|
420
|
+
|
421
|
+
si = SpiderInstance.new(nil => [u])
|
422
|
+
si.check_already_seen_with cacher
|
423
|
+
si.start!
|
424
|
+
|
425
|
+
server.shutdown
|
426
|
+
end
|
385
427
|
end
|
data/spider.gemspec
CHANGED
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.4
|
|
3
3
|
specification_version: 1
|
4
4
|
name: spider
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.
|
7
|
-
date: 2007-11-
|
6
|
+
version: 0.4.0
|
7
|
+
date: 2007-11-02 00:00:00 -04:00
|
8
8
|
summary: A Web spidering library
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -34,16 +34,13 @@ files:
|
|
34
34
|
- doc/files
|
35
35
|
- doc/files/lib
|
36
36
|
- doc/files/lib/spider_rb.html
|
37
|
+
- doc/files/lib/spider_instance_rb.html
|
38
|
+
- doc/files/lib/included_in_memcached_rb.html
|
37
39
|
- doc/files/README.html
|
38
40
|
- doc/classes
|
41
|
+
- doc/classes/IncludedInMemcached.html
|
39
42
|
- doc/classes/SpiderInstance.html
|
40
43
|
- doc/classes/Spider.html
|
41
|
-
- doc/classes/Net.html
|
42
|
-
- doc/classes/NilClass.html
|
43
|
-
- doc/classes/Net
|
44
|
-
- doc/classes/Net/HTTPRedirection.html
|
45
|
-
- doc/classes/Net/HTTPSuccess.html
|
46
|
-
- doc/classes/Net/HTTPResponse.html
|
47
44
|
- doc/fr_file_index.html
|
48
45
|
- doc/fr_class_index.html
|
49
46
|
- doc/fr_method_index.html
|
@@ -51,6 +48,7 @@ files:
|
|
51
48
|
- doc/created.rid
|
52
49
|
- spec
|
53
50
|
- spec/spider_spec.rb
|
51
|
+
- spec/included_in_memcached_spec.rb
|
54
52
|
- spec/spider_instance_spec.rb
|
55
53
|
- README
|
56
54
|
- spider.gemspec
|
@@ -58,6 +56,8 @@ files:
|
|
58
56
|
- lib
|
59
57
|
- lib/spider.rb
|
60
58
|
- lib/robot_rules.rb
|
59
|
+
- lib/spider_instance.rb
|
60
|
+
- lib/included_in_memcached.rb
|
61
61
|
- test_server
|
62
62
|
- test_server/server1
|
63
63
|
- test_server/server1/page1.html
|
@@ -1,144 +0,0 @@
|
|
1
|
-
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
-
<!DOCTYPE html
|
3
|
-
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
-
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
-
|
6
|
-
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
7
|
-
<head>
|
8
|
-
<title>Class: Net::HTTPRedirection</title>
|
9
|
-
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
-
<meta http-equiv="Content-Script-Type" content="text/javascript" />
|
11
|
-
<link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
|
12
|
-
<script type="text/javascript">
|
13
|
-
// <![CDATA[
|
14
|
-
|
15
|
-
function popupCode( url ) {
|
16
|
-
window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
|
17
|
-
}
|
18
|
-
|
19
|
-
function toggleCode( id ) {
|
20
|
-
if ( document.getElementById )
|
21
|
-
elem = document.getElementById( id );
|
22
|
-
else if ( document.all )
|
23
|
-
elem = eval( "document.all." + id );
|
24
|
-
else
|
25
|
-
return false;
|
26
|
-
|
27
|
-
elemStyle = elem.style;
|
28
|
-
|
29
|
-
if ( elemStyle.display != "block" ) {
|
30
|
-
elemStyle.display = "block"
|
31
|
-
} else {
|
32
|
-
elemStyle.display = "none"
|
33
|
-
}
|
34
|
-
|
35
|
-
return true;
|
36
|
-
}
|
37
|
-
|
38
|
-
// Make codeblocks hidden by default
|
39
|
-
document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
|
40
|
-
|
41
|
-
// ]]>
|
42
|
-
</script>
|
43
|
-
|
44
|
-
</head>
|
45
|
-
<body>
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
<div id="classHeader">
|
50
|
-
<table class="header-table">
|
51
|
-
<tr class="top-aligned-row">
|
52
|
-
<td><strong>Class</strong></td>
|
53
|
-
<td class="class-name-in-header">Net::HTTPRedirection</td>
|
54
|
-
</tr>
|
55
|
-
<tr class="top-aligned-row">
|
56
|
-
<td><strong>In:</strong></td>
|
57
|
-
<td>
|
58
|
-
<a href="../../files/lib/spider_rb.html">
|
59
|
-
lib/spider.rb
|
60
|
-
</a>
|
61
|
-
<br />
|
62
|
-
</td>
|
63
|
-
</tr>
|
64
|
-
|
65
|
-
<tr class="top-aligned-row">
|
66
|
-
<td><strong>Parent:</strong></td>
|
67
|
-
<td>
|
68
|
-
Object
|
69
|
-
</td>
|
70
|
-
</tr>
|
71
|
-
</table>
|
72
|
-
</div>
|
73
|
-
<!-- banner header -->
|
74
|
-
|
75
|
-
<div id="bodyContent">
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
<div id="contextContent">
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
</div>
|
84
|
-
|
85
|
-
<div id="method-list">
|
86
|
-
<h3 class="section-bar">Methods</h3>
|
87
|
-
|
88
|
-
<div class="name-list">
|
89
|
-
<a href="#M000008">redirect?</a>
|
90
|
-
</div>
|
91
|
-
</div>
|
92
|
-
|
93
|
-
</div>
|
94
|
-
|
95
|
-
|
96
|
-
<!-- if includes -->
|
97
|
-
|
98
|
-
<div id="section">
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
<!-- if method_list -->
|
108
|
-
<div id="methods">
|
109
|
-
<h3 class="section-bar">Public Instance methods</h3>
|
110
|
-
|
111
|
-
<div id="method-M000008" class="method-detail">
|
112
|
-
<a name="M000008"></a>
|
113
|
-
|
114
|
-
<div class="method-heading">
|
115
|
-
<a href="#M000008" class="method-signature">
|
116
|
-
<span class="method-name">redirect?</span><span class="method-args">()</span>
|
117
|
-
</a>
|
118
|
-
</div>
|
119
|
-
|
120
|
-
<div class="method-description">
|
121
|
-
<p><a class="source-toggle" href="#"
|
122
|
-
onclick="toggleCode('M000008-source');return false;">[Source]</a></p>
|
123
|
-
<div class="method-source-code" id="M000008-source">
|
124
|
-
<pre>
|
125
|
-
<span class="ruby-comment cmt"># File lib/spider.rb, line 41</span>
|
126
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">redirect?</span>; <span class="ruby-keyword kw">true</span>; <span class="ruby-keyword kw">end</span>
|
127
|
-
</pre>
|
128
|
-
</div>
|
129
|
-
</div>
|
130
|
-
</div>
|
131
|
-
|
132
|
-
|
133
|
-
</div>
|
134
|
-
|
135
|
-
|
136
|
-
</div>
|
137
|
-
|
138
|
-
|
139
|
-
<div id="validator-badges">
|
140
|
-
<p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
|
141
|
-
</div>
|
142
|
-
|
143
|
-
</body>
|
144
|
-
</html>
|
@@ -1,166 +0,0 @@
|
|
1
|
-
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
-
<!DOCTYPE html
|
3
|
-
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
-
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
-
|
6
|
-
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
7
|
-
<head>
|
8
|
-
<title>Class: Net::HTTPResponse</title>
|
9
|
-
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
-
<meta http-equiv="Content-Script-Type" content="text/javascript" />
|
11
|
-
<link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
|
12
|
-
<script type="text/javascript">
|
13
|
-
// <![CDATA[
|
14
|
-
|
15
|
-
function popupCode( url ) {
|
16
|
-
window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
|
17
|
-
}
|
18
|
-
|
19
|
-
function toggleCode( id ) {
|
20
|
-
if ( document.getElementById )
|
21
|
-
elem = document.getElementById( id );
|
22
|
-
else if ( document.all )
|
23
|
-
elem = eval( "document.all." + id );
|
24
|
-
else
|
25
|
-
return false;
|
26
|
-
|
27
|
-
elemStyle = elem.style;
|
28
|
-
|
29
|
-
if ( elemStyle.display != "block" ) {
|
30
|
-
elemStyle.display = "block"
|
31
|
-
} else {
|
32
|
-
elemStyle.display = "none"
|
33
|
-
}
|
34
|
-
|
35
|
-
return true;
|
36
|
-
}
|
37
|
-
|
38
|
-
// Make codeblocks hidden by default
|
39
|
-
document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
|
40
|
-
|
41
|
-
// ]]>
|
42
|
-
</script>
|
43
|
-
|
44
|
-
</head>
|
45
|
-
<body>
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
<div id="classHeader">
|
50
|
-
<table class="header-table">
|
51
|
-
<tr class="top-aligned-row">
|
52
|
-
<td><strong>Class</strong></td>
|
53
|
-
<td class="class-name-in-header">Net::HTTPResponse</td>
|
54
|
-
</tr>
|
55
|
-
<tr class="top-aligned-row">
|
56
|
-
<td><strong>In:</strong></td>
|
57
|
-
<td>
|
58
|
-
<a href="../../files/lib/spider_rb.html">
|
59
|
-
lib/spider.rb
|
60
|
-
</a>
|
61
|
-
<br />
|
62
|
-
</td>
|
63
|
-
</tr>
|
64
|
-
|
65
|
-
<tr class="top-aligned-row">
|
66
|
-
<td><strong>Parent:</strong></td>
|
67
|
-
<td>
|
68
|
-
Object
|
69
|
-
</td>
|
70
|
-
</tr>
|
71
|
-
</table>
|
72
|
-
</div>
|
73
|
-
<!-- banner header -->
|
74
|
-
|
75
|
-
<div id="bodyContent">
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
<div id="contextContent">
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
</div>
|
84
|
-
|
85
|
-
<div id="method-list">
|
86
|
-
<h3 class="section-bar">Methods</h3>
|
87
|
-
|
88
|
-
<div class="name-list">
|
89
|
-
<a href="#M000011">redirect?</a>
|
90
|
-
<a href="#M000010">success?</a>
|
91
|
-
</div>
|
92
|
-
</div>
|
93
|
-
|
94
|
-
</div>
|
95
|
-
|
96
|
-
|
97
|
-
<!-- if includes -->
|
98
|
-
|
99
|
-
<div id="section">
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
<!-- if method_list -->
|
109
|
-
<div id="methods">
|
110
|
-
<h3 class="section-bar">Public Instance methods</h3>
|
111
|
-
|
112
|
-
<div id="method-M000011" class="method-detail">
|
113
|
-
<a name="M000011"></a>
|
114
|
-
|
115
|
-
<div class="method-heading">
|
116
|
-
<a href="#M000011" class="method-signature">
|
117
|
-
<span class="method-name">redirect?</span><span class="method-args">()</span>
|
118
|
-
</a>
|
119
|
-
</div>
|
120
|
-
|
121
|
-
<div class="method-description">
|
122
|
-
<p><a class="source-toggle" href="#"
|
123
|
-
onclick="toggleCode('M000011-source');return false;">[Source]</a></p>
|
124
|
-
<div class="method-source-code" id="M000011-source">
|
125
|
-
<pre>
|
126
|
-
<span class="ruby-comment cmt"># File lib/spider.rb, line 35</span>
|
127
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">redirect?</span>; <span class="ruby-keyword kw">false</span>; <span class="ruby-keyword kw">end</span>
|
128
|
-
</pre>
|
129
|
-
</div>
|
130
|
-
</div>
|
131
|
-
</div>
|
132
|
-
|
133
|
-
<div id="method-M000010" class="method-detail">
|
134
|
-
<a name="M000010"></a>
|
135
|
-
|
136
|
-
<div class="method-heading">
|
137
|
-
<a href="#M000010" class="method-signature">
|
138
|
-
<span class="method-name">success?</span><span class="method-args">()</span>
|
139
|
-
</a>
|
140
|
-
</div>
|
141
|
-
|
142
|
-
<div class="method-description">
|
143
|
-
<p><a class="source-toggle" href="#"
|
144
|
-
onclick="toggleCode('M000010-source');return false;">[Source]</a></p>
|
145
|
-
<div class="method-source-code" id="M000010-source">
|
146
|
-
<pre>
|
147
|
-
<span class="ruby-comment cmt"># File lib/spider.rb, line 34</span>
|
148
|
-
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">success?</span>; <span class="ruby-keyword kw">false</span>; <span class="ruby-keyword kw">end</span>
|
149
|
-
</pre>
|
150
|
-
</div>
|
151
|
-
</div>
|
152
|
-
</div>
|
153
|
-
|
154
|
-
|
155
|
-
</div>
|
156
|
-
|
157
|
-
|
158
|
-
</div>
|
159
|
-
|
160
|
-
|
161
|
-
<div id="validator-badges">
|
162
|
-
<p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
|
163
|
-
</div>
|
164
|
-
|
165
|
-
</body>
|
166
|
-
</html>
|