spider 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,44 @@
1
+ require 'rubygems'
2
+ require 'spec'
3
+
4
+ def before_specing_memcached
5
+ require File.dirname(__FILE__)+'/../lib/included_in_memcached'
6
+ system('memcached -d -P /tmp/spider-memcached.pid')
7
+ end
8
+
9
+ def after_specing_memcached
10
+ system('kill -KILL `cat /tmp/spider-memcached.pid`')
11
+ end
12
+
13
+ Spec::Runner.configure { |c| c.mock_with :mocha }
14
+
15
+ describe 'Object to halt cycles' do
16
+ before do
17
+ before_specing_memcached
18
+ end
19
+
20
+ it 'should understand <<' do
21
+ c = IncludedInMemcached.new('localhost:11211')
22
+ c.should respond_to(:<<)
23
+ end
24
+
25
+ it 'should understand included?' do
26
+ c = IncludedInMemcached.new('localhost:11211')
27
+ c.should respond_to(:include?)
28
+ end
29
+
30
+ it 'should produce false if the object is not included' do
31
+ c = IncludedInMemcached.new('localhost:11211')
32
+ c.include?('a').should be_false
33
+ end
34
+
35
+ it 'should produce true if the object is included' do
36
+ c = IncludedInMemcached.new('localhost:11211')
37
+ c << 'a'
38
+ c.include?('a').should be_true
39
+ end
40
+
41
+ after do
42
+ after_specing_memcached
43
+ end
44
+ end
@@ -3,6 +3,7 @@ require 'spec'
3
3
  require 'webrick'
4
4
  require 'webrick/https'
5
5
  require File.dirname(__FILE__)+'/../lib/spider'
6
+ require File.dirname(__FILE__)+'/../lib/included_in_memcached'
6
7
 
7
8
  Spec::Runner.configure { |c| c.mock_with :mocha }
8
9
 
@@ -13,6 +14,21 @@ class QueryServlet < WEBrick::HTTPServlet::AbstractServlet
13
14
  end
14
15
  end
15
16
 
17
+ class LoopingServlet < WEBrick::HTTPServlet::AbstractServlet
18
+ def do_GET(req, res)
19
+ res['Content-type'] = 'text/html'
20
+ if req.path == '/foo'
21
+ res.body = <<-END
22
+ <a href="/">a</a>
23
+ END
24
+ else
25
+ res.body = <<-END
26
+ <a href="/foo">b</a>
27
+ END
28
+ end
29
+ end
30
+ end
31
+
16
32
  def null_logger
17
33
  l = stub
18
34
  [:log, :fatal, :error, :warn , :info, :debug].each do |k|
@@ -23,6 +39,18 @@ def null_logger
23
39
  end
24
40
 
25
41
  describe 'SpiderInstance' do
42
+ it 'should prevent cycles with an IncludedInMemcached' do
43
+ system('memcached -d -P /tmp/spider-memcached.pid')
44
+ cacher = IncludedInMemcached.new('localhost:11211')
45
+ it_should_prevent_cycles_with(cacher)
46
+ system('kill -KILL `cat /tmp/spider-memcached.pid`')
47
+ end
48
+
49
+ it 'should prevent cycles with an Array' do
50
+ cacher = Array.new
51
+ it_should_prevent_cycles_with(cacher)
52
+ end
53
+
26
54
  it 'should call the "setup" callback before loading the Web page' do
27
55
  mock_successful_http
28
56
  @on_called = false
@@ -70,8 +98,6 @@ describe 'SpiderInstance' do
70
98
  si.start!
71
99
  end
72
100
 
73
- it 'should allow for a proxy' # fill in more
74
-
75
101
  it 'should call the :every callback with the current URL, the response, and the prior URL' do
76
102
  mock_successful_http
77
103
  callback_arguments_on(:every)
@@ -146,8 +172,6 @@ describe 'SpiderInstance' do
146
172
  @page_called.should be_true
147
173
  end
148
174
 
149
- it 'should maintain the entire graph within some external object (or memory, or memcached)'
150
-
151
175
  it 'should skip URLs when allowable_url? is false' do
152
176
  u = 'http://example.com/'
153
177
  u_p = URI.parse(u)
@@ -382,4 +406,22 @@ describe 'SpiderInstance' do
382
406
  end
383
407
  si.start!
384
408
  end
409
+
410
+ def it_should_prevent_cycles_with(cacher)
411
+ u = 'http://localhost:8888/'
412
+ u_p = URI.parse(u)
413
+ u2 = 'http://localhost:8888/foo'
414
+ u_p2 = URI.parse(u2)
415
+
416
+ server = WEBrick::HTTPServer.new(:Port => 8888, :Logger => null_logger,
417
+ :AccessLog => [])
418
+ server.mount('/', LoopingServlet)
419
+ Thread.new {server.start}
420
+
421
+ si = SpiderInstance.new(nil => [u])
422
+ si.check_already_seen_with cacher
423
+ si.start!
424
+
425
+ server.shutdown
426
+ end
385
427
  end
@@ -13,5 +13,5 @@ spec = Gem::Specification.new do |s|
13
13
  A Web spidering library: handles robots.txt, scraping, finding more
14
14
  links, and doing it all over again.
15
15
  EOF
16
- s.version = '0.3.0'
16
+ s.version = '0.4.0'
17
17
  end
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.4
3
3
  specification_version: 1
4
4
  name: spider
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.3.0
7
- date: 2007-11-01 00:00:00 -04:00
6
+ version: 0.4.0
7
+ date: 2007-11-02 00:00:00 -04:00
8
8
  summary: A Web spidering library
9
9
  require_paths:
10
10
  - lib
@@ -34,16 +34,13 @@ files:
34
34
  - doc/files
35
35
  - doc/files/lib
36
36
  - doc/files/lib/spider_rb.html
37
+ - doc/files/lib/spider_instance_rb.html
38
+ - doc/files/lib/included_in_memcached_rb.html
37
39
  - doc/files/README.html
38
40
  - doc/classes
41
+ - doc/classes/IncludedInMemcached.html
39
42
  - doc/classes/SpiderInstance.html
40
43
  - doc/classes/Spider.html
41
- - doc/classes/Net.html
42
- - doc/classes/NilClass.html
43
- - doc/classes/Net
44
- - doc/classes/Net/HTTPRedirection.html
45
- - doc/classes/Net/HTTPSuccess.html
46
- - doc/classes/Net/HTTPResponse.html
47
44
  - doc/fr_file_index.html
48
45
  - doc/fr_class_index.html
49
46
  - doc/fr_method_index.html
@@ -51,6 +48,7 @@ files:
51
48
  - doc/created.rid
52
49
  - spec
53
50
  - spec/spider_spec.rb
51
+ - spec/included_in_memcached_spec.rb
54
52
  - spec/spider_instance_spec.rb
55
53
  - README
56
54
  - spider.gemspec
@@ -58,6 +56,8 @@ files:
58
56
  - lib
59
57
  - lib/spider.rb
60
58
  - lib/robot_rules.rb
59
+ - lib/spider_instance.rb
60
+ - lib/included_in_memcached.rb
61
61
  - test_server
62
62
  - test_server/server1
63
63
  - test_server/server1/page1.html
@@ -1,144 +0,0 @@
1
- <?xml version="1.0" encoding="iso-8859-1"?>
2
- <!DOCTYPE html
3
- PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
-
6
- <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
- <head>
8
- <title>Class: Net::HTTPRedirection</title>
9
- <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
- <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
- <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
12
- <script type="text/javascript">
13
- // <![CDATA[
14
-
15
- function popupCode( url ) {
16
- window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
- }
18
-
19
- function toggleCode( id ) {
20
- if ( document.getElementById )
21
- elem = document.getElementById( id );
22
- else if ( document.all )
23
- elem = eval( "document.all." + id );
24
- else
25
- return false;
26
-
27
- elemStyle = elem.style;
28
-
29
- if ( elemStyle.display != "block" ) {
30
- elemStyle.display = "block"
31
- } else {
32
- elemStyle.display = "none"
33
- }
34
-
35
- return true;
36
- }
37
-
38
- // Make codeblocks hidden by default
39
- document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
-
41
- // ]]>
42
- </script>
43
-
44
- </head>
45
- <body>
46
-
47
-
48
-
49
- <div id="classHeader">
50
- <table class="header-table">
51
- <tr class="top-aligned-row">
52
- <td><strong>Class</strong></td>
53
- <td class="class-name-in-header">Net::HTTPRedirection</td>
54
- </tr>
55
- <tr class="top-aligned-row">
56
- <td><strong>In:</strong></td>
57
- <td>
58
- <a href="../../files/lib/spider_rb.html">
59
- lib/spider.rb
60
- </a>
61
- <br />
62
- </td>
63
- </tr>
64
-
65
- <tr class="top-aligned-row">
66
- <td><strong>Parent:</strong></td>
67
- <td>
68
- Object
69
- </td>
70
- </tr>
71
- </table>
72
- </div>
73
- <!-- banner header -->
74
-
75
- <div id="bodyContent">
76
-
77
-
78
-
79
- <div id="contextContent">
80
-
81
-
82
-
83
- </div>
84
-
85
- <div id="method-list">
86
- <h3 class="section-bar">Methods</h3>
87
-
88
- <div class="name-list">
89
- <a href="#M000008">redirect?</a>&nbsp;&nbsp;
90
- </div>
91
- </div>
92
-
93
- </div>
94
-
95
-
96
- <!-- if includes -->
97
-
98
- <div id="section">
99
-
100
-
101
-
102
-
103
-
104
-
105
-
106
-
107
- <!-- if method_list -->
108
- <div id="methods">
109
- <h3 class="section-bar">Public Instance methods</h3>
110
-
111
- <div id="method-M000008" class="method-detail">
112
- <a name="M000008"></a>
113
-
114
- <div class="method-heading">
115
- <a href="#M000008" class="method-signature">
116
- <span class="method-name">redirect?</span><span class="method-args">()</span>
117
- </a>
118
- </div>
119
-
120
- <div class="method-description">
121
- <p><a class="source-toggle" href="#"
122
- onclick="toggleCode('M000008-source');return false;">[Source]</a></p>
123
- <div class="method-source-code" id="M000008-source">
124
- <pre>
125
- <span class="ruby-comment cmt"># File lib/spider.rb, line 41</span>
126
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">redirect?</span>; <span class="ruby-keyword kw">true</span>; <span class="ruby-keyword kw">end</span>
127
- </pre>
128
- </div>
129
- </div>
130
- </div>
131
-
132
-
133
- </div>
134
-
135
-
136
- </div>
137
-
138
-
139
- <div id="validator-badges">
140
- <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
141
- </div>
142
-
143
- </body>
144
- </html>
@@ -1,166 +0,0 @@
1
- <?xml version="1.0" encoding="iso-8859-1"?>
2
- <!DOCTYPE html
3
- PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
-
6
- <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
- <head>
8
- <title>Class: Net::HTTPResponse</title>
9
- <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
- <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
- <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
12
- <script type="text/javascript">
13
- // <![CDATA[
14
-
15
- function popupCode( url ) {
16
- window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
- }
18
-
19
- function toggleCode( id ) {
20
- if ( document.getElementById )
21
- elem = document.getElementById( id );
22
- else if ( document.all )
23
- elem = eval( "document.all." + id );
24
- else
25
- return false;
26
-
27
- elemStyle = elem.style;
28
-
29
- if ( elemStyle.display != "block" ) {
30
- elemStyle.display = "block"
31
- } else {
32
- elemStyle.display = "none"
33
- }
34
-
35
- return true;
36
- }
37
-
38
- // Make codeblocks hidden by default
39
- document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
-
41
- // ]]>
42
- </script>
43
-
44
- </head>
45
- <body>
46
-
47
-
48
-
49
- <div id="classHeader">
50
- <table class="header-table">
51
- <tr class="top-aligned-row">
52
- <td><strong>Class</strong></td>
53
- <td class="class-name-in-header">Net::HTTPResponse</td>
54
- </tr>
55
- <tr class="top-aligned-row">
56
- <td><strong>In:</strong></td>
57
- <td>
58
- <a href="../../files/lib/spider_rb.html">
59
- lib/spider.rb
60
- </a>
61
- <br />
62
- </td>
63
- </tr>
64
-
65
- <tr class="top-aligned-row">
66
- <td><strong>Parent:</strong></td>
67
- <td>
68
- Object
69
- </td>
70
- </tr>
71
- </table>
72
- </div>
73
- <!-- banner header -->
74
-
75
- <div id="bodyContent">
76
-
77
-
78
-
79
- <div id="contextContent">
80
-
81
-
82
-
83
- </div>
84
-
85
- <div id="method-list">
86
- <h3 class="section-bar">Methods</h3>
87
-
88
- <div class="name-list">
89
- <a href="#M000011">redirect?</a>&nbsp;&nbsp;
90
- <a href="#M000010">success?</a>&nbsp;&nbsp;
91
- </div>
92
- </div>
93
-
94
- </div>
95
-
96
-
97
- <!-- if includes -->
98
-
99
- <div id="section">
100
-
101
-
102
-
103
-
104
-
105
-
106
-
107
-
108
- <!-- if method_list -->
109
- <div id="methods">
110
- <h3 class="section-bar">Public Instance methods</h3>
111
-
112
- <div id="method-M000011" class="method-detail">
113
- <a name="M000011"></a>
114
-
115
- <div class="method-heading">
116
- <a href="#M000011" class="method-signature">
117
- <span class="method-name">redirect?</span><span class="method-args">()</span>
118
- </a>
119
- </div>
120
-
121
- <div class="method-description">
122
- <p><a class="source-toggle" href="#"
123
- onclick="toggleCode('M000011-source');return false;">[Source]</a></p>
124
- <div class="method-source-code" id="M000011-source">
125
- <pre>
126
- <span class="ruby-comment cmt"># File lib/spider.rb, line 35</span>
127
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">redirect?</span>; <span class="ruby-keyword kw">false</span>; <span class="ruby-keyword kw">end</span>
128
- </pre>
129
- </div>
130
- </div>
131
- </div>
132
-
133
- <div id="method-M000010" class="method-detail">
134
- <a name="M000010"></a>
135
-
136
- <div class="method-heading">
137
- <a href="#M000010" class="method-signature">
138
- <span class="method-name">success?</span><span class="method-args">()</span>
139
- </a>
140
- </div>
141
-
142
- <div class="method-description">
143
- <p><a class="source-toggle" href="#"
144
- onclick="toggleCode('M000010-source');return false;">[Source]</a></p>
145
- <div class="method-source-code" id="M000010-source">
146
- <pre>
147
- <span class="ruby-comment cmt"># File lib/spider.rb, line 34</span>
148
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">success?</span>; <span class="ruby-keyword kw">false</span>; <span class="ruby-keyword kw">end</span>
149
- </pre>
150
- </div>
151
- </div>
152
- </div>
153
-
154
-
155
- </div>
156
-
157
-
158
- </div>
159
-
160
-
161
- <div id="validator-badges">
162
- <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
163
- </div>
164
-
165
- </body>
166
- </html>