spider 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,44 @@
1
+ require 'rubygems'
2
+ require 'spec'
3
+
4
+ def before_specing_memcached
5
+ require File.dirname(__FILE__)+'/../lib/included_in_memcached'
6
+ system('memcached -d -P /tmp/spider-memcached.pid')
7
+ end
8
+
9
+ def after_specing_memcached
10
+ system('kill -KILL `cat /tmp/spider-memcached.pid`')
11
+ end
12
+
13
+ Spec::Runner.configure { |c| c.mock_with :mocha }
14
+
15
+ describe 'Object to halt cycles' do
16
+ before do
17
+ before_specing_memcached
18
+ end
19
+
20
+ it 'should understand <<' do
21
+ c = IncludedInMemcached.new('localhost:11211')
22
+ c.should respond_to(:<<)
23
+ end
24
+
25
+ it 'should understand included?' do
26
+ c = IncludedInMemcached.new('localhost:11211')
27
+ c.should respond_to(:include?)
28
+ end
29
+
30
+ it 'should produce false if the object is not included' do
31
+ c = IncludedInMemcached.new('localhost:11211')
32
+ c.include?('a').should be_false
33
+ end
34
+
35
+ it 'should produce true if the object is included' do
36
+ c = IncludedInMemcached.new('localhost:11211')
37
+ c << 'a'
38
+ c.include?('a').should be_true
39
+ end
40
+
41
+ after do
42
+ after_specing_memcached
43
+ end
44
+ end
@@ -3,6 +3,7 @@ require 'spec'
3
3
  require 'webrick'
4
4
  require 'webrick/https'
5
5
  require File.dirname(__FILE__)+'/../lib/spider'
6
+ require File.dirname(__FILE__)+'/../lib/included_in_memcached'
6
7
 
7
8
  Spec::Runner.configure { |c| c.mock_with :mocha }
8
9
 
@@ -13,6 +14,21 @@ class QueryServlet < WEBrick::HTTPServlet::AbstractServlet
13
14
  end
14
15
  end
15
16
 
17
+ class LoopingServlet < WEBrick::HTTPServlet::AbstractServlet
18
+ def do_GET(req, res)
19
+ res['Content-type'] = 'text/html'
20
+ if req.path == '/foo'
21
+ res.body = <<-END
22
+ <a href="/">a</a>
23
+ END
24
+ else
25
+ res.body = <<-END
26
+ <a href="/foo">b</a>
27
+ END
28
+ end
29
+ end
30
+ end
31
+
16
32
  def null_logger
17
33
  l = stub
18
34
  [:log, :fatal, :error, :warn , :info, :debug].each do |k|
@@ -23,6 +39,18 @@ def null_logger
23
39
  end
24
40
 
25
41
  describe 'SpiderInstance' do
42
+ it 'should prevent cycles with an IncludedInMemcached' do
43
+ system('memcached -d -P /tmp/spider-memcached.pid')
44
+ cacher = IncludedInMemcached.new('localhost:11211')
45
+ it_should_prevent_cycles_with(cacher)
46
+ system('kill -KILL `cat /tmp/spider-memcached.pid`')
47
+ end
48
+
49
+ it 'should prevent cycles with an Array' do
50
+ cacher = Array.new
51
+ it_should_prevent_cycles_with(cacher)
52
+ end
53
+
26
54
  it 'should call the "setup" callback before loading the Web page' do
27
55
  mock_successful_http
28
56
  @on_called = false
@@ -70,8 +98,6 @@ describe 'SpiderInstance' do
70
98
  si.start!
71
99
  end
72
100
 
73
- it 'should allow for a proxy' # fill in more
74
-
75
101
  it 'should call the :every callback with the current URL, the response, and the prior URL' do
76
102
  mock_successful_http
77
103
  callback_arguments_on(:every)
@@ -146,8 +172,6 @@ describe 'SpiderInstance' do
146
172
  @page_called.should be_true
147
173
  end
148
174
 
149
- it 'should maintain the entire graph within some external object (or memory, or memcached)'
150
-
151
175
  it 'should skip URLs when allowable_url? is false' do
152
176
  u = 'http://example.com/'
153
177
  u_p = URI.parse(u)
@@ -382,4 +406,22 @@ describe 'SpiderInstance' do
382
406
  end
383
407
  si.start!
384
408
  end
409
+
410
+ def it_should_prevent_cycles_with(cacher)
411
+ u = 'http://localhost:8888/'
412
+ u_p = URI.parse(u)
413
+ u2 = 'http://localhost:8888/foo'
414
+ u_p2 = URI.parse(u2)
415
+
416
+ server = WEBrick::HTTPServer.new(:Port => 8888, :Logger => null_logger,
417
+ :AccessLog => [])
418
+ server.mount('/', LoopingServlet)
419
+ Thread.new {server.start}
420
+
421
+ si = SpiderInstance.new(nil => [u])
422
+ si.check_already_seen_with cacher
423
+ si.start!
424
+
425
+ server.shutdown
426
+ end
385
427
  end
@@ -13,5 +13,5 @@ spec = Gem::Specification.new do |s|
13
13
  A Web spidering library: handles robots.txt, scraping, finding more
14
14
  links, and doing it all over again.
15
15
  EOF
16
- s.version = '0.3.0'
16
+ s.version = '0.4.0'
17
17
  end
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.4
3
3
  specification_version: 1
4
4
  name: spider
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.3.0
7
- date: 2007-11-01 00:00:00 -04:00
6
+ version: 0.4.0
7
+ date: 2007-11-02 00:00:00 -04:00
8
8
  summary: A Web spidering library
9
9
  require_paths:
10
10
  - lib
@@ -34,16 +34,13 @@ files:
34
34
  - doc/files
35
35
  - doc/files/lib
36
36
  - doc/files/lib/spider_rb.html
37
+ - doc/files/lib/spider_instance_rb.html
38
+ - doc/files/lib/included_in_memcached_rb.html
37
39
  - doc/files/README.html
38
40
  - doc/classes
41
+ - doc/classes/IncludedInMemcached.html
39
42
  - doc/classes/SpiderInstance.html
40
43
  - doc/classes/Spider.html
41
- - doc/classes/Net.html
42
- - doc/classes/NilClass.html
43
- - doc/classes/Net
44
- - doc/classes/Net/HTTPRedirection.html
45
- - doc/classes/Net/HTTPSuccess.html
46
- - doc/classes/Net/HTTPResponse.html
47
44
  - doc/fr_file_index.html
48
45
  - doc/fr_class_index.html
49
46
  - doc/fr_method_index.html
@@ -51,6 +48,7 @@ files:
51
48
  - doc/created.rid
52
49
  - spec
53
50
  - spec/spider_spec.rb
51
+ - spec/included_in_memcached_spec.rb
54
52
  - spec/spider_instance_spec.rb
55
53
  - README
56
54
  - spider.gemspec
@@ -58,6 +56,8 @@ files:
58
56
  - lib
59
57
  - lib/spider.rb
60
58
  - lib/robot_rules.rb
59
+ - lib/spider_instance.rb
60
+ - lib/included_in_memcached.rb
61
61
  - test_server
62
62
  - test_server/server1
63
63
  - test_server/server1/page1.html
@@ -1,144 +0,0 @@
1
- <?xml version="1.0" encoding="iso-8859-1"?>
2
- <!DOCTYPE html
3
- PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
-
6
- <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
- <head>
8
- <title>Class: Net::HTTPRedirection</title>
9
- <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
- <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
- <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
12
- <script type="text/javascript">
13
- // <![CDATA[
14
-
15
- function popupCode( url ) {
16
- window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
- }
18
-
19
- function toggleCode( id ) {
20
- if ( document.getElementById )
21
- elem = document.getElementById( id );
22
- else if ( document.all )
23
- elem = eval( "document.all." + id );
24
- else
25
- return false;
26
-
27
- elemStyle = elem.style;
28
-
29
- if ( elemStyle.display != "block" ) {
30
- elemStyle.display = "block"
31
- } else {
32
- elemStyle.display = "none"
33
- }
34
-
35
- return true;
36
- }
37
-
38
- // Make codeblocks hidden by default
39
- document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
-
41
- // ]]>
42
- </script>
43
-
44
- </head>
45
- <body>
46
-
47
-
48
-
49
- <div id="classHeader">
50
- <table class="header-table">
51
- <tr class="top-aligned-row">
52
- <td><strong>Class</strong></td>
53
- <td class="class-name-in-header">Net::HTTPRedirection</td>
54
- </tr>
55
- <tr class="top-aligned-row">
56
- <td><strong>In:</strong></td>
57
- <td>
58
- <a href="../../files/lib/spider_rb.html">
59
- lib/spider.rb
60
- </a>
61
- <br />
62
- </td>
63
- </tr>
64
-
65
- <tr class="top-aligned-row">
66
- <td><strong>Parent:</strong></td>
67
- <td>
68
- Object
69
- </td>
70
- </tr>
71
- </table>
72
- </div>
73
- <!-- banner header -->
74
-
75
- <div id="bodyContent">
76
-
77
-
78
-
79
- <div id="contextContent">
80
-
81
-
82
-
83
- </div>
84
-
85
- <div id="method-list">
86
- <h3 class="section-bar">Methods</h3>
87
-
88
- <div class="name-list">
89
- <a href="#M000008">redirect?</a>&nbsp;&nbsp;
90
- </div>
91
- </div>
92
-
93
- </div>
94
-
95
-
96
- <!-- if includes -->
97
-
98
- <div id="section">
99
-
100
-
101
-
102
-
103
-
104
-
105
-
106
-
107
- <!-- if method_list -->
108
- <div id="methods">
109
- <h3 class="section-bar">Public Instance methods</h3>
110
-
111
- <div id="method-M000008" class="method-detail">
112
- <a name="M000008"></a>
113
-
114
- <div class="method-heading">
115
- <a href="#M000008" class="method-signature">
116
- <span class="method-name">redirect?</span><span class="method-args">()</span>
117
- </a>
118
- </div>
119
-
120
- <div class="method-description">
121
- <p><a class="source-toggle" href="#"
122
- onclick="toggleCode('M000008-source');return false;">[Source]</a></p>
123
- <div class="method-source-code" id="M000008-source">
124
- <pre>
125
- <span class="ruby-comment cmt"># File lib/spider.rb, line 41</span>
126
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">redirect?</span>; <span class="ruby-keyword kw">true</span>; <span class="ruby-keyword kw">end</span>
127
- </pre>
128
- </div>
129
- </div>
130
- </div>
131
-
132
-
133
- </div>
134
-
135
-
136
- </div>
137
-
138
-
139
- <div id="validator-badges">
140
- <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
141
- </div>
142
-
143
- </body>
144
- </html>
@@ -1,166 +0,0 @@
1
- <?xml version="1.0" encoding="iso-8859-1"?>
2
- <!DOCTYPE html
3
- PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
-
6
- <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
- <head>
8
- <title>Class: Net::HTTPResponse</title>
9
- <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
- <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
- <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
12
- <script type="text/javascript">
13
- // <![CDATA[
14
-
15
- function popupCode( url ) {
16
- window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
- }
18
-
19
- function toggleCode( id ) {
20
- if ( document.getElementById )
21
- elem = document.getElementById( id );
22
- else if ( document.all )
23
- elem = eval( "document.all." + id );
24
- else
25
- return false;
26
-
27
- elemStyle = elem.style;
28
-
29
- if ( elemStyle.display != "block" ) {
30
- elemStyle.display = "block"
31
- } else {
32
- elemStyle.display = "none"
33
- }
34
-
35
- return true;
36
- }
37
-
38
- // Make codeblocks hidden by default
39
- document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
-
41
- // ]]>
42
- </script>
43
-
44
- </head>
45
- <body>
46
-
47
-
48
-
49
- <div id="classHeader">
50
- <table class="header-table">
51
- <tr class="top-aligned-row">
52
- <td><strong>Class</strong></td>
53
- <td class="class-name-in-header">Net::HTTPResponse</td>
54
- </tr>
55
- <tr class="top-aligned-row">
56
- <td><strong>In:</strong></td>
57
- <td>
58
- <a href="../../files/lib/spider_rb.html">
59
- lib/spider.rb
60
- </a>
61
- <br />
62
- </td>
63
- </tr>
64
-
65
- <tr class="top-aligned-row">
66
- <td><strong>Parent:</strong></td>
67
- <td>
68
- Object
69
- </td>
70
- </tr>
71
- </table>
72
- </div>
73
- <!-- banner header -->
74
-
75
- <div id="bodyContent">
76
-
77
-
78
-
79
- <div id="contextContent">
80
-
81
-
82
-
83
- </div>
84
-
85
- <div id="method-list">
86
- <h3 class="section-bar">Methods</h3>
87
-
88
- <div class="name-list">
89
- <a href="#M000011">redirect?</a>&nbsp;&nbsp;
90
- <a href="#M000010">success?</a>&nbsp;&nbsp;
91
- </div>
92
- </div>
93
-
94
- </div>
95
-
96
-
97
- <!-- if includes -->
98
-
99
- <div id="section">
100
-
101
-
102
-
103
-
104
-
105
-
106
-
107
-
108
- <!-- if method_list -->
109
- <div id="methods">
110
- <h3 class="section-bar">Public Instance methods</h3>
111
-
112
- <div id="method-M000011" class="method-detail">
113
- <a name="M000011"></a>
114
-
115
- <div class="method-heading">
116
- <a href="#M000011" class="method-signature">
117
- <span class="method-name">redirect?</span><span class="method-args">()</span>
118
- </a>
119
- </div>
120
-
121
- <div class="method-description">
122
- <p><a class="source-toggle" href="#"
123
- onclick="toggleCode('M000011-source');return false;">[Source]</a></p>
124
- <div class="method-source-code" id="M000011-source">
125
- <pre>
126
- <span class="ruby-comment cmt"># File lib/spider.rb, line 35</span>
127
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">redirect?</span>; <span class="ruby-keyword kw">false</span>; <span class="ruby-keyword kw">end</span>
128
- </pre>
129
- </div>
130
- </div>
131
- </div>
132
-
133
- <div id="method-M000010" class="method-detail">
134
- <a name="M000010"></a>
135
-
136
- <div class="method-heading">
137
- <a href="#M000010" class="method-signature">
138
- <span class="method-name">success?</span><span class="method-args">()</span>
139
- </a>
140
- </div>
141
-
142
- <div class="method-description">
143
- <p><a class="source-toggle" href="#"
144
- onclick="toggleCode('M000010-source');return false;">[Source]</a></p>
145
- <div class="method-source-code" id="M000010-source">
146
- <pre>
147
- <span class="ruby-comment cmt"># File lib/spider.rb, line 34</span>
148
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">success?</span>; <span class="ruby-keyword kw">false</span>; <span class="ruby-keyword kw">end</span>
149
- </pre>
150
- </div>
151
- </div>
152
- </div>
153
-
154
-
155
- </div>
156
-
157
-
158
- </div>
159
-
160
-
161
- <div id="validator-badges">
162
- <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
163
- </div>
164
-
165
- </body>
166
- </html>