spider 0.4.3 → 0.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (66) hide show
  1. data/CHANGES +6 -0
  2. data/README +3 -3
  3. data/doc/classes/BeStaticServerPages.html +197 -0
  4. data/doc/classes/BeStaticServerPages.src/M000030.html +19 -0
  5. data/doc/classes/BeStaticServerPages.src/M000031.html +19 -0
  6. data/doc/classes/BeStaticServerPages.src/M000032.html +18 -0
  7. data/doc/classes/BeStaticServerPages.src/M000033.html +18 -0
  8. data/doc/classes/IncludedInMemcached.html +18 -45
  9. data/doc/classes/IncludedInMemcached.src/M000015.html +18 -0
  10. data/doc/classes/IncludedInMemcached.src/M000016.html +18 -0
  11. data/doc/classes/IncludedInMemcached.src/M000017.html +18 -0
  12. data/doc/classes/LoopingServlet.html +137 -0
  13. data/doc/classes/LoopingServlet.src/M000037.html +23 -0
  14. data/doc/classes/NextUrlsInSQS.html +204 -0
  15. data/doc/classes/NextUrlsInSQS.src/M000018.html +19 -0
  16. data/doc/classes/NextUrlsInSQS.src/M000019.html +22 -0
  17. data/doc/classes/NextUrlsInSQS.src/M000020.html +19 -0
  18. data/doc/classes/QueryServlet.html +137 -0
  19. data/doc/classes/QueryServlet.src/M000038.html +19 -0
  20. data/doc/classes/RobotRules.html +175 -0
  21. data/doc/classes/RobotRules.src/M000034.html +19 -0
  22. data/doc/classes/RobotRules.src/M000035.html +67 -0
  23. data/doc/classes/RobotRules.src/M000036.html +24 -0
  24. data/doc/classes/Spider.html +5 -17
  25. data/doc/classes/Spider.src/M000029.html +21 -0
  26. data/doc/classes/SpiderInstance.html +72 -108
  27. data/doc/classes/SpiderInstance.src/M000021.html +18 -0
  28. data/doc/classes/SpiderInstance.src/M000022.html +22 -0
  29. data/doc/classes/SpiderInstance.src/M000023.html +22 -0
  30. data/doc/classes/SpiderInstance.src/M000024.html +24 -0
  31. data/doc/classes/SpiderInstance.src/M000025.html +18 -0
  32. data/doc/classes/SpiderInstance.src/M000026.html +18 -0
  33. data/doc/classes/SpiderInstance.src/M000027.html +18 -0
  34. data/doc/classes/SpiderInstance.src/M000028.html +18 -0
  35. data/doc/created.rid +1 -1
  36. data/doc/files/lib/spider/included_in_memcached_rb.html +29 -1
  37. data/doc/files/lib/spider/next_urls_in_sqs_rb.html +144 -0
  38. data/doc/files/lib/spider/robot_rules_rb.html +114 -0
  39. data/doc/files/lib/spider/spider_instance_rb.html +1 -2
  40. data/doc/files/lib/spider_rb.html +40 -9
  41. data/doc/files/spec/spec_helper_rb.html +196 -0
  42. data/doc/files/spec/spec_helper_rb.src/M000001.html +20 -0
  43. data/doc/files/spec/spec_helper_rb.src/M000002.html +26 -0
  44. data/doc/files/spec/spec_helper_rb.src/M000003.html +24 -0
  45. data/doc/files/spec/spec_helper_rb.src/M000004.html +18 -0
  46. data/doc/files/spec/spec_helper_rb.src/M000005.html +23 -0
  47. data/doc/files/spec/spider/included_in_memcached_spec_rb.html +142 -0
  48. data/doc/files/spec/spider/included_in_memcached_spec_rb.src/M000006.html +19 -0
  49. data/doc/files/spec/spider/included_in_memcached_spec_rb.src/M000007.html +18 -0
  50. data/doc/files/spec/spider/spider_instance_spec_rb.html +210 -0
  51. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000008.html +21 -0
  52. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000009.html +19 -0
  53. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000010.html +19 -0
  54. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000011.html +27 -0
  55. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000012.html +26 -0
  56. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000013.html +27 -0
  57. data/doc/files/spec/spider_spec_rb.html +127 -0
  58. data/doc/files/spec/spider_spec_rb.src/M000014.html +23 -0
  59. data/doc/fr_class_index.html +5 -0
  60. data/doc/fr_file_index.html +6 -1
  61. data/doc/fr_method_index.html +38 -11
  62. data/doc/index.html +1 -1
  63. data/lib/spider/spider_instance.rb +15 -7
  64. data/spider.gemspec +1 -1
  65. metadata +84 -22
  66. data/lib/test.rb +0 -27
@@ -0,0 +1,23 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html>
7
+ <head>
8
+ <title>find_pages_with_static_server (spec/spider_spec.rb)</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
11
+ </head>
12
+ <body class="standalone-code">
13
+ <pre><span class="ruby-comment cmt"># File spec/spider_spec.rb, line 25</span>
14
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">find_pages_with_static_server</span>(<span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
15
+ <span class="ruby-identifier">pages</span> = []
16
+ <span class="ruby-constant">Spider</span>.<span class="ruby-identifier">start_at</span>(<span class="ruby-value str">'http://localhost:8888/'</span>) <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">s</span><span class="ruby-operator">|</span>
17
+ <span class="ruby-identifier">block</span>.<span class="ruby-identifier">call</span>(<span class="ruby-identifier">s</span>) <span class="ruby-keyword kw">unless</span> <span class="ruby-identifier">block</span>.<span class="ruby-identifier">nil?</span>
18
+ <span class="ruby-identifier">s</span>.<span class="ruby-identifier">on</span>(<span class="ruby-identifier">:every</span>){ <span class="ruby-operator">|</span><span class="ruby-identifier">u</span>,<span class="ruby-identifier">r</span>,<span class="ruby-identifier">p</span><span class="ruby-operator">|</span> <span class="ruby-identifier">pages</span> <span class="ruby-operator">&lt;&lt;</span> <span class="ruby-identifier">u</span> }
19
+ <span class="ruby-keyword kw">end</span>
20
+ <span class="ruby-identifier">pages</span>
21
+ <span class="ruby-keyword kw">end</span></pre>
22
+ </body>
23
+ </html>
@@ -20,7 +20,12 @@
20
20
  <div id="index">
21
21
  <h1 class="section-bar">Classes</h1>
22
22
  <div id="index-entries">
23
+ <a href="classes/BeStaticServerPages.html">BeStaticServerPages</a><br />
23
24
  <a href="classes/IncludedInMemcached.html">IncludedInMemcached</a><br />
25
+ <a href="classes/LoopingServlet.html">LoopingServlet</a><br />
26
+ <a href="classes/NextUrlsInSQS.html">NextUrlsInSQS</a><br />
27
+ <a href="classes/QueryServlet.html">QueryServlet</a><br />
28
+ <a href="classes/RobotRules.html">RobotRules</a><br />
24
29
  <a href="classes/Spider.html">Spider</a><br />
25
30
  <a href="classes/SpiderInstance.html">SpiderInstance</a><br />
26
31
  </div>
@@ -20,10 +20,15 @@
20
20
  <div id="index">
21
21
  <h1 class="section-bar">Files</h1>
22
22
  <div id="index-entries">
23
- <a href="files/README.html">README</a><br />
24
23
  <a href="files/lib/spider_rb.html">lib/spider.rb</a><br />
25
24
  <a href="files/lib/spider/included_in_memcached_rb.html">lib/spider/included_in_memcached.rb</a><br />
25
+ <a href="files/lib/spider/next_urls_in_sqs_rb.html">lib/spider/next_urls_in_sqs.rb</a><br />
26
+ <a href="files/lib/spider/robot_rules_rb.html">lib/spider/robot_rules.rb</a><br />
26
27
  <a href="files/lib/spider/spider_instance_rb.html">lib/spider/spider_instance.rb</a><br />
28
+ <a href="files/spec/spec_helper_rb.html">spec/spec_helper.rb</a><br />
29
+ <a href="files/spec/spider/included_in_memcached_spec_rb.html">spec/spider/included_in_memcached_spec.rb</a><br />
30
+ <a href="files/spec/spider/spider_instance_spec_rb.html">spec/spider/spider_instance_spec.rb</a><br />
31
+ <a href="files/spec/spider_spec_rb.html">spec/spider_spec.rb</a><br />
27
32
  </div>
28
33
  </div>
29
34
  </body>
@@ -20,17 +20,44 @@
20
20
  <div id="index">
21
21
  <h1 class="section-bar">Methods</h1>
22
22
  <div id="index-entries">
23
- <a href="classes/IncludedInMemcached.html#M000002"><< (IncludedInMemcached)</a><br />
24
- <a href="classes/SpiderInstance.html#M000004">add_url_check (SpiderInstance)</a><br />
25
- <a href="classes/SpiderInstance.html#M000005">check_already_seen_with (SpiderInstance)</a><br />
26
- <a href="classes/SpiderInstance.html#M000010">clear_headers (SpiderInstance)</a><br />
27
- <a href="classes/SpiderInstance.html#M000009">headers (SpiderInstance)</a><br />
28
- <a href="classes/IncludedInMemcached.html#M000003">include? (IncludedInMemcached)</a><br />
29
- <a href="classes/IncludedInMemcached.html#M000001">new (IncludedInMemcached)</a><br />
30
- <a href="classes/SpiderInstance.html#M000006">on (SpiderInstance)</a><br />
31
- <a href="classes/SpiderInstance.html#M000007">setup (SpiderInstance)</a><br />
32
- <a href="classes/Spider.html#M000011">start_at (Spider)</a><br />
33
- <a href="classes/SpiderInstance.html#M000008">teardown (SpiderInstance)</a><br />
23
+ <a href="classes/IncludedInMemcached.html#M000016"><< (IncludedInMemcached)</a><br />
24
+ <a href="classes/SpiderInstance.html#M000021">add_url_check (SpiderInstance)</a><br />
25
+ <a href="files/spec/spider/included_in_memcached_spec_rb.html#M000007">after_specing_memcached (spec/spider/included_in_memcached_spec.rb)</a><br />
26
+ <a href="classes/RobotRules.html#M000036">allowed? (RobotRules)</a><br />
27
+ <a href="files/spec/spec_helper_rb.html#M000004">be_static_server_pages (spec/spec_helper.rb)</a><br />
28
+ <a href="files/spec/spider/included_in_memcached_spec_rb.html#M000006">before_specing_memcached (spec/spider/included_in_memcached_spec.rb)</a><br />
29
+ <a href="files/spec/spider/spider_instance_spec_rb.html#M000012">callback_arguments_on (spec/spider/spider_instance_spec.rb)</a><br />
30
+ <a href="classes/SpiderInstance.html#M000022">check_already_seen_with (SpiderInstance)</a><br />
31
+ <a href="classes/SpiderInstance.html#M000028">clear_headers (SpiderInstance)</a><br />
32
+ <a href="classes/BeStaticServerPages.html#M000033">description (BeStaticServerPages)</a><br />
33
+ <a href="classes/QueryServlet.html#M000038">do_GET (QueryServlet)</a><br />
34
+ <a href="classes/LoopingServlet.html#M000037">do_GET (LoopingServlet)</a><br />
35
+ <a href="classes/BeStaticServerPages.html#M000032">failure_message (BeStaticServerPages)</a><br />
36
+ <a href="files/spec/spider_spec_rb.html#M000014">find_pages_with_static_server (spec/spider_spec.rb)</a><br />
37
+ <a href="classes/SpiderInstance.html#M000027">headers (SpiderInstance)</a><br />
38
+ <a href="classes/IncludedInMemcached.html#M000017">include? (IncludedInMemcached)</a><br />
39
+ <a href="files/spec/spider/spider_instance_spec_rb.html#M000013">it_should_prevent_cycles_with (spec/spider/spider_instance_spec.rb)</a><br />
40
+ <a href="files/spec/spec_helper_rb.html#M000001">local_require (spec/spec_helper.rb)</a><br />
41
+ <a href="classes/BeStaticServerPages.html#M000031">matches? (BeStaticServerPages)</a><br />
42
+ <a href="files/spec/spider/spider_instance_spec_rb.html#M000010">mock_failed_http (spec/spider/spider_instance_spec.rb)</a><br />
43
+ <a href="files/spec/spider/spider_instance_spec_rb.html#M000008">mock_http (spec/spider/spider_instance_spec.rb)</a><br />
44
+ <a href="files/spec/spider/spider_instance_spec_rb.html#M000011">mock_redirect_http (spec/spider/spider_instance_spec.rb)</a><br />
45
+ <a href="files/spec/spider/spider_instance_spec_rb.html#M000009">mock_successful_http (spec/spider/spider_instance_spec.rb)</a><br />
46
+ <a href="classes/IncludedInMemcached.html#M000015">new (IncludedInMemcached)</a><br />
47
+ <a href="classes/NextUrlsInSQS.html#M000018">new (NextUrlsInSQS)</a><br />
48
+ <a href="classes/BeStaticServerPages.html#M000030">new (BeStaticServerPages)</a><br />
49
+ <a href="classes/RobotRules.html#M000034">new (RobotRules)</a><br />
50
+ <a href="files/spec/spec_helper_rb.html#M000005">null_logger (spec/spec_helper.rb)</a><br />
51
+ <a href="classes/SpiderInstance.html#M000024">on (SpiderInstance)</a><br />
52
+ <a href="classes/RobotRules.html#M000035">parse (RobotRules)</a><br />
53
+ <a href="classes/NextUrlsInSQS.html#M000019">pop (NextUrlsInSQS)</a><br />
54
+ <a href="classes/NextUrlsInSQS.html#M000020">push (NextUrlsInSQS)</a><br />
55
+ <a href="classes/SpiderInstance.html#M000025">setup (SpiderInstance)</a><br />
56
+ <a href="classes/Spider.html#M000029">start_at (Spider)</a><br />
57
+ <a href="classes/SpiderInstance.html#M000023">store_next_urls_with (SpiderInstance)</a><br />
58
+ <a href="classes/SpiderInstance.html#M000026">teardown (SpiderInstance)</a><br />
59
+ <a href="files/spec/spec_helper_rb.html#M000003">with_memcached (spec/spec_helper.rb)</a><br />
60
+ <a href="files/spec/spec_helper_rb.html#M000002">with_web_server (spec/spec_helper.rb)</a><br />
34
61
  </div>
35
62
  </div>
36
63
  </body>
data/doc/index.html CHANGED
@@ -19,6 +19,6 @@
19
19
  <frame src="fr_class_index.html" name="Classes" />
20
20
  <frame src="fr_method_index.html" name="Methods" />
21
21
  </frameset>
22
- <frame src="files/lib/spider_rb.html" name="docwin" />
22
+ <frame src="files/spec/spec_helper_rb.html" name="docwin" />
23
23
  </frameset>
24
24
  </html>
@@ -53,7 +53,7 @@ class SpiderInstance
53
53
  @callbacks = {}
54
54
  @next_urls = [next_urls]
55
55
  @seen = seen
56
- @rules = rules || RobotRules.new('Ruby Spider 1.0')
56
+ @rules = rules || RobotRules.new('Ruby Spider 0.4.4')
57
57
  @robots_seen = robots_seen
58
58
  @headers = {}
59
59
  @setup = nil
@@ -227,12 +227,18 @@ class SpiderInstance
227
227
 
228
228
  # True if the robots.txt for that URL allows access to it.
229
229
  def allowed?(a_url, parsed_url) # :nodoc:
230
+ return false unless ['http','https'].include?(parsed_url.scheme)
230
231
  u = "#{parsed_url.scheme}://#{parsed_url.host}:#{parsed_url.port}/robots.txt"
232
+ parsed_u = URI.parse(u)
233
+ return false unless @url_checks.map{|url_check|url_check.call(a_url)}.all?
231
234
  begin
232
235
  unless @robots_seen.include?(u)
233
- open(u, 'User-Agent' => 'Ruby Spider',
234
- 'Accept' => 'text/html,text/xml,application/xml,text/plain') do |url|
235
- @rules.parse(u, url.read)
236
+ #open(u, 'User-Agent' => 'Ruby Spider',
237
+ # 'Accept' => 'text/html,text/xml,application/xml,text/plain', :ssl_verify => false) do |url|
238
+ # @rules.parse(u, url.read)
239
+ #end
240
+ get_page(parsed_u) do |r|
241
+ @rules.parse(u, r.body)
236
242
  end
237
243
  @robots_seen << u
238
244
  end
@@ -248,10 +254,12 @@ class SpiderInstance
248
254
  @seen << parsed_url
249
255
  begin
250
256
  http = Net::HTTP.new(parsed_url.host, parsed_url.port)
251
- http.use_ssl = parsed_url.scheme == 'https'
257
+ if parsed_url.scheme == 'https'
258
+ http.use_ssl = true
259
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
260
+ end
252
261
  # Uses start because http.finish cannot be called.
253
- r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri,
254
- @headers))}
262
+ r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri, @headers))}
255
263
  if r.redirect?
256
264
  get_page(URI.parse(construct_complete_url(parsed_url,r['Location'])), &block)
257
265
  else
data/spider.gemspec CHANGED
@@ -14,5 +14,5 @@ spec = Gem::Specification.new do |s|
14
14
  A Web spidering library: handles robots.txt, scraping, finding more
15
15
  links, and doing it all over again.
16
16
  EOF
17
- s.version = '0.4.3'
17
+ s.version = '0.4.4'
18
18
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.3
4
+ version: 0.4.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Nagro
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-10-09 00:00:00 -04:00
12
+ date: 2009-05-21 00:00:00 +00:00
13
13
  default_executable:
14
14
  dependencies: []
15
15
 
@@ -22,41 +22,103 @@ extensions: []
22
22
  extra_rdoc_files: []
23
23
 
24
24
  files:
25
- - CHANGES
25
+ - README
26
26
  - doc
27
+ - doc/fr_class_index.html
28
+ - doc/rdoc-style.css
29
+ - doc/fr_file_index.html
30
+ - doc/created.rid
31
+ - doc/fr_method_index.html
27
32
  - doc/classes
28
- - doc/classes/IncludedInMemcached.html
33
+ - doc/classes/RobotRules.html
34
+ - doc/classes/Spider.src
35
+ - doc/classes/Spider.src/M000029.html
36
+ - doc/classes/NextUrlsInSQS.src
37
+ - doc/classes/NextUrlsInSQS.src/M000018.html
38
+ - doc/classes/NextUrlsInSQS.src/M000019.html
39
+ - doc/classes/NextUrlsInSQS.src/M000020.html
40
+ - doc/classes/LoopingServlet.html
29
41
  - doc/classes/Spider.html
42
+ - doc/classes/BeStaticServerPages.html
43
+ - doc/classes/SpiderInstance.src
44
+ - doc/classes/SpiderInstance.src/M000021.html
45
+ - doc/classes/SpiderInstance.src/M000023.html
46
+ - doc/classes/SpiderInstance.src/M000027.html
47
+ - doc/classes/SpiderInstance.src/M000025.html
48
+ - doc/classes/SpiderInstance.src/M000026.html
49
+ - doc/classes/SpiderInstance.src/M000028.html
50
+ - doc/classes/SpiderInstance.src/M000022.html
51
+ - doc/classes/SpiderInstance.src/M000024.html
52
+ - doc/classes/LoopingServlet.src
53
+ - doc/classes/LoopingServlet.src/M000037.html
54
+ - doc/classes/IncludedInMemcached.html
55
+ - doc/classes/RobotRules.src
56
+ - doc/classes/RobotRules.src/M000035.html
57
+ - doc/classes/RobotRules.src/M000034.html
58
+ - doc/classes/RobotRules.src/M000036.html
59
+ - doc/classes/BeStaticServerPages.src
60
+ - doc/classes/BeStaticServerPages.src/M000032.html
61
+ - doc/classes/BeStaticServerPages.src/M000033.html
62
+ - doc/classes/BeStaticServerPages.src/M000030.html
63
+ - doc/classes/BeStaticServerPages.src/M000031.html
64
+ - doc/classes/NextUrlsInSQS.html
65
+ - doc/classes/IncludedInMemcached.src
66
+ - doc/classes/IncludedInMemcached.src/M000017.html
67
+ - doc/classes/IncludedInMemcached.src/M000015.html
68
+ - doc/classes/IncludedInMemcached.src/M000016.html
30
69
  - doc/classes/SpiderInstance.html
31
- - doc/created.rid
70
+ - doc/classes/QueryServlet.src
71
+ - doc/classes/QueryServlet.src/M000038.html
72
+ - doc/classes/QueryServlet.html
32
73
  - doc/files
74
+ - doc/files/README.html
75
+ - doc/files/spec
76
+ - doc/files/spec/spec_helper_rb.src
77
+ - doc/files/spec/spec_helper_rb.src/M000002.html
78
+ - doc/files/spec/spec_helper_rb.src/M000005.html
79
+ - doc/files/spec/spec_helper_rb.src/M000001.html
80
+ - doc/files/spec/spec_helper_rb.src/M000003.html
81
+ - doc/files/spec/spec_helper_rb.src/M000004.html
82
+ - doc/files/spec/spider_spec_rb.src
83
+ - doc/files/spec/spider_spec_rb.src/M000014.html
84
+ - doc/files/spec/spec_helper_rb.html
85
+ - doc/files/spec/spider
86
+ - doc/files/spec/spider/included_in_memcached_spec_rb.src
87
+ - doc/files/spec/spider/included_in_memcached_spec_rb.src/M000007.html
88
+ - doc/files/spec/spider/included_in_memcached_spec_rb.src/M000006.html
89
+ - doc/files/spec/spider/spider_instance_spec_rb.src
90
+ - doc/files/spec/spider/spider_instance_spec_rb.src/M000013.html
91
+ - doc/files/spec/spider/spider_instance_spec_rb.src/M000009.html
92
+ - doc/files/spec/spider/spider_instance_spec_rb.src/M000010.html
93
+ - doc/files/spec/spider/spider_instance_spec_rb.src/M000012.html
94
+ - doc/files/spec/spider/spider_instance_spec_rb.src/M000008.html
95
+ - doc/files/spec/spider/spider_instance_spec_rb.src/M000011.html
96
+ - doc/files/spec/spider/included_in_memcached_spec_rb.html
97
+ - doc/files/spec/spider/spider_instance_spec_rb.html
98
+ - doc/files/spec/spider_spec_rb.html
33
99
  - doc/files/lib
100
+ - doc/files/lib/spider_rb.html
34
101
  - doc/files/lib/spider
35
102
  - doc/files/lib/spider/included_in_memcached_rb.html
103
+ - doc/files/lib/spider/robot_rules_rb.html
104
+ - doc/files/lib/spider/next_urls_in_sqs_rb.html
36
105
  - doc/files/lib/spider/spider_instance_rb.html
37
- - doc/files/lib/spider_rb.html
38
- - doc/files/README.html
39
- - doc/fr_class_index.html
40
- - doc/fr_file_index.html
41
- - doc/fr_method_index.html
42
106
  - doc/index.html
43
- - doc/rdoc-style.css
44
- - lib
45
- - lib/spider
46
- - lib/spider/included_in_memcached.rb
47
- - lib/spider/next_urls_in_sqs.rb
48
- - lib/spider/robot_rules.rb
49
- - lib/spider/spider_instance.rb
50
- - lib/spider.rb
51
- - lib/test.rb
52
- - README
107
+ - spider.gemspec
53
108
  - spec
54
109
  - spec/spec_helper.rb
55
110
  - spec/spider
56
111
  - spec/spider/included_in_memcached_spec.rb
57
112
  - spec/spider/spider_instance_spec.rb
58
113
  - spec/spider_spec.rb
59
- - spider.gemspec
114
+ - CHANGES
115
+ - lib
116
+ - lib/spider.rb
117
+ - lib/spider
118
+ - lib/spider/robot_rules.rb
119
+ - lib/spider/next_urls_in_sqs.rb
120
+ - lib/spider/spider_instance.rb
121
+ - lib/spider/included_in_memcached.rb
60
122
  has_rdoc: true
61
123
  homepage: http://spider.rubyforge.org/
62
124
  post_install_message:
@@ -79,7 +141,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
79
141
  requirements: []
80
142
 
81
143
  rubyforge_project: spider
82
- rubygems_version: 1.0.1
144
+ rubygems_version: 1.3.0
83
145
  signing_key:
84
146
  specification_version: 2
85
147
  summary: A Web spidering library
data/lib/test.rb DELETED
@@ -1,27 +0,0 @@
1
- require 'spider.rb'
2
- require 'spider/next_urls_in_sqs.rb'
3
-
4
- class MyArray < Array
5
- def pop
6
- a_msg = super
7
- puts "pop: #{a_msg.inspect}"
8
- return a_msg
9
- end
10
-
11
- def push(a_msg)
12
- puts "push: #{a_msg.inspect}"
13
- super(a_msg)
14
- end
15
- end
16
-
17
- AWS_ACCESS_KEY = '0YA99M8Y09J2D4FEC602'
18
- AWS_SECRET_ACCESS_KEY = 'Sc9R9uiwbFYz7XhQqkPvSK3Bbq4tPYPVMWyDlF+a'
19
-
20
- #Spider.start_at("http://docs.huihoo.com/ruby/ruby-man-1.4/function.html") do |s|
21
- Spider.start_at("http://www.google.com") do |s|
22
- #s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY)
23
- s.store_next_urls_with MyArray.new
24
- s.on(:every) do |a_url, resp, prior_url|
25
- puts a_url
26
- end
27
- end