spider 0.4.3 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. data/CHANGES +6 -0
  2. data/README +3 -3
  3. data/doc/classes/BeStaticServerPages.html +197 -0
  4. data/doc/classes/BeStaticServerPages.src/M000030.html +19 -0
  5. data/doc/classes/BeStaticServerPages.src/M000031.html +19 -0
  6. data/doc/classes/BeStaticServerPages.src/M000032.html +18 -0
  7. data/doc/classes/BeStaticServerPages.src/M000033.html +18 -0
  8. data/doc/classes/IncludedInMemcached.html +18 -45
  9. data/doc/classes/IncludedInMemcached.src/M000015.html +18 -0
  10. data/doc/classes/IncludedInMemcached.src/M000016.html +18 -0
  11. data/doc/classes/IncludedInMemcached.src/M000017.html +18 -0
  12. data/doc/classes/LoopingServlet.html +137 -0
  13. data/doc/classes/LoopingServlet.src/M000037.html +23 -0
  14. data/doc/classes/NextUrlsInSQS.html +204 -0
  15. data/doc/classes/NextUrlsInSQS.src/M000018.html +19 -0
  16. data/doc/classes/NextUrlsInSQS.src/M000019.html +22 -0
  17. data/doc/classes/NextUrlsInSQS.src/M000020.html +19 -0
  18. data/doc/classes/QueryServlet.html +137 -0
  19. data/doc/classes/QueryServlet.src/M000038.html +19 -0
  20. data/doc/classes/RobotRules.html +175 -0
  21. data/doc/classes/RobotRules.src/M000034.html +19 -0
  22. data/doc/classes/RobotRules.src/M000035.html +67 -0
  23. data/doc/classes/RobotRules.src/M000036.html +24 -0
  24. data/doc/classes/Spider.html +5 -17
  25. data/doc/classes/Spider.src/M000029.html +21 -0
  26. data/doc/classes/SpiderInstance.html +72 -108
  27. data/doc/classes/SpiderInstance.src/M000021.html +18 -0
  28. data/doc/classes/SpiderInstance.src/M000022.html +22 -0
  29. data/doc/classes/SpiderInstance.src/M000023.html +22 -0
  30. data/doc/classes/SpiderInstance.src/M000024.html +24 -0
  31. data/doc/classes/SpiderInstance.src/M000025.html +18 -0
  32. data/doc/classes/SpiderInstance.src/M000026.html +18 -0
  33. data/doc/classes/SpiderInstance.src/M000027.html +18 -0
  34. data/doc/classes/SpiderInstance.src/M000028.html +18 -0
  35. data/doc/created.rid +1 -1
  36. data/doc/files/lib/spider/included_in_memcached_rb.html +29 -1
  37. data/doc/files/lib/spider/next_urls_in_sqs_rb.html +144 -0
  38. data/doc/files/lib/spider/robot_rules_rb.html +114 -0
  39. data/doc/files/lib/spider/spider_instance_rb.html +1 -2
  40. data/doc/files/lib/spider_rb.html +40 -9
  41. data/doc/files/spec/spec_helper_rb.html +196 -0
  42. data/doc/files/spec/spec_helper_rb.src/M000001.html +20 -0
  43. data/doc/files/spec/spec_helper_rb.src/M000002.html +26 -0
  44. data/doc/files/spec/spec_helper_rb.src/M000003.html +24 -0
  45. data/doc/files/spec/spec_helper_rb.src/M000004.html +18 -0
  46. data/doc/files/spec/spec_helper_rb.src/M000005.html +23 -0
  47. data/doc/files/spec/spider/included_in_memcached_spec_rb.html +142 -0
  48. data/doc/files/spec/spider/included_in_memcached_spec_rb.src/M000006.html +19 -0
  49. data/doc/files/spec/spider/included_in_memcached_spec_rb.src/M000007.html +18 -0
  50. data/doc/files/spec/spider/spider_instance_spec_rb.html +210 -0
  51. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000008.html +21 -0
  52. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000009.html +19 -0
  53. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000010.html +19 -0
  54. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000011.html +27 -0
  55. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000012.html +26 -0
  56. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000013.html +27 -0
  57. data/doc/files/spec/spider_spec_rb.html +127 -0
  58. data/doc/files/spec/spider_spec_rb.src/M000014.html +23 -0
  59. data/doc/fr_class_index.html +5 -0
  60. data/doc/fr_file_index.html +6 -1
  61. data/doc/fr_method_index.html +38 -11
  62. data/doc/index.html +1 -1
  63. data/lib/spider/spider_instance.rb +15 -7
  64. data/spider.gemspec +1 -1
  65. metadata +84 -22
  66. data/lib/test.rb +0 -27
@@ -0,0 +1,23 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html>
7
+ <head>
8
+ <title>find_pages_with_static_server (spec/spider_spec.rb)</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
11
+ </head>
12
+ <body class="standalone-code">
13
+ <pre><span class="ruby-comment cmt"># File spec/spider_spec.rb, line 25</span>
14
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">find_pages_with_static_server</span>(<span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
15
+ <span class="ruby-identifier">pages</span> = []
16
+ <span class="ruby-constant">Spider</span>.<span class="ruby-identifier">start_at</span>(<span class="ruby-value str">'http://localhost:8888/'</span>) <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">s</span><span class="ruby-operator">|</span>
17
+ <span class="ruby-identifier">block</span>.<span class="ruby-identifier">call</span>(<span class="ruby-identifier">s</span>) <span class="ruby-keyword kw">unless</span> <span class="ruby-identifier">block</span>.<span class="ruby-identifier">nil?</span>
18
+ <span class="ruby-identifier">s</span>.<span class="ruby-identifier">on</span>(<span class="ruby-identifier">:every</span>){ <span class="ruby-operator">|</span><span class="ruby-identifier">u</span>,<span class="ruby-identifier">r</span>,<span class="ruby-identifier">p</span><span class="ruby-operator">|</span> <span class="ruby-identifier">pages</span> <span class="ruby-operator">&lt;&lt;</span> <span class="ruby-identifier">u</span> }
19
+ <span class="ruby-keyword kw">end</span>
20
+ <span class="ruby-identifier">pages</span>
21
+ <span class="ruby-keyword kw">end</span></pre>
22
+ </body>
23
+ </html>
@@ -20,7 +20,12 @@
20
20
  <div id="index">
21
21
  <h1 class="section-bar">Classes</h1>
22
22
  <div id="index-entries">
23
+ <a href="classes/BeStaticServerPages.html">BeStaticServerPages</a><br />
23
24
  <a href="classes/IncludedInMemcached.html">IncludedInMemcached</a><br />
25
+ <a href="classes/LoopingServlet.html">LoopingServlet</a><br />
26
+ <a href="classes/NextUrlsInSQS.html">NextUrlsInSQS</a><br />
27
+ <a href="classes/QueryServlet.html">QueryServlet</a><br />
28
+ <a href="classes/RobotRules.html">RobotRules</a><br />
24
29
  <a href="classes/Spider.html">Spider</a><br />
25
30
  <a href="classes/SpiderInstance.html">SpiderInstance</a><br />
26
31
  </div>
@@ -20,10 +20,15 @@
20
20
  <div id="index">
21
21
  <h1 class="section-bar">Files</h1>
22
22
  <div id="index-entries">
23
- <a href="files/README.html">README</a><br />
24
23
  <a href="files/lib/spider_rb.html">lib/spider.rb</a><br />
25
24
  <a href="files/lib/spider/included_in_memcached_rb.html">lib/spider/included_in_memcached.rb</a><br />
25
+ <a href="files/lib/spider/next_urls_in_sqs_rb.html">lib/spider/next_urls_in_sqs.rb</a><br />
26
+ <a href="files/lib/spider/robot_rules_rb.html">lib/spider/robot_rules.rb</a><br />
26
27
  <a href="files/lib/spider/spider_instance_rb.html">lib/spider/spider_instance.rb</a><br />
28
+ <a href="files/spec/spec_helper_rb.html">spec/spec_helper.rb</a><br />
29
+ <a href="files/spec/spider/included_in_memcached_spec_rb.html">spec/spider/included_in_memcached_spec.rb</a><br />
30
+ <a href="files/spec/spider/spider_instance_spec_rb.html">spec/spider/spider_instance_spec.rb</a><br />
31
+ <a href="files/spec/spider_spec_rb.html">spec/spider_spec.rb</a><br />
27
32
  </div>
28
33
  </div>
29
34
  </body>
@@ -20,17 +20,44 @@
20
20
  <div id="index">
21
21
  <h1 class="section-bar">Methods</h1>
22
22
  <div id="index-entries">
23
- <a href="classes/IncludedInMemcached.html#M000002"><< (IncludedInMemcached)</a><br />
24
- <a href="classes/SpiderInstance.html#M000004">add_url_check (SpiderInstance)</a><br />
25
- <a href="classes/SpiderInstance.html#M000005">check_already_seen_with (SpiderInstance)</a><br />
26
- <a href="classes/SpiderInstance.html#M000010">clear_headers (SpiderInstance)</a><br />
27
- <a href="classes/SpiderInstance.html#M000009">headers (SpiderInstance)</a><br />
28
- <a href="classes/IncludedInMemcached.html#M000003">include? (IncludedInMemcached)</a><br />
29
- <a href="classes/IncludedInMemcached.html#M000001">new (IncludedInMemcached)</a><br />
30
- <a href="classes/SpiderInstance.html#M000006">on (SpiderInstance)</a><br />
31
- <a href="classes/SpiderInstance.html#M000007">setup (SpiderInstance)</a><br />
32
- <a href="classes/Spider.html#M000011">start_at (Spider)</a><br />
33
- <a href="classes/SpiderInstance.html#M000008">teardown (SpiderInstance)</a><br />
23
+ <a href="classes/IncludedInMemcached.html#M000016"><< (IncludedInMemcached)</a><br />
24
+ <a href="classes/SpiderInstance.html#M000021">add_url_check (SpiderInstance)</a><br />
25
+ <a href="files/spec/spider/included_in_memcached_spec_rb.html#M000007">after_specing_memcached (spec/spider/included_in_memcached_spec.rb)</a><br />
26
+ <a href="classes/RobotRules.html#M000036">allowed? (RobotRules)</a><br />
27
+ <a href="files/spec/spec_helper_rb.html#M000004">be_static_server_pages (spec/spec_helper.rb)</a><br />
28
+ <a href="files/spec/spider/included_in_memcached_spec_rb.html#M000006">before_specing_memcached (spec/spider/included_in_memcached_spec.rb)</a><br />
29
+ <a href="files/spec/spider/spider_instance_spec_rb.html#M000012">callback_arguments_on (spec/spider/spider_instance_spec.rb)</a><br />
30
+ <a href="classes/SpiderInstance.html#M000022">check_already_seen_with (SpiderInstance)</a><br />
31
+ <a href="classes/SpiderInstance.html#M000028">clear_headers (SpiderInstance)</a><br />
32
+ <a href="classes/BeStaticServerPages.html#M000033">description (BeStaticServerPages)</a><br />
33
+ <a href="classes/QueryServlet.html#M000038">do_GET (QueryServlet)</a><br />
34
+ <a href="classes/LoopingServlet.html#M000037">do_GET (LoopingServlet)</a><br />
35
+ <a href="classes/BeStaticServerPages.html#M000032">failure_message (BeStaticServerPages)</a><br />
36
+ <a href="files/spec/spider_spec_rb.html#M000014">find_pages_with_static_server (spec/spider_spec.rb)</a><br />
37
+ <a href="classes/SpiderInstance.html#M000027">headers (SpiderInstance)</a><br />
38
+ <a href="classes/IncludedInMemcached.html#M000017">include? (IncludedInMemcached)</a><br />
39
+ <a href="files/spec/spider/spider_instance_spec_rb.html#M000013">it_should_prevent_cycles_with (spec/spider/spider_instance_spec.rb)</a><br />
40
+ <a href="files/spec/spec_helper_rb.html#M000001">local_require (spec/spec_helper.rb)</a><br />
41
+ <a href="classes/BeStaticServerPages.html#M000031">matches? (BeStaticServerPages)</a><br />
42
+ <a href="files/spec/spider/spider_instance_spec_rb.html#M000010">mock_failed_http (spec/spider/spider_instance_spec.rb)</a><br />
43
+ <a href="files/spec/spider/spider_instance_spec_rb.html#M000008">mock_http (spec/spider/spider_instance_spec.rb)</a><br />
44
+ <a href="files/spec/spider/spider_instance_spec_rb.html#M000011">mock_redirect_http (spec/spider/spider_instance_spec.rb)</a><br />
45
+ <a href="files/spec/spider/spider_instance_spec_rb.html#M000009">mock_successful_http (spec/spider/spider_instance_spec.rb)</a><br />
46
+ <a href="classes/IncludedInMemcached.html#M000015">new (IncludedInMemcached)</a><br />
47
+ <a href="classes/NextUrlsInSQS.html#M000018">new (NextUrlsInSQS)</a><br />
48
+ <a href="classes/BeStaticServerPages.html#M000030">new (BeStaticServerPages)</a><br />
49
+ <a href="classes/RobotRules.html#M000034">new (RobotRules)</a><br />
50
+ <a href="files/spec/spec_helper_rb.html#M000005">null_logger (spec/spec_helper.rb)</a><br />
51
+ <a href="classes/SpiderInstance.html#M000024">on (SpiderInstance)</a><br />
52
+ <a href="classes/RobotRules.html#M000035">parse (RobotRules)</a><br />
53
+ <a href="classes/NextUrlsInSQS.html#M000019">pop (NextUrlsInSQS)</a><br />
54
+ <a href="classes/NextUrlsInSQS.html#M000020">push (NextUrlsInSQS)</a><br />
55
+ <a href="classes/SpiderInstance.html#M000025">setup (SpiderInstance)</a><br />
56
+ <a href="classes/Spider.html#M000029">start_at (Spider)</a><br />
57
+ <a href="classes/SpiderInstance.html#M000023">store_next_urls_with (SpiderInstance)</a><br />
58
+ <a href="classes/SpiderInstance.html#M000026">teardown (SpiderInstance)</a><br />
59
+ <a href="files/spec/spec_helper_rb.html#M000003">with_memcached (spec/spec_helper.rb)</a><br />
60
+ <a href="files/spec/spec_helper_rb.html#M000002">with_web_server (spec/spec_helper.rb)</a><br />
34
61
  </div>
35
62
  </div>
36
63
  </body>
data/doc/index.html CHANGED
@@ -19,6 +19,6 @@
19
19
  <frame src="fr_class_index.html" name="Classes" />
20
20
  <frame src="fr_method_index.html" name="Methods" />
21
21
  </frameset>
22
- <frame src="files/lib/spider_rb.html" name="docwin" />
22
+ <frame src="files/spec/spec_helper_rb.html" name="docwin" />
23
23
  </frameset>
24
24
  </html>
@@ -53,7 +53,7 @@ class SpiderInstance
53
53
  @callbacks = {}
54
54
  @next_urls = [next_urls]
55
55
  @seen = seen
56
- @rules = rules || RobotRules.new('Ruby Spider 1.0')
56
+ @rules = rules || RobotRules.new('Ruby Spider 0.4.4')
57
57
  @robots_seen = robots_seen
58
58
  @headers = {}
59
59
  @setup = nil
@@ -227,12 +227,18 @@ class SpiderInstance
227
227
 
228
228
  # True if the robots.txt for that URL allows access to it.
229
229
  def allowed?(a_url, parsed_url) # :nodoc:
230
+ return false unless ['http','https'].include?(parsed_url.scheme)
230
231
  u = "#{parsed_url.scheme}://#{parsed_url.host}:#{parsed_url.port}/robots.txt"
232
+ parsed_u = URI.parse(u)
233
+ return false unless @url_checks.map{|url_check|url_check.call(a_url)}.all?
231
234
  begin
232
235
  unless @robots_seen.include?(u)
233
- open(u, 'User-Agent' => 'Ruby Spider',
234
- 'Accept' => 'text/html,text/xml,application/xml,text/plain') do |url|
235
- @rules.parse(u, url.read)
236
+ #open(u, 'User-Agent' => 'Ruby Spider',
237
+ # 'Accept' => 'text/html,text/xml,application/xml,text/plain', :ssl_verify => false) do |url|
238
+ # @rules.parse(u, url.read)
239
+ #end
240
+ get_page(parsed_u) do |r|
241
+ @rules.parse(u, r.body)
236
242
  end
237
243
  @robots_seen << u
238
244
  end
@@ -248,10 +254,12 @@ class SpiderInstance
248
254
  @seen << parsed_url
249
255
  begin
250
256
  http = Net::HTTP.new(parsed_url.host, parsed_url.port)
251
- http.use_ssl = parsed_url.scheme == 'https'
257
+ if parsed_url.scheme == 'https'
258
+ http.use_ssl = true
259
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
260
+ end
252
261
  # Uses start because http.finish cannot be called.
253
- r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri,
254
- @headers))}
262
+ r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri, @headers))}
255
263
  if r.redirect?
256
264
  get_page(URI.parse(construct_complete_url(parsed_url,r['Location'])), &block)
257
265
  else
data/spider.gemspec CHANGED
@@ -14,5 +14,5 @@ spec = Gem::Specification.new do |s|
14
14
  A Web spidering library: handles robots.txt, scraping, finding more
15
15
  links, and doing it all over again.
16
16
  EOF
17
- s.version = '0.4.3'
17
+ s.version = '0.4.4'
18
18
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.3
4
+ version: 0.4.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Nagro
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-10-09 00:00:00 -04:00
12
+ date: 2009-05-21 00:00:00 +00:00
13
13
  default_executable:
14
14
  dependencies: []
15
15
 
@@ -22,41 +22,103 @@ extensions: []
22
22
  extra_rdoc_files: []
23
23
 
24
24
  files:
25
- - CHANGES
25
+ - README
26
26
  - doc
27
+ - doc/fr_class_index.html
28
+ - doc/rdoc-style.css
29
+ - doc/fr_file_index.html
30
+ - doc/created.rid
31
+ - doc/fr_method_index.html
27
32
  - doc/classes
28
- - doc/classes/IncludedInMemcached.html
33
+ - doc/classes/RobotRules.html
34
+ - doc/classes/Spider.src
35
+ - doc/classes/Spider.src/M000029.html
36
+ - doc/classes/NextUrlsInSQS.src
37
+ - doc/classes/NextUrlsInSQS.src/M000018.html
38
+ - doc/classes/NextUrlsInSQS.src/M000019.html
39
+ - doc/classes/NextUrlsInSQS.src/M000020.html
40
+ - doc/classes/LoopingServlet.html
29
41
  - doc/classes/Spider.html
42
+ - doc/classes/BeStaticServerPages.html
43
+ - doc/classes/SpiderInstance.src
44
+ - doc/classes/SpiderInstance.src/M000021.html
45
+ - doc/classes/SpiderInstance.src/M000023.html
46
+ - doc/classes/SpiderInstance.src/M000027.html
47
+ - doc/classes/SpiderInstance.src/M000025.html
48
+ - doc/classes/SpiderInstance.src/M000026.html
49
+ - doc/classes/SpiderInstance.src/M000028.html
50
+ - doc/classes/SpiderInstance.src/M000022.html
51
+ - doc/classes/SpiderInstance.src/M000024.html
52
+ - doc/classes/LoopingServlet.src
53
+ - doc/classes/LoopingServlet.src/M000037.html
54
+ - doc/classes/IncludedInMemcached.html
55
+ - doc/classes/RobotRules.src
56
+ - doc/classes/RobotRules.src/M000035.html
57
+ - doc/classes/RobotRules.src/M000034.html
58
+ - doc/classes/RobotRules.src/M000036.html
59
+ - doc/classes/BeStaticServerPages.src
60
+ - doc/classes/BeStaticServerPages.src/M000032.html
61
+ - doc/classes/BeStaticServerPages.src/M000033.html
62
+ - doc/classes/BeStaticServerPages.src/M000030.html
63
+ - doc/classes/BeStaticServerPages.src/M000031.html
64
+ - doc/classes/NextUrlsInSQS.html
65
+ - doc/classes/IncludedInMemcached.src
66
+ - doc/classes/IncludedInMemcached.src/M000017.html
67
+ - doc/classes/IncludedInMemcached.src/M000015.html
68
+ - doc/classes/IncludedInMemcached.src/M000016.html
30
69
  - doc/classes/SpiderInstance.html
31
- - doc/created.rid
70
+ - doc/classes/QueryServlet.src
71
+ - doc/classes/QueryServlet.src/M000038.html
72
+ - doc/classes/QueryServlet.html
32
73
  - doc/files
74
+ - doc/files/README.html
75
+ - doc/files/spec
76
+ - doc/files/spec/spec_helper_rb.src
77
+ - doc/files/spec/spec_helper_rb.src/M000002.html
78
+ - doc/files/spec/spec_helper_rb.src/M000005.html
79
+ - doc/files/spec/spec_helper_rb.src/M000001.html
80
+ - doc/files/spec/spec_helper_rb.src/M000003.html
81
+ - doc/files/spec/spec_helper_rb.src/M000004.html
82
+ - doc/files/spec/spider_spec_rb.src
83
+ - doc/files/spec/spider_spec_rb.src/M000014.html
84
+ - doc/files/spec/spec_helper_rb.html
85
+ - doc/files/spec/spider
86
+ - doc/files/spec/spider/included_in_memcached_spec_rb.src
87
+ - doc/files/spec/spider/included_in_memcached_spec_rb.src/M000007.html
88
+ - doc/files/spec/spider/included_in_memcached_spec_rb.src/M000006.html
89
+ - doc/files/spec/spider/spider_instance_spec_rb.src
90
+ - doc/files/spec/spider/spider_instance_spec_rb.src/M000013.html
91
+ - doc/files/spec/spider/spider_instance_spec_rb.src/M000009.html
92
+ - doc/files/spec/spider/spider_instance_spec_rb.src/M000010.html
93
+ - doc/files/spec/spider/spider_instance_spec_rb.src/M000012.html
94
+ - doc/files/spec/spider/spider_instance_spec_rb.src/M000008.html
95
+ - doc/files/spec/spider/spider_instance_spec_rb.src/M000011.html
96
+ - doc/files/spec/spider/included_in_memcached_spec_rb.html
97
+ - doc/files/spec/spider/spider_instance_spec_rb.html
98
+ - doc/files/spec/spider_spec_rb.html
33
99
  - doc/files/lib
100
+ - doc/files/lib/spider_rb.html
34
101
  - doc/files/lib/spider
35
102
  - doc/files/lib/spider/included_in_memcached_rb.html
103
+ - doc/files/lib/spider/robot_rules_rb.html
104
+ - doc/files/lib/spider/next_urls_in_sqs_rb.html
36
105
  - doc/files/lib/spider/spider_instance_rb.html
37
- - doc/files/lib/spider_rb.html
38
- - doc/files/README.html
39
- - doc/fr_class_index.html
40
- - doc/fr_file_index.html
41
- - doc/fr_method_index.html
42
106
  - doc/index.html
43
- - doc/rdoc-style.css
44
- - lib
45
- - lib/spider
46
- - lib/spider/included_in_memcached.rb
47
- - lib/spider/next_urls_in_sqs.rb
48
- - lib/spider/robot_rules.rb
49
- - lib/spider/spider_instance.rb
50
- - lib/spider.rb
51
- - lib/test.rb
52
- - README
107
+ - spider.gemspec
53
108
  - spec
54
109
  - spec/spec_helper.rb
55
110
  - spec/spider
56
111
  - spec/spider/included_in_memcached_spec.rb
57
112
  - spec/spider/spider_instance_spec.rb
58
113
  - spec/spider_spec.rb
59
- - spider.gemspec
114
+ - CHANGES
115
+ - lib
116
+ - lib/spider.rb
117
+ - lib/spider
118
+ - lib/spider/robot_rules.rb
119
+ - lib/spider/next_urls_in_sqs.rb
120
+ - lib/spider/spider_instance.rb
121
+ - lib/spider/included_in_memcached.rb
60
122
  has_rdoc: true
61
123
  homepage: http://spider.rubyforge.org/
62
124
  post_install_message:
@@ -79,7 +141,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
79
141
  requirements: []
80
142
 
81
143
  rubyforge_project: spider
82
- rubygems_version: 1.0.1
144
+ rubygems_version: 1.3.0
83
145
  signing_key:
84
146
  specification_version: 2
85
147
  summary: A Web spidering library
data/lib/test.rb DELETED
@@ -1,27 +0,0 @@
1
- require 'spider.rb'
2
- require 'spider/next_urls_in_sqs.rb'
3
-
4
- class MyArray < Array
5
- def pop
6
- a_msg = super
7
- puts "pop: #{a_msg.inspect}"
8
- return a_msg
9
- end
10
-
11
- def push(a_msg)
12
- puts "push: #{a_msg.inspect}"
13
- super(a_msg)
14
- end
15
- end
16
-
17
- AWS_ACCESS_KEY = '0YA99M8Y09J2D4FEC602'
18
- AWS_SECRET_ACCESS_KEY = 'Sc9R9uiwbFYz7XhQqkPvSK3Bbq4tPYPVMWyDlF+a'
19
-
20
- #Spider.start_at("http://docs.huihoo.com/ruby/ruby-man-1.4/function.html") do |s|
21
- Spider.start_at("http://www.google.com") do |s|
22
- #s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY)
23
- s.store_next_urls_with MyArray.new
24
- s.on(:every) do |a_url, resp, prior_url|
25
- puts a_url
26
- end
27
- end