spider 0.4.4 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. checksums.yaml +7 -0
  2. data/AUTHORS +12 -0
  3. data/CHANGES +6 -0
  4. data/LICENSE +21 -0
  5. data/{README → README.md} +50 -43
  6. data/lib/spider.rb +12 -29
  7. data/lib/spider/included_in_memcached.rb +1 -24
  8. data/lib/spider/next_urls_in_sqs.rb +6 -29
  9. data/lib/spider/robot_rules.rb +61 -57
  10. data/lib/spider/spider_instance.rb +8 -31
  11. data/spider.gemspec +4 -2
  12. metadata +33 -124
  13. data/doc/classes/BeStaticServerPages.html +0 -197
  14. data/doc/classes/BeStaticServerPages.src/M000030.html +0 -19
  15. data/doc/classes/BeStaticServerPages.src/M000031.html +0 -19
  16. data/doc/classes/BeStaticServerPages.src/M000032.html +0 -18
  17. data/doc/classes/BeStaticServerPages.src/M000033.html +0 -18
  18. data/doc/classes/IncludedInMemcached.html +0 -199
  19. data/doc/classes/IncludedInMemcached.src/M000015.html +0 -18
  20. data/doc/classes/IncludedInMemcached.src/M000016.html +0 -18
  21. data/doc/classes/IncludedInMemcached.src/M000017.html +0 -18
  22. data/doc/classes/LoopingServlet.html +0 -137
  23. data/doc/classes/LoopingServlet.src/M000037.html +0 -23
  24. data/doc/classes/NextUrlsInSQS.html +0 -204
  25. data/doc/classes/NextUrlsInSQS.src/M000018.html +0 -19
  26. data/doc/classes/NextUrlsInSQS.src/M000019.html +0 -22
  27. data/doc/classes/NextUrlsInSQS.src/M000020.html +0 -19
  28. data/doc/classes/QueryServlet.html +0 -137
  29. data/doc/classes/QueryServlet.src/M000038.html +0 -19
  30. data/doc/classes/RobotRules.html +0 -175
  31. data/doc/classes/RobotRules.src/M000034.html +0 -19
  32. data/doc/classes/RobotRules.src/M000035.html +0 -67
  33. data/doc/classes/RobotRules.src/M000036.html +0 -24
  34. data/doc/classes/Spider.html +0 -170
  35. data/doc/classes/Spider.src/M000029.html +0 -21
  36. data/doc/classes/SpiderInstance.html +0 -345
  37. data/doc/classes/SpiderInstance.src/M000021.html +0 -18
  38. data/doc/classes/SpiderInstance.src/M000022.html +0 -22
  39. data/doc/classes/SpiderInstance.src/M000023.html +0 -22
  40. data/doc/classes/SpiderInstance.src/M000024.html +0 -24
  41. data/doc/classes/SpiderInstance.src/M000025.html +0 -18
  42. data/doc/classes/SpiderInstance.src/M000026.html +0 -18
  43. data/doc/classes/SpiderInstance.src/M000027.html +0 -18
  44. data/doc/classes/SpiderInstance.src/M000028.html +0 -18
  45. data/doc/created.rid +0 -1
  46. data/doc/files/README.html +0 -223
  47. data/doc/files/lib/spider/included_in_memcached_rb.html +0 -142
  48. data/doc/files/lib/spider/next_urls_in_sqs_rb.html +0 -144
  49. data/doc/files/lib/spider/robot_rules_rb.html +0 -114
  50. data/doc/files/lib/spider/spider_instance_rb.html +0 -117
  51. data/doc/files/lib/spider_rb.html +0 -254
  52. data/doc/files/spec/spec_helper_rb.html +0 -196
  53. data/doc/files/spec/spec_helper_rb.src/M000001.html +0 -20
  54. data/doc/files/spec/spec_helper_rb.src/M000002.html +0 -26
  55. data/doc/files/spec/spec_helper_rb.src/M000003.html +0 -24
  56. data/doc/files/spec/spec_helper_rb.src/M000004.html +0 -18
  57. data/doc/files/spec/spec_helper_rb.src/M000005.html +0 -23
  58. data/doc/files/spec/spider/included_in_memcached_spec_rb.html +0 -142
  59. data/doc/files/spec/spider/included_in_memcached_spec_rb.src/M000006.html +0 -19
  60. data/doc/files/spec/spider/included_in_memcached_spec_rb.src/M000007.html +0 -18
  61. data/doc/files/spec/spider/spider_instance_spec_rb.html +0 -210
  62. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000008.html +0 -21
  63. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000009.html +0 -19
  64. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000010.html +0 -19
  65. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000011.html +0 -27
  66. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000012.html +0 -26
  67. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000013.html +0 -27
  68. data/doc/files/spec/spider_spec_rb.html +0 -127
  69. data/doc/files/spec/spider_spec_rb.src/M000014.html +0 -23
  70. data/doc/fr_class_index.html +0 -34
  71. data/doc/fr_file_index.html +0 -35
  72. data/doc/fr_method_index.html +0 -64
  73. data/doc/index.html +0 -24
  74. data/doc/rdoc-style.css +0 -208
@@ -1,114 +0,0 @@
1
- <?xml version="1.0" encoding="iso-8859-1"?>
2
- <!DOCTYPE html
3
- PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
-
6
- <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
- <head>
8
- <title>File: robot_rules.rb</title>
9
- <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
- <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
- <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
12
- <script type="text/javascript">
13
- // <![CDATA[
14
-
15
- function popupCode( url ) {
16
- window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
- }
18
-
19
- function toggleCode( id ) {
20
- if ( document.getElementById )
21
- elem = document.getElementById( id );
22
- else if ( document.all )
23
- elem = eval( "document.all." + id );
24
- else
25
- return false;
26
-
27
- elemStyle = elem.style;
28
-
29
- if ( elemStyle.display != "block" ) {
30
- elemStyle.display = "block"
31
- } else {
32
- elemStyle.display = "none"
33
- }
34
-
35
- return true;
36
- }
37
-
38
- // Make codeblocks hidden by default
39
- document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
-
41
- // ]]>
42
- </script>
43
-
44
- </head>
45
- <body>
46
-
47
-
48
-
49
- <div id="fileHeader">
50
- <h1>robot_rules.rb</h1>
51
- <table class="header-table">
52
- <tr class="top-aligned-row">
53
- <td><strong>Path:</strong></td>
54
- <td>lib/spider/robot_rules.rb
55
- </td>
56
- </tr>
57
- <tr class="top-aligned-row">
58
- <td><strong>Last Update:</strong></td>
59
- <td>Thu May 21 13:19:06 +0000 2009</td>
60
- </tr>
61
- </table>
62
- </div>
63
- <!-- banner header -->
64
-
65
- <div id="bodyContent">
66
-
67
-
68
-
69
- <div id="contextContent">
70
-
71
- <div id="description">
72
- <p>
73
- Understand robots.txt.
74
- </p>
75
-
76
- </div>
77
-
78
- <div id="requires-list">
79
- <h3 class="section-bar">Required files</h3>
80
-
81
- <div class="name-list">
82
- uri&nbsp;&nbsp;
83
- </div>
84
- </div>
85
-
86
- </div>
87
-
88
-
89
- </div>
90
-
91
-
92
- <!-- if includes -->
93
-
94
- <div id="section">
95
-
96
-
97
-
98
-
99
-
100
-
101
-
102
-
103
- <!-- if method_list -->
104
-
105
-
106
- </div>
107
-
108
-
109
- <div id="validator-badges">
110
- <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
111
- </div>
112
-
113
- </body>
114
- </html>
@@ -1,117 +0,0 @@
1
- <?xml version="1.0" encoding="iso-8859-1"?>
2
- <!DOCTYPE html
3
- PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
-
6
- <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
- <head>
8
- <title>File: spider_instance.rb</title>
9
- <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
- <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
- <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
12
- <script type="text/javascript">
13
- // <![CDATA[
14
-
15
- function popupCode( url ) {
16
- window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
- }
18
-
19
- function toggleCode( id ) {
20
- if ( document.getElementById )
21
- elem = document.getElementById( id );
22
- else if ( document.all )
23
- elem = eval( "document.all." + id );
24
- else
25
- return false;
26
-
27
- elemStyle = elem.style;
28
-
29
- if ( elemStyle.display != "block" ) {
30
- elemStyle.display = "block"
31
- } else {
32
- elemStyle.display = "none"
33
- }
34
-
35
- return true;
36
- }
37
-
38
- // Make codeblocks hidden by default
39
- document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
-
41
- // ]]>
42
- </script>
43
-
44
- </head>
45
- <body>
46
-
47
-
48
-
49
- <div id="fileHeader">
50
- <h1>spider_instance.rb</h1>
51
- <table class="header-table">
52
- <tr class="top-aligned-row">
53
- <td><strong>Path:</strong></td>
54
- <td>lib/spider/spider_instance.rb
55
- </td>
56
- </tr>
57
- <tr class="top-aligned-row">
58
- <td><strong>Last Update:</strong></td>
59
- <td>Thu May 21 15:38:44 +0000 2009</td>
60
- </tr>
61
- </table>
62
- </div>
63
- <!-- banner header -->
64
-
65
- <div id="bodyContent">
66
-
67
-
68
-
69
- <div id="contextContent">
70
-
71
- <div id="description">
72
- <p>
73
- Specialized spidering rules.
74
- </p>
75
-
76
- </div>
77
-
78
- <div id="requires-list">
79
- <h3 class="section-bar">Required files</h3>
80
-
81
- <div class="name-list">
82
- open-uri&nbsp;&nbsp;
83
- uri&nbsp;&nbsp;
84
- net/http&nbsp;&nbsp;
85
- net/https&nbsp;&nbsp;
86
- </div>
87
- </div>
88
-
89
- </div>
90
-
91
-
92
- </div>
93
-
94
-
95
- <!-- if includes -->
96
-
97
- <div id="section">
98
-
99
-
100
-
101
-
102
-
103
-
104
-
105
-
106
- <!-- if method_list -->
107
-
108
-
109
- </div>
110
-
111
-
112
- <div id="validator-badges">
113
- <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
114
- </div>
115
-
116
- </body>
117
- </html>
@@ -1,254 +0,0 @@
1
- <?xml version="1.0" encoding="iso-8859-1"?>
2
- <!DOCTYPE html
3
- PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
-
6
- <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
- <head>
8
- <title>File: spider.rb</title>
9
- <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
- <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
- <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
12
- <script type="text/javascript">
13
- // <![CDATA[
14
-
15
- function popupCode( url ) {
16
- window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
- }
18
-
19
- function toggleCode( id ) {
20
- if ( document.getElementById )
21
- elem = document.getElementById( id );
22
- else if ( document.all )
23
- elem = eval( "document.all." + id );
24
- else
25
- return false;
26
-
27
- elemStyle = elem.style;
28
-
29
- if ( elemStyle.display != "block" ) {
30
- elemStyle.display = "block"
31
- } else {
32
- elemStyle.display = "none"
33
- }
34
-
35
- return true;
36
- }
37
-
38
- // Make codeblocks hidden by default
39
- document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
-
41
- // ]]>
42
- </script>
43
-
44
- </head>
45
- <body>
46
-
47
-
48
-
49
- <div id="fileHeader">
50
- <h1>spider.rb</h1>
51
- <table class="header-table">
52
- <tr class="top-aligned-row">
53
- <td><strong>Path:</strong></td>
54
- <td>lib/spider.rb
55
- </td>
56
- </tr>
57
- <tr class="top-aligned-row">
58
- <td><strong>Last Update:</strong></td>
59
- <td>Thu May 21 13:19:06 +0000 2009</td>
60
- </tr>
61
- </table>
62
- </div>
63
- <!-- banner header -->
64
-
65
- <div id="bodyContent">
66
-
67
-
68
-
69
- <div id="contextContent">
70
-
71
- <div id="description">
72
- <p>
73
- Copyright 2007-2008 Mike Burns &amp; John Nagro
74
- </p>
75
- <p>
76
- <a href="../../classes/Spider.html">Spider</a>, a Web spidering library for
77
- Ruby. It handles the robots.txt, scraping, collecting, and looping so that
78
- you can just handle the data.
79
- </p>
80
- <h2>Examples</h2>
81
- <h3>Crawl the Web, loading each page in turn, until you run out of memory</h3>
82
- <pre>
83
- require 'spider'
84
- Spider.start_at('http://mike-burns.com/') {}
85
- </pre>
86
- <h3>To handle erroneous responses</h3>
87
- <pre>
88
- require 'spider'
89
- Spider.start_at('http://mike-burns.com/') do |s|
90
- s.on :failure do |a_url, resp, prior_url|
91
- puts &quot;URL failed: #{a_url}&quot;
92
- puts &quot; linked from #{prior_url}&quot;
93
- end
94
- end
95
- </pre>
96
- <h3>Or handle successful responses</h3>
97
- <pre>
98
- require 'spider'
99
- Spider.start_at('http://mike-burns.com/') do |s|
100
- s.on :success do |a_url, resp, prior_url|
101
- puts &quot;#{a_url}: #{resp.code}&quot;
102
- puts resp.body
103
- puts
104
- end
105
- end
106
- </pre>
107
- <h3>Limit to just one domain</h3>
108
- <pre>
109
- require 'spider'
110
- Spider.start_at('http://mike-burns.com/') do |s|
111
- s.add_url_check do |a_url|
112
- a_url =~ %r{^http://mike-burns.com.*}
113
- end
114
- end
115
- </pre>
116
- <h3>Pass headers to some requests</h3>
117
- <pre>
118
- require 'spider'
119
- Spider.start_at('http://mike-burns.com/') do |s|
120
- s.setup do |a_url|
121
- if a_url =~ %r{^http://.*wikipedia.*}
122
- headers['User-Agent'] = &quot;Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)&quot;
123
- end
124
- end
125
- end
126
- </pre>
127
- <h3>Use memcached to track cycles</h3>
128
- <pre>
129
- require 'spider'
130
- require 'spider/included_in_memcached'
131
- SERVERS = ['10.0.10.2:11211','10.0.10.3:11211','10.0.10.4:11211']
132
- Spider.start_at('http://mike-burns.com/') do |s|
133
- s.check_already_seen_with IncludedInMemcached.new(SERVERS)
134
- end
135
- </pre>
136
- <h3>Track cycles with a custom object</h3>
137
- <pre>
138
- require 'spider'
139
- class ExpireLinks &lt; Hash
140
- def &lt;&lt;(v)
141
- self[v] = Time.now
142
- end
143
- def include?(v)
144
- self[v].kind_of?(Time) &amp;&amp; (self[v] + 86400) &gt;= Time.now
145
- end
146
- end
147
-
148
- Spider.start_at('http://mike-burns.com/') do |s|
149
- s.check_already_seen_with ExpireLinks.new
150
- end
151
- </pre>
152
- <h3>Store nodes to visit with Amazon SQS</h3>
153
- <pre>
154
- require 'spider'
155
- require 'spider/next_urls_in_sqs'
156
- Spider.start_at('http://mike-burns.com') do |s|
157
- s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY)
158
- end
159
- </pre>
160
- <h4>Store nodes to visit with a custom object</h4>
161
- <pre>
162
- require 'spider'
163
- class MyArray &lt; Array
164
- def pop
165
- super
166
- end
167
-
168
- def push(a_msg)
169
- super(a_msg)
170
- end
171
- end
172
-
173
- Spider.start_at('http://mike-burns.com') do |s|
174
- s.store_next_urls_with MyArray.new
175
- end
176
- </pre>
177
- <h3>Create a URL graph</h3>
178
- <pre>
179
- require 'spider'
180
- nodes = {}
181
- Spider.start_at('http://mike-burns.com/') do |s|
182
- s.add_url_check {|a_url| a_url =~ %r{^http://mike-burns.com.*} }
183
-
184
- s.on(:every) do |a_url, resp, prior_url|
185
- nodes[prior_url] ||= []
186
- nodes[prior_url] &lt;&lt; a_url
187
- end
188
- end
189
- </pre>
190
- <h3>Use a proxy</h3>
191
- <pre>
192
- require 'net/http_configuration'
193
- require 'spider'
194
- http_conf = Net::HTTP::Configuration.new(:proxy_host =&gt; '7proxies.org',
195
- :proxy_port =&gt; 8881)
196
- http_conf.apply do
197
- Spider.start_at('http://img.4chan.org/b/') do |s|
198
- s.on(:success) do |a_url, resp, prior_url|
199
- File.open(a_url.gsub('/',':'),'w') do |f|
200
- f.write(resp.body)
201
- end
202
- end
203
- end
204
- end
205
- </pre>
206
- <h2>Author</h2>
207
- <p>
208
- John Nagro john.nagro@gmail.com
209
- </p>
210
- <p>
211
- Mike Burns <a href="http://mike-burns.com">mike-burns.com</a>
212
- mike@mike-burns.com (original author)
213
- </p>
214
- <p>
215
- Many thanks to: Matt Horan Henri Cook Sander van der Vliet John Buckley
216
- Brian Campbell
217
- </p>
218
- <p>
219
- With `robot_rules&#8217; from James Edward Gray II via <a
220
- href="http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589">blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589</a>
221
- </p>
222
-
223
- </div>
224
-
225
-
226
- </div>
227
-
228
-
229
- </div>
230
-
231
-
232
- <!-- if includes -->
233
-
234
- <div id="section">
235
-
236
-
237
-
238
-
239
-
240
-
241
-
242
-
243
- <!-- if method_list -->
244
-
245
-
246
- </div>
247
-
248
-
249
- <div id="validator-badges">
250
- <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
251
- </div>
252
-
253
- </body>
254
- </html>