spider 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'spec'
3
+ require File.dirname(__FILE__)+'/../lib/spider'
4
+
5
+ describe 'Spider' do
6
+ it 'should start at the given URL when given a string' do
7
+ #Spider.start_at('http://example.com/') {}
8
+ pending 'this will be a while'
9
+ end
10
+ end
data/spider.gemspec CHANGED
@@ -4,7 +4,7 @@ spec = Gem::Specification.new do |s|
4
4
  s.author = 'Mike Burns'
5
5
  s.email = 'mike@mike-burns.com'
6
6
  s.has_rdoc = true
7
- ###s.homepage = ''
7
+ s.homepage = 'http://spider.rubyforge.org/'
8
8
  s.name = 'spider'
9
9
  s.summary = 'A Web spidering library'
10
10
  s.files = Dir['**/*'].delete_if { |f| f =~ /(cvs|gem|svn)$/i }
@@ -13,5 +13,5 @@ spec = Gem::Specification.new do |s|
13
13
  A Web spidering library: handles robots.txt, scraping, finding more
14
14
  links, and doing it all over again.
15
15
  EOF
16
- s.version = '0.1.0'
16
+ s.version = '0.2.0'
17
17
  end
@@ -0,0 +1,22 @@
1
+ #!/usr/local/bin/ruby -w
2
+
3
+ require 'rubygems'
4
+ require 'spider'
5
+
6
+ Spider.start_at('http://localhost:8880/page1.html') do |s|
7
+ s.add_url_check do |a_url|
8
+ a_url =~ %r{^http://localhost:8880.*}
9
+ end
10
+
11
+ s.on 404 do |a_url, err_code|
12
+ puts "URL not found: #{a_url}"
13
+ end
14
+
15
+ s.on :success do |a_url, code, headers, body|
16
+ puts "body: #{body}"
17
+ end
18
+
19
+ s.on :any do |a_url, resp|
20
+ puts "URL returned anything: #{a_url} with this code #{resp.code}"
21
+ end
22
+ end
@@ -0,0 +1 @@
1
+ <a href="page2.html">See page two!</a>
@@ -0,0 +1,3 @@
1
+ <a href="page1.html">See page one!</a>
2
+ <a href="http://localhost:8881/page1.html">See server two!</a>
3
+
@@ -0,0 +1 @@
1
+ <a href="page2.html">See page two!</a>
@@ -0,0 +1,2 @@
1
+ <a href="page1.html">See page one!</a>
2
+ <a href="http://localhost:8880/page1.html">See server one!</a>
@@ -0,0 +1,24 @@
1
+ #!/usr/local/bin/ruby -w
2
+ # Two Web servers, on different ports of localhost, serving two pages each.
3
+ # One page links to the next; the next links to both the first and to the the
4
+ # first on the other server.
5
+ # This is used to test: cycles, domain restrictions.
6
+
7
+ require 'webrick'
8
+
9
+ server1 = WEBrick::HTTPServer.new(:Port => 8880)
10
+ server1.mount('/', WEBrick::HTTPServlet::FileHandler,
11
+ File.dirname(__FILE__)+'/server1')
12
+ server2 = WEBrick::HTTPServer.new(:Port => 8881)
13
+ server2.mount('/', WEBrick::HTTPServlet::FileHandler,
14
+ File.dirname(__FILE__)+'/server2')
15
+
16
+ %w(INT TERM).each do |signal|
17
+ trap(signal) { [server1,server2].each { |server| server.shutdown } }
18
+ end
19
+
20
+ threads = []
21
+ [server1,server2].each do |server|
22
+ threads << Thread.new { server.start }
23
+ end
24
+ threads.each { |t| t.join }
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.9.2
2
+ rubygems_version: 0.9.4
3
3
  specification_version: 1
4
4
  name: spider
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.1.0
7
- date: 2007-03-30 00:00:00 -04:00
6
+ version: 0.2.0
7
+ date: 2007-10-22 00:00:00 -04:00
8
8
  summary: A Web spidering library
9
9
  require_paths:
10
10
  - lib
11
11
  email: mike@mike-burns.com
12
- homepage:
12
+ homepage: http://spider.rubyforge.org/
13
13
  rubyforge_project:
14
14
  description: "A Web spidering library: handles robots.txt, scraping, finding more links, and doing it all over again."
15
15
  autorequire:
@@ -29,13 +29,39 @@ post_install_message:
29
29
  authors:
30
30
  - Mike Burns
31
31
  files:
32
- - spider.gemspec
32
+ - doc
33
+ - doc/rdoc-style.css
34
+ - doc/files
35
+ - doc/files/lib
36
+ - doc/files/lib/spider_rb.html
37
+ - doc/files/README.html
38
+ - doc/classes
39
+ - doc/classes/SpiderInstance.html
40
+ - doc/classes/Spider.html
41
+ - doc/classes/Net.html
42
+ - doc/fr_file_index.html
43
+ - doc/fr_class_index.html
44
+ - doc/fr_method_index.html
45
+ - doc/index.html
46
+ - doc/created.rid
47
+ - spec
48
+ - spec/spider_spec.rb
49
+ - spec/spider_instance_spec.rb
33
50
  - README
51
+ - spider.gemspec
34
52
  - CHANGES
35
- - LICENSE
36
53
  - lib
37
54
  - lib/spider.rb
38
55
  - lib/robot_rules.rb
56
+ - test_server
57
+ - test_server/server1
58
+ - test_server/server1/page1.html
59
+ - test_server/server1/page2.html
60
+ - test_server/server2
61
+ - test_server/server2/page1.html
62
+ - test_server/server2/page2.html
63
+ - test_server/servers.rb
64
+ - test_server/client.rb
39
65
  test_files: []
40
66
 
41
67
  rdoc_options: []
data/LICENSE DELETED
@@ -1,339 +0,0 @@
1
- GNU GENERAL PUBLIC LICENSE
2
- Version 2, June 1991
3
-
4
- Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
5
- 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
6
- Everyone is permitted to copy and distribute verbatim copies
7
- of this license document, but changing it is not allowed.
8
-
9
- Preamble
10
-
11
- The licenses for most software are designed to take away your
12
- freedom to share and change it. By contrast, the GNU General Public
13
- License is intended to guarantee your freedom to share and change free
14
- software--to make sure the software is free for all its users. This
15
- General Public License applies to most of the Free Software
16
- Foundation's software and to any other program whose authors commit to
17
- using it. (Some other Free Software Foundation software is covered by
18
- the GNU Lesser General Public License instead.) You can apply it to
19
- your programs, too.
20
-
21
- When we speak of free software, we are referring to freedom, not
22
- price. Our General Public Licenses are designed to make sure that you
23
- have the freedom to distribute copies of free software (and charge for
24
- this service if you wish), that you receive source code or can get it
25
- if you want it, that you can change the software or use pieces of it
26
- in new free programs; and that you know you can do these things.
27
-
28
- To protect your rights, we need to make restrictions that forbid
29
- anyone to deny you these rights or to ask you to surrender the rights.
30
- These restrictions translate to certain responsibilities for you if you
31
- distribute copies of the software, or if you modify it.
32
-
33
- For example, if you distribute copies of such a program, whether
34
- gratis or for a fee, you must give the recipients all the rights that
35
- you have. You must make sure that they, too, receive or can get the
36
- source code. And you must show them these terms so they know their
37
- rights.
38
-
39
- We protect your rights with two steps: (1) copyright the software, and
40
- (2) offer you this license which gives you legal permission to copy,
41
- distribute and/or modify the software.
42
-
43
- Also, for each author's protection and ours, we want to make certain
44
- that everyone understands that there is no warranty for this free
45
- software. If the software is modified by someone else and passed on, we
46
- want its recipients to know that what they have is not the original, so
47
- that any problems introduced by others will not reflect on the original
48
- authors' reputations.
49
-
50
- Finally, any free program is threatened constantly by software
51
- patents. We wish to avoid the danger that redistributors of a free
52
- program will individually obtain patent licenses, in effect making the
53
- program proprietary. To prevent this, we have made it clear that any
54
- patent must be licensed for everyone's free use or not licensed at all.
55
-
56
- The precise terms and conditions for copying, distribution and
57
- modification follow.
58
-
59
- GNU GENERAL PUBLIC LICENSE
60
- TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
61
-
62
- 0. This License applies to any program or other work which contains
63
- a notice placed by the copyright holder saying it may be distributed
64
- under the terms of this General Public License. The "Program", below,
65
- refers to any such program or work, and a "work based on the Program"
66
- means either the Program or any derivative work under copyright law:
67
- that is to say, a work containing the Program or a portion of it,
68
- either verbatim or with modifications and/or translated into another
69
- language. (Hereinafter, translation is included without limitation in
70
- the term "modification".) Each licensee is addressed as "you".
71
-
72
- Activities other than copying, distribution and modification are not
73
- covered by this License; they are outside its scope. The act of
74
- running the Program is not restricted, and the output from the Program
75
- is covered only if its contents constitute a work based on the
76
- Program (independent of having been made by running the Program).
77
- Whether that is true depends on what the Program does.
78
-
79
- 1. You may copy and distribute verbatim copies of the Program's
80
- source code as you receive it, in any medium, provided that you
81
- conspicuously and appropriately publish on each copy an appropriate
82
- copyright notice and disclaimer of warranty; keep intact all the
83
- notices that refer to this License and to the absence of any warranty;
84
- and give any other recipients of the Program a copy of this License
85
- along with the Program.
86
-
87
- You may charge a fee for the physical act of transferring a copy, and
88
- you may at your option offer warranty protection in exchange for a fee.
89
-
90
- 2. You may modify your copy or copies of the Program or any portion
91
- of it, thus forming a work based on the Program, and copy and
92
- distribute such modifications or work under the terms of Section 1
93
- above, provided that you also meet all of these conditions:
94
-
95
- a) You must cause the modified files to carry prominent notices
96
- stating that you changed the files and the date of any change.
97
-
98
- b) You must cause any work that you distribute or publish, that in
99
- whole or in part contains or is derived from the Program or any
100
- part thereof, to be licensed as a whole at no charge to all third
101
- parties under the terms of this License.
102
-
103
- c) If the modified program normally reads commands interactively
104
- when run, you must cause it, when started running for such
105
- interactive use in the most ordinary way, to print or display an
106
- announcement including an appropriate copyright notice and a
107
- notice that there is no warranty (or else, saying that you provide
108
- a warranty) and that users may redistribute the program under
109
- these conditions, and telling the user how to view a copy of this
110
- License. (Exception: if the Program itself is interactive but
111
- does not normally print such an announcement, your work based on
112
- the Program is not required to print an announcement.)
113
-
114
- These requirements apply to the modified work as a whole. If
115
- identifiable sections of that work are not derived from the Program,
116
- and can be reasonably considered independent and separate works in
117
- themselves, then this License, and its terms, do not apply to those
118
- sections when you distribute them as separate works. But when you
119
- distribute the same sections as part of a whole which is a work based
120
- on the Program, the distribution of the whole must be on the terms of
121
- this License, whose permissions for other licensees extend to the
122
- entire whole, and thus to each and every part regardless of who wrote it.
123
-
124
- Thus, it is not the intent of this section to claim rights or contest
125
- your rights to work written entirely by you; rather, the intent is to
126
- exercise the right to control the distribution of derivative or
127
- collective works based on the Program.
128
-
129
- In addition, mere aggregation of another work not based on the Program
130
- with the Program (or with a work based on the Program) on a volume of
131
- a storage or distribution medium does not bring the other work under
132
- the scope of this License.
133
-
134
- 3. You may copy and distribute the Program (or a work based on it,
135
- under Section 2) in object code or executable form under the terms of
136
- Sections 1 and 2 above provided that you also do one of the following:
137
-
138
- a) Accompany it with the complete corresponding machine-readable
139
- source code, which must be distributed under the terms of Sections
140
- 1 and 2 above on a medium customarily used for software interchange; or,
141
-
142
- b) Accompany it with a written offer, valid for at least three
143
- years, to give any third party, for a charge no more than your
144
- cost of physically performing source distribution, a complete
145
- machine-readable copy of the corresponding source code, to be
146
- distributed under the terms of Sections 1 and 2 above on a medium
147
- customarily used for software interchange; or,
148
-
149
- c) Accompany it with the information you received as to the offer
150
- to distribute corresponding source code. (This alternative is
151
- allowed only for noncommercial distribution and only if you
152
- received the program in object code or executable form with such
153
- an offer, in accord with Subsection b above.)
154
-
155
- The source code for a work means the preferred form of the work for
156
- making modifications to it. For an executable work, complete source
157
- code means all the source code for all modules it contains, plus any
158
- associated interface definition files, plus the scripts used to
159
- control compilation and installation of the executable. However, as a
160
- special exception, the source code distributed need not include
161
- anything that is normally distributed (in either source or binary
162
- form) with the major components (compiler, kernel, and so on) of the
163
- operating system on which the executable runs, unless that component
164
- itself accompanies the executable.
165
-
166
- If distribution of executable or object code is made by offering
167
- access to copy from a designated place, then offering equivalent
168
- access to copy the source code from the same place counts as
169
- distribution of the source code, even though third parties are not
170
- compelled to copy the source along with the object code.
171
-
172
- 4. You may not copy, modify, sublicense, or distribute the Program
173
- except as expressly provided under this License. Any attempt
174
- otherwise to copy, modify, sublicense or distribute the Program is
175
- void, and will automatically terminate your rights under this License.
176
- However, parties who have received copies, or rights, from you under
177
- this License will not have their licenses terminated so long as such
178
- parties remain in full compliance.
179
-
180
- 5. You are not required to accept this License, since you have not
181
- signed it. However, nothing else grants you permission to modify or
182
- distribute the Program or its derivative works. These actions are
183
- prohibited by law if you do not accept this License. Therefore, by
184
- modifying or distributing the Program (or any work based on the
185
- Program), you indicate your acceptance of this License to do so, and
186
- all its terms and conditions for copying, distributing or modifying
187
- the Program or works based on it.
188
-
189
- 6. Each time you redistribute the Program (or any work based on the
190
- Program), the recipient automatically receives a license from the
191
- original licensor to copy, distribute or modify the Program subject to
192
- these terms and conditions. You may not impose any further
193
- restrictions on the recipients' exercise of the rights granted herein.
194
- You are not responsible for enforcing compliance by third parties to
195
- this License.
196
-
197
- 7. If, as a consequence of a court judgment or allegation of patent
198
- infringement or for any other reason (not limited to patent issues),
199
- conditions are imposed on you (whether by court order, agreement or
200
- otherwise) that contradict the conditions of this License, they do not
201
- excuse you from the conditions of this License. If you cannot
202
- distribute so as to satisfy simultaneously your obligations under this
203
- License and any other pertinent obligations, then as a consequence you
204
- may not distribute the Program at all. For example, if a patent
205
- license would not permit royalty-free redistribution of the Program by
206
- all those who receive copies directly or indirectly through you, then
207
- the only way you could satisfy both it and this License would be to
208
- refrain entirely from distribution of the Program.
209
-
210
- If any portion of this section is held invalid or unenforceable under
211
- any particular circumstance, the balance of the section is intended to
212
- apply and the section as a whole is intended to apply in other
213
- circumstances.
214
-
215
- It is not the purpose of this section to induce you to infringe any
216
- patents or other property right claims or to contest validity of any
217
- such claims; this section has the sole purpose of protecting the
218
- integrity of the free software distribution system, which is
219
- implemented by public license practices. Many people have made
220
- generous contributions to the wide range of software distributed
221
- through that system in reliance on consistent application of that
222
- system; it is up to the author/donor to decide if he or she is willing
223
- to distribute software through any other system and a licensee cannot
224
- impose that choice.
225
-
226
- This section is intended to make thoroughly clear what is believed to
227
- be a consequence of the rest of this License.
228
-
229
- 8. If the distribution and/or use of the Program is restricted in
230
- certain countries either by patents or by copyrighted interfaces, the
231
- original copyright holder who places the Program under this License
232
- may add an explicit geographical distribution limitation excluding
233
- those countries, so that distribution is permitted only in or among
234
- countries not thus excluded. In such case, this License incorporates
235
- the limitation as if written in the body of this License.
236
-
237
- 9. The Free Software Foundation may publish revised and/or new versions
238
- of the General Public License from time to time. Such new versions will
239
- be similar in spirit to the present version, but may differ in detail to
240
- address new problems or concerns.
241
-
242
- Each version is given a distinguishing version number. If the Program
243
- specifies a version number of this License which applies to it and "any
244
- later version", you have the option of following the terms and conditions
245
- either of that version or of any later version published by the Free
246
- Software Foundation. If the Program does not specify a version number of
247
- this License, you may choose any version ever published by the Free Software
248
- Foundation.
249
-
250
- 10. If you wish to incorporate parts of the Program into other free
251
- programs whose distribution conditions are different, write to the author
252
- to ask for permission. For software which is copyrighted by the Free
253
- Software Foundation, write to the Free Software Foundation; we sometimes
254
- make exceptions for this. Our decision will be guided by the two goals
255
- of preserving the free status of all derivatives of our free software and
256
- of promoting the sharing and reuse of software generally.
257
-
258
- NO WARRANTY
259
-
260
- 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261
- FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
262
- OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263
- PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264
- OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265
- MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
266
- TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
267
- PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268
- REPAIR OR CORRECTION.
269
-
270
- 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271
- WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272
- REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273
- INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274
- OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275
- TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276
- YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277
- PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278
- POSSIBILITY OF SUCH DAMAGES.
279
-
280
- END OF TERMS AND CONDITIONS
281
-
282
- How to Apply These Terms to Your New Programs
283
-
284
- If you develop a new program, and you want it to be of the greatest
285
- possible use to the public, the best way to achieve this is to make it
286
- free software which everyone can redistribute and change under these terms.
287
-
288
- To do so, attach the following notices to the program. It is safest
289
- to attach them to the start of each source file to most effectively
290
- convey the exclusion of warranty; and each file should have at least
291
- the "copyright" line and a pointer to where the full notice is found.
292
-
293
- <one line to give the program's name and a brief idea of what it does.>
294
- Copyright (C) <year> <name of author>
295
-
296
- This program is free software; you can redistribute it and/or modify
297
- it under the terms of the GNU General Public License as published by
298
- the Free Software Foundation; either version 2 of the License, or
299
- (at your option) any later version.
300
-
301
- This program is distributed in the hope that it will be useful,
302
- but WITHOUT ANY WARRANTY; without even the implied warranty of
303
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
304
- GNU General Public License for more details.
305
-
306
- You should have received a copy of the GNU General Public License along
307
- with this program; if not, write to the Free Software Foundation, Inc.,
308
- 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309
-
310
- Also add information on how to contact you by electronic and paper mail.
311
-
312
- If the program is interactive, make it output a short notice like this
313
- when it starts in an interactive mode:
314
-
315
- Gnomovision version 69, Copyright (C) year name of author
316
- Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317
- This is free software, and you are welcome to redistribute it
318
- under certain conditions; type `show c' for details.
319
-
320
- The hypothetical commands `show w' and `show c' should show the appropriate
321
- parts of the General Public License. Of course, the commands you use may
322
- be called something other than `show w' and `show c'; they could even be
323
- mouse-clicks or menu items--whatever suits your program.
324
-
325
- You should also get your employer (if you work as a programmer) or your
326
- school, if any, to sign a "copyright disclaimer" for the program, if
327
- necessary. Here is a sample; alter the names:
328
-
329
- Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330
- `Gnomovision' (which makes passes at compilers) written by James Hacker.
331
-
332
- <signature of Ty Coon>, 1 April 1989
333
- Ty Coon, President of Vice
334
-
335
- This General Public License does not permit incorporating your program into
336
- proprietary programs. If your program is a subroutine library, you may
337
- consider it more useful to permit linking proprietary applications with the
338
- library. If this is what you want to do, use the GNU Lesser General
339
- Public License instead of this License.