spidr 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +13 -0
- data/Manifest.txt +11 -0
- data/README.txt +55 -0
- data/Rakefile +13 -0
- data/lib/spidr.rb +3 -0
- data/lib/spidr/agent.rb +490 -0
- data/lib/spidr/page.rb +159 -0
- data/lib/spidr/rules.rb +61 -0
- data/lib/spidr/spidr.rb +48 -0
- data/lib/spidr/version.rb +3 -0
- data/test/test_spidr.rb +0 -0
- metadata +84 -0
data/History.txt
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
=== 0.1.0 / 2008-05-23
|
2
|
+
|
3
|
+
* Initial release.
|
4
|
+
* Black-list or white-list URLs based upon:
|
5
|
+
* Host name
|
6
|
+
* Port number
|
7
|
+
* Full link
|
8
|
+
* URL extension
|
9
|
+
* Provides call-backs for:
|
10
|
+
* Every visited Page.
|
11
|
+
* Every visited URL.
|
12
|
+
* Every visited URL that matches a specified pattern.
|
13
|
+
|
data/Manifest.txt
ADDED
data/README.txt
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
= Spidr
|
2
|
+
|
3
|
+
* http://spidr.rubyforge.org/
|
4
|
+
* Postmodern Modulus III (postmodern.mod3@gmail.com)
|
5
|
+
|
6
|
+
== DESCRIPTION:
|
7
|
+
|
8
|
+
Spidr is a versatile Ruby web spidering library that can spider a site,
|
9
|
+
multiple domains, certain links or infinitely. Spidr is designed to be fast
|
10
|
+
and easy to use.
|
11
|
+
|
12
|
+
== FEATURES/PROBLEMS:
|
13
|
+
|
14
|
+
* Black-list or white-list URLs based upon:
|
15
|
+
* Host name
|
16
|
+
* Port number
|
17
|
+
* Full link
|
18
|
+
* URL extension
|
19
|
+
* Provides call-backs for:
|
20
|
+
* Every visited Page.
|
21
|
+
* Every visited URL.
|
22
|
+
* Every visited URL that matches a specified pattern.
|
23
|
+
|
24
|
+
== REQUIREMENTS:
|
25
|
+
|
26
|
+
* Hpricot
|
27
|
+
|
28
|
+
== INSTALL:
|
29
|
+
|
30
|
+
$ sudo gem install spidr
|
31
|
+
|
32
|
+
== LICENSE:
|
33
|
+
|
34
|
+
The MIT License
|
35
|
+
|
36
|
+
Copyright (c) 2008 Hal Brodigan
|
37
|
+
|
38
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
39
|
+
a copy of this software and associated documentation files (the
|
40
|
+
'Software'), to deal in the Software without restriction, including
|
41
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
42
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
43
|
+
permit persons to whom the Software is furnished to do so, subject to
|
44
|
+
the following conditions:
|
45
|
+
|
46
|
+
The above copyright notice and this permission notice shall be
|
47
|
+
included in all copies or substantial portions of the Software.
|
48
|
+
|
49
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
50
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
51
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
52
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
53
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
54
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
55
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Rakefile
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'hoe'
|
5
|
+
require './lib/spidr/version.rb'
|
6
|
+
|
7
|
+
Hoe.new('spidr', Spidr::VERSION) do |p|
|
8
|
+
p.rubyforge_name = 'spidr'
|
9
|
+
p.developer('Postmodern Modulus III', 'postmodern.mod3@gmail.com')
|
10
|
+
p.extra_deps = ['hpricot']
|
11
|
+
end
|
12
|
+
|
13
|
+
# vim: syntax=Ruby
|
data/lib/spidr.rb
ADDED
data/lib/spidr/agent.rb
ADDED
@@ -0,0 +1,490 @@
|
|
1
|
+
require 'spidr/rules'
|
2
|
+
require 'spidr/page'
|
3
|
+
require 'spidr/spidr'
|
4
|
+
|
5
|
+
require 'net/http'
|
6
|
+
require 'hpricot'
|
7
|
+
|
8
|
+
module Spidr
|
9
|
+
class Agent
|
10
|
+
|
11
|
+
# URL schemes to visit
|
12
|
+
SCHEMES = ['http', 'https']
|
13
|
+
|
14
|
+
# Proxy to use
|
15
|
+
attr_accessor :proxy
|
16
|
+
|
17
|
+
# User-Agent to use
|
18
|
+
attr_accessor :user_agent
|
19
|
+
|
20
|
+
# Referer to use
|
21
|
+
attr_accessor :referer
|
22
|
+
|
23
|
+
# Delay in between fetching pages
|
24
|
+
attr_accessor :delay
|
25
|
+
|
26
|
+
# History containing visited URLs
|
27
|
+
attr_accessor :history
|
28
|
+
|
29
|
+
#
|
30
|
+
# Creates a new Agent object with the given _options_ and _block_.
|
31
|
+
# If a _block_ is given, it will be passed the newly created
|
32
|
+
# Agent object.
|
33
|
+
#
|
34
|
+
# _options_ may contain the following keys:
|
35
|
+
# <tt>:proxy</tt>:: The proxy to use while spidering.
|
36
|
+
# <tt>:user_agent</tt>:: the User-Agent string to send.
|
37
|
+
# <tt>:referer</tt>:: The referer URL to send.
|
38
|
+
# <tt>:delay</tt>:: Duration in seconds to pause between spidering each
|
39
|
+
# link. Defaults to 0.
|
40
|
+
# <tt>:hosts</tt>:: An +Array+ of host patterns to visit.
|
41
|
+
# <tt>:ignore_hosts</tt>:: An +Array+ of host patterns to not visit.
|
42
|
+
# <tt>:ports</tt>:: An +Array+ of port patterns to visit.
|
43
|
+
# <tt>:ignore_ports</tt>:: An +Array+ of port patterns to not visit.
|
44
|
+
# <tt>:links</tt>:: An +Array+ of link patterns to visit.
|
45
|
+
# <tt>:ignore_links</tt>:: An +Array+ of link patterns to not visit.
|
46
|
+
# <tt>:exts</tt>:: An +Array+ of File extension patterns to visit.
|
47
|
+
# <tt>:ignore_exts</tt>:: An +Array+ of File extension patterns to not
|
48
|
+
# visit.
|
49
|
+
#
|
50
|
+
def initialize(options={},&block)
|
51
|
+
@proxy = (options[:proxy] || Spidr.proxy)
|
52
|
+
@user_agent = (options[:user_agent] || Spidr.user_agent)
|
53
|
+
@referer = options[:referer]
|
54
|
+
|
55
|
+
@host_rules = Rules.new(:accept => options[:hosts],
|
56
|
+
:reject => options[:ignore_hosts])
|
57
|
+
@port_rules = Rules.new(:accept => options[:ports],
|
58
|
+
:reject => options[:ignore_ports])
|
59
|
+
@link_rules = Rules.new(:accept => options[:links],
|
60
|
+
:reject => options[:ignore_links])
|
61
|
+
@ext_rules = Rules.new(:accept => options[:exts],
|
62
|
+
:reject => options[:ignore_exts])
|
63
|
+
|
64
|
+
@every_url_blocks = []
|
65
|
+
@urls_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
66
|
+
|
67
|
+
@every_page_blocks = []
|
68
|
+
|
69
|
+
@delay = (options[:delay] || 0)
|
70
|
+
@history = []
|
71
|
+
@queue = []
|
72
|
+
|
73
|
+
block.call(self) if block
|
74
|
+
end
|
75
|
+
|
76
|
+
#
|
77
|
+
# Creates a new Agent object with the given _options_ and will begin
|
78
|
+
# spidering at the specified _url_. If a _block_ is given it will be
|
79
|
+
# passed the newly created Agent object, before the agent begins
|
80
|
+
# spidering.
|
81
|
+
#
|
82
|
+
def self.start_at(url,options={},&block)
|
83
|
+
self.new(options) do |spider|
|
84
|
+
block.call(spider) if block
|
85
|
+
|
86
|
+
spider.start_at(url)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
#
|
91
|
+
# Creates a new Agent object with the given _options_ and will begin
|
92
|
+
# spidering the specified host _name_. If a _block_ is given it will be
|
93
|
+
# passed the newly created Agent object, before the agent begins
|
94
|
+
# spidering.
|
95
|
+
#
|
96
|
+
def self.host(name,options={},&block)
|
97
|
+
self.new(options.merge(:hosts => [name.to_s])) do |spider|
|
98
|
+
block.call(spider) if block
|
99
|
+
|
100
|
+
spider.start_at("http://#{name}/")
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
#
|
105
|
+
# Creates a new Agent object with the given _options_ and will begin
|
106
|
+
# spidering the host of the specified _url_. If a _block_ is given it
|
107
|
+
# will be passed the newly created Agent object, before the agent
|
108
|
+
# begins spidering.
|
109
|
+
#
|
110
|
+
def self.site(url,options={},&block)
|
111
|
+
url = URI(url.to_s)
|
112
|
+
|
113
|
+
return self.new(options.merge(:hosts => [url.host])) do |spider|
|
114
|
+
block.call(spider) if block
|
115
|
+
|
116
|
+
spider.start_at(url)
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
#
|
121
|
+
# Returns the +Array+ of host patterns to visit.
|
122
|
+
#
|
123
|
+
def visit_hosts
|
124
|
+
@host_rules.accept
|
125
|
+
end
|
126
|
+
|
127
|
+
#
|
128
|
+
# Adds the given _pattern_ to the visit_hosts. If a _block_ is given,
|
129
|
+
# it will be added to the visit_hosts.
|
130
|
+
#
|
131
|
+
def visit_hosts_like(pattern=nil,&block)
|
132
|
+
if pattern
|
133
|
+
visit_hosts << pattern
|
134
|
+
elsif block
|
135
|
+
visit_hosts << block
|
136
|
+
end
|
137
|
+
|
138
|
+
return self
|
139
|
+
end
|
140
|
+
|
141
|
+
#
|
142
|
+
# Returns the +Array+ of URL host patterns to not visit.
|
143
|
+
#
|
144
|
+
def ignore_hosts
|
145
|
+
@host_rules.reject
|
146
|
+
end
|
147
|
+
|
148
|
+
#
|
149
|
+
# Adds the given _pattern_ to the ignore_hosts. If a _block_ is given,
|
150
|
+
# it will be added to the ignore_hosts.
|
151
|
+
#
|
152
|
+
def ignore_hosts_like(pattern=nil,&block)
|
153
|
+
if pattern
|
154
|
+
ignore_hosts << pattern
|
155
|
+
elsif block
|
156
|
+
ignore_hosts << block
|
157
|
+
end
|
158
|
+
|
159
|
+
return self
|
160
|
+
end
|
161
|
+
|
162
|
+
#
|
163
|
+
# Returns the +Array+ of URL port patterns to visit.
|
164
|
+
#
|
165
|
+
def visit_ports
|
166
|
+
@port_rules.accept
|
167
|
+
end
|
168
|
+
|
169
|
+
#
|
170
|
+
# Adds the given _pattern_ to the visit_ports. If a _block_ is given,
|
171
|
+
# it will be added to the visit_ports.
|
172
|
+
#
|
173
|
+
def visit_ports_like(pattern=nil,&block)
|
174
|
+
if pattern
|
175
|
+
visit_ports << pattern
|
176
|
+
elsif block
|
177
|
+
visit_ports << block
|
178
|
+
end
|
179
|
+
|
180
|
+
return self
|
181
|
+
end
|
182
|
+
|
183
|
+
#
|
184
|
+
# Returns the +Array+ of URL port patterns to not visit.
|
185
|
+
#
|
186
|
+
def ignore_ports
|
187
|
+
@port_rules.reject
|
188
|
+
end
|
189
|
+
|
190
|
+
#
|
191
|
+
# Adds the given _pattern_ to the ignore_hosts. If a _block_ is given,
|
192
|
+
# it will be added to the ignore_hosts.
|
193
|
+
#
|
194
|
+
def ignore_ports_like(pattern=nil,&block)
|
195
|
+
if pattern
|
196
|
+
ignore_ports << pattern
|
197
|
+
elsif block
|
198
|
+
ignore_ports << block
|
199
|
+
end
|
200
|
+
|
201
|
+
return self
|
202
|
+
end
|
203
|
+
|
204
|
+
#
|
205
|
+
# Returns the +Array+ of link patterns to visit.
|
206
|
+
#
|
207
|
+
def visit_links
|
208
|
+
@link_rules.accept
|
209
|
+
end
|
210
|
+
|
211
|
+
#
|
212
|
+
# Adds the given _pattern_ to the visit_links. If a _block_ is given,
|
213
|
+
# it will be added to the visit_links.
|
214
|
+
#
|
215
|
+
def visit_links_like(pattern=nil,&block)
|
216
|
+
if pattern
|
217
|
+
visit_links << pattern
|
218
|
+
elsif block
|
219
|
+
visit_links << block
|
220
|
+
end
|
221
|
+
|
222
|
+
return self
|
223
|
+
end
|
224
|
+
|
225
|
+
#
|
226
|
+
# Returns the +Array+ of link patterns to not visit.
|
227
|
+
#
|
228
|
+
def ignore_links
|
229
|
+
@link_rules.reject
|
230
|
+
end
|
231
|
+
|
232
|
+
#
|
233
|
+
# Adds the given _pattern_ to the ignore_links. If a _block_ is given,
|
234
|
+
# it will be added to the ignore_links.
|
235
|
+
#
|
236
|
+
def ignore_links_like(pattern=nil,&block)
|
237
|
+
if pattern
|
238
|
+
ignore_links << pattern
|
239
|
+
elsif block
|
240
|
+
ignore_links << block
|
241
|
+
end
|
242
|
+
|
243
|
+
return self
|
244
|
+
end
|
245
|
+
|
246
|
+
#
|
247
|
+
# Returns the +Array+ of URL extension patterns to visit.
|
248
|
+
#
|
249
|
+
def visit_exts
|
250
|
+
@ext_rules.accept
|
251
|
+
end
|
252
|
+
|
253
|
+
#
|
254
|
+
# Adds the given _pattern_ to the visit_exts. If a _block_ is given,
|
255
|
+
# it will be added to the visit_exts.
|
256
|
+
#
|
257
|
+
def visit_exts_like(pattern=nil,&block)
|
258
|
+
if pattern
|
259
|
+
visit_exts << pattern
|
260
|
+
elsif block
|
261
|
+
visit_exts << block
|
262
|
+
end
|
263
|
+
|
264
|
+
return self
|
265
|
+
end
|
266
|
+
|
267
|
+
#
|
268
|
+
# Returns the +Array+ of URL extension patterns to not visit.
|
269
|
+
#
|
270
|
+
def ignore_exts
|
271
|
+
@ext_rules.reject
|
272
|
+
end
|
273
|
+
|
274
|
+
#
|
275
|
+
# Adds the given _pattern_ to the ignore_exts. If a _block_ is given,
|
276
|
+
# it will be added to the ignore_exts.
|
277
|
+
#
|
278
|
+
def ignore_exts_like(&block)
|
279
|
+
if pattern
|
280
|
+
ignore_exts << pattern
|
281
|
+
elsif block
|
282
|
+
ignore_exts << block
|
283
|
+
end
|
284
|
+
|
285
|
+
return self
|
286
|
+
end
|
287
|
+
|
288
|
+
#
|
289
|
+
# For every URL that the agent visits it will be passed to the
|
290
|
+
# specified _block_.
|
291
|
+
#
|
292
|
+
def every_url(&block)
|
293
|
+
@every_url_blocks << block
|
294
|
+
return self
|
295
|
+
end
|
296
|
+
|
297
|
+
#
|
298
|
+
# For every URL that the agent visits and matches the specified
|
299
|
+
# _pattern_, it will be passed to the specified _block_.
|
300
|
+
#
|
301
|
+
def urls_like(pattern,&block)
|
302
|
+
@urls_like_blocks[pattern] << block
|
303
|
+
return self
|
304
|
+
end
|
305
|
+
|
306
|
+
#
|
307
|
+
# For every Page that the agent visits it will be passed to the
|
308
|
+
# specified _block_.
|
309
|
+
#
|
310
|
+
def every_page(&block)
|
311
|
+
@every_page_blocks << block
|
312
|
+
return self
|
313
|
+
end
|
314
|
+
|
315
|
+
#
|
316
|
+
# Clear the history and start spidering at the specified _url_.
|
317
|
+
#
|
318
|
+
def start_at(url)
|
319
|
+
@history.clear
|
320
|
+
return run(url)
|
321
|
+
end
|
322
|
+
|
323
|
+
#
|
324
|
+
# Start spidering at the specified _url_.
|
325
|
+
#
|
326
|
+
def run(url)
|
327
|
+
enqueue(url)
|
328
|
+
|
329
|
+
until @queue.empty?
|
330
|
+
visit_page(dequeue)
|
331
|
+
end
|
332
|
+
|
333
|
+
return self
|
334
|
+
end
|
335
|
+
|
336
|
+
#
|
337
|
+
# Returns the +Array+ of visited URLs.
|
338
|
+
#
|
339
|
+
def visited_urls
|
340
|
+
@history
|
341
|
+
end
|
342
|
+
|
343
|
+
#
|
344
|
+
# Returns the +Array+ of visited URLs.
|
345
|
+
#
|
346
|
+
def visited_links
|
347
|
+
@history.map { |uri| uri.to_s }
|
348
|
+
end
|
349
|
+
|
350
|
+
#
|
351
|
+
# Return the +Array+ of hosts that were visited.
|
352
|
+
#
|
353
|
+
def visited_hosts
|
354
|
+
@history.map { |uri| uri.host }.uniq
|
355
|
+
end
|
356
|
+
|
357
|
+
#
|
358
|
+
# Returns +true+ if the specified _url_ was visited, returns +false+
|
359
|
+
# otherwise.
|
360
|
+
#
|
361
|
+
def visited?(url)
|
362
|
+
if url.kind_of?(URI)
|
363
|
+
return @history.include?(url)
|
364
|
+
else
|
365
|
+
return @history.include?(URI(url).to_s)
|
366
|
+
end
|
367
|
+
end
|
368
|
+
|
369
|
+
protected
|
370
|
+
|
371
|
+
#
|
372
|
+
# Returns +true+ if the specified _url_ is queued for visiting, returns
|
373
|
+
# +false+ otherwise.
|
374
|
+
#
|
375
|
+
def queued?(url)
|
376
|
+
@queue.include?(url)
|
377
|
+
end
|
378
|
+
|
379
|
+
#
|
380
|
+
# Enqueues the specified _url_ for visiting, only if it passes all the
|
381
|
+
# agent's rules for visiting a given URL. Returns +true+ if the _url_
|
382
|
+
# was successfully enqueued, returns +false+ otherwise.
|
383
|
+
#
|
384
|
+
def enqueue(url)
|
385
|
+
link = url.to_s
|
386
|
+
url = URI(link)
|
387
|
+
|
388
|
+
if (!(queued?(url)) && visit?(url))
|
389
|
+
@every_url_blocks.each { |block| block.call(url) }
|
390
|
+
|
391
|
+
@urls_like_blocks.each do |pattern,blocks|
|
392
|
+
if ((pattern.kind_of?(Regexp) && link =~ pattern) || pattern == link || pattern == url)
|
393
|
+
blocks.each { |url_block| url_block.call(url) }
|
394
|
+
end
|
395
|
+
end
|
396
|
+
|
397
|
+
@queue << url
|
398
|
+
return true
|
399
|
+
end
|
400
|
+
|
401
|
+
return false
|
402
|
+
end
|
403
|
+
|
404
|
+
#
|
405
|
+
# Dequeues a URL that will later be visited.
|
406
|
+
#
|
407
|
+
def dequeue
|
408
|
+
@queue.shift
|
409
|
+
end
|
410
|
+
|
411
|
+
#
|
412
|
+
# Returns +true+ if the specified URL should be visited, returns
|
413
|
+
# +false+ otherwise.
|
414
|
+
#
|
415
|
+
def visit?(url)
|
416
|
+
(!(visited?(url)) &&
|
417
|
+
visit_scheme?(url) &&
|
418
|
+
visit_host?(url) &&
|
419
|
+
visit_port?(url) &&
|
420
|
+
visit_link?(url) &&
|
421
|
+
visit_ext?(url))
|
422
|
+
end
|
423
|
+
|
424
|
+
#
|
425
|
+
# Visits the spedified _url_ and enqueus it's links for visiting. If a
|
426
|
+
# _block_ is given, it will be passed a newly created Page object
|
427
|
+
# for the specified _url_.
|
428
|
+
#
|
429
|
+
def visit_page(url,&block)
|
430
|
+
get_page(url) do |page|
|
431
|
+
@history << page.url
|
432
|
+
|
433
|
+
page.urls.each { |next_url| enqueue(next_url) }
|
434
|
+
|
435
|
+
@every_page_blocks.each { |page_block| page_block.call(page) }
|
436
|
+
|
437
|
+
block.call(page) if block
|
438
|
+
end
|
439
|
+
end
|
440
|
+
|
441
|
+
private
|
442
|
+
|
443
|
+
def visit_scheme?(url)
|
444
|
+
if url.scheme
|
445
|
+
return SCHEMES.include?(url.scheme)
|
446
|
+
else
|
447
|
+
return true
|
448
|
+
end
|
449
|
+
end
|
450
|
+
|
451
|
+
def visit_host?(url)
|
452
|
+
@host_rules.accept?(url.host)
|
453
|
+
end
|
454
|
+
|
455
|
+
def visit_port?(url)
|
456
|
+
@port_rules.accept?(url.port)
|
457
|
+
end
|
458
|
+
|
459
|
+
def visit_link?(url)
|
460
|
+
@link_rules.accept?(url.to_s)
|
461
|
+
end
|
462
|
+
|
463
|
+
def visit_ext?(url)
|
464
|
+
@ext_rules.accept?(File.extname(url.path)[1..-1])
|
465
|
+
end
|
466
|
+
|
467
|
+
def get_page(url,&block)
|
468
|
+
host = url.host
|
469
|
+
port = url.port
|
470
|
+
|
471
|
+
proxy_host = @proxy[:host]
|
472
|
+
proxy_port = @proxy[:port]
|
473
|
+
proxy_user = @proxy[:user]
|
474
|
+
proxy_password = @proxy[:password]
|
475
|
+
|
476
|
+
Net::HTTP::Proxy(proxy_host,proxy_port,proxy_user,proxy_password).start(host,port) do |sess|
|
477
|
+
headers = {}
|
478
|
+
|
479
|
+
headers['User-Agent'] = @user_agent if @user_agent
|
480
|
+
headers['Referer'] = @referer if @referer
|
481
|
+
|
482
|
+
new_page = Page.new(url,sess.get(url.path,headers))
|
483
|
+
|
484
|
+
block.call(new_page) if block
|
485
|
+
return new_page
|
486
|
+
end
|
487
|
+
end
|
488
|
+
|
489
|
+
end
|
490
|
+
end
|
data/lib/spidr/page.rb
ADDED
@@ -0,0 +1,159 @@
|
|
1
|
+
require 'uri'
|
2
|
+
require 'hpricot'
|
3
|
+
|
4
|
+
module Spidr
|
5
|
+
class Page
|
6
|
+
|
7
|
+
# URL of the page
|
8
|
+
attr_reader :url
|
9
|
+
|
10
|
+
# Body returned for the page
|
11
|
+
attr_reader :body
|
12
|
+
|
13
|
+
# Headers returned with the body
|
14
|
+
attr_reader :headers
|
15
|
+
|
16
|
+
#
|
17
|
+
# Creates a new Page object from the specified _url_ and HTTP
|
18
|
+
# _response_.
|
19
|
+
#
|
20
|
+
def initialize(url,response)
|
21
|
+
@url = url
|
22
|
+
@response = response
|
23
|
+
@doc = nil
|
24
|
+
end
|
25
|
+
|
26
|
+
#
|
27
|
+
# Returns the content-type of the page.
|
28
|
+
#
|
29
|
+
def content_type
|
30
|
+
@response['Content-Type']
|
31
|
+
end
|
32
|
+
|
33
|
+
#
|
34
|
+
# Returns +true+ if the page is a HTML document, returns +false+
|
35
|
+
# otherwise.
|
36
|
+
#
|
37
|
+
def html?
|
38
|
+
(content_type =~ /text\/html/) == 0
|
39
|
+
end
|
40
|
+
|
41
|
+
#
|
42
|
+
# Returns +true+ if the page is a XML document, returns +false+
|
43
|
+
# otherwise.
|
44
|
+
#
|
45
|
+
def xml?
|
46
|
+
(content_type =~ /text\/xml/) == 0
|
47
|
+
end
|
48
|
+
|
49
|
+
#
|
50
|
+
# Returns +true+ if the page is a Javascript file, returns +false+
|
51
|
+
# otherwise.
|
52
|
+
#
|
53
|
+
def javascript?
|
54
|
+
(content_type =~ /(text|application)\/javascript/) == 0
|
55
|
+
end
|
56
|
+
|
57
|
+
#
|
58
|
+
# Returns +true+ if the page is a CSS file, returns +false+
|
59
|
+
# otherwise.
|
60
|
+
#
|
61
|
+
def css?
|
62
|
+
(content_type =~ /text\/css/) == 0
|
63
|
+
end
|
64
|
+
|
65
|
+
#
|
66
|
+
# Returns +true+ if the page is a RSS/RDF feed, returns +false+
|
67
|
+
# otherwise.
|
68
|
+
#
|
69
|
+
def rss?
|
70
|
+
(content_type =~ /application\/(rss|rdf)\+xml/) == 0
|
71
|
+
end
|
72
|
+
|
73
|
+
#
|
74
|
+
# Returns +true+ if the page is a Atom feed, returns +false+
|
75
|
+
# otherwise.
|
76
|
+
#
|
77
|
+
def atom?
|
78
|
+
(content_type =~ /application\/atom\+xml/) == 0
|
79
|
+
end
|
80
|
+
|
81
|
+
#
|
82
|
+
# Returns the body of the page in +String+ form.
|
83
|
+
#
|
84
|
+
def body
|
85
|
+
@response.body
|
86
|
+
end
|
87
|
+
|
88
|
+
#
|
89
|
+
# Returns an Hpricot::Doc if the page represents a HTML document,
|
90
|
+
# returns +nil+ otherwise.
|
91
|
+
#
|
92
|
+
def doc
|
93
|
+
if html?
|
94
|
+
return @doc ||= Hpricot(body)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
#
|
99
|
+
# Returns all links from the HTML page.
|
100
|
+
#
|
101
|
+
def links
|
102
|
+
if html?
|
103
|
+
return doc.search('a[@href]').map do |a|
|
104
|
+
a.attributes['href'].strip
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
return []
|
109
|
+
end
|
110
|
+
|
111
|
+
#
|
112
|
+
# Returns all links from the HtML page as absolute URLs.
|
113
|
+
#
|
114
|
+
def urls
|
115
|
+
links.map { |link| to_absolute(link) }
|
116
|
+
end
|
117
|
+
|
118
|
+
protected
|
119
|
+
|
120
|
+
#
|
121
|
+
# Converts the specified _link_ into an absolute URL
|
122
|
+
# based on the url of the page.
|
123
|
+
#
|
124
|
+
def to_absolute(link)
|
125
|
+
link = URI.encode(link.to_s.gsub(/#.*$/,''))
|
126
|
+
relative = URI(link)
|
127
|
+
|
128
|
+
if relative.scheme.nil?
|
129
|
+
new_url = @url.clone
|
130
|
+
|
131
|
+
if relative.path[0..0] == '/'
|
132
|
+
new_url.path = relative.path
|
133
|
+
elsif relative.path[-1..-1] == '/'
|
134
|
+
new_url.path = File.expand_path(File.join(new_url.path,relative.path))
|
135
|
+
elsif !(relative.path.empty?)
|
136
|
+
new_url.path = File.expand_path(File.join(File.dirname(new_url.path),relative.path))
|
137
|
+
end
|
138
|
+
|
139
|
+
return new_url
|
140
|
+
end
|
141
|
+
|
142
|
+
return relative
|
143
|
+
end
|
144
|
+
|
145
|
+
#
|
146
|
+
# Provides transparent access to the values in the +headers+ +Hash+.
|
147
|
+
#
|
148
|
+
def method_missing(sym,*args,&block)
|
149
|
+
if (args.empty? && block.nil?)
|
150
|
+
name = sym.id2name.sub('_','-')
|
151
|
+
|
152
|
+
return @response[name] if @response.has_key?(name)
|
153
|
+
end
|
154
|
+
|
155
|
+
return super(sym,*args,&block)
|
156
|
+
end
|
157
|
+
|
158
|
+
end
|
159
|
+
end
|
data/lib/spidr/rules.rb
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
module Spidr
|
2
|
+
class Rules
|
3
|
+
|
4
|
+
# Accept rules
|
5
|
+
attr_reader :accept
|
6
|
+
|
7
|
+
# Reject rules
|
8
|
+
attr_reader :reject
|
9
|
+
|
10
|
+
def initialize(options={})
|
11
|
+
@accept = (options[:accept] || [])
|
12
|
+
@reject = (options[:reject] || [])
|
13
|
+
end
|
14
|
+
|
15
|
+
#
|
16
|
+
# Returns +true+ if the _field_ is accepted by the rules,
|
17
|
+
# returns +false+ otherwise.
|
18
|
+
#
|
19
|
+
def accept?(field)
|
20
|
+
unless @accept.empty?
|
21
|
+
@accept.each do |rule|
|
22
|
+
return true if test_field(field,rule)
|
23
|
+
end
|
24
|
+
|
25
|
+
return false
|
26
|
+
else
|
27
|
+
@reject.each do |rule|
|
28
|
+
return false if test_field(field,rule)
|
29
|
+
end
|
30
|
+
|
31
|
+
return true
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
#
|
36
|
+
# Returns +true+ if the _field_ is rejected by the rules,
|
37
|
+
# returns +false+ otherwise.
|
38
|
+
#
|
39
|
+
def reject?(field)
|
40
|
+
!(accept?(field))
|
41
|
+
end
|
42
|
+
|
43
|
+
protected
|
44
|
+
|
45
|
+
#
|
46
|
+
# Tests the specified _field_ against the specified _rule_. Returns
|
47
|
+
# +true+ when the _rule_ matches the specified _field_, returns
|
48
|
+
# +false+ otherwise.
|
49
|
+
#
|
50
|
+
def test_field(field,rule)
|
51
|
+
if rule.kind_of?(Proc)
|
52
|
+
return (rule.call(field) == true)
|
53
|
+
elsif rule.kind_of?(Regexp)
|
54
|
+
return !((field.to_s =~ rule).nil?)
|
55
|
+
else
|
56
|
+
return field == rule
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
end
|
data/lib/spidr/spidr.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'spidr/agent'
|
2
|
+
|
3
|
+
module Spidr
|
4
|
+
# Common proxy port.
|
5
|
+
COMMON_PROXY_PORT = 8080
|
6
|
+
|
7
|
+
#
|
8
|
+
# Returns the +Hash+ of the Spidr proxy information.
|
9
|
+
#
|
10
|
+
def Spidr.proxy
|
11
|
+
@@spidr_proxy ||= {:host => nil, :port => COMMON_PROXY_PORT, :user => nil, :password => nil}
|
12
|
+
end
|
13
|
+
|
14
|
+
#
|
15
|
+
# Returns the Spidr User-Agent
|
16
|
+
#
|
17
|
+
def Spidr.user_agent
|
18
|
+
@@spidr_user_agent ||= nil
|
19
|
+
end
|
20
|
+
|
21
|
+
#
|
22
|
+
# Sets the Spidr Web User-Agent to the specified _new_agent_.
|
23
|
+
#
|
24
|
+
def Spidr.user_agent=(new_agent)
|
25
|
+
@@spidr_user_agent = new_agent
|
26
|
+
end
|
27
|
+
|
28
|
+
#
|
29
|
+
# See Agent.start_at.
|
30
|
+
#
|
31
|
+
def Spidr.start_at(url,options={},&block)
|
32
|
+
Agent.start_at(url,options,&block)
|
33
|
+
end
|
34
|
+
|
35
|
+
#
|
36
|
+
# See Agent.host.
|
37
|
+
#
|
38
|
+
def Spidr.host(name,options={},&block)
|
39
|
+
Agent.host(name,options,&block)
|
40
|
+
end
|
41
|
+
|
42
|
+
#
|
43
|
+
# See Agent.site.
|
44
|
+
#
|
45
|
+
def Spidr.site(url,options={},&block)
|
46
|
+
Agent.site(url,options,&block)
|
47
|
+
end
|
48
|
+
end
|
data/test/test_spidr.rb
ADDED
File without changes
|
metadata
ADDED
@@ -0,0 +1,84 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: spidr
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Postmodern Modulus III
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-05-23 00:00:00 -07:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: hpricot
|
17
|
+
version_requirement:
|
18
|
+
version_requirements: !ruby/object:Gem::Requirement
|
19
|
+
requirements:
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: "0"
|
23
|
+
version:
|
24
|
+
- !ruby/object:Gem::Dependency
|
25
|
+
name: hoe
|
26
|
+
version_requirement:
|
27
|
+
version_requirements: !ruby/object:Gem::Requirement
|
28
|
+
requirements:
|
29
|
+
- - ">="
|
30
|
+
- !ruby/object:Gem::Version
|
31
|
+
version: 1.5.3
|
32
|
+
version:
|
33
|
+
description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
|
34
|
+
email:
|
35
|
+
- postmodern.mod3@gmail.com
|
36
|
+
executables: []
|
37
|
+
|
38
|
+
extensions: []
|
39
|
+
|
40
|
+
extra_rdoc_files:
|
41
|
+
- History.txt
|
42
|
+
- Manifest.txt
|
43
|
+
- README.txt
|
44
|
+
files:
|
45
|
+
- History.txt
|
46
|
+
- Manifest.txt
|
47
|
+
- README.txt
|
48
|
+
- Rakefile
|
49
|
+
- lib/spidr.rb
|
50
|
+
- lib/spidr/page.rb
|
51
|
+
- lib/spidr/rules.rb
|
52
|
+
- lib/spidr/agent.rb
|
53
|
+
- lib/spidr/spidr.rb
|
54
|
+
- lib/spidr/version.rb
|
55
|
+
- test/test_spidr.rb
|
56
|
+
has_rdoc: true
|
57
|
+
homepage: http://spidr.rubyforge.org/
|
58
|
+
post_install_message:
|
59
|
+
rdoc_options:
|
60
|
+
- --main
|
61
|
+
- README.txt
|
62
|
+
require_paths:
|
63
|
+
- lib
|
64
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: "0"
|
69
|
+
version:
|
70
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
71
|
+
requirements:
|
72
|
+
- - ">="
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: "0"
|
75
|
+
version:
|
76
|
+
requirements: []
|
77
|
+
|
78
|
+
rubyforge_project: spidr
|
79
|
+
rubygems_version: 1.1.1
|
80
|
+
signing_key:
|
81
|
+
specification_version: 2
|
82
|
+
summary: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely
|
83
|
+
test_files:
|
84
|
+
- test/test_spidr.rb
|