spidr 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,13 @@
1
+ === 0.1.0 / 2008-05-23
2
+
3
+ * Initial release.
4
+ * Black-list or white-list URLs based upon:
5
+ * Host name
6
+ * Port number
7
+ * Full link
8
+ * URL extension
9
+ * Provides call-backs for:
10
+ * Every visited Page.
11
+ * Every visited URL.
12
+ * Every visited URL that matches a specified pattern.
13
+
@@ -0,0 +1,11 @@
1
+ History.txt
2
+ Manifest.txt
3
+ README.txt
4
+ Rakefile
5
+ lib/spidr.rb
6
+ lib/spidr/page.rb
7
+ lib/spidr/rules.rb
8
+ lib/spidr/agent.rb
9
+ lib/spidr/spidr.rb
10
+ lib/spidr/version.rb
11
+ test/test_spidr.rb
@@ -0,0 +1,55 @@
1
+ = Spidr
2
+
3
+ * http://spidr.rubyforge.org/
4
+ * Postmodern Modulus III (postmodern.mod3@gmail.com)
5
+
6
+ == DESCRIPTION:
7
+
8
+ Spidr is a versatile Ruby web spidering library that can spider a site,
9
+ multiple domains, certain links or infinitely. Spidr is designed to be fast
10
+ and easy to use.
11
+
12
+ == FEATURES/PROBLEMS:
13
+
14
+ * Black-list or white-list URLs based upon:
15
+ * Host name
16
+ * Port number
17
+ * Full link
18
+ * URL extension
19
+ * Provides call-backs for:
20
+ * Every visited Page.
21
+ * Every visited URL.
22
+ * Every visited URL that matches a specified pattern.
23
+
24
+ == REQUIREMENTS:
25
+
26
+ * Hpricot
27
+
28
+ == INSTALL:
29
+
30
+ $ sudo gem install spidr
31
+
32
+ == LICENSE:
33
+
34
+ The MIT License
35
+
36
+ Copyright (c) 2008 Hal Brodigan
37
+
38
+ Permission is hereby granted, free of charge, to any person obtaining
39
+ a copy of this software and associated documentation files (the
40
+ 'Software'), to deal in the Software without restriction, including
41
+ without limitation the rights to use, copy, modify, merge, publish,
42
+ distribute, sublicense, and/or sell copies of the Software, and to
43
+ permit persons to whom the Software is furnished to do so, subject to
44
+ the following conditions:
45
+
46
+ The above copyright notice and this permission notice shall be
47
+ included in all copies or substantial portions of the Software.
48
+
49
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
50
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
51
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
52
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
53
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
54
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
55
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,13 @@
1
+ # -*- ruby -*-
2
+
3
+ require 'rubygems'
4
+ require 'hoe'
5
+ require './lib/spidr/version.rb'
6
+
7
+ Hoe.new('spidr', Spidr::VERSION) do |p|
8
+ p.rubyforge_name = 'spidr'
9
+ p.developer('Postmodern Modulus III', 'postmodern.mod3@gmail.com')
10
+ p.extra_deps = ['hpricot']
11
+ end
12
+
13
+ # vim: syntax=Ruby
@@ -0,0 +1,3 @@
1
+ require 'spidr/agent'
2
+ require 'spidr/spidr'
3
+ require 'spidr/version'
@@ -0,0 +1,490 @@
1
+ require 'spidr/rules'
2
+ require 'spidr/page'
3
+ require 'spidr/spidr'
4
+
5
+ require 'net/http'
6
+ require 'hpricot'
7
+
8
+ module Spidr
9
+ class Agent
10
+
11
+ # URL schemes to visit
12
+ SCHEMES = ['http', 'https']
13
+
14
+ # Proxy to use
15
+ attr_accessor :proxy
16
+
17
+ # User-Agent to use
18
+ attr_accessor :user_agent
19
+
20
+ # Referer to use
21
+ attr_accessor :referer
22
+
23
+ # Delay in between fetching pages
24
+ attr_accessor :delay
25
+
26
+ # History containing visited URLs
27
+ attr_accessor :history
28
+
29
+ #
30
+ # Creates a new Agent object with the given _options_ and _block_.
31
+ # If a _block_ is given, it will be passed the newly created
32
+ # Agent object.
33
+ #
34
+ # _options_ may contain the following keys:
35
+ # <tt>:proxy</tt>:: The proxy to use while spidering.
36
+ # <tt>:user_agent</tt>:: the User-Agent string to send.
37
+ # <tt>:referer</tt>:: The referer URL to send.
38
+ # <tt>:delay</tt>:: Duration in seconds to pause between spidering each
39
+ # link. Defaults to 0.
40
+ # <tt>:hosts</tt>:: An +Array+ of host patterns to visit.
41
+ # <tt>:ignore_hosts</tt>:: An +Array+ of host patterns to not visit.
42
+ # <tt>:ports</tt>:: An +Array+ of port patterns to visit.
43
+ # <tt>:ignore_ports</tt>:: An +Array+ of port patterns to not visit.
44
+ # <tt>:links</tt>:: An +Array+ of link patterns to visit.
45
+ # <tt>:ignore_links</tt>:: An +Array+ of link patterns to not visit.
46
+ # <tt>:exts</tt>:: An +Array+ of File extension patterns to visit.
47
+ # <tt>:ignore_exts</tt>:: An +Array+ of File extension patterns to not
48
+ # visit.
49
+ #
50
+ def initialize(options={},&block)
51
+ @proxy = (options[:proxy] || Spidr.proxy)
52
+ @user_agent = (options[:user_agent] || Spidr.user_agent)
53
+ @referer = options[:referer]
54
+
55
+ @host_rules = Rules.new(:accept => options[:hosts],
56
+ :reject => options[:ignore_hosts])
57
+ @port_rules = Rules.new(:accept => options[:ports],
58
+ :reject => options[:ignore_ports])
59
+ @link_rules = Rules.new(:accept => options[:links],
60
+ :reject => options[:ignore_links])
61
+ @ext_rules = Rules.new(:accept => options[:exts],
62
+ :reject => options[:ignore_exts])
63
+
64
+ @every_url_blocks = []
65
+ @urls_like_blocks = Hash.new { |hash,key| hash[key] = [] }
66
+
67
+ @every_page_blocks = []
68
+
69
+ @delay = (options[:delay] || 0)
70
+ @history = []
71
+ @queue = []
72
+
73
+ block.call(self) if block
74
+ end
75
+
76
+ #
77
+ # Creates a new Agent object with the given _options_ and will begin
78
+ # spidering at the specified _url_. If a _block_ is given it will be
79
+ # passed the newly created Agent object, before the agent begins
80
+ # spidering.
81
+ #
82
+ def self.start_at(url,options={},&block)
83
+ self.new(options) do |spider|
84
+ block.call(spider) if block
85
+
86
+ spider.start_at(url)
87
+ end
88
+ end
89
+
90
+ #
91
+ # Creates a new Agent object with the given _options_ and will begin
92
+ # spidering the specified host _name_. If a _block_ is given it will be
93
+ # passed the newly created Agent object, before the agent begins
94
+ # spidering.
95
+ #
96
+ def self.host(name,options={},&block)
97
+ self.new(options.merge(:hosts => [name.to_s])) do |spider|
98
+ block.call(spider) if block
99
+
100
+ spider.start_at("http://#{name}/")
101
+ end
102
+ end
103
+
104
+ #
105
+ # Creates a new Agent object with the given _options_ and will begin
106
+ # spidering the host of the specified _url_. If a _block_ is given it
107
+ # will be passed the newly created Agent object, before the agent
108
+ # begins spidering.
109
+ #
110
+ def self.site(url,options={},&block)
111
+ url = URI(url.to_s)
112
+
113
+ return self.new(options.merge(:hosts => [url.host])) do |spider|
114
+ block.call(spider) if block
115
+
116
+ spider.start_at(url)
117
+ end
118
+ end
119
+
120
+ #
121
+ # Returns the +Array+ of host patterns to visit.
122
+ #
123
+ def visit_hosts
124
+ @host_rules.accept
125
+ end
126
+
127
+ #
128
+ # Adds the given _pattern_ to the visit_hosts. If a _block_ is given,
129
+ # it will be added to the visit_hosts.
130
+ #
131
+ def visit_hosts_like(pattern=nil,&block)
132
+ if pattern
133
+ visit_hosts << pattern
134
+ elsif block
135
+ visit_hosts << block
136
+ end
137
+
138
+ return self
139
+ end
140
+
141
+ #
142
+ # Returns the +Array+ of URL host patterns to not visit.
143
+ #
144
+ def ignore_hosts
145
+ @host_rules.reject
146
+ end
147
+
148
+ #
149
+ # Adds the given _pattern_ to the ignore_hosts. If a _block_ is given,
150
+ # it will be added to the ignore_hosts.
151
+ #
152
+ def ignore_hosts_like(pattern=nil,&block)
153
+ if pattern
154
+ ignore_hosts << pattern
155
+ elsif block
156
+ ignore_hosts << block
157
+ end
158
+
159
+ return self
160
+ end
161
+
162
+ #
163
+ # Returns the +Array+ of URL port patterns to visit.
164
+ #
165
+ def visit_ports
166
+ @port_rules.accept
167
+ end
168
+
169
+ #
170
+ # Adds the given _pattern_ to the visit_ports. If a _block_ is given,
171
+ # it will be added to the visit_ports.
172
+ #
173
+ def visit_ports_like(pattern=nil,&block)
174
+ if pattern
175
+ visit_ports << pattern
176
+ elsif block
177
+ visit_ports << block
178
+ end
179
+
180
+ return self
181
+ end
182
+
183
+ #
184
+ # Returns the +Array+ of URL port patterns to not visit.
185
+ #
186
+ def ignore_ports
187
+ @port_rules.reject
188
+ end
189
+
190
+ #
191
+ # Adds the given _pattern_ to the ignore_hosts. If a _block_ is given,
192
+ # it will be added to the ignore_hosts.
193
+ #
194
+ def ignore_ports_like(pattern=nil,&block)
195
+ if pattern
196
+ ignore_ports << pattern
197
+ elsif block
198
+ ignore_ports << block
199
+ end
200
+
201
+ return self
202
+ end
203
+
204
+ #
205
+ # Returns the +Array+ of link patterns to visit.
206
+ #
207
+ def visit_links
208
+ @link_rules.accept
209
+ end
210
+
211
+ #
212
+ # Adds the given _pattern_ to the visit_links. If a _block_ is given,
213
+ # it will be added to the visit_links.
214
+ #
215
+ def visit_links_like(pattern=nil,&block)
216
+ if pattern
217
+ visit_links << pattern
218
+ elsif block
219
+ visit_links << block
220
+ end
221
+
222
+ return self
223
+ end
224
+
225
+ #
226
+ # Returns the +Array+ of link patterns to not visit.
227
+ #
228
+ def ignore_links
229
+ @link_rules.reject
230
+ end
231
+
232
+ #
233
+ # Adds the given _pattern_ to the ignore_links. If a _block_ is given,
234
+ # it will be added to the ignore_links.
235
+ #
236
+ def ignore_links_like(pattern=nil,&block)
237
+ if pattern
238
+ ignore_links << pattern
239
+ elsif block
240
+ ignore_links << block
241
+ end
242
+
243
+ return self
244
+ end
245
+
246
+ #
247
+ # Returns the +Array+ of URL extension patterns to visit.
248
+ #
249
+ def visit_exts
250
+ @ext_rules.accept
251
+ end
252
+
253
+ #
254
+ # Adds the given _pattern_ to the visit_exts. If a _block_ is given,
255
+ # it will be added to the visit_exts.
256
+ #
257
+ def visit_exts_like(pattern=nil,&block)
258
+ if pattern
259
+ visit_exts << pattern
260
+ elsif block
261
+ visit_exts << block
262
+ end
263
+
264
+ return self
265
+ end
266
+
267
+ #
268
+ # Returns the +Array+ of URL extension patterns to not visit.
269
+ #
270
+ def ignore_exts
271
+ @ext_rules.reject
272
+ end
273
+
274
+ #
275
+ # Adds the given _pattern_ to the ignore_exts. If a _block_ is given,
276
+ # it will be added to the ignore_exts.
277
+ #
278
+ def ignore_exts_like(&block)
279
+ if pattern
280
+ ignore_exts << pattern
281
+ elsif block
282
+ ignore_exts << block
283
+ end
284
+
285
+ return self
286
+ end
287
+
288
+ #
289
+ # For every URL that the agent visits it will be passed to the
290
+ # specified _block_.
291
+ #
292
+ def every_url(&block)
293
+ @every_url_blocks << block
294
+ return self
295
+ end
296
+
297
+ #
298
+ # For every URL that the agent visits and matches the specified
299
+ # _pattern_, it will be passed to the specified _block_.
300
+ #
301
+ def urls_like(pattern,&block)
302
+ @urls_like_blocks[pattern] << block
303
+ return self
304
+ end
305
+
306
+ #
307
+ # For every Page that the agent visits it will be passed to the
308
+ # specified _block_.
309
+ #
310
+ def every_page(&block)
311
+ @every_page_blocks << block
312
+ return self
313
+ end
314
+
315
+ #
316
+ # Clear the history and start spidering at the specified _url_.
317
+ #
318
+ def start_at(url)
319
+ @history.clear
320
+ return run(url)
321
+ end
322
+
323
+ #
324
+ # Start spidering at the specified _url_.
325
+ #
326
+ def run(url)
327
+ enqueue(url)
328
+
329
+ until @queue.empty?
330
+ visit_page(dequeue)
331
+ end
332
+
333
+ return self
334
+ end
335
+
336
+ #
337
+ # Returns the +Array+ of visited URLs.
338
+ #
339
+ def visited_urls
340
+ @history
341
+ end
342
+
343
+ #
344
+ # Returns the +Array+ of visited URLs.
345
+ #
346
+ def visited_links
347
+ @history.map { |uri| uri.to_s }
348
+ end
349
+
350
+ #
351
+ # Return the +Array+ of hosts that were visited.
352
+ #
353
+ def visited_hosts
354
+ @history.map { |uri| uri.host }.uniq
355
+ end
356
+
357
+ #
358
+ # Returns +true+ if the specified _url_ was visited, returns +false+
359
+ # otherwise.
360
+ #
361
+ def visited?(url)
362
+ if url.kind_of?(URI)
363
+ return @history.include?(url)
364
+ else
365
+ return @history.include?(URI(url).to_s)
366
+ end
367
+ end
368
+
369
+ protected
370
+
371
+ #
372
+ # Returns +true+ if the specified _url_ is queued for visiting, returns
373
+ # +false+ otherwise.
374
+ #
375
+ def queued?(url)
376
+ @queue.include?(url)
377
+ end
378
+
379
+ #
380
+ # Enqueues the specified _url_ for visiting, only if it passes all the
381
+ # agent's rules for visiting a given URL. Returns +true+ if the _url_
382
+ # was successfully enqueued, returns +false+ otherwise.
383
+ #
384
+ def enqueue(url)
385
+ link = url.to_s
386
+ url = URI(link)
387
+
388
+ if (!(queued?(url)) && visit?(url))
389
+ @every_url_blocks.each { |block| block.call(url) }
390
+
391
+ @urls_like_blocks.each do |pattern,blocks|
392
+ if ((pattern.kind_of?(Regexp) && link =~ pattern) || pattern == link || pattern == url)
393
+ blocks.each { |url_block| url_block.call(url) }
394
+ end
395
+ end
396
+
397
+ @queue << url
398
+ return true
399
+ end
400
+
401
+ return false
402
+ end
403
+
404
+ #
405
+ # Dequeues a URL that will later be visited.
406
+ #
407
+ def dequeue
408
+ @queue.shift
409
+ end
410
+
411
+ #
412
+ # Returns +true+ if the specified URL should be visited, returns
413
+ # +false+ otherwise.
414
+ #
415
+ def visit?(url)
416
+ (!(visited?(url)) &&
417
+ visit_scheme?(url) &&
418
+ visit_host?(url) &&
419
+ visit_port?(url) &&
420
+ visit_link?(url) &&
421
+ visit_ext?(url))
422
+ end
423
+
424
+ #
425
+ # Visits the spedified _url_ and enqueus it's links for visiting. If a
426
+ # _block_ is given, it will be passed a newly created Page object
427
+ # for the specified _url_.
428
+ #
429
+ def visit_page(url,&block)
430
+ get_page(url) do |page|
431
+ @history << page.url
432
+
433
+ page.urls.each { |next_url| enqueue(next_url) }
434
+
435
+ @every_page_blocks.each { |page_block| page_block.call(page) }
436
+
437
+ block.call(page) if block
438
+ end
439
+ end
440
+
441
+ private
442
+
443
+ def visit_scheme?(url)
444
+ if url.scheme
445
+ return SCHEMES.include?(url.scheme)
446
+ else
447
+ return true
448
+ end
449
+ end
450
+
451
+ def visit_host?(url)
452
+ @host_rules.accept?(url.host)
453
+ end
454
+
455
+ def visit_port?(url)
456
+ @port_rules.accept?(url.port)
457
+ end
458
+
459
+ def visit_link?(url)
460
+ @link_rules.accept?(url.to_s)
461
+ end
462
+
463
+ def visit_ext?(url)
464
+ @ext_rules.accept?(File.extname(url.path)[1..-1])
465
+ end
466
+
467
+ def get_page(url,&block)
468
+ host = url.host
469
+ port = url.port
470
+
471
+ proxy_host = @proxy[:host]
472
+ proxy_port = @proxy[:port]
473
+ proxy_user = @proxy[:user]
474
+ proxy_password = @proxy[:password]
475
+
476
+ Net::HTTP::Proxy(proxy_host,proxy_port,proxy_user,proxy_password).start(host,port) do |sess|
477
+ headers = {}
478
+
479
+ headers['User-Agent'] = @user_agent if @user_agent
480
+ headers['Referer'] = @referer if @referer
481
+
482
+ new_page = Page.new(url,sess.get(url.path,headers))
483
+
484
+ block.call(new_page) if block
485
+ return new_page
486
+ end
487
+ end
488
+
489
+ end
490
+ end
@@ -0,0 +1,159 @@
1
+ require 'uri'
2
+ require 'hpricot'
3
+
4
+ module Spidr
5
+ class Page
6
+
7
+ # URL of the page
8
+ attr_reader :url
9
+
10
+ # Body returned for the page
11
+ attr_reader :body
12
+
13
+ # Headers returned with the body
14
+ attr_reader :headers
15
+
16
+ #
17
+ # Creates a new Page object from the specified _url_ and HTTP
18
+ # _response_.
19
+ #
20
+ def initialize(url,response)
21
+ @url = url
22
+ @response = response
23
+ @doc = nil
24
+ end
25
+
26
+ #
27
+ # Returns the content-type of the page.
28
+ #
29
+ def content_type
30
+ @response['Content-Type']
31
+ end
32
+
33
+ #
34
+ # Returns +true+ if the page is a HTML document, returns +false+
35
+ # otherwise.
36
+ #
37
+ def html?
38
+ (content_type =~ /text\/html/) == 0
39
+ end
40
+
41
+ #
42
+ # Returns +true+ if the page is a XML document, returns +false+
43
+ # otherwise.
44
+ #
45
+ def xml?
46
+ (content_type =~ /text\/xml/) == 0
47
+ end
48
+
49
+ #
50
+ # Returns +true+ if the page is a Javascript file, returns +false+
51
+ # otherwise.
52
+ #
53
+ def javascript?
54
+ (content_type =~ /(text|application)\/javascript/) == 0
55
+ end
56
+
57
+ #
58
+ # Returns +true+ if the page is a CSS file, returns +false+
59
+ # otherwise.
60
+ #
61
+ def css?
62
+ (content_type =~ /text\/css/) == 0
63
+ end
64
+
65
+ #
66
+ # Returns +true+ if the page is a RSS/RDF feed, returns +false+
67
+ # otherwise.
68
+ #
69
+ def rss?
70
+ (content_type =~ /application\/(rss|rdf)\+xml/) == 0
71
+ end
72
+
73
+ #
74
+ # Returns +true+ if the page is a Atom feed, returns +false+
75
+ # otherwise.
76
+ #
77
+ def atom?
78
+ (content_type =~ /application\/atom\+xml/) == 0
79
+ end
80
+
81
+ #
82
+ # Returns the body of the page in +String+ form.
83
+ #
84
+ def body
85
+ @response.body
86
+ end
87
+
88
+ #
89
+ # Returns an Hpricot::Doc if the page represents a HTML document,
90
+ # returns +nil+ otherwise.
91
+ #
92
+ def doc
93
+ if html?
94
+ return @doc ||= Hpricot(body)
95
+ end
96
+ end
97
+
98
+ #
99
+ # Returns all links from the HTML page.
100
+ #
101
+ def links
102
+ if html?
103
+ return doc.search('a[@href]').map do |a|
104
+ a.attributes['href'].strip
105
+ end
106
+ end
107
+
108
+ return []
109
+ end
110
+
111
+ #
112
+ # Returns all links from the HtML page as absolute URLs.
113
+ #
114
+ def urls
115
+ links.map { |link| to_absolute(link) }
116
+ end
117
+
118
+ protected
119
+
120
+ #
121
+ # Converts the specified _link_ into an absolute URL
122
+ # based on the url of the page.
123
+ #
124
+ def to_absolute(link)
125
+ link = URI.encode(link.to_s.gsub(/#.*$/,''))
126
+ relative = URI(link)
127
+
128
+ if relative.scheme.nil?
129
+ new_url = @url.clone
130
+
131
+ if relative.path[0..0] == '/'
132
+ new_url.path = relative.path
133
+ elsif relative.path[-1..-1] == '/'
134
+ new_url.path = File.expand_path(File.join(new_url.path,relative.path))
135
+ elsif !(relative.path.empty?)
136
+ new_url.path = File.expand_path(File.join(File.dirname(new_url.path),relative.path))
137
+ end
138
+
139
+ return new_url
140
+ end
141
+
142
+ return relative
143
+ end
144
+
145
+ #
146
+ # Provides transparent access to the values in the +headers+ +Hash+.
147
+ #
148
+ def method_missing(sym,*args,&block)
149
+ if (args.empty? && block.nil?)
150
+ name = sym.id2name.sub('_','-')
151
+
152
+ return @response[name] if @response.has_key?(name)
153
+ end
154
+
155
+ return super(sym,*args,&block)
156
+ end
157
+
158
+ end
159
+ end
@@ -0,0 +1,61 @@
1
+ module Spidr
2
+ class Rules
3
+
4
+ # Accept rules
5
+ attr_reader :accept
6
+
7
+ # Reject rules
8
+ attr_reader :reject
9
+
10
+ def initialize(options={})
11
+ @accept = (options[:accept] || [])
12
+ @reject = (options[:reject] || [])
13
+ end
14
+
15
+ #
16
+ # Returns +true+ if the _field_ is accepted by the rules,
17
+ # returns +false+ otherwise.
18
+ #
19
+ def accept?(field)
20
+ unless @accept.empty?
21
+ @accept.each do |rule|
22
+ return true if test_field(field,rule)
23
+ end
24
+
25
+ return false
26
+ else
27
+ @reject.each do |rule|
28
+ return false if test_field(field,rule)
29
+ end
30
+
31
+ return true
32
+ end
33
+ end
34
+
35
+ #
36
+ # Returns +true+ if the _field_ is rejected by the rules,
37
+ # returns +false+ otherwise.
38
+ #
39
+ def reject?(field)
40
+ !(accept?(field))
41
+ end
42
+
43
+ protected
44
+
45
+ #
46
+ # Tests the specified _field_ against the specified _rule_. Returns
47
+ # +true+ when the _rule_ matches the specified _field_, returns
48
+ # +false+ otherwise.
49
+ #
50
+ def test_field(field,rule)
51
+ if rule.kind_of?(Proc)
52
+ return (rule.call(field) == true)
53
+ elsif rule.kind_of?(Regexp)
54
+ return !((field.to_s =~ rule).nil?)
55
+ else
56
+ return field == rule
57
+ end
58
+ end
59
+
60
+ end
61
+ end
@@ -0,0 +1,48 @@
1
+ require 'spidr/agent'
2
+
3
+ module Spidr
4
+ # Common proxy port.
5
+ COMMON_PROXY_PORT = 8080
6
+
7
+ #
8
+ # Returns the +Hash+ of the Spidr proxy information.
9
+ #
10
+ def Spidr.proxy
11
+ @@spidr_proxy ||= {:host => nil, :port => COMMON_PROXY_PORT, :user => nil, :password => nil}
12
+ end
13
+
14
+ #
15
+ # Returns the Spidr User-Agent
16
+ #
17
+ def Spidr.user_agent
18
+ @@spidr_user_agent ||= nil
19
+ end
20
+
21
+ #
22
+ # Sets the Spidr Web User-Agent to the specified _new_agent_.
23
+ #
24
+ def Spidr.user_agent=(new_agent)
25
+ @@spidr_user_agent = new_agent
26
+ end
27
+
28
+ #
29
+ # See Agent.start_at.
30
+ #
31
+ def Spidr.start_at(url,options={},&block)
32
+ Agent.start_at(url,options,&block)
33
+ end
34
+
35
+ #
36
+ # See Agent.host.
37
+ #
38
+ def Spidr.host(name,options={},&block)
39
+ Agent.host(name,options,&block)
40
+ end
41
+
42
+ #
43
+ # See Agent.site.
44
+ #
45
+ def Spidr.site(url,options={},&block)
46
+ Agent.site(url,options,&block)
47
+ end
48
+ end
@@ -0,0 +1,3 @@
1
+ module Spidr
2
+ VERSION = '0.1.0'
3
+ end
File without changes
metadata ADDED
@@ -0,0 +1,84 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: spidr
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Postmodern Modulus III
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-05-23 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hpricot
17
+ version_requirement:
18
+ version_requirements: !ruby/object:Gem::Requirement
19
+ requirements:
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: "0"
23
+ version:
24
+ - !ruby/object:Gem::Dependency
25
+ name: hoe
26
+ version_requirement:
27
+ version_requirements: !ruby/object:Gem::Requirement
28
+ requirements:
29
+ - - ">="
30
+ - !ruby/object:Gem::Version
31
+ version: 1.5.3
32
+ version:
33
+ description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
34
+ email:
35
+ - postmodern.mod3@gmail.com
36
+ executables: []
37
+
38
+ extensions: []
39
+
40
+ extra_rdoc_files:
41
+ - History.txt
42
+ - Manifest.txt
43
+ - README.txt
44
+ files:
45
+ - History.txt
46
+ - Manifest.txt
47
+ - README.txt
48
+ - Rakefile
49
+ - lib/spidr.rb
50
+ - lib/spidr/page.rb
51
+ - lib/spidr/rules.rb
52
+ - lib/spidr/agent.rb
53
+ - lib/spidr/spidr.rb
54
+ - lib/spidr/version.rb
55
+ - test/test_spidr.rb
56
+ has_rdoc: true
57
+ homepage: http://spidr.rubyforge.org/
58
+ post_install_message:
59
+ rdoc_options:
60
+ - --main
61
+ - README.txt
62
+ require_paths:
63
+ - lib
64
+ required_ruby_version: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: "0"
69
+ version:
70
+ required_rubygems_version: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: "0"
75
+ version:
76
+ requirements: []
77
+
78
+ rubyforge_project: spidr
79
+ rubygems_version: 1.1.1
80
+ signing_key:
81
+ specification_version: 2
82
+ summary: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely
83
+ test_files:
84
+ - test/test_spidr.rb