spidr 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,13 @@
1
+ === 0.1.0 / 2008-05-23
2
+
3
+ * Initial release.
4
+ * Black-list or white-list URLs based upon:
5
+ * Host name
6
+ * Port number
7
+ * Full link
8
+ * URL extension
9
+ * Provides call-backs for:
10
+ * Every visited Page.
11
+ * Every visited URL.
12
+ * Every visited URL that matches a specified pattern.
13
+
@@ -0,0 +1,11 @@
1
+ History.txt
2
+ Manifest.txt
3
+ README.txt
4
+ Rakefile
5
+ lib/spidr.rb
6
+ lib/spidr/page.rb
7
+ lib/spidr/rules.rb
8
+ lib/spidr/agent.rb
9
+ lib/spidr/spidr.rb
10
+ lib/spidr/version.rb
11
+ test/test_spidr.rb
@@ -0,0 +1,55 @@
1
+ = Spidr
2
+
3
+ * http://spidr.rubyforge.org/
4
+ * Postmodern Modulus III (postmodern.mod3@gmail.com)
5
+
6
+ == DESCRIPTION:
7
+
8
+ Spidr is a versatile Ruby web spidering library that can spider a site,
9
+ multiple domains, certain links or infinitely. Spidr is designed to be fast
10
+ and easy to use.
11
+
12
+ == FEATURES/PROBLEMS:
13
+
14
+ * Black-list or white-list URLs based upon:
15
+ * Host name
16
+ * Port number
17
+ * Full link
18
+ * URL extension
19
+ * Provides call-backs for:
20
+ * Every visited Page.
21
+ * Every visited URL.
22
+ * Every visited URL that matches a specified pattern.
23
+
24
+ == REQUIREMENTS:
25
+
26
+ * Hpricot
27
+
28
+ == INSTALL:
29
+
30
+ $ sudo gem install spidr
31
+
32
+ == LICENSE:
33
+
34
+ The MIT License
35
+
36
+ Copyright (c) 2008 Hal Brodigan
37
+
38
+ Permission is hereby granted, free of charge, to any person obtaining
39
+ a copy of this software and associated documentation files (the
40
+ 'Software'), to deal in the Software without restriction, including
41
+ without limitation the rights to use, copy, modify, merge, publish,
42
+ distribute, sublicense, and/or sell copies of the Software, and to
43
+ permit persons to whom the Software is furnished to do so, subject to
44
+ the following conditions:
45
+
46
+ The above copyright notice and this permission notice shall be
47
+ included in all copies or substantial portions of the Software.
48
+
49
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
50
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
51
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
52
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
53
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
54
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
55
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,13 @@
1
+ # -*- ruby -*-
2
+
3
+ require 'rubygems'
4
+ require 'hoe'
5
+ require './lib/spidr/version.rb'
6
+
7
+ Hoe.new('spidr', Spidr::VERSION) do |p|
8
+ p.rubyforge_name = 'spidr'
9
+ p.developer('Postmodern Modulus III', 'postmodern.mod3@gmail.com')
10
+ p.extra_deps = ['hpricot']
11
+ end
12
+
13
+ # vim: syntax=Ruby
@@ -0,0 +1,3 @@
1
+ require 'spidr/agent'
2
+ require 'spidr/spidr'
3
+ require 'spidr/version'
@@ -0,0 +1,490 @@
1
+ require 'spidr/rules'
2
+ require 'spidr/page'
3
+ require 'spidr/spidr'
4
+
5
+ require 'net/http'
6
+ require 'hpricot'
7
+
8
+ module Spidr
9
+ class Agent
10
+
11
+ # URL schemes to visit
12
+ SCHEMES = ['http', 'https']
13
+
14
+ # Proxy to use
15
+ attr_accessor :proxy
16
+
17
+ # User-Agent to use
18
+ attr_accessor :user_agent
19
+
20
+ # Referer to use
21
+ attr_accessor :referer
22
+
23
+ # Delay in between fetching pages
24
+ attr_accessor :delay
25
+
26
+ # History containing visited URLs
27
+ attr_accessor :history
28
+
29
+ #
30
+ # Creates a new Agent object with the given _options_ and _block_.
31
+ # If a _block_ is given, it will be passed the newly created
32
+ # Agent object.
33
+ #
34
+ # _options_ may contain the following keys:
35
+ # <tt>:proxy</tt>:: The proxy to use while spidering.
36
+ # <tt>:user_agent</tt>:: the User-Agent string to send.
37
+ # <tt>:referer</tt>:: The referer URL to send.
38
+ # <tt>:delay</tt>:: Duration in seconds to pause between spidering each
39
+ # link. Defaults to 0.
40
+ # <tt>:hosts</tt>:: An +Array+ of host patterns to visit.
41
+ # <tt>:ignore_hosts</tt>:: An +Array+ of host patterns to not visit.
42
+ # <tt>:ports</tt>:: An +Array+ of port patterns to visit.
43
+ # <tt>:ignore_ports</tt>:: An +Array+ of port patterns to not visit.
44
+ # <tt>:links</tt>:: An +Array+ of link patterns to visit.
45
+ # <tt>:ignore_links</tt>:: An +Array+ of link patterns to not visit.
46
+ # <tt>:exts</tt>:: An +Array+ of File extension patterns to visit.
47
+ # <tt>:ignore_exts</tt>:: An +Array+ of File extension patterns to not
48
+ # visit.
49
+ #
50
+ def initialize(options={},&block)
51
+ @proxy = (options[:proxy] || Spidr.proxy)
52
+ @user_agent = (options[:user_agent] || Spidr.user_agent)
53
+ @referer = options[:referer]
54
+
55
+ @host_rules = Rules.new(:accept => options[:hosts],
56
+ :reject => options[:ignore_hosts])
57
+ @port_rules = Rules.new(:accept => options[:ports],
58
+ :reject => options[:ignore_ports])
59
+ @link_rules = Rules.new(:accept => options[:links],
60
+ :reject => options[:ignore_links])
61
+ @ext_rules = Rules.new(:accept => options[:exts],
62
+ :reject => options[:ignore_exts])
63
+
64
+ @every_url_blocks = []
65
+ @urls_like_blocks = Hash.new { |hash,key| hash[key] = [] }
66
+
67
+ @every_page_blocks = []
68
+
69
+ @delay = (options[:delay] || 0)
70
+ @history = []
71
+ @queue = []
72
+
73
+ block.call(self) if block
74
+ end
75
+
76
+ #
77
+ # Creates a new Agent object with the given _options_ and will begin
78
+ # spidering at the specified _url_. If a _block_ is given it will be
79
+ # passed the newly created Agent object, before the agent begins
80
+ # spidering.
81
+ #
82
+ def self.start_at(url,options={},&block)
83
+ self.new(options) do |spider|
84
+ block.call(spider) if block
85
+
86
+ spider.start_at(url)
87
+ end
88
+ end
89
+
90
+ #
91
+ # Creates a new Agent object with the given _options_ and will begin
92
+ # spidering the specified host _name_. If a _block_ is given it will be
93
+ # passed the newly created Agent object, before the agent begins
94
+ # spidering.
95
+ #
96
+ def self.host(name,options={},&block)
97
+ self.new(options.merge(:hosts => [name.to_s])) do |spider|
98
+ block.call(spider) if block
99
+
100
+ spider.start_at("http://#{name}/")
101
+ end
102
+ end
103
+
104
+ #
105
+ # Creates a new Agent object with the given _options_ and will begin
106
+ # spidering the host of the specified _url_. If a _block_ is given it
107
+ # will be passed the newly created Agent object, before the agent
108
+ # begins spidering.
109
+ #
110
+ def self.site(url,options={},&block)
111
+ url = URI(url.to_s)
112
+
113
+ return self.new(options.merge(:hosts => [url.host])) do |spider|
114
+ block.call(spider) if block
115
+
116
+ spider.start_at(url)
117
+ end
118
+ end
119
+
120
+ #
121
+ # Returns the +Array+ of host patterns to visit.
122
+ #
123
+ def visit_hosts
124
+ @host_rules.accept
125
+ end
126
+
127
+ #
128
+ # Adds the given _pattern_ to the visit_hosts. If a _block_ is given,
129
+ # it will be added to the visit_hosts.
130
+ #
131
+ def visit_hosts_like(pattern=nil,&block)
132
+ if pattern
133
+ visit_hosts << pattern
134
+ elsif block
135
+ visit_hosts << block
136
+ end
137
+
138
+ return self
139
+ end
140
+
141
+ #
142
+ # Returns the +Array+ of URL host patterns to not visit.
143
+ #
144
+ def ignore_hosts
145
+ @host_rules.reject
146
+ end
147
+
148
+ #
149
+ # Adds the given _pattern_ to the ignore_hosts. If a _block_ is given,
150
+ # it will be added to the ignore_hosts.
151
+ #
152
+ def ignore_hosts_like(pattern=nil,&block)
153
+ if pattern
154
+ ignore_hosts << pattern
155
+ elsif block
156
+ ignore_hosts << block
157
+ end
158
+
159
+ return self
160
+ end
161
+
162
+ #
163
+ # Returns the +Array+ of URL port patterns to visit.
164
+ #
165
+ def visit_ports
166
+ @port_rules.accept
167
+ end
168
+
169
+ #
170
+ # Adds the given _pattern_ to the visit_ports. If a _block_ is given,
171
+ # it will be added to the visit_ports.
172
+ #
173
+ def visit_ports_like(pattern=nil,&block)
174
+ if pattern
175
+ visit_ports << pattern
176
+ elsif block
177
+ visit_ports << block
178
+ end
179
+
180
+ return self
181
+ end
182
+
183
+ #
184
+ # Returns the +Array+ of URL port patterns to not visit.
185
+ #
186
+ def ignore_ports
187
+ @port_rules.reject
188
+ end
189
+
190
+ #
191
+ # Adds the given _pattern_ to the ignore_hosts. If a _block_ is given,
192
+ # it will be added to the ignore_hosts.
193
+ #
194
+ def ignore_ports_like(pattern=nil,&block)
195
+ if pattern
196
+ ignore_ports << pattern
197
+ elsif block
198
+ ignore_ports << block
199
+ end
200
+
201
+ return self
202
+ end
203
+
204
+ #
205
+ # Returns the +Array+ of link patterns to visit.
206
+ #
207
+ def visit_links
208
+ @link_rules.accept
209
+ end
210
+
211
+ #
212
+ # Adds the given _pattern_ to the visit_links. If a _block_ is given,
213
+ # it will be added to the visit_links.
214
+ #
215
+ def visit_links_like(pattern=nil,&block)
216
+ if pattern
217
+ visit_links << pattern
218
+ elsif block
219
+ visit_links << block
220
+ end
221
+
222
+ return self
223
+ end
224
+
225
+ #
226
+ # Returns the +Array+ of link patterns to not visit.
227
+ #
228
+ def ignore_links
229
+ @link_rules.reject
230
+ end
231
+
232
+ #
233
+ # Adds the given _pattern_ to the ignore_links. If a _block_ is given,
234
+ # it will be added to the ignore_links.
235
+ #
236
+ def ignore_links_like(pattern=nil,&block)
237
+ if pattern
238
+ ignore_links << pattern
239
+ elsif block
240
+ ignore_links << block
241
+ end
242
+
243
+ return self
244
+ end
245
+
246
+ #
247
+ # Returns the +Array+ of URL extension patterns to visit.
248
+ #
249
+ def visit_exts
250
+ @ext_rules.accept
251
+ end
252
+
253
+ #
254
+ # Adds the given _pattern_ to the visit_exts. If a _block_ is given,
255
+ # it will be added to the visit_exts.
256
+ #
257
+ def visit_exts_like(pattern=nil,&block)
258
+ if pattern
259
+ visit_exts << pattern
260
+ elsif block
261
+ visit_exts << block
262
+ end
263
+
264
+ return self
265
+ end
266
+
267
+ #
268
+ # Returns the +Array+ of URL extension patterns to not visit.
269
+ #
270
+ def ignore_exts
271
+ @ext_rules.reject
272
+ end
273
+
274
+ #
275
+ # Adds the given _pattern_ to the ignore_exts. If a _block_ is given,
276
+ # it will be added to the ignore_exts.
277
+ #
278
+ def ignore_exts_like(&block)
279
+ if pattern
280
+ ignore_exts << pattern
281
+ elsif block
282
+ ignore_exts << block
283
+ end
284
+
285
+ return self
286
+ end
287
+
288
+ #
289
+ # For every URL that the agent visits it will be passed to the
290
+ # specified _block_.
291
+ #
292
+ def every_url(&block)
293
+ @every_url_blocks << block
294
+ return self
295
+ end
296
+
297
+ #
298
+ # For every URL that the agent visits and matches the specified
299
+ # _pattern_, it will be passed to the specified _block_.
300
+ #
301
+ def urls_like(pattern,&block)
302
+ @urls_like_blocks[pattern] << block
303
+ return self
304
+ end
305
+
306
+ #
307
+ # For every Page that the agent visits it will be passed to the
308
+ # specified _block_.
309
+ #
310
+ def every_page(&block)
311
+ @every_page_blocks << block
312
+ return self
313
+ end
314
+
315
+ #
316
+ # Clear the history and start spidering at the specified _url_.
317
+ #
318
+ def start_at(url)
319
+ @history.clear
320
+ return run(url)
321
+ end
322
+
323
+ #
324
+ # Start spidering at the specified _url_.
325
+ #
326
+ def run(url)
327
+ enqueue(url)
328
+
329
+ until @queue.empty?
330
+ visit_page(dequeue)
331
+ end
332
+
333
+ return self
334
+ end
335
+
336
+ #
337
+ # Returns the +Array+ of visited URLs.
338
+ #
339
+ def visited_urls
340
+ @history
341
+ end
342
+
343
+ #
344
+ # Returns the +Array+ of visited URLs.
345
+ #
346
+ def visited_links
347
+ @history.map { |uri| uri.to_s }
348
+ end
349
+
350
+ #
351
+ # Return the +Array+ of hosts that were visited.
352
+ #
353
+ def visited_hosts
354
+ @history.map { |uri| uri.host }.uniq
355
+ end
356
+
357
+ #
358
+ # Returns +true+ if the specified _url_ was visited, returns +false+
359
+ # otherwise.
360
+ #
361
+ def visited?(url)
362
+ if url.kind_of?(URI)
363
+ return @history.include?(url)
364
+ else
365
+ return @history.include?(URI(url).to_s)
366
+ end
367
+ end
368
+
369
+ protected
370
+
371
+ #
372
+ # Returns +true+ if the specified _url_ is queued for visiting, returns
373
+ # +false+ otherwise.
374
+ #
375
+ def queued?(url)
376
+ @queue.include?(url)
377
+ end
378
+
379
+ #
380
+ # Enqueues the specified _url_ for visiting, only if it passes all the
381
+ # agent's rules for visiting a given URL. Returns +true+ if the _url_
382
+ # was successfully enqueued, returns +false+ otherwise.
383
+ #
384
+ def enqueue(url)
385
+ link = url.to_s
386
+ url = URI(link)
387
+
388
+ if (!(queued?(url)) && visit?(url))
389
+ @every_url_blocks.each { |block| block.call(url) }
390
+
391
+ @urls_like_blocks.each do |pattern,blocks|
392
+ if ((pattern.kind_of?(Regexp) && link =~ pattern) || pattern == link || pattern == url)
393
+ blocks.each { |url_block| url_block.call(url) }
394
+ end
395
+ end
396
+
397
+ @queue << url
398
+ return true
399
+ end
400
+
401
+ return false
402
+ end
403
+
404
+ #
405
+ # Dequeues a URL that will later be visited.
406
+ #
407
+ def dequeue
408
+ @queue.shift
409
+ end
410
+
411
+ #
412
+ # Returns +true+ if the specified URL should be visited, returns
413
+ # +false+ otherwise.
414
+ #
415
+ def visit?(url)
416
+ (!(visited?(url)) &&
417
+ visit_scheme?(url) &&
418
+ visit_host?(url) &&
419
+ visit_port?(url) &&
420
+ visit_link?(url) &&
421
+ visit_ext?(url))
422
+ end
423
+
424
+ #
425
+ # Visits the spedified _url_ and enqueus it's links for visiting. If a
426
+ # _block_ is given, it will be passed a newly created Page object
427
+ # for the specified _url_.
428
+ #
429
+ def visit_page(url,&block)
430
+ get_page(url) do |page|
431
+ @history << page.url
432
+
433
+ page.urls.each { |next_url| enqueue(next_url) }
434
+
435
+ @every_page_blocks.each { |page_block| page_block.call(page) }
436
+
437
+ block.call(page) if block
438
+ end
439
+ end
440
+
441
+ private
442
+
443
+ def visit_scheme?(url)
444
+ if url.scheme
445
+ return SCHEMES.include?(url.scheme)
446
+ else
447
+ return true
448
+ end
449
+ end
450
+
451
+ def visit_host?(url)
452
+ @host_rules.accept?(url.host)
453
+ end
454
+
455
+ def visit_port?(url)
456
+ @port_rules.accept?(url.port)
457
+ end
458
+
459
+ def visit_link?(url)
460
+ @link_rules.accept?(url.to_s)
461
+ end
462
+
463
+ def visit_ext?(url)
464
+ @ext_rules.accept?(File.extname(url.path)[1..-1])
465
+ end
466
+
467
+ def get_page(url,&block)
468
+ host = url.host
469
+ port = url.port
470
+
471
+ proxy_host = @proxy[:host]
472
+ proxy_port = @proxy[:port]
473
+ proxy_user = @proxy[:user]
474
+ proxy_password = @proxy[:password]
475
+
476
+ Net::HTTP::Proxy(proxy_host,proxy_port,proxy_user,proxy_password).start(host,port) do |sess|
477
+ headers = {}
478
+
479
+ headers['User-Agent'] = @user_agent if @user_agent
480
+ headers['Referer'] = @referer if @referer
481
+
482
+ new_page = Page.new(url,sess.get(url.path,headers))
483
+
484
+ block.call(new_page) if block
485
+ return new_page
486
+ end
487
+ end
488
+
489
+ end
490
+ end
@@ -0,0 +1,159 @@
1
+ require 'uri'
2
+ require 'hpricot'
3
+
4
+ module Spidr
5
+ class Page
6
+
7
+ # URL of the page
8
+ attr_reader :url
9
+
10
+ # Body returned for the page
11
+ attr_reader :body
12
+
13
+ # Headers returned with the body
14
+ attr_reader :headers
15
+
16
+ #
17
+ # Creates a new Page object from the specified _url_ and HTTP
18
+ # _response_.
19
+ #
20
+ def initialize(url,response)
21
+ @url = url
22
+ @response = response
23
+ @doc = nil
24
+ end
25
+
26
+ #
27
+ # Returns the content-type of the page.
28
+ #
29
+ def content_type
30
+ @response['Content-Type']
31
+ end
32
+
33
+ #
34
+ # Returns +true+ if the page is a HTML document, returns +false+
35
+ # otherwise.
36
+ #
37
+ def html?
38
+ (content_type =~ /text\/html/) == 0
39
+ end
40
+
41
+ #
42
+ # Returns +true+ if the page is a XML document, returns +false+
43
+ # otherwise.
44
+ #
45
+ def xml?
46
+ (content_type =~ /text\/xml/) == 0
47
+ end
48
+
49
+ #
50
+ # Returns +true+ if the page is a Javascript file, returns +false+
51
+ # otherwise.
52
+ #
53
+ def javascript?
54
+ (content_type =~ /(text|application)\/javascript/) == 0
55
+ end
56
+
57
+ #
58
+ # Returns +true+ if the page is a CSS file, returns +false+
59
+ # otherwise.
60
+ #
61
+ def css?
62
+ (content_type =~ /text\/css/) == 0
63
+ end
64
+
65
+ #
66
+ # Returns +true+ if the page is a RSS/RDF feed, returns +false+
67
+ # otherwise.
68
+ #
69
+ def rss?
70
+ (content_type =~ /application\/(rss|rdf)\+xml/) == 0
71
+ end
72
+
73
+ #
74
+ # Returns +true+ if the page is a Atom feed, returns +false+
75
+ # otherwise.
76
+ #
77
+ def atom?
78
+ (content_type =~ /application\/atom\+xml/) == 0
79
+ end
80
+
81
+ #
82
+ # Returns the body of the page in +String+ form.
83
+ #
84
+ def body
85
+ @response.body
86
+ end
87
+
88
+ #
89
+ # Returns an Hpricot::Doc if the page represents a HTML document,
90
+ # returns +nil+ otherwise.
91
+ #
92
+ def doc
93
+ if html?
94
+ return @doc ||= Hpricot(body)
95
+ end
96
+ end
97
+
98
+ #
99
+ # Returns all links from the HTML page.
100
+ #
101
+ def links
102
+ if html?
103
+ return doc.search('a[@href]').map do |a|
104
+ a.attributes['href'].strip
105
+ end
106
+ end
107
+
108
+ return []
109
+ end
110
+
111
+ #
112
+ # Returns all links from the HtML page as absolute URLs.
113
+ #
114
+ def urls
115
+ links.map { |link| to_absolute(link) }
116
+ end
117
+
118
+ protected
119
+
120
+ #
121
+ # Converts the specified _link_ into an absolute URL
122
+ # based on the url of the page.
123
+ #
124
+ def to_absolute(link)
125
+ link = URI.encode(link.to_s.gsub(/#.*$/,''))
126
+ relative = URI(link)
127
+
128
+ if relative.scheme.nil?
129
+ new_url = @url.clone
130
+
131
+ if relative.path[0..0] == '/'
132
+ new_url.path = relative.path
133
+ elsif relative.path[-1..-1] == '/'
134
+ new_url.path = File.expand_path(File.join(new_url.path,relative.path))
135
+ elsif !(relative.path.empty?)
136
+ new_url.path = File.expand_path(File.join(File.dirname(new_url.path),relative.path))
137
+ end
138
+
139
+ return new_url
140
+ end
141
+
142
+ return relative
143
+ end
144
+
145
+ #
146
+ # Provides transparent access to the values in the +headers+ +Hash+.
147
+ #
148
+ def method_missing(sym,*args,&block)
149
+ if (args.empty? && block.nil?)
150
+ name = sym.id2name.sub('_','-')
151
+
152
+ return @response[name] if @response.has_key?(name)
153
+ end
154
+
155
+ return super(sym,*args,&block)
156
+ end
157
+
158
+ end
159
+ end
@@ -0,0 +1,61 @@
1
+ module Spidr
2
+ class Rules
3
+
4
+ # Accept rules
5
+ attr_reader :accept
6
+
7
+ # Reject rules
8
+ attr_reader :reject
9
+
10
+ def initialize(options={})
11
+ @accept = (options[:accept] || [])
12
+ @reject = (options[:reject] || [])
13
+ end
14
+
15
+ #
16
+ # Returns +true+ if the _field_ is accepted by the rules,
17
+ # returns +false+ otherwise.
18
+ #
19
+ def accept?(field)
20
+ unless @accept.empty?
21
+ @accept.each do |rule|
22
+ return true if test_field(field,rule)
23
+ end
24
+
25
+ return false
26
+ else
27
+ @reject.each do |rule|
28
+ return false if test_field(field,rule)
29
+ end
30
+
31
+ return true
32
+ end
33
+ end
34
+
35
+ #
36
+ # Returns +true+ if the _field_ is rejected by the rules,
37
+ # returns +false+ otherwise.
38
+ #
39
+ def reject?(field)
40
+ !(accept?(field))
41
+ end
42
+
43
+ protected
44
+
45
+ #
46
+ # Tests the specified _field_ against the specified _rule_. Returns
47
+ # +true+ when the _rule_ matches the specified _field_, returns
48
+ # +false+ otherwise.
49
+ #
50
+ def test_field(field,rule)
51
+ if rule.kind_of?(Proc)
52
+ return (rule.call(field) == true)
53
+ elsif rule.kind_of?(Regexp)
54
+ return !((field.to_s =~ rule).nil?)
55
+ else
56
+ return field == rule
57
+ end
58
+ end
59
+
60
+ end
61
+ end
@@ -0,0 +1,48 @@
1
+ require 'spidr/agent'
2
+
3
+ module Spidr
4
+ # Common proxy port.
5
+ COMMON_PROXY_PORT = 8080
6
+
7
+ #
8
+ # Returns the +Hash+ of the Spidr proxy information.
9
+ #
10
+ def Spidr.proxy
11
+ @@spidr_proxy ||= {:host => nil, :port => COMMON_PROXY_PORT, :user => nil, :password => nil}
12
+ end
13
+
14
+ #
15
+ # Returns the Spidr User-Agent
16
+ #
17
+ def Spidr.user_agent
18
+ @@spidr_user_agent ||= nil
19
+ end
20
+
21
+ #
22
+ # Sets the Spidr Web User-Agent to the specified _new_agent_.
23
+ #
24
+ def Spidr.user_agent=(new_agent)
25
+ @@spidr_user_agent = new_agent
26
+ end
27
+
28
+ #
29
+ # See Agent.start_at.
30
+ #
31
+ def Spidr.start_at(url,options={},&block)
32
+ Agent.start_at(url,options,&block)
33
+ end
34
+
35
+ #
36
+ # See Agent.host.
37
+ #
38
+ def Spidr.host(name,options={},&block)
39
+ Agent.host(name,options,&block)
40
+ end
41
+
42
+ #
43
+ # See Agent.site.
44
+ #
45
+ def Spidr.site(url,options={},&block)
46
+ Agent.site(url,options,&block)
47
+ end
48
+ end
@@ -0,0 +1,3 @@
1
+ module Spidr
2
+ VERSION = '0.1.0'
3
+ end
File without changes
metadata ADDED
@@ -0,0 +1,84 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: spidr
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Postmodern Modulus III
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-05-23 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hpricot
17
+ version_requirement:
18
+ version_requirements: !ruby/object:Gem::Requirement
19
+ requirements:
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: "0"
23
+ version:
24
+ - !ruby/object:Gem::Dependency
25
+ name: hoe
26
+ version_requirement:
27
+ version_requirements: !ruby/object:Gem::Requirement
28
+ requirements:
29
+ - - ">="
30
+ - !ruby/object:Gem::Version
31
+ version: 1.5.3
32
+ version:
33
+ description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
34
+ email:
35
+ - postmodern.mod3@gmail.com
36
+ executables: []
37
+
38
+ extensions: []
39
+
40
+ extra_rdoc_files:
41
+ - History.txt
42
+ - Manifest.txt
43
+ - README.txt
44
+ files:
45
+ - History.txt
46
+ - Manifest.txt
47
+ - README.txt
48
+ - Rakefile
49
+ - lib/spidr.rb
50
+ - lib/spidr/page.rb
51
+ - lib/spidr/rules.rb
52
+ - lib/spidr/agent.rb
53
+ - lib/spidr/spidr.rb
54
+ - lib/spidr/version.rb
55
+ - test/test_spidr.rb
56
+ has_rdoc: true
57
+ homepage: http://spidr.rubyforge.org/
58
+ post_install_message:
59
+ rdoc_options:
60
+ - --main
61
+ - README.txt
62
+ require_paths:
63
+ - lib
64
+ required_ruby_version: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: "0"
69
+ version:
70
+ required_rubygems_version: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: "0"
75
+ version:
76
+ requirements: []
77
+
78
+ rubyforge_project: spidr
79
+ rubygems_version: 1.1.1
80
+ signing_key:
81
+ specification_version: 2
82
+ summary: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely
83
+ test_files:
84
+ - test/test_spidr.rb