spidr 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
data.tar.gz.sig CHANGED
Binary file
@@ -1,3 +1,34 @@
1
+ === 0.2.1 / 2009-11-25
2
+
3
+ * Added Spidr::Events#every_ok_page.
4
+ * Added Spidr::Events#every_redirect_page.
5
+ * Added Spidr::Events#every_timedout_page.
6
+ * Added Spidr::Events#every_bad_request_page.
7
+ * Added Spidr::Events#every_unauthorized_page.
8
+ * Added Spidr::Events#every_forbidden_page.
9
+ * Added Spidr::Events#every_missing_page.
10
+ * Added Spidr::Events#every_internal_server_error_page.
11
+ * Added Spidr::Events#every_txt_page.
12
+ * Added Spidr::Events#every_html_page.
13
+ * Added Spidr::Events#every_xml_page.
14
+ * Added Spidr::Events#every_xsl_page.
15
+ * Added Spidr::Events#every_doc.
16
+ * Added Spidr::Events#every_html_doc.
17
+ * Added Spidr::Events#every_xml_doc.
18
+ * Added Spidr::Events#every_xsl_doc.
19
+ * Added Spidr::Events#every_rss_doc.
20
+ * Added Spidr::Events#every_atom_doc.
21
+ * Added Spidr::Events#every_javascript_page.
22
+ * Added Spidr::Events#every_css_page.
23
+ * Added Spidr::Events#every_rss_page.
24
+ * Added Spidr::Events#every_atom_page.
25
+ * Added Spidr::Events#every_ms_word_page.
26
+ * Added Spidr::Events#every_pdf_page.
27
+ * Added Spidr::Events#every_zip_page.
28
+ * Fixed a bug where Spidr::Agent#delay was not being used to delay
29
+ requesting pages.
30
+ * Spider +link+ and +script+ tags in HTML pages (thanks Nick Plante).
31
+
1
32
  === 0.2.0 / 2009-10-10
2
33
 
3
34
  * Added URI.expand_path.
data/README.txt CHANGED
@@ -1,8 +1,10 @@
1
1
  = Spidr
2
2
 
3
- * http://spidr.rubyforge.org/
4
- * http://github.com/postmodern/spidr/
5
- * Postmodern (postmodern.mod3 at gmail.com)
3
+ * http://spidr.rubyforge.org
4
+ * http://github.com/postmodern/spidr
5
+ * http://github.com/postmodern/spidr/issues
6
+ * http://groups.google.com/group/spidr
7
+ * irc.freenode.net #spidr
6
8
 
7
9
  == DESCRIPTION:
8
10
 
@@ -56,7 +58,7 @@ and easy to use.
56
58
  'http://company.com/',
57
59
  :hosts => [
58
60
  'company.com',
59
- /host\d\.company\.com/
61
+ /host\d\.company\.com/
60
62
  ]
61
63
  )
62
64
 
@@ -90,10 +92,10 @@ and easy to use.
90
92
  puts "[-] #{page.url}"
91
93
 
92
94
  page.search('//meta').each do |meta|
93
- name = (meta.attributes['name'] || meta.attributes['http-equiv'])
94
- value = meta.attributes['content']
95
+ name = (meta.attributes['name'] || meta.attributes['http-equiv'])
96
+ value = meta.attributes['content']
95
97
 
96
- puts " #{name} = #{value}"
98
+ puts " #{name} = #{value}"
97
99
  end
98
100
  end
99
101
  end
@@ -101,8 +103,8 @@ and easy to use.
101
103
  * Print out the titles from every page:
102
104
 
103
105
  Spidr.site('http://www.rubypulse.com/') do |spider|
104
- spider.every_page do |page|
105
- puts page.title if page.html?
106
+ spider.every_html_page do |page|
107
+ puts page.title
106
108
  end
107
109
  end
108
110
 
@@ -119,16 +121,16 @@ and easy to use.
119
121
  * Pause the spider on a forbidden page:
120
122
 
121
123
  spider = Spidr.host('overnight.startup.com') do |spider|
122
- spider.every_page do |page|
123
- spider.pause! if page.forbidden?
124
+ spider.every_forbidden_page do |page|
125
+ spider.pause!
124
126
  end
125
127
  end
126
128
 
127
129
  * Skip the processing of a page:
128
130
 
129
131
  Spidr.host('sketchy.content.com') do |spider|
130
- spider.every_page do |page|
131
- spider.skip_page! if page.not_found?
132
+ spider.every_missing_page do |page|
133
+ spider.skip_page!
132
134
  end
133
135
  end
134
136
 
@@ -137,8 +139,8 @@ and easy to use.
137
139
  Spidr.host('sketchy.content.com') do |spider|
138
140
  spider.every_url do |url|
139
141
  if url.path.split('/').find { |dir| dir.to_i > 1000 }
140
- spider.skip_link!
141
- end
142
+ spider.skip_link!
143
+ end
142
144
  end
143
145
  end
144
146
 
data/Rakefile CHANGED
@@ -18,7 +18,7 @@ Hoe.spec('spidr') do
18
18
 
19
19
  self.extra_dev_deps = [
20
20
  ['rspec', '>=1.2.8'],
21
- ['yard', '>=0.2.3.5']
21
+ ['yard', '>=0.4.0']
22
22
  ]
23
23
 
24
24
  self.spec_extras = {:has_rdoc => 'yard'}
@@ -458,6 +458,8 @@ module Spidr
458
458
  path += "?#{url.query}" if url.query
459
459
 
460
460
  begin
461
+ sleep(@delay) if @delay > 0
462
+
461
463
  get_session(url.scheme,host,port) do |sess|
462
464
  headers = {}
463
465
  headers['User-Agent'] = @user_agent if @user_agent
@@ -56,6 +56,20 @@ module Spidr
56
56
  return self
57
57
  end
58
58
 
59
+ #
60
+ # Pass the headers from every response the agent receives to a given
61
+ # block.
62
+ #
63
+ # @yield [headers]
64
+ # The block will be passed the headers of every response.
65
+ #
66
+ # @yieldparam [Hash] headers
67
+ # The headers from a response.
68
+ #
69
+ def all_headers(&block)
70
+ every_page { |page| block.call(page.headers) }
71
+ end
72
+
59
73
  #
60
74
  # Pass every page that the agent visits to a given block.
61
75
  #
@@ -71,17 +85,419 @@ module Spidr
71
85
  end
72
86
 
73
87
  #
74
- # Pass the headers from every response the agent receives to a given
88
+ # Pass every OK page that the agent visits to a given block.
89
+ #
90
+ # @yield [page]
91
+ # The block will be passed every OK page visited.
92
+ #
93
+ # @yieldparam [Page] page
94
+ # A visited page.
95
+ #
96
+ def every_ok_page(&block)
97
+ every_page do |page|
98
+ block.call(page) if (block && page.ok?)
99
+ end
100
+ end
101
+
102
+ #
103
+ # Pass every Redirect page that the agent visits to a given block.
104
+ #
105
+ # @yield [page]
106
+ # The block will be passed every Redirect page visited.
107
+ #
108
+ # @yieldparam [Page] page
109
+ # A visited page.
110
+ #
111
+ def every_redirect_page(&block)
112
+ every_page do |page|
113
+ block.call(page) if (block && page.redirect?)
114
+ end
115
+ end
116
+
117
+ #
118
+ # Pass every Timeout page that the agent visits to a given block.
119
+ #
120
+ # @yield [page]
121
+ # The block will be passed every Timeout page visited.
122
+ #
123
+ # @yieldparam [Page] page
124
+ # A visited page.
125
+ #
126
+ def every_timedout_page(&block)
127
+ every_page do |page|
128
+ block.call(page) if (block && page.timedout?)
129
+ end
130
+ end
131
+
132
+ #
133
+ # Pass every Bad Request page that the agent visits to a given block.
134
+ #
135
+ # @yield [page]
136
+ # The block will be passed every Bad Request page visited.
137
+ #
138
+ # @yieldparam [Page] page
139
+ # A visited page.
140
+ #
141
+ def every_bad_request_page(&block)
142
+ every_page do |page|
143
+ block.call(page) if (block && page.bad_request?)
144
+ end
145
+ end
146
+
147
+ #
148
+ # Pass every Unauthorized page that the agent visits to a given block.
149
+ #
150
+ # @yield [page]
151
+ # The block will be passed every Unauthorized page visited.
152
+ #
153
+ # @yieldparam [Page] page
154
+ # A visited page.
155
+ #
156
+ def every_unauthorized_page(&block)
157
+ every_page do |page|
158
+ block.call(page) if (block && page.unauthorized?)
159
+ end
160
+ end
161
+
162
+ #
163
+ # Pass every Forbidden page that the agent visits to a given block.
164
+ #
165
+ # @yield [page]
166
+ # The block will be passed every Forbidden page visited.
167
+ #
168
+ # @yieldparam [Page] page
169
+ # A visited page.
170
+ #
171
+ def every_forbidden_page(&block)
172
+ every_page do |page|
173
+ block.call(page) if (block && page.forbidden?)
174
+ end
175
+ end
176
+
177
+ #
178
+ # Pass every Missing page that the agent visits to a given block.
179
+ #
180
+ # @yield [page]
181
+ # The block will be passed every Missing page visited.
182
+ #
183
+ # @yieldparam [Page] page
184
+ # A visited page.
185
+ #
186
+ def every_missing_page(&block)
187
+ every_page do |page|
188
+ block.call(page) if (block && page.missing?)
189
+ end
190
+ end
191
+
192
+ #
193
+ # Pass every Internal Server Error page that the agent visits to a
194
+ # given block.
195
+ #
196
+ # @yield [page]
197
+ # The block will be passed every Internal Server Error page visited.
198
+ #
199
+ # @yieldparam [Page] page
200
+ # A visited page.
201
+ #
202
+ def every_internal_server_error_page(&block)
203
+ every_page do |page|
204
+ block.call(page) if (block && page.had_internal_server_error?)
205
+ end
206
+ end
207
+
208
+ #
209
+ # Pass every Plain Text page that the agent visits to a given block.
210
+ #
211
+ # @yield [page]
212
+ # The block will be passed every Plain Text page visited.
213
+ #
214
+ # @yieldparam [Page] page
215
+ # A visited page.
216
+ #
217
+ def every_txt_page(&block)
218
+ every_page do |page|
219
+ block.call(page) if (block && page.txt?)
220
+ end
221
+ end
222
+
223
+ #
224
+ # Pass every HTML page that the agent visits to a given block.
225
+ #
226
+ # @yield [page]
227
+ # The block will be passed every HTML page visited.
228
+ #
229
+ # @yieldparam [Page] page
230
+ # A visited page.
231
+ #
232
+ def every_html_page(&block)
233
+ every_page do |page|
234
+ block.call(page) if (block && page.html?)
235
+ end
236
+ end
237
+
238
+ #
239
+ # Pass every XML page that the agent visits to a given block.
240
+ #
241
+ # @yield [page]
242
+ # The block will be passed every XML page visited.
243
+ #
244
+ # @yieldparam [Page] page
245
+ # A visited page.
246
+ #
247
+ def every_xml_page(&block)
248
+ every_page do |page|
249
+ block.call(page) if (block && page.xml?)
250
+ end
251
+ end
252
+
253
+ #
254
+ # Pass every XML Stylesheet (XSL) page that the agent visits to a
255
+ # given block.
256
+ #
257
+ # @yield [page]
258
+ # The block will be passed every XML Stylesheet (XSL) page visited.
259
+ #
260
+ # @yieldparam [Page] page
261
+ # A visited page.
262
+ #
263
+ def every_xsl_page(&block)
264
+ every_page do |page|
265
+ block.call(page) if (block && page.xsl?)
266
+ end
267
+ end
268
+
269
+ #
270
+ # Pass every HTML or XML document that the agent parses to a given
75
271
  # block.
76
272
  #
77
- # @yield [headers]
78
- # The block will be passed the headers of every response.
273
+ # @yield [doc]
274
+ # The block will be passed every HTML or XML document parsed.
79
275
  #
80
- # @yieldparam [Hash] headers
81
- # The headers from a response.
276
+ # @yieldparam [Nokogiri::HTML::Document, Nokogiri::XML::Document] doc
277
+ # A parsed HTML or XML document.
82
278
  #
83
- def all_headers(&block)
84
- every_page { |page| block.call(page.headers) }
279
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
280
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
281
+ #
282
+ def every_doc(&block)
283
+ every_page do |page|
284
+ if block
285
+ if (doc = page.doc)
286
+ block.call(doc)
287
+ end
288
+ end
289
+ end
290
+ end
291
+
292
+ #
293
+ # Pass every HTML document that the agent parses to a given block.
294
+ #
295
+ # @yield [doc]
296
+ # The block will be passed every HTML document parsed.
297
+ #
298
+ # @yieldparam [Nokogiri::HTML::Document] doc
299
+ # A parsed HTML document.
300
+ #
301
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
302
+ #
303
+ def every_html_doc(&block)
304
+ every_page do |page|
305
+ if (block && page.html?)
306
+ if (doc = page.doc)
307
+ block.call(doc)
308
+ end
309
+ end
310
+ end
311
+ end
312
+
313
+ #
314
+ # Pass every XML document that the agent parses to a given block.
315
+ #
316
+ # @yield [doc]
317
+ # The block will be passed every XML document parsed.
318
+ #
319
+ # @yieldparam [Nokogiri::XML::Document] doc
320
+ # A parsed XML document.
321
+ #
322
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
323
+ #
324
+ def every_xml_doc(&block)
325
+ every_page do |page|
326
+ if (block && page.xml?)
327
+ if (doc = page.doc)
328
+ block.call(doc)
329
+ end
330
+ end
331
+ end
332
+ end
333
+
334
+ #
335
+ # Pass every XML Stylesheet (XSL) that the agent parses to a given
336
+ # block.
337
+ #
338
+ # @yield [doc]
339
+ # The block will be passed every XSL Stylesheet (XSL) parsed.
340
+ #
341
+ # @yieldparam [Nokogiri::XML::Document] doc
342
+ # A parsed XML document.
343
+ #
344
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
345
+ #
346
+ def every_xsl_doc(&block)
347
+ every_page do |page|
348
+ if (block && page.xsl?)
349
+ if (doc = page.doc)
350
+ block.call(doc)
351
+ end
352
+ end
353
+ end
354
+ end
355
+
356
+ #
357
+ # Pass every RSS document that the agent parses to a given block.
358
+ #
359
+ # @yield [doc]
360
+ # The block will be passed every RSS document parsed.
361
+ #
362
+ # @yieldparam [Nokogiri::XML::Document] doc
363
+ # A parsed XML document.
364
+ #
365
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
366
+ #
367
+ def every_rss_doc(&block)
368
+ every_page do |page|
369
+ if (block && page.rss?)
370
+ if (doc = page.doc)
371
+ block.call(doc)
372
+ end
373
+ end
374
+ end
375
+ end
376
+
377
+ #
378
+ # Pass every Atom document that the agent parses to a given block.
379
+ #
380
+ # @yield [doc]
381
+ # The block will be passed every Atom document parsed.
382
+ #
383
+ # @yieldparam [Nokogiri::XML::Document] doc
384
+ # A parsed XML document.
385
+ #
386
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
387
+ #
388
+ def every_atom_doc(&block)
389
+ every_page do |page|
390
+ if (block && page.atom?)
391
+ if (doc = page.doc)
392
+ block.call(doc)
393
+ end
394
+ end
395
+ end
396
+ end
397
+
398
+ #
399
+ # Pass every JavaScript page that the agent visits to a given block.
400
+ #
401
+ # @yield [page]
402
+ # The block will be passed every JavaScript page visited.
403
+ #
404
+ # @yieldparam [Page] page
405
+ # A visited page.
406
+ #
407
+ def every_javascript_page(&block)
408
+ every_page do |page|
409
+ block.call(page) if (block && page.javascript?)
410
+ end
411
+ end
412
+
413
+ #
414
+ # Pass every CSS page that the agent visits to a given block.
415
+ #
416
+ # @yield [page]
417
+ # The block will be passed every CSS page visited.
418
+ #
419
+ # @yieldparam [Page] page
420
+ # A visited page.
421
+ #
422
+ def every_css_page(&block)
423
+ every_page do |page|
424
+ block.call(page) if (block && page.css?)
425
+ end
426
+ end
427
+
428
+ #
429
+ # Pass every RSS feed that the agent visits to a given block.
430
+ #
431
+ # @yield [feed]
432
+ # The block will be passed every RSS feed visited.
433
+ #
434
+ # @yieldparam [Page] feed
435
+ # A visited page.
436
+ #
437
+ def every_rss_page(&block)
438
+ every_page do |page|
439
+ block.call(page) if (block && page.rss?)
440
+ end
441
+ end
442
+
443
+ #
444
+ # Pass every Atom feed that the agent visits to a given block.
445
+ #
446
+ # @yield [feed]
447
+ # The block will be passed every Atom feed visited.
448
+ #
449
+ # @yieldparam [Page] feed
450
+ # A visited page.
451
+ #
452
+ def every_atom_page(&block)
453
+ every_page do |page|
454
+ block.call(page) if (block && page.atom?)
455
+ end
456
+ end
457
+
458
+ #
459
+ # Pass every MS Word page that the agent visits to a given block.
460
+ #
461
+ # @yield [page]
462
+ # The block will be passed every MS Word page visited.
463
+ #
464
+ # @yieldparam [Page] page
465
+ # A visited page.
466
+ #
467
+ def every_ms_word_page(&block)
468
+ every_page do |page|
469
+ block.call(page) if (block && page.ms_word?)
470
+ end
471
+ end
472
+
473
+ #
474
+ # Pass every PDF page that the agent visits to a given block.
475
+ #
476
+ # @yield [page]
477
+ # The block will be passed every PDF page visited.
478
+ #
479
+ # @yieldparam [Page] page
480
+ # A visited page.
481
+ #
482
+ def every_pdf_page(&block)
483
+ every_page do |page|
484
+ block.call(page) if (block && page.pdf?)
485
+ end
486
+ end
487
+
488
+ #
489
+ # Pass every ZIP page that the agent visits to a given block.
490
+ #
491
+ # @yield [page]
492
+ # The block will be passed every ZIP page visited.
493
+ #
494
+ # @yieldparam [Page] page
495
+ # A visited page.
496
+ #
497
+ def every_zip_page(&block)
498
+ every_page do |page|
499
+ block.call(page) if (block && page.zip?)
500
+ end
85
501
  end
86
502
  end
87
503
  end
@@ -173,6 +173,16 @@ module Spidr
173
173
  (content_type =~ /text\/xml/) == 0
174
174
  end
175
175
 
176
+ #
177
+ # Determines if the page is XML Stylesheet (XSL).
178
+ #
179
+ # @return [Boolean]
180
+ # Specifies whether the page is XML Stylesheet (XSL).
181
+ #
182
+ def xsl?
183
+ (content_type =~ /text\/xsl/) == 0
184
+ end
185
+
176
186
  #
177
187
  # Determines if the page is JavaScript.
178
188
  #
@@ -261,13 +271,16 @@ module Spidr
261
271
  # Returns +nil+ if the page is neither HTML, XML, RSS, Atom or if
262
272
  # the page could not be parsed properly.
263
273
  #
274
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
275
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
276
+ #
264
277
  def doc
265
278
  return nil if (body.nil? || body.empty?)
266
279
 
267
280
  begin
268
281
  if html?
269
282
  return @doc ||= Nokogiri::HTML(body)
270
- elsif (xml? || rss? || atom?)
283
+ elsif (xml? || xsl? || rss? || atom?)
271
284
  return @doc ||= Nokogiri::XML(body)
272
285
  end
273
286
  rescue
@@ -373,6 +386,14 @@ module Spidr
373
386
  doc.search('iframe[@src]').each do |iframe|
374
387
  add_url.call(iframe.get_attribute('src'))
375
388
  end
389
+
390
+ doc.search('link[@href]').each do |link|
391
+ add_url.call(link.get_attribute('href'))
392
+ end
393
+
394
+ doc.search('script[@src]').each do |script|
395
+ add_url.call(script.get_attribute('src'))
396
+ end
376
397
  end
377
398
 
378
399
  return urls
@@ -1,4 +1,4 @@
1
1
  module Spidr
2
2
  # Spidr version
3
- VERSION = '0.2.0'
3
+ VERSION = '0.2.1'
4
4
  end
@@ -9,4 +9,4 @@ YARD::Rake::YardocTask.new do |t|
9
9
  ]
10
10
  end
11
11
 
12
- task :docs => :yardoc
12
+ task :docs => :yard
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
@@ -30,7 +30,7 @@ cert_chain:
30
30
  pDj+ws7QjtH/Qcrr1l9jfN0ehDs=
31
31
  -----END CERTIFICATE-----
32
32
 
33
- date: 2009-10-10 00:00:00 -07:00
33
+ date: 2009-11-25 00:00:00 -08:00
34
34
  default_executable:
35
35
  dependencies:
36
36
  - !ruby/object:Gem::Dependency
@@ -61,7 +61,7 @@ dependencies:
61
61
  requirements:
62
62
  - - ">="
63
63
  - !ruby/object:Gem::Version
64
- version: 0.2.3.5
64
+ version: 0.4.0
65
65
  version:
66
66
  - !ruby/object:Gem::Dependency
67
67
  name: hoe
@@ -154,7 +154,7 @@ files:
154
154
  - static/course/frames/frame_next.html
155
155
  - static/course/specs.json
156
156
  has_rdoc: yard
157
- homepage: http://spidr.rubyforge.org/
157
+ homepage: http://spidr.rubyforge.org
158
158
  licenses: []
159
159
 
160
160
  post_install_message:
metadata.gz.sig CHANGED
Binary file