spidr 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data.tar.gz.sig CHANGED
Binary file
@@ -1,3 +1,34 @@
1
+ === 0.2.1 / 2009-11-25
2
+
3
+ * Added Spidr::Events#every_ok_page.
4
+ * Added Spidr::Events#every_redirect_page.
5
+ * Added Spidr::Events#every_timedout_page.
6
+ * Added Spidr::Events#every_bad_request_page.
7
+ * Added Spidr::Events#every_unauthorized_page.
8
+ * Added Spidr::Events#every_forbidden_page.
9
+ * Added Spidr::Events#every_missing_page.
10
+ * Added Spidr::Events#every_internal_server_error_page.
11
+ * Added Spidr::Events#every_txt_page.
12
+ * Added Spidr::Events#every_html_page.
13
+ * Added Spidr::Events#every_xml_page.
14
+ * Added Spidr::Events#every_xsl_page.
15
+ * Added Spidr::Events#every_doc.
16
+ * Added Spidr::Events#every_html_doc.
17
+ * Added Spidr::Events#every_xml_doc.
18
+ * Added Spidr::Events#every_xsl_doc.
19
+ * Added Spidr::Events#every_rss_doc.
20
+ * Added Spidr::Events#every_atom_doc.
21
+ * Added Spidr::Events#every_javascript_page.
22
+ * Added Spidr::Events#every_css_page.
23
+ * Added Spidr::Events#every_rss_page.
24
+ * Added Spidr::Events#every_atom_page.
25
+ * Added Spidr::Events#every_ms_word_page.
26
+ * Added Spidr::Events#every_pdf_page.
27
+ * Added Spidr::Events#every_zip_page.
28
+ * Fixed a bug where Spidr::Agent#delay was not being used to delay
29
+ requesting pages.
30
+ * Spider +link+ and +script+ tags in HTML pages (thanks Nick Plante).
31
+
1
32
  === 0.2.0 / 2009-10-10
2
33
 
3
34
  * Added URI.expand_path.
data/README.txt CHANGED
@@ -1,8 +1,10 @@
1
1
  = Spidr
2
2
 
3
- * http://spidr.rubyforge.org/
4
- * http://github.com/postmodern/spidr/
5
- * Postmodern (postmodern.mod3 at gmail.com)
3
+ * http://spidr.rubyforge.org
4
+ * http://github.com/postmodern/spidr
5
+ * http://github.com/postmodern/spidr/issues
6
+ * http://groups.google.com/group/spidr
7
+ * irc.freenode.net #spidr
6
8
 
7
9
  == DESCRIPTION:
8
10
 
@@ -56,7 +58,7 @@ and easy to use.
56
58
  'http://company.com/',
57
59
  :hosts => [
58
60
  'company.com',
59
- /host\d\.company\.com/
61
+ /host\d\.company\.com/
60
62
  ]
61
63
  )
62
64
 
@@ -90,10 +92,10 @@ and easy to use.
90
92
  puts "[-] #{page.url}"
91
93
 
92
94
  page.search('//meta').each do |meta|
93
- name = (meta.attributes['name'] || meta.attributes['http-equiv'])
94
- value = meta.attributes['content']
95
+ name = (meta.attributes['name'] || meta.attributes['http-equiv'])
96
+ value = meta.attributes['content']
95
97
 
96
- puts " #{name} = #{value}"
98
+ puts " #{name} = #{value}"
97
99
  end
98
100
  end
99
101
  end
@@ -101,8 +103,8 @@ and easy to use.
101
103
  * Print out the titles from every page:
102
104
 
103
105
  Spidr.site('http://www.rubypulse.com/') do |spider|
104
- spider.every_page do |page|
105
- puts page.title if page.html?
106
+ spider.every_html_page do |page|
107
+ puts page.title
106
108
  end
107
109
  end
108
110
 
@@ -119,16 +121,16 @@ and easy to use.
119
121
  * Pause the spider on a forbidden page:
120
122
 
121
123
  spider = Spidr.host('overnight.startup.com') do |spider|
122
- spider.every_page do |page|
123
- spider.pause! if page.forbidden?
124
+ spider.every_forbidden_page do |page|
125
+ spider.pause!
124
126
  end
125
127
  end
126
128
 
127
129
  * Skip the processing of a page:
128
130
 
129
131
  Spidr.host('sketchy.content.com') do |spider|
130
- spider.every_page do |page|
131
- spider.skip_page! if page.not_found?
132
+ spider.every_missing_page do |page|
133
+ spider.skip_page!
132
134
  end
133
135
  end
134
136
 
@@ -137,8 +139,8 @@ and easy to use.
137
139
  Spidr.host('sketchy.content.com') do |spider|
138
140
  spider.every_url do |url|
139
141
  if url.path.split('/').find { |dir| dir.to_i > 1000 }
140
- spider.skip_link!
141
- end
142
+ spider.skip_link!
143
+ end
142
144
  end
143
145
  end
144
146
 
data/Rakefile CHANGED
@@ -18,7 +18,7 @@ Hoe.spec('spidr') do
18
18
 
19
19
  self.extra_dev_deps = [
20
20
  ['rspec', '>=1.2.8'],
21
- ['yard', '>=0.2.3.5']
21
+ ['yard', '>=0.4.0']
22
22
  ]
23
23
 
24
24
  self.spec_extras = {:has_rdoc => 'yard'}
@@ -458,6 +458,8 @@ module Spidr
458
458
  path += "?#{url.query}" if url.query
459
459
 
460
460
  begin
461
+ sleep(@delay) if @delay > 0
462
+
461
463
  get_session(url.scheme,host,port) do |sess|
462
464
  headers = {}
463
465
  headers['User-Agent'] = @user_agent if @user_agent
@@ -56,6 +56,20 @@ module Spidr
56
56
  return self
57
57
  end
58
58
 
59
+ #
60
+ # Pass the headers from every response the agent receives to a given
61
+ # block.
62
+ #
63
+ # @yield [headers]
64
+ # The block will be passed the headers of every response.
65
+ #
66
+ # @yieldparam [Hash] headers
67
+ # The headers from a response.
68
+ #
69
+ def all_headers(&block)
70
+ every_page { |page| block.call(page.headers) }
71
+ end
72
+
59
73
  #
60
74
  # Pass every page that the agent visits to a given block.
61
75
  #
@@ -71,17 +85,419 @@ module Spidr
71
85
  end
72
86
 
73
87
  #
74
- # Pass the headers from every response the agent receives to a given
88
+ # Pass every OK page that the agent visits to a given block.
89
+ #
90
+ # @yield [page]
91
+ # The block will be passed every OK page visited.
92
+ #
93
+ # @yieldparam [Page] page
94
+ # A visited page.
95
+ #
96
+ def every_ok_page(&block)
97
+ every_page do |page|
98
+ block.call(page) if (block && page.ok?)
99
+ end
100
+ end
101
+
102
+ #
103
+ # Pass every Redirect page that the agent visits to a given block.
104
+ #
105
+ # @yield [page]
106
+ # The block will be passed every Redirect page visited.
107
+ #
108
+ # @yieldparam [Page] page
109
+ # A visited page.
110
+ #
111
+ def every_redirect_page(&block)
112
+ every_page do |page|
113
+ block.call(page) if (block && page.redirect?)
114
+ end
115
+ end
116
+
117
+ #
118
+ # Pass every Timeout page that the agent visits to a given block.
119
+ #
120
+ # @yield [page]
121
+ # The block will be passed every Timeout page visited.
122
+ #
123
+ # @yieldparam [Page] page
124
+ # A visited page.
125
+ #
126
+ def every_timedout_page(&block)
127
+ every_page do |page|
128
+ block.call(page) if (block && page.timedout?)
129
+ end
130
+ end
131
+
132
+ #
133
+ # Pass every Bad Request page that the agent visits to a given block.
134
+ #
135
+ # @yield [page]
136
+ # The block will be passed every Bad Request page visited.
137
+ #
138
+ # @yieldparam [Page] page
139
+ # A visited page.
140
+ #
141
+ def every_bad_request_page(&block)
142
+ every_page do |page|
143
+ block.call(page) if (block && page.bad_request?)
144
+ end
145
+ end
146
+
147
+ #
148
+ # Pass every Unauthorized page that the agent visits to a given block.
149
+ #
150
+ # @yield [page]
151
+ # The block will be passed every Unauthorized page visited.
152
+ #
153
+ # @yieldparam [Page] page
154
+ # A visited page.
155
+ #
156
+ def every_unauthorized_page(&block)
157
+ every_page do |page|
158
+ block.call(page) if (block && page.unauthorized?)
159
+ end
160
+ end
161
+
162
+ #
163
+ # Pass every Forbidden page that the agent visits to a given block.
164
+ #
165
+ # @yield [page]
166
+ # The block will be passed every Forbidden page visited.
167
+ #
168
+ # @yieldparam [Page] page
169
+ # A visited page.
170
+ #
171
+ def every_forbidden_page(&block)
172
+ every_page do |page|
173
+ block.call(page) if (block && page.forbidden?)
174
+ end
175
+ end
176
+
177
+ #
178
+ # Pass every Missing page that the agent visits to a given block.
179
+ #
180
+ # @yield [page]
181
+ # The block will be passed every Missing page visited.
182
+ #
183
+ # @yieldparam [Page] page
184
+ # A visited page.
185
+ #
186
+ def every_missing_page(&block)
187
+ every_page do |page|
188
+ block.call(page) if (block && page.missing?)
189
+ end
190
+ end
191
+
192
+ #
193
+ # Pass every Internal Server Error page that the agent visits to a
194
+ # given block.
195
+ #
196
+ # @yield [page]
197
+ # The block will be passed every Internal Server Error page visited.
198
+ #
199
+ # @yieldparam [Page] page
200
+ # A visited page.
201
+ #
202
+ def every_internal_server_error_page(&block)
203
+ every_page do |page|
204
+ block.call(page) if (block && page.had_internal_server_error?)
205
+ end
206
+ end
207
+
208
+ #
209
+ # Pass every Plain Text page that the agent visits to a given block.
210
+ #
211
+ # @yield [page]
212
+ # The block will be passed every Plain Text page visited.
213
+ #
214
+ # @yieldparam [Page] page
215
+ # A visited page.
216
+ #
217
+ def every_txt_page(&block)
218
+ every_page do |page|
219
+ block.call(page) if (block && page.txt?)
220
+ end
221
+ end
222
+
223
+ #
224
+ # Pass every HTML page that the agent visits to a given block.
225
+ #
226
+ # @yield [page]
227
+ # The block will be passed every HTML page visited.
228
+ #
229
+ # @yieldparam [Page] page
230
+ # A visited page.
231
+ #
232
+ def every_html_page(&block)
233
+ every_page do |page|
234
+ block.call(page) if (block && page.html?)
235
+ end
236
+ end
237
+
238
+ #
239
+ # Pass every XML page that the agent visits to a given block.
240
+ #
241
+ # @yield [page]
242
+ # The block will be passed every XML page visited.
243
+ #
244
+ # @yieldparam [Page] page
245
+ # A visited page.
246
+ #
247
+ def every_xml_page(&block)
248
+ every_page do |page|
249
+ block.call(page) if (block && page.xml?)
250
+ end
251
+ end
252
+
253
+ #
254
+ # Pass every XML Stylesheet (XSL) page that the agent visits to a
255
+ # given block.
256
+ #
257
+ # @yield [page]
258
+ # The block will be passed every XML Stylesheet (XSL) page visited.
259
+ #
260
+ # @yieldparam [Page] page
261
+ # A visited page.
262
+ #
263
+ def every_xsl_page(&block)
264
+ every_page do |page|
265
+ block.call(page) if (block && page.xsl?)
266
+ end
267
+ end
268
+
269
+ #
270
+ # Pass every HTML or XML document that the agent parses to a given
75
271
  # block.
76
272
  #
77
- # @yield [headers]
78
- # The block will be passed the headers of every response.
273
+ # @yield [doc]
274
+ # The block will be passed every HTML or XML document parsed.
79
275
  #
80
- # @yieldparam [Hash] headers
81
- # The headers from a response.
276
+ # @yieldparam [Nokogiri::HTML::Document, Nokogiri::XML::Document] doc
277
+ # A parsed HTML or XML document.
82
278
  #
83
- def all_headers(&block)
84
- every_page { |page| block.call(page.headers) }
279
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
280
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
281
+ #
282
+ def every_doc(&block)
283
+ every_page do |page|
284
+ if block
285
+ if (doc = page.doc)
286
+ block.call(doc)
287
+ end
288
+ end
289
+ end
290
+ end
291
+
292
+ #
293
+ # Pass every HTML document that the agent parses to a given block.
294
+ #
295
+ # @yield [doc]
296
+ # The block will be passed every HTML document parsed.
297
+ #
298
+ # @yieldparam [Nokogiri::HTML::Document] doc
299
+ # A parsed HTML document.
300
+ #
301
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
302
+ #
303
+ def every_html_doc(&block)
304
+ every_page do |page|
305
+ if (block && page.html?)
306
+ if (doc = page.doc)
307
+ block.call(doc)
308
+ end
309
+ end
310
+ end
311
+ end
312
+
313
+ #
314
+ # Pass every XML document that the agent parses to a given block.
315
+ #
316
+ # @yield [doc]
317
+ # The block will be passed every XML document parsed.
318
+ #
319
+ # @yieldparam [Nokogiri::XML::Document] doc
320
+ # A parsed XML document.
321
+ #
322
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
323
+ #
324
+ def every_xml_doc(&block)
325
+ every_page do |page|
326
+ if (block && page.xml?)
327
+ if (doc = page.doc)
328
+ block.call(doc)
329
+ end
330
+ end
331
+ end
332
+ end
333
+
334
+ #
335
+ # Pass every XML Stylesheet (XSL) that the agent parses to a given
336
+ # block.
337
+ #
338
+ # @yield [doc]
339
+ # The block will be passed every XSL Stylesheet (XSL) parsed.
340
+ #
341
+ # @yieldparam [Nokogiri::XML::Document] doc
342
+ # A parsed XML document.
343
+ #
344
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
345
+ #
346
+ def every_xsl_doc(&block)
347
+ every_page do |page|
348
+ if (block && page.xsl?)
349
+ if (doc = page.doc)
350
+ block.call(doc)
351
+ end
352
+ end
353
+ end
354
+ end
355
+
356
+ #
357
+ # Pass every RSS document that the agent parses to a given block.
358
+ #
359
+ # @yield [doc]
360
+ # The block will be passed every RSS document parsed.
361
+ #
362
+ # @yieldparam [Nokogiri::XML::Document] doc
363
+ # A parsed XML document.
364
+ #
365
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
366
+ #
367
+ def every_rss_doc(&block)
368
+ every_page do |page|
369
+ if (block && page.rss?)
370
+ if (doc = page.doc)
371
+ block.call(doc)
372
+ end
373
+ end
374
+ end
375
+ end
376
+
377
+ #
378
+ # Pass every Atom document that the agent parses to a given block.
379
+ #
380
+ # @yield [doc]
381
+ # The block will be passed every Atom document parsed.
382
+ #
383
+ # @yieldparam [Nokogiri::XML::Document] doc
384
+ # A parsed XML document.
385
+ #
386
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
387
+ #
388
+ def every_atom_doc(&block)
389
+ every_page do |page|
390
+ if (block && page.atom?)
391
+ if (doc = page.doc)
392
+ block.call(doc)
393
+ end
394
+ end
395
+ end
396
+ end
397
+
398
+ #
399
+ # Pass every JavaScript page that the agent visits to a given block.
400
+ #
401
+ # @yield [page]
402
+ # The block will be passed every JavaScript page visited.
403
+ #
404
+ # @yieldparam [Page] page
405
+ # A visited page.
406
+ #
407
+ def every_javascript_page(&block)
408
+ every_page do |page|
409
+ block.call(page) if (block && page.javascript?)
410
+ end
411
+ end
412
+
413
+ #
414
+ # Pass every CSS page that the agent visits to a given block.
415
+ #
416
+ # @yield [page]
417
+ # The block will be passed every CSS page visited.
418
+ #
419
+ # @yieldparam [Page] page
420
+ # A visited page.
421
+ #
422
+ def every_css_page(&block)
423
+ every_page do |page|
424
+ block.call(page) if (block && page.css?)
425
+ end
426
+ end
427
+
428
+ #
429
+ # Pass every RSS feed that the agent visits to a given block.
430
+ #
431
+ # @yield [feed]
432
+ # The block will be passed every RSS feed visited.
433
+ #
434
+ # @yieldparam [Page] feed
435
+ # A visited page.
436
+ #
437
+ def every_rss_page(&block)
438
+ every_page do |page|
439
+ block.call(page) if (block && page.rss?)
440
+ end
441
+ end
442
+
443
+ #
444
+ # Pass every Atom feed that the agent visits to a given block.
445
+ #
446
+ # @yield [feed]
447
+ # The block will be passed every Atom feed visited.
448
+ #
449
+ # @yieldparam [Page] feed
450
+ # A visited page.
451
+ #
452
+ def every_atom_page(&block)
453
+ every_page do |page|
454
+ block.call(page) if (block && page.atom?)
455
+ end
456
+ end
457
+
458
+ #
459
+ # Pass every MS Word page that the agent visits to a given block.
460
+ #
461
+ # @yield [page]
462
+ # The block will be passed every MS Word page visited.
463
+ #
464
+ # @yieldparam [Page] page
465
+ # A visited page.
466
+ #
467
+ def every_ms_word_page(&block)
468
+ every_page do |page|
469
+ block.call(page) if (block && page.ms_word?)
470
+ end
471
+ end
472
+
473
+ #
474
+ # Pass every PDF page that the agent visits to a given block.
475
+ #
476
+ # @yield [page]
477
+ # The block will be passed every PDF page visited.
478
+ #
479
+ # @yieldparam [Page] page
480
+ # A visited page.
481
+ #
482
+ def every_pdf_page(&block)
483
+ every_page do |page|
484
+ block.call(page) if (block && page.pdf?)
485
+ end
486
+ end
487
+
488
+ #
489
+ # Pass every ZIP page that the agent visits to a given block.
490
+ #
491
+ # @yield [page]
492
+ # The block will be passed every ZIP page visited.
493
+ #
494
+ # @yieldparam [Page] page
495
+ # A visited page.
496
+ #
497
+ def every_zip_page(&block)
498
+ every_page do |page|
499
+ block.call(page) if (block && page.zip?)
500
+ end
85
501
  end
86
502
  end
87
503
  end
@@ -173,6 +173,16 @@ module Spidr
173
173
  (content_type =~ /text\/xml/) == 0
174
174
  end
175
175
 
176
+ #
177
+ # Determines if the page is XML Stylesheet (XSL).
178
+ #
179
+ # @return [Boolean]
180
+ # Specifies whether the page is XML Stylesheet (XSL).
181
+ #
182
+ def xsl?
183
+ (content_type =~ /text\/xsl/) == 0
184
+ end
185
+
176
186
  #
177
187
  # Determines if the page is JavaScript.
178
188
  #
@@ -261,13 +271,16 @@ module Spidr
261
271
  # Returns +nil+ if the page is neither HTML, XML, RSS, Atom or if
262
272
  # the page could not be parsed properly.
263
273
  #
274
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
275
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
276
+ #
264
277
  def doc
265
278
  return nil if (body.nil? || body.empty?)
266
279
 
267
280
  begin
268
281
  if html?
269
282
  return @doc ||= Nokogiri::HTML(body)
270
- elsif (xml? || rss? || atom?)
283
+ elsif (xml? || xsl? || rss? || atom?)
271
284
  return @doc ||= Nokogiri::XML(body)
272
285
  end
273
286
  rescue
@@ -373,6 +386,14 @@ module Spidr
373
386
  doc.search('iframe[@src]').each do |iframe|
374
387
  add_url.call(iframe.get_attribute('src'))
375
388
  end
389
+
390
+ doc.search('link[@href]').each do |link|
391
+ add_url.call(link.get_attribute('href'))
392
+ end
393
+
394
+ doc.search('script[@src]').each do |script|
395
+ add_url.call(script.get_attribute('src'))
396
+ end
376
397
  end
377
398
 
378
399
  return urls
@@ -1,4 +1,4 @@
1
1
  module Spidr
2
2
  # Spidr version
3
- VERSION = '0.2.0'
3
+ VERSION = '0.2.1'
4
4
  end
@@ -9,4 +9,4 @@ YARD::Rake::YardocTask.new do |t|
9
9
  ]
10
10
  end
11
11
 
12
- task :docs => :yardoc
12
+ task :docs => :yard
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
@@ -30,7 +30,7 @@ cert_chain:
30
30
  pDj+ws7QjtH/Qcrr1l9jfN0ehDs=
31
31
  -----END CERTIFICATE-----
32
32
 
33
- date: 2009-10-10 00:00:00 -07:00
33
+ date: 2009-11-25 00:00:00 -08:00
34
34
  default_executable:
35
35
  dependencies:
36
36
  - !ruby/object:Gem::Dependency
@@ -61,7 +61,7 @@ dependencies:
61
61
  requirements:
62
62
  - - ">="
63
63
  - !ruby/object:Gem::Version
64
- version: 0.2.3.5
64
+ version: 0.4.0
65
65
  version:
66
66
  - !ruby/object:Gem::Dependency
67
67
  name: hoe
@@ -154,7 +154,7 @@ files:
154
154
  - static/course/frames/frame_next.html
155
155
  - static/course/specs.json
156
156
  has_rdoc: yard
157
- homepage: http://spidr.rubyforge.org/
157
+ homepage: http://spidr.rubyforge.org
158
158
  licenses: []
159
159
 
160
160
  post_install_message:
metadata.gz.sig CHANGED
Binary file