spidr 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data.tar.gz.sig +0 -0
- data/History.txt +31 -0
- data/README.txt +17 -15
- data/Rakefile +1 -1
- data/lib/spidr/agent.rb +2 -0
- data/lib/spidr/events.rb +423 -7
- data/lib/spidr/page.rb +22 -1
- data/lib/spidr/version.rb +1 -1
- data/tasks/yard.rb +1 -1
- metadata +4 -4
- metadata.gz.sig +0 -0
data.tar.gz.sig
CHANGED
Binary file
|
data/History.txt
CHANGED
@@ -1,3 +1,34 @@
|
|
1
|
+
=== 0.2.1 / 2009-11-25
|
2
|
+
|
3
|
+
* Added Spidr::Events#every_ok_page.
|
4
|
+
* Added Spidr::Events#every_redirect_page.
|
5
|
+
* Added Spidr::Events#every_timedout_page.
|
6
|
+
* Added Spidr::Events#every_bad_request_page.
|
7
|
+
* Added Spidr::Events#every_unauthorized_page.
|
8
|
+
* Added Spidr::Events#every_forbidden_page.
|
9
|
+
* Added Spidr::Events#every_missing_page.
|
10
|
+
* Added Spidr::Events#every_internal_server_error_page.
|
11
|
+
* Added Spidr::Events#every_txt_page.
|
12
|
+
* Added Spidr::Events#every_html_page.
|
13
|
+
* Added Spidr::Events#every_xml_page.
|
14
|
+
* Added Spidr::Events#every_xsl_page.
|
15
|
+
* Added Spidr::Events#every_doc.
|
16
|
+
* Added Spidr::Events#every_html_doc.
|
17
|
+
* Added Spidr::Events#every_xml_doc.
|
18
|
+
* Added Spidr::Events#every_xsl_doc.
|
19
|
+
* Added Spidr::Events#every_rss_doc.
|
20
|
+
* Added Spidr::Events#every_atom_doc.
|
21
|
+
* Added Spidr::Events#every_javascript_page.
|
22
|
+
* Added Spidr::Events#every_css_page.
|
23
|
+
* Added Spidr::Events#every_rss_page.
|
24
|
+
* Added Spidr::Events#every_atom_page.
|
25
|
+
* Added Spidr::Events#every_ms_word_page.
|
26
|
+
* Added Spidr::Events#every_pdf_page.
|
27
|
+
* Added Spidr::Events#every_zip_page.
|
28
|
+
* Fixed a bug where Spidr::Agent#delay was not being used to delay
|
29
|
+
requesting pages.
|
30
|
+
* Spider +link+ and +script+ tags in HTML pages (thanks Nick Plante).
|
31
|
+
|
1
32
|
=== 0.2.0 / 2009-10-10
|
2
33
|
|
3
34
|
* Added URI.expand_path.
|
data/README.txt
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
= Spidr
|
2
2
|
|
3
|
-
* http://spidr.rubyforge.org
|
4
|
-
* http://github.com/postmodern/spidr
|
5
|
-
*
|
3
|
+
* http://spidr.rubyforge.org
|
4
|
+
* http://github.com/postmodern/spidr
|
5
|
+
* http://github.com/postmodern/spidr/issues
|
6
|
+
* http://groups.google.com/group/spidr
|
7
|
+
* irc.freenode.net #spidr
|
6
8
|
|
7
9
|
== DESCRIPTION:
|
8
10
|
|
@@ -56,7 +58,7 @@ and easy to use.
|
|
56
58
|
'http://company.com/',
|
57
59
|
:hosts => [
|
58
60
|
'company.com',
|
59
|
-
|
61
|
+
/host\d\.company\.com/
|
60
62
|
]
|
61
63
|
)
|
62
64
|
|
@@ -90,10 +92,10 @@ and easy to use.
|
|
90
92
|
puts "[-] #{page.url}"
|
91
93
|
|
92
94
|
page.search('//meta').each do |meta|
|
93
|
-
|
94
|
-
|
95
|
+
name = (meta.attributes['name'] || meta.attributes['http-equiv'])
|
96
|
+
value = meta.attributes['content']
|
95
97
|
|
96
|
-
|
98
|
+
puts " #{name} = #{value}"
|
97
99
|
end
|
98
100
|
end
|
99
101
|
end
|
@@ -101,8 +103,8 @@ and easy to use.
|
|
101
103
|
* Print out the titles from every page:
|
102
104
|
|
103
105
|
Spidr.site('http://www.rubypulse.com/') do |spider|
|
104
|
-
spider.
|
105
|
-
puts page.title
|
106
|
+
spider.every_html_page do |page|
|
107
|
+
puts page.title
|
106
108
|
end
|
107
109
|
end
|
108
110
|
|
@@ -119,16 +121,16 @@ and easy to use.
|
|
119
121
|
* Pause the spider on a forbidden page:
|
120
122
|
|
121
123
|
spider = Spidr.host('overnight.startup.com') do |spider|
|
122
|
-
spider.
|
123
|
-
spider.pause!
|
124
|
+
spider.every_forbidden_page do |page|
|
125
|
+
spider.pause!
|
124
126
|
end
|
125
127
|
end
|
126
128
|
|
127
129
|
* Skip the processing of a page:
|
128
130
|
|
129
131
|
Spidr.host('sketchy.content.com') do |spider|
|
130
|
-
spider.
|
131
|
-
spider.skip_page!
|
132
|
+
spider.every_missing_page do |page|
|
133
|
+
spider.skip_page!
|
132
134
|
end
|
133
135
|
end
|
134
136
|
|
@@ -137,8 +139,8 @@ and easy to use.
|
|
137
139
|
Spidr.host('sketchy.content.com') do |spider|
|
138
140
|
spider.every_url do |url|
|
139
141
|
if url.path.split('/').find { |dir| dir.to_i > 1000 }
|
140
|
-
|
141
|
-
|
142
|
+
spider.skip_link!
|
143
|
+
end
|
142
144
|
end
|
143
145
|
end
|
144
146
|
|
data/Rakefile
CHANGED
data/lib/spidr/agent.rb
CHANGED
data/lib/spidr/events.rb
CHANGED
@@ -56,6 +56,20 @@ module Spidr
|
|
56
56
|
return self
|
57
57
|
end
|
58
58
|
|
59
|
+
#
|
60
|
+
# Pass the headers from every response the agent receives to a given
|
61
|
+
# block.
|
62
|
+
#
|
63
|
+
# @yield [headers]
|
64
|
+
# The block will be passed the headers of every response.
|
65
|
+
#
|
66
|
+
# @yieldparam [Hash] headers
|
67
|
+
# The headers from a response.
|
68
|
+
#
|
69
|
+
def all_headers(&block)
|
70
|
+
every_page { |page| block.call(page.headers) }
|
71
|
+
end
|
72
|
+
|
59
73
|
#
|
60
74
|
# Pass every page that the agent visits to a given block.
|
61
75
|
#
|
@@ -71,17 +85,419 @@ module Spidr
|
|
71
85
|
end
|
72
86
|
|
73
87
|
#
|
74
|
-
# Pass
|
88
|
+
# Pass every OK page that the agent visits to a given block.
|
89
|
+
#
|
90
|
+
# @yield [page]
|
91
|
+
# The block will be passed every OK page visited.
|
92
|
+
#
|
93
|
+
# @yieldparam [Page] page
|
94
|
+
# A visited page.
|
95
|
+
#
|
96
|
+
def every_ok_page(&block)
|
97
|
+
every_page do |page|
|
98
|
+
block.call(page) if (block && page.ok?)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
#
|
103
|
+
# Pass every Redirect page that the agent visits to a given block.
|
104
|
+
#
|
105
|
+
# @yield [page]
|
106
|
+
# The block will be passed every Redirect page visited.
|
107
|
+
#
|
108
|
+
# @yieldparam [Page] page
|
109
|
+
# A visited page.
|
110
|
+
#
|
111
|
+
def every_redirect_page(&block)
|
112
|
+
every_page do |page|
|
113
|
+
block.call(page) if (block && page.redirect?)
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
#
|
118
|
+
# Pass every Timeout page that the agent visits to a given block.
|
119
|
+
#
|
120
|
+
# @yield [page]
|
121
|
+
# The block will be passed every Timeout page visited.
|
122
|
+
#
|
123
|
+
# @yieldparam [Page] page
|
124
|
+
# A visited page.
|
125
|
+
#
|
126
|
+
def every_timedout_page(&block)
|
127
|
+
every_page do |page|
|
128
|
+
block.call(page) if (block && page.timedout?)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
#
|
133
|
+
# Pass every Bad Request page that the agent visits to a given block.
|
134
|
+
#
|
135
|
+
# @yield [page]
|
136
|
+
# The block will be passed every Bad Request page visited.
|
137
|
+
#
|
138
|
+
# @yieldparam [Page] page
|
139
|
+
# A visited page.
|
140
|
+
#
|
141
|
+
def every_bad_request_page(&block)
|
142
|
+
every_page do |page|
|
143
|
+
block.call(page) if (block && page.bad_request?)
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
#
|
148
|
+
# Pass every Unauthorized page that the agent visits to a given block.
|
149
|
+
#
|
150
|
+
# @yield [page]
|
151
|
+
# The block will be passed every Unauthorized page visited.
|
152
|
+
#
|
153
|
+
# @yieldparam [Page] page
|
154
|
+
# A visited page.
|
155
|
+
#
|
156
|
+
def every_unauthorized_page(&block)
|
157
|
+
every_page do |page|
|
158
|
+
block.call(page) if (block && page.unauthorized?)
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
#
|
163
|
+
# Pass every Forbidden page that the agent visits to a given block.
|
164
|
+
#
|
165
|
+
# @yield [page]
|
166
|
+
# The block will be passed every Forbidden page visited.
|
167
|
+
#
|
168
|
+
# @yieldparam [Page] page
|
169
|
+
# A visited page.
|
170
|
+
#
|
171
|
+
def every_forbidden_page(&block)
|
172
|
+
every_page do |page|
|
173
|
+
block.call(page) if (block && page.forbidden?)
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
#
|
178
|
+
# Pass every Missing page that the agent visits to a given block.
|
179
|
+
#
|
180
|
+
# @yield [page]
|
181
|
+
# The block will be passed every Missing page visited.
|
182
|
+
#
|
183
|
+
# @yieldparam [Page] page
|
184
|
+
# A visited page.
|
185
|
+
#
|
186
|
+
def every_missing_page(&block)
|
187
|
+
every_page do |page|
|
188
|
+
block.call(page) if (block && page.missing?)
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
#
|
193
|
+
# Pass every Internal Server Error page that the agent visits to a
|
194
|
+
# given block.
|
195
|
+
#
|
196
|
+
# @yield [page]
|
197
|
+
# The block will be passed every Internal Server Error page visited.
|
198
|
+
#
|
199
|
+
# @yieldparam [Page] page
|
200
|
+
# A visited page.
|
201
|
+
#
|
202
|
+
def every_internal_server_error_page(&block)
|
203
|
+
every_page do |page|
|
204
|
+
block.call(page) if (block && page.had_internal_server_error?)
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
#
|
209
|
+
# Pass every Plain Text page that the agent visits to a given block.
|
210
|
+
#
|
211
|
+
# @yield [page]
|
212
|
+
# The block will be passed every Plain Text page visited.
|
213
|
+
#
|
214
|
+
# @yieldparam [Page] page
|
215
|
+
# A visited page.
|
216
|
+
#
|
217
|
+
def every_txt_page(&block)
|
218
|
+
every_page do |page|
|
219
|
+
block.call(page) if (block && page.txt?)
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
#
|
224
|
+
# Pass every HTML page that the agent visits to a given block.
|
225
|
+
#
|
226
|
+
# @yield [page]
|
227
|
+
# The block will be passed every HTML page visited.
|
228
|
+
#
|
229
|
+
# @yieldparam [Page] page
|
230
|
+
# A visited page.
|
231
|
+
#
|
232
|
+
def every_html_page(&block)
|
233
|
+
every_page do |page|
|
234
|
+
block.call(page) if (block && page.html?)
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
#
|
239
|
+
# Pass every XML page that the agent visits to a given block.
|
240
|
+
#
|
241
|
+
# @yield [page]
|
242
|
+
# The block will be passed every XML page visited.
|
243
|
+
#
|
244
|
+
# @yieldparam [Page] page
|
245
|
+
# A visited page.
|
246
|
+
#
|
247
|
+
def every_xml_page(&block)
|
248
|
+
every_page do |page|
|
249
|
+
block.call(page) if (block && page.xml?)
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
#
|
254
|
+
# Pass every XML Stylesheet (XSL) page that the agent visits to a
|
255
|
+
# given block.
|
256
|
+
#
|
257
|
+
# @yield [page]
|
258
|
+
# The block will be passed every XML Stylesheet (XSL) page visited.
|
259
|
+
#
|
260
|
+
# @yieldparam [Page] page
|
261
|
+
# A visited page.
|
262
|
+
#
|
263
|
+
def every_xsl_page(&block)
|
264
|
+
every_page do |page|
|
265
|
+
block.call(page) if (block && page.xsl?)
|
266
|
+
end
|
267
|
+
end
|
268
|
+
|
269
|
+
#
|
270
|
+
# Pass every HTML or XML document that the agent parses to a given
|
75
271
|
# block.
|
76
272
|
#
|
77
|
-
# @yield [
|
78
|
-
# The block will be passed
|
273
|
+
# @yield [doc]
|
274
|
+
# The block will be passed every HTML or XML document parsed.
|
79
275
|
#
|
80
|
-
# @yieldparam [
|
81
|
-
#
|
276
|
+
# @yieldparam [Nokogiri::HTML::Document, Nokogiri::XML::Document] doc
|
277
|
+
# A parsed HTML or XML document.
|
82
278
|
#
|
83
|
-
|
84
|
-
|
279
|
+
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
|
280
|
+
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
|
281
|
+
#
|
282
|
+
def every_doc(&block)
|
283
|
+
every_page do |page|
|
284
|
+
if block
|
285
|
+
if (doc = page.doc)
|
286
|
+
block.call(doc)
|
287
|
+
end
|
288
|
+
end
|
289
|
+
end
|
290
|
+
end
|
291
|
+
|
292
|
+
#
|
293
|
+
# Pass every HTML document that the agent parses to a given block.
|
294
|
+
#
|
295
|
+
# @yield [doc]
|
296
|
+
# The block will be passed every HTML document parsed.
|
297
|
+
#
|
298
|
+
# @yieldparam [Nokogiri::HTML::Document] doc
|
299
|
+
# A parsed HTML document.
|
300
|
+
#
|
301
|
+
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
|
302
|
+
#
|
303
|
+
def every_html_doc(&block)
|
304
|
+
every_page do |page|
|
305
|
+
if (block && page.html?)
|
306
|
+
if (doc = page.doc)
|
307
|
+
block.call(doc)
|
308
|
+
end
|
309
|
+
end
|
310
|
+
end
|
311
|
+
end
|
312
|
+
|
313
|
+
#
|
314
|
+
# Pass every XML document that the agent parses to a given block.
|
315
|
+
#
|
316
|
+
# @yield [doc]
|
317
|
+
# The block will be passed every XML document parsed.
|
318
|
+
#
|
319
|
+
# @yieldparam [Nokogiri::XML::Document] doc
|
320
|
+
# A parsed XML document.
|
321
|
+
#
|
322
|
+
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
|
323
|
+
#
|
324
|
+
def every_xml_doc(&block)
|
325
|
+
every_page do |page|
|
326
|
+
if (block && page.xml?)
|
327
|
+
if (doc = page.doc)
|
328
|
+
block.call(doc)
|
329
|
+
end
|
330
|
+
end
|
331
|
+
end
|
332
|
+
end
|
333
|
+
|
334
|
+
#
|
335
|
+
# Pass every XML Stylesheet (XSL) that the agent parses to a given
|
336
|
+
# block.
|
337
|
+
#
|
338
|
+
# @yield [doc]
|
339
|
+
# The block will be passed every XSL Stylesheet (XSL) parsed.
|
340
|
+
#
|
341
|
+
# @yieldparam [Nokogiri::XML::Document] doc
|
342
|
+
# A parsed XML document.
|
343
|
+
#
|
344
|
+
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
|
345
|
+
#
|
346
|
+
def every_xsl_doc(&block)
|
347
|
+
every_page do |page|
|
348
|
+
if (block && page.xsl?)
|
349
|
+
if (doc = page.doc)
|
350
|
+
block.call(doc)
|
351
|
+
end
|
352
|
+
end
|
353
|
+
end
|
354
|
+
end
|
355
|
+
|
356
|
+
#
|
357
|
+
# Pass every RSS document that the agent parses to a given block.
|
358
|
+
#
|
359
|
+
# @yield [doc]
|
360
|
+
# The block will be passed every RSS document parsed.
|
361
|
+
#
|
362
|
+
# @yieldparam [Nokogiri::XML::Document] doc
|
363
|
+
# A parsed XML document.
|
364
|
+
#
|
365
|
+
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
|
366
|
+
#
|
367
|
+
def every_rss_doc(&block)
|
368
|
+
every_page do |page|
|
369
|
+
if (block && page.rss?)
|
370
|
+
if (doc = page.doc)
|
371
|
+
block.call(doc)
|
372
|
+
end
|
373
|
+
end
|
374
|
+
end
|
375
|
+
end
|
376
|
+
|
377
|
+
#
|
378
|
+
# Pass every Atom document that the agent parses to a given block.
|
379
|
+
#
|
380
|
+
# @yield [doc]
|
381
|
+
# The block will be passed every Atom document parsed.
|
382
|
+
#
|
383
|
+
# @yieldparam [Nokogiri::XML::Document] doc
|
384
|
+
# A parsed XML document.
|
385
|
+
#
|
386
|
+
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
|
387
|
+
#
|
388
|
+
def every_atom_doc(&block)
|
389
|
+
every_page do |page|
|
390
|
+
if (block && page.atom?)
|
391
|
+
if (doc = page.doc)
|
392
|
+
block.call(doc)
|
393
|
+
end
|
394
|
+
end
|
395
|
+
end
|
396
|
+
end
|
397
|
+
|
398
|
+
#
|
399
|
+
# Pass every JavaScript page that the agent visits to a given block.
|
400
|
+
#
|
401
|
+
# @yield [page]
|
402
|
+
# The block will be passed every JavaScript page visited.
|
403
|
+
#
|
404
|
+
# @yieldparam [Page] page
|
405
|
+
# A visited page.
|
406
|
+
#
|
407
|
+
def every_javascript_page(&block)
|
408
|
+
every_page do |page|
|
409
|
+
block.call(page) if (block && page.javascript?)
|
410
|
+
end
|
411
|
+
end
|
412
|
+
|
413
|
+
#
|
414
|
+
# Pass every CSS page that the agent visits to a given block.
|
415
|
+
#
|
416
|
+
# @yield [page]
|
417
|
+
# The block will be passed every CSS page visited.
|
418
|
+
#
|
419
|
+
# @yieldparam [Page] page
|
420
|
+
# A visited page.
|
421
|
+
#
|
422
|
+
def every_css_page(&block)
|
423
|
+
every_page do |page|
|
424
|
+
block.call(page) if (block && page.css?)
|
425
|
+
end
|
426
|
+
end
|
427
|
+
|
428
|
+
#
|
429
|
+
# Pass every RSS feed that the agent visits to a given block.
|
430
|
+
#
|
431
|
+
# @yield [feed]
|
432
|
+
# The block will be passed every RSS feed visited.
|
433
|
+
#
|
434
|
+
# @yieldparam [Page] feed
|
435
|
+
# A visited page.
|
436
|
+
#
|
437
|
+
def every_rss_page(&block)
|
438
|
+
every_page do |page|
|
439
|
+
block.call(page) if (block && page.rss?)
|
440
|
+
end
|
441
|
+
end
|
442
|
+
|
443
|
+
#
|
444
|
+
# Pass every Atom feed that the agent visits to a given block.
|
445
|
+
#
|
446
|
+
# @yield [feed]
|
447
|
+
# The block will be passed every Atom feed visited.
|
448
|
+
#
|
449
|
+
# @yieldparam [Page] feed
|
450
|
+
# A visited page.
|
451
|
+
#
|
452
|
+
def every_atom_page(&block)
|
453
|
+
every_page do |page|
|
454
|
+
block.call(page) if (block && page.atom?)
|
455
|
+
end
|
456
|
+
end
|
457
|
+
|
458
|
+
#
|
459
|
+
# Pass every MS Word page that the agent visits to a given block.
|
460
|
+
#
|
461
|
+
# @yield [page]
|
462
|
+
# The block will be passed every MS Word page visited.
|
463
|
+
#
|
464
|
+
# @yieldparam [Page] page
|
465
|
+
# A visited page.
|
466
|
+
#
|
467
|
+
def every_ms_word_page(&block)
|
468
|
+
every_page do |page|
|
469
|
+
block.call(page) if (block && page.ms_word?)
|
470
|
+
end
|
471
|
+
end
|
472
|
+
|
473
|
+
#
|
474
|
+
# Pass every PDF page that the agent visits to a given block.
|
475
|
+
#
|
476
|
+
# @yield [page]
|
477
|
+
# The block will be passed every PDF page visited.
|
478
|
+
#
|
479
|
+
# @yieldparam [Page] page
|
480
|
+
# A visited page.
|
481
|
+
#
|
482
|
+
def every_pdf_page(&block)
|
483
|
+
every_page do |page|
|
484
|
+
block.call(page) if (block && page.pdf?)
|
485
|
+
end
|
486
|
+
end
|
487
|
+
|
488
|
+
#
|
489
|
+
# Pass every ZIP page that the agent visits to a given block.
|
490
|
+
#
|
491
|
+
# @yield [page]
|
492
|
+
# The block will be passed every ZIP page visited.
|
493
|
+
#
|
494
|
+
# @yieldparam [Page] page
|
495
|
+
# A visited page.
|
496
|
+
#
|
497
|
+
def every_zip_page(&block)
|
498
|
+
every_page do |page|
|
499
|
+
block.call(page) if (block && page.zip?)
|
500
|
+
end
|
85
501
|
end
|
86
502
|
end
|
87
503
|
end
|
data/lib/spidr/page.rb
CHANGED
@@ -173,6 +173,16 @@ module Spidr
|
|
173
173
|
(content_type =~ /text\/xml/) == 0
|
174
174
|
end
|
175
175
|
|
176
|
+
#
|
177
|
+
# Determines if the page is XML Stylesheet (XSL).
|
178
|
+
#
|
179
|
+
# @return [Boolean]
|
180
|
+
# Specifies whether the page is XML Stylesheet (XSL).
|
181
|
+
#
|
182
|
+
def xsl?
|
183
|
+
(content_type =~ /text\/xsl/) == 0
|
184
|
+
end
|
185
|
+
|
176
186
|
#
|
177
187
|
# Determines if the page is JavaScript.
|
178
188
|
#
|
@@ -261,13 +271,16 @@ module Spidr
|
|
261
271
|
# Returns +nil+ if the page is neither HTML, XML, RSS, Atom or if
|
262
272
|
# the page could not be parsed properly.
|
263
273
|
#
|
274
|
+
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
|
275
|
+
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
|
276
|
+
#
|
264
277
|
def doc
|
265
278
|
return nil if (body.nil? || body.empty?)
|
266
279
|
|
267
280
|
begin
|
268
281
|
if html?
|
269
282
|
return @doc ||= Nokogiri::HTML(body)
|
270
|
-
elsif (xml? || rss? || atom?)
|
283
|
+
elsif (xml? || xsl? || rss? || atom?)
|
271
284
|
return @doc ||= Nokogiri::XML(body)
|
272
285
|
end
|
273
286
|
rescue
|
@@ -373,6 +386,14 @@ module Spidr
|
|
373
386
|
doc.search('iframe[@src]').each do |iframe|
|
374
387
|
add_url.call(iframe.get_attribute('src'))
|
375
388
|
end
|
389
|
+
|
390
|
+
doc.search('link[@href]').each do |link|
|
391
|
+
add_url.call(link.get_attribute('href'))
|
392
|
+
end
|
393
|
+
|
394
|
+
doc.search('script[@src]').each do |script|
|
395
|
+
add_url.call(script.get_attribute('src'))
|
396
|
+
end
|
376
397
|
end
|
377
398
|
|
378
399
|
return urls
|
data/lib/spidr/version.rb
CHANGED
data/tasks/yard.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Postmodern
|
@@ -30,7 +30,7 @@ cert_chain:
|
|
30
30
|
pDj+ws7QjtH/Qcrr1l9jfN0ehDs=
|
31
31
|
-----END CERTIFICATE-----
|
32
32
|
|
33
|
-
date: 2009-
|
33
|
+
date: 2009-11-25 00:00:00 -08:00
|
34
34
|
default_executable:
|
35
35
|
dependencies:
|
36
36
|
- !ruby/object:Gem::Dependency
|
@@ -61,7 +61,7 @@ dependencies:
|
|
61
61
|
requirements:
|
62
62
|
- - ">="
|
63
63
|
- !ruby/object:Gem::Version
|
64
|
-
version: 0.
|
64
|
+
version: 0.4.0
|
65
65
|
version:
|
66
66
|
- !ruby/object:Gem::Dependency
|
67
67
|
name: hoe
|
@@ -154,7 +154,7 @@ files:
|
|
154
154
|
- static/course/frames/frame_next.html
|
155
155
|
- static/course/specs.json
|
156
156
|
has_rdoc: yard
|
157
|
-
homepage: http://spidr.rubyforge.org
|
157
|
+
homepage: http://spidr.rubyforge.org
|
158
158
|
licenses: []
|
159
159
|
|
160
160
|
post_install_message:
|
metadata.gz.sig
CHANGED
Binary file
|