spidr 0.2.7 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.rspec +1 -0
- data/ChangeLog.md +56 -31
- data/Gemfile +7 -21
- data/LICENSE.txt +1 -2
- data/README.md +7 -6
- data/Rakefile +13 -23
- data/gemspec.yml +19 -0
- data/lib/spidr/actions/actions.rb +1 -1
- data/lib/spidr/agent.rb +21 -6
- data/lib/spidr/auth_store.rb +1 -1
- data/lib/spidr/body.rb +99 -0
- data/lib/spidr/extensions/uri.rb +14 -7
- data/lib/spidr/headers.rb +323 -0
- data/lib/spidr/links.rb +229 -0
- data/lib/spidr/page.rb +32 -536
- data/lib/spidr/sanitizers.rb +3 -3
- data/lib/spidr/session_cache.rb +1 -0
- data/lib/spidr/version.rb +1 -1
- data/spec/actions_spec.rb +6 -8
- data/spec/auth_store_spec.rb +28 -28
- data/spec/cookie_jar_spec.rb +49 -60
- data/spec/extensions/uri_spec.rb +4 -0
- data/spec/filters_spec.rb +8 -0
- data/spec/page_spec.rb +0 -7
- data/spec/rules_spec.rb +8 -6
- data/spec/sanitizers_spec.rb +10 -16
- data/spec/spec_helper.rb +1 -12
- data/spec/spidr_spec.rb +11 -11
- data/spidr.gemspec +11 -110
- metadata +24 -52
- data/.gitignore +0 -9
- data/.specopts +0 -1
- data/Gemfile.lock +0 -39
data/lib/spidr/page.rb
CHANGED
@@ -1,8 +1,6 @@
|
|
1
|
-
require 'spidr/
|
2
|
-
|
3
|
-
require '
|
4
|
-
require 'uri'
|
5
|
-
require 'nokogiri'
|
1
|
+
require 'spidr/headers'
|
2
|
+
require 'spidr/body'
|
3
|
+
require 'spidr/links'
|
6
4
|
|
7
5
|
module Spidr
|
8
6
|
#
|
@@ -10,8 +8,9 @@ module Spidr
|
|
10
8
|
#
|
11
9
|
class Page
|
12
10
|
|
13
|
-
|
14
|
-
|
11
|
+
include Headers
|
12
|
+
include Body
|
13
|
+
include Links
|
15
14
|
|
16
15
|
# URL of the page
|
17
16
|
attr_reader :url
|
@@ -39,26 +38,21 @@ module Spidr
|
|
39
38
|
end
|
40
39
|
|
41
40
|
#
|
42
|
-
# The
|
43
|
-
#
|
44
|
-
# @return [Integer]
|
45
|
-
# Response code from the page.
|
46
|
-
#
|
47
|
-
def code
|
48
|
-
@response.code.to_i
|
49
|
-
end
|
50
|
-
|
41
|
+
# The meta-redirect links of the page.
|
51
42
|
#
|
52
|
-
#
|
43
|
+
# @return [Array<String>]
|
44
|
+
# All meta-redirect links in the page.
|
53
45
|
#
|
54
|
-
# @
|
55
|
-
#
|
46
|
+
# @deprecated
|
47
|
+
# Deprecated in 0.3.0 and will be removed in 0.4.0.
|
48
|
+
# Use {#meta_redirects} instead.
|
56
49
|
#
|
57
|
-
def
|
58
|
-
|
59
|
-
|
50
|
+
def meta_redirect
|
51
|
+
STDERR.puts 'DEPRECATION: Spidr::Page#meta_redirect will be removed in 0.3.0'
|
52
|
+
STDERR.puts 'DEPRECATION: Use Spidr::Page#meta_redirects instead'
|
60
53
|
|
61
|
-
|
54
|
+
meta_redirects
|
55
|
+
end
|
62
56
|
|
63
57
|
#
|
64
58
|
# Determines if the response code is `300`, `301`, `302`, `303`
|
@@ -81,531 +75,33 @@ module Spidr
|
|
81
75
|
|
82
76
|
alias redirect? is_redirect?
|
83
77
|
|
84
|
-
|
85
|
-
# Determines if the response code is `308`.
|
86
|
-
#
|
87
|
-
# @return [Boolean]
|
88
|
-
# Specifies whether the response code is `308`.
|
89
|
-
#
|
90
|
-
def timedout?
|
91
|
-
code == 308
|
92
|
-
end
|
93
|
-
|
94
|
-
#
|
95
|
-
# Determines if the response code is `400`.
|
96
|
-
#
|
97
|
-
# @return [Boolean]
|
98
|
-
# Specifies whether the response code is `400`.
|
99
|
-
#
|
100
|
-
def bad_request?
|
101
|
-
code == 400
|
102
|
-
end
|
103
|
-
|
104
|
-
#
|
105
|
-
# Determines if the response code is `401`.
|
106
|
-
#
|
107
|
-
# @return [Boolean]
|
108
|
-
# Specifies whether the response code is `401`.
|
109
|
-
#
|
110
|
-
def is_unauthorized?
|
111
|
-
code == 401
|
112
|
-
end
|
113
|
-
|
114
|
-
alias unauthorized? is_unauthorized?
|
115
|
-
|
116
|
-
#
|
117
|
-
# Determines if the response code is `403`.
|
118
|
-
#
|
119
|
-
# @return [Boolean]
|
120
|
-
# Specifies whether the response code is `403`.
|
121
|
-
#
|
122
|
-
def is_forbidden?
|
123
|
-
code == 403
|
124
|
-
end
|
125
|
-
|
126
|
-
alias forbidden? is_forbidden?
|
127
|
-
|
128
|
-
#
|
129
|
-
# Determines if the response code is `404`.
|
130
|
-
#
|
131
|
-
# @return [Boolean]
|
132
|
-
# Specifies whether the response code is `404`.
|
133
|
-
#
|
134
|
-
def is_missing?
|
135
|
-
code == 404
|
136
|
-
end
|
137
|
-
|
138
|
-
alias missing? is_missing?
|
139
|
-
|
140
|
-
#
|
141
|
-
# Determines if the response code is `500`.
|
142
|
-
#
|
143
|
-
# @return [Boolean]
|
144
|
-
# Specifies whether the response code is `500`.
|
145
|
-
#
|
146
|
-
def had_internal_server_error?
|
147
|
-
code == 500
|
148
|
-
end
|
149
|
-
|
150
|
-
#
|
151
|
-
# The Content-Type of the page.
|
152
|
-
#
|
153
|
-
# @return [String]
|
154
|
-
# The Content-Type of the page.
|
155
|
-
#
|
156
|
-
def content_type
|
157
|
-
(@response['Content-Type'] || '')
|
158
|
-
end
|
159
|
-
|
160
|
-
#
|
161
|
-
# The content types of the page.
|
162
|
-
#
|
163
|
-
# @return [Array<String>]
|
164
|
-
# The values within the Content-Type header.
|
165
|
-
#
|
166
|
-
# @since 0.2.2
|
167
|
-
#
|
168
|
-
def content_types
|
169
|
-
(@headers['content-type'] || [])
|
170
|
-
end
|
171
|
-
|
172
|
-
#
|
173
|
-
# Determines if the page is plain-text.
|
174
|
-
#
|
175
|
-
# @return [Boolean]
|
176
|
-
# Specifies whether the page is plain-text.
|
177
|
-
#
|
178
|
-
def plain_text?
|
179
|
-
is_content_type?('text/plain')
|
180
|
-
end
|
181
|
-
|
182
|
-
alias txt? plain_text?
|
183
|
-
|
184
|
-
#
|
185
|
-
# Determines if the page is HTML document.
|
186
|
-
#
|
187
|
-
# @return [Boolean]
|
188
|
-
# Specifies whether the page is HTML document.
|
189
|
-
#
|
190
|
-
def html?
|
191
|
-
is_content_type?('text/html')
|
192
|
-
end
|
193
|
-
|
194
|
-
#
|
195
|
-
# Determines if the page is XML document.
|
196
|
-
#
|
197
|
-
# @return [Boolean]
|
198
|
-
# Specifies whether the page is XML document.
|
199
|
-
#
|
200
|
-
def xml?
|
201
|
-
is_content_type?('text/xml')
|
202
|
-
end
|
203
|
-
|
204
|
-
#
|
205
|
-
# Determines if the page is XML Stylesheet (XSL).
|
206
|
-
#
|
207
|
-
# @return [Boolean]
|
208
|
-
# Specifies whether the page is XML Stylesheet (XSL).
|
209
|
-
#
|
210
|
-
def xsl?
|
211
|
-
is_content_type?('text/xsl')
|
212
|
-
end
|
213
|
-
|
214
|
-
#
|
215
|
-
# Determines if the page is JavaScript.
|
216
|
-
#
|
217
|
-
# @return [Boolean]
|
218
|
-
# Specifies whether the page is JavaScript.
|
219
|
-
#
|
220
|
-
def javascript?
|
221
|
-
is_content_type?('text/javascript') || \
|
222
|
-
is_content_type?('application/javascript')
|
223
|
-
end
|
224
|
-
|
225
|
-
#
|
226
|
-
# Determines if the page is a CSS stylesheet.
|
227
|
-
#
|
228
|
-
# @return [Boolean]
|
229
|
-
# Specifies whether the page is a CSS stylesheet.
|
230
|
-
#
|
231
|
-
def css?
|
232
|
-
is_content_type?('text/css')
|
233
|
-
end
|
234
|
-
|
235
|
-
#
|
236
|
-
# Determines if the page is a RSS feed.
|
237
|
-
#
|
238
|
-
# @return [Boolean]
|
239
|
-
# Specifies whether the page is a RSS feed.
|
240
|
-
#
|
241
|
-
def rss?
|
242
|
-
is_content_type?('application/rss+xml') || \
|
243
|
-
is_content_type?('application/rdf+xml')
|
244
|
-
end
|
245
|
-
|
246
|
-
#
|
247
|
-
# Determines if the page is an Atom feed.
|
248
|
-
#
|
249
|
-
# @return [Boolean]
|
250
|
-
# Specifies whether the page is an Atom feed.
|
251
|
-
#
|
252
|
-
def atom?
|
253
|
-
is_content_type?('application/atom+xml')
|
254
|
-
end
|
255
|
-
|
256
|
-
#
|
257
|
-
# Determines if the page is a MS Word document.
|
258
|
-
#
|
259
|
-
# @return [Boolean]
|
260
|
-
# Specifies whether the page is a MS Word document.
|
261
|
-
#
|
262
|
-
def ms_word?
|
263
|
-
is_content_type?('application/msword')
|
264
|
-
end
|
265
|
-
|
266
|
-
#
|
267
|
-
# Determines if the page is a PDF document.
|
268
|
-
#
|
269
|
-
# @return [Boolean]
|
270
|
-
# Specifies whether the page is a PDF document.
|
271
|
-
#
|
272
|
-
def pdf?
|
273
|
-
is_content_type?('application/pdf')
|
274
|
-
end
|
275
|
-
|
276
|
-
#
|
277
|
-
# Determines if the page is a ZIP archive.
|
278
|
-
#
|
279
|
-
# @return [Boolean]
|
280
|
-
# Specifies whether the page is a ZIP archive.
|
281
|
-
#
|
282
|
-
def zip?
|
283
|
-
is_content_type?('application/zip')
|
284
|
-
end
|
285
|
-
|
286
|
-
#
|
287
|
-
# The raw Cookie String sent along with the page.
|
288
|
-
#
|
289
|
-
# @return [String]
|
290
|
-
# The raw Cookie from the response.
|
291
|
-
#
|
292
|
-
# @since 0.2.7
|
293
|
-
#
|
294
|
-
def raw_cookie
|
295
|
-
(@response['Set-Cookie'] || '')
|
296
|
-
end
|
297
|
-
|
298
|
-
#
|
299
|
-
# The raw Cookie String sent along with the page.
|
300
|
-
#
|
301
|
-
# @return [String]
|
302
|
-
# The raw Cookie from the response.
|
303
|
-
#
|
304
|
-
# @deprecated
|
305
|
-
# Deprecated in 0.2.7 and will be removed in 0.3.0.
|
306
|
-
# Use {#raw_cookie} instead.
|
307
|
-
#
|
308
|
-
# @since 0.2.2
|
309
|
-
#
|
310
|
-
def cookie
|
311
|
-
STDERR.puts 'DEPRECATION: Spidr::Page#cookie will be removed in 0.3.0'
|
312
|
-
STDERR.puts 'DEPRECATION: Use Spidr::Page#raw_cookie instead'
|
313
|
-
|
314
|
-
return raw_cookie
|
315
|
-
end
|
316
|
-
|
317
|
-
#
|
318
|
-
# The Cookie values sent along with the page.
|
319
|
-
#
|
320
|
-
# @return [Array<String>]
|
321
|
-
# The Cookies from the response.
|
322
|
-
#
|
323
|
-
# @since 0.2.2
|
324
|
-
#
|
325
|
-
def cookies
|
326
|
-
(@headers['set-cookie'] || [])
|
327
|
-
end
|
328
|
-
|
329
|
-
#
|
330
|
-
# The Cookie key -> value pairs returned with the response.
|
331
|
-
#
|
332
|
-
# @return [Hash{String => String}]
|
333
|
-
# The cookie keys and values.
|
334
|
-
#
|
335
|
-
# @since 0.2.2
|
336
|
-
#
|
337
|
-
def cookie_params
|
338
|
-
params = {}
|
339
|
-
|
340
|
-
cookies.each do |cookie|
|
341
|
-
cookie.split('; ').each do |key_value|
|
342
|
-
key, value = key_value.split('=',2)
|
343
|
-
|
344
|
-
next if RESERVED_COOKIE_NAMES.include?(key)
|
345
|
-
|
346
|
-
params[key] = (value || '')
|
347
|
-
end
|
348
|
-
end
|
349
|
-
|
350
|
-
return params
|
351
|
-
end
|
352
|
-
|
353
|
-
#
|
354
|
-
# The body of the response.
|
355
|
-
#
|
356
|
-
# @return [String]
|
357
|
-
# The body of the response.
|
358
|
-
#
|
359
|
-
def body
|
360
|
-
(@response.body || '')
|
361
|
-
end
|
362
|
-
|
363
|
-
#
|
364
|
-
# Returns a parsed document object for HTML, XML, RSS and Atom pages.
|
365
|
-
#
|
366
|
-
# @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil]
|
367
|
-
# The document that represents HTML or XML pages.
|
368
|
-
# Returns `nil` if the page is neither HTML, XML, RSS, Atom or if
|
369
|
-
# the page could not be parsed properly.
|
370
|
-
#
|
371
|
-
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
|
372
|
-
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
|
373
|
-
#
|
374
|
-
def doc
|
375
|
-
return nil if body.empty?
|
376
|
-
|
377
|
-
begin
|
378
|
-
if html?
|
379
|
-
return @doc ||= Nokogiri::HTML(body)
|
380
|
-
elsif (xml? || xsl? || rss? || atom?)
|
381
|
-
return @doc ||= Nokogiri::XML(body)
|
382
|
-
end
|
383
|
-
rescue
|
384
|
-
return nil
|
385
|
-
end
|
386
|
-
end
|
387
|
-
|
388
|
-
#
|
389
|
-
# Searches the document for XPath or CSS Path paths.
|
390
|
-
#
|
391
|
-
# @param [Array<String>] paths
|
392
|
-
# CSS or XPath expressions to search the document with.
|
393
|
-
#
|
394
|
-
# @return [Array]
|
395
|
-
# The matched nodes from the document.
|
396
|
-
# Returns an empty Array if no nodes were matched, or if the page
|
397
|
-
# is not an HTML or XML document.
|
398
|
-
#
|
399
|
-
# @example
|
400
|
-
# page.search('//a[@href]')
|
401
|
-
#
|
402
|
-
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000239
|
403
|
-
#
|
404
|
-
def search(*paths)
|
405
|
-
if doc
|
406
|
-
doc.search(*paths)
|
407
|
-
else
|
408
|
-
[]
|
409
|
-
end
|
410
|
-
end
|
78
|
+
protected
|
411
79
|
|
412
80
|
#
|
413
|
-
#
|
414
|
-
#
|
415
|
-
# @return [Nokogiri::HTML::Node, Nokogiri::XML::Node, nil]
|
416
|
-
# The first matched node. Returns `nil` if no nodes could be matched,
|
417
|
-
# or if the page is not a HTML or XML document.
|
418
|
-
#
|
419
|
-
# @example
|
420
|
-
# page.at('//title')
|
81
|
+
# Provides transparent access to the values in {#headers}.
|
421
82
|
#
|
422
|
-
# @
|
83
|
+
# @param [Symbol] name
|
84
|
+
# The name of the missing method.
|
423
85
|
#
|
424
|
-
|
425
|
-
|
426
|
-
doc.at(*arguments)
|
427
|
-
end
|
428
|
-
end
|
429
|
-
|
430
|
-
alias / search
|
431
|
-
alias % at
|
432
|
-
|
433
|
-
#
|
434
|
-
# The title of the HTML page.
|
86
|
+
# @param [Array] arguments
|
87
|
+
# Additional arguments for the missing method.
|
435
88
|
#
|
436
89
|
# @return [String]
|
437
|
-
# The
|
438
|
-
#
|
439
|
-
def title
|
440
|
-
if (node = at('//title'))
|
441
|
-
node.inner_text
|
442
|
-
end
|
443
|
-
end
|
444
|
-
|
445
|
-
#
|
446
|
-
# The links from within the page.
|
447
|
-
#
|
448
|
-
# @return [Array<String>]
|
449
|
-
# All links within the HTML page, frame/iframe source URLs and any
|
450
|
-
# links in the `Location` header.
|
90
|
+
# The missing method mapped to a header in {#headers}.
|
451
91
|
#
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
add_url = lambda { |url|
|
456
|
-
urls << url unless (url.nil? || url.empty?)
|
457
|
-
}
|
458
|
-
|
459
|
-
self.redirects_to.each(&add_url) if self.is_redirect?
|
460
|
-
|
461
|
-
if (html? && doc)
|
462
|
-
doc.search('a[@href]').each do |a|
|
463
|
-
add_url.call(a.get_attribute('href'))
|
464
|
-
end
|
465
|
-
|
466
|
-
doc.search('frame[@src]').each do |iframe|
|
467
|
-
add_url.call(iframe.get_attribute('src'))
|
468
|
-
end
|
469
|
-
|
470
|
-
doc.search('iframe[@src]').each do |iframe|
|
471
|
-
add_url.call(iframe.get_attribute('src'))
|
472
|
-
end
|
473
|
-
|
474
|
-
doc.search('link[@href]').each do |link|
|
475
|
-
add_url.call(link.get_attribute('href'))
|
476
|
-
end
|
477
|
-
|
478
|
-
doc.search('script[@src]').each do |script|
|
479
|
-
add_url.call(script.get_attribute('src'))
|
480
|
-
end
|
481
|
-
end
|
482
|
-
|
483
|
-
return urls
|
484
|
-
end
|
485
|
-
|
486
|
-
#
|
487
|
-
# URL(s) that this document redirects to.
|
488
|
-
#
|
489
|
-
# @return [Array<String>]
|
490
|
-
# The links that this page redirects to (usually found in a
|
491
|
-
# location header or by way of a page-level meta redirect).
|
492
|
-
#
|
493
|
-
def redirects_to
|
494
|
-
location = @headers['location']
|
495
|
-
|
496
|
-
if location.nil?
|
497
|
-
# check page-level meta redirects if there isn't a location header
|
498
|
-
meta_redirect
|
499
|
-
elsif location.kind_of?(Array)
|
500
|
-
location
|
501
|
-
else
|
502
|
-
# usually the location header contains a single String
|
503
|
-
[location]
|
504
|
-
end
|
505
|
-
end
|
506
|
-
|
507
|
-
#
|
508
|
-
# Absolute URIs from within the page.
|
509
|
-
#
|
510
|
-
# @return [Array<URI::HTTP>]
|
511
|
-
# The links from within the page, converted to absolute URIs.
|
512
|
-
#
|
513
|
-
def urls
|
514
|
-
links.map { |link| to_absolute(link) }.compact
|
515
|
-
end
|
516
|
-
|
517
|
-
#
|
518
|
-
# Normalizes and expands a given link into a proper URI.
|
519
|
-
#
|
520
|
-
# @param [String] link
|
521
|
-
# The link to normalize and expand.
|
522
|
-
#
|
523
|
-
# @return [URI::HTTP]
|
524
|
-
# The normalized URI.
|
525
|
-
#
|
526
|
-
def to_absolute(link)
|
527
|
-
begin
|
528
|
-
url = @url.merge(link.to_s)
|
529
|
-
rescue URI::InvalidURIError, URI::InvalidComponentError
|
530
|
-
return nil
|
531
|
-
end
|
532
|
-
|
533
|
-
unless (url.path.nil? || url.path.empty?)
|
534
|
-
# make sure the path does not contain any .. or . directories,
|
535
|
-
# since URI::Generic#merge cannot normalize paths such as
|
536
|
-
# "/stuff/../"
|
537
|
-
url.path = URI.expand_path(url.path)
|
538
|
-
end
|
539
|
-
|
540
|
-
return url
|
541
|
-
end
|
542
|
-
|
543
|
-
#
|
544
|
-
# Determines if a page-level "soft" redirect is present. If yes,
|
545
|
-
# returns an array of those redirects (usually a single URL).
|
546
|
-
# Otherwise, returns false.
|
547
|
-
#
|
548
|
-
# @return [Array<String>]
|
549
|
-
# An array of redirect URLs
|
92
|
+
# @raise [NoMethodError]
|
93
|
+
# The missing method did not map to a header in {#headers}.
|
550
94
|
#
|
551
|
-
def
|
552
|
-
|
553
|
-
|
554
|
-
if (html? && doc)
|
555
|
-
search('//meta[@http-equiv and @content]').each do |node|
|
556
|
-
if node.get_attribute('http-equiv') =~ /refresh/i
|
557
|
-
content = node.get_attribute('content')
|
95
|
+
def method_missing(name,*arguments,&block)
|
96
|
+
if (arguments.empty? && block.nil?)
|
97
|
+
header_name = name.to_s.sub('_','-')
|
558
98
|
|
559
|
-
|
560
|
-
|
561
|
-
end
|
562
|
-
end
|
99
|
+
if @response.key?(header_name)
|
100
|
+
return @response[header_name]
|
563
101
|
end
|
564
102
|
end
|
565
103
|
|
566
|
-
return
|
567
|
-
end
|
568
|
-
|
569
|
-
#
|
570
|
-
# Returns a boolean indicating whether or not page-level meta
|
571
|
-
# redirects are present in this page.
|
572
|
-
#
|
573
|
-
# @return [Boolean]
|
574
|
-
# Specifies whether the page includes page-level redirects.
|
575
|
-
#
|
576
|
-
def meta_redirect?
|
577
|
-
!meta_redirect.empty?
|
578
|
-
end
|
579
|
-
|
580
|
-
protected
|
581
|
-
|
582
|
-
#
|
583
|
-
# Determines if any of the content-types of the page include a given
|
584
|
-
# type.
|
585
|
-
#
|
586
|
-
# @param [String] type
|
587
|
-
# The content-type to test for.
|
588
|
-
#
|
589
|
-
# @return [Boolean]
|
590
|
-
# Specifies whether the page includes the given content-type.
|
591
|
-
#
|
592
|
-
# @since 0.2.4
|
593
|
-
#
|
594
|
-
def is_content_type?(type)
|
595
|
-
content_types.any? { |content| content.include?(type) }
|
596
|
-
end
|
597
|
-
|
598
|
-
#
|
599
|
-
# Provides transparent access to the values in `headers`.
|
600
|
-
#
|
601
|
-
def method_missing(sym,*args,&block)
|
602
|
-
if (args.empty? && block.nil?)
|
603
|
-
name = sym.id2name.sub('_','-')
|
604
|
-
|
605
|
-
return @response[name] if @response.key?(name)
|
606
|
-
end
|
607
|
-
|
608
|
-
return super(sym,*args,&block)
|
104
|
+
return super(name,*arguments,&block)
|
609
105
|
end
|
610
106
|
|
611
107
|
end
|