spidr 0.1.9 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data.tar.gz.sig +0 -0
- data/History.txt +43 -0
- data/Manifest.txt +19 -0
- data/README.txt +100 -11
- data/Rakefile +15 -5
- data/lib/spidr/actions.rb +2 -0
- data/lib/spidr/actions/actions.rb +79 -0
- data/lib/spidr/actions/exceptions.rb +4 -0
- data/lib/spidr/actions/exceptions/action.rb +6 -0
- data/lib/spidr/actions/exceptions/paused.rb +8 -0
- data/lib/spidr/actions/exceptions/skip_link.rb +8 -0
- data/lib/spidr/actions/exceptions/skip_page.rb +8 -0
- data/lib/spidr/agent.rb +385 -444
- data/lib/spidr/events.rb +87 -0
- data/lib/spidr/extensions.rb +1 -0
- data/lib/spidr/extensions/uri.rb +45 -0
- data/lib/spidr/filters.rb +438 -0
- data/lib/spidr/page.rb +211 -70
- data/lib/spidr/rules.rb +40 -18
- data/lib/spidr/spidr.rb +57 -7
- data/lib/spidr/version.rb +2 -1
- data/spec/actions_spec.rb +61 -0
- data/spec/agent_spec.rb +24 -31
- data/spec/extensions/uri_spec.rb +39 -0
- data/spec/filters_spec.rb +53 -0
- data/spec/helpers/page.rb +8 -0
- data/spec/page_examples.rb +17 -0
- data/spec/page_spec.rb +81 -0
- data/spec/rules_spec.rb +43 -0
- data/spec/spec_helper.rb +1 -1
- data/spec/spidr_spec.rb +30 -0
- data/static/course/specs.json +1 -1
- data/tasks/course.rb +8 -1
- data/tasks/spec.rb +1 -0
- data/tasks/yard.rb +12 -0
- metadata +45 -6
- metadata.gz.sig +0 -0
data/lib/spidr/page.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'spidr/extensions/uri'
|
2
|
+
|
1
3
|
require 'uri'
|
2
4
|
require 'nokogiri'
|
3
5
|
|
@@ -10,15 +12,17 @@ module Spidr
|
|
10
12
|
# HTTP Response
|
11
13
|
attr_reader :response
|
12
14
|
|
13
|
-
# Body returned for the page
|
14
|
-
attr_reader :body
|
15
|
-
|
16
15
|
# Headers returned with the body
|
17
16
|
attr_reader :headers
|
18
17
|
|
19
18
|
#
|
20
|
-
# Creates a new Page object
|
21
|
-
#
|
19
|
+
# Creates a new Page object.
|
20
|
+
#
|
21
|
+
# @param [URI::HTTP] url
|
22
|
+
# The URL of the page.
|
23
|
+
#
|
24
|
+
# @param [Net::HTTP::Response] response
|
25
|
+
# The response from the request for the page.
|
22
26
|
#
|
23
27
|
def initialize(url,response)
|
24
28
|
@url = url
|
@@ -28,169 +32,234 @@ module Spidr
|
|
28
32
|
end
|
29
33
|
|
30
34
|
#
|
31
|
-
#
|
35
|
+
# The response code from the page.
|
36
|
+
#
|
37
|
+
# @return [Integer]
|
38
|
+
# Response code from the page.
|
32
39
|
#
|
33
40
|
def code
|
34
|
-
@response.code
|
41
|
+
@response.code.to_i
|
35
42
|
end
|
36
43
|
|
37
44
|
#
|
38
|
-
#
|
45
|
+
# Determines if the response code is +200+.
|
46
|
+
#
|
47
|
+
# @return [Boolean]
|
48
|
+
# Specifies whether the response code is +200+.
|
39
49
|
#
|
40
50
|
def is_ok?
|
41
51
|
code == 200
|
42
52
|
end
|
43
53
|
|
54
|
+
alias ok? is_ok?
|
55
|
+
|
56
|
+
#
|
57
|
+
# Determines if the response code is +301+ or +307+.
|
44
58
|
#
|
45
|
-
#
|
46
|
-
#
|
59
|
+
# @return [Boolean]
|
60
|
+
# Specifies whether the response code is +301+ or +307+.
|
47
61
|
#
|
48
62
|
def is_redirect?
|
49
63
|
(code == 301 || code == 307)
|
50
64
|
end
|
51
65
|
|
66
|
+
alias redirect? is_redirect?
|
67
|
+
|
68
|
+
#
|
69
|
+
# Determines if the response code is +308+.
|
52
70
|
#
|
53
|
-
#
|
71
|
+
# @return [Boolean]
|
72
|
+
# Specifies whether the response code is +308+.
|
54
73
|
#
|
55
74
|
def timedout?
|
56
75
|
code == 308
|
57
76
|
end
|
58
77
|
|
59
78
|
#
|
60
|
-
#
|
79
|
+
# Determines if the response code is +400+.
|
80
|
+
#
|
81
|
+
# @return [Boolean]
|
82
|
+
# Specifies whether the response code is +400+.
|
61
83
|
#
|
62
84
|
def bad_request?
|
63
85
|
code == 400
|
64
86
|
end
|
65
87
|
|
66
88
|
#
|
67
|
-
#
|
89
|
+
# Determines if the response code is +401+.
|
90
|
+
#
|
91
|
+
# @return [Boolean]
|
92
|
+
# Specifies whether the response code is +401+.
|
68
93
|
#
|
69
94
|
def is_unauthorized?
|
70
95
|
code == 401
|
71
96
|
end
|
72
97
|
|
98
|
+
alias unauthorized? is_unauthorized?
|
99
|
+
|
73
100
|
#
|
74
|
-
#
|
101
|
+
# Determines if the response code is +403+.
|
102
|
+
#
|
103
|
+
# @return [Boolean]
|
104
|
+
# Specifies whether the response code is +403+.
|
75
105
|
#
|
76
106
|
def is_forbidden?
|
77
107
|
code == 403
|
78
108
|
end
|
79
109
|
|
110
|
+
alias forbidden? is_forbidden?
|
111
|
+
|
80
112
|
#
|
81
|
-
#
|
113
|
+
# Determines if the response code is +404+.
|
114
|
+
#
|
115
|
+
# @return [Boolean]
|
116
|
+
# Specifies whether the response code is +404+.
|
82
117
|
#
|
83
118
|
def is_missing?
|
84
119
|
code == 404
|
85
120
|
end
|
86
121
|
|
122
|
+
alias missing? is_missing?
|
123
|
+
|
87
124
|
#
|
88
|
-
#
|
125
|
+
# Determines if the response code is +500+.
|
126
|
+
#
|
127
|
+
# @return [Boolean]
|
128
|
+
# Specifies whether the response code is +500+.
|
89
129
|
#
|
90
130
|
def had_internal_server_error?
|
91
131
|
code == 500
|
92
132
|
end
|
93
133
|
|
94
134
|
#
|
95
|
-
#
|
135
|
+
# The Content-Type of the page.
|
136
|
+
#
|
137
|
+
# @return [String]
|
138
|
+
# The Content-Type of the page.
|
96
139
|
#
|
97
140
|
def content_type
|
98
141
|
@response['Content-Type']
|
99
142
|
end
|
100
143
|
|
101
144
|
#
|
102
|
-
#
|
103
|
-
#
|
145
|
+
# Determines if the page is plain-text.
|
146
|
+
#
|
147
|
+
# @return [Boolean]
|
148
|
+
# Specifies whether the page is plain-text.
|
104
149
|
#
|
105
150
|
def plain_text?
|
106
151
|
(content_type =~ /text\/plain/) == 0
|
107
152
|
end
|
108
153
|
|
154
|
+
alias txt? plain_text?
|
155
|
+
|
156
|
+
#
|
157
|
+
# Determines if the page is HTML document.
|
109
158
|
#
|
110
|
-
#
|
111
|
-
#
|
159
|
+
# @return [Boolean]
|
160
|
+
# Specifies whether the page is HTML document.
|
112
161
|
#
|
113
162
|
def html?
|
114
163
|
(content_type =~ /text\/html/) == 0
|
115
164
|
end
|
116
165
|
|
117
166
|
#
|
118
|
-
#
|
119
|
-
#
|
167
|
+
# Determines if the page is XML document.
|
168
|
+
#
|
169
|
+
# @return [Boolean]
|
170
|
+
# Specifies whether the page is XML document.
|
120
171
|
#
|
121
172
|
def xml?
|
122
173
|
(content_type =~ /text\/xml/) == 0
|
123
174
|
end
|
124
175
|
|
125
176
|
#
|
126
|
-
#
|
127
|
-
#
|
177
|
+
# Determines if the page is JavaScript.
|
178
|
+
#
|
179
|
+
# @return [Boolean]
|
180
|
+
# Specifies whether the page is JavaScript.
|
128
181
|
#
|
129
182
|
def javascript?
|
130
183
|
(content_type =~ /(text|application)\/javascript/) == 0
|
131
184
|
end
|
132
185
|
|
133
186
|
#
|
134
|
-
#
|
135
|
-
#
|
187
|
+
# Determines if the page is a CSS stylesheet.
|
188
|
+
#
|
189
|
+
# @return [Boolean]
|
190
|
+
# Specifies whether the page is a CSS stylesheet.
|
136
191
|
#
|
137
192
|
def css?
|
138
193
|
(content_type =~ /text\/css/) == 0
|
139
194
|
end
|
140
195
|
|
141
196
|
#
|
142
|
-
#
|
143
|
-
#
|
197
|
+
# Determines if the page is a RSS feed.
|
198
|
+
#
|
199
|
+
# @return [Boolean]
|
200
|
+
# Specifies whether the page is a RSS feed.
|
144
201
|
#
|
145
202
|
def rss?
|
146
203
|
(content_type =~ /application\/(rss|rdf)\+xml/) == 0
|
147
204
|
end
|
148
205
|
|
149
206
|
#
|
150
|
-
#
|
151
|
-
#
|
207
|
+
# Determines if the page is an Atom feed.
|
208
|
+
#
|
209
|
+
# @return [Boolean]
|
210
|
+
# Specifies whether the page is an Atom feed.
|
152
211
|
#
|
153
212
|
def atom?
|
154
213
|
(content_type =~ /application\/atom\+xml/) == 0
|
155
214
|
end
|
156
215
|
|
157
216
|
#
|
158
|
-
#
|
159
|
-
#
|
217
|
+
# Determines if the page is a MS Word document.
|
218
|
+
#
|
219
|
+
# @return [Boolean]
|
220
|
+
# Specifies whether the page is a MS Word document.
|
160
221
|
#
|
161
222
|
def ms_word?
|
162
223
|
(content_type =~ /application\/msword/) == 0
|
163
224
|
end
|
164
225
|
|
165
226
|
#
|
166
|
-
#
|
167
|
-
#
|
227
|
+
# Determines if the page is a PDF document.
|
228
|
+
#
|
229
|
+
# @return [Boolean]
|
230
|
+
# Specifies whether the page is a PDF document.
|
168
231
|
#
|
169
232
|
def pdf?
|
170
233
|
(content_type =~ /application\/pdf/) == 0
|
171
234
|
end
|
172
235
|
|
173
236
|
#
|
174
|
-
#
|
175
|
-
#
|
237
|
+
# Determines if the page is a ZIP archive.
|
238
|
+
#
|
239
|
+
# @return [Boolean]
|
240
|
+
# Specifies whether the page is a ZIP archive.
|
176
241
|
#
|
177
242
|
def zip?
|
178
243
|
(content_type =~ /application\/zip/) == 0
|
179
244
|
end
|
180
245
|
|
181
246
|
#
|
182
|
-
#
|
247
|
+
# The body of the response.
|
248
|
+
#
|
249
|
+
# @return [String]
|
250
|
+
# The body of the response.
|
183
251
|
#
|
184
252
|
def body
|
185
253
|
@response.body
|
186
254
|
end
|
187
255
|
|
188
256
|
#
|
189
|
-
#
|
190
|
-
#
|
191
|
-
#
|
192
|
-
#
|
193
|
-
#
|
257
|
+
# Returns a parsed document object for HTML, XML, RSS and Atom pages.
|
258
|
+
#
|
259
|
+
# @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil]
|
260
|
+
# The document that represents HTML or XML pages.
|
261
|
+
# Returns +nil+ if the page is neither HTML, XML, RSS, Atom or if
|
262
|
+
# the page could not be parsed properly.
|
194
263
|
#
|
195
264
|
def doc
|
196
265
|
return nil if (body.nil? || body.empty?)
|
@@ -198,7 +267,7 @@ module Spidr
|
|
198
267
|
begin
|
199
268
|
if html?
|
200
269
|
return @doc ||= Nokogiri::HTML(body)
|
201
|
-
elsif xml?
|
270
|
+
elsif (xml? || rss? || atom?)
|
202
271
|
return @doc ||= Nokogiri::XML(body)
|
203
272
|
end
|
204
273
|
rescue
|
@@ -207,7 +276,70 @@ module Spidr
|
|
207
276
|
end
|
208
277
|
|
209
278
|
#
|
210
|
-
#
|
279
|
+
# Searches the document for XPath or CSS Path paths.
|
280
|
+
#
|
281
|
+
# @param [Array<String>] paths
|
282
|
+
# CSS or XPath expressions to search the document with.
|
283
|
+
#
|
284
|
+
# @return [Array]
|
285
|
+
# The matched nodes from the document.
|
286
|
+
# Returns an empty Array if no nodes were matched, or if the page
|
287
|
+
# is not an HTML or XML document.
|
288
|
+
#
|
289
|
+
# @example
|
290
|
+
# page.search('//a[@href]')
|
291
|
+
#
|
292
|
+
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000239
|
293
|
+
#
|
294
|
+
def search(*paths)
|
295
|
+
if doc
|
296
|
+
return doc.search(*paths)
|
297
|
+
end
|
298
|
+
|
299
|
+
return []
|
300
|
+
end
|
301
|
+
|
302
|
+
#
|
303
|
+
# Searches for the first occurrence an XPath or CSS Path expression.
|
304
|
+
#
|
305
|
+
# @return [Nokogiri::HTML::Node, Nokogiri::XML::Node, nil]
|
306
|
+
# The first matched node. Returns +nil+ if no nodes could be matched,
|
307
|
+
# or if the page is not a HTML or XML document.
|
308
|
+
#
|
309
|
+
# @example
|
310
|
+
# page.at('//title')
|
311
|
+
#
|
312
|
+
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000251
|
313
|
+
#
|
314
|
+
def at(*arguments)
|
315
|
+
if doc
|
316
|
+
return doc.at(*arguments)
|
317
|
+
end
|
318
|
+
|
319
|
+
return nil
|
320
|
+
end
|
321
|
+
|
322
|
+
alias / search
|
323
|
+
alias % at
|
324
|
+
|
325
|
+
#
|
326
|
+
# The title of the HTML page.
|
327
|
+
#
|
328
|
+
# @return [String]
|
329
|
+
# The inner-text of the title element of the page.
|
330
|
+
#
|
331
|
+
def title
|
332
|
+
if (node = at('//title'))
|
333
|
+
return node.inner_text
|
334
|
+
end
|
335
|
+
end
|
336
|
+
|
337
|
+
#
|
338
|
+
# The links from within the page.
|
339
|
+
#
|
340
|
+
# @return [Array<String>]
|
341
|
+
# All links within the HTML page, frame/iframe source URLs and any
|
342
|
+
# links in the +Location+ header.
|
211
343
|
#
|
212
344
|
def links
|
213
345
|
urls = []
|
@@ -218,7 +350,15 @@ module Spidr
|
|
218
350
|
|
219
351
|
case code
|
220
352
|
when 300..303, 307
|
221
|
-
|
353
|
+
location = @headers['location']
|
354
|
+
|
355
|
+
if location.kind_of?(Array)
|
356
|
+
# handle multiple location URLs
|
357
|
+
location.each(&add_url)
|
358
|
+
else
|
359
|
+
# usually the location header contains a single String
|
360
|
+
add_url.call(location)
|
361
|
+
end
|
222
362
|
end
|
223
363
|
|
224
364
|
if (html? && doc)
|
@@ -239,44 +379,45 @@ module Spidr
|
|
239
379
|
end
|
240
380
|
|
241
381
|
#
|
242
|
-
#
|
382
|
+
# Absolute URIs from within the page.
|
383
|
+
#
|
384
|
+
# @return [Array<URI::HTTP>]
|
385
|
+
# The links from within the page, converted to absolute URIs.
|
243
386
|
#
|
244
387
|
def urls
|
245
388
|
links.map { |link| to_absolute(link) }.compact
|
246
389
|
end
|
247
390
|
|
248
|
-
protected
|
249
|
-
|
250
391
|
#
|
251
|
-
#
|
252
|
-
#
|
392
|
+
# Normalizes and expands a given link into a proper URI.
|
393
|
+
#
|
394
|
+
# @param [String] link
|
395
|
+
# The link to normalize and expand.
|
396
|
+
#
|
397
|
+
# @return [URI::HTTP]
|
398
|
+
# The normalized URI.
|
253
399
|
#
|
254
400
|
def to_absolute(link)
|
255
|
-
# decode, clean then re-encode the URL
|
256
|
-
link = URI.encode(URI.decode(link.to_s).gsub(/#[a-zA-Z0-9_-]*$/,''))
|
257
|
-
|
258
401
|
begin
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
if absolute.path
|
263
|
-
if absolute.path.empty?
|
264
|
-
# default the absolute path to '/'
|
265
|
-
absolute.path = '/'
|
266
|
-
else
|
267
|
-
# make sure the path does not contain any .. or . directories.
|
268
|
-
absolute.path = File.expand_path(absolute.path)
|
269
|
-
end
|
270
|
-
end
|
271
|
-
|
272
|
-
return absolute
|
273
|
-
rescue URI::InvalidURIError => e
|
402
|
+
url = @url.merge(link.to_s)
|
403
|
+
rescue URI::InvalidURIError
|
274
404
|
return nil
|
275
405
|
end
|
406
|
+
|
407
|
+
unless (url.path.nil? || url.path.empty?)
|
408
|
+
# make sure the path does not contain any .. or . directories,
|
409
|
+
# since URI::Generic#merge cannot normalize paths such as
|
410
|
+
# "/stuff/../"
|
411
|
+
url.path = URI.expand_path(url.path)
|
412
|
+
end
|
413
|
+
|
414
|
+
return url
|
276
415
|
end
|
277
416
|
|
417
|
+
protected
|
418
|
+
|
278
419
|
#
|
279
|
-
# Provides transparent access to the values in
|
420
|
+
# Provides transparent access to the values in +headers+.
|
280
421
|
#
|
281
422
|
def method_missing(sym,*args,&block)
|
282
423
|
if (args.empty? && block.nil?)
|