spidr_epg 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/.gitignore +10 -0
- data/.rspec +1 -0
- data/.yardopts +1 -0
- data/ChangeLog.md +291 -0
- data/ChangeLog.md~ +291 -0
- data/Gemfile +16 -0
- data/Gemfile.lock +49 -0
- data/Gemfile~ +16 -0
- data/LICENSE.txt +20 -0
- data/README.md +193 -0
- data/README.md~ +190 -0
- data/Rakefile +29 -0
- data/gemspec.yml +19 -0
- data/lib/spidr/actions/actions.rb +83 -0
- data/lib/spidr/actions/exceptions/action.rb +9 -0
- data/lib/spidr/actions/exceptions/paused.rb +11 -0
- data/lib/spidr/actions/exceptions/skip_link.rb +12 -0
- data/lib/spidr/actions/exceptions/skip_page.rb +12 -0
- data/lib/spidr/actions/exceptions.rb +4 -0
- data/lib/spidr/actions.rb +2 -0
- data/lib/spidr/agent.rb +866 -0
- data/lib/spidr/auth_credential.rb +28 -0
- data/lib/spidr/auth_store.rb +161 -0
- data/lib/spidr/body.rb +98 -0
- data/lib/spidr/cookie_jar.rb +202 -0
- data/lib/spidr/events.rb +537 -0
- data/lib/spidr/extensions/uri.rb +52 -0
- data/lib/spidr/extensions.rb +1 -0
- data/lib/spidr/filters.rb +539 -0
- data/lib/spidr/headers.rb +370 -0
- data/lib/spidr/links.rb +229 -0
- data/lib/spidr/page.rb +108 -0
- data/lib/spidr/rules.rb +79 -0
- data/lib/spidr/sanitizers.rb +56 -0
- data/lib/spidr/session_cache.rb +145 -0
- data/lib/spidr/spidr.rb +107 -0
- data/lib/spidr/version.rb +4 -0
- data/lib/spidr/version.rb~ +4 -0
- data/lib/spidr.rb +3 -0
- data/pkg/spidr-1.0.0.gem +0 -0
- data/spec/actions_spec.rb +59 -0
- data/spec/agent_spec.rb +81 -0
- data/spec/auth_store_spec.rb +85 -0
- data/spec/cookie_jar_spec.rb +144 -0
- data/spec/extensions/uri_spec.rb +43 -0
- data/spec/filters_spec.rb +61 -0
- data/spec/helpers/history.rb +34 -0
- data/spec/helpers/page.rb +8 -0
- data/spec/helpers/wsoc.rb +83 -0
- data/spec/page_examples.rb +21 -0
- data/spec/page_spec.rb +125 -0
- data/spec/rules_spec.rb +45 -0
- data/spec/sanitizers_spec.rb +61 -0
- data/spec/session_cache.rb +58 -0
- data/spec/spec_helper.rb +4 -0
- data/spec/spidr_spec.rb +39 -0
- data/spidr.gemspec +133 -0
- data/spidr.gemspec~ +131 -0
- metadata +158 -0
@@ -0,0 +1,370 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module Spidr
|
4
|
+
module Headers
|
5
|
+
# Reserved names used within Cookie strings
|
6
|
+
RESERVED_COOKIE_NAMES = Set['path', 'expires', 'domain']
|
7
|
+
|
8
|
+
#
|
9
|
+
# The response code from the page.
|
10
|
+
#
|
11
|
+
# @return [Integer]
|
12
|
+
# Response code from the page.
|
13
|
+
#
|
14
|
+
def code
|
15
|
+
response.code.to_i
|
16
|
+
end
|
17
|
+
|
18
|
+
#
|
19
|
+
# Determines if the response code is `200`.
|
20
|
+
#
|
21
|
+
# @return [Boolean]
|
22
|
+
# Specifies whether the response code is `200`.
|
23
|
+
#
|
24
|
+
def is_ok?
|
25
|
+
code == 200
|
26
|
+
end
|
27
|
+
|
28
|
+
alias ok? is_ok?
|
29
|
+
|
30
|
+
#
|
31
|
+
# Determines if the response code is `308`.
|
32
|
+
#
|
33
|
+
# @return [Boolean]
|
34
|
+
# Specifies whether the response code is `308`.
|
35
|
+
#
|
36
|
+
def timedout?
|
37
|
+
code == 308
|
38
|
+
end
|
39
|
+
|
40
|
+
#
|
41
|
+
# Determines if the response code is `400`.
|
42
|
+
#
|
43
|
+
# @return [Boolean]
|
44
|
+
# Specifies whether the response code is `400`.
|
45
|
+
#
|
46
|
+
def bad_request?
|
47
|
+
code == 400
|
48
|
+
end
|
49
|
+
|
50
|
+
#
|
51
|
+
# Determines if the response code is `401`.
|
52
|
+
#
|
53
|
+
# @return [Boolean]
|
54
|
+
# Specifies whether the response code is `401`.
|
55
|
+
#
|
56
|
+
def is_unauthorized?
|
57
|
+
code == 401
|
58
|
+
end
|
59
|
+
|
60
|
+
alias unauthorized? is_unauthorized?
|
61
|
+
|
62
|
+
#
|
63
|
+
# Determines if the response code is `403`.
|
64
|
+
#
|
65
|
+
# @return [Boolean]
|
66
|
+
# Specifies whether the response code is `403`.
|
67
|
+
#
|
68
|
+
def is_forbidden?
|
69
|
+
code == 403
|
70
|
+
end
|
71
|
+
|
72
|
+
alias forbidden? is_forbidden?
|
73
|
+
|
74
|
+
#
|
75
|
+
# Determines if the response code is `404`.
|
76
|
+
#
|
77
|
+
# @return [Boolean]
|
78
|
+
# Specifies whether the response code is `404`.
|
79
|
+
#
|
80
|
+
def is_missing?
|
81
|
+
code == 404
|
82
|
+
end
|
83
|
+
|
84
|
+
alias missing? is_missing?
|
85
|
+
|
86
|
+
#
|
87
|
+
# Determines if the response code is `500`.
|
88
|
+
#
|
89
|
+
# @return [Boolean]
|
90
|
+
# Specifies whether the response code is `500`.
|
91
|
+
#
|
92
|
+
def had_internal_server_error?
|
93
|
+
code == 500
|
94
|
+
end
|
95
|
+
|
96
|
+
#
|
97
|
+
# The Content-Type of the page.
|
98
|
+
#
|
99
|
+
# @return [String]
|
100
|
+
# The Content-Type of the page.
|
101
|
+
#
|
102
|
+
def content_type
|
103
|
+
(response['Content-Type'] || '')
|
104
|
+
end
|
105
|
+
|
106
|
+
#
|
107
|
+
# The content types of the page.
|
108
|
+
#
|
109
|
+
# @return [Array<String>]
|
110
|
+
# The values within the Content-Type header.
|
111
|
+
#
|
112
|
+
# @since 0.2.2
|
113
|
+
#
|
114
|
+
def content_types
|
115
|
+
(headers['content-type'] || [])
|
116
|
+
end
|
117
|
+
|
118
|
+
#
|
119
|
+
# The charset included in the Content-Type.
|
120
|
+
#
|
121
|
+
# @return [String, nil]
|
122
|
+
# The charset of the content.
|
123
|
+
#
|
124
|
+
# @since 0.4.0
|
125
|
+
#
|
126
|
+
def content_charset
|
127
|
+
content_types.each do |value|
|
128
|
+
if value.include?(';')
|
129
|
+
value.split(';').each do |param|
|
130
|
+
param.strip!
|
131
|
+
|
132
|
+
if param.start_with?('charset=')
|
133
|
+
return param.split('=',2).last
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
return nil
|
140
|
+
end
|
141
|
+
|
142
|
+
#
|
143
|
+
# Determines if any of the content-types of the page include a given
|
144
|
+
# type.
|
145
|
+
#
|
146
|
+
# @param [String] type
|
147
|
+
# The content-type to test for.
|
148
|
+
#
|
149
|
+
# @return [Boolean]
|
150
|
+
# Specifies whether the page includes the given content-type.
|
151
|
+
#
|
152
|
+
# @example Match the Content-Type
|
153
|
+
# page.is_content_type?('application/json')
|
154
|
+
#
|
155
|
+
# @example Match the sub-type of the Content-Type
|
156
|
+
# page.is_content_type?('json')
|
157
|
+
#
|
158
|
+
# @since 0.4.0
|
159
|
+
#
|
160
|
+
def is_content_type?(type)
|
161
|
+
if type.include?('/')
|
162
|
+
# otherwise only match the first param
|
163
|
+
content_types.any? do |value|
|
164
|
+
value = value.split(';',2).first
|
165
|
+
|
166
|
+
value == type
|
167
|
+
end
|
168
|
+
else
|
169
|
+
# otherwise only match the sub-type
|
170
|
+
content_types.any? do |value|
|
171
|
+
value = value.split(';',2).first
|
172
|
+
value = value.split('/',2).last
|
173
|
+
|
174
|
+
value == type
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
#
|
180
|
+
# Determines if the page is plain-text.
|
181
|
+
#
|
182
|
+
# @return [Boolean]
|
183
|
+
# Specifies whether the page is plain-text.
|
184
|
+
#
|
185
|
+
def plain_text?
|
186
|
+
is_content_type?('text/plain')
|
187
|
+
end
|
188
|
+
|
189
|
+
alias txt? plain_text?
|
190
|
+
|
191
|
+
#
|
192
|
+
# Determines if the page is a Directory Listing.
|
193
|
+
#
|
194
|
+
# @return [Boolean]
|
195
|
+
# Specifies whether the page is a Directory Listing.
|
196
|
+
#
|
197
|
+
# @since 0.3.0
|
198
|
+
#
|
199
|
+
def directory?
|
200
|
+
is_content_type?('text/directory')
|
201
|
+
end
|
202
|
+
|
203
|
+
#
|
204
|
+
# Determines if the page is HTML document.
|
205
|
+
#
|
206
|
+
# @return [Boolean]
|
207
|
+
# Specifies whether the page is HTML document.
|
208
|
+
#
|
209
|
+
def html?
|
210
|
+
is_content_type?('text/html')
|
211
|
+
end
|
212
|
+
|
213
|
+
#
|
214
|
+
# Determines if the page is XML document.
|
215
|
+
#
|
216
|
+
# @return [Boolean]
|
217
|
+
# Specifies whether the page is XML document.
|
218
|
+
#
|
219
|
+
def xml?
|
220
|
+
is_content_type?('text/xml') || \
|
221
|
+
is_content_type?('application/xml')
|
222
|
+
end
|
223
|
+
|
224
|
+
#
|
225
|
+
# Determines if the page is XML Stylesheet (XSL).
|
226
|
+
#
|
227
|
+
# @return [Boolean]
|
228
|
+
# Specifies whether the page is XML Stylesheet (XSL).
|
229
|
+
#
|
230
|
+
def xsl?
|
231
|
+
is_content_type?('text/xsl')
|
232
|
+
end
|
233
|
+
|
234
|
+
#
|
235
|
+
# Determines if the page is JavaScript.
|
236
|
+
#
|
237
|
+
# @return [Boolean]
|
238
|
+
# Specifies whether the page is JavaScript.
|
239
|
+
#
|
240
|
+
def javascript?
|
241
|
+
is_content_type?('text/javascript') || \
|
242
|
+
is_content_type?('application/javascript')
|
243
|
+
end
|
244
|
+
|
245
|
+
#
|
246
|
+
# Determines if the page is JSON.
|
247
|
+
#
|
248
|
+
# @return [Boolean]
|
249
|
+
# Specifies whether the page is JSON.
|
250
|
+
#
|
251
|
+
# @since 0.3.0
|
252
|
+
#
|
253
|
+
def json?
|
254
|
+
is_content_type?('application/json')
|
255
|
+
end
|
256
|
+
|
257
|
+
#
|
258
|
+
# Determines if the page is a CSS stylesheet.
|
259
|
+
#
|
260
|
+
# @return [Boolean]
|
261
|
+
# Specifies whether the page is a CSS stylesheet.
|
262
|
+
#
|
263
|
+
def css?
|
264
|
+
is_content_type?('text/css')
|
265
|
+
end
|
266
|
+
|
267
|
+
#
|
268
|
+
# Determines if the page is a RSS feed.
|
269
|
+
#
|
270
|
+
# @return [Boolean]
|
271
|
+
# Specifies whether the page is a RSS feed.
|
272
|
+
#
|
273
|
+
def rss?
|
274
|
+
is_content_type?('application/rss+xml') || \
|
275
|
+
is_content_type?('application/rdf+xml')
|
276
|
+
end
|
277
|
+
|
278
|
+
#
|
279
|
+
# Determines if the page is an Atom feed.
|
280
|
+
#
|
281
|
+
# @return [Boolean]
|
282
|
+
# Specifies whether the page is an Atom feed.
|
283
|
+
#
|
284
|
+
def atom?
|
285
|
+
is_content_type?('application/atom+xml')
|
286
|
+
end
|
287
|
+
|
288
|
+
#
|
289
|
+
# Determines if the page is a MS Word document.
|
290
|
+
#
|
291
|
+
# @return [Boolean]
|
292
|
+
# Specifies whether the page is a MS Word document.
|
293
|
+
#
|
294
|
+
def ms_word?
|
295
|
+
is_content_type?('application/msword')
|
296
|
+
end
|
297
|
+
|
298
|
+
#
|
299
|
+
# Determines if the page is a PDF document.
|
300
|
+
#
|
301
|
+
# @return [Boolean]
|
302
|
+
# Specifies whether the page is a PDF document.
|
303
|
+
#
|
304
|
+
def pdf?
|
305
|
+
is_content_type?('application/pdf')
|
306
|
+
end
|
307
|
+
|
308
|
+
#
|
309
|
+
# Determines if the page is a ZIP archive.
|
310
|
+
#
|
311
|
+
# @return [Boolean]
|
312
|
+
# Specifies whether the page is a ZIP archive.
|
313
|
+
#
|
314
|
+
def zip?
|
315
|
+
is_content_type?('application/zip')
|
316
|
+
end
|
317
|
+
|
318
|
+
#
|
319
|
+
# The raw Cookie String sent along with the page.
|
320
|
+
#
|
321
|
+
# @return [String]
|
322
|
+
# The raw Cookie from the response.
|
323
|
+
#
|
324
|
+
# @since 0.2.7
|
325
|
+
#
|
326
|
+
def cookie
|
327
|
+
(response['Set-Cookie'] || '')
|
328
|
+
end
|
329
|
+
|
330
|
+
alias raw_cookie cookie
|
331
|
+
|
332
|
+
#
|
333
|
+
# The Cookie values sent along with the page.
|
334
|
+
#
|
335
|
+
# @return [Array<String>]
|
336
|
+
# The Cookies from the response.
|
337
|
+
#
|
338
|
+
# @since 0.2.2
|
339
|
+
#
|
340
|
+
def cookies
|
341
|
+
(headers['set-cookie'] || [])
|
342
|
+
end
|
343
|
+
|
344
|
+
#
|
345
|
+
# The Cookie key -> value pairs returned with the response.
|
346
|
+
#
|
347
|
+
# @return [Hash{String => String}]
|
348
|
+
# The cookie keys and values.
|
349
|
+
#
|
350
|
+
# @since 0.2.2
|
351
|
+
#
|
352
|
+
def cookie_params
|
353
|
+
params = {}
|
354
|
+
|
355
|
+
cookies.each do |value|
|
356
|
+
value.split(';').each do |param|
|
357
|
+
param.strip!
|
358
|
+
|
359
|
+
name, value = param.split('=',2)
|
360
|
+
|
361
|
+
unless RESERVED_COOKIE_NAMES.include?(name)
|
362
|
+
params[name] = (value || '')
|
363
|
+
end
|
364
|
+
end
|
365
|
+
end
|
366
|
+
|
367
|
+
return params
|
368
|
+
end
|
369
|
+
end
|
370
|
+
end
|
data/lib/spidr/links.rb
ADDED
@@ -0,0 +1,229 @@
|
|
1
|
+
require 'spidrs/extensions/uri'
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
module Spidr
|
5
|
+
module Links
|
6
|
+
include Enumerable
|
7
|
+
|
8
|
+
#
|
9
|
+
# Enumerates over the meta-redirect links in the page.
|
10
|
+
#
|
11
|
+
# @yield [link]
|
12
|
+
# If a block is given, it will be passed every meta-redirect link
|
13
|
+
# from the page.
|
14
|
+
#
|
15
|
+
# @yieldparam [String] link
|
16
|
+
# A meta-redirect link from the page.
|
17
|
+
#
|
18
|
+
# @return [Enumerator]
|
19
|
+
# If no block is given, an enumerator object will be returned.
|
20
|
+
#
|
21
|
+
# @since 0.3.0
|
22
|
+
#
|
23
|
+
def each_meta_redirect
|
24
|
+
return enum_for(:each_meta_redirect) unless block_given?
|
25
|
+
|
26
|
+
if (html? && doc)
|
27
|
+
search('//meta[@http-equiv and @content]').each do |node|
|
28
|
+
if node.get_attribute('http-equiv') =~ /refresh/i
|
29
|
+
content = node.get_attribute('content')
|
30
|
+
|
31
|
+
if (redirect = content.match(/url=(\S+)$/))
|
32
|
+
yield redirect[1]
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
#
|
40
|
+
# Returns a boolean indicating whether or not page-level meta
|
41
|
+
# redirects are present in this page.
|
42
|
+
#
|
43
|
+
# @return [Boolean]
|
44
|
+
# Specifies whether the page includes page-level redirects.
|
45
|
+
#
|
46
|
+
def meta_redirect?
|
47
|
+
!(each_meta_redirect.first.nil?)
|
48
|
+
end
|
49
|
+
|
50
|
+
#
|
51
|
+
# The meta-redirect links of the page.
|
52
|
+
#
|
53
|
+
# @return [Array<String>]
|
54
|
+
# All meta-redirect links in the page.
|
55
|
+
#
|
56
|
+
# @since 0.3.0
|
57
|
+
#
|
58
|
+
def meta_redirects
|
59
|
+
each_meta_redirect.to_a
|
60
|
+
end
|
61
|
+
|
62
|
+
#
|
63
|
+
# Enumerates over every HTTP or meta-redirect link in the page.
|
64
|
+
#
|
65
|
+
# @yield [link]
|
66
|
+
# The given block will be passed every redirection link from the page.
|
67
|
+
#
|
68
|
+
# @yieldparam [String] link
|
69
|
+
# A HTTP or meta-redirect link from the page.
|
70
|
+
#
|
71
|
+
# @return [Enumerator]
|
72
|
+
# If no block is given, an enumerator object will be returned.
|
73
|
+
#
|
74
|
+
# @since 0.3.0
|
75
|
+
#
|
76
|
+
def each_redirect(&block)
|
77
|
+
return enum_for(:each_redirect) unless block
|
78
|
+
|
79
|
+
location = headers['location']
|
80
|
+
|
81
|
+
if location.nil?
|
82
|
+
# check page-level meta redirects if there isn't a location header
|
83
|
+
each_meta_redirect(&block)
|
84
|
+
elsif location.kind_of?(Array)
|
85
|
+
location.each(&block)
|
86
|
+
else
|
87
|
+
# usually the location header contains a single String
|
88
|
+
yield location
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
#
|
93
|
+
# URLs that this document redirects to.
|
94
|
+
#
|
95
|
+
# @return [Array<String>]
|
96
|
+
# The links that this page redirects to (usually found in a
|
97
|
+
# location header or by way of a page-level meta redirect).
|
98
|
+
#
|
99
|
+
def redirects_to
|
100
|
+
each_redirect.to_a
|
101
|
+
end
|
102
|
+
|
103
|
+
#
|
104
|
+
# Enumerates over every link in the page.
|
105
|
+
#
|
106
|
+
# @yield [link]
|
107
|
+
# The given block will be passed every non-empty link in the page.
|
108
|
+
#
|
109
|
+
# @yieldparam [String] link
|
110
|
+
# A link in the page.
|
111
|
+
#
|
112
|
+
# @return [Enumerator]
|
113
|
+
# If no block is given, an enumerator object will be returned.
|
114
|
+
#
|
115
|
+
# @since 0.3.0
|
116
|
+
#
|
117
|
+
def each_link
|
118
|
+
return enum_for(:each_link) unless block_given?
|
119
|
+
|
120
|
+
filter = lambda { |url|
|
121
|
+
yield url unless (url.nil? || url.empty?)
|
122
|
+
}
|
123
|
+
|
124
|
+
each_redirect(&filter) if is_redirect?
|
125
|
+
|
126
|
+
if (html? && doc)
|
127
|
+
doc.search('//a[@href]').each do |a|
|
128
|
+
filter.call(a.get_attribute('href'))
|
129
|
+
end
|
130
|
+
|
131
|
+
doc.search('//frame[@src]').each do |iframe|
|
132
|
+
filter.call(iframe.get_attribute('src'))
|
133
|
+
end
|
134
|
+
|
135
|
+
doc.search('//iframe[@src]').each do |iframe|
|
136
|
+
filter.call(iframe.get_attribute('src'))
|
137
|
+
end
|
138
|
+
|
139
|
+
doc.search('//link[@href]').each do |link|
|
140
|
+
filter.call(link.get_attribute('href'))
|
141
|
+
end
|
142
|
+
|
143
|
+
doc.search('//script[@src]').each do |script|
|
144
|
+
filter.call(script.get_attribute('src'))
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
#
|
150
|
+
# The links from within the page.
|
151
|
+
#
|
152
|
+
# @return [Array<String>]
|
153
|
+
# All links within the HTML page, frame/iframe source URLs and any
|
154
|
+
# links in the `Location` header.
|
155
|
+
#
|
156
|
+
def links
|
157
|
+
each_link.to_a
|
158
|
+
end
|
159
|
+
|
160
|
+
#
|
161
|
+
# Enumerates over every absolute URL in the page.
|
162
|
+
#
|
163
|
+
# @yield [url]
|
164
|
+
# The given block will be passed every URL in the page.
|
165
|
+
#
|
166
|
+
# @yieldparam [URI::HTTP] url
|
167
|
+
# An absolute URL in the page.
|
168
|
+
#
|
169
|
+
# @return [Enumerator]
|
170
|
+
# If no block is given, an enumerator object will be returned.
|
171
|
+
#
|
172
|
+
# @since 0.3.0
|
173
|
+
#
|
174
|
+
def each_url
|
175
|
+
return enum_for(:each_url) unless block_given?
|
176
|
+
|
177
|
+
each_link do |link|
|
178
|
+
if (url = to_absolute(link))
|
179
|
+
yield url
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
alias each each_url
|
185
|
+
|
186
|
+
#
|
187
|
+
# Absolute URIs from within the page.
|
188
|
+
#
|
189
|
+
# @return [Array<URI::HTTP>]
|
190
|
+
# The links from within the page, converted to absolute URIs.
|
191
|
+
#
|
192
|
+
def urls
|
193
|
+
each_url.to_a
|
194
|
+
end
|
195
|
+
|
196
|
+
#
|
197
|
+
# Normalizes and expands a given link into a proper URI.
|
198
|
+
#
|
199
|
+
# @param [String] link
|
200
|
+
# The link to normalize and expand.
|
201
|
+
#
|
202
|
+
# @return [URI::HTTP]
|
203
|
+
# The normalized URI.
|
204
|
+
#
|
205
|
+
def to_absolute(link)
|
206
|
+
begin
|
207
|
+
new_url = url.merge(link.to_s)
|
208
|
+
rescue Exception
|
209
|
+
return nil
|
210
|
+
end
|
211
|
+
|
212
|
+
if new_url.path
|
213
|
+
path = new_url.path
|
214
|
+
|
215
|
+
# ensure that paths begin with a leading '/' for URI::FTP
|
216
|
+
if (new_url.scheme == 'ftp' && path[0,1] != '/')
|
217
|
+
path.insert(0,'/')
|
218
|
+
end
|
219
|
+
|
220
|
+
# make sure the path does not contain any .. or . directories,
|
221
|
+
# since URI::Generic#merge cannot normalize paths such as
|
222
|
+
# "/stuff/../"
|
223
|
+
new_url.path = URI.expand_path(path)
|
224
|
+
end
|
225
|
+
|
226
|
+
return new_url
|
227
|
+
end
|
228
|
+
end
|
229
|
+
end
|
data/lib/spidr/page.rb
ADDED
@@ -0,0 +1,108 @@
|
|
1
|
+
require 'spidrs/headers'
|
2
|
+
require 'spidrs/body'
|
3
|
+
require 'spidrs/links'
|
4
|
+
|
5
|
+
module Spidr
|
6
|
+
#
|
7
|
+
# Represents a requested page from a website.
|
8
|
+
#
|
9
|
+
class Page
|
10
|
+
|
11
|
+
include Headers
|
12
|
+
include Body
|
13
|
+
include Links
|
14
|
+
|
15
|
+
# URL of the page
|
16
|
+
attr_reader :url
|
17
|
+
|
18
|
+
# HTTP Response
|
19
|
+
attr_reader :response
|
20
|
+
|
21
|
+
# Headers returned with the body
|
22
|
+
attr_reader :headers
|
23
|
+
|
24
|
+
#
|
25
|
+
# Creates a new Page object.
|
26
|
+
#
|
27
|
+
# @param [URI::HTTP] url
|
28
|
+
# The URL of the page.
|
29
|
+
#
|
30
|
+
# @param [Net::HTTP::Response] response
|
31
|
+
# The response from the request for the page.
|
32
|
+
#
|
33
|
+
def initialize(url,response)
|
34
|
+
@url = url
|
35
|
+
@response = response
|
36
|
+
@headers = response.to_hash
|
37
|
+
@doc = nil
|
38
|
+
end
|
39
|
+
|
40
|
+
#
|
41
|
+
# The meta-redirect links of the page.
|
42
|
+
#
|
43
|
+
# @return [Array<String>]
|
44
|
+
# All meta-redirect links in the page.
|
45
|
+
#
|
46
|
+
# @deprecated
|
47
|
+
# Deprecated in 0.3.0 and will be removed in 0.4.0.
|
48
|
+
# Use {#meta_redirects} instead.
|
49
|
+
#
|
50
|
+
def meta_redirect
|
51
|
+
STDERR.puts 'DEPRECATION: Spidr::Page#meta_redirect will be removed in 0.3.0'
|
52
|
+
STDERR.puts 'DEPRECATION: Use Spidr::Page#meta_redirects instead'
|
53
|
+
|
54
|
+
meta_redirects
|
55
|
+
end
|
56
|
+
|
57
|
+
#
|
58
|
+
# Determines if the response code is `300`, `301`, `302`, `303`
|
59
|
+
# or `307`. Also checks for "soft" redirects added at the page
|
60
|
+
# level by a meta refresh tag.
|
61
|
+
#
|
62
|
+
# @return [Boolean]
|
63
|
+
# Specifies whether the response code is a HTTP Redirect code.
|
64
|
+
#
|
65
|
+
def is_redirect?
|
66
|
+
case code
|
67
|
+
when 300..303, 307
|
68
|
+
true
|
69
|
+
when 200
|
70
|
+
meta_redirect?
|
71
|
+
else
|
72
|
+
false
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
alias redirect? is_redirect?
|
77
|
+
|
78
|
+
protected
|
79
|
+
|
80
|
+
#
|
81
|
+
# Provides transparent access to the values in {#headers}.
|
82
|
+
#
|
83
|
+
# @param [Symbol] name
|
84
|
+
# The name of the missing method.
|
85
|
+
#
|
86
|
+
# @param [Array] arguments
|
87
|
+
# Additional arguments for the missing method.
|
88
|
+
#
|
89
|
+
# @return [String]
|
90
|
+
# The missing method mapped to a header in {#headers}.
|
91
|
+
#
|
92
|
+
# @raise [NoMethodError]
|
93
|
+
# The missing method did not map to a header in {#headers}.
|
94
|
+
#
|
95
|
+
def method_missing(name,*arguments,&block)
|
96
|
+
if (arguments.empty? && block.nil?)
|
97
|
+
header_name = name.to_s.sub('_','-')
|
98
|
+
|
99
|
+
if @response.key?(header_name)
|
100
|
+
return @response[header_name]
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
return super(name,*arguments,&block)
|
105
|
+
end
|
106
|
+
|
107
|
+
end
|
108
|
+
end
|