spidr 0.2.7 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -27,19 +27,26 @@ module URI
27
27
  # # => "/path"
28
28
  #
29
29
  def URI.expand_path(path)
30
- dirs = path.gsub(/[\/]{2,}/,'/').scan(/[^\/]*\/|[^\/]+$/)
30
+ dirs = path.split(/\/+/)
31
+
32
+ # append any tailing '/' chars, lost due to String#split
33
+ dirs << '' if path[-1,1] == '/'
34
+
31
35
  new_dirs = []
32
36
 
33
37
  dirs.each do |dir|
34
- if (dir == '..' || dir == '../')
35
- unless new_dirs == ['/']
36
- new_dirs.pop
37
- end
38
- elsif (dir != '.' && dir != './')
38
+ if dir == '..'
39
+ new_dirs.pop
40
+ elsif dir != '.'
39
41
  new_dirs.push(dir)
40
42
  end
41
43
  end
42
44
 
43
- return new_dirs.join
45
+ full_path = new_dirs.join('/')
46
+
47
+ # default empty paths to '/'
48
+ full_path = '/' if full_path.empty?
49
+
50
+ return full_path
44
51
  end
45
52
  end
@@ -0,0 +1,323 @@
1
+ module Spidr
2
+ module Headers
3
+ # Reserved names used within Cookie strings
4
+ RESERVED_COOKIE_NAMES = Set['path', 'expires', 'domain']
5
+
6
+ #
7
+ # The response code from the page.
8
+ #
9
+ # @return [Integer]
10
+ # Response code from the page.
11
+ #
12
+ def code
13
+ response.code.to_i
14
+ end
15
+
16
+ #
17
+ # Determines if the response code is `200`.
18
+ #
19
+ # @return [Boolean]
20
+ # Specifies whether the response code is `200`.
21
+ #
22
+ def is_ok?
23
+ code == 200
24
+ end
25
+
26
+ alias ok? is_ok?
27
+
28
+ #
29
+ # Determines if the response code is `308`.
30
+ #
31
+ # @return [Boolean]
32
+ # Specifies whether the response code is `308`.
33
+ #
34
+ def timedout?
35
+ code == 308
36
+ end
37
+
38
+ #
39
+ # Determines if the response code is `400`.
40
+ #
41
+ # @return [Boolean]
42
+ # Specifies whether the response code is `400`.
43
+ #
44
+ def bad_request?
45
+ code == 400
46
+ end
47
+
48
+ #
49
+ # Determines if the response code is `401`.
50
+ #
51
+ # @return [Boolean]
52
+ # Specifies whether the response code is `401`.
53
+ #
54
+ def is_unauthorized?
55
+ code == 401
56
+ end
57
+
58
+ alias unauthorized? is_unauthorized?
59
+
60
+ #
61
+ # Determines if the response code is `403`.
62
+ #
63
+ # @return [Boolean]
64
+ # Specifies whether the response code is `403`.
65
+ #
66
+ def is_forbidden?
67
+ code == 403
68
+ end
69
+
70
+ alias forbidden? is_forbidden?
71
+
72
+ #
73
+ # Determines if the response code is `404`.
74
+ #
75
+ # @return [Boolean]
76
+ # Specifies whether the response code is `404`.
77
+ #
78
+ def is_missing?
79
+ code == 404
80
+ end
81
+
82
+ alias missing? is_missing?
83
+
84
+ #
85
+ # Determines if the response code is `500`.
86
+ #
87
+ # @return [Boolean]
88
+ # Specifies whether the response code is `500`.
89
+ #
90
+ def had_internal_server_error?
91
+ code == 500
92
+ end
93
+
94
+ #
95
+ # The Content-Type of the page.
96
+ #
97
+ # @return [String]
98
+ # The Content-Type of the page.
99
+ #
100
+ def content_type
101
+ (response['Content-Type'] || '')
102
+ end
103
+
104
+ #
105
+ # The content types of the page.
106
+ #
107
+ # @return [Array<String>]
108
+ # The values within the Content-Type header.
109
+ #
110
+ # @since 0.2.2
111
+ #
112
+ def content_types
113
+ (headers['content-type'] || [])
114
+ end
115
+
116
+ #
117
+ # Determines if the page is plain-text.
118
+ #
119
+ # @return [Boolean]
120
+ # Specifies whether the page is plain-text.
121
+ #
122
+ def plain_text?
123
+ is_content_type?('text/plain')
124
+ end
125
+
126
+ alias txt? plain_text?
127
+
128
+ #
129
+ # Determines if the page is a Directory Listing.
130
+ #
131
+ # @return [Boolean]
132
+ # Specifies whether the page is a Directory Listing.
133
+ #
134
+ # @since 0.3.0
135
+ #
136
+ def directory?
137
+ is_content_type?('text/directory')
138
+ end
139
+
140
+ #
141
+ # Determines if the page is HTML document.
142
+ #
143
+ # @return [Boolean]
144
+ # Specifies whether the page is HTML document.
145
+ #
146
+ def html?
147
+ is_content_type?('text/html')
148
+ end
149
+
150
+ #
151
+ # Determines if the page is XML document.
152
+ #
153
+ # @return [Boolean]
154
+ # Specifies whether the page is XML document.
155
+ #
156
+ def xml?
157
+ is_content_type?('text/xml') || \
158
+ is_content_type?('application/xml')
159
+ end
160
+
161
+ #
162
+ # Determines if the page is XML Stylesheet (XSL).
163
+ #
164
+ # @return [Boolean]
165
+ # Specifies whether the page is XML Stylesheet (XSL).
166
+ #
167
+ def xsl?
168
+ is_content_type?('text/xsl')
169
+ end
170
+
171
+ #
172
+ # Determines if the page is JavaScript.
173
+ #
174
+ # @return [Boolean]
175
+ # Specifies whether the page is JavaScript.
176
+ #
177
+ def javascript?
178
+ is_content_type?('text/javascript') || \
179
+ is_content_type?('application/javascript')
180
+ end
181
+
182
+ #
183
+ # Determines if the page is JSON.
184
+ #
185
+ # @return [Boolean]
186
+ # Specifies whether the page is JSON.
187
+ #
188
+ # @since 0.3.0
189
+ #
190
+ def json?
191
+ is_content_type?('application/json')
192
+ end
193
+
194
+ #
195
+ # Determines if the page is a CSS stylesheet.
196
+ #
197
+ # @return [Boolean]
198
+ # Specifies whether the page is a CSS stylesheet.
199
+ #
200
+ def css?
201
+ is_content_type?('text/css')
202
+ end
203
+
204
+ #
205
+ # Determines if the page is a RSS feed.
206
+ #
207
+ # @return [Boolean]
208
+ # Specifies whether the page is a RSS feed.
209
+ #
210
+ def rss?
211
+ is_content_type?('application/rss+xml') || \
212
+ is_content_type?('application/rdf+xml')
213
+ end
214
+
215
+ #
216
+ # Determines if the page is an Atom feed.
217
+ #
218
+ # @return [Boolean]
219
+ # Specifies whether the page is an Atom feed.
220
+ #
221
+ def atom?
222
+ is_content_type?('application/atom+xml')
223
+ end
224
+
225
+ #
226
+ # Determines if the page is a MS Word document.
227
+ #
228
+ # @return [Boolean]
229
+ # Specifies whether the page is a MS Word document.
230
+ #
231
+ def ms_word?
232
+ is_content_type?('application/msword')
233
+ end
234
+
235
+ #
236
+ # Determines if the page is a PDF document.
237
+ #
238
+ # @return [Boolean]
239
+ # Specifies whether the page is a PDF document.
240
+ #
241
+ def pdf?
242
+ is_content_type?('application/pdf')
243
+ end
244
+
245
+ #
246
+ # Determines if the page is a ZIP archive.
247
+ #
248
+ # @return [Boolean]
249
+ # Specifies whether the page is a ZIP archive.
250
+ #
251
+ def zip?
252
+ is_content_type?('application/zip')
253
+ end
254
+
255
+ #
256
+ # The raw Cookie String sent along with the page.
257
+ #
258
+ # @return [String]
259
+ # The raw Cookie from the response.
260
+ #
261
+ # @since 0.2.7
262
+ #
263
+ def cookie
264
+ (response['Set-Cookie'] || '')
265
+ end
266
+
267
+ alias raw_cookie cookie
268
+
269
+ #
270
+ # The Cookie values sent along with the page.
271
+ #
272
+ # @return [Array<String>]
273
+ # The Cookies from the response.
274
+ #
275
+ # @since 0.2.2
276
+ #
277
+ def cookies
278
+ (headers['set-cookie'] || [])
279
+ end
280
+
281
+ #
282
+ # The Cookie key -> value pairs returned with the response.
283
+ #
284
+ # @return [Hash{String => String}]
285
+ # The cookie keys and values.
286
+ #
287
+ # @since 0.2.2
288
+ #
289
+ def cookie_params
290
+ params = {}
291
+
292
+ cookies.each do |cookie|
293
+ cookie.split('; ').each do |key_value|
294
+ key, value = key_value.split('=',2)
295
+
296
+ next if RESERVED_COOKIE_NAMES.include?(key)
297
+
298
+ params[key] = (value || '')
299
+ end
300
+ end
301
+
302
+ return params
303
+ end
304
+
305
+ protected
306
+
307
+ #
308
+ # Determines if any of the content-types of the page include a given
309
+ # type.
310
+ #
311
+ # @param [String] type
312
+ # The content-type to test for.
313
+ #
314
+ # @return [Boolean]
315
+ # Specifies whether the page includes the given content-type.
316
+ #
317
+ # @since 0.2.4
318
+ #
319
+ def is_content_type?(type)
320
+ content_types.any? { |content| content.include?(type) }
321
+ end
322
+ end
323
+ end
@@ -0,0 +1,229 @@
1
+ require 'spidr/extensions/uri'
2
+ require 'uri'
3
+
4
+ module Spidr
5
+ module Links
6
+ include Enumerable
7
+
8
+ #
9
+ # Enumerates over the meta-redirect links in the page.
10
+ #
11
+ # @yield [link]
12
+ # If a block is given, it will be passed every meta-redirect link
13
+ # from the page.
14
+ #
15
+ # @yieldparam [String] link
16
+ # A meta-redirect link from the page.
17
+ #
18
+ # @return [Enumerator]
19
+ # If no block is given, an enumerator object will be returned.
20
+ #
21
+ # @since 0.3.0
22
+ #
23
+ def each_meta_redirect
24
+ return enum_for(:each_meta_redirect) unless block_given?
25
+
26
+ if (html? && doc)
27
+ search('//meta[@http-equiv and @content]').each do |node|
28
+ if node.get_attribute('http-equiv') =~ /refresh/i
29
+ content = node.get_attribute('content')
30
+
31
+ if (redirect = content.match(/url=(\S+)$/))
32
+ yield redirect[1]
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
38
+
39
+ #
40
+ # Returns a boolean indicating whether or not page-level meta
41
+ # redirects are present in this page.
42
+ #
43
+ # @return [Boolean]
44
+ # Specifies whether the page includes page-level redirects.
45
+ #
46
+ def meta_redirect?
47
+ !(each_meta_redirect.first.nil?)
48
+ end
49
+
50
+ #
51
+ # The meta-redirect links of the page.
52
+ #
53
+ # @return [Array<String>]
54
+ # All meta-redirect links in the page.
55
+ #
56
+ # @since 0.3.0
57
+ #
58
+ def meta_redirects
59
+ each_meta_redirect.to_a
60
+ end
61
+
62
+ #
63
+ # Enumerates over every HTTP or meta-redirect link in the page.
64
+ #
65
+ # @yield [link]
66
+ # The given block will be passed every redirection link from the page.
67
+ #
68
+ # @yieldparam [String] link
69
+ # A HTTP or meta-redirect link from the page.
70
+ #
71
+ # @return [Enumerator]
72
+ # If no block is given, an enumerator object will be returned.
73
+ #
74
+ # @since 0.3.0
75
+ #
76
+ def each_redirect(&block)
77
+ return enum_for(:each_redirect) unless block
78
+
79
+ location = headers['location']
80
+
81
+ if location.nil?
82
+ # check page-level meta redirects if there isn't a location header
83
+ each_meta_redirect(&block)
84
+ elsif location.kind_of?(Array)
85
+ location.each(&block)
86
+ else
87
+ # usually the location header contains a single String
88
+ block.call(location)
89
+ end
90
+ end
91
+
92
+ #
93
+ # URLs that this document redirects to.
94
+ #
95
+ # @return [Array<String>]
96
+ # The links that this page redirects to (usually found in a
97
+ # location header or by way of a page-level meta redirect).
98
+ #
99
+ def redirects_to
100
+ each_redirect.to_a
101
+ end
102
+
103
+ #
104
+ # Enumerates over every link in the page.
105
+ #
106
+ # @yield [link]
107
+ # The given block will be passed every non-empty link in the page.
108
+ #
109
+ # @yieldparam [String] link
110
+ # A link in the page.
111
+ #
112
+ # @return [Enumerator]
113
+ # If no block is given, an enumerator object will be returned.
114
+ #
115
+ # @since 0.3.0
116
+ #
117
+ def each_link
118
+ return enum_for(:each_link) unless block_given?
119
+
120
+ filter = lambda { |url|
121
+ yield url unless (url.nil? || url.empty?)
122
+ }
123
+
124
+ each_redirect(&filter) if is_redirect?
125
+
126
+ if (html? && doc)
127
+ doc.search('a[@href]').each do |a|
128
+ filter.call(a.get_attribute('href'))
129
+ end
130
+
131
+ doc.search('frame[@src]').each do |iframe|
132
+ filter.call(iframe.get_attribute('src'))
133
+ end
134
+
135
+ doc.search('iframe[@src]').each do |iframe|
136
+ filter.call(iframe.get_attribute('src'))
137
+ end
138
+
139
+ doc.search('link[@href]').each do |link|
140
+ filter.call(link.get_attribute('href'))
141
+ end
142
+
143
+ doc.search('script[@src]').each do |script|
144
+ filter.call(script.get_attribute('src'))
145
+ end
146
+ end
147
+ end
148
+
149
+ #
150
+ # The links from within the page.
151
+ #
152
+ # @return [Array<String>]
153
+ # All links within the HTML page, frame/iframe source URLs and any
154
+ # links in the `Location` header.
155
+ #
156
+ def links
157
+ each_link.to_a
158
+ end
159
+
160
+ #
161
+ # Enumerates over every absolute URL in the page.
162
+ #
163
+ # @yield [url]
164
+ # The given block will be passed every URL in the page.
165
+ #
166
+ # @yieldparam [URI::HTTP] url
167
+ # An absolute URL in the page.
168
+ #
169
+ # @return [Enumerator]
170
+ # If no block is given, an enumerator object will be returned.
171
+ #
172
+ # @since 0.3.0
173
+ #
174
+ def each_url
175
+ return enum_for(:each_url) unless block_given?
176
+
177
+ each_link do |link|
178
+ if (url = to_absolute(link))
179
+ yield url
180
+ end
181
+ end
182
+ end
183
+
184
+ alias each each_url
185
+
186
+ #
187
+ # Absolute URIs from within the page.
188
+ #
189
+ # @return [Array<URI::HTTP>]
190
+ # The links from within the page, converted to absolute URIs.
191
+ #
192
+ def urls
193
+ each_url.to_a
194
+ end
195
+
196
+ #
197
+ # Normalizes and expands a given link into a proper URI.
198
+ #
199
+ # @param [String] link
200
+ # The link to normalize and expand.
201
+ #
202
+ # @return [URI::HTTP]
203
+ # The normalized URI.
204
+ #
205
+ def to_absolute(link)
206
+ begin
207
+ new_url = url.merge(link.to_s)
208
+ rescue Exception
209
+ return nil
210
+ end
211
+
212
+ if new_url.path
213
+ path = new_url.path
214
+
215
+ # ensure that paths begin with a leading '/' for URI::FTP
216
+ if (new_url.scheme == 'ftp' && path[0,1] != '/')
217
+ path.insert(0,'/')
218
+ end
219
+
220
+ # make sure the path does not contain any .. or . directories,
221
+ # since URI::Generic#merge cannot normalize paths such as
222
+ # "/stuff/../"
223
+ new_url.path = URI.expand_path(path)
224
+ end
225
+
226
+ return new_url
227
+ end
228
+ end
229
+ end