wgit 0.4.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/wgit/crawler.rb +155 -66
- data/lib/wgit/database/database.rb +9 -8
- data/lib/wgit/database/model.rb +2 -2
- data/lib/wgit/document.rb +55 -62
- data/lib/wgit/document_extensions.rb +2 -2
- data/lib/wgit/indexer.rb +27 -15
- data/lib/wgit/response.rb +144 -0
- data/lib/wgit/url.rb +149 -85
- data/lib/wgit/utils.rb +6 -3
- data/lib/wgit/version.rb +7 -2
- metadata +3 -2
@@ -34,8 +34,8 @@ Wgit::Document.define_extension(
|
|
34
34
|
'//meta[@name="keywords"]/@content',
|
35
35
|
singleton: true,
|
36
36
|
text_content_only: true
|
37
|
-
) do |keywords,
|
38
|
-
if keywords && (
|
37
|
+
) do |keywords, _source, type|
|
38
|
+
if keywords && (type == :document)
|
39
39
|
keywords = keywords.split(',')
|
40
40
|
Wgit::Utils.process_arr(keywords)
|
41
41
|
end
|
data/lib/wgit/indexer.rb
CHANGED
@@ -44,12 +44,16 @@ module Wgit
|
|
44
44
|
# inserted into the database allowing for prior manipulation.
|
45
45
|
# @return [Integer] The total number of pages crawled within the website.
|
46
46
|
def self.index_site(
|
47
|
-
url, connection_string: nil, insert_externals: true,
|
47
|
+
url, connection_string: nil, insert_externals: true,
|
48
|
+
allow_paths: nil, disallow_paths: nil, &block
|
48
49
|
)
|
49
50
|
url = Wgit::Url.parse(url)
|
50
51
|
db = Wgit::Database.new(connection_string)
|
51
52
|
indexer = Wgit::Indexer.new(db)
|
52
|
-
indexer.index_site(
|
53
|
+
indexer.index_site(
|
54
|
+
url, insert_externals: insert_externals,
|
55
|
+
allow_paths: allow_paths, disallow_paths: disallow_paths, &block
|
56
|
+
)
|
53
57
|
end
|
54
58
|
|
55
59
|
# Convience method to index a single webpage using
|
@@ -215,10 +219,13 @@ the next iteration.")
|
|
215
219
|
# nil or false from the block to prevent the document from being saved
|
216
220
|
# into the database.
|
217
221
|
# @return [Integer] The total number of webpages/documents indexed.
|
218
|
-
def index_site(
|
222
|
+
def index_site(
|
223
|
+
url, insert_externals: true, allow_paths: nil, disallow_paths: nil
|
224
|
+
)
|
225
|
+
crawl_opts = { allow_paths: allow_paths, disallow_paths: disallow_paths }
|
219
226
|
total_pages_indexed = 0
|
220
227
|
|
221
|
-
ext_urls = @crawler.crawl_site(url) do |doc|
|
228
|
+
ext_urls = @crawler.crawl_site(url, crawl_opts) do |doc|
|
222
229
|
result = true
|
223
230
|
result = yield(doc) if block_given?
|
224
231
|
|
@@ -231,8 +238,8 @@ the next iteration.")
|
|
231
238
|
@db.url?(url) ? @db.update(url) : @db.insert(url)
|
232
239
|
|
233
240
|
if insert_externals && ext_urls
|
234
|
-
write_urls_to_db(ext_urls)
|
235
|
-
Wgit.logger.info("Found and saved #{
|
241
|
+
num_inserted_urls = write_urls_to_db(ext_urls)
|
242
|
+
Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
|
236
243
|
end
|
237
244
|
|
238
245
|
Wgit.logger.info("Crawled and saved #{total_pages_indexed} docs for the \
|
@@ -266,8 +273,8 @@ site: #{url}")
|
|
266
273
|
|
267
274
|
ext_urls = document&.external_links
|
268
275
|
if insert_externals && ext_urls
|
269
|
-
write_urls_to_db(ext_urls)
|
270
|
-
Wgit.logger.info("Found and saved #{
|
276
|
+
num_inserted_urls = write_urls_to_db(ext_urls)
|
277
|
+
Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
|
271
278
|
end
|
272
279
|
|
273
280
|
nil
|
@@ -315,14 +322,19 @@ site: #{url}")
|
|
315
322
|
def write_urls_to_db(urls)
|
316
323
|
count = 0
|
317
324
|
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
Wgit.logger.info("
|
323
|
-
|
324
|
-
Wgit.logger.info("Url already exists: #{url}")
|
325
|
+
return count unless urls.respond_to?(:each)
|
326
|
+
|
327
|
+
urls.each do |url|
|
328
|
+
if url.invalid?
|
329
|
+
Wgit.logger.info("Ignoring invalid external url: #{url}")
|
330
|
+
next
|
325
331
|
end
|
332
|
+
|
333
|
+
@db.insert(url)
|
334
|
+
count += 1
|
335
|
+
Wgit.logger.info("Inserted external url: #{url}")
|
336
|
+
rescue Mongo::Error::OperationFailure
|
337
|
+
Wgit.logger.info("External url already exists: #{url}")
|
326
338
|
end
|
327
339
|
|
328
340
|
count
|
@@ -0,0 +1,144 @@
|
|
1
|
+
module Wgit
|
2
|
+
# Response class representing a generic HTTP crawl response.
|
3
|
+
class Response
|
4
|
+
# The underlying HTTP adapter/library response object.
|
5
|
+
attr_accessor :adapter_response
|
6
|
+
|
7
|
+
# The HTML response body.
|
8
|
+
attr_reader :body
|
9
|
+
|
10
|
+
# The HTTP response headers.
|
11
|
+
attr_reader :headers
|
12
|
+
|
13
|
+
# The servers IP address.
|
14
|
+
attr_accessor :ip_address
|
15
|
+
|
16
|
+
# The redirections of the response.
|
17
|
+
attr_reader :redirections
|
18
|
+
|
19
|
+
# The number of redirections for the response.
|
20
|
+
attr_reader :redirect_count
|
21
|
+
|
22
|
+
# The HTTP response status code.
|
23
|
+
attr_reader :status
|
24
|
+
|
25
|
+
# The total crawl/network time for the response.
|
26
|
+
attr_reader :total_time
|
27
|
+
|
28
|
+
# The HTTP request URL.
|
29
|
+
attr_accessor :url
|
30
|
+
|
31
|
+
# Defaults some values and returns a "blank" Wgit::Response object.
|
32
|
+
def initialize
|
33
|
+
@body = ''
|
34
|
+
@headers = {}
|
35
|
+
@redirections = {}
|
36
|
+
@total_time = 0.0
|
37
|
+
end
|
38
|
+
|
39
|
+
# Adds time to @total_time (incrementally).
|
40
|
+
#
|
41
|
+
# @param time [Float] The time to add to @total_time.
|
42
|
+
# @return [Float] @total_time's new value.
|
43
|
+
def add_total_time(time)
|
44
|
+
@total_time += (time || 0.0)
|
45
|
+
end
|
46
|
+
|
47
|
+
# Sets the HTML response body.
|
48
|
+
#
|
49
|
+
# @param str [String] The new HTML body.
|
50
|
+
# @return [String] @body's new value.
|
51
|
+
def body=(str)
|
52
|
+
@body = (str || '')
|
53
|
+
end
|
54
|
+
|
55
|
+
# Returns the HTML response body or nil (if it's empty).
|
56
|
+
#
|
57
|
+
# @return [String, NilClass] The HTML body or nil if empty.
|
58
|
+
def body_or_nil
|
59
|
+
@body.empty? ? nil : @body
|
60
|
+
end
|
61
|
+
|
62
|
+
# Returns true if the response isn't a #success? or a #redirect?
|
63
|
+
#
|
64
|
+
# @return [Boolean] True if failed, false otherwise.
|
65
|
+
def failure?
|
66
|
+
!success? && !redirect?
|
67
|
+
end
|
68
|
+
|
69
|
+
# Sets the headers Hash to the given value. The header keys are mapped
|
70
|
+
# to snake_cased Symbols for consistency.
|
71
|
+
#
|
72
|
+
# @param headers [Hash] The new response headers.
|
73
|
+
# @return [Hash] @headers's new value.
|
74
|
+
def headers=(headers)
|
75
|
+
return @headers = {} unless headers
|
76
|
+
|
77
|
+
@headers = headers.map do |k, v|
|
78
|
+
k = k.downcase.gsub('-', '_').to_sym
|
79
|
+
[k, v]
|
80
|
+
end.to_h
|
81
|
+
end
|
82
|
+
|
83
|
+
# Returns whether or not the response is 404 Not Found.
|
84
|
+
#
|
85
|
+
# @return [Boolean] True if 404 Not Found, false otherwise.
|
86
|
+
def not_found?
|
87
|
+
@status == 404
|
88
|
+
end
|
89
|
+
|
90
|
+
# Returns whether or not the response is 200 OK.
|
91
|
+
#
|
92
|
+
# @return [Boolean] True if 200 OK, false otherwise.
|
93
|
+
def ok?
|
94
|
+
@status == 200
|
95
|
+
end
|
96
|
+
|
97
|
+
# Returns whether or not the response is a 3xx Redirect.
|
98
|
+
#
|
99
|
+
# @return [Boolean] True if 3xx Redirect, false otherwise.
|
100
|
+
def redirect?
|
101
|
+
return false unless @status
|
102
|
+
|
103
|
+
@status.between?(300, 399)
|
104
|
+
end
|
105
|
+
|
106
|
+
# Returns the number of redirects this response has had.
|
107
|
+
#
|
108
|
+
# @return [Integer] The number of response redirects.
|
109
|
+
def redirect_count
|
110
|
+
@redirections.size
|
111
|
+
end
|
112
|
+
|
113
|
+
# Returns the size of the response body.
|
114
|
+
#
|
115
|
+
# @return [Integer] The response body size in bytes.
|
116
|
+
def size
|
117
|
+
@body.size
|
118
|
+
end
|
119
|
+
|
120
|
+
# Sets the HTML response status.
|
121
|
+
#
|
122
|
+
# @param int [Integer] The new response status.
|
123
|
+
# @return [Integer] @status' new value.
|
124
|
+
def status=(int)
|
125
|
+
@status = int.positive? ? int : nil
|
126
|
+
end
|
127
|
+
|
128
|
+
# Returns whether or not the response is a 2xx Success.
|
129
|
+
#
|
130
|
+
# @return [Boolean] True if 2xx Success, false otherwise.
|
131
|
+
def success?
|
132
|
+
return false unless @status
|
133
|
+
|
134
|
+
@status.between?(200, 299)
|
135
|
+
end
|
136
|
+
|
137
|
+
alias code status
|
138
|
+
alias content body
|
139
|
+
alias crawl_time total_time
|
140
|
+
alias to_s body
|
141
|
+
alias redirects redirections
|
142
|
+
alias length size
|
143
|
+
end
|
144
|
+
end
|
data/lib/wgit/url.rb
CHANGED
@@ -8,15 +8,19 @@ require 'addressable/uri'
|
|
8
8
|
module Wgit
|
9
9
|
# Class modeling a web based HTTP URL.
|
10
10
|
#
|
11
|
-
# Can be an internal/relative link e.g. "about.html" or
|
11
|
+
# Can be an internal/relative link e.g. "about.html" or an absolute URL
|
12
12
|
# e.g. "http://www.google.co.uk". Is a subclass of String and uses 'uri' and
|
13
13
|
# 'addressable/uri' internally.
|
14
|
+
#
|
15
|
+
# Most of the methods in this class return new Wgit::Url instances making the
|
16
|
+
# method calls chainable e.g. url.omit_base.omit_fragment etc. The methods
|
17
|
+
# also try to be idempotent where possible.
|
14
18
|
class Url < String
|
15
19
|
include Assertable
|
16
20
|
|
17
21
|
# Whether or not the Url has been crawled or not. A custom crawled= method
|
18
22
|
# is provided by this class, overridding the default one.
|
19
|
-
|
23
|
+
attr_reader :crawled
|
20
24
|
|
21
25
|
# The Time stamp of when this Url was crawled.
|
22
26
|
attr_accessor :date_crawled
|
@@ -110,7 +114,7 @@ module Wgit
|
|
110
114
|
|
111
115
|
# Returns true if self is a relative Url; false if absolute.
|
112
116
|
#
|
113
|
-
# All external links in a page are expected to have a
|
117
|
+
# All external links in a page are expected to have a scheme prefix e.g.
|
114
118
|
# 'http://', otherwise the link is treated as an internal link (regardless
|
115
119
|
# of whether it's valid or not). The only exception is if an opts arg is
|
116
120
|
# provided and self is a page belonging to that arg type e.g. host; then
|
@@ -118,7 +122,7 @@ module Wgit
|
|
118
122
|
#
|
119
123
|
# @param opts [Hash] The options with which to check relativity. Only one
|
120
124
|
# opts param should be provided. The provided opts param Url must be
|
121
|
-
# absolute and be prefixed with a
|
125
|
+
# absolute and be prefixed with a scheme. Consider using the output of
|
122
126
|
# Wgit::Url#to_base which should work unless it's nil.
|
123
127
|
# @option opts [Wgit::Url, String] :base The Url base e.g.
|
124
128
|
# http://www.google.com/how which gives a base of
|
@@ -147,8 +151,10 @@ module Wgit
|
|
147
151
|
|
148
152
|
type, url = opts.first
|
149
153
|
url = Wgit::Url.new(url)
|
150
|
-
|
151
|
-
|
154
|
+
unless url.to_base
|
155
|
+
raise "Invalid opts param value, Url must be absolute and contain \
|
156
|
+
protocol scheme: #{url}"
|
157
|
+
end
|
152
158
|
|
153
159
|
case type
|
154
160
|
when :base # http://www.google.com
|
@@ -182,19 +188,29 @@ protocol: #{url}" unless url.to_base
|
|
182
188
|
true
|
183
189
|
end
|
184
190
|
|
185
|
-
#
|
191
|
+
# Returns if self is an invalid (relative) HTTP Url or not.
|
192
|
+
#
|
193
|
+
# @return [Boolean] True if invalid, otherwise false.
|
194
|
+
def invalid?
|
195
|
+
!valid?
|
196
|
+
end
|
197
|
+
|
198
|
+
# Concats self and other together before returning a new Url. Self is not
|
186
199
|
# modified.
|
187
200
|
#
|
188
|
-
# @param
|
189
|
-
# @return [Wgit::Url] self + separator +
|
190
|
-
def concat(
|
191
|
-
|
192
|
-
raise '
|
201
|
+
# @param other [Wgit::Url, String] The other to concat to the end of self.
|
202
|
+
# @return [Wgit::Url] self + separator + other, separator depends on other.
|
203
|
+
def concat(other)
|
204
|
+
other = Wgit::Url.new(other)
|
205
|
+
raise 'other must be relative' unless other.relative?
|
206
|
+
|
207
|
+
other = other.omit_leading_slash
|
208
|
+
separator = other.start_with?('#') || other.start_with?('?') ? '' : '/'
|
193
209
|
|
194
|
-
|
195
|
-
|
210
|
+
# We use to_s below to call String#+, not Wgit::Url#+ (alias for concat).
|
211
|
+
concatted = omit_trailing_slash.to_s + separator.to_s + other.to_s
|
196
212
|
|
197
|
-
Wgit::Url.new(
|
213
|
+
Wgit::Url.new(concatted)
|
198
214
|
end
|
199
215
|
|
200
216
|
# Normalises/escapes self and returns a new Wgit::Url. Self isn't modified.
|
@@ -204,21 +220,47 @@ protocol: #{url}" unless url.to_base
|
|
204
220
|
Wgit::Url.new(@uri.normalize.to_s)
|
205
221
|
end
|
206
222
|
|
207
|
-
#
|
208
|
-
#
|
223
|
+
# Returns an absolute form of self within the context of doc. Doesn't
|
224
|
+
# modify the receiver.
|
209
225
|
#
|
210
|
-
#
|
211
|
-
#
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
226
|
+
# If self is absolute then it's returned as is, making this method
|
227
|
+
# idempotent. The doc's <base> element is used if present, otherwise
|
228
|
+
# doc.url is used as the base; which is concatted with self.
|
229
|
+
#
|
230
|
+
# Typically used to build an absolute link obtained from a document e.g.
|
231
|
+
#
|
232
|
+
# link = Wgit::Url.new('/favicon.png')
|
233
|
+
# doc = Wgit::Document.new('http://example.com')
|
234
|
+
#
|
235
|
+
# link.prefix_base(doc) # => "http://example.com/favicon.png"
|
236
|
+
#
|
237
|
+
# @param doc [Wgit::Document] The doc whose base Url is concatted with
|
238
|
+
# self.
|
239
|
+
# @raise [StandardError] If doc isn't a Wgit::Document or if `doc.base_url`
|
240
|
+
# raises an Exception.
|
241
|
+
# @return [Wgit::Url] Self in absolute form.
|
242
|
+
def prefix_base(doc)
|
243
|
+
assert_type(doc, Wgit::Document)
|
216
244
|
|
217
|
-
|
218
|
-
|
219
|
-
end
|
245
|
+
absolute? ? self : doc.base_url(link: self).concat(self)
|
246
|
+
end
|
220
247
|
|
221
|
-
|
248
|
+
# Returns self having prefixed a protocol scheme. Doesn't modify receiver.
|
249
|
+
# Returns self even if absolute (with scheme); therefore is idempotent.
|
250
|
+
#
|
251
|
+
# @param protocol [Symbol] Either :http or :https.
|
252
|
+
# @return [Wgit::Url] Self with a protocol scheme prefix.
|
253
|
+
def prefix_scheme(protocol: :http)
|
254
|
+
return self if absolute?
|
255
|
+
|
256
|
+
case protocol
|
257
|
+
when :http
|
258
|
+
Wgit::Url.new("http://#{url}")
|
259
|
+
when :https
|
260
|
+
Wgit::Url.new("https://#{url}")
|
261
|
+
else
|
262
|
+
raise "protocol must be :http or :https, not :#{protocol}"
|
263
|
+
end
|
222
264
|
end
|
223
265
|
|
224
266
|
# Returns a Hash containing this Url's instance vars excluding @uri.
|
@@ -238,6 +280,13 @@ protocol: #{url}" unless url.to_base
|
|
238
280
|
URI(normalize)
|
239
281
|
end
|
240
282
|
|
283
|
+
# Returns the Addressable::URI object for this URL.
|
284
|
+
#
|
285
|
+
# @return [Addressable::URI] The Addressable::URI object of self.
|
286
|
+
def to_addressable_uri
|
287
|
+
@uri
|
288
|
+
end
|
289
|
+
|
241
290
|
# Returns self.
|
242
291
|
#
|
243
292
|
# @return [Wgit::Url] This (self) Url.
|
@@ -245,10 +294,10 @@ protocol: #{url}" unless url.to_base
|
|
245
294
|
self
|
246
295
|
end
|
247
296
|
|
248
|
-
# Returns a new Wgit::Url containing just the scheme
|
297
|
+
# Returns a new Wgit::Url containing just the scheme of this URL
|
249
298
|
# e.g. Given http://www.google.co.uk, http is returned.
|
250
299
|
#
|
251
|
-
# @return [Wgit::Url, nil] Containing just the scheme
|
300
|
+
# @return [Wgit::Url, nil] Containing just the scheme or nil.
|
252
301
|
def to_scheme
|
253
302
|
scheme = @uri.scheme
|
254
303
|
scheme ? Wgit::Url.new(scheme) : nil
|
@@ -281,9 +330,11 @@ protocol: #{url}" unless url.to_base
|
|
281
330
|
domain ? Wgit::Url.new(domain.split('.').first) : nil
|
282
331
|
end
|
283
332
|
|
284
|
-
# Returns only the base of this URL e.g. the protocol and host
|
333
|
+
# Returns only the base of this URL e.g. the protocol scheme and host
|
334
|
+
# combined.
|
285
335
|
#
|
286
|
-
# @return [Wgit::Url, nil]
|
336
|
+
# @return [Wgit::Url, nil] The base of self e.g. http://www.google.co.uk or
|
337
|
+
# nil.
|
287
338
|
def to_base
|
288
339
|
return nil if @uri.scheme.nil? || @uri.host.nil?
|
289
340
|
|
@@ -302,7 +353,7 @@ protocol: #{url}" unless url.to_base
|
|
302
353
|
return nil if path.nil? || path.empty?
|
303
354
|
return Wgit::Url.new('/') if path == '/'
|
304
355
|
|
305
|
-
Wgit::Url.new(path).
|
356
|
+
Wgit::Url.new(path).omit_slashes
|
306
357
|
end
|
307
358
|
|
308
359
|
# Returns the endpoint of this URL e.g. the bit after the host with any
|
@@ -324,16 +375,16 @@ protocol: #{url}" unless url.to_base
|
|
324
375
|
# @return [Wgit::Url, nil] Containing just the query string or nil.
|
325
376
|
def to_query
|
326
377
|
query = @uri.query
|
327
|
-
query ? Wgit::Url.new(
|
378
|
+
query ? Wgit::Url.new(query) : nil
|
328
379
|
end
|
329
380
|
|
330
|
-
# Returns a new Wgit::Url containing just the
|
381
|
+
# Returns a new Wgit::Url containing just the fragment string of this URL
|
331
382
|
# e.g. Given http://google.com#about, #about is returned.
|
332
383
|
#
|
333
|
-
# @return [Wgit::Url, nil] Containing just the
|
334
|
-
def
|
335
|
-
|
336
|
-
|
384
|
+
# @return [Wgit::Url, nil] Containing just the fragment string or nil.
|
385
|
+
def to_fragment
|
386
|
+
fragment = @uri.fragment
|
387
|
+
fragment ? Wgit::Url.new(fragment) : nil
|
337
388
|
end
|
338
389
|
|
339
390
|
# Returns a new Wgit::Url containing just the file extension of this URL
|
@@ -348,12 +399,27 @@ protocol: #{url}" unless url.to_base
|
|
348
399
|
segs.length > 1 ? Wgit::Url.new(segs.last) : nil
|
349
400
|
end
|
350
401
|
|
402
|
+
# Omits the given URL components from self and returns a new Wgit::Url.
|
403
|
+
#
|
404
|
+
# Calls Addressable::URI#omit underneath and creates a new Wgit::Url from
|
405
|
+
# the output. See the Addressable::URI docs for more information.
|
406
|
+
#
|
407
|
+
# @param components [*Symbol] One or more Symbols representing the URL
|
408
|
+
# components to omit. The following components are supported: :scheme,
|
409
|
+
# :user, :password, :userinfo, :host, :port, :authority, :path, :query,
|
410
|
+
# :fragment.
|
411
|
+
# @return [Wgit::Url] Self's URL value with the given components omitted.
|
412
|
+
def omit(*components)
|
413
|
+
omitted = @uri.omit(*components)
|
414
|
+
Wgit::Url.new(omitted.to_s)
|
415
|
+
end
|
416
|
+
|
351
417
|
# Returns a new Wgit::Url containing self without a trailing slash. Is
|
352
418
|
# idempotent meaning self will always be returned regardless of whether
|
353
419
|
# there's a trailing slash or not.
|
354
420
|
#
|
355
421
|
# @return [Wgit::Url] Self without a trailing slash.
|
356
|
-
def
|
422
|
+
def omit_leading_slash
|
357
423
|
start_with?('/') ? Wgit::Url.new(self[1..-1]) : self
|
358
424
|
end
|
359
425
|
|
@@ -362,7 +428,7 @@ protocol: #{url}" unless url.to_base
|
|
362
428
|
# there's a trailing slash or not.
|
363
429
|
#
|
364
430
|
# @return [Wgit::Url] Self without a trailing slash.
|
365
|
-
def
|
431
|
+
def omit_trailing_slash
|
366
432
|
end_with?('/') ? Wgit::Url.new(chop) : self
|
367
433
|
end
|
368
434
|
|
@@ -371,9 +437,9 @@ protocol: #{url}" unless url.to_base
|
|
371
437
|
# present or not.
|
372
438
|
#
|
373
439
|
# @return [Wgit::Url] Self without leading or trailing slashes.
|
374
|
-
def
|
375
|
-
|
376
|
-
|
440
|
+
def omit_slashes
|
441
|
+
omit_leading_slash
|
442
|
+
.omit_trailing_slash
|
377
443
|
end
|
378
444
|
|
379
445
|
# Returns a new Wgit::Url with the base (proto and host) removed e.g. Given
|
@@ -382,13 +448,13 @@ protocol: #{url}" unless url.to_base
|
|
382
448
|
# Leading and trailing slashes are always stripped from the return value.
|
383
449
|
#
|
384
450
|
# @return [Wgit::Url] Self containing everything after the base.
|
385
|
-
def
|
451
|
+
def omit_base
|
386
452
|
base_url = to_base
|
387
|
-
|
453
|
+
omit_base = base_url ? gsub(base_url, '') : self
|
388
454
|
|
389
|
-
return self if ['', '/'].include?(
|
455
|
+
return self if ['', '/'].include?(omit_base)
|
390
456
|
|
391
|
-
Wgit::Url.new(
|
457
|
+
Wgit::Url.new(omit_base).omit_slashes
|
392
458
|
end
|
393
459
|
|
394
460
|
# Returns a new Wgit::Url with the query string portion removed e.g. Given
|
@@ -398,26 +464,26 @@ protocol: #{url}" unless url.to_base
|
|
398
464
|
# URL.
|
399
465
|
#
|
400
466
|
# @return [Wgit::Url] Self with the query string portion removed.
|
401
|
-
def
|
467
|
+
def omit_query
|
402
468
|
query = to_query
|
403
|
-
|
469
|
+
omit_query_string = query ? gsub("?#{query}", '') : self
|
404
470
|
|
405
|
-
Wgit::Url.new(
|
471
|
+
Wgit::Url.new(omit_query_string)
|
406
472
|
end
|
407
473
|
|
408
|
-
# Returns a new Wgit::Url with the
|
474
|
+
# Returns a new Wgit::Url with the fragment portion removed e.g. Given
|
409
475
|
# http://google.com/search#about, http://google.com/search is
|
410
|
-
# returned. Self is returned as is if no
|
411
|
-
# consisting of only
|
412
|
-
# This method assumes that the
|
476
|
+
# returned. Self is returned as is if no fragment is present. A URL
|
477
|
+
# consisting of only a fragment e.g. '#about' will return an empty URL.
|
478
|
+
# This method assumes that the fragment is correctly placed at the very end
|
413
479
|
# of the URL.
|
414
480
|
#
|
415
|
-
# @return [Wgit::Url] Self with the
|
416
|
-
def
|
417
|
-
|
418
|
-
|
481
|
+
# @return [Wgit::Url] Self with the fragment portion removed.
|
482
|
+
def omit_fragment
|
483
|
+
fragment = to_fragment
|
484
|
+
omit_fragment = fragment ? gsub("##{fragment}", '') : self
|
419
485
|
|
420
|
-
Wgit::Url.new(
|
486
|
+
Wgit::Url.new(omit_fragment)
|
421
487
|
end
|
422
488
|
|
423
489
|
# Returns true if self is a URL query string e.g. ?q=hello etc. Note this
|
@@ -428,35 +494,33 @@ protocol: #{url}" unless url.to_base
|
|
428
494
|
start_with?('?')
|
429
495
|
end
|
430
496
|
|
431
|
-
# Returns true if self is a URL
|
432
|
-
# shouldn't be used to determine if self contains
|
497
|
+
# Returns true if self is a URL fragment e.g. #top etc. Note this
|
498
|
+
# shouldn't be used to determine if self contains a fragment.
|
433
499
|
#
|
434
|
-
# @return [Boolean] True if self is a
|
435
|
-
def
|
500
|
+
# @return [Boolean] True if self is a fragment, false otherwise.
|
501
|
+
def fragment?
|
436
502
|
start_with?('#')
|
437
503
|
end
|
438
504
|
|
439
|
-
alias
|
440
|
-
alias
|
441
|
-
alias
|
442
|
-
alias
|
443
|
-
alias
|
444
|
-
alias
|
445
|
-
alias
|
446
|
-
alias
|
447
|
-
alias
|
448
|
-
alias
|
449
|
-
alias
|
450
|
-
alias
|
451
|
-
alias
|
452
|
-
alias
|
453
|
-
alias
|
454
|
-
alias
|
455
|
-
alias
|
456
|
-
alias
|
457
|
-
alias
|
458
|
-
alias
|
459
|
-
alias is_anchor? anchor?
|
460
|
-
alias fragment? anchor?
|
505
|
+
alias + concat
|
506
|
+
alias crawled? crawled
|
507
|
+
alias normalise normalize
|
508
|
+
alias is_relative? relative?
|
509
|
+
alias is_absolute? absolute?
|
510
|
+
alias is_valid? valid?
|
511
|
+
alias is_query? query?
|
512
|
+
alias is_fragment? fragment?
|
513
|
+
alias uri to_uri
|
514
|
+
alias url to_url
|
515
|
+
alias scheme to_scheme
|
516
|
+
alias host to_host
|
517
|
+
alias domain to_domain
|
518
|
+
alias brand to_brand
|
519
|
+
alias base to_base
|
520
|
+
alias path to_path
|
521
|
+
alias endpoint to_endpoint
|
522
|
+
alias query to_query
|
523
|
+
alias fragment to_fragment
|
524
|
+
alias extension to_extension
|
461
525
|
end
|
462
526
|
end
|