wgit 0.11.0 → 0.12.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +48 -0
- data/CODE_OF_CONDUCT.md +1 -1
- data/CONTRIBUTING.md +1 -1
- data/README.md +27 -24
- data/bin/wgit +72 -18
- data/lib/wgit/assertable.rb +33 -6
- data/lib/wgit/core_ext.rb +1 -1
- data/lib/wgit/crawler.rb +91 -20
- data/lib/wgit/database/adapters/in_memory.rb +204 -0
- data/lib/wgit/database/adapters/mongo_db.rb +627 -0
- data/lib/wgit/database/database.rb +18 -663
- data/lib/wgit/database/database_adapter.rb +147 -0
- data/lib/wgit/document.rb +187 -77
- data/lib/wgit/document_extractors.rb +15 -23
- data/lib/wgit/dsl.rb +64 -67
- data/lib/wgit/html_to_text.rb +277 -0
- data/lib/wgit/indexer.rb +29 -10
- data/lib/wgit/logger.rb +2 -2
- data/lib/wgit/model.rb +164 -0
- data/lib/wgit/response.rb +5 -8
- data/lib/wgit/robots_parser.rb +8 -8
- data/lib/wgit/url.rb +38 -38
- data/lib/wgit/utils.rb +124 -14
- data/lib/wgit/version.rb +1 -1
- data/lib/wgit.rb +18 -14
- metadata +74 -30
- data/lib/wgit/database/model.rb +0 -60
data/lib/wgit/model.rb
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "./utils"
|
|
4
|
+
|
|
5
|
+
module Wgit
|
|
6
|
+
# Module used to build the Database collection objects, forming a data model.
|
|
7
|
+
# The models produced are Hash like and therefore DB agnostic. Each model
|
|
8
|
+
# will contain a unique field used for searching and avoiding duplicates,
|
|
9
|
+
# this is typically a `url` field. Also contained in the model are the
|
|
10
|
+
# search fields used in Database and Document #search calls.
|
|
11
|
+
module Model
|
|
12
|
+
# The default search fields used in Database and Document #search calls.
|
|
13
|
+
# The number of matches for each field is multiplied by the field weight,
|
|
14
|
+
# the total is the search score, used to sort the search results.
|
|
15
|
+
# Call Wgit::Model.set_default_search_fields` to revert to default.
|
|
16
|
+
DEFAULT_SEARCH_FIELDS = {
|
|
17
|
+
title: 2,
|
|
18
|
+
description: 2,
|
|
19
|
+
keywords: 2,
|
|
20
|
+
text: 1
|
|
21
|
+
}.freeze
|
|
22
|
+
|
|
23
|
+
# The search fields used in Database and Document #search calls.
|
|
24
|
+
# The number of matches for each field is multiplied by the field weight,
|
|
25
|
+
# the total is the search score, used to sort the search results.
|
|
26
|
+
# Call Wgit::Model.set_default_search_fields` to revert to default.
|
|
27
|
+
@search_fields = DEFAULT_SEARCH_FIELDS
|
|
28
|
+
|
|
29
|
+
# Whether or not to include the Document#html in the #document model.
|
|
30
|
+
@include_doc_html = false
|
|
31
|
+
|
|
32
|
+
# Whether or not to include the Document#score in the #document model.
|
|
33
|
+
@include_doc_score = false
|
|
34
|
+
|
|
35
|
+
class << self
|
|
36
|
+
# The search fields used in Database and Document #search calls.
|
|
37
|
+
# A custom setter method is also provided for changing these fields.
|
|
38
|
+
attr_reader :search_fields
|
|
39
|
+
|
|
40
|
+
# Whether or not to include the Document#html in the #document model.
|
|
41
|
+
attr_accessor :include_doc_html
|
|
42
|
+
|
|
43
|
+
# Whether or not to include the Document#score in the #document model.
|
|
44
|
+
attr_accessor :include_doc_score
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Sets the search fields used in Database and Document #search calls.
|
|
48
|
+
#
|
|
49
|
+
# You can pass the fields as an Array of Symbols which gives each field a
|
|
50
|
+
# weight of 1 meaning all fields are considered of equal value. Or you can
|
|
51
|
+
# pass a Hash of Symbol => Int and specify the weights yourself, allowing
|
|
52
|
+
# you to customise the search rankings.
|
|
53
|
+
#
|
|
54
|
+
# Use like:
|
|
55
|
+
# ```
|
|
56
|
+
# Wgit::Model.set_search_fields [:title, :text], db
|
|
57
|
+
# => { title: 1, text: 1 }
|
|
58
|
+
# Wgit::Model.set_search_fields {title: 2, text: 1}, db
|
|
59
|
+
# => { title: 2, text: 1 }
|
|
60
|
+
# ```
|
|
61
|
+
#
|
|
62
|
+
# If the given db (database) param responds to #search_fields= then it will
|
|
63
|
+
# be called and given the fields to set. This should perform whatever the
|
|
64
|
+
# database adapter needs in order to search using the given fields e.g.
|
|
65
|
+
# creating a search index. Calling the DB enables the search_fields to be
|
|
66
|
+
# set globally within Wgit by one method call, this one.
|
|
67
|
+
#
|
|
68
|
+
# @param fields [Array<Symbol>, Hash<Symbol, Integer>] The field names or
|
|
69
|
+
# the field names with their coresponding search weights.
|
|
70
|
+
# @param db [Wgit::Database::DatabaseAdapter] A connected db instance. If
|
|
71
|
+
# db responds to #search_fields=, it will be called and given the fields.
|
|
72
|
+
# @raise [StandardError] If fields is of an incorrect type.
|
|
73
|
+
# @return [Hash<Symbol, Integer>] The fields and their weights.
|
|
74
|
+
def self.set_search_fields(fields, db = nil)
|
|
75
|
+
# We need a Hash of fields => weights (Symbols => Integers).
|
|
76
|
+
case fields
|
|
77
|
+
when Array # of Strings/Symbols.
|
|
78
|
+
fields = fields.map { |field| [field.to_sym, 1] }
|
|
79
|
+
when Hash # of Strings/Symbols and Integers.
|
|
80
|
+
fields = fields.map { |field, weight| [field.to_sym, weight.to_i] }
|
|
81
|
+
else
|
|
82
|
+
raise "fields must be an Array or Hash, not a #{fields.class}"
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
@search_fields = fields.to_h
|
|
86
|
+
db.search_fields = @search_fields if db.respond_to?(:search_fields=)
|
|
87
|
+
|
|
88
|
+
@search_fields
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Sets the search fields used in Database and Document #search calls.
|
|
92
|
+
#
|
|
93
|
+
# If the given db (database) param responds to #search_fields= then it will
|
|
94
|
+
# be called and given the fields to set. This should perform whatever the
|
|
95
|
+
# database adapter needs in order to search using the given fields e.g.
|
|
96
|
+
# creating a search index. Calling the DB enables the search_fields to be
|
|
97
|
+
# set globally within Wgit by one method call, this one.
|
|
98
|
+
#
|
|
99
|
+
# @param db [Wgit::Database::DatabaseAdapter] A connected db instance. If
|
|
100
|
+
# db responds to #search_fields=, it will be called and given the fields.
|
|
101
|
+
# @return [Hash<Symbol, Integer>] The fields and their weights.
|
|
102
|
+
def self.set_default_search_fields(db = nil)
|
|
103
|
+
set_search_fields(DEFAULT_SEARCH_FIELDS, db)
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# The data model for a Wgit::Url collection object and for an embedded
|
|
107
|
+
# 'url' inside a Wgit::Document collection object.
|
|
108
|
+
#
|
|
109
|
+
# The unique field for this model is `model['url']`.
|
|
110
|
+
#
|
|
111
|
+
# @param url [Wgit::Url] The Url data object.
|
|
112
|
+
# @return [Hash] The URL model ready for DB insertion.
|
|
113
|
+
def self.url(url)
|
|
114
|
+
raise "url must respond_to? :to_h" unless url.respond_to?(:to_h)
|
|
115
|
+
|
|
116
|
+
model = url.to_h
|
|
117
|
+
select_bson_types(model)
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
# The data model for a Wgit::Document collection object.
|
|
121
|
+
#
|
|
122
|
+
# The unique field for this model is `model['url']['url']`.
|
|
123
|
+
#
|
|
124
|
+
# @param doc [Wgit::Document] The Document data object.
|
|
125
|
+
# @return [Hash] The Document model ready for DB insertion.
|
|
126
|
+
def self.document(doc)
|
|
127
|
+
raise "doc must respond_to? :to_h" unless doc.respond_to?(:to_h)
|
|
128
|
+
|
|
129
|
+
model = doc.to_h(
|
|
130
|
+
include_html: @include_doc_html, include_score: @include_doc_score
|
|
131
|
+
)
|
|
132
|
+
model["url"] = url(doc.url) # Expand Url String into full object.
|
|
133
|
+
|
|
134
|
+
select_bson_types(model)
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Common fields when inserting a record into the DB.
|
|
138
|
+
#
|
|
139
|
+
# @return [Hash] Insertion fields common to all models.
|
|
140
|
+
def self.common_insert_data
|
|
141
|
+
{
|
|
142
|
+
date_added: Wgit::Utils.time_stamp,
|
|
143
|
+
date_modified: Wgit::Utils.time_stamp
|
|
144
|
+
}
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
# Common fields when updating a record in the DB.
|
|
148
|
+
#
|
|
149
|
+
# @return [Hash] Update fields common to all models.
|
|
150
|
+
def self.common_update_data
|
|
151
|
+
{
|
|
152
|
+
date_modified: Wgit::Utils.time_stamp
|
|
153
|
+
}
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# Returns the model having removed non bson types (for use with MongoDB).
|
|
157
|
+
#
|
|
158
|
+
# @param model_hash [Hash] The model Hash to sanitize.
|
|
159
|
+
# @return [Hash] The model Hash with non bson types removed.
|
|
160
|
+
def self.select_bson_types(model_hash)
|
|
161
|
+
model_hash.select { |_k, v| v.respond_to?(:bson_type) }
|
|
162
|
+
end
|
|
163
|
+
end
|
|
164
|
+
end
|
data/lib/wgit/response.rb
CHANGED
|
@@ -27,7 +27,7 @@ module Wgit
|
|
|
27
27
|
|
|
28
28
|
# Defaults some values and returns a "blank" Wgit::Response object.
|
|
29
29
|
def initialize
|
|
30
|
-
@body =
|
|
30
|
+
@body = ""
|
|
31
31
|
@headers = {}
|
|
32
32
|
@redirections = {}
|
|
33
33
|
@total_time = 0.0
|
|
@@ -45,7 +45,7 @@ module Wgit
|
|
|
45
45
|
# @param time [Float] The time to add to @total_time.
|
|
46
46
|
# @return [Float] @total_time's new value.
|
|
47
47
|
def add_total_time(time)
|
|
48
|
-
@total_time +=
|
|
48
|
+
@total_time += time || 0.0
|
|
49
49
|
end
|
|
50
50
|
|
|
51
51
|
# Sets the HTML response body.
|
|
@@ -53,7 +53,7 @@ module Wgit
|
|
|
53
53
|
# @param str [String] The new HTML body.
|
|
54
54
|
# @return [String] @body's new value.
|
|
55
55
|
def body=(str)
|
|
56
|
-
@body =
|
|
56
|
+
@body = str || ""
|
|
57
57
|
end
|
|
58
58
|
|
|
59
59
|
# Returns the HTML response body or nil (if it's empty).
|
|
@@ -81,10 +81,7 @@ module Wgit
|
|
|
81
81
|
return
|
|
82
82
|
end
|
|
83
83
|
|
|
84
|
-
@headers = headers.
|
|
85
|
-
k = k.downcase.gsub('-', '_').to_sym
|
|
86
|
-
[k, v]
|
|
87
|
-
end.to_h
|
|
84
|
+
@headers = headers.transform_keys { |k| k.downcase.gsub("-", "_").to_sym }
|
|
88
85
|
end
|
|
89
86
|
|
|
90
87
|
# Returns whether or not the response is 404 Not Found.
|
|
@@ -146,7 +143,7 @@ module Wgit
|
|
|
146
143
|
# @return [Boolean] True if Wgit should not index this site, false
|
|
147
144
|
# otherwise.
|
|
148
145
|
def no_index?
|
|
149
|
-
headers.fetch(:x_robots_tag,
|
|
146
|
+
headers.fetch(:x_robots_tag, "").downcase.strip == "noindex"
|
|
150
147
|
end
|
|
151
148
|
|
|
152
149
|
alias_method :code, :status
|
data/lib/wgit/robots_parser.rb
CHANGED
|
@@ -7,15 +7,15 @@ module Wgit
|
|
|
7
7
|
include Wgit::Assertable
|
|
8
8
|
|
|
9
9
|
# Key representing the start of a comment.
|
|
10
|
-
KEY_COMMENT =
|
|
10
|
+
KEY_COMMENT = "#"
|
|
11
11
|
# Key value separator used in robots.txt files.
|
|
12
|
-
KEY_SEPARATOR =
|
|
12
|
+
KEY_SEPARATOR = ":"
|
|
13
13
|
# Key representing a user agent.
|
|
14
|
-
KEY_USER_AGENT =
|
|
14
|
+
KEY_USER_AGENT = "User-agent"
|
|
15
15
|
# Key representing an allow URL rule.
|
|
16
|
-
KEY_ALLOW =
|
|
16
|
+
KEY_ALLOW = "Allow"
|
|
17
17
|
# Key representing a disallow URL rule.
|
|
18
|
-
KEY_DISALLOW =
|
|
18
|
+
KEY_DISALLOW = "Disallow"
|
|
19
19
|
|
|
20
20
|
# Value representing the Wgit user agent.
|
|
21
21
|
USER_AGENT_WGIT = :wgit
|
|
@@ -143,7 +143,7 @@ module Wgit
|
|
|
143
143
|
return line unless line.count(KEY_SEPARATOR) == 1
|
|
144
144
|
|
|
145
145
|
segs = line.split(KEY_SEPARATOR)
|
|
146
|
-
return
|
|
146
|
+
return "" if segs.size == 1
|
|
147
147
|
|
|
148
148
|
segs.last.strip
|
|
149
149
|
end
|
|
@@ -176,13 +176,13 @@ module Wgit
|
|
|
176
176
|
|
|
177
177
|
def parse_special_syntax(path)
|
|
178
178
|
# Remove $ e.g. "/blah$" becomes "/blah"
|
|
179
|
-
path = path.gsub(
|
|
179
|
+
path = path.gsub("$", "")
|
|
180
180
|
|
|
181
181
|
# Remove any inline comments e.g. "/blah # comment" becomes "/blah"
|
|
182
182
|
path = path.split(" #{KEY_COMMENT}").first if path.include?(" #{KEY_COMMENT}")
|
|
183
183
|
|
|
184
184
|
# Replace an empty path with * e.g. "Allow: " becomes "Allow: *"
|
|
185
|
-
path =
|
|
185
|
+
path = "*" if path.empty?
|
|
186
186
|
|
|
187
187
|
path
|
|
188
188
|
end
|
data/lib/wgit/url.rb
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require_relative
|
|
4
|
-
require_relative
|
|
5
|
-
require
|
|
6
|
-
require
|
|
3
|
+
require_relative "utils"
|
|
4
|
+
require_relative "assertable"
|
|
5
|
+
require "uri"
|
|
6
|
+
require "addressable/uri"
|
|
7
7
|
|
|
8
8
|
module Wgit
|
|
9
9
|
# Class modeling/serialising a web based HTTP URL.
|
|
@@ -56,11 +56,11 @@ module Wgit
|
|
|
56
56
|
obj = url_or_obj
|
|
57
57
|
assert_respond_to(obj, :fetch)
|
|
58
58
|
|
|
59
|
-
url = obj.fetch(
|
|
60
|
-
crawled = obj.fetch(
|
|
61
|
-
date_crawled = obj.fetch(
|
|
62
|
-
crawl_duration = obj.fetch(
|
|
63
|
-
redirects = obj.fetch(
|
|
59
|
+
url = obj.fetch("url") # Should always be present.
|
|
60
|
+
crawled = obj.fetch("crawled", false)
|
|
61
|
+
date_crawled = obj.fetch("date_crawled", nil)
|
|
62
|
+
crawl_duration = obj.fetch("crawl_duration", nil)
|
|
63
|
+
redirects = obj.fetch("redirects", {})
|
|
64
64
|
end
|
|
65
65
|
|
|
66
66
|
@uri = Addressable::URI.parse(url)
|
|
@@ -89,7 +89,7 @@ module Wgit
|
|
|
89
89
|
# @raise [StandardError] If obj.is_a?(String) is false.
|
|
90
90
|
# @return [Wgit::Url] A Wgit::Url instance.
|
|
91
91
|
def self.parse(obj)
|
|
92
|
-
raise
|
|
92
|
+
raise "Can only parse if obj#is_a?(String)" unless obj.is_a?(String)
|
|
93
93
|
|
|
94
94
|
# Return a Wgit::Url as is to avoid losing state e.g. date_crawled etc.
|
|
95
95
|
obj.is_a?(Wgit::Url) ? obj : new(obj)
|
|
@@ -98,7 +98,7 @@ module Wgit
|
|
|
98
98
|
# Returns a Wgit::Url instance from Wgit::Url.parse, or nil if obj cannot
|
|
99
99
|
# be parsed successfully e.g. the String is invalid.
|
|
100
100
|
#
|
|
101
|
-
# Use this method when you can't
|
|
101
|
+
# Use this method when you can't guarantee that obj is parsable as a URL.
|
|
102
102
|
# See Wgit::Url.parse for more information.
|
|
103
103
|
#
|
|
104
104
|
# @param obj [Object] The object to parse, which #is_a?(String).
|
|
@@ -227,7 +227,7 @@ Addressable::URI::InvalidURIError")
|
|
|
227
227
|
def relative?(opts = {})
|
|
228
228
|
defaults = { origin: nil, host: nil, domain: nil, brand: nil }
|
|
229
229
|
opts = defaults.merge(opts)
|
|
230
|
-
raise
|
|
230
|
+
raise "Url (self) cannot be empty" if empty?
|
|
231
231
|
|
|
232
232
|
return false if scheme_relative?
|
|
233
233
|
return true if @uri.relative?
|
|
@@ -295,11 +295,11 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
|
295
295
|
# @return [Wgit::Url] self + separator + other, separator depends on other.
|
|
296
296
|
def join(other)
|
|
297
297
|
other = Wgit::Url.new(other)
|
|
298
|
-
raise
|
|
298
|
+
raise "other must be relative" unless other.relative?
|
|
299
299
|
|
|
300
300
|
other = other.omit_leading_slash
|
|
301
|
-
separator = %w[# ? .].include?(other[0]) ?
|
|
302
|
-
separator =
|
|
301
|
+
separator = %w[# ? .].include?(other[0]) ? "" : "/"
|
|
302
|
+
separator = "" if end_with?("/")
|
|
303
303
|
joined = self + separator + other
|
|
304
304
|
|
|
305
305
|
Wgit::Url.new(joined)
|
|
@@ -335,7 +335,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
|
335
335
|
# @return [Wgit::Url] Self in absolute form.
|
|
336
336
|
def make_absolute(doc)
|
|
337
337
|
assert_type(doc, Wgit::Document)
|
|
338
|
-
raise
|
|
338
|
+
raise "Cannot make absolute when Document @url is not valid" \
|
|
339
339
|
unless doc.url.valid?
|
|
340
340
|
|
|
341
341
|
return prefix_scheme(doc.url.to_scheme&.to_sym) if scheme_relative?
|
|
@@ -355,7 +355,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
|
355
355
|
|
|
356
356
|
return self if absolute? && !scheme_relative?
|
|
357
357
|
|
|
358
|
-
separator = scheme_relative? ?
|
|
358
|
+
separator = scheme_relative? ? "" : "//"
|
|
359
359
|
Wgit::Url.new("#{scheme}:#{separator}#{self}")
|
|
360
360
|
end
|
|
361
361
|
|
|
@@ -364,8 +364,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
|
364
364
|
#
|
|
365
365
|
# @return [Hash] self's instance vars as a Hash.
|
|
366
366
|
def to_h
|
|
367
|
-
h = Wgit::Utils.to_h(self, ignore: [
|
|
368
|
-
Hash[h.to_a.insert(0, [
|
|
367
|
+
h = Wgit::Utils.to_h(self, ignore: ["@uri"])
|
|
368
|
+
Hash[h.to_a.insert(0, ["url", to_s])] # Insert url at position 0.
|
|
369
369
|
end
|
|
370
370
|
|
|
371
371
|
# Returns a normalised URI object for this URL.
|
|
@@ -440,7 +440,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
|
440
440
|
dot_domain = ".#{to_domain}"
|
|
441
441
|
return nil unless include?(dot_domain)
|
|
442
442
|
|
|
443
|
-
sub_domain = to_host.sub(dot_domain,
|
|
443
|
+
sub_domain = to_host.sub(dot_domain, "")
|
|
444
444
|
Wgit::Url.new(sub_domain)
|
|
445
445
|
end
|
|
446
446
|
|
|
@@ -450,7 +450,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
|
450
450
|
# @return [Wgit::Url, nil] Containing just the brand or nil.
|
|
451
451
|
def to_brand
|
|
452
452
|
domain = to_domain
|
|
453
|
-
domain ? Wgit::Url.new(domain.split(
|
|
453
|
+
domain ? Wgit::Url.new(domain.split(".").first) : nil
|
|
454
454
|
end
|
|
455
455
|
|
|
456
456
|
# Returns only the base of this URL e.g. the protocol scheme and host
|
|
@@ -486,7 +486,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
|
486
486
|
def to_path
|
|
487
487
|
path = @uri.path
|
|
488
488
|
return nil if path.nil? || path.empty?
|
|
489
|
-
return Wgit::Url.new(
|
|
489
|
+
return Wgit::Url.new("/") if path == "/"
|
|
490
490
|
|
|
491
491
|
Wgit::Url.new(path).omit_leading_slash
|
|
492
492
|
end
|
|
@@ -500,7 +500,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
|
500
500
|
# an endpoint, / is returned.
|
|
501
501
|
def to_endpoint
|
|
502
502
|
endpoint = @uri.path
|
|
503
|
-
endpoint = "/#{endpoint}" unless endpoint.start_with?(
|
|
503
|
+
endpoint = "/#{endpoint}" unless endpoint.start_with?("/")
|
|
504
504
|
Wgit::Url.new(endpoint)
|
|
505
505
|
end
|
|
506
506
|
|
|
@@ -524,8 +524,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
|
524
524
|
query_str = to_query
|
|
525
525
|
return {} unless query_str
|
|
526
526
|
|
|
527
|
-
query_str.split(
|
|
528
|
-
k, v = param.split(
|
|
527
|
+
query_str.split("&").each_with_object({}) do |param, hash|
|
|
528
|
+
k, v = param.split("=")
|
|
529
529
|
k = k.to_sym if symbolize_keys
|
|
530
530
|
hash[k] = v
|
|
531
531
|
end
|
|
@@ -548,7 +548,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
|
548
548
|
path = to_path&.omit_trailing_slash
|
|
549
549
|
return nil unless path
|
|
550
550
|
|
|
551
|
-
segs = path.split(
|
|
551
|
+
segs = path.split(".")
|
|
552
552
|
segs.length > 1 ? Wgit::Url.new(segs.last) : nil
|
|
553
553
|
end
|
|
554
554
|
|
|
@@ -591,7 +591,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
|
591
591
|
#
|
|
592
592
|
# @return [Wgit::Url] Self without a trailing slash.
|
|
593
593
|
def omit_leading_slash
|
|
594
|
-
start_with?(
|
|
594
|
+
start_with?("/") ? Wgit::Url.new(self[1..]) : self
|
|
595
595
|
end
|
|
596
596
|
|
|
597
597
|
# Returns a new Wgit::Url containing self without a trailing slash. Is
|
|
@@ -600,7 +600,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
|
600
600
|
#
|
|
601
601
|
# @return [Wgit::Url] Self without a trailing slash.
|
|
602
602
|
def omit_trailing_slash
|
|
603
|
-
end_with?(
|
|
603
|
+
end_with?("/") ? Wgit::Url.new(chop) : self
|
|
604
604
|
end
|
|
605
605
|
|
|
606
606
|
# Returns a new Wgit::Url containing self without a leading or trailing
|
|
@@ -621,9 +621,9 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
|
621
621
|
# @return [Wgit::Url] Self containing everything after the base.
|
|
622
622
|
def omit_base
|
|
623
623
|
base_url = to_base
|
|
624
|
-
omit_base = base_url ? gsub(base_url,
|
|
624
|
+
omit_base = base_url ? gsub(base_url, "") : self
|
|
625
625
|
|
|
626
|
-
return self if [
|
|
626
|
+
return self if ["", "/"].include?(omit_base)
|
|
627
627
|
|
|
628
628
|
Wgit::Url.new(omit_base).omit_leading_slash
|
|
629
629
|
end
|
|
@@ -636,9 +636,9 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
|
636
636
|
# @return [Wgit::Url] Self containing everything after the origin.
|
|
637
637
|
def omit_origin
|
|
638
638
|
origin = to_origin
|
|
639
|
-
omit_origin = origin ? gsub(origin,
|
|
639
|
+
omit_origin = origin ? gsub(origin, "") : self
|
|
640
640
|
|
|
641
|
-
return self if [
|
|
641
|
+
return self if ["", "/"].include?(omit_origin)
|
|
642
642
|
|
|
643
643
|
Wgit::Url.new(omit_origin).omit_leading_slash
|
|
644
644
|
end
|
|
@@ -652,7 +652,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
|
652
652
|
# @return [Wgit::Url] Self with the query string portion removed.
|
|
653
653
|
def omit_query
|
|
654
654
|
query = to_query
|
|
655
|
-
omit_query_string = query ? gsub("?#{query}",
|
|
655
|
+
omit_query_string = query ? gsub("?#{query}", "") : self
|
|
656
656
|
|
|
657
657
|
Wgit::Url.new(omit_query_string)
|
|
658
658
|
end
|
|
@@ -667,7 +667,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
|
667
667
|
# @return [Wgit::Url] Self with the fragment portion removed.
|
|
668
668
|
def omit_fragment
|
|
669
669
|
fragment = to_fragment
|
|
670
|
-
omit_fragment = fragment ? gsub("##{fragment}",
|
|
670
|
+
omit_fragment = fragment ? gsub("##{fragment}", "") : self
|
|
671
671
|
|
|
672
672
|
Wgit::Url.new(omit_fragment)
|
|
673
673
|
end
|
|
@@ -677,7 +677,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
|
677
677
|
#
|
|
678
678
|
# @return [Boolean] True if self is a query string, false otherwise.
|
|
679
679
|
def query?
|
|
680
|
-
start_with?(
|
|
680
|
+
start_with?("?")
|
|
681
681
|
end
|
|
682
682
|
|
|
683
683
|
# Returns true if self is a URL fragment e.g. #top etc. Note this
|
|
@@ -685,14 +685,14 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
|
685
685
|
#
|
|
686
686
|
# @return [Boolean] True if self is a fragment, false otherwise.
|
|
687
687
|
def fragment?
|
|
688
|
-
start_with?(
|
|
688
|
+
start_with?("#")
|
|
689
689
|
end
|
|
690
690
|
|
|
691
691
|
# Returns true if self equals '/' a.k.a. index.
|
|
692
692
|
#
|
|
693
693
|
# @return [Boolean] True if self equals '/', false otherwise.
|
|
694
694
|
def index?
|
|
695
|
-
self ==
|
|
695
|
+
self == "/"
|
|
696
696
|
end
|
|
697
697
|
|
|
698
698
|
# Returns true if self starts with '//' a.k.a a scheme/protocol relative
|
|
@@ -700,7 +700,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
|
700
700
|
#
|
|
701
701
|
# @return [Boolean] True if self starts with '//', false otherwise.
|
|
702
702
|
def scheme_relative?
|
|
703
|
-
start_with?(
|
|
703
|
+
start_with?("//")
|
|
704
704
|
end
|
|
705
705
|
|
|
706
706
|
alias_method :crawled?, :crawled
|