wgit 0.10.8 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +72 -1
- data/CODE_OF_CONDUCT.md +1 -1
- data/CONTRIBUTING.md +2 -2
- data/README.md +24 -20
- data/bin/wgit +75 -19
- data/lib/wgit/assertable.rb +33 -6
- data/lib/wgit/core_ext.rb +1 -1
- data/lib/wgit/crawler.rb +102 -37
- data/lib/wgit/database/adapters/in_memory.rb +204 -0
- data/lib/wgit/database/adapters/mongo_db.rb +627 -0
- data/lib/wgit/database/database.rb +18 -651
- data/lib/wgit/database/database_adapter.rb +147 -0
- data/lib/wgit/document.rb +222 -98
- data/lib/wgit/document_extractors.rb +16 -10
- data/lib/wgit/dsl.rb +74 -81
- data/lib/wgit/html_to_text.rb +277 -0
- data/lib/wgit/indexer.rb +184 -71
- data/lib/wgit/logger.rb +2 -2
- data/lib/wgit/model.rb +164 -0
- data/lib/wgit/response.rb +25 -13
- data/lib/wgit/robots_parser.rb +193 -0
- data/lib/wgit/url.rb +150 -90
- data/lib/wgit/utils.rb +200 -37
- data/lib/wgit/version.rb +1 -1
- data/lib/wgit.rb +18 -13
- metadata +56 -43
- data/lib/wgit/database/model.rb +0 -60
@@ -0,0 +1,193 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Wgit
|
4
|
+
# The RobotsParser class handles parsing and processing of a web servers
|
5
|
+
# robots.txt file.
|
6
|
+
class RobotsParser
|
7
|
+
include Wgit::Assertable
|
8
|
+
|
9
|
+
# Key representing the start of a comment.
|
10
|
+
KEY_COMMENT = "#"
|
11
|
+
# Key value separator used in robots.txt files.
|
12
|
+
KEY_SEPARATOR = ":"
|
13
|
+
# Key representing a user agent.
|
14
|
+
KEY_USER_AGENT = "User-agent"
|
15
|
+
# Key representing an allow URL rule.
|
16
|
+
KEY_ALLOW = "Allow"
|
17
|
+
# Key representing a disallow URL rule.
|
18
|
+
KEY_DISALLOW = "Disallow"
|
19
|
+
|
20
|
+
# Value representing the Wgit user agent.
|
21
|
+
USER_AGENT_WGIT = :wgit
|
22
|
+
# Value representing any user agent including Wgit.
|
23
|
+
USER_AGENT_ANY = :*
|
24
|
+
|
25
|
+
# Value representing any and all paths.
|
26
|
+
PATHS_ALL = %w[/ *].freeze
|
27
|
+
|
28
|
+
# Hash containing the user-agent allow/disallow URL rules. Looks like:
|
29
|
+
# allow_paths: ["/"]
|
30
|
+
# disallow_paths: ["/accounts", ...]
|
31
|
+
attr_reader :rules
|
32
|
+
|
33
|
+
# Initializes and returns a Wgit::RobotsParser instance having parsed the
|
34
|
+
# robot.txt contents.
|
35
|
+
#
|
36
|
+
# @param contents [String, #to_s] The contents of the robots.txt file to be
|
37
|
+
# parsed.
|
38
|
+
def initialize(contents)
|
39
|
+
@rules = {
|
40
|
+
allow_paths: Set.new,
|
41
|
+
disallow_paths: Set.new
|
42
|
+
}
|
43
|
+
|
44
|
+
assert_respond_to(contents, :to_s)
|
45
|
+
parse(contents.to_s)
|
46
|
+
end
|
47
|
+
|
48
|
+
# Overrides String#inspect to shorten the printed output of a Parser.
|
49
|
+
#
|
50
|
+
# @return [String] A short textual representation of this Parser.
|
51
|
+
def inspect
|
52
|
+
"#<Wgit::RobotsParser has_rules=#{rules?} no_index=#{no_index?}>"
|
53
|
+
end
|
54
|
+
|
55
|
+
# Returns the allow paths/rules for this parser's robots.txt contents.
|
56
|
+
#
|
57
|
+
# @return [Array<String>] The allow paths/rules to follow.
|
58
|
+
def allow_paths
|
59
|
+
@rules[:allow_paths].to_a
|
60
|
+
end
|
61
|
+
|
62
|
+
# Returns the disallow paths/rules for this parser's robots.txt contents.
|
63
|
+
#
|
64
|
+
# @return [Array<String>] The disallow paths/rules to follow.
|
65
|
+
def disallow_paths
|
66
|
+
@rules[:disallow_paths].to_a
|
67
|
+
end
|
68
|
+
|
69
|
+
# Returns whether or not there are rules applying to Wgit.
|
70
|
+
#
|
71
|
+
# @return [Boolean] True if there are rules for Wgit to follow, false
|
72
|
+
# otherwise.
|
73
|
+
def rules?
|
74
|
+
allow_rules? || disallow_rules?
|
75
|
+
end
|
76
|
+
|
77
|
+
# Returns whether or not there are allow rules applying to Wgit.
|
78
|
+
#
|
79
|
+
# @return [Boolean] True if there are allow rules for Wgit to follow,
|
80
|
+
# false otherwise.
|
81
|
+
def allow_rules?
|
82
|
+
@rules[:allow_paths].any?
|
83
|
+
end
|
84
|
+
|
85
|
+
# Returns whether or not there are disallow rules applying to Wgit.
|
86
|
+
#
|
87
|
+
# @return [Boolean] True if there are disallow rules for Wgit to follow,
|
88
|
+
# false otherwise.
|
89
|
+
def disallow_rules?
|
90
|
+
@rules[:disallow_paths].any?
|
91
|
+
end
|
92
|
+
|
93
|
+
# Returns whether or not Wgit is banned from indexing this site.
|
94
|
+
#
|
95
|
+
# @return [Boolean] True if Wgit should not index this site, false
|
96
|
+
# otherwise.
|
97
|
+
def no_index?
|
98
|
+
@rules[:disallow_paths].any? { |path| PATHS_ALL.include?(path) }
|
99
|
+
end
|
100
|
+
|
101
|
+
private
|
102
|
+
|
103
|
+
# Parses the file contents and sets @rules.
|
104
|
+
def parse(contents)
|
105
|
+
user_agents = []
|
106
|
+
new_block = false
|
107
|
+
|
108
|
+
contents.split("\n").each do |line|
|
109
|
+
line.strip!
|
110
|
+
next if line.empty? || line.start_with?(KEY_COMMENT)
|
111
|
+
|
112
|
+
# A user agent block is denoted by N User-agent's followed by N
|
113
|
+
# Allow/Disallow's. After which a new block is formed from scratch.
|
114
|
+
if start_with_any_case?(line, KEY_USER_AGENT)
|
115
|
+
if new_block
|
116
|
+
user_agents = []
|
117
|
+
new_block = false
|
118
|
+
end
|
119
|
+
user_agents << remove_key(line, KEY_USER_AGENT).downcase.to_sym
|
120
|
+
else
|
121
|
+
new_block = true
|
122
|
+
end
|
123
|
+
|
124
|
+
if start_with_any_case?(line, KEY_ALLOW)
|
125
|
+
append_allow_rule(user_agents, line)
|
126
|
+
elsif start_with_any_case?(line, KEY_DISALLOW)
|
127
|
+
append_disallow_rule(user_agents, line)
|
128
|
+
elsif !start_with_any_case?(line, KEY_USER_AGENT)
|
129
|
+
Wgit.logger.debug("Skipping unsupported robots.txt line: #{line}")
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
# Implements start_with? but case insensitive.
|
135
|
+
def start_with_any_case?(str, prefix)
|
136
|
+
str.downcase.start_with?(prefix.downcase)
|
137
|
+
end
|
138
|
+
|
139
|
+
# Returns line with key removed (if present). Otherwise line is returned
|
140
|
+
# as given.
|
141
|
+
def remove_key(line, key)
|
142
|
+
return line unless start_with_any_case?(line, key)
|
143
|
+
return line unless line.count(KEY_SEPARATOR) == 1
|
144
|
+
|
145
|
+
segs = line.split(KEY_SEPARATOR)
|
146
|
+
return "" if segs.size == 1
|
147
|
+
|
148
|
+
segs.last.strip
|
149
|
+
end
|
150
|
+
|
151
|
+
# Don't append * or /, as this means all paths, which is the same as no
|
152
|
+
# allow_paths when passed to Wgit::Crawler.
|
153
|
+
def append_allow_rule(user_agents, line)
|
154
|
+
return unless wgit_user_agent?(user_agents)
|
155
|
+
|
156
|
+
path = remove_key(line, KEY_ALLOW)
|
157
|
+
path = parse_special_syntax(path)
|
158
|
+
return if PATHS_ALL.include?(path)
|
159
|
+
|
160
|
+
@rules[:allow_paths] << path
|
161
|
+
end
|
162
|
+
|
163
|
+
def append_disallow_rule(user_agents, line)
|
164
|
+
return unless wgit_user_agent?(user_agents)
|
165
|
+
|
166
|
+
path = remove_key(line, KEY_DISALLOW)
|
167
|
+
path = parse_special_syntax(path)
|
168
|
+
@rules[:disallow_paths] << path
|
169
|
+
end
|
170
|
+
|
171
|
+
def wgit_user_agent?(user_agents)
|
172
|
+
user_agents.any? do |agent|
|
173
|
+
[USER_AGENT_ANY, USER_AGENT_WGIT].include?(agent.downcase)
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
def parse_special_syntax(path)
|
178
|
+
# Remove $ e.g. "/blah$" becomes "/blah"
|
179
|
+
path = path.gsub("$", "")
|
180
|
+
|
181
|
+
# Remove any inline comments e.g. "/blah # comment" becomes "/blah"
|
182
|
+
path = path.split(" #{KEY_COMMENT}").first if path.include?(" #{KEY_COMMENT}")
|
183
|
+
|
184
|
+
# Replace an empty path with * e.g. "Allow: " becomes "Allow: *"
|
185
|
+
path = "*" if path.empty?
|
186
|
+
|
187
|
+
path
|
188
|
+
end
|
189
|
+
|
190
|
+
alias_method :paths, :rules
|
191
|
+
alias_method :banned?, :no_index?
|
192
|
+
end
|
193
|
+
end
|
data/lib/wgit/url.rb
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require_relative
|
4
|
-
require_relative
|
5
|
-
require
|
6
|
-
require
|
3
|
+
require_relative "utils"
|
4
|
+
require_relative "assertable"
|
5
|
+
require "uri"
|
6
|
+
require "addressable/uri"
|
7
7
|
|
8
8
|
module Wgit
|
9
9
|
# Class modeling/serialising a web based HTTP URL.
|
@@ -28,6 +28,9 @@ module Wgit
|
|
28
28
|
# The duration of the crawl for this Url (in seconds).
|
29
29
|
attr_accessor :crawl_duration
|
30
30
|
|
31
|
+
# Record the redirects from the initial Url to the final Url.
|
32
|
+
attr_reader :redirects
|
33
|
+
|
31
34
|
# Initializes a new instance of Wgit::Url which models a web based
|
32
35
|
# HTTP URL.
|
33
36
|
#
|
@@ -53,16 +56,18 @@ module Wgit
|
|
53
56
|
obj = url_or_obj
|
54
57
|
assert_respond_to(obj, :fetch)
|
55
58
|
|
56
|
-
url = obj.fetch(
|
57
|
-
crawled = obj.fetch(
|
58
|
-
date_crawled = obj.fetch(
|
59
|
-
crawl_duration = obj.fetch(
|
59
|
+
url = obj.fetch("url") # Should always be present.
|
60
|
+
crawled = obj.fetch("crawled", false)
|
61
|
+
date_crawled = obj.fetch("date_crawled", nil)
|
62
|
+
crawl_duration = obj.fetch("crawl_duration", nil)
|
63
|
+
redirects = obj.fetch("redirects", {})
|
60
64
|
end
|
61
65
|
|
62
66
|
@uri = Addressable::URI.parse(url)
|
63
67
|
@crawled = crawled
|
64
68
|
@date_crawled = date_crawled
|
65
69
|
@crawl_duration = crawl_duration
|
70
|
+
@redirects = redirects || {}
|
66
71
|
|
67
72
|
super(url)
|
68
73
|
end
|
@@ -84,7 +89,7 @@ module Wgit
|
|
84
89
|
# @raise [StandardError] If obj.is_a?(String) is false.
|
85
90
|
# @return [Wgit::Url] A Wgit::Url instance.
|
86
91
|
def self.parse(obj)
|
87
|
-
raise
|
92
|
+
raise "Can only parse if obj#is_a?(String)" unless obj.is_a?(String)
|
88
93
|
|
89
94
|
# Return a Wgit::Url as is to avoid losing state e.g. date_crawled etc.
|
90
95
|
obj.is_a?(Wgit::Url) ? obj : new(obj)
|
@@ -107,16 +112,6 @@ Addressable::URI::InvalidURIError")
|
|
107
112
|
nil
|
108
113
|
end
|
109
114
|
|
110
|
-
# Sets the @crawled instance var, also setting @date_crawled for
|
111
|
-
# convenience.
|
112
|
-
#
|
113
|
-
# @param bool [Boolean] True if this Url has been crawled, false otherwise.
|
114
|
-
# @return [Boolean] The value of bool having been set.
|
115
|
-
def crawled=(bool)
|
116
|
-
@crawled = bool
|
117
|
-
@date_crawled = bool ? Wgit::Utils.time_stamp : nil
|
118
|
-
end
|
119
|
-
|
120
115
|
# Overrides String#inspect to distingiush this Url from a String.
|
121
116
|
#
|
122
117
|
# @return [String] A short textual representation of this Url.
|
@@ -134,6 +129,71 @@ Addressable::URI::InvalidURIError")
|
|
134
129
|
super(new_url)
|
135
130
|
end
|
136
131
|
|
132
|
+
# Overrides String#concat which oddly returns a Wgit::Url object, and
|
133
|
+
# instead returns a String. Therefore this method works the same as if
|
134
|
+
# you call String#concat, or its alias String#+, which is desired for
|
135
|
+
# this method. If you want to join two Urls, use Wgit::Url#join method.
|
136
|
+
#
|
137
|
+
# @param other [String] The String to concat onto this one.
|
138
|
+
# @return [String] The new concatted String, not a Wgit::Url.
|
139
|
+
def concat(other)
|
140
|
+
to_s.concat(other.to_s)
|
141
|
+
end
|
142
|
+
|
143
|
+
# Sets the @crawled instance var, also setting @date_crawled for
|
144
|
+
# convenience.
|
145
|
+
#
|
146
|
+
# @param bool [Boolean] True if this Url has been crawled, false otherwise.
|
147
|
+
# @return [Boolean] The value of bool having been set.
|
148
|
+
def crawled=(bool)
|
149
|
+
@crawled = bool
|
150
|
+
@date_crawled = bool ? Wgit::Utils.time_stamp : nil
|
151
|
+
end
|
152
|
+
|
153
|
+
# Sets the @redirects instance var, mapping any Strings into Wgit::Urls.
|
154
|
+
#
|
155
|
+
# @param redirects [Hash] The redirects Hash to set for this Url.
|
156
|
+
def redirects=(redirects)
|
157
|
+
assert_type(redirects, Hash)
|
158
|
+
|
159
|
+
map_to_url = proc do |url|
|
160
|
+
Wgit::Url.new(url.to_s, crawled: @crawled, date_crawled: @date_crawled)
|
161
|
+
end
|
162
|
+
|
163
|
+
@redirects = redirects
|
164
|
+
.map { |from, to| [map_to_url.call(from), map_to_url.call(to)] }
|
165
|
+
.to_h
|
166
|
+
end
|
167
|
+
|
168
|
+
# Returns the Wgit::Url's starting with the originally requested Url to be
|
169
|
+
# crawled, followed by each redirected to Url, finishing with the final
|
170
|
+
# crawled Url e.g.
|
171
|
+
#
|
172
|
+
# Example Url redirects journey (dictated by the webserver):
|
173
|
+
#
|
174
|
+
# ```
|
175
|
+
# http://example.com => 301 to https://example.com
|
176
|
+
# https://example.com => 301 to https://example.com/
|
177
|
+
# https://example.com/ => 200 OK (no more redirects, crawl complete)
|
178
|
+
# ```
|
179
|
+
#
|
180
|
+
# Would return an Array of Wgit::Url's in the form of:
|
181
|
+
#
|
182
|
+
# ```
|
183
|
+
# %w(
|
184
|
+
# http://example.com
|
185
|
+
# https://example.com
|
186
|
+
# https://example.com/
|
187
|
+
# )
|
188
|
+
# ```
|
189
|
+
#
|
190
|
+
# @return [Array<Wgit::Url>] Each redirected to Url's finishing with the
|
191
|
+
# final (successfully) crawled Url. If no redirects took place, then just
|
192
|
+
# the originally requested Url is returned inside the Array.
|
193
|
+
def redirects_journey
|
194
|
+
[redirects.keys, self].flatten
|
195
|
+
end
|
196
|
+
|
137
197
|
# Returns true if self is a relative Url; false if absolute.
|
138
198
|
#
|
139
199
|
# An absolute URL must have a scheme prefix e.g.
|
@@ -167,10 +227,10 @@ Addressable::URI::InvalidURIError")
|
|
167
227
|
def relative?(opts = {})
|
168
228
|
defaults = { origin: nil, host: nil, domain: nil, brand: nil }
|
169
229
|
opts = defaults.merge(opts)
|
170
|
-
raise
|
230
|
+
raise "Url (self) cannot be empty" if empty?
|
171
231
|
|
172
232
|
return false if scheme_relative?
|
173
|
-
return true
|
233
|
+
return true if @uri.relative?
|
174
234
|
|
175
235
|
# Self is absolute but may be relative to the opts param e.g. host.
|
176
236
|
opts.select! { |_k, v| v }
|
@@ -226,22 +286,23 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
226
286
|
!valid?
|
227
287
|
end
|
228
288
|
|
229
|
-
#
|
230
|
-
# modified.
|
289
|
+
# Joins self and other together before returning a new Url. Self is not
|
290
|
+
# modified. Some magic occurs depending on what is being joined, see
|
291
|
+
# the source code for more information.
|
231
292
|
#
|
232
|
-
# @param other [Wgit::Url, String] The other to
|
293
|
+
# @param other [Wgit::Url, String] The other (relative) Url to join to the
|
294
|
+
# end of self.
|
233
295
|
# @return [Wgit::Url] self + separator + other, separator depends on other.
|
234
|
-
def
|
296
|
+
def join(other)
|
235
297
|
other = Wgit::Url.new(other)
|
236
|
-
raise
|
298
|
+
raise "other must be relative" unless other.relative?
|
237
299
|
|
238
300
|
other = other.omit_leading_slash
|
239
|
-
separator = %w[# ? .].include?(other[0]) ?
|
240
|
-
|
241
|
-
|
242
|
-
concatted = omit_trailing_slash.to_s + separator.to_s + other.to_s
|
301
|
+
separator = %w[# ? .].include?(other[0]) ? "" : "/"
|
302
|
+
separator = "" if end_with?("/")
|
303
|
+
joined = self + separator + other
|
243
304
|
|
244
|
-
Wgit::Url.new(
|
305
|
+
Wgit::Url.new(joined)
|
245
306
|
end
|
246
307
|
|
247
308
|
# Normalizes/escapes self and returns a new Wgit::Url. Self isn't modified.
|
@@ -257,7 +318,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
257
318
|
#
|
258
319
|
# If self is absolute then it's returned as is, making this method
|
259
320
|
# idempotent. The doc's `<base>` element is used if present, otherwise
|
260
|
-
# `doc.url` is used as the base; which is
|
321
|
+
# `doc.url` is used as the base; which is joined with self.
|
261
322
|
#
|
262
323
|
# Typically used to build an absolute link obtained from a document.
|
263
324
|
#
|
@@ -267,19 +328,19 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
267
328
|
#
|
268
329
|
# link.make_absolute(doc) # => "http://example.com/favicon.png"
|
269
330
|
#
|
270
|
-
# @param doc [Wgit::Document] The doc whose base Url is
|
331
|
+
# @param doc [Wgit::Document] The doc whose base Url is joined with
|
271
332
|
# self.
|
272
333
|
# @raise [StandardError] If doc isn't a Wgit::Document or if `doc.base_url`
|
273
334
|
# raises an Exception.
|
274
335
|
# @return [Wgit::Url] Self in absolute form.
|
275
336
|
def make_absolute(doc)
|
276
337
|
assert_type(doc, Wgit::Document)
|
277
|
-
raise
|
338
|
+
raise "Cannot make absolute when Document @url is not valid" \
|
278
339
|
unless doc.url.valid?
|
279
340
|
|
280
341
|
return prefix_scheme(doc.url.to_scheme&.to_sym) if scheme_relative?
|
281
342
|
|
282
|
-
absolute? ? self : doc.base_url(link: self).
|
343
|
+
absolute? ? self : doc.base_url(link: self).join(self)
|
283
344
|
end
|
284
345
|
|
285
346
|
# Returns self having prefixed a scheme/protocol. Doesn't modify receiver.
|
@@ -294,7 +355,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
294
355
|
|
295
356
|
return self if absolute? && !scheme_relative?
|
296
357
|
|
297
|
-
separator = scheme_relative? ?
|
358
|
+
separator = scheme_relative? ? "" : "//"
|
298
359
|
Wgit::Url.new("#{scheme}:#{separator}#{self}")
|
299
360
|
end
|
300
361
|
|
@@ -303,8 +364,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
303
364
|
#
|
304
365
|
# @return [Hash] self's instance vars as a Hash.
|
305
366
|
def to_h
|
306
|
-
h = Wgit::Utils.to_h(self, ignore: [
|
307
|
-
Hash[h.to_a.insert(0, [
|
367
|
+
h = Wgit::Utils.to_h(self, ignore: ["@uri"])
|
368
|
+
Hash[h.to_a.insert(0, ["url", to_s])] # Insert url at position 0.
|
308
369
|
end
|
309
370
|
|
310
371
|
# Returns a normalised URI object for this URL.
|
@@ -379,7 +440,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
379
440
|
dot_domain = ".#{to_domain}"
|
380
441
|
return nil unless include?(dot_domain)
|
381
442
|
|
382
|
-
sub_domain = to_host.sub(dot_domain,
|
443
|
+
sub_domain = to_host.sub(dot_domain, "")
|
383
444
|
Wgit::Url.new(sub_domain)
|
384
445
|
end
|
385
446
|
|
@@ -389,7 +450,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
389
450
|
# @return [Wgit::Url, nil] Containing just the brand or nil.
|
390
451
|
def to_brand
|
391
452
|
domain = to_domain
|
392
|
-
domain ? Wgit::Url.new(domain.split(
|
453
|
+
domain ? Wgit::Url.new(domain.split(".").first) : nil
|
393
454
|
end
|
394
455
|
|
395
456
|
# Returns only the base of this URL e.g. the protocol scheme and host
|
@@ -425,9 +486,9 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
425
486
|
def to_path
|
426
487
|
path = @uri.path
|
427
488
|
return nil if path.nil? || path.empty?
|
428
|
-
return Wgit::Url.new(
|
489
|
+
return Wgit::Url.new("/") if path == "/"
|
429
490
|
|
430
|
-
Wgit::Url.new(path).
|
491
|
+
Wgit::Url.new(path).omit_leading_slash
|
431
492
|
end
|
432
493
|
|
433
494
|
# Returns the endpoint of this URL e.g. the bit after the host with any
|
@@ -439,7 +500,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
439
500
|
# an endpoint, / is returned.
|
440
501
|
def to_endpoint
|
441
502
|
endpoint = @uri.path
|
442
|
-
endpoint =
|
503
|
+
endpoint = "/#{endpoint}" unless endpoint.start_with?("/")
|
443
504
|
Wgit::Url.new(endpoint)
|
444
505
|
end
|
445
506
|
|
@@ -463,8 +524,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
463
524
|
query_str = to_query
|
464
525
|
return {} unless query_str
|
465
526
|
|
466
|
-
query_str.split(
|
467
|
-
k, v = param.split(
|
527
|
+
query_str.split("&").each_with_object({}) do |param, hash|
|
528
|
+
k, v = param.split("=")
|
468
529
|
k = k.to_sym if symbolize_keys
|
469
530
|
hash[k] = v
|
470
531
|
end
|
@@ -484,10 +545,10 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
484
545
|
#
|
485
546
|
# @return [Wgit::Url, nil] Containing just the extension string or nil.
|
486
547
|
def to_extension
|
487
|
-
path = to_path
|
548
|
+
path = to_path&.omit_trailing_slash
|
488
549
|
return nil unless path
|
489
550
|
|
490
|
-
segs = path.split(
|
551
|
+
segs = path.split(".")
|
491
552
|
segs.length > 1 ? Wgit::Url.new(segs.last) : nil
|
492
553
|
end
|
493
554
|
|
@@ -530,7 +591,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
530
591
|
#
|
531
592
|
# @return [Wgit::Url] Self without a trailing slash.
|
532
593
|
def omit_leading_slash
|
533
|
-
start_with?(
|
594
|
+
start_with?("/") ? Wgit::Url.new(self[1..]) : self
|
534
595
|
end
|
535
596
|
|
536
597
|
# Returns a new Wgit::Url containing self without a trailing slash. Is
|
@@ -539,7 +600,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
539
600
|
#
|
540
601
|
# @return [Wgit::Url] Self without a trailing slash.
|
541
602
|
def omit_trailing_slash
|
542
|
-
end_with?(
|
603
|
+
end_with?("/") ? Wgit::Url.new(chop) : self
|
543
604
|
end
|
544
605
|
|
545
606
|
# Returns a new Wgit::Url containing self without a leading or trailing
|
@@ -560,11 +621,11 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
560
621
|
# @return [Wgit::Url] Self containing everything after the base.
|
561
622
|
def omit_base
|
562
623
|
base_url = to_base
|
563
|
-
omit_base = base_url ? gsub(base_url,
|
624
|
+
omit_base = base_url ? gsub(base_url, "") : self
|
564
625
|
|
565
|
-
return self if [
|
626
|
+
return self if ["", "/"].include?(omit_base)
|
566
627
|
|
567
|
-
Wgit::Url.new(omit_base).
|
628
|
+
Wgit::Url.new(omit_base).omit_leading_slash
|
568
629
|
end
|
569
630
|
|
570
631
|
# Returns a new Wgit::Url with the origin (base + port) removed e.g. Given
|
@@ -575,11 +636,11 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
575
636
|
# @return [Wgit::Url] Self containing everything after the origin.
|
576
637
|
def omit_origin
|
577
638
|
origin = to_origin
|
578
|
-
omit_origin = origin ? gsub(origin,
|
639
|
+
omit_origin = origin ? gsub(origin, "") : self
|
579
640
|
|
580
|
-
return self if [
|
641
|
+
return self if ["", "/"].include?(omit_origin)
|
581
642
|
|
582
|
-
Wgit::Url.new(omit_origin).
|
643
|
+
Wgit::Url.new(omit_origin).omit_leading_slash
|
583
644
|
end
|
584
645
|
|
585
646
|
# Returns a new Wgit::Url with the query string portion removed e.g. Given
|
@@ -591,7 +652,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
591
652
|
# @return [Wgit::Url] Self with the query string portion removed.
|
592
653
|
def omit_query
|
593
654
|
query = to_query
|
594
|
-
omit_query_string = query ? gsub("?#{query}",
|
655
|
+
omit_query_string = query ? gsub("?#{query}", "") : self
|
595
656
|
|
596
657
|
Wgit::Url.new(omit_query_string)
|
597
658
|
end
|
@@ -606,7 +667,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
606
667
|
# @return [Wgit::Url] Self with the fragment portion removed.
|
607
668
|
def omit_fragment
|
608
669
|
fragment = to_fragment
|
609
|
-
omit_fragment = fragment ? gsub("##{fragment}",
|
670
|
+
omit_fragment = fragment ? gsub("##{fragment}", "") : self
|
610
671
|
|
611
672
|
Wgit::Url.new(omit_fragment)
|
612
673
|
end
|
@@ -616,7 +677,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
616
677
|
#
|
617
678
|
# @return [Boolean] True if self is a query string, false otherwise.
|
618
679
|
def query?
|
619
|
-
start_with?(
|
680
|
+
start_with?("?")
|
620
681
|
end
|
621
682
|
|
622
683
|
# Returns true if self is a URL fragment e.g. #top etc. Note this
|
@@ -624,14 +685,14 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
624
685
|
#
|
625
686
|
# @return [Boolean] True if self is a fragment, false otherwise.
|
626
687
|
def fragment?
|
627
|
-
start_with?(
|
688
|
+
start_with?("#")
|
628
689
|
end
|
629
690
|
|
630
691
|
# Returns true if self equals '/' a.k.a. index.
|
631
692
|
#
|
632
693
|
# @return [Boolean] True if self equals '/', false otherwise.
|
633
694
|
def index?
|
634
|
-
self ==
|
695
|
+
self == "/"
|
635
696
|
end
|
636
697
|
|
637
698
|
# Returns true if self starts with '//' a.k.a a scheme/protocol relative
|
@@ -639,35 +700,34 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
|
|
639
700
|
#
|
640
701
|
# @return [Boolean] True if self starts with '//', false otherwise.
|
641
702
|
def scheme_relative?
|
642
|
-
start_with?(
|
643
|
-
end
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
alias sub_domain to_sub_domain
|
703
|
+
start_with?("//")
|
704
|
+
end
|
705
|
+
|
706
|
+
alias_method :crawled?, :crawled
|
707
|
+
alias_method :is_relative?, :relative?
|
708
|
+
alias_method :is_absolute?, :absolute?
|
709
|
+
alias_method :is_valid?, :valid?
|
710
|
+
alias_method :is_query?, :query?
|
711
|
+
alias_method :is_fragment?, :fragment?
|
712
|
+
alias_method :is_index?, :index?
|
713
|
+
alias_method :is_scheme_relative?, :scheme_relative?
|
714
|
+
alias_method :uri, :to_uri
|
715
|
+
alias_method :url, :to_url
|
716
|
+
alias_method :scheme, :to_scheme
|
717
|
+
alias_method :host, :to_host
|
718
|
+
alias_method :port, :to_port
|
719
|
+
alias_method :domain, :to_domain
|
720
|
+
alias_method :brand, :to_brand
|
721
|
+
alias_method :base, :to_base
|
722
|
+
alias_method :origin, :to_origin
|
723
|
+
alias_method :path, :to_path
|
724
|
+
alias_method :endpoint, :to_endpoint
|
725
|
+
alias_method :query, :to_query
|
726
|
+
alias_method :query_hash, :to_query_hash
|
727
|
+
alias_method :fragment, :to_fragment
|
728
|
+
alias_method :extension, :to_extension
|
729
|
+
alias_method :user, :to_user
|
730
|
+
alias_method :password, :to_password
|
731
|
+
alias_method :sub_domain, :to_sub_domain
|
672
732
|
end
|
673
733
|
end
|