wgit 0.0.15 → 0.0.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/wgit/crawler.rb +1 -1
- data/lib/wgit/url.rb +41 -43
- data/lib/wgit/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 269236ab66e07aaabe01d61f765689e3d997628ad76d5f61a9c477e35d67880b
|
4
|
+
data.tar.gz: 5fd11a994c23cd9569099109f8e2236873cf2d6267ea38bd661329620ece50b0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1c97aab9a225690205fcf10a99d2f632c45c08e7c3c5a543a0d374eb0595a6953baf77acd16eebf032c741d671d9e0fece030b01578af14fabc2acfa446734aa
|
7
|
+
data.tar.gz: eefb60a4462142fce4643dc12edac1fa11951c32f7e2d72f3295369fc8db83b3b126ea5e51410bc9ca9955cc1d7c386ac3c0aac77b1c6eaf9bc89ffc517f44ee
|
data/lib/wgit/crawler.rb
CHANGED
@@ -196,7 +196,7 @@ module Wgit
|
|
196
196
|
raise 'Too many redirects' if redirect_count >= redirect_limit
|
197
197
|
redirect_count += 1
|
198
198
|
|
199
|
-
response = Net::HTTP.get_response(url.
|
199
|
+
response = Net::HTTP.get_response(url.to_uri)
|
200
200
|
location = Wgit::Url.new(response.fetch('location', ''))
|
201
201
|
|
202
202
|
if not location.empty?
|
data/lib/wgit/url.rb
CHANGED
@@ -8,7 +8,7 @@ module Wgit
|
|
8
8
|
# Class modeling a web based URL.
|
9
9
|
# Can be an internal/relative link e.g. "about.html" or a full URL
|
10
10
|
# e.g. "http://www.google.co.uk". Is a subclass of String and uses
|
11
|
-
# 'addressable/uri' internally.
|
11
|
+
# 'uri' and 'addressable/uri' internally.
|
12
12
|
class Url < String
|
13
13
|
include Assertable
|
14
14
|
|
@@ -51,18 +51,27 @@ module Wgit
|
|
51
51
|
super(url)
|
52
52
|
end
|
53
53
|
|
54
|
+
# A class alias for Url.new.
|
55
|
+
#
|
56
|
+
# @param str [String] The URL string to parse.
|
57
|
+
# @return [Wgit::Url] The parsed Url object.
|
58
|
+
def self.parse(str)
|
59
|
+
self.new(str)
|
60
|
+
end
|
61
|
+
|
54
62
|
# Raises an exception if url is not a valid HTTP URL.
|
55
63
|
#
|
56
64
|
# @param url [Wgit::Url, String] The Url to validate.
|
57
65
|
# @raise [RuntimeError] If url is invalid.
|
58
66
|
def self.validate(url)
|
59
|
-
|
67
|
+
url = Wgit::Url.new(url)
|
68
|
+
if url.relative_link?
|
60
69
|
raise "Invalid url (or a relative link): #{url}"
|
61
70
|
end
|
62
71
|
unless url.start_with?("http://") or url.start_with?("https://")
|
63
72
|
raise "Invalid url (missing protocol prefix): #{url}"
|
64
73
|
end
|
65
|
-
if URI.regexp.match(url).nil?
|
74
|
+
if URI.regexp.match(url.normalise).nil?
|
66
75
|
raise "Invalid url: #{url}"
|
67
76
|
end
|
68
77
|
end
|
@@ -96,20 +105,31 @@ module Wgit
|
|
96
105
|
url
|
97
106
|
end
|
98
107
|
|
99
|
-
#
|
108
|
+
# Concats the host and link Strings and returns the result.
|
109
|
+
#
|
110
|
+
# @param host [Wgit::Url, String] The Url host.
|
111
|
+
# @param link [Wgit::Url, String] The link to add to the host prefix.
|
112
|
+
# @return [Wgit::Url] host + "/" + link
|
113
|
+
def self.concat(host, link)
|
114
|
+
host = Wgit::Url.new(host).without_trailing_slash
|
115
|
+
link = Wgit::Url.new(link).without_leading_slash
|
116
|
+
separator = (link.start_with?('#') or link.start_with?('?')) ? '' : '/'
|
117
|
+
Wgit::Url.new(host + separator + link)
|
118
|
+
end
|
119
|
+
|
120
|
+
# Returns true if self is a relative Url.
|
121
|
+
#
|
100
122
|
# All external links in a page are expected to have a protocol prefix e.g.
|
101
123
|
# "http://", otherwise the link is treated as an internal link (regardless
|
102
|
-
# of whether it
|
103
|
-
# and
|
124
|
+
# of whether it's valid or not). The only exception is if base is provided
|
125
|
+
# and self is a page within that site; then the link is relative.
|
104
126
|
#
|
105
|
-
# @param
|
106
|
-
# @param base [String] The Url base e.g. http://www.google.co.uk.
|
127
|
+
# @param base [Wgit::Url, String] The Url base e.g. http://www.google.com.
|
107
128
|
# @return [Boolean] True if relative, false if absolute.
|
108
|
-
# @raise [RuntimeError] If
|
109
|
-
def
|
110
|
-
raise "Invalid link: #{
|
129
|
+
# @raise [RuntimeError] If self is invalid e.g. empty.
|
130
|
+
def is_relative?(base: nil)
|
131
|
+
raise "Invalid link: #{self}" if nil? or empty?
|
111
132
|
|
112
|
-
link = Wgit::Url.new(link)
|
113
133
|
if base
|
114
134
|
base = Wgit::Url.new(base)
|
115
135
|
if base.to_scheme.nil?
|
@@ -117,35 +137,13 @@ module Wgit
|
|
117
137
|
end
|
118
138
|
end
|
119
139
|
|
120
|
-
if
|
140
|
+
if @uri.relative?
|
121
141
|
true
|
122
142
|
else
|
123
|
-
base ?
|
143
|
+
base ? to_host == base.to_host : false
|
124
144
|
end
|
125
145
|
end
|
126
146
|
|
127
|
-
# Concats the host and link Strings and returns the result.
|
128
|
-
#
|
129
|
-
# @param host [Wgit::Url, String] The Url host.
|
130
|
-
# @param link [Wgit::Url, String] The link to add to the host prefix.
|
131
|
-
# @return [Wgit::Url] host + "/" + link
|
132
|
-
def self.concat(host, link)
|
133
|
-
host = Wgit::Url.new(host).without_trailing_slash
|
134
|
-
link = Wgit::Url.new(link).without_leading_slash
|
135
|
-
separator = (link.start_with?('#') or link.start_with?('?')) ? '' : '/'
|
136
|
-
Wgit::Url.new(host + separator + link)
|
137
|
-
end
|
138
|
-
|
139
|
-
# Returns if self is a relative or absolute Url. If base is provided and
|
140
|
-
# self is a page within that site then the link is relative.
|
141
|
-
# See Wgit.relative_link? for more information.
|
142
|
-
#
|
143
|
-
# @return [Boolean] True if relative, false if absolute.
|
144
|
-
# @raise [RuntimeError] If the link is invalid.
|
145
|
-
def relative_link?(base: nil)
|
146
|
-
Wgit::Url.relative_link?(self, base: base)
|
147
|
-
end
|
148
|
-
|
149
147
|
# Determines if self is a valid Url or not.
|
150
148
|
#
|
151
149
|
# @return [Boolean] True if valid, otherwise false.
|
@@ -170,18 +168,18 @@ module Wgit
|
|
170
168
|
@date_crawled = bool ? Wgit::Utils.time_stamp : nil
|
171
169
|
end
|
172
170
|
|
173
|
-
# Normalises/
|
171
|
+
# Normalises/escapes self and returns a new Wgit::Url.
|
174
172
|
#
|
175
173
|
# @return [Wgit::Url] An encoded version of self.
|
176
174
|
def normalise
|
177
175
|
Wgit::Url.new(@uri.normalize.to_s)
|
178
176
|
end
|
179
177
|
|
180
|
-
# Returns
|
178
|
+
# Returns a normalised URI object for this URL.
|
181
179
|
#
|
182
|
-
# @return [
|
180
|
+
# @return [URI::HTTP, URI::HTTPS] The URI object of self.
|
183
181
|
def to_uri
|
184
|
-
|
182
|
+
URI(normalise)
|
185
183
|
end
|
186
184
|
|
187
185
|
# Returns self.
|
@@ -361,9 +359,9 @@ module Wgit
|
|
361
359
|
alias :fragment :to_anchor
|
362
360
|
alias :extension :to_extension
|
363
361
|
alias :without_fragment :without_anchor
|
364
|
-
alias :
|
365
|
-
alias :
|
366
|
-
alias :is_internal? :
|
362
|
+
alias :relative_link? :is_relative?
|
363
|
+
alias :internal_link? :is_relative?
|
364
|
+
alias :is_internal? :is_relative?
|
367
365
|
alias :crawled? :crawled
|
368
366
|
alias :normalize :normalise
|
369
367
|
end
|
data/lib/wgit/version.rb
CHANGED