wgit 0.5.1 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +7 -0
- data/CHANGELOG.md +174 -0
- data/CODE_OF_CONDUCT.md +76 -0
- data/CONTRIBUTING.md +21 -0
- data/LICENSE.txt +21 -0
- data/README.md +399 -0
- data/lib/wgit/crawler.rb +135 -119
- data/lib/wgit/document.rb +45 -67
- data/lib/wgit/document_extensions.rb +1 -1
- data/lib/wgit/response.rb +6 -6
- data/lib/wgit/url.rb +23 -14
- data/lib/wgit/utils.rb +2 -2
- data/lib/wgit/version.rb +1 -1
- metadata +10 -5
data/lib/wgit/response.rb
CHANGED
@@ -56,11 +56,11 @@ module Wgit
|
|
56
56
|
@body.empty? ? nil : @body
|
57
57
|
end
|
58
58
|
|
59
|
-
# Returns
|
59
|
+
# Returns whether or not a server response is absent.
|
60
60
|
#
|
61
|
-
# @return [Boolean] True if
|
61
|
+
# @return [Boolean] True if the status is nil or < 1, false otherwise.
|
62
62
|
def failure?
|
63
|
-
!success?
|
63
|
+
!success?
|
64
64
|
end
|
65
65
|
|
66
66
|
# Sets the headers Hash to the given value. The header keys are mapped
|
@@ -122,13 +122,13 @@ module Wgit
|
|
122
122
|
@status = int.positive? ? int : nil
|
123
123
|
end
|
124
124
|
|
125
|
-
# Returns whether or not
|
125
|
+
# Returns whether or not a server response is present.
|
126
126
|
#
|
127
|
-
# @return [Boolean] True if
|
127
|
+
# @return [Boolean] True if the status is > 0, false otherwise.
|
128
128
|
def success?
|
129
129
|
return false unless @status
|
130
130
|
|
131
|
-
@status.
|
131
|
+
@status.positive?
|
132
132
|
end
|
133
133
|
|
134
134
|
alias code status
|
data/lib/wgit/url.rb
CHANGED
@@ -19,7 +19,7 @@ module Wgit
|
|
19
19
|
include Assertable
|
20
20
|
|
21
21
|
# Whether or not the Url has been crawled or not. A custom crawled= method
|
22
|
-
# is provided by this class
|
22
|
+
# is provided by this class.
|
23
23
|
attr_reader :crawled
|
24
24
|
|
25
25
|
# The Time stamp of when this Url was crawled.
|
@@ -31,7 +31,7 @@ module Wgit
|
|
31
31
|
# Initializes a new instance of Wgit::Url which represents a web based
|
32
32
|
# HTTP URL.
|
33
33
|
#
|
34
|
-
# @param url_or_obj [String, Wgit::Url,
|
34
|
+
# @param url_or_obj [String, Wgit::Url, #fetch#[]] Is either a String
|
35
35
|
# based URL or an object representing a Database record e.g. a MongoDB
|
36
36
|
# document/object.
|
37
37
|
# @param crawled [Boolean] Whether or not the HTML of the URL's web page
|
@@ -114,16 +114,22 @@ module Wgit
|
|
114
114
|
|
115
115
|
# Returns true if self is a relative Url; false if absolute.
|
116
116
|
#
|
117
|
-
#
|
118
|
-
# 'http://', otherwise the
|
117
|
+
# An absolute URL must have a scheme prefix e.g.
|
118
|
+
# 'http://', otherwise the URL is regarded as being relative (regardless
|
119
119
|
# of whether it's valid or not). The only exception is if an opts arg is
|
120
120
|
# provided and self is a page belonging to that arg type e.g. host; then
|
121
121
|
# the link is relative.
|
122
122
|
#
|
123
|
+
# @example
|
124
|
+
# url = Wgit::Url.new('http://example.com/about')
|
125
|
+
#
|
126
|
+
# url.relative? # => false
|
127
|
+
# url.relative?(host: 'http://example.com') # => true
|
128
|
+
#
|
123
129
|
# @param opts [Hash] The options with which to check relativity. Only one
|
124
130
|
# opts param should be provided. The provided opts param Url must be
|
125
131
|
# absolute and be prefixed with a scheme. Consider using the output of
|
126
|
-
# Wgit::Url#to_base which should work unless it's nil.
|
132
|
+
# Wgit::Url#to_base which should work (unless it's nil).
|
127
133
|
# @option opts [Wgit::Url, String] :base The Url base e.g.
|
128
134
|
# http://www.google.com/how which gives a base of
|
129
135
|
# 'http://www.google.com'.
|
@@ -133,7 +139,7 @@ module Wgit
|
|
133
139
|
# http://www.google.com/how which gives a domain of 'google.com'.
|
134
140
|
# @option opts [Wgit::Url, String] :brand The Url brand e.g.
|
135
141
|
# http://www.google.com/how which gives a domain of 'google'.
|
136
|
-
# @raise [StandardError] If self is invalid e.g. empty or an invalid opts
|
142
|
+
# @raise [StandardError] If self is invalid (e.g. empty) or an invalid opts
|
137
143
|
# param has been provided.
|
138
144
|
# @return [Boolean] True if relative, false if absolute.
|
139
145
|
def relative?(opts = {})
|
@@ -151,9 +157,9 @@ module Wgit
|
|
151
157
|
|
152
158
|
type, url = opts.first
|
153
159
|
url = Wgit::Url.new(url)
|
154
|
-
|
155
|
-
raise "Invalid opts param value,
|
156
|
-
protocol scheme: #{url}"
|
160
|
+
if url.invalid?
|
161
|
+
raise "Invalid opts param value, it must be absolute, containing a \
|
162
|
+
protocol scheme and domain (e.g. http://example.com): #{url}"
|
157
163
|
end
|
158
164
|
|
159
165
|
case type
|
@@ -177,18 +183,20 @@ protocol scheme: #{url}"
|
|
177
183
|
@uri.absolute?
|
178
184
|
end
|
179
185
|
|
180
|
-
# Returns if self is a valid and absolute HTTP
|
186
|
+
# Returns if self is a valid and absolute HTTP URL or not. Self should
|
187
|
+
# always be crawlable if this method returns true.
|
181
188
|
#
|
182
|
-
# @return [Boolean] True if valid and
|
189
|
+
# @return [Boolean] True if valid, absolute and crawable, otherwise false.
|
183
190
|
def valid?
|
184
191
|
return false if relative?
|
185
|
-
return false unless
|
192
|
+
return false unless to_base && to_domain
|
186
193
|
return false if URI::DEFAULT_PARSER.make_regexp.match(normalize).nil?
|
187
194
|
|
188
195
|
true
|
189
196
|
end
|
190
197
|
|
191
|
-
# Returns if self is an invalid (relative) HTTP
|
198
|
+
# Returns if self is an invalid (e.g. relative) HTTP URL. See
|
199
|
+
# Wgit::Url#valid? for the inverse (and more information).
|
192
200
|
#
|
193
201
|
# @return [Boolean] True if invalid, otherwise false.
|
194
202
|
def invalid?
|
@@ -227,8 +235,9 @@ protocol scheme: #{url}"
|
|
227
235
|
# idempotent. The doc's <base> element is used if present, otherwise
|
228
236
|
# doc.url is used as the base; which is concatted with self.
|
229
237
|
#
|
230
|
-
# Typically used to build an absolute link obtained from a document
|
238
|
+
# Typically used to build an absolute link obtained from a document.
|
231
239
|
#
|
240
|
+
# @example
|
232
241
|
# link = Wgit::Url.new('/favicon.png')
|
233
242
|
# doc = Wgit::Document.new('http://example.com')
|
234
243
|
#
|
data/lib/wgit/utils.rb
CHANGED
@@ -188,9 +188,9 @@ module Wgit
|
|
188
188
|
#
|
189
189
|
# @param arr [Enumerable] The Array to process. arr is modified.
|
190
190
|
# @return [Enumerable] The processed arr is both modified and then returned.
|
191
|
-
def self.process_arr(arr)
|
191
|
+
def self.process_arr(arr, encode: true)
|
192
192
|
if arr.is_a?(Array)
|
193
|
-
arr.map! { |str| process_str(str) }
|
193
|
+
arr.map! { |str| process_str(str, encode: encode) }
|
194
194
|
arr.reject! { |str| str.is_a?(String) ? str.empty? : false }
|
195
195
|
arr.compact!
|
196
196
|
arr.uniq!
|
data/lib/wgit/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wgit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-12-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -213,6 +213,12 @@ files:
|
|
213
213
|
- "./lib/wgit/url.rb"
|
214
214
|
- "./lib/wgit/utils.rb"
|
215
215
|
- "./lib/wgit/version.rb"
|
216
|
+
- ".yardopts"
|
217
|
+
- CHANGELOG.md
|
218
|
+
- CODE_OF_CONDUCT.md
|
219
|
+
- CONTRIBUTING.md
|
220
|
+
- LICENSE.txt
|
221
|
+
- README.md
|
216
222
|
homepage: https://github.com/michaeltelford/wgit
|
217
223
|
licenses:
|
218
224
|
- MIT
|
@@ -221,7 +227,7 @@ metadata:
|
|
221
227
|
source_code_uri: https://github.com/michaeltelford/wgit
|
222
228
|
changelog_uri: https://github.com/michaeltelford/wgit/blob/master/CHANGELOG.md
|
223
229
|
bug_tracker_uri: https://github.com/michaeltelford/wgit/issues
|
224
|
-
documentation_uri: https://www.rubydoc.info/
|
230
|
+
documentation_uri: https://www.rubydoc.info/github/michaeltelford/wgit/master
|
225
231
|
allowed_push_host: https://rubygems.org
|
226
232
|
post_install_message:
|
227
233
|
rdoc_options: []
|
@@ -238,8 +244,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
238
244
|
- !ruby/object:Gem::Version
|
239
245
|
version: '0'
|
240
246
|
requirements: []
|
241
|
-
|
242
|
-
rubygems_version: 2.7.6
|
247
|
+
rubygems_version: 3.0.6
|
243
248
|
signing_key:
|
244
249
|
specification_version: 4
|
245
250
|
summary: Wgit is a Ruby gem similar in nature to GNU's `wget` tool. It provides an
|