wgit 0.5.1 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/wgit/response.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  module Wgit
2
- # Response class representing a generic HTTP crawl response.
2
+ # Response class modeling a generic HTTP GET response.
3
3
  class Response
4
4
  # The underlying HTTP adapter/library response object.
5
5
  attr_accessor :adapter_response
@@ -56,11 +56,11 @@ module Wgit
56
56
  @body.empty? ? nil : @body
57
57
  end
58
58
 
59
- # Returns true if the response isn't a #success? or a #redirect?
59
+ # Returns whether or not a server response is absent.
60
60
  #
61
- # @return [Boolean] True if failed, false otherwise.
61
+ # @return [Boolean] True if the status is nil or < 1, false otherwise.
62
62
  def failure?
63
- !success? && !redirect?
63
+ !success?
64
64
  end
65
65
 
66
66
  # Sets the headers Hash to the given value. The header keys are mapped
@@ -69,7 +69,10 @@ module Wgit
69
69
  # @param headers [Hash] The new response headers.
70
70
  # @return [Hash] @headers's new value.
71
71
  def headers=(headers)
72
- return @headers = {} unless headers
72
+ unless headers
73
+ @headers = {}
74
+ return
75
+ end
73
76
 
74
77
  @headers = headers.map do |k, v|
75
78
  k = k.downcase.gsub('-', '_').to_sym
@@ -122,20 +125,20 @@ module Wgit
122
125
  @status = int.positive? ? int : nil
123
126
  end
124
127
 
125
- # Returns whether or not the response is a 2xx Success.
128
+ # Returns whether or not a server response is present.
126
129
  #
127
- # @return [Boolean] True if 2xx Success, false otherwise.
130
+ # @return [Boolean] True if the status is > 0, false otherwise.
128
131
  def success?
129
132
  return false unless @status
130
133
 
131
- @status.between?(200, 299)
134
+ @status.positive?
132
135
  end
133
136
 
134
- alias code status
135
- alias content body
136
- alias crawl_time total_time
137
- alias to_s body
138
- alias redirects redirections
139
- alias length size
137
+ alias code status
138
+ alias content body
139
+ alias crawl_duration total_time
140
+ alias to_s body
141
+ alias redirects redirections
142
+ alias length size
140
143
  end
141
144
  end
data/lib/wgit/url.rb CHANGED
@@ -6,20 +6,20 @@ require 'uri'
6
6
  require 'addressable/uri'
7
7
 
8
8
  module Wgit
9
- # Class modeling a web based HTTP URL.
9
+ # Class modeling/serialising a web based HTTP URL.
10
10
  #
11
11
  # Can be an internal/relative link e.g. "about.html" or an absolute URL
12
- # e.g. "http://www.google.co.uk". Is a subclass of String and uses 'uri' and
13
- # 'addressable/uri' internally.
12
+ # e.g. "http://www.google.co.uk". Is a subclass of String and uses `URI` and
13
+ # `addressable/uri` internally for parsing.
14
14
  #
15
- # Most of the methods in this class return new Wgit::Url instances making the
16
- # method calls chainable e.g. url.omit_base.omit_fragment etc. The methods
17
- # also try to be idempotent where possible.
15
+ # Most of the methods in this class return new `Wgit::Url` instances making
16
+ # the method calls chainable e.g. `url.omit_base.omit_fragment` etc. The
17
+ # methods also try to be idempotent where possible.
18
18
  class Url < String
19
19
  include Assertable
20
20
 
21
21
  # Whether or not the Url has been crawled or not. A custom crawled= method
22
- # is provided by this class, overridding the default one.
22
+ # is provided by this class.
23
23
  attr_reader :crawled
24
24
 
25
25
  # The Time stamp of when this Url was crawled.
@@ -28,10 +28,10 @@ module Wgit
28
28
  # The duration of the crawl for this Url (in seconds).
29
29
  attr_accessor :crawl_duration
30
30
 
31
- # Initializes a new instance of Wgit::Url which represents a web based
31
+ # Initializes a new instance of Wgit::Url which models a web based
32
32
  # HTTP URL.
33
33
  #
34
- # @param url_or_obj [String, Wgit::Url, Object#fetch#[]] Is either a String
34
+ # @param url_or_obj [String, Wgit::Url, #fetch#[]] Is either a String
35
35
  # based URL or an object representing a Database record e.g. a MongoDB
36
36
  # document/object.
37
37
  # @param crawled [Boolean] Whether or not the HTML of the URL's web page
@@ -90,6 +90,23 @@ module Wgit
90
90
  obj.is_a?(Wgit::Url) ? obj : new(obj)
91
91
  end
92
92
 
93
+ # Returns a Wgit::Url instance from Wgit::Url.parse, or nil if obj cannot
94
+ # be parsed successfully e.g. the String is invalid.
95
+ #
96
+ # Use this method when you can't gaurentee that obj is parsable as a URL.
97
+ # See Wgit::Url.parse for more information.
98
+ #
99
+ # @param obj [Object] The object to parse, which #is_a?(String).
100
+ # @raise [StandardError] If obj.is_a?(String) is false.
101
+ # @return [Wgit::Url] A Wgit::Url instance or nil (if obj is invalid).
102
+ def self.parse?(obj)
103
+ parse(obj)
104
+ rescue Addressable::URI::InvalidURIError
105
+ Wgit.logger.debug("Wgit::Url.parse?('#{obj}') exception: \
106
+ Addressable::URI::InvalidURIError")
107
+ nil
108
+ end
109
+
93
110
  # Sets the @crawled instance var, also setting @date_crawled for
94
111
  # convenience.
95
112
  #
@@ -98,8 +115,6 @@ module Wgit
98
115
  def crawled=(bool)
99
116
  @crawled = bool
100
117
  @date_crawled = bool ? Wgit::Utils.time_stamp : nil
101
-
102
- bool
103
118
  end
104
119
 
105
120
  # Overrides String#replace setting the new_url @uri and String value.
@@ -114,33 +129,40 @@ module Wgit
114
129
 
115
130
  # Returns true if self is a relative Url; false if absolute.
116
131
  #
117
- # All external links in a page are expected to have a scheme prefix e.g.
118
- # 'http://', otherwise the link is treated as an internal link (regardless
132
+ # An absolute URL must have a scheme prefix e.g.
133
+ # 'http://', otherwise the URL is regarded as being relative (regardless
119
134
  # of whether it's valid or not). The only exception is if an opts arg is
120
135
  # provided and self is a page belonging to that arg type e.g. host; then
121
136
  # the link is relative.
122
137
  #
138
+ # @example
139
+ # url = Wgit::Url.new('http://example.com/about')
140
+ #
141
+ # url.relative? # => false
142
+ # url.relative?(host: 'http://example.com') # => true
143
+ #
123
144
  # @param opts [Hash] The options with which to check relativity. Only one
124
145
  # opts param should be provided. The provided opts param Url must be
125
146
  # absolute and be prefixed with a scheme. Consider using the output of
126
- # Wgit::Url#to_base which should work unless it's nil.
127
- # @option opts [Wgit::Url, String] :base The Url base e.g.
128
- # http://www.google.com/how which gives a base of
129
- # 'http://www.google.com'.
147
+ # Wgit::Url#to_origin which should work (unless it's nil).
148
+ # @option opts [Wgit::Url, String] :origin The Url origin e.g.
149
+ # http://www.google.com:81/how which gives a origin of
150
+ # 'http://www.google.com:81'.
130
151
  # @option opts [Wgit::Url, String] :host The Url host e.g.
131
152
  # http://www.google.com/how which gives a host of 'www.google.com'.
132
153
  # @option opts [Wgit::Url, String] :domain The Url domain e.g.
133
154
  # http://www.google.com/how which gives a domain of 'google.com'.
134
155
  # @option opts [Wgit::Url, String] :brand The Url brand e.g.
135
156
  # http://www.google.com/how which gives a domain of 'google'.
136
- # @raise [StandardError] If self is invalid e.g. empty or an invalid opts
157
+ # @raise [StandardError] If self is invalid (e.g. empty) or an invalid opts
137
158
  # param has been provided.
138
159
  # @return [Boolean] True if relative, false if absolute.
139
160
  def relative?(opts = {})
140
- defaults = { base: nil, host: nil, domain: nil, brand: nil }
161
+ defaults = { origin: nil, host: nil, domain: nil, brand: nil }
141
162
  opts = defaults.merge(opts)
142
163
  raise 'Url (self) cannot be empty' if empty?
143
164
 
165
+ return false if scheme_relative?
144
166
  return true if @uri.relative?
145
167
 
146
168
  # Self is absolute but may be relative to the opts param e.g. host.
@@ -151,14 +173,14 @@ module Wgit
151
173
 
152
174
  type, url = opts.first
153
175
  url = Wgit::Url.new(url)
154
- unless url.to_base
155
- raise "Invalid opts param value, Url must be absolute and contain \
156
- protocol scheme: #{url}"
176
+ if url.invalid?
177
+ raise "Invalid opts param value, it must be absolute, containing a \
178
+ protocol scheme and domain (e.g. http://example.com): #{url}"
157
179
  end
158
180
 
159
181
  case type
160
- when :base # http://www.google.com
161
- to_base == url.to_base
182
+ when :origin # http://www.google.com:81
183
+ to_origin == url.to_origin
162
184
  when :host # www.google.com
163
185
  to_host == url.to_host
164
186
  when :domain # google.com
@@ -177,18 +199,20 @@ protocol scheme: #{url}"
177
199
  @uri.absolute?
178
200
  end
179
201
 
180
- # Returns if self is a valid and absolute HTTP Url or not.
202
+ # Returns if self is a valid and absolute HTTP URL or not. Self should
203
+ # always be crawlable if this method returns true.
181
204
  #
182
- # @return [Boolean] True if valid and absolute, otherwise false.
205
+ # @return [Boolean] True if valid, absolute and crawable, otherwise false.
183
206
  def valid?
184
207
  return false if relative?
185
- return false unless start_with?('http://') || start_with?('https://')
186
- return false if URI::DEFAULT_PARSER.make_regexp.match(normalize).nil?
208
+ return false unless to_origin && to_domain
209
+ return false unless URI::DEFAULT_PARSER.make_regexp.match(normalize)
187
210
 
188
211
  true
189
212
  end
190
213
 
191
- # Returns if self is an invalid (relative) HTTP Url or not.
214
+ # Returns if self is an invalid (e.g. relative) HTTP URL. See
215
+ # Wgit::Url#valid? for the inverse (and more information).
192
216
  #
193
217
  # @return [Boolean] True if invalid, otherwise false.
194
218
  def invalid?
@@ -213,7 +237,8 @@ protocol scheme: #{url}"
213
237
  Wgit::Url.new(concatted)
214
238
  end
215
239
 
216
- # Normalises/escapes self and returns a new Wgit::Url. Self isn't modified.
240
+ # Normalizes/escapes self and returns a new Wgit::Url. Self isn't modified.
241
+ # This should be used before GET'ing the url, in case it has IRI chars.
217
242
  #
218
243
  # @return [Wgit::Url] An escaped version of self.
219
244
  def normalize
@@ -224,43 +249,46 @@ protocol scheme: #{url}"
224
249
  # modify the receiver.
225
250
  #
226
251
  # If self is absolute then it's returned as is, making this method
227
- # idempotent. The doc's <base> element is used if present, otherwise
228
- # doc.url is used as the base; which is concatted with self.
252
+ # idempotent. The doc's `<base>` element is used if present, otherwise
253
+ # `doc.url` is used as the base; which is concatted with self.
229
254
  #
230
- # Typically used to build an absolute link obtained from a document e.g.
255
+ # Typically used to build an absolute link obtained from a document.
231
256
  #
257
+ # @example
232
258
  # link = Wgit::Url.new('/favicon.png')
233
259
  # doc = Wgit::Document.new('http://example.com')
234
260
  #
235
- # link.prefix_base(doc) # => "http://example.com/favicon.png"
261
+ # link.make_absolute(doc) # => "http://example.com/favicon.png"
236
262
  #
237
263
  # @param doc [Wgit::Document] The doc whose base Url is concatted with
238
264
  # self.
239
265
  # @raise [StandardError] If doc isn't a Wgit::Document or if `doc.base_url`
240
266
  # raises an Exception.
241
267
  # @return [Wgit::Url] Self in absolute form.
242
- def prefix_base(doc)
268
+ def make_absolute(doc)
243
269
  assert_type(doc, Wgit::Document)
270
+ raise 'Cannot make absolute when Document @url is not valid' \
271
+ unless doc.url.valid?
272
+
273
+ return prefix_scheme(doc.url.to_scheme&.to_sym) if scheme_relative?
244
274
 
245
275
  absolute? ? self : doc.base_url(link: self).concat(self)
246
276
  end
247
277
 
248
- # Returns self having prefixed a protocol scheme. Doesn't modify receiver.
278
+ # Returns self having prefixed a scheme/protocol. Doesn't modify receiver.
249
279
  # Returns self even if absolute (with scheme); therefore is idempotent.
250
280
  #
251
- # @param protocol [Symbol] Either :http or :https.
252
- # @return [Wgit::Url] Self with a protocol scheme prefix.
253
- def prefix_scheme(protocol: :http)
254
- return self if absolute?
255
-
256
- case protocol
257
- when :http
258
- Wgit::Url.new("http://#{url}")
259
- when :https
260
- Wgit::Url.new("https://#{url}")
261
- else
262
- raise "protocol must be :http or :https, not :#{protocol}"
281
+ # @param scheme [Symbol] Either :http or :https.
282
+ # @return [Wgit::Url] Self with a scheme prefix.
283
+ def prefix_scheme(scheme = :http)
284
+ unless %i[http https].include?(scheme)
285
+ raise "scheme must be :http or :https, not :#{scheme}"
263
286
  end
287
+
288
+ return self if absolute? && !scheme_relative?
289
+
290
+ separator = scheme_relative? ? '' : '//'
291
+ Wgit::Url.new("#{scheme}:#{separator}#{self}")
264
292
  end
265
293
 
266
294
  # Returns a Hash containing this Url's instance vars excluding @uri.
@@ -268,8 +296,7 @@ protocol scheme: #{url}"
268
296
  #
269
297
  # @return [Hash] self's instance vars as a Hash.
270
298
  def to_h
271
- ignore = ['@uri']
272
- h = Wgit::Utils.to_h(self, ignore: ignore)
299
+ h = Wgit::Utils.to_h(self, ignore: ['@uri'])
273
300
  Hash[h.to_a.insert(0, ['url', self])] # Insert url at position 0.
274
301
  end
275
302
 
@@ -312,6 +339,20 @@ protocol scheme: #{url}"
312
339
  host ? Wgit::Url.new(host) : nil
313
340
  end
314
341
 
342
+ # Returns a new Wgit::Url containing just the port of this URL e.g.
343
+ # Given http://www.google.co.uk:443/about.html, '443' is returned.
344
+ #
345
+ # @return [Wgit::Url, nil] Containing just the port or nil.
346
+ def to_port
347
+ port = @uri.port
348
+
349
+ # @uri.port defaults port to 80/443 if missing, so we check for :#{port}.
350
+ return nil unless port
351
+ return nil unless include?(":#{port}")
352
+
353
+ Wgit::Url.new(port.to_s)
354
+ end
355
+
315
356
  # Returns a new Wgit::Url containing just the domain of this URL e.g.
316
357
  # Given http://www.google.co.uk/about.html, google.co.uk is returned.
317
358
  #
@@ -321,6 +362,20 @@ protocol scheme: #{url}"
321
362
  domain ? Wgit::Url.new(domain) : nil
322
363
  end
323
364
 
365
+ # Returns a new Wgit::Url containing just the sub domain of this URL e.g.
366
+ # Given http://scripts.dev.google.com, scripts.dev is returned.
367
+ #
368
+ # @return [Wgit::Url, nil] Containing just the sub domain or nil.
369
+ def to_sub_domain
370
+ return nil unless to_host
371
+
372
+ dot_domain = ".#{to_domain}"
373
+ return nil unless include?(dot_domain)
374
+
375
+ sub_domain = to_host.sub(dot_domain, '')
376
+ Wgit::Url.new(sub_domain)
377
+ end
378
+
324
379
  # Returns a new Wgit::Url containing just the brand of this URL e.g.
325
380
  # Given http://www.google.co.uk/about.html, google is returned.
326
381
  #
@@ -336,12 +391,24 @@ protocol scheme: #{url}"
336
391
  # @return [Wgit::Url, nil] The base of self e.g. http://www.google.co.uk or
337
392
  # nil.
338
393
  def to_base
339
- return nil if @uri.scheme.nil? || @uri.host.nil?
394
+ return nil unless @uri.scheme && @uri.host
340
395
 
341
396
  base = "#{@uri.scheme}://#{@uri.host}"
342
397
  Wgit::Url.new(base)
343
398
  end
344
399
 
400
+ # Returns only the origin of this URL e.g. the protocol scheme, host and
401
+ # port combined. For http://localhost:3000/api, http://localhost:3000 gets
402
+ # returned. If there's no port present, then to_base is returned.
403
+ #
404
+ # @return [Wgit::Url, nil] The origin of self or nil.
405
+ def to_origin
406
+ return nil unless to_base
407
+ return to_base unless to_port
408
+
409
+ Wgit::Url.new("#{to_base}:#{to_port}")
410
+ end
411
+
345
412
  # Returns the path of this URL e.g. the bit after the host without slashes.
346
413
  # For example:
347
414
  # Wgit::Url.new("http://www.google.co.uk/about.html/").to_path returns
@@ -370,7 +437,7 @@ protocol scheme: #{url}"
370
437
  end
371
438
 
372
439
  # Returns a new Wgit::Url containing just the query string of this URL
373
- # e.g. Given http://google.com?q=ruby, '?q=ruby' is returned.
440
+ # e.g. Given http://google.com?q=foo&bar=1, 'q=ruby&bar=1' is returned.
374
441
  #
375
442
  # @return [Wgit::Url, nil] Containing just the query string or nil.
376
443
  def to_query
@@ -378,6 +445,24 @@ protocol scheme: #{url}"
378
445
  query ? Wgit::Url.new(query) : nil
379
446
  end
380
447
 
448
+ # Returns a Hash containing just the query string parameters of this URL
449
+ # e.g. Given http://google.com?q=ruby, "{ 'q' => 'ruby' }" is returned.
450
+ #
451
+ # @param symbolize_keys [Boolean] The returned Hash keys will be Symbols if
452
+ # true, Strings otherwise.
453
+ # @return [Hash<String | Symbol, String>] Containing the query string
454
+ # params or empty if the URL doesn't contain any query parameters.
455
+ def to_query_hash(symbolize_keys: false)
456
+ query_str = to_query
457
+ return {} unless query_str
458
+
459
+ query_str.split('&').each_with_object({}) do |param, hash|
460
+ k, v = param.split('=')
461
+ k = k.to_sym if symbolize_keys
462
+ hash[k] = v
463
+ end
464
+ end
465
+
381
466
  # Returns a new Wgit::Url containing just the fragment string of this URL
382
467
  # e.g. Given http://google.com#about, #about is returned.
383
468
  #
@@ -399,6 +484,24 @@ protocol scheme: #{url}"
399
484
  segs.length > 1 ? Wgit::Url.new(segs.last) : nil
400
485
  end
401
486
 
487
+ # Returns a new Wgit::Url containing just the username string of this URL
488
+ # e.g. Given http://me:pass1@example.com, me is returned.
489
+ #
490
+ # @return [Wgit::Url, nil] Containing just the user string or nil.
491
+ def to_user
492
+ user = @uri.user
493
+ user ? Wgit::Url.new(user) : nil
494
+ end
495
+
496
+ # Returns a new Wgit::Url containing just the password string of this URL
497
+ # e.g. Given http://me:pass1@example.com, pass1 is returned.
498
+ #
499
+ # @return [Wgit::Url, nil] Containing just the password string or nil.
500
+ def to_password
501
+ password = @uri.password
502
+ password ? Wgit::Url.new(password) : nil
503
+ end
504
+
402
505
  # Omits the given URL components from self and returns a new Wgit::Url.
403
506
  #
404
507
  # Calls Addressable::URI#omit underneath and creates a new Wgit::Url from
@@ -442,7 +545,7 @@ protocol scheme: #{url}"
442
545
  .omit_trailing_slash
443
546
  end
444
547
 
445
- # Returns a new Wgit::Url with the base (proto and host) removed e.g. Given
548
+ # Returns a new Wgit::Url with the base (scheme and host) removed e.g. Given
446
549
  # http://google.com/search?q=something#about, search?q=something#about is
447
550
  # returned. If relative and base isn't present then self is returned.
448
551
  # Leading and trailing slashes are always stripped from the return value.
@@ -457,6 +560,21 @@ protocol scheme: #{url}"
457
560
  Wgit::Url.new(omit_base).omit_slashes
458
561
  end
459
562
 
563
+ # Returns a new Wgit::Url with the origin (base + port) removed e.g. Given
564
+ # http://google.com:81/search?q=something#about, search?q=something#about is
565
+ # returned. If relative and base isn't present then self is returned.
566
+ # Leading and trailing slashes are always stripped from the return value.
567
+ #
568
+ # @return [Wgit::Url] Self containing everything after the origin.
569
+ def omit_origin
570
+ origin = to_origin
571
+ omit_origin = origin ? gsub(origin, '') : self
572
+
573
+ return self if ['', '/'].include?(omit_origin)
574
+
575
+ Wgit::Url.new(omit_origin).omit_slashes
576
+ end
577
+
460
578
  # Returns a new Wgit::Url with the query string portion removed e.g. Given
461
579
  # http://google.com/search?q=hello, http://google.com/search is
462
580
  # returned. Self is returned as is if no query string is present. A URL
@@ -502,25 +620,47 @@ protocol scheme: #{url}"
502
620
  start_with?('#')
503
621
  end
504
622
 
505
- alias + concat
506
- alias crawled? crawled
507
- alias normalise normalize
508
- alias is_relative? relative?
509
- alias is_absolute? absolute?
510
- alias is_valid? valid?
511
- alias is_query? query?
512
- alias is_fragment? fragment?
513
- alias uri to_uri
514
- alias url to_url
515
- alias scheme to_scheme
516
- alias host to_host
517
- alias domain to_domain
518
- alias brand to_brand
519
- alias base to_base
520
- alias path to_path
521
- alias endpoint to_endpoint
522
- alias query to_query
523
- alias fragment to_fragment
524
- alias extension to_extension
623
+ # Returns true if self equals '/' a.k.a. index.
624
+ #
625
+ # @return [Boolean] True if self equals '/', false otherwise.
626
+ def index?
627
+ self == '/'
628
+ end
629
+
630
+ # Returns true if self starts with '//' a.k.a a scheme/protocol relative
631
+ # path.
632
+ #
633
+ # @return [Boolean] True if self starts with '//', false otherwise.
634
+ def scheme_relative?
635
+ start_with?('//')
636
+ end
637
+
638
+ alias + concat
639
+ alias crawled? crawled
640
+ alias is_relative? relative?
641
+ alias is_absolute? absolute?
642
+ alias is_valid? valid?
643
+ alias is_query? query?
644
+ alias is_fragment? fragment?
645
+ alias is_index? index?
646
+ alias is_scheme_relative? scheme_relative?
647
+ alias uri to_uri
648
+ alias url to_url
649
+ alias scheme to_scheme
650
+ alias host to_host
651
+ alias port to_port
652
+ alias domain to_domain
653
+ alias brand to_brand
654
+ alias base to_base
655
+ alias origin to_origin
656
+ alias path to_path
657
+ alias endpoint to_endpoint
658
+ alias query to_query
659
+ alias query_hash to_query_hash
660
+ alias fragment to_fragment
661
+ alias extension to_extension
662
+ alias user to_user
663
+ alias password to_password
664
+ alias sub_domain to_sub_domain
525
665
  end
526
666
  end