wgit 0.5.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
  module Wgit
2
- # Response class representing a generic HTTP crawl response.
2
+ # Response class modeling a generic HTTP GET response.
3
3
  class Response
4
4
  # The underlying HTTP adapter/library response object.
5
5
  attr_accessor :adapter_response
@@ -16,9 +16,6 @@ module Wgit
16
16
  # The redirections of the response.
17
17
  attr_reader :redirections
18
18
 
19
- # The number of redirections for the response.
20
- attr_reader :redirect_count
21
-
22
19
  # The HTTP response status code.
23
20
  attr_reader :status
24
21
 
@@ -59,11 +56,11 @@ module Wgit
59
56
  @body.empty? ? nil : @body
60
57
  end
61
58
 
62
- # Returns true if the response isn't a #success? or a #redirect?
59
+ # Returns whether or not a server response is absent.
63
60
  #
64
- # @return [Boolean] True if failed, false otherwise.
61
+ # @return [Boolean] True if the status is nil or < 1, false otherwise.
65
62
  def failure?
66
- !success? && !redirect?
63
+ !success?
67
64
  end
68
65
 
69
66
  # Sets the headers Hash to the given value. The header keys are mapped
@@ -72,7 +69,10 @@ module Wgit
72
69
  # @param headers [Hash] The new response headers.
73
70
  # @return [Hash] @headers's new value.
74
71
  def headers=(headers)
75
- return @headers = {} unless headers
72
+ unless headers
73
+ @headers = {}
74
+ return
75
+ end
76
76
 
77
77
  @headers = headers.map do |k, v|
78
78
  k = k.downcase.gsub('-', '_').to_sym
@@ -125,20 +125,20 @@ module Wgit
125
125
  @status = int.positive? ? int : nil
126
126
  end
127
127
 
128
- # Returns whether or not the response is a 2xx Success.
128
+ # Returns whether or not a server response is present.
129
129
  #
130
- # @return [Boolean] True if 2xx Success, false otherwise.
130
+ # @return [Boolean] True if the status is > 0, false otherwise.
131
131
  def success?
132
132
  return false unless @status
133
133
 
134
- @status.between?(200, 299)
134
+ @status.positive?
135
135
  end
136
136
 
137
- alias code status
138
- alias content body
139
- alias crawl_time total_time
140
- alias to_s body
141
- alias redirects redirections
142
- alias length size
137
+ alias code status
138
+ alias content body
139
+ alias crawl_duration total_time
140
+ alias to_s body
141
+ alias redirects redirections
142
+ alias length size
143
143
  end
144
144
  end
@@ -6,20 +6,20 @@ require 'uri'
6
6
  require 'addressable/uri'
7
7
 
8
8
  module Wgit
9
- # Class modeling a web based HTTP URL.
9
+ # Class modeling/serialising a web based HTTP URL.
10
10
  #
11
11
  # Can be an internal/relative link e.g. "about.html" or an absolute URL
12
- # e.g. "http://www.google.co.uk". Is a subclass of String and uses 'uri' and
13
- # 'addressable/uri' internally.
12
+ # e.g. "http://www.google.co.uk". Is a subclass of String and uses `URI` and
13
+ # `addressable/uri` internally for parsing.
14
14
  #
15
- # Most of the methods in this class return new Wgit::Url instances making the
16
- # method calls chainable e.g. url.omit_base.omit_fragment etc. The methods
17
- # also try to be idempotent where possible.
15
+ # Most of the methods in this class return new `Wgit::Url` instances making
16
+ # the method calls chainable e.g. `url.omit_base.omit_fragment` etc. The
17
+ # methods also try to be idempotent where possible.
18
18
  class Url < String
19
19
  include Assertable
20
20
 
21
21
  # Whether or not the Url has been crawled or not. A custom crawled= method
22
- # is provided by this class, overridding the default one.
22
+ # is provided by this class.
23
23
  attr_reader :crawled
24
24
 
25
25
  # The Time stamp of when this Url was crawled.
@@ -28,10 +28,10 @@ module Wgit
28
28
  # The duration of the crawl for this Url (in seconds).
29
29
  attr_accessor :crawl_duration
30
30
 
31
- # Initializes a new instance of Wgit::Url which represents a web based
31
+ # Initializes a new instance of Wgit::Url which models a web based
32
32
  # HTTP URL.
33
33
  #
34
- # @param url_or_obj [String, Wgit::Url, Object#fetch#[]] Is either a String
34
+ # @param url_or_obj [String, Wgit::Url, #fetch#[]] Is either a String
35
35
  # based URL or an object representing a Database record e.g. a MongoDB
36
36
  # document/object.
37
37
  # @param crawled [Boolean] Whether or not the HTML of the URL's web page
@@ -90,6 +90,23 @@ module Wgit
90
90
  obj.is_a?(Wgit::Url) ? obj : new(obj)
91
91
  end
92
92
 
93
+ # Returns a Wgit::Url instance from Wgit::Url.parse, or nil if obj cannot
94
+ # be parsed successfully e.g. the String is invalid.
95
+ #
96
+ # Use this method when you can't gaurentee that obj is parsable as a URL.
97
+ # See Wgit::Url.parse for more information.
98
+ #
99
+ # @param obj [Object] The object to parse, which #is_a?(String).
100
+ # @raise [StandardError] If obj.is_a?(String) is false.
101
+ # @return [Wgit::Url] A Wgit::Url instance or nil (if obj is invalid).
102
+ def self.parse?(obj)
103
+ parse(obj)
104
+ rescue Addressable::URI::InvalidURIError
105
+ Wgit.logger.debug("Wgit::Url.parse?('#{obj}') exception: \
106
+ Addressable::URI::InvalidURIError")
107
+ nil
108
+ end
109
+
93
110
  # Sets the @crawled instance var, also setting @date_crawled for
94
111
  # convenience.
95
112
  #
@@ -98,8 +115,6 @@ module Wgit
98
115
  def crawled=(bool)
99
116
  @crawled = bool
100
117
  @date_crawled = bool ? Wgit::Utils.time_stamp : nil
101
-
102
- bool
103
118
  end
104
119
 
105
120
  # Overrides String#replace setting the new_url @uri and String value.
@@ -114,30 +129,36 @@ module Wgit
114
129
 
115
130
  # Returns true if self is a relative Url; false if absolute.
116
131
  #
117
- # All external links in a page are expected to have a scheme prefix e.g.
118
- # 'http://', otherwise the link is treated as an internal link (regardless
132
+ # An absolute URL must have a scheme prefix e.g.
133
+ # 'http://', otherwise the URL is regarded as being relative (regardless
119
134
  # of whether it's valid or not). The only exception is if an opts arg is
120
135
  # provided and self is a page belonging to that arg type e.g. host; then
121
136
  # the link is relative.
122
137
  #
138
+ # @example
139
+ # url = Wgit::Url.new('http://example.com/about')
140
+ #
141
+ # url.relative? # => false
142
+ # url.relative?(host: 'http://example.com') # => true
143
+ #
123
144
  # @param opts [Hash] The options with which to check relativity. Only one
124
145
  # opts param should be provided. The provided opts param Url must be
125
146
  # absolute and be prefixed with a scheme. Consider using the output of
126
- # Wgit::Url#to_base which should work unless it's nil.
127
- # @option opts [Wgit::Url, String] :base The Url base e.g.
128
- # http://www.google.com/how which gives a base of
129
- # 'http://www.google.com'.
147
+ # Wgit::Url#to_origin which should work (unless it's nil).
148
+ # @option opts [Wgit::Url, String] :origin The Url origin e.g.
149
+ # http://www.google.com:81/how which gives a origin of
150
+ # 'http://www.google.com:81'.
130
151
  # @option opts [Wgit::Url, String] :host The Url host e.g.
131
152
  # http://www.google.com/how which gives a host of 'www.google.com'.
132
153
  # @option opts [Wgit::Url, String] :domain The Url domain e.g.
133
154
  # http://www.google.com/how which gives a domain of 'google.com'.
134
155
  # @option opts [Wgit::Url, String] :brand The Url brand e.g.
135
156
  # http://www.google.com/how which gives a domain of 'google'.
136
- # @raise [StandardError] If self is invalid e.g. empty or an invalid opts
157
+ # @raise [StandardError] If self is invalid (e.g. empty) or an invalid opts
137
158
  # param has been provided.
138
159
  # @return [Boolean] True if relative, false if absolute.
139
160
  def relative?(opts = {})
140
- defaults = { base: nil, host: nil, domain: nil, brand: nil }
161
+ defaults = { origin: nil, host: nil, domain: nil, brand: nil }
141
162
  opts = defaults.merge(opts)
142
163
  raise 'Url (self) cannot be empty' if empty?
143
164
 
@@ -151,14 +172,14 @@ module Wgit
151
172
 
152
173
  type, url = opts.first
153
174
  url = Wgit::Url.new(url)
154
- unless url.to_base
155
- raise "Invalid opts param value, Url must be absolute and contain \
156
- protocol scheme: #{url}"
175
+ if url.invalid?
176
+ raise "Invalid opts param value, it must be absolute, containing a \
177
+ protocol scheme and domain (e.g. http://example.com): #{url}"
157
178
  end
158
179
 
159
180
  case type
160
- when :base # http://www.google.com
161
- to_base == url.to_base
181
+ when :origin # http://www.google.com:81
182
+ to_origin == url.to_origin
162
183
  when :host # www.google.com
163
184
  to_host == url.to_host
164
185
  when :domain # google.com
@@ -177,18 +198,20 @@ protocol scheme: #{url}"
177
198
  @uri.absolute?
178
199
  end
179
200
 
180
- # Returns if self is a valid and absolute HTTP Url or not.
201
+ # Returns if self is a valid and absolute HTTP URL or not. Self should
202
+ # always be crawlable if this method returns true.
181
203
  #
182
- # @return [Boolean] True if valid and absolute, otherwise false.
204
+ # @return [Boolean] True if valid, absolute and crawable, otherwise false.
183
205
  def valid?
184
206
  return false if relative?
185
- return false unless start_with?('http://') || start_with?('https://')
186
- return false if URI::DEFAULT_PARSER.make_regexp.match(normalize).nil?
207
+ return false unless to_origin && to_domain
208
+ return false unless URI::DEFAULT_PARSER.make_regexp.match(normalize)
187
209
 
188
210
  true
189
211
  end
190
212
 
191
- # Returns if self is an invalid (relative) HTTP Url or not.
213
+ # Returns if self is an invalid (e.g. relative) HTTP URL. See
214
+ # Wgit::Url#valid? for the inverse (and more information).
192
215
  #
193
216
  # @return [Boolean] True if invalid, otherwise false.
194
217
  def invalid?
@@ -205,7 +228,7 @@ protocol scheme: #{url}"
205
228
  raise 'other must be relative' unless other.relative?
206
229
 
207
230
  other = other.omit_leading_slash
208
- separator = other.start_with?('#') || other.start_with?('?') ? '' : '/'
231
+ separator = %w[# ? .].include?(other[0]) ? '' : '/'
209
232
 
210
233
  # We use to_s below to call String#+, not Wgit::Url#+ (alias for concat).
211
234
  concatted = omit_trailing_slash.to_s + separator.to_s + other.to_s
@@ -213,7 +236,8 @@ protocol scheme: #{url}"
213
236
  Wgit::Url.new(concatted)
214
237
  end
215
238
 
216
- # Normalises/escapes self and returns a new Wgit::Url. Self isn't modified.
239
+ # Normalizes/escapes self and returns a new Wgit::Url. Self isn't modified.
240
+ # This should be used before GET'ing the url, in case it has IRI chars.
217
241
  #
218
242
  # @return [Wgit::Url] An escaped version of self.
219
243
  def normalize
@@ -224,22 +248,23 @@ protocol scheme: #{url}"
224
248
  # modify the receiver.
225
249
  #
226
250
  # If self is absolute then it's returned as is, making this method
227
- # idempotent. The doc's <base> element is used if present, otherwise
228
- # doc.url is used as the base; which is concatted with self.
251
+ # idempotent. The doc's `<base>` element is used if present, otherwise
252
+ # `doc.url` is used as the base; which is concatted with self.
229
253
  #
230
- # Typically used to build an absolute link obtained from a document e.g.
254
+ # Typically used to build an absolute link obtained from a document.
231
255
  #
256
+ # @example
232
257
  # link = Wgit::Url.new('/favicon.png')
233
258
  # doc = Wgit::Document.new('http://example.com')
234
259
  #
235
- # link.prefix_base(doc) # => "http://example.com/favicon.png"
260
+ # link.make_absolute(doc) # => "http://example.com/favicon.png"
236
261
  #
237
262
  # @param doc [Wgit::Document] The doc whose base Url is concatted with
238
263
  # self.
239
264
  # @raise [StandardError] If doc isn't a Wgit::Document or if `doc.base_url`
240
265
  # raises an Exception.
241
266
  # @return [Wgit::Url] Self in absolute form.
242
- def prefix_base(doc)
267
+ def make_absolute(doc)
243
268
  assert_type(doc, Wgit::Document)
244
269
 
245
270
  absolute? ? self : doc.base_url(link: self).concat(self)
@@ -268,8 +293,7 @@ protocol scheme: #{url}"
268
293
  #
269
294
  # @return [Hash] self's instance vars as a Hash.
270
295
  def to_h
271
- ignore = ['@uri']
272
- h = Wgit::Utils.to_h(self, ignore: ignore)
296
+ h = Wgit::Utils.to_h(self, ignore: ['@uri'])
273
297
  Hash[h.to_a.insert(0, ['url', self])] # Insert url at position 0.
274
298
  end
275
299
 
@@ -312,6 +336,20 @@ protocol scheme: #{url}"
312
336
  host ? Wgit::Url.new(host) : nil
313
337
  end
314
338
 
339
+ # Returns a new Wgit::Url containing just the port of this URL e.g.
340
+ # Given http://www.google.co.uk:443/about.html, '443' is returned.
341
+ #
342
+ # @return [Wgit::Url, nil] Containing just the port or nil.
343
+ def to_port
344
+ port = @uri.port
345
+
346
+ # @uri.port defaults port to 80/443 if missing, so we check for :#{port}.
347
+ return nil unless port
348
+ return nil unless include?(":#{port}")
349
+
350
+ Wgit::Url.new(port.to_s)
351
+ end
352
+
315
353
  # Returns a new Wgit::Url containing just the domain of this URL e.g.
316
354
  # Given http://www.google.co.uk/about.html, google.co.uk is returned.
317
355
  #
@@ -321,6 +359,20 @@ protocol scheme: #{url}"
321
359
  domain ? Wgit::Url.new(domain) : nil
322
360
  end
323
361
 
362
+ # Returns a new Wgit::Url containing just the sub domain of this URL e.g.
363
+ # Given http://scripts.dev.google.com, scripts.dev is returned.
364
+ #
365
+ # @return [Wgit::Url, nil] Containing just the sub domain or nil.
366
+ def to_sub_domain
367
+ return nil unless to_host
368
+
369
+ dot_domain = ".#{to_domain}"
370
+ return nil unless include?(dot_domain)
371
+
372
+ sub_domain = to_host.sub(dot_domain, '')
373
+ Wgit::Url.new(sub_domain)
374
+ end
375
+
324
376
  # Returns a new Wgit::Url containing just the brand of this URL e.g.
325
377
  # Given http://www.google.co.uk/about.html, google is returned.
326
378
  #
@@ -336,12 +388,24 @@ protocol scheme: #{url}"
336
388
  # @return [Wgit::Url, nil] The base of self e.g. http://www.google.co.uk or
337
389
  # nil.
338
390
  def to_base
339
- return nil if @uri.scheme.nil? || @uri.host.nil?
391
+ return nil unless @uri.scheme && @uri.host
340
392
 
341
393
  base = "#{@uri.scheme}://#{@uri.host}"
342
394
  Wgit::Url.new(base)
343
395
  end
344
396
 
397
+ # Returns only the origin of this URL e.g. the protocol scheme, host and
398
+ # port combined. For http://localhost:3000/api, http://localhost:3000 gets
399
+ # returned. If there's no port present, then to_base is returned.
400
+ #
401
+ # @return [Wgit::Url, nil] The origin of self or nil.
402
+ def to_origin
403
+ return nil unless to_base
404
+ return to_base unless to_port
405
+
406
+ Wgit::Url.new("#{to_base}:#{to_port}")
407
+ end
408
+
345
409
  # Returns the path of this URL e.g. the bit after the host without slashes.
346
410
  # For example:
347
411
  # Wgit::Url.new("http://www.google.co.uk/about.html/").to_path returns
@@ -370,7 +434,7 @@ protocol scheme: #{url}"
370
434
  end
371
435
 
372
436
  # Returns a new Wgit::Url containing just the query string of this URL
373
- # e.g. Given http://google.com?q=ruby, '?q=ruby' is returned.
437
+ # e.g. Given http://google.com?q=foo&bar=1, 'q=ruby&bar=1' is returned.
374
438
  #
375
439
  # @return [Wgit::Url, nil] Containing just the query string or nil.
376
440
  def to_query
@@ -378,6 +442,24 @@ protocol scheme: #{url}"
378
442
  query ? Wgit::Url.new(query) : nil
379
443
  end
380
444
 
445
+ # Returns a Hash containing just the query string parameters of this URL
446
+ # e.g. Given http://google.com?q=ruby, "{ 'q' => 'ruby' }" is returned.
447
+ #
448
+ # @param symbolize_keys [Boolean] The returned Hash keys will be Symbols if
449
+ # true, Strings otherwise.
450
+ # @return [Hash<String | Symbol, String>] Containing the query string
451
+ # params or empty if the URL doesn't contain any query parameters.
452
+ def to_query_hash(symbolize_keys: false)
453
+ query_str = to_query
454
+ return {} unless query_str
455
+
456
+ query_str.split('&').each_with_object({}) do |param, hash|
457
+ k, v = param.split('=')
458
+ k = k.to_sym if symbolize_keys
459
+ hash[k] = v
460
+ end
461
+ end
462
+
381
463
  # Returns a new Wgit::Url containing just the fragment string of this URL
382
464
  # e.g. Given http://google.com#about, #about is returned.
383
465
  #
@@ -399,6 +481,24 @@ protocol scheme: #{url}"
399
481
  segs.length > 1 ? Wgit::Url.new(segs.last) : nil
400
482
  end
401
483
 
484
+ # Returns a new Wgit::Url containing just the username string of this URL
485
+ # e.g. Given http://me:pass1@example.com, me is returned.
486
+ #
487
+ # @return [Wgit::Url, nil] Containing just the user string or nil.
488
+ def to_user
489
+ user = @uri.user
490
+ user ? Wgit::Url.new(user) : nil
491
+ end
492
+
493
+ # Returns a new Wgit::Url containing just the password string of this URL
494
+ # e.g. Given http://me:pass1@example.com, pass1 is returned.
495
+ #
496
+ # @return [Wgit::Url, nil] Containing just the password string or nil.
497
+ def to_password
498
+ password = @uri.password
499
+ password ? Wgit::Url.new(password) : nil
500
+ end
501
+
402
502
  # Omits the given URL components from self and returns a new Wgit::Url.
403
503
  #
404
504
  # Calls Addressable::URI#omit underneath and creates a new Wgit::Url from
@@ -442,7 +542,7 @@ protocol scheme: #{url}"
442
542
  .omit_trailing_slash
443
543
  end
444
544
 
445
- # Returns a new Wgit::Url with the base (proto and host) removed e.g. Given
545
+ # Returns a new Wgit::Url with the base (scheme and host) removed e.g. Given
446
546
  # http://google.com/search?q=something#about, search?q=something#about is
447
547
  # returned. If relative and base isn't present then self is returned.
448
548
  # Leading and trailing slashes are always stripped from the return value.
@@ -457,6 +557,21 @@ protocol scheme: #{url}"
457
557
  Wgit::Url.new(omit_base).omit_slashes
458
558
  end
459
559
 
560
+ # Returns a new Wgit::Url with the origin (base + port) removed e.g. Given
561
+ # http://google.com:81/search?q=something#about, search?q=something#about is
562
+ # returned. If relative and base isn't present then self is returned.
563
+ # Leading and trailing slashes are always stripped from the return value.
564
+ #
565
+ # @return [Wgit::Url] Self containing everything after the origin.
566
+ def omit_origin
567
+ origin = to_origin
568
+ omit_origin = origin ? gsub(origin, '') : self
569
+
570
+ return self if ['', '/'].include?(omit_origin)
571
+
572
+ Wgit::Url.new(omit_origin).omit_slashes
573
+ end
574
+
460
575
  # Returns a new Wgit::Url with the query string portion removed e.g. Given
461
576
  # http://google.com/search?q=hello, http://google.com/search is
462
577
  # returned. Self is returned as is if no query string is present. A URL
@@ -502,25 +617,38 @@ protocol scheme: #{url}"
502
617
  start_with?('#')
503
618
  end
504
619
 
620
+ # Returns true if self equals '/' a.k.a. index.
621
+ #
622
+ # @return [Boolean] True if self equals '/', false otherwise.
623
+ def index?
624
+ self == '/'
625
+ end
626
+
505
627
  alias + concat
506
628
  alias crawled? crawled
507
- alias normalise normalize
508
629
  alias is_relative? relative?
509
630
  alias is_absolute? absolute?
510
631
  alias is_valid? valid?
511
632
  alias is_query? query?
512
633
  alias is_fragment? fragment?
634
+ alias is_index? index?
513
635
  alias uri to_uri
514
636
  alias url to_url
515
637
  alias scheme to_scheme
516
638
  alias host to_host
639
+ alias port to_port
517
640
  alias domain to_domain
518
641
  alias brand to_brand
519
642
  alias base to_base
643
+ alias origin to_origin
520
644
  alias path to_path
521
645
  alias endpoint to_endpoint
522
646
  alias query to_query
647
+ alias query_hash to_query_hash
523
648
  alias fragment to_fragment
524
649
  alias extension to_extension
650
+ alias user to_user
651
+ alias password to_password
652
+ alias sub_domain to_sub_domain
525
653
  end
526
654
  end