wgit 0.5.1 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/wgit/response.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  module Wgit
2
- # Response class representing a generic HTTP crawl response.
2
+ # Response class modeling a generic HTTP GET response.
3
3
  class Response
4
4
  # The underlying HTTP adapter/library response object.
5
5
  attr_accessor :adapter_response
@@ -56,11 +56,11 @@ module Wgit
56
56
  @body.empty? ? nil : @body
57
57
  end
58
58
 
59
- # Returns true if the response isn't a #success? or a #redirect?
59
+ # Returns whether or not a server response is absent.
60
60
  #
61
- # @return [Boolean] True if failed, false otherwise.
61
+ # @return [Boolean] True if the status is nil or < 1, false otherwise.
62
62
  def failure?
63
- !success? && !redirect?
63
+ !success?
64
64
  end
65
65
 
66
66
  # Sets the headers Hash to the given value. The header keys are mapped
@@ -69,7 +69,10 @@ module Wgit
69
69
  # @param headers [Hash] The new response headers.
70
70
  # @return [Hash] @headers's new value.
71
71
  def headers=(headers)
72
- return @headers = {} unless headers
72
+ unless headers
73
+ @headers = {}
74
+ return
75
+ end
73
76
 
74
77
  @headers = headers.map do |k, v|
75
78
  k = k.downcase.gsub('-', '_').to_sym
@@ -122,20 +125,20 @@ module Wgit
122
125
  @status = int.positive? ? int : nil
123
126
  end
124
127
 
125
- # Returns whether or not the response is a 2xx Success.
128
+ # Returns whether or not a server response is present.
126
129
  #
127
- # @return [Boolean] True if 2xx Success, false otherwise.
130
+ # @return [Boolean] True if the status is > 0, false otherwise.
128
131
  def success?
129
132
  return false unless @status
130
133
 
131
- @status.between?(200, 299)
134
+ @status.positive?
132
135
  end
133
136
 
134
- alias code status
135
- alias content body
136
- alias crawl_time total_time
137
- alias to_s body
138
- alias redirects redirections
139
- alias length size
137
+ alias code status
138
+ alias content body
139
+ alias crawl_duration total_time
140
+ alias to_s body
141
+ alias redirects redirections
142
+ alias length size
140
143
  end
141
144
  end
data/lib/wgit/url.rb CHANGED
@@ -6,20 +6,20 @@ require 'uri'
6
6
  require 'addressable/uri'
7
7
 
8
8
  module Wgit
9
- # Class modeling a web based HTTP URL.
9
+ # Class modeling/serialising a web based HTTP URL.
10
10
  #
11
11
  # Can be an internal/relative link e.g. "about.html" or an absolute URL
12
- # e.g. "http://www.google.co.uk". Is a subclass of String and uses 'uri' and
13
- # 'addressable/uri' internally.
12
+ # e.g. "http://www.google.co.uk". Is a subclass of String and uses `URI` and
13
+ # `addressable/uri` internally for parsing.
14
14
  #
15
- # Most of the methods in this class return new Wgit::Url instances making the
16
- # method calls chainable e.g. url.omit_base.omit_fragment etc. The methods
17
- # also try to be idempotent where possible.
15
+ # Most of the methods in this class return new `Wgit::Url` instances making
16
+ # the method calls chainable e.g. `url.omit_base.omit_fragment` etc. The
17
+ # methods also try to be idempotent where possible.
18
18
  class Url < String
19
19
  include Assertable
20
20
 
21
21
  # Whether or not the Url has been crawled or not. A custom crawled= method
22
- # is provided by this class, overridding the default one.
22
+ # is provided by this class.
23
23
  attr_reader :crawled
24
24
 
25
25
  # The Time stamp of when this Url was crawled.
@@ -28,10 +28,10 @@ module Wgit
28
28
  # The duration of the crawl for this Url (in seconds).
29
29
  attr_accessor :crawl_duration
30
30
 
31
- # Initializes a new instance of Wgit::Url which represents a web based
31
+ # Initializes a new instance of Wgit::Url which models a web based
32
32
  # HTTP URL.
33
33
  #
34
- # @param url_or_obj [String, Wgit::Url, Object#fetch#[]] Is either a String
34
+ # @param url_or_obj [String, Wgit::Url, #fetch#[]] Is either a String
35
35
  # based URL or an object representing a Database record e.g. a MongoDB
36
36
  # document/object.
37
37
  # @param crawled [Boolean] Whether or not the HTML of the URL's web page
@@ -90,6 +90,23 @@ module Wgit
90
90
  obj.is_a?(Wgit::Url) ? obj : new(obj)
91
91
  end
92
92
 
93
+ # Returns a Wgit::Url instance from Wgit::Url.parse, or nil if obj cannot
94
+ # be parsed successfully e.g. the String is invalid.
95
+ #
96
+ # Use this method when you can't gaurentee that obj is parsable as a URL.
97
+ # See Wgit::Url.parse for more information.
98
+ #
99
+ # @param obj [Object] The object to parse, which #is_a?(String).
100
+ # @raise [StandardError] If obj.is_a?(String) is false.
101
+ # @return [Wgit::Url] A Wgit::Url instance or nil (if obj is invalid).
102
+ def self.parse?(obj)
103
+ parse(obj)
104
+ rescue Addressable::URI::InvalidURIError
105
+ Wgit.logger.debug("Wgit::Url.parse?('#{obj}') exception: \
106
+ Addressable::URI::InvalidURIError")
107
+ nil
108
+ end
109
+
93
110
  # Sets the @crawled instance var, also setting @date_crawled for
94
111
  # convenience.
95
112
  #
@@ -98,8 +115,6 @@ module Wgit
98
115
  def crawled=(bool)
99
116
  @crawled = bool
100
117
  @date_crawled = bool ? Wgit::Utils.time_stamp : nil
101
-
102
- bool
103
118
  end
104
119
 
105
120
  # Overrides String#replace setting the new_url @uri and String value.
@@ -114,33 +129,40 @@ module Wgit
114
129
 
115
130
  # Returns true if self is a relative Url; false if absolute.
116
131
  #
117
- # All external links in a page are expected to have a scheme prefix e.g.
118
- # 'http://', otherwise the link is treated as an internal link (regardless
132
+ # An absolute URL must have a scheme prefix e.g.
133
+ # 'http://', otherwise the URL is regarded as being relative (regardless
119
134
  # of whether it's valid or not). The only exception is if an opts arg is
120
135
  # provided and self is a page belonging to that arg type e.g. host; then
121
136
  # the link is relative.
122
137
  #
138
+ # @example
139
+ # url = Wgit::Url.new('http://example.com/about')
140
+ #
141
+ # url.relative? # => false
142
+ # url.relative?(host: 'http://example.com') # => true
143
+ #
123
144
  # @param opts [Hash] The options with which to check relativity. Only one
124
145
  # opts param should be provided. The provided opts param Url must be
125
146
  # absolute and be prefixed with a scheme. Consider using the output of
126
- # Wgit::Url#to_base which should work unless it's nil.
127
- # @option opts [Wgit::Url, String] :base The Url base e.g.
128
- # http://www.google.com/how which gives a base of
129
- # 'http://www.google.com'.
147
+ # Wgit::Url#to_origin which should work (unless it's nil).
148
+ # @option opts [Wgit::Url, String] :origin The Url origin e.g.
149
+ # http://www.google.com:81/how which gives a origin of
150
+ # 'http://www.google.com:81'.
130
151
  # @option opts [Wgit::Url, String] :host The Url host e.g.
131
152
  # http://www.google.com/how which gives a host of 'www.google.com'.
132
153
  # @option opts [Wgit::Url, String] :domain The Url domain e.g.
133
154
  # http://www.google.com/how which gives a domain of 'google.com'.
134
155
  # @option opts [Wgit::Url, String] :brand The Url brand e.g.
135
156
  # http://www.google.com/how which gives a domain of 'google'.
136
- # @raise [StandardError] If self is invalid e.g. empty or an invalid opts
157
+ # @raise [StandardError] If self is invalid (e.g. empty) or an invalid opts
137
158
  # param has been provided.
138
159
  # @return [Boolean] True if relative, false if absolute.
139
160
  def relative?(opts = {})
140
- defaults = { base: nil, host: nil, domain: nil, brand: nil }
161
+ defaults = { origin: nil, host: nil, domain: nil, brand: nil }
141
162
  opts = defaults.merge(opts)
142
163
  raise 'Url (self) cannot be empty' if empty?
143
164
 
165
+ return false if scheme_relative?
144
166
  return true if @uri.relative?
145
167
 
146
168
  # Self is absolute but may be relative to the opts param e.g. host.
@@ -151,14 +173,14 @@ module Wgit
151
173
 
152
174
  type, url = opts.first
153
175
  url = Wgit::Url.new(url)
154
- unless url.to_base
155
- raise "Invalid opts param value, Url must be absolute and contain \
156
- protocol scheme: #{url}"
176
+ if url.invalid?
177
+ raise "Invalid opts param value, it must be absolute, containing a \
178
+ protocol scheme and domain (e.g. http://example.com): #{url}"
157
179
  end
158
180
 
159
181
  case type
160
- when :base # http://www.google.com
161
- to_base == url.to_base
182
+ when :origin # http://www.google.com:81
183
+ to_origin == url.to_origin
162
184
  when :host # www.google.com
163
185
  to_host == url.to_host
164
186
  when :domain # google.com
@@ -177,18 +199,20 @@ protocol scheme: #{url}"
177
199
  @uri.absolute?
178
200
  end
179
201
 
180
- # Returns if self is a valid and absolute HTTP Url or not.
202
+ # Returns if self is a valid and absolute HTTP URL or not. Self should
203
+ # always be crawlable if this method returns true.
181
204
  #
182
- # @return [Boolean] True if valid and absolute, otherwise false.
205
+ # @return [Boolean] True if valid, absolute and crawable, otherwise false.
183
206
  def valid?
184
207
  return false if relative?
185
- return false unless start_with?('http://') || start_with?('https://')
186
- return false if URI::DEFAULT_PARSER.make_regexp.match(normalize).nil?
208
+ return false unless to_origin && to_domain
209
+ return false unless URI::DEFAULT_PARSER.make_regexp.match(normalize)
187
210
 
188
211
  true
189
212
  end
190
213
 
191
- # Returns if self is an invalid (relative) HTTP Url or not.
214
+ # Returns if self is an invalid (e.g. relative) HTTP URL. See
215
+ # Wgit::Url#valid? for the inverse (and more information).
192
216
  #
193
217
  # @return [Boolean] True if invalid, otherwise false.
194
218
  def invalid?
@@ -213,7 +237,8 @@ protocol scheme: #{url}"
213
237
  Wgit::Url.new(concatted)
214
238
  end
215
239
 
216
- # Normalises/escapes self and returns a new Wgit::Url. Self isn't modified.
240
+ # Normalizes/escapes self and returns a new Wgit::Url. Self isn't modified.
241
+ # This should be used before GET'ing the url, in case it has IRI chars.
217
242
  #
218
243
  # @return [Wgit::Url] An escaped version of self.
219
244
  def normalize
@@ -224,43 +249,46 @@ protocol scheme: #{url}"
224
249
  # modify the receiver.
225
250
  #
226
251
  # If self is absolute then it's returned as is, making this method
227
- # idempotent. The doc's <base> element is used if present, otherwise
228
- # doc.url is used as the base; which is concatted with self.
252
+ # idempotent. The doc's `<base>` element is used if present, otherwise
253
+ # `doc.url` is used as the base; which is concatted with self.
229
254
  #
230
- # Typically used to build an absolute link obtained from a document e.g.
255
+ # Typically used to build an absolute link obtained from a document.
231
256
  #
257
+ # @example
232
258
  # link = Wgit::Url.new('/favicon.png')
233
259
  # doc = Wgit::Document.new('http://example.com')
234
260
  #
235
- # link.prefix_base(doc) # => "http://example.com/favicon.png"
261
+ # link.make_absolute(doc) # => "http://example.com/favicon.png"
236
262
  #
237
263
  # @param doc [Wgit::Document] The doc whose base Url is concatted with
238
264
  # self.
239
265
  # @raise [StandardError] If doc isn't a Wgit::Document or if `doc.base_url`
240
266
  # raises an Exception.
241
267
  # @return [Wgit::Url] Self in absolute form.
242
- def prefix_base(doc)
268
+ def make_absolute(doc)
243
269
  assert_type(doc, Wgit::Document)
270
+ raise 'Cannot make absolute when Document @url is not valid' \
271
+ unless doc.url.valid?
272
+
273
+ return prefix_scheme(doc.url.to_scheme&.to_sym) if scheme_relative?
244
274
 
245
275
  absolute? ? self : doc.base_url(link: self).concat(self)
246
276
  end
247
277
 
248
- # Returns self having prefixed a protocol scheme. Doesn't modify receiver.
278
+ # Returns self having prefixed a scheme/protocol. Doesn't modify receiver.
249
279
  # Returns self even if absolute (with scheme); therefore is idempotent.
250
280
  #
251
- # @param protocol [Symbol] Either :http or :https.
252
- # @return [Wgit::Url] Self with a protocol scheme prefix.
253
- def prefix_scheme(protocol: :http)
254
- return self if absolute?
255
-
256
- case protocol
257
- when :http
258
- Wgit::Url.new("http://#{url}")
259
- when :https
260
- Wgit::Url.new("https://#{url}")
261
- else
262
- raise "protocol must be :http or :https, not :#{protocol}"
281
+ # @param scheme [Symbol] Either :http or :https.
282
+ # @return [Wgit::Url] Self with a scheme prefix.
283
+ def prefix_scheme(scheme = :http)
284
+ unless %i[http https].include?(scheme)
285
+ raise "scheme must be :http or :https, not :#{scheme}"
263
286
  end
287
+
288
+ return self if absolute? && !scheme_relative?
289
+
290
+ separator = scheme_relative? ? '' : '//'
291
+ Wgit::Url.new("#{scheme}:#{separator}#{self}")
264
292
  end
265
293
 
266
294
  # Returns a Hash containing this Url's instance vars excluding @uri.
@@ -268,8 +296,7 @@ protocol scheme: #{url}"
268
296
  #
269
297
  # @return [Hash] self's instance vars as a Hash.
270
298
  def to_h
271
- ignore = ['@uri']
272
- h = Wgit::Utils.to_h(self, ignore: ignore)
299
+ h = Wgit::Utils.to_h(self, ignore: ['@uri'])
273
300
  Hash[h.to_a.insert(0, ['url', self])] # Insert url at position 0.
274
301
  end
275
302
 
@@ -312,6 +339,20 @@ protocol scheme: #{url}"
312
339
  host ? Wgit::Url.new(host) : nil
313
340
  end
314
341
 
342
+ # Returns a new Wgit::Url containing just the port of this URL e.g.
343
+ # Given http://www.google.co.uk:443/about.html, '443' is returned.
344
+ #
345
+ # @return [Wgit::Url, nil] Containing just the port or nil.
346
+ def to_port
347
+ port = @uri.port
348
+
349
+ # @uri.port defaults port to 80/443 if missing, so we check for :#{port}.
350
+ return nil unless port
351
+ return nil unless include?(":#{port}")
352
+
353
+ Wgit::Url.new(port.to_s)
354
+ end
355
+
315
356
  # Returns a new Wgit::Url containing just the domain of this URL e.g.
316
357
  # Given http://www.google.co.uk/about.html, google.co.uk is returned.
317
358
  #
@@ -321,6 +362,20 @@ protocol scheme: #{url}"
321
362
  domain ? Wgit::Url.new(domain) : nil
322
363
  end
323
364
 
365
+ # Returns a new Wgit::Url containing just the sub domain of this URL e.g.
366
+ # Given http://scripts.dev.google.com, scripts.dev is returned.
367
+ #
368
+ # @return [Wgit::Url, nil] Containing just the sub domain or nil.
369
+ def to_sub_domain
370
+ return nil unless to_host
371
+
372
+ dot_domain = ".#{to_domain}"
373
+ return nil unless include?(dot_domain)
374
+
375
+ sub_domain = to_host.sub(dot_domain, '')
376
+ Wgit::Url.new(sub_domain)
377
+ end
378
+
324
379
  # Returns a new Wgit::Url containing just the brand of this URL e.g.
325
380
  # Given http://www.google.co.uk/about.html, google is returned.
326
381
  #
@@ -336,12 +391,24 @@ protocol scheme: #{url}"
336
391
  # @return [Wgit::Url, nil] The base of self e.g. http://www.google.co.uk or
337
392
  # nil.
338
393
  def to_base
339
- return nil if @uri.scheme.nil? || @uri.host.nil?
394
+ return nil unless @uri.scheme && @uri.host
340
395
 
341
396
  base = "#{@uri.scheme}://#{@uri.host}"
342
397
  Wgit::Url.new(base)
343
398
  end
344
399
 
400
+ # Returns only the origin of this URL e.g. the protocol scheme, host and
401
+ # port combined. For http://localhost:3000/api, http://localhost:3000 gets
402
+ # returned. If there's no port present, then to_base is returned.
403
+ #
404
+ # @return [Wgit::Url, nil] The origin of self or nil.
405
+ def to_origin
406
+ return nil unless to_base
407
+ return to_base unless to_port
408
+
409
+ Wgit::Url.new("#{to_base}:#{to_port}")
410
+ end
411
+
345
412
  # Returns the path of this URL e.g. the bit after the host without slashes.
346
413
  # For example:
347
414
  # Wgit::Url.new("http://www.google.co.uk/about.html/").to_path returns
@@ -370,7 +437,7 @@ protocol scheme: #{url}"
370
437
  end
371
438
 
372
439
  # Returns a new Wgit::Url containing just the query string of this URL
373
- # e.g. Given http://google.com?q=ruby, '?q=ruby' is returned.
440
+ # e.g. Given http://google.com?q=foo&bar=1, 'q=ruby&bar=1' is returned.
374
441
  #
375
442
  # @return [Wgit::Url, nil] Containing just the query string or nil.
376
443
  def to_query
@@ -378,6 +445,24 @@ protocol scheme: #{url}"
378
445
  query ? Wgit::Url.new(query) : nil
379
446
  end
380
447
 
448
+ # Returns a Hash containing just the query string parameters of this URL
449
+ # e.g. Given http://google.com?q=ruby, "{ 'q' => 'ruby' }" is returned.
450
+ #
451
+ # @param symbolize_keys [Boolean] The returned Hash keys will be Symbols if
452
+ # true, Strings otherwise.
453
+ # @return [Hash<String | Symbol, String>] Containing the query string
454
+ # params or empty if the URL doesn't contain any query parameters.
455
+ def to_query_hash(symbolize_keys: false)
456
+ query_str = to_query
457
+ return {} unless query_str
458
+
459
+ query_str.split('&').each_with_object({}) do |param, hash|
460
+ k, v = param.split('=')
461
+ k = k.to_sym if symbolize_keys
462
+ hash[k] = v
463
+ end
464
+ end
465
+
381
466
  # Returns a new Wgit::Url containing just the fragment string of this URL
382
467
  # e.g. Given http://google.com#about, #about is returned.
383
468
  #
@@ -399,6 +484,24 @@ protocol scheme: #{url}"
399
484
  segs.length > 1 ? Wgit::Url.new(segs.last) : nil
400
485
  end
401
486
 
487
+ # Returns a new Wgit::Url containing just the username string of this URL
488
+ # e.g. Given http://me:pass1@example.com, me is returned.
489
+ #
490
+ # @return [Wgit::Url, nil] Containing just the user string or nil.
491
+ def to_user
492
+ user = @uri.user
493
+ user ? Wgit::Url.new(user) : nil
494
+ end
495
+
496
+ # Returns a new Wgit::Url containing just the password string of this URL
497
+ # e.g. Given http://me:pass1@example.com, pass1 is returned.
498
+ #
499
+ # @return [Wgit::Url, nil] Containing just the password string or nil.
500
+ def to_password
501
+ password = @uri.password
502
+ password ? Wgit::Url.new(password) : nil
503
+ end
504
+
402
505
  # Omits the given URL components from self and returns a new Wgit::Url.
403
506
  #
404
507
  # Calls Addressable::URI#omit underneath and creates a new Wgit::Url from
@@ -442,7 +545,7 @@ protocol scheme: #{url}"
442
545
  .omit_trailing_slash
443
546
  end
444
547
 
445
- # Returns a new Wgit::Url with the base (proto and host) removed e.g. Given
548
+ # Returns a new Wgit::Url with the base (scheme and host) removed e.g. Given
446
549
  # http://google.com/search?q=something#about, search?q=something#about is
447
550
  # returned. If relative and base isn't present then self is returned.
448
551
  # Leading and trailing slashes are always stripped from the return value.
@@ -457,6 +560,21 @@ protocol scheme: #{url}"
457
560
  Wgit::Url.new(omit_base).omit_slashes
458
561
  end
459
562
 
563
+ # Returns a new Wgit::Url with the origin (base + port) removed e.g. Given
564
+ # http://google.com:81/search?q=something#about, search?q=something#about is
565
+ # returned. If relative and base isn't present then self is returned.
566
+ # Leading and trailing slashes are always stripped from the return value.
567
+ #
568
+ # @return [Wgit::Url] Self containing everything after the origin.
569
+ def omit_origin
570
+ origin = to_origin
571
+ omit_origin = origin ? gsub(origin, '') : self
572
+
573
+ return self if ['', '/'].include?(omit_origin)
574
+
575
+ Wgit::Url.new(omit_origin).omit_slashes
576
+ end
577
+
460
578
  # Returns a new Wgit::Url with the query string portion removed e.g. Given
461
579
  # http://google.com/search?q=hello, http://google.com/search is
462
580
  # returned. Self is returned as is if no query string is present. A URL
@@ -502,25 +620,47 @@ protocol scheme: #{url}"
502
620
  start_with?('#')
503
621
  end
504
622
 
505
- alias + concat
506
- alias crawled? crawled
507
- alias normalise normalize
508
- alias is_relative? relative?
509
- alias is_absolute? absolute?
510
- alias is_valid? valid?
511
- alias is_query? query?
512
- alias is_fragment? fragment?
513
- alias uri to_uri
514
- alias url to_url
515
- alias scheme to_scheme
516
- alias host to_host
517
- alias domain to_domain
518
- alias brand to_brand
519
- alias base to_base
520
- alias path to_path
521
- alias endpoint to_endpoint
522
- alias query to_query
523
- alias fragment to_fragment
524
- alias extension to_extension
623
+ # Returns true if self equals '/' a.k.a. index.
624
+ #
625
+ # @return [Boolean] True if self equals '/', false otherwise.
626
+ def index?
627
+ self == '/'
628
+ end
629
+
630
+ # Returns true if self starts with '//' a.k.a a scheme/protocol relative
631
+ # path.
632
+ #
633
+ # @return [Boolean] True if self starts with '//', false otherwise.
634
+ def scheme_relative?
635
+ start_with?('//')
636
+ end
637
+
638
+ alias + concat
639
+ alias crawled? crawled
640
+ alias is_relative? relative?
641
+ alias is_absolute? absolute?
642
+ alias is_valid? valid?
643
+ alias is_query? query?
644
+ alias is_fragment? fragment?
645
+ alias is_index? index?
646
+ alias is_scheme_relative? scheme_relative?
647
+ alias uri to_uri
648
+ alias url to_url
649
+ alias scheme to_scheme
650
+ alias host to_host
651
+ alias port to_port
652
+ alias domain to_domain
653
+ alias brand to_brand
654
+ alias base to_base
655
+ alias origin to_origin
656
+ alias path to_path
657
+ alias endpoint to_endpoint
658
+ alias query to_query
659
+ alias query_hash to_query_hash
660
+ alias fragment to_fragment
661
+ alias extension to_extension
662
+ alias user to_user
663
+ alias password to_password
664
+ alias sub_domain to_sub_domain
525
665
  end
526
666
  end