wgit 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
  module Wgit
2
- # Response class representing a generic HTTP crawl response.
2
+ # Response class modeling a generic HTTP GET response.
3
3
  class Response
4
4
  # The underlying HTTP adapter/library response object.
5
5
  attr_accessor :adapter_response
@@ -69,7 +69,10 @@ module Wgit
69
69
  # @param headers [Hash] The new response headers.
70
70
  # @return [Hash] @headers's new value.
71
71
  def headers=(headers)
72
- return @headers = {} unless headers
72
+ unless headers
73
+ @headers = {}
74
+ return
75
+ end
73
76
 
74
77
  @headers = headers.map do |k, v|
75
78
  k = k.downcase.gsub('-', '_').to_sym
@@ -6,15 +6,15 @@ require 'uri'
6
6
  require 'addressable/uri'
7
7
 
8
8
  module Wgit
9
- # Class modeling a web based HTTP URL.
9
+ # Class modeling/serialising a web based HTTP URL.
10
10
  #
11
11
  # Can be an internal/relative link e.g. "about.html" or an absolute URL
12
- # e.g. "http://www.google.co.uk". Is a subclass of String and uses 'uri' and
13
- # 'addressable/uri' internally.
12
+ # e.g. "http://www.google.co.uk". Is a subclass of String and uses `URI` and
13
+ # `addressable/uri` internally for parsing.
14
14
  #
15
- # Most of the methods in this class return new Wgit::Url instances making the
16
- # method calls chainable e.g. url.omit_base.omit_fragment etc. The methods
17
- # also try to be idempotent where possible.
15
+ # Most of the methods in this class return new `Wgit::Url` instances making
16
+ # the method calls chainable e.g. `url.omit_base.omit_fragment` etc. The
17
+ # methods also try to be idempotent where possible.
18
18
  class Url < String
19
19
  include Assertable
20
20
 
@@ -28,7 +28,7 @@ module Wgit
28
28
  # The duration of the crawl for this Url (in seconds).
29
29
  attr_accessor :crawl_duration
30
30
 
31
- # Initializes a new instance of Wgit::Url which represents a web based
31
+ # Initializes a new instance of Wgit::Url which models a web based
32
32
  # HTTP URL.
33
33
  #
34
34
  # @param url_or_obj [String, Wgit::Url, #fetch#[]] Is either a String
@@ -99,10 +99,10 @@ module Wgit
99
99
  # @param obj [Object] The object to parse, which #is_a?(String).
100
100
  # @raise [StandardError] If obj.is_a?(String) is false.
101
101
  # @return [Wgit::Url] A Wgit::Url instance or nil (if obj is invalid).
102
- def self.parse_or_nil(obj)
102
+ def self.parse?(obj)
103
103
  parse(obj)
104
104
  rescue Addressable::URI::InvalidURIError
105
- Wgit.logger.debug("Wgit::Url.parse_or_nil('#{obj}') exception: \
105
+ Wgit.logger.debug("Wgit::Url.parse?('#{obj}') exception: \
106
106
  Addressable::URI::InvalidURIError")
107
107
  nil
108
108
  end
@@ -115,8 +115,6 @@ Addressable::URI::InvalidURIError")
115
115
  def crawled=(bool)
116
116
  @crawled = bool
117
117
  @date_crawled = bool ? Wgit::Utils.time_stamp : nil
118
-
119
- bool
120
118
  end
121
119
 
122
120
  # Overrides String#replace setting the new_url @uri and String value.
@@ -146,10 +144,10 @@ Addressable::URI::InvalidURIError")
146
144
  # @param opts [Hash] The options with which to check relativity. Only one
147
145
  # opts param should be provided. The provided opts param Url must be
148
146
  # absolute and be prefixed with a scheme. Consider using the output of
149
- # Wgit::Url#to_base which should work (unless it's nil).
150
- # @option opts [Wgit::Url, String] :base The Url base e.g.
151
- # http://www.google.com/how which gives a base of
152
- # 'http://www.google.com'.
147
+ # Wgit::Url#to_origin which should work (unless it's nil).
148
+ # @option opts [Wgit::Url, String] :origin The Url origin e.g.
149
+ # http://www.google.com:81/how which gives a origin of
150
+ # 'http://www.google.com:81'.
153
151
  # @option opts [Wgit::Url, String] :host The Url host e.g.
154
152
  # http://www.google.com/how which gives a host of 'www.google.com'.
155
153
  # @option opts [Wgit::Url, String] :domain The Url domain e.g.
@@ -160,7 +158,7 @@ Addressable::URI::InvalidURIError")
160
158
  # param has been provided.
161
159
  # @return [Boolean] True if relative, false if absolute.
162
160
  def relative?(opts = {})
163
- defaults = { base: nil, host: nil, domain: nil, brand: nil }
161
+ defaults = { origin: nil, host: nil, domain: nil, brand: nil }
164
162
  opts = defaults.merge(opts)
165
163
  raise 'Url (self) cannot be empty' if empty?
166
164
 
@@ -180,8 +178,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
180
178
  end
181
179
 
182
180
  case type
183
- when :base # http://www.google.com
184
- to_base == url.to_base
181
+ when :origin # http://www.google.com:81
182
+ to_origin == url.to_origin
185
183
  when :host # www.google.com
186
184
  to_host == url.to_host
187
185
  when :domain # google.com
@@ -206,8 +204,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
206
204
  # @return [Boolean] True if valid, absolute and crawable, otherwise false.
207
205
  def valid?
208
206
  return false if relative?
209
- return false unless to_base && to_domain
210
- return false if URI::DEFAULT_PARSER.make_regexp.match(normalize).nil?
207
+ return false unless to_origin && to_domain
208
+ return false unless URI::DEFAULT_PARSER.make_regexp.match(normalize)
211
209
 
212
210
  true
213
211
  end
@@ -238,7 +236,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
238
236
  Wgit::Url.new(concatted)
239
237
  end
240
238
 
241
- # Normalises/escapes self and returns a new Wgit::Url. Self isn't modified.
239
+ # Normalizes/escapes self and returns a new Wgit::Url. Self isn't modified.
240
+ # This should be used before GET'ing the url, in case it has IRI chars.
242
241
  #
243
242
  # @return [Wgit::Url] An escaped version of self.
244
243
  def normalize
@@ -249,8 +248,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
249
248
  # modify the receiver.
250
249
  #
251
250
  # If self is absolute then it's returned as is, making this method
252
- # idempotent. The doc's <base> element is used if present, otherwise
253
- # doc.url is used as the base; which is concatted with self.
251
+ # idempotent. The doc's `<base>` element is used if present, otherwise
252
+ # `doc.url` is used as the base; which is concatted with self.
254
253
  #
255
254
  # Typically used to build an absolute link obtained from a document.
256
255
  #
@@ -258,14 +257,14 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
258
257
  # link = Wgit::Url.new('/favicon.png')
259
258
  # doc = Wgit::Document.new('http://example.com')
260
259
  #
261
- # link.prefix_base(doc) # => "http://example.com/favicon.png"
260
+ # link.make_absolute(doc) # => "http://example.com/favicon.png"
262
261
  #
263
262
  # @param doc [Wgit::Document] The doc whose base Url is concatted with
264
263
  # self.
265
264
  # @raise [StandardError] If doc isn't a Wgit::Document or if `doc.base_url`
266
265
  # raises an Exception.
267
266
  # @return [Wgit::Url] Self in absolute form.
268
- def prefix_base(doc)
267
+ def make_absolute(doc)
269
268
  assert_type(doc, Wgit::Document)
270
269
 
271
270
  absolute? ? self : doc.base_url(link: self).concat(self)
@@ -294,8 +293,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
294
293
  #
295
294
  # @return [Hash] self's instance vars as a Hash.
296
295
  def to_h
297
- ignore = ['@uri']
298
- h = Wgit::Utils.to_h(self, ignore: ignore)
296
+ h = Wgit::Utils.to_h(self, ignore: ['@uri'])
299
297
  Hash[h.to_a.insert(0, ['url', self])] # Insert url at position 0.
300
298
  end
301
299
 
@@ -338,6 +336,20 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
338
336
  host ? Wgit::Url.new(host) : nil
339
337
  end
340
338
 
339
+ # Returns a new Wgit::Url containing just the port of this URL e.g.
340
+ # Given http://www.google.co.uk:443/about.html, '443' is returned.
341
+ #
342
+ # @return [Wgit::Url, nil] Containing just the port or nil.
343
+ def to_port
344
+ port = @uri.port
345
+
346
+ # @uri.port defaults port to 80/443 if missing, so we check for :#{port}.
347
+ return nil unless port
348
+ return nil unless include?(":#{port}")
349
+
350
+ Wgit::Url.new(port.to_s)
351
+ end
352
+
341
353
  # Returns a new Wgit::Url containing just the domain of this URL e.g.
342
354
  # Given http://www.google.co.uk/about.html, google.co.uk is returned.
343
355
  #
@@ -347,6 +359,20 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
347
359
  domain ? Wgit::Url.new(domain) : nil
348
360
  end
349
361
 
362
+ # Returns a new Wgit::Url containing just the sub domain of this URL e.g.
363
+ # Given http://scripts.dev.google.com, scripts.dev is returned.
364
+ #
365
+ # @return [Wgit::Url, nil] Containing just the sub domain or nil.
366
+ def to_sub_domain
367
+ return nil unless to_host
368
+
369
+ dot_domain = ".#{to_domain}"
370
+ return nil unless include?(dot_domain)
371
+
372
+ sub_domain = to_host.sub(dot_domain, '')
373
+ Wgit::Url.new(sub_domain)
374
+ end
375
+
350
376
  # Returns a new Wgit::Url containing just the brand of this URL e.g.
351
377
  # Given http://www.google.co.uk/about.html, google is returned.
352
378
  #
@@ -362,12 +388,24 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
362
388
  # @return [Wgit::Url, nil] The base of self e.g. http://www.google.co.uk or
363
389
  # nil.
364
390
  def to_base
365
- return nil if @uri.scheme.nil? || @uri.host.nil?
391
+ return nil unless @uri.scheme && @uri.host
366
392
 
367
393
  base = "#{@uri.scheme}://#{@uri.host}"
368
394
  Wgit::Url.new(base)
369
395
  end
370
396
 
397
+ # Returns only the origin of this URL e.g. the protocol scheme, host and
398
+ # port combined. For http://localhost:3000/api, http://localhost:3000 gets
399
+ # returned. If there's no port present, then to_base is returned.
400
+ #
401
+ # @return [Wgit::Url, nil] The origin of self or nil.
402
+ def to_origin
403
+ return nil unless to_base
404
+ return to_base unless to_port
405
+
406
+ Wgit::Url.new("#{to_base}:#{to_port}")
407
+ end
408
+
371
409
  # Returns the path of this URL e.g. the bit after the host without slashes.
372
410
  # For example:
373
411
  # Wgit::Url.new("http://www.google.co.uk/about.html/").to_path returns
@@ -396,7 +434,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
396
434
  end
397
435
 
398
436
  # Returns a new Wgit::Url containing just the query string of this URL
399
- # e.g. Given http://google.com?q=ruby, '?q=ruby' is returned.
437
+ # e.g. Given http://google.com?q=foo&bar=1, 'q=ruby&bar=1' is returned.
400
438
  #
401
439
  # @return [Wgit::Url, nil] Containing just the query string or nil.
402
440
  def to_query
@@ -404,6 +442,24 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
404
442
  query ? Wgit::Url.new(query) : nil
405
443
  end
406
444
 
445
+ # Returns a Hash containing just the query string parameters of this URL
446
+ # e.g. Given http://google.com?q=ruby, "{ 'q' => 'ruby' }" is returned.
447
+ #
448
+ # @param symbolize_keys [Boolean] The returned Hash keys will be Symbols if
449
+ # true, Strings otherwise.
450
+ # @return [Hash<String | Symbol, String>] Containing the query string
451
+ # params or empty if the URL doesn't contain any query parameters.
452
+ def to_query_hash(symbolize_keys: false)
453
+ query_str = to_query
454
+ return {} unless query_str
455
+
456
+ query_str.split('&').each_with_object({}) do |param, hash|
457
+ k, v = param.split('=')
458
+ k = k.to_sym if symbolize_keys
459
+ hash[k] = v
460
+ end
461
+ end
462
+
407
463
  # Returns a new Wgit::Url containing just the fragment string of this URL
408
464
  # e.g. Given http://google.com#about, #about is returned.
409
465
  #
@@ -425,6 +481,24 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
425
481
  segs.length > 1 ? Wgit::Url.new(segs.last) : nil
426
482
  end
427
483
 
484
+ # Returns a new Wgit::Url containing just the username string of this URL
485
+ # e.g. Given http://me:pass1@example.com, me is returned.
486
+ #
487
+ # @return [Wgit::Url, nil] Containing just the user string or nil.
488
+ def to_user
489
+ user = @uri.user
490
+ user ? Wgit::Url.new(user) : nil
491
+ end
492
+
493
+ # Returns a new Wgit::Url containing just the password string of this URL
494
+ # e.g. Given http://me:pass1@example.com, pass1 is returned.
495
+ #
496
+ # @return [Wgit::Url, nil] Containing just the password string or nil.
497
+ def to_password
498
+ password = @uri.password
499
+ password ? Wgit::Url.new(password) : nil
500
+ end
501
+
428
502
  # Omits the given URL components from self and returns a new Wgit::Url.
429
503
  #
430
504
  # Calls Addressable::URI#omit underneath and creates a new Wgit::Url from
@@ -468,7 +542,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
468
542
  .omit_trailing_slash
469
543
  end
470
544
 
471
- # Returns a new Wgit::Url with the base (proto and host) removed e.g. Given
545
+ # Returns a new Wgit::Url with the base (scheme and host) removed e.g. Given
472
546
  # http://google.com/search?q=something#about, search?q=something#about is
473
547
  # returned. If relative and base isn't present then self is returned.
474
548
  # Leading and trailing slashes are always stripped from the return value.
@@ -483,6 +557,21 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
483
557
  Wgit::Url.new(omit_base).omit_slashes
484
558
  end
485
559
 
560
+ # Returns a new Wgit::Url with the origin (base + port) removed e.g. Given
561
+ # http://google.com:81/search?q=something#about, search?q=something#about is
562
+ # returned. If relative and base isn't present then self is returned.
563
+ # Leading and trailing slashes are always stripped from the return value.
564
+ #
565
+ # @return [Wgit::Url] Self containing everything after the origin.
566
+ def omit_origin
567
+ origin = to_origin
568
+ omit_origin = origin ? gsub(origin, '') : self
569
+
570
+ return self if ['', '/'].include?(omit_origin)
571
+
572
+ Wgit::Url.new(omit_origin).omit_slashes
573
+ end
574
+
486
575
  # Returns a new Wgit::Url with the query string portion removed e.g. Given
487
576
  # http://google.com/search?q=hello, http://google.com/search is
488
577
  # returned. Self is returned as is if no query string is present. A URL
@@ -528,25 +617,38 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
528
617
  start_with?('#')
529
618
  end
530
619
 
620
+ # Returns true if self equals '/' a.k.a. index.
621
+ #
622
+ # @return [Boolean] True if self equals '/', false otherwise.
623
+ def index?
624
+ self == '/'
625
+ end
626
+
531
627
  alias + concat
532
628
  alias crawled? crawled
533
- alias normalise normalize
534
629
  alias is_relative? relative?
535
630
  alias is_absolute? absolute?
536
631
  alias is_valid? valid?
537
632
  alias is_query? query?
538
633
  alias is_fragment? fragment?
634
+ alias is_index? index?
539
635
  alias uri to_uri
540
636
  alias url to_url
541
637
  alias scheme to_scheme
542
638
  alias host to_host
639
+ alias port to_port
543
640
  alias domain to_domain
544
641
  alias brand to_brand
545
642
  alias base to_base
643
+ alias origin to_origin
546
644
  alias path to_path
547
645
  alias endpoint to_endpoint
548
646
  alias query to_query
647
+ alias query_hash to_query_hash
549
648
  alias fragment to_fragment
550
649
  alias extension to_extension
650
+ alias user to_user
651
+ alias password to_password
652
+ alias sub_domain to_sub_domain
551
653
  end
552
654
  end
@@ -145,7 +145,8 @@ module Wgit
145
145
  # @param keyword_limit [Integer] The max amount of keywords to be
146
146
  # outputted to the stream.
147
147
  # @param stream [#puts] Any object that respond_to?(:puts). It is used
148
- # to output text somewhere e.g. a file or STDOUT.
148
+ # to output text somewhere e.g. a file or STDERR.
149
+ # @return [Integer] The number of results.
149
150
  def self.printf_search_results(results, keyword_limit: 5, stream: STDOUT)
150
151
  raise 'stream must respond_to? :puts' unless stream.respond_to?(:puts)
151
152
 
@@ -162,18 +163,37 @@ module Wgit
162
163
  stream.puts
163
164
  end
164
165
 
165
- nil
166
+ results.size
166
167
  end
167
168
 
168
- # Processes a String to make it uniform. Strips any leading/trailing white
169
+ # Sanitises the obj to make it uniform by calling the correct sanitize_*
170
+ # method for its type e.g. if obj.is_a? String then sanitize(obj). Any type
171
+ # not in the case statement will be ignored and returned as is.
172
+ #
173
+ # @param obj [Object] The object to be sanitized.
174
+ # @param encode [Boolean] Whether or not to encode to UTF-8 replacing
175
+ # invalid characters.
176
+ # @return [Object] The sanitized obj is both modified and then returned.
177
+ def self.sanitize(obj, encode: true)
178
+ case obj
179
+ when String
180
+ sanitize_str(obj, encode: encode)
181
+ when Array
182
+ sanitize_arr(obj, encode: encode)
183
+ else
184
+ obj
185
+ end
186
+ end
187
+
188
+ # Sanitises a String to make it uniform. Strips any leading/trailing white
169
189
  # space. Also applies UTF-8 encoding (replacing invalid characters) if
170
190
  # `encode: true`.
171
191
  #
172
- # @param str [String] The String to process. str is modified.
192
+ # @param str [String] The String to sanitize. str is modified.
173
193
  # @param encode [Boolean] Whether or not to encode to UTF-8 replacing
174
194
  # invalid characters.
175
- # @return [String] The processed str is both modified and then returned.
176
- def self.process_str(str, encode: true)
195
+ # @return [String] The sanitized str is both modified and then returned.
196
+ def self.sanitize_str(str, encode: true)
177
197
  if str.is_a?(String)
178
198
  str.encode!('UTF-8', undef: :replace, invalid: :replace) if encode
179
199
  str.strip!
@@ -182,15 +202,15 @@ module Wgit
182
202
  str
183
203
  end
184
204
 
185
- # Processes an Array to make it uniform. Removes empty Strings and nils,
186
- # processes non empty Strings using Wgit::Utils.process_str and removes
205
+ # Sanitises an Array to make it uniform. Removes empty Strings and nils,
206
+ # processes non empty Strings using Wgit::Utils.sanitize and removes
187
207
  # duplicates.
188
208
  #
189
- # @param arr [Enumerable] The Array to process. arr is modified.
190
- # @return [Enumerable] The processed arr is both modified and then returned.
191
- def self.process_arr(arr, encode: true)
209
+ # @param arr [Enumerable] The Array to sanitize. arr is modified.
210
+ # @return [Enumerable] The sanitized arr is both modified and then returned.
211
+ def self.sanitize_arr(arr, encode: true)
192
212
  if arr.is_a?(Array)
193
- arr.map! { |str| process_str(str, encode: encode) }
213
+ arr.map! { |str| sanitize(str, encode: encode) }
194
214
  arr.reject! { |str| str.is_a?(String) ? str.empty? : false }
195
215
  arr.compact!
196
216
  arr.uniq!
@@ -198,13 +218,5 @@ module Wgit
198
218
 
199
219
  arr
200
220
  end
201
-
202
- # Returns the model having removed non bson types (for use with MongoDB).
203
- #
204
- # @param model_hash [Hash] The model Hash to process.
205
- # @return [Hash] The model Hash with non bson types removed.
206
- def self.remove_non_bson_types(model_hash)
207
- model_hash.select { |_k, v| v.respond_to?(:bson_type) }
208
- end
209
221
  end
210
222
  end
@@ -2,10 +2,11 @@
2
2
 
3
3
  # Wgit is a WWW indexer/scraper which crawls URL's and retrieves their page
4
4
  # contents for later use.
5
+ #
5
6
  # @author Michael Telford
6
7
  module Wgit
7
8
  # The current gem version of Wgit.
8
- VERSION = '0.8.0'
9
+ VERSION = '0.9.0'
9
10
 
10
11
  # Returns the current gem version of Wgit as a String.
11
12
  def self.version