wgit 0.8.0 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,5 @@
1
1
  module Wgit
2
- # Response class representing a generic HTTP crawl response.
2
+ # Response class modeling a generic HTTP GET response.
3
3
  class Response
4
4
  # The underlying HTTP adapter/library response object.
5
5
  attr_accessor :adapter_response
@@ -69,7 +69,10 @@ module Wgit
69
69
  # @param headers [Hash] The new response headers.
70
70
  # @return [Hash] @headers's new value.
71
71
  def headers=(headers)
72
- return @headers = {} unless headers
72
+ unless headers
73
+ @headers = {}
74
+ return
75
+ end
73
76
 
74
77
  @headers = headers.map do |k, v|
75
78
  k = k.downcase.gsub('-', '_').to_sym
@@ -6,15 +6,15 @@ require 'uri'
6
6
  require 'addressable/uri'
7
7
 
8
8
  module Wgit
9
- # Class modeling a web based HTTP URL.
9
+ # Class modeling/serialising a web based HTTP URL.
10
10
  #
11
11
  # Can be an internal/relative link e.g. "about.html" or an absolute URL
12
- # e.g. "http://www.google.co.uk". Is a subclass of String and uses 'uri' and
13
- # 'addressable/uri' internally.
12
+ # e.g. "http://www.google.co.uk". Is a subclass of String and uses `URI` and
13
+ # `addressable/uri` internally for parsing.
14
14
  #
15
- # Most of the methods in this class return new Wgit::Url instances making the
16
- # method calls chainable e.g. url.omit_base.omit_fragment etc. The methods
17
- # also try to be idempotent where possible.
15
+ # Most of the methods in this class return new `Wgit::Url` instances making
16
+ # the method calls chainable e.g. `url.omit_base.omit_fragment` etc. The
17
+ # methods also try to be idempotent where possible.
18
18
  class Url < String
19
19
  include Assertable
20
20
 
@@ -28,7 +28,7 @@ module Wgit
28
28
  # The duration of the crawl for this Url (in seconds).
29
29
  attr_accessor :crawl_duration
30
30
 
31
- # Initializes a new instance of Wgit::Url which represents a web based
31
+ # Initializes a new instance of Wgit::Url which models a web based
32
32
  # HTTP URL.
33
33
  #
34
34
  # @param url_or_obj [String, Wgit::Url, #fetch#[]] Is either a String
@@ -99,10 +99,10 @@ module Wgit
99
99
  # @param obj [Object] The object to parse, which #is_a?(String).
100
100
  # @raise [StandardError] If obj.is_a?(String) is false.
101
101
  # @return [Wgit::Url] A Wgit::Url instance or nil (if obj is invalid).
102
- def self.parse_or_nil(obj)
102
+ def self.parse?(obj)
103
103
  parse(obj)
104
104
  rescue Addressable::URI::InvalidURIError
105
- Wgit.logger.debug("Wgit::Url.parse_or_nil('#{obj}') exception: \
105
+ Wgit.logger.debug("Wgit::Url.parse?('#{obj}') exception: \
106
106
  Addressable::URI::InvalidURIError")
107
107
  nil
108
108
  end
@@ -115,8 +115,6 @@ Addressable::URI::InvalidURIError")
115
115
  def crawled=(bool)
116
116
  @crawled = bool
117
117
  @date_crawled = bool ? Wgit::Utils.time_stamp : nil
118
-
119
- bool
120
118
  end
121
119
 
122
120
  # Overrides String#replace setting the new_url @uri and String value.
@@ -146,10 +144,10 @@ Addressable::URI::InvalidURIError")
146
144
  # @param opts [Hash] The options with which to check relativity. Only one
147
145
  # opts param should be provided. The provided opts param Url must be
148
146
  # absolute and be prefixed with a scheme. Consider using the output of
149
- # Wgit::Url#to_base which should work (unless it's nil).
150
- # @option opts [Wgit::Url, String] :base The Url base e.g.
151
- # http://www.google.com/how which gives a base of
152
- # 'http://www.google.com'.
147
+ # Wgit::Url#to_origin which should work (unless it's nil).
148
+ # @option opts [Wgit::Url, String] :origin The Url origin e.g.
149
+ # http://www.google.com:81/how which gives a origin of
150
+ # 'http://www.google.com:81'.
153
151
  # @option opts [Wgit::Url, String] :host The Url host e.g.
154
152
  # http://www.google.com/how which gives a host of 'www.google.com'.
155
153
  # @option opts [Wgit::Url, String] :domain The Url domain e.g.
@@ -160,7 +158,7 @@ Addressable::URI::InvalidURIError")
160
158
  # param has been provided.
161
159
  # @return [Boolean] True if relative, false if absolute.
162
160
  def relative?(opts = {})
163
- defaults = { base: nil, host: nil, domain: nil, brand: nil }
161
+ defaults = { origin: nil, host: nil, domain: nil, brand: nil }
164
162
  opts = defaults.merge(opts)
165
163
  raise 'Url (self) cannot be empty' if empty?
166
164
 
@@ -180,8 +178,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
180
178
  end
181
179
 
182
180
  case type
183
- when :base # http://www.google.com
184
- to_base == url.to_base
181
+ when :origin # http://www.google.com:81
182
+ to_origin == url.to_origin
185
183
  when :host # www.google.com
186
184
  to_host == url.to_host
187
185
  when :domain # google.com
@@ -206,8 +204,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
206
204
  # @return [Boolean] True if valid, absolute and crawable, otherwise false.
207
205
  def valid?
208
206
  return false if relative?
209
- return false unless to_base && to_domain
210
- return false if URI::DEFAULT_PARSER.make_regexp.match(normalize).nil?
207
+ return false unless to_origin && to_domain
208
+ return false unless URI::DEFAULT_PARSER.make_regexp.match(normalize)
211
209
 
212
210
  true
213
211
  end
@@ -238,7 +236,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
238
236
  Wgit::Url.new(concatted)
239
237
  end
240
238
 
241
- # Normalises/escapes self and returns a new Wgit::Url. Self isn't modified.
239
+ # Normalizes/escapes self and returns a new Wgit::Url. Self isn't modified.
240
+ # This should be used before GET'ing the url, in case it has IRI chars.
242
241
  #
243
242
  # @return [Wgit::Url] An escaped version of self.
244
243
  def normalize
@@ -249,8 +248,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
249
248
  # modify the receiver.
250
249
  #
251
250
  # If self is absolute then it's returned as is, making this method
252
- # idempotent. The doc's <base> element is used if present, otherwise
253
- # doc.url is used as the base; which is concatted with self.
251
+ # idempotent. The doc's `<base>` element is used if present, otherwise
252
+ # `doc.url` is used as the base; which is concatted with self.
254
253
  #
255
254
  # Typically used to build an absolute link obtained from a document.
256
255
  #
@@ -258,14 +257,14 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
258
257
  # link = Wgit::Url.new('/favicon.png')
259
258
  # doc = Wgit::Document.new('http://example.com')
260
259
  #
261
- # link.prefix_base(doc) # => "http://example.com/favicon.png"
260
+ # link.make_absolute(doc) # => "http://example.com/favicon.png"
262
261
  #
263
262
  # @param doc [Wgit::Document] The doc whose base Url is concatted with
264
263
  # self.
265
264
  # @raise [StandardError] If doc isn't a Wgit::Document or if `doc.base_url`
266
265
  # raises an Exception.
267
266
  # @return [Wgit::Url] Self in absolute form.
268
- def prefix_base(doc)
267
+ def make_absolute(doc)
269
268
  assert_type(doc, Wgit::Document)
270
269
 
271
270
  absolute? ? self : doc.base_url(link: self).concat(self)
@@ -294,8 +293,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
294
293
  #
295
294
  # @return [Hash] self's instance vars as a Hash.
296
295
  def to_h
297
- ignore = ['@uri']
298
- h = Wgit::Utils.to_h(self, ignore: ignore)
296
+ h = Wgit::Utils.to_h(self, ignore: ['@uri'])
299
297
  Hash[h.to_a.insert(0, ['url', self])] # Insert url at position 0.
300
298
  end
301
299
 
@@ -338,6 +336,20 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
338
336
  host ? Wgit::Url.new(host) : nil
339
337
  end
340
338
 
339
+ # Returns a new Wgit::Url containing just the port of this URL e.g.
340
+ # Given http://www.google.co.uk:443/about.html, '443' is returned.
341
+ #
342
+ # @return [Wgit::Url, nil] Containing just the port or nil.
343
+ def to_port
344
+ port = @uri.port
345
+
346
+ # @uri.port defaults port to 80/443 if missing, so we check for :#{port}.
347
+ return nil unless port
348
+ return nil unless include?(":#{port}")
349
+
350
+ Wgit::Url.new(port.to_s)
351
+ end
352
+
341
353
  # Returns a new Wgit::Url containing just the domain of this URL e.g.
342
354
  # Given http://www.google.co.uk/about.html, google.co.uk is returned.
343
355
  #
@@ -347,6 +359,20 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
347
359
  domain ? Wgit::Url.new(domain) : nil
348
360
  end
349
361
 
362
+ # Returns a new Wgit::Url containing just the sub domain of this URL e.g.
363
+ # Given http://scripts.dev.google.com, scripts.dev is returned.
364
+ #
365
+ # @return [Wgit::Url, nil] Containing just the sub domain or nil.
366
+ def to_sub_domain
367
+ return nil unless to_host
368
+
369
+ dot_domain = ".#{to_domain}"
370
+ return nil unless include?(dot_domain)
371
+
372
+ sub_domain = to_host.sub(dot_domain, '')
373
+ Wgit::Url.new(sub_domain)
374
+ end
375
+
350
376
  # Returns a new Wgit::Url containing just the brand of this URL e.g.
351
377
  # Given http://www.google.co.uk/about.html, google is returned.
352
378
  #
@@ -362,12 +388,24 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
362
388
  # @return [Wgit::Url, nil] The base of self e.g. http://www.google.co.uk or
363
389
  # nil.
364
390
  def to_base
365
- return nil if @uri.scheme.nil? || @uri.host.nil?
391
+ return nil unless @uri.scheme && @uri.host
366
392
 
367
393
  base = "#{@uri.scheme}://#{@uri.host}"
368
394
  Wgit::Url.new(base)
369
395
  end
370
396
 
397
+ # Returns only the origin of this URL e.g. the protocol scheme, host and
398
+ # port combined. For http://localhost:3000/api, http://localhost:3000 gets
399
+ # returned. If there's no port present, then to_base is returned.
400
+ #
401
+ # @return [Wgit::Url, nil] The origin of self or nil.
402
+ def to_origin
403
+ return nil unless to_base
404
+ return to_base unless to_port
405
+
406
+ Wgit::Url.new("#{to_base}:#{to_port}")
407
+ end
408
+
371
409
  # Returns the path of this URL e.g. the bit after the host without slashes.
372
410
  # For example:
373
411
  # Wgit::Url.new("http://www.google.co.uk/about.html/").to_path returns
@@ -396,7 +434,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
396
434
  end
397
435
 
398
436
  # Returns a new Wgit::Url containing just the query string of this URL
399
- # e.g. Given http://google.com?q=ruby, '?q=ruby' is returned.
437
+ # e.g. Given http://google.com?q=foo&bar=1, 'q=ruby&bar=1' is returned.
400
438
  #
401
439
  # @return [Wgit::Url, nil] Containing just the query string or nil.
402
440
  def to_query
@@ -404,6 +442,24 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
404
442
  query ? Wgit::Url.new(query) : nil
405
443
  end
406
444
 
445
+ # Returns a Hash containing just the query string parameters of this URL
446
+ # e.g. Given http://google.com?q=ruby, "{ 'q' => 'ruby' }" is returned.
447
+ #
448
+ # @param symbolize_keys [Boolean] The returned Hash keys will be Symbols if
449
+ # true, Strings otherwise.
450
+ # @return [Hash<String | Symbol, String>] Containing the query string
451
+ # params or empty if the URL doesn't contain any query parameters.
452
+ def to_query_hash(symbolize_keys: false)
453
+ query_str = to_query
454
+ return {} unless query_str
455
+
456
+ query_str.split('&').each_with_object({}) do |param, hash|
457
+ k, v = param.split('=')
458
+ k = k.to_sym if symbolize_keys
459
+ hash[k] = v
460
+ end
461
+ end
462
+
407
463
  # Returns a new Wgit::Url containing just the fragment string of this URL
408
464
  # e.g. Given http://google.com#about, #about is returned.
409
465
  #
@@ -425,6 +481,24 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
425
481
  segs.length > 1 ? Wgit::Url.new(segs.last) : nil
426
482
  end
427
483
 
484
+ # Returns a new Wgit::Url containing just the username string of this URL
485
+ # e.g. Given http://me:pass1@example.com, me is returned.
486
+ #
487
+ # @return [Wgit::Url, nil] Containing just the user string or nil.
488
+ def to_user
489
+ user = @uri.user
490
+ user ? Wgit::Url.new(user) : nil
491
+ end
492
+
493
+ # Returns a new Wgit::Url containing just the password string of this URL
494
+ # e.g. Given http://me:pass1@example.com, pass1 is returned.
495
+ #
496
+ # @return [Wgit::Url, nil] Containing just the password string or nil.
497
+ def to_password
498
+ password = @uri.password
499
+ password ? Wgit::Url.new(password) : nil
500
+ end
501
+
428
502
  # Omits the given URL components from self and returns a new Wgit::Url.
429
503
  #
430
504
  # Calls Addressable::URI#omit underneath and creates a new Wgit::Url from
@@ -468,7 +542,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
468
542
  .omit_trailing_slash
469
543
  end
470
544
 
471
- # Returns a new Wgit::Url with the base (proto and host) removed e.g. Given
545
+ # Returns a new Wgit::Url with the base (scheme and host) removed e.g. Given
472
546
  # http://google.com/search?q=something#about, search?q=something#about is
473
547
  # returned. If relative and base isn't present then self is returned.
474
548
  # Leading and trailing slashes are always stripped from the return value.
@@ -483,6 +557,21 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
483
557
  Wgit::Url.new(omit_base).omit_slashes
484
558
  end
485
559
 
560
+ # Returns a new Wgit::Url with the origin (base + port) removed e.g. Given
561
+ # http://google.com:81/search?q=something#about, search?q=something#about is
562
+ # returned. If relative and base isn't present then self is returned.
563
+ # Leading and trailing slashes are always stripped from the return value.
564
+ #
565
+ # @return [Wgit::Url] Self containing everything after the origin.
566
+ def omit_origin
567
+ origin = to_origin
568
+ omit_origin = origin ? gsub(origin, '') : self
569
+
570
+ return self if ['', '/'].include?(omit_origin)
571
+
572
+ Wgit::Url.new(omit_origin).omit_slashes
573
+ end
574
+
486
575
  # Returns a new Wgit::Url with the query string portion removed e.g. Given
487
576
  # http://google.com/search?q=hello, http://google.com/search is
488
577
  # returned. Self is returned as is if no query string is present. A URL
@@ -528,25 +617,38 @@ protocol scheme and domain (e.g. http://example.com): #{url}"
528
617
  start_with?('#')
529
618
  end
530
619
 
620
+ # Returns true if self equals '/' a.k.a. index.
621
+ #
622
+ # @return [Boolean] True if self equals '/', false otherwise.
623
+ def index?
624
+ self == '/'
625
+ end
626
+
531
627
  alias + concat
532
628
  alias crawled? crawled
533
- alias normalise normalize
534
629
  alias is_relative? relative?
535
630
  alias is_absolute? absolute?
536
631
  alias is_valid? valid?
537
632
  alias is_query? query?
538
633
  alias is_fragment? fragment?
634
+ alias is_index? index?
539
635
  alias uri to_uri
540
636
  alias url to_url
541
637
  alias scheme to_scheme
542
638
  alias host to_host
639
+ alias port to_port
543
640
  alias domain to_domain
544
641
  alias brand to_brand
545
642
  alias base to_base
643
+ alias origin to_origin
546
644
  alias path to_path
547
645
  alias endpoint to_endpoint
548
646
  alias query to_query
647
+ alias query_hash to_query_hash
549
648
  alias fragment to_fragment
550
649
  alias extension to_extension
650
+ alias user to_user
651
+ alias password to_password
652
+ alias sub_domain to_sub_domain
551
653
  end
552
654
  end
@@ -145,7 +145,8 @@ module Wgit
145
145
  # @param keyword_limit [Integer] The max amount of keywords to be
146
146
  # outputted to the stream.
147
147
  # @param stream [#puts] Any object that respond_to?(:puts). It is used
148
- # to output text somewhere e.g. a file or STDOUT.
148
+ # to output text somewhere e.g. a file or STDERR.
149
+ # @return [Integer] The number of results.
149
150
  def self.printf_search_results(results, keyword_limit: 5, stream: STDOUT)
150
151
  raise 'stream must respond_to? :puts' unless stream.respond_to?(:puts)
151
152
 
@@ -162,18 +163,37 @@ module Wgit
162
163
  stream.puts
163
164
  end
164
165
 
165
- nil
166
+ results.size
166
167
  end
167
168
 
168
- # Processes a String to make it uniform. Strips any leading/trailing white
169
+ # Sanitises the obj to make it uniform by calling the correct sanitize_*
170
+ # method for its type e.g. if obj.is_a? String then sanitize(obj). Any type
171
+ # not in the case statement will be ignored and returned as is.
172
+ #
173
+ # @param obj [Object] The object to be sanitized.
174
+ # @param encode [Boolean] Whether or not to encode to UTF-8 replacing
175
+ # invalid characters.
176
+ # @return [Object] The sanitized obj is both modified and then returned.
177
+ def self.sanitize(obj, encode: true)
178
+ case obj
179
+ when String
180
+ sanitize_str(obj, encode: encode)
181
+ when Array
182
+ sanitize_arr(obj, encode: encode)
183
+ else
184
+ obj
185
+ end
186
+ end
187
+
188
+ # Sanitises a String to make it uniform. Strips any leading/trailing white
169
189
  # space. Also applies UTF-8 encoding (replacing invalid characters) if
170
190
  # `encode: true`.
171
191
  #
172
- # @param str [String] The String to process. str is modified.
192
+ # @param str [String] The String to sanitize. str is modified.
173
193
  # @param encode [Boolean] Whether or not to encode to UTF-8 replacing
174
194
  # invalid characters.
175
- # @return [String] The processed str is both modified and then returned.
176
- def self.process_str(str, encode: true)
195
+ # @return [String] The sanitized str is both modified and then returned.
196
+ def self.sanitize_str(str, encode: true)
177
197
  if str.is_a?(String)
178
198
  str.encode!('UTF-8', undef: :replace, invalid: :replace) if encode
179
199
  str.strip!
@@ -182,15 +202,15 @@ module Wgit
182
202
  str
183
203
  end
184
204
 
185
- # Processes an Array to make it uniform. Removes empty Strings and nils,
186
- # processes non empty Strings using Wgit::Utils.process_str and removes
205
+ # Sanitises an Array to make it uniform. Removes empty Strings and nils,
206
+ # processes non empty Strings using Wgit::Utils.sanitize and removes
187
207
  # duplicates.
188
208
  #
189
- # @param arr [Enumerable] The Array to process. arr is modified.
190
- # @return [Enumerable] The processed arr is both modified and then returned.
191
- def self.process_arr(arr, encode: true)
209
+ # @param arr [Enumerable] The Array to sanitize. arr is modified.
210
+ # @return [Enumerable] The sanitized arr is both modified and then returned.
211
+ def self.sanitize_arr(arr, encode: true)
192
212
  if arr.is_a?(Array)
193
- arr.map! { |str| process_str(str, encode: encode) }
213
+ arr.map! { |str| sanitize(str, encode: encode) }
194
214
  arr.reject! { |str| str.is_a?(String) ? str.empty? : false }
195
215
  arr.compact!
196
216
  arr.uniq!
@@ -198,13 +218,5 @@ module Wgit
198
218
 
199
219
  arr
200
220
  end
201
-
202
- # Returns the model having removed non bson types (for use with MongoDB).
203
- #
204
- # @param model_hash [Hash] The model Hash to process.
205
- # @return [Hash] The model Hash with non bson types removed.
206
- def self.remove_non_bson_types(model_hash)
207
- model_hash.select { |_k, v| v.respond_to?(:bson_type) }
208
- end
209
221
  end
210
222
  end
@@ -2,10 +2,11 @@
2
2
 
3
3
  # Wgit is a WWW indexer/scraper which crawls URL's and retrieves their page
4
4
  # contents for later use.
5
+ #
5
6
  # @author Michael Telford
6
7
  module Wgit
7
8
  # The current gem version of Wgit.
8
- VERSION = '0.8.0'
9
+ VERSION = '0.9.0'
9
10
 
10
11
  # Returns the current gem version of Wgit as a String.
11
12
  def self.version