wgit 0.8.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/.yardopts +1 -1
 - data/CHANGELOG.md +39 -0
 - data/LICENSE.txt +1 -1
 - data/README.md +118 -323
 - data/bin/wgit +9 -5
 - data/lib/wgit.rb +3 -1
 - data/lib/wgit/assertable.rb +3 -3
 - data/lib/wgit/base.rb +30 -0
 - data/lib/wgit/crawler.rb +206 -76
 - data/lib/wgit/database/database.rb +309 -134
 - data/lib/wgit/database/model.rb +10 -3
 - data/lib/wgit/document.rb +138 -95
 - data/lib/wgit/{document_extensions.rb → document_extractors.rb} +11 -11
 - data/lib/wgit/dsl.rb +324 -0
 - data/lib/wgit/indexer.rb +65 -162
 - data/lib/wgit/response.rb +5 -2
 - data/lib/wgit/url.rb +133 -31
 - data/lib/wgit/utils.rb +32 -20
 - data/lib/wgit/version.rb +2 -1
 - metadata +26 -14
 
    
        data/lib/wgit/response.rb
    CHANGED
    
    | 
         @@ -1,5 +1,5 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            module Wgit
         
     | 
| 
       2 
     | 
    
         
            -
              # Response class  
     | 
| 
      
 2 
     | 
    
         
            +
              # Response class modeling a generic HTTP GET response.
         
     | 
| 
       3 
3 
     | 
    
         
             
              class Response
         
     | 
| 
       4 
4 
     | 
    
         
             
                # The underlying HTTP adapter/library response object.
         
     | 
| 
       5 
5 
     | 
    
         
             
                attr_accessor :adapter_response
         
     | 
| 
         @@ -69,7 +69,10 @@ module Wgit 
     | 
|
| 
       69 
69 
     | 
    
         
             
                # @param headers [Hash] The new response headers.
         
     | 
| 
       70 
70 
     | 
    
         
             
                # @return [Hash] @headers's new value.
         
     | 
| 
       71 
71 
     | 
    
         
             
                def headers=(headers)
         
     | 
| 
       72 
     | 
    
         
            -
                   
     | 
| 
      
 72 
     | 
    
         
            +
                  unless headers
         
     | 
| 
      
 73 
     | 
    
         
            +
                    @headers = {}
         
     | 
| 
      
 74 
     | 
    
         
            +
                    return
         
     | 
| 
      
 75 
     | 
    
         
            +
                  end
         
     | 
| 
       73 
76 
     | 
    
         | 
| 
       74 
77 
     | 
    
         
             
                  @headers = headers.map do |k, v|
         
     | 
| 
       75 
78 
     | 
    
         
             
                    k = k.downcase.gsub('-', '_').to_sym
         
     | 
    
        data/lib/wgit/url.rb
    CHANGED
    
    | 
         @@ -6,15 +6,15 @@ require 'uri' 
     | 
|
| 
       6 
6 
     | 
    
         
             
            require 'addressable/uri'
         
     | 
| 
       7 
7 
     | 
    
         | 
| 
       8 
8 
     | 
    
         
             
            module Wgit
         
     | 
| 
       9 
     | 
    
         
            -
              # Class modeling a web based HTTP URL.
         
     | 
| 
      
 9 
     | 
    
         
            +
              # Class modeling/serialising a web based HTTP URL.
         
     | 
| 
       10 
10 
     | 
    
         
             
              #
         
     | 
| 
       11 
11 
     | 
    
         
             
              # Can be an internal/relative link e.g. "about.html" or an absolute URL
         
     | 
| 
       12 
     | 
    
         
            -
              # e.g. "http://www.google.co.uk". Is a subclass of String and uses  
     | 
| 
       13 
     | 
    
         
            -
              #  
     | 
| 
      
 12 
     | 
    
         
            +
              # e.g. "http://www.google.co.uk". Is a subclass of String and uses `URI` and
         
     | 
| 
      
 13 
     | 
    
         
            +
              # `addressable/uri` internally for parsing.
         
     | 
| 
       14 
14 
     | 
    
         
             
              #
         
     | 
| 
       15 
     | 
    
         
            -
              # Most of the methods in this class return new Wgit::Url instances making 
     | 
| 
       16 
     | 
    
         
            -
              # method calls chainable e.g. url.omit_base.omit_fragment etc. The 
     | 
| 
       17 
     | 
    
         
            -
              # also try to be idempotent where possible.
         
     | 
| 
      
 15 
     | 
    
         
            +
              # Most of the methods in this class return new `Wgit::Url` instances making
         
     | 
| 
      
 16 
     | 
    
         
            +
              # the method calls chainable e.g. `url.omit_base.omit_fragment` etc. The
         
     | 
| 
      
 17 
     | 
    
         
            +
              # methods also try to be idempotent where possible.
         
     | 
| 
       18 
18 
     | 
    
         
             
              class Url < String
         
     | 
| 
       19 
19 
     | 
    
         
             
                include Assertable
         
     | 
| 
       20 
20 
     | 
    
         | 
| 
         @@ -28,7 +28,7 @@ module Wgit 
     | 
|
| 
       28 
28 
     | 
    
         
             
                # The duration of the crawl for this Url (in seconds).
         
     | 
| 
       29 
29 
     | 
    
         
             
                attr_accessor :crawl_duration
         
     | 
| 
       30 
30 
     | 
    
         | 
| 
       31 
     | 
    
         
            -
                # Initializes a new instance of Wgit::Url which  
     | 
| 
      
 31 
     | 
    
         
            +
                # Initializes a new instance of Wgit::Url which models a web based
         
     | 
| 
       32 
32 
     | 
    
         
             
                # HTTP URL.
         
     | 
| 
       33 
33 
     | 
    
         
             
                #
         
     | 
| 
       34 
34 
     | 
    
         
             
                # @param url_or_obj [String, Wgit::Url, #fetch#[]] Is either a String
         
     | 
| 
         @@ -99,10 +99,10 @@ module Wgit 
     | 
|
| 
       99 
99 
     | 
    
         
             
                # @param obj [Object] The object to parse, which #is_a?(String).
         
     | 
| 
       100 
100 
     | 
    
         
             
                # @raise [StandardError] If obj.is_a?(String) is false.
         
     | 
| 
       101 
101 
     | 
    
         
             
                # @return [Wgit::Url] A Wgit::Url instance or nil (if obj is invalid).
         
     | 
| 
       102 
     | 
    
         
            -
                def self. 
     | 
| 
      
 102 
     | 
    
         
            +
                def self.parse?(obj)
         
     | 
| 
       103 
103 
     | 
    
         
             
                  parse(obj)
         
     | 
| 
       104 
104 
     | 
    
         
             
                rescue Addressable::URI::InvalidURIError
         
     | 
| 
       105 
     | 
    
         
            -
                  Wgit.logger.debug("Wgit::Url. 
     | 
| 
      
 105 
     | 
    
         
            +
                  Wgit.logger.debug("Wgit::Url.parse?('#{obj}') exception: \
         
     | 
| 
       106 
106 
     | 
    
         
             
            Addressable::URI::InvalidURIError")
         
     | 
| 
       107 
107 
     | 
    
         
             
                  nil
         
     | 
| 
       108 
108 
     | 
    
         
             
                end
         
     | 
| 
         @@ -115,8 +115,6 @@ Addressable::URI::InvalidURIError") 
     | 
|
| 
       115 
115 
     | 
    
         
             
                def crawled=(bool)
         
     | 
| 
       116 
116 
     | 
    
         
             
                  @crawled      = bool
         
     | 
| 
       117 
117 
     | 
    
         
             
                  @date_crawled = bool ? Wgit::Utils.time_stamp : nil
         
     | 
| 
       118 
     | 
    
         
            -
             
     | 
| 
       119 
     | 
    
         
            -
                  bool
         
     | 
| 
       120 
118 
     | 
    
         
             
                end
         
     | 
| 
       121 
119 
     | 
    
         | 
| 
       122 
120 
     | 
    
         
             
                # Overrides String#replace setting the new_url @uri and String value.
         
     | 
| 
         @@ -146,10 +144,10 @@ Addressable::URI::InvalidURIError") 
     | 
|
| 
       146 
144 
     | 
    
         
             
                # @param opts [Hash] The options with which to check relativity. Only one
         
     | 
| 
       147 
145 
     | 
    
         
             
                #   opts param should be provided. The provided opts param Url must be
         
     | 
| 
       148 
146 
     | 
    
         
             
                #   absolute and be prefixed with a scheme. Consider using the output of
         
     | 
| 
       149 
     | 
    
         
            -
                #   Wgit::Url# 
     | 
| 
       150 
     | 
    
         
            -
                # @option opts [Wgit::Url, String] : 
     | 
| 
       151 
     | 
    
         
            -
                #   http://www.google.com/how which gives a  
     | 
| 
       152 
     | 
    
         
            -
                #   'http://www.google.com'.
         
     | 
| 
      
 147 
     | 
    
         
            +
                #   Wgit::Url#to_origin which should work (unless it's nil).
         
     | 
| 
      
 148 
     | 
    
         
            +
                # @option opts [Wgit::Url, String] :origin The Url origin e.g.
         
     | 
| 
      
 149 
     | 
    
         
            +
                #   http://www.google.com:81/how which gives a origin of
         
     | 
| 
      
 150 
     | 
    
         
            +
                #   'http://www.google.com:81'.
         
     | 
| 
       153 
151 
     | 
    
         
             
                # @option opts [Wgit::Url, String] :host The Url host e.g.
         
     | 
| 
       154 
152 
     | 
    
         
             
                #   http://www.google.com/how which gives a host of 'www.google.com'.
         
     | 
| 
       155 
153 
     | 
    
         
             
                # @option opts [Wgit::Url, String] :domain The Url domain e.g.
         
     | 
| 
         @@ -160,7 +158,7 @@ Addressable::URI::InvalidURIError") 
     | 
|
| 
       160 
158 
     | 
    
         
             
                #   param has been provided.
         
     | 
| 
       161 
159 
     | 
    
         
             
                # @return [Boolean] True if relative, false if absolute.
         
     | 
| 
       162 
160 
     | 
    
         
             
                def relative?(opts = {})
         
     | 
| 
       163 
     | 
    
         
            -
                  defaults = {  
     | 
| 
      
 161 
     | 
    
         
            +
                  defaults = { origin: nil, host: nil, domain: nil, brand: nil }
         
     | 
| 
       164 
162 
     | 
    
         
             
                  opts = defaults.merge(opts)
         
     | 
| 
       165 
163 
     | 
    
         
             
                  raise 'Url (self) cannot be empty' if empty?
         
     | 
| 
       166 
164 
     | 
    
         | 
| 
         @@ -180,8 +178,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}" 
     | 
|
| 
       180 
178 
     | 
    
         
             
                  end
         
     | 
| 
       181 
179 
     | 
    
         | 
| 
       182 
180 
     | 
    
         
             
                  case type
         
     | 
| 
       183 
     | 
    
         
            -
                  when : 
     | 
| 
       184 
     | 
    
         
            -
                     
     | 
| 
      
 181 
     | 
    
         
            +
                  when :origin # http://www.google.com:81
         
     | 
| 
      
 182 
     | 
    
         
            +
                    to_origin == url.to_origin
         
     | 
| 
       185 
183 
     | 
    
         
             
                  when :host   # www.google.com
         
     | 
| 
       186 
184 
     | 
    
         
             
                    to_host   == url.to_host
         
     | 
| 
       187 
185 
     | 
    
         
             
                  when :domain # google.com
         
     | 
| 
         @@ -206,8 +204,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}" 
     | 
|
| 
       206 
204 
     | 
    
         
             
                # @return [Boolean] True if valid, absolute and crawable, otherwise false.
         
     | 
| 
       207 
205 
     | 
    
         
             
                def valid?
         
     | 
| 
       208 
206 
     | 
    
         
             
                  return false if relative?
         
     | 
| 
       209 
     | 
    
         
            -
                  return false unless  
     | 
| 
       210 
     | 
    
         
            -
                  return false  
     | 
| 
      
 207 
     | 
    
         
            +
                  return false unless to_origin && to_domain
         
     | 
| 
      
 208 
     | 
    
         
            +
                  return false unless URI::DEFAULT_PARSER.make_regexp.match(normalize)
         
     | 
| 
       211 
209 
     | 
    
         | 
| 
       212 
210 
     | 
    
         
             
                  true
         
     | 
| 
       213 
211 
     | 
    
         
             
                end
         
     | 
| 
         @@ -238,7 +236,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}" 
     | 
|
| 
       238 
236 
     | 
    
         
             
                  Wgit::Url.new(concatted)
         
     | 
| 
       239 
237 
     | 
    
         
             
                end
         
     | 
| 
       240 
238 
     | 
    
         | 
| 
       241 
     | 
    
         
            -
                #  
     | 
| 
      
 239 
     | 
    
         
            +
                # Normalizes/escapes self and returns a new Wgit::Url. Self isn't modified.
         
     | 
| 
      
 240 
     | 
    
         
            +
                # This should be used before GET'ing the url, in case it has IRI chars.
         
     | 
| 
       242 
241 
     | 
    
         
             
                #
         
     | 
| 
       243 
242 
     | 
    
         
             
                # @return [Wgit::Url] An escaped version of self.
         
     | 
| 
       244 
243 
     | 
    
         
             
                def normalize
         
     | 
| 
         @@ -249,8 +248,8 @@ protocol scheme and domain (e.g. http://example.com): #{url}" 
     | 
|
| 
       249 
248 
     | 
    
         
             
                # modify the receiver.
         
     | 
| 
       250 
249 
     | 
    
         
             
                #
         
     | 
| 
       251 
250 
     | 
    
         
             
                # If self is absolute then it's returned as is, making this method
         
     | 
| 
       252 
     | 
    
         
            -
                # idempotent. The doc's  
     | 
| 
       253 
     | 
    
         
            -
                # doc.url is used as the base; which is concatted with self.
         
     | 
| 
      
 251 
     | 
    
         
            +
                # idempotent. The doc's `<base>` element is used if present, otherwise
         
     | 
| 
      
 252 
     | 
    
         
            +
                # `doc.url` is used as the base; which is concatted with self.
         
     | 
| 
       254 
253 
     | 
    
         
             
                #
         
     | 
| 
       255 
254 
     | 
    
         
             
                # Typically used to build an absolute link obtained from a document.
         
     | 
| 
       256 
255 
     | 
    
         
             
                #
         
     | 
| 
         @@ -258,14 +257,14 @@ protocol scheme and domain (e.g. http://example.com): #{url}" 
     | 
|
| 
       258 
257 
     | 
    
         
             
                #   link = Wgit::Url.new('/favicon.png')
         
     | 
| 
       259 
258 
     | 
    
         
             
                #   doc  = Wgit::Document.new('http://example.com')
         
     | 
| 
       260 
259 
     | 
    
         
             
                #
         
     | 
| 
       261 
     | 
    
         
            -
                #   link. 
     | 
| 
      
 260 
     | 
    
         
            +
                #   link.make_absolute(doc) # => "http://example.com/favicon.png"
         
     | 
| 
       262 
261 
     | 
    
         
             
                #
         
     | 
| 
       263 
262 
     | 
    
         
             
                # @param doc [Wgit::Document] The doc whose base Url is concatted with
         
     | 
| 
       264 
263 
     | 
    
         
             
                #   self.
         
     | 
| 
       265 
264 
     | 
    
         
             
                # @raise [StandardError] If doc isn't a Wgit::Document or if `doc.base_url`
         
     | 
| 
       266 
265 
     | 
    
         
             
                #   raises an Exception.
         
     | 
| 
       267 
266 
     | 
    
         
             
                # @return [Wgit::Url] Self in absolute form.
         
     | 
| 
       268 
     | 
    
         
            -
                def  
     | 
| 
      
 267 
     | 
    
         
            +
                def make_absolute(doc)
         
     | 
| 
       269 
268 
     | 
    
         
             
                  assert_type(doc, Wgit::Document)
         
     | 
| 
       270 
269 
     | 
    
         | 
| 
       271 
270 
     | 
    
         
             
                  absolute? ? self : doc.base_url(link: self).concat(self)
         
     | 
| 
         @@ -294,8 +293,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}" 
     | 
|
| 
       294 
293 
     | 
    
         
             
                #
         
     | 
| 
       295 
294 
     | 
    
         
             
                # @return [Hash] self's instance vars as a Hash.
         
     | 
| 
       296 
295 
     | 
    
         
             
                def to_h
         
     | 
| 
       297 
     | 
    
         
            -
                   
     | 
| 
       298 
     | 
    
         
            -
                  h = Wgit::Utils.to_h(self, ignore: ignore)
         
     | 
| 
      
 296 
     | 
    
         
            +
                  h = Wgit::Utils.to_h(self, ignore: ['@uri'])
         
     | 
| 
       299 
297 
     | 
    
         
             
                  Hash[h.to_a.insert(0, ['url', self])] # Insert url at position 0.
         
     | 
| 
       300 
298 
     | 
    
         
             
                end
         
     | 
| 
       301 
299 
     | 
    
         | 
| 
         @@ -338,6 +336,20 @@ protocol scheme and domain (e.g. http://example.com): #{url}" 
     | 
|
| 
       338 
336 
     | 
    
         
             
                  host ? Wgit::Url.new(host) : nil
         
     | 
| 
       339 
337 
     | 
    
         
             
                end
         
     | 
| 
       340 
338 
     | 
    
         | 
| 
      
 339 
     | 
    
         
            +
                # Returns a new Wgit::Url containing just the port of this URL e.g.
         
     | 
| 
      
 340 
     | 
    
         
            +
                # Given http://www.google.co.uk:443/about.html, '443' is returned.
         
     | 
| 
      
 341 
     | 
    
         
            +
                #
         
     | 
| 
      
 342 
     | 
    
         
            +
                # @return [Wgit::Url, nil] Containing just the port or nil.
         
     | 
| 
      
 343 
     | 
    
         
            +
                def to_port
         
     | 
| 
      
 344 
     | 
    
         
            +
                  port = @uri.port
         
     | 
| 
      
 345 
     | 
    
         
            +
             
     | 
| 
      
 346 
     | 
    
         
            +
                  # @uri.port defaults port to 80/443 if missing, so we check for :#{port}.
         
     | 
| 
      
 347 
     | 
    
         
            +
                  return nil unless port
         
     | 
| 
      
 348 
     | 
    
         
            +
                  return nil unless include?(":#{port}")
         
     | 
| 
      
 349 
     | 
    
         
            +
             
     | 
| 
      
 350 
     | 
    
         
            +
                  Wgit::Url.new(port.to_s)
         
     | 
| 
      
 351 
     | 
    
         
            +
                end
         
     | 
| 
      
 352 
     | 
    
         
            +
             
     | 
| 
       341 
353 
     | 
    
         
             
                # Returns a new Wgit::Url containing just the domain of this URL e.g.
         
     | 
| 
       342 
354 
     | 
    
         
             
                # Given http://www.google.co.uk/about.html, google.co.uk is returned.
         
     | 
| 
       343 
355 
     | 
    
         
             
                #
         
     | 
| 
         @@ -347,6 +359,20 @@ protocol scheme and domain (e.g. http://example.com): #{url}" 
     | 
|
| 
       347 
359 
     | 
    
         
             
                  domain ? Wgit::Url.new(domain) : nil
         
     | 
| 
       348 
360 
     | 
    
         
             
                end
         
     | 
| 
       349 
361 
     | 
    
         | 
| 
      
 362 
     | 
    
         
            +
                # Returns a new Wgit::Url containing just the sub domain of this URL e.g.
         
     | 
| 
      
 363 
     | 
    
         
            +
                # Given http://scripts.dev.google.com, scripts.dev is returned.
         
     | 
| 
      
 364 
     | 
    
         
            +
                #
         
     | 
| 
      
 365 
     | 
    
         
            +
                # @return [Wgit::Url, nil] Containing just the sub domain or nil.
         
     | 
| 
      
 366 
     | 
    
         
            +
                def to_sub_domain
         
     | 
| 
      
 367 
     | 
    
         
            +
                  return nil unless to_host
         
     | 
| 
      
 368 
     | 
    
         
            +
             
     | 
| 
      
 369 
     | 
    
         
            +
                  dot_domain = ".#{to_domain}"
         
     | 
| 
      
 370 
     | 
    
         
            +
                  return nil unless include?(dot_domain)
         
     | 
| 
      
 371 
     | 
    
         
            +
             
     | 
| 
      
 372 
     | 
    
         
            +
                  sub_domain = to_host.sub(dot_domain, '')
         
     | 
| 
      
 373 
     | 
    
         
            +
                  Wgit::Url.new(sub_domain)
         
     | 
| 
      
 374 
     | 
    
         
            +
                end
         
     | 
| 
      
 375 
     | 
    
         
            +
             
     | 
| 
       350 
376 
     | 
    
         
             
                # Returns a new Wgit::Url containing just the brand of this URL e.g.
         
     | 
| 
       351 
377 
     | 
    
         
             
                # Given http://www.google.co.uk/about.html, google is returned.
         
     | 
| 
       352 
378 
     | 
    
         
             
                #
         
     | 
| 
         @@ -362,12 +388,24 @@ protocol scheme and domain (e.g. http://example.com): #{url}" 
     | 
|
| 
       362 
388 
     | 
    
         
             
                # @return [Wgit::Url, nil] The base of self e.g. http://www.google.co.uk or
         
     | 
| 
       363 
389 
     | 
    
         
             
                #   nil.
         
     | 
| 
       364 
390 
     | 
    
         
             
                def to_base
         
     | 
| 
       365 
     | 
    
         
            -
                  return nil  
     | 
| 
      
 391 
     | 
    
         
            +
                  return nil unless @uri.scheme && @uri.host
         
     | 
| 
       366 
392 
     | 
    
         | 
| 
       367 
393 
     | 
    
         
             
                  base = "#{@uri.scheme}://#{@uri.host}"
         
     | 
| 
       368 
394 
     | 
    
         
             
                  Wgit::Url.new(base)
         
     | 
| 
       369 
395 
     | 
    
         
             
                end
         
     | 
| 
       370 
396 
     | 
    
         | 
| 
      
 397 
     | 
    
         
            +
                # Returns only the origin of this URL e.g. the protocol scheme, host and
         
     | 
| 
      
 398 
     | 
    
         
            +
                # port combined. For http://localhost:3000/api, http://localhost:3000 gets
         
     | 
| 
      
 399 
     | 
    
         
            +
                # returned. If there's no port present, then to_base is returned.
         
     | 
| 
      
 400 
     | 
    
         
            +
                #
         
     | 
| 
      
 401 
     | 
    
         
            +
                # @return [Wgit::Url, nil] The origin of self or nil.
         
     | 
| 
      
 402 
     | 
    
         
            +
                def to_origin
         
     | 
| 
      
 403 
     | 
    
         
            +
                  return nil unless to_base
         
     | 
| 
      
 404 
     | 
    
         
            +
                  return to_base unless to_port
         
     | 
| 
      
 405 
     | 
    
         
            +
             
     | 
| 
      
 406 
     | 
    
         
            +
                  Wgit::Url.new("#{to_base}:#{to_port}")
         
     | 
| 
      
 407 
     | 
    
         
            +
                end
         
     | 
| 
      
 408 
     | 
    
         
            +
             
     | 
| 
       371 
409 
     | 
    
         
             
                # Returns the path of this URL e.g. the bit after the host without slashes.
         
     | 
| 
       372 
410 
     | 
    
         
             
                # For example:
         
     | 
| 
       373 
411 
     | 
    
         
             
                # Wgit::Url.new("http://www.google.co.uk/about.html/").to_path returns
         
     | 
| 
         @@ -396,7 +434,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}" 
     | 
|
| 
       396 
434 
     | 
    
         
             
                end
         
     | 
| 
       397 
435 
     | 
    
         | 
| 
       398 
436 
     | 
    
         
             
                # Returns a new Wgit::Url containing just the query string of this URL
         
     | 
| 
       399 
     | 
    
         
            -
                # e.g. Given http://google.com?q= 
     | 
| 
      
 437 
     | 
    
         
            +
                # e.g. Given http://google.com?q=foo&bar=1, 'q=ruby&bar=1' is returned.
         
     | 
| 
       400 
438 
     | 
    
         
             
                #
         
     | 
| 
       401 
439 
     | 
    
         
             
                # @return [Wgit::Url, nil] Containing just the query string or nil.
         
     | 
| 
       402 
440 
     | 
    
         
             
                def to_query
         
     | 
| 
         @@ -404,6 +442,24 @@ protocol scheme and domain (e.g. http://example.com): #{url}" 
     | 
|
| 
       404 
442 
     | 
    
         
             
                  query ? Wgit::Url.new(query) : nil
         
     | 
| 
       405 
443 
     | 
    
         
             
                end
         
     | 
| 
       406 
444 
     | 
    
         | 
| 
      
 445 
     | 
    
         
            +
                # Returns a Hash containing just the query string parameters of this URL
         
     | 
| 
      
 446 
     | 
    
         
            +
                # e.g. Given http://google.com?q=ruby, "{ 'q' => 'ruby' }" is returned.
         
     | 
| 
      
 447 
     | 
    
         
            +
                #
         
     | 
| 
      
 448 
     | 
    
         
            +
                # @param symbolize_keys [Boolean] The returned Hash keys will be Symbols if
         
     | 
| 
      
 449 
     | 
    
         
            +
                #   true, Strings otherwise.
         
     | 
| 
      
 450 
     | 
    
         
            +
                # @return [Hash<String | Symbol, String>] Containing the query string
         
     | 
| 
      
 451 
     | 
    
         
            +
                #   params or empty if the URL doesn't contain any query parameters.
         
     | 
| 
      
 452 
     | 
    
         
            +
                def to_query_hash(symbolize_keys: false)
         
     | 
| 
      
 453 
     | 
    
         
            +
                  query_str = to_query
         
     | 
| 
      
 454 
     | 
    
         
            +
                  return {} unless query_str
         
     | 
| 
      
 455 
     | 
    
         
            +
             
     | 
| 
      
 456 
     | 
    
         
            +
                  query_str.split('&').each_with_object({}) do |param, hash|
         
     | 
| 
      
 457 
     | 
    
         
            +
                    k, v = param.split('=')
         
     | 
| 
      
 458 
     | 
    
         
            +
                    k = k.to_sym if symbolize_keys
         
     | 
| 
      
 459 
     | 
    
         
            +
                    hash[k] = v
         
     | 
| 
      
 460 
     | 
    
         
            +
                  end
         
     | 
| 
      
 461 
     | 
    
         
            +
                end
         
     | 
| 
      
 462 
     | 
    
         
            +
             
     | 
| 
       407 
463 
     | 
    
         
             
                # Returns a new Wgit::Url containing just the fragment string of this URL
         
     | 
| 
       408 
464 
     | 
    
         
             
                # e.g. Given http://google.com#about, #about is returned.
         
     | 
| 
       409 
465 
     | 
    
         
             
                #
         
     | 
| 
         @@ -425,6 +481,24 @@ protocol scheme and domain (e.g. http://example.com): #{url}" 
     | 
|
| 
       425 
481 
     | 
    
         
             
                  segs.length > 1 ? Wgit::Url.new(segs.last) : nil
         
     | 
| 
       426 
482 
     | 
    
         
             
                end
         
     | 
| 
       427 
483 
     | 
    
         | 
| 
      
 484 
     | 
    
         
            +
                # Returns a new Wgit::Url containing just the username string of this URL
         
     | 
| 
      
 485 
     | 
    
         
            +
                # e.g. Given http://me:pass1@example.com, me is returned.
         
     | 
| 
      
 486 
     | 
    
         
            +
                #
         
     | 
| 
      
 487 
     | 
    
         
            +
                # @return [Wgit::Url, nil] Containing just the user string or nil.
         
     | 
| 
      
 488 
     | 
    
         
            +
                def to_user
         
     | 
| 
      
 489 
     | 
    
         
            +
                  user = @uri.user
         
     | 
| 
      
 490 
     | 
    
         
            +
                  user ? Wgit::Url.new(user) : nil
         
     | 
| 
      
 491 
     | 
    
         
            +
                end
         
     | 
| 
      
 492 
     | 
    
         
            +
             
     | 
| 
      
 493 
     | 
    
         
            +
                # Returns a new Wgit::Url containing just the password string of this URL
         
     | 
| 
      
 494 
     | 
    
         
            +
                # e.g. Given http://me:pass1@example.com, pass1 is returned.
         
     | 
| 
      
 495 
     | 
    
         
            +
                #
         
     | 
| 
      
 496 
     | 
    
         
            +
                # @return [Wgit::Url, nil] Containing just the password string or nil.
         
     | 
| 
      
 497 
     | 
    
         
            +
                def to_password
         
     | 
| 
      
 498 
     | 
    
         
            +
                  password = @uri.password
         
     | 
| 
      
 499 
     | 
    
         
            +
                  password ? Wgit::Url.new(password) : nil
         
     | 
| 
      
 500 
     | 
    
         
            +
                end
         
     | 
| 
      
 501 
     | 
    
         
            +
             
     | 
| 
       428 
502 
     | 
    
         
             
                # Omits the given URL components from self and returns a new Wgit::Url.
         
     | 
| 
       429 
503 
     | 
    
         
             
                #
         
     | 
| 
       430 
504 
     | 
    
         
             
                # Calls Addressable::URI#omit underneath and creates a new Wgit::Url from
         
     | 
| 
         @@ -468,7 +542,7 @@ protocol scheme and domain (e.g. http://example.com): #{url}" 
     | 
|
| 
       468 
542 
     | 
    
         
             
                    .omit_trailing_slash
         
     | 
| 
       469 
543 
     | 
    
         
             
                end
         
     | 
| 
       470 
544 
     | 
    
         | 
| 
       471 
     | 
    
         
            -
                # Returns a new Wgit::Url with the base ( 
     | 
| 
      
 545 
     | 
    
         
            +
                # Returns a new Wgit::Url with the base (scheme and host) removed e.g. Given
         
     | 
| 
       472 
546 
     | 
    
         
             
                # http://google.com/search?q=something#about, search?q=something#about is
         
     | 
| 
       473 
547 
     | 
    
         
             
                # returned. If relative and base isn't present then self is returned.
         
     | 
| 
       474 
548 
     | 
    
         
             
                # Leading and trailing slashes are always stripped from the return value.
         
     | 
| 
         @@ -483,6 +557,21 @@ protocol scheme and domain (e.g. http://example.com): #{url}" 
     | 
|
| 
       483 
557 
     | 
    
         
             
                  Wgit::Url.new(omit_base).omit_slashes
         
     | 
| 
       484 
558 
     | 
    
         
             
                end
         
     | 
| 
       485 
559 
     | 
    
         | 
| 
      
 560 
     | 
    
         
            +
                # Returns a new Wgit::Url with the origin (base + port) removed e.g. Given
         
     | 
| 
      
 561 
     | 
    
         
            +
                # http://google.com:81/search?q=something#about, search?q=something#about is
         
     | 
| 
      
 562 
     | 
    
         
            +
                # returned. If relative and base isn't present then self is returned.
         
     | 
| 
      
 563 
     | 
    
         
            +
                # Leading and trailing slashes are always stripped from the return value.
         
     | 
| 
      
 564 
     | 
    
         
            +
                #
         
     | 
| 
      
 565 
     | 
    
         
            +
                # @return [Wgit::Url] Self containing everything after the origin.
         
     | 
| 
      
 566 
     | 
    
         
            +
                def omit_origin
         
     | 
| 
      
 567 
     | 
    
         
            +
                  origin = to_origin
         
     | 
| 
      
 568 
     | 
    
         
            +
                  omit_origin = origin ? gsub(origin, '') : self
         
     | 
| 
      
 569 
     | 
    
         
            +
             
     | 
| 
      
 570 
     | 
    
         
            +
                  return self if ['', '/'].include?(omit_origin)
         
     | 
| 
      
 571 
     | 
    
         
            +
             
     | 
| 
      
 572 
     | 
    
         
            +
                  Wgit::Url.new(omit_origin).omit_slashes
         
     | 
| 
      
 573 
     | 
    
         
            +
                end
         
     | 
| 
      
 574 
     | 
    
         
            +
             
     | 
| 
       486 
575 
     | 
    
         
             
                # Returns a new Wgit::Url with the query string portion removed e.g. Given
         
     | 
| 
       487 
576 
     | 
    
         
             
                # http://google.com/search?q=hello, http://google.com/search is
         
     | 
| 
       488 
577 
     | 
    
         
             
                # returned. Self is returned as is if no query string is present. A URL
         
     | 
| 
         @@ -528,25 +617,38 @@ protocol scheme and domain (e.g. http://example.com): #{url}" 
     | 
|
| 
       528 
617 
     | 
    
         
             
                  start_with?('#')
         
     | 
| 
       529 
618 
     | 
    
         
             
                end
         
     | 
| 
       530 
619 
     | 
    
         | 
| 
      
 620 
     | 
    
         
            +
                # Returns true if self equals '/' a.k.a. index.
         
     | 
| 
      
 621 
     | 
    
         
            +
                #
         
     | 
| 
      
 622 
     | 
    
         
            +
                # @return [Boolean] True if self equals '/', false otherwise.
         
     | 
| 
      
 623 
     | 
    
         
            +
                def index?
         
     | 
| 
      
 624 
     | 
    
         
            +
                  self == '/'
         
     | 
| 
      
 625 
     | 
    
         
            +
                end
         
     | 
| 
      
 626 
     | 
    
         
            +
             
     | 
| 
       531 
627 
     | 
    
         
             
                alias +            concat
         
     | 
| 
       532 
628 
     | 
    
         
             
                alias crawled?     crawled
         
     | 
| 
       533 
     | 
    
         
            -
                alias normalise    normalize
         
     | 
| 
       534 
629 
     | 
    
         
             
                alias is_relative? relative?
         
     | 
| 
       535 
630 
     | 
    
         
             
                alias is_absolute? absolute?
         
     | 
| 
       536 
631 
     | 
    
         
             
                alias is_valid?    valid?
         
     | 
| 
       537 
632 
     | 
    
         
             
                alias is_query?    query?
         
     | 
| 
       538 
633 
     | 
    
         
             
                alias is_fragment? fragment?
         
     | 
| 
      
 634 
     | 
    
         
            +
                alias is_index?    index?
         
     | 
| 
       539 
635 
     | 
    
         
             
                alias uri          to_uri
         
     | 
| 
       540 
636 
     | 
    
         
             
                alias url          to_url
         
     | 
| 
       541 
637 
     | 
    
         
             
                alias scheme       to_scheme
         
     | 
| 
       542 
638 
     | 
    
         
             
                alias host         to_host
         
     | 
| 
      
 639 
     | 
    
         
            +
                alias port         to_port
         
     | 
| 
       543 
640 
     | 
    
         
             
                alias domain       to_domain
         
     | 
| 
       544 
641 
     | 
    
         
             
                alias brand        to_brand
         
     | 
| 
       545 
642 
     | 
    
         
             
                alias base         to_base
         
     | 
| 
      
 643 
     | 
    
         
            +
                alias origin       to_origin
         
     | 
| 
       546 
644 
     | 
    
         
             
                alias path         to_path
         
     | 
| 
       547 
645 
     | 
    
         
             
                alias endpoint     to_endpoint
         
     | 
| 
       548 
646 
     | 
    
         
             
                alias query        to_query
         
     | 
| 
      
 647 
     | 
    
         
            +
                alias query_hash   to_query_hash
         
     | 
| 
       549 
648 
     | 
    
         
             
                alias fragment     to_fragment
         
     | 
| 
       550 
649 
     | 
    
         
             
                alias extension    to_extension
         
     | 
| 
      
 650 
     | 
    
         
            +
                alias user         to_user
         
     | 
| 
      
 651 
     | 
    
         
            +
                alias password     to_password
         
     | 
| 
      
 652 
     | 
    
         
            +
                alias sub_domain   to_sub_domain
         
     | 
| 
       551 
653 
     | 
    
         
             
              end
         
     | 
| 
       552 
654 
     | 
    
         
             
            end
         
     | 
    
        data/lib/wgit/utils.rb
    CHANGED
    
    | 
         @@ -145,7 +145,8 @@ module Wgit 
     | 
|
| 
       145 
145 
     | 
    
         
             
                # @param keyword_limit [Integer] The max amount of keywords to be
         
     | 
| 
       146 
146 
     | 
    
         
             
                #   outputted to the stream.
         
     | 
| 
       147 
147 
     | 
    
         
             
                # @param stream [#puts] Any object that respond_to?(:puts). It is used
         
     | 
| 
       148 
     | 
    
         
            -
                #   to output text somewhere e.g. a file or  
     | 
| 
      
 148 
     | 
    
         
            +
                #   to output text somewhere e.g. a file or STDERR.
         
     | 
| 
      
 149 
     | 
    
         
            +
                # @return [Integer] The number of results.
         
     | 
| 
       149 
150 
     | 
    
         
             
                def self.printf_search_results(results, keyword_limit: 5, stream: STDOUT)
         
     | 
| 
       150 
151 
     | 
    
         
             
                  raise 'stream must respond_to? :puts' unless stream.respond_to?(:puts)
         
     | 
| 
       151 
152 
     | 
    
         | 
| 
         @@ -162,18 +163,37 @@ module Wgit 
     | 
|
| 
       162 
163 
     | 
    
         
             
                    stream.puts
         
     | 
| 
       163 
164 
     | 
    
         
             
                  end
         
     | 
| 
       164 
165 
     | 
    
         | 
| 
       165 
     | 
    
         
            -
                   
     | 
| 
      
 166 
     | 
    
         
            +
                  results.size
         
     | 
| 
       166 
167 
     | 
    
         
             
                end
         
     | 
| 
       167 
168 
     | 
    
         | 
| 
       168 
     | 
    
         
            -
                #  
     | 
| 
      
 169 
     | 
    
         
            +
                # Sanitises the obj to make it uniform by calling the correct sanitize_*
         
     | 
| 
      
 170 
     | 
    
         
            +
                # method for its type e.g. if obj.is_a? String then sanitize(obj). Any type
         
     | 
| 
      
 171 
     | 
    
         
            +
                # not in the case statement will be ignored and returned as is.
         
     | 
| 
      
 172 
     | 
    
         
            +
                #
         
     | 
| 
      
 173 
     | 
    
         
            +
                # @param obj [Object] The object to be sanitized.
         
     | 
| 
      
 174 
     | 
    
         
            +
                # @param encode [Boolean] Whether or not to encode to UTF-8 replacing
         
     | 
| 
      
 175 
     | 
    
         
            +
                #   invalid characters.
         
     | 
| 
      
 176 
     | 
    
         
            +
                # @return [Object] The sanitized obj is both modified and then returned.
         
     | 
| 
      
 177 
     | 
    
         
            +
                def self.sanitize(obj, encode: true)
         
     | 
| 
      
 178 
     | 
    
         
            +
                  case obj
         
     | 
| 
      
 179 
     | 
    
         
            +
                  when String
         
     | 
| 
      
 180 
     | 
    
         
            +
                    sanitize_str(obj, encode: encode)
         
     | 
| 
      
 181 
     | 
    
         
            +
                  when Array
         
     | 
| 
      
 182 
     | 
    
         
            +
                    sanitize_arr(obj, encode: encode)
         
     | 
| 
      
 183 
     | 
    
         
            +
                  else
         
     | 
| 
      
 184 
     | 
    
         
            +
                    obj
         
     | 
| 
      
 185 
     | 
    
         
            +
                  end
         
     | 
| 
      
 186 
     | 
    
         
            +
                end
         
     | 
| 
      
 187 
     | 
    
         
            +
             
     | 
| 
      
 188 
     | 
    
         
            +
                # Sanitises a String to make it uniform. Strips any leading/trailing white
         
     | 
| 
       169 
189 
     | 
    
         
             
                # space. Also applies UTF-8 encoding (replacing invalid characters) if
         
     | 
| 
       170 
190 
     | 
    
         
             
                # `encode: true`.
         
     | 
| 
       171 
191 
     | 
    
         
             
                #
         
     | 
| 
       172 
     | 
    
         
            -
                # @param str [String] The String to  
     | 
| 
      
 192 
     | 
    
         
            +
                # @param str [String] The String to sanitize. str is modified.
         
     | 
| 
       173 
193 
     | 
    
         
             
                # @param encode [Boolean] Whether or not to encode to UTF-8 replacing
         
     | 
| 
       174 
194 
     | 
    
         
             
                #   invalid characters.
         
     | 
| 
       175 
     | 
    
         
            -
                # @return [String] The  
     | 
| 
       176 
     | 
    
         
            -
                def self. 
     | 
| 
      
 195 
     | 
    
         
            +
                # @return [String] The sanitized str is both modified and then returned.
         
     | 
| 
      
 196 
     | 
    
         
            +
                def self.sanitize_str(str, encode: true)
         
     | 
| 
       177 
197 
     | 
    
         
             
                  if str.is_a?(String)
         
     | 
| 
       178 
198 
     | 
    
         
             
                    str.encode!('UTF-8', undef: :replace, invalid: :replace) if encode
         
     | 
| 
       179 
199 
     | 
    
         
             
                    str.strip!
         
     | 
| 
         @@ -182,15 +202,15 @@ module Wgit 
     | 
|
| 
       182 
202 
     | 
    
         
             
                  str
         
     | 
| 
       183 
203 
     | 
    
         
             
                end
         
     | 
| 
       184 
204 
     | 
    
         | 
| 
       185 
     | 
    
         
            -
                #  
     | 
| 
       186 
     | 
    
         
            -
                # processes non empty Strings using Wgit::Utils. 
     | 
| 
      
 205 
     | 
    
         
            +
                # Sanitises an Array to make it uniform. Removes empty Strings and nils,
         
     | 
| 
      
 206 
     | 
    
         
            +
                # processes non empty Strings using Wgit::Utils.sanitize and removes
         
     | 
| 
       187 
207 
     | 
    
         
             
                # duplicates.
         
     | 
| 
       188 
208 
     | 
    
         
             
                #
         
     | 
| 
       189 
     | 
    
         
            -
                # @param arr [Enumerable] The Array to  
     | 
| 
       190 
     | 
    
         
            -
                # @return [Enumerable] The  
     | 
| 
       191 
     | 
    
         
            -
                def self. 
     | 
| 
      
 209 
     | 
    
         
            +
                # @param arr [Enumerable] The Array to sanitize. arr is modified.
         
     | 
| 
      
 210 
     | 
    
         
            +
                # @return [Enumerable] The sanitized arr is both modified and then returned.
         
     | 
| 
      
 211 
     | 
    
         
            +
                def self.sanitize_arr(arr, encode: true)
         
     | 
| 
       192 
212 
     | 
    
         
             
                  if arr.is_a?(Array)
         
     | 
| 
       193 
     | 
    
         
            -
                    arr.map! { |str|  
     | 
| 
      
 213 
     | 
    
         
            +
                    arr.map! { |str| sanitize(str, encode: encode) }
         
     | 
| 
       194 
214 
     | 
    
         
             
                    arr.reject! { |str| str.is_a?(String) ? str.empty? : false }
         
     | 
| 
       195 
215 
     | 
    
         
             
                    arr.compact!
         
     | 
| 
       196 
216 
     | 
    
         
             
                    arr.uniq!
         
     | 
| 
         @@ -198,13 +218,5 @@ module Wgit 
     | 
|
| 
       198 
218 
     | 
    
         | 
| 
       199 
219 
     | 
    
         
             
                  arr
         
     | 
| 
       200 
220 
     | 
    
         
             
                end
         
     | 
| 
       201 
     | 
    
         
            -
             
     | 
| 
       202 
     | 
    
         
            -
                # Returns the model having removed non bson types (for use with MongoDB).
         
     | 
| 
       203 
     | 
    
         
            -
                #
         
     | 
| 
       204 
     | 
    
         
            -
                # @param model_hash [Hash] The model Hash to process.
         
     | 
| 
       205 
     | 
    
         
            -
                # @return [Hash] The model Hash with non bson types removed.
         
     | 
| 
       206 
     | 
    
         
            -
                def self.remove_non_bson_types(model_hash)
         
     | 
| 
       207 
     | 
    
         
            -
                  model_hash.select { |_k, v| v.respond_to?(:bson_type) }
         
     | 
| 
       208 
     | 
    
         
            -
                end
         
     | 
| 
       209 
221 
     | 
    
         
             
              end
         
     | 
| 
       210 
222 
     | 
    
         
             
            end
         
     | 
    
        data/lib/wgit/version.rb
    CHANGED
    
    | 
         @@ -2,10 +2,11 @@ 
     | 
|
| 
       2 
2 
     | 
    
         | 
| 
       3 
3 
     | 
    
         
             
            # Wgit is a WWW indexer/scraper which crawls URL's and retrieves their page
         
     | 
| 
       4 
4 
     | 
    
         
             
            # contents for later use.
         
     | 
| 
      
 5 
     | 
    
         
            +
            #
         
     | 
| 
       5 
6 
     | 
    
         
             
            # @author Michael Telford
         
     | 
| 
       6 
7 
     | 
    
         
             
            module Wgit
         
     | 
| 
       7 
8 
     | 
    
         
             
              # The current gem version of Wgit.
         
     | 
| 
       8 
     | 
    
         
            -
              VERSION = '0. 
     | 
| 
      
 9 
     | 
    
         
            +
              VERSION = '0.9.0'
         
     | 
| 
       9 
10 
     | 
    
         | 
| 
       10 
11 
     | 
    
         
             
              # Returns the current gem version of Wgit as a String.
         
     | 
| 
       11 
12 
     | 
    
         
             
              def self.version
         
     |