wgit 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
 - data/lib/wgit.rb +11 -0
 - data/lib/wgit/assertable.rb +69 -0
 - data/lib/wgit/core_ext.rb +40 -0
 - data/lib/wgit/crawler.rb +132 -0
 - data/lib/wgit/database/database.rb +269 -0
 - data/lib/wgit/database/model.rb +31 -0
 - data/lib/wgit/database/mongo_connection_details.rb +27 -0
 - data/lib/wgit/document.rb +293 -0
 - data/lib/wgit/url.rb +140 -0
 - data/lib/wgit/utils.rb +115 -0
 - data/lib/wgit/version.rb +3 -0
 - data/lib/wgit/web_crawler.rb +134 -0
 - metadata +62 -0
 
    
        checksums.yaml
    ADDED
    
    | 
         @@ -0,0 +1,7 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            ---
         
     | 
| 
      
 2 
     | 
    
         
            +
            SHA1:
         
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: 82f33e00a273c6cdeb3ba9c171110d849fff2428
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: 14c63f826d1d21811b14e9f3a2bca750b3f4afa3
         
     | 
| 
      
 5 
     | 
    
         
            +
            SHA512:
         
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: 7c42b925f72d9e7cceba79d9aee764f97b6537c0005038501a1f75c36b1bcd3b6036cfb9b62fcf01fd435e0348c1e8c00c445a291051c068fa58184de2c9590a
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: a2a756c3be7b9b214921bfdac5846a2250e452265285cb9c3b812d2eaefc2ab969b608cd1841f34507a6ef184f20ba7c98658daf0135fb85eead88de0356320f
         
     | 
    
        data/lib/wgit.rb
    ADDED
    
    | 
         @@ -0,0 +1,11 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require_relative 'wgit/version'
         
     | 
| 
      
 2 
     | 
    
         
            +
            require_relative 'wgit/crawler'
         
     | 
| 
      
 3 
     | 
    
         
            +
            require_relative 'wgit/web_crawler'
         
     | 
| 
      
 4 
     | 
    
         
            +
            require_relative 'wgit/url'
         
     | 
| 
      
 5 
     | 
    
         
            +
            require_relative 'wgit/document'
         
     | 
| 
      
 6 
     | 
    
         
            +
            require_relative 'wgit/utils'
         
     | 
| 
      
 7 
     | 
    
         
            +
            require_relative 'wgit/assertable'
         
     | 
| 
      
 8 
     | 
    
         
            +
            require_relative 'wgit/database/database'
         
     | 
| 
      
 9 
     | 
    
         
            +
            require_relative 'wgit/database/model'
         
     | 
| 
      
 10 
     | 
    
         
            +
            require_relative 'wgit/database/mongo_connection_details'
         
     | 
| 
      
 11 
     | 
    
         
            +
            #require_relative 'wgit/core_ext'
         
     | 
| 
         @@ -0,0 +1,69 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
             
     | 
| 
      
 2 
     | 
    
         
            +
            module Wgit
         
     | 
| 
      
 3 
     | 
    
         
            +
             
     | 
| 
      
 4 
     | 
    
         
            +
              # @author Michael Telford
         
     | 
| 
      
 5 
     | 
    
         
            +
              # Module containing assert methods including type checking which can be used 
         
     | 
| 
      
 6 
     | 
    
         
            +
              # for asserting the integrity of method definitions etc. 
         
     | 
| 
      
 7 
     | 
    
         
            +
              module Assertable
         
     | 
| 
      
 8 
     | 
    
         
            +
                  DEFAULT_TYPE_FAIL_MSG = "Expected: %s, Actual: %s"
         
     | 
| 
      
 9 
     | 
    
         
            +
                  WRONG_METHOD_MSG = "arr must be Enumerable, use a different method"
         
     | 
| 
      
 10 
     | 
    
         
            +
                  DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s"
         
     | 
| 
      
 11 
     | 
    
         
            +
                
         
     | 
| 
      
 12 
     | 
    
         
            +
                  # obj.instance_of? must return true for one of the types listed in 
         
     | 
| 
      
 13 
     | 
    
         
            +
                  # type_or_types or an exception is thrown using msg if provided. 
         
     | 
| 
      
 14 
     | 
    
         
            +
                  # type_or_types can be a single Class or an Enumerable of Class objects, 
         
     | 
| 
      
 15 
     | 
    
         
            +
                  # Strings and Symbols will not work. 
         
     | 
| 
      
 16 
     | 
    
         
            +
                  def assert_types(obj, type_or_types, msg = nil)
         
     | 
| 
      
 17 
     | 
    
         
            +
                      msg ||= DEFAULT_TYPE_FAIL_MSG % [type_or_types, obj.class]
         
     | 
| 
      
 18 
     | 
    
         
            +
                      if type_or_types.respond_to?(:any?)
         
     | 
| 
      
 19 
     | 
    
         
            +
                          match = type_or_types.any? { |type| obj.instance_of?(type) }
         
     | 
| 
      
 20 
     | 
    
         
            +
                      else
         
     | 
| 
      
 21 
     | 
    
         
            +
                          match = obj.instance_of?(type_or_types)
         
     | 
| 
      
 22 
     | 
    
         
            +
                      end
         
     | 
| 
      
 23 
     | 
    
         
            +
                      raise msg unless match
         
     | 
| 
      
 24 
     | 
    
         
            +
                      obj
         
     | 
| 
      
 25 
     | 
    
         
            +
                  end
         
     | 
| 
      
 26 
     | 
    
         
            +
                
         
     | 
| 
      
 27 
     | 
    
         
            +
                  # Each object within arr must match one of the types listed in 
         
     | 
| 
      
 28 
     | 
    
         
            +
                  # type_or_types or an exception is thrown using msg if provided. 
         
     | 
| 
      
 29 
     | 
    
         
            +
                  # type_or_types can be a single Class or an Enumerable of Class objects, 
         
     | 
| 
      
 30 
     | 
    
         
            +
                  # Strings and Symbols will not work. 
         
     | 
| 
      
 31 
     | 
    
         
            +
                  def assert_arr_types(arr, type_or_types, msg = nil)
         
     | 
| 
      
 32 
     | 
    
         
            +
                      raise WRONG_METHOD_MSG unless arr.respond_to?(:each)
         
     | 
| 
      
 33 
     | 
    
         
            +
                      arr.each do |obj|
         
     | 
| 
      
 34 
     | 
    
         
            +
                          assert_types(obj, type_or_types, msg)
         
     | 
| 
      
 35 
     | 
    
         
            +
                      end
         
     | 
| 
      
 36 
     | 
    
         
            +
                  end
         
     | 
| 
      
 37 
     | 
    
         
            +
                
         
     | 
| 
      
 38 
     | 
    
         
            +
                  # The obj_or_objs must respond_to? all of the given methods or an 
         
     | 
| 
      
 39 
     | 
    
         
            +
                  # Exception is raised using msg or a default message.
         
     | 
| 
      
 40 
     | 
    
         
            +
                  # Returns obj_or_objs on sucessful assertion.
         
     | 
| 
      
 41 
     | 
    
         
            +
                  def assert_respond_to(obj_or_objs, methods, msg = nil)
         
     | 
| 
      
 42 
     | 
    
         
            +
                      if obj_or_objs.respond_to?(:each)
         
     | 
| 
      
 43 
     | 
    
         
            +
                          obj_or_objs.each do |obj|
         
     | 
| 
      
 44 
     | 
    
         
            +
                              _assert_respond_to(obj, methods, msg)
         
     | 
| 
      
 45 
     | 
    
         
            +
                          end
         
     | 
| 
      
 46 
     | 
    
         
            +
                      else
         
     | 
| 
      
 47 
     | 
    
         
            +
                          _assert_respond_to(obj_or_objs, methods, msg)
         
     | 
| 
      
 48 
     | 
    
         
            +
                      end
         
     | 
| 
      
 49 
     | 
    
         
            +
                      obj_or_objs
         
     | 
| 
      
 50 
     | 
    
         
            +
                  end
         
     | 
| 
      
 51 
     | 
    
         
            +
                
         
     | 
| 
      
 52 
     | 
    
         
            +
                  private
         
     | 
| 
      
 53 
     | 
    
         
            +
                
         
     | 
| 
      
 54 
     | 
    
         
            +
                  def _assert_respond_to(obj, methods, msg = nil)
         
     | 
| 
      
 55 
     | 
    
         
            +
                      msg ||= DEFAULT_DUCK_FAIL_MSG % ["#{obj.class} (#{obj})", methods]
         
     | 
| 
      
 56 
     | 
    
         
            +
                      match = methods.all? { |method| obj.respond_to?(method) }
         
     | 
| 
      
 57 
     | 
    
         
            +
                      raise msg unless match
         
     | 
| 
      
 58 
     | 
    
         
            +
                      obj
         
     | 
| 
      
 59 
     | 
    
         
            +
                  end
         
     | 
| 
      
 60 
     | 
    
         
            +
                
         
     | 
| 
      
 61 
     | 
    
         
            +
                  alias :assert_type :assert_types
         
     | 
| 
      
 62 
     | 
    
         
            +
                  alias :type :assert_types
         
     | 
| 
      
 63 
     | 
    
         
            +
                  alias :types :assert_types
         
     | 
| 
      
 64 
     | 
    
         
            +
                  alias :assert_arr_type :assert_arr_types
         
     | 
| 
      
 65 
     | 
    
         
            +
                  alias :arr_type :assert_arr_types
         
     | 
| 
      
 66 
     | 
    
         
            +
                  alias :arr_types :assert_arr_types
         
     | 
| 
      
 67 
     | 
    
         
            +
                  alias :respond_to :assert_respond_to
         
     | 
| 
      
 68 
     | 
    
         
            +
              end
         
     | 
| 
      
 69 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,40 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require_relative 'url'
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            # @author Michael Telford
         
     | 
| 
      
 4 
     | 
    
         
            +
            # Script which extends Ruby's core functionality when parsed.
         
     | 
| 
      
 5 
     | 
    
         
            +
            # Needs to be required separately using `require 'wgit/core_ext'`. 
         
     | 
| 
      
 6 
     | 
    
         
            +
             
     | 
| 
      
 7 
     | 
    
         
            +
            class String
         
     | 
| 
      
 8 
     | 
    
         
            +
              # Converts a String into a Wgit::Url object. 
         
     | 
| 
      
 9 
     | 
    
         
            +
              def to_url
         
     | 
| 
      
 10 
     | 
    
         
            +
                Wgit::Url.new(self)
         
     | 
| 
      
 11 
     | 
    
         
            +
              end
         
     | 
| 
      
 12 
     | 
    
         
            +
            end
         
     | 
| 
      
 13 
     | 
    
         
            +
             
     | 
| 
      
 14 
     | 
    
         
            +
            module Enumerable
         
     | 
| 
      
 15 
     | 
    
         
            +
              # Converts each String instance into a Wgit::Url object and returns the new 
         
     | 
| 
      
 16 
     | 
    
         
            +
              # array. 
         
     | 
| 
      
 17 
     | 
    
         
            +
              def to_urls
         
     | 
| 
      
 18 
     | 
    
         
            +
                map do |element|
         
     | 
| 
      
 19 
     | 
    
         
            +
                  process_url_element(element)
         
     | 
| 
      
 20 
     | 
    
         
            +
                end
         
     | 
| 
      
 21 
     | 
    
         
            +
              end
         
     | 
| 
      
 22 
     | 
    
         
            +
              
         
     | 
| 
      
 23 
     | 
    
         
            +
              # Converts each String instance into a Wgit::Url object and returns the 
         
     | 
| 
      
 24 
     | 
    
         
            +
              # updated array. 
         
     | 
| 
      
 25 
     | 
    
         
            +
              def to_urls!
         
     | 
| 
      
 26 
     | 
    
         
            +
                map! do |element|
         
     | 
| 
      
 27 
     | 
    
         
            +
                  process_url_element(element)
         
     | 
| 
      
 28 
     | 
    
         
            +
                end
         
     | 
| 
      
 29 
     | 
    
         
            +
              end
         
     | 
| 
      
 30 
     | 
    
         
            +
            end
         
     | 
| 
      
 31 
     | 
    
         
            +
             
     | 
| 
      
 32 
     | 
    
         
            +
            private
         
     | 
| 
      
 33 
     | 
    
         
            +
             
     | 
| 
      
 34 
     | 
    
         
            +
            def process_url_element(element)
         
     | 
| 
      
 35 
     | 
    
         
            +
              if element.is_a? String
         
     | 
| 
      
 36 
     | 
    
         
            +
                element.to_url
         
     | 
| 
      
 37 
     | 
    
         
            +
              else
         
     | 
| 
      
 38 
     | 
    
         
            +
                element
         
     | 
| 
      
 39 
     | 
    
         
            +
              end
         
     | 
| 
      
 40 
     | 
    
         
            +
            end
         
     | 
    
        data/lib/wgit/crawler.rb
    ADDED
    
    | 
         @@ -0,0 +1,132 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require_relative 'url'
         
     | 
| 
      
 2 
     | 
    
         
            +
            require_relative 'document'
         
     | 
| 
      
 3 
     | 
    
         
            +
            require_relative 'utils'
         
     | 
| 
      
 4 
     | 
    
         
            +
            require_relative 'assertable'
         
     | 
| 
      
 5 
     | 
    
         
            +
            require 'net/http' # requires 'uri'
         
     | 
| 
      
 6 
     | 
    
         
            +
             
         
     | 
| 
      
 7 
     | 
    
         
            +
            module Wgit
         
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
              # @author Michael Telford
         
     | 
| 
      
 10 
     | 
    
         
            +
              # Crawler class provides a means of crawling web URL's. 
         
     | 
| 
      
 11 
     | 
    
         
            +
              # Note that any redirects will not be followed for during crawling 
         
     | 
| 
      
 12 
     | 
    
         
            +
              # functionality. 
         
     | 
| 
      
 13 
     | 
    
         
            +
              class Crawler
         
     | 
| 
      
 14 
     | 
    
         
            +
                include Assertable
         
     | 
| 
      
 15 
     | 
    
         
            +
                
         
     | 
| 
      
 16 
     | 
    
         
            +
              	attr_reader :urls, :docs
         
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
      
 18 
     | 
    
         
            +
              	def initialize(*urls)
         
     | 
| 
      
 19 
     | 
    
         
            +
              		self.urls = urls unless urls.nil?
         
     | 
| 
      
 20 
     | 
    
         
            +
                  @docs = []
         
     | 
| 
      
 21 
     | 
    
         
            +
              	end
         
     | 
| 
      
 22 
     | 
    
         
            +
                
         
     | 
| 
      
 23 
     | 
    
         
            +
                def urls=(urls)
         
     | 
| 
      
 24 
     | 
    
         
            +
                    @urls = []
         
     | 
| 
      
 25 
     | 
    
         
            +
                    Wgit::Utils.each(urls) { |url| add_url(url) }
         
     | 
| 
      
 26 
     | 
    
         
            +
                end
         
     | 
| 
      
 27 
     | 
    
         
            +
              
         
     | 
| 
      
 28 
     | 
    
         
            +
                def [](*urls)
         
     | 
| 
      
 29 
     | 
    
         
            +
                    self.urls = urls unless urls.nil?
         
     | 
| 
      
 30 
     | 
    
         
            +
                end
         
     | 
| 
      
 31 
     | 
    
         
            +
              
         
     | 
| 
      
 32 
     | 
    
         
            +
                def <<(url)
         
     | 
| 
      
 33 
     | 
    
         
            +
                    add_url(url)
         
     | 
| 
      
 34 
     | 
    
         
            +
                end
         
     | 
| 
      
 35 
     | 
    
         
            +
            	
         
     | 
| 
      
 36 
     | 
    
         
            +
                # Crawls individual urls, not entire sites.
         
     | 
| 
      
 37 
     | 
    
         
            +
                # Returns the last crawled doc.
         
     | 
| 
      
 38 
     | 
    
         
            +
                # Yields each doc to the provided block or adds each doc to @docs
         
     | 
| 
      
 39 
     | 
    
         
            +
                # which can be accessed by Crawler#docs after the method returns.
         
     | 
| 
      
 40 
     | 
    
         
            +
              	def crawl_urls(urls = @urls, &block)
         
     | 
| 
      
 41 
     | 
    
         
            +
                  raise "No urls to crawl" unless urls
         
     | 
| 
      
 42 
     | 
    
         
            +
                  @docs = []
         
     | 
| 
      
 43 
     | 
    
         
            +
                  doc = nil
         
     | 
| 
      
 44 
     | 
    
         
            +
                  Wgit::Utils.each(urls) { |url| doc = handle_crawl_block(url, &block) }
         
     | 
| 
      
 45 
     | 
    
         
            +
                  doc ? doc : @docs.last
         
     | 
| 
      
 46 
     | 
    
         
            +
              	end
         
     | 
| 
      
 47 
     | 
    
         
            +
            	
         
     | 
| 
      
 48 
     | 
    
         
            +
              	# Crawl the url and return the response document or nil.
         
     | 
| 
      
 49 
     | 
    
         
            +
                # Also yield(doc) if a block is provided. The doc is passed to the block 
         
     | 
| 
      
 50 
     | 
    
         
            +
                # regardless of the crawl success so the doc.url can be used if needed. 
         
     | 
| 
      
 51 
     | 
    
         
            +
              	def crawl_url(url = @urls.first, &block)
         
     | 
| 
      
 52 
     | 
    
         
            +
                  assert_type(url, Url)
         
     | 
| 
      
 53 
     | 
    
         
            +
              		markup = fetch(url)
         
     | 
| 
      
 54 
     | 
    
         
            +
                  url.crawled = true
         
     | 
| 
      
 55 
     | 
    
         
            +
                  doc = Wgit::Document.new(url, markup)
         
     | 
| 
      
 56 
     | 
    
         
            +
                  block.call(doc) if block_given?
         
     | 
| 
      
 57 
     | 
    
         
            +
                  doc.empty? ? nil : doc
         
     | 
| 
      
 58 
     | 
    
         
            +
              	end
         
     | 
| 
      
 59 
     | 
    
         
            +
                
         
     | 
| 
      
 60 
     | 
    
         
            +
                # Crawls an entire site by recursively going through its internal_links.
         
     | 
| 
      
 61 
     | 
    
         
            +
                # Also yield(doc) for each crawled doc if a block is provided.
         
     | 
| 
      
 62 
     | 
    
         
            +
                # A block is the only way to interact with the crawled docs.
         
     | 
| 
      
 63 
     | 
    
         
            +
                # Returns a unique array of external urls collected from the site
         
     | 
| 
      
 64 
     | 
    
         
            +
                # or nil if the base_url could not be crawled successfully.
         
     | 
| 
      
 65 
     | 
    
         
            +
                def crawl_site(base_url = @urls.first, &block)
         
     | 
| 
      
 66 
     | 
    
         
            +
                  assert_type(base_url, Url)
         
     | 
| 
      
 67 
     | 
    
         
            +
                
         
     | 
| 
      
 68 
     | 
    
         
            +
                  doc = crawl_url(base_url, &block)
         
     | 
| 
      
 69 
     | 
    
         
            +
                  return nil if doc.nil?
         
     | 
| 
      
 70 
     | 
    
         
            +
                
         
     | 
| 
      
 71 
     | 
    
         
            +
                  crawled_urls  = []
         
     | 
| 
      
 72 
     | 
    
         
            +
                  external_urls = doc.external_links
         
     | 
| 
      
 73 
     | 
    
         
            +
                  internal_urls = doc.internal_links
         
     | 
| 
      
 74 
     | 
    
         
            +
                
         
     | 
| 
      
 75 
     | 
    
         
            +
                  return doc.external_links.uniq if internal_urls.empty?
         
     | 
| 
      
 76 
     | 
    
         
            +
                
         
     | 
| 
      
 77 
     | 
    
         
            +
                  loop do
         
     | 
| 
      
 78 
     | 
    
         
            +
                    internal_urls.uniq! unless internal_urls.uniq.nil?
         
     | 
| 
      
 79 
     | 
    
         
            +
                  
         
     | 
| 
      
 80 
     | 
    
         
            +
                    links = internal_urls - crawled_urls
         
     | 
| 
      
 81 
     | 
    
         
            +
                    break if links.empty?
         
     | 
| 
      
 82 
     | 
    
         
            +
                  
         
     | 
| 
      
 83 
     | 
    
         
            +
                    links.each do |link|
         
     | 
| 
      
 84 
     | 
    
         
            +
                      doc = crawl_url(Wgit::Url.concat(base_url.to_base, link), &block)
         
     | 
| 
      
 85 
     | 
    
         
            +
                      crawled_urls << link
         
     | 
| 
      
 86 
     | 
    
         
            +
                      next if doc.nil?
         
     | 
| 
      
 87 
     | 
    
         
            +
                      internal_urls.concat(doc.internal_links)
         
     | 
| 
      
 88 
     | 
    
         
            +
                      external_urls.concat(doc.external_links)
         
     | 
| 
      
 89 
     | 
    
         
            +
                    end
         
     | 
| 
      
 90 
     | 
    
         
            +
                  end
         
     | 
| 
      
 91 
     | 
    
         
            +
                
         
     | 
| 
      
 92 
     | 
    
         
            +
                  external_urls.uniq
         
     | 
| 
      
 93 
     | 
    
         
            +
                end
         
     | 
| 
      
 94 
     | 
    
         
            +
                
         
     | 
| 
      
 95 
     | 
    
         
            +
              private
         
     | 
| 
      
 96 
     | 
    
         
            +
                
         
     | 
| 
      
 97 
     | 
    
         
            +
                # Add the document to the @docs array for later processing
         
     | 
| 
      
 98 
     | 
    
         
            +
                # or let the block process it here and now.
         
     | 
| 
      
 99 
     | 
    
         
            +
                def handle_crawl_block(url, &block)
         
     | 
| 
      
 100 
     | 
    
         
            +
                    if not block_given?
         
     | 
| 
      
 101 
     | 
    
         
            +
            		        @docs << crawl_url(url)
         
     | 
| 
      
 102 
     | 
    
         
            +
                        nil
         
     | 
| 
      
 103 
     | 
    
         
            +
                    else
         
     | 
| 
      
 104 
     | 
    
         
            +
                        crawl_url(url, &block)
         
     | 
| 
      
 105 
     | 
    
         
            +
                    end
         
     | 
| 
      
 106 
     | 
    
         
            +
                end
         
     | 
| 
      
 107 
     | 
    
         
            +
              
         
     | 
| 
      
 108 
     | 
    
         
            +
                # The fetch method performs a HTTP GET to obtain the HTML document.
         
     | 
| 
      
 109 
     | 
    
         
            +
                # Invalid urls or any HTTP response that doesn't return a HTML body 
         
     | 
| 
      
 110 
     | 
    
         
            +
                # will be ignored and nil will be returned.  This means that redirects
         
     | 
| 
      
 111 
     | 
    
         
            +
                # etc. will not be followed. 
         
     | 
| 
      
 112 
     | 
    
         
            +
                def fetch(url)
         
     | 
| 
      
 113 
     | 
    
         
            +
                    raise unless url.respond_to?(:to_uri)
         
     | 
| 
      
 114 
     | 
    
         
            +
                    res = Net::HTTP.get_response(url.to_uri)
         
     | 
| 
      
 115 
     | 
    
         
            +
                    res.body.empty? ? nil : res.body
         
     | 
| 
      
 116 
     | 
    
         
            +
                rescue
         
     | 
| 
      
 117 
     | 
    
         
            +
                    nil
         
     | 
| 
      
 118 
     | 
    
         
            +
                end
         
     | 
| 
      
 119 
     | 
    
         
            +
              
         
     | 
| 
      
 120 
     | 
    
         
            +
                def add_url(url)
         
     | 
| 
      
 121 
     | 
    
         
            +
                    @urls = [] if @urls.nil?
         
     | 
| 
      
 122 
     | 
    
         
            +
                    if url.instance_of?(Url)
         
     | 
| 
      
 123 
     | 
    
         
            +
                        @urls << url
         
     | 
| 
      
 124 
     | 
    
         
            +
                    else
         
     | 
| 
      
 125 
     | 
    
         
            +
                        @urls << Wgit::Url.new(url)
         
     | 
| 
      
 126 
     | 
    
         
            +
                    end
         
     | 
| 
      
 127 
     | 
    
         
            +
                end
         
     | 
| 
      
 128 
     | 
    
         
            +
              
         
     | 
| 
      
 129 
     | 
    
         
            +
                alias :crawl :crawl_urls
         
     | 
| 
      
 130 
     | 
    
         
            +
                alias :crawl_r :crawl_site
         
     | 
| 
      
 131 
     | 
    
         
            +
              end
         
     | 
| 
      
 132 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,269 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require_relative '../document'
         
     | 
| 
      
 2 
     | 
    
         
            +
            require_relative '../url'
         
     | 
| 
      
 3 
     | 
    
         
            +
            require_relative '../utils'
         
     | 
| 
      
 4 
     | 
    
         
            +
            require_relative '../assertable'
         
     | 
| 
      
 5 
     | 
    
         
            +
            require_relative 'mongo_connection_details'
         
     | 
| 
      
 6 
     | 
    
         
            +
            require_relative 'model'
         
     | 
| 
      
 7 
     | 
    
         
            +
            require 'mongo'
         
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
            module Wgit
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
              # @author Michael Telford
         
     | 
| 
      
 12 
     | 
    
         
            +
              # Class modeling a DB connection and CRUD operations for the Url and 
         
     | 
| 
      
 13 
     | 
    
         
            +
              # Document collections.
         
     | 
| 
      
 14 
     | 
    
         
            +
              # The most common methods are: insert, update, urls, search, stats, size. 
         
     | 
| 
      
 15 
     | 
    
         
            +
              class Database
         
     | 
| 
      
 16 
     | 
    
         
            +
                include Assertable
         
     | 
| 
      
 17 
     | 
    
         
            +
              
         
     | 
| 
      
 18 
     | 
    
         
            +
                # Is relative to the root project folder, not this file. 
         
     | 
| 
      
 19 
     | 
    
         
            +
                LOG_FILE_PATH = "misc/mongo_log.txt"
         
     | 
| 
      
 20 
     | 
    
         
            +
              
         
     | 
| 
      
 21 
     | 
    
         
            +
                def initialize
         
     | 
| 
      
 22 
     | 
    
         
            +
                  conn_details = Wgit::CONNECTION_DETAILS
         
     | 
| 
      
 23 
     | 
    
         
            +
                  if conn_details.empty?
         
     | 
| 
      
 24 
     | 
    
         
            +
                    raise "Wgit::CONNECTION_DETAILS must be defined and include :host, 
         
     | 
| 
      
 25 
     | 
    
         
            +
            :port, :db, :uname, :pword for a database connection to be established."
         
     | 
| 
      
 26 
     | 
    
         
            +
                  end
         
     | 
| 
      
 27 
     | 
    
         
            +
                  
         
     | 
| 
      
 28 
     | 
    
         
            +
                  logger = Logger.new(LOG_FILE_PATH)
         
     | 
| 
      
 29 
     | 
    
         
            +
                  address = "#{conn_details[:host]}:#{conn_details[:port]}"
         
     | 
| 
      
 30 
     | 
    
         
            +
                  @@client = Mongo::Client.new([address], 
         
     | 
| 
      
 31 
     | 
    
         
            +
                                               :database => conn_details[:db],
         
     | 
| 
      
 32 
     | 
    
         
            +
                                               :user => conn_details[:uname],
         
     | 
| 
      
 33 
     | 
    
         
            +
                                               :password => conn_details[:pword],
         
     | 
| 
      
 34 
     | 
    
         
            +
                                               :logger => logger,
         
     | 
| 
      
 35 
     | 
    
         
            +
                                               :truncate_logs => false)
         
     | 
| 
      
 36 
     | 
    
         
            +
                end
         
     | 
| 
      
 37 
     | 
    
         
            +
              
         
     | 
| 
      
 38 
     | 
    
         
            +
                ### Create Data ###
         
     | 
| 
      
 39 
     | 
    
         
            +
              
         
     | 
| 
      
 40 
     | 
    
         
            +
                def insert(data)
         
     | 
| 
      
 41 
     | 
    
         
            +
                    if data.is_a?(Url)
         
     | 
| 
      
 42 
     | 
    
         
            +
                        insert_urls(data)
         
     | 
| 
      
 43 
     | 
    
         
            +
                    elsif data.is_a?(Document)
         
     | 
| 
      
 44 
     | 
    
         
            +
                        insert_docs(data)
         
     | 
| 
      
 45 
     | 
    
         
            +
                    elsif data.respond_to?(:first)
         
     | 
| 
      
 46 
     | 
    
         
            +
                        if data.first.is_a?(Url)
         
     | 
| 
      
 47 
     | 
    
         
            +
                            insert_urls(data)
         
     | 
| 
      
 48 
     | 
    
         
            +
                        else
         
     | 
| 
      
 49 
     | 
    
         
            +
                            insert_docs(data)
         
     | 
| 
      
 50 
     | 
    
         
            +
                        end
         
     | 
| 
      
 51 
     | 
    
         
            +
                    else
         
     | 
| 
      
 52 
     | 
    
         
            +
                        raise "data is not in the correct format (all Url's or Document's)"
         
     | 
| 
      
 53 
     | 
    
         
            +
                    end
         
     | 
| 
      
 54 
     | 
    
         
            +
                end
         
     | 
| 
      
 55 
     | 
    
         
            +
              
         
     | 
| 
      
 56 
     | 
    
         
            +
                def insert_urls(url_or_urls)
         
     | 
| 
      
 57 
     | 
    
         
            +
                    unless url_or_urls.respond_to?(:map)
         
     | 
| 
      
 58 
     | 
    
         
            +
                        assert_type(url_or_urls, Url)
         
     | 
| 
      
 59 
     | 
    
         
            +
                        url_or_urls = Wgit::Model.url(url_or_urls)
         
     | 
| 
      
 60 
     | 
    
         
            +
                    else
         
     | 
| 
      
 61 
     | 
    
         
            +
                        assert_arr_types(url_or_urls, Url)
         
     | 
| 
      
 62 
     | 
    
         
            +
                        url_or_urls = url_or_urls.map do |url|
         
     | 
| 
      
 63 
     | 
    
         
            +
                            Wgit::Model.url(url)
         
     | 
| 
      
 64 
     | 
    
         
            +
                        end
         
     | 
| 
      
 65 
     | 
    
         
            +
                    end
         
     | 
| 
      
 66 
     | 
    
         
            +
                    create(:urls, url_or_urls)
         
     | 
| 
      
 67 
     | 
    
         
            +
                end
         
     | 
| 
      
 68 
     | 
    
         
            +
              
         
     | 
| 
      
 69 
     | 
    
         
            +
                def insert_docs(doc_or_docs)
         
     | 
| 
      
 70 
     | 
    
         
            +
                    unless doc_or_docs.respond_to?(:map)
         
     | 
| 
      
 71 
     | 
    
         
            +
                        assert_type(doc_or_docs, [Document, Hash])
         
     | 
| 
      
 72 
     | 
    
         
            +
                        unless doc_or_docs.is_a?(Hash)
         
     | 
| 
      
 73 
     | 
    
         
            +
                            doc_or_docs = Wgit::Model.document(doc_or_docs)
         
     | 
| 
      
 74 
     | 
    
         
            +
                        end
         
     | 
| 
      
 75 
     | 
    
         
            +
                    else
         
     | 
| 
      
 76 
     | 
    
         
            +
                        assert_arr_types(doc_or_docs, [Document, Hash])
         
     | 
| 
      
 77 
     | 
    
         
            +
                        doc_or_docs = doc_or_docs.map do |doc|
         
     | 
| 
      
 78 
     | 
    
         
            +
                            Wgit::Model.document(doc) unless doc.is_a?(Hash)
         
     | 
| 
      
 79 
     | 
    
         
            +
                        end
         
     | 
| 
      
 80 
     | 
    
         
            +
                    end
         
     | 
| 
      
 81 
     | 
    
         
            +
                    create(:documents, doc_or_docs)
         
     | 
| 
      
 82 
     | 
    
         
            +
                end
         
     | 
| 
      
 83 
     | 
    
         
            +
              
         
     | 
| 
      
 84 
     | 
    
         
            +
                ### Retrieve Data ###
         
     | 
| 
      
 85 
     | 
    
         
            +
              
         
     | 
| 
      
 86 
     | 
    
         
            +
                # A crawled parameter value of nil (the default) returns all urls.
         
     | 
| 
      
 87 
     | 
    
         
            +
                # A limit of 0 means all urls are returned.
         
     | 
| 
      
 88 
     | 
    
         
            +
                # All urls are sorted by date_added ascending, in other words the first 
         
     | 
| 
      
 89 
     | 
    
         
            +
                # url in the results is the first added. 
         
     | 
| 
      
 90 
     | 
    
         
            +
                def urls(crawled = nil, limit = 0, skip = 0, &block)
         
     | 
| 
      
 91 
     | 
    
         
            +
                  crawled.nil? ? query = {} : query = { :crawled => crawled }
         
     | 
| 
      
 92 
     | 
    
         
            +
                  
         
     | 
| 
      
 93 
     | 
    
         
            +
                  sort = { :date_added => 1 }
         
     | 
| 
      
 94 
     | 
    
         
            +
                  results = retrieve(:urls, query, sort, {}, limit, skip)
         
     | 
| 
      
 95 
     | 
    
         
            +
                  return [] if results.count < 1
         
     | 
| 
      
 96 
     | 
    
         
            +
                  
         
     | 
| 
      
 97 
     | 
    
         
            +
                  # results.respond_to? :map! is false so we use map and overwrite the var.
         
     | 
| 
      
 98 
     | 
    
         
            +
                  results = results.map { |url_doc| Wgit::Url.new(url_doc) }
         
     | 
| 
      
 99 
     | 
    
         
            +
                  return results unless block_given?
         
     | 
| 
      
 100 
     | 
    
         
            +
                  results.each { |url| block.call(url) }
         
     | 
| 
      
 101 
     | 
    
         
            +
                end
         
     | 
| 
      
 102 
     | 
    
         
            +
              
         
     | 
| 
      
 103 
     | 
    
         
            +
                def crawled_urls(limit = 0, skip = 0, &block)
         
     | 
| 
      
 104 
     | 
    
         
            +
                  urls(true, limit, skip, &block)
         
     | 
| 
      
 105 
     | 
    
         
            +
                end
         
     | 
| 
      
 106 
     | 
    
         
            +
              
         
     | 
| 
      
 107 
     | 
    
         
            +
                def uncrawled_urls(limit = 0, skip = 0, &block)
         
     | 
| 
      
 108 
     | 
    
         
            +
                  urls(false, limit, skip, &block)
         
     | 
| 
      
 109 
     | 
    
         
            +
                end
         
     | 
| 
      
 110 
     | 
    
         
            +
             
     | 
| 
      
 111 
     | 
    
         
            +
                # Currently all searches are case insensitive.
         
     | 
| 
      
 112 
     | 
    
         
            +
                #
         
     | 
| 
      
 113 
     | 
    
         
            +
                # Searches against the indexed docs in the DB for the given text.
         
     | 
| 
      
 114 
     | 
    
         
            +
                # The searched fields are decided by the text index setup against the 
         
     | 
| 
      
 115 
     | 
    
         
            +
                # documents collection. Currently we search against the following fields:
         
     | 
| 
      
 116 
     | 
    
         
            +
                # "author", "keywords", "title" and "text".
         
     | 
| 
      
 117 
     | 
    
         
            +
                #
         
     | 
| 
      
 118 
     | 
    
         
            +
                # The MongoDB search ranks/sorts the results in order (highest first) based 
         
     | 
| 
      
 119 
     | 
    
         
            +
                # upon each documents textScore which records the number of text hits. We 
         
     | 
| 
      
 120 
     | 
    
         
            +
                # then store this textScore in each Document object for use elsewhere if 
         
     | 
| 
      
 121 
     | 
    
         
            +
                # needed. 
         
     | 
| 
      
 122 
     | 
    
         
            +
                #
         
     | 
| 
      
 123 
     | 
    
         
            +
                # @param text [String] the value to search the data against.
         
     | 
| 
      
 124 
     | 
    
         
            +
                # @param whole_sentence [Boolean] whether multiple words should be 
         
     | 
| 
      
 125 
     | 
    
         
            +
                # searched for separately.
         
     | 
| 
      
 126 
     | 
    
         
            +
                # @param limit [Fixnum] the max length/count of the results array.
         
     | 
| 
      
 127 
     | 
    
         
            +
                # @param skip [Fixnum] the number of results to skip, starting with the 
         
     | 
| 
      
 128 
     | 
    
         
            +
                # most relevant based upon the textScore of the search. 
         
     | 
| 
      
 129 
     | 
    
         
            +
                # @param block [Block] a block which if provided is passed to each result. 
         
     | 
| 
      
 130 
     | 
    
         
            +
                # 
         
     | 
| 
      
 131 
     | 
    
         
            +
                # @return [Array] of Document objects representing the search results.
         
     | 
| 
      
 132 
     | 
    
         
            +
                def search(text, whole_sentence = false, limit = 10, skip = 0, &block)
         
     | 
| 
      
 133 
     | 
    
         
            +
                  text.strip!
         
     | 
| 
      
 134 
     | 
    
         
            +
                  text.replace("\"" + text + "\"") if whole_sentence
         
     | 
| 
      
 135 
     | 
    
         
            +
                
         
     | 
| 
      
 136 
     | 
    
         
            +
                  # The textScore sorts based on the most search hits.
         
     | 
| 
      
 137 
     | 
    
         
            +
                  # We use the textScore hash as a sort and a projection below.
         
     | 
| 
      
 138 
     | 
    
         
            +
                  # :$caseSensitive => case_sensitive, # 3.2+ only.
         
     | 
| 
      
 139 
     | 
    
         
            +
                  sort_proj = { :score => { :$meta => "textScore" } }
         
     | 
| 
      
 140 
     | 
    
         
            +
                  query = { :$text => { :$search => text } }
         
     | 
| 
      
 141 
     | 
    
         
            +
                  results = retrieve(:documents, query, sort_proj, sort_proj, limit, skip)
         
     | 
| 
      
 142 
     | 
    
         
            +
                
         
     | 
| 
      
 143 
     | 
    
         
            +
                  return [] if results.count < 1
         
     | 
| 
      
 144 
     | 
    
         
            +
                  # results.respond_to? :map! is false so we use map and overwrite the var.
         
     | 
| 
      
 145 
     | 
    
         
            +
                  results = results.map { |mongo_doc| Wgit::Document.new(mongo_doc) }
         
     | 
| 
      
 146 
     | 
    
         
            +
                  return results unless block_given?
         
     | 
| 
      
 147 
     | 
    
         
            +
                  results.each { |doc| block.call(doc) }
         
     | 
| 
      
 148 
     | 
    
         
            +
                end
         
     | 
| 
      
 149 
     | 
    
         
            +
             
     | 
| 
      
 150 
     | 
    
         
            +
                # Performs a search and pretty prints the results.
         
     | 
| 
      
 151 
     | 
    
         
            +
                def search_p(text, whole_sentence = false, limit = 10, 
         
     | 
| 
      
 152 
     | 
    
         
            +
                             skip = 0, sentence_length = 80, &block)
         
     | 
| 
      
 153 
     | 
    
         
            +
                  results = search(text, whole_sentence, limit, skip, &block)
         
     | 
| 
      
 154 
     | 
    
         
            +
                  Wgit::Utils.printf_search_results(results, text, false, sentence_length)
         
     | 
| 
      
 155 
     | 
    
         
            +
                end
         
     | 
| 
      
 156 
     | 
    
         
            +
              
         
     | 
| 
      
 157 
     | 
    
         
            +
                # Returns a Mongo object which can be used like a Hash to retrieve values.
         
     | 
| 
      
 158 
     | 
    
         
            +
                def stats
         
     | 
| 
      
 159 
     | 
    
         
            +
                    @@client.command(:dbStats => 0).documents[0]
         
     | 
| 
      
 160 
     | 
    
         
            +
                end
         
     | 
| 
      
 161 
     | 
    
         
            +
              
         
     | 
| 
      
 162 
     | 
    
         
            +
                def size
         
     | 
| 
      
 163 
     | 
    
         
            +
                    stats[:dataSize]
         
     | 
| 
      
 164 
     | 
    
         
            +
                end
         
     | 
| 
      
 165 
     | 
    
         
            +
              
         
     | 
| 
      
 166 
     | 
    
         
            +
                ### Update Data ###
         
     | 
| 
      
 167 
     | 
    
         
            +
              
         
     | 
| 
      
 168 
     | 
    
         
            +
                def update(data)
         
     | 
| 
      
 169 
     | 
    
         
            +
                  if data.is_a?(Url)
         
     | 
| 
      
 170 
     | 
    
         
            +
                    update_url(data)
         
     | 
| 
      
 171 
     | 
    
         
            +
                  elsif data.is_a?(Document)
         
     | 
| 
      
 172 
     | 
    
         
            +
                    update_doc(data)
         
     | 
| 
      
 173 
     | 
    
         
            +
                  else
         
     | 
| 
      
 174 
     | 
    
         
            +
                    raise "data is not in the correct format (all Url's or Document's)"
         
     | 
| 
      
 175 
     | 
    
         
            +
                  end
         
     | 
| 
      
 176 
     | 
    
         
            +
                end
         
     | 
| 
      
 177 
     | 
    
         
            +
              
         
     | 
| 
      
 178 
     | 
    
         
            +
                def update_url(url)
         
     | 
| 
      
 179 
     | 
    
         
            +
                  assert_type(url, Url)
         
     | 
| 
      
 180 
     | 
    
         
            +
                  selection = { :url => url }
         
     | 
| 
      
 181 
     | 
    
         
            +
                  url_hash = Wgit::Model.url(url).merge(Wgit::Model.common_update_data)
         
     | 
| 
      
 182 
     | 
    
         
            +
                  update = { "$set" => url_hash }
         
     | 
| 
      
 183 
     | 
    
         
            +
                  _update(true, :urls, selection, update)
         
     | 
| 
      
 184 
     | 
    
         
            +
                end
         
     | 
| 
      
 185 
     | 
    
         
            +
              
         
     | 
| 
      
 186 
     | 
    
         
            +
              def update_doc(doc)
         
     | 
| 
      
 187 
     | 
    
         
            +
                assert_type(doc, Document)
         
     | 
| 
      
 188 
     | 
    
         
            +
                selection = { :url => doc.url }
         
     | 
| 
      
 189 
     | 
    
         
            +
                doc_hash = Wgit::Model.document(doc).merge(Wgit::Model.common_update_data)
         
     | 
| 
      
 190 
     | 
    
         
            +
                update = { "$set" => doc_hash }
         
     | 
| 
      
 191 
     | 
    
         
            +
                _update(true, :documents, selection, update)
         
     | 
| 
      
 192 
     | 
    
         
            +
              end
         
     | 
| 
      
 193 
     | 
    
         
            +
              
         
     | 
| 
      
 194 
     | 
    
         
            +
            private
         
     | 
| 
      
 195 
     | 
    
         
            +
             
     | 
| 
      
 196 
     | 
    
         
            +
                def write_succeeded?(result, count = 1, multi = false)
         
     | 
| 
      
 197 
     | 
    
         
            +
                    case result.class.to_s
         
     | 
| 
      
 198 
     | 
    
         
            +
                    # Single create result.
         
     | 
| 
      
 199 
     | 
    
         
            +
                    when "Mongo::Operation::Write::Insert::Result"
         
     | 
| 
      
 200 
     | 
    
         
            +
                        result.documents.first[:err].nil?
         
     | 
| 
      
 201 
     | 
    
         
            +
                    # Multiple create result.
         
     | 
| 
      
 202 
     | 
    
         
            +
                    when "Mongo::BulkWrite::Result"
         
     | 
| 
      
 203 
     | 
    
         
            +
                        result.inserted_count == count
         
     | 
| 
      
 204 
     | 
    
         
            +
                    # Single and multiple update result.
         
     | 
| 
      
 205 
     | 
    
         
            +
                    when "Mongo::Operation::Write::Update::Result", # MongoDB 3.0
         
     | 
| 
      
 206 
     | 
    
         
            +
                         "Mongo::Operation::Write::Update::LegacyResult" # MongoDB 2.4
         
     | 
| 
      
 207 
     | 
    
         
            +
                        if multi
         
     | 
| 
      
 208 
     | 
    
         
            +
                            result.n == count
         
     | 
| 
      
 209 
     | 
    
         
            +
                        else
         
     | 
| 
      
 210 
     | 
    
         
            +
                            result.documents.first[:err].nil?
         
     | 
| 
      
 211 
     | 
    
         
            +
                        end
         
     | 
| 
      
 212 
     | 
    
         
            +
                    else
         
     | 
| 
      
 213 
     | 
    
         
            +
                        raise "Result class not currently supported: #{result.class.to_s}"
         
     | 
| 
      
 214 
     | 
    
         
            +
                    end
         
     | 
| 
      
 215 
     | 
    
         
            +
                end
         
     | 
| 
      
 216 
     | 
    
         
            +
              
         
     | 
| 
      
 217 
     | 
    
         
            +
                def create(collection, data)
         
     | 
| 
      
 218 
     | 
    
         
            +
                    assert_type(data, [Hash, Array])
         
     | 
| 
      
 219 
     | 
    
         
            +
                    # Single doc.
         
     | 
| 
      
 220 
     | 
    
         
            +
                    if data.is_a?(Hash)
         
     | 
| 
      
 221 
     | 
    
         
            +
                        data.merge!(Wgit::Model.common_insert_data)
         
     | 
| 
      
 222 
     | 
    
         
            +
                        result = @@client[collection.to_sym].insert_one(data)
         
     | 
| 
      
 223 
     | 
    
         
            +
                        unless write_succeeded?(result)
         
     | 
| 
      
 224 
     | 
    
         
            +
                          raise "DB write (insert) failed"
         
     | 
| 
      
 225 
     | 
    
         
            +
                        end
         
     | 
| 
      
 226 
     | 
    
         
            +
                        result.n
         
     | 
| 
      
 227 
     | 
    
         
            +
                    # Multiple docs.
         
     | 
| 
      
 228 
     | 
    
         
            +
                    elsif data.is_a?(Array)
         
     | 
| 
      
 229 
     | 
    
         
            +
                        assert_arr_types(data, Hash)
         
     | 
| 
      
 230 
     | 
    
         
            +
                        data.map! do |data_hash|
         
     | 
| 
      
 231 
     | 
    
         
            +
                            data_hash.merge(Wgit::Model.common_insert_data)
         
     | 
| 
      
 232 
     | 
    
         
            +
                        end
         
     | 
| 
      
 233 
     | 
    
         
            +
                        result = @@client[collection.to_sym].insert_many(data)
         
     | 
| 
      
 234 
     | 
    
         
            +
                        unless write_succeeded?(result, data.length)
         
     | 
| 
      
 235 
     | 
    
         
            +
                            raise "DB write(s) failed"
         
     | 
| 
      
 236 
     | 
    
         
            +
                        end
         
     | 
| 
      
 237 
     | 
    
         
            +
                        result.inserted_count
         
     | 
| 
      
 238 
     | 
    
         
            +
                    else
         
     | 
| 
      
 239 
     | 
    
         
            +
                        raise "data must be a Hash or an Array of Hash's"
         
     | 
| 
      
 240 
     | 
    
         
            +
                    end
         
     | 
| 
      
 241 
     | 
    
         
            +
                end
         
     | 
| 
      
 242 
     | 
    
         
            +
              
         
     | 
| 
      
 243 
     | 
    
         
            +
                def retrieve(collection, query, sort = {}, projection = {}, 
         
     | 
| 
      
 244 
     | 
    
         
            +
                             limit = 0, skip = 0)
         
     | 
| 
      
 245 
     | 
    
         
            +
                    assert_type(query, Hash)
         
     | 
| 
      
 246 
     | 
    
         
            +
                    @@client[collection.to_sym].find(query).projection(projection)
         
     | 
| 
      
 247 
     | 
    
         
            +
                                              .skip(skip).limit(limit).sort(sort)
         
     | 
| 
      
 248 
     | 
    
         
            +
                end
         
     | 
| 
      
 249 
     | 
    
         
            +
              
         
     | 
| 
      
 250 
     | 
    
         
            +
                # NOTE: The Model.common_update_data should be merged in the calling 
         
     | 
| 
      
 251 
     | 
    
         
            +
                # method as the update param can be bespoke due to its nature.
         
     | 
| 
      
 252 
     | 
    
         
            +
                def _update(single, collection, selection, update)
         
     | 
| 
      
 253 
     | 
    
         
            +
                    assert_arr_types([selection, update], Hash)
         
     | 
| 
      
 254 
     | 
    
         
            +
                    if single
         
     | 
| 
      
 255 
     | 
    
         
            +
                      result = @@client[collection.to_sym].update_one(selection, update)
         
     | 
| 
      
 256 
     | 
    
         
            +
                    else
         
     | 
| 
      
 257 
     | 
    
         
            +
                      result = @@client[collection.to_sym].update_many(selection, update)
         
     | 
| 
      
 258 
     | 
    
         
            +
                    end
         
     | 
| 
      
 259 
     | 
    
         
            +
                    raise "DB write (update) failed" unless write_succeeded?(result)
         
     | 
| 
      
 260 
     | 
    
         
            +
                    result.n
         
     | 
| 
      
 261 
     | 
    
         
            +
                end
         
     | 
| 
      
 262 
     | 
    
         
            +
              
         
     | 
| 
      
 263 
     | 
    
         
            +
                alias :count :size
         
     | 
| 
      
 264 
     | 
    
         
            +
                alias :length :size
         
     | 
| 
      
 265 
     | 
    
         
            +
                alias :insert_url :insert_urls
         
     | 
| 
      
 266 
     | 
    
         
            +
                alias :insert_doc :insert_docs
         
     | 
| 
      
 267 
     | 
    
         
            +
                alias :search_and_format :search_p
         
     | 
| 
      
 268 
     | 
    
         
            +
              end
         
     | 
| 
      
 269 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,31 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require_relative '../utils'
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            module Wgit
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
              # @author Michael Telford
         
     | 
| 
      
 6 
     | 
    
         
            +
              # Module containing the DB data model structure.
         
     | 
| 
      
 7 
     | 
    
         
            +
              module Model
         
     | 
| 
      
 8 
     | 
    
         
            +
                  def self.url(url)
         
     | 
| 
      
 9 
     | 
    
         
            +
                      raise "url must respond to to_h" unless url.respond_to?(:to_h)
         
     | 
| 
      
 10 
     | 
    
         
            +
                      url.to_h
         
     | 
| 
      
 11 
     | 
    
         
            +
                  end
         
     | 
| 
      
 12 
     | 
    
         
            +
                
         
     | 
| 
      
 13 
     | 
    
         
            +
                  def self.document(doc)
         
     | 
| 
      
 14 
     | 
    
         
            +
                      raise "doc must respond to to_h" unless doc.respond_to?(:to_h)
         
     | 
| 
      
 15 
     | 
    
         
            +
                      doc.to_h(false)
         
     | 
| 
      
 16 
     | 
    
         
            +
                  end
         
     | 
| 
      
 17 
     | 
    
         
            +
                
         
     | 
| 
      
 18 
     | 
    
         
            +
                  def self.common_insert_data
         
     | 
| 
      
 19 
     | 
    
         
            +
                      {
         
     | 
| 
      
 20 
     | 
    
         
            +
                          :date_added     => Wgit::Utils.time_stamp,
         
     | 
| 
      
 21 
     | 
    
         
            +
                          :date_modified  => Wgit::Utils.time_stamp,
         
     | 
| 
      
 22 
     | 
    
         
            +
                      }
         
     | 
| 
      
 23 
     | 
    
         
            +
                  end
         
     | 
| 
      
 24 
     | 
    
         
            +
                
         
     | 
| 
      
 25 
     | 
    
         
            +
                  def self.common_update_data
         
     | 
| 
      
 26 
     | 
    
         
            +
                      {
         
     | 
| 
      
 27 
     | 
    
         
            +
                          :date_modified  => Wgit::Utils.time_stamp,
         
     | 
| 
      
 28 
     | 
    
         
            +
                      }
         
     | 
| 
      
 29 
     | 
    
         
            +
                  end
         
     | 
| 
      
 30 
     | 
    
         
            +
              end
         
     | 
| 
      
 31 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,27 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
             
     | 
| 
      
 2 
     | 
    
         
            +
            # @author Michael Telford
         
     | 
| 
      
 3 
     | 
    
         
            +
            module Wgit
         
     | 
| 
      
 4 
     | 
    
         
            +
              DB_PROVIDER = :MongoLabs.freeze
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
              # OpenShift (MongoDB 2.4)
         
     | 
| 
      
 7 
     | 
    
         
            +
              if DB_PROVIDER == :OpenShift
         
     | 
| 
      
 8 
     | 
    
         
            +
                CONNECTION_DETAILS = {
         
     | 
| 
      
 9 
     | 
    
         
            +
                  :host           => "127.0.0.1",
         
     | 
| 
      
 10 
     | 
    
         
            +
                  :port           => "27017",
         
     | 
| 
      
 11 
     | 
    
         
            +
                  :db             => "admin",
         
     | 
| 
      
 12 
     | 
    
         
            +
                  :uname          => "admin",
         
     | 
| 
      
 13 
     | 
    
         
            +
                  :pword          => "R5jUKv1fessb"
         
     | 
| 
      
 14 
     | 
    
         
            +
                }.freeze
         
     | 
| 
      
 15 
     | 
    
         
            +
              # MongoLabs (MongoDB 3.0)
         
     | 
| 
      
 16 
     | 
    
         
            +
              elsif DB_PROVIDER == :MongoLabs
         
     | 
| 
      
 17 
     | 
    
         
            +
                CONNECTION_DETAILS = {
         
     | 
| 
      
 18 
     | 
    
         
            +
                  :host           => "ds037205.mongolab.com",
         
     | 
| 
      
 19 
     | 
    
         
            +
                  :port           => "37205",
         
     | 
| 
      
 20 
     | 
    
         
            +
                  :db             => "crawler",
         
     | 
| 
      
 21 
     | 
    
         
            +
                  :uname          => "rubyapp",
         
     | 
| 
      
 22 
     | 
    
         
            +
                  :pword          => "R5jUKv1fessb",
         
     | 
| 
      
 23 
     | 
    
         
            +
                }.freeze
         
     | 
| 
      
 24 
     | 
    
         
            +
              else
         
     | 
| 
      
 25 
     | 
    
         
            +
                raise "Database provider '#{DB_PROVIDER}' is not recognized"
         
     | 
| 
      
 26 
     | 
    
         
            +
              end
         
     | 
| 
      
 27 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,293 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require_relative 'url'
         
     | 
| 
      
 2 
     | 
    
         
            +
            require_relative 'utils'
         
     | 
| 
      
 3 
     | 
    
         
            +
            require_relative 'assertable'
         
     | 
| 
      
 4 
     | 
    
         
            +
            require 'nokogiri'
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
            module Wgit
         
     | 
| 
      
 7 
     | 
    
         
            +
             
     | 
| 
      
 8 
     | 
    
         
            +
              # @author Michael Telford
         
     | 
| 
      
 9 
     | 
    
         
            +
              # Class modeling a HTML web document. Also doubles as a search result.
         
     | 
| 
      
 10 
     | 
    
         
            +
              class Document
         
     | 
| 
      
 11 
     | 
    
         
            +
                include Assertable
         
     | 
| 
      
 12 
     | 
    
         
            +
              
         
     | 
| 
      
 13 
     | 
    
         
            +
                TEXT_ELEMENTS = [:dd, :div, :dl, :dt, :figcaption, :figure, :hr, :li, 
         
     | 
| 
      
 14 
     | 
    
         
            +
                                 :main, :ol, :p, :pre, :span, :ul, :h1, :h2, :h3, :h4, :h5]
         
     | 
| 
      
 15 
     | 
    
         
            +
                
         
     | 
| 
      
 16 
     | 
    
         
            +
              	attr_reader :url, :html, :title, :author, :keywords, :links, :text, :score
         
     | 
| 
      
 17 
     | 
    
         
            +
            	
         
     | 
| 
      
 18 
     | 
    
         
            +
              	def initialize(url_or_doc, html = nil)
         
     | 
| 
      
 19 
     | 
    
         
            +
                      if (url_or_doc.is_a?(String))
         
     | 
| 
      
 20 
     | 
    
         
            +
                          assert_type(url_or_doc, Url)
         
     | 
| 
      
 21 
     | 
    
         
            +
                          html ||= ""
         
     | 
| 
      
 22 
     | 
    
         
            +
                    
         
     | 
| 
      
 23 
     | 
    
         
            +
                          @url = url_or_doc
         
     | 
| 
      
 24 
     | 
    
         
            +
                          @html = html
         
     | 
| 
      
 25 
     | 
    
         
            +
            		
         
     | 
| 
      
 26 
     | 
    
         
            +
                          @doc = Nokogiri::HTML(html) do |config|
         
     | 
| 
      
 27 
     | 
    
         
            +
                              # TODO: Remove #'s below when crawling in production.
         
     | 
| 
      
 28 
     | 
    
         
            +
                              #config.options = Nokogiri::XML::ParseOptions::STRICT | 
         
     | 
| 
      
 29 
     | 
    
         
            +
                              #                 Nokogiri::XML::ParseOptions::NONET
         
     | 
| 
      
 30 
     | 
    
         
            +
                          end
         
     | 
| 
      
 31 
     | 
    
         
            +
            		
         
     | 
| 
      
 32 
     | 
    
         
            +
                          init_title
         
     | 
| 
      
 33 
     | 
    
         
            +
                  		    init_author
         
     | 
| 
      
 34 
     | 
    
         
            +
                  		    init_keywords
         
     | 
| 
      
 35 
     | 
    
         
            +
                          init_links
         
     | 
| 
      
 36 
     | 
    
         
            +
                          init_text
         
     | 
| 
      
 37 
     | 
    
         
            +
                          @score = 0.0
         
     | 
| 
      
 38 
     | 
    
         
            +
                      else
         
     | 
| 
      
 39 
     | 
    
         
            +
                          # Init from a mongo collection document.
         
     | 
| 
      
 40 
     | 
    
         
            +
                          @url = Wgit::Url.new(url_or_doc[:url])
         
     | 
| 
      
 41 
     | 
    
         
            +
                          @html = url_or_doc[:html].nil? ? "" : url_or_doc[:html]
         
     | 
| 
      
 42 
     | 
    
         
            +
                          @title = url_or_doc[:title]
         
     | 
| 
      
 43 
     | 
    
         
            +
                          @author = url_or_doc[:author]
         
     | 
| 
      
 44 
     | 
    
         
            +
                          @keywords = url_or_doc[:keywords].nil? ? [] : url_or_doc[:keywords]
         
     | 
| 
      
 45 
     | 
    
         
            +
                          @links = url_or_doc[:links].nil? ? [] : url_or_doc[:links] 
         
     | 
| 
      
 46 
     | 
    
         
            +
                          @links.map! { |link| Wgit::Url.new(link) }
         
     | 
| 
      
 47 
     | 
    
         
            +
                          @text = url_or_doc[:text].nil? ? [] : url_or_doc[:text]
         
     | 
| 
      
 48 
     | 
    
         
            +
                          @score = url_or_doc[:score].nil? ? 0.0 : url_or_doc[:score]
         
     | 
| 
      
 49 
     | 
    
         
            +
                      end
         
     | 
| 
      
 50 
     | 
    
         
            +
              	end
         
     | 
| 
      
 51 
     | 
    
         
            +
            	
         
     | 
| 
      
 52 
     | 
    
         
            +
              	def internal_links
         
     | 
| 
      
 53 
     | 
    
         
            +
                      return [] if @links.empty?
         
     | 
| 
      
 54 
     | 
    
         
            +
              		@links.reject do |link|
         
     | 
| 
      
 55 
     | 
    
         
            +
                          begin
         
     | 
| 
      
 56 
     | 
    
         
            +
                              not link.relative_link?
         
     | 
| 
      
 57 
     | 
    
         
            +
                          rescue
         
     | 
| 
      
 58 
     | 
    
         
            +
                              true
         
     | 
| 
      
 59 
     | 
    
         
            +
                          end
         
     | 
| 
      
 60 
     | 
    
         
            +
                      end
         
     | 
| 
      
 61 
     | 
    
         
            +
              	end
         
     | 
| 
      
 62 
     | 
    
         
            +
                
         
     | 
| 
      
 63 
     | 
    
         
            +
                  def internal_full_links
         
     | 
| 
      
 64 
     | 
    
         
            +
                      return [] if internal_links.empty?
         
     | 
| 
      
 65 
     | 
    
         
            +
                      internal_links.map do |link|
         
     | 
| 
      
 66 
     | 
    
         
            +
                          link.replace("/" + link) unless link.start_with?("/")
         
     | 
| 
      
 67 
     | 
    
         
            +
                          Wgit::Url.new(@url.to_base + link)
         
     | 
| 
      
 68 
     | 
    
         
            +
                      end
         
     | 
| 
      
 69 
     | 
    
         
            +
                  end
         
     | 
| 
      
 70 
     | 
    
         
            +
            	
         
     | 
| 
      
 71 
     | 
    
         
            +
              	def external_links
         
     | 
| 
      
 72 
     | 
    
         
            +
                  return [] if @links.empty?
         
     | 
| 
      
 73 
     | 
    
         
            +
              		@links.reject do |link|
         
     | 
| 
      
 74 
     | 
    
         
            +
                    begin
         
     | 
| 
      
 75 
     | 
    
         
            +
                        link.relative_link?
         
     | 
| 
      
 76 
     | 
    
         
            +
                    rescue
         
     | 
| 
      
 77 
     | 
    
         
            +
                        true
         
     | 
| 
      
 78 
     | 
    
         
            +
                    end
         
     | 
| 
      
 79 
     | 
    
         
            +
                  end
         
     | 
| 
      
 80 
     | 
    
         
            +
              	end
         
     | 
| 
      
 81 
     | 
    
         
            +
                
         
     | 
| 
      
 82 
     | 
    
         
            +
                  def stats
         
     | 
| 
      
 83 
     | 
    
         
            +
                      hash = {}
         
     | 
| 
      
 84 
     | 
    
         
            +
                      instance_variables.each do |var|
         
     | 
| 
      
 85 
     | 
    
         
            +
                          # Add up the total bytes of text as well as the length.
         
     | 
| 
      
 86 
     | 
    
         
            +
                          if var == :@text
         
     | 
| 
      
 87 
     | 
    
         
            +
                              count = 0
         
     | 
| 
      
 88 
     | 
    
         
            +
                              @text.each { |t| count += t.length }
         
     | 
| 
      
 89 
     | 
    
         
            +
                              hash[:text_length] = @text.length
         
     | 
| 
      
 90 
     | 
    
         
            +
                              hash[:text_bytes] = count
         
     | 
| 
      
 91 
     | 
    
         
            +
                          # Else take the #length method return value.
         
     | 
| 
      
 92 
     | 
    
         
            +
                          else
         
     | 
| 
      
 93 
     | 
    
         
            +
                              next unless instance_variable_get(var).respond_to?(:length)
         
     | 
| 
      
 94 
     | 
    
         
            +
                              hash[var[1..-1].to_sym] = 
         
     | 
| 
      
 95 
     | 
    
         
            +
                                                  instance_variable_get(var).send(:length)
         
     | 
| 
      
 96 
     | 
    
         
            +
                          end
         
     | 
| 
      
 97 
     | 
    
         
            +
                      end
         
     | 
| 
      
 98 
     | 
    
         
            +
                      hash
         
     | 
| 
      
 99 
     | 
    
         
            +
                  end
         
     | 
| 
      
 100 
     | 
    
         
            +
                
         
     | 
| 
      
 101 
     | 
    
         
            +
                  def size
         
     | 
| 
      
 102 
     | 
    
         
            +
                      stats[:html]
         
     | 
| 
      
 103 
     | 
    
         
            +
                  end
         
     | 
| 
      
 104 
     | 
    
         
            +
                
         
     | 
| 
      
 105 
     | 
    
         
            +
                  def to_h(include_html = false)
         
     | 
| 
      
 106 
     | 
    
         
            +
                      ignore = include_html ? [] : [:@html]
         
     | 
| 
      
 107 
     | 
    
         
            +
                      ignore << :@doc # Always ignore :@doc
         
     | 
| 
      
 108 
     | 
    
         
            +
                      Wgit::Utils.to_h(self, ignore)
         
     | 
| 
      
 109 
     | 
    
         
            +
                  end
         
     | 
| 
      
 110 
     | 
    
         
            +
                  
         
     | 
| 
      
 111 
     | 
    
         
            +
                  # Override of the default == method, is equal if url and html both match.
         
     | 
| 
      
 112 
     | 
    
         
            +
                  # Use doc.object_id == other_doc.object_id for exact object comparison. 
         
     | 
| 
      
 113 
     | 
    
         
            +
                  def ==(other_doc)
         
     | 
| 
      
 114 
     | 
    
         
            +
                    return false unless other_doc.is_a? Wgit::Document
         
     | 
| 
      
 115 
     | 
    
         
            +
                    url == other_doc.url and html == other_doc.html
         
     | 
| 
      
 116 
     | 
    
         
            +
                  end
         
     | 
| 
      
 117 
     | 
    
         
            +
                  
         
     | 
| 
      
 118 
     | 
    
         
            +
                  # Shortcut for calling Document#html[range].
         
     | 
| 
      
 119 
     | 
    
         
            +
                  def [](range)
         
     | 
| 
      
 120 
     | 
    
         
            +
                    html[range]
         
     | 
| 
      
 121 
     | 
    
         
            +
                  end
         
     | 
| 
      
 122 
     | 
    
         
            +
                
         
     | 
| 
      
 123 
     | 
    
         
            +
                  def empty?
         
     | 
| 
      
 124 
     | 
    
         
            +
                      html.strip.empty?
         
     | 
| 
      
 125 
     | 
    
         
            +
                  end
         
     | 
| 
      
 126 
     | 
    
         
            +
                
         
     | 
| 
      
 127 
     | 
    
         
            +
                  # Searches against the Document#text for the given search text.
         
     | 
| 
      
 128 
     | 
    
         
            +
                  # The number of search hits for each sentenence are recorded internally 
         
     | 
| 
      
 129 
     | 
    
         
            +
                  # and used to rank/sort the search results before being returned. Where 
         
     | 
| 
      
 130 
     | 
    
         
            +
                  # the Database#search method search all documents for the most hits this 
         
     | 
| 
      
 131 
     | 
    
         
            +
                  # method searches each documents text for the most hits. 
         
     | 
| 
      
 132 
     | 
    
         
            +
                  #
         
     | 
| 
      
 133 
     | 
    
         
            +
                  # Each search result comprises of a sentence of a given length. The length 
         
     | 
| 
      
 134 
     | 
    
         
            +
                  # will be based on the sentence_limit parameter or the full length of the 
         
     | 
| 
      
 135 
     | 
    
         
            +
                  # original sentence, which ever is less. The algorithm obviously ensures 
         
     | 
| 
      
 136 
     | 
    
         
            +
                  # that the search value is visible somewhere in the sentence.
         
     | 
| 
      
 137 
     | 
    
         
            +
                  #
         
     | 
| 
      
 138 
     | 
    
         
            +
                  # @param text [String] the value to search the document text against.
         
     | 
| 
      
 139 
     | 
    
         
            +
                  # @param sentence_limit [Fixnum] the length of each search result 
         
     | 
| 
      
 140 
     | 
    
         
            +
                  # sentence. 
         
     | 
| 
      
 141 
     | 
    
         
            +
                  # 
         
     | 
| 
      
 142 
     | 
    
         
            +
                  # @return [Array] of String objects representing the search results.
         
     | 
| 
      
 143 
     | 
    
         
            +
                  def search(text, sentence_limit = 80)
         
     | 
| 
      
 144 
     | 
    
         
            +
                      raise "A search value must be provided" if text.empty?
         
     | 
| 
      
 145 
     | 
    
         
            +
                      raise "The sentence length value must be even" if sentence_limit.odd?
         
     | 
| 
      
 146 
     | 
    
         
            +
                    
         
     | 
| 
      
 147 
     | 
    
         
            +
                      results = {}
         
     | 
| 
      
 148 
     | 
    
         
            +
                      regex = Regexp.new(text, Regexp::IGNORECASE)
         
     | 
| 
      
 149 
     | 
    
         
            +
                    
         
     | 
| 
      
 150 
     | 
    
         
            +
                      @text.each do |sentence|
         
     | 
| 
      
 151 
     | 
    
         
            +
                          hits = sentence.scan(regex).count
         
     | 
| 
      
 152 
     | 
    
         
            +
                          if hits > 0
         
     | 
| 
      
 153 
     | 
    
         
            +
                              sentence.strip!
         
     | 
| 
      
 154 
     | 
    
         
            +
                              index = sentence.index(regex)
         
     | 
| 
      
 155 
     | 
    
         
            +
                              Wgit::Utils.format_sentence_length(sentence, index, sentence_limit)
         
     | 
| 
      
 156 
     | 
    
         
            +
                              results[sentence] = hits
         
     | 
| 
      
 157 
     | 
    
         
            +
                          end
         
     | 
| 
      
 158 
     | 
    
         
            +
                      end
         
     | 
| 
      
 159 
     | 
    
         
            +
                    
         
     | 
| 
      
 160 
     | 
    
         
            +
                      return [] if results.empty?
         
     | 
| 
      
 161 
     | 
    
         
            +
                      results = Hash[results.sort_by { |k, v| v }]
         
     | 
| 
      
 162 
     | 
    
         
            +
                      results.keys.reverse
         
     | 
| 
      
 163 
     | 
    
         
            +
                  end
         
     | 
| 
      
 164 
     | 
    
         
            +
                
         
     | 
| 
      
 165 
     | 
    
         
            +
                  # Performs a text search (see search for details) but assigns the results 
         
     | 
| 
      
 166 
     | 
    
         
            +
                  # to the @text instance variable. This can be used for sub search 
         
     | 
| 
      
 167 
     | 
    
         
            +
                  # functionality. Note that there is no way of getting the original text 
         
     | 
| 
      
 168 
     | 
    
         
            +
                  # back however. 
         
     | 
| 
      
 169 
     | 
    
         
            +
                  def search!(text)
         
     | 
| 
      
 170 
     | 
    
         
            +
                      @text = search(text)
         
     | 
| 
      
 171 
     | 
    
         
            +
                  end
         
     | 
| 
      
 172 
     | 
    
         
            +
                
         
     | 
| 
      
 173 
     | 
    
         
            +
                  # Uses Nokogiri's xpath method to search the doc's html and return the 
         
     | 
| 
      
 174 
     | 
    
         
            +
                  # results. 
         
     | 
| 
      
 175 
     | 
    
         
            +
                  def xpath(xpath)
         
     | 
| 
      
 176 
     | 
    
         
            +
                		@doc.xpath(xpath)
         
     | 
| 
      
 177 
     | 
    
         
            +
                  end
         
     | 
| 
      
 178 
     | 
    
         
            +
            	
         
     | 
| 
      
 179 
     | 
    
         
            +
              private
         
     | 
| 
      
 180 
     | 
    
         
            +
             
     | 
| 
      
 181 
     | 
    
         
            +
                  def process_str(str)
         
     | 
| 
      
 182 
     | 
    
         
            +
                      str.encode!('UTF-8', 'UTF-8', :invalid => :replace)
         
     | 
| 
      
 183 
     | 
    
         
            +
                      str.strip!
         
     | 
| 
      
 184 
     | 
    
         
            +
                      str # This is required to return the str, do not remove.
         
     | 
| 
      
 185 
     | 
    
         
            +
                  end
         
     | 
| 
      
 186 
     | 
    
         
            +
             
     | 
| 
      
 187 
     | 
    
         
            +
                  def process_arr(array)
         
     | 
| 
      
 188 
     | 
    
         
            +
                      assert_arr_types(array, String)
         
     | 
| 
      
 189 
     | 
    
         
            +
                      array.map! { |str| process_str(str) }
         
     | 
| 
      
 190 
     | 
    
         
            +
                      array.reject! { |str| str.empty? }
         
     | 
| 
      
 191 
     | 
    
         
            +
                      array.uniq!
         
     | 
| 
      
 192 
     | 
    
         
            +
                  end
         
     | 
| 
      
 193 
     | 
    
         
            +
                
         
     | 
| 
      
 194 
     | 
    
         
            +
                  # Modifies internal links by removing this doc's base or host url if 
         
     | 
| 
      
 195 
     | 
    
         
            +
                  # present. http://www.google.co.uk/about.html (with or without the 
         
     | 
| 
      
 196 
     | 
    
         
            +
                  # protocol prefix) will become about.html meaning it'll appear within 
         
     | 
| 
      
 197 
     | 
    
         
            +
                  # internal_links.
         
     | 
| 
      
 198 
     | 
    
         
            +
                  def process_internal_links(links)
         
     | 
| 
      
 199 
     | 
    
         
            +
                      links.map! do |link|
         
     | 
| 
      
 200 
     | 
    
         
            +
                          host_or_base = if link.start_with?("http")
         
     | 
| 
      
 201 
     | 
    
         
            +
                                             url.base
         
     | 
| 
      
 202 
     | 
    
         
            +
                                         else
         
     | 
| 
      
 203 
     | 
    
         
            +
                                             url.host
         
     | 
| 
      
 204 
     | 
    
         
            +
                                         end
         
     | 
| 
      
 205 
     | 
    
         
            +
                          if link.start_with?(host_or_base)
         
     | 
| 
      
 206 
     | 
    
         
            +
                              link.sub!(host_or_base, "")
         
     | 
| 
      
 207 
     | 
    
         
            +
                              link.replace(link[1..-1]) if link.start_with?("/")
         
     | 
| 
      
 208 
     | 
    
         
            +
                              link.strip!
         
     | 
| 
      
 209 
     | 
    
         
            +
                          end
         
     | 
| 
      
 210 
     | 
    
         
            +
                          link
         
     | 
| 
      
 211 
     | 
    
         
            +
                      end
         
     | 
| 
      
 212 
     | 
    
         
            +
                  end
         
     | 
| 
      
 213 
     | 
    
         
            +
                
         
     | 
| 
      
 214 
     | 
    
         
            +
                  def text_elements_xpath
         
     | 
| 
      
 215 
     | 
    
         
            +
                      xpath = ""
         
     | 
| 
      
 216 
     | 
    
         
            +
                      return xpath if TEXT_ELEMENTS.empty?
         
     | 
| 
      
 217 
     | 
    
         
            +
                      el_xpath = "//%s/text()"
         
     | 
| 
      
 218 
     | 
    
         
            +
                      TEXT_ELEMENTS.each_with_index do |el, i|
         
     | 
| 
      
 219 
     | 
    
         
            +
                          xpath += " | " unless i == 0
         
     | 
| 
      
 220 
     | 
    
         
            +
                          xpath += el_xpath % [el]
         
     | 
| 
      
 221 
     | 
    
         
            +
                      end
         
     | 
| 
      
 222 
     | 
    
         
            +
                      xpath
         
     | 
| 
      
 223 
     | 
    
         
            +
                  end
         
     | 
| 
      
 224 
     | 
    
         
            +
                
         
     | 
| 
      
 225 
     | 
    
         
            +
                  def init_var(xpath, var, first_result = true)
         
     | 
| 
      
 226 
     | 
    
         
            +
              		results = @doc.xpath(xpath)        
         
     | 
| 
      
 227 
     | 
    
         
            +
                      unless results.nil? || results.empty?
         
     | 
| 
      
 228 
     | 
    
         
            +
                          result = if first_result
         
     | 
| 
      
 229 
     | 
    
         
            +
                                       results.first.content
         
     | 
| 
      
 230 
     | 
    
         
            +
                                   else
         
     | 
| 
      
 231 
     | 
    
         
            +
                                       results.map { |res| res.content }
         
     | 
| 
      
 232 
     | 
    
         
            +
                                   end
         
     | 
| 
      
 233 
     | 
    
         
            +
                          instance_variable_set(var, result)
         
     | 
| 
      
 234 
     | 
    
         
            +
                      end
         
     | 
| 
      
 235 
     | 
    
         
            +
                  end
         
     | 
| 
      
 236 
     | 
    
         
            +
            	
         
     | 
| 
      
 237 
     | 
    
         
            +
              	def init_title
         
     | 
| 
      
 238 
     | 
    
         
            +
                  @title = nil
         
     | 
| 
      
 239 
     | 
    
         
            +
                  xpath = "//title"
         
     | 
| 
      
 240 
     | 
    
         
            +
                  init_var(xpath, :@title)
         
     | 
| 
      
 241 
     | 
    
         
            +
                  process_str(@title) unless @title.nil?
         
     | 
| 
      
 242 
     | 
    
         
            +
              	end
         
     | 
| 
      
 243 
     | 
    
         
            +
            	
         
     | 
| 
      
 244 
     | 
    
         
            +
              	def init_author
         
     | 
| 
      
 245 
     | 
    
         
            +
                  @author = nil
         
     | 
| 
      
 246 
     | 
    
         
            +
                  xpath = "//meta[@name='author']/@content"
         
     | 
| 
      
 247 
     | 
    
         
            +
                  init_var(xpath, :@author)
         
     | 
| 
      
 248 
     | 
    
         
            +
                  process_str(@author) unless @author.nil?
         
     | 
| 
      
 249 
     | 
    
         
            +
              	end
         
     | 
| 
      
 250 
     | 
    
         
            +
            	
         
     | 
| 
      
 251 
     | 
    
         
            +
              	def init_keywords
         
     | 
| 
      
 252 
     | 
    
         
            +
                  @keywords = nil
         
     | 
| 
      
 253 
     | 
    
         
            +
                  xpath = "//meta[@name='keywords']/@content"
         
     | 
| 
      
 254 
     | 
    
         
            +
                  init_var(xpath, :@keywords)
         
     | 
| 
      
 255 
     | 
    
         
            +
                  return @keywords = [] unless @keywords
         
     | 
| 
      
 256 
     | 
    
         
            +
                  @keywords = @keywords.split(",")
         
     | 
| 
      
 257 
     | 
    
         
            +
                  process_arr(@keywords)
         
     | 
| 
      
 258 
     | 
    
         
            +
              	end
         
     | 
| 
      
 259 
     | 
    
         
            +
                
         
     | 
| 
      
 260 
     | 
    
         
            +
                def init_links
         
     | 
| 
      
 261 
     | 
    
         
            +
                  @links = nil
         
     | 
| 
      
 262 
     | 
    
         
            +
                  xpath = "//a/@href"
         
     | 
| 
      
 263 
     | 
    
         
            +
                  init_var(xpath, :@links, false)
         
     | 
| 
      
 264 
     | 
    
         
            +
                  return @links = [] unless @links
         
     | 
| 
      
 265 
     | 
    
         
            +
                  process_arr(@links)
         
     | 
| 
      
 266 
     | 
    
         
            +
                  @links.reject! { |link| link == "/" }
         
     | 
| 
      
 267 
     | 
    
         
            +
                  @links.map! do |link|
         
     | 
| 
      
 268 
     | 
    
         
            +
                    begin
         
     | 
| 
      
 269 
     | 
    
         
            +
                      Wgit::Url.new(link)
         
     | 
| 
      
 270 
     | 
    
         
            +
                    rescue
         
     | 
| 
      
 271 
     | 
    
         
            +
                      nil
         
     | 
| 
      
 272 
     | 
    
         
            +
                    end
         
     | 
| 
      
 273 
     | 
    
         
            +
                  end
         
     | 
| 
      
 274 
     | 
    
         
            +
                  @links.reject! { |link| link.nil? }
         
     | 
| 
      
 275 
     | 
    
         
            +
                  process_internal_links(@links)
         
     | 
| 
      
 276 
     | 
    
         
            +
                end
         
     | 
| 
      
 277 
     | 
    
         
            +
              
         
     | 
| 
      
 278 
     | 
    
         
            +
                def init_text
         
     | 
| 
      
 279 
     | 
    
         
            +
                  @text = nil
         
     | 
| 
      
 280 
     | 
    
         
            +
                  xpath = text_elements_xpath
         
     | 
| 
      
 281 
     | 
    
         
            +
                  init_var(xpath, :@text, false)
         
     | 
| 
      
 282 
     | 
    
         
            +
                  return @text = [] unless @text
         
     | 
| 
      
 283 
     | 
    
         
            +
                  process_arr(@text)
         
     | 
| 
      
 284 
     | 
    
         
            +
                end
         
     | 
| 
      
 285 
     | 
    
         
            +
                
         
     | 
| 
      
 286 
     | 
    
         
            +
              	alias :to_hash :to_h
         
     | 
| 
      
 287 
     | 
    
         
            +
                alias :relative_links :internal_links
         
     | 
| 
      
 288 
     | 
    
         
            +
                alias :relative_urls :internal_links
         
     | 
| 
      
 289 
     | 
    
         
            +
                alias :relative_full_links :internal_full_links
         
     | 
| 
      
 290 
     | 
    
         
            +
                alias :relative_full_urls :internal_full_links
         
     | 
| 
      
 291 
     | 
    
         
            +
                alias :external_urls :external_links
         
     | 
| 
      
 292 
     | 
    
         
            +
              end
         
     | 
| 
      
 293 
     | 
    
         
            +
            end
         
     | 
    
        data/lib/wgit/url.rb
    ADDED
    
    | 
         @@ -0,0 +1,140 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require_relative 'utils'
         
     | 
| 
      
 2 
     | 
    
         
            +
            require 'uri'
         
     | 
| 
      
 3 
     | 
    
         
            +
             
     | 
| 
      
 4 
     | 
    
         
            +
            module Wgit
         
     | 
| 
      
 5 
     | 
    
         
            +
              
         
     | 
| 
      
 6 
     | 
    
         
            +
              # @author Michael Telford
         
     | 
| 
      
 7 
     | 
    
         
            +
              # Class modeling a web based URL.
         
     | 
| 
      
 8 
     | 
    
         
            +
              # Can be an internal link e.g. "about.html" 
         
     | 
| 
      
 9 
     | 
    
         
            +
              # or a full URL e.g. "http://www.google.co.uk".
         
     | 
| 
      
 10 
     | 
    
         
            +
              class Url < String
         
     | 
| 
      
 11 
     | 
    
         
            +
                  attr_accessor :crawled, :date_crawled
         
     | 
| 
      
 12 
     | 
    
         
            +
                
         
     | 
| 
      
 13 
     | 
    
         
            +
                  def initialize(url_or_doc, crawled = false, date_crawled = nil)
         
     | 
| 
      
 14 
     | 
    
         
            +
                      if (url_or_doc.is_a?(String))
         
     | 
| 
      
 15 
     | 
    
         
            +
                          url = url_or_doc
         
     | 
| 
      
 16 
     | 
    
         
            +
                      else
         
     | 
| 
      
 17 
     | 
    
         
            +
                          # Init from a mongo collection document.
         
     | 
| 
      
 18 
     | 
    
         
            +
                          url = url_or_doc[:url]
         
     | 
| 
      
 19 
     | 
    
         
            +
                          crawled = url_or_doc[:crawled].nil? ? false : url_or_doc[:crawled]
         
     | 
| 
      
 20 
     | 
    
         
            +
                          date_crawled = url_or_doc[:date_crawled]
         
     | 
| 
      
 21 
     | 
    
         
            +
                      end
         
     | 
| 
      
 22 
     | 
    
         
            +
                      @uri = URI(url)
         
     | 
| 
      
 23 
     | 
    
         
            +
                      @crawled = crawled
         
     | 
| 
      
 24 
     | 
    
         
            +
                      @date_crawled = date_crawled
         
     | 
| 
      
 25 
     | 
    
         
            +
                      super(url)
         
     | 
| 
      
 26 
     | 
    
         
            +
                  end
         
     | 
| 
      
 27 
     | 
    
         
            +
                
         
     | 
| 
      
 28 
     | 
    
         
            +
                  def self.validate(url)
         
     | 
| 
      
 29 
     | 
    
         
            +
                      if Wgit::Url.relative_link?(url)
         
     | 
| 
      
 30 
     | 
    
         
            +
                          raise "Invalid url (or a relative link): #{url}"
         
     | 
| 
      
 31 
     | 
    
         
            +
                      end
         
     | 
| 
      
 32 
     | 
    
         
            +
                      unless url.start_with?("http://") or url.start_with?("https://")
         
     | 
| 
      
 33 
     | 
    
         
            +
                          raise "Invalid url (missing protocol prefix): #{url}"
         
     | 
| 
      
 34 
     | 
    
         
            +
                      end
         
     | 
| 
      
 35 
     | 
    
         
            +
                      if URI.regexp.match(url).nil?
         
     | 
| 
      
 36 
     | 
    
         
            +
                          raise "Invalid url: #{url}"
         
     | 
| 
      
 37 
     | 
    
         
            +
                      end
         
     | 
| 
      
 38 
     | 
    
         
            +
                  end
         
     | 
| 
      
 39 
     | 
    
         
            +
                
         
     | 
| 
      
 40 
     | 
    
         
            +
                  def self.valid?(url)
         
     | 
| 
      
 41 
     | 
    
         
            +
                      Wgit::Url.validate(url)
         
     | 
| 
      
 42 
     | 
    
         
            +
                      true
         
     | 
| 
      
 43 
     | 
    
         
            +
                  rescue
         
     | 
| 
      
 44 
     | 
    
         
            +
                      false
         
     | 
| 
      
 45 
     | 
    
         
            +
                  end
         
     | 
| 
      
 46 
     | 
    
         
            +
                
         
     | 
| 
      
 47 
     | 
    
         
            +
                  # Modifies the receiver url by prefixing it with a protocol.
         
     | 
| 
      
 48 
     | 
    
         
            +
                  # Returns the url whether its been modified or not.
         
     | 
| 
      
 49 
     | 
    
         
            +
                  def self.prefix_protocol(url, https = false)
         
     | 
| 
      
 50 
     | 
    
         
            +
                      unless url.start_with?("http://") or url.start_with?("https://")
         
     | 
| 
      
 51 
     | 
    
         
            +
                          if https
         
     | 
| 
      
 52 
     | 
    
         
            +
                              url.replace("https://#{url}")
         
     | 
| 
      
 53 
     | 
    
         
            +
                          else
         
     | 
| 
      
 54 
     | 
    
         
            +
                              url.replace("http://#{url}")
         
     | 
| 
      
 55 
     | 
    
         
            +
                          end
         
     | 
| 
      
 56 
     | 
    
         
            +
                      end
         
     | 
| 
      
 57 
     | 
    
         
            +
                      url
         
     | 
| 
      
 58 
     | 
    
         
            +
                  end
         
     | 
| 
      
 59 
     | 
    
         
            +
                
         
     | 
| 
      
 60 
     | 
    
         
            +
                  # URI.split("http://www.google.co.uk/about.html") returns the following:
         
     | 
| 
      
 61 
     | 
    
         
            +
                  # array[2]: "www.google.co.uk", array[5]: "/about.html".
         
     | 
| 
      
 62 
     | 
    
         
            +
                  # This means that all external links in a page are expected to have a 
         
     | 
| 
      
 63 
     | 
    
         
            +
                  # protocol prefix e.g. "http://", otherwise the link is treated as an 
         
     | 
| 
      
 64 
     | 
    
         
            +
                  # internal link (regardless of whether it is valid or not).
         
     | 
| 
      
 65 
     | 
    
         
            +
                  def self.relative_link?(link)
         
     | 
| 
      
 66 
     | 
    
         
            +
                      link_segs = URI.split(link)
         
     | 
| 
      
 67 
     | 
    
         
            +
                      if not link_segs[2].nil? and not link_segs[2].empty?
         
     | 
| 
      
 68 
     | 
    
         
            +
                          false
         
     | 
| 
      
 69 
     | 
    
         
            +
                      elsif not link_segs[5].nil? and not link_segs[5].empty?
         
     | 
| 
      
 70 
     | 
    
         
            +
                          true
         
     | 
| 
      
 71 
     | 
    
         
            +
                      else
         
     | 
| 
      
 72 
     | 
    
         
            +
                          raise "Invalid link: #{link}"
         
     | 
| 
      
 73 
     | 
    
         
            +
                      end
         
     | 
| 
      
 74 
     | 
    
         
            +
                  end
         
     | 
| 
      
 75 
     | 
    
         
            +
                
         
     | 
| 
      
 76 
     | 
    
         
            +
                  def self.concat(host, link)
         
     | 
| 
      
 77 
     | 
    
         
            +
                      url = host
         
     | 
| 
      
 78 
     | 
    
         
            +
                      url.chop! if url.end_with?("/")
         
     | 
| 
      
 79 
     | 
    
         
            +
                      link = link[1..-1] if link.start_with?("/")
         
     | 
| 
      
 80 
     | 
    
         
            +
                      Wgit::Url.new(url + "/" + link)
         
     | 
| 
      
 81 
     | 
    
         
            +
                  end
         
     | 
| 
      
 82 
     | 
    
         
            +
                
         
     | 
| 
      
 83 
     | 
    
         
            +
                  def relative_link?
         
     | 
| 
      
 84 
     | 
    
         
            +
                      Wgit::Url.relative_link?(self)
         
     | 
| 
      
 85 
     | 
    
         
            +
                  end
         
     | 
| 
      
 86 
     | 
    
         
            +
                
         
     | 
| 
      
 87 
     | 
    
         
            +
                  def valid?
         
     | 
| 
      
 88 
     | 
    
         
            +
                      Wgit::Url.valid?(self)
         
     | 
| 
      
 89 
     | 
    
         
            +
                  end
         
     | 
| 
      
 90 
     | 
    
         
            +
                
         
     | 
| 
      
 91 
     | 
    
         
            +
                  def concat(link)
         
     | 
| 
      
 92 
     | 
    
         
            +
                      Wgit::Url.concat(self, link)
         
     | 
| 
      
 93 
     | 
    
         
            +
                  end
         
     | 
| 
      
 94 
     | 
    
         
            +
                
         
     | 
| 
      
 95 
     | 
    
         
            +
                  def crawled=(bool)
         
     | 
| 
      
 96 
     | 
    
         
            +
                      @crawled = bool
         
     | 
| 
      
 97 
     | 
    
         
            +
                      @date_crawled = bool ? Wgit::Utils.time_stamp : nil
         
     | 
| 
      
 98 
     | 
    
         
            +
                  end
         
     | 
| 
      
 99 
     | 
    
         
            +
                
         
     | 
| 
      
 100 
     | 
    
         
            +
                  def to_uri
         
     | 
| 
      
 101 
     | 
    
         
            +
                      @uri
         
     | 
| 
      
 102 
     | 
    
         
            +
                  end
         
     | 
| 
      
 103 
     | 
    
         
            +
                  
         
     | 
| 
      
 104 
     | 
    
         
            +
                  def to_url
         
     | 
| 
      
 105 
     | 
    
         
            +
                    self
         
     | 
| 
      
 106 
     | 
    
         
            +
                  end
         
     | 
| 
      
 107 
     | 
    
         
            +
                
         
     | 
| 
      
 108 
     | 
    
         
            +
                  # Given http://www.google.co.uk/about.html, www.google.co.uk is returned.
         
     | 
| 
      
 109 
     | 
    
         
            +
                  def to_host
         
     | 
| 
      
 110 
     | 
    
         
            +
                      Wgit::Url.new(@uri.host)
         
     | 
| 
      
 111 
     | 
    
         
            +
                  end
         
     | 
| 
      
 112 
     | 
    
         
            +
                
         
     | 
| 
      
 113 
     | 
    
         
            +
                  # URI.split("http://www.google.co.uk/about.html") returns the following:
         
     | 
| 
      
 114 
     | 
    
         
            +
                  # array[0]: "http://", array[2]: "www.google.co.uk".
         
     | 
| 
      
 115 
     | 
    
         
            +
                  # Returns array[0] + array[2] e.g. http://www.google.co.uk.
         
     | 
| 
      
 116 
     | 
    
         
            +
                  def to_base
         
     | 
| 
      
 117 
     | 
    
         
            +
                      if Wgit::Url.relative_link?(self)
         
     | 
| 
      
 118 
     | 
    
         
            +
                          raise "A relative link doesn't have a base URL: #{self}"
         
     | 
| 
      
 119 
     | 
    
         
            +
                      end
         
     | 
| 
      
 120 
     | 
    
         
            +
                      url_segs = URI.split(self)
         
     | 
| 
      
 121 
     | 
    
         
            +
                      if url_segs[0].nil? or url_segs[2].nil? or url_segs[2].empty?
         
     | 
| 
      
 122 
     | 
    
         
            +
                          raise "Both a protocol and host are needed: #{self}"
         
     | 
| 
      
 123 
     | 
    
         
            +
                      end
         
     | 
| 
      
 124 
     | 
    
         
            +
                      base = "#{url_segs[0]}://#{url_segs[2]}"
         
     | 
| 
      
 125 
     | 
    
         
            +
                      Wgit::Url.new(base)
         
     | 
| 
      
 126 
     | 
    
         
            +
                  end
         
     | 
| 
      
 127 
     | 
    
         
            +
                
         
     | 
| 
      
 128 
     | 
    
         
            +
                  def to_h
         
     | 
| 
      
 129 
     | 
    
         
            +
                      ignore = [:@uri]
         
     | 
| 
      
 130 
     | 
    
         
            +
                      h = Wgit::Utils.to_h(self, ignore)
         
     | 
| 
      
 131 
     | 
    
         
            +
                      Hash[h.to_a.insert(0, [:url, self])] # Insert url at position 0.
         
     | 
| 
      
 132 
     | 
    
         
            +
                  end
         
     | 
| 
      
 133 
     | 
    
         
            +
                
         
     | 
| 
      
 134 
     | 
    
         
            +
                  alias :to_hash :to_h
         
     | 
| 
      
 135 
     | 
    
         
            +
                  alias :host :to_host
         
     | 
| 
      
 136 
     | 
    
         
            +
                  alias :base :to_base
         
     | 
| 
      
 137 
     | 
    
         
            +
                  alias :internal_link? :relative_link?
         
     | 
| 
      
 138 
     | 
    
         
            +
                  alias :crawled? :crawled
         
     | 
| 
      
 139 
     | 
    
         
            +
              end
         
     | 
| 
      
 140 
     | 
    
         
            +
            end
         
     | 
    
        data/lib/wgit/utils.rb
    ADDED
    
    | 
         @@ -0,0 +1,115 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
             
     | 
| 
      
 2 
     | 
    
         
            +
            module Wgit
         
     | 
| 
      
 3 
     | 
    
         
            +
             
     | 
| 
      
 4 
     | 
    
         
            +
              # @author Michael Telford
         
     | 
| 
      
 5 
     | 
    
         
            +
              # Utility module containing generic methods.
         
     | 
| 
      
 6 
     | 
    
         
            +
              module Utils
         
     | 
| 
      
 7 
     | 
    
         
            +
                  def self.time_stamp
         
     | 
| 
      
 8 
     | 
    
         
            +
                      Time.new
         
     | 
| 
      
 9 
     | 
    
         
            +
                  end
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
                  # Returns a hash created from obj's instance vars and values.
         
     | 
| 
      
 12 
     | 
    
         
            +
                  def self.to_h(obj, ignore = [])
         
     | 
| 
      
 13 
     | 
    
         
            +
                      hash = {}
         
     | 
| 
      
 14 
     | 
    
         
            +
                      obj.instance_variables.each do |var|
         
     | 
| 
      
 15 
     | 
    
         
            +
                          next if ignore.include?(var)
         
     | 
| 
      
 16 
     | 
    
         
            +
                          hash[var[1..-1].to_sym] = obj.instance_variable_get(var)
         
     | 
| 
      
 17 
     | 
    
         
            +
                      end
         
     | 
| 
      
 18 
     | 
    
         
            +
                      hash
         
     | 
| 
      
 19 
     | 
    
         
            +
                  end
         
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
      
 21 
     | 
    
         
            +
                  # Improved each method which takes care of singleton and enumerable
         
     | 
| 
      
 22 
     | 
    
         
            +
                  # objects. Yields one or more objects.
         
     | 
| 
      
 23 
     | 
    
         
            +
                  def self.each(obj_or_objs)
         
     | 
| 
      
 24 
     | 
    
         
            +
                      if obj_or_objs.respond_to?(:each)
         
     | 
| 
      
 25 
     | 
    
         
            +
                          obj_or_objs.each { |obj| yield obj }
         
     | 
| 
      
 26 
     | 
    
         
            +
                      else
         
     | 
| 
      
 27 
     | 
    
         
            +
                          yield obj_or_objs
         
     | 
| 
      
 28 
     | 
    
         
            +
                      end
         
     | 
| 
      
 29 
     | 
    
         
            +
                  end
         
     | 
| 
      
 30 
     | 
    
         
            +
             
     | 
| 
      
 31 
     | 
    
         
            +
                  # Formats the sentence (modifies the receiver) and returns its value.
         
     | 
| 
      
 32 
     | 
    
         
            +
                  # The length will be based on the sentence_limit parameter or the full
         
     | 
| 
      
 33 
     | 
    
         
            +
                  # length of the original sentence, which ever is less. The full sentence
         
     | 
| 
      
 34 
     | 
    
         
            +
                  # is returned if the sentence_limit is 0. The algorithm obviously ensures 
         
     | 
| 
      
 35 
     | 
    
         
            +
                  # that the search value is visible somewhere in the sentence.
         
     | 
| 
      
 36 
     | 
    
         
            +
                  def self.format_sentence_length(sentence, index, sentence_limit)
         
     | 
| 
      
 37 
     | 
    
         
            +
                      raise "A sentence value must be provided" if sentence.empty?
         
     | 
| 
      
 38 
     | 
    
         
            +
                      raise "The sentence length value must be even" if sentence_limit.odd?
         
     | 
| 
      
 39 
     | 
    
         
            +
                      if index < 0 or index > sentence.length
         
     | 
| 
      
 40 
     | 
    
         
            +
                          raise "Incorrect index value: #{index}"
         
     | 
| 
      
 41 
     | 
    
         
            +
                      end
         
     | 
| 
      
 42 
     | 
    
         
            +
                    
         
     | 
| 
      
 43 
     | 
    
         
            +
                      return sentence if sentence_limit == 0
         
     | 
| 
      
 44 
     | 
    
         
            +
             
     | 
| 
      
 45 
     | 
    
         
            +
                      start = 0
         
     | 
| 
      
 46 
     | 
    
         
            +
                      finish = sentence.length
         
     | 
| 
      
 47 
     | 
    
         
            +
             
     | 
| 
      
 48 
     | 
    
         
            +
                      if sentence.length > sentence_limit
         
     | 
| 
      
 49 
     | 
    
         
            +
                          start = index - (sentence_limit / 2)
         
     | 
| 
      
 50 
     | 
    
         
            +
                          finish = index + (sentence_limit / 2)
         
     | 
| 
      
 51 
     | 
    
         
            +
             
     | 
| 
      
 52 
     | 
    
         
            +
                          if start < 0
         
     | 
| 
      
 53 
     | 
    
         
            +
                              diff = 0 - start
         
     | 
| 
      
 54 
     | 
    
         
            +
                              if (finish + diff) > sentence.length
         
     | 
| 
      
 55 
     | 
    
         
            +
                                  finish = sentence.length
         
     | 
| 
      
 56 
     | 
    
         
            +
                              else
         
     | 
| 
      
 57 
     | 
    
         
            +
                                  finish += diff
         
     | 
| 
      
 58 
     | 
    
         
            +
                              end
         
     | 
| 
      
 59 
     | 
    
         
            +
                              start = 0
         
     | 
| 
      
 60 
     | 
    
         
            +
                          elsif finish > sentence.length
         
     | 
| 
      
 61 
     | 
    
         
            +
                              diff = finish - sentence.length
         
     | 
| 
      
 62 
     | 
    
         
            +
                              if (start - diff) < 0
         
     | 
| 
      
 63 
     | 
    
         
            +
                                  start = 0
         
     | 
| 
      
 64 
     | 
    
         
            +
                              else
         
     | 
| 
      
 65 
     | 
    
         
            +
                                  start -= diff
         
     | 
| 
      
 66 
     | 
    
         
            +
                              end
         
     | 
| 
      
 67 
     | 
    
         
            +
                              finish = sentence.length
         
     | 
| 
      
 68 
     | 
    
         
            +
                          end
         
     | 
| 
      
 69 
     | 
    
         
            +
             
     | 
| 
      
 70 
     | 
    
         
            +
                          raise if sentence[start..(finish - 1)].length != sentence_limit
         
     | 
| 
      
 71 
     | 
    
         
            +
                      end
         
     | 
| 
      
 72 
     | 
    
         
            +
             
     | 
| 
      
 73 
     | 
    
         
            +
                      sentence.replace(sentence[start..(finish - 1)])
         
     | 
| 
      
 74 
     | 
    
         
            +
                  end
         
     | 
| 
      
 75 
     | 
    
         
            +
             
     | 
| 
      
 76 
     | 
    
         
            +
                  # Prints out the search results in a search engine page format.
         
     | 
| 
      
 77 
     | 
    
         
            +
                  # Most of the params are passed to Document#search - see class docs. 
         
     | 
| 
      
 78 
     | 
    
         
            +
                  # The steam param decides where the printf output is written to, and 
         
     | 
| 
      
 79 
     | 
    
         
            +
                  # therefore must respond_to? :puts
         
     | 
| 
      
 80 
     | 
    
         
            +
                  # The format for each result is:
         
     | 
| 
      
 81 
     | 
    
         
            +
                  #
         
     | 
| 
      
 82 
     | 
    
         
            +
                  # Title
         
     | 
| 
      
 83 
     | 
    
         
            +
                  # Keywords (if there are some)
         
     | 
| 
      
 84 
     | 
    
         
            +
                  # Text Snippet (showing the searched for text if provided)
         
     | 
| 
      
 85 
     | 
    
         
            +
                  # Url
         
     | 
| 
      
 86 
     | 
    
         
            +
                  # <empty_line>
         
     | 
| 
      
 87 
     | 
    
         
            +
                  def self.printf_search_results(results, text = nil, case_sensitive = false,
         
     | 
| 
      
 88 
     | 
    
         
            +
                                                 sentence_length = 80, keyword_count = 5, 
         
     | 
| 
      
 89 
     | 
    
         
            +
                                                 stream = Kernel)
         
     | 
| 
      
 90 
     | 
    
         
            +
                      raise "stream must respond_to? :puts" unless stream.respond_to? :puts
         
     | 
| 
      
 91 
     | 
    
         
            +
                      keyword_count -= 1 # Because Array's are zero indexed.
         
     | 
| 
      
 92 
     | 
    
         
            +
                    
         
     | 
| 
      
 93 
     | 
    
         
            +
                      results.each do |doc|
         
     | 
| 
      
 94 
     | 
    
         
            +
                          sentence = if text.nil?
         
     | 
| 
      
 95 
     | 
    
         
            +
                                        nil
         
     | 
| 
      
 96 
     | 
    
         
            +
                                     else
         
     | 
| 
      
 97 
     | 
    
         
            +
                                        sentence = doc.search(text, sentence_length).first
         
     | 
| 
      
 98 
     | 
    
         
            +
                                        if sentence.nil?
         
     | 
| 
      
 99 
     | 
    
         
            +
                                            nil
         
     | 
| 
      
 100 
     | 
    
         
            +
                                        else
         
     | 
| 
      
 101 
     | 
    
         
            +
                                            sentence.strip.empty? ? nil : sentence
         
     | 
| 
      
 102 
     | 
    
         
            +
                                        end
         
     | 
| 
      
 103 
     | 
    
         
            +
                                     end
         
     | 
| 
      
 104 
     | 
    
         
            +
                          stream.puts doc.title
         
     | 
| 
      
 105 
     | 
    
         
            +
                          unless doc.keywords.empty?
         
     | 
| 
      
 106 
     | 
    
         
            +
                              stream.puts doc.keywords[0..keyword_count].join(", ")
         
     | 
| 
      
 107 
     | 
    
         
            +
                          end
         
     | 
| 
      
 108 
     | 
    
         
            +
                          stream.puts sentence unless sentence.nil?
         
     | 
| 
      
 109 
     | 
    
         
            +
                          stream.puts doc.url
         
     | 
| 
      
 110 
     | 
    
         
            +
                          stream.puts
         
     | 
| 
      
 111 
     | 
    
         
            +
                      end
         
     | 
| 
      
 112 
     | 
    
         
            +
                      nil
         
     | 
| 
      
 113 
     | 
    
         
            +
                  end
         
     | 
| 
      
 114 
     | 
    
         
            +
              end
         
     | 
| 
      
 115 
     | 
    
         
            +
            end
         
     | 
    
        data/lib/wgit/version.rb
    ADDED
    
    
| 
         @@ -0,0 +1,134 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            #!/usr/bin/env ruby
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            require_relative 'crawler'
         
     | 
| 
      
 4 
     | 
    
         
            +
            require_relative 'database/database'
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
            # @author Michael Telford
         
     | 
| 
      
 7 
     | 
    
         
            +
            module Wgit
         
     | 
| 
      
 8 
     | 
    
         
            +
              
         
     | 
| 
      
 9 
     | 
    
         
            +
              # Convience method to crawl the World Wide Web.
         
     | 
| 
      
 10 
     | 
    
         
            +
              # The default value (-1) for max_sites_to_crawl is unrestricted.
         
     | 
| 
      
 11 
     | 
    
         
            +
              # The default max_data_size is 1GB.
         
     | 
| 
      
 12 
     | 
    
         
            +
              def self.crawl_the_web(max_sites_to_crawl = -1, max_data_size = 1048576000)
         
     | 
| 
      
 13 
     | 
    
         
            +
                db = Wgit::Database.new
         
     | 
| 
      
 14 
     | 
    
         
            +
                web_crawler = Wgit::WebCrawler.new(db, max_sites_to_crawl, max_data_size)
         
     | 
| 
      
 15 
     | 
    
         
            +
                web_crawler.crawl_the_web
         
     | 
| 
      
 16 
     | 
    
         
            +
              end
         
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
      
 18 
     | 
    
         
            +
              # Class which sets up a crawler and saves the indexed 
         
     | 
| 
      
 19 
     | 
    
         
            +
              # docs to a database. Will crawl the web forever if you let it :-)
         
     | 
| 
      
 20 
     | 
    
         
            +
              class WebCrawler
         
     | 
| 
      
 21 
     | 
    
         
            +
                attr_accessor :max_sites_to_crawl, :max_data_size
         
     | 
| 
      
 22 
     | 
    
         
            +
                attr_reader :crawler, :db
         
     | 
| 
      
 23 
     | 
    
         
            +
                
         
     | 
| 
      
 24 
     | 
    
         
            +
                def initialize(database, 
         
     | 
| 
      
 25 
     | 
    
         
            +
                               max_sites_to_crawl = -1, 
         
     | 
| 
      
 26 
     | 
    
         
            +
                               max_data_size = 1048576000)
         
     | 
| 
      
 27 
     | 
    
         
            +
                  @crawler = Wgit::Crawler.new
         
     | 
| 
      
 28 
     | 
    
         
            +
                  @db = database
         
     | 
| 
      
 29 
     | 
    
         
            +
                  @max_sites_to_crawl = max_sites_to_crawl
         
     | 
| 
      
 30 
     | 
    
         
            +
                  @max_data_size = max_data_size
         
     | 
| 
      
 31 
     | 
    
         
            +
                end
         
     | 
| 
      
 32 
     | 
    
         
            +
                
         
     | 
| 
      
 33 
     | 
    
         
            +
                # Retrieves url's from the database and recursively crawls each site 
         
     | 
| 
      
 34 
     | 
    
         
            +
                # storing their internal pages into the database and adding their external 
         
     | 
| 
      
 35 
     | 
    
         
            +
                # url's to be crawled at a later date. 
         
     | 
| 
      
 36 
     | 
    
         
            +
                def crawl_the_web
         
     | 
| 
      
 37 
     | 
    
         
            +
                  if max_sites_to_crawl < 0
         
     | 
| 
      
 38 
     | 
    
         
            +
                    puts "Crawling until the database has been filled or it runs out of \
         
     | 
| 
      
 39 
     | 
    
         
            +
            urls to crawl (which might be never)."
         
     | 
| 
      
 40 
     | 
    
         
            +
                  end
         
     | 
| 
      
 41 
     | 
    
         
            +
                  loop_count = 0
         
     | 
| 
      
 42 
     | 
    
         
            +
                  
         
     | 
| 
      
 43 
     | 
    
         
            +
                  while keep_crawling?(loop_count) do
         
     | 
| 
      
 44 
     | 
    
         
            +
                      puts "Current database size: #{db.size}"
         
     | 
| 
      
 45 
     | 
    
         
            +
                      crawler.urls = db.uncrawled_urls
         
     | 
| 
      
 46 
     | 
    
         
            +
             
     | 
| 
      
 47 
     | 
    
         
            +
                      if crawler.urls.empty?
         
     | 
| 
      
 48 
     | 
    
         
            +
                          puts "No urls to crawl, exiting."
         
     | 
| 
      
 49 
     | 
    
         
            +
                          break
         
     | 
| 
      
 50 
     | 
    
         
            +
                      end
         
     | 
| 
      
 51 
     | 
    
         
            +
                      puts "Starting crawl loop for: #{crawler.urls}"
         
     | 
| 
      
 52 
     | 
    
         
            +
                  
         
     | 
| 
      
 53 
     | 
    
         
            +
                      docs_count = 0
         
     | 
| 
      
 54 
     | 
    
         
            +
                      urls_count = 0
         
     | 
| 
      
 55 
     | 
    
         
            +
                  
         
     | 
| 
      
 56 
     | 
    
         
            +
                      crawler.urls.each do |url|
         
     | 
| 
      
 57 
     | 
    
         
            +
                        unless keep_crawling?(loop_count)
         
     | 
| 
      
 58 
     | 
    
         
            +
                          puts "Reached max number of sites to crawl or database \
         
     | 
| 
      
 59 
     | 
    
         
            +
            capacity, exiting."
         
     | 
| 
      
 60 
     | 
    
         
            +
                          return
         
     | 
| 
      
 61 
     | 
    
         
            +
                        end
         
     | 
| 
      
 62 
     | 
    
         
            +
                        loop_count += 1
         
     | 
| 
      
 63 
     | 
    
         
            +
             
     | 
| 
      
 64 
     | 
    
         
            +
                        url.crawled = true
         
     | 
| 
      
 65 
     | 
    
         
            +
                        raise unless db.update(url) == 1
         
     | 
| 
      
 66 
     | 
    
         
            +
                    
         
     | 
| 
      
 67 
     | 
    
         
            +
                        site_docs_count = 0
         
     | 
| 
      
 68 
     | 
    
         
            +
                        ext_links = crawler.crawl_site(url) do |doc|
         
     | 
| 
      
 69 
     | 
    
         
            +
                            unless doc.empty?
         
     | 
| 
      
 70 
     | 
    
         
            +
                                if write_doc_to_db(doc)
         
     | 
| 
      
 71 
     | 
    
         
            +
                                    docs_count += 1
         
     | 
| 
      
 72 
     | 
    
         
            +
                                    site_docs_count += 1
         
     | 
| 
      
 73 
     | 
    
         
            +
                                end
         
     | 
| 
      
 74 
     | 
    
         
            +
                            end
         
     | 
| 
      
 75 
     | 
    
         
            +
                        end
         
     | 
| 
      
 76 
     | 
    
         
            +
                    
         
     | 
| 
      
 77 
     | 
    
         
            +
                        urls_count += write_urls_to_db(ext_links)
         
     | 
| 
      
 78 
     | 
    
         
            +
                        puts "Crawled and saved #{site_docs_count} docs for the \
         
     | 
| 
      
 79 
     | 
    
         
            +
            site: #{url}"
         
     | 
| 
      
 80 
     | 
    
         
            +
                      end
         
     | 
| 
      
 81 
     | 
    
         
            +
              
         
     | 
| 
      
 82 
     | 
    
         
            +
                      puts "Crawled and saved docs for #{docs_count} url(s) overall for \
         
     | 
| 
      
 83 
     | 
    
         
            +
            this iteration."
         
     | 
| 
      
 84 
     | 
    
         
            +
                      puts "Found and saved #{urls_count} external url(s) for the next \
         
     | 
| 
      
 85 
     | 
    
         
            +
            iteration."
         
     | 
| 
      
 86 
     | 
    
         
            +
                  end
         
     | 
| 
      
 87 
     | 
    
         
            +
                end
         
     | 
| 
      
 88 
     | 
    
         
            +
              
         
     | 
| 
      
 89 
     | 
    
         
            +
                private
         
     | 
| 
      
 90 
     | 
    
         
            +
             
     | 
| 
      
 91 
     | 
    
         
            +
                # Keep crawling or not based on DB size and current loop interation.
         
     | 
| 
      
 92 
     | 
    
         
            +
                def keep_crawling?(loop_count)
         
     | 
| 
      
 93 
     | 
    
         
            +
                  return false if db.size >= max_data_size
         
     | 
| 
      
 94 
     | 
    
         
            +
                  # If max_sites_to_crawl is -1 for example then crawl away.
         
     | 
| 
      
 95 
     | 
    
         
            +
                  if max_sites_to_crawl < 0
         
     | 
| 
      
 96 
     | 
    
         
            +
                    true
         
     | 
| 
      
 97 
     | 
    
         
            +
                  else
         
     | 
| 
      
 98 
     | 
    
         
            +
                    loop_count < max_sites_to_crawl
         
     | 
| 
      
 99 
     | 
    
         
            +
                  end
         
     | 
| 
      
 100 
     | 
    
         
            +
                end
         
     | 
| 
      
 101 
     | 
    
         
            +
             
     | 
| 
      
 102 
     | 
    
         
            +
                # The unique url index on the documents collection prevents duplicate 
         
     | 
| 
      
 103 
     | 
    
         
            +
                # inserts.
         
     | 
| 
      
 104 
     | 
    
         
            +
                def write_doc_to_db(doc)
         
     | 
| 
      
 105 
     | 
    
         
            +
                    db.insert(doc)
         
     | 
| 
      
 106 
     | 
    
         
            +
                    puts "Saved document for url: #{doc.url}"
         
     | 
| 
      
 107 
     | 
    
         
            +
                    true
         
     | 
| 
      
 108 
     | 
    
         
            +
                rescue Mongo::Error::OperationFailure
         
     | 
| 
      
 109 
     | 
    
         
            +
                    puts "Document already exists: #{doc.url}"
         
     | 
| 
      
 110 
     | 
    
         
            +
                    false
         
     | 
| 
      
 111 
     | 
    
         
            +
                end
         
     | 
| 
      
 112 
     | 
    
         
            +
             
     | 
| 
      
 113 
     | 
    
         
            +
                # The unique url index on the urls collection prevents duplicate inserts.
         
     | 
| 
      
 114 
     | 
    
         
            +
                def write_urls_to_db(urls)
         
     | 
| 
      
 115 
     | 
    
         
            +
                    count = 0
         
     | 
| 
      
 116 
     | 
    
         
            +
                    if urls.respond_to?(:each)
         
     | 
| 
      
 117 
     | 
    
         
            +
                        urls.each do |url|
         
     | 
| 
      
 118 
     | 
    
         
            +
                            begin
         
     | 
| 
      
 119 
     | 
    
         
            +
                              db.insert(url)
         
     | 
| 
      
 120 
     | 
    
         
            +
                              count += 1
         
     | 
| 
      
 121 
     | 
    
         
            +
                              puts "Inserted url: #{url}"
         
     | 
| 
      
 122 
     | 
    
         
            +
                            rescue Mongo::Error::OperationFailure
         
     | 
| 
      
 123 
     | 
    
         
            +
                              puts "Url already exists: #{url}"
         
     | 
| 
      
 124 
     | 
    
         
            +
                            end
         
     | 
| 
      
 125 
     | 
    
         
            +
                        end
         
     | 
| 
      
 126 
     | 
    
         
            +
                    end
         
     | 
| 
      
 127 
     | 
    
         
            +
                    count
         
     | 
| 
      
 128 
     | 
    
         
            +
                end
         
     | 
| 
      
 129 
     | 
    
         
            +
              end
         
     | 
| 
      
 130 
     | 
    
         
            +
            end
         
     | 
| 
      
 131 
     | 
    
         
            +
             
     | 
| 
      
 132 
     | 
    
         
            +
            if __FILE__ == $0
         
     | 
| 
      
 133 
     | 
    
         
            +
                Wgit.crawl_the_web
         
     | 
| 
      
 134 
     | 
    
         
            +
            end
         
     | 
    
        metadata
    ADDED
    
    | 
         @@ -0,0 +1,62 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            --- !ruby/object:Gem::Specification
         
     | 
| 
      
 2 
     | 
    
         
            +
            name: wgit
         
     | 
| 
      
 3 
     | 
    
         
            +
            version: !ruby/object:Gem::Version
         
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.0.1
         
     | 
| 
      
 5 
     | 
    
         
            +
            platform: ruby
         
     | 
| 
      
 6 
     | 
    
         
            +
            authors:
         
     | 
| 
      
 7 
     | 
    
         
            +
            - Michael Telford
         
     | 
| 
      
 8 
     | 
    
         
            +
            autorequire: 
         
     | 
| 
      
 9 
     | 
    
         
            +
            bindir: bin
         
     | 
| 
      
 10 
     | 
    
         
            +
            cert_chain: []
         
     | 
| 
      
 11 
     | 
    
         
            +
            date: 2016-03-07 00:00:00.000000000 Z
         
     | 
| 
      
 12 
     | 
    
         
            +
            dependencies: []
         
     | 
| 
      
 13 
     | 
    
         
            +
            description: Wgit is a WWW indexer/scraper which crawls URL's and retrieves their
         
     | 
| 
      
 14 
     | 
    
         
            +
              page contents for later use. Also included in this package is a means to search
         
     | 
| 
      
 15 
     | 
    
         
            +
              indexed documents stored in a database. Therefore this library provides the main
         
     | 
| 
      
 16 
     | 
    
         
            +
              components of a WWW search engine. You can also use Wgit to copy entire website's
         
     | 
| 
      
 17 
     | 
    
         
            +
              HTML making it far more powerful than wget. The Wgit API is easily extendable allowing
         
     | 
| 
      
 18 
     | 
    
         
            +
              you to easily pull out the parts of a webpage that are important to you, the CSS
         
     | 
| 
      
 19 
     | 
    
         
            +
              or JS links for example.
         
     | 
| 
      
 20 
     | 
    
         
            +
            email: michael.telford@live.com
         
     | 
| 
      
 21 
     | 
    
         
            +
            executables: []
         
     | 
| 
      
 22 
     | 
    
         
            +
            extensions: []
         
     | 
| 
      
 23 
     | 
    
         
            +
            extra_rdoc_files: []
         
     | 
| 
      
 24 
     | 
    
         
            +
            files:
         
     | 
| 
      
 25 
     | 
    
         
            +
            - "./lib/wgit.rb"
         
     | 
| 
      
 26 
     | 
    
         
            +
            - "./lib/wgit/assertable.rb"
         
     | 
| 
      
 27 
     | 
    
         
            +
            - "./lib/wgit/core_ext.rb"
         
     | 
| 
      
 28 
     | 
    
         
            +
            - "./lib/wgit/crawler.rb"
         
     | 
| 
      
 29 
     | 
    
         
            +
            - "./lib/wgit/database/database.rb"
         
     | 
| 
      
 30 
     | 
    
         
            +
            - "./lib/wgit/database/model.rb"
         
     | 
| 
      
 31 
     | 
    
         
            +
            - "./lib/wgit/database/mongo_connection_details.rb"
         
     | 
| 
      
 32 
     | 
    
         
            +
            - "./lib/wgit/document.rb"
         
     | 
| 
      
 33 
     | 
    
         
            +
            - "./lib/wgit/url.rb"
         
     | 
| 
      
 34 
     | 
    
         
            +
            - "./lib/wgit/utils.rb"
         
     | 
| 
      
 35 
     | 
    
         
            +
            - "./lib/wgit/version.rb"
         
     | 
| 
      
 36 
     | 
    
         
            +
            - "./lib/wgit/web_crawler.rb"
         
     | 
| 
      
 37 
     | 
    
         
            +
            homepage: http://rubygems.org/gems/wgit
         
     | 
| 
      
 38 
     | 
    
         
            +
            licenses:
         
     | 
| 
      
 39 
     | 
    
         
            +
            - MIT
         
     | 
| 
      
 40 
     | 
    
         
            +
            metadata:
         
     | 
| 
      
 41 
     | 
    
         
            +
              allowed_push_host: https://rubygems.org
         
     | 
| 
      
 42 
     | 
    
         
            +
            post_install_message: 
         
     | 
| 
      
 43 
     | 
    
         
            +
            rdoc_options: []
         
     | 
| 
      
 44 
     | 
    
         
            +
            require_paths:
         
     | 
| 
      
 45 
     | 
    
         
            +
            - lib
         
     | 
| 
      
 46 
     | 
    
         
            +
            required_ruby_version: !ruby/object:Gem::Requirement
         
     | 
| 
      
 47 
     | 
    
         
            +
              requirements:
         
     | 
| 
      
 48 
     | 
    
         
            +
              - - ">="
         
     | 
| 
      
 49 
     | 
    
         
            +
                - !ruby/object:Gem::Version
         
     | 
| 
      
 50 
     | 
    
         
            +
                  version: '0'
         
     | 
| 
      
 51 
     | 
    
         
            +
            required_rubygems_version: !ruby/object:Gem::Requirement
         
     | 
| 
      
 52 
     | 
    
         
            +
              requirements:
         
     | 
| 
      
 53 
     | 
    
         
            +
              - - ">="
         
     | 
| 
      
 54 
     | 
    
         
            +
                - !ruby/object:Gem::Version
         
     | 
| 
      
 55 
     | 
    
         
            +
                  version: '0'
         
     | 
| 
      
 56 
     | 
    
         
            +
            requirements: []
         
     | 
| 
      
 57 
     | 
    
         
            +
            rubyforge_project: 
         
     | 
| 
      
 58 
     | 
    
         
            +
            rubygems_version: 2.4.5
         
     | 
| 
      
 59 
     | 
    
         
            +
            signing_key: 
         
     | 
| 
      
 60 
     | 
    
         
            +
            specification_version: 4
         
     | 
| 
      
 61 
     | 
    
         
            +
            summary: Wgit is wget on steroids with an easy to use API.
         
     | 
| 
      
 62 
     | 
    
         
            +
            test_files: []
         
     |