RubyGems - wgit - Versions diffs - 0.0.1 → 0.0.2 - Mend

wgit 0.0.1 → 0.0.2

Files changed (15) hide show

checksums.yaml +5 -5
data/lib/wgit.rb +1 -1
data/lib/wgit/assertable.rb +72 -61
data/lib/wgit/core_ext.rb +11 -5
data/lib/wgit/crawler.rb +97 -57
data/lib/wgit/database/database.rb +247 -170
data/lib/wgit/database/model.rb +40 -24
data/lib/wgit/database/mongo_connection_details.rb +44 -23
data/lib/wgit/document.rb +534 -233
data/lib/wgit/indexer.rb +235 -0
data/lib/wgit/url.rb +199 -121
data/lib/wgit/utils.rb +143 -96
data/lib/wgit/version.rb +5 -1
metadata +10 -9
data/lib/wgit/web_crawler.rb +0 -134

@@ -0,0 +1,235 @@
+require_relative 'crawler'
+require_relative 'database/database'
+module Wgit
+  # Convience method to index the World Wide Web using
+  # Wgit::Indexer#index_the_web.
+  #
+  # Retrieves uncrawled url's from the database and recursively crawls each
+  # site storing their internal pages into the database and adding their
+  # external url's to be crawled at a later date. Puts out info on the crawl
+  # to STDOUT as it goes along.
+  #
+  # @param max_sites_to_crawl [Integer] The number of separate and whole
+  #   websites to be crawled before the method exits. Defaults to -1 which
+  #   means the crawl will occur until manually stopped (Ctrl+C etc).
+  # @param max_data_size [Integer] The maximum amount of bytes that will be
+  #   scraped from the web (default is 1GB). Note, that this value is used to
+  #   determine when to stop crawling; it's not a guarantee of the max data
+  #   that will be obtained.
+  def self.index_the_web(max_sites_to_crawl = -1, max_data_size = 1048576000)
+    db = Wgit::Database.new
+    indexer = Wgit::Indexer.new(db)
+    indexer.index_the_web(max_sites_to_crawl, max_data_size)
+  end
+  # Convience method to index a single website using
+  # Wgit::Indexer#index_this_site.
+  #
+  # Crawls a single website's pages and stores them into the database.
+  # There is no max download limit so be careful which sites you index.
+  #
+  # @param url [Wgit::Url, String] The base Url of the website to crawl.
+  # @param insert_externals [Boolean] Whether or not to insert the website's
+  #   external Url's into the database.
+  # @yield [doc] Given the Wgit::Document of each crawled web page, before it
+  #   is inserted into the database allowing for prior manipulation.
+  # @return [Integer] The total number of pages crawled within the website.
+  def self.index_this_site(url, insert_externals = true, &block)
+    url = Wgit::Url.new url
+    db = Wgit::Database.new
+    indexer = Wgit::Indexer.new(db)
+    indexer.index_this_site(url, insert_externals, &block)
+  end
+  # Performs a search of the database's indexed documents and pretty prints
+  # the results. See Wgit::Database#search for details of the search.
+  #
+  # @param query [String] The text query to search with.
+  # @param whole_sentence [Boolean] Whether multiple words should be searched
+  #   for separately.
+  # @param limit [Integer] The max number of results to return.
+  # @param skip [Integer] The number of DB records to skip.
+  # @param sentence_length [Integer] The max length of each result's text
+  #   snippet.
+  # @yield [doc] Given each search result (Wgit::Document).
+  def self.indexed_search(query, whole_sentence = false, limit = 10,
+                          skip = 0, sentence_length = 80, &block)
+    db = Wgit::Database.new
+    results = db.search(query, whole_sentence, limit, skip, &block)
+    Wgit::Utils.printf_search_results(results, query, false, sentence_length)
+  end
+  # Class which sets up a crawler and saves the indexed docs to a database.
+  class Indexer
+    # The crawler used to scrape the WWW.
+    attr_reader :crawler
+    # The database instance used to store Urls and Documents in.
+    attr_reader :db
+    # Initialize the Indexer.
+    #
+    # @param database [Wgit::Database] The database instance (already
+    #   initialized with the correct connection details etc).
+    def initialize(database)
+      @crawler = Wgit::Crawler.new
+      @db = database
+    end
+    # Retrieves uncrawled url's from the database and recursively crawls each
+    # site storing their internal pages into the database and adding their
+    # external url's to be crawled at a later date. Puts out info on the crawl
+    # to STDOUT as it goes along.
+    #
+    # @param max_sites_to_crawl [Integer] The number of separate and whole
+    #   websites to be crawled before the method exits. Defaults to -1 which
+    #   means the crawl will occur until manually stopped (Ctrl+C etc).
+    # @param max_data_size [Integer] The maximum amount of bytes that will be
+    #   scraped from the web (default is 1GB). Note, that this value is used to
+    #   determine when to stop crawling; it's not a guarantee of the max data
+    #   that will be obtained.
+    def index_the_web(max_sites_to_crawl = -1, max_data_size = 1048576000)
+      if max_sites_to_crawl < 0
+        puts "Indexing until the database has been filled or it runs out of \
+urls to crawl (which might be never)."
+      end
+      site_count = 0
+      while keep_crawling?(site_count, max_sites_to_crawl, max_data_size) do
+        puts "Current database size: #{@db.size}"
+        @crawler.urls = @db.uncrawled_urls
+        if @crawler.urls.empty?
+          puts "No urls to crawl, exiting."
+          return
+        end
+        puts "Starting crawl loop for: #{@crawler.urls}"
+        docs_count = 0
+        urls_count = 0
+        @crawler.urls.each do |url|
+          unless keep_crawling?(site_count, max_sites_to_crawl, max_data_size)
+            puts "Reached max number of sites to crawl or database \
+capacity, exiting."
+            return
+          end
+          site_count += 1
+          url.crawled = true
+          raise unless @db.update(url) == 1
+          site_docs_count = 0
+          ext_links = @crawler.crawl_site(url) do |doc|
+            unless doc.empty?
+              if write_doc_to_db(doc)
+                docs_count += 1
+                site_docs_count += 1
+              end
+            end
+          end
+          urls_count += write_urls_to_db(ext_links)
+          puts "Crawled and saved #{site_docs_count} docs for the \
+site: #{url}"
+        end
+        puts "Crawled and saved docs for #{docs_count} url(s) overall for \
+this iteration."
+        puts "Found and saved #{urls_count} external url(s) for the next \
+iteration."
+      end
+    end
+    # Crawls a single website's pages and stores them into the database.
+    # There is no max download limit so be careful which sites you index.
+    # Puts out info on the crawl to STDOUT as it goes along.
+    #
+    # @param url [Wgit::Url] The base Url of the website to crawl.
+    # @param insert_externals [Boolean] Whether or not to insert the website's
+    #   external Url's into the database.
+    # @yield [doc] Given the Wgit::Document of each crawled web page, before it
+    #   is inserted into the database allowing for prior manipulation. Return
+    #   nil or false from the block to prevent the document from being saved
+    #   into the database.
+    # @return [Integer] The total number of webpages/documents indexed.
+    def index_this_site(url, insert_externals = true)
+      total_pages_indexed = 0
+      ext_urls = @crawler.crawl_site(url) do |doc|
+        result = true
+        if block_given?
+          result = yield(doc)
+        end
+        if result
+          if write_doc_to_db(doc)
+            total_pages_indexed += 1
+            puts "Crawled and saved internal page: #{doc.url}"
+          end
+        end
+      end
+      url.crawled = true
+      if !@db.url?(url)
+        @db.insert(url)
+      else
+        @db.update(url)
+      end
+      if insert_externals
+        write_urls_to_db(ext_urls)
+        puts "Found and saved #{ext_urls.length} external url(s)"
+      end
+      puts "Crawled and saved #{total_pages_indexed} docs for the \
+site: #{url}"
+      total_pages_indexed
+    end
+  private
+    # Keep crawling or not based on DB size and current loop iteration.
+    def keep_crawling?(site_count, max_sites_to_crawl, max_data_size)
+      return false if @db.size >= max_data_size
+      # If max_sites_to_crawl is -1 for example then crawl away.
+      if max_sites_to_crawl < 0
+        true
+      else
+        site_count < max_sites_to_crawl
+      end
+    end
+    # The unique url index on the documents collection prevents duplicate
+    # inserts.
+    def write_doc_to_db(doc)
+      @db.insert(doc)
+      puts "Saved document for url: #{doc.url}"
+      true
+    rescue Mongo::Error::OperationFailure
+      puts "Document already exists: #{doc.url}"
+      false
+    end
+    # The unique url index on the urls collection prevents duplicate inserts.
+    def write_urls_to_db(urls)
+      count = 0
+      if urls.respond_to?(:each)
+        urls.each do |url|
+          begin
+            @db.insert(url)
+            count += 1
+            puts "Inserted url: #{url}"
+          rescue Mongo::Error::OperationFailure
+            puts "Url already exists: #{url}"
+          end
+        end
+      end
+      count
+    end
+  end
+end

data/lib/wgit/url.rb CHANGED

@@ -1,140 +1,218 @@
 require_relative 'utils'
+require_relative 'assertable'
 require 'uri'
 module Wgit
-  # @author Michael Telford
   # Class modeling a web based URL.
-  # Can be an internal link e.g. "about.html"
-  # or a full URL e.g. "http://www.google.co.uk".
+  # Can be an internal/relative link e.g. "about.html" or a full URL
+  # e.g. "http://www.google.co.uk". Is a subclass of String and uses 'uri'
+  # internally.
   class Url < String
-      attr_accessor :crawled, :date_crawled
+    include Assertable
-      def initialize(url_or_doc, crawled = false, date_crawled = nil)
-          if (url_or_doc.is_a?(String))
-              url = url_or_doc
-          else
-              # Init from a mongo collection document.
-              url = url_or_doc[:url]
-              crawled = url_or_doc[:crawled].nil? ? false : url_or_doc[:crawled]
-              date_crawled = url_or_doc[:date_crawled]
-          end
-          @uri = URI(url)
-          @crawled = crawled
-          @date_crawled = date_crawled
-          super(url)
-      end
-      def self.validate(url)
-          if Wgit::Url.relative_link?(url)
-              raise "Invalid url (or a relative link): #{url}"
-          end
-          unless url.start_with?("http://") or url.start_with?("https://")
-              raise "Invalid url (missing protocol prefix): #{url}"
-          end
-          if URI.regexp.match(url).nil?
-              raise "Invalid url: #{url}"
-          end
-      end
-      def self.valid?(url)
-          Wgit::Url.validate(url)
-          true
-      rescue
-          false
-      end
-      # Modifies the receiver url by prefixing it with a protocol.
-      # Returns the url whether its been modified or not.
-      def self.prefix_protocol(url, https = false)
-          unless url.start_with?("http://") or url.start_with?("https://")
-              if https
-                  url.replace("https://#{url}")
-              else
-                  url.replace("http://#{url}")
-              end
-          end
-          url
-      end
-      # URI.split("http://www.google.co.uk/about.html") returns the following:
-      # array[2]: "www.google.co.uk", array[5]: "/about.html".
-      # This means that all external links in a page are expected to have a
-      # protocol prefix e.g. "http://", otherwise the link is treated as an
-      # internal link (regardless of whether it is valid or not).
-      def self.relative_link?(link)
-          link_segs = URI.split(link)
-          if not link_segs[2].nil? and not link_segs[2].empty?
-              false
-          elsif not link_segs[5].nil? and not link_segs[5].empty?
-              true
-          else
-              raise "Invalid link: #{link}"
-          end
-      end
-      def self.concat(host, link)
-          url = host
-          url.chop! if url.end_with?("/")
-          link = link[1..-1] if link.start_with?("/")
-          Wgit::Url.new(url + "/" + link)
-      end
-      def relative_link?
-          Wgit::Url.relative_link?(self)
-      end
-      def valid?
-          Wgit::Url.valid?(self)
+    # Whether or not the Url has been crawled or not.
+    attr_accessor :crawled
+    # The date which the Url was crawled.
+    attr_accessor :date_crawled
+    # Initializes a new instance of Wgit::Url which represents a web based
+    # HTTP URL.
+    #
+    # @param url_or_obj [String, Object#fetch#[]] Is either a String based
+    #     URL or an object representing a Database record e.g. a MongoDB
+    #     document/object.
+    # @param crawled [Boolean] Whether or not the HTML of the URL's web
+    #     page has been scraped or not.
+    # @param date_crawled [Time] Should only be provided if crawled is
+    #     true. A suitable object can be returned from
+    #     Wgit::Utils.time_stamp.
+    # @raise [RuntimeError] If url_or_obj is an Object with missing methods.
+    def initialize(url_or_obj, crawled = false, date_crawled = nil)
+      # Init from a URL String.
+      if url_or_obj.is_a?(String)
+        url = url_or_obj.to_s
+      # Else init from a database object/document.
+      else
+        obj = url_or_obj
+        assert_respond_to(obj, [:fetch, :[]])
+        url = obj.fetch("url") # Should always be present.
+        crawled = obj.fetch("crawled", false)
+        date_crawled = obj["date_crawled"]
       end
-      def concat(link)
-          Wgit::Url.concat(self, link)
+      @uri = URI(url)
+      @crawled = crawled
+      @date_crawled = date_crawled
+      super(url)
+    end
+    # Raises an exception if url is not a valid HTTP URL.
+    #
+    # @param url [Wgit::Url, String] The Url to validate.
+    # @raise [RuntimeError] If url is invalid.
+    def self.validate(url)
+      if Wgit::Url.relative_link?(url)
+        raise "Invalid url (or a relative link): #{url}"
       end
-      def crawled=(bool)
-          @crawled = bool
-          @date_crawled = bool ? Wgit::Utils.time_stamp : nil
+      unless url.start_with?("http://") or url.start_with?("https://")
+        raise "Invalid url (missing protocol prefix): #{url}"
       end
-      def to_uri
-          @uri
+      if URI.regexp.match(url).nil?
+        raise "Invalid url: #{url}"
       end
-      def to_url
-        self
+    end
+    # Determines if the Url is valid or not.
+    #
+    # @param url [Wgit::Url, String] The Url to validate.
+    # @return [Boolean] True if valid, otherwise false.
+    def self.valid?(url)
+      Wgit::Url.validate(url)
+      true
+    rescue
+      false
+    end
+    # Modifies the receiver url by prefixing it with a protocol.
+    # Returns the url whether its been modified or not.
+    # The default protocol prefix is http://.
+    #
+    # @param url [Wgit::Url, String] The url to be prefixed with a protocol.
+    # @param https [Boolean] Whether the protocol prefix is https or http.
+    # @return [Wgit::Url] The url with a protocol prefix.
+    def self.prefix_protocol(url, https = false)
+      unless url.start_with?("http://") or url.start_with?("https://")
+        if https
+          url.replace("https://#{url}")
+        else
+          url.replace("http://#{url}")
+        end
       end
-      # Given http://www.google.co.uk/about.html, www.google.co.uk is returned.
-      def to_host
-          Wgit::Url.new(@uri.host)
+      url
+    end
+    # Returns if link is a relative or absolute Url. How it works:
+    # URI.split("http://www.google.co.uk/about.html") returns the following:
+    # array[2]: "www.google.co.uk", array[5]: "/about.html".
+    # This means that all external links in a page are expected to have a
+    # protocol prefix e.g. "http://", otherwise the link is treated as an
+    # internal link (regardless of whether it is valid or not).
+    #
+    # @param link [Wgit::Url, String] The url to test if relative or not.
+    # @return [Boolean] True if relative, false if absolute.
+    # @raise [RuntimeError] If the link is invalid.
+    def self.relative_link?(link)
+      link_segs = URI.split(link)
+      if not link_segs[2].nil? and not link_segs[2].empty?
+        false
+      elsif not link_segs[5].nil? and not link_segs[5].empty?
+        true
+      else
+        raise "Invalid link: #{link}"
       end
+    end
+    # Concats the host and link Strings and returns the result.
+    #
+    # @param host [Wgit::Url, String] The Url host.
+    # @param link [Wgit::Url, String] The link to add to the host prefix.
+    # @return [Wgit::Url] host + "/" + link
+    def self.concat(host, link)
+      url = host
+      url.chop! if url.end_with?("/")
+      link = link[1..-1] if link.start_with?("/")
+      Wgit::Url.new(url + "/" + link)
+    end
+    # Returns if self is a relative or absolute Url.
+    # @return [Boolean] True if relative, false if absolute.
+    # @raise [RuntimeError] If the link is invalid.
+    def relative_link?
+      Wgit::Url.relative_link?(self)
+    end
+    # Determines if self is a valid Url or not.
+    #
+    # @return [Boolean] True if valid, otherwise false.
+    def valid?
+      Wgit::Url.valid?(self)
+    end
+    # Concats self (Url) and the link.
+    #
+    # @param link [Wgit::Url, String] The link to concat with self.
+    # @return [Wgit::Url] self + "/" + link
+    def concat(link)
+      Wgit::Url.concat(self, link)
+    end
+    # Sets the @crawled instance var, also setting @date_crawled to the
+    # current time or nil (depending on the bool value).
+    #
+    # @param bool [Boolean] True if self has been crawled, false otherwise.
+    def crawled=(bool)
+      @crawled = bool
+      @date_crawled = bool ? Wgit::Utils.time_stamp : nil
+    end
+    # Returns the @uri instance var of this URL.
+    #
+    # @return [URI::HTTP, URI::HTTPS] The URI object of self.
+    def to_uri
+      @uri
+    end
-      # URI.split("http://www.google.co.uk/about.html") returns the following:
-      # array[0]: "http://", array[2]: "www.google.co.uk".
-      # Returns array[0] + array[2] e.g. http://www.google.co.uk.
-      def to_base
-          if Wgit::Url.relative_link?(self)
-              raise "A relative link doesn't have a base URL: #{self}"
-          end
-          url_segs = URI.split(self)
-          if url_segs[0].nil? or url_segs[2].nil? or url_segs[2].empty?
-              raise "Both a protocol and host are needed: #{self}"
-          end
-          base = "#{url_segs[0]}://#{url_segs[2]}"
-          Wgit::Url.new(base)
+    # Returns self.
+    #
+    # @return [Wgit::Url] This (self) Url.
+    def to_url
+      self
+    end
+    # Returns a new Wgit::Url containing just the host of this URL e.g.
+    # Given http://www.google.co.uk/about.html, www.google.co.uk is returned.
+    #
+    # @return [Wgit::Url] Containing just the host.
+    def to_host
+      Wgit::Url.new(@uri.host)
+    end
+    # Returns the base of this URL e.g. the protocol and host combined.
+    # How it works:
+    # URI.split("http://www.google.co.uk/about.html") returns the following:
+    # array[0]: "http://", array[2]: "www.google.co.uk", which we use.
+    #
+    # @return [Wgit::Url] Base of self (Url) e.g. http://www.google.co.uk.
+    def to_base
+      if Wgit::Url.relative_link?(self)
+        raise "A relative link doesn't have a base URL: #{self}"
       end
-      def to_h
-          ignore = [:@uri]
-          h = Wgit::Utils.to_h(self, ignore)
-          Hash[h.to_a.insert(0, [:url, self])] # Insert url at position 0.
+      url_segs = URI.split(self)
+      if url_segs[0].nil? or url_segs[2].nil? or url_segs[2].empty?
+        raise "Both a protocol and host are needed: #{self}"
       end
-      alias :to_hash :to_h
-      alias :host :to_host
-      alias :base :to_base
-      alias :internal_link? :relative_link?
-      alias :crawled? :crawled
+      base = "#{url_segs[0]}://#{url_segs[2]}"
+      Wgit::Url.new(base)
+    end
+    # Returns a Hash containing this Url's instance vars excluding @uri.
+    # Used when storing the URL in a Database e.g. MongoDB etc.
+    #
+    # @return [Hash] self's instance vars as a Hash.
+    def to_h
+      ignore = ["@uri"]
+      h = Wgit::Utils.to_h(self, ignore)
+      Hash[h.to_a.insert(0, ["url", self])] # Insert url at position 0.
+    end
+    alias :to_hash :to_h
+    alias :host :to_host
+    alias :base :to_base
+    alias :internal_link? :relative_link?
+    alias :crawled? :crawled
   end
 end