RubyGems - wgit - Versions diffs - 0.0.10 → 0.0.11 - Mend

wgit 0.0.10 → 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/LICENSE.txt +21 -0
data/README.md +334 -0
data/TODO.txt +35 -0
data/lib/wgit/assertable.rb +4 -0
data/lib/wgit/core_ext.rb +4 -2
data/lib/wgit/crawler.rb +188 -188
data/lib/wgit/database/database.rb +22 -21
data/lib/wgit/document.rb +594 -592
data/lib/wgit/url.rb +306 -278
data/lib/wgit/version.rb +1 -1
metadata +6 -3

data/lib/wgit/crawler.rb CHANGED Viewed

@@ -1,188 +1,188 @@
-require_relative 'url'
-require_relative 'document'
-require_relative 'utils'
-require_relative 'assertable'
-require 'net/http' # requires 'uri'
-module Wgit
-  # The Crawler class provides a means of crawling web based URL's, turning
-  # their HTML into Wgit::Document's.
-  class Crawler
-    include Assertable
-    # The urls to crawl.
-    attr_reader :urls
-    # The docs of the crawled @urls.
-    attr_reader :docs
-    # Initializes the Crawler by setting the @urls and @docs.
-    #
-    # @param urls [*Wgit::Url] The URLs to crawl.
-    def initialize(*urls)
-      self.[](*urls)
-      @docs = []
-    end
-    # Sets this Crawler's @urls.
-    #
-    # @param urls [Array<Wgit::Url>] The URLs to crawl.
-    def urls=(urls)
-      @urls = []
-      Wgit::Utils.each(urls) { |url| add_url(url) }
-    end
-    # Sets this Crawler's @urls.
-    #
-    # @param urls [*Wgit::Url] The URLs to crawl.
-    def [](*urls)
-      # If urls is nil then add_url (when called later) will set @urls = []
-      # so we do nothing here.
-      if not urls.nil?
-        # Due to *urls you can end up with [[url1,url2,url3]] etc. where the
-        # outer array is bogus so we use the inner one only.
-        if  urls.is_a?(Enumerable) &&
-            urls.length == 1 &&
-            urls.first.is_a?(Enumerable)
-          urls = urls.first
-        end
-        # Here we call urls= method using self because the param name is also
-        # urls which conflicts.
-        self.urls = urls
-      end
-    end
-    # Adds the url to this Crawler's @urls.
-    #
-    # @param url [Wgit::Url] A URL to crawl.
-    def <<(url)
-      add_url(url)
-    end
-    # Crawls individual urls, not entire sites.
-    #
-    # @param urls [Array<Wgit::Url>] The URLs to crawl.
-    # @yield [doc] If provided, the block is given each crawled
-    #   Document. Otherwise each doc is added to @docs which can be accessed
-    #   by Crawler#docs after this method returns.
-    # @return [Wgit::Document] The last Document crawled.
-    def crawl_urls(urls = @urls, &block)
-      raise "No urls to crawl" unless urls
-      @docs = []
-      doc = nil
-      Wgit::Utils.each(urls) { |url| doc = handle_crawl_block(url, &block) }
-      doc ? doc : @docs.last
-    end
-    # Crawl the url and return the response document or nil.
-    #
-    # @param url [Wgit::Document] The URL to crawl.
-    # @yield [doc] The crawled HTML Document regardless if the
-    #   crawl was successful or not. Therefore, the Document#url can be used.
-    # @return [Wgit::Document, nil] The crawled HTML Document or nil if the
-    #   crawl was unsuccessful.
-    def crawl_url(url = @urls.first)
-      assert_type(url, Wgit::Url)
-      markup = fetch(url)
-      url.crawled = true
-      doc = Wgit::Document.new(url, markup)
-      yield(doc) if block_given?
-      doc.empty? ? nil : doc
-    end
-    # Crawls an entire site by recursively going through its internal_links.
-    #
-    # @param base_url [Wgit::Url] The base URL of the website to be crawled.
-    # @yield [doc] Given each crawled Document/page of the site.
-    #   A block is the only way to interact with each crawled Document.
-    # @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
-    #   from all of the site's pages or nil if the base_url could not be
-    #   crawled successfully.
-    def crawl_site(base_url = @urls.first, &block)
-      assert_type(base_url, Wgit::Url)
-      doc = crawl_url(base_url, &block)
-      return nil if doc.nil?
-      path = base_url.path.empty? ? '/' : base_url.path
-      crawled_urls  = [path]
-      external_urls = doc.external_links
-      internal_urls = doc.internal_links
-      return doc.external_links.uniq if internal_urls.empty?
-      loop do
-        internal_urls.uniq!
-        links = internal_urls - crawled_urls
-        break if links.empty?
-        links.each do |link|
-          doc = crawl_url(Wgit::Url.concat(base_url.to_base, link), &block)
-          crawled_urls << link
-          next if doc.nil?
-          internal_urls.concat(doc.internal_links)
-          external_urls.concat(doc.external_links)
-        end
-      end
-      external_urls.uniq
-    end
-  private
-    # Add the document to the @docs array for later processing or let the block
-    # process it here and now.
-    def handle_crawl_block(url, &block)
-      if block_given?
-        crawl_url(url, &block)
-      else
-        @docs << crawl_url(url)
-        nil
-      end
-    end
-    # The fetch method performs a HTTP GET to obtain the HTML document.
-    # Invalid urls or any HTTP response that doesn't return a HTML body will be
-    # ignored and nil will be returned. Otherwise, the HTML is returned.
-    def fetch(url)
-      response = resolve(url)
-      response.body.empty? ? nil : response.body
-    rescue Exception => ex
-      Wgit.logger.debug(
-        "Wgit::Crawler#fetch('#{url}') exception: #{ex.message}"
-      )
-      nil
-    end
-    # The resolve method performs a HTTP GET to obtain the HTML document.
-    # A certain amount of redirects will be followed by default before raising
-    # an exception. Redirects can be disabled by setting `redirect_limit: 1`.
-    # The Net::HTTPResponse will be returned.
-    def resolve(url, redirect_limit: 5)
-      redirect_count = -1
-      begin
-        raise "Too many redirects" if redirect_count >= redirect_limit
-        redirect_count += 1
-        response = Net::HTTP.get_response(URI(url))
-        location = Wgit::Url.new(response.fetch('location', ''))
-        if not location.empty?
-          url = location.is_relative? ? url.to_base.concat(location) : location
-        end
-      end while response.is_a?(Net::HTTPRedirection)
-      response
-    end
-    # Add the url to @urls ensuring it is cast to a Wgit::Url if necessary.
-    def add_url(url)
-      @urls = [] if @urls.nil?
-      @urls << Wgit::Url.new(url)
-    end
-    alias :crawl :crawl_urls
-    alias :crawl_r :crawl_site
-  end
-end
+require_relative 'url'
+require_relative 'document'
+require_relative 'utils'
+require_relative 'assertable'
+require 'net/http' # requires 'uri'
+module Wgit
+  # The Crawler class provides a means of crawling web based URL's, turning
+  # their HTML into Wgit::Document's.
+  class Crawler
+    include Assertable
+    # The urls to crawl.
+    attr_reader :urls
+    # The docs of the crawled @urls.
+    attr_reader :docs
+    # Initializes the Crawler by setting the @urls and @docs.
+    #
+    # @param urls [*Wgit::Url] The URLs to crawl.
+    def initialize(*urls)
+      self.[](*urls)
+      @docs = []
+    end
+    # Sets this Crawler's @urls.
+    #
+    # @param urls [Array<Wgit::Url>] The URLs to crawl.
+    def urls=(urls)
+      @urls = []
+      Wgit::Utils.each(urls) { |url| add_url(url) }
+    end
+    # Sets this Crawler's @urls.
+    #
+    # @param urls [*Wgit::Url] The URLs to crawl.
+    def [](*urls)
+      # If urls is nil then add_url (when called later) will set @urls = []
+      # so we do nothing here.
+      if not urls.nil?
+        # Due to *urls you can end up with [[url1,url2,url3]] etc. where the
+        # outer array is bogus so we use the inner one only.
+        if  urls.is_a?(Enumerable) &&
+            urls.length == 1 &&
+            urls.first.is_a?(Enumerable)
+          urls = urls.first
+        end
+        # Here we call urls= method using self because the param name is also
+        # urls which conflicts.
+        self.urls = urls
+      end
+    end
+    # Adds the url to this Crawler's @urls.
+    #
+    # @param url [Wgit::Url] A URL to crawl.
+    def <<(url)
+      add_url(url)
+    end
+    # Crawls individual urls, not entire sites.
+    #
+    # @param urls [Array<Wgit::Url>] The URLs to crawl.
+    # @yield [doc] If provided, the block is given each crawled
+    #   Document. Otherwise each doc is added to @docs which can be accessed
+    #   by Crawler#docs after this method returns.
+    # @return [Wgit::Document] The last Document crawled.
+    def crawl_urls(urls = @urls, &block)
+      raise "No urls to crawl" unless urls
+      @docs = []
+      doc = nil
+      Wgit::Utils.each(urls) { |url| doc = handle_crawl_block(url, &block) }
+      doc ? doc : @docs.last
+    end
+    # Crawl the url and return the response document or nil.
+    #
+    # @param url [Wgit::Document] The URL to crawl.
+    # @yield [doc] The crawled HTML Document regardless if the
+    #   crawl was successful or not. Therefore, the Document#url can be used.
+    # @return [Wgit::Document, nil] The crawled HTML Document or nil if the
+    #   crawl was unsuccessful.
+    def crawl_url(url = @urls.first)
+      assert_type(url, Wgit::Url)
+      markup = fetch(url)
+      url.crawled = true
+      doc = Wgit::Document.new(url, markup)
+      yield(doc) if block_given?
+      doc.empty? ? nil : doc
+    end
+    # Crawls an entire site by recursively going through its internal_links.
+    #
+    # @param base_url [Wgit::Url] The base URL of the website to be crawled.
+    # @yield [doc] Given each crawled Document/page of the site.
+    #   A block is the only way to interact with each crawled Document.
+    # @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
+    #   from all of the site's pages or nil if the base_url could not be
+    #   crawled successfully.
+    def crawl_site(base_url = @urls.first, &block)
+      assert_type(base_url, Wgit::Url)
+      doc = crawl_url(base_url, &block)
+      return nil if doc.nil?
+      path = base_url.path.nil? ? '/' : base_url.path
+      crawled_urls  = [path]
+      external_urls = doc.external_links
+      internal_urls = doc.internal_links
+      return doc.external_links.uniq if internal_urls.empty?
+      loop do
+        internal_urls.uniq!
+        links = internal_urls - crawled_urls
+        break if links.empty?
+        links.each do |link|
+          doc = crawl_url(Wgit::Url.concat(base_url.to_base, link), &block)
+          crawled_urls << link
+          next if doc.nil?
+          internal_urls.concat(doc.internal_links)
+          external_urls.concat(doc.external_links)
+        end
+      end
+      external_urls.uniq
+    end
+  private
+    # Add the document to the @docs array for later processing or let the block
+    # process it here and now.
+    def handle_crawl_block(url, &block)
+      if block_given?
+        crawl_url(url, &block)
+      else
+        @docs << crawl_url(url)
+        nil
+      end
+    end
+    # The fetch method performs a HTTP GET to obtain the HTML document.
+    # Invalid urls or any HTTP response that doesn't return a HTML body will be
+    # ignored and nil will be returned. Otherwise, the HTML is returned.
+    def fetch(url)
+      response = resolve(url)
+      response.body.empty? ? nil : response.body
+    rescue Exception => ex
+      Wgit.logger.debug(
+        "Wgit::Crawler#fetch('#{url}') exception: #{ex.message}"
+      )
+      nil
+    end
+    # The resolve method performs a HTTP GET to obtain the HTML document.
+    # A certain amount of redirects will be followed by default before raising
+    # an exception. Redirects can be disabled by setting `redirect_limit: 1`.
+    # The Net::HTTPResponse will be returned.
+    def resolve(url, redirect_limit: 5)
+      redirect_count = -1
+      begin
+        raise "Too many redirects" if redirect_count >= redirect_limit
+        redirect_count += 1
+        response = Net::HTTP.get_response(URI(url))
+        location = Wgit::Url.new(response.fetch('location', ''))
+        if not location.empty?
+          url = location.is_relative? ? url.to_base.concat(location) : location
+        end
+      end while response.is_a?(Net::HTTPRedirection)
+      response
+    end
+    # Add the url to @urls ensuring it is cast to a Wgit::Url if necessary.
+    def add_url(url)
+      @urls = [] if @urls.nil?
+      @urls << Wgit::Url.new(url)
+    end
+    alias :crawl :crawl_urls
+    alias :crawl_r :crawl_site
+  end
+end

data/lib/wgit/database/database.rb CHANGED Viewed

@@ -8,7 +8,7 @@ require 'mongo'
 module Wgit
-  # Class modeling a DB connection and CRUD operations for the Url and
+  # Class modeling a DB connection and CRUD operations for the Url and
   # Document collections.
   class Database
     include Assertable
@@ -19,7 +19,7 @@ module Wgit
     def initialize
       conn_details = Wgit::CONNECTION_DETAILS
       if conn_details.empty?
-        raise "Wgit::CONNECTION_DETAILS must be defined and include :host,
+        raise "Wgit::CONNECTION_DETAILS must be defined and include :host,
 :port, :db, :uname, :pword for a database connection to be established."
       end
@@ -29,14 +29,14 @@ module Wgit
       Mongo::Logger.logger.level    = Logger::ERROR
       address = "#{conn_details[:host]}:#{conn_details[:port]}"
-      @@client = Mongo::Client.new([address],
+      @@client = Mongo::Client.new([address],
                                    database: conn_details[:db],
                                    user:     conn_details[:uname],
                                    password: conn_details[:pword])
     end
     ### Create Data ###
     # Insert one or more Url or Document objects into the DB.
     #
     # @param data [Hash, Enumerable<Hash>] Hash(es) returned from
@@ -57,9 +57,9 @@ module Wgit
         raise "data is not in the correct format (all Url's or Document's)"
       end
     end
     ### Retrieve Data ###
     # Returns Url records from the DB. All Urls are sorted by date_added
     # ascending, in other words the first url returned is the first one that
     # was inserted into the DB.
@@ -71,18 +71,18 @@ module Wgit
     # @return [Array<Wgit::Url>] The Urls obtained from the DB.
     def urls(crawled = nil, limit = 0, skip = 0)
       crawled.nil? ? query = {} : query = { crawled: crawled }
       sort = { date_added: 1 }
       results = retrieve(:urls, query, sort, {}, limit, skip)
       return [] if results.count < 1
       # results.respond_to? :map! is false so we use map and overwrite the var.
       results = results.map { |url_doc| Wgit::Url.new(url_doc) }
       results.each { |url| yield(url) } if block_given?
       results
     end
     # Returns Url records that have been crawled.
     #
     # @param limit [Integer] The max number of Url's to return. 0 returns all.
@@ -127,20 +127,20 @@ module Wgit
     def search(query, whole_sentence = false, limit = 10, skip = 0)
       query.strip!
       query.replace("\"" + query + "\"") if whole_sentence
       # The sort_proj sorts based on the most search hits.
       # We use the sort_proj hash as both a sort and a projection below.
       # :$caseSensitive => case_sensitive, 3.2+ only.
       sort_proj = { score: { :$meta => "textScore" } }
       query = { :$text => { :$search => query } }
       results = retrieve(:documents, query, sort_proj, sort_proj, limit, skip)
       return [] if results.count < 1 # respond_to? :empty? == false
       # results.respond_to? :map! is false so we use map and overwrite the var.
       results = results.map { |mongo_doc| Wgit::Document.new(mongo_doc) }
       results.each { |doc| yield(doc) } if block_given?
       results
     end
@@ -150,7 +150,7 @@ module Wgit
     def stats
       @@client.command(dbStats: 0).documents[0]
     end
     # Returns the current size of the database.
     #
     # @return [Integer] The current size of the DB.
@@ -201,7 +201,7 @@ module Wgit
     end
     ### Update Data ###
     # Update a Url or Document object in the DB.
     #
     # @param data [Hash, Enumerable<Hash>] Hash(es) returned from
@@ -254,7 +254,7 @@ module Wgit
       end
       create(:urls, url_or_urls)
     end
     # Insert one or more Document objects into the DB.
     def insert_docs(doc_or_docs)
       unless doc_or_docs.respond_to?(:map)
@@ -270,7 +270,7 @@ module Wgit
       end
       create(:documents, doc_or_docs)
     end
     # Create/insert one or more Url or Document records into the DB.
     def create(collection, data)
       assert_type(data, [Hash, Array])
@@ -324,9 +324,9 @@ module Wgit
       update = { "$set" => doc_hash }
       _update(true, :documents, selection, update)
     end
     # Update one or more Url or Document records in the DB.
-    # NOTE: The Model.common_update_data should be merged in the calling
+    # NOTE: The Model.common_update_data should be merged in the calling
     # method as the update param can be bespoke due to its nature.
     def _update(single, collection, selection, update)
       assert_arr_types([selection, update], Hash)
@@ -338,12 +338,13 @@ module Wgit
       raise "DB write (update) failed" unless write_succeeded?(result)
       result.n
     end
     alias :count :size
     alias :length :size
     alias :num_documents :num_docs
     alias :document? :doc?
     alias :insert_url :insert_urls
     alias :insert_doc :insert_docs
+    alias :num_objects :num_records
   end
 end