RubyGems - wgit - Versions diffs - 0.7.0 → 0.10.1 - Mend

wgit 0.7.0 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

checksums.yaml +4 -4
data/.yardopts +1 -1
data/CHANGELOG.md +74 -2
data/LICENSE.txt +1 -1
data/README.md +114 -290
data/bin/wgit +9 -5
data/lib/wgit/assertable.rb +3 -3
data/lib/wgit/base.rb +30 -0
data/lib/wgit/core_ext.rb +1 -1
data/lib/wgit/crawler.rb +219 -79
data/lib/wgit/database/database.rb +309 -134
data/lib/wgit/database/model.rb +10 -3
data/lib/wgit/document.rb +226 -143
data/lib/wgit/{document_extensions.rb → document_extractors.rb} +21 -11
data/lib/wgit/dsl.rb +324 -0
data/lib/wgit/indexer.rb +65 -162
data/lib/wgit/response.rb +11 -8
data/lib/wgit/url.rb +192 -61
data/lib/wgit/utils.rb +32 -20
data/lib/wgit/version.rb +2 -1
data/lib/wgit.rb +3 -1
metadata +34 -19

data/lib/wgit/indexer.rb CHANGED Viewed

@@ -4,129 +4,8 @@ require_relative 'crawler'
 require_relative 'database/database'
 module Wgit
-  # Convience method to index the World Wide Web using
-  # Wgit::Indexer#index_www.
-  #
-  # Retrieves uncrawled url's from the database and recursively crawls each
-  # site storing their internal pages into the database and adding their
-  # external url's to be crawled later on. Logs info on the crawl
-  # using Wgit.logger as it goes along.
-  #
-  # @param connection_string [String] The database connection string. Set as
-  #   nil to use ENV['WGIT_CONNECTION_STRING'].
-  # @param max_sites [Integer] The number of separate and whole
-  #   websites to be crawled before the method exits. Defaults to -1 which
-  #   means the crawl will occur until manually stopped (Ctrl+C etc).
-  # @param max_data [Integer] The maximum amount of bytes that will be
-  #   scraped from the web (default is 1GB). Note, that this value is used to
-  #   determine when to stop crawling; it's not a guarantee of the max data
-  #   that will be obtained.
-  def self.index_www(
-    connection_string: nil, max_sites: -1, max_data: 1_048_576_000
-  )
-    db = Wgit::Database.new(connection_string)
-    indexer = Wgit::Indexer.new(db)
-    indexer.index_www(max_sites: max_sites, max_data: max_data)
-  end
-  # Convience method to index a single website using
-  # Wgit::Indexer#index_site.
-  #
-  # Crawls a single website's pages and stores them into the database.
-  # There is no max download limit so be careful which sites you index.
-  #
-  # @param url [Wgit::Url, String] The base Url of the website to crawl.
-  # @param connection_string [String] The database connection string. Set as
-  #   nil to use ENV['WGIT_CONNECTION_STRING'].
-  # @param insert_externals [Boolean] Whether or not to insert the website's
-  #   external Url's into the database.
-  # @param allow_paths [String, Array<String>] Filters links by selecting
-  #   them if their path `File.fnmatch?` one of allow_paths.
-  # @param disallow_paths [String, Array<String>] Filters links by rejecting
-  #   them if their path `File.fnmatch?` one of disallow_paths.
-  # @yield [doc] Given the Wgit::Document of each crawled webpage, before it's
-  #   inserted into the database allowing for prior manipulation.
-  # @return [Integer] The total number of pages crawled within the website.
-  def self.index_site(
-    url, connection_string: nil, insert_externals: true,
-    allow_paths: nil, disallow_paths: nil, &block
-  )
-    url = Wgit::Url.parse(url)
-    db = Wgit::Database.new(connection_string)
-    indexer = Wgit::Indexer.new(db)
-    indexer.index_site(
-      url, insert_externals: insert_externals,
-           allow_paths: allow_paths, disallow_paths: disallow_paths, &block
-    )
-  end
-  # Convience method to index a single webpage using
-  # Wgit::Indexer#index_page.
-  #
-  # Crawls a single webpage and stores it into the database.
-  # There is no max download limit so be careful of large pages.
-  #
-  # @param url [Wgit::Url, String] The Url of the webpage to crawl.
-  # @param connection_string [String] The database connection string. Set as
-  #   nil to use ENV['WGIT_CONNECTION_STRING'].
-  # @param insert_externals [Boolean] Whether or not to insert the website's
-  #   external Url's into the database.
-  # @yield [doc] Given the Wgit::Document of the crawled webpage, before it's
-  #   inserted into the database allowing for prior manipulation.
-  def self.index_page(
-    url, connection_string: nil, insert_externals: true, &block
-  )
-    url = Wgit::Url.parse(url)
-    db = Wgit::Database.new(connection_string)
-    indexer = Wgit::Indexer.new(db)
-    indexer.index_page(url, insert_externals: insert_externals, &block)
-  end
-  # Performs a search of the database's indexed documents and pretty prints
-  # the results. See Wgit::Database#search and Wgit::Document#search for
-  # details of how the search works.
-  #
-  # @param query [String] The text query to search with.
-  # @param connection_string [String] The database connection string. Set as
-  #   nil to use ENV['WGIT_CONNECTION_STRING'].
-  # @param case_sensitive [Boolean] Whether character case must match.
-  # @param whole_sentence [Boolean] Whether multiple words should be searched
-  #   for separately.
-  # @param limit [Integer] The max number of results to print.
-  # @param skip [Integer] The number of DB records to skip.
-  # @param sentence_limit [Integer] The max length of each result's text
-  #   snippet.
-  # @yield [doc] Given each search result (Wgit::Document) returned from the
-  #   database.
-  def self.indexed_search(
-    query, connection_string: nil,
-    case_sensitive: false, whole_sentence: true,
-    limit: 10, skip: 0, sentence_limit: 80, &block
-  )
-    db = Wgit::Database.new(connection_string)
-    results = db.search(
-      query,
-      case_sensitive: case_sensitive,
-      whole_sentence: whole_sentence,
-      limit: limit,
-      skip: skip,
-      &block
-    )
-    results.each do |doc|
-      doc.search!(
-        query,
-        case_sensitive: case_sensitive,
-        whole_sentence: whole_sentence,
-        sentence_limit: sentence_limit
-      )
-    end
-    Wgit::Utils.printf_search_results(results)
-  end
-  # Class which crawls and saves the indexed Documents to a database.
+  # Class which crawls and saves the Documents to a database. Can be thought of
+  # as a combination of Wgit::Crawler and Wgit::Database.
   class Indexer
     # The crawler used to index the WWW.
     attr_reader :crawler
@@ -139,7 +18,7 @@ module Wgit
     # @param database [Wgit::Database] The database instance (already
     #   initialized and connected) used to index.
     # @param crawler [Wgit::Crawler] The crawler instance used to index.
-    def initialize(database, crawler = Wgit::Crawler.new)
+    def initialize(database = Wgit::Database.new, crawler = Wgit::Crawler.new)
       @db      = database
       @crawler = crawler
     end
@@ -189,7 +68,8 @@ database capacity, exiting.")
           site_docs_count = 0
           ext_links = @crawler.crawl_site(url) do |doc|
-            if !doc.empty? && write_doc_to_db(doc)
+            unless doc.empty?
+              write_doc_to_db(doc)
               docs_count += 1
               site_docs_count += 1
             end
@@ -198,12 +78,9 @@ database capacity, exiting.")
           raise 'Error updating url' unless @db.update(url) == 1
           urls_count += write_urls_to_db(ext_links)
-          Wgit.logger.info("Crawled and saved #{site_docs_count} docs for the \
-site: #{url}")
         end
-        Wgit.logger.info("Crawled and saved docs for #{docs_count} url(s) \
+        Wgit.logger.info("Crawled and indexed docs for #{docs_count} url(s) \
 overall for this iteration.")
         Wgit.logger.info("Found and saved #{urls_count} external url(s) for \
 the next iteration.")
@@ -219,66 +96,91 @@ the next iteration.")
     # @param url [Wgit::Url] The base Url of the website to crawl.
     # @param insert_externals [Boolean] Whether or not to insert the website's
     #   external Url's into the database.
-    # @param allow_paths [String, Array<String>] Filters links by selecting
-    #   them if their path `File.fnmatch?` one of allow_paths.
-    # @param disallow_paths [String, Array<String>] Filters links by rejecting
-    #   them if their path `File.fnmatch?` one of disallow_paths.
+    # @param follow [String] The xpath extracting links to be followed during
+    #   the crawl. This changes how a site is crawled. Only links pointing to
+    #   the site domain are allowed. The `:default` is any `<a>` href returning
+    #   HTML.
+    # @param allow_paths [String, Array<String>] Filters the `follow:` links by
+    #   selecting them if their path `File.fnmatch?` one of allow_paths.
+    # @param disallow_paths [String, Array<String>] Filters the `follow` links
+    #   by rejecting them if their path `File.fnmatch?` one of disallow_paths.
     # @yield [doc] Given the Wgit::Document of each crawled web page before
     #   it's inserted into the database allowing for prior manipulation. Return
     #   nil or false from the block to prevent the document from being saved
     #   into the database.
     # @return [Integer] The total number of webpages/documents indexed.
     def index_site(
-      url, insert_externals: true, allow_paths: nil, disallow_paths: nil
+      url, insert_externals: false, follow: :default,
+      allow_paths: nil, disallow_paths: nil
     )
-      crawl_opts = { allow_paths: allow_paths, disallow_paths: disallow_paths }
+      crawl_opts = {
+        follow: follow,
+        allow_paths: allow_paths,
+        disallow_paths: disallow_paths
+      }
       total_pages_indexed = 0
-      ext_urls = @crawler.crawl_site(url, crawl_opts) do |doc|
-        result = true
-        result = yield(doc) if block_given?
+      ext_urls = @crawler.crawl_site(url, **crawl_opts) do |doc|
+        result = block_given? ? yield(doc) : true
-        if result && !doc.empty? && write_doc_to_db(doc)
+        if result && !doc.empty?
+          write_doc_to_db(doc)
           total_pages_indexed += 1
-          Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
         end
       end
-      @db.url?(url) ? @db.update(url) : @db.insert(url)
+      @db.upsert(url)
       if insert_externals && ext_urls
         num_inserted_urls = write_urls_to_db(ext_urls)
         Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
       end
-      Wgit.logger.info("Crawled and saved #{total_pages_indexed} docs for the \
-site: #{url}")
+      Wgit.logger.info("Crawled and indexed #{total_pages_indexed} docs for \
+the site: #{url}")
       total_pages_indexed
     end
+    # Crawls one or more webpages and stores them into the database.
+    # There is no max download limit so be careful of large pages.
+    # Logs info on the crawl using Wgit.logger as it goes along.
+    #
+    # @param urls [*Wgit::Url] The webpage Url's to crawl.
+    # @param insert_externals [Boolean] Whether or not to insert the webpages
+    #   external Url's into the database.
+    # @yield [doc] Given the Wgit::Document of the crawled webpage,
+    #   before it's inserted into the database allowing for prior
+    #   manipulation. Return nil or false from the block to prevent the
+    #   document from being saved into the database.
+    # @raise [StandardError] if no urls are provided.
+    def index_urls(*urls, insert_externals: false, &block)
+      raise 'You must provide at least one Url' if urls.empty?
+      opts = { insert_externals: insert_externals }
+      Wgit::Utils.each(urls) { |url| index_url(url, **opts, &block) }
+      nil
+    end
     # Crawls a single webpage and stores it into the database.
     # There is no max download limit so be careful of large pages.
     # Logs info on the crawl using Wgit.logger as it goes along.
     #
     # @param url [Wgit::Url] The webpage Url to crawl.
-    # @param insert_externals [Boolean] Whether or not to insert the webpage's
+    # @param insert_externals [Boolean] Whether or not to insert the webpages
     #   external Url's into the database.
     # @yield [doc] Given the Wgit::Document of the crawled webpage,
     #   before it's inserted into the database allowing for prior
     #   manipulation. Return nil or false from the block to prevent the
     #   document from being saved into the database.
-    def index_page(url, insert_externals: true)
+    def index_url(url, insert_externals: false)
       document = @crawler.crawl_url(url) do |doc|
-        result = true
-        result = yield(doc) if block_given?
-        if result && !doc.empty? && write_doc_to_db(doc)
-          Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
-        end
+        result = block_given? ? yield(doc) : true
+        write_doc_to_db(doc) if result && !doc.empty?
       end
-      @db.url?(url) ? @db.update(url) : @db.insert(url)
+      @db.upsert(url)
       ext_urls = document&.external_links
       if insert_externals && ext_urls
@@ -311,23 +213,19 @@ site: #{url}")
     # collection deliberately prevents duplicate inserts.
     #
     # @param doc [Wgit::Document] The document to write to the DB.
-    # @return [Boolean] True if the write was successful, false otherwise.
     def write_doc_to_db(doc)
-      @db.insert(doc)
-      Wgit.logger.info("Saved document for url: #{doc.url}")
-      true
-    rescue Mongo::Error::OperationFailure
-      Wgit.logger.info("Document already exists: #{doc.url}")
-      false
+      if @db.upsert(doc)
+        Wgit.logger.info("Saved document for url: #{doc.url}")
+      else
+        Wgit.logger.info("Updated document for url: #{doc.url}")
+      end
     end
     # Write the urls to the DB. Note that the unique url index on the urls
     # collection deliberately prevents duplicate inserts.
     #
     # @param urls [Array<Wgit::Url>] The urls to write to the DB.
-    # @return [Boolean] True if the write was successful, false otherwise.
+    # @return [Integer] The number of inserted urls.
     def write_urls_to_db(urls)
       count = 0
@@ -341,6 +239,7 @@ site: #{url}")
         @db.insert(url)
         count += 1
         Wgit.logger.info("Inserted external url: #{url}")
       rescue Mongo::Error::OperationFailure
         Wgit.logger.info("External url already exists: #{url}")
@@ -348,5 +247,9 @@ site: #{url}")
       count
     end
+    alias database db
+    alias index    index_urls
+    alias index_r  index_site
   end
 end

data/lib/wgit/response.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module Wgit
-  # Response class representing a generic HTTP crawl response.
+  # Response class modeling a generic HTTP GET response.
   class Response
     # The underlying HTTP adapter/library response object.
     attr_accessor :adapter_response
@@ -69,7 +69,10 @@ module Wgit
     # @param headers [Hash] The new response headers.
     # @return [Hash] @headers's new value.
     def headers=(headers)
-      return @headers = {} unless headers
+      unless headers
+        @headers = {}
+        return
+      end
       @headers = headers.map do |k, v|
         k = k.downcase.gsub('-', '_').to_sym
@@ -131,11 +134,11 @@ module Wgit
       @status.positive?
     end
-    alias code       status
-    alias content    body
-    alias crawl_time total_time
-    alias to_s       body
-    alias redirects  redirections
-    alias length     size
+    alias code           status
+    alias content        body
+    alias crawl_duration total_time
+    alias to_s           body
+    alias redirects      redirections
+    alias length         size
   end
 end