RubyGems - wgit - Versions diffs - 0.0.1 → 0.0.2 - Mend

wgit 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml +5 -5
data/lib/wgit.rb +1 -1
data/lib/wgit/assertable.rb +72 -61
data/lib/wgit/core_ext.rb +11 -5
data/lib/wgit/crawler.rb +97 -57
data/lib/wgit/database/database.rb +247 -170
data/lib/wgit/database/model.rb +40 -24
data/lib/wgit/database/mongo_connection_details.rb +44 -23
data/lib/wgit/document.rb +534 -233
data/lib/wgit/indexer.rb +235 -0
data/lib/wgit/url.rb +199 -121
data/lib/wgit/utils.rb +143 -96
data/lib/wgit/version.rb +5 -1
metadata +10 -9
data/lib/wgit/web_crawler.rb +0 -134

data/lib/wgit/utils.rb CHANGED

@@ -1,115 +1,162 @@
 module Wgit
-  # @author Michael Telford
   # Utility module containing generic methods.
   module Utils
-      def self.time_stamp
-          Time.new
-      end
+    # Returns the current time stamp.
+    #
+    # @return [Time] The current time stamp.
+    def self.time_stamp
+      Time.new
+    end
-      # Returns a hash created from obj's instance vars and values.
-      def self.to_h(obj, ignore = [])
-          hash = {}
-          obj.instance_variables.each do |var|
-              next if ignore.include?(var)
-              hash[var[1..-1].to_sym] = obj.instance_variable_get(var)
-          end
-          hash
+    # Returns a Hash created from obj's instance vars and values.
+    #
+    # @param obj [Object] The object to process.
+    # @param ignore [Array<String>] Attributes to ignore.
+    # @param use_strings_as_keys [Boolean] Whether or not to use strings as
+    #     the keys in the returned Hash. Symbols are used otherwise.
+    # @return [Hash] A Hash created from obj's instance vars and values.
+    def self.to_h(obj, ignore = [], use_strings_as_keys = true)
+      hash = {}
+      obj.instance_variables.each do |var|
+        next if ignore.include?(var.to_s)
+        key = var.to_s[1..-1]
+        key = key.to_sym unless use_strings_as_keys
+        hash[key] = obj.instance_variable_get(var)
       end
-      # Improved each method which takes care of singleton and enumerable
-      # objects. Yields one or more objects.
-      def self.each(obj_or_objs)
-          if obj_or_objs.respond_to?(:each)
-              obj_or_objs.each { |obj| yield obj }
-          else
-              yield obj_or_objs
-          end
+      hash
+    end
+    # Returns the model having removed non bson types (for use with MongoDB).
+    #
+    # @param model_hash [Hash] The model Hash to process.
+    # @return [Hash] The model Hash with non bson types removed.
+    def self.remove_non_bson_types(model_hash)
+      model_hash.reject do |k, v|
+        not v.respond_to? :bson_type
       end
+    end
-      # Formats the sentence (modifies the receiver) and returns its value.
-      # The length will be based on the sentence_limit parameter or the full
-      # length of the original sentence, which ever is less. The full sentence
-      # is returned if the sentence_limit is 0. The algorithm obviously ensures
-      # that the search value is visible somewhere in the sentence.
-      def self.format_sentence_length(sentence, index, sentence_limit)
-          raise "A sentence value must be provided" if sentence.empty?
-          raise "The sentence length value must be even" if sentence_limit.odd?
-          if index < 0 or index > sentence.length
-              raise "Incorrect index value: #{index}"
-          end
-          return sentence if sentence_limit == 0
+    # An improved :each method which accepts both singleton and Enumerable
+    # objects (as opposed to just an Enumerable object).
+    #
+    # @yield [el] Gives each element of obj_or_objects if it's Enumerable,
+    #     otherwise obj_or_objs itself is given.
+    def self.each(obj_or_objs)
+      if obj_or_objs.respond_to?(:each)
+        obj_or_objs.each { |obj| yield(obj) }
+      else
+        yield(obj_or_objs)
+      end
+    end
-          start = 0
-          finish = sentence.length
+    # Formats the sentence (modifies the receiver) and returns its value.
+    # The formatting is essentially to shorten the sentence and ensure that
+    # the index is present somewhere in the sentence. Used for search query
+    # results.
+    #
+    # @param sentence [String] The sentence to be formatted.
+    # @param index [Integer] The first index of a word in sentence. This is
+    #     usually a word in a search query.
+    # @param sentence_limit [Integer] The max length of the formatted sentence
+    #     being returned. The length will be based on the sentence_limit
+    #     parameter or the full length of the original sentence, which ever
+    #     is less. The full sentence is returned if the sentence_limit is 0.
+    # @return [String] The sentence once formatted.
+    def self.format_sentence_length(sentence, index, sentence_limit)
+      raise "A sentence value must be provided" if sentence.empty?
+      raise "The sentence length value must be even" if sentence_limit.odd?
+      if index < 0 or index > sentence.length
+        raise "Incorrect index value: #{index}"
+      end
+      return sentence if sentence_limit == 0
-          if sentence.length > sentence_limit
-              start = index - (sentence_limit / 2)
-              finish = index + (sentence_limit / 2)
+      start = 0
+      finish = sentence.length
-              if start < 0
-                  diff = 0 - start
-                  if (finish + diff) > sentence.length
-                      finish = sentence.length
-                  else
-                      finish += diff
-                  end
-                  start = 0
-              elsif finish > sentence.length
-                  diff = finish - sentence.length
-                  if (start - diff) < 0
-                      start = 0
-                  else
-                      start -= diff
-                  end
-                  finish = sentence.length
-              end
+      if sentence.length > sentence_limit
+        start = index - (sentence_limit / 2)
+        finish = index + (sentence_limit / 2)
-              raise if sentence[start..(finish - 1)].length != sentence_limit
+        if start < 0
+          diff = 0 - start
+          if (finish + diff) > sentence.length
+            finish = sentence.length
+          else
+            finish += diff
           end
+          start = 0
+        elsif finish > sentence.length
+          diff = finish - sentence.length
+          if (start - diff) < 0
+            start = 0
+          else
+            start -= diff
+          end
+          finish = sentence.length
+        end
-          sentence.replace(sentence[start..(finish - 1)])
+        raise if sentence[start..(finish - 1)].length != sentence_limit
       end
-      # Prints out the search results in a search engine page format.
-      # Most of the params are passed to Document#search - see class docs.
-      # The steam param decides where the printf output is written to, and
-      # therefore must respond_to? :puts
-      # The format for each result is:
-      #
-      # Title
-      # Keywords (if there are some)
-      # Text Snippet (showing the searched for text if provided)
-      # Url
-      # <empty_line>
-      def self.printf_search_results(results, text = nil, case_sensitive = false,
-                                     sentence_length = 80, keyword_count = 5,
-                                     stream = Kernel)
-          raise "stream must respond_to? :puts" unless stream.respond_to? :puts
-          keyword_count -= 1 # Because Array's are zero indexed.
-          results.each do |doc|
-              sentence = if text.nil?
-                            nil
-                         else
-                            sentence = doc.search(text, sentence_length).first
-                            if sentence.nil?
-                                nil
-                            else
-                                sentence.strip.empty? ? nil : sentence
-                            end
-                         end
-              stream.puts doc.title
-              unless doc.keywords.empty?
-                  stream.puts doc.keywords[0..keyword_count].join(", ")
-              end
-              stream.puts sentence unless sentence.nil?
-              stream.puts doc.url
-              stream.puts
-          end
-          nil
+      sentence.replace(sentence[start..(finish - 1)])
+    end
+    # Prints out the search results in a search engine like format.
+    # Most of the params are passed to Wgit::Document#search; see the docs.
+    # The format for each result looks like:
+    #
+    # Title
+    #
+    # Keywords (if there are some)
+    #
+    # Text Snippet (showing the searched for query if provided)
+    #
+    # URL
+    #
+    # <empty_line_seperator>
+    #
+    # @param results [Array<Wgit::Document>] An Array whose
+    #     Wgit::Documents#text matches the query at least once.
+    # @param query [String] The text query to search for.
+    # @param case_sensitive [Boolean] Whether or not the search should be
+    #     case sensitive or not.
+    # @param sentence_length [Integer] The length of the matching text of the
+    #     search results to be outputted to the stream.
+    # @param keyword_count [Integer] The max amount of keywords to be
+    #     outputted to the stream.
+    # @param stream [#puts] Any object that respond_to? :puts. It is used
+    #     to output text somewhere e.g. STDOUT (the default).
+    # @return [nil]
+    def self.printf_search_results(results, query = nil, case_sensitive = false,
+                                   sentence_length = 80, keyword_count = 5,
+                                   stream = Kernel)
+      raise "stream must respond_to? :puts" unless stream.respond_to? :puts
+      keyword_count -= 1 # Because Array's are zero indexed.
+      results.each do |doc|
+        sentence =  if query.nil?
+                      nil
+                    else
+                      sentence = doc.search(query, sentence_length).first
+                      if sentence.nil?
+                        nil
+                      else
+                        sentence.strip.empty? ? nil : sentence
+                      end
+                    end
+        stream.puts doc.title
+        unless doc.keywords.nil? || doc.keywords.empty?
+          stream.puts doc.keywords[0..keyword_count].join(", ")
+        end
+        stream.puts sentence unless sentence.nil?
+        stream.puts doc.url
+        stream.puts
       end
+      nil
+    end
   end
 end

data/lib/wgit/version.rb CHANGED

@@ -1,3 +1,7 @@
+# Wgit is a WWW indexer/scraper which crawls URL's and retrieves their page
+# contents for later use.
+# @author Michael Telford
 module Wgit
-  VERSION = "0.0.1".freeze
+  # The current gem version of Wgit.
+  VERSION = "0.0.2".freeze
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: wgit
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.2
 platform: ruby
 authors:
 - Michael Telford
@@ -15,8 +15,8 @@ description: Wgit is a WWW indexer/scraper which crawls URL's and retrieves thei
   indexed documents stored in a database. Therefore this library provides the main
   components of a WWW search engine. You can also use Wgit to copy entire website's
   HTML making it far more powerful than wget. The Wgit API is easily extendable allowing
-  you to easily pull out the parts of a webpage that are important to you, the CSS
-  or JS links for example.
+  you to easily pull out the parts of a webpage that are important to you, the external
+  links or keywords for example.
 email: michael.telford@live.com
 executables: []
 extensions: []
@@ -30,14 +30,15 @@ files:
 - "./lib/wgit/database/model.rb"
 - "./lib/wgit/database/mongo_connection_details.rb"
 - "./lib/wgit/document.rb"
+- "./lib/wgit/indexer.rb"
 - "./lib/wgit/url.rb"
 - "./lib/wgit/utils.rb"
 - "./lib/wgit/version.rb"
-- "./lib/wgit/web_crawler.rb"
-homepage: http://rubygems.org/gems/wgit
+homepage: https://github.com/michaeltelford/wgit
 licenses:
 - MIT
 metadata:
+  source_code_uri: https://github.com/michaeltelford/wgit
   allowed_push_host: https://rubygems.org
 post_install_message:
 rdoc_options: []
@@ -45,9 +46,9 @@ require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
-  - - ">="
+  - - "~>"
     - !ruby/object:Gem::Version
-      version: '0'
+      version: '2.5'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
@@ -55,8 +56,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.4.5
+rubygems_version: 2.7.8
 signing_key:
 specification_version: 4
-summary: Wgit is wget on steroids with an easy to use API.
+summary: Wgit is wget on steroids with an easy to use API for web scraping and indexing.
 test_files: []

data/lib/wgit/web_crawler.rb DELETED

@@ -1,134 +0,0 @@
-#!/usr/bin/env ruby
-require_relative 'crawler'
-require_relative 'database/database'
-# @author Michael Telford
-module Wgit
-  # Convience method to crawl the World Wide Web.
-  # The default value (-1) for max_sites_to_crawl is unrestricted.
-  # The default max_data_size is 1GB.
-  def self.crawl_the_web(max_sites_to_crawl = -1, max_data_size = 1048576000)
-    db = Wgit::Database.new
-    web_crawler = Wgit::WebCrawler.new(db, max_sites_to_crawl, max_data_size)
-    web_crawler.crawl_the_web
-  end
-  # Class which sets up a crawler and saves the indexed
-  # docs to a database. Will crawl the web forever if you let it :-)
-  class WebCrawler
-    attr_accessor :max_sites_to_crawl, :max_data_size
-    attr_reader :crawler, :db
-    def initialize(database,
-                   max_sites_to_crawl = -1,
-                   max_data_size = 1048576000)
-      @crawler = Wgit::Crawler.new
-      @db = database
-      @max_sites_to_crawl = max_sites_to_crawl
-      @max_data_size = max_data_size
-    end
-    # Retrieves url's from the database and recursively crawls each site
-    # storing their internal pages into the database and adding their external
-    # url's to be crawled at a later date.
-    def crawl_the_web
-      if max_sites_to_crawl < 0
-        puts "Crawling until the database has been filled or it runs out of \
-urls to crawl (which might be never)."
-      end
-      loop_count = 0
-      while keep_crawling?(loop_count) do
-          puts "Current database size: #{db.size}"
-          crawler.urls = db.uncrawled_urls
-          if crawler.urls.empty?
-              puts "No urls to crawl, exiting."
-              break
-          end
-          puts "Starting crawl loop for: #{crawler.urls}"
-          docs_count = 0
-          urls_count = 0
-          crawler.urls.each do |url|
-            unless keep_crawling?(loop_count)
-              puts "Reached max number of sites to crawl or database \
-capacity, exiting."
-              return
-            end
-            loop_count += 1
-            url.crawled = true
-            raise unless db.update(url) == 1
-            site_docs_count = 0
-            ext_links = crawler.crawl_site(url) do |doc|
-                unless doc.empty?
-                    if write_doc_to_db(doc)
-                        docs_count += 1
-                        site_docs_count += 1
-                    end
-                end
-            end
-            urls_count += write_urls_to_db(ext_links)
-            puts "Crawled and saved #{site_docs_count} docs for the \
-site: #{url}"
-          end
-          puts "Crawled and saved docs for #{docs_count} url(s) overall for \
-this iteration."
-          puts "Found and saved #{urls_count} external url(s) for the next \
-iteration."
-      end
-    end
-    private
-    # Keep crawling or not based on DB size and current loop interation.
-    def keep_crawling?(loop_count)
-      return false if db.size >= max_data_size
-      # If max_sites_to_crawl is -1 for example then crawl away.
-      if max_sites_to_crawl < 0
-        true
-      else
-        loop_count < max_sites_to_crawl
-      end
-    end
-    # The unique url index on the documents collection prevents duplicate
-    # inserts.
-    def write_doc_to_db(doc)
-        db.insert(doc)
-        puts "Saved document for url: #{doc.url}"
-        true
-    rescue Mongo::Error::OperationFailure
-        puts "Document already exists: #{doc.url}"
-        false
-    end
-    # The unique url index on the urls collection prevents duplicate inserts.
-    def write_urls_to_db(urls)
-        count = 0
-        if urls.respond_to?(:each)
-            urls.each do |url|
-                begin
-                  db.insert(url)
-                  count += 1
-                  puts "Inserted url: #{url}"
-                rescue Mongo::Error::OperationFailure
-                  puts "Url already exists: #{url}"
-                end
-            end
-        end
-        count
-    end
-  end
-end
-if __FILE__ == $0
-    Wgit.crawl_the_web
-end