RubyGems - wgit - Versions diffs - 0.0.1 - Mend

wgit 0.0.1

Files changed (14) hide show

checksums.yaml +7 -0
data/lib/wgit.rb +11 -0
data/lib/wgit/assertable.rb +69 -0
data/lib/wgit/core_ext.rb +40 -0
data/lib/wgit/crawler.rb +132 -0
data/lib/wgit/database/database.rb +269 -0
data/lib/wgit/database/model.rb +31 -0
data/lib/wgit/database/mongo_connection_details.rb +27 -0
data/lib/wgit/document.rb +293 -0
data/lib/wgit/url.rb +140 -0
data/lib/wgit/utils.rb +115 -0
data/lib/wgit/version.rb +3 -0
data/lib/wgit/web_crawler.rb +134 -0
metadata +62 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 82f33e00a273c6cdeb3ba9c171110d849fff2428
+  data.tar.gz: 14c63f826d1d21811b14e9f3a2bca750b3f4afa3
+SHA512:
+  metadata.gz: 7c42b925f72d9e7cceba79d9aee764f97b6537c0005038501a1f75c36b1bcd3b6036cfb9b62fcf01fd435e0348c1e8c00c445a291051c068fa58184de2c9590a
+  data.tar.gz: a2a756c3be7b9b214921bfdac5846a2250e452265285cb9c3b812d2eaefc2ab969b608cd1841f34507a6ef184f20ba7c98658daf0135fb85eead88de0356320f

data/lib/wgit.rb ADDED Viewed

@@ -0,0 +1,11 @@
+require_relative 'wgit/version'
+require_relative 'wgit/crawler'
+require_relative 'wgit/web_crawler'
+require_relative 'wgit/url'
+require_relative 'wgit/document'
+require_relative 'wgit/utils'
+require_relative 'wgit/assertable'
+require_relative 'wgit/database/database'
+require_relative 'wgit/database/model'
+require_relative 'wgit/database/mongo_connection_details'
+#require_relative 'wgit/core_ext'

data/lib/wgit/assertable.rb ADDED Viewed

@@ -0,0 +1,69 @@
+module Wgit
+  # @author Michael Telford
+  # Module containing assert methods including type checking which can be used
+  # for asserting the integrity of method definitions etc.
+  module Assertable
+      DEFAULT_TYPE_FAIL_MSG = "Expected: %s, Actual: %s"
+      WRONG_METHOD_MSG = "arr must be Enumerable, use a different method"
+      DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s"
+      # obj.instance_of? must return true for one of the types listed in
+      # type_or_types or an exception is thrown using msg if provided.
+      # type_or_types can be a single Class or an Enumerable of Class objects,
+      # Strings and Symbols will not work.
+      def assert_types(obj, type_or_types, msg = nil)
+          msg ||= DEFAULT_TYPE_FAIL_MSG % [type_or_types, obj.class]
+          if type_or_types.respond_to?(:any?)
+              match = type_or_types.any? { |type| obj.instance_of?(type) }
+          else
+              match = obj.instance_of?(type_or_types)
+          end
+          raise msg unless match
+          obj
+      end
+      # Each object within arr must match one of the types listed in
+      # type_or_types or an exception is thrown using msg if provided.
+      # type_or_types can be a single Class or an Enumerable of Class objects,
+      # Strings and Symbols will not work.
+      def assert_arr_types(arr, type_or_types, msg = nil)
+          raise WRONG_METHOD_MSG unless arr.respond_to?(:each)
+          arr.each do |obj|
+              assert_types(obj, type_or_types, msg)
+          end
+      end
+      # The obj_or_objs must respond_to? all of the given methods or an
+      # Exception is raised using msg or a default message.
+      # Returns obj_or_objs on sucessful assertion.
+      def assert_respond_to(obj_or_objs, methods, msg = nil)
+          if obj_or_objs.respond_to?(:each)
+              obj_or_objs.each do |obj|
+                  _assert_respond_to(obj, methods, msg)
+              end
+          else
+              _assert_respond_to(obj_or_objs, methods, msg)
+          end
+          obj_or_objs
+      end
+      private
+      def _assert_respond_to(obj, methods, msg = nil)
+          msg ||= DEFAULT_DUCK_FAIL_MSG % ["#{obj.class} (#{obj})", methods]
+          match = methods.all? { |method| obj.respond_to?(method) }
+          raise msg unless match
+          obj
+      end
+      alias :assert_type :assert_types
+      alias :type :assert_types
+      alias :types :assert_types
+      alias :assert_arr_type :assert_arr_types
+      alias :arr_type :assert_arr_types
+      alias :arr_types :assert_arr_types
+      alias :respond_to :assert_respond_to
+  end
+end

data/lib/wgit/core_ext.rb ADDED Viewed

@@ -0,0 +1,40 @@
+require_relative 'url'
+# @author Michael Telford
+# Script which extends Ruby's core functionality when parsed.
+# Needs to be required separately using `require 'wgit/core_ext'`.
+class String
+  # Converts a String into a Wgit::Url object.
+  def to_url
+    Wgit::Url.new(self)
+  end
+end
+module Enumerable
+  # Converts each String instance into a Wgit::Url object and returns the new
+  # array.
+  def to_urls
+    map do |element|
+      process_url_element(element)
+    end
+  end
+  # Converts each String instance into a Wgit::Url object and returns the
+  # updated array.
+  def to_urls!
+    map! do |element|
+      process_url_element(element)
+    end
+  end
+end
+private
+def process_url_element(element)
+  if element.is_a? String
+    element.to_url
+  else
+    element
+  end
+end

data/lib/wgit/crawler.rb ADDED Viewed

@@ -0,0 +1,132 @@
+require_relative 'url'
+require_relative 'document'
+require_relative 'utils'
+require_relative 'assertable'
+require 'net/http' # requires 'uri'
+module Wgit
+  # @author Michael Telford
+  # Crawler class provides a means of crawling web URL's.
+  # Note that any redirects will not be followed for during crawling
+  # functionality.
+  class Crawler
+    include Assertable
+  	attr_reader :urls, :docs
+  	def initialize(*urls)
+  		self.urls = urls unless urls.nil?
+      @docs = []
+  	end
+    def urls=(urls)
+        @urls = []
+        Wgit::Utils.each(urls) { |url| add_url(url) }
+    end
+    def [](*urls)
+        self.urls = urls unless urls.nil?
+    end
+    def <<(url)
+        add_url(url)
+    end
+    # Crawls individual urls, not entire sites.
+    # Returns the last crawled doc.
+    # Yields each doc to the provided block or adds each doc to @docs
+    # which can be accessed by Crawler#docs after the method returns.
+  	def crawl_urls(urls = @urls, &block)
+      raise "No urls to crawl" unless urls
+      @docs = []
+      doc = nil
+      Wgit::Utils.each(urls) { |url| doc = handle_crawl_block(url, &block) }
+      doc ? doc : @docs.last
+  	end
+  	# Crawl the url and return the response document or nil.
+    # Also yield(doc) if a block is provided. The doc is passed to the block
+    # regardless of the crawl success so the doc.url can be used if needed.
+  	def crawl_url(url = @urls.first, &block)
+      assert_type(url, Url)
+  		markup = fetch(url)
+      url.crawled = true
+      doc = Wgit::Document.new(url, markup)
+      block.call(doc) if block_given?
+      doc.empty? ? nil : doc
+  	end
+    # Crawls an entire site by recursively going through its internal_links.
+    # Also yield(doc) for each crawled doc if a block is provided.
+    # A block is the only way to interact with the crawled docs.
+    # Returns a unique array of external urls collected from the site
+    # or nil if the base_url could not be crawled successfully.
+    def crawl_site(base_url = @urls.first, &block)
+      assert_type(base_url, Url)
+      doc = crawl_url(base_url, &block)
+      return nil if doc.nil?
+      crawled_urls  = []
+      external_urls = doc.external_links
+      internal_urls = doc.internal_links
+      return doc.external_links.uniq if internal_urls.empty?
+      loop do
+        internal_urls.uniq! unless internal_urls.uniq.nil?
+        links = internal_urls - crawled_urls
+        break if links.empty?
+        links.each do |link|
+          doc = crawl_url(Wgit::Url.concat(base_url.to_base, link), &block)
+          crawled_urls << link
+          next if doc.nil?
+          internal_urls.concat(doc.internal_links)
+          external_urls.concat(doc.external_links)
+        end
+      end
+      external_urls.uniq
+    end
+  private
+    # Add the document to the @docs array for later processing
+    # or let the block process it here and now.
+    def handle_crawl_block(url, &block)
+        if not block_given?
+		        @docs << crawl_url(url)
+            nil
+        else
+            crawl_url(url, &block)
+        end
+    end
+    # The fetch method performs a HTTP GET to obtain the HTML document.
+    # Invalid urls or any HTTP response that doesn't return a HTML body
+    # will be ignored and nil will be returned.  This means that redirects
+    # etc. will not be followed.
+    def fetch(url)
+        raise unless url.respond_to?(:to_uri)
+        res = Net::HTTP.get_response(url.to_uri)
+        res.body.empty? ? nil : res.body
+    rescue
+        nil
+    end
+    def add_url(url)
+        @urls = [] if @urls.nil?
+        if url.instance_of?(Url)
+            @urls << url
+        else
+            @urls << Wgit::Url.new(url)
+        end
+    end
+    alias :crawl :crawl_urls
+    alias :crawl_r :crawl_site
+  end
+end

data/lib/wgit/database/database.rb ADDED Viewed

@@ -0,0 +1,269 @@
+require_relative '../document'
+require_relative '../url'
+require_relative '../utils'
+require_relative '../assertable'
+require_relative 'mongo_connection_details'
+require_relative 'model'
+require 'mongo'
+module Wgit
+  # @author Michael Telford
+  # Class modeling a DB connection and CRUD operations for the Url and
+  # Document collections.
+  # The most common methods are: insert, update, urls, search, stats, size.
+  class Database
+    include Assertable
+    # Is relative to the root project folder, not this file.
+    LOG_FILE_PATH = "misc/mongo_log.txt"
+    def initialize
+      conn_details = Wgit::CONNECTION_DETAILS
+      if conn_details.empty?
+        raise "Wgit::CONNECTION_DETAILS must be defined and include :host,
+:port, :db, :uname, :pword for a database connection to be established."
+      end
+      logger = Logger.new(LOG_FILE_PATH)
+      address = "#{conn_details[:host]}:#{conn_details[:port]}"
+      @@client = Mongo::Client.new([address],
+                                   :database => conn_details[:db],
+                                   :user => conn_details[:uname],
+                                   :password => conn_details[:pword],
+                                   :logger => logger,
+                                   :truncate_logs => false)
+    end
+    ### Create Data ###
+    def insert(data)
+        if data.is_a?(Url)
+            insert_urls(data)
+        elsif data.is_a?(Document)
+            insert_docs(data)
+        elsif data.respond_to?(:first)
+            if data.first.is_a?(Url)
+                insert_urls(data)
+            else
+                insert_docs(data)
+            end
+        else
+            raise "data is not in the correct format (all Url's or Document's)"
+        end
+    end
+    def insert_urls(url_or_urls)
+        unless url_or_urls.respond_to?(:map)
+            assert_type(url_or_urls, Url)
+            url_or_urls = Wgit::Model.url(url_or_urls)
+        else
+            assert_arr_types(url_or_urls, Url)
+            url_or_urls = url_or_urls.map do |url|
+                Wgit::Model.url(url)
+            end
+        end
+        create(:urls, url_or_urls)
+    end
+    def insert_docs(doc_or_docs)
+        unless doc_or_docs.respond_to?(:map)
+            assert_type(doc_or_docs, [Document, Hash])
+            unless doc_or_docs.is_a?(Hash)
+                doc_or_docs = Wgit::Model.document(doc_or_docs)
+            end
+        else
+            assert_arr_types(doc_or_docs, [Document, Hash])
+            doc_or_docs = doc_or_docs.map do |doc|
+                Wgit::Model.document(doc) unless doc.is_a?(Hash)
+            end
+        end
+        create(:documents, doc_or_docs)
+    end
+    ### Retrieve Data ###
+    # A crawled parameter value of nil (the default) returns all urls.
+    # A limit of 0 means all urls are returned.
+    # All urls are sorted by date_added ascending, in other words the first
+    # url in the results is the first added.
+    def urls(crawled = nil, limit = 0, skip = 0, &block)
+      crawled.nil? ? query = {} : query = { :crawled => crawled }
+      sort = { :date_added => 1 }
+      results = retrieve(:urls, query, sort, {}, limit, skip)
+      return [] if results.count < 1
+      # results.respond_to? :map! is false so we use map and overwrite the var.
+      results = results.map { |url_doc| Wgit::Url.new(url_doc) }
+      return results unless block_given?
+      results.each { |url| block.call(url) }
+    end
+    def crawled_urls(limit = 0, skip = 0, &block)
+      urls(true, limit, skip, &block)
+    end
+    def uncrawled_urls(limit = 0, skip = 0, &block)
+      urls(false, limit, skip, &block)
+    end
+    # Currently all searches are case insensitive.
+    #
+    # Searches against the indexed docs in the DB for the given text.
+    # The searched fields are decided by the text index setup against the
+    # documents collection. Currently we search against the following fields:
+    # "author", "keywords", "title" and "text".
+    #
+    # The MongoDB search ranks/sorts the results in order (highest first) based
+    # upon each documents textScore which records the number of text hits. We
+    # then store this textScore in each Document object for use elsewhere if
+    # needed.
+    #
+    # @param text [String] the value to search the data against.
+    # @param whole_sentence [Boolean] whether multiple words should be
+    # searched for separately.
+    # @param limit [Fixnum] the max length/count of the results array.
+    # @param skip [Fixnum] the number of results to skip, starting with the
+    # most relevant based upon the textScore of the search.
+    # @param block [Block] a block which if provided is passed to each result.
+    #
+    # @return [Array] of Document objects representing the search results.
+    def search(text, whole_sentence = false, limit = 10, skip = 0, &block)
+      text.strip!
+      text.replace("\"" + text + "\"") if whole_sentence
+      # The textScore sorts based on the most search hits.
+      # We use the textScore hash as a sort and a projection below.
+      # :$caseSensitive => case_sensitive, # 3.2+ only.
+      sort_proj = { :score => { :$meta => "textScore" } }
+      query = { :$text => { :$search => text } }
+      results = retrieve(:documents, query, sort_proj, sort_proj, limit, skip)
+      return [] if results.count < 1
+      # results.respond_to? :map! is false so we use map and overwrite the var.
+      results = results.map { |mongo_doc| Wgit::Document.new(mongo_doc) }
+      return results unless block_given?
+      results.each { |doc| block.call(doc) }
+    end
+    # Performs a search and pretty prints the results.
+    def search_p(text, whole_sentence = false, limit = 10,
+                 skip = 0, sentence_length = 80, &block)
+      results = search(text, whole_sentence, limit, skip, &block)
+      Wgit::Utils.printf_search_results(results, text, false, sentence_length)
+    end
+    # Returns a Mongo object which can be used like a Hash to retrieve values.
+    def stats
+        @@client.command(:dbStats => 0).documents[0]
+    end
+    def size
+        stats[:dataSize]
+    end
+    ### Update Data ###
+    def update(data)
+      if data.is_a?(Url)
+        update_url(data)
+      elsif data.is_a?(Document)
+        update_doc(data)
+      else
+        raise "data is not in the correct format (all Url's or Document's)"
+      end
+    end
+    def update_url(url)
+      assert_type(url, Url)
+      selection = { :url => url }
+      url_hash = Wgit::Model.url(url).merge(Wgit::Model.common_update_data)
+      update = { "$set" => url_hash }
+      _update(true, :urls, selection, update)
+    end
+  def update_doc(doc)
+    assert_type(doc, Document)
+    selection = { :url => doc.url }
+    doc_hash = Wgit::Model.document(doc).merge(Wgit::Model.common_update_data)
+    update = { "$set" => doc_hash }
+    _update(true, :documents, selection, update)
+  end
+private
+    def write_succeeded?(result, count = 1, multi = false)
+        case result.class.to_s
+        # Single create result.
+        when "Mongo::Operation::Write::Insert::Result"
+            result.documents.first[:err].nil?
+        # Multiple create result.
+        when "Mongo::BulkWrite::Result"
+            result.inserted_count == count
+        # Single and multiple update result.
+        when "Mongo::Operation::Write::Update::Result", # MongoDB 3.0
+             "Mongo::Operation::Write::Update::LegacyResult" # MongoDB 2.4
+            if multi
+                result.n == count
+            else
+                result.documents.first[:err].nil?
+            end
+        else
+            raise "Result class not currently supported: #{result.class.to_s}"
+        end
+    end
+    def create(collection, data)
+        assert_type(data, [Hash, Array])
+        # Single doc.
+        if data.is_a?(Hash)
+            data.merge!(Wgit::Model.common_insert_data)
+            result = @@client[collection.to_sym].insert_one(data)
+            unless write_succeeded?(result)
+              raise "DB write (insert) failed"
+            end
+            result.n
+        # Multiple docs.
+        elsif data.is_a?(Array)
+            assert_arr_types(data, Hash)
+            data.map! do |data_hash|
+                data_hash.merge(Wgit::Model.common_insert_data)
+            end
+            result = @@client[collection.to_sym].insert_many(data)
+            unless write_succeeded?(result, data.length)
+                raise "DB write(s) failed"
+            end
+            result.inserted_count
+        else
+            raise "data must be a Hash or an Array of Hash's"
+        end
+    end
+    def retrieve(collection, query, sort = {}, projection = {},
+                 limit = 0, skip = 0)
+        assert_type(query, Hash)
+        @@client[collection.to_sym].find(query).projection(projection)
+                                  .skip(skip).limit(limit).sort(sort)
+    end
+    # NOTE: The Model.common_update_data should be merged in the calling
+    # method as the update param can be bespoke due to its nature.
+    def _update(single, collection, selection, update)
+        assert_arr_types([selection, update], Hash)
+        if single
+          result = @@client[collection.to_sym].update_one(selection, update)
+        else
+          result = @@client[collection.to_sym].update_many(selection, update)
+        end
+        raise "DB write (update) failed" unless write_succeeded?(result)
+        result.n
+    end
+    alias :count :size
+    alias :length :size
+    alias :insert_url :insert_urls
+    alias :insert_doc :insert_docs
+    alias :search_and_format :search_p
+  end
+end

data/lib/wgit/database/model.rb ADDED Viewed

@@ -0,0 +1,31 @@
+require_relative '../utils'
+module Wgit
+  # @author Michael Telford
+  # Module containing the DB data model structure.
+  module Model
+      def self.url(url)
+          raise "url must respond to to_h" unless url.respond_to?(:to_h)
+          url.to_h
+      end
+      def self.document(doc)
+          raise "doc must respond to to_h" unless doc.respond_to?(:to_h)
+          doc.to_h(false)
+      end
+      def self.common_insert_data
+          {
+              :date_added     => Wgit::Utils.time_stamp,
+              :date_modified  => Wgit::Utils.time_stamp,
+          }
+      end
+      def self.common_update_data
+          {
+              :date_modified  => Wgit::Utils.time_stamp,
+          }
+      end
+  end
+end

data/lib/wgit/database/mongo_connection_details.rb ADDED Viewed

@@ -0,0 +1,27 @@
+# @author Michael Telford
+module Wgit
+  DB_PROVIDER = :MongoLabs.freeze
+  # OpenShift (MongoDB 2.4)
+  if DB_PROVIDER == :OpenShift
+    CONNECTION_DETAILS = {
+      :host           => "127.0.0.1",
+      :port           => "27017",
+      :db             => "admin",
+      :uname          => "admin",
+      :pword          => "R5jUKv1fessb"
+    }.freeze
+  # MongoLabs (MongoDB 3.0)
+  elsif DB_PROVIDER == :MongoLabs
+    CONNECTION_DETAILS = {
+      :host           => "ds037205.mongolab.com",
+      :port           => "37205",
+      :db             => "crawler",
+      :uname          => "rubyapp",
+      :pword          => "R5jUKv1fessb",
+    }.freeze
+  else
+    raise "Database provider '#{DB_PROVIDER}' is not recognized"
+  end
+end

data/lib/wgit/document.rb ADDED Viewed

@@ -0,0 +1,293 @@
+require_relative 'url'
+require_relative 'utils'
+require_relative 'assertable'
+require 'nokogiri'
+module Wgit
+  # @author Michael Telford
+  # Class modeling a HTML web document. Also doubles as a search result.
+  class Document
+    include Assertable
+    TEXT_ELEMENTS = [:dd, :div, :dl, :dt, :figcaption, :figure, :hr, :li,
+                     :main, :ol, :p, :pre, :span, :ul, :h1, :h2, :h3, :h4, :h5]
+  	attr_reader :url, :html, :title, :author, :keywords, :links, :text, :score
+  	def initialize(url_or_doc, html = nil)
+          if (url_or_doc.is_a?(String))
+              assert_type(url_or_doc, Url)
+              html ||= ""
+              @url = url_or_doc
+              @html = html
+              @doc = Nokogiri::HTML(html) do |config|
+                  # TODO: Remove #'s below when crawling in production.
+                  #config.options = Nokogiri::XML::ParseOptions::STRICT |
+                  #                 Nokogiri::XML::ParseOptions::NONET
+              end
+              init_title
+      		    init_author
+      		    init_keywords
+              init_links
+              init_text
+              @score = 0.0
+          else
+              # Init from a mongo collection document.
+              @url = Wgit::Url.new(url_or_doc[:url])
+              @html = url_or_doc[:html].nil? ? "" : url_or_doc[:html]
+              @title = url_or_doc[:title]
+              @author = url_or_doc[:author]
+              @keywords = url_or_doc[:keywords].nil? ? [] : url_or_doc[:keywords]
+              @links = url_or_doc[:links].nil? ? [] : url_or_doc[:links]
+              @links.map! { |link| Wgit::Url.new(link) }
+              @text = url_or_doc[:text].nil? ? [] : url_or_doc[:text]
+              @score = url_or_doc[:score].nil? ? 0.0 : url_or_doc[:score]
+          end
+  	end
+  	def internal_links
+          return [] if @links.empty?
+  		@links.reject do |link|
+              begin
+                  not link.relative_link?
+              rescue
+                  true
+              end
+          end
+  	end
+      def internal_full_links
+          return [] if internal_links.empty?
+          internal_links.map do |link|
+              link.replace("/" + link) unless link.start_with?("/")
+              Wgit::Url.new(@url.to_base + link)
+          end
+      end
+  	def external_links
+      return [] if @links.empty?
+  		@links.reject do |link|
+        begin
+            link.relative_link?
+        rescue
+            true
+        end
+      end
+  	end
+      def stats
+          hash = {}
+          instance_variables.each do |var|
+              # Add up the total bytes of text as well as the length.
+              if var == :@text
+                  count = 0
+                  @text.each { |t| count += t.length }
+                  hash[:text_length] = @text.length
+                  hash[:text_bytes] = count
+              # Else take the #length method return value.
+              else
+                  next unless instance_variable_get(var).respond_to?(:length)
+                  hash[var[1..-1].to_sym] =
+                                      instance_variable_get(var).send(:length)
+              end
+          end
+          hash
+      end
+      def size
+          stats[:html]
+      end
+      def to_h(include_html = false)
+          ignore = include_html ? [] : [:@html]
+          ignore << :@doc # Always ignore :@doc
+          Wgit::Utils.to_h(self, ignore)
+      end
+      # Override of the default == method, is equal if url and html both match.
+      # Use doc.object_id == other_doc.object_id for exact object comparison.
+      def ==(other_doc)
+        return false unless other_doc.is_a? Wgit::Document
+        url == other_doc.url and html == other_doc.html
+      end
+      # Shortcut for calling Document#html[range].
+      def [](range)
+        html[range]
+      end
+      def empty?
+          html.strip.empty?
+      end
+      # Searches against the Document#text for the given search text.
+      # The number of search hits for each sentenence are recorded internally
+      # and used to rank/sort the search results before being returned. Where
+      # the Database#search method search all documents for the most hits this
+      # method searches each documents text for the most hits.
+      #
+      # Each search result comprises of a sentence of a given length. The length
+      # will be based on the sentence_limit parameter or the full length of the
+      # original sentence, which ever is less. The algorithm obviously ensures
+      # that the search value is visible somewhere in the sentence.
+      #
+      # @param text [String] the value to search the document text against.
+      # @param sentence_limit [Fixnum] the length of each search result
+      # sentence.
+      #
+      # @return [Array] of String objects representing the search results.
+      def search(text, sentence_limit = 80)
+          raise "A search value must be provided" if text.empty?
+          raise "The sentence length value must be even" if sentence_limit.odd?
+          results = {}
+          regex = Regexp.new(text, Regexp::IGNORECASE)
+          @text.each do |sentence|
+              hits = sentence.scan(regex).count
+              if hits > 0
+                  sentence.strip!
+                  index = sentence.index(regex)
+                  Wgit::Utils.format_sentence_length(sentence, index, sentence_limit)
+                  results[sentence] = hits
+              end
+          end
+          return [] if results.empty?
+          results = Hash[results.sort_by { |k, v| v }]
+          results.keys.reverse
+      end
+      # Performs a text search (see search for details) but assigns the results
+      # to the @text instance variable. This can be used for sub search
+      # functionality. Note that there is no way of getting the original text
+      # back however.
+      def search!(text)
+          @text = search(text)
+      end
+      # Uses Nokogiri's xpath method to search the doc's html and return the
+      # results.
+      def xpath(xpath)
+    		@doc.xpath(xpath)
+      end
+  private
+      def process_str(str)
+          str.encode!('UTF-8', 'UTF-8', :invalid => :replace)
+          str.strip!
+          str # This is required to return the str, do not remove.
+      end
+      def process_arr(array)
+          assert_arr_types(array, String)
+          array.map! { |str| process_str(str) }
+          array.reject! { |str| str.empty? }
+          array.uniq!
+      end
+      # Modifies internal links by removing this doc's base or host url if
+      # present. http://www.google.co.uk/about.html (with or without the
+      # protocol prefix) will become about.html meaning it'll appear within
+      # internal_links.
+      def process_internal_links(links)
+          links.map! do |link|
+              host_or_base = if link.start_with?("http")
+                                 url.base
+                             else
+                                 url.host
+                             end
+              if link.start_with?(host_or_base)
+                  link.sub!(host_or_base, "")
+                  link.replace(link[1..-1]) if link.start_with?("/")
+                  link.strip!
+              end
+              link
+          end
+      end
+      def text_elements_xpath
+          xpath = ""
+          return xpath if TEXT_ELEMENTS.empty?
+          el_xpath = "//%s/text()"
+          TEXT_ELEMENTS.each_with_index do |el, i|
+              xpath += " | " unless i == 0
+              xpath += el_xpath % [el]
+          end
+          xpath
+      end
+      def init_var(xpath, var, first_result = true)
+  		results = @doc.xpath(xpath)
+          unless results.nil? || results.empty?
+              result = if first_result
+                           results.first.content
+                       else
+                           results.map { |res| res.content }
+                       end
+              instance_variable_set(var, result)
+          end
+      end
+  	def init_title
+      @title = nil
+      xpath = "//title"
+      init_var(xpath, :@title)
+      process_str(@title) unless @title.nil?
+  	end
+  	def init_author
+      @author = nil
+      xpath = "//meta[@name='author']/@content"
+      init_var(xpath, :@author)
+      process_str(@author) unless @author.nil?
+  	end
+  	def init_keywords
+      @keywords = nil
+      xpath = "//meta[@name='keywords']/@content"
+      init_var(xpath, :@keywords)
+      return @keywords = [] unless @keywords
+      @keywords = @keywords.split(",")
+      process_arr(@keywords)
+  	end
+    def init_links
+      @links = nil
+      xpath = "//a/@href"
+      init_var(xpath, :@links, false)
+      return @links = [] unless @links
+      process_arr(@links)
+      @links.reject! { |link| link == "/" }
+      @links.map! do |link|
+        begin
+          Wgit::Url.new(link)
+        rescue
+          nil
+        end
+      end
+      @links.reject! { |link| link.nil? }
+      process_internal_links(@links)
+    end
+    def init_text
+      @text = nil
+      xpath = text_elements_xpath
+      init_var(xpath, :@text, false)
+      return @text = [] unless @text
+      process_arr(@text)
+    end
+  	alias :to_hash :to_h
+    alias :relative_links :internal_links
+    alias :relative_urls :internal_links
+    alias :relative_full_links :internal_full_links
+    alias :relative_full_urls :internal_full_links
+    alias :external_urls :external_links
+  end
+end

data/lib/wgit/url.rb ADDED Viewed

@@ -0,0 +1,140 @@
+require_relative 'utils'
+require 'uri'
+module Wgit
+  # @author Michael Telford
+  # Class modeling a web based URL.
+  # Can be an internal link e.g. "about.html"
+  # or a full URL e.g. "http://www.google.co.uk".
+  class Url < String
+      attr_accessor :crawled, :date_crawled
+      def initialize(url_or_doc, crawled = false, date_crawled = nil)
+          if (url_or_doc.is_a?(String))
+              url = url_or_doc
+          else
+              # Init from a mongo collection document.
+              url = url_or_doc[:url]
+              crawled = url_or_doc[:crawled].nil? ? false : url_or_doc[:crawled]
+              date_crawled = url_or_doc[:date_crawled]
+          end
+          @uri = URI(url)
+          @crawled = crawled
+          @date_crawled = date_crawled
+          super(url)
+      end
+      def self.validate(url)
+          if Wgit::Url.relative_link?(url)
+              raise "Invalid url (or a relative link): #{url}"
+          end
+          unless url.start_with?("http://") or url.start_with?("https://")
+              raise "Invalid url (missing protocol prefix): #{url}"
+          end
+          if URI.regexp.match(url).nil?
+              raise "Invalid url: #{url}"
+          end
+      end
+      def self.valid?(url)
+          Wgit::Url.validate(url)
+          true
+      rescue
+          false
+      end
+      # Modifies the receiver url by prefixing it with a protocol.
+      # Returns the url whether its been modified or not.
+      def self.prefix_protocol(url, https = false)
+          unless url.start_with?("http://") or url.start_with?("https://")
+              if https
+                  url.replace("https://#{url}")
+              else
+                  url.replace("http://#{url}")
+              end
+          end
+          url
+      end
+      # URI.split("http://www.google.co.uk/about.html") returns the following:
+      # array[2]: "www.google.co.uk", array[5]: "/about.html".
+      # This means that all external links in a page are expected to have a
+      # protocol prefix e.g. "http://", otherwise the link is treated as an
+      # internal link (regardless of whether it is valid or not).
+      def self.relative_link?(link)
+          link_segs = URI.split(link)
+          if not link_segs[2].nil? and not link_segs[2].empty?
+              false
+          elsif not link_segs[5].nil? and not link_segs[5].empty?
+              true
+          else
+              raise "Invalid link: #{link}"
+          end
+      end
+      def self.concat(host, link)
+          url = host
+          url.chop! if url.end_with?("/")
+          link = link[1..-1] if link.start_with?("/")
+          Wgit::Url.new(url + "/" + link)
+      end
+      def relative_link?
+          Wgit::Url.relative_link?(self)
+      end
+      def valid?
+          Wgit::Url.valid?(self)
+      end
+      def concat(link)
+          Wgit::Url.concat(self, link)
+      end
+      def crawled=(bool)
+          @crawled = bool
+          @date_crawled = bool ? Wgit::Utils.time_stamp : nil
+      end
+      def to_uri
+          @uri
+      end
+      def to_url
+        self
+      end
+      # Given http://www.google.co.uk/about.html, www.google.co.uk is returned.
+      def to_host
+          Wgit::Url.new(@uri.host)
+      end
+      # URI.split("http://www.google.co.uk/about.html") returns the following:
+      # array[0]: "http://", array[2]: "www.google.co.uk".
+      # Returns array[0] + array[2] e.g. http://www.google.co.uk.
+      def to_base
+          if Wgit::Url.relative_link?(self)
+              raise "A relative link doesn't have a base URL: #{self}"
+          end
+          url_segs = URI.split(self)
+          if url_segs[0].nil? or url_segs[2].nil? or url_segs[2].empty?
+              raise "Both a protocol and host are needed: #{self}"
+          end
+          base = "#{url_segs[0]}://#{url_segs[2]}"
+          Wgit::Url.new(base)
+      end
+      def to_h
+          ignore = [:@uri]
+          h = Wgit::Utils.to_h(self, ignore)
+          Hash[h.to_a.insert(0, [:url, self])] # Insert url at position 0.
+      end
+      alias :to_hash :to_h
+      alias :host :to_host
+      alias :base :to_base
+      alias :internal_link? :relative_link?
+      alias :crawled? :crawled
+  end
+end

data/lib/wgit/utils.rb ADDED Viewed

@@ -0,0 +1,115 @@
+module Wgit
+  # @author Michael Telford
+  # Utility module containing generic methods.
+  module Utils
+      def self.time_stamp
+          Time.new
+      end
+      # Returns a hash created from obj's instance vars and values.
+      def self.to_h(obj, ignore = [])
+          hash = {}
+          obj.instance_variables.each do |var|
+              next if ignore.include?(var)
+              hash[var[1..-1].to_sym] = obj.instance_variable_get(var)
+          end
+          hash
+      end
+      # Improved each method which takes care of singleton and enumerable
+      # objects. Yields one or more objects.
+      def self.each(obj_or_objs)
+          if obj_or_objs.respond_to?(:each)
+              obj_or_objs.each { |obj| yield obj }
+          else
+              yield obj_or_objs
+          end
+      end
+      # Formats the sentence (modifies the receiver) and returns its value.
+      # The length will be based on the sentence_limit parameter or the full
+      # length of the original sentence, which ever is less. The full sentence
+      # is returned if the sentence_limit is 0. The algorithm obviously ensures
+      # that the search value is visible somewhere in the sentence.
+      def self.format_sentence_length(sentence, index, sentence_limit)
+          raise "A sentence value must be provided" if sentence.empty?
+          raise "The sentence length value must be even" if sentence_limit.odd?
+          if index < 0 or index > sentence.length
+              raise "Incorrect index value: #{index}"
+          end
+          return sentence if sentence_limit == 0
+          start = 0
+          finish = sentence.length
+          if sentence.length > sentence_limit
+              start = index - (sentence_limit / 2)
+              finish = index + (sentence_limit / 2)
+              if start < 0
+                  diff = 0 - start
+                  if (finish + diff) > sentence.length
+                      finish = sentence.length
+                  else
+                      finish += diff
+                  end
+                  start = 0
+              elsif finish > sentence.length
+                  diff = finish - sentence.length
+                  if (start - diff) < 0
+                      start = 0
+                  else
+                      start -= diff
+                  end
+                  finish = sentence.length
+              end
+              raise if sentence[start..(finish - 1)].length != sentence_limit
+          end
+          sentence.replace(sentence[start..(finish - 1)])
+      end
+      # Prints out the search results in a search engine page format.
+      # Most of the params are passed to Document#search - see class docs.
+      # The steam param decides where the printf output is written to, and
+      # therefore must respond_to? :puts
+      # The format for each result is:
+      #
+      # Title
+      # Keywords (if there are some)
+      # Text Snippet (showing the searched for text if provided)
+      # Url
+      # <empty_line>
+      def self.printf_search_results(results, text = nil, case_sensitive = false,
+                                     sentence_length = 80, keyword_count = 5,
+                                     stream = Kernel)
+          raise "stream must respond_to? :puts" unless stream.respond_to? :puts
+          keyword_count -= 1 # Because Array's are zero indexed.
+          results.each do |doc|
+              sentence = if text.nil?
+                            nil
+                         else
+                            sentence = doc.search(text, sentence_length).first
+                            if sentence.nil?
+                                nil
+                            else
+                                sentence.strip.empty? ? nil : sentence
+                            end
+                         end
+              stream.puts doc.title
+              unless doc.keywords.empty?
+                  stream.puts doc.keywords[0..keyword_count].join(", ")
+              end
+              stream.puts sentence unless sentence.nil?
+              stream.puts doc.url
+              stream.puts
+          end
+          nil
+      end
+  end
+end

data/lib/wgit/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module Wgit
+  VERSION = "0.0.1".freeze
+end

data/lib/wgit/web_crawler.rb ADDED Viewed

@@ -0,0 +1,134 @@
+#!/usr/bin/env ruby
+require_relative 'crawler'
+require_relative 'database/database'
+# @author Michael Telford
+module Wgit
+  # Convience method to crawl the World Wide Web.
+  # The default value (-1) for max_sites_to_crawl is unrestricted.
+  # The default max_data_size is 1GB.
+  def self.crawl_the_web(max_sites_to_crawl = -1, max_data_size = 1048576000)
+    db = Wgit::Database.new
+    web_crawler = Wgit::WebCrawler.new(db, max_sites_to_crawl, max_data_size)
+    web_crawler.crawl_the_web
+  end
+  # Class which sets up a crawler and saves the indexed
+  # docs to a database. Will crawl the web forever if you let it :-)
+  class WebCrawler
+    attr_accessor :max_sites_to_crawl, :max_data_size
+    attr_reader :crawler, :db
+    def initialize(database,
+                   max_sites_to_crawl = -1,
+                   max_data_size = 1048576000)
+      @crawler = Wgit::Crawler.new
+      @db = database
+      @max_sites_to_crawl = max_sites_to_crawl
+      @max_data_size = max_data_size
+    end
+    # Retrieves url's from the database and recursively crawls each site
+    # storing their internal pages into the database and adding their external
+    # url's to be crawled at a later date.
+    def crawl_the_web
+      if max_sites_to_crawl < 0
+        puts "Crawling until the database has been filled or it runs out of \
+urls to crawl (which might be never)."
+      end
+      loop_count = 0
+      while keep_crawling?(loop_count) do
+          puts "Current database size: #{db.size}"
+          crawler.urls = db.uncrawled_urls
+          if crawler.urls.empty?
+              puts "No urls to crawl, exiting."
+              break
+          end
+          puts "Starting crawl loop for: #{crawler.urls}"
+          docs_count = 0
+          urls_count = 0
+          crawler.urls.each do |url|
+            unless keep_crawling?(loop_count)
+              puts "Reached max number of sites to crawl or database \
+capacity, exiting."
+              return
+            end
+            loop_count += 1
+            url.crawled = true
+            raise unless db.update(url) == 1
+            site_docs_count = 0
+            ext_links = crawler.crawl_site(url) do |doc|
+                unless doc.empty?
+                    if write_doc_to_db(doc)
+                        docs_count += 1
+                        site_docs_count += 1
+                    end
+                end
+            end
+            urls_count += write_urls_to_db(ext_links)
+            puts "Crawled and saved #{site_docs_count} docs for the \
+site: #{url}"
+          end
+          puts "Crawled and saved docs for #{docs_count} url(s) overall for \
+this iteration."
+          puts "Found and saved #{urls_count} external url(s) for the next \
+iteration."
+      end
+    end
+    private
+    # Keep crawling or not based on DB size and current loop interation.
+    def keep_crawling?(loop_count)
+      return false if db.size >= max_data_size
+      # If max_sites_to_crawl is -1 for example then crawl away.
+      if max_sites_to_crawl < 0
+        true
+      else
+        loop_count < max_sites_to_crawl
+      end
+    end
+    # The unique url index on the documents collection prevents duplicate
+    # inserts.
+    def write_doc_to_db(doc)
+        db.insert(doc)
+        puts "Saved document for url: #{doc.url}"
+        true
+    rescue Mongo::Error::OperationFailure
+        puts "Document already exists: #{doc.url}"
+        false
+    end
+    # The unique url index on the urls collection prevents duplicate inserts.
+    def write_urls_to_db(urls)
+        count = 0
+        if urls.respond_to?(:each)
+            urls.each do |url|
+                begin
+                  db.insert(url)
+                  count += 1
+                  puts "Inserted url: #{url}"
+                rescue Mongo::Error::OperationFailure
+                  puts "Url already exists: #{url}"
+                end
+            end
+        end
+        count
+    end
+  end
+end
+if __FILE__ == $0
+    Wgit.crawl_the_web
+end

metadata ADDED Viewed

@@ -0,0 +1,62 @@
+--- !ruby/object:Gem::Specification
+name: wgit
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- Michael Telford
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2016-03-07 00:00:00.000000000 Z
+dependencies: []
+description: Wgit is a WWW indexer/scraper which crawls URL's and retrieves their
+  page contents for later use. Also included in this package is a means to search
+  indexed documents stored in a database. Therefore this library provides the main
+  components of a WWW search engine. You can also use Wgit to copy entire website's
+  HTML making it far more powerful than wget. The Wgit API is easily extendable allowing
+  you to easily pull out the parts of a webpage that are important to you, the CSS
+  or JS links for example.
+email: michael.telford@live.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- "./lib/wgit.rb"
+- "./lib/wgit/assertable.rb"
+- "./lib/wgit/core_ext.rb"
+- "./lib/wgit/crawler.rb"
+- "./lib/wgit/database/database.rb"
+- "./lib/wgit/database/model.rb"
+- "./lib/wgit/database/mongo_connection_details.rb"
+- "./lib/wgit/document.rb"
+- "./lib/wgit/url.rb"
+- "./lib/wgit/utils.rb"
+- "./lib/wgit/version.rb"
+- "./lib/wgit/web_crawler.rb"
+homepage: http://rubygems.org/gems/wgit
+licenses:
+- MIT
+metadata:
+  allowed_push_host: https://rubygems.org
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.4.5
+signing_key:
+specification_version: 4
+summary: Wgit is wget on steroids with an easy to use API.
+test_files: []