RubyGems - rdig - Versions diffs - 0.1.0 - Mend

rdig 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

data/CHANGES +2 -0
data/LICENSE +20 -0
data/README +61 -0
data/TODO +0 -0
data/bin/rdig +32 -0
data/doc/examples/config.rb +53 -0
data/install.rb +89 -0
data/lib/htmlentities/CHANGES +21 -0
data/lib/htmlentities/COPYING +7 -0
data/lib/htmlentities/README +15 -0
data/lib/htmlentities/htmlentities.rb +281 -0
data/lib/rdig.rb +243 -0
data/lib/rdig/content_extractors.rb +145 -0
data/lib/rdig/crawler.rb +176 -0
data/lib/rdig/highlight.rb +24 -0
data/lib/rdig/http_client.rb +22 -0
data/lib/rdig/index.rb +39 -0
data/lib/rdig/search.rb +77 -0
data/lib/rdig/url_filters.rb +171 -0
data/rakefile +325 -0
data/test/fixtures/html/custom_tag_selectors.html +25 -0
data/test/fixtures/html/entities.html +15 -0
data/test/fixtures/html/simple.html +17 -0
data/test/test_helper.rb +18 -0
data/test/unit/etag_filter_test.rb +23 -0
data/test/unit/html_content_extractor_test.rb +64 -0
data/test/unit/url_filters_test.rb +96 -0
metadata +102 -0

data/lib/rdig.rb ADDED Viewed

@@ -0,0 +1,243 @@
+#!/usr/bin/env ruby
+#--
+# Copyright (c) 2006 Jens Kraemer
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#++
+#
+RDIGVERSION = '0.1.0'
+require 'thread'
+require 'thwait'
+require 'singleton'
+require 'monitor'
+require 'ostruct'
+require 'uri'
+require 'cgi'
+require 'set'
+require 'net/http'
+require 'getoptlong'
+begin
+  require 'rubyful_soup'
+  require 'ferret'
+rescue LoadError
+  require 'rubygems'
+  require 'rubyful_soup'
+  require 'ferret'
+end
+require 'htmlentities/htmlentities'
+require 'rdig/http_client'
+require 'rdig/content_extractors'
+require 'rdig/url_filters'
+require 'rdig/search'
+require 'rdig/index'
+require 'rdig/crawler'
+$KCODE = 'u'
+require 'jcode'
+# See README for basic usage information
+module RDig
+  class << self
+    # the filter chain each URL has to run through before being crawled.
+    def filter_chain
+      @filter_chain ||= [
+        { :maximum_redirect_filter => :max_redirects },
+        :fix_relative_uri,
+        :normalize_uri,
+        { :hostname_filter => :include_hosts },
+        { RDig::UrlFilters::UrlInclusionFilter => :include_documents },
+        { RDig::UrlFilters::UrlExclusionFilter => :exclude_documents },
+        RDig::UrlFilters::VisitedUrlFilter
+      ]
+    end
+    def application
+      @application ||= Application.new
+    end
+    def searcher
+      @searcher ||= Search::Searcher.new(config.ferret)
+    end
+    # RDig configuration
+    #
+    # may be used with a block:
+    #   RDig.configuration do |config| ...
+    #
+    # see doc/examples/config.rb for a commented example configuration
+    def configuration
+      if block_given?
+        yield configuration
+      else
+        @config ||= OpenStruct.new(
+          :crawler           => OpenStruct.new(
+            :start_urls        => [ "http://localhost:3000/" ],
+            :include_hosts     => [ "localhost" ],
+            :include_documents => nil,
+            :exclude_documents => nil,
+            :index_document    => nil,
+            :num_threads       => 2,
+            :max_redirects     => 5,
+            :wait_before_leave => 10
+          ),
+          :content_extraction  => OpenStruct.new(
+            # settings for html content extraction
+            :html => OpenStruct.new(
+              # select the html element that contains the content to index
+              # by default, we index all inside the body tag:
+              :content_tag_selector => lambda { |tagsoup|
+                tagsoup.html.body
+              },
+              # select the html element containing the title
+              :title_tag_selector         => lambda { |tagsoup|
+                tagsoup.html.head.title
+              }
+            )
+          ),
+          :ferret                => OpenStruct.new(
+            :path                => "index/",
+            :create              => true,
+            :handle_parse_errors => true,
+            :analyzer            => Ferret::Analysis::StandardAnalyzer.new,
+            :occur_default       => Ferret::Search::BooleanClause::Occur::MUST
+          )
+        )
+      end
+    end
+    alias config configuration
+  end
+  class Application
+    OPTIONS = [
+      ['--config',   '-c', GetoptLong::REQUIRED_ARGUMENT,
+        "Read aplication configuration from CONFIG."],
+      ['--help',     '-h', GetoptLong::NO_ARGUMENT,
+        "Display this help message."],
+      ['--query',   '-q', GetoptLong::REQUIRED_ARGUMENT,
+        "Execute QUERY."],
+      ['--version',  '-v', GetoptLong::NO_ARGUMENT,
+       	"Display the program version."],
+    ]
+    # Application options from the command line
+    def options
+      @options ||= OpenStruct.new
+    end
+    # Display the program usage line.
+    def usage
+      puts "rdig -c configfile {options}"
+    end
+    # Display the rake command line help.
+    def help
+      usage
+      puts
+      puts "Options are ..."
+      puts
+      OPTIONS.sort.each do |long, short, mode, desc|
+        if mode == GetoptLong::REQUIRED_ARGUMENT
+          if desc =~ /\b([A-Z]{2,})\b/
+            long = long + "=#{$1}"
+          end
+        end
+        printf "  %-20s (%s)\n", long, short
+        printf "      %s\n", desc
+      end
+    end
+    # Return a list of the command line options supported by the
+    # program.
+    def command_line_options
+      OPTIONS.collect { |lst| lst[0..-2] }
+    end
+    # Do the option defined by +opt+ and +value+.
+    def do_option(opt, value)
+      case opt
+      when '--help'
+        help
+        exit
+      when '--config'
+        options.config_file = value
+      when '--query'
+        options.query = value
+      when '--version'
+        puts "rdig, version #{RDIGVERSION}"
+        exit
+      else
+        fail "Unknown option: #{opt}"
+      end
+    end
+    # Read and handle the command line options.
+    def handle_options
+      opts = GetoptLong.new(*command_line_options)
+      opts.each { |opt, value| do_option(opt, value) }
+    end
+    # Load the configuration
+    def load_configfile
+      load File.expand_path(options.config_file)
+    end
+    # Run the +rdig+ application.
+    def run
+      handle_options
+      begin
+        load_configfile
+      rescue
+        puts $!.backtrace
+        fail "No Configfile found!\n#{$!}"
+      end
+      if options.query
+        # query the index
+        puts "executing query >#{options.query}<"
+        results = RDig.searcher.search(options.query)
+        puts "total results: #{results[:hitcount]}"
+        results[:list].each { |result|
+          puts <<-EOF
+#{result[:url]}
+  #{result[:title]}
+  #{result[:extract]}
+          EOF
+        }
+      else
+        # rebuild index
+        @crawler = Crawler.new
+        @crawler.run
+      end
+    end
+  end
+end

data/lib/rdig/content_extractors.rb ADDED Viewed

@@ -0,0 +1,145 @@
+# override some methods concered with entity resolving
+# to convert them to strings
+class BeautifulStoneSoup
+  # resolve unknown html entities using the htmlentities lib
+  alias :orig_unknown_entityref :unknown_entityref
+  def unknown_entityref(ref)
+    if HTMLEntities::MAP.has_key?(ref)
+      handle_data [HTMLEntities::MAP[ref]].pack('U')
+    else
+      orig_unknown_entityref ref
+    end
+  end
+  # resolve numeric entities to utf8
+  def handle_charref(ref)
+    handle_data( ref.gsub(/([0-9]{1,7})/) {
+                            [$1.to_i].pack('U')
+                    }.gsub(/x([0-9a-f]{1,6})/i) {
+                            [$1.to_i(16)].pack('U')
+                    } )
+  end
+end
+module RDig
+  # Contains Classes which are used for extracting content and meta data from
+  # various content types.
+  #
+  # TODO: support at least pdf, too.
+  module ContentExtractors
+    # process the given +content+ depending on it's +content_type+.
+    def ContentExtractors.process(content, content_type)
+      case content_type
+      when /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
+        return HtmlContentExtractor.process(content)
+      else
+        puts "unable to handle content type #{content_type}"
+      end
+      return nil
+    end
+    # extracts title, content and links from html documents
+    class HtmlContentExtractor
+      # returns:
+      # { :content => 'extracted clear text',
+      #   :meta => { :title => 'Title' },
+      #   :links => [array of urls] }
+      def self.process(content)
+        result = { }
+        tag_soup = BeautifulSoup.new(content)
+        result[:title] = extract_title(tag_soup)
+        result[:links] = extract_links(tag_soup)
+        result[:content] = extract_content(tag_soup)
+        return result
+      end
+      # Extracts textual content from the HTML tree.
+      #
+      # - First, the root element to use is determined using the
+      # +content_element+ method, which itself uses the content_tag_selector
+      # from RDig.configuration.
+      # - Then, this element is processed by +extract_text+, which will give
+      # all textual content contained in the root element and all it's
+      # children.
+      def self.extract_content(tag_soup)
+        content = ''
+        content_element(tag_soup).children { |child|
+          extract_text(child, content)
+        }
+        return content.strip
+      end
+      # extracts the href attributes of all a tags, except
+      # internal links like <a href="#top">
+      def self.extract_links(tagsoup)
+        tagsoup.find_all('a').map { |link|
+          CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
+        }.compact
+      end
+      # Extracts the title from the given html tree
+      def self.extract_title(tagsoup)
+        title = ''
+        the_title_tag = title_tag(tagsoup)
+        if the_title_tag.is_a? String
+          the_title_tag
+        else
+          extract_text(the_title_tag).strip if the_title_tag
+        end
+      end
+      # Recursively extracts all text contained in the given element,
+      # and appends it to content.
+      def self.extract_text(element, content='')
+        if element.is_a? NavigableString
+          value = strip_comments(element)
+          value.strip!
+          unless value.empty?
+            content << value
+            content << ' '
+          end
+        elsif element.string  # it's a Tag, and it has some content string
+          value = element.string.strip
+          unless value.empty?
+            content << value
+            content << ' '
+          end
+        else
+          element.children { |child|
+            extract_text(child, content)
+          }
+        end
+      end
+      # Returns the element to extract the title from.
+      #
+      # This may return a string, e.g. an attribute value selected from a meta
+      # tag, too.
+      def self.title_tag(tagsoup)
+        if RDig.config.content_extraction.html.title_tag_selector
+          RDig.config.content_extraction.html.title_tag_selector.call(tagsoup)
+        else
+          tagsoup.html.head.title
+        end
+      end
+      # Retrieve the root element to extract document content from
+      def self.content_element(tagsoup)
+        if RDig.config.content_extraction.html.content_tag_selector
+          RDig.config.content_extraction.html.content_tag_selector.call(tagsoup)
+        else
+          tagsoup.html.body
+        end
+      end
+      # Return the given string minus all html comments
+      def self.strip_comments(string)
+        string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
+      end
+    end
+  end
+end

data/lib/rdig/crawler.rb ADDED Viewed

@@ -0,0 +1,176 @@
+module RDig
+  class Crawler
+    def initialize
+      @documents = Queue.new
+      @etag_filter = ETagFilter.new
+    end
+    def run
+      @indexer = Index::Indexer.new(RDig.config.ferret)
+      filterchain = UrlFilters::FilterChain.new(RDig.filter_chain)
+      RDig.config.crawler.start_urls.each { |url| add_url(url, filterchain) }
+      num_threads = RDig.config.crawler.num_threads
+      group = ThreadsWait.new
+      num_threads.times { |i|
+        group.join_nowait Thread.new("fetcher #{i}") {
+          filterchain = UrlFilters::FilterChain.new(RDig.filter_chain)
+          while (doc = @documents.pop) != :exit
+            process_document doc, filterchain
+          end
+        }
+      }
+      # dilemma: suppose we have 1 start url and two threads t1 and t2:
+      # t1 pops the start url from the queue which now is empty
+      # as the queue is empty now, t2 blocks until t1 adds the links
+      # retrieved from his document.
+      #
+      # But we need the 'queue empty' condition as a sign for us to stop
+      # waiting for new entries, too.
+      # check every now and then for an empty queue
+      sleep_interval = RDig.config.crawler.wait_before_leave
+      begin
+        sleep sleep_interval
+      end until @documents.empty?
+      # nothing to do any more, tell the threads to exit
+      num_threads.times { @documents << :exit }
+      puts "waiting for threads to finish..."
+      group.all_waits
+    ensure
+      @indexer.close if @indexer
+    end
+    def process_document(doc, filterchain)
+      doc.fetch
+      # add links from this document to the queue
+      doc.content[:links].each { |url| add_url(url, filterchain, doc) }
+      return unless @etag_filter.apply(doc)
+      case doc.status
+      when :success
+        if doc.content
+          if doc.content[:links]
+            doc.content[:links].each { |url| add_url(url, filterchain, doc) }
+          end
+          @indexer << doc
+          #else
+          #puts "success but no content: #{doc.uri.to_s}"
+        end
+      when :redirect
+        # links contains the url we were redirected to
+        doc.content[:links].each { |url| add_url(url, filterchain, doc) }
+      end
+    rescue
+      puts "error processing document #{doc.uri.to_s}: #{$!}"
+    end
+    # pipes a new document pointing to url through the filter chain,
+    # if it survives that, it gets added to the documents queue for further
+    # processing
+    def add_url(url, filterchain, referring_document = nil)
+      return if url.nil? || url.empty?
+      if referring_document
+        doc = Document.new(url, referring_document.uri)
+        # keep redirect count
+        if referring_document.status == :redirect
+          doc.redirections = referring_document.redirections + 1
+        end
+      else
+        doc = Document.new(url)
+      end
+      doc = filterchain.apply(doc)
+      if doc
+        puts "added url #{url}"
+        #else
+        #puts "skipping url #{url}"
+      end
+      @documents << doc if doc
+    end
+  end
+  class Document
+    include HttpClient
+    attr_reader :content
+    attr_reader :content_type
+    attr_reader :uri
+    attr_reader :referring_uri
+    attr_reader :status
+    attr_reader :etag
+    attr_accessor :redirections
+    # url: url of this document, may be relative to the referring doc or host.
+    # referrer: uri of the document we retrieved this link from
+    def initialize(url, referrer = nil)
+      @redirections = 0
+      begin
+        @uri = URI.parse(url)
+      rescue URI::InvalidURIError
+        raise "Cannot create document using invalid URL: #{url}"
+      end
+      @referring_uri = referrer
+    end
+    def has_content?
+      !self.content.nil?
+    end
+    def title; @content[:title] end
+    def body; @content[:content] end
+    def url; @uri.to_s end
+    def fetch
+      puts "fetching #{@uri.to_s}"
+      response = do_get(@uri)
+      case response
+      when Net::HTTPSuccess
+        @content_type = response['content-type']
+        @raw_body = response.body
+        @etag = response['etag']
+        # todo externalize this (another chain ?)
+        @content = ContentExtractors.process(@raw_body, @content_type)
+        @status = :success
+      when Net::HTTPRedirection
+        @status = :redirect
+        @content = { :links => [ response['location'] ] }
+      else
+        puts "don't know what to do with response: #{response}"
+      end
+    end
+  end
+  # checks fetched documents' E-Tag headers against the list of E-Tags
+  # of the documents already indexed.
+  # This is supposed to help against double-indexing documents which can
+  # be reached via different URLs (think http://host.com/ and
+  # http://host.com/index.html )
+  # Documents without ETag are allowed to pass through
+  class ETagFilter
+    include MonitorMixin
+    def initialize
+      @etags = Set.new
+      super
+    end
+    def apply(document)
+      return document unless document.etag
+      synchronize do
+        @etags.add?(document.etag) ? document : nil
+      end
+    end
+  end
+end