RubyGems - rdig - Versions diffs - 0.2.1 → 0.3.0 - Mend

rdig 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

data/CHANGES +14 -0
data/doc/examples/config.rb +43 -7
data/lib/rdig.rb +35 -20
data/lib/rdig/content_extractors.rb +75 -37
data/lib/rdig/crawler.rb +18 -91
data/lib/rdig/documents.rb +133 -0
data/lib/rdig/file.rb +18 -0
data/lib/rdig/index.rb +6 -4
data/lib/rdig/url_filters.rb +42 -9
data/test/fixtures/pdf/simple.pdf +0 -0
data/test/unit/crawler_fs_test.rb +32 -0
data/test/unit/file_document_test.rb +34 -0
data/test/unit/html_content_extractor_test.rb +14 -24
data/test/unit/pdf_content_extractor_test.rb +3 -3
data/test/unit/url_filters_test.rb +38 -38
data/test/unit/word_content_extractor_test.rb +1 -1
metadata +8 -4
data/lib/rdig/http_client.rb +0 -22

data/CHANGES CHANGED

@@ -1,3 +1,17 @@
+0.3.0
+- file system crawling
+- optional url rewriting before indexing, e.g. for linking to results
+  via http and building the index directly from the file system
+- PDF title extraction with pdfinfo
+- removed dependency on mkmf which doesn't seem to exist in Ruby 1.8.2
+- made content extractors more flexible - instances now use a given
+  configuration instead of the global one. This allows the
+  WordContentExtractor to use an HtmlContentExtractor with it's own
+  configuration that is independent of the global config.
+0.2.1
+- Bugfix release
 0.2.0
 - add pdf and Word content extraction capabilities using the tools
   from the xpdf-utils and wv packages

data/doc/examples/config.rb CHANGED

@@ -1,25 +1,36 @@
 RDig.configuration do |cfg|
   ##################################################################
-  # options you should really set
+  # options you really should set
   # provide one or more URLs for the crawler to start from
   cfg.crawler.start_urls = [ 'http://www.example.com/' ]
+  # use something like this for crawling a file system:
+  # cfg.crawler.start_urls = [ 'file:///home/bob/documents/' ]
+  # beware, mixing file and http crawling is not possible and might result in
+  # unpredictable results.
   # limit the crawl to these hosts. The crawler will never
   # follow any links pointing to hosts other than those given here.
+  # ignored for file system crawling
   cfg.crawler.include_hosts = [ 'www.example.com' ]
   # this is the path where the index will be stored
   # caution, existing contents of this directory will be deleted!
-  cfg.ferret.path        = '/path/to/index'
+  cfg.indexer.path        = '/path/to/index'
   ##################################################################
   # options you might want to set, the given values are the defaults
+  # set to true to get stack traces on errors
+  # cfg.verbose = false
   # content extraction options
-  # provide a method that selects the tag containing the title of a document
+  # provide a method that returns the title of an html document
+  # this method may either return a tag to extract the title from,
+  # or a ready-to-index string.
   # cfg.content_extraction.html.title_tag_selector = lambda { |tagsoup| tagsoup.html.head.title }
   # provide a method that selects the tag containing the page content you
@@ -29,8 +40,12 @@ RDig.configuration do |cfg|
   # crawler options
-  # nil (index all documents) or an array of Regexps
-  # matching URLs you want to index.
+  # Notice: for file system crawling the include/exclude_document patterns are
+  # applied to the full path of _files_ only (like /home/bob/test.pdf),
+  # for http to full URIs (like http://example.com/index.html).
+  # nil (include all documents) or an array of Regexps
+  # matching the URLs you want to index.
   # cfg.crawler.include_documents = nil
   # nil (no documents excluded) or an array of Regexps
@@ -40,14 +55,35 @@ RDig.configuration do |cfg|
   # included by the inclusion patterns.
   # cfg.crawler.exclude_documents = nil
-  # number of http fetching threads to use
+  # number of document fetching threads to use. Should be raised only if
+  # your CPU has idle time when indexing.
   # cfg.crawler.num_threads = 2
+  # suggested setting for file system crawling:
+  # cfg.crawler.num_threads = 1
   # maximum number of http redirections to follow
   # cfg.crawler.max_redirects = 5
   # number of seconds to wait with an empty url queue before
-  # finishing the crawl. Set to a higher number for slow sites
+  # finishing the crawl. Set to a higher number when experiencing incomplete
+  # crawls on slow sites. Don't set to 0, even when crawling a local fs.
   # cfg.crawler.wait_before_leave = 10
+  # indexer options
+  # create a new index on each run. Will append to the index if false. Use when
+  # building a single index from multiple runs, e.g. one across a website and the
+  # other a tree in a local file system
+  # config.index.create = true
+  # rewrite document uris before indexing them. This is useful if you're
+  # indexing on disk, but the documents should be accessible via http, e.g. from
+  # a web based search application. By default, no rewriting takes place.
+  # example:
+  # cfg.index.rewrite_uri = lambda { |uri|
+  #   uri.path.gsub!(/^\/base\//, '/virtual_dir/')
+  #   uri.scheme = 'http'
+  #   uri.host = 'www.mydomain.com'
+  # }
 end

data/lib/rdig.rb CHANGED

@@ -24,7 +24,7 @@
 #++
 #
-RDIGVERSION = '0.2.1'
+RDIGVERSION = '0.3.0'
 require 'thread'
@@ -38,28 +38,28 @@ require 'set'
 require 'net/http'
 require 'getoptlong'
 require 'tempfile'
-# mkmf gives us the handy find_executable method used to check for helper
-# programs:
-require 'mkmf'
+require 'open-uri'
 begin
-  require 'rubyful_soup'
   require 'ferret'
+  require 'rubyful_soup'
 rescue LoadError
   require 'rubygems'
-  require 'rubyful_soup'
   require 'ferret'
+  require 'rubyful_soup'
 end
 require 'htmlentities/htmlentities'
-require 'rdig/http_client'
 require 'rdig/content_extractors'
 require 'rdig/url_filters'
 require 'rdig/search'
 require 'rdig/index'
+require 'rdig/file'
+require 'rdig/documents'
 require 'rdig/crawler'
 $KCODE = 'u'
 require 'jcode'
@@ -68,17 +68,30 @@ module RDig
   class << self
-    # the filter chain each URL has to run through before being crawled.
+    # the filter chains are for limiting the set of indexed documents.
+    # there are two chain types - one for http, and one for file system
+    # crawling.
+    # a document has to survive all filters in the chain to get indexed.
     def filter_chain
-      @filter_chain ||= [
-        { :maximum_redirect_filter => :max_redirects },
-        :fix_relative_uri,
-        :normalize_uri,
-        { :hostname_filter => :include_hosts },
-        { RDig::UrlFilters::UrlInclusionFilter => :include_documents },
-        { RDig::UrlFilters::UrlExclusionFilter => :exclude_documents },
-        RDig::UrlFilters::VisitedUrlFilter
-      ]
+      @filter_chain ||= {
+        # filter chain for http crawling
+        :http => [
+          :scheme_filter_http,
+          :fix_relative_uri,
+          :normalize_uri,
+          { :hostname_filter => :include_hosts },
+          { RDig::UrlFilters::UrlInclusionFilter => :include_documents },
+          { RDig::UrlFilters::UrlExclusionFilter => :exclude_documents },
+          RDig::UrlFilters::VisitedUrlFilter
+        ],
+        # filter chain for file system crawling
+        :file => [
+          :scheme_filter_file,
+          { RDig::UrlFilters::PathInclusionFilter => :include_documents },
+          { RDig::UrlFilters::PathExclusionFilter => :exclude_documents }
+        ]
+      }
     end
     def application
@@ -86,7 +99,7 @@ module RDig
     end
     def searcher
-      @searcher ||= Search::Searcher.new(config.ferret)
+      @searcher ||= Search::Searcher.new(config.index)
     end
     # RDig configuration
@@ -124,7 +137,7 @@ module RDig
               }
             )
           ),
-          :ferret                => OpenStruct.new(
+          :index                 => OpenStruct.new(
             :path                => "index/",
             :create              => true,
             :handle_parse_errors => true,
@@ -224,6 +237,8 @@ module RDig
       end
+      puts "using Ferret #{Ferret::VERSION}"
       if options.query
         # query the index
         puts "executing query >#{options.query}<"

data/lib/rdig/content_extractors.rb CHANGED

@@ -54,7 +54,9 @@ module RDig
       def self.extractors; @@extractors ||= [] end
       def self.extractor_instances
-        @@extractor_instances ||= extractors.map { |ex_class| ex_class.new }
+        @@extractor_instances ||= extractors.map { |ex_class|
+          ex_class.new(RDig.configuration.content_extraction)
+        }
       end
       def self.process(content, content_type)
@@ -65,6 +67,10 @@ module RDig
         nil
       end
+      def initialize(config)
+        @config = config
+      end
       def can_do(content_type)
         content_type =~ @pattern
       end
@@ -91,60 +97,88 @@ module RDig
         file.delete
       end
-      def available
-        if @available.nil?
-          @available = !find_executable(@executable).nil?
-        end
-        @available
-      end
+      # setting @available according to presence of external executables
+      # in initializer of ContentExtractor is needed to make this work
       def can_do(content_type)
-        available and super(content_type)
+        @available and super(content_type)
       end
     end
     # Extract text from pdf content.
     #
-    # Requires the pdftotext utility from the xpdf-utils package
+    # Requires the pdftotext and pdfinfo utilities from the
+    # xpdf-utils package
     # (on debian and friends do 'apt-get install xpdf-utils')
     #
-    # TODO: use pdfinfo to get title from document
     class PdfContentExtractor < ContentExtractor
       include ExternalAppHelper
-      def initialize
-        @executable = 'pdftotext'
+      def initialize(config)
+        super(config)
         @pattern = /^application\/pdf/
+        @pdftotext = 'pdftotext'
+        @pdfinfo = 'pdfinfo'
+        @available = true
+        [ @pdftotext, @pdfinfo].each { |program|
+          unless %x{#{program} -h 2>&1} =~ /Copyright 1996/
+            @available = false
+            break
+          end
+        }
       end
+      def process(content)
+        result = {}
+        as_file(content) do |file|
+          result[:content] = get_content(file.path).strip
+          result[:title] = get_title(file.path)
+        end
+        result
+      end
       def get_content(path_to_tempfile)
-        %x{#{@executable} '#{path_to_tempfile}' -}
+        %x{#{@pdftotext} -enc UTF-8 '#{path_to_tempfile}' -}
+      end
+      # extracts the title from pdf meta data
+      # needs pdfinfo
+      # returns the title or nil if no title was found
+      def get_title(path_to_tempfile)
+        %x{#{@pdfinfo} -enc UTF-8 '#{path_to_tempfile}'} =~ /title:\s+(.*)$/i ? $1.strip : nil
+      rescue
       end
     end
     # Extract text from word documents
     #
-    # Requires the antiword utility
-    # (on debian and friends do 'apt-get install antiword')
+    # Requires the wvHtml utility
+    # (on debian and friends do 'apt-get install wv')
     class WordContentExtractor < ContentExtractor
       include ExternalAppHelper
-      def initialize
-        @executable = 'wvHtml'
+      def initialize(config)
+        super(config)
+        @wvhtml = 'wvHtml'
         @pattern = /^application\/msword/
-        @html_extractor = HtmlContentExtractor.new
+        # html extractor for parsing wvHtml output
+        @html_extractor = HtmlContentExtractor.new(OpenStruct.new(
+            :html => OpenStruct.new(
+              :content_tag_selector => lambda { |tagsoup|
+                tagsoup.html.body
+              },
+              :title_tag_selector         => lambda { |tagsoup|
+                tagsoup.html.head.title
+              }
+            )))
+        # TODO: besser: if $?.exitstatus == 127 (not found)
+        @available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
       end
       def process(content)
         result = {}
-        as_file(content) do |infile|
-          outfile = Tempfile.new('rdig')
-          outfile.close
-          %x{#{@executable} --targetdir='#{File.dirname(outfile.path)}' '#{infile.path}' '#{File.basename(outfile.path)}'}
-          File.open(outfile.path) do |html|
-            result = @html_extractor.process(html.read)
-          end
-          outfile.delete
+        as_file(content) do |file|
+          result = @html_extractor.process(%x{#{@wvhtml} --charset=UTF-8 '#{file.path}' -})
         end
         return result || {}
       end
@@ -154,7 +188,8 @@ module RDig
     # extracts title, content and links from html documents
     class HtmlContentExtractor < ContentExtractor
-      def initialize
+      def initialize(config)
+        super(config)
         @pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
       end
@@ -181,9 +216,10 @@ module RDig
       # children.
       def extract_content(tag_soup)
         content = ''
-        content_element(tag_soup).children { |child|
+        ce = content_element(tag_soup)
+        ce.children { |child|
           extract_text(child, content)
-        }
+        } unless ce.nil?
         return content.strip
       end
@@ -197,18 +233,20 @@ module RDig
       # Extracts the title from the given html tree
       def extract_title(tagsoup)
-        title = ''
         the_title_tag = title_tag(tagsoup)
         if the_title_tag.is_a? String
           the_title_tag
         else
-          extract_text(the_title_tag).strip if the_title_tag
+          title = ''
+          extract_text(the_title_tag, title)
+          title.strip
         end
       end
       # Recursively extracts all text contained in the given element,
       # and appends it to content.
       def extract_text(element, content='')
+        return nil if element.nil?
         if element.is_a? NavigableString
           value = strip_comments(element)
           value.strip!
@@ -234,8 +272,8 @@ module RDig
       # This may return a string, e.g. an attribute value selected from a meta
       # tag, too.
       def title_tag(tagsoup)
-        if RDig.config.content_extraction.html.title_tag_selector
-          RDig.config.content_extraction.html.title_tag_selector.call(tagsoup)
+        if @config.html.title_tag_selector
+          @config.html.title_tag_selector.call(tagsoup)
         else
           tagsoup.html.head.title
         end
@@ -243,8 +281,8 @@ module RDig
       # Retrieve the root element to extract document content from
       def content_element(tagsoup)
-        if RDig.config.content_extraction.html.content_tag_selector
-          RDig.config.content_extraction.html.content_tag_selector.call(tagsoup)
+        if @config.html.content_tag_selector
+          @config.html.content_tag_selector.call(tagsoup)
         else
           tagsoup.html.body
         end

data/lib/rdig/crawler.rb CHANGED

@@ -9,30 +9,28 @@ module RDig
     end
     def run
-      @indexer = Index::Indexer.new(RDig.config.ferret)
-      filterchain = UrlFilters::FilterChain.new(RDig.filter_chain)
+      raise 'no start urls given!' if RDig.config.crawler.start_urls.empty?
+      @indexer = Index::Indexer.new(RDig.config.index)
+      # check whether we are indexing on-disk or via http
+      url_type = RDig.config.crawler.start_urls.first =~ /^file:\/\// ? :file : :http
+      chain_config = RDig.filter_chain[url_type]
+      filterchain = UrlFilters::FilterChain.new(chain_config)
       RDig.config.crawler.start_urls.each { |url| add_url(url, filterchain) }
       num_threads = RDig.config.crawler.num_threads
       group = ThreadsWait.new
       num_threads.times { |i|
         group.join_nowait Thread.new("fetcher #{i}") {
-          filterchain = UrlFilters::FilterChain.new(RDig.filter_chain)
+          filterchain = UrlFilters::FilterChain.new(chain_config)
           while (doc = @documents.pop) != :exit
             process_document doc, filterchain
           end
         }
       }
-      # dilemma: suppose we have 1 start url and two threads t1 and t2:
-      # t1 pops the start url from the queue which now is empty
-      # as the queue is empty now, t2 blocks until t1 adds the links
-      # retrieved from his document.
-      #
-      # But we need the 'queue empty' condition as a sign for us to stop
-      # waiting for new entries, too.
-      # check every now and then for an empty queue
+      # check for an empty queue every now and then
       sleep_interval = RDig.config.crawler.wait_before_leave
       begin
         sleep sleep_interval
@@ -54,22 +52,10 @@ module RDig
       } unless doc.content[:links].nil?
       return unless @etag_filter.apply(doc)
-      case doc.status
-      when :success
-        if doc.content
-          if doc.content[:links]
-            doc.content[:links].each { |url| add_url(url, filterchain, doc) }
-          end
-          @indexer << doc
-          #else
-          #puts "success but no content: #{doc.uri.to_s}"
-        end
-      when :redirect
-        # links contains the url we were redirected to
-        doc.content[:links].each { |url| add_url(url, filterchain, doc) }
-      end
+      @indexer << doc if doc.needs_indexing?
     rescue
       puts "error processing document #{doc.uri.to_s}: #{$!}"
+      puts "Trace: #{$!.backtrace.join("\n")}" if RDig::config.verbose
     end
@@ -78,82 +64,23 @@ module RDig
     # processing
     def add_url(url, filterchain, referring_document = nil)
       return if url.nil? || url.empty?
-      if referring_document
-        doc = Document.new(url, referring_document.uri)
-        # keep redirect count
-        if referring_document.status == :redirect
-          doc.redirections = referring_document.redirections + 1
-        end
+      if referring_document and referring_document.uri.scheme =~ /^https?/i
+        doc = Document.create(url, referring_document.uri)
       else
-        doc = Document.new(url)
+        doc = Document.create(url)
       end
       doc = filterchain.apply(doc)
       if doc
-        puts "added url #{url}"
-        #else
-        #puts "skipping url #{url}"
+        @documents << doc
+        puts "added url #{url}" if RDig::config.verbose
       end
-      @documents << doc if doc
     end
   end
-  class Document
-    include HttpClient
-    attr_reader :content
-    attr_reader :content_type
-    attr_reader :uri
-    attr_reader :referring_uri
-    attr_reader :status
-    attr_reader :etag
-    attr_accessor :redirections
-    # url: url of this document, may be relative to the referring doc or host.
-    # referrer: uri of the document we retrieved this link from
-    def initialize(url, referrer = nil)
-      @redirections = 0
-      begin
-        @uri = URI.parse(url)
-      rescue URI::InvalidURIError
-        raise "Cannot create document using invalid URL: #{url}"
-      end
-      @referring_uri = referrer
-    end
-    def has_content?
-      !self.content.nil?
-    end
-    def title; @content[:title] end
-    def body; @content[:content] end
-    def url; @uri.to_s end
-    def fetch
-      puts "fetching #{@uri.to_s}"
-      response = do_get(@uri)
-      case response
-      when Net::HTTPSuccess
-        @content_type = response['content-type']
-        @raw_body = response.body
-        @etag = response['etag']
-        # todo externalize this (another chain ?)
-        @content = ContentExtractors.process(@raw_body, @content_type)
-        @status = :success
-      when Net::HTTPRedirection
-        @status = :redirect
-        @content = { :links => [ response['location'] ] }
-      else
-        puts "don't know what to do with response: #{response}"
-      end
-    end
-  end
   # checks fetched documents' E-Tag headers against the list of E-Tags
   # of the documents already indexed.
   # This is supposed to help against double-indexing documents which can
@@ -169,7 +96,7 @@ module RDig
     end
     def apply(document)
-      return document unless document.etag
+      return document unless (document.respond_to?(:etag) && document.etag)
       synchronize do
         @etags.add?(document.etag) ? document : nil
       end

data/lib/rdig/documents.rb ADDED

@@ -0,0 +1,133 @@
+module RDig
+  #
+  # Document base class
+  #
+  class Document
+    attr_reader :uri
+    attr_reader :content
+    attr_reader :content_type
+    def self.create(url, referrer_uri = nil)
+      # a referrer is a clear enough hint to create an HttpDocument
+      if referrer_uri && referrer_uri.scheme =~ /^https?$/i
+        return HttpDocument.new(:url => url, :referrer => referrer_uri)
+      end
+      case url
+      when /^https?:\/\//i
+        HttpDocument.new(:url => url, :referrer => referrer_uri) if referrer_uri.nil?
+      when /^file:\/\//i
+        # files don't have referrers - the check for nil prevents us from being
+        # tricked into indexing local files by file:// links in the web site
+        # we index.
+        FileDocument.new(:url => url) if referrer_uri.nil?
+      end
+    end
+    # url: url of this document, may be relative to the referring doc or host.
+    # referrer: uri of the document we retrieved this link from
+    def initialize(args)
+      begin
+        @uri = URI.parse(args[:url])
+      rescue URI::InvalidURIError
+        raise "Cannot create document using invalid URL: #{url}"
+      end
+    end
+    def title; @content[:title] end
+    def body; @content[:content] end
+    def links; @content[:links] end
+    def needs_indexing?
+      has_content? && (title || body)
+    end
+    def has_content?
+      !self.content.nil?
+    end
+  end
+  #
+  # Document in a File system
+  #
+  class FileDocument < Document
+    def initialize(args={})
+      super(args)
+    end
+    def self.find_files(path)
+      links = []
+      Dir.glob(File.expand_path(File.join(path, '*'))) do |filename|
+        # Skip files not matching known mime types
+        pattern = /.+\.(#{File::FILE_EXTENSION_MIME_TYPES.keys.join('|')})$/i
+        if File.directory?(filename) || filename =~ pattern
+          links << "file://#{filename}"
+        end
+      end
+      links
+    end
+    def file?
+      File.file? @uri.path
+    end
+    def fetch
+      if File.directory? @uri.path
+        # directories are treated like a link collection
+        @content = { :links => self.class.find_files(@uri.path) }
+      else
+        # process this file's contents
+        open(@uri.path) do |file|
+          @content = ContentExtractors.process(file.read, file.content_type)
+          @content[:links] = nil if @content # don't follow links inside files
+        end
+      end
+      @content ||= {}
+    end
+  end
+  #
+  # Remote Document to be retrieved by HTTP
+  #
+  class HttpDocument < Document
+    attr_reader :referring_uri
+    attr_reader :status
+    attr_reader :etag
+    # url: url of this document, may be relative to the referring doc or host.
+    # referrer: uri of the document we retrieved this link from
+    def initialize(args={})
+      super(args)
+      @referring_uri = args[:referrer]
+    end
+    def fetch
+      puts "fetching #{@uri.to_s}" if RDig::config.verbose
+      open(@uri.to_s) do |doc|
+        case doc.status.first.to_i
+        when 200
+          @etag = doc.meta['etag']
+          # puts "etag: #{@etag}"
+          @content = ContentExtractors.process(doc.read, doc.content_type)
+          @status = :success
+        when 404
+          puts "got 404 for #{url}"
+        else
+          puts "don't know what to do with response: #{doc.status.join(' : ')}"
+        end
+      end
+    rescue
+      puts "error fetching #{@uri.to_s}: #{$!}" if RDig::config.verbose
+    ensure
+      @content ||= {}
+    end
+  end
+end

data/lib/rdig/file.rb ADDED

@@ -0,0 +1,18 @@
+# Extend class File with a content_type method
+class File
+  # mime types and file extensions
+  FILE_EXTENSION_MIME_TYPES = {
+    'doc'  => 'application/msword',
+    'html' => 'text/html',
+    'htm'  => 'text/html',
+    #'.odt'  => 'application/vnd.oasis.opendocument.text',
+    'pdf'  => 'application/pdf',
+    'txt'  => 'text/plain',
+  }
+  def content_type
+    FILE_EXTENSION_MIME_TYPES[File.extname(self.path).downcase.gsub(/^\./,'')] || 'application/octet-stream'
+  end
+end

data/lib/rdig/index.rb CHANGED

@@ -6,7 +6,7 @@ module RDig
       include MonitorMixin, Ferret::Index, Ferret::Document
       def initialize(settings)
-        #@ferret_config = settings
+        @config = settings
         @index_writer = IndexWriter.new(settings.path,
                                         :create   => settings.create,
                                         :analyzer => settings.analyzer)
@@ -14,10 +14,12 @@ module RDig
       end
       def add_to_index(document)
-        puts "add to index: #{document.uri.to_s}"
+        puts "add to index: #{document.uri.to_s}" if RDig::config.verbose
         doc = Ferret::Document::Document.new
-        doc << Field.new("url", document.url,
-                        Field::Store::YES, Field::Index::UNTOKENIZED)
+        @config.rewrite_uri.call(document.uri) if @config.rewrite_uri
+        doc << Field.new("url", document.uri.to_s,
+                        Field::Store::YES, Field::Index::TOKENIZED)
         doc << Field.new("title", document.title,
                         Field::Store::YES, Field::Index::TOKENIZED)
         doc << Field.new("data",  document.body,

data/lib/rdig/url_filters.rb CHANGED

@@ -82,7 +82,7 @@ module RDig
     # base class for url inclusion / exclusion filters
-    class UrlPatternFilter
+    class PatternFilter
       # takes an Array of Regexps, or nil to disable the filter
       def initialize(args=nil)
         unless args.nil?
@@ -98,8 +98,8 @@ module RDig
         end
       end
     end
-    class UrlExclusionFilter < UrlPatternFilter
-      # returns nil if any of the patterns matches it's URL,
+    class UrlExclusionFilter < PatternFilter
+      # returns nil if any of the patterns matches it's URI,
       # the document itself otherwise
       def apply(document)
         return document unless @patterns
@@ -109,9 +109,9 @@ module RDig
         return document
       end
     end
-    class UrlInclusionFilter < UrlPatternFilter
-      # returns nil if any of the patterns matches it's URL,
-      # the document itself otherwise
+    class UrlInclusionFilter < PatternFilter
+      # returns the document if any of the patterns matches it's URI,
+      # nil otherwise
       def apply(document)
         return document unless @patterns
         @patterns.each { |p|
@@ -121,21 +121,42 @@ module RDig
       end
     end
+    # returns nil if any of the patterns matches it's path,
+    # the document itself otherwise. Applied to real files only.
+    class PathExclusionFilter < PatternFilter
+      def apply(document)
+        return document unless (@patterns && document.file?)
+        @patterns.each { |p|
+          return nil if document.uri.path =~ p
+        }
+        return document
+      end
+    end
+    # returns the document if any of the patterns matches it's path,
+    # nil otherwise. Applied to real files only
+    class PathInclusionFilter < PatternFilter
+      def apply(document)
+        return document unless (@patterns && document.file?)
+        @patterns.each { |p|
+          return document if document.uri.path =~ p
+        }
+        return nil
+      end
+    end
     # checks redirect count of the given document
     # takes it out of the chain if number of redirections exceeds the
     # max_redirects setting
     def UrlFilters.maximum_redirect_filter(document, max_redirects)
-      return nil if document.redirections > max_redirects
+      return nil if document.respond_to?(:redirections) && document.redirections > max_redirects
       return document
     end
     # expands both href="/path/xyz.html" and href="affe.html"
     # to full urls
     def UrlFilters.fix_relative_uri(document)
-      return nil unless document.uri.scheme.nil? || document.uri.scheme =~ /^http/i
+      #return nil unless document.uri.scheme.nil? || document.uri.scheme =~ /^https?/i
       ref = document.referring_uri
       return document unless ref
       uri = document.uri
@@ -150,6 +171,9 @@ module RDig
         uri.path = ref_path[0..ref_path.rindex('/')] + uri.path
       end
       return document
+    rescue
+      p document
+      p document.uri
     end
     def UrlFilters.hostname_filter(document, include_hosts)
@@ -167,5 +191,14 @@ module RDig
       return document
     end
+    def UrlFilters.scheme_filter_file(document)
+      return document if (document.uri.scheme.nil? || document.uri.scheme =~ /^file$/i)
+      nil
+    end
+    def UrlFilters.scheme_filter_http(document)
+      return document if (document.uri.scheme.nil? || document.uri.scheme =~ /^https?$/i)
+      nil
+    end
   end
 end

data/test/fixtures/pdf/simple.pdf CHANGED

Binary file

data/test/unit/crawler_fs_test.rb ADDED

@@ -0,0 +1,32 @@
+require 'test_helper'
+class CrawlerFsTest < Test::Unit::TestCase
+  include TestHelper
+  def setup
+    @fixture_path = File.expand_path(File.join(File.dirname(__FILE__), '../fixtures/'))
+    index_dir = 'tmp/test-index'
+    Dir.mkdir index_dir unless File.directory? index_dir
+    RDig.configuration do |cfg|
+      @old_crawler_cfg = cfg.crawler.clone
+      cfg.crawler.start_urls = [ "file://#{@fixture_path}" ]
+      cfg.crawler.num_threads = 1
+      cfg.crawler.wait_before_leave = 1
+      cfg.index.path = index_dir
+      cfg.verbose = true
+    end
+  end
+  def teardown
+    RDig.configuration do |cfg|
+      cfg.crawler = @old_crawler_cfg
+    end
+  end
+  def test_crawl
+    crawler = Crawler.new
+    crawler.run
+  end
+end

data/test/unit/file_document_test.rb ADDED

@@ -0,0 +1,34 @@
+require 'test_helper'
+class FileDocumentTest < Test::Unit::TestCase
+  include TestHelper
+  def setup
+    @fixture_path = File.join(File.expand_path(File.dirname(__FILE__)), '../fixtures/')
+  end
+  def test_find_files
+    links = FileDocument.find_files(@fixture_path)
+    assert_equal 3, links.size
+    links = FileDocument.find_files("#{@fixture_path}/html")
+    assert_equal 3, links.size
+  end
+  def test_fetch_directory
+    dir = Document.create("file://#{@fixture_path}")
+    dir.fetch
+    assert_equal 3, dir.links.size
+    dir = Document.create("file://#{@fixture_path}/pdf")
+    dir.fetch
+    assert_equal 1, dir.links.size
+  end
+  def test_fetch_content
+    file = Document.create("file://#{@fixture_path}/pdf/simple.pdf")
+    file.fetch
+    assert file.needs_indexing?
+    assert_equal 'This is for testing PDF extraction. Some Ümläuts and a €uro. Another Paragraph.', file.body
+  end
+end

data/test/unit/html_content_extractor_test.rb CHANGED

@@ -3,13 +3,9 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
   include TestHelper
   def setup
-    @extractor = ContentExtractors::HtmlContentExtractor.new
+    @config = OpenStruct.new(:html => RDig.config.content_extraction.html.clone)
+    @extractor = ContentExtractors::HtmlContentExtractor.new(@config)
     @nbsp = [160].pack('U') # non breaking space
-    @config_backup = RDig.config.content_extraction.html.clone
-  end
-  def teardown
-    RDig.config.content_extraction.html = @config_backup
   end
   def test_can_do
@@ -41,13 +37,11 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
   end
   def test_custom_content_element
-    RDig.configuration do |config|
-      config.content_extraction.html.title_tag_selector = lambda do |tagsoup|
-        tagsoup.find('h1', :attrs => { 'class', 'title' })
-      end
-      config.content_extraction.html.content_tag_selector = lambda do |tagsoup|
-        tagsoup.find('div', :attrs => { 'id', 'content' })
-      end
+    @config.html.title_tag_selector = lambda do |tagsoup|
+      tagsoup.find('h1', :attrs => { 'class', 'title' })
+    end
+    @config.html.content_tag_selector = lambda do |tagsoup|
+      tagsoup.find('div', :attrs => { 'id', 'content' })
     end
     result = @extractor.process(html_doc('custom_tag_selectors'))
     assert_equal 'Sample Title in h1', result[:title]
@@ -61,23 +55,19 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
   def test_title_from_dcmeta
-    RDig.configuration do |config|
-      config.content_extraction.html.title_tag_selector = lambda do |tagsoup|
-        tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
-      end
+    @config.html.title_tag_selector = lambda do |tagsoup|
+      tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
     end
     result = @extractor.process(html_doc('custom_tag_selectors'))
     assert_equal 'Title from DC meta data', result[:title]
   end
   def test_preprocessed_title
-    RDig.configuration do |config|
-      config.content_extraction.html.title_tag_selector = lambda do |tagsoup|
-        title = tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
-        # use only a portion of the title tag's contents if it matches our
-        # regexp:
-        title =~ /^(.*)meta data$/ ? $1.strip : title.strip
-      end
+    @config.html.title_tag_selector = lambda do |tagsoup|
+      title = tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
+      # use only a portion of the title tag's contents if it matches our
+      # regexp:
+      title =~ /^(.*)meta data$/ ? $1.strip : title.strip
     end
     result = @extractor.process(html_doc('custom_tag_selectors'))
     assert_equal 'Title from DC', result[:title]

data/test/unit/pdf_content_extractor_test.rb CHANGED

@@ -3,7 +3,7 @@ class PdfContentExtractorTest < Test::Unit::TestCase
   include TestHelper
   def setup
-    @ce = ContentExtractors::PdfContentExtractor.new
+    @ce = ContentExtractors::PdfContentExtractor.new(RDig.configuration.content_extraction)
   end
   def test_can_do
@@ -23,10 +23,10 @@ class PdfContentExtractorTest < Test::Unit::TestCase
   private
   def check_content(result)
     assert_not_nil result
-    assert_nil result[:title]
+    assert_equal 'PDF Test', result[:title]
     assert_nil result[:links]
     assert_not_nil result[:content]
-    assert_equal 'This is for testing PDF extraction. Another Paragraph.', result[:content]
+    assert_equal 'This is for testing PDF extraction. Some Ümläuts and a €uro. Another Paragraph.', result[:content]
   end
 end

data/test/unit/url_filters_test.rb CHANGED

@@ -13,17 +13,17 @@ class UrlFilterTest < Test::Unit::TestCase
     ]
     chain = UrlFilters::FilterChain.new(cfg)
-    assert_nil chain.apply(Document.new("http://test.host/affe.htm"))
-    assert_not_nil chain.apply(Document.new("http://test.host/affe.html"))
-    assert_nil chain.apply(Document.new("http://test.host.com/affe.html"))
+    assert_nil chain.apply(Document.create("http://test.host/affe.htm"))
+    assert_not_nil chain.apply(Document.create("http://test.host/affe.html"))
+    assert_nil chain.apply(Document.create("http://test.host.com/affe.html"))
   end
   # test default chain config
   def test_default_filterchain
-    chain = UrlFilters::FilterChain.new(RDig.filter_chain)
-    assert_nil chain.apply(Document.new("http://www.example.com/affe.htm"))
-    assert_not_nil chain.apply(Document.new("http://localhost:3000/affe.html"))
-    assert_nil chain.apply(Document.new("http://localhost.com/affe.html"))
+    chain = UrlFilters::FilterChain.new(RDig.filter_chain[:http])
+    assert_nil chain.apply(Document.create("http://www.example.com/affe.htm"))
+    assert_not_nil chain.apply(Document.create("http://localhost:3000/affe.html"))
+    assert_nil chain.apply(Document.create("http://localhost.com/affe.html"))
   end
   # check lookup of chain parameters from config
@@ -38,59 +38,59 @@ class UrlFilterTest < Test::Unit::TestCase
     ]
     chain = UrlFilters::FilterChain.new(cfg)
-    assert_nil chain.apply(Document.new("http://test.host/affe.htm"))
-    assert_not_nil chain.apply(Document.new("http://test.host/affe.html"))
-    assert_nil chain.apply(Document.new("http://test.host.com/affe.html"))
+    assert_nil chain.apply(Document.create("http://test.host/affe.htm"))
+    assert_not_nil chain.apply(Document.create("http://test.host/affe.html"))
+    assert_nil chain.apply(Document.create("http://test.host.com/affe.html"))
   end
   def test_urlpattern_filter
     f = UrlFilters::UrlInclusionFilter.new(/.*\.html$/)
-    assert_nil f.apply(Document.new("http://test.host/affe.htm"))
-    assert_not_nil f.apply(Document.new("http://test.host/affe.html"))
+    assert_nil f.apply(Document.create("http://test.host/affe.htm"))
+    assert_not_nil f.apply(Document.create("http://test.host/affe.html"))
     f = UrlFilters::UrlExclusionFilter.new([ /.*\.html$/, /.*\.aspx/ ])
-    assert_not_nil f.apply(Document.new("http://test.host/affe.htm"))
-    assert_nil f.apply(Document.new("http://test.host/affe.html"))
-    assert_nil f.apply(Document.new("http://test.host/affe.aspx"))
+    assert_not_nil f.apply(Document.create("http://test.host/affe.htm"))
+    assert_nil f.apply(Document.create("http://test.host/affe.html"))
+    assert_nil f.apply(Document.create("http://test.host/affe.aspx"))
     f = UrlFilters::UrlExclusionFilter.new([ /http:\/\/[^\/]+\/dir1/ ])
-    assert_nil f.apply(Document.new("http://test.host/dir1/affe.aspx"))
-    assert_not_nil f.apply(Document.new("http://test.host/dir2/dir1/affe.htm"))
-    assert_not_nil f.apply(Document.new("http://test.host/affe.htm"))
-    assert_not_nil f.apply(Document.new("http://test.host/dir2/affe.htm"))
+    assert_nil f.apply(Document.create("http://test.host/dir1/affe.aspx"))
+    assert_not_nil f.apply(Document.create("http://test.host/dir2/dir1/affe.htm"))
+    assert_not_nil f.apply(Document.create("http://test.host/affe.htm"))
+    assert_not_nil f.apply(Document.create("http://test.host/dir2/affe.htm"))
     f = UrlFilters::UrlExclusionFilter.new([ /\/dir1/ ])
-    assert_nil f.apply(Document.new("http://test.host/dir1/affe.aspx"))
-    assert_nil f.apply(Document.new("http://test.host/dir2/dir1/affe.htm"))
-    assert_not_nil f.apply(Document.new("http://test.host/affe.htm"))
-    assert_not_nil f.apply(Document.new("http://test.host/dir2/affe.htm"))
+    assert_nil f.apply(Document.create("http://test.host/dir1/affe.aspx"))
+    assert_nil f.apply(Document.create("http://test.host/dir2/dir1/affe.htm"))
+    assert_not_nil f.apply(Document.create("http://test.host/affe.htm"))
+    assert_not_nil f.apply(Document.create("http://test.host/dir2/affe.htm"))
   end
   def test_hostname_filter
     include_hosts = [ 'test.host', 'localhost' ]
-    assert_nil UrlFilters.hostname_filter(Document.new('http://google.com/'), include_hosts)
-    assert_not_nil UrlFilters.hostname_filter(Document.new('http://test.host/file.html'), include_hosts)
-    assert_not_nil UrlFilters.hostname_filter(Document.new('http://localhost/file.html'), include_hosts)
+    assert_nil UrlFilters.hostname_filter(Document.create('http://google.com/'), include_hosts)
+    assert_not_nil UrlFilters.hostname_filter(Document.create('http://test.host/file.html'), include_hosts)
+    assert_not_nil UrlFilters.hostname_filter(Document.create('http://localhost/file.html'), include_hosts)
   end
   def test_fix_relative_uri
-    doc = Document.new('http://test.host/dir/file.html')
+    doc = Document.create('http://test.host/dir/file.html')
     assert_equal('http://test.host/dir/another.html',
-                  UrlFilters.fix_relative_uri(Document.new('another.html', doc.uri)).uri.to_s)
+                  UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
     assert_equal('http://test.host/dir/../another.html',
-                  UrlFilters.fix_relative_uri(Document.new('../another.html', doc.uri)).uri.to_s)
+                  UrlFilters.fix_relative_uri(Document.create('../another.html', doc.uri)).uri.to_s)
     assert_equal('http://test.host/dir/another.html',
-                  UrlFilters.fix_relative_uri(Document.new('/dir/another.html', doc.uri)).uri.to_s)
+                  UrlFilters.fix_relative_uri(Document.create('/dir/another.html', doc.uri)).uri.to_s)
     assert_equal('http://test.host/dir/another.html',
-                  UrlFilters.fix_relative_uri(Document.new('http://test.host/dir/another.html', doc.uri)).uri.to_s)
+                  UrlFilters.fix_relative_uri(Document.create('http://test.host/dir/another.html', doc.uri)).uri.to_s)
     assert_equal('HTTP://test.host/dir/another.html',
-                  UrlFilters.fix_relative_uri(Document.new('HTTP://test.host/dir/another.html', doc.uri)).uri.to_s)
-    doc = Document.new('https://test.host/dir/')
+                  UrlFilters.fix_relative_uri(Document.create('HTTP://test.host/dir/another.html', doc.uri)).uri.to_s)
+    doc = Document.create('https://test.host/dir/')
     assert_equal('https://test.host/dir/another.html',
-                  UrlFilters.fix_relative_uri(Document.new('another.html', doc.uri)).uri.to_s)
-    doc = Document.new('https://test.host/')
+                  UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
+    doc = Document.create('https://test.host/')
     assert_equal('https://test.host/another.html',
-                  UrlFilters.fix_relative_uri(Document.new('another.html', doc.uri)).uri.to_s)
-    doc = Document.new('https://test.host')
+                  UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
+    doc = Document.create('https://test.host')
     assert_equal('https://test.host/another.html',
-                  UrlFilters.fix_relative_uri(Document.new('another.html', doc.uri)).uri.to_s)
+                  UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
   end
 end

data/test/unit/word_content_extractor_test.rb CHANGED

@@ -3,7 +3,7 @@ class WordContentExtractorTest < Test::Unit::TestCase
   include TestHelper
   def setup
-    @ce = ContentExtractors::WordContentExtractor.new
+    @ce = ContentExtractors::WordContentExtractor.new(RDig.configuration.content_extraction)
   end
   def test_can_do

metadata CHANGED

@@ -1,10 +1,10 @@
 --- !ruby/object:Gem::Specification
-rubygems_version: 0.8.11
+rubygems_version: 0.8.11.15
 specification_version: 1
 name: rdig
 version: !ruby/object:Gem::Version
-  version: 0.2.1
-date: 2006-04-20 00:00:00 +02:00
+  version: 0.3.0
+date: 2006-04-26 00:00:00 +02:00
 summary: Ruby based web site indexing and searching library.
 require_paths:
 - lib
@@ -25,6 +25,7 @@ required_ruby_version: !ruby/object:Gem::Version::Requirement
 platform: ruby
 signing_key:
 cert_chain:
+post_install_message:
 authors:
 - Jens Kraemer
 files:
@@ -32,13 +33,14 @@ files:
 - lib/rdig
 - lib/htmlentities
 - lib/rdig.rb
-- lib/rdig/http_client.rb
 - lib/rdig/crawler.rb
 - lib/rdig/search.rb
 - lib/rdig/highlight.rb
 - lib/rdig/index.rb
 - lib/rdig/url_filters.rb
 - lib/rdig/content_extractors.rb
+- lib/rdig/documents.rb
+- lib/rdig/file.rb
 - lib/htmlentities/CHANGES
 - lib/htmlentities/COPYING
 - lib/htmlentities/README
@@ -51,6 +53,8 @@ files:
 - test/unit/html_content_extractor_test.rb
 - test/unit/pdf_content_extractor_test.rb
 - test/unit/word_content_extractor_test.rb
+- test/unit/file_document_test.rb
+- test/unit/crawler_fs_test.rb
 - test/fixtures/html
 - test/fixtures/pdf
 - test/fixtures/word

data/lib/rdig/http_client.rb DELETED

@@ -1,22 +0,0 @@
-module RDig
-  module HttpClient
-    def do_get(uri, user_agent='RDig crawler')
-      # Set up the appropriate http headers
-      headers = { "User-Agent" => user_agent }
-      result = {}
-      begin
-        Net::HTTP.start(uri.host, (uri.port or 80)) { |http|
-          final_uri = uri.path
-          final_uri += ('?' + uri.query) if uri.query
-          return http.get(final_uri, headers)
-        }
-      rescue => error
-        puts error
-      end
-    end
-  end
-end