RubyGems - rdig - Versions diffs - 0.3.2 → 0.3.3 - Mend

rdig 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

data/doc/examples/config.rb +33 -0
data/lib/rdig.rb +35 -23
data/lib/rdig/content_extractors.rb +10 -225
data/lib/rdig/content_extractors/doc.rb +41 -0
data/lib/rdig/content_extractors/hpricot.rb +99 -0
data/lib/rdig/content_extractors/pdf.rb +49 -0
data/lib/rdig/content_extractors/rubyful_soup.rb +147 -0
data/lib/rdig/search.rb +14 -14
data/rakefile +5 -1
data/test/unit/hpricot_content_extractor_test.rb +77 -0
data/test/unit/{html_content_extractor_test.rb → rubyful_soup_content_extractor_test.rb} +13 -7
data/test/unit/searcher_test.rb +35 -0
metadata +11 -13

data/doc/examples/config.rb CHANGED

@@ -27,7 +27,40 @@ RDig.configuration do |cfg|
   # cfg.verbose = false
   # content extraction options
+  cfg.content_extraction = OpenStruct.new(
+  # HPRICOT configuration
+  # this is the html parser used by default from RDig 0.3.3 upwards.
+  # Hpricot by far outperforms Rubyful Soup, and is at least as flexible when
+  # it comes to selection of portions of the html documents.
+    :hpricot      => OpenStruct.new(
+      # css selector for the element containing the page title
+      :title_tag_selector => 'title',
+      # might also be a proc returning either an element or a string:
+      # :title_tag_selector => lambda { |hpricot_doc| ... }
+      :content_tag_selector => 'body'
+      # might also be a proc returning either an element or a string:
+      # :content_tag_selector => lambda { |hpricot_doc| ... }
+    )
+  # RUBYFUL SOUP
+  # This is a powerful, but somewhat slow, ruby-only html parsing lib which was
+  # RDig's default html parser up to version 0.3.2. To use it, comment the
+  # hpricot config above, and uncomment the following:
+  #
+  #  :rubyful_soup => OpenStruct.new(
+  #    # select the html element that contains the content to index
+  #    # by default, we index all inside the body tag:
+  #    :content_tag_selector => lambda { |tagsoup|
+  #      tagsoup.html.body
+  #    },
+  #    # select the html element containing the title
+  #    :title_tag_selector         => lambda { |tagsoup|
+  #      tagsoup.html.head.title
+  #    }
+  #  )
+  )
   # provide a method that returns the title of an html document
   # this method may either return a tag to extract the title from,
   # or a ready-to-index string.

data/lib/rdig.rb CHANGED

@@ -24,7 +24,7 @@
 #++
 #
-RDIGVERSION = '0.3.2'
+RDIGVERSION = '0.3.3'
 require 'thread'
@@ -42,22 +42,12 @@ require 'open-uri'
 begin
   require 'ferret'
-  require 'rubyful_soup'
 rescue LoadError
   require 'rubygems'
   require 'ferret'
-  require 'rubyful_soup'
 end
 require 'htmlentities/htmlentities'
-require 'rdig/content_extractors'
-require 'rdig/url_filters'
-require 'rdig/search'
-require 'rdig/index'
-require 'rdig/file'
-require 'rdig/documents'
-require 'rdig/crawler'
 $KCODE = 'u'
@@ -124,25 +114,37 @@ module RDig
             :wait_before_leave => 10
           ),
           :content_extraction  => OpenStruct.new(
-            # settings for html content extraction
-            :html => OpenStruct.new(
-              # select the html element that contains the content to index
-              # by default, we index all inside the body tag:
-              :content_tag_selector => lambda { |tagsoup|
-                tagsoup.html.body
-              },
-              # select the html element containing the title
-              :title_tag_selector         => lambda { |tagsoup|
-                tagsoup.html.head.title
-              }
+            # settings for html content extraction (hpricot)
+            :hpricot      => OpenStruct.new(
+              # css selector for the element containing the page title
+              :title_tag_selector => 'title',
+              # might also be a proc returning either an element or a string:
+              # :title_tag_selector => lambda { |hpricot_doc| ... }
+              :content_tag_selector => 'body'
+              # might also be a proc returning either an element or a string:
+              # :content_tag_selector => lambda { |hpricot_doc| ... }
             )
+            #,
+            # # settings for html content extraction (RubyfulSoup)
+            # :rubyful_soup => OpenStruct.new(
+            #  # select the html element that contains the content to index
+            #  # by default, we index all inside the body tag:
+            #  :content_tag_selector => lambda { |tagsoup|
+            #    tagsoup.html.body
+            #  },
+            #  # select the html element containing the title
+            #  :title_tag_selector         => lambda { |tagsoup|
+            #    tagsoup.html.head.title
+            #  }
+            # )
           ),
           :index                 => OpenStruct.new(
             :path                => "index/",
             :create              => true,
             :handle_parse_errors => true,
             :analyzer            => Ferret::Analysis::StandardAnalyzer.new,
-            :occur_default       => :must
+            :occur_default       => :must,
+            :default_field       => '*'
           )
         )
       end
@@ -261,3 +263,13 @@ module RDig
     end
   end
 end
+require 'rdig/content_extractors'
+require 'rdig/url_filters'
+require 'rdig/search'
+require 'rdig/index'
+require 'rdig/file'
+require 'rdig/documents'
+require 'rdig/crawler'

data/lib/rdig/content_extractors.rb CHANGED

@@ -1,26 +1,3 @@
-# override some methods concered with entity resolving
-# to convert them to strings
-class BeautifulStoneSoup
-  # resolve unknown html entities using the htmlentities lib
-  alias :orig_unknown_entityref :unknown_entityref
-  def unknown_entityref(ref)
-    if HTMLEntities::MAP.has_key?(ref)
-      handle_data [HTMLEntities::MAP[ref]].pack('U')
-    else
-      orig_unknown_entityref ref
-    end
-  end
-  # resolve numeric entities to utf8
-  def handle_charref(ref)
-    handle_data( ref.gsub(/([0-9]{1,7})/) {
-                            [$1.to_i].pack('U')
-                    }.gsub(/x([0-9a-f]{1,6})/i) {
-                            [$1.to_i(16)].pack('U')
-                    } )
-  end
-end
 module RDig
   # Contains classes which are used for extracting content and meta data from
@@ -30,15 +7,6 @@ module RDig
     # process the given +content+ depending on it's +content_type+.
     def self.process(content, content_type)
       ContentExtractor.process(content, content_type)
-      #      case content_type
-      #when /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
-      #  return HtmlContentExtractor.process(content)
-      #when /^application\/.+pdf/
-      #  return PdfContentExtractor.process(content) unless RDig.config.content_extraction.pdf.disabled
-      #else
-      #  puts "unable to handle content type #{content_type}"
-      #end
-      #return nil
     end
     # Base class for Content Extractors.
@@ -48,7 +16,7 @@ module RDig
       def self.inherited(extractor)
         super(extractor)
-        puts("discovered content extractor class: #{extractor}")
+        puts("discovered content extractor class: #{extractor}") if RDig.config.verbose
         self.extractors << extractor
       end
@@ -72,7 +40,7 @@ module RDig
       end
       def can_do(content_type)
-        content_type =~ @pattern
+        @pattern && content_type =~ @pattern
       end
     end
@@ -104,197 +72,14 @@ module RDig
       end
     end
-    # Extract text from pdf content.
-    #
-    # Requires the pdftotext and pdfinfo utilities from the
-    # xpdf-utils package
-    # (on debian and friends do 'apt-get install xpdf-utils')
-    #
-    class PdfContentExtractor < ContentExtractor
-      include ExternalAppHelper
-      def initialize(config)
-        super(config)
-        @pattern = /^application\/pdf/
-        @pdftotext = 'pdftotext'
-        @pdfinfo = 'pdfinfo'
-        @available = true
-        [ @pdftotext, @pdfinfo].each { |program|
-          unless %x{#{program} -h 2>&1} =~ /Copyright 1996/
-            @available = false
-            break
-          end
-        }
-      end
-      def process(content)
-        result = {}
-        as_file(content) do |file|
-          result[:content] = get_content(file.path).strip
-          result[:title] = get_title(file.path)
-        end
-        result
-      end
-      def get_content(path_to_tempfile)
-        %x{#{@pdftotext} -enc UTF-8 '#{path_to_tempfile}' -}
-      end
-      # extracts the title from pdf meta data
-      # needs pdfinfo
-      # returns the title or nil if no title was found
-      def get_title(path_to_tempfile)
-        %x{#{@pdfinfo} -enc UTF-8 '#{path_to_tempfile}'} =~ /title:\s+(.*)$/i ? $1.strip : nil
-      rescue
-      end
-    end
-    # Extract text from word documents
-    #
-    # Requires the wvHtml utility
-    # (on debian and friends do 'apt-get install wv')
-    class WordContentExtractor < ContentExtractor
-      include ExternalAppHelper
-      def initialize(config)
-        super(config)
-        @wvhtml = 'wvHtml'
-        @pattern = /^application\/msword/
-        # html extractor for parsing wvHtml output
-        @html_extractor = HtmlContentExtractor.new(OpenStruct.new(
-            :html => OpenStruct.new(
-              :content_tag_selector => lambda { |tagsoup|
-                tagsoup.html.body
-              },
-              :title_tag_selector         => lambda { |tagsoup|
-                tagsoup.html.head.title
-              }
-            )))
-        # TODO: besser: if $?.exitstatus == 127 (not found)
-        @available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
-      end
-      def process(content)
-        result = {}
-        as_file(content) do |file|
-          result = @html_extractor.process(%x{#{@wvhtml} --charset=UTF-8 '#{file.path}' -})
-        end
-        return result || {}
-      end
-    end
-    # extracts title, content and links from html documents
-    class HtmlContentExtractor < ContentExtractor
-      def initialize(config)
-        super(config)
-        @pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
-      end
-      # returns:
-      # { :content => 'extracted clear text',
-      #   :meta => { :title => 'Title' },
-      #   :links => [array of urls] }
-      def process(content)
-        result = { }
-        tag_soup = BeautifulSoup.new(content)
-        result[:title] = extract_title(tag_soup)
-        result[:links] = extract_links(tag_soup)
-        result[:content] = extract_content(tag_soup)
-        return result
-      end
-      # Extracts textual content from the HTML tree.
-      #
-      # - First, the root element to use is determined using the
-      # +content_element+ method, which itself uses the content_tag_selector
-      # from RDig.configuration.
-      # - Then, this element is processed by +extract_text+, which will give
-      # all textual content contained in the root element and all it's
-      # children.
-      def extract_content(tag_soup)
-        content = ''
-        ce = content_element(tag_soup)
-        ce.children { |child|
-          extract_text(child, content)
-        } unless ce.nil?
-        return content.strip
-      end
-      # extracts the href attributes of all a tags, except
-      # internal links like <a href="#top">
-      def extract_links(tagsoup)
-        tagsoup.find_all('a').map { |link|
-          CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
-        }.compact
-      end
-      # Extracts the title from the given html tree
-      def extract_title(tagsoup)
-        the_title_tag = title_tag(tagsoup)
-        if the_title_tag.is_a? String
-          the_title_tag
-        else
-          title = ''
-          extract_text(the_title_tag, title)
-          title.strip
-        end
-      end
-      # Recursively extracts all text contained in the given element,
-      # and appends it to content.
-      def extract_text(element, content='')
-        return nil if element.nil?
-        if element.is_a? NavigableString
-          value = strip_comments(element)
-          value.strip!
-          unless value.empty?
-            content << value
-            content << ' '
-          end
-        elsif element.string  # it's a Tag, and it has some content string
-          # skip inline scripts and styles
-          return nil if element.name =~ /^(script|style)$/i
-          value = element.string.strip
-          unless value.empty?
-            content << value
-            content << ' '
-          end
-        else
-          element.children { |child|
-            extract_text(child, content)
-          }
-        end
-      end
-      # Returns the element to extract the title from.
-      #
-      # This may return a string, e.g. an attribute value selected from a meta
-      # tag, too.
-      def title_tag(tagsoup)
-        if @config.html.title_tag_selector
-          @config.html.title_tag_selector.call(tagsoup)
-        else
-          tagsoup.html.head.title
-        end
-      end
-      # Retrieve the root element to extract document content from
-      def content_element(tagsoup)
-        if @config.html.content_tag_selector
-          @config.html.content_tag_selector.call(tagsoup)
-        else
-          tagsoup.html.body
-        end
-      end
-      # Return the given string minus all html comments
-      def strip_comments(string)
-        string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
-      end
-    end
+  end
+end
+# load content extractors
+Dir["#{File.expand_path(File.dirname(__FILE__))}/content_extractors/**/*.rb"].each do |f|
+  begin
+    require f
+  rescue
+    puts "error loading #{f}: #{$!}"
   end
 end

data/lib/rdig/content_extractors/doc.rb ADDED

@@ -0,0 +1,41 @@
+module RDig
+  module ContentExtractors
+    # Extract text from word documents
+    #
+    # Requires the wvHtml utility
+    # (on debian and friends do 'apt-get install wv')
+    class WordContentExtractor < ContentExtractor
+      include ExternalAppHelper
+      def initialize(config)
+        super(config)
+        @wvhtml = 'wvHtml'
+        @pattern = /^application\/msword/
+        # html extractor for parsing wvHtml output
+        @html_extractor = RubyfulSoupContentExtractor.new(OpenStruct.new(
+            :rubyful_soup => OpenStruct.new(
+              :content_tag_selector => lambda { |tagsoup|
+                tagsoup.html.body
+              },
+              :title_tag_selector         => lambda { |tagsoup|
+                tagsoup.html.head.title
+              }
+            )))
+        # TODO: better: if $?.exitstatus == 127 (not found)
+        @available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
+      end
+      def process(content)
+        result = {}
+        as_file(content) do |file|
+          result = @html_extractor.process(%x{#{@wvhtml} --charset=UTF-8 '#{file.path}' -})
+        end
+        return result || {}
+      end
+    end
+  end
+end

data/lib/rdig/content_extractors/hpricot.rb ADDED

@@ -0,0 +1,99 @@
+begin
+  require 'hpricot'
+rescue LoadError
+  require 'rubygems'
+  require 'hpricot'
+end
+module RDig
+  module ContentExtractors
+    # extracts title, content and links from html documents using the hpricot library
+    class HpricotContentExtractor < ContentExtractor
+      def initialize(config)
+        super(config.hpricot)
+        # if not configured, refuse to handle any content:
+        @pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/ if config.hpricot
+      end
+      # returns:
+      # { :content => 'extracted clear text',
+      #   :title => 'Title',
+      #   :links => [array of urls] }
+      def process(content)
+        doc = Hpricot(content)
+        {
+          :title => extract_title(doc).decode_entities,
+          :links => extract_links(doc),
+          :content => extract_content(doc).decode_entities
+        }
+      end
+      # Extracts textual content from the HTML tree.
+      #
+      # - First, the root element to use is determined using the
+      # +content_element+ method, which itself uses the content_tag_selector
+      # from RDig.configuration.
+      # - Then, this element is processed by +extract_text+, which will give
+      # all textual content contained in the root element and all it's
+      # children.
+      def extract_content(doc)
+        content = ''
+        ce = content_element(doc)
+        content = strip_tags(strip_comments(ce.inner_html)) if ce
+#          (ce/'h1, h2, h3, h4, h5, h6, p, li, dt, dd, td, address, option, ').each do |child|
+#          extract_text child, content
+        return content.strip
+      end
+      # extracts the href attributes of all a tags, except
+      # internal links like <a href="#top">
+      def extract_links(doc)
+        (doc/'a').map { |link|
+          href = link['href']
+          CGI.unescapeHTML(href) if href && href !~ /^#/
+        }.compact
+      end
+      # Extracts the title from the given html tree
+      def extract_title(doc)
+        the_title_tag = title_tag(doc)
+        return the_title_tag unless the_title_tag.respond_to? :inner_html
+        strip_tags(the_title_tag.inner_html)
+      end
+      # Returns the element to extract the title from.
+      #
+      # This may return a string, e.g. an attribute value selected from a meta
+      # tag, too.
+      def title_tag(doc)
+        tag_from_config(doc, :title_tag_selector) || doc.at('title')
+      end
+      # Retrieve the root element to extract document content from
+      def content_element(doc)
+        tag_from_config(doc, :content_tag_selector) || doc.at('body')
+      end
+      def tag_from_config(doc, config_key)
+        cfg = @config.send(config_key)
+        cfg.is_a?(String) ? doc/cfg : cfg.call(doc) if cfg
+      end
+      # Return the given string minus all html comments
+      def strip_comments(string)
+        string.gsub Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), ''
+      end
+      def strip_tags(string)
+        string.gsub! Regexp.new('<(script|style).*?>.*?<\/(script|style).*?>',
+                               Regexp::MULTILINE, 'u'), ''
+        string.gsub! Regexp.new('<.+?>',
+                               Regexp::MULTILINE, 'u'), ''
+        string.gsub Regexp.new('\s+', Regexp::MULTILINE, 'u'), ' '
+      end
+    end
+  end
+end

data/lib/rdig/content_extractors/pdf.rb ADDED

@@ -0,0 +1,49 @@
+module RDig
+  module ContentExtractors
+    # Extract text from pdf content.
+    #
+    # Requires the pdftotext and pdfinfo utilities from the
+    # xpdf-utils package
+    # (on debian and friends do 'apt-get install xpdf-utils')
+    #
+    class PdfContentExtractor < ContentExtractor
+      include ExternalAppHelper
+      def initialize(config)
+        super(config)
+        @pattern = /^application\/pdf/
+        @pdftotext = 'pdftotext'
+        @pdfinfo = 'pdfinfo'
+        @available = true
+        [ @pdftotext, @pdfinfo].each { |program|
+          unless %x{#{program} -h 2>&1} =~ /Copyright 1996/
+            @available = false
+            break
+          end
+        }
+      end
+      def process(content)
+        result = {}
+        as_file(content) do |file|
+          result[:content] = get_content(file.path).strip
+          result[:title] = get_title(file.path)
+        end
+        result
+      end
+      def get_content(path_to_tempfile)
+        %x{#{@pdftotext} -enc UTF-8 '#{path_to_tempfile}' -}
+      end
+      # extracts the title from pdf meta data
+      # needs pdfinfo
+      # returns the title or nil if no title was found
+      def get_title(path_to_tempfile)
+        %x{#{@pdfinfo} -enc UTF-8 '#{path_to_tempfile}'} =~ /title:\s+(.*)$/i ? $1.strip : nil
+      rescue
+      end
+    end
+  end
+end

data/lib/rdig/content_extractors/rubyful_soup.rb ADDED

@@ -0,0 +1,147 @@
+begin
+  require 'rubyful_soup'
+rescue LoadError
+  require 'rubygems'
+  require 'rubyful_soup'
+end
+# override some methods concered with entity resolving
+# to convert them to strings
+class BeautifulStoneSoup
+  # resolve unknown html entities using the htmlentities lib
+  alias :orig_unknown_entityref :unknown_entityref
+  def unknown_entityref(ref)
+    if HTMLEntities::MAP.has_key?(ref)
+      handle_data [HTMLEntities::MAP[ref]].pack('U')
+    else
+      orig_unknown_entityref ref
+    end
+  end
+  # resolve numeric entities to utf8
+  def handle_charref(ref)
+    handle_data( ref.gsub(/([0-9]{1,7})/) {
+                            [$1.to_i].pack('U')
+                    }.gsub(/x([0-9a-f]{1,6})/i) {
+                            [$1.to_i(16)].pack('U')
+                    } )
+  end
+end
+module RDig
+  module ContentExtractors
+    # extracts title, content and links from html documents
+    class RubyfulSoupContentExtractor < ContentExtractor
+      def initialize(config)
+        super(config.rubyful_soup)
+        # if not configured, refuse to handle any content:
+        @pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/ if config.rubyful_soup
+      end
+      # returns:
+      # { :content => 'extracted clear text',
+      #   :meta => { :title => 'Title' },
+      #   :links => [array of urls] }
+      def process(content)
+        result = { }
+        tag_soup = BeautifulSoup.new(content)
+        result[:title] = extract_title(tag_soup)
+        result[:links] = extract_links(tag_soup)
+        result[:content] = extract_content(tag_soup)
+        return result
+      end
+      # Extracts textual content from the HTML tree.
+      #
+      # - First, the root element to use is determined using the
+      # +content_element+ method, which itself uses the content_tag_selector
+      # from RDig.configuration.
+      # - Then, this element is processed by +extract_text+, which will give
+      # all textual content contained in the root element and all it's
+      # children.
+      def extract_content(tag_soup)
+        content = ''
+        ce = content_element(tag_soup)
+        ce.children { |child|
+          extract_text(child, content)
+        } unless ce.nil?
+        return content.strip
+      end
+      # extracts the href attributes of all a tags, except
+      # internal links like <a href="#top">
+      def extract_links(tagsoup)
+        tagsoup.find_all('a').map { |link|
+          CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
+        }.compact
+      end
+      # Extracts the title from the given html tree
+      def extract_title(tagsoup)
+        the_title_tag = title_tag(tagsoup)
+        if the_title_tag.is_a? String
+          the_title_tag
+        else
+          title = ''
+          extract_text(the_title_tag, title)
+          title.strip
+        end
+      end
+      # Recursively extracts all text contained in the given element,
+      # and appends it to content.
+      def extract_text(element, content='')
+        return nil if element.nil?
+        if element.is_a? NavigableString
+          value = strip_comments(element)
+          value.strip!
+          unless value.empty?
+            content << value
+            content << ' '
+          end
+        elsif element.string  # it's a Tag, and it has some content string
+          # skip inline scripts and styles
+          return nil if element.name =~ /^(script|style)$/i
+          value = element.string.strip
+          unless value.empty?
+            content << value
+            content << ' '
+          end
+        else
+          element.children { |child|
+            extract_text(child, content)
+          }
+        end
+      end
+      # Returns the element to extract the title from.
+      #
+      # This may return a string, e.g. an attribute value selected from a meta
+      # tag, too.
+      def title_tag(tagsoup)
+        if @config.title_tag_selector
+          @config.title_tag_selector.call(tagsoup)
+        else
+          tagsoup.html.head.title
+        end
+      end
+      # Retrieve the root element to extract document content from
+      def content_element(tagsoup)
+        if @config.content_tag_selector
+          @config.content_tag_selector.call(tagsoup)
+        else
+          tagsoup.html.body
+        end
+      end
+      # Return the given string minus all html comments
+      def strip_comments(string)
+        string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
+      end
+    end
+  end
+end

data/lib/rdig/search.rb CHANGED

@@ -12,7 +12,7 @@ module RDig
       # takes the ferret section of the rdig configuration as a parameter.
       def initialize(settings)
         @ferret_config = settings
-        @query_parser = Ferret::QueryParser.new('*', settings.marshal_dump)
+        @query_parser = Ferret::QueryParser.new(settings.marshal_dump)
         ferret_searcher
       end
@@ -24,8 +24,8 @@ module RDig
           @ferret_searcher = nil
         end
         unless @ferret_searcher
-          @ferret_searcher = IndexSearcher.new(@ferret_config.path)
-          @query_parser.fields = @ferret_searcher.reader.get_field_names.to_a
+          @ferret_searcher = Ferret::Search::Searcher.new(@ferret_config.path)
+          @query_parser.fields = @ferret_searcher.reader.field_names.to_a
         end
         @ferret_searcher
       end
@@ -36,23 +36,23 @@ module RDig
       # for more information on queries.
       # A Ferret::Search::Query instance may be given, too.
       #
-      # Otions are:
-      # first_doc:: first document in result list to retrieve (0-based). The default is 0.
-      # num_docs:: number of documents to retrieve. The default is 10.
+      # Some of the more often used otions are:
+      # offset:: first document in result list to retrieve (0-based). The default is 0.
+      # limit:: number of documents to retrieve. The default is 10.
+      # Please see the Ferret::Search::Searcher API for more options.
       def search(query, options={})
         result = {}
         query = query_parser.parse(query) if query.is_a?(String)
         puts "Query: #{query}"
-        hits = ferret_searcher.search(query, options)
-        result[:hitcount] = hits.total_hits
         results = []
-        hits.each { |doc_id,score|
-          doc = ferret_searcher.reader.get_document doc_id
+        searcher = ferret_searcher
+        result[:hitcount] = searcher.search_each(query, options) do |doc_id, score|
+          doc = searcher[doc_id]
           results << { :score => score,
-                      :title => doc['title'],
-                      :url => doc['url'],
-                      :extract => build_extract(doc['data']) }
-        }
+                       :title => doc[:title],
+                       :url => doc[:url],
+                       :extract => build_extract(doc[:data]) }
+        end
         result[:list] = results
         result
       end

data/rakefile CHANGED

@@ -125,12 +125,16 @@ else
     to help building a site search for web sites or intranets. Internally,
     Ferret is used for the full text indexing. After creating a config file
     for your site, the index can be built with a single call to rdig.
+    For HTML page crawling, hpricot and rubyful_soup are supported.
     EOF
     #### Dependencies and requirements.
     s.add_dependency('ferret', '>= 0.10.0')
-    s.add_dependency('rubyful_soup', '>= 1.0.4')
+    # TODO: check if there is anything like 'suggested' instead of required, or
+    # ORed dependencies...
+    #s.add_dependency('rubyful_soup', '>= 1.0.4')
+    #s.add_dependency('hpricot', '>= 0.4')
     #s.requirements << ""
     #### Which files are to be included in this gem?  Everything!  (Except CVS directories.)

data/test/unit/hpricot_content_extractor_test.rb ADDED

@@ -0,0 +1,77 @@
+require 'test_helper'
+class HpricotContentExtractorTest < Test::Unit::TestCase
+  include TestHelper
+  def setup
+    @config = RDig.config.content_extraction.hpricot.clone
+    @extractor = ContentExtractors::HpricotContentExtractor.new(OpenStruct.new(:hpricot => @config))
+    @nbsp = [160].pack('U') # non breaking space
+  end
+  def test_can_do
+    assert !@extractor.can_do('application/pdf')
+    assert !@extractor.can_do('application/msword')
+    assert @extractor.can_do('text/html')
+    assert @extractor.can_do('text/xml')
+    assert @extractor.can_do('application/xml')
+    assert @extractor.can_do('application/xhtml+xml')
+  end
+  def test_simple
+    result = ContentExtractors.process(html_doc('simple'), 'text/html')
+    assert_not_nil result
+    assert_equal 'Sample Title', result[:title]
+    assert_not_nil result[:content]
+    assert_not_nil result[:links]
+    assert_equal 1, result[:links].size
+    assert_equal 'A Link Affe Some sample text Lorem ipsum', result[:content]
+    assert_equal 'http://test.host/affe.html', result[:links].first
+  end
+  def test_entities
+    result = @extractor.process(html_doc('entities'))
+    assert_equal 'Sample & Title', result[:title]
+    assert_equal 'http://test.host/affe.html?b=a&c=d', result[:links].first
+    assert_equal 'http://test.host/affe2.html?b=a&c=d', result[:links].last
+    assert_equal "Some > Links don't#{@nbsp}break me! Affe Affe Ümläuts heiß hier ß", result[:content]
+  end
+  def test_custom_content_element
+    @config.title_tag_selector = lambda do |doc|
+      doc.at("h1[@class='title']")
+    end
+    @config.content_tag_selector = lambda do |doc|
+      doc.at("div[@id='content']")
+    end
+    result = @extractor.process(html_doc('custom_tag_selectors'))
+    assert_equal 'Sample Title in h1', result[:title]
+    assert_equal 'Affe Real content is here.', result[:content]
+    # check if links are collected outside the content tag, too:
+    assert_equal 3, result[:links].size
+    assert_equal 'http://test.host/outside.html', result[:links].first
+    assert_equal '/inside.html', result[:links][1]
+    assert_equal '/footer.html', result[:links][2]
+  end
+  def test_title_from_dcmeta
+    @config.title_tag_selector = lambda do |doc|
+      doc.at("meta[@name='DC.title']")['content']
+    end
+    result = @extractor.process(html_doc('custom_tag_selectors'))
+    assert_equal 'Title from DC meta data', result[:title]
+  end
+  def test_preprocessed_title
+    @config.title_tag_selector = lambda do |doc|
+      title = doc.at("meta[@name='DC.title']")['content']
+      # use only a portion of the title tag's contents if it matches our
+      # regexp:
+      (title =~ /^(.*)meta data$/ ? $1 : title).strip
+    end
+    result = @extractor.process(html_doc('custom_tag_selectors'))
+    assert_equal 'Title from DC', result[:title]
+  end
+end

data/test/unit/{html_content_extractor_test.rb → rubyful_soup_content_extractor_test.rb} RENAMED

@@ -1,10 +1,16 @@
 require 'test_helper'
-class HtmlContentExtractorTest < Test::Unit::TestCase
+class RubyfulSoupContentExtractorTest < Test::Unit::TestCase
   include TestHelper
   def setup
-    @config = OpenStruct.new(:html => RDig.config.content_extraction.html.clone)
-    @extractor = ContentExtractors::HtmlContentExtractor.new(@config)
+    @config =  OpenStruct.new(
+              :content_tag_selector => lambda { |tagsoup|
+                tagsoup.html.body
+              },
+              :title_tag_selector         => lambda { |tagsoup|
+                tagsoup.html.head.title
+              })
+    @extractor = ContentExtractors::RubyfulSoupContentExtractor.new(OpenStruct.new(:rubyful_soup => @config))
     @nbsp = [160].pack('U') # non breaking space
   end
@@ -37,10 +43,10 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
   end
   def test_custom_content_element
-    @config.html.title_tag_selector = lambda do |tagsoup|
+    @config.title_tag_selector = lambda do |tagsoup|
       tagsoup.find('h1', :attrs => { 'class', 'title' })
     end
-    @config.html.content_tag_selector = lambda do |tagsoup|
+    @config.content_tag_selector = lambda do |tagsoup|
       tagsoup.find('div', :attrs => { 'id', 'content' })
     end
     result = @extractor.process(html_doc('custom_tag_selectors'))
@@ -55,7 +61,7 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
   def test_title_from_dcmeta
-    @config.html.title_tag_selector = lambda do |tagsoup|
+    @config.title_tag_selector = lambda do |tagsoup|
       tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
     end
     result = @extractor.process(html_doc('custom_tag_selectors'))
@@ -63,7 +69,7 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
   end
   def test_preprocessed_title
-    @config.html.title_tag_selector = lambda do |tagsoup|
+    @config.title_tag_selector = lambda do |tagsoup|
       title = tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
       # use only a portion of the title tag's contents if it matches our
       # regexp:

data/test/unit/searcher_test.rb ADDED

@@ -0,0 +1,35 @@
+require 'test_helper'
+class SearcherTest < Test::Unit::TestCase
+  include TestHelper
+  def setup
+    @fixture_path = File.expand_path(File.join(File.dirname(__FILE__), '../fixtures/'))
+    index_dir = 'tmp/test-index'
+    Dir.mkdir index_dir unless File.directory? index_dir
+    RDig.configuration do |cfg|
+      @old_crawler_cfg = cfg.crawler.clone
+      cfg.crawler.start_urls = [ "file://#{@fixture_path}" ]
+      cfg.crawler.num_threads = 1
+      cfg.crawler.wait_before_leave = 1
+      cfg.index.path = index_dir
+      cfg.verbose = true
+    end
+    crawler = Crawler.new
+    crawler.run
+  end
+  def teardown
+    RDig.configuration do |cfg|
+      cfg.crawler = @old_crawler_cfg
+    end
+  end
+  def test_search
+    result = RDig.searcher.search 'some sample text'
+    assert_equal 3, result[:hitcount]
+    assert_equal 3, result[:list].size
+  end
+end

metadata CHANGED

@@ -3,15 +3,15 @@ rubygems_version: 0.9.0
 specification_version: 1
 name: rdig
 version: !ruby/object:Gem::Version
-  version: 0.3.2
-date: 2006-10-09 00:00:00 +02:00
+  version: 0.3.3
+date: 2006-10-23 00:00:00 +02:00
 summary: Ruby based web site indexing and searching library.
 require_paths:
 - lib
 email: jk@jkraemer.net
 homepage: http://rdig.rubyforge.org/
 rubyforge_project: rdig
-description: RDig provides an HTTP crawler and content extraction utilities to help building a site search for web sites or intranets. Internally, Ferret is used for the full text indexing. After creating a config file  for your site, the index can be built with a single call to rdig.
+description: RDig provides an HTTP crawler and content extraction utilities to help building a site search for web sites or intranets. Internally, Ferret is used for the full text indexing. After creating a config file  for your site, the index can be built with a single call to rdig. For HTML page crawling, hpricot and rubyful_soup are supported.
 autorequire:
 default_executable: rdig
 bindir: bin
@@ -33,6 +33,7 @@ files:
 - lib/rdig
 - lib/htmlentities
 - lib/rdig.rb
+- lib/rdig/content_extractors
 - lib/rdig/crawler.rb
 - lib/rdig/search.rb
 - lib/rdig/highlight.rb
@@ -41,6 +42,10 @@ files:
 - lib/rdig/content_extractors.rb
 - lib/rdig/documents.rb
 - lib/rdig/file.rb
+- lib/rdig/content_extractors/rubyful_soup.rb
+- lib/rdig/content_extractors/doc.rb
+- lib/rdig/content_extractors/hpricot.rb
+- lib/rdig/content_extractors/pdf.rb
 - lib/htmlentities/CHANGES
 - lib/htmlentities/COPYING
 - lib/htmlentities/README
@@ -50,8 +55,10 @@ files:
 - test/test_helper.rb
 - test/unit/etag_filter_test.rb
 - test/unit/url_filters_test.rb
-- test/unit/html_content_extractor_test.rb
+- test/unit/searcher_test.rb
+- test/unit/rubyful_soup_content_extractor_test.rb
 - test/unit/pdf_content_extractor_test.rb
+- test/unit/hpricot_content_extractor_test.rb
 - test/unit/word_content_extractor_test.rb
 - test/unit/file_document_test.rb
 - test/unit/crawler_fs_test.rb
@@ -100,12 +107,3 @@ dependencies:
       - !ruby/object:Gem::Version
         version: 0.10.0
     version:
-- !ruby/object:Gem::Dependency
-  name: rubyful_soup
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Version::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: 1.0.4
-    version: