RubyGems - rdig - Versions diffs - 0.3.2 → 0.3.3 - Mend

rdig 0.3.2 → 0.3.3

Files changed (13) hide show

data/doc/examples/config.rb +33 -0
data/lib/rdig.rb +35 -23
data/lib/rdig/content_extractors.rb +10 -225
data/lib/rdig/content_extractors/doc.rb +41 -0
data/lib/rdig/content_extractors/hpricot.rb +99 -0
data/lib/rdig/content_extractors/pdf.rb +49 -0
data/lib/rdig/content_extractors/rubyful_soup.rb +147 -0
data/lib/rdig/search.rb +14 -14
data/rakefile +5 -1
data/test/unit/hpricot_content_extractor_test.rb +77 -0
data/test/unit/{html_content_extractor_test.rb → rubyful_soup_content_extractor_test.rb} +13 -7
data/test/unit/searcher_test.rb +35 -0
metadata +11 -13

@@ -27,7 +27,40 @@ RDig.configuration do |cfg|
   # cfg.verbose = false
   # content extraction options
+  cfg.content_extraction = OpenStruct.new(
+  # HPRICOT configuration
+  # this is the html parser used by default from RDig 0.3.3 upwards.
+  # Hpricot by far outperforms Rubyful Soup, and is at least as flexible when
+  # it comes to selection of portions of the html documents.
+    :hpricot      => OpenStruct.new(
+      # css selector for the element containing the page title
+      :title_tag_selector => 'title',
+      # might also be a proc returning either an element or a string:
+      # :title_tag_selector => lambda { |hpricot_doc| ... }
+      :content_tag_selector => 'body'
+      # might also be a proc returning either an element or a string:
+      # :content_tag_selector => lambda { |hpricot_doc| ... }
+    )
+  # RUBYFUL SOUP
+  # This is a powerful, but somewhat slow, ruby-only html parsing lib which was
+  # RDig's default html parser up to version 0.3.2. To use it, comment the
+  # hpricot config above, and uncomment the following:
+  #
+  #  :rubyful_soup => OpenStruct.new(
+  #    # select the html element that contains the content to index
+  #    # by default, we index all inside the body tag:
+  #    :content_tag_selector => lambda { |tagsoup|
+  #      tagsoup.html.body
+  #    },
+  #    # select the html element containing the title
+  #    :title_tag_selector         => lambda { |tagsoup|
+  #      tagsoup.html.head.title
+  #    }
+  #  )
+  )
   # provide a method that returns the title of an html document
   # this method may either return a tag to extract the title from,
   # or a ready-to-index string.

data/lib/rdig.rb CHANGED

@@ -24,7 +24,7 @@
 #++
 #
-RDIGVERSION = '0.3.2'
+RDIGVERSION = '0.3.3'
 require 'thread'
@@ -42,22 +42,12 @@ require 'open-uri'
 begin
   require 'ferret'
-  require 'rubyful_soup'
 rescue LoadError
   require 'rubygems'
   require 'ferret'
-  require 'rubyful_soup'
 end
 require 'htmlentities/htmlentities'
-require 'rdig/content_extractors'
-require 'rdig/url_filters'
-require 'rdig/search'
-require 'rdig/index'
-require 'rdig/file'
-require 'rdig/documents'
-require 'rdig/crawler'
 $KCODE = 'u'
@@ -124,25 +114,37 @@ module RDig
             :wait_before_leave => 10
           ),
           :content_extraction  => OpenStruct.new(
-            # settings for html content extraction
-            :html => OpenStruct.new(
-              # select the html element that contains the content to index
-              # by default, we index all inside the body tag:
-              :content_tag_selector => lambda { |tagsoup|
-                tagsoup.html.body
-              },
-              # select the html element containing the title
-              :title_tag_selector         => lambda { |tagsoup|
-                tagsoup.html.head.title
-              }
+            # settings for html content extraction (hpricot)
+            :hpricot      => OpenStruct.new(
+              # css selector for the element containing the page title
+              :title_tag_selector => 'title',
+              # might also be a proc returning either an element or a string:
+              # :title_tag_selector => lambda { |hpricot_doc| ... }
+              :content_tag_selector => 'body'
+              # might also be a proc returning either an element or a string:
+              # :content_tag_selector => lambda { |hpricot_doc| ... }
             )
+            #,
+            # # settings for html content extraction (RubyfulSoup)
+            # :rubyful_soup => OpenStruct.new(
+            #  # select the html element that contains the content to index
+            #  # by default, we index all inside the body tag:
+            #  :content_tag_selector => lambda { |tagsoup|
+            #    tagsoup.html.body
+            #  },
+            #  # select the html element containing the title
+            #  :title_tag_selector         => lambda { |tagsoup|
+            #    tagsoup.html.head.title
+            #  }
+            # )
           ),
           :index                 => OpenStruct.new(
             :path                => "index/",
             :create              => true,
             :handle_parse_errors => true,
             :analyzer            => Ferret::Analysis::StandardAnalyzer.new,
-            :occur_default       => :must
+            :occur_default       => :must,
+            :default_field       => '*'
           )
         )
       end
@@ -261,3 +263,13 @@ module RDig
     end
   end
 end
+require 'rdig/content_extractors'
+require 'rdig/url_filters'
+require 'rdig/search'
+require 'rdig/index'
+require 'rdig/file'
+require 'rdig/documents'
+require 'rdig/crawler'

data/lib/rdig/content_extractors.rb CHANGED

@@ -1,26 +1,3 @@
-# override some methods concered with entity resolving
-# to convert them to strings
-class BeautifulStoneSoup
-  # resolve unknown html entities using the htmlentities lib
-  alias :orig_unknown_entityref :unknown_entityref
-  def unknown_entityref(ref)
-    if HTMLEntities::MAP.has_key?(ref)
-      handle_data [HTMLEntities::MAP[ref]].pack('U')
-    else
-      orig_unknown_entityref ref
-    end
-  end
-  # resolve numeric entities to utf8
-  def handle_charref(ref)
-    handle_data( ref.gsub(/([0-9]{1,7})/) {
-                            [$1.to_i].pack('U')
-                    }.gsub(/x([0-9a-f]{1,6})/i) {
-                            [$1.to_i(16)].pack('U')
-                    } )
-  end
-end
 module RDig
   # Contains classes which are used for extracting content and meta data from
@@ -30,15 +7,6 @@ module RDig
     # process the given +content+ depending on it's +content_type+.
     def self.process(content, content_type)
       ContentExtractor.process(content, content_type)
-      #      case content_type
-      #when /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
-      #  return HtmlContentExtractor.process(content)
-      #when /^application\/.+pdf/
-      #  return PdfContentExtractor.process(content) unless RDig.config.content_extraction.pdf.disabled
-      #else
-      #  puts "unable to handle content type #{content_type}"
-      #end
-      #return nil
     end
     # Base class for Content Extractors.
@@ -48,7 +16,7 @@ module RDig
       def self.inherited(extractor)
         super(extractor)
-        puts("discovered content extractor class: #{extractor}")
+        puts("discovered content extractor class: #{extractor}") if RDig.config.verbose
         self.extractors << extractor
       end
@@ -72,7 +40,7 @@ module RDig
       end
       def can_do(content_type)
-        content_type =~ @pattern
+        @pattern && content_type =~ @pattern
       end
     end
@@ -104,197 +72,14 @@ module RDig
       end
     end
-    # Extract text from pdf content.
-    #
-    # Requires the pdftotext and pdfinfo utilities from the
-    # xpdf-utils package
-    # (on debian and friends do 'apt-get install xpdf-utils')
-    #
-    class PdfContentExtractor < ContentExtractor
-      include ExternalAppHelper
-      def initialize(config)
-        super(config)
-        @pattern = /^application\/pdf/
-        @pdftotext = 'pdftotext'
-        @pdfinfo = 'pdfinfo'
-        @available = true
-        [ @pdftotext, @pdfinfo].each { |program|
-          unless %x{#{program} -h 2>&1} =~ /Copyright 1996/
-            @available = false
-            break
-          end
-        }
-      end
-      def process(content)
-        result = {}
-        as_file(content) do |file|
-          result[:content] = get_content(file.path).strip
-          result[:title] = get_title(file.path)
-        end
-        result
-      end
-      def get_content(path_to_tempfile)
-        %x{#{@pdftotext} -enc UTF-8 '#{path_to_tempfile}' -}
-      end
-      # extracts the title from pdf meta data
-      # needs pdfinfo
-      # returns the title or nil if no title was found
-      def get_title(path_to_tempfile)
-        %x{#{@pdfinfo} -enc UTF-8 '#{path_to_tempfile}'} =~ /title:\s+(.*)$/i ? $1.strip : nil
-      rescue
-      end
-    end
-    # Extract text from word documents
-    #
-    # Requires the wvHtml utility
-    # (on debian and friends do 'apt-get install wv')
-    class WordContentExtractor < ContentExtractor
-      include ExternalAppHelper
-      def initialize(config)
-        super(config)
-        @wvhtml = 'wvHtml'
-        @pattern = /^application\/msword/
-        # html extractor for parsing wvHtml output
-        @html_extractor = HtmlContentExtractor.new(OpenStruct.new(
-            :html => OpenStruct.new(
-              :content_tag_selector => lambda { |tagsoup|
-                tagsoup.html.body
-              },
-              :title_tag_selector         => lambda { |tagsoup|
-                tagsoup.html.head.title
-              }
-            )))
-        # TODO: besser: if $?.exitstatus == 127 (not found)
-        @available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
-      end
-      def process(content)
-        result = {}
-        as_file(content) do |file|
-          result = @html_extractor.process(%x{#{@wvhtml} --charset=UTF-8 '#{file.path}' -})
-        end
-        return result || {}
-      end
-    end
-    # extracts title, content and links from html documents
-    class HtmlContentExtractor < ContentExtractor
-      def initialize(config)
-        super(config)
-        @pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
-      end
-      # returns:
-      # { :content => 'extracted clear text',
-      #   :meta => { :title => 'Title' },
-      #   :links => [array of urls] }
-      def process(content)
-        result = { }
-        tag_soup = BeautifulSoup.new(content)
-        result[:title] = extract_title(tag_soup)
-        result[:links] = extract_links(tag_soup)
-        result[:content] = extract_content(tag_soup)
-        return result
-      end
-      # Extracts textual content from the HTML tree.
-      #
-      # - First, the root element to use is determined using the
-      # +content_element+ method, which itself uses the content_tag_selector
-      # from RDig.configuration.
-      # - Then, this element is processed by +extract_text+, which will give
-      # all textual content contained in the root element and all it's
-      # children.
-      def extract_content(tag_soup)
-        content = ''
-        ce = content_element(tag_soup)
-        ce.children { |child|
-          extract_text(child, content)
-        } unless ce.nil?
-        return content.strip
-      end
-      # extracts the href attributes of all a tags, except
-      # internal links like <a href="#top">
-      def extract_links(tagsoup)
-        tagsoup.find_all('a').map { |link|
-          CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
-        }.compact
-      end
-      # Extracts the title from the given html tree
-      def extract_title(tagsoup)
-        the_title_tag = title_tag(tagsoup)
-        if the_title_tag.is_a? String
-          the_title_tag
-        else
-          title = ''
-          extract_text(the_title_tag, title)
-          title.strip
-        end
-      end
-      # Recursively extracts all text contained in the given element,
-      # and appends it to content.
-      def extract_text(element, content='')
-        return nil if element.nil?
-        if element.is_a? NavigableString
-          value = strip_comments(element)
-          value.strip!
-          unless value.empty?
-            content << value
-            content << ' '
-          end
-        elsif element.string  # it's a Tag, and it has some content string
-          # skip inline scripts and styles
-          return nil if element.name =~ /^(script|style)$/i
-          value = element.string.strip
-          unless value.empty?
-            content << value
-            content << ' '
-          end
-        else
-          element.children { |child|
-            extract_text(child, content)
-          }
-        end
-      end
-      # Returns the element to extract the title from.
-      #
-      # This may return a string, e.g. an attribute value selected from a meta
-      # tag, too.
-      def title_tag(tagsoup)
-        if @config.html.title_tag_selector
-          @config.html.title_tag_selector.call(tagsoup)
-        else
-          tagsoup.html.head.title
-        end
-      end
-      # Retrieve the root element to extract document content from
-      def content_element(tagsoup)
-        if @config.html.content_tag_selector
-          @config.html.content_tag_selector.call(tagsoup)
-        else
-          tagsoup.html.body
-        end
-      end
-      # Return the given string minus all html comments
-      def strip_comments(string)
-        string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
-      end
-    end
+  end
+end
+# load content extractors
+Dir["#{File.expand_path(File.dirname(__FILE__))}/content_extractors/**/*.rb"].each do |f|
+  begin
+    require f
+  rescue
+    puts "error loading #{f}: #{$!}"
   end
 end

data/lib/rdig/content_extractors/doc.rb ADDED

@@ -0,0 +1,41 @@
+module RDig
+  module ContentExtractors
+    # Extract text from word documents
+    #
+    # Requires the wvHtml utility
+    # (on debian and friends do 'apt-get install wv')
+    class WordContentExtractor < ContentExtractor
+      include ExternalAppHelper
+      def initialize(config)
+        super(config)
+        @wvhtml = 'wvHtml'
+        @pattern = /^application\/msword/
+        # html extractor for parsing wvHtml output
+        @html_extractor = RubyfulSoupContentExtractor.new(OpenStruct.new(
+            :rubyful_soup => OpenStruct.new(
+              :content_tag_selector => lambda { |tagsoup|
+                tagsoup.html.body
+              },
+              :title_tag_selector         => lambda { |tagsoup|
+                tagsoup.html.head.title
+              }
+            )))
+        # TODO: better: if $?.exitstatus == 127 (not found)
+        @available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
+      end
+      def process(content)
+        result = {}
+        as_file(content) do |file|
+          result = @html_extractor.process(%x{#{@wvhtml} --charset=UTF-8 '#{file.path}' -})
+        end
+        return result || {}
+      end
+    end
+  end
+end

data/lib/rdig/content_extractors/hpricot.rb ADDED

@@ -0,0 +1,99 @@
+begin
+  require 'hpricot'
+rescue LoadError
+  require 'rubygems'
+  require 'hpricot'
+end
+module RDig
+  module ContentExtractors
+    # extracts title, content and links from html documents using the hpricot library
+    class HpricotContentExtractor < ContentExtractor
+      def initialize(config)
+        super(config.hpricot)
+        # if not configured, refuse to handle any content:
+        @pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/ if config.hpricot
+      end
+      # returns:
+      # { :content => 'extracted clear text',
+      #   :title => 'Title',
+      #   :links => [array of urls] }
+      def process(content)
+        doc = Hpricot(content)
+        {
+          :title => extract_title(doc).decode_entities,
+          :links => extract_links(doc),
+          :content => extract_content(doc).decode_entities
+        }
+      end
+      # Extracts textual content from the HTML tree.
+      #
+      # - First, the root element to use is determined using the
+      # +content_element+ method, which itself uses the content_tag_selector
+      # from RDig.configuration.
+      # - Then, this element is processed by +extract_text+, which will give
+      # all textual content contained in the root element and all it's
+      # children.
+      def extract_content(doc)
+        content = ''
+        ce = content_element(doc)
+        content = strip_tags(strip_comments(ce.inner_html)) if ce
+#          (ce/'h1, h2, h3, h4, h5, h6, p, li, dt, dd, td, address, option, ').each do |child|
+#          extract_text child, content
+        return content.strip
+      end
+      # extracts the href attributes of all a tags, except
+      # internal links like <a href="#top">
+      def extract_links(doc)
+        (doc/'a').map { |link|
+          href = link['href']
+          CGI.unescapeHTML(href) if href && href !~ /^#/
+        }.compact
+      end
+      # Extracts the title from the given html tree
+      def extract_title(doc)
+        the_title_tag = title_tag(doc)
+        return the_title_tag unless the_title_tag.respond_to? :inner_html
+        strip_tags(the_title_tag.inner_html)
+      end
+      # Returns the element to extract the title from.
+      #
+      # This may return a string, e.g. an attribute value selected from a meta
+      # tag, too.
+      def title_tag(doc)
+        tag_from_config(doc, :title_tag_selector) || doc.at('title')
+      end
+      # Retrieve the root element to extract document content from
+      def content_element(doc)
+        tag_from_config(doc, :content_tag_selector) || doc.at('body')
+      end
+      def tag_from_config(doc, config_key)
+        cfg = @config.send(config_key)
+        cfg.is_a?(String) ? doc/cfg : cfg.call(doc) if cfg
+      end
+      # Return the given string minus all html comments
+      def strip_comments(string)
+        string.gsub Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), ''
+      end
+      def strip_tags(string)
+        string.gsub! Regexp.new('<(script|style).*?>.*?<\/(script|style).*?>',
+                               Regexp::MULTILINE, 'u'), ''
+        string.gsub! Regexp.new('<.+?>',
+                               Regexp::MULTILINE, 'u'), ''
+        string.gsub Regexp.new('\s+', Regexp::MULTILINE, 'u'), ' '
+      end
+    end
+  end
+end

data/lib/rdig/content_extractors/pdf.rb ADDED

@@ -0,0 +1,49 @@
+module RDig
+  module ContentExtractors
+    # Extract text from pdf content.
+    #
+    # Requires the pdftotext and pdfinfo utilities from the
+    # xpdf-utils package
+    # (on debian and friends do 'apt-get install xpdf-utils')
+    #
+    class PdfContentExtractor < ContentExtractor
+      include ExternalAppHelper
+      def initialize(config)
+        super(config)
+        @pattern = /^application\/pdf/
+        @pdftotext = 'pdftotext'
+        @pdfinfo = 'pdfinfo'
+        @available = true
+        [ @pdftotext, @pdfinfo].each { |program|
+          unless %x{#{program} -h 2>&1} =~ /Copyright 1996/
+            @available = false
+            break
+          end
+        }
+      end
+      def process(content)
+        result = {}
+        as_file(content) do |file|
+          result[:content] = get_content(file.path).strip
+          result[:title] = get_title(file.path)
+        end
+        result
+      end
+      def get_content(path_to_tempfile)
+        %x{#{@pdftotext} -enc UTF-8 '#{path_to_tempfile}' -}
+      end
+      # extracts the title from pdf meta data
+      # needs pdfinfo
+      # returns the title or nil if no title was found
+      def get_title(path_to_tempfile)
+        %x{#{@pdfinfo} -enc UTF-8 '#{path_to_tempfile}'} =~ /title:\s+(.*)$/i ? $1.strip : nil
+      rescue
+      end
+    end
+  end
+end

data/lib/rdig/content_extractors/rubyful_soup.rb ADDED

@@ -0,0 +1,147 @@
+begin
+  require 'rubyful_soup'
+rescue LoadError
+  require 'rubygems'
+  require 'rubyful_soup'
+end
+# override some methods concered with entity resolving
+# to convert them to strings
+class BeautifulStoneSoup
+  # resolve unknown html entities using the htmlentities lib
+  alias :orig_unknown_entityref :unknown_entityref
+  def unknown_entityref(ref)
+    if HTMLEntities::MAP.has_key?(ref)
+      handle_data [HTMLEntities::MAP[ref]].pack('U')
+    else
+      orig_unknown_entityref ref
+    end
+  end
+  # resolve numeric entities to utf8
+  def handle_charref(ref)
+    handle_data( ref.gsub(/([0-9]{1,7})/) {
+                            [$1.to_i].pack('U')
+                    }.gsub(/x([0-9a-f]{1,6})/i) {
+                            [$1.to_i(16)].pack('U')
+                    } )
+  end
+end
+module RDig
+  module ContentExtractors
+    # extracts title, content and links from html documents
+    class RubyfulSoupContentExtractor < ContentExtractor
+      def initialize(config)
+        super(config.rubyful_soup)
+        # if not configured, refuse to handle any content:
+        @pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/ if config.rubyful_soup
+      end
+      # returns:
+      # { :content => 'extracted clear text',
+      #   :meta => { :title => 'Title' },
+      #   :links => [array of urls] }
+      def process(content)
+        result = { }
+        tag_soup = BeautifulSoup.new(content)
+        result[:title] = extract_title(tag_soup)
+        result[:links] = extract_links(tag_soup)
+        result[:content] = extract_content(tag_soup)
+        return result
+      end
+      # Extracts textual content from the HTML tree.
+      #
+      # - First, the root element to use is determined using the
+      # +content_element+ method, which itself uses the content_tag_selector
+      # from RDig.configuration.
+      # - Then, this element is processed by +extract_text+, which will give
+      # all textual content contained in the root element and all it's
+      # children.
+      def extract_content(tag_soup)
+        content = ''
+        ce = content_element(tag_soup)
+        ce.children { |child|
+          extract_text(child, content)
+        } unless ce.nil?
+        return content.strip
+      end
+      # extracts the href attributes of all a tags, except
+      # internal links like <a href="#top">
+      def extract_links(tagsoup)
+        tagsoup.find_all('a').map { |link|
+          CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
+        }.compact
+      end
+      # Extracts the title from the given html tree
+      def extract_title(tagsoup)
+        the_title_tag = title_tag(tagsoup)
+        if the_title_tag.is_a? String
+          the_title_tag
+        else
+          title = ''
+          extract_text(the_title_tag, title)
+          title.strip
+        end
+      end
+      # Recursively extracts all text contained in the given element,
+      # and appends it to content.
+      def extract_text(element, content='')
+        return nil if element.nil?
+        if element.is_a? NavigableString
+          value = strip_comments(element)
+          value.strip!
+          unless value.empty?
+            content << value
+            content << ' '
+          end
+        elsif element.string  # it's a Tag, and it has some content string
+          # skip inline scripts and styles
+          return nil if element.name =~ /^(script|style)$/i
+          value = element.string.strip
+          unless value.empty?
+            content << value
+            content << ' '
+          end
+        else
+          element.children { |child|
+            extract_text(child, content)
+          }
+        end
+      end
+      # Returns the element to extract the title from.
+      #
+      # This may return a string, e.g. an attribute value selected from a meta
+      # tag, too.
+      def title_tag(tagsoup)
+        if @config.title_tag_selector
+          @config.title_tag_selector.call(tagsoup)
+        else
+          tagsoup.html.head.title
+        end
+      end
+      # Retrieve the root element to extract document content from
+      def content_element(tagsoup)
+        if @config.content_tag_selector
+          @config.content_tag_selector.call(tagsoup)
+        else
+          tagsoup.html.body
+        end
+      end
+      # Return the given string minus all html comments
+      def strip_comments(string)
+        string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
+      end
+    end
+  end
+end

data/lib/rdig/search.rb CHANGED

@@ -12,7 +12,7 @@ module RDig
       # takes the ferret section of the rdig configuration as a parameter.
       def initialize(settings)
         @ferret_config = settings
-        @query_parser = Ferret::QueryParser.new('*', settings.marshal_dump)
+        @query_parser = Ferret::QueryParser.new(settings.marshal_dump)
         ferret_searcher
       end
@@ -24,8 +24,8 @@ module RDig
           @ferret_searcher = nil
         end
         unless @ferret_searcher
-          @ferret_searcher = IndexSearcher.new(@ferret_config.path)
-          @query_parser.fields = @ferret_searcher.reader.get_field_names.to_a
+          @ferret_searcher = Ferret::Search::Searcher.new(@ferret_config.path)
+          @query_parser.fields = @ferret_searcher.reader.field_names.to_a
         end
         @ferret_searcher
       end
@@ -36,23 +36,23 @@ module RDig
       # for more information on queries.
       # A Ferret::Search::Query instance may be given, too.
       #
-      # Otions are:
-      # first_doc:: first document in result list to retrieve (0-based). The default is 0.
-      # num_docs:: number of documents to retrieve. The default is 10.
+      # Some of the more often used otions are:
+      # offset:: first document in result list to retrieve (0-based). The default is 0.
+      # limit:: number of documents to retrieve. The default is 10.
+      # Please see the Ferret::Search::Searcher API for more options.
       def search(query, options={})
         result = {}
         query = query_parser.parse(query) if query.is_a?(String)
         puts "Query: #{query}"
-        hits = ferret_searcher.search(query, options)
-        result[:hitcount] = hits.total_hits
         results = []
-        hits.each { |doc_id,score|
-          doc = ferret_searcher.reader.get_document doc_id
+        searcher = ferret_searcher
+        result[:hitcount] = searcher.search_each(query, options) do |doc_id, score|
+          doc = searcher[doc_id]
           results << { :score => score,
-                      :title => doc['title'],
-                      :url => doc['url'],
-                      :extract => build_extract(doc['data']) }
-        }
+                       :title => doc[:title],
+                       :url => doc[:url],
+                       :extract => build_extract(doc[:data]) }
+        end
         result[:list] = results
         result
       end

data/rakefile CHANGED

@@ -125,12 +125,16 @@ else
     to help building a site search for web sites or intranets. Internally,
     Ferret is used for the full text indexing. After creating a config file
     for your site, the index can be built with a single call to rdig.
+    For HTML page crawling, hpricot and rubyful_soup are supported.
     EOF
     #### Dependencies and requirements.
     s.add_dependency('ferret', '>= 0.10.0')
-    s.add_dependency('rubyful_soup', '>= 1.0.4')
+    # TODO: check if there is anything like 'suggested' instead of required, or
+    # ORed dependencies...
+    #s.add_dependency('rubyful_soup', '>= 1.0.4')
+    #s.add_dependency('hpricot', '>= 0.4')
     #s.requirements << ""
     #### Which files are to be included in this gem?  Everything!  (Except CVS directories.)

data/test/unit/hpricot_content_extractor_test.rb ADDED

@@ -0,0 +1,77 @@
+require 'test_helper'
+class HpricotContentExtractorTest < Test::Unit::TestCase
+  include TestHelper
+  def setup
+    @config = RDig.config.content_extraction.hpricot.clone
+    @extractor = ContentExtractors::HpricotContentExtractor.new(OpenStruct.new(:hpricot => @config))
+    @nbsp = [160].pack('U') # non breaking space
+  end
+  def test_can_do
+    assert !@extractor.can_do('application/pdf')
+    assert !@extractor.can_do('application/msword')
+    assert @extractor.can_do('text/html')
+    assert @extractor.can_do('text/xml')
+    assert @extractor.can_do('application/xml')
+    assert @extractor.can_do('application/xhtml+xml')
+  end
+  def test_simple
+    result = ContentExtractors.process(html_doc('simple'), 'text/html')
+    assert_not_nil result
+    assert_equal 'Sample Title', result[:title]
+    assert_not_nil result[:content]
+    assert_not_nil result[:links]
+    assert_equal 1, result[:links].size
+    assert_equal 'A Link Affe Some sample text Lorem ipsum', result[:content]
+    assert_equal 'http://test.host/affe.html', result[:links].first
+  end
+  def test_entities
+    result = @extractor.process(html_doc('entities'))
+    assert_equal 'Sample & Title', result[:title]
+    assert_equal 'http://test.host/affe.html?b=a&c=d', result[:links].first
+    assert_equal 'http://test.host/affe2.html?b=a&c=d', result[:links].last
+    assert_equal "Some > Links don't#{@nbsp}break me! Affe Affe Ümläuts heiß hier ß", result[:content]
+  end
+  def test_custom_content_element
+    @config.title_tag_selector = lambda do |doc|
+      doc.at("h1[@class='title']")
+    end
+    @config.content_tag_selector = lambda do |doc|
+      doc.at("div[@id='content']")
+    end
+    result = @extractor.process(html_doc('custom_tag_selectors'))
+    assert_equal 'Sample Title in h1', result[:title]
+    assert_equal 'Affe Real content is here.', result[:content]
+    # check if links are collected outside the content tag, too:
+    assert_equal 3, result[:links].size
+    assert_equal 'http://test.host/outside.html', result[:links].first
+    assert_equal '/inside.html', result[:links][1]
+    assert_equal '/footer.html', result[:links][2]
+  end
+  def test_title_from_dcmeta
+    @config.title_tag_selector = lambda do |doc|
+      doc.at("meta[@name='DC.title']")['content']
+    end
+    result = @extractor.process(html_doc('custom_tag_selectors'))
+    assert_equal 'Title from DC meta data', result[:title]
+  end
+  def test_preprocessed_title
+    @config.title_tag_selector = lambda do |doc|
+      title = doc.at("meta[@name='DC.title']")['content']
+      # use only a portion of the title tag's contents if it matches our
+      # regexp:
+      (title =~ /^(.*)meta data$/ ? $1 : title).strip
+    end
+    result = @extractor.process(html_doc('custom_tag_selectors'))
+    assert_equal 'Title from DC', result[:title]
+  end
+end

data/test/unit/{html_content_extractor_test.rb → rubyful_soup_content_extractor_test.rb} RENAMED

@@ -1,10 +1,16 @@
 require 'test_helper'
-class HtmlContentExtractorTest < Test::Unit::TestCase
+class RubyfulSoupContentExtractorTest < Test::Unit::TestCase
   include TestHelper
   def setup
-    @config = OpenStruct.new(:html => RDig.config.content_extraction.html.clone)
-    @extractor = ContentExtractors::HtmlContentExtractor.new(@config)
+    @config =  OpenStruct.new(
+              :content_tag_selector => lambda { |tagsoup|
+                tagsoup.html.body
+              },
+              :title_tag_selector         => lambda { |tagsoup|
+                tagsoup.html.head.title
+              })
+    @extractor = ContentExtractors::RubyfulSoupContentExtractor.new(OpenStruct.new(:rubyful_soup => @config))
     @nbsp = [160].pack('U') # non breaking space
   end
@@ -37,10 +43,10 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
   end
   def test_custom_content_element
-    @config.html.title_tag_selector = lambda do |tagsoup|
+    @config.title_tag_selector = lambda do |tagsoup|
       tagsoup.find('h1', :attrs => { 'class', 'title' })
     end
-    @config.html.content_tag_selector = lambda do |tagsoup|
+    @config.content_tag_selector = lambda do |tagsoup|
       tagsoup.find('div', :attrs => { 'id', 'content' })
     end
     result = @extractor.process(html_doc('custom_tag_selectors'))
@@ -55,7 +61,7 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
   def test_title_from_dcmeta
-    @config.html.title_tag_selector = lambda do |tagsoup|
+    @config.title_tag_selector = lambda do |tagsoup|
       tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
     end
     result = @extractor.process(html_doc('custom_tag_selectors'))
@@ -63,7 +69,7 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
   end
   def test_preprocessed_title
-    @config.html.title_tag_selector = lambda do |tagsoup|
+    @config.title_tag_selector = lambda do |tagsoup|
       title = tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
       # use only a portion of the title tag's contents if it matches our
       # regexp:

data/test/unit/searcher_test.rb ADDED

@@ -0,0 +1,35 @@
+require 'test_helper'
+class SearcherTest < Test::Unit::TestCase
+  include TestHelper
+  def setup
+    @fixture_path = File.expand_path(File.join(File.dirname(__FILE__), '../fixtures/'))
+    index_dir = 'tmp/test-index'
+    Dir.mkdir index_dir unless File.directory? index_dir
+    RDig.configuration do |cfg|
+      @old_crawler_cfg = cfg.crawler.clone
+      cfg.crawler.start_urls = [ "file://#{@fixture_path}" ]
+      cfg.crawler.num_threads = 1
+      cfg.crawler.wait_before_leave = 1
+      cfg.index.path = index_dir
+      cfg.verbose = true
+    end
+    crawler = Crawler.new
+    crawler.run
+  end
+  def teardown
+    RDig.configuration do |cfg|
+      cfg.crawler = @old_crawler_cfg
+    end
+  end
+  def test_search
+    result = RDig.searcher.search 'some sample text'
+    assert_equal 3, result[:hitcount]
+    assert_equal 3, result[:list].size
+  end
+end

metadata CHANGED

@@ -3,15 +3,15 @@ rubygems_version: 0.9.0
 specification_version: 1
 name: rdig
 version: !ruby/object:Gem::Version
-  version: 0.3.2
-date: 2006-10-09 00:00:00 +02:00
+  version: 0.3.3
+date: 2006-10-23 00:00:00 +02:00
 summary: Ruby based web site indexing and searching library.
 require_paths:
 - lib
 email: jk@jkraemer.net
 homepage: http://rdig.rubyforge.org/
 rubyforge_project: rdig
-description: RDig provides an HTTP crawler and content extraction utilities to help building a site search for web sites or intranets. Internally, Ferret is used for the full text indexing. After creating a config file  for your site, the index can be built with a single call to rdig.
+description: RDig provides an HTTP crawler and content extraction utilities to help building a site search for web sites or intranets. Internally, Ferret is used for the full text indexing. After creating a config file  for your site, the index can be built with a single call to rdig. For HTML page crawling, hpricot and rubyful_soup are supported.
 autorequire:
 default_executable: rdig
 bindir: bin
@@ -33,6 +33,7 @@ files:
 - lib/rdig
 - lib/htmlentities
 - lib/rdig.rb
+- lib/rdig/content_extractors
 - lib/rdig/crawler.rb
 - lib/rdig/search.rb
 - lib/rdig/highlight.rb
@@ -41,6 +42,10 @@ files:
 - lib/rdig/content_extractors.rb
 - lib/rdig/documents.rb
 - lib/rdig/file.rb
+- lib/rdig/content_extractors/rubyful_soup.rb
+- lib/rdig/content_extractors/doc.rb
+- lib/rdig/content_extractors/hpricot.rb
+- lib/rdig/content_extractors/pdf.rb
 - lib/htmlentities/CHANGES
 - lib/htmlentities/COPYING
 - lib/htmlentities/README
@@ -50,8 +55,10 @@ files:
 - test/test_helper.rb
 - test/unit/etag_filter_test.rb
 - test/unit/url_filters_test.rb
-- test/unit/html_content_extractor_test.rb
+- test/unit/searcher_test.rb
+- test/unit/rubyful_soup_content_extractor_test.rb
 - test/unit/pdf_content_extractor_test.rb
+- test/unit/hpricot_content_extractor_test.rb
 - test/unit/word_content_extractor_test.rb
 - test/unit/file_document_test.rb
 - test/unit/crawler_fs_test.rb
@@ -100,12 +107,3 @@ dependencies:
       - !ruby/object:Gem::Version
         version: 0.10.0
     version:
-- !ruby/object:Gem::Dependency
-  name: rubyful_soup
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Version::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: 1.0.4
-    version: