RubyGems - rdig - Versions diffs - 0.1.0 → 0.2.0 - Mend

rdig 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

data/CHANGES +6 -0
data/bin/rdig +0 -17
data/lib/rdig.rb +5 -1
data/lib/rdig/content_extractors.rb +133 -18
data/rakefile +6 -1
data/test/fixtures/pdf/simple.pdf +0 -0
data/test/fixtures/word/simple.doc +0 -0
data/test/test_helper.rb +6 -0
data/test/unit/html_content_extractor_test.rb +11 -2
data/test/unit/pdf_content_extractor_test.rb +33 -0
data/test/unit/word_content_extractor_test.rb +34 -0
metadata +81 -76

data/CHANGES CHANGED Viewed

@@ -1,2 +1,8 @@
+0.2.0
+- add pdf and Word content extraction capabilities using the tools
+  from the xpdf-utils and wv packages
+- additional content extractors may be plugged in by extending
+  the ContentExtractor class
 0.1.0
 initial release

data/bin/rdig CHANGED Viewed

@@ -13,20 +13,3 @@ end
 RDig.application.run
-#$LOAD_PATH << File.expand_path(File.dirname(__FILE__) + "/lib")
-#$LOAD_PATH << File.expand_path(File.dirname(__FILE__))
-#require 'init'
-#if ARGV[0]
-#  require ARGV[0]
-#else
-#  require 'config'
-#end
-#include SiteSearch
-#puts "creating new index in #{SiteSearch.settings[:index_dir]}"
-#crawler = Crawler.new
-#crawler.run

data/lib/rdig.rb CHANGED Viewed

@@ -24,7 +24,7 @@
 #++
 #
-RDIGVERSION = '0.1.0'
+RDIGVERSION = '0.2.0'
 require 'thread'
@@ -37,6 +37,10 @@ require 'cgi'
 require 'set'
 require 'net/http'
 require 'getoptlong'
+require 'tempfile'
+# mkmf gives us the handy find_executable method used to check for helper
+# programs:
+require 'mkmf'
 begin
   require 'rubyful_soup'

data/lib/rdig/content_extractors.rb CHANGED Viewed

@@ -23,31 +23,146 @@ end
 module RDig
-  # Contains Classes which are used for extracting content and meta data from
+  # Contains classes which are used for extracting content and meta data from
   # various content types.
-  #
-  # TODO: support at least pdf, too.
   module ContentExtractors
     # process the given +content+ depending on it's +content_type+.
-    def ContentExtractors.process(content, content_type)
-      case content_type
-      when /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
-        return HtmlContentExtractor.process(content)
-      else
+    def self.process(content, content_type)
+      ContentExtractor.process(content, content_type)
+      #      case content_type
+      #when /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
+      #  return HtmlContentExtractor.process(content)
+      #when /^application\/.+pdf/
+      #  return PdfContentExtractor.process(content) unless RDig.config.content_extraction.pdf.disabled
+      #else
+      #  puts "unable to handle content type #{content_type}"
+      #end
+      #return nil
+    end
+    # Base class for Content Extractors.
+    # Extractors inheriting from this class will be auto-discovered and used
+    # when can_do returns true
+    class ContentExtractor
+      def self.inherited(extractor)
+        super(extractor)
+        puts("discovered content extractor class: #{extractor}")
+        self.extractors << extractor
+      end
+      def self.extractors; @@extractors ||= [] end
+      def self.extractor_instances
+        @@extractor_instances ||= extractors.map { |ex_class| ex_class.new }
+      end
+      def self.process(content, content_type)
+        self.extractor_instances.each { |extractor|
+          return extractor.process(content) if extractor.can_do(content_type)
+        }
         puts "unable to handle content type #{content_type}"
+        nil
+      end
+      def can_do(content_type)
+        content_type =~ @pattern
       end
-      return nil
+    end
+    # to be used by concrete implementations having a get_content class method
+    # that takes a path to a file and return the textual content extracted from
+    # that file.
+    module ExternalAppHelper
+      def process(content)
+        result = {}
+        as_file(content) do |file|
+          result[:content] = get_content(file.path).strip
+        end
+        result
+      end
+      def as_file(content)
+        file = Tempfile.new('rdig')
+        file << content
+        file.close
+        yield file
+        file.delete
+      end
+      def available
+        if @available.nil?
+          @available = !find_executable(@executable).nil?
+        end
+        @available
+      end
+      def can_do(content_type)
+        available and super(content_type)
+      end
+    end
+    # Extract text from pdf content.
+    #
+    # Requires the pdftotext utility from the xpdf-utils package
+    # (on debian and friends do 'apt-get install xpdf-utils')
+    #
+    # TODO: use pdfinfo to get title from document
+    class PdfContentExtractor < ContentExtractor
+      include ExternalAppHelper
+      def initialize
+        @executable = 'pdftotext'
+        @pattern = /^application\/pdf/
+      end
+      def get_content(path_to_tempfile)
+        %x{#{@executable} '#{path_to_tempfile}' -}
+      end
+    end
+    # Extract text from word documents
+    #
+    # Requires the antiword utility
+    # (on debian and friends do 'apt-get install antiword')
+    class WordContentExtractor < ContentExtractor
+      include ExternalAppHelper
+      def initialize
+        @executable = 'wvHtml'
+        @pattern = /^application\/msword/
+        @html_extractor = HtmlContentExtractor.new
+      end
+      def process(content)
+        result = {}
+        as_file(content) do |infile|
+          outfile = Tempfile.new('rdig')
+          outfile.close
+          %x{#{@executable} --targetdir='#{File.dirname(outfile.path)}' '#{infile.path}' '#{File.basename(outfile.path)}'}
+          File.open(outfile.path) do |html|
+            result = @html_extractor.process(html.read)
+          end
+          outfile.delete
+        end
+        return result || {}
+      end
     end
     # extracts title, content and links from html documents
-    class HtmlContentExtractor
+    class HtmlContentExtractor < ContentExtractor
+      def initialize
+        @pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
+      end
       # returns:
       # { :content => 'extracted clear text',
       #   :meta => { :title => 'Title' },
       #   :links => [array of urls] }
-      def self.process(content)
+      def process(content)
         result = { }
         tag_soup = BeautifulSoup.new(content)
         result[:title] = extract_title(tag_soup)
@@ -64,7 +179,7 @@ module RDig
       # - Then, this element is processed by +extract_text+, which will give
       # all textual content contained in the root element and all it's
       # children.
-      def self.extract_content(tag_soup)
+      def extract_content(tag_soup)
         content = ''
         content_element(tag_soup).children { |child|
           extract_text(child, content)
@@ -74,14 +189,14 @@ module RDig
       # extracts the href attributes of all a tags, except
       # internal links like <a href="#top">
-      def self.extract_links(tagsoup)
+      def extract_links(tagsoup)
         tagsoup.find_all('a').map { |link|
           CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
         }.compact
       end
       # Extracts the title from the given html tree
-      def self.extract_title(tagsoup)
+      def extract_title(tagsoup)
         title = ''
         the_title_tag = title_tag(tagsoup)
         if the_title_tag.is_a? String
@@ -93,7 +208,7 @@ module RDig
       # Recursively extracts all text contained in the given element,
       # and appends it to content.
-      def self.extract_text(element, content='')
+      def extract_text(element, content='')
         if element.is_a? NavigableString
           value = strip_comments(element)
           value.strip!
@@ -118,7 +233,7 @@ module RDig
       #
       # This may return a string, e.g. an attribute value selected from a meta
       # tag, too.
-      def self.title_tag(tagsoup)
+      def title_tag(tagsoup)
         if RDig.config.content_extraction.html.title_tag_selector
           RDig.config.content_extraction.html.title_tag_selector.call(tagsoup)
         else
@@ -127,7 +242,7 @@ module RDig
       end
       # Retrieve the root element to extract document content from
-      def self.content_element(tagsoup)
+      def content_element(tagsoup)
         if RDig.config.content_extraction.html.content_tag_selector
           RDig.config.content_extraction.html.content_tag_selector.call(tagsoup)
         else
@@ -136,7 +251,7 @@ module RDig
       end
       # Return the given string minus all html comments
-      def self.strip_comments(string)
+      def strip_comments(string)
         string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
       end
     end

data/rakefile CHANGED Viewed

@@ -94,7 +94,7 @@ Rake::TestTask.new("test_functional") { |t|
 # Generate the RDoc documentation ----------------------------------------
 rd = Rake::RDocTask.new { |rdoc|
-  rdoc.rdoc_dir = 'doc/html'
+  rdoc.rdoc_dir = 'html'
   rdoc.title    = "RDig - Ferret based full text search for web sites"
   rdoc.options << '--line-numbers' << '--inline-source'
   rdoc.options << '--main' << 'README'
@@ -323,3 +323,8 @@ task :tag => [:prerelease] do
   end
 end
+# Publish RDocs ------------------------------------------------------
+desc "Publish the API documentation"
+task :pdoc => [:rdoc] do
+  Rake::RubyForgePublisher.new(RUBY_FORGE_PROJECT, RUBY_FORGE_USER).upload
+end

data/test/fixtures/pdf/simple.pdf ADDED Viewed

Binary file

data/test/fixtures/word/simple.doc ADDED Viewed

Binary file

data/test/test_helper.rb CHANGED Viewed

@@ -12,6 +12,12 @@ module TestHelper
     }
   end
+  def word_doc(name)
+    read_fixture("word/#{name}.doc")
+  end
+  def pdf_doc(name)
+    read_fixture("pdf/#{name}.pdf")
+  end
   def html_doc(name)
     read_fixture("html/#{name}.html")
   end

data/test/unit/html_content_extractor_test.rb CHANGED Viewed

@@ -3,7 +3,7 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
   include TestHelper
   def setup
-    @extractor = ContentExtractors::HtmlContentExtractor
+    @extractor = ContentExtractors::HtmlContentExtractor.new
     @nbsp = [160].pack('U') # non breaking space
     @config_backup = RDig.config.content_extraction.html.clone
   end
@@ -12,8 +12,17 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
     RDig.config.content_extraction.html = @config_backup
   end
+  def test_can_do
+    assert !@extractor.can_do('application/pdf')
+    assert !@extractor.can_do('application/msword')
+    assert @extractor.can_do('text/html')
+    assert @extractor.can_do('text/xml')
+    assert @extractor.can_do('application/xml')
+    assert @extractor.can_do('application/xhtml+xml')
+  end
   def test_simple
-    result = @extractor.process(html_doc('simple'))
+    result = ContentExtractors.process(html_doc('simple'), 'text/html')
     assert_not_nil result
     assert_equal 'Sample Title', result[:title]
     assert_not_nil result[:content]

data/test/unit/pdf_content_extractor_test.rb ADDED Viewed

@@ -0,0 +1,33 @@
+require 'test_helper'
+class PdfContentExtractorTest < Test::Unit::TestCase
+  include TestHelper
+  def setup
+    @ce = ContentExtractors::PdfContentExtractor.new
+  end
+  def test_can_do
+    assert @ce.can_do('application/pdf')
+    assert !@ce.can_do('application/msword')
+  end
+  def test_simple_with_ctype
+    result = ContentExtractors.process(pdf_doc('simple'), 'application/pdf')
+    check_content(result)
+  end
+  def test_simple
+    result = @ce.process(pdf_doc('simple'))
+    check_content(result)
+  end
+  private
+  def check_content(result)
+    assert_not_nil result
+    assert_nil result[:title]
+    assert_nil result[:links]
+    assert_not_nil result[:content]
+    assert_equal 'This is for testing PDF extraction. Another Paragraph.', result[:content]
+  end
+end

data/test/unit/word_content_extractor_test.rb ADDED Viewed

@@ -0,0 +1,34 @@
+require 'test_helper'
+class WordContentExtractorTest < Test::Unit::TestCase
+  include TestHelper
+  def setup
+    @ce = ContentExtractors::WordContentExtractor.new
+  end
+  def test_can_do
+    assert !@ce.can_do('application/pdf')
+    assert @ce.can_do('application/msword')
+  end
+  def test_simple_with_ctype
+    result = ContentExtractors.process(word_doc('simple'), 'application/msword')
+    check_content(result)
+  end
+  def test_simple
+    result = @ce.process(word_doc('simple'))
+    check_content(result)
+  end
+  private
+  def check_content(result)
+    assert_not_nil result
+    assert_equal [], result[:links]
+    assert_not_nil result[:title]
+    assert_equal 'Untitled', result[:title]
+    assert_not_nil result[:content]
+    assert_equal 'Test content for Word content extraction. Another paragraph.', result[:content]
+  end
+end

metadata CHANGED Viewed

@@ -1,102 +1,107 @@
 --- !ruby/object:Gem::Specification
-rubygems_version: 0.8.10
+rubygems_version: 0.8.11
 specification_version: 1
 name: rdig
 version: !ruby/object:Gem::Version
-  version: 0.1.0
-date: 2006-03-25
+  version: 0.2.0
+date: 2006-04-19 00:00:00 +02:00
 summary: Ruby based web site indexing and searching library.
 require_paths:
-  - lib
+- lib
 email: jk@jkraemer.net
 homepage: http://rdig.rubyforge.org/
 rubyforge_project: rdig
-description: "RDig provides an HTTP crawler and content extraction utilities to help building
-  a site search for web sites or intranets. Internally, Ferret is used for the
-  full text indexing. After creating a config file  for your site, the index can
-  be built with a single call to rdig."
+description: RDig provides an HTTP crawler and content extraction utilities to help building a site search for web sites or intranets. Internally, Ferret is used for the full text indexing. After creating a config file  for your site, the index can be built with a single call to rdig.
 autorequire:
 default_executable: rdig
 bindir: bin
 has_rdoc: true
 required_ruby_version: !ruby/object:Gem::Version::Requirement
   requirements:
-    -
-      - ">"
-      - !ruby/object:Gem::Version
-        version: 0.0.0
+  - - ">"
+    - !ruby/object:Gem::Version
+      version: 0.0.0
   version:
 platform: ruby
+signing_key:
+cert_chain:
 authors:
-  - Jens Kraemer
+- Jens Kraemer
 files:
-  - bin/rdig
-  - lib/rdig
-  - lib/htmlentities
-  - lib/rdig.rb
-  - lib/rdig/http_client.rb
-  - lib/rdig/crawler.rb
-  - lib/rdig/search.rb
-  - lib/rdig/highlight.rb
-  - lib/rdig/index.rb
-  - lib/rdig/url_filters.rb
-  - lib/rdig/content_extractors.rb
-  - lib/htmlentities/CHANGES
-  - lib/htmlentities/COPYING
-  - lib/htmlentities/README
-  - lib/htmlentities/htmlentities.rb
-  - test/unit
-  - test/fixtures
-  - test/test_helper.rb
-  - test/unit/etag_filter_test.rb
-  - test/unit/url_filters_test.rb
-  - test/unit/html_content_extractor_test.rb
-  - test/fixtures/html
-  - test/fixtures/html/entities.html
-  - test/fixtures/html/simple.html
-  - test/fixtures/html/custom_tag_selectors.html
-  - doc/examples
-  - doc/examples/config.rb
-  - LICENSE
-  - TODO
-  - CHANGES
-  - README
-  - install.rb
-  - rakefile
+- bin/rdig
+- lib/rdig
+- lib/htmlentities
+- lib/rdig.rb
+- lib/rdig/http_client.rb
+- lib/rdig/crawler.rb
+- lib/rdig/search.rb
+- lib/rdig/highlight.rb
+- lib/rdig/index.rb
+- lib/rdig/url_filters.rb
+- lib/rdig/content_extractors.rb
+- lib/htmlentities/CHANGES
+- lib/htmlentities/COPYING
+- lib/htmlentities/README
+- lib/htmlentities/htmlentities.rb
+- test/unit
+- test/fixtures
+- test/test_helper.rb
+- test/unit/etag_filter_test.rb
+- test/unit/url_filters_test.rb
+- test/unit/html_content_extractor_test.rb
+- test/unit/pdf_content_extractor_test.rb
+- test/unit/word_content_extractor_test.rb
+- test/fixtures/html
+- test/fixtures/pdf
+- test/fixtures/word
+- test/fixtures/html/entities.html
+- test/fixtures/html/simple.html
+- test/fixtures/html/custom_tag_selectors.html
+- test/fixtures/pdf/simple.pdf
+- test/fixtures/word/simple.doc
+- doc/examples
+- doc/examples/config.rb
+- LICENSE
+- TODO
+- CHANGES
+- README
+- install.rb
+- rakefile
 test_files: []
 rdoc_options:
-  - "--title"
-  - "Rake -- Ruby Make"
-  - "--main"
-  - README
-  - "--line-numbers"
+- --title
+- Rake -- Ruby Make
+- --main
+- README
+- --line-numbers
 extra_rdoc_files:
-  - README
-  - CHANGES
-  - LICENSE
-  - TODO
+- README
+- CHANGES
+- LICENSE
+- TODO
 executables:
-  - rdig
+- rdig
 extensions: []
 requirements: []
 dependencies:
-  - !ruby/object:Gem::Dependency
-    name: ferret
-    version_requirement:
-    version_requirements: !ruby/object:Gem::Version::Requirement
-      requirements:
-        -
-          - ">="
-          - !ruby/object:Gem::Version
-            version: 0.3.2
-      version:
-  - !ruby/object:Gem::Dependency
-    name: rubyful_soup
-    version_requirement:
-    version_requirements: !ruby/object:Gem::Version::Requirement
-      requirements:
-        -
-          - ">="
-          - !ruby/object:Gem::Version
-            version: 1.0.4
-      version:
+- !ruby/object:Gem::Dependency
+  name: ferret
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Version::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.3.2
+    version:
+- !ruby/object:Gem::Dependency
+  name: rubyful_soup
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Version::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.0.4
+    version: