RubyGems - rdig - Versions diffs - 0.1.0 → 0.2.0 - Mend

rdig 0.1.0 → 0.2.0

Files changed (12) hide show

data/CHANGES +6 -0
data/bin/rdig +0 -17
data/lib/rdig.rb +5 -1
data/lib/rdig/content_extractors.rb +133 -18
data/rakefile +6 -1
data/test/fixtures/pdf/simple.pdf +0 -0
data/test/fixtures/word/simple.doc +0 -0
data/test/test_helper.rb +6 -0
data/test/unit/html_content_extractor_test.rb +11 -2
data/test/unit/pdf_content_extractor_test.rb +33 -0
data/test/unit/word_content_extractor_test.rb +34 -0
metadata +81 -76

data/CHANGES CHANGED Viewed

@@ -1,2 +1,8 @@
+0.2.0
+- add pdf and Word content extraction capabilities using the tools
+  from the xpdf-utils and wv packages
+- additional content extractors may be plugged in by extending
+  the ContentExtractor class
 0.1.0
 initial release

data/bin/rdig CHANGED Viewed

@@ -13,20 +13,3 @@ end
 RDig.application.run
-#$LOAD_PATH << File.expand_path(File.dirname(__FILE__) + "/lib")
-#$LOAD_PATH << File.expand_path(File.dirname(__FILE__))
-#require 'init'
-#if ARGV[0]
-#  require ARGV[0]
-#else
-#  require 'config'
-#end
-#include SiteSearch
-#puts "creating new index in #{SiteSearch.settings[:index_dir]}"
-#crawler = Crawler.new
-#crawler.run

data/lib/rdig.rb CHANGED Viewed

@@ -24,7 +24,7 @@
 #++
 #
-RDIGVERSION = '0.1.0'
+RDIGVERSION = '0.2.0'
 require 'thread'
@@ -37,6 +37,10 @@ require 'cgi'
 require 'set'
 require 'net/http'
 require 'getoptlong'
+require 'tempfile'
+# mkmf gives us the handy find_executable method used to check for helper
+# programs:
+require 'mkmf'
 begin
   require 'rubyful_soup'

data/lib/rdig/content_extractors.rb CHANGED Viewed

@@ -23,31 +23,146 @@ end
 module RDig
-  # Contains Classes which are used for extracting content and meta data from
+  # Contains classes which are used for extracting content and meta data from
   # various content types.
-  #
-  # TODO: support at least pdf, too.
   module ContentExtractors
     # process the given +content+ depending on it's +content_type+.
-    def ContentExtractors.process(content, content_type)
-      case content_type
-      when /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
-        return HtmlContentExtractor.process(content)
-      else
+    def self.process(content, content_type)
+      ContentExtractor.process(content, content_type)
+      #      case content_type
+      #when /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
+      #  return HtmlContentExtractor.process(content)
+      #when /^application\/.+pdf/
+      #  return PdfContentExtractor.process(content) unless RDig.config.content_extraction.pdf.disabled
+      #else
+      #  puts "unable to handle content type #{content_type}"
+      #end
+      #return nil
+    end
+    # Base class for Content Extractors.
+    # Extractors inheriting from this class will be auto-discovered and used
+    # when can_do returns true
+    class ContentExtractor
+      def self.inherited(extractor)
+        super(extractor)
+        puts("discovered content extractor class: #{extractor}")
+        self.extractors << extractor
+      end
+      def self.extractors; @@extractors ||= [] end
+      def self.extractor_instances
+        @@extractor_instances ||= extractors.map { |ex_class| ex_class.new }
+      end
+      def self.process(content, content_type)
+        self.extractor_instances.each { |extractor|
+          return extractor.process(content) if extractor.can_do(content_type)
+        }
         puts "unable to handle content type #{content_type}"
+        nil
+      end
+      def can_do(content_type)
+        content_type =~ @pattern
       end
-      return nil
+    end
+    # to be used by concrete implementations having a get_content class method
+    # that takes a path to a file and return the textual content extracted from
+    # that file.
+    module ExternalAppHelper
+      def process(content)
+        result = {}
+        as_file(content) do |file|
+          result[:content] = get_content(file.path).strip
+        end
+        result
+      end
+      def as_file(content)
+        file = Tempfile.new('rdig')
+        file << content
+        file.close
+        yield file
+        file.delete
+      end
+      def available
+        if @available.nil?
+          @available = !find_executable(@executable).nil?
+        end
+        @available
+      end
+      def can_do(content_type)
+        available and super(content_type)
+      end
+    end
+    # Extract text from pdf content.
+    #
+    # Requires the pdftotext utility from the xpdf-utils package
+    # (on debian and friends do 'apt-get install xpdf-utils')
+    #
+    # TODO: use pdfinfo to get title from document
+    class PdfContentExtractor < ContentExtractor
+      include ExternalAppHelper
+      def initialize
+        @executable = 'pdftotext'
+        @pattern = /^application\/pdf/
+      end
+      def get_content(path_to_tempfile)
+        %x{#{@executable} '#{path_to_tempfile}' -}
+      end
+    end
+    # Extract text from word documents
+    #
+    # Requires the antiword utility
+    # (on debian and friends do 'apt-get install antiword')
+    class WordContentExtractor < ContentExtractor
+      include ExternalAppHelper
+      def initialize
+        @executable = 'wvHtml'
+        @pattern = /^application\/msword/
+        @html_extractor = HtmlContentExtractor.new
+      end
+      def process(content)
+        result = {}
+        as_file(content) do |infile|
+          outfile = Tempfile.new('rdig')
+          outfile.close
+          %x{#{@executable} --targetdir='#{File.dirname(outfile.path)}' '#{infile.path}' '#{File.basename(outfile.path)}'}
+          File.open(outfile.path) do |html|
+            result = @html_extractor.process(html.read)
+          end
+          outfile.delete
+        end
+        return result || {}
+      end
     end
     # extracts title, content and links from html documents
-    class HtmlContentExtractor
+    class HtmlContentExtractor < ContentExtractor
+      def initialize
+        @pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
+      end
       # returns:
       # { :content => 'extracted clear text',
       #   :meta => { :title => 'Title' },
       #   :links => [array of urls] }
-      def self.process(content)
+      def process(content)
         result = { }
         tag_soup = BeautifulSoup.new(content)
         result[:title] = extract_title(tag_soup)
@@ -64,7 +179,7 @@ module RDig
       # - Then, this element is processed by +extract_text+, which will give
       # all textual content contained in the root element and all it's
       # children.
-      def self.extract_content(tag_soup)
+      def extract_content(tag_soup)
         content = ''
         content_element(tag_soup).children { |child|
           extract_text(child, content)
@@ -74,14 +189,14 @@ module RDig
       # extracts the href attributes of all a tags, except
       # internal links like <a href="#top">
-      def self.extract_links(tagsoup)
+      def extract_links(tagsoup)
         tagsoup.find_all('a').map { |link|
           CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
         }.compact
       end
       # Extracts the title from the given html tree
-      def self.extract_title(tagsoup)
+      def extract_title(tagsoup)
         title = ''
         the_title_tag = title_tag(tagsoup)
         if the_title_tag.is_a? String
@@ -93,7 +208,7 @@ module RDig
       # Recursively extracts all text contained in the given element,
       # and appends it to content.
-      def self.extract_text(element, content='')
+      def extract_text(element, content='')
         if element.is_a? NavigableString
           value = strip_comments(element)
           value.strip!
@@ -118,7 +233,7 @@ module RDig
       #
       # This may return a string, e.g. an attribute value selected from a meta
       # tag, too.
-      def self.title_tag(tagsoup)
+      def title_tag(tagsoup)
         if RDig.config.content_extraction.html.title_tag_selector
           RDig.config.content_extraction.html.title_tag_selector.call(tagsoup)
         else
@@ -127,7 +242,7 @@ module RDig
       end
       # Retrieve the root element to extract document content from
-      def self.content_element(tagsoup)
+      def content_element(tagsoup)
         if RDig.config.content_extraction.html.content_tag_selector
           RDig.config.content_extraction.html.content_tag_selector.call(tagsoup)
         else
@@ -136,7 +251,7 @@ module RDig
       end
       # Return the given string minus all html comments
-      def self.strip_comments(string)
+      def strip_comments(string)
         string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
       end
     end

data/rakefile CHANGED Viewed

@@ -94,7 +94,7 @@ Rake::TestTask.new("test_functional") { |t|
 # Generate the RDoc documentation ----------------------------------------
 rd = Rake::RDocTask.new { |rdoc|
-  rdoc.rdoc_dir = 'doc/html'
+  rdoc.rdoc_dir = 'html'
   rdoc.title    = "RDig - Ferret based full text search for web sites"
   rdoc.options << '--line-numbers' << '--inline-source'
   rdoc.options << '--main' << 'README'
@@ -323,3 +323,8 @@ task :tag => [:prerelease] do
   end
 end
+# Publish RDocs ------------------------------------------------------
+desc "Publish the API documentation"
+task :pdoc => [:rdoc] do
+  Rake::RubyForgePublisher.new(RUBY_FORGE_PROJECT, RUBY_FORGE_USER).upload
+end

data/test/fixtures/pdf/simple.pdf ADDED Viewed

Binary file

data/test/fixtures/word/simple.doc ADDED Viewed

Binary file

data/test/test_helper.rb CHANGED Viewed

@@ -12,6 +12,12 @@ module TestHelper
     }
   end
+  def word_doc(name)
+    read_fixture("word/#{name}.doc")
+  end
+  def pdf_doc(name)
+    read_fixture("pdf/#{name}.pdf")
+  end
   def html_doc(name)
     read_fixture("html/#{name}.html")
   end

data/test/unit/html_content_extractor_test.rb CHANGED Viewed

@@ -3,7 +3,7 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
   include TestHelper
   def setup
-    @extractor = ContentExtractors::HtmlContentExtractor
+    @extractor = ContentExtractors::HtmlContentExtractor.new
     @nbsp = [160].pack('U') # non breaking space
     @config_backup = RDig.config.content_extraction.html.clone
   end
@@ -12,8 +12,17 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
     RDig.config.content_extraction.html = @config_backup
   end
+  def test_can_do
+    assert !@extractor.can_do('application/pdf')
+    assert !@extractor.can_do('application/msword')
+    assert @extractor.can_do('text/html')
+    assert @extractor.can_do('text/xml')
+    assert @extractor.can_do('application/xml')
+    assert @extractor.can_do('application/xhtml+xml')
+  end
   def test_simple
-    result = @extractor.process(html_doc('simple'))
+    result = ContentExtractors.process(html_doc('simple'), 'text/html')
     assert_not_nil result
     assert_equal 'Sample Title', result[:title]
     assert_not_nil result[:content]

data/test/unit/pdf_content_extractor_test.rb ADDED Viewed

@@ -0,0 +1,33 @@
+require 'test_helper'
+class PdfContentExtractorTest < Test::Unit::TestCase
+  include TestHelper
+  def setup
+    @ce = ContentExtractors::PdfContentExtractor.new
+  end
+  def test_can_do
+    assert @ce.can_do('application/pdf')
+    assert !@ce.can_do('application/msword')
+  end
+  def test_simple_with_ctype
+    result = ContentExtractors.process(pdf_doc('simple'), 'application/pdf')
+    check_content(result)
+  end
+  def test_simple
+    result = @ce.process(pdf_doc('simple'))
+    check_content(result)
+  end
+  private
+  def check_content(result)
+    assert_not_nil result
+    assert_nil result[:title]
+    assert_nil result[:links]
+    assert_not_nil result[:content]
+    assert_equal 'This is for testing PDF extraction. Another Paragraph.', result[:content]
+  end
+end

data/test/unit/word_content_extractor_test.rb ADDED Viewed

@@ -0,0 +1,34 @@
+require 'test_helper'
+class WordContentExtractorTest < Test::Unit::TestCase
+  include TestHelper
+  def setup
+    @ce = ContentExtractors::WordContentExtractor.new
+  end
+  def test_can_do
+    assert !@ce.can_do('application/pdf')
+    assert @ce.can_do('application/msword')
+  end
+  def test_simple_with_ctype
+    result = ContentExtractors.process(word_doc('simple'), 'application/msword')
+    check_content(result)
+  end
+  def test_simple
+    result = @ce.process(word_doc('simple'))
+    check_content(result)
+  end
+  private
+  def check_content(result)
+    assert_not_nil result
+    assert_equal [], result[:links]
+    assert_not_nil result[:title]
+    assert_equal 'Untitled', result[:title]
+    assert_not_nil result[:content]
+    assert_equal 'Test content for Word content extraction. Another paragraph.', result[:content]
+  end
+end

metadata CHANGED Viewed

@@ -1,102 +1,107 @@
 --- !ruby/object:Gem::Specification
-rubygems_version: 0.8.10
+rubygems_version: 0.8.11
 specification_version: 1
 name: rdig
 version: !ruby/object:Gem::Version
-  version: 0.1.0
-date: 2006-03-25
+  version: 0.2.0
+date: 2006-04-19 00:00:00 +02:00
 summary: Ruby based web site indexing and searching library.
 require_paths:
-  - lib
+- lib
 email: jk@jkraemer.net
 homepage: http://rdig.rubyforge.org/
 rubyforge_project: rdig
-description: "RDig provides an HTTP crawler and content extraction utilities to help building
-  a site search for web sites or intranets. Internally, Ferret is used for the
-  full text indexing. After creating a config file  for your site, the index can
-  be built with a single call to rdig."
+description: RDig provides an HTTP crawler and content extraction utilities to help building a site search for web sites or intranets. Internally, Ferret is used for the full text indexing. After creating a config file  for your site, the index can be built with a single call to rdig.
 autorequire:
 default_executable: rdig
 bindir: bin
 has_rdoc: true
 required_ruby_version: !ruby/object:Gem::Version::Requirement
   requirements:
-    -
-      - ">"
-      - !ruby/object:Gem::Version
-        version: 0.0.0
+  - - ">"
+    - !ruby/object:Gem::Version
+      version: 0.0.0
   version:
 platform: ruby
+signing_key:
+cert_chain:
 authors:
-  - Jens Kraemer
+- Jens Kraemer
 files:
-  - bin/rdig
-  - lib/rdig
-  - lib/htmlentities
-  - lib/rdig.rb
-  - lib/rdig/http_client.rb
-  - lib/rdig/crawler.rb
-  - lib/rdig/search.rb
-  - lib/rdig/highlight.rb
-  - lib/rdig/index.rb
-  - lib/rdig/url_filters.rb
-  - lib/rdig/content_extractors.rb
-  - lib/htmlentities/CHANGES
-  - lib/htmlentities/COPYING
-  - lib/htmlentities/README
-  - lib/htmlentities/htmlentities.rb
-  - test/unit
-  - test/fixtures
-  - test/test_helper.rb
-  - test/unit/etag_filter_test.rb
-  - test/unit/url_filters_test.rb
-  - test/unit/html_content_extractor_test.rb
-  - test/fixtures/html
-  - test/fixtures/html/entities.html
-  - test/fixtures/html/simple.html
-  - test/fixtures/html/custom_tag_selectors.html
-  - doc/examples
-  - doc/examples/config.rb
-  - LICENSE
-  - TODO
-  - CHANGES
-  - README
-  - install.rb
-  - rakefile
+- bin/rdig
+- lib/rdig
+- lib/htmlentities
+- lib/rdig.rb
+- lib/rdig/http_client.rb
+- lib/rdig/crawler.rb
+- lib/rdig/search.rb
+- lib/rdig/highlight.rb
+- lib/rdig/index.rb
+- lib/rdig/url_filters.rb
+- lib/rdig/content_extractors.rb
+- lib/htmlentities/CHANGES
+- lib/htmlentities/COPYING
+- lib/htmlentities/README
+- lib/htmlentities/htmlentities.rb
+- test/unit
+- test/fixtures
+- test/test_helper.rb
+- test/unit/etag_filter_test.rb
+- test/unit/url_filters_test.rb
+- test/unit/html_content_extractor_test.rb
+- test/unit/pdf_content_extractor_test.rb
+- test/unit/word_content_extractor_test.rb
+- test/fixtures/html
+- test/fixtures/pdf
+- test/fixtures/word
+- test/fixtures/html/entities.html
+- test/fixtures/html/simple.html
+- test/fixtures/html/custom_tag_selectors.html
+- test/fixtures/pdf/simple.pdf
+- test/fixtures/word/simple.doc
+- doc/examples
+- doc/examples/config.rb
+- LICENSE
+- TODO
+- CHANGES
+- README
+- install.rb
+- rakefile
 test_files: []
 rdoc_options:
-  - "--title"
-  - "Rake -- Ruby Make"
-  - "--main"
-  - README
-  - "--line-numbers"
+- --title
+- Rake -- Ruby Make
+- --main
+- README
+- --line-numbers
 extra_rdoc_files:
-  - README
-  - CHANGES
-  - LICENSE
-  - TODO
+- README
+- CHANGES
+- LICENSE
+- TODO
 executables:
-  - rdig
+- rdig
 extensions: []
 requirements: []
 dependencies:
-  - !ruby/object:Gem::Dependency
-    name: ferret
-    version_requirement:
-    version_requirements: !ruby/object:Gem::Version::Requirement
-      requirements:
-        -
-          - ">="
-          - !ruby/object:Gem::Version
-            version: 0.3.2
-      version:
-  - !ruby/object:Gem::Dependency
-    name: rubyful_soup
-    version_requirement:
-    version_requirements: !ruby/object:Gem::Version::Requirement
-      requirements:
-        -
-          - ">="
-          - !ruby/object:Gem::Version
-            version: 1.0.4
-      version:
+- !ruby/object:Gem::Dependency
+  name: ferret
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Version::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.3.2
+    version:
+- !ruby/object:Gem::Dependency
+  name: rubyful_soup
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Version::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.0.4
+    version: