RubyGems - gabrielg-indico - Versions diffs - 0.1.0 - Mend

gabrielg-indico 0.1.0

Files changed (29) hide show

data/CHANGELOG +1 -0
data/LICENSE +20 -0
data/Manifest +27 -0
data/README.markdown +7 -0
data/Rakefile +10 -0
data/indico.gemspec +42 -0
data/lib/indico.rb +4 -0
data/lib/thumblemonks/indico/extracted_document.rb +29 -0
data/lib/thumblemonks/indico/io_helpers.rb +31 -0
data/lib/thumblemonks/indico/text_extractor.rb +43 -0
data/lib/thumblemonks/indico/text_extractor/handlers/html_document.rb +34 -0
data/lib/thumblemonks/indico/text_extractor/handlers/pdf_document.rb +27 -0
data/lib/thumblemonks/indico/text_extractor/handlers/rtf_document.rb +30 -0
data/lib/thumblemonks/indico/text_extractor/handlers/text_document.rb +14 -0
data/lib/thumblemonks/indico/text_extractor/handlers/word_document.rb +30 -0
data/test/extractors/html_extraction_test.rb +28 -0
data/test/extractors/pdf_extraction_test.rb +21 -0
data/test/extractors/rtf_extraction_test.rb +21 -0
data/test/extractors/text_extraction_test.rb +20 -0
data/test/extractors/word_extraction_test.rb +21 -0
data/test/fixtures/resume_aardvark.doc +0 -0
data/test/fixtures/resume_detritus.rtf +11 -0
data/test/fixtures/resume_extirpate.html +34 -0
data/test/fixtures/resume_moribund.txt +5 -0
data/test/fixtures/resume_voracity.pdf +0 -0
data/test/indico_test.rb +43 -0
data/test/indico_test_fixture.rb +69 -0
data/test/test_helper.rb +9 -0
metadata +128 -0

data/CHANGELOG ADDED

	@@ -0,0 +1 @@
1	+ v0.1.0. Initial release.

data/LICENSE ADDED

@@ -0,0 +1,20 @@
+Copyright (c) 2008 [Gabriel Gironda]
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/Manifest ADDED

@@ -0,0 +1,27 @@
+CHANGELOG
+lib/indico.rb
+lib/thumblemonks/indico/extracted_document.rb
+lib/thumblemonks/indico/io_helpers.rb
+lib/thumblemonks/indico/text_extractor/handlers/html_document.rb
+lib/thumblemonks/indico/text_extractor/handlers/pdf_document.rb
+lib/thumblemonks/indico/text_extractor/handlers/rtf_document.rb
+lib/thumblemonks/indico/text_extractor/handlers/text_document.rb
+lib/thumblemonks/indico/text_extractor/handlers/word_document.rb
+lib/thumblemonks/indico/text_extractor.rb
+LICENSE
+Manifest
+Rakefile
+README.markdown
+test/extractors/html_extraction_test.rb
+test/extractors/pdf_extraction_test.rb
+test/extractors/rtf_extraction_test.rb
+test/extractors/text_extraction_test.rb
+test/extractors/word_extraction_test.rb
+test/fixtures/resume_aardvark.doc
+test/fixtures/resume_detritus.rtf
+test/fixtures/resume_extirpate.html
+test/fixtures/resume_moribund.txt
+test/fixtures/resume_voracity.pdf
+test/indico_test.rb
+test/indico_test_fixture.rb
+test/test_helper.rb

data/README.markdown ADDED

@@ -0,0 +1,7 @@
+# Indico
+This Gem has a whole bunch of external binary dependencies. You'll want:
+* pstotext installed for PDF support
+* perl and the CPAN module RTF-Parser installed for RTF support
+* antiword for Word support

data/Rakefile ADDED

@@ -0,0 +1,10 @@
+require 'echoe'
+Echoe.new('indico') do |p|
+  p.author = "Gabriel Gironda"
+  p.summary = "Gem for converting documents to plain text for indexing."
+  p.url = "http://github.com/gabrielg/indico"
+  p.install_message = "GROOVY GUYS CHOOSE THUMBLE MONKS"
+  p.runtime_dependencies = ["hpricot", "open4"]
+  # This horks my gems and I've got no idea why.
+  # p.development_dependencies = ["thoughtbot-shoulda >= 2.0.6"]
+end

data/indico.gemspec ADDED

@@ -0,0 +1,42 @@
+# -*- encoding: utf-8 -*-
+Gem::Specification.new do |s|
+  s.name = %q{indico}
+  s.version = "0.1.0"
+  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
+  s.authors = ["Gabriel Gironda"]
+  s.date = %q{2009-01-11}
+  s.description = %q{Gem for converting documents to plain text for indexing.}
+  s.email = %q{}
+  s.extra_rdoc_files = ["CHANGELOG", "lib/indico.rb", "lib/thumblemonks/indico/extracted_document.rb", "lib/thumblemonks/indico/io_helpers.rb", "lib/thumblemonks/indico/text_extractor/handlers/html_document.rb", "lib/thumblemonks/indico/text_extractor/handlers/pdf_document.rb", "lib/thumblemonks/indico/text_extractor/handlers/rtf_document.rb", "lib/thumblemonks/indico/text_extractor/handlers/text_document.rb", "lib/thumblemonks/indico/text_extractor/handlers/word_document.rb", "lib/thumblemonks/indico/text_extractor.rb", "LICENSE", "README.markdown"]
+  s.files = ["CHANGELOG", "lib/indico.rb", "lib/thumblemonks/indico/extracted_document.rb", "lib/thumblemonks/indico/io_helpers.rb", "lib/thumblemonks/indico/text_extractor/handlers/html_document.rb", "lib/thumblemonks/indico/text_extractor/handlers/pdf_document.rb", "lib/thumblemonks/indico/text_extractor/handlers/rtf_document.rb", "lib/thumblemonks/indico/text_extractor/handlers/text_document.rb", "lib/thumblemonks/indico/text_extractor/handlers/word_document.rb", "lib/thumblemonks/indico/text_extractor.rb", "LICENSE", "Manifest", "Rakefile", "README.markdown", "test/extractors/html_extraction_test.rb", "test/extractors/pdf_extraction_test.rb", "test/extractors/rtf_extraction_test.rb", "test/extractors/text_extraction_test.rb", "test/extractors/word_extraction_test.rb", "test/fixtures/resume_aardvark.doc", "test/fixtures/resume_detritus.rtf", "test/fixtures/resume_extirpate.html", "test/fixtures/resume_moribund.txt", "test/fixtures/resume_voracity.pdf", "test/indico_test.rb", "test/indico_test_fixture.rb", "test/test_helper.rb", "indico.gemspec"]
+  s.has_rdoc = true
+  s.homepage = %q{http://github.com/gabrielg/indico}
+  s.post_install_message = %q{GROOVY GUYS CHOOSE THUMBLE MONKS}
+  s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Indico", "--main", "README.markdown"]
+  s.require_paths = ["lib"]
+  s.rubyforge_project = %q{indico}
+  s.rubygems_version = %q{1.3.1}
+  s.summary = %q{Gem for converting documents to plain text for indexing.}
+  s.test_files = ["test/extractors/html_extraction_test.rb", "test/extractors/pdf_extraction_test.rb", "test/extractors/rtf_extraction_test.rb", "test/extractors/text_extraction_test.rb", "test/extractors/word_extraction_test.rb", "test/indico_test.rb", "test/test_helper.rb"]
+  if s.respond_to? :specification_version then
+    current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
+    s.specification_version = 2
+    if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
+      s.add_runtime_dependency(%q<hpricot>, [">= 0"])
+      s.add_runtime_dependency(%q<open4>, [">= 0"])
+      s.add_development_dependency(%q<echoe>, [">= 0"])
+    else
+      s.add_dependency(%q<hpricot>, [">= 0"])
+      s.add_dependency(%q<open4>, [">= 0"])
+      s.add_dependency(%q<echoe>, [">= 0"])
+    end
+  else
+    s.add_dependency(%q<hpricot>, [">= 0"])
+    s.add_dependency(%q<open4>, [">= 0"])
+    s.add_dependency(%q<echoe>, [">= 0"])
+  end
+end

data/lib/indico.rb ADDED

@@ -0,0 +1,4 @@
+require 'thumblemonks/indico/text_extractor.rb'
+require 'thumblemonks/indico/extracted_document.rb'
+require 'thumblemonks/indico/io_helpers'
+Thumblemonks::Indico.load_handlers!

data/lib/thumblemonks/indico/extracted_document.rb ADDED

@@ -0,0 +1,29 @@
+module Thumblemonks
+  module Indico
+    class ExtractedDocument
+      class ExtractionError < ArgumentError; end
+      attr_reader :content
+      def initialize(content)
+        @content = content
+      end
+      def extract_text
+        @extracted_text ||= extract_text!
+      end
+      def extract_text!
+        raise NotImplementedError
+      end
+    private
+      def self.register(options)
+        raise(RuntimeError, "Register should only be called from subclasses") if self == ExtractedDocument
+        Indico.add_handler(self, options)
+      end
+    end # ExtractedDocument
+  end   # Indico
+end     # Thumblemonks

data/lib/thumblemonks/indico/io_helpers.rb ADDED

@@ -0,0 +1,31 @@
+require 'open4'
+require 'tempfile'
+module Thumblemonks
+  module Indico
+    module IOHelpers
+      def open_or_raise(command, exception = nil)
+        exception ||= RuntimeError
+        return_val = nil
+        status = Open4.popen4(command) do |pid,stdin,stdout,stderr|
+          return_val = yield(pid, stdin, stdout, stderr)
+        end
+        raise(exception) unless status.exitstatus.zero?
+        return_val
+      end
+      def open_with_tempfile_or_raise(content, exception = RuntimeError)
+        return_val = nil
+        Tempfile.open(self.class.name.gsub(/[^a-z]/i, '_').downcase) do |tf|
+          tf << content
+          tf.flush
+          cmd = yield(tf)
+          open_or_raise(cmd, exception) { |pid,stdin,stdout,stderr| return_val = stdout.read }
+        end
+        return_val
+      end
+    end # IOHelpers
+  end   # Indico
+end     # Thumblemonks

data/lib/thumblemonks/indico/text_extractor.rb ADDED

@@ -0,0 +1,43 @@
+module Thumblemonks
+  module Indico
+    class UnknownTypeError < ArgumentError; end
+    class << self
+      attr_accessor :known_types, :document_handlers
+    end
+    self.known_types, self.document_handlers = {}, {}
+    def self.type_known?(mime_type)
+      known_types.has_key?(mime_type)
+    end
+    def self.build_document(options)
+      pick_document_class(options[:type]).new(options[:content])
+    end
+    # Called from an initializer in config/initializers
+    def self.load_handlers!
+      Dir["#{handler_dir}/handlers/*_document.rb"].each {|handler| require(handler)}
+    end
+    def self.add_handler(handler_class, options)
+      document_handlers.merge!(options[:type] => handler_class)
+      known_types.merge!(options[:type] => Array(options[:extensions]))
+    end
+  private
+    def self.handler_dir
+      File.join(File.dirname(__FILE__), 'text_extractor')
+    end
+    def self.pick_document_class(mime_type)
+      raise UnknownTypeError unless type_known?(mime_type)
+      document_handler_for(mime_type)
+    end
+    def self.document_handler_for(mime_type)
+      document_handlers[mime_type]
+    end
+  end   # Indico
+end     # Thumblemonks

data/lib/thumblemonks/indico/text_extractor/handlers/html_document.rb ADDED

@@ -0,0 +1,34 @@
+require 'hpricot'
+module Thumblemonks
+  module Indico
+    class HTMLDocument < ExtractedDocument
+      def extract_text!
+        extract_from_html(content)
+      end
+    private
+      # FIXME - extract more than just the body text later
+      def extract_from_html(content)
+        body = Hpricot(content).search('body').first
+        strip_tags!(body)
+        body.search('*').inject('') do |content,element|
+          next(content) unless element.text?
+          content << element.to_plain_text
+        end
+      rescue => e
+        raise Thumblemonks::Indico::ExtractedDocument::ExtractionError
+      end
+      def strip_tags!(body_html)
+        bad_tags = %w[script style]
+        body_html.search(bad_tags.join(", ")).remove
+      end
+      register :type => 'text/html', :extensions => %w[html htm]
+      register :type => 'application/xhtml+xml', :extensions => %w[html htm xhtml htm]
+    end # HTMLDocument
+  end   # Indico
+end     # Thumblemonks

data/lib/thumblemonks/indico/text_extractor/handlers/pdf_document.rb ADDED

@@ -0,0 +1,27 @@
+module Thumblemonks
+  module Indico
+    class PDFDocument < ExtractedDocument
+      include IOHelpers
+      def extract_text!
+        extract_from_pdf(content)
+      end
+    private
+      def extract_from_pdf(content)
+        open_or_raise(pdf_converter, ExtractionError) do |pid,stdin,stdout,stderr|
+          stdin << content
+          stdin.close_write
+          stdout.read
+        end
+      end
+      def pdf_converter
+        'pstotext'
+      end
+      register :type => 'application/pdf', :extensions => %w[pdf]
+    end # PDFDocument
+  end   # Indico
+end     # Thumblemonks

data/lib/thumblemonks/indico/text_extractor/handlers/rtf_document.rb ADDED

@@ -0,0 +1,30 @@
+require 'pathname'
+require 'open4'
+module Thumblemonks
+  module Indico
+    class RTFDocument < ExtractedDocument
+      include IOHelpers
+      def extract_text!
+        extract_from_rtf(content)
+      end
+    private
+      def extract_from_rtf(content)
+        open_or_raise("perl #{rtf_converter}", ExtractionError) do |pid,stdin,stdout,stderr|
+          stdin << content
+          stdin.close_write
+          stdout.read
+        end
+      end
+      def rtf_converter
+        %q[-e 'use RTF::TEXT::Converter;RTF::TEXT::Converter->new(output => \*STDOUT)->parse_stream(\*STDIN);print "\n";']
+      end
+      register :type => 'application/rtf', :extensions => %w[rtf]
+    end # RTFDocument
+  end   # Indico
+end     # Thumblemonks

data/lib/thumblemonks/indico/text_extractor/handlers/text_document.rb ADDED

@@ -0,0 +1,14 @@
+module Thumblemonks
+  module Indico
+    class TextDocument < ExtractedDocument
+      # Plain text extractor really just passes through.
+      # TODO - make it convert the text to UTF-8?
+      def extract_text!
+        content.dup
+      end
+      register :type => 'text/plain', :extensions => %w[txt text]
+    end # RTFDocument
+  end   # Indico
+end     # Thumblemonks

data/lib/thumblemonks/indico/text_extractor/handlers/word_document.rb ADDED

@@ -0,0 +1,30 @@
+require 'tempfile'
+module Thumblemonks
+  module Indico
+    class WordDocument < ExtractedDocument
+      include IOHelpers
+      def extract_text!
+        extract_from_doc(content)
+      end
+    private
+      # FIXME - i hate that i cant pass arguments when using IO.popen. Rather
+      # than pass a file path, do something with making a FIFO and reading/writing
+      # to it instead.
+      def extract_from_doc(content)
+        open_with_tempfile_or_raise(content, ExtractionError) do |tf|
+          %Q[#{word_converter} #{tf.path}]
+        end
+      end
+      def word_converter
+        'antiword'
+      end
+      register :type => 'application/msword', :extensions => %w[doc]
+    end # RTFDocument
+  end   # Indico
+end     # Thumblemonks

data/test/extractors/html_extraction_test.rb ADDED

@@ -0,0 +1,28 @@
+require 'test_helper'
+class HTMLExtractionTest < Test::Unit::TestCase
+  include IndicoTestFixture
+  should_return_extractor 'HTMLDocument', :type => 'text/html'
+  should_return_extractor 'HTMLDocument', :type => 'application/xhtml+xml'
+  should "raise when extracting if Hpricot errs out" do
+    Hpricot.expects(:make).raises(RuntimeError)
+    assert_raise(Thumblemonks::Indico::ExtractedDocument::ExtractionError) do
+      Thumblemonks::Indico.build_document(:type => 'text/html', :content => 'foo').extract_text
+    end
+  end
+  context "with an extracted document" do
+    setup do
+      @extractor = extractor_for('text/html', 'resume_extirpate.html')
+    end
+    should_have_expected_content_with_word 'Extirpate'
+    should_cache_extracted_text
+    should_not_cache_extracted_text_when_using_bang_method
+  end # with an extracted document
+end

data/test/extractors/pdf_extraction_test.rb ADDED

@@ -0,0 +1,21 @@
+require 'test_helper'
+class PDFExtractionTest < Test::Unit::TestCase
+  include IndicoTestFixture
+  should_return_extractor 'PDFDocument', :type => 'application/pdf'
+  should_raise_when_extracting Thumblemonks::Indico::ExtractedDocument::ExtractionError, 'application/pdf', 'resume_aardvark.doc'
+  context "with an extracted document" do
+    setup do
+      @extractor = extractor_for('application/pdf', 'resume_voracity.pdf')
+    end
+    should_have_expected_content_with_word 'Voracity'
+    should_cache_extracted_text
+    should_not_cache_extracted_text_when_using_bang_method
+  end # with an extracted document
+end

data/test/extractors/rtf_extraction_test.rb ADDED

@@ -0,0 +1,21 @@
+require 'test_helper'
+class RTFExtractionTest < Test::Unit::TestCase
+  include IndicoTestFixture
+  should_return_extractor 'RTFDocument', :type => 'application/rtf'
+  should_raise_when_extracting Thumblemonks::Indico::ExtractedDocument::ExtractionError, 'application/rtf', 'resume_aardvark.doc'
+  context "with an extracted document" do
+    setup do
+      @extractor = extractor_for('application/rtf', 'resume_detritus.rtf')
+    end
+    should_have_expected_content_with_word 'Detritus'
+    should_cache_extracted_text
+    should_not_cache_extracted_text_when_using_bang_method
+  end # with an extracted document
+end

data/test/extractors/text_extraction_test.rb ADDED

@@ -0,0 +1,20 @@
+require 'test_helper'
+class TextExtractionTest < Test::Unit::TestCase
+  include IndicoTestFixture
+  should_return_extractor 'TextDocument', :type => 'text/plain'
+  context "with an extracted document" do
+    setup do
+      @extractor = extractor_for('text/plain', 'resume_moribund.txt')
+    end
+    should_have_expected_content_with_word 'Moribund'
+    should_cache_extracted_text
+    should_not_cache_extracted_text_when_using_bang_method
+  end # with an extracted document
+end

data/test/extractors/word_extraction_test.rb ADDED

@@ -0,0 +1,21 @@
+require 'test_helper'
+class WordExtractionTest < Test::Unit::TestCase
+  include IndicoTestFixture
+  should_return_extractor 'WordDocument', :type => 'application/msword'
+  should_raise_when_extracting Thumblemonks::Indico::ExtractedDocument::ExtractionError, 'application/msword', 'resume_voracity.pdf'
+  context "with an extracted document" do
+    setup do
+      @extractor = extractor_for('application/msword', 'resume_aardvark.doc')
+    end
+    should_have_expected_content_with_word 'Aardvark'
+    should_cache_extracted_text
+    should_not_cache_extracted_text_when_using_bang_method
+  end # with an extracted document
+end

data/test/fixtures/resume_aardvark.doc ADDED

Binary file

data/test/fixtures/resume_detritus.rtf ADDED

@@ -0,0 +1,11 @@
+{\rtf1\ansi\ansicpg1252\cocoartf949\cocoasubrtf350
+{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
+{\colortbl;\red255\green255\blue255;}
+\margl1440\margr1440\vieww9000\viewh8400\viewkind0
+\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\ql\qnatural\pardirnatural
+\f0\fs24 \cf0 Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\
+\
+Detritus\
+\
+Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.}

data/test/fixtures/resume_extirpate.html ADDED

@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
+	"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
+<head>
+	<title>Lorem</title>
+</head>
+<body>
+  <script type="text/javascript">
+    var foo = "bar";
+  </script>
+  <style type="text/css">
+    form {
+      background-color: yellow;
+    }
+  </style>
+  <p>
+  Lorem ipsum do&#108;or sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est <em>laborum.</em>
+  </p>
+  <p>
+  Extirpate
+  </p>
+  <p>
+  Lorem ipsum dolor sit amet, <strong>consectetur</strong> adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+  </p>
+</body>
+</html>

data/test/fixtures/resume_moribund.txt ADDED

@@ -0,0 +1,5 @@
+Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+Moribund
+Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

data/test/fixtures/resume_voracity.pdf ADDED

Binary file

data/test/indico_test.rb ADDED

@@ -0,0 +1,43 @@
+require 'test_helper'
+class IndicoTest < Test::Unit::TestCase
+  context "type_known?()" do
+    should "return false if type not known" do
+      assert !Thumblemonks::Indico.type_known?('application/nope')
+    end
+    should "return true if type known" do
+      assert Thumblemonks::Indico.type_known?('application/rtf')
+    end
+  end # type_known?()
+  context "known_types()" do
+    should "return a hash of known_types keyed by mime_type" do
+      assert Thumblemonks::Indico.known_types.has_key?('application/rtf')
+    end
+    should "return a hash of known_types with values being an array of file extensions for the type" do
+      assert Thumblemonks::Indico.known_types['application/rtf'].include?('rtf')
+    end
+  end   # known_types()
+  context "build_document()" do
+    should "raise an exception when trying to build a document for an unknown type" do
+      assert_raise(Thumblemonks::Indico::UnknownTypeError) do
+        Thumblemonks::Indico.build_document(:type => 'application/foo', :content => 'what')
+      end
+    end
+    should "return a document when type is known" do
+      assert_not_nil Thumblemonks::Indico.build_document(:type => 'application/rtf', :content => 'what')
+    end
+  end # "build_document()"
+end

data/test/indico_test_fixture.rb ADDED

@@ -0,0 +1,69 @@
+module IndicoTestFixture
+  def self.included(base)
+    base.send(:include, InstanceMethods)
+    base.extend(ClassMethods)
+  end
+  module InstanceMethods
+    def extractor_for(mime_type, file_name)
+      content = File.read("#{FIXTURES_DIR}/#{file_name}")
+      Thumblemonks::Indico.build_document(:type => mime_type, :content => content)
+    end
+    def expected_words(unique_word)
+      expected = <<-EOE
+    Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+    #{unique_word}
+    Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+  EOE
+      normalize_text(expected)
+    end
+    def normalize_text(string)
+      string.split(/\s+/m).reject{|p|p =~ /^\s*$/}.sort
+    end
+  end # InstanceMethods
+  module ClassMethods
+    def should_raise_when_extracting(exception, mime_type, unparseable_file)
+      should "raise '#{exception.name}' when trying to parse '#{unparseable_file}' as #{mime_type}" do
+        content = File.read("#{FIXTURES_DIR}/#{unparseable_file}")
+        extractor = Thumblemonks::Indico.build_document(:type => mime_type, :content => content)
+        assert_raise(exception) { extractor.extract_text }
+      end
+    end
+    def should_return_extractor(extractor_class_name, options)
+      should "return a '#{extractor_class_name}' for mime type '#{options[:type]}'" do
+        extractor = Thumblemonks::Indico.build_document(:type => options[:type], :content => 'foo')
+        assert_kind_of(Thumblemonks::Indico.const_get(extractor_class_name), extractor)
+      end
+    end
+    def should_have_expected_content_with_word(unique_word)
+      should "extract text with the word '#{unique_word}'" do
+        expected, extracted = expected_words(unique_word), normalize_text(@extractor.extract_text)
+        assert(expected == extracted, "Expected: #{expected.inspect}\nBut got: #{extracted.inspect}\nA difference of: #{(expected - extracted).inspect}\n, #{(extracted - expected).inspect}")
+      end
+    end
+    def should_cache_extracted_text
+      should "cache the extracted text when using a non bang method" do
+        assert_equal(@extractor.extract_text.object_id, @extractor.extract_text.object_id, "Expected same object")
+      end
+    end
+    def should_not_cache_extracted_text_when_using_bang_method
+      should "not cache the extracted text when using the bang method" do
+        assert_not_equal(@extractor.extract_text!.object_id, @extractor.extract_text!.object_id, "Did not expect same object")
+      end
+    end
+  end # ClassMethods
+end

data/test/test_helper.rb ADDED

@@ -0,0 +1,9 @@
+require 'rubygems'
+require 'test/unit'
+require 'shoulda'
+require 'mocha'
+$: << File.dirname(__FILE__) + "/.."
+$: << File.dirname(__FILE__) + "/../lib"
+require 'indico'
+require File.join(File.dirname(__FILE__), 'indico_test_fixture')
+FIXTURES_DIR = File.dirname(__FILE__) + "/fixtures"

metadata ADDED

@@ -0,0 +1,128 @@
+--- !ruby/object:Gem::Specification
+name: gabrielg-indico
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- Gabriel Gironda
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2009-01-11 00:00:00 -08:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: hpricot
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+    version:
+- !ruby/object:Gem::Dependency
+  name: open4
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+    version:
+- !ruby/object:Gem::Dependency
+  name: echoe
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+    version:
+description: Gem for converting documents to plain text for indexing.
+email: ""
+executables: []
+extensions: []
+extra_rdoc_files:
+- CHANGELOG
+- lib/indico.rb
+- lib/thumblemonks/indico/extracted_document.rb
+- lib/thumblemonks/indico/io_helpers.rb
+- lib/thumblemonks/indico/text_extractor/handlers/html_document.rb
+- lib/thumblemonks/indico/text_extractor/handlers/pdf_document.rb
+- lib/thumblemonks/indico/text_extractor/handlers/rtf_document.rb
+- lib/thumblemonks/indico/text_extractor/handlers/text_document.rb
+- lib/thumblemonks/indico/text_extractor/handlers/word_document.rb
+- lib/thumblemonks/indico/text_extractor.rb
+- LICENSE
+- README.markdown
+files:
+- CHANGELOG
+- lib/indico.rb
+- lib/thumblemonks/indico/extracted_document.rb
+- lib/thumblemonks/indico/io_helpers.rb
+- lib/thumblemonks/indico/text_extractor/handlers/html_document.rb
+- lib/thumblemonks/indico/text_extractor/handlers/pdf_document.rb
+- lib/thumblemonks/indico/text_extractor/handlers/rtf_document.rb
+- lib/thumblemonks/indico/text_extractor/handlers/text_document.rb
+- lib/thumblemonks/indico/text_extractor/handlers/word_document.rb
+- lib/thumblemonks/indico/text_extractor.rb
+- LICENSE
+- Manifest
+- Rakefile
+- README.markdown
+- test/extractors/html_extraction_test.rb
+- test/extractors/pdf_extraction_test.rb
+- test/extractors/rtf_extraction_test.rb
+- test/extractors/text_extraction_test.rb
+- test/extractors/word_extraction_test.rb
+- test/fixtures/resume_aardvark.doc
+- test/fixtures/resume_detritus.rtf
+- test/fixtures/resume_extirpate.html
+- test/fixtures/resume_moribund.txt
+- test/fixtures/resume_voracity.pdf
+- test/indico_test.rb
+- test/indico_test_fixture.rb
+- test/test_helper.rb
+- indico.gemspec
+has_rdoc: true
+homepage: http://github.com/gabrielg/indico
+post_install_message: GROOVY GUYS CHOOSE THUMBLE MONKS
+rdoc_options:
+- --line-numbers
+- --inline-source
+- --title
+- Indico
+- --main
+- README.markdown
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "1.2"
+  version:
+requirements: []
+rubyforge_project: indico
+rubygems_version: 1.2.0
+signing_key:
+specification_version: 2
+summary: Gem for converting documents to plain text for indexing.
+test_files:
+- test/extractors/html_extraction_test.rb
+- test/extractors/pdf_extraction_test.rb
+- test/extractors/rtf_extraction_test.rb
+- test/extractors/text_extraction_test.rb
+- test/extractors/word_extraction_test.rb
+- test/indico_test.rb
+- test/test_helper.rb