gabrielg-indico 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1 @@
1
+ v0.1.0. Initial release.
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2008 [Gabriel Gironda]
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,27 @@
1
+ CHANGELOG
2
+ lib/indico.rb
3
+ lib/thumblemonks/indico/extracted_document.rb
4
+ lib/thumblemonks/indico/io_helpers.rb
5
+ lib/thumblemonks/indico/text_extractor/handlers/html_document.rb
6
+ lib/thumblemonks/indico/text_extractor/handlers/pdf_document.rb
7
+ lib/thumblemonks/indico/text_extractor/handlers/rtf_document.rb
8
+ lib/thumblemonks/indico/text_extractor/handlers/text_document.rb
9
+ lib/thumblemonks/indico/text_extractor/handlers/word_document.rb
10
+ lib/thumblemonks/indico/text_extractor.rb
11
+ LICENSE
12
+ Manifest
13
+ Rakefile
14
+ README.markdown
15
+ test/extractors/html_extraction_test.rb
16
+ test/extractors/pdf_extraction_test.rb
17
+ test/extractors/rtf_extraction_test.rb
18
+ test/extractors/text_extraction_test.rb
19
+ test/extractors/word_extraction_test.rb
20
+ test/fixtures/resume_aardvark.doc
21
+ test/fixtures/resume_detritus.rtf
22
+ test/fixtures/resume_extirpate.html
23
+ test/fixtures/resume_moribund.txt
24
+ test/fixtures/resume_voracity.pdf
25
+ test/indico_test.rb
26
+ test/indico_test_fixture.rb
27
+ test/test_helper.rb
@@ -0,0 +1,7 @@
1
+ # Indico
2
+
3
+ This Gem has a whole bunch of external binary dependencies. You'll want:
4
+
5
+ * pstotext installed for PDF support
6
+ * perl and the CPAN module RTF-Parser installed for RTF support
7
+ * antiword for Word support
@@ -0,0 +1,10 @@
1
+ require 'echoe'
2
+ Echoe.new('indico') do |p|
3
+ p.author = "Gabriel Gironda"
4
+ p.summary = "Gem for converting documents to plain text for indexing."
5
+ p.url = "http://github.com/gabrielg/indico"
6
+ p.install_message = "GROOVY GUYS CHOOSE THUMBLE MONKS"
7
+ p.runtime_dependencies = ["hpricot", "open4"]
8
+ # This horks my gems and I've got no idea why.
9
+ # p.development_dependencies = ["thoughtbot-shoulda >= 2.0.6"]
10
+ end
@@ -0,0 +1,42 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = %q{indico}
5
+ s.version = "0.1.0"
6
+
7
+ s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
+ s.authors = ["Gabriel Gironda"]
9
+ s.date = %q{2009-01-11}
10
+ s.description = %q{Gem for converting documents to plain text for indexing.}
11
+ s.email = %q{}
12
+ s.extra_rdoc_files = ["CHANGELOG", "lib/indico.rb", "lib/thumblemonks/indico/extracted_document.rb", "lib/thumblemonks/indico/io_helpers.rb", "lib/thumblemonks/indico/text_extractor/handlers/html_document.rb", "lib/thumblemonks/indico/text_extractor/handlers/pdf_document.rb", "lib/thumblemonks/indico/text_extractor/handlers/rtf_document.rb", "lib/thumblemonks/indico/text_extractor/handlers/text_document.rb", "lib/thumblemonks/indico/text_extractor/handlers/word_document.rb", "lib/thumblemonks/indico/text_extractor.rb", "LICENSE", "README.markdown"]
13
+ s.files = ["CHANGELOG", "lib/indico.rb", "lib/thumblemonks/indico/extracted_document.rb", "lib/thumblemonks/indico/io_helpers.rb", "lib/thumblemonks/indico/text_extractor/handlers/html_document.rb", "lib/thumblemonks/indico/text_extractor/handlers/pdf_document.rb", "lib/thumblemonks/indico/text_extractor/handlers/rtf_document.rb", "lib/thumblemonks/indico/text_extractor/handlers/text_document.rb", "lib/thumblemonks/indico/text_extractor/handlers/word_document.rb", "lib/thumblemonks/indico/text_extractor.rb", "LICENSE", "Manifest", "Rakefile", "README.markdown", "test/extractors/html_extraction_test.rb", "test/extractors/pdf_extraction_test.rb", "test/extractors/rtf_extraction_test.rb", "test/extractors/text_extraction_test.rb", "test/extractors/word_extraction_test.rb", "test/fixtures/resume_aardvark.doc", "test/fixtures/resume_detritus.rtf", "test/fixtures/resume_extirpate.html", "test/fixtures/resume_moribund.txt", "test/fixtures/resume_voracity.pdf", "test/indico_test.rb", "test/indico_test_fixture.rb", "test/test_helper.rb", "indico.gemspec"]
14
+ s.has_rdoc = true
15
+ s.homepage = %q{http://github.com/gabrielg/indico}
16
+ s.post_install_message = %q{GROOVY GUYS CHOOSE THUMBLE MONKS}
17
+ s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Indico", "--main", "README.markdown"]
18
+ s.require_paths = ["lib"]
19
+ s.rubyforge_project = %q{indico}
20
+ s.rubygems_version = %q{1.3.1}
21
+ s.summary = %q{Gem for converting documents to plain text for indexing.}
22
+ s.test_files = ["test/extractors/html_extraction_test.rb", "test/extractors/pdf_extraction_test.rb", "test/extractors/rtf_extraction_test.rb", "test/extractors/text_extraction_test.rb", "test/extractors/word_extraction_test.rb", "test/indico_test.rb", "test/test_helper.rb"]
23
+
24
+ if s.respond_to? :specification_version then
25
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
26
+ s.specification_version = 2
27
+
28
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
29
+ s.add_runtime_dependency(%q<hpricot>, [">= 0"])
30
+ s.add_runtime_dependency(%q<open4>, [">= 0"])
31
+ s.add_development_dependency(%q<echoe>, [">= 0"])
32
+ else
33
+ s.add_dependency(%q<hpricot>, [">= 0"])
34
+ s.add_dependency(%q<open4>, [">= 0"])
35
+ s.add_dependency(%q<echoe>, [">= 0"])
36
+ end
37
+ else
38
+ s.add_dependency(%q<hpricot>, [">= 0"])
39
+ s.add_dependency(%q<open4>, [">= 0"])
40
+ s.add_dependency(%q<echoe>, [">= 0"])
41
+ end
42
+ end
@@ -0,0 +1,4 @@
1
+ require 'thumblemonks/indico/text_extractor.rb'
2
+ require 'thumblemonks/indico/extracted_document.rb'
3
+ require 'thumblemonks/indico/io_helpers'
4
+ Thumblemonks::Indico.load_handlers!
@@ -0,0 +1,29 @@
1
+ module Thumblemonks
2
+ module Indico
3
+ class ExtractedDocument
4
+ class ExtractionError < ArgumentError; end
5
+
6
+ attr_reader :content
7
+
8
+ def initialize(content)
9
+ @content = content
10
+ end
11
+
12
+ def extract_text
13
+ @extracted_text ||= extract_text!
14
+ end
15
+
16
+ def extract_text!
17
+ raise NotImplementedError
18
+ end
19
+
20
+ private
21
+
22
+ def self.register(options)
23
+ raise(RuntimeError, "Register should only be called from subclasses") if self == ExtractedDocument
24
+ Indico.add_handler(self, options)
25
+ end
26
+
27
+ end # ExtractedDocument
28
+ end # Indico
29
+ end # Thumblemonks
@@ -0,0 +1,31 @@
1
+ require 'open4'
2
+ require 'tempfile'
3
+
4
+ module Thumblemonks
5
+ module Indico
6
+ module IOHelpers
7
+
8
+ def open_or_raise(command, exception = nil)
9
+ exception ||= RuntimeError
10
+ return_val = nil
11
+ status = Open4.popen4(command) do |pid,stdin,stdout,stderr|
12
+ return_val = yield(pid, stdin, stdout, stderr)
13
+ end
14
+ raise(exception) unless status.exitstatus.zero?
15
+ return_val
16
+ end
17
+
18
+ def open_with_tempfile_or_raise(content, exception = RuntimeError)
19
+ return_val = nil
20
+ Tempfile.open(self.class.name.gsub(/[^a-z]/i, '_').downcase) do |tf|
21
+ tf << content
22
+ tf.flush
23
+ cmd = yield(tf)
24
+ open_or_raise(cmd, exception) { |pid,stdin,stdout,stderr| return_val = stdout.read }
25
+ end
26
+ return_val
27
+ end
28
+
29
+ end # IOHelpers
30
+ end # Indico
31
+ end # Thumblemonks
@@ -0,0 +1,43 @@
1
+ module Thumblemonks
2
+ module Indico
3
+ class UnknownTypeError < ArgumentError; end
4
+
5
+ class << self
6
+ attr_accessor :known_types, :document_handlers
7
+ end
8
+ self.known_types, self.document_handlers = {}, {}
9
+
10
+ def self.type_known?(mime_type)
11
+ known_types.has_key?(mime_type)
12
+ end
13
+
14
+ def self.build_document(options)
15
+ pick_document_class(options[:type]).new(options[:content])
16
+ end
17
+
18
+ # Called from an initializer in config/initializers
19
+ def self.load_handlers!
20
+ Dir["#{handler_dir}/handlers/*_document.rb"].each {|handler| require(handler)}
21
+ end
22
+
23
+ def self.add_handler(handler_class, options)
24
+ document_handlers.merge!(options[:type] => handler_class)
25
+ known_types.merge!(options[:type] => Array(options[:extensions]))
26
+ end
27
+
28
+ private
29
+
30
+ def self.handler_dir
31
+ File.join(File.dirname(__FILE__), 'text_extractor')
32
+ end
33
+
34
+ def self.pick_document_class(mime_type)
35
+ raise UnknownTypeError unless type_known?(mime_type)
36
+ document_handler_for(mime_type)
37
+ end
38
+
39
+ def self.document_handler_for(mime_type)
40
+ document_handlers[mime_type]
41
+ end
42
+ end # Indico
43
+ end # Thumblemonks
@@ -0,0 +1,34 @@
1
+ require 'hpricot'
2
+ module Thumblemonks
3
+ module Indico
4
+ class HTMLDocument < ExtractedDocument
5
+
6
+ def extract_text!
7
+ extract_from_html(content)
8
+ end
9
+
10
+ private
11
+
12
+ # FIXME - extract more than just the body text later
13
+ def extract_from_html(content)
14
+ body = Hpricot(content).search('body').first
15
+ strip_tags!(body)
16
+ body.search('*').inject('') do |content,element|
17
+ next(content) unless element.text?
18
+ content << element.to_plain_text
19
+ end
20
+ rescue => e
21
+ raise Thumblemonks::Indico::ExtractedDocument::ExtractionError
22
+ end
23
+
24
+ def strip_tags!(body_html)
25
+ bad_tags = %w[script style]
26
+ body_html.search(bad_tags.join(", ")).remove
27
+ end
28
+
29
+ register :type => 'text/html', :extensions => %w[html htm]
30
+ register :type => 'application/xhtml+xml', :extensions => %w[html htm xhtml htm]
31
+
32
+ end # HTMLDocument
33
+ end # Indico
34
+ end # Thumblemonks
@@ -0,0 +1,27 @@
1
+ module Thumblemonks
2
+ module Indico
3
+ class PDFDocument < ExtractedDocument
4
+ include IOHelpers
5
+
6
+ def extract_text!
7
+ extract_from_pdf(content)
8
+ end
9
+
10
+ private
11
+
12
+ def extract_from_pdf(content)
13
+ open_or_raise(pdf_converter, ExtractionError) do |pid,stdin,stdout,stderr|
14
+ stdin << content
15
+ stdin.close_write
16
+ stdout.read
17
+ end
18
+ end
19
+
20
+ def pdf_converter
21
+ 'pstotext'
22
+ end
23
+
24
+ register :type => 'application/pdf', :extensions => %w[pdf]
25
+ end # PDFDocument
26
+ end # Indico
27
+ end # Thumblemonks
@@ -0,0 +1,30 @@
1
+ require 'pathname'
2
+ require 'open4'
3
+
4
+ module Thumblemonks
5
+ module Indico
6
+ class RTFDocument < ExtractedDocument
7
+ include IOHelpers
8
+
9
+ def extract_text!
10
+ extract_from_rtf(content)
11
+ end
12
+
13
+ private
14
+
15
+ def extract_from_rtf(content)
16
+ open_or_raise("perl #{rtf_converter}", ExtractionError) do |pid,stdin,stdout,stderr|
17
+ stdin << content
18
+ stdin.close_write
19
+ stdout.read
20
+ end
21
+ end
22
+
23
+ def rtf_converter
24
+ %q[-e 'use RTF::TEXT::Converter;RTF::TEXT::Converter->new(output => \*STDOUT)->parse_stream(\*STDIN);print "\n";']
25
+ end
26
+
27
+ register :type => 'application/rtf', :extensions => %w[rtf]
28
+ end # RTFDocument
29
+ end # Indico
30
+ end # Thumblemonks
@@ -0,0 +1,14 @@
1
+ module Thumblemonks
2
+ module Indico
3
+ class TextDocument < ExtractedDocument
4
+
5
+ # Plain text extractor really just passes through.
6
+ # TODO - make it convert the text to UTF-8?
7
+ def extract_text!
8
+ content.dup
9
+ end
10
+
11
+ register :type => 'text/plain', :extensions => %w[txt text]
12
+ end # RTFDocument
13
+ end # Indico
14
+ end # Thumblemonks
@@ -0,0 +1,30 @@
1
+ require 'tempfile'
2
+
3
+ module Thumblemonks
4
+ module Indico
5
+ class WordDocument < ExtractedDocument
6
+ include IOHelpers
7
+
8
+ def extract_text!
9
+ extract_from_doc(content)
10
+ end
11
+
12
+ private
13
+
14
+ # FIXME - i hate that i cant pass arguments when using IO.popen. Rather
15
+ # than pass a file path, do something with making a FIFO and reading/writing
16
+ # to it instead.
17
+ def extract_from_doc(content)
18
+ open_with_tempfile_or_raise(content, ExtractionError) do |tf|
19
+ %Q[#{word_converter} #{tf.path}]
20
+ end
21
+ end
22
+
23
+ def word_converter
24
+ 'antiword'
25
+ end
26
+
27
+ register :type => 'application/msword', :extensions => %w[doc]
28
+ end # RTFDocument
29
+ end # Indico
30
+ end # Thumblemonks
@@ -0,0 +1,28 @@
1
+ require 'test_helper'
2
+
3
+ class HTMLExtractionTest < Test::Unit::TestCase
4
+ include IndicoTestFixture
5
+
6
+ should_return_extractor 'HTMLDocument', :type => 'text/html'
7
+ should_return_extractor 'HTMLDocument', :type => 'application/xhtml+xml'
8
+
9
+ should "raise when extracting if Hpricot errs out" do
10
+ Hpricot.expects(:make).raises(RuntimeError)
11
+ assert_raise(Thumblemonks::Indico::ExtractedDocument::ExtractionError) do
12
+ Thumblemonks::Indico.build_document(:type => 'text/html', :content => 'foo').extract_text
13
+ end
14
+ end
15
+
16
+ context "with an extracted document" do
17
+
18
+ setup do
19
+ @extractor = extractor_for('text/html', 'resume_extirpate.html')
20
+ end
21
+
22
+ should_have_expected_content_with_word 'Extirpate'
23
+ should_cache_extracted_text
24
+ should_not_cache_extracted_text_when_using_bang_method
25
+
26
+ end # with an extracted document
27
+
28
+ end
@@ -0,0 +1,21 @@
1
+ require 'test_helper'
2
+
3
+ class PDFExtractionTest < Test::Unit::TestCase
4
+ include IndicoTestFixture
5
+
6
+ should_return_extractor 'PDFDocument', :type => 'application/pdf'
7
+ should_raise_when_extracting Thumblemonks::Indico::ExtractedDocument::ExtractionError, 'application/pdf', 'resume_aardvark.doc'
8
+
9
+ context "with an extracted document" do
10
+
11
+ setup do
12
+ @extractor = extractor_for('application/pdf', 'resume_voracity.pdf')
13
+ end
14
+
15
+ should_have_expected_content_with_word 'Voracity'
16
+ should_cache_extracted_text
17
+ should_not_cache_extracted_text_when_using_bang_method
18
+
19
+ end # with an extracted document
20
+
21
+ end
@@ -0,0 +1,21 @@
1
+ require 'test_helper'
2
+
3
+ class RTFExtractionTest < Test::Unit::TestCase
4
+ include IndicoTestFixture
5
+
6
+ should_return_extractor 'RTFDocument', :type => 'application/rtf'
7
+ should_raise_when_extracting Thumblemonks::Indico::ExtractedDocument::ExtractionError, 'application/rtf', 'resume_aardvark.doc'
8
+
9
+ context "with an extracted document" do
10
+
11
+ setup do
12
+ @extractor = extractor_for('application/rtf', 'resume_detritus.rtf')
13
+ end
14
+
15
+ should_have_expected_content_with_word 'Detritus'
16
+ should_cache_extracted_text
17
+ should_not_cache_extracted_text_when_using_bang_method
18
+
19
+ end # with an extracted document
20
+
21
+ end
@@ -0,0 +1,20 @@
1
+ require 'test_helper'
2
+
3
+ class TextExtractionTest < Test::Unit::TestCase
4
+ include IndicoTestFixture
5
+
6
+ should_return_extractor 'TextDocument', :type => 'text/plain'
7
+
8
+ context "with an extracted document" do
9
+
10
+ setup do
11
+ @extractor = extractor_for('text/plain', 'resume_moribund.txt')
12
+ end
13
+
14
+ should_have_expected_content_with_word 'Moribund'
15
+ should_cache_extracted_text
16
+ should_not_cache_extracted_text_when_using_bang_method
17
+
18
+ end # with an extracted document
19
+
20
+ end
@@ -0,0 +1,21 @@
1
+ require 'test_helper'
2
+
3
+ class WordExtractionTest < Test::Unit::TestCase
4
+ include IndicoTestFixture
5
+
6
+ should_return_extractor 'WordDocument', :type => 'application/msword'
7
+ should_raise_when_extracting Thumblemonks::Indico::ExtractedDocument::ExtractionError, 'application/msword', 'resume_voracity.pdf'
8
+
9
+ context "with an extracted document" do
10
+
11
+ setup do
12
+ @extractor = extractor_for('application/msword', 'resume_aardvark.doc')
13
+ end
14
+
15
+ should_have_expected_content_with_word 'Aardvark'
16
+ should_cache_extracted_text
17
+ should_not_cache_extracted_text_when_using_bang_method
18
+
19
+ end # with an extracted document
20
+
21
+ end
@@ -0,0 +1,11 @@
1
+ {\rtf1\ansi\ansicpg1252\cocoartf949\cocoasubrtf350
2
+ {\fonttbl\f0\fswiss\fcharset0 Helvetica;}
3
+ {\colortbl;\red255\green255\blue255;}
4
+ \margl1440\margr1440\vieww9000\viewh8400\viewkind0
5
+ \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\ql\qnatural\pardirnatural
6
+
7
+ \f0\fs24 \cf0 Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\
8
+ \
9
+ Detritus\
10
+ \
11
+ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.}
@@ -0,0 +1,34 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
3
+ "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
4
+
5
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
6
+ <head>
7
+ <title>Lorem</title>
8
+
9
+ </head>
10
+
11
+ <body>
12
+ <script type="text/javascript">
13
+ var foo = "bar";
14
+ </script>
15
+ <style type="text/css">
16
+ form {
17
+ background-color: yellow;
18
+ }
19
+ </style>
20
+
21
+ <p>
22
+ Lorem ipsum do&#108;or sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est <em>laborum.</em>
23
+ </p>
24
+
25
+ <p>
26
+ Extirpate
27
+ </p>
28
+
29
+ <p>
30
+ Lorem ipsum dolor sit amet, <strong>consectetur</strong> adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
31
+ </p>
32
+
33
+ </body>
34
+ </html>
@@ -0,0 +1,5 @@
1
+ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
2
+
3
+ Moribund
4
+
5
+ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
@@ -0,0 +1,43 @@
1
+ require 'test_helper'
2
+
3
+ class IndicoTest < Test::Unit::TestCase
4
+
5
+ context "type_known?()" do
6
+
7
+ should "return false if type not known" do
8
+ assert !Thumblemonks::Indico.type_known?('application/nope')
9
+ end
10
+
11
+ should "return true if type known" do
12
+ assert Thumblemonks::Indico.type_known?('application/rtf')
13
+ end
14
+
15
+ end # type_known?()
16
+
17
+ context "known_types()" do
18
+
19
+ should "return a hash of known_types keyed by mime_type" do
20
+ assert Thumblemonks::Indico.known_types.has_key?('application/rtf')
21
+ end
22
+
23
+ should "return a hash of known_types with values being an array of file extensions for the type" do
24
+ assert Thumblemonks::Indico.known_types['application/rtf'].include?('rtf')
25
+ end
26
+
27
+ end # known_types()
28
+
29
+ context "build_document()" do
30
+
31
+ should "raise an exception when trying to build a document for an unknown type" do
32
+ assert_raise(Thumblemonks::Indico::UnknownTypeError) do
33
+ Thumblemonks::Indico.build_document(:type => 'application/foo', :content => 'what')
34
+ end
35
+ end
36
+
37
+ should "return a document when type is known" do
38
+ assert_not_nil Thumblemonks::Indico.build_document(:type => 'application/rtf', :content => 'what')
39
+ end
40
+
41
+ end # "build_document()"
42
+
43
+ end
@@ -0,0 +1,69 @@
1
+ module IndicoTestFixture
2
+
3
+ def self.included(base)
4
+ base.send(:include, InstanceMethods)
5
+ base.extend(ClassMethods)
6
+ end
7
+
8
+ module InstanceMethods
9
+
10
+ def extractor_for(mime_type, file_name)
11
+ content = File.read("#{FIXTURES_DIR}/#{file_name}")
12
+ Thumblemonks::Indico.build_document(:type => mime_type, :content => content)
13
+ end
14
+
15
+ def expected_words(unique_word)
16
+ expected = <<-EOE
17
+ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
18
+
19
+ #{unique_word}
20
+
21
+ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
22
+ EOE
23
+ normalize_text(expected)
24
+ end
25
+
26
+ def normalize_text(string)
27
+ string.split(/\s+/m).reject{|p|p =~ /^\s*$/}.sort
28
+ end
29
+
30
+ end # InstanceMethods
31
+
32
+ module ClassMethods
33
+
34
+ def should_raise_when_extracting(exception, mime_type, unparseable_file)
35
+ should "raise '#{exception.name}' when trying to parse '#{unparseable_file}' as #{mime_type}" do
36
+ content = File.read("#{FIXTURES_DIR}/#{unparseable_file}")
37
+ extractor = Thumblemonks::Indico.build_document(:type => mime_type, :content => content)
38
+ assert_raise(exception) { extractor.extract_text }
39
+ end
40
+ end
41
+
42
+ def should_return_extractor(extractor_class_name, options)
43
+ should "return a '#{extractor_class_name}' for mime type '#{options[:type]}'" do
44
+ extractor = Thumblemonks::Indico.build_document(:type => options[:type], :content => 'foo')
45
+ assert_kind_of(Thumblemonks::Indico.const_get(extractor_class_name), extractor)
46
+ end
47
+ end
48
+
49
+ def should_have_expected_content_with_word(unique_word)
50
+ should "extract text with the word '#{unique_word}'" do
51
+ expected, extracted = expected_words(unique_word), normalize_text(@extractor.extract_text)
52
+ assert(expected == extracted, "Expected: #{expected.inspect}\nBut got: #{extracted.inspect}\nA difference of: #{(expected - extracted).inspect}\n, #{(extracted - expected).inspect}")
53
+ end
54
+ end
55
+
56
+ def should_cache_extracted_text
57
+ should "cache the extracted text when using a non bang method" do
58
+ assert_equal(@extractor.extract_text.object_id, @extractor.extract_text.object_id, "Expected same object")
59
+ end
60
+ end
61
+
62
+ def should_not_cache_extracted_text_when_using_bang_method
63
+ should "not cache the extracted text when using the bang method" do
64
+ assert_not_equal(@extractor.extract_text!.object_id, @extractor.extract_text!.object_id, "Did not expect same object")
65
+ end
66
+ end
67
+
68
+ end # ClassMethods
69
+ end
@@ -0,0 +1,9 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+ require 'mocha'
5
+ $: << File.dirname(__FILE__) + "/.."
6
+ $: << File.dirname(__FILE__) + "/../lib"
7
+ require 'indico'
8
+ require File.join(File.dirname(__FILE__), 'indico_test_fixture')
9
+ FIXTURES_DIR = File.dirname(__FILE__) + "/fixtures"
metadata ADDED
@@ -0,0 +1,128 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gabrielg-indico
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Gabriel Gironda
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-01-11 00:00:00 -08:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hpricot
17
+ version_requirement:
18
+ version_requirements: !ruby/object:Gem::Requirement
19
+ requirements:
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: "0"
23
+ version:
24
+ - !ruby/object:Gem::Dependency
25
+ name: open4
26
+ version_requirement:
27
+ version_requirements: !ruby/object:Gem::Requirement
28
+ requirements:
29
+ - - ">="
30
+ - !ruby/object:Gem::Version
31
+ version: "0"
32
+ version:
33
+ - !ruby/object:Gem::Dependency
34
+ name: echoe
35
+ version_requirement:
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: "0"
41
+ version:
42
+ description: Gem for converting documents to plain text for indexing.
43
+ email: ""
44
+ executables: []
45
+
46
+ extensions: []
47
+
48
+ extra_rdoc_files:
49
+ - CHANGELOG
50
+ - lib/indico.rb
51
+ - lib/thumblemonks/indico/extracted_document.rb
52
+ - lib/thumblemonks/indico/io_helpers.rb
53
+ - lib/thumblemonks/indico/text_extractor/handlers/html_document.rb
54
+ - lib/thumblemonks/indico/text_extractor/handlers/pdf_document.rb
55
+ - lib/thumblemonks/indico/text_extractor/handlers/rtf_document.rb
56
+ - lib/thumblemonks/indico/text_extractor/handlers/text_document.rb
57
+ - lib/thumblemonks/indico/text_extractor/handlers/word_document.rb
58
+ - lib/thumblemonks/indico/text_extractor.rb
59
+ - LICENSE
60
+ - README.markdown
61
+ files:
62
+ - CHANGELOG
63
+ - lib/indico.rb
64
+ - lib/thumblemonks/indico/extracted_document.rb
65
+ - lib/thumblemonks/indico/io_helpers.rb
66
+ - lib/thumblemonks/indico/text_extractor/handlers/html_document.rb
67
+ - lib/thumblemonks/indico/text_extractor/handlers/pdf_document.rb
68
+ - lib/thumblemonks/indico/text_extractor/handlers/rtf_document.rb
69
+ - lib/thumblemonks/indico/text_extractor/handlers/text_document.rb
70
+ - lib/thumblemonks/indico/text_extractor/handlers/word_document.rb
71
+ - lib/thumblemonks/indico/text_extractor.rb
72
+ - LICENSE
73
+ - Manifest
74
+ - Rakefile
75
+ - README.markdown
76
+ - test/extractors/html_extraction_test.rb
77
+ - test/extractors/pdf_extraction_test.rb
78
+ - test/extractors/rtf_extraction_test.rb
79
+ - test/extractors/text_extraction_test.rb
80
+ - test/extractors/word_extraction_test.rb
81
+ - test/fixtures/resume_aardvark.doc
82
+ - test/fixtures/resume_detritus.rtf
83
+ - test/fixtures/resume_extirpate.html
84
+ - test/fixtures/resume_moribund.txt
85
+ - test/fixtures/resume_voracity.pdf
86
+ - test/indico_test.rb
87
+ - test/indico_test_fixture.rb
88
+ - test/test_helper.rb
89
+ - indico.gemspec
90
+ has_rdoc: true
91
+ homepage: http://github.com/gabrielg/indico
92
+ post_install_message: GROOVY GUYS CHOOSE THUMBLE MONKS
93
+ rdoc_options:
94
+ - --line-numbers
95
+ - --inline-source
96
+ - --title
97
+ - Indico
98
+ - --main
99
+ - README.markdown
100
+ require_paths:
101
+ - lib
102
+ required_ruby_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: "0"
107
+ version:
108
+ required_rubygems_version: !ruby/object:Gem::Requirement
109
+ requirements:
110
+ - - ">="
111
+ - !ruby/object:Gem::Version
112
+ version: "1.2"
113
+ version:
114
+ requirements: []
115
+
116
+ rubyforge_project: indico
117
+ rubygems_version: 1.2.0
118
+ signing_key:
119
+ specification_version: 2
120
+ summary: Gem for converting documents to plain text for indexing.
121
+ test_files:
122
+ - test/extractors/html_extraction_test.rb
123
+ - test/extractors/pdf_extraction_test.rb
124
+ - test/extractors/rtf_extraction_test.rb
125
+ - test/extractors/text_extraction_test.rb
126
+ - test/extractors/word_extraction_test.rb
127
+ - test/indico_test.rb
128
+ - test/test_helper.rb