gabrielg-indico 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +1 -0
- data/LICENSE +20 -0
- data/Manifest +27 -0
- data/README.markdown +7 -0
- data/Rakefile +10 -0
- data/indico.gemspec +42 -0
- data/lib/indico.rb +4 -0
- data/lib/thumblemonks/indico/extracted_document.rb +29 -0
- data/lib/thumblemonks/indico/io_helpers.rb +31 -0
- data/lib/thumblemonks/indico/text_extractor.rb +43 -0
- data/lib/thumblemonks/indico/text_extractor/handlers/html_document.rb +34 -0
- data/lib/thumblemonks/indico/text_extractor/handlers/pdf_document.rb +27 -0
- data/lib/thumblemonks/indico/text_extractor/handlers/rtf_document.rb +30 -0
- data/lib/thumblemonks/indico/text_extractor/handlers/text_document.rb +14 -0
- data/lib/thumblemonks/indico/text_extractor/handlers/word_document.rb +30 -0
- data/test/extractors/html_extraction_test.rb +28 -0
- data/test/extractors/pdf_extraction_test.rb +21 -0
- data/test/extractors/rtf_extraction_test.rb +21 -0
- data/test/extractors/text_extraction_test.rb +20 -0
- data/test/extractors/word_extraction_test.rb +21 -0
- data/test/fixtures/resume_aardvark.doc +0 -0
- data/test/fixtures/resume_detritus.rtf +11 -0
- data/test/fixtures/resume_extirpate.html +34 -0
- data/test/fixtures/resume_moribund.txt +5 -0
- data/test/fixtures/resume_voracity.pdf +0 -0
- data/test/indico_test.rb +43 -0
- data/test/indico_test_fixture.rb +69 -0
- data/test/test_helper.rb +9 -0
- metadata +128 -0
data/CHANGELOG
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
v0.1.0. Initial release.
|
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2008 [Gabriel Gironda]
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Manifest
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
CHANGELOG
|
2
|
+
lib/indico.rb
|
3
|
+
lib/thumblemonks/indico/extracted_document.rb
|
4
|
+
lib/thumblemonks/indico/io_helpers.rb
|
5
|
+
lib/thumblemonks/indico/text_extractor/handlers/html_document.rb
|
6
|
+
lib/thumblemonks/indico/text_extractor/handlers/pdf_document.rb
|
7
|
+
lib/thumblemonks/indico/text_extractor/handlers/rtf_document.rb
|
8
|
+
lib/thumblemonks/indico/text_extractor/handlers/text_document.rb
|
9
|
+
lib/thumblemonks/indico/text_extractor/handlers/word_document.rb
|
10
|
+
lib/thumblemonks/indico/text_extractor.rb
|
11
|
+
LICENSE
|
12
|
+
Manifest
|
13
|
+
Rakefile
|
14
|
+
README.markdown
|
15
|
+
test/extractors/html_extraction_test.rb
|
16
|
+
test/extractors/pdf_extraction_test.rb
|
17
|
+
test/extractors/rtf_extraction_test.rb
|
18
|
+
test/extractors/text_extraction_test.rb
|
19
|
+
test/extractors/word_extraction_test.rb
|
20
|
+
test/fixtures/resume_aardvark.doc
|
21
|
+
test/fixtures/resume_detritus.rtf
|
22
|
+
test/fixtures/resume_extirpate.html
|
23
|
+
test/fixtures/resume_moribund.txt
|
24
|
+
test/fixtures/resume_voracity.pdf
|
25
|
+
test/indico_test.rb
|
26
|
+
test/indico_test_fixture.rb
|
27
|
+
test/test_helper.rb
|
data/README.markdown
ADDED
data/Rakefile
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
require 'echoe'
|
2
|
+
Echoe.new('indico') do |p|
|
3
|
+
p.author = "Gabriel Gironda"
|
4
|
+
p.summary = "Gem for converting documents to plain text for indexing."
|
5
|
+
p.url = "http://github.com/gabrielg/indico"
|
6
|
+
p.install_message = "GROOVY GUYS CHOOSE THUMBLE MONKS"
|
7
|
+
p.runtime_dependencies = ["hpricot", "open4"]
|
8
|
+
# This horks my gems and I've got no idea why.
|
9
|
+
# p.development_dependencies = ["thoughtbot-shoulda >= 2.0.6"]
|
10
|
+
end
|
data/indico.gemspec
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = %q{indico}
|
5
|
+
s.version = "0.1.0"
|
6
|
+
|
7
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
|
+
s.authors = ["Gabriel Gironda"]
|
9
|
+
s.date = %q{2009-01-11}
|
10
|
+
s.description = %q{Gem for converting documents to plain text for indexing.}
|
11
|
+
s.email = %q{}
|
12
|
+
s.extra_rdoc_files = ["CHANGELOG", "lib/indico.rb", "lib/thumblemonks/indico/extracted_document.rb", "lib/thumblemonks/indico/io_helpers.rb", "lib/thumblemonks/indico/text_extractor/handlers/html_document.rb", "lib/thumblemonks/indico/text_extractor/handlers/pdf_document.rb", "lib/thumblemonks/indico/text_extractor/handlers/rtf_document.rb", "lib/thumblemonks/indico/text_extractor/handlers/text_document.rb", "lib/thumblemonks/indico/text_extractor/handlers/word_document.rb", "lib/thumblemonks/indico/text_extractor.rb", "LICENSE", "README.markdown"]
|
13
|
+
s.files = ["CHANGELOG", "lib/indico.rb", "lib/thumblemonks/indico/extracted_document.rb", "lib/thumblemonks/indico/io_helpers.rb", "lib/thumblemonks/indico/text_extractor/handlers/html_document.rb", "lib/thumblemonks/indico/text_extractor/handlers/pdf_document.rb", "lib/thumblemonks/indico/text_extractor/handlers/rtf_document.rb", "lib/thumblemonks/indico/text_extractor/handlers/text_document.rb", "lib/thumblemonks/indico/text_extractor/handlers/word_document.rb", "lib/thumblemonks/indico/text_extractor.rb", "LICENSE", "Manifest", "Rakefile", "README.markdown", "test/extractors/html_extraction_test.rb", "test/extractors/pdf_extraction_test.rb", "test/extractors/rtf_extraction_test.rb", "test/extractors/text_extraction_test.rb", "test/extractors/word_extraction_test.rb", "test/fixtures/resume_aardvark.doc", "test/fixtures/resume_detritus.rtf", "test/fixtures/resume_extirpate.html", "test/fixtures/resume_moribund.txt", "test/fixtures/resume_voracity.pdf", "test/indico_test.rb", "test/indico_test_fixture.rb", "test/test_helper.rb", "indico.gemspec"]
|
14
|
+
s.has_rdoc = true
|
15
|
+
s.homepage = %q{http://github.com/gabrielg/indico}
|
16
|
+
s.post_install_message = %q{GROOVY GUYS CHOOSE THUMBLE MONKS}
|
17
|
+
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Indico", "--main", "README.markdown"]
|
18
|
+
s.require_paths = ["lib"]
|
19
|
+
s.rubyforge_project = %q{indico}
|
20
|
+
s.rubygems_version = %q{1.3.1}
|
21
|
+
s.summary = %q{Gem for converting documents to plain text for indexing.}
|
22
|
+
s.test_files = ["test/extractors/html_extraction_test.rb", "test/extractors/pdf_extraction_test.rb", "test/extractors/rtf_extraction_test.rb", "test/extractors/text_extraction_test.rb", "test/extractors/word_extraction_test.rb", "test/indico_test.rb", "test/test_helper.rb"]
|
23
|
+
|
24
|
+
if s.respond_to? :specification_version then
|
25
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
26
|
+
s.specification_version = 2
|
27
|
+
|
28
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
29
|
+
s.add_runtime_dependency(%q<hpricot>, [">= 0"])
|
30
|
+
s.add_runtime_dependency(%q<open4>, [">= 0"])
|
31
|
+
s.add_development_dependency(%q<echoe>, [">= 0"])
|
32
|
+
else
|
33
|
+
s.add_dependency(%q<hpricot>, [">= 0"])
|
34
|
+
s.add_dependency(%q<open4>, [">= 0"])
|
35
|
+
s.add_dependency(%q<echoe>, [">= 0"])
|
36
|
+
end
|
37
|
+
else
|
38
|
+
s.add_dependency(%q<hpricot>, [">= 0"])
|
39
|
+
s.add_dependency(%q<open4>, [">= 0"])
|
40
|
+
s.add_dependency(%q<echoe>, [">= 0"])
|
41
|
+
end
|
42
|
+
end
|
data/lib/indico.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
module Thumblemonks
|
2
|
+
module Indico
|
3
|
+
class ExtractedDocument
|
4
|
+
class ExtractionError < ArgumentError; end
|
5
|
+
|
6
|
+
attr_reader :content
|
7
|
+
|
8
|
+
def initialize(content)
|
9
|
+
@content = content
|
10
|
+
end
|
11
|
+
|
12
|
+
def extract_text
|
13
|
+
@extracted_text ||= extract_text!
|
14
|
+
end
|
15
|
+
|
16
|
+
def extract_text!
|
17
|
+
raise NotImplementedError
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def self.register(options)
|
23
|
+
raise(RuntimeError, "Register should only be called from subclasses") if self == ExtractedDocument
|
24
|
+
Indico.add_handler(self, options)
|
25
|
+
end
|
26
|
+
|
27
|
+
end # ExtractedDocument
|
28
|
+
end # Indico
|
29
|
+
end # Thumblemonks
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'open4'
|
2
|
+
require 'tempfile'
|
3
|
+
|
4
|
+
module Thumblemonks
|
5
|
+
module Indico
|
6
|
+
module IOHelpers
|
7
|
+
|
8
|
+
def open_or_raise(command, exception = nil)
|
9
|
+
exception ||= RuntimeError
|
10
|
+
return_val = nil
|
11
|
+
status = Open4.popen4(command) do |pid,stdin,stdout,stderr|
|
12
|
+
return_val = yield(pid, stdin, stdout, stderr)
|
13
|
+
end
|
14
|
+
raise(exception) unless status.exitstatus.zero?
|
15
|
+
return_val
|
16
|
+
end
|
17
|
+
|
18
|
+
def open_with_tempfile_or_raise(content, exception = RuntimeError)
|
19
|
+
return_val = nil
|
20
|
+
Tempfile.open(self.class.name.gsub(/[^a-z]/i, '_').downcase) do |tf|
|
21
|
+
tf << content
|
22
|
+
tf.flush
|
23
|
+
cmd = yield(tf)
|
24
|
+
open_or_raise(cmd, exception) { |pid,stdin,stdout,stderr| return_val = stdout.read }
|
25
|
+
end
|
26
|
+
return_val
|
27
|
+
end
|
28
|
+
|
29
|
+
end # IOHelpers
|
30
|
+
end # Indico
|
31
|
+
end # Thumblemonks
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Thumblemonks
|
2
|
+
module Indico
|
3
|
+
class UnknownTypeError < ArgumentError; end
|
4
|
+
|
5
|
+
class << self
|
6
|
+
attr_accessor :known_types, :document_handlers
|
7
|
+
end
|
8
|
+
self.known_types, self.document_handlers = {}, {}
|
9
|
+
|
10
|
+
def self.type_known?(mime_type)
|
11
|
+
known_types.has_key?(mime_type)
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.build_document(options)
|
15
|
+
pick_document_class(options[:type]).new(options[:content])
|
16
|
+
end
|
17
|
+
|
18
|
+
# Called from an initializer in config/initializers
|
19
|
+
def self.load_handlers!
|
20
|
+
Dir["#{handler_dir}/handlers/*_document.rb"].each {|handler| require(handler)}
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.add_handler(handler_class, options)
|
24
|
+
document_handlers.merge!(options[:type] => handler_class)
|
25
|
+
known_types.merge!(options[:type] => Array(options[:extensions]))
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def self.handler_dir
|
31
|
+
File.join(File.dirname(__FILE__), 'text_extractor')
|
32
|
+
end
|
33
|
+
|
34
|
+
def self.pick_document_class(mime_type)
|
35
|
+
raise UnknownTypeError unless type_known?(mime_type)
|
36
|
+
document_handler_for(mime_type)
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.document_handler_for(mime_type)
|
40
|
+
document_handlers[mime_type]
|
41
|
+
end
|
42
|
+
end # Indico
|
43
|
+
end # Thumblemonks
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'hpricot'
|
2
|
+
module Thumblemonks
|
3
|
+
module Indico
|
4
|
+
class HTMLDocument < ExtractedDocument
|
5
|
+
|
6
|
+
def extract_text!
|
7
|
+
extract_from_html(content)
|
8
|
+
end
|
9
|
+
|
10
|
+
private
|
11
|
+
|
12
|
+
# FIXME - extract more than just the body text later
|
13
|
+
def extract_from_html(content)
|
14
|
+
body = Hpricot(content).search('body').first
|
15
|
+
strip_tags!(body)
|
16
|
+
body.search('*').inject('') do |content,element|
|
17
|
+
next(content) unless element.text?
|
18
|
+
content << element.to_plain_text
|
19
|
+
end
|
20
|
+
rescue => e
|
21
|
+
raise Thumblemonks::Indico::ExtractedDocument::ExtractionError
|
22
|
+
end
|
23
|
+
|
24
|
+
def strip_tags!(body_html)
|
25
|
+
bad_tags = %w[script style]
|
26
|
+
body_html.search(bad_tags.join(", ")).remove
|
27
|
+
end
|
28
|
+
|
29
|
+
register :type => 'text/html', :extensions => %w[html htm]
|
30
|
+
register :type => 'application/xhtml+xml', :extensions => %w[html htm xhtml htm]
|
31
|
+
|
32
|
+
end # HTMLDocument
|
33
|
+
end # Indico
|
34
|
+
end # Thumblemonks
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Thumblemonks
|
2
|
+
module Indico
|
3
|
+
class PDFDocument < ExtractedDocument
|
4
|
+
include IOHelpers
|
5
|
+
|
6
|
+
def extract_text!
|
7
|
+
extract_from_pdf(content)
|
8
|
+
end
|
9
|
+
|
10
|
+
private
|
11
|
+
|
12
|
+
def extract_from_pdf(content)
|
13
|
+
open_or_raise(pdf_converter, ExtractionError) do |pid,stdin,stdout,stderr|
|
14
|
+
stdin << content
|
15
|
+
stdin.close_write
|
16
|
+
stdout.read
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def pdf_converter
|
21
|
+
'pstotext'
|
22
|
+
end
|
23
|
+
|
24
|
+
register :type => 'application/pdf', :extensions => %w[pdf]
|
25
|
+
end # PDFDocument
|
26
|
+
end # Indico
|
27
|
+
end # Thumblemonks
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'pathname'
|
2
|
+
require 'open4'
|
3
|
+
|
4
|
+
module Thumblemonks
|
5
|
+
module Indico
|
6
|
+
class RTFDocument < ExtractedDocument
|
7
|
+
include IOHelpers
|
8
|
+
|
9
|
+
def extract_text!
|
10
|
+
extract_from_rtf(content)
|
11
|
+
end
|
12
|
+
|
13
|
+
private
|
14
|
+
|
15
|
+
def extract_from_rtf(content)
|
16
|
+
open_or_raise("perl #{rtf_converter}", ExtractionError) do |pid,stdin,stdout,stderr|
|
17
|
+
stdin << content
|
18
|
+
stdin.close_write
|
19
|
+
stdout.read
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def rtf_converter
|
24
|
+
%q[-e 'use RTF::TEXT::Converter;RTF::TEXT::Converter->new(output => \*STDOUT)->parse_stream(\*STDIN);print "\n";']
|
25
|
+
end
|
26
|
+
|
27
|
+
register :type => 'application/rtf', :extensions => %w[rtf]
|
28
|
+
end # RTFDocument
|
29
|
+
end # Indico
|
30
|
+
end # Thumblemonks
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module Thumblemonks
|
2
|
+
module Indico
|
3
|
+
class TextDocument < ExtractedDocument
|
4
|
+
|
5
|
+
# Plain text extractor really just passes through.
|
6
|
+
# TODO - make it convert the text to UTF-8?
|
7
|
+
def extract_text!
|
8
|
+
content.dup
|
9
|
+
end
|
10
|
+
|
11
|
+
register :type => 'text/plain', :extensions => %w[txt text]
|
12
|
+
end # RTFDocument
|
13
|
+
end # Indico
|
14
|
+
end # Thumblemonks
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'tempfile'
|
2
|
+
|
3
|
+
module Thumblemonks
|
4
|
+
module Indico
|
5
|
+
class WordDocument < ExtractedDocument
|
6
|
+
include IOHelpers
|
7
|
+
|
8
|
+
def extract_text!
|
9
|
+
extract_from_doc(content)
|
10
|
+
end
|
11
|
+
|
12
|
+
private
|
13
|
+
|
14
|
+
# FIXME - i hate that i cant pass arguments when using IO.popen. Rather
|
15
|
+
# than pass a file path, do something with making a FIFO and reading/writing
|
16
|
+
# to it instead.
|
17
|
+
def extract_from_doc(content)
|
18
|
+
open_with_tempfile_or_raise(content, ExtractionError) do |tf|
|
19
|
+
%Q[#{word_converter} #{tf.path}]
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def word_converter
|
24
|
+
'antiword'
|
25
|
+
end
|
26
|
+
|
27
|
+
register :type => 'application/msword', :extensions => %w[doc]
|
28
|
+
end # RTFDocument
|
29
|
+
end # Indico
|
30
|
+
end # Thumblemonks
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class HTMLExtractionTest < Test::Unit::TestCase
|
4
|
+
include IndicoTestFixture
|
5
|
+
|
6
|
+
should_return_extractor 'HTMLDocument', :type => 'text/html'
|
7
|
+
should_return_extractor 'HTMLDocument', :type => 'application/xhtml+xml'
|
8
|
+
|
9
|
+
should "raise when extracting if Hpricot errs out" do
|
10
|
+
Hpricot.expects(:make).raises(RuntimeError)
|
11
|
+
assert_raise(Thumblemonks::Indico::ExtractedDocument::ExtractionError) do
|
12
|
+
Thumblemonks::Indico.build_document(:type => 'text/html', :content => 'foo').extract_text
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
context "with an extracted document" do
|
17
|
+
|
18
|
+
setup do
|
19
|
+
@extractor = extractor_for('text/html', 'resume_extirpate.html')
|
20
|
+
end
|
21
|
+
|
22
|
+
should_have_expected_content_with_word 'Extirpate'
|
23
|
+
should_cache_extracted_text
|
24
|
+
should_not_cache_extracted_text_when_using_bang_method
|
25
|
+
|
26
|
+
end # with an extracted document
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class PDFExtractionTest < Test::Unit::TestCase
|
4
|
+
include IndicoTestFixture
|
5
|
+
|
6
|
+
should_return_extractor 'PDFDocument', :type => 'application/pdf'
|
7
|
+
should_raise_when_extracting Thumblemonks::Indico::ExtractedDocument::ExtractionError, 'application/pdf', 'resume_aardvark.doc'
|
8
|
+
|
9
|
+
context "with an extracted document" do
|
10
|
+
|
11
|
+
setup do
|
12
|
+
@extractor = extractor_for('application/pdf', 'resume_voracity.pdf')
|
13
|
+
end
|
14
|
+
|
15
|
+
should_have_expected_content_with_word 'Voracity'
|
16
|
+
should_cache_extracted_text
|
17
|
+
should_not_cache_extracted_text_when_using_bang_method
|
18
|
+
|
19
|
+
end # with an extracted document
|
20
|
+
|
21
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class RTFExtractionTest < Test::Unit::TestCase
|
4
|
+
include IndicoTestFixture
|
5
|
+
|
6
|
+
should_return_extractor 'RTFDocument', :type => 'application/rtf'
|
7
|
+
should_raise_when_extracting Thumblemonks::Indico::ExtractedDocument::ExtractionError, 'application/rtf', 'resume_aardvark.doc'
|
8
|
+
|
9
|
+
context "with an extracted document" do
|
10
|
+
|
11
|
+
setup do
|
12
|
+
@extractor = extractor_for('application/rtf', 'resume_detritus.rtf')
|
13
|
+
end
|
14
|
+
|
15
|
+
should_have_expected_content_with_word 'Detritus'
|
16
|
+
should_cache_extracted_text
|
17
|
+
should_not_cache_extracted_text_when_using_bang_method
|
18
|
+
|
19
|
+
end # with an extracted document
|
20
|
+
|
21
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class TextExtractionTest < Test::Unit::TestCase
|
4
|
+
include IndicoTestFixture
|
5
|
+
|
6
|
+
should_return_extractor 'TextDocument', :type => 'text/plain'
|
7
|
+
|
8
|
+
context "with an extracted document" do
|
9
|
+
|
10
|
+
setup do
|
11
|
+
@extractor = extractor_for('text/plain', 'resume_moribund.txt')
|
12
|
+
end
|
13
|
+
|
14
|
+
should_have_expected_content_with_word 'Moribund'
|
15
|
+
should_cache_extracted_text
|
16
|
+
should_not_cache_extracted_text_when_using_bang_method
|
17
|
+
|
18
|
+
end # with an extracted document
|
19
|
+
|
20
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class WordExtractionTest < Test::Unit::TestCase
|
4
|
+
include IndicoTestFixture
|
5
|
+
|
6
|
+
should_return_extractor 'WordDocument', :type => 'application/msword'
|
7
|
+
should_raise_when_extracting Thumblemonks::Indico::ExtractedDocument::ExtractionError, 'application/msword', 'resume_voracity.pdf'
|
8
|
+
|
9
|
+
context "with an extracted document" do
|
10
|
+
|
11
|
+
setup do
|
12
|
+
@extractor = extractor_for('application/msword', 'resume_aardvark.doc')
|
13
|
+
end
|
14
|
+
|
15
|
+
should_have_expected_content_with_word 'Aardvark'
|
16
|
+
should_cache_extracted_text
|
17
|
+
should_not_cache_extracted_text_when_using_bang_method
|
18
|
+
|
19
|
+
end # with an extracted document
|
20
|
+
|
21
|
+
end
|
Binary file
|
@@ -0,0 +1,11 @@
|
|
1
|
+
{\rtf1\ansi\ansicpg1252\cocoartf949\cocoasubrtf350
|
2
|
+
{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
|
3
|
+
{\colortbl;\red255\green255\blue255;}
|
4
|
+
\margl1440\margr1440\vieww9000\viewh8400\viewkind0
|
5
|
+
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\ql\qnatural\pardirnatural
|
6
|
+
|
7
|
+
\f0\fs24 \cf0 Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\
|
8
|
+
\
|
9
|
+
Detritus\
|
10
|
+
\
|
11
|
+
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.}
|
@@ -0,0 +1,34 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
|
3
|
+
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
4
|
+
|
5
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
|
6
|
+
<head>
|
7
|
+
<title>Lorem</title>
|
8
|
+
|
9
|
+
</head>
|
10
|
+
|
11
|
+
<body>
|
12
|
+
<script type="text/javascript">
|
13
|
+
var foo = "bar";
|
14
|
+
</script>
|
15
|
+
<style type="text/css">
|
16
|
+
form {
|
17
|
+
background-color: yellow;
|
18
|
+
}
|
19
|
+
</style>
|
20
|
+
|
21
|
+
<p>
|
22
|
+
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est <em>laborum.</em>
|
23
|
+
</p>
|
24
|
+
|
25
|
+
<p>
|
26
|
+
Extirpate
|
27
|
+
</p>
|
28
|
+
|
29
|
+
<p>
|
30
|
+
Lorem ipsum dolor sit amet, <strong>consectetur</strong> adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
31
|
+
</p>
|
32
|
+
|
33
|
+
</body>
|
34
|
+
</html>
|
@@ -0,0 +1,5 @@
|
|
1
|
+
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
2
|
+
|
3
|
+
Moribund
|
4
|
+
|
5
|
+
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
Binary file
|
data/test/indico_test.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class IndicoTest < Test::Unit::TestCase
|
4
|
+
|
5
|
+
context "type_known?()" do
|
6
|
+
|
7
|
+
should "return false if type not known" do
|
8
|
+
assert !Thumblemonks::Indico.type_known?('application/nope')
|
9
|
+
end
|
10
|
+
|
11
|
+
should "return true if type known" do
|
12
|
+
assert Thumblemonks::Indico.type_known?('application/rtf')
|
13
|
+
end
|
14
|
+
|
15
|
+
end # type_known?()
|
16
|
+
|
17
|
+
context "known_types()" do
|
18
|
+
|
19
|
+
should "return a hash of known_types keyed by mime_type" do
|
20
|
+
assert Thumblemonks::Indico.known_types.has_key?('application/rtf')
|
21
|
+
end
|
22
|
+
|
23
|
+
should "return a hash of known_types with values being an array of file extensions for the type" do
|
24
|
+
assert Thumblemonks::Indico.known_types['application/rtf'].include?('rtf')
|
25
|
+
end
|
26
|
+
|
27
|
+
end # known_types()
|
28
|
+
|
29
|
+
context "build_document()" do
|
30
|
+
|
31
|
+
should "raise an exception when trying to build a document for an unknown type" do
|
32
|
+
assert_raise(Thumblemonks::Indico::UnknownTypeError) do
|
33
|
+
Thumblemonks::Indico.build_document(:type => 'application/foo', :content => 'what')
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
should "return a document when type is known" do
|
38
|
+
assert_not_nil Thumblemonks::Indico.build_document(:type => 'application/rtf', :content => 'what')
|
39
|
+
end
|
40
|
+
|
41
|
+
end # "build_document()"
|
42
|
+
|
43
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
module IndicoTestFixture
|
2
|
+
|
3
|
+
def self.included(base)
|
4
|
+
base.send(:include, InstanceMethods)
|
5
|
+
base.extend(ClassMethods)
|
6
|
+
end
|
7
|
+
|
8
|
+
module InstanceMethods
|
9
|
+
|
10
|
+
def extractor_for(mime_type, file_name)
|
11
|
+
content = File.read("#{FIXTURES_DIR}/#{file_name}")
|
12
|
+
Thumblemonks::Indico.build_document(:type => mime_type, :content => content)
|
13
|
+
end
|
14
|
+
|
15
|
+
def expected_words(unique_word)
|
16
|
+
expected = <<-EOE
|
17
|
+
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
18
|
+
|
19
|
+
#{unique_word}
|
20
|
+
|
21
|
+
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
22
|
+
EOE
|
23
|
+
normalize_text(expected)
|
24
|
+
end
|
25
|
+
|
26
|
+
def normalize_text(string)
|
27
|
+
string.split(/\s+/m).reject{|p|p =~ /^\s*$/}.sort
|
28
|
+
end
|
29
|
+
|
30
|
+
end # InstanceMethods
|
31
|
+
|
32
|
+
module ClassMethods
|
33
|
+
|
34
|
+
def should_raise_when_extracting(exception, mime_type, unparseable_file)
|
35
|
+
should "raise '#{exception.name}' when trying to parse '#{unparseable_file}' as #{mime_type}" do
|
36
|
+
content = File.read("#{FIXTURES_DIR}/#{unparseable_file}")
|
37
|
+
extractor = Thumblemonks::Indico.build_document(:type => mime_type, :content => content)
|
38
|
+
assert_raise(exception) { extractor.extract_text }
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def should_return_extractor(extractor_class_name, options)
|
43
|
+
should "return a '#{extractor_class_name}' for mime type '#{options[:type]}'" do
|
44
|
+
extractor = Thumblemonks::Indico.build_document(:type => options[:type], :content => 'foo')
|
45
|
+
assert_kind_of(Thumblemonks::Indico.const_get(extractor_class_name), extractor)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def should_have_expected_content_with_word(unique_word)
|
50
|
+
should "extract text with the word '#{unique_word}'" do
|
51
|
+
expected, extracted = expected_words(unique_word), normalize_text(@extractor.extract_text)
|
52
|
+
assert(expected == extracted, "Expected: #{expected.inspect}\nBut got: #{extracted.inspect}\nA difference of: #{(expected - extracted).inspect}\n, #{(extracted - expected).inspect}")
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def should_cache_extracted_text
|
57
|
+
should "cache the extracted text when using a non bang method" do
|
58
|
+
assert_equal(@extractor.extract_text.object_id, @extractor.extract_text.object_id, "Expected same object")
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def should_not_cache_extracted_text_when_using_bang_method
|
63
|
+
should "not cache the extracted text when using the bang method" do
|
64
|
+
assert_not_equal(@extractor.extract_text!.object_id, @extractor.extract_text!.object_id, "Did not expect same object")
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
end # ClassMethods
|
69
|
+
end
|
data/test/test_helper.rb
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'test/unit'
|
3
|
+
require 'shoulda'
|
4
|
+
require 'mocha'
|
5
|
+
$: << File.dirname(__FILE__) + "/.."
|
6
|
+
$: << File.dirname(__FILE__) + "/../lib"
|
7
|
+
require 'indico'
|
8
|
+
require File.join(File.dirname(__FILE__), 'indico_test_fixture')
|
9
|
+
FIXTURES_DIR = File.dirname(__FILE__) + "/fixtures"
|
metadata
ADDED
@@ -0,0 +1,128 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: gabrielg-indico
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Gabriel Gironda
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-01-11 00:00:00 -08:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: hpricot
|
17
|
+
version_requirement:
|
18
|
+
version_requirements: !ruby/object:Gem::Requirement
|
19
|
+
requirements:
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: "0"
|
23
|
+
version:
|
24
|
+
- !ruby/object:Gem::Dependency
|
25
|
+
name: open4
|
26
|
+
version_requirement:
|
27
|
+
version_requirements: !ruby/object:Gem::Requirement
|
28
|
+
requirements:
|
29
|
+
- - ">="
|
30
|
+
- !ruby/object:Gem::Version
|
31
|
+
version: "0"
|
32
|
+
version:
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: echoe
|
35
|
+
version_requirement:
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: "0"
|
41
|
+
version:
|
42
|
+
description: Gem for converting documents to plain text for indexing.
|
43
|
+
email: ""
|
44
|
+
executables: []
|
45
|
+
|
46
|
+
extensions: []
|
47
|
+
|
48
|
+
extra_rdoc_files:
|
49
|
+
- CHANGELOG
|
50
|
+
- lib/indico.rb
|
51
|
+
- lib/thumblemonks/indico/extracted_document.rb
|
52
|
+
- lib/thumblemonks/indico/io_helpers.rb
|
53
|
+
- lib/thumblemonks/indico/text_extractor/handlers/html_document.rb
|
54
|
+
- lib/thumblemonks/indico/text_extractor/handlers/pdf_document.rb
|
55
|
+
- lib/thumblemonks/indico/text_extractor/handlers/rtf_document.rb
|
56
|
+
- lib/thumblemonks/indico/text_extractor/handlers/text_document.rb
|
57
|
+
- lib/thumblemonks/indico/text_extractor/handlers/word_document.rb
|
58
|
+
- lib/thumblemonks/indico/text_extractor.rb
|
59
|
+
- LICENSE
|
60
|
+
- README.markdown
|
61
|
+
files:
|
62
|
+
- CHANGELOG
|
63
|
+
- lib/indico.rb
|
64
|
+
- lib/thumblemonks/indico/extracted_document.rb
|
65
|
+
- lib/thumblemonks/indico/io_helpers.rb
|
66
|
+
- lib/thumblemonks/indico/text_extractor/handlers/html_document.rb
|
67
|
+
- lib/thumblemonks/indico/text_extractor/handlers/pdf_document.rb
|
68
|
+
- lib/thumblemonks/indico/text_extractor/handlers/rtf_document.rb
|
69
|
+
- lib/thumblemonks/indico/text_extractor/handlers/text_document.rb
|
70
|
+
- lib/thumblemonks/indico/text_extractor/handlers/word_document.rb
|
71
|
+
- lib/thumblemonks/indico/text_extractor.rb
|
72
|
+
- LICENSE
|
73
|
+
- Manifest
|
74
|
+
- Rakefile
|
75
|
+
- README.markdown
|
76
|
+
- test/extractors/html_extraction_test.rb
|
77
|
+
- test/extractors/pdf_extraction_test.rb
|
78
|
+
- test/extractors/rtf_extraction_test.rb
|
79
|
+
- test/extractors/text_extraction_test.rb
|
80
|
+
- test/extractors/word_extraction_test.rb
|
81
|
+
- test/fixtures/resume_aardvark.doc
|
82
|
+
- test/fixtures/resume_detritus.rtf
|
83
|
+
- test/fixtures/resume_extirpate.html
|
84
|
+
- test/fixtures/resume_moribund.txt
|
85
|
+
- test/fixtures/resume_voracity.pdf
|
86
|
+
- test/indico_test.rb
|
87
|
+
- test/indico_test_fixture.rb
|
88
|
+
- test/test_helper.rb
|
89
|
+
- indico.gemspec
|
90
|
+
has_rdoc: true
|
91
|
+
homepage: http://github.com/gabrielg/indico
|
92
|
+
post_install_message: GROOVY GUYS CHOOSE THUMBLE MONKS
|
93
|
+
rdoc_options:
|
94
|
+
- --line-numbers
|
95
|
+
- --inline-source
|
96
|
+
- --title
|
97
|
+
- Indico
|
98
|
+
- --main
|
99
|
+
- README.markdown
|
100
|
+
require_paths:
|
101
|
+
- lib
|
102
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
103
|
+
requirements:
|
104
|
+
- - ">="
|
105
|
+
- !ruby/object:Gem::Version
|
106
|
+
version: "0"
|
107
|
+
version:
|
108
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
109
|
+
requirements:
|
110
|
+
- - ">="
|
111
|
+
- !ruby/object:Gem::Version
|
112
|
+
version: "1.2"
|
113
|
+
version:
|
114
|
+
requirements: []
|
115
|
+
|
116
|
+
rubyforge_project: indico
|
117
|
+
rubygems_version: 1.2.0
|
118
|
+
signing_key:
|
119
|
+
specification_version: 2
|
120
|
+
summary: Gem for converting documents to plain text for indexing.
|
121
|
+
test_files:
|
122
|
+
- test/extractors/html_extraction_test.rb
|
123
|
+
- test/extractors/pdf_extraction_test.rb
|
124
|
+
- test/extractors/rtf_extraction_test.rb
|
125
|
+
- test/extractors/text_extraction_test.rb
|
126
|
+
- test/extractors/word_extraction_test.rb
|
127
|
+
- test/indico_test.rb
|
128
|
+
- test/test_helper.rb
|