textractor 0.0.3 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +2 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +17 -0
- data/README.md +7 -7
- data/Rakefile +2 -21
- data/lib/textractor.rb +54 -8
- data/lib/textractor/extractors.rb +12 -0
- data/lib/textractor/extractors/doc_extractor.rb +29 -0
- data/lib/textractor/extractors/docx_extractor.rb +28 -0
- data/lib/textractor/extractors/pdf_extractor.rb +11 -0
- data/lib/textractor/extractors/text_extractor.rb +11 -0
- data/lib/textractor/extractors/word_extractor.rb +29 -0
- data/lib/textractor/version.rb +3 -0
- data/spec/fixtures/document.doc +0 -0
- data/spec/fixtures/document.docx +0 -0
- data/spec/fixtures/document.pdf +0 -0
- data/spec/fixtures/document.txt +1 -1
- data/spec/fixtures/no_extension +0 -0
- data/spec/integration/textractor_spec.rb +58 -0
- data/spec/spec_helper.rb +10 -4
- data/spec/textractor_spec.rb +76 -14
- data/textractor.gemspec +18 -71
- metadata +45 -20
- data/VERSION +0 -1
- data/lib/textractor/document.rb +0 -66
- data/spec/document_spec.rb +0 -94
data/.gitignore
CHANGED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
data/README.md
CHANGED
@@ -28,18 +28,18 @@ this may not work on all systems due to dependency issues.
|
|
28
28
|
Due to textractor's reliance on command line tools all the methods in
|
29
29
|
textractor work on paths not File objects.
|
30
30
|
|
31
|
-
|
32
|
-
document.text # => "Ruby on rails developer"
|
33
|
-
|
34
|
-
There is also a convenience method on Textractor.
|
35
|
-
|
36
|
-
Textractor.text_from_file(path_to_document) # => "Ruby on rails developer"
|
31
|
+
Textractor.text_from_path(path_to_document) # => "Ruby on rails developer"
|
37
32
|
|
38
33
|
Textractor will attempt to guess what type of document you're trying
|
39
34
|
to extract text from. However, if you know the content type of your
|
40
35
|
document, you can provide it and Textractor won't guess.
|
41
36
|
|
42
|
-
Textractor.
|
37
|
+
Textractor.text_from_path(path_to_document, :content_type => "application/doc")
|
38
|
+
|
39
|
+
## TODO
|
40
|
+
|
41
|
+
* Remove vendored docx2txt perl script
|
42
|
+
* Replace as much as possible with pure ruby
|
43
43
|
|
44
44
|
## Note on Patches/Pull Requests
|
45
45
|
|
data/Rakefile
CHANGED
@@ -1,22 +1,5 @@
|
|
1
|
-
require '
|
2
|
-
|
3
|
-
|
4
|
-
begin
|
5
|
-
require 'jeweler'
|
6
|
-
Jeweler::Tasks.new do |gem|
|
7
|
-
gem.name = "textractor"
|
8
|
-
gem.summary = %Q{simple wrapper around CLI tools for extracting text from PDF and Word documents}
|
9
|
-
gem.description = %Q{simple wrapper around CLI for extracting text from PDF and Word documents}
|
10
|
-
gem.email = "mguterl@gmail.com"
|
11
|
-
gem.homepage = "http://github.com/mguterl/textractor"
|
12
|
-
gem.authors = ["Michael Guterl"]
|
13
|
-
gem.add_development_dependency "rspec", ">= 1.3.0"
|
14
|
-
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
15
|
-
end
|
16
|
-
Jeweler::GemcutterTasks.new
|
17
|
-
rescue LoadError
|
18
|
-
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
19
|
-
end
|
1
|
+
require 'bundler'
|
2
|
+
Bundler::GemHelper.install_tasks
|
20
3
|
|
21
4
|
require 'spec/rake/spectask'
|
22
5
|
Spec::Rake::SpecTask.new(:spec) do |spec|
|
@@ -30,8 +13,6 @@ Spec::Rake::SpecTask.new(:rcov) do |spec|
|
|
30
13
|
spec.rcov = true
|
31
14
|
end
|
32
15
|
|
33
|
-
task :spec => :check_dependencies
|
34
|
-
|
35
16
|
task :default => :spec
|
36
17
|
|
37
18
|
require 'rake/rdoctask'
|
data/lib/textractor.rb
CHANGED
@@ -1,18 +1,64 @@
|
|
1
1
|
module Textractor
|
2
|
-
autoload :Document, "textractor/document"
|
3
2
|
|
4
|
-
|
5
|
-
|
3
|
+
UnknownContentType = Class.new(StandardError)
|
4
|
+
FileNotFound = Class.new(StandardError)
|
5
|
+
ContentTypeAlreadyRegistered = Class.new(StandardError)
|
6
|
+
ContentTypeNotRegistered = Class.new(StandardError)
|
7
|
+
|
8
|
+
autoload :Extractors, "textractor/extractors"
|
9
|
+
|
10
|
+
def self.text_from_path(path, options = {})
|
11
|
+
raise FileNotFound unless File.exists?(path)
|
12
|
+
content_type = options.fetch(:content_type) { content_type_for_path(path) }
|
13
|
+
extractor_class = extractor_for_content_type(content_type)
|
14
|
+
extractor = extractor_class.new
|
15
|
+
|
16
|
+
extractor.text_from_path(path)
|
6
17
|
end
|
7
18
|
|
8
|
-
|
19
|
+
def self.content_type_for_path(path)
|
20
|
+
case File.extname(path)
|
21
|
+
when /\.pdf$/
|
22
|
+
'application/pdf'
|
23
|
+
when /\.doc$/
|
24
|
+
'application/msword'
|
25
|
+
when /\.docx$/
|
26
|
+
'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
27
|
+
when /\.txt$/
|
28
|
+
'text/plain'
|
29
|
+
else
|
30
|
+
raise UnknownContentType, "unable to determine content type for #{path}"
|
31
|
+
end
|
32
|
+
end
|
9
33
|
|
10
|
-
def self.
|
11
|
-
|
34
|
+
def self.register_content_type(content_type, extractor)
|
35
|
+
raise ContentTypeAlreadyRegistered, "#{content_type} is already registered" if extractors[content_type]
|
36
|
+
extractors[content_type] = extractor
|
12
37
|
end
|
13
38
|
|
14
|
-
def self.
|
15
|
-
|
39
|
+
def self.remove_content_type(content_type)
|
40
|
+
extractors.delete content_type
|
16
41
|
end
|
17
42
|
|
43
|
+
def self.extractor_for_content_type(content_type)
|
44
|
+
extractors[content_type] or raise ContentTypeNotRegistered, "#{content_type} is not registered with Textractor"
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.extractors
|
48
|
+
@extractors ||= {}
|
49
|
+
end
|
50
|
+
|
51
|
+
def self.clear_registry
|
52
|
+
@extractors = {}
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.register_basic_types
|
56
|
+
register_content_type("application/pdf", Extractors::PDFExtractor)
|
57
|
+
register_content_type("application/msword", Extractors::DocExtractor)
|
58
|
+
register_content_type("application/vnd.openxmlformats-officedocument.wordprocessingml.document", Extractors::DocxExtractor)
|
59
|
+
register_content_type("text/plain", Extractors::TextExtractor)
|
60
|
+
end
|
61
|
+
|
62
|
+
register_basic_types
|
63
|
+
|
18
64
|
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
module Textractor
|
2
|
+
|
3
|
+
module Extractors
|
4
|
+
|
5
|
+
autoload :PDFExtractor, 'textractor/extractors/pdf_extractor'
|
6
|
+
autoload :DocExtractor, 'textractor/extractors/doc_extractor'
|
7
|
+
autoload :DocxExtractor, 'textractor/extractors/docx_extractor'
|
8
|
+
autoload :TextExtractor, 'textractor/extractors/text_extractor'
|
9
|
+
|
10
|
+
end
|
11
|
+
|
12
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Textractor::Extractors
|
2
|
+
|
3
|
+
class DocExtractor
|
4
|
+
|
5
|
+
DEFAULT_WV_TEXT_PATH = File.expand_path(File.dirname(__FILE__) + "/../../../support/wvText.xml").freeze
|
6
|
+
|
7
|
+
class << self
|
8
|
+
attr_writer :wvText_path
|
9
|
+
|
10
|
+
def wvText_path
|
11
|
+
@wvText_path || DEFAULT_WV_TEXT_PATH
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def text_from_path(path)
|
16
|
+
command = "wvWare -c utf-8 --nographics -x #{wvText_path} #{path}"
|
17
|
+
puts command if $DEBUG
|
18
|
+
`#{command}`.strip
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def wvText_path
|
24
|
+
self.class.wvText_path
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Textractor::Extractors
|
2
|
+
|
3
|
+
class DocxExtractor
|
4
|
+
|
5
|
+
DEFAULT_DOCX2TXT_PATH = File.expand_path(File.dirname(__FILE__) + "/../../../vendor/docx2txt/docx2txt.pl").freeze
|
6
|
+
|
7
|
+
class << self
|
8
|
+
attr_writer :docx2txt_path
|
9
|
+
|
10
|
+
def docx2txt_path
|
11
|
+
@docx2txt_path || DEFAULT_DOCX2TXT_PATH
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
def text_from_path(path)
|
17
|
+
`#{docx2txt_path} #{path} -`.strip
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def docx2txt_path
|
23
|
+
self.class.docx2txt_path
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Textractor::Extractors
|
2
|
+
|
3
|
+
class WordExtractor
|
4
|
+
|
5
|
+
DEFAULT_WV_TEXT_PATH = File.expand_path(File.dirname(__FILE__) + "/../../../support/wvText.xml").freeze
|
6
|
+
|
7
|
+
class << self
|
8
|
+
attr_writer :wvText_path
|
9
|
+
|
10
|
+
def wvText_path
|
11
|
+
@wvText_path || DEFAULT_WV_TEXT_PATH
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def text_from_path(path)
|
16
|
+
command = "wvWare -c utf-8 --nographics -x #{wvText_path} #{path}"
|
17
|
+
puts command if $DEBUG
|
18
|
+
`#{command}`.strip
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def wvText_path
|
24
|
+
self.class.wvText_path
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
data/spec/fixtures/document.doc
CHANGED
Binary file
|
data/spec/fixtures/document.docx
CHANGED
Binary file
|
data/spec/fixtures/document.pdf
CHANGED
Binary file
|
data/spec/fixtures/document.txt
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
text
|
Binary file
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Textractor do
|
4
|
+
|
5
|
+
before do
|
6
|
+
Textractor.clear_registry
|
7
|
+
Textractor.register_basic_types
|
8
|
+
end
|
9
|
+
|
10
|
+
it 'returns the contents of word (.doc) documents' do
|
11
|
+
Textractor.text_from_path(fixture_path("document.doc")).should == 'text'
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'returns the contents of word (.docx) documents' do
|
15
|
+
Textractor.text_from_path(fixture_path("document.docx")).should == 'text'
|
16
|
+
end
|
17
|
+
|
18
|
+
it 'returns the contents of pdf documents' do
|
19
|
+
Textractor.text_from_path(fixture_path("document.pdf")).should == 'text'
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'returns the contents of text documents' do
|
23
|
+
Textractor.text_from_path(fixture_path("document.txt")).should == 'text'
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'allows the user to specify content type to avoid internal resolution' do
|
27
|
+
Textractor.text_from_path(fixture_path("no_extension"), :content_type => "application/pdf").should == 'text'
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'raises an exception when the content type is unable to be determined' do
|
31
|
+
expect {
|
32
|
+
Textractor.text_from_path(fixture_path("no_extension"))
|
33
|
+
}.to raise_error(Textractor::UnknownContentType)
|
34
|
+
end
|
35
|
+
|
36
|
+
it 'raises an exception when the path specified does not exist' do
|
37
|
+
expect {
|
38
|
+
Textractor.text_from_path('non-existant')
|
39
|
+
}.to raise_error(Textractor::FileNotFound)
|
40
|
+
end
|
41
|
+
|
42
|
+
it 'raises an exception when there is no extractor defined for the content type' do
|
43
|
+
Textractor.clear_registry
|
44
|
+
|
45
|
+
expect {
|
46
|
+
Textractor.text_from_path(fixture_path('document.pdf'))
|
47
|
+
}.to raise_error(Textractor::ContentTypeNotRegistered)
|
48
|
+
end
|
49
|
+
|
50
|
+
it 'allows content type extractors to be removed' do
|
51
|
+
Textractor.remove_content_type("application/pdf")
|
52
|
+
|
53
|
+
expect {
|
54
|
+
Textractor.text_from_path(fixture_path('document.pdf'))
|
55
|
+
}.to raise_error(Textractor::ContentTypeNotRegistered)
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1,9 +1,15 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler/setup'
|
3
|
+
require 'spec'
|
4
|
+
|
1
5
|
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
2
|
-
|
6
|
+
|
3
7
|
require 'textractor'
|
4
|
-
|
5
|
-
|
8
|
+
|
9
|
+
def fixture_path(path)
|
10
|
+
File.expand_path(File.join(File.dirname(__FILE__), 'fixtures', path))
|
11
|
+
end
|
6
12
|
|
7
13
|
Spec::Runner.configure do |config|
|
8
|
-
|
14
|
+
|
9
15
|
end
|
data/spec/textractor_spec.rb
CHANGED
@@ -1,32 +1,94 @@
|
|
1
1
|
require 'spec/spec_helper'
|
2
2
|
|
3
|
+
class TestExtractor
|
4
|
+
|
5
|
+
def text_from_path(path)
|
6
|
+
path
|
7
|
+
end
|
8
|
+
|
9
|
+
end
|
10
|
+
|
3
11
|
describe Textractor do
|
4
12
|
|
5
|
-
|
13
|
+
before do
|
14
|
+
Textractor.clear_registry
|
15
|
+
end
|
16
|
+
|
17
|
+
describe ".text_from_path" do
|
18
|
+
before do
|
19
|
+
File.stub(:exists?).and_return(true)
|
20
|
+
Textractor.stub(:content_type_for_path).and_return('test')
|
21
|
+
Textractor.stub(:extractor_for_content_type).and_return(TestExtractor)
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'extracts the text from a given path' do
|
25
|
+
Textractor.text_from_path('document').should == 'document'
|
26
|
+
end
|
27
|
+
|
28
|
+
it 'uses content_type_for_path to determine the content type' do
|
29
|
+
Textractor.should_receive(:content_type_for_path).with('document')
|
30
|
+
Textractor.text_from_path('document')
|
31
|
+
end
|
32
|
+
|
33
|
+
it 'uses extractor_for_content_type to look up the correct extractor' do
|
34
|
+
Textractor.should_receive(:extractor_for_content_type).with('test')
|
35
|
+
Textractor.text_from_path('document')
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
describe ".register_content_type" do
|
41
|
+
|
42
|
+
it 'raises an exception if an extractor is already defined for that content type' do
|
43
|
+
Textractor.register_content_type("text/plain", TestExtractor)
|
44
|
+
|
45
|
+
expect {
|
46
|
+
Textractor.register_content_type("text/plain", TestExtractor)
|
47
|
+
}.to raise_error(Textractor::ContentTypeAlreadyRegistered)
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
6
51
|
|
7
|
-
|
8
|
-
|
52
|
+
describe ".extractor_for_content_type" do
|
53
|
+
before do
|
54
|
+
Textractor.register_content_type("text/plain", TestExtractor)
|
9
55
|
end
|
10
56
|
|
11
|
-
it '
|
12
|
-
Textractor.
|
13
|
-
Textractor.wvText_path.should == "foo.bar"
|
57
|
+
it 'returns the extractor for the content type' do
|
58
|
+
Textractor.extractor_for_content_type("text/plain").should == TestExtractor
|
14
59
|
end
|
15
60
|
|
61
|
+
it 'raises an exception when no extractor is defined for that content type' do
|
62
|
+
expect {
|
63
|
+
Textractor.extractor_for_content_type("unknown")
|
64
|
+
}.to raise_error(Textractor::ContentTypeNotRegistered)
|
65
|
+
end
|
16
66
|
end
|
17
67
|
|
18
|
-
describe ".
|
68
|
+
describe ".content_type_for_path" do
|
19
69
|
|
20
|
-
it '
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
70
|
+
it 'returns the content type based on the file extension' do
|
71
|
+
Textractor.content_type_for_path("document.pdf").should == "application/pdf"
|
72
|
+
end
|
73
|
+
|
74
|
+
it 'raises an exception if it cannot determine the content type' do
|
75
|
+
expect {
|
76
|
+
Textractor.content_type_for_path('unknown')
|
77
|
+
}.to raise_error(Textractor::UnknownContentType)
|
25
78
|
end
|
26
79
|
|
27
80
|
end
|
28
81
|
|
29
|
-
|
30
|
-
|
82
|
+
describe ".clear_registry" do
|
83
|
+
before do
|
84
|
+
Textractor.register_content_type("text/plain", TestExtractor)
|
85
|
+
end
|
86
|
+
|
87
|
+
it 'clears the registered content types and their respective extractors' do
|
88
|
+
Textractor.clear_registry
|
89
|
+
Textractor.extractors.should be_empty
|
90
|
+
end
|
91
|
+
|
31
92
|
end
|
93
|
+
|
32
94
|
end
|
data/textractor.gemspec
CHANGED
@@ -1,78 +1,25 @@
|
|
1
|
-
# Generated by jeweler
|
2
|
-
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
-
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
1
|
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path("../lib/textractor/version", __FILE__)
|
5
3
|
|
6
4
|
Gem::Specification.new do |s|
|
7
|
-
s.name
|
8
|
-
s.version
|
5
|
+
s.name = "textractor"
|
6
|
+
s.version = Textractor::VERSION
|
7
|
+
s.platform = Gem::Platform::RUBY
|
8
|
+
s.authors = ['Michael Guterl']
|
9
|
+
s.email = ['michael@diminishing.org']
|
10
|
+
s.homepage = "http://github.com/mguterl/textractor"
|
11
|
+
s.summary = "simple wrapper around CLI for extracting text from PDF and Word documents"
|
12
|
+
s.description = "simple wrapper around CLI for extracting text from PDF and Word documents"
|
9
13
|
|
10
|
-
s.required_rubygems_version =
|
11
|
-
s.
|
12
|
-
s.date = %q{2010-07-27}
|
13
|
-
s.description = %q{simple wrapper around CLI for extracting text from PDF and Word documents}
|
14
|
-
s.email = %q{mguterl@gmail.com}
|
15
|
-
s.extra_rdoc_files = [
|
16
|
-
"LICENSE",
|
17
|
-
"README.md"
|
18
|
-
]
|
19
|
-
s.files = [
|
20
|
-
".document",
|
21
|
-
".gitignore",
|
22
|
-
"LICENSE",
|
23
|
-
"README.md",
|
24
|
-
"Rakefile",
|
25
|
-
"VERSION",
|
26
|
-
"lib/textractor.rb",
|
27
|
-
"lib/textractor/document.rb",
|
28
|
-
"spec/document_spec.rb",
|
29
|
-
"spec/fixtures/document.doc",
|
30
|
-
"spec/fixtures/document.docx",
|
31
|
-
"spec/fixtures/document.pdf",
|
32
|
-
"spec/fixtures/document.txt",
|
33
|
-
"spec/spec.opts",
|
34
|
-
"spec/spec_helper.rb",
|
35
|
-
"spec/textractor_spec.rb",
|
36
|
-
"support/wvText.xml",
|
37
|
-
"textractor.gemspec",
|
38
|
-
"vendor/docx2txt/AUTHORS",
|
39
|
-
"vendor/docx2txt/BSDmakefile",
|
40
|
-
"vendor/docx2txt/COPYING",
|
41
|
-
"vendor/docx2txt/ChangeLog",
|
42
|
-
"vendor/docx2txt/INSTALL",
|
43
|
-
"vendor/docx2txt/Makefile",
|
44
|
-
"vendor/docx2txt/README",
|
45
|
-
"vendor/docx2txt/ToDo",
|
46
|
-
"vendor/docx2txt/VERSION",
|
47
|
-
"vendor/docx2txt/WInstall.bat",
|
48
|
-
"vendor/docx2txt/docx2txt.bat",
|
49
|
-
"vendor/docx2txt/docx2txt.config",
|
50
|
-
"vendor/docx2txt/docx2txt.pl",
|
51
|
-
"vendor/docx2txt/docx2txt.sh",
|
52
|
-
"vendor/docx2txt/resume.docx"
|
53
|
-
]
|
54
|
-
s.homepage = %q{http://github.com/mguterl/textractor}
|
55
|
-
s.rdoc_options = ["--charset=UTF-8"]
|
56
|
-
s.require_paths = ["lib"]
|
57
|
-
s.rubygems_version = %q{1.3.7}
|
58
|
-
s.summary = %q{simple wrapper around CLI tools for extracting text from PDF and Word documents}
|
59
|
-
s.test_files = [
|
60
|
-
"spec/document_spec.rb",
|
61
|
-
"spec/spec_helper.rb",
|
62
|
-
"spec/textractor_spec.rb"
|
63
|
-
]
|
14
|
+
s.required_rubygems_version = ">= 1.3.6"
|
15
|
+
s.rubyforge_project = "textractor"
|
64
16
|
|
65
|
-
|
66
|
-
|
67
|
-
s.specification_version = 3
|
17
|
+
s.add_development_dependency "bundler", ">= 1.0.0"
|
18
|
+
s.add_development_dependency "rspec", "~> 1.3.0"
|
68
19
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
else
|
75
|
-
s.add_dependency(%q<rspec>, [">= 1.3.0"])
|
76
|
-
end
|
20
|
+
s.files = `git ls-files`.split("\n")
|
21
|
+
s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
|
22
|
+
s.require_path = 'lib'
|
23
|
+
s.extra_rdoc_files = ["LICENSE", "README.md"]
|
24
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
77
25
|
end
|
78
|
-
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 31
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
-
|
10
|
-
version: 0.
|
8
|
+
- 1
|
9
|
+
- 2
|
10
|
+
version: 0.1.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Michael Guterl
|
@@ -15,16 +15,32 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-
|
18
|
+
date: 2010-11-06 00:00:00 -04:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
22
|
-
name:
|
22
|
+
name: bundler
|
23
23
|
prerelease: false
|
24
24
|
requirement: &id001 !ruby/object:Gem::Requirement
|
25
25
|
none: false
|
26
26
|
requirements:
|
27
27
|
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 23
|
30
|
+
segments:
|
31
|
+
- 1
|
32
|
+
- 0
|
33
|
+
- 0
|
34
|
+
version: 1.0.0
|
35
|
+
type: :development
|
36
|
+
version_requirements: *id001
|
37
|
+
- !ruby/object:Gem::Dependency
|
38
|
+
name: rspec
|
39
|
+
prerelease: false
|
40
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ~>
|
28
44
|
- !ruby/object:Gem::Version
|
29
45
|
hash: 27
|
30
46
|
segments:
|
@@ -33,9 +49,10 @@ dependencies:
|
|
33
49
|
- 0
|
34
50
|
version: 1.3.0
|
35
51
|
type: :development
|
36
|
-
version_requirements: *
|
52
|
+
version_requirements: *id002
|
37
53
|
description: simple wrapper around CLI for extracting text from PDF and Word documents
|
38
|
-
email:
|
54
|
+
email:
|
55
|
+
- michael@diminishing.org
|
39
56
|
executables: []
|
40
57
|
|
41
58
|
extensions: []
|
@@ -46,17 +63,25 @@ extra_rdoc_files:
|
|
46
63
|
files:
|
47
64
|
- .document
|
48
65
|
- .gitignore
|
66
|
+
- Gemfile
|
67
|
+
- Gemfile.lock
|
49
68
|
- LICENSE
|
50
69
|
- README.md
|
51
70
|
- Rakefile
|
52
|
-
- VERSION
|
53
71
|
- lib/textractor.rb
|
54
|
-
- lib/textractor/
|
55
|
-
-
|
72
|
+
- lib/textractor/extractors.rb
|
73
|
+
- lib/textractor/extractors/doc_extractor.rb
|
74
|
+
- lib/textractor/extractors/docx_extractor.rb
|
75
|
+
- lib/textractor/extractors/pdf_extractor.rb
|
76
|
+
- lib/textractor/extractors/text_extractor.rb
|
77
|
+
- lib/textractor/extractors/word_extractor.rb
|
78
|
+
- lib/textractor/version.rb
|
56
79
|
- spec/fixtures/document.doc
|
57
80
|
- spec/fixtures/document.docx
|
58
81
|
- spec/fixtures/document.pdf
|
59
82
|
- spec/fixtures/document.txt
|
83
|
+
- spec/fixtures/no_extension
|
84
|
+
- spec/integration/textractor_spec.rb
|
60
85
|
- spec/spec.opts
|
61
86
|
- spec/spec_helper.rb
|
62
87
|
- spec/textractor_spec.rb
|
@@ -100,18 +125,18 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
100
125
|
requirements:
|
101
126
|
- - ">="
|
102
127
|
- !ruby/object:Gem::Version
|
103
|
-
hash:
|
128
|
+
hash: 23
|
104
129
|
segments:
|
105
|
-
-
|
106
|
-
|
130
|
+
- 1
|
131
|
+
- 3
|
132
|
+
- 6
|
133
|
+
version: 1.3.6
|
107
134
|
requirements: []
|
108
135
|
|
109
|
-
rubyforge_project:
|
136
|
+
rubyforge_project: textractor
|
110
137
|
rubygems_version: 1.3.7
|
111
138
|
signing_key:
|
112
139
|
specification_version: 3
|
113
|
-
summary: simple wrapper around CLI
|
114
|
-
test_files:
|
115
|
-
|
116
|
-
- spec/spec_helper.rb
|
117
|
-
- spec/textractor_spec.rb
|
140
|
+
summary: simple wrapper around CLI for extracting text from PDF and Word documents
|
141
|
+
test_files: []
|
142
|
+
|
data/VERSION
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
0.0.3
|
data/lib/textractor/document.rb
DELETED
@@ -1,66 +0,0 @@
|
|
1
|
-
module Textractor
|
2
|
-
|
3
|
-
class Document
|
4
|
-
|
5
|
-
CONTENT_TYPE_CONVERSIONS = {
|
6
|
-
'application/pdf' => :pdf,
|
7
|
-
'application/x-pdf' => :pdf,
|
8
|
-
'application/doc' => :doc,
|
9
|
-
'application/x-doc' => :doc,
|
10
|
-
'application/msword' => :doc,
|
11
|
-
'text/plain' => :txt,
|
12
|
-
'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => :docx,
|
13
|
-
}
|
14
|
-
|
15
|
-
attr_reader :filename
|
16
|
-
|
17
|
-
def initialize(filename, options = {})
|
18
|
-
@filename = File.expand_path(filename)
|
19
|
-
@content_type = options[:content_type]
|
20
|
-
end
|
21
|
-
|
22
|
-
def text
|
23
|
-
send("extract_from_#{type}")
|
24
|
-
end
|
25
|
-
|
26
|
-
def type
|
27
|
-
return CONTENT_TYPE_CONVERSIONS[content_type] if content_type
|
28
|
-
case File.extname(@filename)
|
29
|
-
when /pdf/
|
30
|
-
:pdf
|
31
|
-
when /docx/
|
32
|
-
:docx
|
33
|
-
when /doc/
|
34
|
-
:doc
|
35
|
-
when /txt/
|
36
|
-
:txt
|
37
|
-
else
|
38
|
-
nil
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|
42
|
-
private
|
43
|
-
|
44
|
-
def content_type
|
45
|
-
@content_type
|
46
|
-
end
|
47
|
-
|
48
|
-
def extract_from_pdf
|
49
|
-
`pdftotext #{filename} - 2>/dev/null`.strip
|
50
|
-
end
|
51
|
-
|
52
|
-
def extract_from_doc
|
53
|
-
`wvWare -c utf-8 --nographics -x #{Textractor.wvText_path} #{filename} 2>/dev/null`.strip
|
54
|
-
end
|
55
|
-
|
56
|
-
def extract_from_docx
|
57
|
-
`#{File.dirname(__FILE__) + "/../../vendor/docx2txt/docx2txt.pl"} #{filename} -`.strip
|
58
|
-
end
|
59
|
-
|
60
|
-
def extract_from_txt
|
61
|
-
File.read(filename)
|
62
|
-
end
|
63
|
-
|
64
|
-
end
|
65
|
-
|
66
|
-
end
|
data/spec/document_spec.rb
DELETED
@@ -1,94 +0,0 @@
|
|
1
|
-
require 'spec/spec_helper'
|
2
|
-
|
3
|
-
describe Textractor::Document do
|
4
|
-
|
5
|
-
PDF_DOCUMENT_FIXTURE = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.pdf")
|
6
|
-
DOC_DOCUMENT_FIXTURE = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.doc")
|
7
|
-
TXT_DOCUMENT_FIXTURE = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.txt")
|
8
|
-
DOCX_DOCUMENT_FIXTURE = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.docx")
|
9
|
-
|
10
|
-
it 'should require a filename to create' do
|
11
|
-
expect { Textractor::Document.new }.to raise_error(ArgumentError)
|
12
|
-
Textractor::Document.new('filename').filename.should == File.expand_path('filename')
|
13
|
-
end
|
14
|
-
|
15
|
-
describe "#text" do
|
16
|
-
|
17
|
-
describe "with pdf document" do
|
18
|
-
|
19
|
-
it 'should extract the text from the document' do
|
20
|
-
@doc = Textractor::Document.new(PDF_DOCUMENT_FIXTURE)
|
21
|
-
@doc.text.should == "Ruby on rails developer"
|
22
|
-
end
|
23
|
-
|
24
|
-
end
|
25
|
-
|
26
|
-
describe "with doc document" do
|
27
|
-
|
28
|
-
it 'should extract the text from the document' do
|
29
|
-
@doc = Textractor::Document.new(DOC_DOCUMENT_FIXTURE)
|
30
|
-
@doc.text.should == "Ruby on rails developer"
|
31
|
-
end
|
32
|
-
|
33
|
-
end
|
34
|
-
|
35
|
-
describe "with txt document" do
|
36
|
-
|
37
|
-
it 'should extract the text from the document' do
|
38
|
-
@doc = Textractor::Document.new(TXT_DOCUMENT_FIXTURE)
|
39
|
-
@doc.text.should == "Ruby on rails developer"
|
40
|
-
end
|
41
|
-
|
42
|
-
end
|
43
|
-
|
44
|
-
describe "with docx document" do
|
45
|
-
|
46
|
-
it 'should extract the text from the document' do
|
47
|
-
@doc = Textractor::Document.new(DOCX_DOCUMENT_FIXTURE)
|
48
|
-
@doc.text.should == "Ruby on rails developer"
|
49
|
-
end
|
50
|
-
|
51
|
-
end
|
52
|
-
|
53
|
-
end
|
54
|
-
|
55
|
-
describe "#type" do
|
56
|
-
|
57
|
-
describe "with no content type provided" do
|
58
|
-
it 'should return :pdf for PDF documents' do
|
59
|
-
@doc = Textractor::Document.new(PDF_DOCUMENT_FIXTURE)
|
60
|
-
@doc.type.should == :pdf
|
61
|
-
end
|
62
|
-
|
63
|
-
it 'should return :doc for Word documents' do
|
64
|
-
@doc = Textractor::Document.new(DOC_DOCUMENT_FIXTURE)
|
65
|
-
@doc.type.should == :doc
|
66
|
-
end
|
67
|
-
|
68
|
-
it 'should return :docx for Word documents' do
|
69
|
-
@doc = Textractor::Document.new(DOCX_DOCUMENT_FIXTURE)
|
70
|
-
@doc.type.should == :docx
|
71
|
-
end
|
72
|
-
|
73
|
-
it 'should return nil for unknown documents' do
|
74
|
-
@doc = Textractor::Document.new("foo.bar")
|
75
|
-
@doc.type.should == nil
|
76
|
-
end
|
77
|
-
end
|
78
|
-
|
79
|
-
describe "with a content type provided" do
|
80
|
-
|
81
|
-
it 'should ignore the extension of the file' do
|
82
|
-
[PDF_DOCUMENT_FIXTURE, DOC_DOCUMENT_FIXTURE, DOCX_DOCUMENT_FIXTURE].each do |filename|
|
83
|
-
Textractor::Document::CONTENT_TYPE_CONVERSIONS.each do |content_type, type|
|
84
|
-
@doc = Textractor::Document.new(filename, :content_type => content_type)
|
85
|
-
@doc.type.should == type
|
86
|
-
end
|
87
|
-
end
|
88
|
-
end
|
89
|
-
|
90
|
-
end
|
91
|
-
|
92
|
-
end
|
93
|
-
|
94
|
-
end
|