textractor 0.0.3 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +2 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +17 -0
- data/README.md +7 -7
- data/Rakefile +2 -21
- data/lib/textractor.rb +54 -8
- data/lib/textractor/extractors.rb +12 -0
- data/lib/textractor/extractors/doc_extractor.rb +29 -0
- data/lib/textractor/extractors/docx_extractor.rb +28 -0
- data/lib/textractor/extractors/pdf_extractor.rb +11 -0
- data/lib/textractor/extractors/text_extractor.rb +11 -0
- data/lib/textractor/extractors/word_extractor.rb +29 -0
- data/lib/textractor/version.rb +3 -0
- data/spec/fixtures/document.doc +0 -0
- data/spec/fixtures/document.docx +0 -0
- data/spec/fixtures/document.pdf +0 -0
- data/spec/fixtures/document.txt +1 -1
- data/spec/fixtures/no_extension +0 -0
- data/spec/integration/textractor_spec.rb +58 -0
- data/spec/spec_helper.rb +10 -4
- data/spec/textractor_spec.rb +76 -14
- data/textractor.gemspec +18 -71
- metadata +45 -20
- data/VERSION +0 -1
- data/lib/textractor/document.rb +0 -66
- data/spec/document_spec.rb +0 -94
data/.gitignore
CHANGED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
data/README.md
CHANGED
@@ -28,18 +28,18 @@ this may not work on all systems due to dependency issues.
|
|
28
28
|
Due to textractor's reliance on command line tools all the methods in
|
29
29
|
textractor work on paths not File objects.
|
30
30
|
|
31
|
-
|
32
|
-
document.text # => "Ruby on rails developer"
|
33
|
-
|
34
|
-
There is also a convenience method on Textractor.
|
35
|
-
|
36
|
-
Textractor.text_from_file(path_to_document) # => "Ruby on rails developer"
|
31
|
+
Textractor.text_from_path(path_to_document) # => "Ruby on rails developer"
|
37
32
|
|
38
33
|
Textractor will attempt to guess what type of document you're trying
|
39
34
|
to extract text from. However, if you know the content type of your
|
40
35
|
document, you can provide it and Textractor won't guess.
|
41
36
|
|
42
|
-
Textractor.
|
37
|
+
Textractor.text_from_path(path_to_document, :content_type => "application/doc")
|
38
|
+
|
39
|
+
## TODO
|
40
|
+
|
41
|
+
* Remove vendored docx2txt perl script
|
42
|
+
* Replace as much as possible with pure ruby
|
43
43
|
|
44
44
|
## Note on Patches/Pull Requests
|
45
45
|
|
data/Rakefile
CHANGED
@@ -1,22 +1,5 @@
|
|
1
|
-
require '
|
2
|
-
|
3
|
-
|
4
|
-
begin
|
5
|
-
require 'jeweler'
|
6
|
-
Jeweler::Tasks.new do |gem|
|
7
|
-
gem.name = "textractor"
|
8
|
-
gem.summary = %Q{simple wrapper around CLI tools for extracting text from PDF and Word documents}
|
9
|
-
gem.description = %Q{simple wrapper around CLI for extracting text from PDF and Word documents}
|
10
|
-
gem.email = "mguterl@gmail.com"
|
11
|
-
gem.homepage = "http://github.com/mguterl/textractor"
|
12
|
-
gem.authors = ["Michael Guterl"]
|
13
|
-
gem.add_development_dependency "rspec", ">= 1.3.0"
|
14
|
-
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
15
|
-
end
|
16
|
-
Jeweler::GemcutterTasks.new
|
17
|
-
rescue LoadError
|
18
|
-
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
19
|
-
end
|
1
|
+
require 'bundler'
|
2
|
+
Bundler::GemHelper.install_tasks
|
20
3
|
|
21
4
|
require 'spec/rake/spectask'
|
22
5
|
Spec::Rake::SpecTask.new(:spec) do |spec|
|
@@ -30,8 +13,6 @@ Spec::Rake::SpecTask.new(:rcov) do |spec|
|
|
30
13
|
spec.rcov = true
|
31
14
|
end
|
32
15
|
|
33
|
-
task :spec => :check_dependencies
|
34
|
-
|
35
16
|
task :default => :spec
|
36
17
|
|
37
18
|
require 'rake/rdoctask'
|
data/lib/textractor.rb
CHANGED
@@ -1,18 +1,64 @@
|
|
1
1
|
module Textractor
|
2
|
-
autoload :Document, "textractor/document"
|
3
2
|
|
4
|
-
|
5
|
-
|
3
|
+
UnknownContentType = Class.new(StandardError)
|
4
|
+
FileNotFound = Class.new(StandardError)
|
5
|
+
ContentTypeAlreadyRegistered = Class.new(StandardError)
|
6
|
+
ContentTypeNotRegistered = Class.new(StandardError)
|
7
|
+
|
8
|
+
autoload :Extractors, "textractor/extractors"
|
9
|
+
|
10
|
+
def self.text_from_path(path, options = {})
|
11
|
+
raise FileNotFound unless File.exists?(path)
|
12
|
+
content_type = options.fetch(:content_type) { content_type_for_path(path) }
|
13
|
+
extractor_class = extractor_for_content_type(content_type)
|
14
|
+
extractor = extractor_class.new
|
15
|
+
|
16
|
+
extractor.text_from_path(path)
|
6
17
|
end
|
7
18
|
|
8
|
-
|
19
|
+
def self.content_type_for_path(path)
|
20
|
+
case File.extname(path)
|
21
|
+
when /\.pdf$/
|
22
|
+
'application/pdf'
|
23
|
+
when /\.doc$/
|
24
|
+
'application/msword'
|
25
|
+
when /\.docx$/
|
26
|
+
'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
27
|
+
when /\.txt$/
|
28
|
+
'text/plain'
|
29
|
+
else
|
30
|
+
raise UnknownContentType, "unable to determine content type for #{path}"
|
31
|
+
end
|
32
|
+
end
|
9
33
|
|
10
|
-
def self.
|
11
|
-
|
34
|
+
def self.register_content_type(content_type, extractor)
|
35
|
+
raise ContentTypeAlreadyRegistered, "#{content_type} is already registered" if extractors[content_type]
|
36
|
+
extractors[content_type] = extractor
|
12
37
|
end
|
13
38
|
|
14
|
-
def self.
|
15
|
-
|
39
|
+
def self.remove_content_type(content_type)
|
40
|
+
extractors.delete content_type
|
16
41
|
end
|
17
42
|
|
43
|
+
def self.extractor_for_content_type(content_type)
|
44
|
+
extractors[content_type] or raise ContentTypeNotRegistered, "#{content_type} is not registered with Textractor"
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.extractors
|
48
|
+
@extractors ||= {}
|
49
|
+
end
|
50
|
+
|
51
|
+
def self.clear_registry
|
52
|
+
@extractors = {}
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.register_basic_types
|
56
|
+
register_content_type("application/pdf", Extractors::PDFExtractor)
|
57
|
+
register_content_type("application/msword", Extractors::DocExtractor)
|
58
|
+
register_content_type("application/vnd.openxmlformats-officedocument.wordprocessingml.document", Extractors::DocxExtractor)
|
59
|
+
register_content_type("text/plain", Extractors::TextExtractor)
|
60
|
+
end
|
61
|
+
|
62
|
+
register_basic_types
|
63
|
+
|
18
64
|
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
module Textractor
|
2
|
+
|
3
|
+
module Extractors
|
4
|
+
|
5
|
+
autoload :PDFExtractor, 'textractor/extractors/pdf_extractor'
|
6
|
+
autoload :DocExtractor, 'textractor/extractors/doc_extractor'
|
7
|
+
autoload :DocxExtractor, 'textractor/extractors/docx_extractor'
|
8
|
+
autoload :TextExtractor, 'textractor/extractors/text_extractor'
|
9
|
+
|
10
|
+
end
|
11
|
+
|
12
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Textractor::Extractors
|
2
|
+
|
3
|
+
class DocExtractor
|
4
|
+
|
5
|
+
DEFAULT_WV_TEXT_PATH = File.expand_path(File.dirname(__FILE__) + "/../../../support/wvText.xml").freeze
|
6
|
+
|
7
|
+
class << self
|
8
|
+
attr_writer :wvText_path
|
9
|
+
|
10
|
+
def wvText_path
|
11
|
+
@wvText_path || DEFAULT_WV_TEXT_PATH
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def text_from_path(path)
|
16
|
+
command = "wvWare -c utf-8 --nographics -x #{wvText_path} #{path}"
|
17
|
+
puts command if $DEBUG
|
18
|
+
`#{command}`.strip
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def wvText_path
|
24
|
+
self.class.wvText_path
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Textractor::Extractors
|
2
|
+
|
3
|
+
class DocxExtractor
|
4
|
+
|
5
|
+
DEFAULT_DOCX2TXT_PATH = File.expand_path(File.dirname(__FILE__) + "/../../../vendor/docx2txt/docx2txt.pl").freeze
|
6
|
+
|
7
|
+
class << self
|
8
|
+
attr_writer :docx2txt_path
|
9
|
+
|
10
|
+
def docx2txt_path
|
11
|
+
@docx2txt_path || DEFAULT_DOCX2TXT_PATH
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
def text_from_path(path)
|
17
|
+
`#{docx2txt_path} #{path} -`.strip
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def docx2txt_path
|
23
|
+
self.class.docx2txt_path
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Textractor::Extractors
|
2
|
+
|
3
|
+
class WordExtractor
|
4
|
+
|
5
|
+
DEFAULT_WV_TEXT_PATH = File.expand_path(File.dirname(__FILE__) + "/../../../support/wvText.xml").freeze
|
6
|
+
|
7
|
+
class << self
|
8
|
+
attr_writer :wvText_path
|
9
|
+
|
10
|
+
def wvText_path
|
11
|
+
@wvText_path || DEFAULT_WV_TEXT_PATH
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def text_from_path(path)
|
16
|
+
command = "wvWare -c utf-8 --nographics -x #{wvText_path} #{path}"
|
17
|
+
puts command if $DEBUG
|
18
|
+
`#{command}`.strip
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def wvText_path
|
24
|
+
self.class.wvText_path
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
data/spec/fixtures/document.doc
CHANGED
Binary file
|
data/spec/fixtures/document.docx
CHANGED
Binary file
|
data/spec/fixtures/document.pdf
CHANGED
Binary file
|
data/spec/fixtures/document.txt
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
text
|
Binary file
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Textractor do
|
4
|
+
|
5
|
+
before do
|
6
|
+
Textractor.clear_registry
|
7
|
+
Textractor.register_basic_types
|
8
|
+
end
|
9
|
+
|
10
|
+
it 'returns the contents of word (.doc) documents' do
|
11
|
+
Textractor.text_from_path(fixture_path("document.doc")).should == 'text'
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'returns the contents of word (.docx) documents' do
|
15
|
+
Textractor.text_from_path(fixture_path("document.docx")).should == 'text'
|
16
|
+
end
|
17
|
+
|
18
|
+
it 'returns the contents of pdf documents' do
|
19
|
+
Textractor.text_from_path(fixture_path("document.pdf")).should == 'text'
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'returns the contents of text documents' do
|
23
|
+
Textractor.text_from_path(fixture_path("document.txt")).should == 'text'
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'allows the user to specify content type to avoid internal resolution' do
|
27
|
+
Textractor.text_from_path(fixture_path("no_extension"), :content_type => "application/pdf").should == 'text'
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'raises an exception when the content type is unable to be determined' do
|
31
|
+
expect {
|
32
|
+
Textractor.text_from_path(fixture_path("no_extension"))
|
33
|
+
}.to raise_error(Textractor::UnknownContentType)
|
34
|
+
end
|
35
|
+
|
36
|
+
it 'raises an exception when the path specified does not exist' do
|
37
|
+
expect {
|
38
|
+
Textractor.text_from_path('non-existant')
|
39
|
+
}.to raise_error(Textractor::FileNotFound)
|
40
|
+
end
|
41
|
+
|
42
|
+
it 'raises an exception when there is no extractor defined for the content type' do
|
43
|
+
Textractor.clear_registry
|
44
|
+
|
45
|
+
expect {
|
46
|
+
Textractor.text_from_path(fixture_path('document.pdf'))
|
47
|
+
}.to raise_error(Textractor::ContentTypeNotRegistered)
|
48
|
+
end
|
49
|
+
|
50
|
+
it 'allows content type extractors to be removed' do
|
51
|
+
Textractor.remove_content_type("application/pdf")
|
52
|
+
|
53
|
+
expect {
|
54
|
+
Textractor.text_from_path(fixture_path('document.pdf'))
|
55
|
+
}.to raise_error(Textractor::ContentTypeNotRegistered)
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1,9 +1,15 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler/setup'
|
3
|
+
require 'spec'
|
4
|
+
|
1
5
|
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
2
|
-
|
6
|
+
|
3
7
|
require 'textractor'
|
4
|
-
|
5
|
-
|
8
|
+
|
9
|
+
def fixture_path(path)
|
10
|
+
File.expand_path(File.join(File.dirname(__FILE__), 'fixtures', path))
|
11
|
+
end
|
6
12
|
|
7
13
|
Spec::Runner.configure do |config|
|
8
|
-
|
14
|
+
|
9
15
|
end
|
data/spec/textractor_spec.rb
CHANGED
@@ -1,32 +1,94 @@
|
|
1
1
|
require 'spec/spec_helper'
|
2
2
|
|
3
|
+
class TestExtractor
|
4
|
+
|
5
|
+
def text_from_path(path)
|
6
|
+
path
|
7
|
+
end
|
8
|
+
|
9
|
+
end
|
10
|
+
|
3
11
|
describe Textractor do
|
4
12
|
|
5
|
-
|
13
|
+
before do
|
14
|
+
Textractor.clear_registry
|
15
|
+
end
|
16
|
+
|
17
|
+
describe ".text_from_path" do
|
18
|
+
before do
|
19
|
+
File.stub(:exists?).and_return(true)
|
20
|
+
Textractor.stub(:content_type_for_path).and_return('test')
|
21
|
+
Textractor.stub(:extractor_for_content_type).and_return(TestExtractor)
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'extracts the text from a given path' do
|
25
|
+
Textractor.text_from_path('document').should == 'document'
|
26
|
+
end
|
27
|
+
|
28
|
+
it 'uses content_type_for_path to determine the content type' do
|
29
|
+
Textractor.should_receive(:content_type_for_path).with('document')
|
30
|
+
Textractor.text_from_path('document')
|
31
|
+
end
|
32
|
+
|
33
|
+
it 'uses extractor_for_content_type to look up the correct extractor' do
|
34
|
+
Textractor.should_receive(:extractor_for_content_type).with('test')
|
35
|
+
Textractor.text_from_path('document')
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
describe ".register_content_type" do
|
41
|
+
|
42
|
+
it 'raises an exception if an extractor is already defined for that content type' do
|
43
|
+
Textractor.register_content_type("text/plain", TestExtractor)
|
44
|
+
|
45
|
+
expect {
|
46
|
+
Textractor.register_content_type("text/plain", TestExtractor)
|
47
|
+
}.to raise_error(Textractor::ContentTypeAlreadyRegistered)
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
6
51
|
|
7
|
-
|
8
|
-
|
52
|
+
describe ".extractor_for_content_type" do
|
53
|
+
before do
|
54
|
+
Textractor.register_content_type("text/plain", TestExtractor)
|
9
55
|
end
|
10
56
|
|
11
|
-
it '
|
12
|
-
Textractor.
|
13
|
-
Textractor.wvText_path.should == "foo.bar"
|
57
|
+
it 'returns the extractor for the content type' do
|
58
|
+
Textractor.extractor_for_content_type("text/plain").should == TestExtractor
|
14
59
|
end
|
15
60
|
|
61
|
+
it 'raises an exception when no extractor is defined for that content type' do
|
62
|
+
expect {
|
63
|
+
Textractor.extractor_for_content_type("unknown")
|
64
|
+
}.to raise_error(Textractor::ContentTypeNotRegistered)
|
65
|
+
end
|
16
66
|
end
|
17
67
|
|
18
|
-
describe ".
|
68
|
+
describe ".content_type_for_path" do
|
19
69
|
|
20
|
-
it '
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
70
|
+
it 'returns the content type based on the file extension' do
|
71
|
+
Textractor.content_type_for_path("document.pdf").should == "application/pdf"
|
72
|
+
end
|
73
|
+
|
74
|
+
it 'raises an exception if it cannot determine the content type' do
|
75
|
+
expect {
|
76
|
+
Textractor.content_type_for_path('unknown')
|
77
|
+
}.to raise_error(Textractor::UnknownContentType)
|
25
78
|
end
|
26
79
|
|
27
80
|
end
|
28
81
|
|
29
|
-
|
30
|
-
|
82
|
+
describe ".clear_registry" do
|
83
|
+
before do
|
84
|
+
Textractor.register_content_type("text/plain", TestExtractor)
|
85
|
+
end
|
86
|
+
|
87
|
+
it 'clears the registered content types and their respective extractors' do
|
88
|
+
Textractor.clear_registry
|
89
|
+
Textractor.extractors.should be_empty
|
90
|
+
end
|
91
|
+
|
31
92
|
end
|
93
|
+
|
32
94
|
end
|
data/textractor.gemspec
CHANGED
@@ -1,78 +1,25 @@
|
|
1
|
-
# Generated by jeweler
|
2
|
-
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
-
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
1
|
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path("../lib/textractor/version", __FILE__)
|
5
3
|
|
6
4
|
Gem::Specification.new do |s|
|
7
|
-
s.name
|
8
|
-
s.version
|
5
|
+
s.name = "textractor"
|
6
|
+
s.version = Textractor::VERSION
|
7
|
+
s.platform = Gem::Platform::RUBY
|
8
|
+
s.authors = ['Michael Guterl']
|
9
|
+
s.email = ['michael@diminishing.org']
|
10
|
+
s.homepage = "http://github.com/mguterl/textractor"
|
11
|
+
s.summary = "simple wrapper around CLI for extracting text from PDF and Word documents"
|
12
|
+
s.description = "simple wrapper around CLI for extracting text from PDF and Word documents"
|
9
13
|
|
10
|
-
s.required_rubygems_version =
|
11
|
-
s.
|
12
|
-
s.date = %q{2010-07-27}
|
13
|
-
s.description = %q{simple wrapper around CLI for extracting text from PDF and Word documents}
|
14
|
-
s.email = %q{mguterl@gmail.com}
|
15
|
-
s.extra_rdoc_files = [
|
16
|
-
"LICENSE",
|
17
|
-
"README.md"
|
18
|
-
]
|
19
|
-
s.files = [
|
20
|
-
".document",
|
21
|
-
".gitignore",
|
22
|
-
"LICENSE",
|
23
|
-
"README.md",
|
24
|
-
"Rakefile",
|
25
|
-
"VERSION",
|
26
|
-
"lib/textractor.rb",
|
27
|
-
"lib/textractor/document.rb",
|
28
|
-
"spec/document_spec.rb",
|
29
|
-
"spec/fixtures/document.doc",
|
30
|
-
"spec/fixtures/document.docx",
|
31
|
-
"spec/fixtures/document.pdf",
|
32
|
-
"spec/fixtures/document.txt",
|
33
|
-
"spec/spec.opts",
|
34
|
-
"spec/spec_helper.rb",
|
35
|
-
"spec/textractor_spec.rb",
|
36
|
-
"support/wvText.xml",
|
37
|
-
"textractor.gemspec",
|
38
|
-
"vendor/docx2txt/AUTHORS",
|
39
|
-
"vendor/docx2txt/BSDmakefile",
|
40
|
-
"vendor/docx2txt/COPYING",
|
41
|
-
"vendor/docx2txt/ChangeLog",
|
42
|
-
"vendor/docx2txt/INSTALL",
|
43
|
-
"vendor/docx2txt/Makefile",
|
44
|
-
"vendor/docx2txt/README",
|
45
|
-
"vendor/docx2txt/ToDo",
|
46
|
-
"vendor/docx2txt/VERSION",
|
47
|
-
"vendor/docx2txt/WInstall.bat",
|
48
|
-
"vendor/docx2txt/docx2txt.bat",
|
49
|
-
"vendor/docx2txt/docx2txt.config",
|
50
|
-
"vendor/docx2txt/docx2txt.pl",
|
51
|
-
"vendor/docx2txt/docx2txt.sh",
|
52
|
-
"vendor/docx2txt/resume.docx"
|
53
|
-
]
|
54
|
-
s.homepage = %q{http://github.com/mguterl/textractor}
|
55
|
-
s.rdoc_options = ["--charset=UTF-8"]
|
56
|
-
s.require_paths = ["lib"]
|
57
|
-
s.rubygems_version = %q{1.3.7}
|
58
|
-
s.summary = %q{simple wrapper around CLI tools for extracting text from PDF and Word documents}
|
59
|
-
s.test_files = [
|
60
|
-
"spec/document_spec.rb",
|
61
|
-
"spec/spec_helper.rb",
|
62
|
-
"spec/textractor_spec.rb"
|
63
|
-
]
|
14
|
+
s.required_rubygems_version = ">= 1.3.6"
|
15
|
+
s.rubyforge_project = "textractor"
|
64
16
|
|
65
|
-
|
66
|
-
|
67
|
-
s.specification_version = 3
|
17
|
+
s.add_development_dependency "bundler", ">= 1.0.0"
|
18
|
+
s.add_development_dependency "rspec", "~> 1.3.0"
|
68
19
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
else
|
75
|
-
s.add_dependency(%q<rspec>, [">= 1.3.0"])
|
76
|
-
end
|
20
|
+
s.files = `git ls-files`.split("\n")
|
21
|
+
s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
|
22
|
+
s.require_path = 'lib'
|
23
|
+
s.extra_rdoc_files = ["LICENSE", "README.md"]
|
24
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
77
25
|
end
|
78
|
-
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 31
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
-
|
10
|
-
version: 0.
|
8
|
+
- 1
|
9
|
+
- 2
|
10
|
+
version: 0.1.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Michael Guterl
|
@@ -15,16 +15,32 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-
|
18
|
+
date: 2010-11-06 00:00:00 -04:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
22
|
-
name:
|
22
|
+
name: bundler
|
23
23
|
prerelease: false
|
24
24
|
requirement: &id001 !ruby/object:Gem::Requirement
|
25
25
|
none: false
|
26
26
|
requirements:
|
27
27
|
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 23
|
30
|
+
segments:
|
31
|
+
- 1
|
32
|
+
- 0
|
33
|
+
- 0
|
34
|
+
version: 1.0.0
|
35
|
+
type: :development
|
36
|
+
version_requirements: *id001
|
37
|
+
- !ruby/object:Gem::Dependency
|
38
|
+
name: rspec
|
39
|
+
prerelease: false
|
40
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ~>
|
28
44
|
- !ruby/object:Gem::Version
|
29
45
|
hash: 27
|
30
46
|
segments:
|
@@ -33,9 +49,10 @@ dependencies:
|
|
33
49
|
- 0
|
34
50
|
version: 1.3.0
|
35
51
|
type: :development
|
36
|
-
version_requirements: *
|
52
|
+
version_requirements: *id002
|
37
53
|
description: simple wrapper around CLI for extracting text from PDF and Word documents
|
38
|
-
email:
|
54
|
+
email:
|
55
|
+
- michael@diminishing.org
|
39
56
|
executables: []
|
40
57
|
|
41
58
|
extensions: []
|
@@ -46,17 +63,25 @@ extra_rdoc_files:
|
|
46
63
|
files:
|
47
64
|
- .document
|
48
65
|
- .gitignore
|
66
|
+
- Gemfile
|
67
|
+
- Gemfile.lock
|
49
68
|
- LICENSE
|
50
69
|
- README.md
|
51
70
|
- Rakefile
|
52
|
-
- VERSION
|
53
71
|
- lib/textractor.rb
|
54
|
-
- lib/textractor/
|
55
|
-
-
|
72
|
+
- lib/textractor/extractors.rb
|
73
|
+
- lib/textractor/extractors/doc_extractor.rb
|
74
|
+
- lib/textractor/extractors/docx_extractor.rb
|
75
|
+
- lib/textractor/extractors/pdf_extractor.rb
|
76
|
+
- lib/textractor/extractors/text_extractor.rb
|
77
|
+
- lib/textractor/extractors/word_extractor.rb
|
78
|
+
- lib/textractor/version.rb
|
56
79
|
- spec/fixtures/document.doc
|
57
80
|
- spec/fixtures/document.docx
|
58
81
|
- spec/fixtures/document.pdf
|
59
82
|
- spec/fixtures/document.txt
|
83
|
+
- spec/fixtures/no_extension
|
84
|
+
- spec/integration/textractor_spec.rb
|
60
85
|
- spec/spec.opts
|
61
86
|
- spec/spec_helper.rb
|
62
87
|
- spec/textractor_spec.rb
|
@@ -100,18 +125,18 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
100
125
|
requirements:
|
101
126
|
- - ">="
|
102
127
|
- !ruby/object:Gem::Version
|
103
|
-
hash:
|
128
|
+
hash: 23
|
104
129
|
segments:
|
105
|
-
-
|
106
|
-
|
130
|
+
- 1
|
131
|
+
- 3
|
132
|
+
- 6
|
133
|
+
version: 1.3.6
|
107
134
|
requirements: []
|
108
135
|
|
109
|
-
rubyforge_project:
|
136
|
+
rubyforge_project: textractor
|
110
137
|
rubygems_version: 1.3.7
|
111
138
|
signing_key:
|
112
139
|
specification_version: 3
|
113
|
-
summary: simple wrapper around CLI
|
114
|
-
test_files:
|
115
|
-
|
116
|
-
- spec/spec_helper.rb
|
117
|
-
- spec/textractor_spec.rb
|
140
|
+
summary: simple wrapper around CLI for extracting text from PDF and Word documents
|
141
|
+
test_files: []
|
142
|
+
|
data/VERSION
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
0.0.3
|
data/lib/textractor/document.rb
DELETED
@@ -1,66 +0,0 @@
|
|
1
|
-
module Textractor
|
2
|
-
|
3
|
-
class Document
|
4
|
-
|
5
|
-
CONTENT_TYPE_CONVERSIONS = {
|
6
|
-
'application/pdf' => :pdf,
|
7
|
-
'application/x-pdf' => :pdf,
|
8
|
-
'application/doc' => :doc,
|
9
|
-
'application/x-doc' => :doc,
|
10
|
-
'application/msword' => :doc,
|
11
|
-
'text/plain' => :txt,
|
12
|
-
'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => :docx,
|
13
|
-
}
|
14
|
-
|
15
|
-
attr_reader :filename
|
16
|
-
|
17
|
-
def initialize(filename, options = {})
|
18
|
-
@filename = File.expand_path(filename)
|
19
|
-
@content_type = options[:content_type]
|
20
|
-
end
|
21
|
-
|
22
|
-
def text
|
23
|
-
send("extract_from_#{type}")
|
24
|
-
end
|
25
|
-
|
26
|
-
def type
|
27
|
-
return CONTENT_TYPE_CONVERSIONS[content_type] if content_type
|
28
|
-
case File.extname(@filename)
|
29
|
-
when /pdf/
|
30
|
-
:pdf
|
31
|
-
when /docx/
|
32
|
-
:docx
|
33
|
-
when /doc/
|
34
|
-
:doc
|
35
|
-
when /txt/
|
36
|
-
:txt
|
37
|
-
else
|
38
|
-
nil
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|
42
|
-
private
|
43
|
-
|
44
|
-
def content_type
|
45
|
-
@content_type
|
46
|
-
end
|
47
|
-
|
48
|
-
def extract_from_pdf
|
49
|
-
`pdftotext #{filename} - 2>/dev/null`.strip
|
50
|
-
end
|
51
|
-
|
52
|
-
def extract_from_doc
|
53
|
-
`wvWare -c utf-8 --nographics -x #{Textractor.wvText_path} #{filename} 2>/dev/null`.strip
|
54
|
-
end
|
55
|
-
|
56
|
-
def extract_from_docx
|
57
|
-
`#{File.dirname(__FILE__) + "/../../vendor/docx2txt/docx2txt.pl"} #{filename} -`.strip
|
58
|
-
end
|
59
|
-
|
60
|
-
def extract_from_txt
|
61
|
-
File.read(filename)
|
62
|
-
end
|
63
|
-
|
64
|
-
end
|
65
|
-
|
66
|
-
end
|
data/spec/document_spec.rb
DELETED
@@ -1,94 +0,0 @@
|
|
1
|
-
require 'spec/spec_helper'
|
2
|
-
|
3
|
-
describe Textractor::Document do
|
4
|
-
|
5
|
-
PDF_DOCUMENT_FIXTURE = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.pdf")
|
6
|
-
DOC_DOCUMENT_FIXTURE = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.doc")
|
7
|
-
TXT_DOCUMENT_FIXTURE = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.txt")
|
8
|
-
DOCX_DOCUMENT_FIXTURE = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.docx")
|
9
|
-
|
10
|
-
it 'should require a filename to create' do
|
11
|
-
expect { Textractor::Document.new }.to raise_error(ArgumentError)
|
12
|
-
Textractor::Document.new('filename').filename.should == File.expand_path('filename')
|
13
|
-
end
|
14
|
-
|
15
|
-
describe "#text" do
|
16
|
-
|
17
|
-
describe "with pdf document" do
|
18
|
-
|
19
|
-
it 'should extract the text from the document' do
|
20
|
-
@doc = Textractor::Document.new(PDF_DOCUMENT_FIXTURE)
|
21
|
-
@doc.text.should == "Ruby on rails developer"
|
22
|
-
end
|
23
|
-
|
24
|
-
end
|
25
|
-
|
26
|
-
describe "with doc document" do
|
27
|
-
|
28
|
-
it 'should extract the text from the document' do
|
29
|
-
@doc = Textractor::Document.new(DOC_DOCUMENT_FIXTURE)
|
30
|
-
@doc.text.should == "Ruby on rails developer"
|
31
|
-
end
|
32
|
-
|
33
|
-
end
|
34
|
-
|
35
|
-
describe "with txt document" do
|
36
|
-
|
37
|
-
it 'should extract the text from the document' do
|
38
|
-
@doc = Textractor::Document.new(TXT_DOCUMENT_FIXTURE)
|
39
|
-
@doc.text.should == "Ruby on rails developer"
|
40
|
-
end
|
41
|
-
|
42
|
-
end
|
43
|
-
|
44
|
-
describe "with docx document" do
|
45
|
-
|
46
|
-
it 'should extract the text from the document' do
|
47
|
-
@doc = Textractor::Document.new(DOCX_DOCUMENT_FIXTURE)
|
48
|
-
@doc.text.should == "Ruby on rails developer"
|
49
|
-
end
|
50
|
-
|
51
|
-
end
|
52
|
-
|
53
|
-
end
|
54
|
-
|
55
|
-
describe "#type" do
|
56
|
-
|
57
|
-
describe "with no content type provided" do
|
58
|
-
it 'should return :pdf for PDF documents' do
|
59
|
-
@doc = Textractor::Document.new(PDF_DOCUMENT_FIXTURE)
|
60
|
-
@doc.type.should == :pdf
|
61
|
-
end
|
62
|
-
|
63
|
-
it 'should return :doc for Word documents' do
|
64
|
-
@doc = Textractor::Document.new(DOC_DOCUMENT_FIXTURE)
|
65
|
-
@doc.type.should == :doc
|
66
|
-
end
|
67
|
-
|
68
|
-
it 'should return :docx for Word documents' do
|
69
|
-
@doc = Textractor::Document.new(DOCX_DOCUMENT_FIXTURE)
|
70
|
-
@doc.type.should == :docx
|
71
|
-
end
|
72
|
-
|
73
|
-
it 'should return nil for unknown documents' do
|
74
|
-
@doc = Textractor::Document.new("foo.bar")
|
75
|
-
@doc.type.should == nil
|
76
|
-
end
|
77
|
-
end
|
78
|
-
|
79
|
-
describe "with a content type provided" do
|
80
|
-
|
81
|
-
it 'should ignore the extension of the file' do
|
82
|
-
[PDF_DOCUMENT_FIXTURE, DOC_DOCUMENT_FIXTURE, DOCX_DOCUMENT_FIXTURE].each do |filename|
|
83
|
-
Textractor::Document::CONTENT_TYPE_CONVERSIONS.each do |content_type, type|
|
84
|
-
@doc = Textractor::Document.new(filename, :content_type => content_type)
|
85
|
-
@doc.type.should == type
|
86
|
-
end
|
87
|
-
end
|
88
|
-
end
|
89
|
-
|
90
|
-
end
|
91
|
-
|
92
|
-
end
|
93
|
-
|
94
|
-
end
|