textractor 0.1.6 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile.lock +3 -1
- data/Rakefile +1 -1
- data/lib/textractor.rb +1 -0
- data/lib/textractor/extractors/doc_extractor.rb +1 -1
- data/lib/textractor/extractors/docx_extractor.rb +1 -1
- data/lib/textractor/extractors/pdf_extractor.rb +1 -1
- data/lib/textractor/version.rb +1 -1
- data/textractor.gemspec +3 -1
- metadata +20 -17
- data/spec/content_type_detector/simple_spec.rb +0 -30
- data/spec/fixtures/document .doc +0 -0
- data/spec/fixtures/document .docx +0 -0
- data/spec/fixtures/document .pdf +0 -0
- data/spec/fixtures/document .txt +0 -1
- data/spec/fixtures/document.doc +0 -0
- data/spec/fixtures/document.docx +0 -0
- data/spec/fixtures/document.pdf +0 -0
- data/spec/fixtures/document.txt +0 -1
- data/spec/fixtures/no_extension +0 -0
- data/spec/integration/textractor_spec.rb +0 -74
- data/spec/spec_helper.rb +0 -14
- data/spec/textractor_spec.rb +0 -104
data/Gemfile.lock
CHANGED
@@ -1,12 +1,14 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
textractor (0.1.
|
4
|
+
textractor (0.1.6)
|
5
|
+
escape (>= 0.0.4)
|
5
6
|
|
6
7
|
GEM
|
7
8
|
remote: http://rubygems.org/
|
8
9
|
specs:
|
9
10
|
diff-lcs (1.1.2)
|
11
|
+
escape (0.0.4)
|
10
12
|
rspec (2.1.0)
|
11
13
|
rspec-core (~> 2.1.0)
|
12
14
|
rspec-expectations (~> 2.1.0)
|
data/Rakefile
CHANGED
data/lib/textractor.rb
CHANGED
@@ -13,7 +13,7 @@ module Textractor::Extractors
|
|
13
13
|
end
|
14
14
|
|
15
15
|
def text_from_path(path)
|
16
|
-
command = "wvWare -c utf-8 --nographics -x #{wvText_path}
|
16
|
+
command = "wvWare -c utf-8 --nographics -x #{wvText_path} #{Escape.shell_single_word(path)}"
|
17
17
|
puts command if $DEBUG
|
18
18
|
`#{command}`.strip
|
19
19
|
end
|
data/lib/textractor/version.rb
CHANGED
data/textractor.gemspec
CHANGED
@@ -16,7 +16,9 @@ Gem::Specification.new do |s|
|
|
16
16
|
s.add_development_dependency "bundler", ">= 1.0.0"
|
17
17
|
s.add_development_dependency "rspec", "~> 2.1.0"
|
18
18
|
|
19
|
-
s.
|
19
|
+
s.add_runtime_dependency "escape", ">=0.0.4"
|
20
|
+
|
21
|
+
s.files = `git ls-files`.split("\n").reject{|f| f.gsub(/"/, "") =~ /^spec/}
|
20
22
|
s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
|
21
23
|
s.require_path = 'lib'
|
22
24
|
s.extra_rdoc_files = ["LICENSE", "README.md"]
|
metadata
CHANGED
@@ -5,9 +5,9 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
-
|
10
|
-
version: 0.
|
8
|
+
- 2
|
9
|
+
- 0
|
10
|
+
version: 0.2.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Michael Guterl
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-07-
|
18
|
+
date: 2011-07-29 00:00:00 -04:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -50,6 +50,22 @@ dependencies:
|
|
50
50
|
version: 2.1.0
|
51
51
|
type: :development
|
52
52
|
version_requirements: *id002
|
53
|
+
- !ruby/object:Gem::Dependency
|
54
|
+
name: escape
|
55
|
+
prerelease: false
|
56
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
hash: 23
|
62
|
+
segments:
|
63
|
+
- 0
|
64
|
+
- 0
|
65
|
+
- 4
|
66
|
+
version: 0.0.4
|
67
|
+
type: :runtime
|
68
|
+
version_requirements: *id003
|
53
69
|
description: simple wrapper around CLI for extracting text from PDF and Word documents
|
54
70
|
email:
|
55
71
|
- michael@diminishing.org
|
@@ -81,19 +97,6 @@ files:
|
|
81
97
|
- lib/textractor/extractors/pdf_extractor.rb
|
82
98
|
- lib/textractor/extractors/text_extractor.rb
|
83
99
|
- lib/textractor/version.rb
|
84
|
-
- spec/content_type_detector/simple_spec.rb
|
85
|
-
- spec/fixtures/document .doc
|
86
|
-
- spec/fixtures/document .docx
|
87
|
-
- spec/fixtures/document .pdf
|
88
|
-
- spec/fixtures/document .txt
|
89
|
-
- spec/fixtures/document.doc
|
90
|
-
- spec/fixtures/document.docx
|
91
|
-
- spec/fixtures/document.pdf
|
92
|
-
- spec/fixtures/document.txt
|
93
|
-
- spec/fixtures/no_extension
|
94
|
-
- spec/integration/textractor_spec.rb
|
95
|
-
- spec/spec_helper.rb
|
96
|
-
- spec/textractor_spec.rb
|
97
100
|
- support/wvText.xml
|
98
101
|
- textractor.gemspec
|
99
102
|
- vendor/docx2txt/AUTHORS
|
@@ -1,30 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
|
3
|
-
describe Textractor::ContentTypeDetector::Simple do
|
4
|
-
|
5
|
-
FILENAMES = [
|
6
|
-
[
|
7
|
-
"foo.pdf", "application/pdf",
|
8
|
-
"foo.doc", "application/msword",
|
9
|
-
"foo.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
10
|
-
"foo.txt", "text/plain",
|
11
|
-
]
|
12
|
-
]
|
13
|
-
|
14
|
-
describe '.content_type_for_path' do
|
15
|
-
FILENAMES.each do |(filename, content_type)|
|
16
|
-
context "given #{filename}" do
|
17
|
-
it "returns #{content_type}" do
|
18
|
-
Textractor::ContentTypeDetector::Simple.content_type_for_path(filename).should == content_type
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
context "given #{filename}" do
|
23
|
-
it "returns #{content_type}" do
|
24
|
-
Textractor::ContentTypeDetector::Simple.content_type_for_path(filename.upcase).should == content_type
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
end
|
data/spec/fixtures/document .doc
DELETED
Binary file
|
Binary file
|
data/spec/fixtures/document .pdf
DELETED
Binary file
|
data/spec/fixtures/document .txt
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
text
|
data/spec/fixtures/document.doc
DELETED
Binary file
|
data/spec/fixtures/document.docx
DELETED
Binary file
|
data/spec/fixtures/document.pdf
DELETED
Binary file
|
data/spec/fixtures/document.txt
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
text
|
data/spec/fixtures/no_extension
DELETED
Binary file
|
@@ -1,74 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
|
3
|
-
describe Textractor do
|
4
|
-
|
5
|
-
before do
|
6
|
-
Textractor.clear_registry
|
7
|
-
Textractor.register_basic_types
|
8
|
-
end
|
9
|
-
|
10
|
-
it 'returns the contents of word (.doc) documents' do
|
11
|
-
Textractor.text_from_path(fixture_path("document.doc")).should == 'text'
|
12
|
-
end
|
13
|
-
|
14
|
-
it 'returns the contents of word (.docx) documents' do
|
15
|
-
Textractor.text_from_path(fixture_path("document.docx")).should == 'text'
|
16
|
-
end
|
17
|
-
|
18
|
-
it 'returns the contents of pdf documents' do
|
19
|
-
Textractor.text_from_path(fixture_path("document.pdf")).should == 'text'
|
20
|
-
end
|
21
|
-
|
22
|
-
it 'returns the contents of text documents' do
|
23
|
-
Textractor.text_from_path(fixture_path("document.txt")).should == 'text'
|
24
|
-
end
|
25
|
-
|
26
|
-
it 'allows the user to specify content type to avoid internal resolution' do
|
27
|
-
Textractor.text_from_path(fixture_path("no_extension"), :content_type => "application/pdf").should == 'text'
|
28
|
-
end
|
29
|
-
|
30
|
-
it 'raises an exception when the content type is unable to be determined' do
|
31
|
-
expect {
|
32
|
-
Textractor.text_from_path(fixture_path("no_extension"))
|
33
|
-
}.to raise_error(Textractor::UnknownContentType)
|
34
|
-
end
|
35
|
-
|
36
|
-
it 'raises an exception when the path specified does not exist' do
|
37
|
-
expect {
|
38
|
-
Textractor.text_from_path('non-existant')
|
39
|
-
}.to raise_error(Textractor::FileNotFound)
|
40
|
-
end
|
41
|
-
|
42
|
-
it 'raises an exception when there is no extractor defined for the content type' do
|
43
|
-
Textractor.clear_registry
|
44
|
-
|
45
|
-
expect {
|
46
|
-
Textractor.text_from_path(fixture_path('document.pdf'))
|
47
|
-
}.to raise_error(Textractor::ContentTypeNotRegistered)
|
48
|
-
end
|
49
|
-
|
50
|
-
it 'allows content type extractors to be removed' do
|
51
|
-
Textractor.remove_content_type("application/pdf")
|
52
|
-
|
53
|
-
expect {
|
54
|
-
Textractor.text_from_path(fixture_path('document.pdf'))
|
55
|
-
}.to raise_error(Textractor::ContentTypeNotRegistered)
|
56
|
-
end
|
57
|
-
|
58
|
-
it 'returns the contents of doc files with a space in the path' do
|
59
|
-
Textractor.text_from_path(fixture_path("document .doc")).should == 'text'
|
60
|
-
end
|
61
|
-
|
62
|
-
it 'returns the contents of docx files with a space in the path' do
|
63
|
-
Textractor.text_from_path(fixture_path("document .docx")).should == 'text'
|
64
|
-
end
|
65
|
-
|
66
|
-
it 'returns the contents of pdf files with a space in the path' do
|
67
|
-
Textractor.text_from_path(fixture_path("document .pdf")).should == 'text'
|
68
|
-
end
|
69
|
-
|
70
|
-
it 'returns the contents of txt files with a space in the path' do
|
71
|
-
Textractor.text_from_path(fixture_path("document .txt")).should == 'text'
|
72
|
-
end
|
73
|
-
|
74
|
-
end
|
data/spec/spec_helper.rb
DELETED
@@ -1,14 +0,0 @@
|
|
1
|
-
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
2
|
-
|
3
|
-
require 'rubygems'
|
4
|
-
require 'bundler/setup'
|
5
|
-
require 'rspec'
|
6
|
-
require 'textractor'
|
7
|
-
|
8
|
-
def fixture_path(path)
|
9
|
-
File.expand_path(File.join(File.dirname(__FILE__), 'fixtures', path))
|
10
|
-
end
|
11
|
-
|
12
|
-
RSpec.configure do |config|
|
13
|
-
|
14
|
-
end
|
data/spec/textractor_spec.rb
DELETED
@@ -1,104 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
|
3
|
-
class TestExtractor
|
4
|
-
|
5
|
-
def text_from_path(path)
|
6
|
-
path
|
7
|
-
end
|
8
|
-
|
9
|
-
end
|
10
|
-
|
11
|
-
describe Textractor do
|
12
|
-
|
13
|
-
before do
|
14
|
-
Textractor.clear_registry
|
15
|
-
end
|
16
|
-
|
17
|
-
describe ".text_from_path" do
|
18
|
-
before do
|
19
|
-
File.stub(:exists?).and_return(true)
|
20
|
-
Textractor.stub(:content_type_for_path).and_return('test')
|
21
|
-
Textractor.stub(:extractor_for_content_type).and_return(TestExtractor)
|
22
|
-
end
|
23
|
-
|
24
|
-
it 'extracts the text from a given path' do
|
25
|
-
Textractor.text_from_path('document').should == 'document'
|
26
|
-
end
|
27
|
-
|
28
|
-
it 'uses content_type_for_path to determine the content type' do
|
29
|
-
Textractor.should_receive(:content_type_for_path).with('document')
|
30
|
-
Textractor.text_from_path('document')
|
31
|
-
end
|
32
|
-
|
33
|
-
it 'uses extractor_for_content_type to look up the correct extractor' do
|
34
|
-
Textractor.should_receive(:extractor_for_content_type).with('test')
|
35
|
-
Textractor.text_from_path('document')
|
36
|
-
end
|
37
|
-
|
38
|
-
end
|
39
|
-
|
40
|
-
describe ".register_content_type" do
|
41
|
-
|
42
|
-
it 'raises an exception if an extractor is already defined for that content type' do
|
43
|
-
Textractor.register_content_type("text/plain", TestExtractor)
|
44
|
-
|
45
|
-
expect {
|
46
|
-
Textractor.register_content_type("text/plain", TestExtractor)
|
47
|
-
}.to raise_error(Textractor::ContentTypeAlreadyRegistered)
|
48
|
-
end
|
49
|
-
|
50
|
-
it 'takes a block for simple cases' do
|
51
|
-
File.stub(:exists?).and_return(true)
|
52
|
-
Textractor.stub(:content_type_for_path).and_return('test')
|
53
|
-
Textractor.register_content_type('test') do |path|
|
54
|
-
path
|
55
|
-
end
|
56
|
-
|
57
|
-
Textractor.text_from_path('document').should == 'document'
|
58
|
-
end
|
59
|
-
|
60
|
-
end
|
61
|
-
|
62
|
-
describe ".extractor_for_content_type" do
|
63
|
-
before do
|
64
|
-
Textractor.register_content_type("text/plain", TestExtractor)
|
65
|
-
end
|
66
|
-
|
67
|
-
it 'returns the extractor for the content type' do
|
68
|
-
Textractor.extractor_for_content_type("text/plain").should == TestExtractor
|
69
|
-
end
|
70
|
-
|
71
|
-
it 'raises an exception when no extractor is defined for that content type' do
|
72
|
-
expect {
|
73
|
-
Textractor.extractor_for_content_type("unknown")
|
74
|
-
}.to raise_error(Textractor::ContentTypeNotRegistered)
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
describe ".content_type_for_path" do
|
79
|
-
|
80
|
-
it 'returns the content type based on the file extension' do
|
81
|
-
Textractor.content_type_for_path("document.pdf").should == "application/pdf"
|
82
|
-
end
|
83
|
-
|
84
|
-
it 'raises an exception if it cannot determine the content type' do
|
85
|
-
expect {
|
86
|
-
Textractor.content_type_for_path('unknown')
|
87
|
-
}.to raise_error(Textractor::UnknownContentType)
|
88
|
-
end
|
89
|
-
|
90
|
-
end
|
91
|
-
|
92
|
-
describe ".clear_registry" do
|
93
|
-
before do
|
94
|
-
Textractor.register_content_type("text/plain", TestExtractor)
|
95
|
-
end
|
96
|
-
|
97
|
-
it 'clears the registered content types and their respective extractors' do
|
98
|
-
Textractor.clear_registry
|
99
|
-
Textractor.extractors.should be_empty
|
100
|
-
end
|
101
|
-
|
102
|
-
end
|
103
|
-
|
104
|
-
end
|