textractor 0.1.6 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile.lock +3 -1
- data/Rakefile +1 -1
- data/lib/textractor.rb +1 -0
- data/lib/textractor/extractors/doc_extractor.rb +1 -1
- data/lib/textractor/extractors/docx_extractor.rb +1 -1
- data/lib/textractor/extractors/pdf_extractor.rb +1 -1
- data/lib/textractor/version.rb +1 -1
- data/textractor.gemspec +3 -1
- metadata +20 -17
- data/spec/content_type_detector/simple_spec.rb +0 -30
- data/spec/fixtures/document .doc +0 -0
- data/spec/fixtures/document .docx +0 -0
- data/spec/fixtures/document .pdf +0 -0
- data/spec/fixtures/document .txt +0 -1
- data/spec/fixtures/document.doc +0 -0
- data/spec/fixtures/document.docx +0 -0
- data/spec/fixtures/document.pdf +0 -0
- data/spec/fixtures/document.txt +0 -1
- data/spec/fixtures/no_extension +0 -0
- data/spec/integration/textractor_spec.rb +0 -74
- data/spec/spec_helper.rb +0 -14
- data/spec/textractor_spec.rb +0 -104
data/Gemfile.lock
CHANGED
@@ -1,12 +1,14 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
textractor (0.1.
|
4
|
+
textractor (0.1.6)
|
5
|
+
escape (>= 0.0.4)
|
5
6
|
|
6
7
|
GEM
|
7
8
|
remote: http://rubygems.org/
|
8
9
|
specs:
|
9
10
|
diff-lcs (1.1.2)
|
11
|
+
escape (0.0.4)
|
10
12
|
rspec (2.1.0)
|
11
13
|
rspec-core (~> 2.1.0)
|
12
14
|
rspec-expectations (~> 2.1.0)
|
data/Rakefile
CHANGED
data/lib/textractor.rb
CHANGED
@@ -13,7 +13,7 @@ module Textractor::Extractors
|
|
13
13
|
end
|
14
14
|
|
15
15
|
def text_from_path(path)
|
16
|
-
command = "wvWare -c utf-8 --nographics -x #{wvText_path}
|
16
|
+
command = "wvWare -c utf-8 --nographics -x #{wvText_path} #{Escape.shell_single_word(path)}"
|
17
17
|
puts command if $DEBUG
|
18
18
|
`#{command}`.strip
|
19
19
|
end
|
data/lib/textractor/version.rb
CHANGED
data/textractor.gemspec
CHANGED
@@ -16,7 +16,9 @@ Gem::Specification.new do |s|
|
|
16
16
|
s.add_development_dependency "bundler", ">= 1.0.0"
|
17
17
|
s.add_development_dependency "rspec", "~> 2.1.0"
|
18
18
|
|
19
|
-
s.
|
19
|
+
s.add_runtime_dependency "escape", ">=0.0.4"
|
20
|
+
|
21
|
+
s.files = `git ls-files`.split("\n").reject{|f| f.gsub(/"/, "") =~ /^spec/}
|
20
22
|
s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
|
21
23
|
s.require_path = 'lib'
|
22
24
|
s.extra_rdoc_files = ["LICENSE", "README.md"]
|
metadata
CHANGED
@@ -5,9 +5,9 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
-
|
10
|
-
version: 0.
|
8
|
+
- 2
|
9
|
+
- 0
|
10
|
+
version: 0.2.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Michael Guterl
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-07-
|
18
|
+
date: 2011-07-29 00:00:00 -04:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -50,6 +50,22 @@ dependencies:
|
|
50
50
|
version: 2.1.0
|
51
51
|
type: :development
|
52
52
|
version_requirements: *id002
|
53
|
+
- !ruby/object:Gem::Dependency
|
54
|
+
name: escape
|
55
|
+
prerelease: false
|
56
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
hash: 23
|
62
|
+
segments:
|
63
|
+
- 0
|
64
|
+
- 0
|
65
|
+
- 4
|
66
|
+
version: 0.0.4
|
67
|
+
type: :runtime
|
68
|
+
version_requirements: *id003
|
53
69
|
description: simple wrapper around CLI for extracting text from PDF and Word documents
|
54
70
|
email:
|
55
71
|
- michael@diminishing.org
|
@@ -81,19 +97,6 @@ files:
|
|
81
97
|
- lib/textractor/extractors/pdf_extractor.rb
|
82
98
|
- lib/textractor/extractors/text_extractor.rb
|
83
99
|
- lib/textractor/version.rb
|
84
|
-
- spec/content_type_detector/simple_spec.rb
|
85
|
-
- spec/fixtures/document .doc
|
86
|
-
- spec/fixtures/document .docx
|
87
|
-
- spec/fixtures/document .pdf
|
88
|
-
- spec/fixtures/document .txt
|
89
|
-
- spec/fixtures/document.doc
|
90
|
-
- spec/fixtures/document.docx
|
91
|
-
- spec/fixtures/document.pdf
|
92
|
-
- spec/fixtures/document.txt
|
93
|
-
- spec/fixtures/no_extension
|
94
|
-
- spec/integration/textractor_spec.rb
|
95
|
-
- spec/spec_helper.rb
|
96
|
-
- spec/textractor_spec.rb
|
97
100
|
- support/wvText.xml
|
98
101
|
- textractor.gemspec
|
99
102
|
- vendor/docx2txt/AUTHORS
|
@@ -1,30 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
|
3
|
-
describe Textractor::ContentTypeDetector::Simple do
|
4
|
-
|
5
|
-
FILENAMES = [
|
6
|
-
[
|
7
|
-
"foo.pdf", "application/pdf",
|
8
|
-
"foo.doc", "application/msword",
|
9
|
-
"foo.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
10
|
-
"foo.txt", "text/plain",
|
11
|
-
]
|
12
|
-
]
|
13
|
-
|
14
|
-
describe '.content_type_for_path' do
|
15
|
-
FILENAMES.each do |(filename, content_type)|
|
16
|
-
context "given #{filename}" do
|
17
|
-
it "returns #{content_type}" do
|
18
|
-
Textractor::ContentTypeDetector::Simple.content_type_for_path(filename).should == content_type
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
context "given #{filename}" do
|
23
|
-
it "returns #{content_type}" do
|
24
|
-
Textractor::ContentTypeDetector::Simple.content_type_for_path(filename.upcase).should == content_type
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
end
|
data/spec/fixtures/document .doc
DELETED
Binary file
|
Binary file
|
data/spec/fixtures/document .pdf
DELETED
Binary file
|
data/spec/fixtures/document .txt
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
text
|
data/spec/fixtures/document.doc
DELETED
Binary file
|
data/spec/fixtures/document.docx
DELETED
Binary file
|
data/spec/fixtures/document.pdf
DELETED
Binary file
|
data/spec/fixtures/document.txt
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
text
|
data/spec/fixtures/no_extension
DELETED
Binary file
|
@@ -1,74 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
|
3
|
-
describe Textractor do
|
4
|
-
|
5
|
-
before do
|
6
|
-
Textractor.clear_registry
|
7
|
-
Textractor.register_basic_types
|
8
|
-
end
|
9
|
-
|
10
|
-
it 'returns the contents of word (.doc) documents' do
|
11
|
-
Textractor.text_from_path(fixture_path("document.doc")).should == 'text'
|
12
|
-
end
|
13
|
-
|
14
|
-
it 'returns the contents of word (.docx) documents' do
|
15
|
-
Textractor.text_from_path(fixture_path("document.docx")).should == 'text'
|
16
|
-
end
|
17
|
-
|
18
|
-
it 'returns the contents of pdf documents' do
|
19
|
-
Textractor.text_from_path(fixture_path("document.pdf")).should == 'text'
|
20
|
-
end
|
21
|
-
|
22
|
-
it 'returns the contents of text documents' do
|
23
|
-
Textractor.text_from_path(fixture_path("document.txt")).should == 'text'
|
24
|
-
end
|
25
|
-
|
26
|
-
it 'allows the user to specify content type to avoid internal resolution' do
|
27
|
-
Textractor.text_from_path(fixture_path("no_extension"), :content_type => "application/pdf").should == 'text'
|
28
|
-
end
|
29
|
-
|
30
|
-
it 'raises an exception when the content type is unable to be determined' do
|
31
|
-
expect {
|
32
|
-
Textractor.text_from_path(fixture_path("no_extension"))
|
33
|
-
}.to raise_error(Textractor::UnknownContentType)
|
34
|
-
end
|
35
|
-
|
36
|
-
it 'raises an exception when the path specified does not exist' do
|
37
|
-
expect {
|
38
|
-
Textractor.text_from_path('non-existant')
|
39
|
-
}.to raise_error(Textractor::FileNotFound)
|
40
|
-
end
|
41
|
-
|
42
|
-
it 'raises an exception when there is no extractor defined for the content type' do
|
43
|
-
Textractor.clear_registry
|
44
|
-
|
45
|
-
expect {
|
46
|
-
Textractor.text_from_path(fixture_path('document.pdf'))
|
47
|
-
}.to raise_error(Textractor::ContentTypeNotRegistered)
|
48
|
-
end
|
49
|
-
|
50
|
-
it 'allows content type extractors to be removed' do
|
51
|
-
Textractor.remove_content_type("application/pdf")
|
52
|
-
|
53
|
-
expect {
|
54
|
-
Textractor.text_from_path(fixture_path('document.pdf'))
|
55
|
-
}.to raise_error(Textractor::ContentTypeNotRegistered)
|
56
|
-
end
|
57
|
-
|
58
|
-
it 'returns the contents of doc files with a space in the path' do
|
59
|
-
Textractor.text_from_path(fixture_path("document .doc")).should == 'text'
|
60
|
-
end
|
61
|
-
|
62
|
-
it 'returns the contents of docx files with a space in the path' do
|
63
|
-
Textractor.text_from_path(fixture_path("document .docx")).should == 'text'
|
64
|
-
end
|
65
|
-
|
66
|
-
it 'returns the contents of pdf files with a space in the path' do
|
67
|
-
Textractor.text_from_path(fixture_path("document .pdf")).should == 'text'
|
68
|
-
end
|
69
|
-
|
70
|
-
it 'returns the contents of txt files with a space in the path' do
|
71
|
-
Textractor.text_from_path(fixture_path("document .txt")).should == 'text'
|
72
|
-
end
|
73
|
-
|
74
|
-
end
|
data/spec/spec_helper.rb
DELETED
@@ -1,14 +0,0 @@
|
|
1
|
-
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
2
|
-
|
3
|
-
require 'rubygems'
|
4
|
-
require 'bundler/setup'
|
5
|
-
require 'rspec'
|
6
|
-
require 'textractor'
|
7
|
-
|
8
|
-
def fixture_path(path)
|
9
|
-
File.expand_path(File.join(File.dirname(__FILE__), 'fixtures', path))
|
10
|
-
end
|
11
|
-
|
12
|
-
RSpec.configure do |config|
|
13
|
-
|
14
|
-
end
|
data/spec/textractor_spec.rb
DELETED
@@ -1,104 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
|
3
|
-
class TestExtractor
|
4
|
-
|
5
|
-
def text_from_path(path)
|
6
|
-
path
|
7
|
-
end
|
8
|
-
|
9
|
-
end
|
10
|
-
|
11
|
-
describe Textractor do
|
12
|
-
|
13
|
-
before do
|
14
|
-
Textractor.clear_registry
|
15
|
-
end
|
16
|
-
|
17
|
-
describe ".text_from_path" do
|
18
|
-
before do
|
19
|
-
File.stub(:exists?).and_return(true)
|
20
|
-
Textractor.stub(:content_type_for_path).and_return('test')
|
21
|
-
Textractor.stub(:extractor_for_content_type).and_return(TestExtractor)
|
22
|
-
end
|
23
|
-
|
24
|
-
it 'extracts the text from a given path' do
|
25
|
-
Textractor.text_from_path('document').should == 'document'
|
26
|
-
end
|
27
|
-
|
28
|
-
it 'uses content_type_for_path to determine the content type' do
|
29
|
-
Textractor.should_receive(:content_type_for_path).with('document')
|
30
|
-
Textractor.text_from_path('document')
|
31
|
-
end
|
32
|
-
|
33
|
-
it 'uses extractor_for_content_type to look up the correct extractor' do
|
34
|
-
Textractor.should_receive(:extractor_for_content_type).with('test')
|
35
|
-
Textractor.text_from_path('document')
|
36
|
-
end
|
37
|
-
|
38
|
-
end
|
39
|
-
|
40
|
-
describe ".register_content_type" do
|
41
|
-
|
42
|
-
it 'raises an exception if an extractor is already defined for that content type' do
|
43
|
-
Textractor.register_content_type("text/plain", TestExtractor)
|
44
|
-
|
45
|
-
expect {
|
46
|
-
Textractor.register_content_type("text/plain", TestExtractor)
|
47
|
-
}.to raise_error(Textractor::ContentTypeAlreadyRegistered)
|
48
|
-
end
|
49
|
-
|
50
|
-
it 'takes a block for simple cases' do
|
51
|
-
File.stub(:exists?).and_return(true)
|
52
|
-
Textractor.stub(:content_type_for_path).and_return('test')
|
53
|
-
Textractor.register_content_type('test') do |path|
|
54
|
-
path
|
55
|
-
end
|
56
|
-
|
57
|
-
Textractor.text_from_path('document').should == 'document'
|
58
|
-
end
|
59
|
-
|
60
|
-
end
|
61
|
-
|
62
|
-
describe ".extractor_for_content_type" do
|
63
|
-
before do
|
64
|
-
Textractor.register_content_type("text/plain", TestExtractor)
|
65
|
-
end
|
66
|
-
|
67
|
-
it 'returns the extractor for the content type' do
|
68
|
-
Textractor.extractor_for_content_type("text/plain").should == TestExtractor
|
69
|
-
end
|
70
|
-
|
71
|
-
it 'raises an exception when no extractor is defined for that content type' do
|
72
|
-
expect {
|
73
|
-
Textractor.extractor_for_content_type("unknown")
|
74
|
-
}.to raise_error(Textractor::ContentTypeNotRegistered)
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
describe ".content_type_for_path" do
|
79
|
-
|
80
|
-
it 'returns the content type based on the file extension' do
|
81
|
-
Textractor.content_type_for_path("document.pdf").should == "application/pdf"
|
82
|
-
end
|
83
|
-
|
84
|
-
it 'raises an exception if it cannot determine the content type' do
|
85
|
-
expect {
|
86
|
-
Textractor.content_type_for_path('unknown')
|
87
|
-
}.to raise_error(Textractor::UnknownContentType)
|
88
|
-
end
|
89
|
-
|
90
|
-
end
|
91
|
-
|
92
|
-
describe ".clear_registry" do
|
93
|
-
before do
|
94
|
-
Textractor.register_content_type("text/plain", TestExtractor)
|
95
|
-
end
|
96
|
-
|
97
|
-
it 'clears the registered content types and their respective extractors' do
|
98
|
-
Textractor.clear_registry
|
99
|
-
Textractor.extractors.should be_empty
|
100
|
-
end
|
101
|
-
|
102
|
-
end
|
103
|
-
|
104
|
-
end
|