textractor 0.1.6 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,12 +1,14 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- textractor (0.1.4)
4
+ textractor (0.1.6)
5
+ escape (>= 0.0.4)
5
6
 
6
7
  GEM
7
8
  remote: http://rubygems.org/
8
9
  specs:
9
10
  diff-lcs (1.1.2)
11
+ escape (0.0.4)
10
12
  rspec (2.1.0)
11
13
  rspec-core (~> 2.1.0)
12
14
  rspec-expectations (~> 2.1.0)
data/Rakefile CHANGED
@@ -15,7 +15,7 @@ end
15
15
 
16
16
  task :default => :spec
17
17
 
18
- require 'rake/rdoctask'
18
+ require 'rdoc/task'
19
19
  Rake::RDocTask.new do |rdoc|
20
20
  version = File.exist?('VERSION') ? File.read('VERSION') : ""
21
21
 
@@ -64,3 +64,4 @@ module Textractor
64
64
  end
65
65
 
66
66
  require 'textractor/content_type_detector'
67
+ require 'escape'
@@ -13,7 +13,7 @@ module Textractor::Extractors
13
13
  end
14
14
 
15
15
  def text_from_path(path)
16
- command = "wvWare -c utf-8 --nographics -x #{wvText_path} '#{path}'"
16
+ command = "wvWare -c utf-8 --nographics -x #{wvText_path} #{Escape.shell_single_word(path)}"
17
17
  puts command if $DEBUG
18
18
  `#{command}`.strip
19
19
  end
@@ -14,7 +14,7 @@ module Textractor::Extractors
14
14
 
15
15
 
16
16
  def text_from_path(path)
17
- `#{docx2txt_path} '#{path}' -`.strip
17
+ `#{docx2txt_path} #{Escape.shell_single_word(path)} -`.strip
18
18
  end
19
19
 
20
20
  private
@@ -3,7 +3,7 @@ module Textractor::Extractors
3
3
  class PDFExtractor
4
4
 
5
5
  def text_from_path(path)
6
- `pdftotext '#{path}' - 2>/dev/null`.strip
6
+ `pdftotext #{Escape.shell_single_word(path)} - 2>/dev/null`.strip
7
7
  end
8
8
 
9
9
  end
@@ -1,3 +1,3 @@
1
1
  module Textractor
2
- VERSION = '0.1.6'
2
+ VERSION = '0.2.0'
3
3
  end
@@ -16,7 +16,9 @@ Gem::Specification.new do |s|
16
16
  s.add_development_dependency "bundler", ">= 1.0.0"
17
17
  s.add_development_dependency "rspec", "~> 2.1.0"
18
18
 
19
- s.files = `git ls-files`.split("\n")
19
+ s.add_runtime_dependency "escape", ">=0.0.4"
20
+
21
+ s.files = `git ls-files`.split("\n").reject{|f| f.gsub(/"/, "") =~ /^spec/}
20
22
  s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
21
23
  s.require_path = 'lib'
22
24
  s.extra_rdoc_files = ["LICENSE", "README.md"]
metadata CHANGED
@@ -5,9 +5,9 @@ version: !ruby/object:Gem::Version
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
- - 1
9
- - 6
10
- version: 0.1.6
8
+ - 2
9
+ - 0
10
+ version: 0.2.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Michael Guterl
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-07-22 00:00:00 -04:00
18
+ date: 2011-07-29 00:00:00 -04:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -50,6 +50,22 @@ dependencies:
50
50
  version: 2.1.0
51
51
  type: :development
52
52
  version_requirements: *id002
53
+ - !ruby/object:Gem::Dependency
54
+ name: escape
55
+ prerelease: false
56
+ requirement: &id003 !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ hash: 23
62
+ segments:
63
+ - 0
64
+ - 0
65
+ - 4
66
+ version: 0.0.4
67
+ type: :runtime
68
+ version_requirements: *id003
53
69
  description: simple wrapper around CLI for extracting text from PDF and Word documents
54
70
  email:
55
71
  - michael@diminishing.org
@@ -81,19 +97,6 @@ files:
81
97
  - lib/textractor/extractors/pdf_extractor.rb
82
98
  - lib/textractor/extractors/text_extractor.rb
83
99
  - lib/textractor/version.rb
84
- - spec/content_type_detector/simple_spec.rb
85
- - spec/fixtures/document .doc
86
- - spec/fixtures/document .docx
87
- - spec/fixtures/document .pdf
88
- - spec/fixtures/document .txt
89
- - spec/fixtures/document.doc
90
- - spec/fixtures/document.docx
91
- - spec/fixtures/document.pdf
92
- - spec/fixtures/document.txt
93
- - spec/fixtures/no_extension
94
- - spec/integration/textractor_spec.rb
95
- - spec/spec_helper.rb
96
- - spec/textractor_spec.rb
97
100
  - support/wvText.xml
98
101
  - textractor.gemspec
99
102
  - vendor/docx2txt/AUTHORS
@@ -1,30 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Textractor::ContentTypeDetector::Simple do
4
-
5
- FILENAMES = [
6
- [
7
- "foo.pdf", "application/pdf",
8
- "foo.doc", "application/msword",
9
- "foo.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
10
- "foo.txt", "text/plain",
11
- ]
12
- ]
13
-
14
- describe '.content_type_for_path' do
15
- FILENAMES.each do |(filename, content_type)|
16
- context "given #{filename}" do
17
- it "returns #{content_type}" do
18
- Textractor::ContentTypeDetector::Simple.content_type_for_path(filename).should == content_type
19
- end
20
- end
21
-
22
- context "given #{filename}" do
23
- it "returns #{content_type}" do
24
- Textractor::ContentTypeDetector::Simple.content_type_for_path(filename.upcase).should == content_type
25
- end
26
- end
27
- end
28
- end
29
-
30
- end
Binary file
Binary file
Binary file
@@ -1 +0,0 @@
1
- text
Binary file
Binary file
Binary file
@@ -1 +0,0 @@
1
- text
Binary file
@@ -1,74 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Textractor do
4
-
5
- before do
6
- Textractor.clear_registry
7
- Textractor.register_basic_types
8
- end
9
-
10
- it 'returns the contents of word (.doc) documents' do
11
- Textractor.text_from_path(fixture_path("document.doc")).should == 'text'
12
- end
13
-
14
- it 'returns the contents of word (.docx) documents' do
15
- Textractor.text_from_path(fixture_path("document.docx")).should == 'text'
16
- end
17
-
18
- it 'returns the contents of pdf documents' do
19
- Textractor.text_from_path(fixture_path("document.pdf")).should == 'text'
20
- end
21
-
22
- it 'returns the contents of text documents' do
23
- Textractor.text_from_path(fixture_path("document.txt")).should == 'text'
24
- end
25
-
26
- it 'allows the user to specify content type to avoid internal resolution' do
27
- Textractor.text_from_path(fixture_path("no_extension"), :content_type => "application/pdf").should == 'text'
28
- end
29
-
30
- it 'raises an exception when the content type is unable to be determined' do
31
- expect {
32
- Textractor.text_from_path(fixture_path("no_extension"))
33
- }.to raise_error(Textractor::UnknownContentType)
34
- end
35
-
36
- it 'raises an exception when the path specified does not exist' do
37
- expect {
38
- Textractor.text_from_path('non-existant')
39
- }.to raise_error(Textractor::FileNotFound)
40
- end
41
-
42
- it 'raises an exception when there is no extractor defined for the content type' do
43
- Textractor.clear_registry
44
-
45
- expect {
46
- Textractor.text_from_path(fixture_path('document.pdf'))
47
- }.to raise_error(Textractor::ContentTypeNotRegistered)
48
- end
49
-
50
- it 'allows content type extractors to be removed' do
51
- Textractor.remove_content_type("application/pdf")
52
-
53
- expect {
54
- Textractor.text_from_path(fixture_path('document.pdf'))
55
- }.to raise_error(Textractor::ContentTypeNotRegistered)
56
- end
57
-
58
- it 'returns the contents of doc files with a space in the path' do
59
- Textractor.text_from_path(fixture_path("document .doc")).should == 'text'
60
- end
61
-
62
- it 'returns the contents of docx files with a space in the path' do
63
- Textractor.text_from_path(fixture_path("document .docx")).should == 'text'
64
- end
65
-
66
- it 'returns the contents of pdf files with a space in the path' do
67
- Textractor.text_from_path(fixture_path("document .pdf")).should == 'text'
68
- end
69
-
70
- it 'returns the contents of txt files with a space in the path' do
71
- Textractor.text_from_path(fixture_path("document .txt")).should == 'text'
72
- end
73
-
74
- end
@@ -1,14 +0,0 @@
1
- $LOAD_PATH.unshift(File.dirname(__FILE__))
2
-
3
- require 'rubygems'
4
- require 'bundler/setup'
5
- require 'rspec'
6
- require 'textractor'
7
-
8
- def fixture_path(path)
9
- File.expand_path(File.join(File.dirname(__FILE__), 'fixtures', path))
10
- end
11
-
12
- RSpec.configure do |config|
13
-
14
- end
@@ -1,104 +0,0 @@
1
- require 'spec_helper'
2
-
3
- class TestExtractor
4
-
5
- def text_from_path(path)
6
- path
7
- end
8
-
9
- end
10
-
11
- describe Textractor do
12
-
13
- before do
14
- Textractor.clear_registry
15
- end
16
-
17
- describe ".text_from_path" do
18
- before do
19
- File.stub(:exists?).and_return(true)
20
- Textractor.stub(:content_type_for_path).and_return('test')
21
- Textractor.stub(:extractor_for_content_type).and_return(TestExtractor)
22
- end
23
-
24
- it 'extracts the text from a given path' do
25
- Textractor.text_from_path('document').should == 'document'
26
- end
27
-
28
- it 'uses content_type_for_path to determine the content type' do
29
- Textractor.should_receive(:content_type_for_path).with('document')
30
- Textractor.text_from_path('document')
31
- end
32
-
33
- it 'uses extractor_for_content_type to look up the correct extractor' do
34
- Textractor.should_receive(:extractor_for_content_type).with('test')
35
- Textractor.text_from_path('document')
36
- end
37
-
38
- end
39
-
40
- describe ".register_content_type" do
41
-
42
- it 'raises an exception if an extractor is already defined for that content type' do
43
- Textractor.register_content_type("text/plain", TestExtractor)
44
-
45
- expect {
46
- Textractor.register_content_type("text/plain", TestExtractor)
47
- }.to raise_error(Textractor::ContentTypeAlreadyRegistered)
48
- end
49
-
50
- it 'takes a block for simple cases' do
51
- File.stub(:exists?).and_return(true)
52
- Textractor.stub(:content_type_for_path).and_return('test')
53
- Textractor.register_content_type('test') do |path|
54
- path
55
- end
56
-
57
- Textractor.text_from_path('document').should == 'document'
58
- end
59
-
60
- end
61
-
62
- describe ".extractor_for_content_type" do
63
- before do
64
- Textractor.register_content_type("text/plain", TestExtractor)
65
- end
66
-
67
- it 'returns the extractor for the content type' do
68
- Textractor.extractor_for_content_type("text/plain").should == TestExtractor
69
- end
70
-
71
- it 'raises an exception when no extractor is defined for that content type' do
72
- expect {
73
- Textractor.extractor_for_content_type("unknown")
74
- }.to raise_error(Textractor::ContentTypeNotRegistered)
75
- end
76
- end
77
-
78
- describe ".content_type_for_path" do
79
-
80
- it 'returns the content type based on the file extension' do
81
- Textractor.content_type_for_path("document.pdf").should == "application/pdf"
82
- end
83
-
84
- it 'raises an exception if it cannot determine the content type' do
85
- expect {
86
- Textractor.content_type_for_path('unknown')
87
- }.to raise_error(Textractor::UnknownContentType)
88
- end
89
-
90
- end
91
-
92
- describe ".clear_registry" do
93
- before do
94
- Textractor.register_content_type("text/plain", TestExtractor)
95
- end
96
-
97
- it 'clears the registered content types and their respective extractors' do
98
- Textractor.clear_registry
99
- Textractor.extractors.should be_empty
100
- end
101
-
102
- end
103
-
104
- end