textractor 0.1.6 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,14 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- textractor (0.1.4)
4
+ textractor (0.1.6)
5
+ escape (>= 0.0.4)
5
6
 
6
7
  GEM
7
8
  remote: http://rubygems.org/
8
9
  specs:
9
10
  diff-lcs (1.1.2)
11
+ escape (0.0.4)
10
12
  rspec (2.1.0)
11
13
  rspec-core (~> 2.1.0)
12
14
  rspec-expectations (~> 2.1.0)
data/Rakefile CHANGED
@@ -15,7 +15,7 @@ end
15
15
 
16
16
  task :default => :spec
17
17
 
18
- require 'rake/rdoctask'
18
+ require 'rdoc/task'
19
19
  Rake::RDocTask.new do |rdoc|
20
20
  version = File.exist?('VERSION') ? File.read('VERSION') : ""
21
21
 
@@ -64,3 +64,4 @@ module Textractor
64
64
  end
65
65
 
66
66
  require 'textractor/content_type_detector'
67
+ require 'escape'
@@ -13,7 +13,7 @@ module Textractor::Extractors
13
13
  end
14
14
 
15
15
  def text_from_path(path)
16
- command = "wvWare -c utf-8 --nographics -x #{wvText_path} '#{path}'"
16
+ command = "wvWare -c utf-8 --nographics -x #{wvText_path} #{Escape.shell_single_word(path)}"
17
17
  puts command if $DEBUG
18
18
  `#{command}`.strip
19
19
  end
@@ -14,7 +14,7 @@ module Textractor::Extractors
14
14
 
15
15
 
16
16
  def text_from_path(path)
17
- `#{docx2txt_path} '#{path}' -`.strip
17
+ `#{docx2txt_path} #{Escape.shell_single_word(path)} -`.strip
18
18
  end
19
19
 
20
20
  private
@@ -3,7 +3,7 @@ module Textractor::Extractors
3
3
  class PDFExtractor
4
4
 
5
5
  def text_from_path(path)
6
- `pdftotext '#{path}' - 2>/dev/null`.strip
6
+ `pdftotext #{Escape.shell_single_word(path)} - 2>/dev/null`.strip
7
7
  end
8
8
 
9
9
  end
@@ -1,3 +1,3 @@
1
1
  module Textractor
2
- VERSION = '0.1.6'
2
+ VERSION = '0.2.0'
3
3
  end
@@ -16,7 +16,9 @@ Gem::Specification.new do |s|
16
16
  s.add_development_dependency "bundler", ">= 1.0.0"
17
17
  s.add_development_dependency "rspec", "~> 2.1.0"
18
18
 
19
- s.files = `git ls-files`.split("\n")
19
+ s.add_runtime_dependency "escape", ">=0.0.4"
20
+
21
+ s.files = `git ls-files`.split("\n").reject{|f| f.gsub(/"/, "") =~ /^spec/}
20
22
  s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
21
23
  s.require_path = 'lib'
22
24
  s.extra_rdoc_files = ["LICENSE", "README.md"]
metadata CHANGED
@@ -5,9 +5,9 @@ version: !ruby/object:Gem::Version
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
- - 1
9
- - 6
10
- version: 0.1.6
8
+ - 2
9
+ - 0
10
+ version: 0.2.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Michael Guterl
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-07-22 00:00:00 -04:00
18
+ date: 2011-07-29 00:00:00 -04:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -50,6 +50,22 @@ dependencies:
50
50
  version: 2.1.0
51
51
  type: :development
52
52
  version_requirements: *id002
53
+ - !ruby/object:Gem::Dependency
54
+ name: escape
55
+ prerelease: false
56
+ requirement: &id003 !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ hash: 23
62
+ segments:
63
+ - 0
64
+ - 0
65
+ - 4
66
+ version: 0.0.4
67
+ type: :runtime
68
+ version_requirements: *id003
53
69
  description: simple wrapper around CLI for extracting text from PDF and Word documents
54
70
  email:
55
71
  - michael@diminishing.org
@@ -81,19 +97,6 @@ files:
81
97
  - lib/textractor/extractors/pdf_extractor.rb
82
98
  - lib/textractor/extractors/text_extractor.rb
83
99
  - lib/textractor/version.rb
84
- - spec/content_type_detector/simple_spec.rb
85
- - spec/fixtures/document .doc
86
- - spec/fixtures/document .docx
87
- - spec/fixtures/document .pdf
88
- - spec/fixtures/document .txt
89
- - spec/fixtures/document.doc
90
- - spec/fixtures/document.docx
91
- - spec/fixtures/document.pdf
92
- - spec/fixtures/document.txt
93
- - spec/fixtures/no_extension
94
- - spec/integration/textractor_spec.rb
95
- - spec/spec_helper.rb
96
- - spec/textractor_spec.rb
97
100
  - support/wvText.xml
98
101
  - textractor.gemspec
99
102
  - vendor/docx2txt/AUTHORS
@@ -1,30 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Textractor::ContentTypeDetector::Simple do
4
-
5
- FILENAMES = [
6
- [
7
- "foo.pdf", "application/pdf",
8
- "foo.doc", "application/msword",
9
- "foo.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
10
- "foo.txt", "text/plain",
11
- ]
12
- ]
13
-
14
- describe '.content_type_for_path' do
15
- FILENAMES.each do |(filename, content_type)|
16
- context "given #{filename}" do
17
- it "returns #{content_type}" do
18
- Textractor::ContentTypeDetector::Simple.content_type_for_path(filename).should == content_type
19
- end
20
- end
21
-
22
- context "given #{filename}" do
23
- it "returns #{content_type}" do
24
- Textractor::ContentTypeDetector::Simple.content_type_for_path(filename.upcase).should == content_type
25
- end
26
- end
27
- end
28
- end
29
-
30
- end
Binary file
Binary file
Binary file
@@ -1 +0,0 @@
1
- text
Binary file
Binary file
Binary file
@@ -1 +0,0 @@
1
- text
Binary file
@@ -1,74 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Textractor do
4
-
5
- before do
6
- Textractor.clear_registry
7
- Textractor.register_basic_types
8
- end
9
-
10
- it 'returns the contents of word (.doc) documents' do
11
- Textractor.text_from_path(fixture_path("document.doc")).should == 'text'
12
- end
13
-
14
- it 'returns the contents of word (.docx) documents' do
15
- Textractor.text_from_path(fixture_path("document.docx")).should == 'text'
16
- end
17
-
18
- it 'returns the contents of pdf documents' do
19
- Textractor.text_from_path(fixture_path("document.pdf")).should == 'text'
20
- end
21
-
22
- it 'returns the contents of text documents' do
23
- Textractor.text_from_path(fixture_path("document.txt")).should == 'text'
24
- end
25
-
26
- it 'allows the user to specify content type to avoid internal resolution' do
27
- Textractor.text_from_path(fixture_path("no_extension"), :content_type => "application/pdf").should == 'text'
28
- end
29
-
30
- it 'raises an exception when the content type is unable to be determined' do
31
- expect {
32
- Textractor.text_from_path(fixture_path("no_extension"))
33
- }.to raise_error(Textractor::UnknownContentType)
34
- end
35
-
36
- it 'raises an exception when the path specified does not exist' do
37
- expect {
38
- Textractor.text_from_path('non-existant')
39
- }.to raise_error(Textractor::FileNotFound)
40
- end
41
-
42
- it 'raises an exception when there is no extractor defined for the content type' do
43
- Textractor.clear_registry
44
-
45
- expect {
46
- Textractor.text_from_path(fixture_path('document.pdf'))
47
- }.to raise_error(Textractor::ContentTypeNotRegistered)
48
- end
49
-
50
- it 'allows content type extractors to be removed' do
51
- Textractor.remove_content_type("application/pdf")
52
-
53
- expect {
54
- Textractor.text_from_path(fixture_path('document.pdf'))
55
- }.to raise_error(Textractor::ContentTypeNotRegistered)
56
- end
57
-
58
- it 'returns the contents of doc files with a space in the path' do
59
- Textractor.text_from_path(fixture_path("document .doc")).should == 'text'
60
- end
61
-
62
- it 'returns the contents of docx files with a space in the path' do
63
- Textractor.text_from_path(fixture_path("document .docx")).should == 'text'
64
- end
65
-
66
- it 'returns the contents of pdf files with a space in the path' do
67
- Textractor.text_from_path(fixture_path("document .pdf")).should == 'text'
68
- end
69
-
70
- it 'returns the contents of txt files with a space in the path' do
71
- Textractor.text_from_path(fixture_path("document .txt")).should == 'text'
72
- end
73
-
74
- end
@@ -1,14 +0,0 @@
1
- $LOAD_PATH.unshift(File.dirname(__FILE__))
2
-
3
- require 'rubygems'
4
- require 'bundler/setup'
5
- require 'rspec'
6
- require 'textractor'
7
-
8
- def fixture_path(path)
9
- File.expand_path(File.join(File.dirname(__FILE__), 'fixtures', path))
10
- end
11
-
12
- RSpec.configure do |config|
13
-
14
- end
@@ -1,104 +0,0 @@
1
- require 'spec_helper'
2
-
3
- class TestExtractor
4
-
5
- def text_from_path(path)
6
- path
7
- end
8
-
9
- end
10
-
11
- describe Textractor do
12
-
13
- before do
14
- Textractor.clear_registry
15
- end
16
-
17
- describe ".text_from_path" do
18
- before do
19
- File.stub(:exists?).and_return(true)
20
- Textractor.stub(:content_type_for_path).and_return('test')
21
- Textractor.stub(:extractor_for_content_type).and_return(TestExtractor)
22
- end
23
-
24
- it 'extracts the text from a given path' do
25
- Textractor.text_from_path('document').should == 'document'
26
- end
27
-
28
- it 'uses content_type_for_path to determine the content type' do
29
- Textractor.should_receive(:content_type_for_path).with('document')
30
- Textractor.text_from_path('document')
31
- end
32
-
33
- it 'uses extractor_for_content_type to look up the correct extractor' do
34
- Textractor.should_receive(:extractor_for_content_type).with('test')
35
- Textractor.text_from_path('document')
36
- end
37
-
38
- end
39
-
40
- describe ".register_content_type" do
41
-
42
- it 'raises an exception if an extractor is already defined for that content type' do
43
- Textractor.register_content_type("text/plain", TestExtractor)
44
-
45
- expect {
46
- Textractor.register_content_type("text/plain", TestExtractor)
47
- }.to raise_error(Textractor::ContentTypeAlreadyRegistered)
48
- end
49
-
50
- it 'takes a block for simple cases' do
51
- File.stub(:exists?).and_return(true)
52
- Textractor.stub(:content_type_for_path).and_return('test')
53
- Textractor.register_content_type('test') do |path|
54
- path
55
- end
56
-
57
- Textractor.text_from_path('document').should == 'document'
58
- end
59
-
60
- end
61
-
62
- describe ".extractor_for_content_type" do
63
- before do
64
- Textractor.register_content_type("text/plain", TestExtractor)
65
- end
66
-
67
- it 'returns the extractor for the content type' do
68
- Textractor.extractor_for_content_type("text/plain").should == TestExtractor
69
- end
70
-
71
- it 'raises an exception when no extractor is defined for that content type' do
72
- expect {
73
- Textractor.extractor_for_content_type("unknown")
74
- }.to raise_error(Textractor::ContentTypeNotRegistered)
75
- end
76
- end
77
-
78
- describe ".content_type_for_path" do
79
-
80
- it 'returns the content type based on the file extension' do
81
- Textractor.content_type_for_path("document.pdf").should == "application/pdf"
82
- end
83
-
84
- it 'raises an exception if it cannot determine the content type' do
85
- expect {
86
- Textractor.content_type_for_path('unknown')
87
- }.to raise_error(Textractor::UnknownContentType)
88
- end
89
-
90
- end
91
-
92
- describe ".clear_registry" do
93
- before do
94
- Textractor.register_content_type("text/plain", TestExtractor)
95
- end
96
-
97
- it 'clears the registered content types and their respective extractors' do
98
- Textractor.clear_registry
99
- Textractor.extractors.should be_empty
100
- end
101
-
102
- end
103
-
104
- end