textractor 0.0.3 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -17,5 +17,7 @@ tmtags
17
17
  coverage
18
18
  rdoc
19
19
  pkg
20
+ *.gem
21
+ .bundle
20
22
 
21
23
  ## PROJECT::SPECIFIC
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source :gemcutter
2
+
3
+ # Specify your gem's dependencies in textractor.gemspec
4
+ gemspec
@@ -0,0 +1,17 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ textractor (0.1.2)
5
+
6
+ GEM
7
+ remote: http://rubygems.org/
8
+ specs:
9
+ rspec (1.3.0)
10
+
11
+ PLATFORMS
12
+ ruby
13
+
14
+ DEPENDENCIES
15
+ bundler (>= 1.0.0)
16
+ rspec (~> 1.3.0)
17
+ textractor!
data/README.md CHANGED
@@ -28,18 +28,18 @@ this may not work on all systems due to dependency issues.
28
28
  Due to textractor's reliance on command line tools all the methods in
29
29
  textractor work on paths not File objects.
30
30
 
31
- document = Textractor::Document.new(path_to_document)
32
- document.text # => "Ruby on rails developer"
33
-
34
- There is also a convenience method on Textractor.
35
-
36
- Textractor.text_from_file(path_to_document) # => "Ruby on rails developer"
31
+ Textractor.text_from_path(path_to_document) # => "Ruby on rails developer"
37
32
 
38
33
  Textractor will attempt to guess what type of document you're trying
39
34
  to extract text from. However, if you know the content type of your
40
35
  document, you can provide it and Textractor won't guess.
41
36
 
42
- Textractor.text_from_file(path_to_document, :content_type => "application/doc")
37
+ Textractor.text_from_path(path_to_document, :content_type => "application/doc")
38
+
39
+ ## TODO
40
+
41
+ * Remove vendored docx2txt perl script
42
+ * Replace as much as possible with pure ruby
43
43
 
44
44
  ## Note on Patches/Pull Requests
45
45
 
data/Rakefile CHANGED
@@ -1,22 +1,5 @@
1
- require 'rubygems'
2
- require 'rake'
3
-
4
- begin
5
- require 'jeweler'
6
- Jeweler::Tasks.new do |gem|
7
- gem.name = "textractor"
8
- gem.summary = %Q{simple wrapper around CLI tools for extracting text from PDF and Word documents}
9
- gem.description = %Q{simple wrapper around CLI for extracting text from PDF and Word documents}
10
- gem.email = "mguterl@gmail.com"
11
- gem.homepage = "http://github.com/mguterl/textractor"
12
- gem.authors = ["Michael Guterl"]
13
- gem.add_development_dependency "rspec", ">= 1.3.0"
14
- # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
- end
16
- Jeweler::GemcutterTasks.new
17
- rescue LoadError
18
- puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
19
- end
1
+ require 'bundler'
2
+ Bundler::GemHelper.install_tasks
20
3
 
21
4
  require 'spec/rake/spectask'
22
5
  Spec::Rake::SpecTask.new(:spec) do |spec|
@@ -30,8 +13,6 @@ Spec::Rake::SpecTask.new(:rcov) do |spec|
30
13
  spec.rcov = true
31
14
  end
32
15
 
33
- task :spec => :check_dependencies
34
-
35
16
  task :default => :spec
36
17
 
37
18
  require 'rake/rdoctask'
@@ -1,18 +1,64 @@
1
1
  module Textractor
2
- autoload :Document, "textractor/document"
3
2
 
4
- def self.text_from_file(filename, options = {})
5
- Textractor::Document.new(filename, options).text
3
+ UnknownContentType = Class.new(StandardError)
4
+ FileNotFound = Class.new(StandardError)
5
+ ContentTypeAlreadyRegistered = Class.new(StandardError)
6
+ ContentTypeNotRegistered = Class.new(StandardError)
7
+
8
+ autoload :Extractors, "textractor/extractors"
9
+
10
+ def self.text_from_path(path, options = {})
11
+ raise FileNotFound unless File.exists?(path)
12
+ content_type = options.fetch(:content_type) { content_type_for_path(path) }
13
+ extractor_class = extractor_for_content_type(content_type)
14
+ extractor = extractor_class.new
15
+
16
+ extractor.text_from_path(path)
6
17
  end
7
18
 
8
- DEFAULT_WV_TEXT_PATH = File.expand_path(File.dirname(__FILE__) + "/../support/wvText.xml")
19
+ def self.content_type_for_path(path)
20
+ case File.extname(path)
21
+ when /\.pdf$/
22
+ 'application/pdf'
23
+ when /\.doc$/
24
+ 'application/msword'
25
+ when /\.docx$/
26
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
27
+ when /\.txt$/
28
+ 'text/plain'
29
+ else
30
+ raise UnknownContentType, "unable to determine content type for #{path}"
31
+ end
32
+ end
9
33
 
10
- def self.wvText_path
11
- @wvText_path || DEFAULT_WV_TEXT_PATH
34
+ def self.register_content_type(content_type, extractor)
35
+ raise ContentTypeAlreadyRegistered, "#{content_type} is already registered" if extractors[content_type]
36
+ extractors[content_type] = extractor
12
37
  end
13
38
 
14
- def self.wvText_path=(path)
15
- @wvText_path = path
39
+ def self.remove_content_type(content_type)
40
+ extractors.delete content_type
16
41
  end
17
42
 
43
+ def self.extractor_for_content_type(content_type)
44
+ extractors[content_type] or raise ContentTypeNotRegistered, "#{content_type} is not registered with Textractor"
45
+ end
46
+
47
+ def self.extractors
48
+ @extractors ||= {}
49
+ end
50
+
51
+ def self.clear_registry
52
+ @extractors = {}
53
+ end
54
+
55
+ def self.register_basic_types
56
+ register_content_type("application/pdf", Extractors::PDFExtractor)
57
+ register_content_type("application/msword", Extractors::DocExtractor)
58
+ register_content_type("application/vnd.openxmlformats-officedocument.wordprocessingml.document", Extractors::DocxExtractor)
59
+ register_content_type("text/plain", Extractors::TextExtractor)
60
+ end
61
+
62
+ register_basic_types
63
+
18
64
  end
@@ -0,0 +1,12 @@
1
+ module Textractor
2
+
3
+ module Extractors
4
+
5
+ autoload :PDFExtractor, 'textractor/extractors/pdf_extractor'
6
+ autoload :DocExtractor, 'textractor/extractors/doc_extractor'
7
+ autoload :DocxExtractor, 'textractor/extractors/docx_extractor'
8
+ autoload :TextExtractor, 'textractor/extractors/text_extractor'
9
+
10
+ end
11
+
12
+ end
@@ -0,0 +1,29 @@
1
+ module Textractor::Extractors
2
+
3
+ class DocExtractor
4
+
5
+ DEFAULT_WV_TEXT_PATH = File.expand_path(File.dirname(__FILE__) + "/../../../support/wvText.xml").freeze
6
+
7
+ class << self
8
+ attr_writer :wvText_path
9
+
10
+ def wvText_path
11
+ @wvText_path || DEFAULT_WV_TEXT_PATH
12
+ end
13
+ end
14
+
15
+ def text_from_path(path)
16
+ command = "wvWare -c utf-8 --nographics -x #{wvText_path} #{path}"
17
+ puts command if $DEBUG
18
+ `#{command}`.strip
19
+ end
20
+
21
+ private
22
+
23
+ def wvText_path
24
+ self.class.wvText_path
25
+ end
26
+
27
+ end
28
+
29
+ end
@@ -0,0 +1,28 @@
1
+ module Textractor::Extractors
2
+
3
+ class DocxExtractor
4
+
5
+ DEFAULT_DOCX2TXT_PATH = File.expand_path(File.dirname(__FILE__) + "/../../../vendor/docx2txt/docx2txt.pl").freeze
6
+
7
+ class << self
8
+ attr_writer :docx2txt_path
9
+
10
+ def docx2txt_path
11
+ @docx2txt_path || DEFAULT_DOCX2TXT_PATH
12
+ end
13
+ end
14
+
15
+
16
+ def text_from_path(path)
17
+ `#{docx2txt_path} #{path} -`.strip
18
+ end
19
+
20
+ private
21
+
22
+ def docx2txt_path
23
+ self.class.docx2txt_path
24
+ end
25
+
26
+ end
27
+
28
+ end
@@ -0,0 +1,11 @@
1
+ module Textractor::Extractors
2
+
3
+ class PDFExtractor
4
+
5
+ def text_from_path(path)
6
+ `pdftotext #{path} - 2>/dev/null`.strip
7
+ end
8
+
9
+ end
10
+
11
+ end
@@ -0,0 +1,11 @@
1
+ module Textractor::Extractors
2
+
3
+ class TextExtractor
4
+
5
+ def text_from_path(path)
6
+ File.read(path)
7
+ end
8
+
9
+ end
10
+
11
+ end
@@ -0,0 +1,29 @@
1
+ module Textractor::Extractors
2
+
3
+ class WordExtractor
4
+
5
+ DEFAULT_WV_TEXT_PATH = File.expand_path(File.dirname(__FILE__) + "/../../../support/wvText.xml").freeze
6
+
7
+ class << self
8
+ attr_writer :wvText_path
9
+
10
+ def wvText_path
11
+ @wvText_path || DEFAULT_WV_TEXT_PATH
12
+ end
13
+ end
14
+
15
+ def text_from_path(path)
16
+ command = "wvWare -c utf-8 --nographics -x #{wvText_path} #{path}"
17
+ puts command if $DEBUG
18
+ `#{command}`.strip
19
+ end
20
+
21
+ private
22
+
23
+ def wvText_path
24
+ self.class.wvText_path
25
+ end
26
+
27
+ end
28
+
29
+ end
@@ -0,0 +1,3 @@
1
+ module Textractor
2
+ VERSION = '0.1.2'
3
+ end
Binary file
Binary file
Binary file
@@ -1 +1 @@
1
- Ruby on rails developer
1
+ text
@@ -0,0 +1,58 @@
1
+ require 'spec_helper'
2
+
3
+ describe Textractor do
4
+
5
+ before do
6
+ Textractor.clear_registry
7
+ Textractor.register_basic_types
8
+ end
9
+
10
+ it 'returns the contents of word (.doc) documents' do
11
+ Textractor.text_from_path(fixture_path("document.doc")).should == 'text'
12
+ end
13
+
14
+ it 'returns the contents of word (.docx) documents' do
15
+ Textractor.text_from_path(fixture_path("document.docx")).should == 'text'
16
+ end
17
+
18
+ it 'returns the contents of pdf documents' do
19
+ Textractor.text_from_path(fixture_path("document.pdf")).should == 'text'
20
+ end
21
+
22
+ it 'returns the contents of text documents' do
23
+ Textractor.text_from_path(fixture_path("document.txt")).should == 'text'
24
+ end
25
+
26
+ it 'allows the user to specify content type to avoid internal resolution' do
27
+ Textractor.text_from_path(fixture_path("no_extension"), :content_type => "application/pdf").should == 'text'
28
+ end
29
+
30
+ it 'raises an exception when the content type is unable to be determined' do
31
+ expect {
32
+ Textractor.text_from_path(fixture_path("no_extension"))
33
+ }.to raise_error(Textractor::UnknownContentType)
34
+ end
35
+
36
+ it 'raises an exception when the path specified does not exist' do
37
+ expect {
38
+ Textractor.text_from_path('non-existant')
39
+ }.to raise_error(Textractor::FileNotFound)
40
+ end
41
+
42
+ it 'raises an exception when there is no extractor defined for the content type' do
43
+ Textractor.clear_registry
44
+
45
+ expect {
46
+ Textractor.text_from_path(fixture_path('document.pdf'))
47
+ }.to raise_error(Textractor::ContentTypeNotRegistered)
48
+ end
49
+
50
+ it 'allows content type extractors to be removed' do
51
+ Textractor.remove_content_type("application/pdf")
52
+
53
+ expect {
54
+ Textractor.text_from_path(fixture_path('document.pdf'))
55
+ }.to raise_error(Textractor::ContentTypeNotRegistered)
56
+ end
57
+
58
+ end
@@ -1,9 +1,15 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+ require 'spec'
4
+
1
5
  $LOAD_PATH.unshift(File.dirname(__FILE__))
2
- $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
6
+
3
7
  require 'textractor'
4
- require 'spec'
5
- require 'spec/autorun'
8
+
9
+ def fixture_path(path)
10
+ File.expand_path(File.join(File.dirname(__FILE__), 'fixtures', path))
11
+ end
6
12
 
7
13
  Spec::Runner.configure do |config|
8
-
14
+
9
15
  end
@@ -1,32 +1,94 @@
1
1
  require 'spec/spec_helper'
2
2
 
3
+ class TestExtractor
4
+
5
+ def text_from_path(path)
6
+ path
7
+ end
8
+
9
+ end
10
+
3
11
  describe Textractor do
4
12
 
5
- describe ".wvText_path" do
13
+ before do
14
+ Textractor.clear_registry
15
+ end
16
+
17
+ describe ".text_from_path" do
18
+ before do
19
+ File.stub(:exists?).and_return(true)
20
+ Textractor.stub(:content_type_for_path).and_return('test')
21
+ Textractor.stub(:extractor_for_content_type).and_return(TestExtractor)
22
+ end
23
+
24
+ it 'extracts the text from a given path' do
25
+ Textractor.text_from_path('document').should == 'document'
26
+ end
27
+
28
+ it 'uses content_type_for_path to determine the content type' do
29
+ Textractor.should_receive(:content_type_for_path).with('document')
30
+ Textractor.text_from_path('document')
31
+ end
32
+
33
+ it 'uses extractor_for_content_type to look up the correct extractor' do
34
+ Textractor.should_receive(:extractor_for_content_type).with('test')
35
+ Textractor.text_from_path('document')
36
+ end
37
+
38
+ end
39
+
40
+ describe ".register_content_type" do
41
+
42
+ it 'raises an exception if an extractor is already defined for that content type' do
43
+ Textractor.register_content_type("text/plain", TestExtractor)
44
+
45
+ expect {
46
+ Textractor.register_content_type("text/plain", TestExtractor)
47
+ }.to raise_error(Textractor::ContentTypeAlreadyRegistered)
48
+ end
49
+
50
+ end
6
51
 
7
- it 'should default to the file provided with the gem' do
8
- Textractor.wvText_path.should == Textractor::DEFAULT_WV_TEXT_PATH
52
+ describe ".extractor_for_content_type" do
53
+ before do
54
+ Textractor.register_content_type("text/plain", TestExtractor)
9
55
  end
10
56
 
11
- it 'should use the new wvText_path if provided' do
12
- Textractor.wvText_path = "foo.bar"
13
- Textractor.wvText_path.should == "foo.bar"
57
+ it 'returns the extractor for the content type' do
58
+ Textractor.extractor_for_content_type("text/plain").should == TestExtractor
14
59
  end
15
60
 
61
+ it 'raises an exception when no extractor is defined for that content type' do
62
+ expect {
63
+ Textractor.extractor_for_content_type("unknown")
64
+ }.to raise_error(Textractor::ContentTypeNotRegistered)
65
+ end
16
66
  end
17
67
 
18
- describe ".text_from_file" do
68
+ describe ".content_type_for_path" do
19
69
 
20
- it 'should return the extracted text from the file' do
21
- document_path = 'word.doc'
22
- document = mock("Textractor::Document", :text => "Ruby on Rails developer")
23
- Textractor::Document.should_receive(:new).with(document_path, :content_type => "application/doc").and_return(document)
24
- Textractor.text_from_file(document_path, :content_type => "application/doc").should == "Ruby on Rails developer"
70
+ it 'returns the content type based on the file extension' do
71
+ Textractor.content_type_for_path("document.pdf").should == "application/pdf"
72
+ end
73
+
74
+ it 'raises an exception if it cannot determine the content type' do
75
+ expect {
76
+ Textractor.content_type_for_path('unknown')
77
+ }.to raise_error(Textractor::UnknownContentType)
25
78
  end
26
79
 
27
80
  end
28
81
 
29
- after(:all) do
30
- Textractor.instance_variable_set(:"@wvText_path", nil)
82
+ describe ".clear_registry" do
83
+ before do
84
+ Textractor.register_content_type("text/plain", TestExtractor)
85
+ end
86
+
87
+ it 'clears the registered content types and their respective extractors' do
88
+ Textractor.clear_registry
89
+ Textractor.extractors.should be_empty
90
+ end
91
+
31
92
  end
93
+
32
94
  end
@@ -1,78 +1,25 @@
1
- # Generated by jeweler
2
- # DO NOT EDIT THIS FILE DIRECTLY
3
- # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
1
  # -*- encoding: utf-8 -*-
2
+ require File.expand_path("../lib/textractor/version", __FILE__)
5
3
 
6
4
  Gem::Specification.new do |s|
7
- s.name = %q{textractor}
8
- s.version = "0.0.3"
5
+ s.name = "textractor"
6
+ s.version = Textractor::VERSION
7
+ s.platform = Gem::Platform::RUBY
8
+ s.authors = ['Michael Guterl']
9
+ s.email = ['michael@diminishing.org']
10
+ s.homepage = "http://github.com/mguterl/textractor"
11
+ s.summary = "simple wrapper around CLI for extracting text from PDF and Word documents"
12
+ s.description = "simple wrapper around CLI for extracting text from PDF and Word documents"
9
13
 
10
- s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
- s.authors = ["Michael Guterl"]
12
- s.date = %q{2010-07-27}
13
- s.description = %q{simple wrapper around CLI for extracting text from PDF and Word documents}
14
- s.email = %q{mguterl@gmail.com}
15
- s.extra_rdoc_files = [
16
- "LICENSE",
17
- "README.md"
18
- ]
19
- s.files = [
20
- ".document",
21
- ".gitignore",
22
- "LICENSE",
23
- "README.md",
24
- "Rakefile",
25
- "VERSION",
26
- "lib/textractor.rb",
27
- "lib/textractor/document.rb",
28
- "spec/document_spec.rb",
29
- "spec/fixtures/document.doc",
30
- "spec/fixtures/document.docx",
31
- "spec/fixtures/document.pdf",
32
- "spec/fixtures/document.txt",
33
- "spec/spec.opts",
34
- "spec/spec_helper.rb",
35
- "spec/textractor_spec.rb",
36
- "support/wvText.xml",
37
- "textractor.gemspec",
38
- "vendor/docx2txt/AUTHORS",
39
- "vendor/docx2txt/BSDmakefile",
40
- "vendor/docx2txt/COPYING",
41
- "vendor/docx2txt/ChangeLog",
42
- "vendor/docx2txt/INSTALL",
43
- "vendor/docx2txt/Makefile",
44
- "vendor/docx2txt/README",
45
- "vendor/docx2txt/ToDo",
46
- "vendor/docx2txt/VERSION",
47
- "vendor/docx2txt/WInstall.bat",
48
- "vendor/docx2txt/docx2txt.bat",
49
- "vendor/docx2txt/docx2txt.config",
50
- "vendor/docx2txt/docx2txt.pl",
51
- "vendor/docx2txt/docx2txt.sh",
52
- "vendor/docx2txt/resume.docx"
53
- ]
54
- s.homepage = %q{http://github.com/mguterl/textractor}
55
- s.rdoc_options = ["--charset=UTF-8"]
56
- s.require_paths = ["lib"]
57
- s.rubygems_version = %q{1.3.7}
58
- s.summary = %q{simple wrapper around CLI tools for extracting text from PDF and Word documents}
59
- s.test_files = [
60
- "spec/document_spec.rb",
61
- "spec/spec_helper.rb",
62
- "spec/textractor_spec.rb"
63
- ]
14
+ s.required_rubygems_version = ">= 1.3.6"
15
+ s.rubyforge_project = "textractor"
64
16
 
65
- if s.respond_to? :specification_version then
66
- current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
67
- s.specification_version = 3
17
+ s.add_development_dependency "bundler", ">= 1.0.0"
18
+ s.add_development_dependency "rspec", "~> 1.3.0"
68
19
 
69
- if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
70
- s.add_development_dependency(%q<rspec>, [">= 1.3.0"])
71
- else
72
- s.add_dependency(%q<rspec>, [">= 1.3.0"])
73
- end
74
- else
75
- s.add_dependency(%q<rspec>, [">= 1.3.0"])
76
- end
20
+ s.files = `git ls-files`.split("\n")
21
+ s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
22
+ s.require_path = 'lib'
23
+ s.extra_rdoc_files = ["LICENSE", "README.md"]
24
+ s.rdoc_options = ["--charset=UTF-8"]
77
25
  end
78
-
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textractor
3
3
  version: !ruby/object:Gem::Version
4
- hash: 25
4
+ hash: 31
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 0
9
- - 3
10
- version: 0.0.3
8
+ - 1
9
+ - 2
10
+ version: 0.1.2
11
11
  platform: ruby
12
12
  authors:
13
13
  - Michael Guterl
@@ -15,16 +15,32 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-07-27 00:00:00 -04:00
18
+ date: 2010-11-06 00:00:00 -04:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
22
- name: rspec
22
+ name: bundler
23
23
  prerelease: false
24
24
  requirement: &id001 !ruby/object:Gem::Requirement
25
25
  none: false
26
26
  requirements:
27
27
  - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 23
30
+ segments:
31
+ - 1
32
+ - 0
33
+ - 0
34
+ version: 1.0.0
35
+ type: :development
36
+ version_requirements: *id001
37
+ - !ruby/object:Gem::Dependency
38
+ name: rspec
39
+ prerelease: false
40
+ requirement: &id002 !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
28
44
  - !ruby/object:Gem::Version
29
45
  hash: 27
30
46
  segments:
@@ -33,9 +49,10 @@ dependencies:
33
49
  - 0
34
50
  version: 1.3.0
35
51
  type: :development
36
- version_requirements: *id001
52
+ version_requirements: *id002
37
53
  description: simple wrapper around CLI for extracting text from PDF and Word documents
38
- email: mguterl@gmail.com
54
+ email:
55
+ - michael@diminishing.org
39
56
  executables: []
40
57
 
41
58
  extensions: []
@@ -46,17 +63,25 @@ extra_rdoc_files:
46
63
  files:
47
64
  - .document
48
65
  - .gitignore
66
+ - Gemfile
67
+ - Gemfile.lock
49
68
  - LICENSE
50
69
  - README.md
51
70
  - Rakefile
52
- - VERSION
53
71
  - lib/textractor.rb
54
- - lib/textractor/document.rb
55
- - spec/document_spec.rb
72
+ - lib/textractor/extractors.rb
73
+ - lib/textractor/extractors/doc_extractor.rb
74
+ - lib/textractor/extractors/docx_extractor.rb
75
+ - lib/textractor/extractors/pdf_extractor.rb
76
+ - lib/textractor/extractors/text_extractor.rb
77
+ - lib/textractor/extractors/word_extractor.rb
78
+ - lib/textractor/version.rb
56
79
  - spec/fixtures/document.doc
57
80
  - spec/fixtures/document.docx
58
81
  - spec/fixtures/document.pdf
59
82
  - spec/fixtures/document.txt
83
+ - spec/fixtures/no_extension
84
+ - spec/integration/textractor_spec.rb
60
85
  - spec/spec.opts
61
86
  - spec/spec_helper.rb
62
87
  - spec/textractor_spec.rb
@@ -100,18 +125,18 @@ required_rubygems_version: !ruby/object:Gem::Requirement
100
125
  requirements:
101
126
  - - ">="
102
127
  - !ruby/object:Gem::Version
103
- hash: 3
128
+ hash: 23
104
129
  segments:
105
- - 0
106
- version: "0"
130
+ - 1
131
+ - 3
132
+ - 6
133
+ version: 1.3.6
107
134
  requirements: []
108
135
 
109
- rubyforge_project:
136
+ rubyforge_project: textractor
110
137
  rubygems_version: 1.3.7
111
138
  signing_key:
112
139
  specification_version: 3
113
- summary: simple wrapper around CLI tools for extracting text from PDF and Word documents
114
- test_files:
115
- - spec/document_spec.rb
116
- - spec/spec_helper.rb
117
- - spec/textractor_spec.rb
140
+ summary: simple wrapper around CLI for extracting text from PDF and Word documents
141
+ test_files: []
142
+
data/VERSION DELETED
@@ -1 +0,0 @@
1
- 0.0.3
@@ -1,66 +0,0 @@
1
- module Textractor
2
-
3
- class Document
4
-
5
- CONTENT_TYPE_CONVERSIONS = {
6
- 'application/pdf' => :pdf,
7
- 'application/x-pdf' => :pdf,
8
- 'application/doc' => :doc,
9
- 'application/x-doc' => :doc,
10
- 'application/msword' => :doc,
11
- 'text/plain' => :txt,
12
- 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => :docx,
13
- }
14
-
15
- attr_reader :filename
16
-
17
- def initialize(filename, options = {})
18
- @filename = File.expand_path(filename)
19
- @content_type = options[:content_type]
20
- end
21
-
22
- def text
23
- send("extract_from_#{type}")
24
- end
25
-
26
- def type
27
- return CONTENT_TYPE_CONVERSIONS[content_type] if content_type
28
- case File.extname(@filename)
29
- when /pdf/
30
- :pdf
31
- when /docx/
32
- :docx
33
- when /doc/
34
- :doc
35
- when /txt/
36
- :txt
37
- else
38
- nil
39
- end
40
- end
41
-
42
- private
43
-
44
- def content_type
45
- @content_type
46
- end
47
-
48
- def extract_from_pdf
49
- `pdftotext #{filename} - 2>/dev/null`.strip
50
- end
51
-
52
- def extract_from_doc
53
- `wvWare -c utf-8 --nographics -x #{Textractor.wvText_path} #{filename} 2>/dev/null`.strip
54
- end
55
-
56
- def extract_from_docx
57
- `#{File.dirname(__FILE__) + "/../../vendor/docx2txt/docx2txt.pl"} #{filename} -`.strip
58
- end
59
-
60
- def extract_from_txt
61
- File.read(filename)
62
- end
63
-
64
- end
65
-
66
- end
@@ -1,94 +0,0 @@
1
- require 'spec/spec_helper'
2
-
3
- describe Textractor::Document do
4
-
5
- PDF_DOCUMENT_FIXTURE = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.pdf")
6
- DOC_DOCUMENT_FIXTURE = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.doc")
7
- TXT_DOCUMENT_FIXTURE = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.txt")
8
- DOCX_DOCUMENT_FIXTURE = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.docx")
9
-
10
- it 'should require a filename to create' do
11
- expect { Textractor::Document.new }.to raise_error(ArgumentError)
12
- Textractor::Document.new('filename').filename.should == File.expand_path('filename')
13
- end
14
-
15
- describe "#text" do
16
-
17
- describe "with pdf document" do
18
-
19
- it 'should extract the text from the document' do
20
- @doc = Textractor::Document.new(PDF_DOCUMENT_FIXTURE)
21
- @doc.text.should == "Ruby on rails developer"
22
- end
23
-
24
- end
25
-
26
- describe "with doc document" do
27
-
28
- it 'should extract the text from the document' do
29
- @doc = Textractor::Document.new(DOC_DOCUMENT_FIXTURE)
30
- @doc.text.should == "Ruby on rails developer"
31
- end
32
-
33
- end
34
-
35
- describe "with txt document" do
36
-
37
- it 'should extract the text from the document' do
38
- @doc = Textractor::Document.new(TXT_DOCUMENT_FIXTURE)
39
- @doc.text.should == "Ruby on rails developer"
40
- end
41
-
42
- end
43
-
44
- describe "with docx document" do
45
-
46
- it 'should extract the text from the document' do
47
- @doc = Textractor::Document.new(DOCX_DOCUMENT_FIXTURE)
48
- @doc.text.should == "Ruby on rails developer"
49
- end
50
-
51
- end
52
-
53
- end
54
-
55
- describe "#type" do
56
-
57
- describe "with no content type provided" do
58
- it 'should return :pdf for PDF documents' do
59
- @doc = Textractor::Document.new(PDF_DOCUMENT_FIXTURE)
60
- @doc.type.should == :pdf
61
- end
62
-
63
- it 'should return :doc for Word documents' do
64
- @doc = Textractor::Document.new(DOC_DOCUMENT_FIXTURE)
65
- @doc.type.should == :doc
66
- end
67
-
68
- it 'should return :docx for Word documents' do
69
- @doc = Textractor::Document.new(DOCX_DOCUMENT_FIXTURE)
70
- @doc.type.should == :docx
71
- end
72
-
73
- it 'should return nil for unknown documents' do
74
- @doc = Textractor::Document.new("foo.bar")
75
- @doc.type.should == nil
76
- end
77
- end
78
-
79
- describe "with a content type provided" do
80
-
81
- it 'should ignore the extension of the file' do
82
- [PDF_DOCUMENT_FIXTURE, DOC_DOCUMENT_FIXTURE, DOCX_DOCUMENT_FIXTURE].each do |filename|
83
- Textractor::Document::CONTENT_TYPE_CONVERSIONS.each do |content_type, type|
84
- @doc = Textractor::Document.new(filename, :content_type => content_type)
85
- @doc.type.should == type
86
- end
87
- end
88
- end
89
-
90
- end
91
-
92
- end
93
-
94
- end