RubyGems - textractor - Versions diffs - 0.0.3 → 0.1.2 - Mend

textractor 0.0.3 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

data/.gitignore +2 -0
data/Gemfile +4 -0
data/Gemfile.lock +17 -0
data/README.md +7 -7
data/Rakefile +2 -21
data/lib/textractor.rb +54 -8
data/lib/textractor/extractors.rb +12 -0
data/lib/textractor/extractors/doc_extractor.rb +29 -0
data/lib/textractor/extractors/docx_extractor.rb +28 -0
data/lib/textractor/extractors/pdf_extractor.rb +11 -0
data/lib/textractor/extractors/text_extractor.rb +11 -0
data/lib/textractor/extractors/word_extractor.rb +29 -0
data/lib/textractor/version.rb +3 -0
data/spec/fixtures/document.doc +0 -0
data/spec/fixtures/document.docx +0 -0
data/spec/fixtures/document.pdf +0 -0
data/spec/fixtures/document.txt +1 -1
data/spec/fixtures/no_extension +0 -0
data/spec/integration/textractor_spec.rb +58 -0
data/spec/spec_helper.rb +10 -4
data/spec/textractor_spec.rb +76 -14
data/textractor.gemspec +18 -71
metadata +45 -20
data/VERSION +0 -1
data/lib/textractor/document.rb +0 -66
data/spec/document_spec.rb +0 -94

data/.gitignore CHANGED

@@ -17,5 +17,7 @@ tmtags
 coverage
 rdoc
 pkg
+*.gem
+.bundle
 ## PROJECT::SPECIFIC

data/Gemfile ADDED

@@ -0,0 +1,4 @@
+source :gemcutter
+# Specify your gem's dependencies in textractor.gemspec
+gemspec

data/Gemfile.lock ADDED

@@ -0,0 +1,17 @@
+PATH
+  remote: .
+  specs:
+    textractor (0.1.2)
+GEM
+  remote: http://rubygems.org/
+  specs:
+    rspec (1.3.0)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  bundler (>= 1.0.0)
+  rspec (~> 1.3.0)
+  textractor!

data/README.md CHANGED

@@ -28,18 +28,18 @@ this may not work on all systems due to dependency issues.
 Due to textractor's reliance on command line tools all the methods in
 textractor work on paths not File objects.
-    document = Textractor::Document.new(path_to_document)
-    document.text # => "Ruby on rails developer"
-There is also a convenience method on Textractor.
-    Textractor.text_from_file(path_to_document) # => "Ruby on rails developer"
+    Textractor.text_from_path(path_to_document) # => "Ruby on rails developer"
 Textractor will attempt to guess what type of document you're trying
 to extract text from.  However, if you know the content type of your
 document, you can provide it and Textractor won't guess.
-    Textractor.text_from_file(path_to_document, :content_type => "application/doc")
+    Textractor.text_from_path(path_to_document, :content_type => "application/doc")
+## TODO
+* Remove vendored docx2txt perl script
+* Replace as much as possible with pure ruby
 ## Note on Patches/Pull Requests

data/Rakefile CHANGED

@@ -1,22 +1,5 @@
-require 'rubygems'
-require 'rake'
-begin
-  require 'jeweler'
-  Jeweler::Tasks.new do |gem|
-    gem.name = "textractor"
-    gem.summary = %Q{simple wrapper around CLI tools for extracting text from PDF and Word documents}
-    gem.description = %Q{simple wrapper around CLI for extracting text from PDF and Word documents}
-    gem.email = "mguterl@gmail.com"
-    gem.homepage = "http://github.com/mguterl/textractor"
-    gem.authors = ["Michael Guterl"]
-    gem.add_development_dependency "rspec", ">= 1.3.0"
-    # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
-  end
-  Jeweler::GemcutterTasks.new
-rescue LoadError
-  puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
-end
+require 'bundler'
+Bundler::GemHelper.install_tasks
 require 'spec/rake/spectask'
 Spec::Rake::SpecTask.new(:spec) do |spec|
@@ -30,8 +13,6 @@ Spec::Rake::SpecTask.new(:rcov) do |spec|
   spec.rcov = true
 end
-task :spec => :check_dependencies
 task :default => :spec
 require 'rake/rdoctask'

data/lib/textractor.rb CHANGED

@@ -1,18 +1,64 @@
 module Textractor
-  autoload :Document, "textractor/document"
-  def self.text_from_file(filename, options = {})
-    Textractor::Document.new(filename, options).text
+  UnknownContentType           = Class.new(StandardError)
+  FileNotFound                 = Class.new(StandardError)
+  ContentTypeAlreadyRegistered = Class.new(StandardError)
+  ContentTypeNotRegistered     = Class.new(StandardError)
+  autoload :Extractors, "textractor/extractors"
+  def self.text_from_path(path, options = {})
+    raise FileNotFound unless File.exists?(path)
+    content_type    = options.fetch(:content_type) { content_type_for_path(path) }
+    extractor_class = extractor_for_content_type(content_type)
+    extractor       = extractor_class.new
+    extractor.text_from_path(path)
   end
-  DEFAULT_WV_TEXT_PATH = File.expand_path(File.dirname(__FILE__) + "/../support/wvText.xml")
+  def self.content_type_for_path(path)
+    case File.extname(path)
+    when /\.pdf$/
+      'application/pdf'
+    when /\.doc$/
+      'application/msword'
+    when /\.docx$/
+      'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
+    when /\.txt$/
+      'text/plain'
+    else
+      raise UnknownContentType, "unable to determine content type for #{path}"
+    end
+  end
-  def self.wvText_path
-    @wvText_path || DEFAULT_WV_TEXT_PATH
+  def self.register_content_type(content_type, extractor)
+    raise ContentTypeAlreadyRegistered, "#{content_type} is already registered" if extractors[content_type]
+    extractors[content_type] = extractor
   end
-  def self.wvText_path=(path)
-    @wvText_path = path
+  def self.remove_content_type(content_type)
+    extractors.delete content_type
   end
+  def self.extractor_for_content_type(content_type)
+    extractors[content_type] or raise ContentTypeNotRegistered, "#{content_type} is not registered with Textractor"
+  end
+  def self.extractors
+    @extractors ||= {}
+  end
+  def self.clear_registry
+    @extractors = {}
+  end
+  def self.register_basic_types
+    register_content_type("application/pdf", Extractors::PDFExtractor)
+    register_content_type("application/msword", Extractors::DocExtractor)
+    register_content_type("application/vnd.openxmlformats-officedocument.wordprocessingml.document", Extractors::DocxExtractor)
+    register_content_type("text/plain", Extractors::TextExtractor)
+  end
+  register_basic_types
 end

data/lib/textractor/extractors.rb ADDED

@@ -0,0 +1,12 @@
+module Textractor
+  module Extractors
+    autoload :PDFExtractor,  'textractor/extractors/pdf_extractor'
+    autoload :DocExtractor,  'textractor/extractors/doc_extractor'
+    autoload :DocxExtractor, 'textractor/extractors/docx_extractor'
+    autoload :TextExtractor, 'textractor/extractors/text_extractor'
+  end
+end

data/lib/textractor/extractors/doc_extractor.rb ADDED

@@ -0,0 +1,29 @@
+module Textractor::Extractors
+  class DocExtractor
+    DEFAULT_WV_TEXT_PATH = File.expand_path(File.dirname(__FILE__) + "/../../../support/wvText.xml").freeze
+    class << self
+      attr_writer :wvText_path
+      def wvText_path
+        @wvText_path || DEFAULT_WV_TEXT_PATH
+      end
+    end
+    def text_from_path(path)
+      command = "wvWare -c utf-8 --nographics -x #{wvText_path} #{path}"
+      puts command if $DEBUG
+      `#{command}`.strip
+    end
+    private
+    def wvText_path
+      self.class.wvText_path
+    end
+  end
+end

data/lib/textractor/extractors/docx_extractor.rb ADDED

@@ -0,0 +1,28 @@
+module Textractor::Extractors
+  class DocxExtractor
+    DEFAULT_DOCX2TXT_PATH = File.expand_path(File.dirname(__FILE__) + "/../../../vendor/docx2txt/docx2txt.pl").freeze
+    class << self
+      attr_writer :docx2txt_path
+      def docx2txt_path
+        @docx2txt_path || DEFAULT_DOCX2TXT_PATH
+      end
+    end
+    def text_from_path(path)
+      `#{docx2txt_path} #{path} -`.strip
+    end
+    private
+    def docx2txt_path
+      self.class.docx2txt_path
+    end
+  end
+end

data/lib/textractor/extractors/pdf_extractor.rb ADDED

@@ -0,0 +1,11 @@
+module Textractor::Extractors
+  class PDFExtractor
+    def text_from_path(path)
+      `pdftotext #{path} - 2>/dev/null`.strip
+    end
+  end
+end

data/lib/textractor/extractors/text_extractor.rb ADDED

@@ -0,0 +1,11 @@
+module Textractor::Extractors
+  class TextExtractor
+    def text_from_path(path)
+      File.read(path)
+    end
+  end
+end

data/lib/textractor/extractors/word_extractor.rb ADDED

@@ -0,0 +1,29 @@
+module Textractor::Extractors
+  class WordExtractor
+    DEFAULT_WV_TEXT_PATH = File.expand_path(File.dirname(__FILE__) + "/../../../support/wvText.xml").freeze
+    class << self
+      attr_writer :wvText_path
+      def wvText_path
+        @wvText_path || DEFAULT_WV_TEXT_PATH
+      end
+    end
+    def text_from_path(path)
+      command = "wvWare -c utf-8 --nographics -x #{wvText_path} #{path}"
+      puts command if $DEBUG
+      `#{command}`.strip
+    end
+    private
+    def wvText_path
+      self.class.wvText_path
+    end
+  end
+end

data/lib/textractor/version.rb ADDED

@@ -0,0 +1,3 @@
+module Textractor
+  VERSION = '0.1.2'
+end

data/spec/fixtures/document.doc CHANGED

Binary file

data/spec/fixtures/document.docx CHANGED

Binary file

data/spec/fixtures/document.pdf CHANGED

Binary file

data/spec/fixtures/document.txt CHANGED

	@@ -1 +1 @@
1	- ~~Ruby on rails developer~~
1	+ text

data/spec/fixtures/no_extension ADDED

Binary file

data/spec/integration/textractor_spec.rb ADDED

@@ -0,0 +1,58 @@
+require 'spec_helper'
+describe Textractor do
+  before do
+    Textractor.clear_registry
+    Textractor.register_basic_types
+  end
+  it 'returns the contents of word (.doc) documents' do
+    Textractor.text_from_path(fixture_path("document.doc")).should == 'text'
+  end
+  it 'returns the contents of word (.docx) documents' do
+    Textractor.text_from_path(fixture_path("document.docx")).should == 'text'
+  end
+  it 'returns the contents of pdf documents' do
+    Textractor.text_from_path(fixture_path("document.pdf")).should == 'text'
+  end
+  it 'returns the contents of text documents' do
+    Textractor.text_from_path(fixture_path("document.txt")).should == 'text'
+  end
+  it 'allows the user to specify content type to avoid internal resolution' do
+    Textractor.text_from_path(fixture_path("no_extension"), :content_type => "application/pdf").should == 'text'
+  end
+  it 'raises an exception when the content type is unable to be determined' do
+    expect {
+      Textractor.text_from_path(fixture_path("no_extension"))
+    }.to raise_error(Textractor::UnknownContentType)
+  end
+  it 'raises an exception when the path specified does not exist' do
+    expect {
+      Textractor.text_from_path('non-existant')
+    }.to raise_error(Textractor::FileNotFound)
+  end
+  it 'raises an exception when there is no extractor defined for the content type' do
+    Textractor.clear_registry
+    expect {
+      Textractor.text_from_path(fixture_path('document.pdf'))
+    }.to raise_error(Textractor::ContentTypeNotRegistered)
+  end
+  it 'allows content type extractors to be removed' do
+    Textractor.remove_content_type("application/pdf")
+    expect {
+      Textractor.text_from_path(fixture_path('document.pdf'))
+    }.to raise_error(Textractor::ContentTypeNotRegistered)
+  end
+end

data/spec/spec_helper.rb CHANGED

@@ -1,9 +1,15 @@
+require 'rubygems'
+require 'bundler/setup'
+require 'spec'
 $LOAD_PATH.unshift(File.dirname(__FILE__))
-$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
 require 'textractor'
-require 'spec'
-require 'spec/autorun'
+def fixture_path(path)
+  File.expand_path(File.join(File.dirname(__FILE__), 'fixtures', path))
+end
 Spec::Runner.configure do |config|
 end

data/spec/textractor_spec.rb CHANGED

@@ -1,32 +1,94 @@
 require 'spec/spec_helper'
+class TestExtractor
+  def text_from_path(path)
+    path
+  end
+end
 describe Textractor do
-  describe ".wvText_path" do
+  before do
+    Textractor.clear_registry
+  end
+  describe ".text_from_path" do
+    before do
+      File.stub(:exists?).and_return(true)
+      Textractor.stub(:content_type_for_path).and_return('test')
+      Textractor.stub(:extractor_for_content_type).and_return(TestExtractor)
+    end
+    it 'extracts the text from a given path' do
+      Textractor.text_from_path('document').should == 'document'
+    end
+    it 'uses content_type_for_path to determine the content type' do
+      Textractor.should_receive(:content_type_for_path).with('document')
+      Textractor.text_from_path('document')
+    end
+    it 'uses extractor_for_content_type to look up the correct extractor' do
+      Textractor.should_receive(:extractor_for_content_type).with('test')
+      Textractor.text_from_path('document')
+    end
+  end
+  describe ".register_content_type" do
+    it 'raises an exception if an extractor is already defined for that content type' do
+      Textractor.register_content_type("text/plain", TestExtractor)
+      expect {
+        Textractor.register_content_type("text/plain", TestExtractor)
+      }.to raise_error(Textractor::ContentTypeAlreadyRegistered)
+    end
+  end
-    it 'should default to the file provided with the gem' do
-      Textractor.wvText_path.should == Textractor::DEFAULT_WV_TEXT_PATH
+  describe ".extractor_for_content_type" do
+    before do
+      Textractor.register_content_type("text/plain", TestExtractor)
     end
-    it 'should use the new wvText_path if provided' do
-      Textractor.wvText_path = "foo.bar"
-      Textractor.wvText_path.should == "foo.bar"
+    it 'returns the extractor for the content type' do
+      Textractor.extractor_for_content_type("text/plain").should == TestExtractor
     end
+    it 'raises an exception when no extractor is defined for that content type' do
+      expect {
+        Textractor.extractor_for_content_type("unknown")
+      }.to raise_error(Textractor::ContentTypeNotRegistered)
+    end
   end
-  describe ".text_from_file" do
+  describe ".content_type_for_path" do
-    it 'should return the extracted text from the file' do
-      document_path = 'word.doc'
-      document = mock("Textractor::Document", :text => "Ruby on Rails developer")
-      Textractor::Document.should_receive(:new).with(document_path, :content_type => "application/doc").and_return(document)
-      Textractor.text_from_file(document_path, :content_type => "application/doc").should == "Ruby on Rails developer"
+    it 'returns the content type based on the file extension' do
+      Textractor.content_type_for_path("document.pdf").should == "application/pdf"
+    end
+    it 'raises an exception if it cannot determine the content type' do
+      expect {
+        Textractor.content_type_for_path('unknown')
+      }.to raise_error(Textractor::UnknownContentType)
     end
   end
-  after(:all) do
-    Textractor.instance_variable_set(:"@wvText_path", nil)
+  describe ".clear_registry" do
+    before do
+      Textractor.register_content_type("text/plain", TestExtractor)
+    end
+    it 'clears the registered content types and their respective extractors' do
+      Textractor.clear_registry
+      Textractor.extractors.should be_empty
+    end
   end
 end

data/textractor.gemspec CHANGED

@@ -1,78 +1,25 @@
-# Generated by jeweler
-# DO NOT EDIT THIS FILE DIRECTLY
-# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
 # -*- encoding: utf-8 -*-
+require File.expand_path("../lib/textractor/version", __FILE__)
 Gem::Specification.new do |s|
-  s.name = %q{textractor}
-  s.version = "0.0.3"
+  s.name        = "textractor"
+  s.version     = Textractor::VERSION
+  s.platform    = Gem::Platform::RUBY
+  s.authors     = ['Michael Guterl']
+  s.email       = ['michael@diminishing.org']
+  s.homepage    = "http://github.com/mguterl/textractor"
+  s.summary     = "simple wrapper around CLI for extracting text from PDF and Word documents"
+  s.description = "simple wrapper around CLI for extracting text from PDF and Word documents"
-  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
-  s.authors = ["Michael Guterl"]
-  s.date = %q{2010-07-27}
-  s.description = %q{simple wrapper around CLI for extracting text from PDF and Word documents}
-  s.email = %q{mguterl@gmail.com}
-  s.extra_rdoc_files = [
-    "LICENSE",
-     "README.md"
-  ]
-  s.files = [
-    ".document",
-     ".gitignore",
-     "LICENSE",
-     "README.md",
-     "Rakefile",
-     "VERSION",
-     "lib/textractor.rb",
-     "lib/textractor/document.rb",
-     "spec/document_spec.rb",
-     "spec/fixtures/document.doc",
-     "spec/fixtures/document.docx",
-     "spec/fixtures/document.pdf",
-     "spec/fixtures/document.txt",
-     "spec/spec.opts",
-     "spec/spec_helper.rb",
-     "spec/textractor_spec.rb",
-     "support/wvText.xml",
-     "textractor.gemspec",
-     "vendor/docx2txt/AUTHORS",
-     "vendor/docx2txt/BSDmakefile",
-     "vendor/docx2txt/COPYING",
-     "vendor/docx2txt/ChangeLog",
-     "vendor/docx2txt/INSTALL",
-     "vendor/docx2txt/Makefile",
-     "vendor/docx2txt/README",
-     "vendor/docx2txt/ToDo",
-     "vendor/docx2txt/VERSION",
-     "vendor/docx2txt/WInstall.bat",
-     "vendor/docx2txt/docx2txt.bat",
-     "vendor/docx2txt/docx2txt.config",
-     "vendor/docx2txt/docx2txt.pl",
-     "vendor/docx2txt/docx2txt.sh",
-     "vendor/docx2txt/resume.docx"
-  ]
-  s.homepage = %q{http://github.com/mguterl/textractor}
-  s.rdoc_options = ["--charset=UTF-8"]
-  s.require_paths = ["lib"]
-  s.rubygems_version = %q{1.3.7}
-  s.summary = %q{simple wrapper around CLI tools for extracting text from PDF and Word documents}
-  s.test_files = [
-    "spec/document_spec.rb",
-     "spec/spec_helper.rb",
-     "spec/textractor_spec.rb"
-  ]
+  s.required_rubygems_version = ">= 1.3.6"
+  s.rubyforge_project         = "textractor"
-  if s.respond_to? :specification_version then
-    current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
-    s.specification_version = 3
+  s.add_development_dependency "bundler", ">= 1.0.0"
+  s.add_development_dependency "rspec",   "~> 1.3.0"
-    if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
-      s.add_development_dependency(%q<rspec>, [">= 1.3.0"])
-    else
-      s.add_dependency(%q<rspec>, [">= 1.3.0"])
-    end
-  else
-    s.add_dependency(%q<rspec>, [">= 1.3.0"])
-  end
+  s.files        = `git ls-files`.split("\n")
+  s.executables  = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
+  s.require_path = 'lib'
+  s.extra_rdoc_files = ["LICENSE", "README.md"]
+  s.rdoc_options = ["--charset=UTF-8"]
 end

metadata CHANGED

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: textractor
 version: !ruby/object:Gem::Version
-  hash: 25
+  hash: 31
   prerelease: false
   segments:
   - 0
-  - 0
-  - 3
-  version: 0.0.3
+  - 1
+  - 2
+  version: 0.1.2
 platform: ruby
 authors:
 - Michael Guterl
@@ -15,16 +15,32 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-07-27 00:00:00 -04:00
+date: 2010-11-06 00:00:00 -04:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
-  name: rspec
+  name: bundler
   prerelease: false
   requirement: &id001 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ">="
+      - !ruby/object:Gem::Version
+        hash: 23
+        segments:
+        - 1
+        - 0
+        - 0
+        version: 1.0.0
+  type: :development
+  version_requirements: *id001
+- !ruby/object:Gem::Dependency
+  name: rspec
+  prerelease: false
+  requirement: &id002 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
       - !ruby/object:Gem::Version
         hash: 27
         segments:
@@ -33,9 +49,10 @@ dependencies:
         - 0
         version: 1.3.0
   type: :development
-  version_requirements: *id001
+  version_requirements: *id002
 description: simple wrapper around CLI for extracting text from PDF and Word documents
-email: mguterl@gmail.com
+email:
+- michael@diminishing.org
 executables: []
 extensions: []
@@ -46,17 +63,25 @@ extra_rdoc_files:
 files:
 - .document
 - .gitignore
+- Gemfile
+- Gemfile.lock
 - LICENSE
 - README.md
 - Rakefile
-- VERSION
 - lib/textractor.rb
-- lib/textractor/document.rb
-- spec/document_spec.rb
+- lib/textractor/extractors.rb
+- lib/textractor/extractors/doc_extractor.rb
+- lib/textractor/extractors/docx_extractor.rb
+- lib/textractor/extractors/pdf_extractor.rb
+- lib/textractor/extractors/text_extractor.rb
+- lib/textractor/extractors/word_extractor.rb
+- lib/textractor/version.rb
 - spec/fixtures/document.doc
 - spec/fixtures/document.docx
 - spec/fixtures/document.pdf
 - spec/fixtures/document.txt
+- spec/fixtures/no_extension
+- spec/integration/textractor_spec.rb
 - spec/spec.opts
 - spec/spec_helper.rb
 - spec/textractor_spec.rb
@@ -100,18 +125,18 @@ required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      hash: 3
+      hash: 23
       segments:
-      - 0
-      version: "0"
+      - 1
+      - 3
+      - 6
+      version: 1.3.6
 requirements: []
-rubyforge_project:
+rubyforge_project: textractor
 rubygems_version: 1.3.7
 signing_key:
 specification_version: 3
-summary: simple wrapper around CLI tools for extracting text from PDF and Word documents
-test_files:
-- spec/document_spec.rb
-- spec/spec_helper.rb
-- spec/textractor_spec.rb
+summary: simple wrapper around CLI for extracting text from PDF and Word documents
+test_files: []

data/VERSION DELETED

	@@ -1 +0,0 @@
1	- 0.0.3

data/lib/textractor/document.rb DELETED

@@ -1,66 +0,0 @@
-module Textractor
-  class Document
-    CONTENT_TYPE_CONVERSIONS = {
-      'application/pdf'    => :pdf,
-      'application/x-pdf'  => :pdf,
-      'application/doc'    => :doc,
-      'application/x-doc'  => :doc,
-      'application/msword' => :doc,
-      'text/plain'         => :txt,
-      'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => :docx,
-    }
-    attr_reader :filename
-    def initialize(filename, options = {})
-      @filename = File.expand_path(filename)
-      @content_type = options[:content_type]
-    end
-    def text
-      send("extract_from_#{type}")
-    end
-    def type
-      return CONTENT_TYPE_CONVERSIONS[content_type] if content_type
-      case File.extname(@filename)
-      when /pdf/
-        :pdf
-      when /docx/
-        :docx
-      when /doc/
-        :doc
-      when /txt/
-        :txt
-      else
-        nil
-      end
-    end
-    private
-    def content_type
-      @content_type
-    end
-    def extract_from_pdf
-      `pdftotext #{filename} - 2>/dev/null`.strip
-    end
-    def extract_from_doc
-      `wvWare -c utf-8 --nographics -x #{Textractor.wvText_path} #{filename} 2>/dev/null`.strip
-    end
-    def extract_from_docx
-      `#{File.dirname(__FILE__) + "/../../vendor/docx2txt/docx2txt.pl"} #{filename} -`.strip
-    end
-    def extract_from_txt
-      File.read(filename)
-    end
-  end
-end

data/spec/document_spec.rb DELETED

@@ -1,94 +0,0 @@
-require 'spec/spec_helper'
-describe Textractor::Document do
-  PDF_DOCUMENT_FIXTURE  = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.pdf")
-  DOC_DOCUMENT_FIXTURE = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.doc")
-  TXT_DOCUMENT_FIXTURE  = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.txt")
-  DOCX_DOCUMENT_FIXTURE = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.docx")
-  it 'should require a filename to create' do
-    expect { Textractor::Document.new }.to raise_error(ArgumentError)
-    Textractor::Document.new('filename').filename.should == File.expand_path('filename')
-  end
-  describe "#text" do
-    describe "with pdf document" do
-      it 'should extract the text from the document' do
-        @doc = Textractor::Document.new(PDF_DOCUMENT_FIXTURE)
-        @doc.text.should == "Ruby on rails developer"
-      end
-    end
-    describe "with doc document" do
-      it 'should extract the text from the document' do
-        @doc = Textractor::Document.new(DOC_DOCUMENT_FIXTURE)
-        @doc.text.should == "Ruby on rails developer"
-      end
-    end
-    describe "with txt document" do
-      it 'should extract the text from the document' do
-        @doc = Textractor::Document.new(TXT_DOCUMENT_FIXTURE)
-        @doc.text.should == "Ruby on rails developer"
-      end
-    end
-    describe "with docx document" do
-      it 'should extract the text from the document' do
-        @doc = Textractor::Document.new(DOCX_DOCUMENT_FIXTURE)
-        @doc.text.should == "Ruby on rails developer"
-      end
-    end
-  end
-  describe "#type" do
-    describe "with no content type provided" do
-      it 'should return :pdf for PDF documents' do
-        @doc = Textractor::Document.new(PDF_DOCUMENT_FIXTURE)
-        @doc.type.should == :pdf
-      end
-      it 'should return :doc for Word documents' do
-        @doc = Textractor::Document.new(DOC_DOCUMENT_FIXTURE)
-        @doc.type.should == :doc
-      end
-      it 'should return :docx for Word documents' do
-        @doc = Textractor::Document.new(DOCX_DOCUMENT_FIXTURE)
-        @doc.type.should == :docx
-      end
-      it 'should return nil for unknown documents' do
-        @doc = Textractor::Document.new("foo.bar")
-        @doc.type.should == nil
-      end
-    end
-    describe "with a content type provided" do
-      it 'should ignore the extension of the file' do
-        [PDF_DOCUMENT_FIXTURE, DOC_DOCUMENT_FIXTURE, DOCX_DOCUMENT_FIXTURE].each do |filename|
-          Textractor::Document::CONTENT_TYPE_CONVERSIONS.each do |content_type, type|
-            @doc = Textractor::Document.new(filename, :content_type => content_type)
-            @doc.type.should == type
-          end
-        end
-      end
-    end
-  end
-end