RubyGems - textractor - Versions diffs - 0.1.6 → 0.2.0 - Mend

textractor 0.1.6 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

data/Gemfile.lock +3 -1
data/Rakefile +1 -1
data/lib/textractor.rb +1 -0
data/lib/textractor/extractors/doc_extractor.rb +1 -1
data/lib/textractor/extractors/docx_extractor.rb +1 -1
data/lib/textractor/extractors/pdf_extractor.rb +1 -1
data/lib/textractor/version.rb +1 -1
data/textractor.gemspec +3 -1
metadata +20 -17
data/spec/content_type_detector/simple_spec.rb +0 -30
data/spec/fixtures/document .doc +0 -0
data/spec/fixtures/document .docx +0 -0
data/spec/fixtures/document .pdf +0 -0
data/spec/fixtures/document .txt +0 -1
data/spec/fixtures/document.doc +0 -0
data/spec/fixtures/document.docx +0 -0
data/spec/fixtures/document.pdf +0 -0
data/spec/fixtures/document.txt +0 -1
data/spec/fixtures/no_extension +0 -0
data/spec/integration/textractor_spec.rb +0 -74
data/spec/spec_helper.rb +0 -14
data/spec/textractor_spec.rb +0 -104

data/Gemfile.lock CHANGED

@@ -1,12 +1,14 @@
 PATH
   remote: .
   specs:
-    textractor (0.1.4)
+    textractor (0.1.6)
+      escape (>= 0.0.4)
 GEM
   remote: http://rubygems.org/
   specs:
     diff-lcs (1.1.2)
+    escape (0.0.4)
     rspec (2.1.0)
       rspec-core (~> 2.1.0)
       rspec-expectations (~> 2.1.0)

data/Rakefile CHANGED

@@ -15,7 +15,7 @@ end
 task :default => :spec
-require 'rake/rdoctask'
+require 'rdoc/task'
 Rake::RDocTask.new do |rdoc|
   version = File.exist?('VERSION') ? File.read('VERSION') : ""

data/lib/textractor.rb CHANGED

@@ -64,3 +64,4 @@ module Textractor
 end
 require 'textractor/content_type_detector'
+require 'escape'

data/lib/textractor/extractors/doc_extractor.rb CHANGED

@@ -13,7 +13,7 @@ module Textractor::Extractors
     end
     def text_from_path(path)
-      command = "wvWare -c utf-8 --nographics -x #{wvText_path} '#{path}'"
+      command = "wvWare -c utf-8 --nographics -x #{wvText_path} #{Escape.shell_single_word(path)}"
       puts command if $DEBUG
       `#{command}`.strip
     end

data/lib/textractor/extractors/docx_extractor.rb CHANGED

@@ -14,7 +14,7 @@ module Textractor::Extractors
     def text_from_path(path)
-      `#{docx2txt_path} '#{path}' -`.strip
+      `#{docx2txt_path} #{Escape.shell_single_word(path)} -`.strip
     end
     private

data/lib/textractor/extractors/pdf_extractor.rb CHANGED

@@ -3,7 +3,7 @@ module Textractor::Extractors
   class PDFExtractor
     def text_from_path(path)
-      `pdftotext '#{path}' - 2>/dev/null`.strip
+      `pdftotext #{Escape.shell_single_word(path)} - 2>/dev/null`.strip
     end
   end

data/lib/textractor/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Textractor
-  VERSION = '0.1.6'
+  VERSION = '0.2.0'
 end

data/textractor.gemspec CHANGED

@@ -16,7 +16,9 @@ Gem::Specification.new do |s|
   s.add_development_dependency "bundler", ">= 1.0.0"
   s.add_development_dependency "rspec",   "~> 2.1.0"
-  s.files        = `git ls-files`.split("\n")
+  s.add_runtime_dependency "escape", ">=0.0.4"
+  s.files        = `git ls-files`.split("\n").reject{|f| f.gsub(/"/, "") =~ /^spec/}
   s.executables  = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
   s.require_path = 'lib'
   s.extra_rdoc_files = ["LICENSE", "README.md"]

metadata CHANGED

@@ -5,9 +5,9 @@ version: !ruby/object:Gem::Version
   prerelease:
   segments:
   - 0
-  - 1
-  - 6
-  version: 0.1.6
+  - 2
+  - 0
+  version: 0.2.0
 platform: ruby
 authors:
 - Michael Guterl
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-07-22 00:00:00 -04:00
+date: 2011-07-29 00:00:00 -04:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -50,6 +50,22 @@ dependencies:
         version: 2.1.0
   type: :development
   version_requirements: *id002
+- !ruby/object:Gem::Dependency
+  name: escape
+  prerelease: false
+  requirement: &id003 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 23
+        segments:
+        - 0
+        - 0
+        - 4
+        version: 0.0.4
+  type: :runtime
+  version_requirements: *id003
 description: simple wrapper around CLI for extracting text from PDF and Word documents
 email:
 - michael@diminishing.org
@@ -81,19 +97,6 @@ files:
 - lib/textractor/extractors/pdf_extractor.rb
 - lib/textractor/extractors/text_extractor.rb
 - lib/textractor/version.rb
-- spec/content_type_detector/simple_spec.rb
-- spec/fixtures/document .doc
-- spec/fixtures/document .docx
-- spec/fixtures/document .pdf
-- spec/fixtures/document .txt
-- spec/fixtures/document.doc
-- spec/fixtures/document.docx
-- spec/fixtures/document.pdf
-- spec/fixtures/document.txt
-- spec/fixtures/no_extension
-- spec/integration/textractor_spec.rb
-- spec/spec_helper.rb
-- spec/textractor_spec.rb
 - support/wvText.xml
 - textractor.gemspec
 - vendor/docx2txt/AUTHORS

data/spec/content_type_detector/simple_spec.rb DELETED

@@ -1,30 +0,0 @@
-require 'spec_helper'
-describe Textractor::ContentTypeDetector::Simple do
-  FILENAMES = [
-    [
-      "foo.pdf", "application/pdf",
-      "foo.doc", "application/msword",
-      "foo.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-      "foo.txt", "text/plain",
-    ]
-  ]
-  describe '.content_type_for_path' do
-    FILENAMES.each do |(filename, content_type)|
-      context "given #{filename}" do
-        it "returns #{content_type}" do
-          Textractor::ContentTypeDetector::Simple.content_type_for_path(filename).should == content_type
-        end
-      end
-      context "given #{filename}" do
-        it "returns #{content_type}" do
-          Textractor::ContentTypeDetector::Simple.content_type_for_path(filename.upcase).should == content_type
-        end
-      end
-    end
-  end
-end

data/spec/fixtures/document .doc DELETED

Binary file

data/spec/fixtures/document .docx DELETED

Binary file

data/spec/fixtures/document .pdf DELETED

Binary file

data/spec/fixtures/document .txt DELETED

	@@ -1 +0,0 @@
1	- text

data/spec/fixtures/document.doc DELETED

Binary file

data/spec/fixtures/document.docx DELETED

Binary file

data/spec/fixtures/document.pdf DELETED

Binary file

data/spec/fixtures/document.txt DELETED

	@@ -1 +0,0 @@
1	- text

data/spec/fixtures/no_extension DELETED

Binary file

data/spec/integration/textractor_spec.rb DELETED

@@ -1,74 +0,0 @@
-require 'spec_helper'
-describe Textractor do
-  before do
-    Textractor.clear_registry
-    Textractor.register_basic_types
-  end
-  it 'returns the contents of word (.doc) documents' do
-    Textractor.text_from_path(fixture_path("document.doc")).should == 'text'
-  end
-  it 'returns the contents of word (.docx) documents' do
-    Textractor.text_from_path(fixture_path("document.docx")).should == 'text'
-  end
-  it 'returns the contents of pdf documents' do
-    Textractor.text_from_path(fixture_path("document.pdf")).should == 'text'
-  end
-  it 'returns the contents of text documents' do
-    Textractor.text_from_path(fixture_path("document.txt")).should == 'text'
-  end
-  it 'allows the user to specify content type to avoid internal resolution' do
-    Textractor.text_from_path(fixture_path("no_extension"), :content_type => "application/pdf").should == 'text'
-  end
-  it 'raises an exception when the content type is unable to be determined' do
-    expect {
-      Textractor.text_from_path(fixture_path("no_extension"))
-    }.to raise_error(Textractor::UnknownContentType)
-  end
-  it 'raises an exception when the path specified does not exist' do
-    expect {
-      Textractor.text_from_path('non-existant')
-    }.to raise_error(Textractor::FileNotFound)
-  end
-  it 'raises an exception when there is no extractor defined for the content type' do
-    Textractor.clear_registry
-    expect {
-      Textractor.text_from_path(fixture_path('document.pdf'))
-    }.to raise_error(Textractor::ContentTypeNotRegistered)
-  end
-  it 'allows content type extractors to be removed' do
-    Textractor.remove_content_type("application/pdf")
-    expect {
-      Textractor.text_from_path(fixture_path('document.pdf'))
-    }.to raise_error(Textractor::ContentTypeNotRegistered)
-  end
-  it 'returns the contents of doc files with a space in the path' do
-    Textractor.text_from_path(fixture_path("document .doc")).should == 'text'
-  end
-  it 'returns the contents of docx files with a space in the path' do
-    Textractor.text_from_path(fixture_path("document .docx")).should == 'text'
-  end
-  it 'returns the contents of pdf files with a space in the path' do
-    Textractor.text_from_path(fixture_path("document .pdf")).should == 'text'
-  end
-  it 'returns the contents of txt files with a space in the path' do
-    Textractor.text_from_path(fixture_path("document .txt")).should == 'text'
-  end
-end

data/spec/spec_helper.rb DELETED

@@ -1,14 +0,0 @@
-$LOAD_PATH.unshift(File.dirname(__FILE__))
-require 'rubygems'
-require 'bundler/setup'
-require 'rspec'
-require 'textractor'
-def fixture_path(path)
-  File.expand_path(File.join(File.dirname(__FILE__), 'fixtures', path))
-end
-RSpec.configure do |config|
-end

data/spec/textractor_spec.rb DELETED

@@ -1,104 +0,0 @@
-require 'spec_helper'
-class TestExtractor
-  def text_from_path(path)
-    path
-  end
-end
-describe Textractor do
-  before do
-    Textractor.clear_registry
-  end
-  describe ".text_from_path" do
-    before do
-      File.stub(:exists?).and_return(true)
-      Textractor.stub(:content_type_for_path).and_return('test')
-      Textractor.stub(:extractor_for_content_type).and_return(TestExtractor)
-    end
-    it 'extracts the text from a given path' do
-      Textractor.text_from_path('document').should == 'document'
-    end
-    it 'uses content_type_for_path to determine the content type' do
-      Textractor.should_receive(:content_type_for_path).with('document')
-      Textractor.text_from_path('document')
-    end
-    it 'uses extractor_for_content_type to look up the correct extractor' do
-      Textractor.should_receive(:extractor_for_content_type).with('test')
-      Textractor.text_from_path('document')
-    end
-  end
-  describe ".register_content_type" do
-    it 'raises an exception if an extractor is already defined for that content type' do
-      Textractor.register_content_type("text/plain", TestExtractor)
-      expect {
-        Textractor.register_content_type("text/plain", TestExtractor)
-      }.to raise_error(Textractor::ContentTypeAlreadyRegistered)
-    end
-    it 'takes a block for simple cases' do
-      File.stub(:exists?).and_return(true)
-      Textractor.stub(:content_type_for_path).and_return('test')
-      Textractor.register_content_type('test') do |path|
-        path
-      end
-      Textractor.text_from_path('document').should == 'document'
-    end
-  end
-  describe ".extractor_for_content_type" do
-    before do
-      Textractor.register_content_type("text/plain", TestExtractor)
-    end
-    it 'returns the extractor for the content type' do
-      Textractor.extractor_for_content_type("text/plain").should == TestExtractor
-    end
-    it 'raises an exception when no extractor is defined for that content type' do
-      expect {
-        Textractor.extractor_for_content_type("unknown")
-      }.to raise_error(Textractor::ContentTypeNotRegistered)
-    end
-  end
-  describe ".content_type_for_path" do
-    it 'returns the content type based on the file extension' do
-      Textractor.content_type_for_path("document.pdf").should == "application/pdf"
-    end
-    it 'raises an exception if it cannot determine the content type' do
-      expect {
-        Textractor.content_type_for_path('unknown')
-      }.to raise_error(Textractor::UnknownContentType)
-    end
-  end
-  describe ".clear_registry" do
-    before do
-      Textractor.register_content_type("text/plain", TestExtractor)
-    end
-    it 'clears the registered content types and their respective extractors' do
-      Textractor.clear_registry
-      Textractor.extractors.should be_empty
-    end
-  end
-end