RubyGems - textractor - Versions diffs - 0.1.4 → 0.1.5 - Mend

textractor 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

data/.rspec +1 -0
data/Gemfile.lock +10 -2
data/README.md +12 -0
data/Rakefile +8 -8
data/autotest/discover.rb +1 -0
data/lib/textractor/extractors/doc_extractor.rb +1 -1
data/lib/textractor/extractors/docx_extractor.rb +1 -1
data/lib/textractor/extractors/pdf_extractor.rb +1 -1
data/lib/textractor/version.rb +1 -1
data/spec/fixtures/document .doc +0 -0
data/spec/fixtures/document .docx +0 -0
data/spec/fixtures/document .pdf +0 -0
data/spec/fixtures/document .txt +1 -0
data/spec/integration/textractor_spec.rb +16 -0
data/spec/spec_helper.rb +4 -5
data/spec/textractor_spec.rb +1 -1
data/textractor.gemspec +1 -2
data/vendor/docx2txt/docx2txt.pl +4 -2
metadata +15 -10
data/spec/spec.opts +0 -1

data/.rspec ADDED

	@@ -0,0 +1 @@
1	+ -c -f progress -r ./spec/spec_helper

data/Gemfile.lock CHANGED

@@ -6,12 +6,20 @@ PATH
 GEM
   remote: http://rubygems.org/
   specs:
-    rspec (1.3.0)
+    diff-lcs (1.1.2)
+    rspec (2.1.0)
+      rspec-core (~> 2.1.0)
+      rspec-expectations (~> 2.1.0)
+      rspec-mocks (~> 2.1.0)
+    rspec-core (2.1.0)
+    rspec-expectations (2.1.0)
+      diff-lcs (~> 1.1.2)
+    rspec-mocks (2.1.0)
 PLATFORMS
   ruby
 DEPENDENCIES
   bundler (>= 1.0.0)
-  rspec (~> 1.3.0)
+  rspec (~> 2.1.0)
   textractor!

data/README.md CHANGED

@@ -10,6 +10,18 @@ In order to use textractor you have to install a few command line tools.
 ### OS X
+#### Homebrew
+    brew install xpdf links
+Download wv-1.2.4 and install from source: http://sourceforge.net/projects/wvware/files/wv/1.2.4/wv-1.2.4.tar.gz/download
+    ./configure
+    make
+    make install
+#### MacPorts
     port install wv xpdf links
 I recommend using also passing +no_x11 to the install command, but this may not work on all systems due to dependency issues.

data/Rakefile CHANGED

@@ -1,16 +1,16 @@
 require 'bundler'
 Bundler::GemHelper.install_tasks
-require 'spec/rake/spectask'
-Spec::Rake::SpecTask.new(:spec) do |spec|
-  spec.libs << 'lib' << 'spec'
-  spec.spec_files = FileList['spec/**/*_spec.rb']
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:spec) do |task|
+  task.rspec_opts = ["-c", "-f progress", "-r ./spec/spec_helper.rb"]
+  task.pattern = 'spec/**/*_spec.rb'
 end
-Spec::Rake::SpecTask.new(:rcov) do |spec|
-  spec.libs << 'lib' << 'spec'
-  spec.pattern = 'spec/**/*_spec.rb'
-  spec.rcov = true
+RSpec::Core::RakeTask.new(:rcov) do |task|
+  task.pattern = 'spec/**/*_spec.rb'
+  task.rcov = true
 end
 task :default => :spec

data/autotest/discover.rb ADDED

	@@ -0,0 +1 @@
1	+ Autotest.add_discovery { "rspec2" }

data/lib/textractor/extractors/doc_extractor.rb CHANGED

@@ -13,7 +13,7 @@ module Textractor::Extractors
     end
     def text_from_path(path)
-      command = "wvWare -c utf-8 --nographics -x #{wvText_path} #{path}"
+      command = "wvWare -c utf-8 --nographics -x #{wvText_path} '#{path}'"
       puts command if $DEBUG
       `#{command}`.strip
     end

data/lib/textractor/extractors/docx_extractor.rb CHANGED

@@ -14,7 +14,7 @@ module Textractor::Extractors
     def text_from_path(path)
-      `#{docx2txt_path} #{path} -`.strip
+      `#{docx2txt_path} '#{path}' -`.strip
     end
     private

data/lib/textractor/extractors/pdf_extractor.rb CHANGED

@@ -3,7 +3,7 @@ module Textractor::Extractors
   class PDFExtractor
     def text_from_path(path)
-      `pdftotext #{path} - 2>/dev/null`.strip
+      `pdftotext '#{path}' - 2>/dev/null`.strip
     end
   end

data/lib/textractor/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Textractor
-  VERSION = '0.1.4'
+  VERSION = '0.1.5'
 end

data/spec/fixtures/document .doc ADDED

Binary file

data/spec/fixtures/document .docx ADDED

Binary file

data/spec/fixtures/document .pdf ADDED

Binary file

data/spec/fixtures/document .txt ADDED

	@@ -0,0 +1 @@
1	+ text

data/spec/integration/textractor_spec.rb CHANGED

@@ -55,4 +55,20 @@ describe Textractor do
     }.to raise_error(Textractor::ContentTypeNotRegistered)
   end
+  it 'returns the contents of doc files with a space in the path' do
+    Textractor.text_from_path(fixture_path("document .doc")).should == 'text'
+  end
+  it 'returns the contents of docx files with a space in the path' do
+    Textractor.text_from_path(fixture_path("document .docx")).should == 'text'
+  end
+  it 'returns the contents of pdf files with a space in the path' do
+    Textractor.text_from_path(fixture_path("document .pdf")).should == 'text'
+  end
+  it 'returns the contents of txt files with a space in the path' do
+    Textractor.text_from_path(fixture_path("document .txt")).should == 'text'
+  end
 end

data/spec/spec_helper.rb CHANGED

@@ -1,15 +1,14 @@
-require 'rubygems'
-require 'bundler/setup'
-require 'spec'
 $LOAD_PATH.unshift(File.dirname(__FILE__))
+require 'rubygems'
+require 'bundler/setup'
+require 'rspec'
 require 'textractor'
 def fixture_path(path)
   File.expand_path(File.join(File.dirname(__FILE__), 'fixtures', path))
 end
-Spec::Runner.configure do |config|
+RSpec.configure do |config|
 end

data/spec/textractor_spec.rb CHANGED

@@ -1,4 +1,4 @@
-require 'spec/spec_helper'
+require 'spec_helper'
 class TestExtractor

data/textractor.gemspec CHANGED

@@ -1,4 +1,3 @@
-# -*- encoding: utf-8 -*-
 require File.expand_path("../lib/textractor/version", __FILE__)
 Gem::Specification.new do |s|
@@ -15,7 +14,7 @@ Gem::Specification.new do |s|
   s.rubyforge_project         = "textractor"
   s.add_development_dependency "bundler", ">= 1.0.0"
-  s.add_development_dependency "rspec",   "~> 1.3.0"
+  s.add_development_dependency "rspec",   "~> 2.1.0"
   s.files        = `git ls-files`.split("\n")
   s.executables  = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact

data/vendor/docx2txt/docx2txt.pl CHANGED

@@ -228,7 +228,9 @@ if ($ENV{OS} =~ /^Windows/) {
 if ($inpIsDir eq 'y') {
     readFileInto("$ARGV[0]/word/document.xml", $content);
 } else {
-    $content = `"$unzip" -p "$ARGV[0]" word/document.xml 2>$nulldevice`;
+    # Special fix for single quotes, they get un-escaped earlier
+    $ARGV[0] =~ s/\'/\'\\\'\'/g;
+    $content = `"$unzip" -p '$ARGV[0]' word/document.xml 2>$nulldevice`;
 }
 die "Failed to extract required information from <$ARGV[0]>!\n" if ! $content;
@@ -260,7 +262,7 @@ binmode $txtfile;    # Ensure no auto-conversion of '\n' to '\r\n' on Windows.
 if ($inpIsDir eq 'y') {
     readFileInto("$ARGV[0]/word/_rels/document.xml.rels", $_);
 } else {
-    $_ = `"$unzip" -p "$ARGV[0]" word/_rels/document.xml.rels 2>$nulldevice`;
+    $_ = `"$unzip" -p '$ARGV[0]' word/_rels/document.xml.rels 2>$nulldevice`;
 }
 my %docurels;

metadata CHANGED

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: textractor
 version: !ruby/object:Gem::Version
-  hash: 19
-  prerelease: false
+  hash: 17
+  prerelease:
   segments:
   - 0
   - 1
-  - 4
-  version: 0.1.4
+  - 5
+  version: 0.1.5
 platform: ruby
 authors:
 - Michael Guterl
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-11-06 00:00:00 -04:00
+date: 2011-07-20 00:00:00 -04:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -42,12 +42,12 @@ dependencies:
     requirements:
     - - ~>
       - !ruby/object:Gem::Version
-        hash: 27
+        hash: 11
         segments:
+        - 2
         - 1
-        - 3
         - 0
-        version: 1.3.0
+        version: 2.1.0
   type: :development
   version_requirements: *id002
 description: simple wrapper around CLI for extracting text from PDF and Word documents
@@ -63,11 +63,13 @@ extra_rdoc_files:
 files:
 - .document
 - .gitignore
+- .rspec
 - Gemfile
 - Gemfile.lock
 - LICENSE
 - README.md
 - Rakefile
+- autotest/discover.rb
 - bin/textractor
 - lib/textractor.rb
 - lib/textractor/content_type_detector.rb
@@ -79,13 +81,16 @@ files:
 - lib/textractor/extractors/pdf_extractor.rb
 - lib/textractor/extractors/text_extractor.rb
 - lib/textractor/version.rb
+- spec/fixtures/document .doc
+- spec/fixtures/document .docx
+- spec/fixtures/document .pdf
+- spec/fixtures/document .txt
 - spec/fixtures/document.doc
 - spec/fixtures/document.docx
 - spec/fixtures/document.pdf
 - spec/fixtures/document.txt
 - spec/fixtures/no_extension
 - spec/integration/textractor_spec.rb
-- spec/spec.opts
 - spec/spec_helper.rb
 - spec/textractor_spec.rb
 - support/wvText.xml
@@ -137,7 +142,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project: textractor
-rubygems_version: 1.3.7
+rubygems_version: 1.6.2
 signing_key:
 specification_version: 3
 summary: simple wrapper around CLI for extracting text from PDF and Word documents

data/spec/spec.opts DELETED

	@@ -1 +0,0 @@
1	- --color