textractor 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ -c -f progress -r ./spec/spec_helper
@@ -6,12 +6,20 @@ PATH
6
6
  GEM
7
7
  remote: http://rubygems.org/
8
8
  specs:
9
- rspec (1.3.0)
9
+ diff-lcs (1.1.2)
10
+ rspec (2.1.0)
11
+ rspec-core (~> 2.1.0)
12
+ rspec-expectations (~> 2.1.0)
13
+ rspec-mocks (~> 2.1.0)
14
+ rspec-core (2.1.0)
15
+ rspec-expectations (2.1.0)
16
+ diff-lcs (~> 1.1.2)
17
+ rspec-mocks (2.1.0)
10
18
 
11
19
  PLATFORMS
12
20
  ruby
13
21
 
14
22
  DEPENDENCIES
15
23
  bundler (>= 1.0.0)
16
- rspec (~> 1.3.0)
24
+ rspec (~> 2.1.0)
17
25
  textractor!
data/README.md CHANGED
@@ -10,6 +10,18 @@ In order to use textractor you have to install a few command line tools.
10
10
 
11
11
  ### OS X
12
12
 
13
+ #### Homebrew
14
+
15
+ brew install xpdf links
16
+
17
+ Download wv-1.2.4 and install from source: http://sourceforge.net/projects/wvware/files/wv/1.2.4/wv-1.2.4.tar.gz/download
18
+
19
+ ./configure
20
+ make
21
+ make install
22
+
23
+ #### MacPorts
24
+
13
25
  port install wv xpdf links
14
26
 
15
27
  I recommend using also passing +no_x11 to the install command, but this may not work on all systems due to dependency issues.
data/Rakefile CHANGED
@@ -1,16 +1,16 @@
1
1
  require 'bundler'
2
2
  Bundler::GemHelper.install_tasks
3
3
 
4
- require 'spec/rake/spectask'
5
- Spec::Rake::SpecTask.new(:spec) do |spec|
6
- spec.libs << 'lib' << 'spec'
7
- spec.spec_files = FileList['spec/**/*_spec.rb']
4
+ require 'rspec/core/rake_task'
5
+
6
+ RSpec::Core::RakeTask.new(:spec) do |task|
7
+ task.rspec_opts = ["-c", "-f progress", "-r ./spec/spec_helper.rb"]
8
+ task.pattern = 'spec/**/*_spec.rb'
8
9
  end
9
10
 
10
- Spec::Rake::SpecTask.new(:rcov) do |spec|
11
- spec.libs << 'lib' << 'spec'
12
- spec.pattern = 'spec/**/*_spec.rb'
13
- spec.rcov = true
11
+ RSpec::Core::RakeTask.new(:rcov) do |task|
12
+ task.pattern = 'spec/**/*_spec.rb'
13
+ task.rcov = true
14
14
  end
15
15
 
16
16
  task :default => :spec
@@ -0,0 +1 @@
1
+ Autotest.add_discovery { "rspec2" }
@@ -13,7 +13,7 @@ module Textractor::Extractors
13
13
  end
14
14
 
15
15
  def text_from_path(path)
16
- command = "wvWare -c utf-8 --nographics -x #{wvText_path} #{path}"
16
+ command = "wvWare -c utf-8 --nographics -x #{wvText_path} '#{path}'"
17
17
  puts command if $DEBUG
18
18
  `#{command}`.strip
19
19
  end
@@ -14,7 +14,7 @@ module Textractor::Extractors
14
14
 
15
15
 
16
16
  def text_from_path(path)
17
- `#{docx2txt_path} #{path} -`.strip
17
+ `#{docx2txt_path} '#{path}' -`.strip
18
18
  end
19
19
 
20
20
  private
@@ -3,7 +3,7 @@ module Textractor::Extractors
3
3
  class PDFExtractor
4
4
 
5
5
  def text_from_path(path)
6
- `pdftotext #{path} - 2>/dev/null`.strip
6
+ `pdftotext '#{path}' - 2>/dev/null`.strip
7
7
  end
8
8
 
9
9
  end
@@ -1,3 +1,3 @@
1
1
  module Textractor
2
- VERSION = '0.1.4'
2
+ VERSION = '0.1.5'
3
3
  end
@@ -0,0 +1 @@
1
+ text
@@ -55,4 +55,20 @@ describe Textractor do
55
55
  }.to raise_error(Textractor::ContentTypeNotRegistered)
56
56
  end
57
57
 
58
+ it 'returns the contents of doc files with a space in the path' do
59
+ Textractor.text_from_path(fixture_path("document .doc")).should == 'text'
60
+ end
61
+
62
+ it 'returns the contents of docx files with a space in the path' do
63
+ Textractor.text_from_path(fixture_path("document .docx")).should == 'text'
64
+ end
65
+
66
+ it 'returns the contents of pdf files with a space in the path' do
67
+ Textractor.text_from_path(fixture_path("document .pdf")).should == 'text'
68
+ end
69
+
70
+ it 'returns the contents of txt files with a space in the path' do
71
+ Textractor.text_from_path(fixture_path("document .txt")).should == 'text'
72
+ end
73
+
58
74
  end
@@ -1,15 +1,14 @@
1
- require 'rubygems'
2
- require 'bundler/setup'
3
- require 'spec'
4
-
5
1
  $LOAD_PATH.unshift(File.dirname(__FILE__))
6
2
 
3
+ require 'rubygems'
4
+ require 'bundler/setup'
5
+ require 'rspec'
7
6
  require 'textractor'
8
7
 
9
8
  def fixture_path(path)
10
9
  File.expand_path(File.join(File.dirname(__FILE__), 'fixtures', path))
11
10
  end
12
11
 
13
- Spec::Runner.configure do |config|
12
+ RSpec.configure do |config|
14
13
 
15
14
  end
@@ -1,4 +1,4 @@
1
- require 'spec/spec_helper'
1
+ require 'spec_helper'
2
2
 
3
3
  class TestExtractor
4
4
 
@@ -1,4 +1,3 @@
1
- # -*- encoding: utf-8 -*-
2
1
  require File.expand_path("../lib/textractor/version", __FILE__)
3
2
 
4
3
  Gem::Specification.new do |s|
@@ -15,7 +14,7 @@ Gem::Specification.new do |s|
15
14
  s.rubyforge_project = "textractor"
16
15
 
17
16
  s.add_development_dependency "bundler", ">= 1.0.0"
18
- s.add_development_dependency "rspec", "~> 1.3.0"
17
+ s.add_development_dependency "rspec", "~> 2.1.0"
19
18
 
20
19
  s.files = `git ls-files`.split("\n")
21
20
  s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
@@ -228,7 +228,9 @@ if ($ENV{OS} =~ /^Windows/) {
228
228
  if ($inpIsDir eq 'y') {
229
229
  readFileInto("$ARGV[0]/word/document.xml", $content);
230
230
  } else {
231
- $content = `"$unzip" -p "$ARGV[0]" word/document.xml 2>$nulldevice`;
231
+ # Special fix for single quotes, they get un-escaped earlier
232
+ $ARGV[0] =~ s/\'/\'\\\'\'/g;
233
+ $content = `"$unzip" -p '$ARGV[0]' word/document.xml 2>$nulldevice`;
232
234
  }
233
235
 
234
236
  die "Failed to extract required information from <$ARGV[0]>!\n" if ! $content;
@@ -260,7 +262,7 @@ binmode $txtfile; # Ensure no auto-conversion of '\n' to '\r\n' on Windows.
260
262
  if ($inpIsDir eq 'y') {
261
263
  readFileInto("$ARGV[0]/word/_rels/document.xml.rels", $_);
262
264
  } else {
263
- $_ = `"$unzip" -p "$ARGV[0]" word/_rels/document.xml.rels 2>$nulldevice`;
265
+ $_ = `"$unzip" -p '$ARGV[0]' word/_rels/document.xml.rels 2>$nulldevice`;
264
266
  }
265
267
 
266
268
  my %docurels;
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textractor
3
3
  version: !ruby/object:Gem::Version
4
- hash: 19
5
- prerelease: false
4
+ hash: 17
5
+ prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 1
9
- - 4
10
- version: 0.1.4
9
+ - 5
10
+ version: 0.1.5
11
11
  platform: ruby
12
12
  authors:
13
13
  - Michael Guterl
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-11-06 00:00:00 -04:00
18
+ date: 2011-07-20 00:00:00 -04:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -42,12 +42,12 @@ dependencies:
42
42
  requirements:
43
43
  - - ~>
44
44
  - !ruby/object:Gem::Version
45
- hash: 27
45
+ hash: 11
46
46
  segments:
47
+ - 2
47
48
  - 1
48
- - 3
49
49
  - 0
50
- version: 1.3.0
50
+ version: 2.1.0
51
51
  type: :development
52
52
  version_requirements: *id002
53
53
  description: simple wrapper around CLI for extracting text from PDF and Word documents
@@ -63,11 +63,13 @@ extra_rdoc_files:
63
63
  files:
64
64
  - .document
65
65
  - .gitignore
66
+ - .rspec
66
67
  - Gemfile
67
68
  - Gemfile.lock
68
69
  - LICENSE
69
70
  - README.md
70
71
  - Rakefile
72
+ - autotest/discover.rb
71
73
  - bin/textractor
72
74
  - lib/textractor.rb
73
75
  - lib/textractor/content_type_detector.rb
@@ -79,13 +81,16 @@ files:
79
81
  - lib/textractor/extractors/pdf_extractor.rb
80
82
  - lib/textractor/extractors/text_extractor.rb
81
83
  - lib/textractor/version.rb
84
+ - spec/fixtures/document .doc
85
+ - spec/fixtures/document .docx
86
+ - spec/fixtures/document .pdf
87
+ - spec/fixtures/document .txt
82
88
  - spec/fixtures/document.doc
83
89
  - spec/fixtures/document.docx
84
90
  - spec/fixtures/document.pdf
85
91
  - spec/fixtures/document.txt
86
92
  - spec/fixtures/no_extension
87
93
  - spec/integration/textractor_spec.rb
88
- - spec/spec.opts
89
94
  - spec/spec_helper.rb
90
95
  - spec/textractor_spec.rb
91
96
  - support/wvText.xml
@@ -137,7 +142,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
137
142
  requirements: []
138
143
 
139
144
  rubyforge_project: textractor
140
- rubygems_version: 1.3.7
145
+ rubygems_version: 1.6.2
141
146
  signing_key:
142
147
  specification_version: 3
143
148
  summary: simple wrapper around CLI for extracting text from PDF and Word documents
@@ -1 +0,0 @@
1
- --color