textractor 0.1.4 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ -c -f progress -r ./spec/spec_helper
@@ -6,12 +6,20 @@ PATH
6
6
  GEM
7
7
  remote: http://rubygems.org/
8
8
  specs:
9
- rspec (1.3.0)
9
+ diff-lcs (1.1.2)
10
+ rspec (2.1.0)
11
+ rspec-core (~> 2.1.0)
12
+ rspec-expectations (~> 2.1.0)
13
+ rspec-mocks (~> 2.1.0)
14
+ rspec-core (2.1.0)
15
+ rspec-expectations (2.1.0)
16
+ diff-lcs (~> 1.1.2)
17
+ rspec-mocks (2.1.0)
10
18
 
11
19
  PLATFORMS
12
20
  ruby
13
21
 
14
22
  DEPENDENCIES
15
23
  bundler (>= 1.0.0)
16
- rspec (~> 1.3.0)
24
+ rspec (~> 2.1.0)
17
25
  textractor!
data/README.md CHANGED
@@ -10,6 +10,18 @@ In order to use textractor you have to install a few command line tools.
10
10
 
11
11
  ### OS X
12
12
 
13
+ #### Homebrew
14
+
15
+ brew install xpdf links
16
+
17
+ Download wv-1.2.4 and install from source: http://sourceforge.net/projects/wvware/files/wv/1.2.4/wv-1.2.4.tar.gz/download
18
+
19
+ ./configure
20
+ make
21
+ make install
22
+
23
+ #### MacPorts
24
+
13
25
  port install wv xpdf links
14
26
 
15
27
  I recommend using also passing +no_x11 to the install command, but this may not work on all systems due to dependency issues.
data/Rakefile CHANGED
@@ -1,16 +1,16 @@
1
1
  require 'bundler'
2
2
  Bundler::GemHelper.install_tasks
3
3
 
4
- require 'spec/rake/spectask'
5
- Spec::Rake::SpecTask.new(:spec) do |spec|
6
- spec.libs << 'lib' << 'spec'
7
- spec.spec_files = FileList['spec/**/*_spec.rb']
4
+ require 'rspec/core/rake_task'
5
+
6
+ RSpec::Core::RakeTask.new(:spec) do |task|
7
+ task.rspec_opts = ["-c", "-f progress", "-r ./spec/spec_helper.rb"]
8
+ task.pattern = 'spec/**/*_spec.rb'
8
9
  end
9
10
 
10
- Spec::Rake::SpecTask.new(:rcov) do |spec|
11
- spec.libs << 'lib' << 'spec'
12
- spec.pattern = 'spec/**/*_spec.rb'
13
- spec.rcov = true
11
+ RSpec::Core::RakeTask.new(:rcov) do |task|
12
+ task.pattern = 'spec/**/*_spec.rb'
13
+ task.rcov = true
14
14
  end
15
15
 
16
16
  task :default => :spec
@@ -0,0 +1 @@
1
+ Autotest.add_discovery { "rspec2" }
@@ -13,7 +13,7 @@ module Textractor::Extractors
13
13
  end
14
14
 
15
15
  def text_from_path(path)
16
- command = "wvWare -c utf-8 --nographics -x #{wvText_path} #{path}"
16
+ command = "wvWare -c utf-8 --nographics -x #{wvText_path} '#{path}'"
17
17
  puts command if $DEBUG
18
18
  `#{command}`.strip
19
19
  end
@@ -14,7 +14,7 @@ module Textractor::Extractors
14
14
 
15
15
 
16
16
  def text_from_path(path)
17
- `#{docx2txt_path} #{path} -`.strip
17
+ `#{docx2txt_path} '#{path}' -`.strip
18
18
  end
19
19
 
20
20
  private
@@ -3,7 +3,7 @@ module Textractor::Extractors
3
3
  class PDFExtractor
4
4
 
5
5
  def text_from_path(path)
6
- `pdftotext #{path} - 2>/dev/null`.strip
6
+ `pdftotext '#{path}' - 2>/dev/null`.strip
7
7
  end
8
8
 
9
9
  end
@@ -1,3 +1,3 @@
1
1
  module Textractor
2
- VERSION = '0.1.4'
2
+ VERSION = '0.1.5'
3
3
  end
@@ -0,0 +1 @@
1
+ text
@@ -55,4 +55,20 @@ describe Textractor do
55
55
  }.to raise_error(Textractor::ContentTypeNotRegistered)
56
56
  end
57
57
 
58
+ it 'returns the contents of doc files with a space in the path' do
59
+ Textractor.text_from_path(fixture_path("document .doc")).should == 'text'
60
+ end
61
+
62
+ it 'returns the contents of docx files with a space in the path' do
63
+ Textractor.text_from_path(fixture_path("document .docx")).should == 'text'
64
+ end
65
+
66
+ it 'returns the contents of pdf files with a space in the path' do
67
+ Textractor.text_from_path(fixture_path("document .pdf")).should == 'text'
68
+ end
69
+
70
+ it 'returns the contents of txt files with a space in the path' do
71
+ Textractor.text_from_path(fixture_path("document .txt")).should == 'text'
72
+ end
73
+
58
74
  end
@@ -1,15 +1,14 @@
1
- require 'rubygems'
2
- require 'bundler/setup'
3
- require 'spec'
4
-
5
1
  $LOAD_PATH.unshift(File.dirname(__FILE__))
6
2
 
3
+ require 'rubygems'
4
+ require 'bundler/setup'
5
+ require 'rspec'
7
6
  require 'textractor'
8
7
 
9
8
  def fixture_path(path)
10
9
  File.expand_path(File.join(File.dirname(__FILE__), 'fixtures', path))
11
10
  end
12
11
 
13
- Spec::Runner.configure do |config|
12
+ RSpec.configure do |config|
14
13
 
15
14
  end
@@ -1,4 +1,4 @@
1
- require 'spec/spec_helper'
1
+ require 'spec_helper'
2
2
 
3
3
  class TestExtractor
4
4
 
@@ -1,4 +1,3 @@
1
- # -*- encoding: utf-8 -*-
2
1
  require File.expand_path("../lib/textractor/version", __FILE__)
3
2
 
4
3
  Gem::Specification.new do |s|
@@ -15,7 +14,7 @@ Gem::Specification.new do |s|
15
14
  s.rubyforge_project = "textractor"
16
15
 
17
16
  s.add_development_dependency "bundler", ">= 1.0.0"
18
- s.add_development_dependency "rspec", "~> 1.3.0"
17
+ s.add_development_dependency "rspec", "~> 2.1.0"
19
18
 
20
19
  s.files = `git ls-files`.split("\n")
21
20
  s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
@@ -228,7 +228,9 @@ if ($ENV{OS} =~ /^Windows/) {
228
228
  if ($inpIsDir eq 'y') {
229
229
  readFileInto("$ARGV[0]/word/document.xml", $content);
230
230
  } else {
231
- $content = `"$unzip" -p "$ARGV[0]" word/document.xml 2>$nulldevice`;
231
+ # Special fix for single quotes, they get un-escaped earlier
232
+ $ARGV[0] =~ s/\'/\'\\\'\'/g;
233
+ $content = `"$unzip" -p '$ARGV[0]' word/document.xml 2>$nulldevice`;
232
234
  }
233
235
 
234
236
  die "Failed to extract required information from <$ARGV[0]>!\n" if ! $content;
@@ -260,7 +262,7 @@ binmode $txtfile; # Ensure no auto-conversion of '\n' to '\r\n' on Windows.
260
262
  if ($inpIsDir eq 'y') {
261
263
  readFileInto("$ARGV[0]/word/_rels/document.xml.rels", $_);
262
264
  } else {
263
- $_ = `"$unzip" -p "$ARGV[0]" word/_rels/document.xml.rels 2>$nulldevice`;
265
+ $_ = `"$unzip" -p '$ARGV[0]' word/_rels/document.xml.rels 2>$nulldevice`;
264
266
  }
265
267
 
266
268
  my %docurels;
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textractor
3
3
  version: !ruby/object:Gem::Version
4
- hash: 19
5
- prerelease: false
4
+ hash: 17
5
+ prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 1
9
- - 4
10
- version: 0.1.4
9
+ - 5
10
+ version: 0.1.5
11
11
  platform: ruby
12
12
  authors:
13
13
  - Michael Guterl
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-11-06 00:00:00 -04:00
18
+ date: 2011-07-20 00:00:00 -04:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -42,12 +42,12 @@ dependencies:
42
42
  requirements:
43
43
  - - ~>
44
44
  - !ruby/object:Gem::Version
45
- hash: 27
45
+ hash: 11
46
46
  segments:
47
+ - 2
47
48
  - 1
48
- - 3
49
49
  - 0
50
- version: 1.3.0
50
+ version: 2.1.0
51
51
  type: :development
52
52
  version_requirements: *id002
53
53
  description: simple wrapper around CLI for extracting text from PDF and Word documents
@@ -63,11 +63,13 @@ extra_rdoc_files:
63
63
  files:
64
64
  - .document
65
65
  - .gitignore
66
+ - .rspec
66
67
  - Gemfile
67
68
  - Gemfile.lock
68
69
  - LICENSE
69
70
  - README.md
70
71
  - Rakefile
72
+ - autotest/discover.rb
71
73
  - bin/textractor
72
74
  - lib/textractor.rb
73
75
  - lib/textractor/content_type_detector.rb
@@ -79,13 +81,16 @@ files:
79
81
  - lib/textractor/extractors/pdf_extractor.rb
80
82
  - lib/textractor/extractors/text_extractor.rb
81
83
  - lib/textractor/version.rb
84
+ - spec/fixtures/document .doc
85
+ - spec/fixtures/document .docx
86
+ - spec/fixtures/document .pdf
87
+ - spec/fixtures/document .txt
82
88
  - spec/fixtures/document.doc
83
89
  - spec/fixtures/document.docx
84
90
  - spec/fixtures/document.pdf
85
91
  - spec/fixtures/document.txt
86
92
  - spec/fixtures/no_extension
87
93
  - spec/integration/textractor_spec.rb
88
- - spec/spec.opts
89
94
  - spec/spec_helper.rb
90
95
  - spec/textractor_spec.rb
91
96
  - support/wvText.xml
@@ -137,7 +142,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
137
142
  requirements: []
138
143
 
139
144
  rubyforge_project: textractor
140
- rubygems_version: 1.3.7
145
+ rubygems_version: 1.6.2
141
146
  signing_key:
142
147
  specification_version: 3
143
148
  summary: simple wrapper around CLI for extracting text from PDF and Word documents
@@ -1 +0,0 @@
1
- --color