textractor 0.1.4 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- data/.rspec +1 -0
- data/Gemfile.lock +10 -2
- data/README.md +12 -0
- data/Rakefile +8 -8
- data/autotest/discover.rb +1 -0
- data/lib/textractor/extractors/doc_extractor.rb +1 -1
- data/lib/textractor/extractors/docx_extractor.rb +1 -1
- data/lib/textractor/extractors/pdf_extractor.rb +1 -1
- data/lib/textractor/version.rb +1 -1
- data/spec/fixtures/document .doc +0 -0
- data/spec/fixtures/document .docx +0 -0
- data/spec/fixtures/document .pdf +0 -0
- data/spec/fixtures/document .txt +1 -0
- data/spec/integration/textractor_spec.rb +16 -0
- data/spec/spec_helper.rb +4 -5
- data/spec/textractor_spec.rb +1 -1
- data/textractor.gemspec +1 -2
- data/vendor/docx2txt/docx2txt.pl +4 -2
- metadata +15 -10
- data/spec/spec.opts +0 -1
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
-c -f progress -r ./spec/spec_helper
|
data/Gemfile.lock
CHANGED
@@ -6,12 +6,20 @@ PATH
|
|
6
6
|
GEM
|
7
7
|
remote: http://rubygems.org/
|
8
8
|
specs:
|
9
|
-
|
9
|
+
diff-lcs (1.1.2)
|
10
|
+
rspec (2.1.0)
|
11
|
+
rspec-core (~> 2.1.0)
|
12
|
+
rspec-expectations (~> 2.1.0)
|
13
|
+
rspec-mocks (~> 2.1.0)
|
14
|
+
rspec-core (2.1.0)
|
15
|
+
rspec-expectations (2.1.0)
|
16
|
+
diff-lcs (~> 1.1.2)
|
17
|
+
rspec-mocks (2.1.0)
|
10
18
|
|
11
19
|
PLATFORMS
|
12
20
|
ruby
|
13
21
|
|
14
22
|
DEPENDENCIES
|
15
23
|
bundler (>= 1.0.0)
|
16
|
-
rspec (~> 1.
|
24
|
+
rspec (~> 2.1.0)
|
17
25
|
textractor!
|
data/README.md
CHANGED
@@ -10,6 +10,18 @@ In order to use textractor you have to install a few command line tools.
|
|
10
10
|
|
11
11
|
### OS X
|
12
12
|
|
13
|
+
#### Homebrew
|
14
|
+
|
15
|
+
brew install xpdf links
|
16
|
+
|
17
|
+
Download wv-1.2.4 and install from source: http://sourceforge.net/projects/wvware/files/wv/1.2.4/wv-1.2.4.tar.gz/download
|
18
|
+
|
19
|
+
./configure
|
20
|
+
make
|
21
|
+
make install
|
22
|
+
|
23
|
+
#### MacPorts
|
24
|
+
|
13
25
|
port install wv xpdf links
|
14
26
|
|
15
27
|
I recommend using also passing +no_x11 to the install command, but this may not work on all systems due to dependency issues.
|
data/Rakefile
CHANGED
@@ -1,16 +1,16 @@
|
|
1
1
|
require 'bundler'
|
2
2
|
Bundler::GemHelper.install_tasks
|
3
3
|
|
4
|
-
require '
|
5
|
-
|
6
|
-
|
7
|
-
|
4
|
+
require 'rspec/core/rake_task'
|
5
|
+
|
6
|
+
RSpec::Core::RakeTask.new(:spec) do |task|
|
7
|
+
task.rspec_opts = ["-c", "-f progress", "-r ./spec/spec_helper.rb"]
|
8
|
+
task.pattern = 'spec/**/*_spec.rb'
|
8
9
|
end
|
9
10
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
spec.rcov = true
|
11
|
+
RSpec::Core::RakeTask.new(:rcov) do |task|
|
12
|
+
task.pattern = 'spec/**/*_spec.rb'
|
13
|
+
task.rcov = true
|
14
14
|
end
|
15
15
|
|
16
16
|
task :default => :spec
|
@@ -0,0 +1 @@
|
|
1
|
+
Autotest.add_discovery { "rspec2" }
|
@@ -13,7 +13,7 @@ module Textractor::Extractors
|
|
13
13
|
end
|
14
14
|
|
15
15
|
def text_from_path(path)
|
16
|
-
command = "wvWare -c utf-8 --nographics -x #{wvText_path} #{path}"
|
16
|
+
command = "wvWare -c utf-8 --nographics -x #{wvText_path} '#{path}'"
|
17
17
|
puts command if $DEBUG
|
18
18
|
`#{command}`.strip
|
19
19
|
end
|
data/lib/textractor/version.rb
CHANGED
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1 @@
|
|
1
|
+
text
|
@@ -55,4 +55,20 @@ describe Textractor do
|
|
55
55
|
}.to raise_error(Textractor::ContentTypeNotRegistered)
|
56
56
|
end
|
57
57
|
|
58
|
+
it 'returns the contents of doc files with a space in the path' do
|
59
|
+
Textractor.text_from_path(fixture_path("document .doc")).should == 'text'
|
60
|
+
end
|
61
|
+
|
62
|
+
it 'returns the contents of docx files with a space in the path' do
|
63
|
+
Textractor.text_from_path(fixture_path("document .docx")).should == 'text'
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'returns the contents of pdf files with a space in the path' do
|
67
|
+
Textractor.text_from_path(fixture_path("document .pdf")).should == 'text'
|
68
|
+
end
|
69
|
+
|
70
|
+
it 'returns the contents of txt files with a space in the path' do
|
71
|
+
Textractor.text_from_path(fixture_path("document .txt")).should == 'text'
|
72
|
+
end
|
73
|
+
|
58
74
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1,15 +1,14 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'bundler/setup'
|
3
|
-
require 'spec'
|
4
|
-
|
5
1
|
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
6
2
|
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler/setup'
|
5
|
+
require 'rspec'
|
7
6
|
require 'textractor'
|
8
7
|
|
9
8
|
def fixture_path(path)
|
10
9
|
File.expand_path(File.join(File.dirname(__FILE__), 'fixtures', path))
|
11
10
|
end
|
12
11
|
|
13
|
-
|
12
|
+
RSpec.configure do |config|
|
14
13
|
|
15
14
|
end
|
data/spec/textractor_spec.rb
CHANGED
data/textractor.gemspec
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
1
|
require File.expand_path("../lib/textractor/version", __FILE__)
|
3
2
|
|
4
3
|
Gem::Specification.new do |s|
|
@@ -15,7 +14,7 @@ Gem::Specification.new do |s|
|
|
15
14
|
s.rubyforge_project = "textractor"
|
16
15
|
|
17
16
|
s.add_development_dependency "bundler", ">= 1.0.0"
|
18
|
-
s.add_development_dependency "rspec", "~> 1.
|
17
|
+
s.add_development_dependency "rspec", "~> 2.1.0"
|
19
18
|
|
20
19
|
s.files = `git ls-files`.split("\n")
|
21
20
|
s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
|
data/vendor/docx2txt/docx2txt.pl
CHANGED
@@ -228,7 +228,9 @@ if ($ENV{OS} =~ /^Windows/) {
|
|
228
228
|
if ($inpIsDir eq 'y') {
|
229
229
|
readFileInto("$ARGV[0]/word/document.xml", $content);
|
230
230
|
} else {
|
231
|
-
|
231
|
+
# Special fix for single quotes, they get un-escaped earlier
|
232
|
+
$ARGV[0] =~ s/\'/\'\\\'\'/g;
|
233
|
+
$content = `"$unzip" -p '$ARGV[0]' word/document.xml 2>$nulldevice`;
|
232
234
|
}
|
233
235
|
|
234
236
|
die "Failed to extract required information from <$ARGV[0]>!\n" if ! $content;
|
@@ -260,7 +262,7 @@ binmode $txtfile; # Ensure no auto-conversion of '\n' to '\r\n' on Windows.
|
|
260
262
|
if ($inpIsDir eq 'y') {
|
261
263
|
readFileInto("$ARGV[0]/word/_rels/document.xml.rels", $_);
|
262
264
|
} else {
|
263
|
-
$_ = `"$unzip" -p
|
265
|
+
$_ = `"$unzip" -p '$ARGV[0]' word/_rels/document.xml.rels 2>$nulldevice`;
|
264
266
|
}
|
265
267
|
|
266
268
|
my %docurels;
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
5
|
-
prerelease:
|
4
|
+
hash: 17
|
5
|
+
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
-
|
10
|
-
version: 0.1.
|
9
|
+
- 5
|
10
|
+
version: 0.1.5
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Michael Guterl
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2011-07-20 00:00:00 -04:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -42,12 +42,12 @@ dependencies:
|
|
42
42
|
requirements:
|
43
43
|
- - ~>
|
44
44
|
- !ruby/object:Gem::Version
|
45
|
-
hash:
|
45
|
+
hash: 11
|
46
46
|
segments:
|
47
|
+
- 2
|
47
48
|
- 1
|
48
|
-
- 3
|
49
49
|
- 0
|
50
|
-
version: 1.
|
50
|
+
version: 2.1.0
|
51
51
|
type: :development
|
52
52
|
version_requirements: *id002
|
53
53
|
description: simple wrapper around CLI for extracting text from PDF and Word documents
|
@@ -63,11 +63,13 @@ extra_rdoc_files:
|
|
63
63
|
files:
|
64
64
|
- .document
|
65
65
|
- .gitignore
|
66
|
+
- .rspec
|
66
67
|
- Gemfile
|
67
68
|
- Gemfile.lock
|
68
69
|
- LICENSE
|
69
70
|
- README.md
|
70
71
|
- Rakefile
|
72
|
+
- autotest/discover.rb
|
71
73
|
- bin/textractor
|
72
74
|
- lib/textractor.rb
|
73
75
|
- lib/textractor/content_type_detector.rb
|
@@ -79,13 +81,16 @@ files:
|
|
79
81
|
- lib/textractor/extractors/pdf_extractor.rb
|
80
82
|
- lib/textractor/extractors/text_extractor.rb
|
81
83
|
- lib/textractor/version.rb
|
84
|
+
- spec/fixtures/document .doc
|
85
|
+
- spec/fixtures/document .docx
|
86
|
+
- spec/fixtures/document .pdf
|
87
|
+
- spec/fixtures/document .txt
|
82
88
|
- spec/fixtures/document.doc
|
83
89
|
- spec/fixtures/document.docx
|
84
90
|
- spec/fixtures/document.pdf
|
85
91
|
- spec/fixtures/document.txt
|
86
92
|
- spec/fixtures/no_extension
|
87
93
|
- spec/integration/textractor_spec.rb
|
88
|
-
- spec/spec.opts
|
89
94
|
- spec/spec_helper.rb
|
90
95
|
- spec/textractor_spec.rb
|
91
96
|
- support/wvText.xml
|
@@ -137,7 +142,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
137
142
|
requirements: []
|
138
143
|
|
139
144
|
rubyforge_project: textractor
|
140
|
-
rubygems_version: 1.
|
145
|
+
rubygems_version: 1.6.2
|
141
146
|
signing_key:
|
142
147
|
specification_version: 3
|
143
148
|
summary: simple wrapper around CLI for extracting text from PDF and Word documents
|
data/spec/spec.opts
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
--color
|