textractor 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.rspec +1 -0
- data/Gemfile.lock +10 -2
- data/README.md +12 -0
- data/Rakefile +8 -8
- data/autotest/discover.rb +1 -0
- data/lib/textractor/extractors/doc_extractor.rb +1 -1
- data/lib/textractor/extractors/docx_extractor.rb +1 -1
- data/lib/textractor/extractors/pdf_extractor.rb +1 -1
- data/lib/textractor/version.rb +1 -1
- data/spec/fixtures/document .doc +0 -0
- data/spec/fixtures/document .docx +0 -0
- data/spec/fixtures/document .pdf +0 -0
- data/spec/fixtures/document .txt +1 -0
- data/spec/integration/textractor_spec.rb +16 -0
- data/spec/spec_helper.rb +4 -5
- data/spec/textractor_spec.rb +1 -1
- data/textractor.gemspec +1 -2
- data/vendor/docx2txt/docx2txt.pl +4 -2
- metadata +15 -10
- data/spec/spec.opts +0 -1
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
-c -f progress -r ./spec/spec_helper
|
data/Gemfile.lock
CHANGED
@@ -6,12 +6,20 @@ PATH
|
|
6
6
|
GEM
|
7
7
|
remote: http://rubygems.org/
|
8
8
|
specs:
|
9
|
-
|
9
|
+
diff-lcs (1.1.2)
|
10
|
+
rspec (2.1.0)
|
11
|
+
rspec-core (~> 2.1.0)
|
12
|
+
rspec-expectations (~> 2.1.0)
|
13
|
+
rspec-mocks (~> 2.1.0)
|
14
|
+
rspec-core (2.1.0)
|
15
|
+
rspec-expectations (2.1.0)
|
16
|
+
diff-lcs (~> 1.1.2)
|
17
|
+
rspec-mocks (2.1.0)
|
10
18
|
|
11
19
|
PLATFORMS
|
12
20
|
ruby
|
13
21
|
|
14
22
|
DEPENDENCIES
|
15
23
|
bundler (>= 1.0.0)
|
16
|
-
rspec (~> 1.
|
24
|
+
rspec (~> 2.1.0)
|
17
25
|
textractor!
|
data/README.md
CHANGED
@@ -10,6 +10,18 @@ In order to use textractor you have to install a few command line tools.
|
|
10
10
|
|
11
11
|
### OS X
|
12
12
|
|
13
|
+
#### Homebrew
|
14
|
+
|
15
|
+
brew install xpdf links
|
16
|
+
|
17
|
+
Download wv-1.2.4 and install from source: http://sourceforge.net/projects/wvware/files/wv/1.2.4/wv-1.2.4.tar.gz/download
|
18
|
+
|
19
|
+
./configure
|
20
|
+
make
|
21
|
+
make install
|
22
|
+
|
23
|
+
#### MacPorts
|
24
|
+
|
13
25
|
port install wv xpdf links
|
14
26
|
|
15
27
|
I recommend using also passing +no_x11 to the install command, but this may not work on all systems due to dependency issues.
|
data/Rakefile
CHANGED
@@ -1,16 +1,16 @@
|
|
1
1
|
require 'bundler'
|
2
2
|
Bundler::GemHelper.install_tasks
|
3
3
|
|
4
|
-
require '
|
5
|
-
|
6
|
-
|
7
|
-
|
4
|
+
require 'rspec/core/rake_task'
|
5
|
+
|
6
|
+
RSpec::Core::RakeTask.new(:spec) do |task|
|
7
|
+
task.rspec_opts = ["-c", "-f progress", "-r ./spec/spec_helper.rb"]
|
8
|
+
task.pattern = 'spec/**/*_spec.rb'
|
8
9
|
end
|
9
10
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
spec.rcov = true
|
11
|
+
RSpec::Core::RakeTask.new(:rcov) do |task|
|
12
|
+
task.pattern = 'spec/**/*_spec.rb'
|
13
|
+
task.rcov = true
|
14
14
|
end
|
15
15
|
|
16
16
|
task :default => :spec
|
@@ -0,0 +1 @@
|
|
1
|
+
Autotest.add_discovery { "rspec2" }
|
@@ -13,7 +13,7 @@ module Textractor::Extractors
|
|
13
13
|
end
|
14
14
|
|
15
15
|
def text_from_path(path)
|
16
|
-
command = "wvWare -c utf-8 --nographics -x #{wvText_path} #{path}"
|
16
|
+
command = "wvWare -c utf-8 --nographics -x #{wvText_path} '#{path}'"
|
17
17
|
puts command if $DEBUG
|
18
18
|
`#{command}`.strip
|
19
19
|
end
|
data/lib/textractor/version.rb
CHANGED
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1 @@
|
|
1
|
+
text
|
@@ -55,4 +55,20 @@ describe Textractor do
|
|
55
55
|
}.to raise_error(Textractor::ContentTypeNotRegistered)
|
56
56
|
end
|
57
57
|
|
58
|
+
it 'returns the contents of doc files with a space in the path' do
|
59
|
+
Textractor.text_from_path(fixture_path("document .doc")).should == 'text'
|
60
|
+
end
|
61
|
+
|
62
|
+
it 'returns the contents of docx files with a space in the path' do
|
63
|
+
Textractor.text_from_path(fixture_path("document .docx")).should == 'text'
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'returns the contents of pdf files with a space in the path' do
|
67
|
+
Textractor.text_from_path(fixture_path("document .pdf")).should == 'text'
|
68
|
+
end
|
69
|
+
|
70
|
+
it 'returns the contents of txt files with a space in the path' do
|
71
|
+
Textractor.text_from_path(fixture_path("document .txt")).should == 'text'
|
72
|
+
end
|
73
|
+
|
58
74
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1,15 +1,14 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'bundler/setup'
|
3
|
-
require 'spec'
|
4
|
-
|
5
1
|
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
6
2
|
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler/setup'
|
5
|
+
require 'rspec'
|
7
6
|
require 'textractor'
|
8
7
|
|
9
8
|
def fixture_path(path)
|
10
9
|
File.expand_path(File.join(File.dirname(__FILE__), 'fixtures', path))
|
11
10
|
end
|
12
11
|
|
13
|
-
|
12
|
+
RSpec.configure do |config|
|
14
13
|
|
15
14
|
end
|
data/spec/textractor_spec.rb
CHANGED
data/textractor.gemspec
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
1
|
require File.expand_path("../lib/textractor/version", __FILE__)
|
3
2
|
|
4
3
|
Gem::Specification.new do |s|
|
@@ -15,7 +14,7 @@ Gem::Specification.new do |s|
|
|
15
14
|
s.rubyforge_project = "textractor"
|
16
15
|
|
17
16
|
s.add_development_dependency "bundler", ">= 1.0.0"
|
18
|
-
s.add_development_dependency "rspec", "~> 1.
|
17
|
+
s.add_development_dependency "rspec", "~> 2.1.0"
|
19
18
|
|
20
19
|
s.files = `git ls-files`.split("\n")
|
21
20
|
s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
|
data/vendor/docx2txt/docx2txt.pl
CHANGED
@@ -228,7 +228,9 @@ if ($ENV{OS} =~ /^Windows/) {
|
|
228
228
|
if ($inpIsDir eq 'y') {
|
229
229
|
readFileInto("$ARGV[0]/word/document.xml", $content);
|
230
230
|
} else {
|
231
|
-
|
231
|
+
# Special fix for single quotes, they get un-escaped earlier
|
232
|
+
$ARGV[0] =~ s/\'/\'\\\'\'/g;
|
233
|
+
$content = `"$unzip" -p '$ARGV[0]' word/document.xml 2>$nulldevice`;
|
232
234
|
}
|
233
235
|
|
234
236
|
die "Failed to extract required information from <$ARGV[0]>!\n" if ! $content;
|
@@ -260,7 +262,7 @@ binmode $txtfile; # Ensure no auto-conversion of '\n' to '\r\n' on Windows.
|
|
260
262
|
if ($inpIsDir eq 'y') {
|
261
263
|
readFileInto("$ARGV[0]/word/_rels/document.xml.rels", $_);
|
262
264
|
} else {
|
263
|
-
$_ = `"$unzip" -p
|
265
|
+
$_ = `"$unzip" -p '$ARGV[0]' word/_rels/document.xml.rels 2>$nulldevice`;
|
264
266
|
}
|
265
267
|
|
266
268
|
my %docurels;
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
5
|
-
prerelease:
|
4
|
+
hash: 17
|
5
|
+
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
-
|
10
|
-
version: 0.1.
|
9
|
+
- 5
|
10
|
+
version: 0.1.5
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Michael Guterl
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2011-07-20 00:00:00 -04:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -42,12 +42,12 @@ dependencies:
|
|
42
42
|
requirements:
|
43
43
|
- - ~>
|
44
44
|
- !ruby/object:Gem::Version
|
45
|
-
hash:
|
45
|
+
hash: 11
|
46
46
|
segments:
|
47
|
+
- 2
|
47
48
|
- 1
|
48
|
-
- 3
|
49
49
|
- 0
|
50
|
-
version: 1.
|
50
|
+
version: 2.1.0
|
51
51
|
type: :development
|
52
52
|
version_requirements: *id002
|
53
53
|
description: simple wrapper around CLI for extracting text from PDF and Word documents
|
@@ -63,11 +63,13 @@ extra_rdoc_files:
|
|
63
63
|
files:
|
64
64
|
- .document
|
65
65
|
- .gitignore
|
66
|
+
- .rspec
|
66
67
|
- Gemfile
|
67
68
|
- Gemfile.lock
|
68
69
|
- LICENSE
|
69
70
|
- README.md
|
70
71
|
- Rakefile
|
72
|
+
- autotest/discover.rb
|
71
73
|
- bin/textractor
|
72
74
|
- lib/textractor.rb
|
73
75
|
- lib/textractor/content_type_detector.rb
|
@@ -79,13 +81,16 @@ files:
|
|
79
81
|
- lib/textractor/extractors/pdf_extractor.rb
|
80
82
|
- lib/textractor/extractors/text_extractor.rb
|
81
83
|
- lib/textractor/version.rb
|
84
|
+
- spec/fixtures/document .doc
|
85
|
+
- spec/fixtures/document .docx
|
86
|
+
- spec/fixtures/document .pdf
|
87
|
+
- spec/fixtures/document .txt
|
82
88
|
- spec/fixtures/document.doc
|
83
89
|
- spec/fixtures/document.docx
|
84
90
|
- spec/fixtures/document.pdf
|
85
91
|
- spec/fixtures/document.txt
|
86
92
|
- spec/fixtures/no_extension
|
87
93
|
- spec/integration/textractor_spec.rb
|
88
|
-
- spec/spec.opts
|
89
94
|
- spec/spec_helper.rb
|
90
95
|
- spec/textractor_spec.rb
|
91
96
|
- support/wvText.xml
|
@@ -137,7 +142,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
137
142
|
requirements: []
|
138
143
|
|
139
144
|
rubyforge_project: textractor
|
140
|
-
rubygems_version: 1.
|
145
|
+
rubygems_version: 1.6.2
|
141
146
|
signing_key:
|
142
147
|
specification_version: 3
|
143
148
|
summary: simple wrapper around CLI for extracting text from PDF and Word documents
|
data/spec/spec.opts
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
--color
|