textractor 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +6 -4
- data/Rakefile +3 -3
- data/VERSION +1 -1
- data/lib/textractor/document.rb +7 -0
- data/spec/document_spec.rb +10 -0
- data/spec/fixtures/document.txt +1 -0
- data/textractor.gemspec +62 -0
- metadata +18 -9
data/README.md
CHANGED
@@ -1,21 +1,23 @@
|
|
1
1
|
# textractor
|
2
2
|
|
3
|
-
textractor is a ruby library that provides a simple wrapper
|
4
|
-
extracting text from PDF and Word documents.
|
3
|
+
textractor is a ruby library that provides a simple wrapper around CLI
|
4
|
+
tools for extracting text from PDF and Word documents.
|
5
5
|
|
6
6
|
## Setup
|
7
7
|
|
8
|
+
gem install textractor
|
9
|
+
|
8
10
|
In order to use textractor you have to install a few command line
|
9
11
|
tools.
|
10
12
|
|
11
13
|
### OS X
|
12
14
|
|
13
|
-
port install wv
|
15
|
+
port install wv xpdf links
|
14
16
|
|
15
17
|
I recommend using also passing +no_x11 to the install command, but
|
16
18
|
this may not work on all systems due to dependency issues.
|
17
19
|
|
18
|
-
port install wv
|
20
|
+
port install wv xpdf links +no_x11
|
19
21
|
|
20
22
|
### Ubuntu 8.04
|
21
23
|
|
data/Rakefile
CHANGED
@@ -5,12 +5,12 @@ begin
|
|
5
5
|
require 'jeweler'
|
6
6
|
Jeweler::Tasks.new do |gem|
|
7
7
|
gem.name = "textractor"
|
8
|
-
gem.summary = %Q{simple wrapper for extracting text from PDF and Word documents}
|
9
|
-
gem.description = %Q{simple wrapper for extracting text from PDF and Word documents}
|
8
|
+
gem.summary = %Q{simple wrapper around CLI tools for extracting text from PDF and Word documents}
|
9
|
+
gem.description = %Q{simple wrapper around CLI for extracting text from PDF and Word documents}
|
10
10
|
gem.email = "mguterl@gmail.com"
|
11
11
|
gem.homepage = "http://github.com/mguterl/textractor"
|
12
12
|
gem.authors = ["Michael Guterl"]
|
13
|
-
gem.add_development_dependency "rspec", ">= 1.
|
13
|
+
gem.add_development_dependency "rspec", ">= 1.3.0"
|
14
14
|
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
15
15
|
end
|
16
16
|
Jeweler::GemcutterTasks.new
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.2
|
data/lib/textractor/document.rb
CHANGED
@@ -7,6 +7,7 @@ module Textractor
|
|
7
7
|
'application/x-pdf' => :pdf,
|
8
8
|
'application/doc' => :word,
|
9
9
|
'application/x-doc' => :word,
|
10
|
+
'text/plain' => :txt
|
10
11
|
}
|
11
12
|
|
12
13
|
attr_reader :filename
|
@@ -27,6 +28,8 @@ module Textractor
|
|
27
28
|
:pdf
|
28
29
|
when /doc/
|
29
30
|
:word
|
31
|
+
when /txt/
|
32
|
+
:txt
|
30
33
|
else
|
31
34
|
nil
|
32
35
|
end
|
@@ -46,6 +49,10 @@ module Textractor
|
|
46
49
|
`wvWare -c utf-8 --nographics -x #{Textractor.wvText_path} #{filename} 2>/dev/null`.strip
|
47
50
|
end
|
48
51
|
|
52
|
+
def extract_from_txt
|
53
|
+
File.read(filename)
|
54
|
+
end
|
55
|
+
|
49
56
|
end
|
50
57
|
|
51
58
|
end
|
data/spec/document_spec.rb
CHANGED
@@ -4,6 +4,7 @@ describe Textractor::Document do
|
|
4
4
|
|
5
5
|
PDF_DOCUMENT_FIXTURE = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.pdf")
|
6
6
|
WORD_DOCUMENT_FIXTURE = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.doc")
|
7
|
+
TXT_DOCUMENT_FIXTURE = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.txt")
|
7
8
|
|
8
9
|
it 'should require a filename to create' do
|
9
10
|
expect { Textractor::Document.new }.to raise_error(ArgumentError)
|
@@ -30,6 +31,15 @@ describe Textractor::Document do
|
|
30
31
|
|
31
32
|
end
|
32
33
|
|
34
|
+
describe "with txt document" do
|
35
|
+
|
36
|
+
it 'should extract the text from the document' do
|
37
|
+
@doc = Textractor::Document.new(TXT_DOCUMENT_FIXTURE)
|
38
|
+
@doc.text.should == "Ruby on rails developer"
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
|
33
43
|
end
|
34
44
|
|
35
45
|
describe "#type" do
|
@@ -0,0 +1 @@
|
|
1
|
+
Ruby on rails developer
|
data/textractor.gemspec
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{textractor}
|
8
|
+
s.version = "0.0.2"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Michael Guterl"]
|
12
|
+
s.date = %q{2010-07-26}
|
13
|
+
s.description = %q{simple wrapper around CLI for extracting text from PDF and Word documents}
|
14
|
+
s.email = %q{mguterl@gmail.com}
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE",
|
17
|
+
"README.md"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".document",
|
21
|
+
".gitignore",
|
22
|
+
"LICENSE",
|
23
|
+
"README.md",
|
24
|
+
"Rakefile",
|
25
|
+
"VERSION",
|
26
|
+
"lib/textractor.rb",
|
27
|
+
"lib/textractor/document.rb",
|
28
|
+
"spec/document_spec.rb",
|
29
|
+
"spec/fixtures/document.doc",
|
30
|
+
"spec/fixtures/document.pdf",
|
31
|
+
"spec/fixtures/document.txt",
|
32
|
+
"spec/spec.opts",
|
33
|
+
"spec/spec_helper.rb",
|
34
|
+
"spec/textractor_spec.rb",
|
35
|
+
"support/wvText.xml",
|
36
|
+
"textractor.gemspec"
|
37
|
+
]
|
38
|
+
s.homepage = %q{http://github.com/mguterl/textractor}
|
39
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
40
|
+
s.require_paths = ["lib"]
|
41
|
+
s.rubygems_version = %q{1.3.7}
|
42
|
+
s.summary = %q{simple wrapper around CLI tools for extracting text from PDF and Word documents}
|
43
|
+
s.test_files = [
|
44
|
+
"spec/document_spec.rb",
|
45
|
+
"spec/spec_helper.rb",
|
46
|
+
"spec/textractor_spec.rb"
|
47
|
+
]
|
48
|
+
|
49
|
+
if s.respond_to? :specification_version then
|
50
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
51
|
+
s.specification_version = 3
|
52
|
+
|
53
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
54
|
+
s.add_development_dependency(%q<rspec>, [">= 1.3.0"])
|
55
|
+
else
|
56
|
+
s.add_dependency(%q<rspec>, [">= 1.3.0"])
|
57
|
+
end
|
58
|
+
else
|
59
|
+
s.add_dependency(%q<rspec>, [">= 1.3.0"])
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
metadata
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
+
hash: 27
|
4
5
|
prerelease: false
|
5
6
|
segments:
|
6
7
|
- 0
|
7
8
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
9
|
+
- 2
|
10
|
+
version: 0.0.2
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- Michael Guterl
|
@@ -14,24 +15,26 @@ autorequire:
|
|
14
15
|
bindir: bin
|
15
16
|
cert_chain: []
|
16
17
|
|
17
|
-
date: 2010-
|
18
|
+
date: 2010-07-26 00:00:00 -04:00
|
18
19
|
default_executable:
|
19
20
|
dependencies:
|
20
21
|
- !ruby/object:Gem::Dependency
|
21
22
|
name: rspec
|
22
23
|
prerelease: false
|
23
24
|
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
24
26
|
requirements:
|
25
27
|
- - ">="
|
26
28
|
- !ruby/object:Gem::Version
|
29
|
+
hash: 27
|
27
30
|
segments:
|
28
31
|
- 1
|
29
|
-
-
|
30
|
-
-
|
31
|
-
version: 1.
|
32
|
+
- 3
|
33
|
+
- 0
|
34
|
+
version: 1.3.0
|
32
35
|
type: :development
|
33
36
|
version_requirements: *id001
|
34
|
-
description: simple wrapper for extracting text from PDF and Word documents
|
37
|
+
description: simple wrapper around CLI for extracting text from PDF and Word documents
|
35
38
|
email: mguterl@gmail.com
|
36
39
|
executables: []
|
37
40
|
|
@@ -52,10 +55,12 @@ files:
|
|
52
55
|
- spec/document_spec.rb
|
53
56
|
- spec/fixtures/document.doc
|
54
57
|
- spec/fixtures/document.pdf
|
58
|
+
- spec/fixtures/document.txt
|
55
59
|
- spec/spec.opts
|
56
60
|
- spec/spec_helper.rb
|
57
61
|
- spec/textractor_spec.rb
|
58
62
|
- support/wvText.xml
|
63
|
+
- textractor.gemspec
|
59
64
|
has_rdoc: true
|
60
65
|
homepage: http://github.com/mguterl/textractor
|
61
66
|
licenses: []
|
@@ -66,26 +71,30 @@ rdoc_options:
|
|
66
71
|
require_paths:
|
67
72
|
- lib
|
68
73
|
required_ruby_version: !ruby/object:Gem::Requirement
|
74
|
+
none: false
|
69
75
|
requirements:
|
70
76
|
- - ">="
|
71
77
|
- !ruby/object:Gem::Version
|
78
|
+
hash: 3
|
72
79
|
segments:
|
73
80
|
- 0
|
74
81
|
version: "0"
|
75
82
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
83
|
+
none: false
|
76
84
|
requirements:
|
77
85
|
- - ">="
|
78
86
|
- !ruby/object:Gem::Version
|
87
|
+
hash: 3
|
79
88
|
segments:
|
80
89
|
- 0
|
81
90
|
version: "0"
|
82
91
|
requirements: []
|
83
92
|
|
84
93
|
rubyforge_project:
|
85
|
-
rubygems_version: 1.3.
|
94
|
+
rubygems_version: 1.3.7
|
86
95
|
signing_key:
|
87
96
|
specification_version: 3
|
88
|
-
summary: simple wrapper for extracting text from PDF and Word documents
|
97
|
+
summary: simple wrapper around CLI tools for extracting text from PDF and Word documents
|
89
98
|
test_files:
|
90
99
|
- spec/document_spec.rb
|
91
100
|
- spec/spec_helper.rb
|