textractor 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +21 -0
- data/LICENSE +20 -0
- data/README.md +54 -0
- data/Rakefile +45 -0
- data/VERSION +1 -0
- data/lib/textractor.rb +18 -0
- data/lib/textractor/document.rb +51 -0
- data/spec/document_spec.rb +69 -0
- data/spec/fixtures/document.doc +0 -0
- data/spec/fixtures/document.pdf +0 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/textractor_spec.rb +32 -0
- data/support/wvText.xml +355 -0
- metadata +92 -0
data/.document
ADDED
data/.gitignore
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2010 Michael Guterl
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
# textractor
|
2
|
+
|
3
|
+
textractor is a ruby library that provides a simple wrapper for
|
4
|
+
extracting text from PDF and Word documents.
|
5
|
+
|
6
|
+
## Setup
|
7
|
+
|
8
|
+
In order to use textractor you have to install a few command line
|
9
|
+
tools.
|
10
|
+
|
11
|
+
### OS X
|
12
|
+
|
13
|
+
port install wv pdftohtml links
|
14
|
+
|
15
|
+
I recommend using also passing +no_x11 to the install command, but
|
16
|
+
this may not work on all systems due to dependency issues.
|
17
|
+
|
18
|
+
port install wv pdftohtml links +no_x11
|
19
|
+
|
20
|
+
### Ubuntu 8.04
|
21
|
+
|
22
|
+
apt-get install wv xpdf-utils links
|
23
|
+
|
24
|
+
## Usage
|
25
|
+
|
26
|
+
Due to textractor's reliance on command line tools all the methods in
|
27
|
+
textractor work on paths not File objects.
|
28
|
+
|
29
|
+
document = Textractor::Document.new(path_to_document)
|
30
|
+
document.text # => "Ruby on rails developer"
|
31
|
+
|
32
|
+
There is also a convenience method on Textractor.
|
33
|
+
|
34
|
+
Textractor.text_from_file(path_to_document) # => "Ruby on rails developer"
|
35
|
+
|
36
|
+
Textractor will attempt to guess what type of document you're trying
|
37
|
+
to extract text from. However, if you know the content type of your
|
38
|
+
document, you can provide it and Textractor won't guess.
|
39
|
+
|
40
|
+
Textractor.text_from_file(path_to_document, :content_type => "application/doc")
|
41
|
+
|
42
|
+
## Note on Patches/Pull Requests
|
43
|
+
|
44
|
+
* Fork the project.
|
45
|
+
* Make your feature addition or bug fix.
|
46
|
+
* Add tests for it. This is important so I don't break it in a
|
47
|
+
future version unintentionally.
|
48
|
+
* Commit, do not mess with rakefile, version, or history.
|
49
|
+
(if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
50
|
+
* Send me a pull request. Bonus points for topic branches.
|
51
|
+
|
52
|
+
## Copyright
|
53
|
+
|
54
|
+
Copyright (c) 2010 Michael Guterl. See LICENSE for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "textractor"
|
8
|
+
gem.summary = %Q{simple wrapper for extracting text from PDF and Word documents}
|
9
|
+
gem.description = %Q{simple wrapper for extracting text from PDF and Word documents}
|
10
|
+
gem.email = "mguterl@gmail.com"
|
11
|
+
gem.homepage = "http://github.com/mguterl/textractor"
|
12
|
+
gem.authors = ["Michael Guterl"]
|
13
|
+
gem.add_development_dependency "rspec", ">= 1.2.9"
|
14
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
15
|
+
end
|
16
|
+
Jeweler::GemcutterTasks.new
|
17
|
+
rescue LoadError
|
18
|
+
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
19
|
+
end
|
20
|
+
|
21
|
+
require 'spec/rake/spectask'
|
22
|
+
Spec::Rake::SpecTask.new(:spec) do |spec|
|
23
|
+
spec.libs << 'lib' << 'spec'
|
24
|
+
spec.spec_files = FileList['spec/**/*_spec.rb']
|
25
|
+
end
|
26
|
+
|
27
|
+
Spec::Rake::SpecTask.new(:rcov) do |spec|
|
28
|
+
spec.libs << 'lib' << 'spec'
|
29
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
30
|
+
spec.rcov = true
|
31
|
+
end
|
32
|
+
|
33
|
+
task :spec => :check_dependencies
|
34
|
+
|
35
|
+
task :default => :spec
|
36
|
+
|
37
|
+
require 'rake/rdoctask'
|
38
|
+
Rake::RDocTask.new do |rdoc|
|
39
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
40
|
+
|
41
|
+
rdoc.rdoc_dir = 'rdoc'
|
42
|
+
rdoc.title = "textractor #{version}"
|
43
|
+
rdoc.rdoc_files.include('README*')
|
44
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
45
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.1
|
data/lib/textractor.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
module Textractor
|
2
|
+
autoload :Document, "textractor/document"
|
3
|
+
|
4
|
+
def self.text_from_file(filename, options = {})
|
5
|
+
Textractor::Document.new(filename, options).text
|
6
|
+
end
|
7
|
+
|
8
|
+
DEFAULT_WV_TEXT_PATH = File.expand_path(File.dirname(__FILE__) + "/../support/wvText.xml")
|
9
|
+
|
10
|
+
def self.wvText_path
|
11
|
+
@wvText_path || DEFAULT_WV_TEXT_PATH
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.wvText_path=(path)
|
15
|
+
@wvText_path = path
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module Textractor
|
2
|
+
|
3
|
+
class Document
|
4
|
+
|
5
|
+
CONTENT_TYPE_CONVERSIONS = {
|
6
|
+
'application/pdf' => :pdf,
|
7
|
+
'application/x-pdf' => :pdf,
|
8
|
+
'application/doc' => :word,
|
9
|
+
'application/x-doc' => :word,
|
10
|
+
}
|
11
|
+
|
12
|
+
attr_reader :filename
|
13
|
+
|
14
|
+
def initialize(filename, options = {})
|
15
|
+
@filename = File.expand_path(filename)
|
16
|
+
@content_type = options[:content_type]
|
17
|
+
end
|
18
|
+
|
19
|
+
def text
|
20
|
+
send("extract_from_#{type}")
|
21
|
+
end
|
22
|
+
|
23
|
+
def type
|
24
|
+
return CONTENT_TYPE_CONVERSIONS[content_type] if content_type
|
25
|
+
case File.extname(@filename)
|
26
|
+
when /pdf/
|
27
|
+
:pdf
|
28
|
+
when /doc/
|
29
|
+
:word
|
30
|
+
else
|
31
|
+
nil
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
def content_type
|
38
|
+
@content_type
|
39
|
+
end
|
40
|
+
|
41
|
+
def extract_from_pdf
|
42
|
+
`pdftotext #{filename} - 2>/dev/null`.strip
|
43
|
+
end
|
44
|
+
|
45
|
+
def extract_from_word
|
46
|
+
`wvWare -c utf-8 --nographics -x #{Textractor.wvText_path} #{filename} 2>/dev/null`.strip
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'spec/spec_helper'
|
2
|
+
|
3
|
+
describe Textractor::Document do
|
4
|
+
|
5
|
+
PDF_DOCUMENT_FIXTURE = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.pdf")
|
6
|
+
WORD_DOCUMENT_FIXTURE = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.doc")
|
7
|
+
|
8
|
+
it 'should require a filename to create' do
|
9
|
+
expect { Textractor::Document.new }.to raise_error(ArgumentError)
|
10
|
+
Textractor::Document.new('filename').filename.should == File.expand_path('filename')
|
11
|
+
end
|
12
|
+
|
13
|
+
describe "#text" do
|
14
|
+
|
15
|
+
describe "with pdf document" do
|
16
|
+
|
17
|
+
it 'should extract the text from the document' do
|
18
|
+
@doc = Textractor::Document.new(PDF_DOCUMENT_FIXTURE)
|
19
|
+
@doc.text.should == "Ruby on rails developer"
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
describe "with word document" do
|
25
|
+
|
26
|
+
it 'should extract the text from the document' do
|
27
|
+
@doc = Textractor::Document.new(WORD_DOCUMENT_FIXTURE)
|
28
|
+
@doc.text.should == "Ruby on rails developer"
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
describe "#type" do
|
36
|
+
|
37
|
+
describe "with no content type provided" do
|
38
|
+
it 'should return :pdf for PDF documents' do
|
39
|
+
@doc = Textractor::Document.new(PDF_DOCUMENT_FIXTURE)
|
40
|
+
@doc.type.should == :pdf
|
41
|
+
end
|
42
|
+
|
43
|
+
it 'should return :word for Word documents' do
|
44
|
+
@doc = Textractor::Document.new(WORD_DOCUMENT_FIXTURE)
|
45
|
+
@doc.type.should == :word
|
46
|
+
end
|
47
|
+
|
48
|
+
it 'should return nil for unknown documents' do
|
49
|
+
@doc = Textractor::Document.new("foo.bar")
|
50
|
+
@doc.type.should == nil
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
describe "with a content type provided" do
|
55
|
+
|
56
|
+
it 'should ignore the extension of the file' do
|
57
|
+
[PDF_DOCUMENT_FIXTURE, WORD_DOCUMENT_FIXTURE].each do |filename|
|
58
|
+
Textractor::Document::CONTENT_TYPE_CONVERSIONS.each do |content_type, type|
|
59
|
+
@doc = Textractor::Document.new(filename, :content_type => content_type)
|
60
|
+
@doc.type.should == type
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
Binary file
|
Binary file
|
data/spec/spec.opts
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'spec/spec_helper'
|
2
|
+
|
3
|
+
describe Textractor do
|
4
|
+
|
5
|
+
describe ".wvText_path" do
|
6
|
+
|
7
|
+
it 'should default to the file provided with the gem' do
|
8
|
+
Textractor.wvText_path.should == Textractor::DEFAULT_WV_TEXT_PATH
|
9
|
+
end
|
10
|
+
|
11
|
+
it 'should use the new wvText_path if provided' do
|
12
|
+
Textractor.wvText_path = "foo.bar"
|
13
|
+
Textractor.wvText_path.should == "foo.bar"
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
|
18
|
+
describe ".text_from_file" do
|
19
|
+
|
20
|
+
it 'should return the extracted text from the file' do
|
21
|
+
document_path = 'word.doc'
|
22
|
+
document = mock("Textractor::Document", :text => "Ruby on Rails developer")
|
23
|
+
Textractor::Document.should_receive(:new).with(document_path, :content_type => "application/doc").and_return(document)
|
24
|
+
Textractor.text_from_file(document_path, :content_type => "application/doc").should == "Ruby on Rails developer"
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
after(:all) do
|
30
|
+
Textractor.instance_variable_set(:"@wvText_path", nil)
|
31
|
+
end
|
32
|
+
end
|
data/support/wvText.xml
ADDED
@@ -0,0 +1,355 @@
|
|
1
|
+
<main>
|
2
|
+
<charentity>
|
3
|
+
<begin>ABW</begin>
|
4
|
+
</charentity>
|
5
|
+
|
6
|
+
<document>
|
7
|
+
<begin>
|
8
|
+
</begin>
|
9
|
+
<end>
|
10
|
+
</end>
|
11
|
+
</document>
|
12
|
+
|
13
|
+
<section>
|
14
|
+
<begin>
|
15
|
+
</begin>
|
16
|
+
<end>
|
17
|
+
</end>
|
18
|
+
</section>
|
19
|
+
|
20
|
+
<justification>
|
21
|
+
<left></left>
|
22
|
+
<right></right>
|
23
|
+
<center></center>
|
24
|
+
<block></block>
|
25
|
+
<asian></asian>
|
26
|
+
</justification>
|
27
|
+
|
28
|
+
<numbering>
|
29
|
+
<Arabic>type="1"</Arabic>
|
30
|
+
<UpperRoman>type="I"</UpperRoman>
|
31
|
+
<LowerRoman>type="i"</LowerRoman>
|
32
|
+
<UpperCaseN>type="A"</UpperCaseN>
|
33
|
+
<LowerCaseN>type="a"</LowerCaseN>
|
34
|
+
</numbering>
|
35
|
+
|
36
|
+
<border>
|
37
|
+
<noned></noned>
|
38
|
+
<singled></singled>
|
39
|
+
<thickd></thickd>
|
40
|
+
<doubled></doubled>
|
41
|
+
<number4d></number4d>
|
42
|
+
<hairlined></hairlined>
|
43
|
+
<dotd></dotd>
|
44
|
+
<dashlargegapd></dashlargegapd>
|
45
|
+
<dotdashd></dotdashd>
|
46
|
+
<dotdotdashd></dotdotdashd>
|
47
|
+
<tripled></tripled>
|
48
|
+
<thin-thicksmallgapd></thin-thicksmallgapd>
|
49
|
+
<thick-thinsmallgapd></thick-thinsmallgapd>
|
50
|
+
<thin-thick-thinsmallgapd></thin-thick-thinsmallgapd>
|
51
|
+
<thin-thickmediumgapd></thin-thickmediumgapd>
|
52
|
+
<thick-thinmediumgapd></thick-thinmediumgapd>
|
53
|
+
<thin-thick-thinmediumgapd></thin-thick-thinmediumgapd>
|
54
|
+
<thin-thicklargegapd></thin-thicklargegapd>
|
55
|
+
<thick-thinlargegapd></thick-thinlargegapd>
|
56
|
+
<thin-thick-thinlargegapd></thin-thick-thinlargegapd>
|
57
|
+
<waved></waved>
|
58
|
+
<doublewaved></doublewaved>
|
59
|
+
<dashsmallgapd></dashsmallgapd>
|
60
|
+
<dashdotstrokedd></dashdotstrokedd>
|
61
|
+
<emboss3Dd></emboss3Dd>
|
62
|
+
<engrave3Dd></engrave3Dd>
|
63
|
+
<defaultd></defaultd>
|
64
|
+
</border>
|
65
|
+
|
66
|
+
<olist>
|
67
|
+
<begin></begin>
|
68
|
+
<end></end>
|
69
|
+
</olist>
|
70
|
+
|
71
|
+
<ulist>
|
72
|
+
<begin></begin>
|
73
|
+
<end></end>
|
74
|
+
</ulist>
|
75
|
+
|
76
|
+
<entry>
|
77
|
+
<begin></begin>
|
78
|
+
<end></end>
|
79
|
+
</entry>
|
80
|
+
|
81
|
+
<!-- the only thing of significance -->
|
82
|
+
<text>
|
83
|
+
<begin></begin>
|
84
|
+
<end>
|
85
|
+
</end>
|
86
|
+
</text>
|
87
|
+
|
88
|
+
<!--
|
89
|
+
this tableoverride option can be used to turn off handling of
|
90
|
+
these tags in tables, which I find is necessary for at least netscape
|
91
|
+
-->
|
92
|
+
<tableoverrides>
|
93
|
+
<ParaBefore>0</ParaBefore>
|
94
|
+
<ParaRight>0</ParaRight>
|
95
|
+
<ParaAfter>0</ParaAfter>
|
96
|
+
<ParaLeft>0</ParaLeft>
|
97
|
+
<ParaLeft1>0</ParaLeft1>
|
98
|
+
<VertMergedCells>0</VertMergedCells>
|
99
|
+
</tableoverrides>
|
100
|
+
|
101
|
+
<table>
|
102
|
+
<begin></begin>
|
103
|
+
<end></end>
|
104
|
+
</table>
|
105
|
+
|
106
|
+
<row>
|
107
|
+
<begin></begin>
|
108
|
+
<end></end>
|
109
|
+
</row>
|
110
|
+
|
111
|
+
<cell>
|
112
|
+
<begin></begin>
|
113
|
+
<end></end>
|
114
|
+
</cell>
|
115
|
+
|
116
|
+
<paragraph>
|
117
|
+
<begin><text.begin/></begin>
|
118
|
+
<end><text.end/></end>
|
119
|
+
</paragraph>
|
120
|
+
|
121
|
+
<!-- these are all the character properties that can show up in word -->
|
122
|
+
<bold><begin></begin><end></end></bold>
|
123
|
+
<italic><begin></begin><end></end></italic>
|
124
|
+
|
125
|
+
<!--
|
126
|
+
text that has been deleted and will be displayed with strikethrough when
|
127
|
+
revision marked text is to be displayed
|
128
|
+
|
129
|
+
use either this line...
|
130
|
+
-->
|
131
|
+
<RMarkDel><begin></begin>
|
132
|
+
<end></end>
|
133
|
+
</RMarkDel>
|
134
|
+
|
135
|
+
<!--
|
136
|
+
or uncomment below to make deleted text dissappear (well, become commented out)
|
137
|
+
-->
|
138
|
+
<!--
|
139
|
+
<RMarkDel><begin><!--</begin><end>--></end></RMarkDel>
|
140
|
+
-->
|
141
|
+
|
142
|
+
<!-- I don't even know what outline means -->
|
143
|
+
<outline><begin></begin><end></end></outline>
|
144
|
+
<smallcaps><begin></begin><end></end></smallcaps>
|
145
|
+
<caps><begin></begin><end></end></caps>
|
146
|
+
<vanish><begin></begin><end></end></vanish>
|
147
|
+
|
148
|
+
<!--If you uncomment this then the annotation text links will become commented out by html tags-->
|
149
|
+
<!--
|
150
|
+
<vanish><begin><!--</begin><end>--></end></vanish>
|
151
|
+
-->
|
152
|
+
|
153
|
+
<!--
|
154
|
+
text that has been newly typed since the last time revision marks have been accepted
|
155
|
+
and will be displayed with underline when revision marked text is to be displayed
|
156
|
+
|
157
|
+
use either this line...
|
158
|
+
-->
|
159
|
+
<RMark><begin></begin><end></end></RMark>
|
160
|
+
|
161
|
+
<!--
|
162
|
+
or uncomment below to make the underline dissappear
|
163
|
+
-->
|
164
|
+
<!--
|
165
|
+
<RMark><begin></begin><end></end></RMark>
|
166
|
+
-->
|
167
|
+
|
168
|
+
|
169
|
+
<strike><begin></begin><end></end></strike>
|
170
|
+
<shadow><begin></begin><end></end></shadow>
|
171
|
+
<lowercase><begin></begin><end></end></lowercase>
|
172
|
+
<emboss><begin></begin><end></end></emboss>
|
173
|
+
<imprint><begin></begin><end></end></imprint>
|
174
|
+
<!--double strike-->
|
175
|
+
<dstrike><begin></begin><end></end></dstrike>
|
176
|
+
|
177
|
+
<!--
|
178
|
+
ftc's
|
179
|
+
&
|
180
|
+
hps
|
181
|
+
|
182
|
+
keep them for font face and do that later.
|
183
|
+
-->
|
184
|
+
|
185
|
+
<super><begin></begin><end></end></super>
|
186
|
+
<sub><begin></begin><end></end></sub>
|
187
|
+
|
188
|
+
<singleu><begin></begin><end></end></singleu>
|
189
|
+
<wordu><begin></begin><end></end></wordu>
|
190
|
+
<doubleu><begin></begin><end></end></doubleu>
|
191
|
+
<dottedu><begin></begin><end></end></dottedu>
|
192
|
+
<hiddenu><begin></begin><end></end></hiddenu>
|
193
|
+
<thicku><begin></begin><end></end></thicku>
|
194
|
+
<dashu><begin></begin><end></end></dashu>
|
195
|
+
<dotu><begin></begin><end></end></dotu>
|
196
|
+
<dotdashu><begin></begin><end></end></dotdashu>
|
197
|
+
<dotdotdashu><begin></begin><end></end></dotdotdashu>
|
198
|
+
<waveu><begin></begin><end></end></waveu>
|
199
|
+
|
200
|
+
<!--
|
201
|
+
text whose properties have been changed since the last time revision marks have been accepted
|
202
|
+
and will be displayed with a note showing the change points.
|
203
|
+
|
204
|
+
use either this line (which admit it a bit scary looking, but harmless)...
|
205
|
+
-->
|
206
|
+
<PropRMark><begin><ibstPropRMark/></begin><end></end></PropRMark>
|
207
|
+
|
208
|
+
<!--
|
209
|
+
or uncomment below to make the notes dissappear
|
210
|
+
-->
|
211
|
+
<!--
|
212
|
+
<PropRMark><begin></begin><end></end></PropRMark>
|
213
|
+
-->
|
214
|
+
|
215
|
+
<!--
|
216
|
+
<color>
|
217
|
+
-->
|
218
|
+
<Black><begin></begin><end></end></Black>
|
219
|
+
<Blue><begin></begin><end></end></Blue>
|
220
|
+
<Cyan><begin></begin><end></end></Cyan>
|
221
|
+
<Green><begin></begin><end></end></Green>
|
222
|
+
<Magenta><begin></begin><end></end></Magenta>
|
223
|
+
<Red><begin></begin><end></end></Red>
|
224
|
+
<Yellow><begin></begin><end></end></Yellow>
|
225
|
+
<White><begin></begin><end></end></White>
|
226
|
+
<DkBlue><begin></begin><end></end></DkBlue>
|
227
|
+
<DkCyan><begin></begin><end></end></DkCyan>
|
228
|
+
<DkGreen><begin></begin><end></end></DkGreen>
|
229
|
+
<DkMagenta><begin></begin><end></end></DkMagenta>
|
230
|
+
<DkRed><begin></begin><end></end></DkRed>
|
231
|
+
<DkYellow><begin></begin><end></end></DkYellow>
|
232
|
+
<DkGray><begin></begin><end></end></DkGray>
|
233
|
+
<LtGray><begin></begin><end></end></LtGray>
|
234
|
+
<!--
|
235
|
+
</color>
|
236
|
+
-->
|
237
|
+
|
238
|
+
<!--
|
239
|
+
<animation>
|
240
|
+
-->
|
241
|
+
<LasVegas><begin></begin><end></end></LasVegas>
|
242
|
+
<BackgroundBlink><begin></begin><end></end></BackgroundBlink>
|
243
|
+
<SparkleText><begin></begin><end></end></SparkleText>
|
244
|
+
<MarchingAnts><begin></begin><end></end></MarchingAnts>
|
245
|
+
<MarchingRedAnts><begin></begin><end></end></MarchingRedAnts>
|
246
|
+
<Shimmer><begin></begin><end></end></Shimmer>
|
247
|
+
<!--
|
248
|
+
</animation>
|
249
|
+
-->
|
250
|
+
|
251
|
+
<!--
|
252
|
+
I dont understand what this one is, and ive never come across it
|
253
|
+
|
254
|
+
use this sample line (which admit it a bit scary looking, but harmless)...
|
255
|
+
-->
|
256
|
+
<DispFldRMark><begin></begin><end></end></DispFldRMark>
|
257
|
+
|
258
|
+
<!--
|
259
|
+
or uncomment below to ignore it, the previous might even crash wv ?
|
260
|
+
-->
|
261
|
+
<!--
|
262
|
+
<DispFldRMark><begin></begin><end></end></DispFldRMark>
|
263
|
+
-->
|
264
|
+
|
265
|
+
<animation>
|
266
|
+
<begin><LasVegas.begin/><BackgroundBlink.begin/><SparkleText.begin/><MarchingAnts.begin/><MarchingRedAnts.begin/><Shimmer.begin/></begin>
|
267
|
+
<end><Shimmer.end/><MarchingRedAnts.end/><MarchingAnts.end/><SparkleText.end/><BackgroundBlink.end/><LasVegas.end/></end>
|
268
|
+
</animation>
|
269
|
+
|
270
|
+
<fontstr>
|
271
|
+
<begin></begin>
|
272
|
+
<end></end>
|
273
|
+
</fontstr>
|
274
|
+
|
275
|
+
<comment>
|
276
|
+
<begin>
|
277
|
+
</begin>
|
278
|
+
<end>
|
279
|
+
</end>
|
280
|
+
</comment>
|
281
|
+
|
282
|
+
<style name="Normal">
|
283
|
+
<character>
|
284
|
+
<begin><PropRMark.begin/><DispFldRMark.begin/><animation.begin/><fontstr.begin/><bold.begin/><italic.begin/><strike.begin/><RMarkDel.begin/><outline.begin/><smallcaps.begin/><caps.begin/><vanish.begin/><RMark.begin/><shadow.begin/><lowercase.begin/><emboss.begin/><imprint.begin/><dstrike.begin/><super.begin/><sub.begin/><singleu.begin/><wordu.begin/><doubleu.begin/><dottedu.begin/><hiddenu.begin/><thicku.begin/><dashu.begin/><dotu.begin/><dotdashu.begin/><dotdotdashu.begin/><waveu.begin/></begin>
|
285
|
+
<end><waveu.end/><dotdotdashu.end/><dotdashu.end/><dotu.end/><dashu.end/><thicku.end/><hiddenu.end/><dottedu.end/><doubleu.end/><wordu.end/><singleu.end/><sub.end/><super.end/><dstrike.end/><imprint.end/><emboss.end/><lowercase.end/><shadow.end/><RMark.end/><vanish.end/><caps.end/><smallcaps.end/><outline.end/><RMarkDel.end/><strike.end/><italic.end/><bold.end/><fontstr.end/><animation.end/><DispFldRMark.end/><PropRMark.end/></end>
|
286
|
+
</character>
|
287
|
+
|
288
|
+
<!-- Netscape does handle this correctly yet, here is how each different side of the border should work.
|
289
|
+
border-top: thin <bordertopstyle/> <bordertopcolor/>;
|
290
|
+
border-left: thin <borderleftstyle/> <borderleftcolor/>;
|
291
|
+
border-right: thin <borderrightstyle/> <borderrightcolor/>;
|
292
|
+
border-bottom: thin <borderbottomstyle/> <borderbottomcolor/>
|
293
|
+
-->
|
294
|
+
|
295
|
+
|
296
|
+
<pmargin>
|
297
|
+
<begin><!-- <mmParaBefore/> <mmParaRight/> <mmParaAfter/> <mmParaLeft/>;--></begin>
|
298
|
+
</pmargin>
|
299
|
+
|
300
|
+
<pborder>
|
301
|
+
<begin>
|
302
|
+
<!--
|
303
|
+
border: thin <borderleftstyle/> <borderleftcolor/>;
|
304
|
+
border-top: thin <bordertopstyle/> <bordertopcolor/>;
|
305
|
+
border-left: thin <borderleftstyle/> <borderleftcolor/>;
|
306
|
+
border-right: thin <borderrightstyle/> <borderrightcolor/>;
|
307
|
+
border-bottom: thin <borderbottomstyle/> <borderbottomcolor/>
|
308
|
+
-->
|
309
|
+
</begin>
|
310
|
+
</pborder>
|
311
|
+
|
312
|
+
<picture>
|
313
|
+
<begin>
|
314
|
+
</begin>
|
315
|
+
<!-- images are lacking for now -->
|
316
|
+
|
317
|
+
</picture>
|
318
|
+
|
319
|
+
</style>
|
320
|
+
|
321
|
+
<!--we need to be override the character properties-->
|
322
|
+
<!--
|
323
|
+
<style name="Normal">
|
324
|
+
<character>
|
325
|
+
<begin><PropRMark.begin/><DispFldRMark.begin/><animation.begin/><fontstr.begin/><bold.begin/><italic.begin/><strike.begin/><RMarkDel.begin/><outline.begin/><smallcaps.begin/><caps.begin/><vanish.begin/><RMark.begin/><shadow.begin/><lowercase.begin/><emboss.begin/><imprint.begin/><dstrike.begin/><super.begin/><sub.begin/><singleu.begin/><wordu.begin/><doubleu.begin/><dottedu.begin/><hiddenu.begin/><thicku.begin/><dashu.begin/><dotu.begin/><dotdashu.begin/><dotdotdashu.begin/><waveu.begin/></begin>
|
326
|
+
<end><waveu.end/><dotdotdashu.end/><dotdashu.end/><dotu.end/><dashu.end/><thicku.end/><hiddenu.end/><dottedu.end/><doubleu.end/><wordu.end/><singleu.end/><sub.end/><super.end/><dstrike.end/><imprint.end/><emboss.end/><lowercase.end/><shadow.end/><RMark.end/><vanish.end/><caps.end/><smallcaps.end/><outline.end/><RMarkDel.end/><strike.end/><italic.end/><bold.end/><fontstr.end/><animation.end/><DispFldRMark.end/><PropRMark.end/></end>
|
327
|
+
</character>
|
328
|
+
|
329
|
+
<text>
|
330
|
+
<begin></begin>
|
331
|
+
<end>
|
332
|
+
</end>
|
333
|
+
</text>
|
334
|
+
|
335
|
+
</style>
|
336
|
+
|
337
|
+
<style name="Heading 1">
|
338
|
+
|
339
|
+
<character>
|
340
|
+
<begin></begin>
|
341
|
+
<end></end>
|
342
|
+
</character>
|
343
|
+
|
344
|
+
<text>
|
345
|
+
<begin></begin>
|
346
|
+
<end>
|
347
|
+
</end>
|
348
|
+
</text>
|
349
|
+
|
350
|
+
|
351
|
+
|
352
|
+
</style>
|
353
|
+
-->
|
354
|
+
|
355
|
+
</main>
|
metadata
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: textractor
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
version: 0.0.1
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Michael Guterl
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-04-20 00:00:00 -04:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: rspec
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - ">="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
segments:
|
28
|
+
- 1
|
29
|
+
- 2
|
30
|
+
- 9
|
31
|
+
version: 1.2.9
|
32
|
+
type: :development
|
33
|
+
version_requirements: *id001
|
34
|
+
description: simple wrapper for extracting text from PDF and Word documents
|
35
|
+
email: mguterl@gmail.com
|
36
|
+
executables: []
|
37
|
+
|
38
|
+
extensions: []
|
39
|
+
|
40
|
+
extra_rdoc_files:
|
41
|
+
- LICENSE
|
42
|
+
- README.md
|
43
|
+
files:
|
44
|
+
- .document
|
45
|
+
- .gitignore
|
46
|
+
- LICENSE
|
47
|
+
- README.md
|
48
|
+
- Rakefile
|
49
|
+
- VERSION
|
50
|
+
- lib/textractor.rb
|
51
|
+
- lib/textractor/document.rb
|
52
|
+
- spec/document_spec.rb
|
53
|
+
- spec/fixtures/document.doc
|
54
|
+
- spec/fixtures/document.pdf
|
55
|
+
- spec/spec.opts
|
56
|
+
- spec/spec_helper.rb
|
57
|
+
- spec/textractor_spec.rb
|
58
|
+
- support/wvText.xml
|
59
|
+
has_rdoc: true
|
60
|
+
homepage: http://github.com/mguterl/textractor
|
61
|
+
licenses: []
|
62
|
+
|
63
|
+
post_install_message:
|
64
|
+
rdoc_options:
|
65
|
+
- --charset=UTF-8
|
66
|
+
require_paths:
|
67
|
+
- lib
|
68
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
69
|
+
requirements:
|
70
|
+
- - ">="
|
71
|
+
- !ruby/object:Gem::Version
|
72
|
+
segments:
|
73
|
+
- 0
|
74
|
+
version: "0"
|
75
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
76
|
+
requirements:
|
77
|
+
- - ">="
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
segments:
|
80
|
+
- 0
|
81
|
+
version: "0"
|
82
|
+
requirements: []
|
83
|
+
|
84
|
+
rubyforge_project:
|
85
|
+
rubygems_version: 1.3.6
|
86
|
+
signing_key:
|
87
|
+
specification_version: 3
|
88
|
+
summary: simple wrapper for extracting text from PDF and Word documents
|
89
|
+
test_files:
|
90
|
+
- spec/document_spec.rb
|
91
|
+
- spec/spec_helper.rb
|
92
|
+
- spec/textractor_spec.rb
|