textractor 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +21 -0
- data/LICENSE +20 -0
- data/README.md +54 -0
- data/Rakefile +45 -0
- data/VERSION +1 -0
- data/lib/textractor.rb +18 -0
- data/lib/textractor/document.rb +51 -0
- data/spec/document_spec.rb +69 -0
- data/spec/fixtures/document.doc +0 -0
- data/spec/fixtures/document.pdf +0 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/textractor_spec.rb +32 -0
- data/support/wvText.xml +355 -0
- metadata +92 -0
data/.document
ADDED
data/.gitignore
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2010 Michael Guterl
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
# textractor
|
2
|
+
|
3
|
+
textractor is a ruby library that provides a simple wrapper for
|
4
|
+
extracting text from PDF and Word documents.
|
5
|
+
|
6
|
+
## Setup
|
7
|
+
|
8
|
+
In order to use textractor you have to install a few command line
|
9
|
+
tools.
|
10
|
+
|
11
|
+
### OS X
|
12
|
+
|
13
|
+
port install wv pdftohtml links
|
14
|
+
|
15
|
+
I recommend using also passing +no_x11 to the install command, but
|
16
|
+
this may not work on all systems due to dependency issues.
|
17
|
+
|
18
|
+
port install wv pdftohtml links +no_x11
|
19
|
+
|
20
|
+
### Ubuntu 8.04
|
21
|
+
|
22
|
+
apt-get install wv xpdf-utils links
|
23
|
+
|
24
|
+
## Usage
|
25
|
+
|
26
|
+
Due to textractor's reliance on command line tools all the methods in
|
27
|
+
textractor work on paths not File objects.
|
28
|
+
|
29
|
+
document = Textractor::Document.new(path_to_document)
|
30
|
+
document.text # => "Ruby on rails developer"
|
31
|
+
|
32
|
+
There is also a convenience method on Textractor.
|
33
|
+
|
34
|
+
Textractor.text_from_file(path_to_document) # => "Ruby on rails developer"
|
35
|
+
|
36
|
+
Textractor will attempt to guess what type of document you're trying
|
37
|
+
to extract text from. However, if you know the content type of your
|
38
|
+
document, you can provide it and Textractor won't guess.
|
39
|
+
|
40
|
+
Textractor.text_from_file(path_to_document, :content_type => "application/doc")
|
41
|
+
|
42
|
+
## Note on Patches/Pull Requests
|
43
|
+
|
44
|
+
* Fork the project.
|
45
|
+
* Make your feature addition or bug fix.
|
46
|
+
* Add tests for it. This is important so I don't break it in a
|
47
|
+
future version unintentionally.
|
48
|
+
* Commit, do not mess with rakefile, version, or history.
|
49
|
+
(if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
50
|
+
* Send me a pull request. Bonus points for topic branches.
|
51
|
+
|
52
|
+
## Copyright
|
53
|
+
|
54
|
+
Copyright (c) 2010 Michael Guterl. See LICENSE for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "textractor"
|
8
|
+
gem.summary = %Q{simple wrapper for extracting text from PDF and Word documents}
|
9
|
+
gem.description = %Q{simple wrapper for extracting text from PDF and Word documents}
|
10
|
+
gem.email = "mguterl@gmail.com"
|
11
|
+
gem.homepage = "http://github.com/mguterl/textractor"
|
12
|
+
gem.authors = ["Michael Guterl"]
|
13
|
+
gem.add_development_dependency "rspec", ">= 1.2.9"
|
14
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
15
|
+
end
|
16
|
+
Jeweler::GemcutterTasks.new
|
17
|
+
rescue LoadError
|
18
|
+
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
19
|
+
end
|
20
|
+
|
21
|
+
require 'spec/rake/spectask'
|
22
|
+
Spec::Rake::SpecTask.new(:spec) do |spec|
|
23
|
+
spec.libs << 'lib' << 'spec'
|
24
|
+
spec.spec_files = FileList['spec/**/*_spec.rb']
|
25
|
+
end
|
26
|
+
|
27
|
+
Spec::Rake::SpecTask.new(:rcov) do |spec|
|
28
|
+
spec.libs << 'lib' << 'spec'
|
29
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
30
|
+
spec.rcov = true
|
31
|
+
end
|
32
|
+
|
33
|
+
task :spec => :check_dependencies
|
34
|
+
|
35
|
+
task :default => :spec
|
36
|
+
|
37
|
+
require 'rake/rdoctask'
|
38
|
+
Rake::RDocTask.new do |rdoc|
|
39
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
40
|
+
|
41
|
+
rdoc.rdoc_dir = 'rdoc'
|
42
|
+
rdoc.title = "textractor #{version}"
|
43
|
+
rdoc.rdoc_files.include('README*')
|
44
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
45
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.1
|
data/lib/textractor.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
module Textractor
|
2
|
+
autoload :Document, "textractor/document"
|
3
|
+
|
4
|
+
def self.text_from_file(filename, options = {})
|
5
|
+
Textractor::Document.new(filename, options).text
|
6
|
+
end
|
7
|
+
|
8
|
+
DEFAULT_WV_TEXT_PATH = File.expand_path(File.dirname(__FILE__) + "/../support/wvText.xml")
|
9
|
+
|
10
|
+
def self.wvText_path
|
11
|
+
@wvText_path || DEFAULT_WV_TEXT_PATH
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.wvText_path=(path)
|
15
|
+
@wvText_path = path
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module Textractor
|
2
|
+
|
3
|
+
class Document
|
4
|
+
|
5
|
+
CONTENT_TYPE_CONVERSIONS = {
|
6
|
+
'application/pdf' => :pdf,
|
7
|
+
'application/x-pdf' => :pdf,
|
8
|
+
'application/doc' => :word,
|
9
|
+
'application/x-doc' => :word,
|
10
|
+
}
|
11
|
+
|
12
|
+
attr_reader :filename
|
13
|
+
|
14
|
+
def initialize(filename, options = {})
|
15
|
+
@filename = File.expand_path(filename)
|
16
|
+
@content_type = options[:content_type]
|
17
|
+
end
|
18
|
+
|
19
|
+
def text
|
20
|
+
send("extract_from_#{type}")
|
21
|
+
end
|
22
|
+
|
23
|
+
def type
|
24
|
+
return CONTENT_TYPE_CONVERSIONS[content_type] if content_type
|
25
|
+
case File.extname(@filename)
|
26
|
+
when /pdf/
|
27
|
+
:pdf
|
28
|
+
when /doc/
|
29
|
+
:word
|
30
|
+
else
|
31
|
+
nil
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
def content_type
|
38
|
+
@content_type
|
39
|
+
end
|
40
|
+
|
41
|
+
def extract_from_pdf
|
42
|
+
`pdftotext #{filename} - 2>/dev/null`.strip
|
43
|
+
end
|
44
|
+
|
45
|
+
def extract_from_word
|
46
|
+
`wvWare -c utf-8 --nographics -x #{Textractor.wvText_path} #{filename} 2>/dev/null`.strip
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'spec/spec_helper'
|
2
|
+
|
3
|
+
describe Textractor::Document do
|
4
|
+
|
5
|
+
PDF_DOCUMENT_FIXTURE = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.pdf")
|
6
|
+
WORD_DOCUMENT_FIXTURE = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.doc")
|
7
|
+
|
8
|
+
it 'should require a filename to create' do
|
9
|
+
expect { Textractor::Document.new }.to raise_error(ArgumentError)
|
10
|
+
Textractor::Document.new('filename').filename.should == File.expand_path('filename')
|
11
|
+
end
|
12
|
+
|
13
|
+
describe "#text" do
|
14
|
+
|
15
|
+
describe "with pdf document" do
|
16
|
+
|
17
|
+
it 'should extract the text from the document' do
|
18
|
+
@doc = Textractor::Document.new(PDF_DOCUMENT_FIXTURE)
|
19
|
+
@doc.text.should == "Ruby on rails developer"
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
describe "with word document" do
|
25
|
+
|
26
|
+
it 'should extract the text from the document' do
|
27
|
+
@doc = Textractor::Document.new(WORD_DOCUMENT_FIXTURE)
|
28
|
+
@doc.text.should == "Ruby on rails developer"
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
describe "#type" do
|
36
|
+
|
37
|
+
describe "with no content type provided" do
|
38
|
+
it 'should return :pdf for PDF documents' do
|
39
|
+
@doc = Textractor::Document.new(PDF_DOCUMENT_FIXTURE)
|
40
|
+
@doc.type.should == :pdf
|
41
|
+
end
|
42
|
+
|
43
|
+
it 'should return :word for Word documents' do
|
44
|
+
@doc = Textractor::Document.new(WORD_DOCUMENT_FIXTURE)
|
45
|
+
@doc.type.should == :word
|
46
|
+
end
|
47
|
+
|
48
|
+
it 'should return nil for unknown documents' do
|
49
|
+
@doc = Textractor::Document.new("foo.bar")
|
50
|
+
@doc.type.should == nil
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
describe "with a content type provided" do
|
55
|
+
|
56
|
+
it 'should ignore the extension of the file' do
|
57
|
+
[PDF_DOCUMENT_FIXTURE, WORD_DOCUMENT_FIXTURE].each do |filename|
|
58
|
+
Textractor::Document::CONTENT_TYPE_CONVERSIONS.each do |content_type, type|
|
59
|
+
@doc = Textractor::Document.new(filename, :content_type => content_type)
|
60
|
+
@doc.type.should == type
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
Binary file
|
Binary file
|
data/spec/spec.opts
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'spec/spec_helper'
|
2
|
+
|
3
|
+
describe Textractor do
|
4
|
+
|
5
|
+
describe ".wvText_path" do
|
6
|
+
|
7
|
+
it 'should default to the file provided with the gem' do
|
8
|
+
Textractor.wvText_path.should == Textractor::DEFAULT_WV_TEXT_PATH
|
9
|
+
end
|
10
|
+
|
11
|
+
it 'should use the new wvText_path if provided' do
|
12
|
+
Textractor.wvText_path = "foo.bar"
|
13
|
+
Textractor.wvText_path.should == "foo.bar"
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
|
18
|
+
describe ".text_from_file" do
|
19
|
+
|
20
|
+
it 'should return the extracted text from the file' do
|
21
|
+
document_path = 'word.doc'
|
22
|
+
document = mock("Textractor::Document", :text => "Ruby on Rails developer")
|
23
|
+
Textractor::Document.should_receive(:new).with(document_path, :content_type => "application/doc").and_return(document)
|
24
|
+
Textractor.text_from_file(document_path, :content_type => "application/doc").should == "Ruby on Rails developer"
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
after(:all) do
|
30
|
+
Textractor.instance_variable_set(:"@wvText_path", nil)
|
31
|
+
end
|
32
|
+
end
|
data/support/wvText.xml
ADDED
@@ -0,0 +1,355 @@
|
|
1
|
+
<main>
|
2
|
+
<charentity>
|
3
|
+
<begin>ABW</begin>
|
4
|
+
</charentity>
|
5
|
+
|
6
|
+
<document>
|
7
|
+
<begin>
|
8
|
+
</begin>
|
9
|
+
<end>
|
10
|
+
</end>
|
11
|
+
</document>
|
12
|
+
|
13
|
+
<section>
|
14
|
+
<begin>
|
15
|
+
</begin>
|
16
|
+
<end>
|
17
|
+
</end>
|
18
|
+
</section>
|
19
|
+
|
20
|
+
<justification>
|
21
|
+
<left></left>
|
22
|
+
<right></right>
|
23
|
+
<center></center>
|
24
|
+
<block></block>
|
25
|
+
<asian></asian>
|
26
|
+
</justification>
|
27
|
+
|
28
|
+
<numbering>
|
29
|
+
<Arabic>type="1"</Arabic>
|
30
|
+
<UpperRoman>type="I"</UpperRoman>
|
31
|
+
<LowerRoman>type="i"</LowerRoman>
|
32
|
+
<UpperCaseN>type="A"</UpperCaseN>
|
33
|
+
<LowerCaseN>type="a"</LowerCaseN>
|
34
|
+
</numbering>
|
35
|
+
|
36
|
+
<border>
|
37
|
+
<noned></noned>
|
38
|
+
<singled></singled>
|
39
|
+
<thickd></thickd>
|
40
|
+
<doubled></doubled>
|
41
|
+
<number4d></number4d>
|
42
|
+
<hairlined></hairlined>
|
43
|
+
<dotd></dotd>
|
44
|
+
<dashlargegapd></dashlargegapd>
|
45
|
+
<dotdashd></dotdashd>
|
46
|
+
<dotdotdashd></dotdotdashd>
|
47
|
+
<tripled></tripled>
|
48
|
+
<thin-thicksmallgapd></thin-thicksmallgapd>
|
49
|
+
<thick-thinsmallgapd></thick-thinsmallgapd>
|
50
|
+
<thin-thick-thinsmallgapd></thin-thick-thinsmallgapd>
|
51
|
+
<thin-thickmediumgapd></thin-thickmediumgapd>
|
52
|
+
<thick-thinmediumgapd></thick-thinmediumgapd>
|
53
|
+
<thin-thick-thinmediumgapd></thin-thick-thinmediumgapd>
|
54
|
+
<thin-thicklargegapd></thin-thicklargegapd>
|
55
|
+
<thick-thinlargegapd></thick-thinlargegapd>
|
56
|
+
<thin-thick-thinlargegapd></thin-thick-thinlargegapd>
|
57
|
+
<waved></waved>
|
58
|
+
<doublewaved></doublewaved>
|
59
|
+
<dashsmallgapd></dashsmallgapd>
|
60
|
+
<dashdotstrokedd></dashdotstrokedd>
|
61
|
+
<emboss3Dd></emboss3Dd>
|
62
|
+
<engrave3Dd></engrave3Dd>
|
63
|
+
<defaultd></defaultd>
|
64
|
+
</border>
|
65
|
+
|
66
|
+
<olist>
|
67
|
+
<begin></begin>
|
68
|
+
<end></end>
|
69
|
+
</olist>
|
70
|
+
|
71
|
+
<ulist>
|
72
|
+
<begin></begin>
|
73
|
+
<end></end>
|
74
|
+
</ulist>
|
75
|
+
|
76
|
+
<entry>
|
77
|
+
<begin></begin>
|
78
|
+
<end></end>
|
79
|
+
</entry>
|
80
|
+
|
81
|
+
<!-- the only thing of significance -->
|
82
|
+
<text>
|
83
|
+
<begin></begin>
|
84
|
+
<end>
|
85
|
+
</end>
|
86
|
+
</text>
|
87
|
+
|
88
|
+
<!--
|
89
|
+
this tableoverride option can be used to turn off handling of
|
90
|
+
these tags in tables, which I find is necessary for at least netscape
|
91
|
+
-->
|
92
|
+
<tableoverrides>
|
93
|
+
<ParaBefore>0</ParaBefore>
|
94
|
+
<ParaRight>0</ParaRight>
|
95
|
+
<ParaAfter>0</ParaAfter>
|
96
|
+
<ParaLeft>0</ParaLeft>
|
97
|
+
<ParaLeft1>0</ParaLeft1>
|
98
|
+
<VertMergedCells>0</VertMergedCells>
|
99
|
+
</tableoverrides>
|
100
|
+
|
101
|
+
<table>
|
102
|
+
<begin></begin>
|
103
|
+
<end></end>
|
104
|
+
</table>
|
105
|
+
|
106
|
+
<row>
|
107
|
+
<begin></begin>
|
108
|
+
<end></end>
|
109
|
+
</row>
|
110
|
+
|
111
|
+
<cell>
|
112
|
+
<begin></begin>
|
113
|
+
<end></end>
|
114
|
+
</cell>
|
115
|
+
|
116
|
+
<paragraph>
|
117
|
+
<begin><text.begin/></begin>
|
118
|
+
<end><text.end/></end>
|
119
|
+
</paragraph>
|
120
|
+
|
121
|
+
<!-- these are all the character properties that can show up in word -->
|
122
|
+
<bold><begin></begin><end></end></bold>
|
123
|
+
<italic><begin></begin><end></end></italic>
|
124
|
+
|
125
|
+
<!--
|
126
|
+
text that has been deleted and will be displayed with strikethrough when
|
127
|
+
revision marked text is to be displayed
|
128
|
+
|
129
|
+
use either this line...
|
130
|
+
-->
|
131
|
+
<RMarkDel><begin></begin>
|
132
|
+
<end></end>
|
133
|
+
</RMarkDel>
|
134
|
+
|
135
|
+
<!--
|
136
|
+
or uncomment below to make deleted text dissappear (well, become commented out)
|
137
|
+
-->
|
138
|
+
<!--
|
139
|
+
<RMarkDel><begin><!--</begin><end>--></end></RMarkDel>
|
140
|
+
-->
|
141
|
+
|
142
|
+
<!-- I don't even know what outline means -->
|
143
|
+
<outline><begin></begin><end></end></outline>
|
144
|
+
<smallcaps><begin></begin><end></end></smallcaps>
|
145
|
+
<caps><begin></begin><end></end></caps>
|
146
|
+
<vanish><begin></begin><end></end></vanish>
|
147
|
+
|
148
|
+
<!--If you uncomment this then the annotation text links will become commented out by html tags-->
|
149
|
+
<!--
|
150
|
+
<vanish><begin><!--</begin><end>--></end></vanish>
|
151
|
+
-->
|
152
|
+
|
153
|
+
<!--
|
154
|
+
text that has been newly typed since the last time revision marks have been accepted
|
155
|
+
and will be displayed with underline when revision marked text is to be displayed
|
156
|
+
|
157
|
+
use either this line...
|
158
|
+
-->
|
159
|
+
<RMark><begin></begin><end></end></RMark>
|
160
|
+
|
161
|
+
<!--
|
162
|
+
or uncomment below to make the underline dissappear
|
163
|
+
-->
|
164
|
+
<!--
|
165
|
+
<RMark><begin></begin><end></end></RMark>
|
166
|
+
-->
|
167
|
+
|
168
|
+
|
169
|
+
<strike><begin></begin><end></end></strike>
|
170
|
+
<shadow><begin></begin><end></end></shadow>
|
171
|
+
<lowercase><begin></begin><end></end></lowercase>
|
172
|
+
<emboss><begin></begin><end></end></emboss>
|
173
|
+
<imprint><begin></begin><end></end></imprint>
|
174
|
+
<!--double strike-->
|
175
|
+
<dstrike><begin></begin><end></end></dstrike>
|
176
|
+
|
177
|
+
<!--
|
178
|
+
ftc's
|
179
|
+
&
|
180
|
+
hps
|
181
|
+
|
182
|
+
keep them for font face and do that later.
|
183
|
+
-->
|
184
|
+
|
185
|
+
<super><begin></begin><end></end></super>
|
186
|
+
<sub><begin></begin><end></end></sub>
|
187
|
+
|
188
|
+
<singleu><begin></begin><end></end></singleu>
|
189
|
+
<wordu><begin></begin><end></end></wordu>
|
190
|
+
<doubleu><begin></begin><end></end></doubleu>
|
191
|
+
<dottedu><begin></begin><end></end></dottedu>
|
192
|
+
<hiddenu><begin></begin><end></end></hiddenu>
|
193
|
+
<thicku><begin></begin><end></end></thicku>
|
194
|
+
<dashu><begin></begin><end></end></dashu>
|
195
|
+
<dotu><begin></begin><end></end></dotu>
|
196
|
+
<dotdashu><begin></begin><end></end></dotdashu>
|
197
|
+
<dotdotdashu><begin></begin><end></end></dotdotdashu>
|
198
|
+
<waveu><begin></begin><end></end></waveu>
|
199
|
+
|
200
|
+
<!--
|
201
|
+
text whose properties have been changed since the last time revision marks have been accepted
|
202
|
+
and will be displayed with a note showing the change points.
|
203
|
+
|
204
|
+
use either this line (which admit it a bit scary looking, but harmless)...
|
205
|
+
-->
|
206
|
+
<PropRMark><begin><ibstPropRMark/></begin><end></end></PropRMark>
|
207
|
+
|
208
|
+
<!--
|
209
|
+
or uncomment below to make the notes dissappear
|
210
|
+
-->
|
211
|
+
<!--
|
212
|
+
<PropRMark><begin></begin><end></end></PropRMark>
|
213
|
+
-->
|
214
|
+
|
215
|
+
<!--
|
216
|
+
<color>
|
217
|
+
-->
|
218
|
+
<Black><begin></begin><end></end></Black>
|
219
|
+
<Blue><begin></begin><end></end></Blue>
|
220
|
+
<Cyan><begin></begin><end></end></Cyan>
|
221
|
+
<Green><begin></begin><end></end></Green>
|
222
|
+
<Magenta><begin></begin><end></end></Magenta>
|
223
|
+
<Red><begin></begin><end></end></Red>
|
224
|
+
<Yellow><begin></begin><end></end></Yellow>
|
225
|
+
<White><begin></begin><end></end></White>
|
226
|
+
<DkBlue><begin></begin><end></end></DkBlue>
|
227
|
+
<DkCyan><begin></begin><end></end></DkCyan>
|
228
|
+
<DkGreen><begin></begin><end></end></DkGreen>
|
229
|
+
<DkMagenta><begin></begin><end></end></DkMagenta>
|
230
|
+
<DkRed><begin></begin><end></end></DkRed>
|
231
|
+
<DkYellow><begin></begin><end></end></DkYellow>
|
232
|
+
<DkGray><begin></begin><end></end></DkGray>
|
233
|
+
<LtGray><begin></begin><end></end></LtGray>
|
234
|
+
<!--
|
235
|
+
</color>
|
236
|
+
-->
|
237
|
+
|
238
|
+
<!--
|
239
|
+
<animation>
|
240
|
+
-->
|
241
|
+
<LasVegas><begin></begin><end></end></LasVegas>
|
242
|
+
<BackgroundBlink><begin></begin><end></end></BackgroundBlink>
|
243
|
+
<SparkleText><begin></begin><end></end></SparkleText>
|
244
|
+
<MarchingAnts><begin></begin><end></end></MarchingAnts>
|
245
|
+
<MarchingRedAnts><begin></begin><end></end></MarchingRedAnts>
|
246
|
+
<Shimmer><begin></begin><end></end></Shimmer>
|
247
|
+
<!--
|
248
|
+
</animation>
|
249
|
+
-->
|
250
|
+
|
251
|
+
<!--
|
252
|
+
I dont understand what this one is, and ive never come across it
|
253
|
+
|
254
|
+
use this sample line (which admit it a bit scary looking, but harmless)...
|
255
|
+
-->
|
256
|
+
<DispFldRMark><begin></begin><end></end></DispFldRMark>
|
257
|
+
|
258
|
+
<!--
|
259
|
+
or uncomment below to ignore it, the previous might even crash wv ?
|
260
|
+
-->
|
261
|
+
<!--
|
262
|
+
<DispFldRMark><begin></begin><end></end></DispFldRMark>
|
263
|
+
-->
|
264
|
+
|
265
|
+
<animation>
|
266
|
+
<begin><LasVegas.begin/><BackgroundBlink.begin/><SparkleText.begin/><MarchingAnts.begin/><MarchingRedAnts.begin/><Shimmer.begin/></begin>
|
267
|
+
<end><Shimmer.end/><MarchingRedAnts.end/><MarchingAnts.end/><SparkleText.end/><BackgroundBlink.end/><LasVegas.end/></end>
|
268
|
+
</animation>
|
269
|
+
|
270
|
+
<fontstr>
|
271
|
+
<begin></begin>
|
272
|
+
<end></end>
|
273
|
+
</fontstr>
|
274
|
+
|
275
|
+
<comment>
|
276
|
+
<begin>
|
277
|
+
</begin>
|
278
|
+
<end>
|
279
|
+
</end>
|
280
|
+
</comment>
|
281
|
+
|
282
|
+
<style name="Normal">
|
283
|
+
<character>
|
284
|
+
<begin><PropRMark.begin/><DispFldRMark.begin/><animation.begin/><fontstr.begin/><bold.begin/><italic.begin/><strike.begin/><RMarkDel.begin/><outline.begin/><smallcaps.begin/><caps.begin/><vanish.begin/><RMark.begin/><shadow.begin/><lowercase.begin/><emboss.begin/><imprint.begin/><dstrike.begin/><super.begin/><sub.begin/><singleu.begin/><wordu.begin/><doubleu.begin/><dottedu.begin/><hiddenu.begin/><thicku.begin/><dashu.begin/><dotu.begin/><dotdashu.begin/><dotdotdashu.begin/><waveu.begin/></begin>
|
285
|
+
<end><waveu.end/><dotdotdashu.end/><dotdashu.end/><dotu.end/><dashu.end/><thicku.end/><hiddenu.end/><dottedu.end/><doubleu.end/><wordu.end/><singleu.end/><sub.end/><super.end/><dstrike.end/><imprint.end/><emboss.end/><lowercase.end/><shadow.end/><RMark.end/><vanish.end/><caps.end/><smallcaps.end/><outline.end/><RMarkDel.end/><strike.end/><italic.end/><bold.end/><fontstr.end/><animation.end/><DispFldRMark.end/><PropRMark.end/></end>
|
286
|
+
</character>
|
287
|
+
|
288
|
+
<!-- Netscape does handle this correctly yet, here is how each different side of the border should work.
|
289
|
+
border-top: thin <bordertopstyle/> <bordertopcolor/>;
|
290
|
+
border-left: thin <borderleftstyle/> <borderleftcolor/>;
|
291
|
+
border-right: thin <borderrightstyle/> <borderrightcolor/>;
|
292
|
+
border-bottom: thin <borderbottomstyle/> <borderbottomcolor/>
|
293
|
+
-->
|
294
|
+
|
295
|
+
|
296
|
+
<pmargin>
|
297
|
+
<begin><!-- <mmParaBefore/> <mmParaRight/> <mmParaAfter/> <mmParaLeft/>;--></begin>
|
298
|
+
</pmargin>
|
299
|
+
|
300
|
+
<pborder>
|
301
|
+
<begin>
|
302
|
+
<!--
|
303
|
+
border: thin <borderleftstyle/> <borderleftcolor/>;
|
304
|
+
border-top: thin <bordertopstyle/> <bordertopcolor/>;
|
305
|
+
border-left: thin <borderleftstyle/> <borderleftcolor/>;
|
306
|
+
border-right: thin <borderrightstyle/> <borderrightcolor/>;
|
307
|
+
border-bottom: thin <borderbottomstyle/> <borderbottomcolor/>
|
308
|
+
-->
|
309
|
+
</begin>
|
310
|
+
</pborder>
|
311
|
+
|
312
|
+
<picture>
|
313
|
+
<begin>
|
314
|
+
</begin>
|
315
|
+
<!-- images are lacking for now -->
|
316
|
+
|
317
|
+
</picture>
|
318
|
+
|
319
|
+
</style>
|
320
|
+
|
321
|
+
<!--we need to be override the character properties-->
|
322
|
+
<!--
|
323
|
+
<style name="Normal">
|
324
|
+
<character>
|
325
|
+
<begin><PropRMark.begin/><DispFldRMark.begin/><animation.begin/><fontstr.begin/><bold.begin/><italic.begin/><strike.begin/><RMarkDel.begin/><outline.begin/><smallcaps.begin/><caps.begin/><vanish.begin/><RMark.begin/><shadow.begin/><lowercase.begin/><emboss.begin/><imprint.begin/><dstrike.begin/><super.begin/><sub.begin/><singleu.begin/><wordu.begin/><doubleu.begin/><dottedu.begin/><hiddenu.begin/><thicku.begin/><dashu.begin/><dotu.begin/><dotdashu.begin/><dotdotdashu.begin/><waveu.begin/></begin>
|
326
|
+
<end><waveu.end/><dotdotdashu.end/><dotdashu.end/><dotu.end/><dashu.end/><thicku.end/><hiddenu.end/><dottedu.end/><doubleu.end/><wordu.end/><singleu.end/><sub.end/><super.end/><dstrike.end/><imprint.end/><emboss.end/><lowercase.end/><shadow.end/><RMark.end/><vanish.end/><caps.end/><smallcaps.end/><outline.end/><RMarkDel.end/><strike.end/><italic.end/><bold.end/><fontstr.end/><animation.end/><DispFldRMark.end/><PropRMark.end/></end>
|
327
|
+
</character>
|
328
|
+
|
329
|
+
<text>
|
330
|
+
<begin></begin>
|
331
|
+
<end>
|
332
|
+
</end>
|
333
|
+
</text>
|
334
|
+
|
335
|
+
</style>
|
336
|
+
|
337
|
+
<style name="Heading 1">
|
338
|
+
|
339
|
+
<character>
|
340
|
+
<begin></begin>
|
341
|
+
<end></end>
|
342
|
+
</character>
|
343
|
+
|
344
|
+
<text>
|
345
|
+
<begin></begin>
|
346
|
+
<end>
|
347
|
+
</end>
|
348
|
+
</text>
|
349
|
+
|
350
|
+
|
351
|
+
|
352
|
+
</style>
|
353
|
+
-->
|
354
|
+
|
355
|
+
</main>
|
metadata
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: textractor
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
version: 0.0.1
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Michael Guterl
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-04-20 00:00:00 -04:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: rspec
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - ">="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
segments:
|
28
|
+
- 1
|
29
|
+
- 2
|
30
|
+
- 9
|
31
|
+
version: 1.2.9
|
32
|
+
type: :development
|
33
|
+
version_requirements: *id001
|
34
|
+
description: simple wrapper for extracting text from PDF and Word documents
|
35
|
+
email: mguterl@gmail.com
|
36
|
+
executables: []
|
37
|
+
|
38
|
+
extensions: []
|
39
|
+
|
40
|
+
extra_rdoc_files:
|
41
|
+
- LICENSE
|
42
|
+
- README.md
|
43
|
+
files:
|
44
|
+
- .document
|
45
|
+
- .gitignore
|
46
|
+
- LICENSE
|
47
|
+
- README.md
|
48
|
+
- Rakefile
|
49
|
+
- VERSION
|
50
|
+
- lib/textractor.rb
|
51
|
+
- lib/textractor/document.rb
|
52
|
+
- spec/document_spec.rb
|
53
|
+
- spec/fixtures/document.doc
|
54
|
+
- spec/fixtures/document.pdf
|
55
|
+
- spec/spec.opts
|
56
|
+
- spec/spec_helper.rb
|
57
|
+
- spec/textractor_spec.rb
|
58
|
+
- support/wvText.xml
|
59
|
+
has_rdoc: true
|
60
|
+
homepage: http://github.com/mguterl/textractor
|
61
|
+
licenses: []
|
62
|
+
|
63
|
+
post_install_message:
|
64
|
+
rdoc_options:
|
65
|
+
- --charset=UTF-8
|
66
|
+
require_paths:
|
67
|
+
- lib
|
68
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
69
|
+
requirements:
|
70
|
+
- - ">="
|
71
|
+
- !ruby/object:Gem::Version
|
72
|
+
segments:
|
73
|
+
- 0
|
74
|
+
version: "0"
|
75
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
76
|
+
requirements:
|
77
|
+
- - ">="
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
segments:
|
80
|
+
- 0
|
81
|
+
version: "0"
|
82
|
+
requirements: []
|
83
|
+
|
84
|
+
rubyforge_project:
|
85
|
+
rubygems_version: 1.3.6
|
86
|
+
signing_key:
|
87
|
+
specification_version: 3
|
88
|
+
summary: simple wrapper for extracting text from PDF and Word documents
|
89
|
+
test_files:
|
90
|
+
- spec/document_spec.rb
|
91
|
+
- spec/spec_helper.rb
|
92
|
+
- spec/textractor_spec.rb
|