textractor 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2010 Michael Guterl
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,54 @@
1
+ # textractor
2
+
3
+ textractor is a ruby library that provides a simple wrapper for
4
+ extracting text from PDF and Word documents.
5
+
6
+ ## Setup
7
+
8
+ In order to use textractor you have to install a few command line
9
+ tools.
10
+
11
+ ### OS X
12
+
13
+ port install wv pdftohtml links
14
+
15
+ I recommend using also passing +no_x11 to the install command, but
16
+ this may not work on all systems due to dependency issues.
17
+
18
+ port install wv pdftohtml links +no_x11
19
+
20
+ ### Ubuntu 8.04
21
+
22
+ apt-get install wv xpdf-utils links
23
+
24
+ ## Usage
25
+
26
+ Due to textractor's reliance on command line tools all the methods in
27
+ textractor work on paths not File objects.
28
+
29
+ document = Textractor::Document.new(path_to_document)
30
+ document.text # => "Ruby on rails developer"
31
+
32
+ There is also a convenience method on Textractor.
33
+
34
+ Textractor.text_from_file(path_to_document) # => "Ruby on rails developer"
35
+
36
+ Textractor will attempt to guess what type of document you're trying
37
+ to extract text from. However, if you know the content type of your
38
+ document, you can provide it and Textractor won't guess.
39
+
40
+ Textractor.text_from_file(path_to_document, :content_type => "application/doc")
41
+
42
+ ## Note on Patches/Pull Requests
43
+
44
+ * Fork the project.
45
+ * Make your feature addition or bug fix.
46
+ * Add tests for it. This is important so I don't break it in a
47
+ future version unintentionally.
48
+ * Commit, do not mess with rakefile, version, or history.
49
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
50
+ * Send me a pull request. Bonus points for topic branches.
51
+
52
+ ## Copyright
53
+
54
+ Copyright (c) 2010 Michael Guterl. See LICENSE for details.
@@ -0,0 +1,45 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "textractor"
8
+ gem.summary = %Q{simple wrapper for extracting text from PDF and Word documents}
9
+ gem.description = %Q{simple wrapper for extracting text from PDF and Word documents}
10
+ gem.email = "mguterl@gmail.com"
11
+ gem.homepage = "http://github.com/mguterl/textractor"
12
+ gem.authors = ["Michael Guterl"]
13
+ gem.add_development_dependency "rspec", ">= 1.2.9"
14
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
+ end
16
+ Jeweler::GemcutterTasks.new
17
+ rescue LoadError
18
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
19
+ end
20
+
21
+ require 'spec/rake/spectask'
22
+ Spec::Rake::SpecTask.new(:spec) do |spec|
23
+ spec.libs << 'lib' << 'spec'
24
+ spec.spec_files = FileList['spec/**/*_spec.rb']
25
+ end
26
+
27
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
28
+ spec.libs << 'lib' << 'spec'
29
+ spec.pattern = 'spec/**/*_spec.rb'
30
+ spec.rcov = true
31
+ end
32
+
33
+ task :spec => :check_dependencies
34
+
35
+ task :default => :spec
36
+
37
+ require 'rake/rdoctask'
38
+ Rake::RDocTask.new do |rdoc|
39
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
40
+
41
+ rdoc.rdoc_dir = 'rdoc'
42
+ rdoc.title = "textractor #{version}"
43
+ rdoc.rdoc_files.include('README*')
44
+ rdoc.rdoc_files.include('lib/**/*.rb')
45
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.1
@@ -0,0 +1,18 @@
1
+ module Textractor
2
+ autoload :Document, "textractor/document"
3
+
4
+ def self.text_from_file(filename, options = {})
5
+ Textractor::Document.new(filename, options).text
6
+ end
7
+
8
+ DEFAULT_WV_TEXT_PATH = File.expand_path(File.dirname(__FILE__) + "/../support/wvText.xml")
9
+
10
+ def self.wvText_path
11
+ @wvText_path || DEFAULT_WV_TEXT_PATH
12
+ end
13
+
14
+ def self.wvText_path=(path)
15
+ @wvText_path = path
16
+ end
17
+
18
+ end
@@ -0,0 +1,51 @@
1
+ module Textractor
2
+
3
+ class Document
4
+
5
+ CONTENT_TYPE_CONVERSIONS = {
6
+ 'application/pdf' => :pdf,
7
+ 'application/x-pdf' => :pdf,
8
+ 'application/doc' => :word,
9
+ 'application/x-doc' => :word,
10
+ }
11
+
12
+ attr_reader :filename
13
+
14
+ def initialize(filename, options = {})
15
+ @filename = File.expand_path(filename)
16
+ @content_type = options[:content_type]
17
+ end
18
+
19
+ def text
20
+ send("extract_from_#{type}")
21
+ end
22
+
23
+ def type
24
+ return CONTENT_TYPE_CONVERSIONS[content_type] if content_type
25
+ case File.extname(@filename)
26
+ when /pdf/
27
+ :pdf
28
+ when /doc/
29
+ :word
30
+ else
31
+ nil
32
+ end
33
+ end
34
+
35
+ private
36
+
37
+ def content_type
38
+ @content_type
39
+ end
40
+
41
+ def extract_from_pdf
42
+ `pdftotext #{filename} - 2>/dev/null`.strip
43
+ end
44
+
45
+ def extract_from_word
46
+ `wvWare -c utf-8 --nographics -x #{Textractor.wvText_path} #{filename} 2>/dev/null`.strip
47
+ end
48
+
49
+ end
50
+
51
+ end
@@ -0,0 +1,69 @@
1
+ require 'spec/spec_helper'
2
+
3
+ describe Textractor::Document do
4
+
5
+ PDF_DOCUMENT_FIXTURE = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.pdf")
6
+ WORD_DOCUMENT_FIXTURE = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.doc")
7
+
8
+ it 'should require a filename to create' do
9
+ expect { Textractor::Document.new }.to raise_error(ArgumentError)
10
+ Textractor::Document.new('filename').filename.should == File.expand_path('filename')
11
+ end
12
+
13
+ describe "#text" do
14
+
15
+ describe "with pdf document" do
16
+
17
+ it 'should extract the text from the document' do
18
+ @doc = Textractor::Document.new(PDF_DOCUMENT_FIXTURE)
19
+ @doc.text.should == "Ruby on rails developer"
20
+ end
21
+
22
+ end
23
+
24
+ describe "with word document" do
25
+
26
+ it 'should extract the text from the document' do
27
+ @doc = Textractor::Document.new(WORD_DOCUMENT_FIXTURE)
28
+ @doc.text.should == "Ruby on rails developer"
29
+ end
30
+
31
+ end
32
+
33
+ end
34
+
35
+ describe "#type" do
36
+
37
+ describe "with no content type provided" do
38
+ it 'should return :pdf for PDF documents' do
39
+ @doc = Textractor::Document.new(PDF_DOCUMENT_FIXTURE)
40
+ @doc.type.should == :pdf
41
+ end
42
+
43
+ it 'should return :word for Word documents' do
44
+ @doc = Textractor::Document.new(WORD_DOCUMENT_FIXTURE)
45
+ @doc.type.should == :word
46
+ end
47
+
48
+ it 'should return nil for unknown documents' do
49
+ @doc = Textractor::Document.new("foo.bar")
50
+ @doc.type.should == nil
51
+ end
52
+ end
53
+
54
+ describe "with a content type provided" do
55
+
56
+ it 'should ignore the extension of the file' do
57
+ [PDF_DOCUMENT_FIXTURE, WORD_DOCUMENT_FIXTURE].each do |filename|
58
+ Textractor::Document::CONTENT_TYPE_CONVERSIONS.each do |content_type, type|
59
+ @doc = Textractor::Document.new(filename, :content_type => content_type)
60
+ @doc.type.should == type
61
+ end
62
+ end
63
+ end
64
+
65
+ end
66
+
67
+ end
68
+
69
+ end
@@ -0,0 +1 @@
1
+ --color
@@ -0,0 +1,9 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ require 'textractor'
4
+ require 'spec'
5
+ require 'spec/autorun'
6
+
7
+ Spec::Runner.configure do |config|
8
+
9
+ end
@@ -0,0 +1,32 @@
1
+ require 'spec/spec_helper'
2
+
3
+ describe Textractor do
4
+
5
+ describe ".wvText_path" do
6
+
7
+ it 'should default to the file provided with the gem' do
8
+ Textractor.wvText_path.should == Textractor::DEFAULT_WV_TEXT_PATH
9
+ end
10
+
11
+ it 'should use the new wvText_path if provided' do
12
+ Textractor.wvText_path = "foo.bar"
13
+ Textractor.wvText_path.should == "foo.bar"
14
+ end
15
+
16
+ end
17
+
18
+ describe ".text_from_file" do
19
+
20
+ it 'should return the extracted text from the file' do
21
+ document_path = 'word.doc'
22
+ document = mock("Textractor::Document", :text => "Ruby on Rails developer")
23
+ Textractor::Document.should_receive(:new).with(document_path, :content_type => "application/doc").and_return(document)
24
+ Textractor.text_from_file(document_path, :content_type => "application/doc").should == "Ruby on Rails developer"
25
+ end
26
+
27
+ end
28
+
29
+ after(:all) do
30
+ Textractor.instance_variable_set(:"@wvText_path", nil)
31
+ end
32
+ end
@@ -0,0 +1,355 @@
1
+ <main>
2
+ <charentity>
3
+ <begin>ABW</begin>
4
+ </charentity>
5
+
6
+ <document>
7
+ <begin>
8
+ </begin>
9
+ <end>
10
+ </end>
11
+ </document>
12
+
13
+ <section>
14
+ <begin>
15
+ </begin>
16
+ <end>
17
+ </end>
18
+ </section>
19
+
20
+ <justification>
21
+ <left></left>
22
+ <right></right>
23
+ <center></center>
24
+ <block></block>
25
+ <asian></asian>
26
+ </justification>
27
+
28
+ <numbering>
29
+ <Arabic>type=&quot;1&quot;</Arabic>
30
+ <UpperRoman>type=&quot;I&quot;</UpperRoman>
31
+ <LowerRoman>type=&quot;i&quot;</LowerRoman>
32
+ <UpperCaseN>type=&quot;A&quot;</UpperCaseN>
33
+ <LowerCaseN>type=&quot;a&quot;</LowerCaseN>
34
+ </numbering>
35
+
36
+ <border>
37
+ <noned></noned>
38
+ <singled></singled>
39
+ <thickd></thickd>
40
+ <doubled></doubled>
41
+ <number4d></number4d>
42
+ <hairlined></hairlined>
43
+ <dotd></dotd>
44
+ <dashlargegapd></dashlargegapd>
45
+ <dotdashd></dotdashd>
46
+ <dotdotdashd></dotdotdashd>
47
+ <tripled></tripled>
48
+ <thin-thicksmallgapd></thin-thicksmallgapd>
49
+ <thick-thinsmallgapd></thick-thinsmallgapd>
50
+ <thin-thick-thinsmallgapd></thin-thick-thinsmallgapd>
51
+ <thin-thickmediumgapd></thin-thickmediumgapd>
52
+ <thick-thinmediumgapd></thick-thinmediumgapd>
53
+ <thin-thick-thinmediumgapd></thin-thick-thinmediumgapd>
54
+ <thin-thicklargegapd></thin-thicklargegapd>
55
+ <thick-thinlargegapd></thick-thinlargegapd>
56
+ <thin-thick-thinlargegapd></thin-thick-thinlargegapd>
57
+ <waved></waved>
58
+ <doublewaved></doublewaved>
59
+ <dashsmallgapd></dashsmallgapd>
60
+ <dashdotstrokedd></dashdotstrokedd>
61
+ <emboss3Dd></emboss3Dd>
62
+ <engrave3Dd></engrave3Dd>
63
+ <defaultd></defaultd>
64
+ </border>
65
+
66
+ <olist>
67
+ <begin></begin>
68
+ <end></end>
69
+ </olist>
70
+
71
+ <ulist>
72
+ <begin></begin>
73
+ <end></end>
74
+ </ulist>
75
+
76
+ <entry>
77
+ <begin></begin>
78
+ <end></end>
79
+ </entry>
80
+
81
+ <!-- the only thing of significance -->
82
+ <text>
83
+ <begin></begin>
84
+ <end>
85
+ </end>
86
+ </text>
87
+
88
+ <!--
89
+ this tableoverride option can be used to turn off handling of
90
+ these tags in tables, which I find is necessary for at least netscape
91
+ -->
92
+ <tableoverrides>
93
+ <ParaBefore>0</ParaBefore>
94
+ <ParaRight>0</ParaRight>
95
+ <ParaAfter>0</ParaAfter>
96
+ <ParaLeft>0</ParaLeft>
97
+ <ParaLeft1>0</ParaLeft1>
98
+ <VertMergedCells>0</VertMergedCells>
99
+ </tableoverrides>
100
+
101
+ <table>
102
+ <begin></begin>
103
+ <end></end>
104
+ </table>
105
+
106
+ <row>
107
+ <begin></begin>
108
+ <end></end>
109
+ </row>
110
+
111
+ <cell>
112
+ <begin></begin>
113
+ <end></end>
114
+ </cell>
115
+
116
+ <paragraph>
117
+ <begin><text.begin/></begin>
118
+ <end><text.end/></end>
119
+ </paragraph>
120
+
121
+ <!-- these are all the character properties that can show up in word -->
122
+ <bold><begin></begin><end></end></bold>
123
+ <italic><begin></begin><end></end></italic>
124
+
125
+ <!--
126
+ text that has been deleted and will be displayed with strikethrough when
127
+ revision marked text is to be displayed
128
+
129
+ use either this line...
130
+ -->
131
+ <RMarkDel><begin></begin>
132
+ <end></end>
133
+ </RMarkDel>
134
+
135
+ <!--
136
+ or uncomment below to make deleted text dissappear (well, become commented out)
137
+ -->
138
+ <!--
139
+ <RMarkDel><begin>&lt;!-&#45;</begin><end>-&#45;&gt;</end></RMarkDel>
140
+ -->
141
+
142
+ <!-- I don't even know what outline means -->
143
+ <outline><begin></begin><end></end></outline>
144
+ <smallcaps><begin></begin><end></end></smallcaps>
145
+ <caps><begin></begin><end></end></caps>
146
+ <vanish><begin></begin><end></end></vanish>
147
+
148
+ <!--If you uncomment this then the annotation text links will become commented out by html tags-->
149
+ <!--
150
+ <vanish><begin>&lt;!-&#45;</begin><end>-&#45;&gt;</end></vanish>
151
+ -->
152
+
153
+ <!--
154
+ text that has been newly typed since the last time revision marks have been accepted
155
+ and will be displayed with underline when revision marked text is to be displayed
156
+
157
+ use either this line...
158
+ -->
159
+ <RMark><begin></begin><end></end></RMark>
160
+
161
+ <!--
162
+ or uncomment below to make the underline dissappear
163
+ -->
164
+ <!--
165
+ <RMark><begin></begin><end></end></RMark>
166
+ -->
167
+
168
+
169
+ <strike><begin></begin><end></end></strike>
170
+ <shadow><begin></begin><end></end></shadow>
171
+ <lowercase><begin></begin><end></end></lowercase>
172
+ <emboss><begin></begin><end></end></emboss>
173
+ <imprint><begin></begin><end></end></imprint>
174
+ <!--double strike-->
175
+ <dstrike><begin></begin><end></end></dstrike>
176
+
177
+ <!--
178
+ ftc's
179
+ &
180
+ hps
181
+
182
+ keep them for font face and do that later.
183
+ -->
184
+
185
+ <super><begin></begin><end></end></super>
186
+ <sub><begin></begin><end></end></sub>
187
+
188
+ <singleu><begin></begin><end></end></singleu>
189
+ <wordu><begin></begin><end></end></wordu>
190
+ <doubleu><begin></begin><end></end></doubleu>
191
+ <dottedu><begin></begin><end></end></dottedu>
192
+ <hiddenu><begin></begin><end></end></hiddenu>
193
+ <thicku><begin></begin><end></end></thicku>
194
+ <dashu><begin></begin><end></end></dashu>
195
+ <dotu><begin></begin><end></end></dotu>
196
+ <dotdashu><begin></begin><end></end></dotdashu>
197
+ <dotdotdashu><begin></begin><end></end></dotdotdashu>
198
+ <waveu><begin></begin><end></end></waveu>
199
+
200
+ <!--
201
+ text whose properties have been changed since the last time revision marks have been accepted
202
+ and will be displayed with a note showing the change points.
203
+
204
+ use either this line (which admit it a bit scary looking, but harmless)...
205
+ -->
206
+ <PropRMark><begin><ibstPropRMark/></begin><end></end></PropRMark>
207
+
208
+ <!--
209
+ or uncomment below to make the notes dissappear
210
+ -->
211
+ <!--
212
+ <PropRMark><begin></begin><end></end></PropRMark>
213
+ -->
214
+
215
+ <!--
216
+ <color>
217
+ -->
218
+ <Black><begin></begin><end></end></Black>
219
+ <Blue><begin></begin><end></end></Blue>
220
+ <Cyan><begin></begin><end></end></Cyan>
221
+ <Green><begin></begin><end></end></Green>
222
+ <Magenta><begin></begin><end></end></Magenta>
223
+ <Red><begin></begin><end></end></Red>
224
+ <Yellow><begin></begin><end></end></Yellow>
225
+ <White><begin></begin><end></end></White>
226
+ <DkBlue><begin></begin><end></end></DkBlue>
227
+ <DkCyan><begin></begin><end></end></DkCyan>
228
+ <DkGreen><begin></begin><end></end></DkGreen>
229
+ <DkMagenta><begin></begin><end></end></DkMagenta>
230
+ <DkRed><begin></begin><end></end></DkRed>
231
+ <DkYellow><begin></begin><end></end></DkYellow>
232
+ <DkGray><begin></begin><end></end></DkGray>
233
+ <LtGray><begin></begin><end></end></LtGray>
234
+ <!--
235
+ </color>
236
+ -->
237
+
238
+ <!--
239
+ <animation>
240
+ -->
241
+ <LasVegas><begin></begin><end></end></LasVegas>
242
+ <BackgroundBlink><begin></begin><end></end></BackgroundBlink>
243
+ <SparkleText><begin></begin><end></end></SparkleText>
244
+ <MarchingAnts><begin></begin><end></end></MarchingAnts>
245
+ <MarchingRedAnts><begin></begin><end></end></MarchingRedAnts>
246
+ <Shimmer><begin></begin><end></end></Shimmer>
247
+ <!--
248
+ </animation>
249
+ -->
250
+
251
+ <!--
252
+ I dont understand what this one is, and ive never come across it
253
+
254
+ use this sample line (which admit it a bit scary looking, but harmless)...
255
+ -->
256
+ <DispFldRMark><begin></begin><end></end></DispFldRMark>
257
+
258
+ <!--
259
+ or uncomment below to ignore it, the previous might even crash wv ?
260
+ -->
261
+ <!--
262
+ <DispFldRMark><begin></begin><end></end></DispFldRMark>
263
+ -->
264
+
265
+ <animation>
266
+ <begin><LasVegas.begin/><BackgroundBlink.begin/><SparkleText.begin/><MarchingAnts.begin/><MarchingRedAnts.begin/><Shimmer.begin/></begin>
267
+ <end><Shimmer.end/><MarchingRedAnts.end/><MarchingAnts.end/><SparkleText.end/><BackgroundBlink.end/><LasVegas.end/></end>
268
+ </animation>
269
+
270
+ <fontstr>
271
+ <begin></begin>
272
+ <end></end>
273
+ </fontstr>
274
+
275
+ <comment>
276
+ <begin>
277
+ </begin>
278
+ <end>
279
+ </end>
280
+ </comment>
281
+
282
+ <style name="Normal">
283
+ <character>
284
+ <begin><PropRMark.begin/><DispFldRMark.begin/><animation.begin/><fontstr.begin/><bold.begin/><italic.begin/><strike.begin/><RMarkDel.begin/><outline.begin/><smallcaps.begin/><caps.begin/><vanish.begin/><RMark.begin/><shadow.begin/><lowercase.begin/><emboss.begin/><imprint.begin/><dstrike.begin/><super.begin/><sub.begin/><singleu.begin/><wordu.begin/><doubleu.begin/><dottedu.begin/><hiddenu.begin/><thicku.begin/><dashu.begin/><dotu.begin/><dotdashu.begin/><dotdotdashu.begin/><waveu.begin/></begin>
285
+ <end><waveu.end/><dotdotdashu.end/><dotdashu.end/><dotu.end/><dashu.end/><thicku.end/><hiddenu.end/><dottedu.end/><doubleu.end/><wordu.end/><singleu.end/><sub.end/><super.end/><dstrike.end/><imprint.end/><emboss.end/><lowercase.end/><shadow.end/><RMark.end/><vanish.end/><caps.end/><smallcaps.end/><outline.end/><RMarkDel.end/><strike.end/><italic.end/><bold.end/><fontstr.end/><animation.end/><DispFldRMark.end/><PropRMark.end/></end>
286
+ </character>
287
+
288
+ <!-- Netscape does handle this correctly yet, here is how each different side of the border should work.
289
+ border-top: thin <bordertopstyle/> <bordertopcolor/>;
290
+ border-left: thin <borderleftstyle/> <borderleftcolor/>;
291
+ border-right: thin <borderrightstyle/> <borderrightcolor/>;
292
+ border-bottom: thin <borderbottomstyle/> <borderbottomcolor/>
293
+ -->
294
+
295
+
296
+ <pmargin>
297
+ <begin><!-- <mmParaBefore/> <mmParaRight/> <mmParaAfter/> <mmParaLeft/>;--></begin>
298
+ </pmargin>
299
+
300
+ <pborder>
301
+ <begin>
302
+ <!--
303
+ border: thin <borderleftstyle/> <borderleftcolor/>;
304
+ border-top: thin <bordertopstyle/> <bordertopcolor/>;
305
+ border-left: thin <borderleftstyle/> <borderleftcolor/>;
306
+ border-right: thin <borderrightstyle/> <borderrightcolor/>;
307
+ border-bottom: thin <borderbottomstyle/> <borderbottomcolor/>
308
+ -->
309
+ </begin>
310
+ </pborder>
311
+
312
+ <picture>
313
+ <begin>
314
+ </begin>
315
+ <!-- images are lacking for now -->
316
+
317
+ </picture>
318
+
319
+ </style>
320
+
321
+ <!--we need to be override the character properties-->
322
+ <!--
323
+ <style name="Normal">
324
+ <character>
325
+ <begin><PropRMark.begin/><DispFldRMark.begin/><animation.begin/><fontstr.begin/><bold.begin/><italic.begin/><strike.begin/><RMarkDel.begin/><outline.begin/><smallcaps.begin/><caps.begin/><vanish.begin/><RMark.begin/><shadow.begin/><lowercase.begin/><emboss.begin/><imprint.begin/><dstrike.begin/><super.begin/><sub.begin/><singleu.begin/><wordu.begin/><doubleu.begin/><dottedu.begin/><hiddenu.begin/><thicku.begin/><dashu.begin/><dotu.begin/><dotdashu.begin/><dotdotdashu.begin/><waveu.begin/></begin>
326
+ <end><waveu.end/><dotdotdashu.end/><dotdashu.end/><dotu.end/><dashu.end/><thicku.end/><hiddenu.end/><dottedu.end/><doubleu.end/><wordu.end/><singleu.end/><sub.end/><super.end/><dstrike.end/><imprint.end/><emboss.end/><lowercase.end/><shadow.end/><RMark.end/><vanish.end/><caps.end/><smallcaps.end/><outline.end/><RMarkDel.end/><strike.end/><italic.end/><bold.end/><fontstr.end/><animation.end/><DispFldRMark.end/><PropRMark.end/></end>
327
+ </character>
328
+
329
+ <text>
330
+ <begin></begin>
331
+ <end>
332
+ </end>
333
+ </text>
334
+
335
+ </style>
336
+
337
+ <style name="Heading 1">
338
+
339
+ <character>
340
+ <begin></begin>
341
+ <end></end>
342
+ </character>
343
+
344
+ <text>
345
+ <begin></begin>
346
+ <end>
347
+ </end>
348
+ </text>
349
+
350
+
351
+
352
+ </style>
353
+ -->
354
+
355
+ </main>
metadata ADDED
@@ -0,0 +1,92 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: textractor
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 1
9
+ version: 0.0.1
10
+ platform: ruby
11
+ authors:
12
+ - Michael Guterl
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-04-20 00:00:00 -04:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: rspec
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ segments:
28
+ - 1
29
+ - 2
30
+ - 9
31
+ version: 1.2.9
32
+ type: :development
33
+ version_requirements: *id001
34
+ description: simple wrapper for extracting text from PDF and Word documents
35
+ email: mguterl@gmail.com
36
+ executables: []
37
+
38
+ extensions: []
39
+
40
+ extra_rdoc_files:
41
+ - LICENSE
42
+ - README.md
43
+ files:
44
+ - .document
45
+ - .gitignore
46
+ - LICENSE
47
+ - README.md
48
+ - Rakefile
49
+ - VERSION
50
+ - lib/textractor.rb
51
+ - lib/textractor/document.rb
52
+ - spec/document_spec.rb
53
+ - spec/fixtures/document.doc
54
+ - spec/fixtures/document.pdf
55
+ - spec/spec.opts
56
+ - spec/spec_helper.rb
57
+ - spec/textractor_spec.rb
58
+ - support/wvText.xml
59
+ has_rdoc: true
60
+ homepage: http://github.com/mguterl/textractor
61
+ licenses: []
62
+
63
+ post_install_message:
64
+ rdoc_options:
65
+ - --charset=UTF-8
66
+ require_paths:
67
+ - lib
68
+ required_ruby_version: !ruby/object:Gem::Requirement
69
+ requirements:
70
+ - - ">="
71
+ - !ruby/object:Gem::Version
72
+ segments:
73
+ - 0
74
+ version: "0"
75
+ required_rubygems_version: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ segments:
80
+ - 0
81
+ version: "0"
82
+ requirements: []
83
+
84
+ rubyforge_project:
85
+ rubygems_version: 1.3.6
86
+ signing_key:
87
+ specification_version: 3
88
+ summary: simple wrapper for extracting text from PDF and Word documents
89
+ test_files:
90
+ - spec/document_spec.rb
91
+ - spec/spec_helper.rb
92
+ - spec/textractor_spec.rb