textractor 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2010 Michael Guterl
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,54 @@
1
+ # textractor
2
+
3
+ textractor is a ruby library that provides a simple wrapper for
4
+ extracting text from PDF and Word documents.
5
+
6
+ ## Setup
7
+
8
+ In order to use textractor you have to install a few command line
9
+ tools.
10
+
11
+ ### OS X
12
+
13
+ port install wv pdftohtml links
14
+
15
+ I recommend using also passing +no_x11 to the install command, but
16
+ this may not work on all systems due to dependency issues.
17
+
18
+ port install wv pdftohtml links +no_x11
19
+
20
+ ### Ubuntu 8.04
21
+
22
+ apt-get install wv xpdf-utils links
23
+
24
+ ## Usage
25
+
26
+ Due to textractor's reliance on command line tools all the methods in
27
+ textractor work on paths not File objects.
28
+
29
+ document = Textractor::Document.new(path_to_document)
30
+ document.text # => "Ruby on rails developer"
31
+
32
+ There is also a convenience method on Textractor.
33
+
34
+ Textractor.text_from_file(path_to_document) # => "Ruby on rails developer"
35
+
36
+ Textractor will attempt to guess what type of document you're trying
37
+ to extract text from. However, if you know the content type of your
38
+ document, you can provide it and Textractor won't guess.
39
+
40
+ Textractor.text_from_file(path_to_document, :content_type => "application/doc")
41
+
42
+ ## Note on Patches/Pull Requests
43
+
44
+ * Fork the project.
45
+ * Make your feature addition or bug fix.
46
+ * Add tests for it. This is important so I don't break it in a
47
+ future version unintentionally.
48
+ * Commit, do not mess with rakefile, version, or history.
49
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
50
+ * Send me a pull request. Bonus points for topic branches.
51
+
52
+ ## Copyright
53
+
54
+ Copyright (c) 2010 Michael Guterl. See LICENSE for details.
@@ -0,0 +1,45 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "textractor"
8
+ gem.summary = %Q{simple wrapper for extracting text from PDF and Word documents}
9
+ gem.description = %Q{simple wrapper for extracting text from PDF and Word documents}
10
+ gem.email = "mguterl@gmail.com"
11
+ gem.homepage = "http://github.com/mguterl/textractor"
12
+ gem.authors = ["Michael Guterl"]
13
+ gem.add_development_dependency "rspec", ">= 1.2.9"
14
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
+ end
16
+ Jeweler::GemcutterTasks.new
17
+ rescue LoadError
18
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
19
+ end
20
+
21
+ require 'spec/rake/spectask'
22
+ Spec::Rake::SpecTask.new(:spec) do |spec|
23
+ spec.libs << 'lib' << 'spec'
24
+ spec.spec_files = FileList['spec/**/*_spec.rb']
25
+ end
26
+
27
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
28
+ spec.libs << 'lib' << 'spec'
29
+ spec.pattern = 'spec/**/*_spec.rb'
30
+ spec.rcov = true
31
+ end
32
+
33
+ task :spec => :check_dependencies
34
+
35
+ task :default => :spec
36
+
37
+ require 'rake/rdoctask'
38
+ Rake::RDocTask.new do |rdoc|
39
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
40
+
41
+ rdoc.rdoc_dir = 'rdoc'
42
+ rdoc.title = "textractor #{version}"
43
+ rdoc.rdoc_files.include('README*')
44
+ rdoc.rdoc_files.include('lib/**/*.rb')
45
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.1
@@ -0,0 +1,18 @@
1
+ module Textractor
2
+ autoload :Document, "textractor/document"
3
+
4
+ def self.text_from_file(filename, options = {})
5
+ Textractor::Document.new(filename, options).text
6
+ end
7
+
8
+ DEFAULT_WV_TEXT_PATH = File.expand_path(File.dirname(__FILE__) + "/../support/wvText.xml")
9
+
10
+ def self.wvText_path
11
+ @wvText_path || DEFAULT_WV_TEXT_PATH
12
+ end
13
+
14
+ def self.wvText_path=(path)
15
+ @wvText_path = path
16
+ end
17
+
18
+ end
@@ -0,0 +1,51 @@
1
+ module Textractor
2
+
3
+ class Document
4
+
5
+ CONTENT_TYPE_CONVERSIONS = {
6
+ 'application/pdf' => :pdf,
7
+ 'application/x-pdf' => :pdf,
8
+ 'application/doc' => :word,
9
+ 'application/x-doc' => :word,
10
+ }
11
+
12
+ attr_reader :filename
13
+
14
+ def initialize(filename, options = {})
15
+ @filename = File.expand_path(filename)
16
+ @content_type = options[:content_type]
17
+ end
18
+
19
+ def text
20
+ send("extract_from_#{type}")
21
+ end
22
+
23
+ def type
24
+ return CONTENT_TYPE_CONVERSIONS[content_type] if content_type
25
+ case File.extname(@filename)
26
+ when /pdf/
27
+ :pdf
28
+ when /doc/
29
+ :word
30
+ else
31
+ nil
32
+ end
33
+ end
34
+
35
+ private
36
+
37
+ def content_type
38
+ @content_type
39
+ end
40
+
41
+ def extract_from_pdf
42
+ `pdftotext #{filename} - 2>/dev/null`.strip
43
+ end
44
+
45
+ def extract_from_word
46
+ `wvWare -c utf-8 --nographics -x #{Textractor.wvText_path} #{filename} 2>/dev/null`.strip
47
+ end
48
+
49
+ end
50
+
51
+ end
@@ -0,0 +1,69 @@
1
+ require 'spec/spec_helper'
2
+
3
+ describe Textractor::Document do
4
+
5
+ PDF_DOCUMENT_FIXTURE = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.pdf")
6
+ WORD_DOCUMENT_FIXTURE = File.expand_path(File.dirname(__FILE__) + "/fixtures/document.doc")
7
+
8
+ it 'should require a filename to create' do
9
+ expect { Textractor::Document.new }.to raise_error(ArgumentError)
10
+ Textractor::Document.new('filename').filename.should == File.expand_path('filename')
11
+ end
12
+
13
+ describe "#text" do
14
+
15
+ describe "with pdf document" do
16
+
17
+ it 'should extract the text from the document' do
18
+ @doc = Textractor::Document.new(PDF_DOCUMENT_FIXTURE)
19
+ @doc.text.should == "Ruby on rails developer"
20
+ end
21
+
22
+ end
23
+
24
+ describe "with word document" do
25
+
26
+ it 'should extract the text from the document' do
27
+ @doc = Textractor::Document.new(WORD_DOCUMENT_FIXTURE)
28
+ @doc.text.should == "Ruby on rails developer"
29
+ end
30
+
31
+ end
32
+
33
+ end
34
+
35
+ describe "#type" do
36
+
37
+ describe "with no content type provided" do
38
+ it 'should return :pdf for PDF documents' do
39
+ @doc = Textractor::Document.new(PDF_DOCUMENT_FIXTURE)
40
+ @doc.type.should == :pdf
41
+ end
42
+
43
+ it 'should return :word for Word documents' do
44
+ @doc = Textractor::Document.new(WORD_DOCUMENT_FIXTURE)
45
+ @doc.type.should == :word
46
+ end
47
+
48
+ it 'should return nil for unknown documents' do
49
+ @doc = Textractor::Document.new("foo.bar")
50
+ @doc.type.should == nil
51
+ end
52
+ end
53
+
54
+ describe "with a content type provided" do
55
+
56
+ it 'should ignore the extension of the file' do
57
+ [PDF_DOCUMENT_FIXTURE, WORD_DOCUMENT_FIXTURE].each do |filename|
58
+ Textractor::Document::CONTENT_TYPE_CONVERSIONS.each do |content_type, type|
59
+ @doc = Textractor::Document.new(filename, :content_type => content_type)
60
+ @doc.type.should == type
61
+ end
62
+ end
63
+ end
64
+
65
+ end
66
+
67
+ end
68
+
69
+ end
@@ -0,0 +1 @@
1
+ --color
@@ -0,0 +1,9 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ require 'textractor'
4
+ require 'spec'
5
+ require 'spec/autorun'
6
+
7
+ Spec::Runner.configure do |config|
8
+
9
+ end
@@ -0,0 +1,32 @@
1
+ require 'spec/spec_helper'
2
+
3
+ describe Textractor do
4
+
5
+ describe ".wvText_path" do
6
+
7
+ it 'should default to the file provided with the gem' do
8
+ Textractor.wvText_path.should == Textractor::DEFAULT_WV_TEXT_PATH
9
+ end
10
+
11
+ it 'should use the new wvText_path if provided' do
12
+ Textractor.wvText_path = "foo.bar"
13
+ Textractor.wvText_path.should == "foo.bar"
14
+ end
15
+
16
+ end
17
+
18
+ describe ".text_from_file" do
19
+
20
+ it 'should return the extracted text from the file' do
21
+ document_path = 'word.doc'
22
+ document = mock("Textractor::Document", :text => "Ruby on Rails developer")
23
+ Textractor::Document.should_receive(:new).with(document_path, :content_type => "application/doc").and_return(document)
24
+ Textractor.text_from_file(document_path, :content_type => "application/doc").should == "Ruby on Rails developer"
25
+ end
26
+
27
+ end
28
+
29
+ after(:all) do
30
+ Textractor.instance_variable_set(:"@wvText_path", nil)
31
+ end
32
+ end
@@ -0,0 +1,355 @@
1
+ <main>
2
+ <charentity>
3
+ <begin>ABW</begin>
4
+ </charentity>
5
+
6
+ <document>
7
+ <begin>
8
+ </begin>
9
+ <end>
10
+ </end>
11
+ </document>
12
+
13
+ <section>
14
+ <begin>
15
+ </begin>
16
+ <end>
17
+ </end>
18
+ </section>
19
+
20
+ <justification>
21
+ <left></left>
22
+ <right></right>
23
+ <center></center>
24
+ <block></block>
25
+ <asian></asian>
26
+ </justification>
27
+
28
+ <numbering>
29
+ <Arabic>type=&quot;1&quot;</Arabic>
30
+ <UpperRoman>type=&quot;I&quot;</UpperRoman>
31
+ <LowerRoman>type=&quot;i&quot;</LowerRoman>
32
+ <UpperCaseN>type=&quot;A&quot;</UpperCaseN>
33
+ <LowerCaseN>type=&quot;a&quot;</LowerCaseN>
34
+ </numbering>
35
+
36
+ <border>
37
+ <noned></noned>
38
+ <singled></singled>
39
+ <thickd></thickd>
40
+ <doubled></doubled>
41
+ <number4d></number4d>
42
+ <hairlined></hairlined>
43
+ <dotd></dotd>
44
+ <dashlargegapd></dashlargegapd>
45
+ <dotdashd></dotdashd>
46
+ <dotdotdashd></dotdotdashd>
47
+ <tripled></tripled>
48
+ <thin-thicksmallgapd></thin-thicksmallgapd>
49
+ <thick-thinsmallgapd></thick-thinsmallgapd>
50
+ <thin-thick-thinsmallgapd></thin-thick-thinsmallgapd>
51
+ <thin-thickmediumgapd></thin-thickmediumgapd>
52
+ <thick-thinmediumgapd></thick-thinmediumgapd>
53
+ <thin-thick-thinmediumgapd></thin-thick-thinmediumgapd>
54
+ <thin-thicklargegapd></thin-thicklargegapd>
55
+ <thick-thinlargegapd></thick-thinlargegapd>
56
+ <thin-thick-thinlargegapd></thin-thick-thinlargegapd>
57
+ <waved></waved>
58
+ <doublewaved></doublewaved>
59
+ <dashsmallgapd></dashsmallgapd>
60
+ <dashdotstrokedd></dashdotstrokedd>
61
+ <emboss3Dd></emboss3Dd>
62
+ <engrave3Dd></engrave3Dd>
63
+ <defaultd></defaultd>
64
+ </border>
65
+
66
+ <olist>
67
+ <begin></begin>
68
+ <end></end>
69
+ </olist>
70
+
71
+ <ulist>
72
+ <begin></begin>
73
+ <end></end>
74
+ </ulist>
75
+
76
+ <entry>
77
+ <begin></begin>
78
+ <end></end>
79
+ </entry>
80
+
81
+ <!-- the only thing of significance -->
82
+ <text>
83
+ <begin></begin>
84
+ <end>
85
+ </end>
86
+ </text>
87
+
88
+ <!--
89
+ this tableoverride option can be used to turn off handling of
90
+ these tags in tables, which I find is necessary for at least netscape
91
+ -->
92
+ <tableoverrides>
93
+ <ParaBefore>0</ParaBefore>
94
+ <ParaRight>0</ParaRight>
95
+ <ParaAfter>0</ParaAfter>
96
+ <ParaLeft>0</ParaLeft>
97
+ <ParaLeft1>0</ParaLeft1>
98
+ <VertMergedCells>0</VertMergedCells>
99
+ </tableoverrides>
100
+
101
+ <table>
102
+ <begin></begin>
103
+ <end></end>
104
+ </table>
105
+
106
+ <row>
107
+ <begin></begin>
108
+ <end></end>
109
+ </row>
110
+
111
+ <cell>
112
+ <begin></begin>
113
+ <end></end>
114
+ </cell>
115
+
116
+ <paragraph>
117
+ <begin><text.begin/></begin>
118
+ <end><text.end/></end>
119
+ </paragraph>
120
+
121
+ <!-- these are all the character properties that can show up in word -->
122
+ <bold><begin></begin><end></end></bold>
123
+ <italic><begin></begin><end></end></italic>
124
+
125
+ <!--
126
+ text that has been deleted and will be displayed with strikethrough when
127
+ revision marked text is to be displayed
128
+
129
+ use either this line...
130
+ -->
131
+ <RMarkDel><begin></begin>
132
+ <end></end>
133
+ </RMarkDel>
134
+
135
+ <!--
136
+ or uncomment below to make deleted text dissappear (well, become commented out)
137
+ -->
138
+ <!--
139
+ <RMarkDel><begin>&lt;!-&#45;</begin><end>-&#45;&gt;</end></RMarkDel>
140
+ -->
141
+
142
+ <!-- I don't even know what outline means -->
143
+ <outline><begin></begin><end></end></outline>
144
+ <smallcaps><begin></begin><end></end></smallcaps>
145
+ <caps><begin></begin><end></end></caps>
146
+ <vanish><begin></begin><end></end></vanish>
147
+
148
+ <!--If you uncomment this then the annotation text links will become commented out by html tags-->
149
+ <!--
150
+ <vanish><begin>&lt;!-&#45;</begin><end>-&#45;&gt;</end></vanish>
151
+ -->
152
+
153
+ <!--
154
+ text that has been newly typed since the last time revision marks have been accepted
155
+ and will be displayed with underline when revision marked text is to be displayed
156
+
157
+ use either this line...
158
+ -->
159
+ <RMark><begin></begin><end></end></RMark>
160
+
161
+ <!--
162
+ or uncomment below to make the underline dissappear
163
+ -->
164
+ <!--
165
+ <RMark><begin></begin><end></end></RMark>
166
+ -->
167
+
168
+
169
+ <strike><begin></begin><end></end></strike>
170
+ <shadow><begin></begin><end></end></shadow>
171
+ <lowercase><begin></begin><end></end></lowercase>
172
+ <emboss><begin></begin><end></end></emboss>
173
+ <imprint><begin></begin><end></end></imprint>
174
+ <!--double strike-->
175
+ <dstrike><begin></begin><end></end></dstrike>
176
+
177
+ <!--
178
+ ftc's
179
+ &
180
+ hps
181
+
182
+ keep them for font face and do that later.
183
+ -->
184
+
185
+ <super><begin></begin><end></end></super>
186
+ <sub><begin></begin><end></end></sub>
187
+
188
+ <singleu><begin></begin><end></end></singleu>
189
+ <wordu><begin></begin><end></end></wordu>
190
+ <doubleu><begin></begin><end></end></doubleu>
191
+ <dottedu><begin></begin><end></end></dottedu>
192
+ <hiddenu><begin></begin><end></end></hiddenu>
193
+ <thicku><begin></begin><end></end></thicku>
194
+ <dashu><begin></begin><end></end></dashu>
195
+ <dotu><begin></begin><end></end></dotu>
196
+ <dotdashu><begin></begin><end></end></dotdashu>
197
+ <dotdotdashu><begin></begin><end></end></dotdotdashu>
198
+ <waveu><begin></begin><end></end></waveu>
199
+
200
+ <!--
201
+ text whose properties have been changed since the last time revision marks have been accepted
202
+ and will be displayed with a note showing the change points.
203
+
204
+ use either this line (which admit it a bit scary looking, but harmless)...
205
+ -->
206
+ <PropRMark><begin><ibstPropRMark/></begin><end></end></PropRMark>
207
+
208
+ <!--
209
+ or uncomment below to make the notes dissappear
210
+ -->
211
+ <!--
212
+ <PropRMark><begin></begin><end></end></PropRMark>
213
+ -->
214
+
215
+ <!--
216
+ <color>
217
+ -->
218
+ <Black><begin></begin><end></end></Black>
219
+ <Blue><begin></begin><end></end></Blue>
220
+ <Cyan><begin></begin><end></end></Cyan>
221
+ <Green><begin></begin><end></end></Green>
222
+ <Magenta><begin></begin><end></end></Magenta>
223
+ <Red><begin></begin><end></end></Red>
224
+ <Yellow><begin></begin><end></end></Yellow>
225
+ <White><begin></begin><end></end></White>
226
+ <DkBlue><begin></begin><end></end></DkBlue>
227
+ <DkCyan><begin></begin><end></end></DkCyan>
228
+ <DkGreen><begin></begin><end></end></DkGreen>
229
+ <DkMagenta><begin></begin><end></end></DkMagenta>
230
+ <DkRed><begin></begin><end></end></DkRed>
231
+ <DkYellow><begin></begin><end></end></DkYellow>
232
+ <DkGray><begin></begin><end></end></DkGray>
233
+ <LtGray><begin></begin><end></end></LtGray>
234
+ <!--
235
+ </color>
236
+ -->
237
+
238
+ <!--
239
+ <animation>
240
+ -->
241
+ <LasVegas><begin></begin><end></end></LasVegas>
242
+ <BackgroundBlink><begin></begin><end></end></BackgroundBlink>
243
+ <SparkleText><begin></begin><end></end></SparkleText>
244
+ <MarchingAnts><begin></begin><end></end></MarchingAnts>
245
+ <MarchingRedAnts><begin></begin><end></end></MarchingRedAnts>
246
+ <Shimmer><begin></begin><end></end></Shimmer>
247
+ <!--
248
+ </animation>
249
+ -->
250
+
251
+ <!--
252
+ I dont understand what this one is, and ive never come across it
253
+
254
+ use this sample line (which admit it a bit scary looking, but harmless)...
255
+ -->
256
+ <DispFldRMark><begin></begin><end></end></DispFldRMark>
257
+
258
+ <!--
259
+ or uncomment below to ignore it, the previous might even crash wv ?
260
+ -->
261
+ <!--
262
+ <DispFldRMark><begin></begin><end></end></DispFldRMark>
263
+ -->
264
+
265
+ <animation>
266
+ <begin><LasVegas.begin/><BackgroundBlink.begin/><SparkleText.begin/><MarchingAnts.begin/><MarchingRedAnts.begin/><Shimmer.begin/></begin>
267
+ <end><Shimmer.end/><MarchingRedAnts.end/><MarchingAnts.end/><SparkleText.end/><BackgroundBlink.end/><LasVegas.end/></end>
268
+ </animation>
269
+
270
+ <fontstr>
271
+ <begin></begin>
272
+ <end></end>
273
+ </fontstr>
274
+
275
+ <comment>
276
+ <begin>
277
+ </begin>
278
+ <end>
279
+ </end>
280
+ </comment>
281
+
282
+ <style name="Normal">
283
+ <character>
284
+ <begin><PropRMark.begin/><DispFldRMark.begin/><animation.begin/><fontstr.begin/><bold.begin/><italic.begin/><strike.begin/><RMarkDel.begin/><outline.begin/><smallcaps.begin/><caps.begin/><vanish.begin/><RMark.begin/><shadow.begin/><lowercase.begin/><emboss.begin/><imprint.begin/><dstrike.begin/><super.begin/><sub.begin/><singleu.begin/><wordu.begin/><doubleu.begin/><dottedu.begin/><hiddenu.begin/><thicku.begin/><dashu.begin/><dotu.begin/><dotdashu.begin/><dotdotdashu.begin/><waveu.begin/></begin>
285
+ <end><waveu.end/><dotdotdashu.end/><dotdashu.end/><dotu.end/><dashu.end/><thicku.end/><hiddenu.end/><dottedu.end/><doubleu.end/><wordu.end/><singleu.end/><sub.end/><super.end/><dstrike.end/><imprint.end/><emboss.end/><lowercase.end/><shadow.end/><RMark.end/><vanish.end/><caps.end/><smallcaps.end/><outline.end/><RMarkDel.end/><strike.end/><italic.end/><bold.end/><fontstr.end/><animation.end/><DispFldRMark.end/><PropRMark.end/></end>
286
+ </character>
287
+
288
+ <!-- Netscape does handle this correctly yet, here is how each different side of the border should work.
289
+ border-top: thin <bordertopstyle/> <bordertopcolor/>;
290
+ border-left: thin <borderleftstyle/> <borderleftcolor/>;
291
+ border-right: thin <borderrightstyle/> <borderrightcolor/>;
292
+ border-bottom: thin <borderbottomstyle/> <borderbottomcolor/>
293
+ -->
294
+
295
+
296
+ <pmargin>
297
+ <begin><!-- <mmParaBefore/> <mmParaRight/> <mmParaAfter/> <mmParaLeft/>;--></begin>
298
+ </pmargin>
299
+
300
+ <pborder>
301
+ <begin>
302
+ <!--
303
+ border: thin <borderleftstyle/> <borderleftcolor/>;
304
+ border-top: thin <bordertopstyle/> <bordertopcolor/>;
305
+ border-left: thin <borderleftstyle/> <borderleftcolor/>;
306
+ border-right: thin <borderrightstyle/> <borderrightcolor/>;
307
+ border-bottom: thin <borderbottomstyle/> <borderbottomcolor/>
308
+ -->
309
+ </begin>
310
+ </pborder>
311
+
312
+ <picture>
313
+ <begin>
314
+ </begin>
315
+ <!-- images are lacking for now -->
316
+
317
+ </picture>
318
+
319
+ </style>
320
+
321
+ <!--we need to be override the character properties-->
322
+ <!--
323
+ <style name="Normal">
324
+ <character>
325
+ <begin><PropRMark.begin/><DispFldRMark.begin/><animation.begin/><fontstr.begin/><bold.begin/><italic.begin/><strike.begin/><RMarkDel.begin/><outline.begin/><smallcaps.begin/><caps.begin/><vanish.begin/><RMark.begin/><shadow.begin/><lowercase.begin/><emboss.begin/><imprint.begin/><dstrike.begin/><super.begin/><sub.begin/><singleu.begin/><wordu.begin/><doubleu.begin/><dottedu.begin/><hiddenu.begin/><thicku.begin/><dashu.begin/><dotu.begin/><dotdashu.begin/><dotdotdashu.begin/><waveu.begin/></begin>
326
+ <end><waveu.end/><dotdotdashu.end/><dotdashu.end/><dotu.end/><dashu.end/><thicku.end/><hiddenu.end/><dottedu.end/><doubleu.end/><wordu.end/><singleu.end/><sub.end/><super.end/><dstrike.end/><imprint.end/><emboss.end/><lowercase.end/><shadow.end/><RMark.end/><vanish.end/><caps.end/><smallcaps.end/><outline.end/><RMarkDel.end/><strike.end/><italic.end/><bold.end/><fontstr.end/><animation.end/><DispFldRMark.end/><PropRMark.end/></end>
327
+ </character>
328
+
329
+ <text>
330
+ <begin></begin>
331
+ <end>
332
+ </end>
333
+ </text>
334
+
335
+ </style>
336
+
337
+ <style name="Heading 1">
338
+
339
+ <character>
340
+ <begin></begin>
341
+ <end></end>
342
+ </character>
343
+
344
+ <text>
345
+ <begin></begin>
346
+ <end>
347
+ </end>
348
+ </text>
349
+
350
+
351
+
352
+ </style>
353
+ -->
354
+
355
+ </main>
metadata ADDED
@@ -0,0 +1,92 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: textractor
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 1
9
+ version: 0.0.1
10
+ platform: ruby
11
+ authors:
12
+ - Michael Guterl
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-04-20 00:00:00 -04:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: rspec
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ segments:
28
+ - 1
29
+ - 2
30
+ - 9
31
+ version: 1.2.9
32
+ type: :development
33
+ version_requirements: *id001
34
+ description: simple wrapper for extracting text from PDF and Word documents
35
+ email: mguterl@gmail.com
36
+ executables: []
37
+
38
+ extensions: []
39
+
40
+ extra_rdoc_files:
41
+ - LICENSE
42
+ - README.md
43
+ files:
44
+ - .document
45
+ - .gitignore
46
+ - LICENSE
47
+ - README.md
48
+ - Rakefile
49
+ - VERSION
50
+ - lib/textractor.rb
51
+ - lib/textractor/document.rb
52
+ - spec/document_spec.rb
53
+ - spec/fixtures/document.doc
54
+ - spec/fixtures/document.pdf
55
+ - spec/spec.opts
56
+ - spec/spec_helper.rb
57
+ - spec/textractor_spec.rb
58
+ - support/wvText.xml
59
+ has_rdoc: true
60
+ homepage: http://github.com/mguterl/textractor
61
+ licenses: []
62
+
63
+ post_install_message:
64
+ rdoc_options:
65
+ - --charset=UTF-8
66
+ require_paths:
67
+ - lib
68
+ required_ruby_version: !ruby/object:Gem::Requirement
69
+ requirements:
70
+ - - ">="
71
+ - !ruby/object:Gem::Version
72
+ segments:
73
+ - 0
74
+ version: "0"
75
+ required_rubygems_version: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ segments:
80
+ - 0
81
+ version: "0"
82
+ requirements: []
83
+
84
+ rubyforge_project:
85
+ rubygems_version: 1.3.6
86
+ signing_key:
87
+ specification_version: 3
88
+ summary: simple wrapper for extracting text from PDF and Word documents
89
+ test_files:
90
+ - spec/document_spec.rb
91
+ - spec/spec_helper.rb
92
+ - spec/textractor_spec.rb