uhferret 1.3.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/COPYING.txt +674 -0
- data/README.rdoc +79 -0
- data/bin/uhferret +129 -0
- data/bin/uhferret-server +68 -0
- data/ext/document.cpp +231 -0
- data/ext/document.h +89 -0
- data/ext/documentlist.cpp +229 -0
- data/ext/documentlist.h +80 -0
- data/ext/extconf.rb +2 -0
- data/ext/tokenreader.cpp +196 -0
- data/ext/tokenreader.h +85 -0
- data/ext/tokenset.cpp +111 -0
- data/ext/tokenset.h +73 -0
- data/ext/tupleset.cpp +150 -0
- data/ext/tupleset.h +92 -0
- data/ext/uhferret_lib_wrap.cxx +10726 -0
- data/lib/uhferret.rb +441 -0
- data/lib/utils.rb +93 -0
- data/lib/webferret.rb +246 -0
- metadata +71 -0
data/lib/uhferret.rb
ADDED
@@ -0,0 +1,441 @@
|
|
1
|
+
# This file is part of uhferret.
|
2
|
+
#
|
3
|
+
# Author:: Peter Lane
|
4
|
+
# Copyright:: Copyright 2011-2020, Peter Lane.
|
5
|
+
# License:: GPLv3
|
6
|
+
#
|
7
|
+
# uhferret is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU General Public License as published by
|
9
|
+
# the Free Software Foundation, either version 3 of the License, or
|
10
|
+
# (at your option) any later version.
|
11
|
+
#
|
12
|
+
# uhferret is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU General Public License
|
18
|
+
# along with uhferret. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
|
20
|
+
require 'uhferret_lib'
|
21
|
+
require 'utils'
|
22
|
+
|
23
|
+
module UHFerret
|
24
|
+
|
25
|
+
# Constant to indicate document is a natural-language document.
|
26
|
+
TextDocument = Uhferret_lib::Document::TypeText
|
27
|
+
|
28
|
+
# Constant to indicate document is a computer program.
|
29
|
+
CodeDocument = Uhferret_lib::Document::TypeCode
|
30
|
+
|
31
|
+
# UHFerret::Ferret holds a reference to a list of documents, and
|
32
|
+
# provides methods to manage this list of documents, compute and
|
33
|
+
# retrieve similarities between documents.
|
34
|
+
class Ferret
|
35
|
+
|
36
|
+
# Constructs an instance of Ferret.
|
37
|
+
# block:: optional block is used to add files etc during construction.
|
38
|
+
def initialize &block
|
39
|
+
@ferret = Uhferret_lib::DocumentList.new
|
40
|
+
self.instance_eval(&block) if block_given?
|
41
|
+
@ferret_run = false
|
42
|
+
end
|
43
|
+
|
44
|
+
# Add given filename to list of documents.
|
45
|
+
# The type of document can be given as:
|
46
|
+
# * UHFerret::TextDocument, for natural language documents
|
47
|
+
# * UHFerret::CodeDocument, for c-style computer programs
|
48
|
+
# Option third argument specifies the group_id for this document.
|
49
|
+
# The group_id can be used to suppress comparisons in some kinds
|
50
|
+
# of output.
|
51
|
+
# - If a pdf or word-processed document is added, it must first
|
52
|
+
# be converted to text. Ferret tries to do this, attaching .txt
|
53
|
+
# to the end of the filename.
|
54
|
+
def add(filename, type = TextDocument, id = 0)
|
55
|
+
if Utils.is_pdf_document?(filename)
|
56
|
+
filename = Utils.convert_pdf_document filename
|
57
|
+
elsif Utils.is_wp_document?(filename)
|
58
|
+
filename = Utils.convert_wp_document filename
|
59
|
+
end
|
60
|
+
@ferret.AddDocument(filename, type, (id.zero? ? @ferret.GetNewGroupId : id))
|
61
|
+
@ferret_run = false
|
62
|
+
end
|
63
|
+
|
64
|
+
# Add list of files specified in given filename
|
65
|
+
# The type of documents can be given as:
|
66
|
+
# * UHFerret::TextDocument, for natural language documents
|
67
|
+
# * UHFerret::CodeDocument, for c-style computer programs
|
68
|
+
def add_list_from_file(filename, type = TextDocument)
|
69
|
+
within_group = false
|
70
|
+
current_id = 0
|
71
|
+
|
72
|
+
IO.foreach(filename) do |line|
|
73
|
+
line.strip!
|
74
|
+
if line.upcase == "START GROUP"
|
75
|
+
within_group = true
|
76
|
+
current_id = @ferret.GetNewGroupId
|
77
|
+
elsif line.upcase == "END GROUP"
|
78
|
+
within_group = false
|
79
|
+
elsif File.readable? line
|
80
|
+
add(line, type, (within_group ? current_id : 0))
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
@ferret_run = false
|
85
|
+
end
|
86
|
+
|
87
|
+
# Run ferret on the current document list.
|
88
|
+
# You must run ferret before retrieving measures of containment or resemblance.
|
89
|
+
#
|
90
|
+
# Raises an ArgumentError if there are not at least two documents in the document
|
91
|
+
# list.
|
92
|
+
def run
|
93
|
+
if @ferret.Size >= 2
|
94
|
+
@ferret.RunFerret
|
95
|
+
@ferret_run = true
|
96
|
+
@sorted_pairs = []
|
97
|
+
else
|
98
|
+
raise ArgumentError.new("UHFerret needs at least two documents to run")
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
# Return document in document list at given index position.
|
103
|
+
#
|
104
|
+
# Raises an IndexError if index is not valid.
|
105
|
+
def [](index)
|
106
|
+
check_index index
|
107
|
+
|
108
|
+
@ferret.getDocument index
|
109
|
+
end
|
110
|
+
|
111
|
+
# Apply provided block to each document in the document list.
|
112
|
+
def each
|
113
|
+
@ferret.Size.times do |i|
|
114
|
+
yield @ferret.getDocument(i)
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
# Return the number of documents in the document list.
|
119
|
+
def size
|
120
|
+
@ferret.Size
|
121
|
+
end
|
122
|
+
|
123
|
+
# Return the number of pairs of documents compared.
|
124
|
+
def num_pairs
|
125
|
+
@ferret.NumberOfPairs
|
126
|
+
end
|
127
|
+
|
128
|
+
# Apply provided block to each pair of compared document indices,
|
129
|
+
# in descending order of resemblance.
|
130
|
+
#
|
131
|
+
# Raises an ArgumentError if ferret has not been 'run' before.
|
132
|
+
def each_pair
|
133
|
+
check_ferret_has_run :each_pair
|
134
|
+
|
135
|
+
if @sorted_pairs == []
|
136
|
+
# extract all valid document pairs
|
137
|
+
@ferret.Size.times do |i|
|
138
|
+
(i+1).upto(@ferret.Size-1) do |j|
|
139
|
+
@sorted_pairs << [i, j]
|
140
|
+
end
|
141
|
+
end
|
142
|
+
# sort into descending order of resemblance
|
143
|
+
@sorted_pairs.sort! do |pair_a, pair_b|
|
144
|
+
@ferret.ComputeResemblance(pair_b[0], pair_b[1]) <=>
|
145
|
+
@ferret.ComputeResemblance(pair_a[0], pair_a[1])
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
# apply block to each pair in sorted order
|
150
|
+
@sorted_pairs.each do |pair|
|
151
|
+
yield(pair[0], pair[1])
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
# Return the containment of doc_1 in doc_2.
|
156
|
+
#
|
157
|
+
# Raises an ArgumentError if ferret has not been 'run' before, and
|
158
|
+
# an IndexError if the document indices are not valid.
|
159
|
+
def containment(doc_1, doc_2)
|
160
|
+
check_ferret_has_run :containment
|
161
|
+
check_index doc_1
|
162
|
+
check_index doc_2
|
163
|
+
|
164
|
+
@ferret.ComputeContainment(doc_1, doc_2)
|
165
|
+
end
|
166
|
+
|
167
|
+
# Return the resemblance of doc_1 and doc_2.
|
168
|
+
#
|
169
|
+
# Raises an ArgumentError if ferret has not been 'run' before, and
|
170
|
+
# an IndexError if the document indices are not valid.
|
171
|
+
def resemblance(doc_1, doc_2)
|
172
|
+
check_ferret_has_run :resemblance
|
173
|
+
check_index doc_1
|
174
|
+
check_index doc_2
|
175
|
+
|
176
|
+
if doc_1 == doc_2
|
177
|
+
return 1.0
|
178
|
+
else
|
179
|
+
@ferret.ComputeResemblance([doc_1, doc_2].min, [doc_1, doc_2].max)
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
# Return the number of trigrams in given document index.
|
184
|
+
#
|
185
|
+
# Raises an ArgumentError if ferret has not been 'run' before, and
|
186
|
+
# an IndexError if the document index is not valid.
|
187
|
+
def trigram_count index
|
188
|
+
check_ferret_has_run :trigram_count
|
189
|
+
check_index index
|
190
|
+
|
191
|
+
@ferret.CountTrigrams index
|
192
|
+
end
|
193
|
+
|
194
|
+
# Return the total number of distinct trigrams in set of documents.
|
195
|
+
#
|
196
|
+
# Raises an ArgumentError if ferret has not been 'run' before calling.
|
197
|
+
def distinct_trigrams_count
|
198
|
+
check_ferret_has_run :distinct_trigrams_count
|
199
|
+
|
200
|
+
@ferret.GetTotalTrigramCount
|
201
|
+
end
|
202
|
+
|
203
|
+
# Return the number of matching trigrams in given two document indices.
|
204
|
+
#
|
205
|
+
# Raises an ArgumentError if ferret has not been 'run' before, and
|
206
|
+
# an IndexError if the document indices are not valid.
|
207
|
+
def trigram_matches(doc_1, doc_2)
|
208
|
+
check_ferret_has_run :trigram_matches
|
209
|
+
check_index doc_1
|
210
|
+
check_index doc_2
|
211
|
+
|
212
|
+
@ferret.CountMatches(doc_1, doc_2)
|
213
|
+
end
|
214
|
+
|
215
|
+
# Write an XML report of the given two document indices into given filename.
|
216
|
+
#
|
217
|
+
# Raises an ArgumentError if ferret has not been 'run' before, and
|
218
|
+
# an IndexError if the document indices are not valid.
|
219
|
+
def xml_output(output_file, doc_1, doc_2)
|
220
|
+
check_ferret_has_run :xml_output
|
221
|
+
check_index doc_1
|
222
|
+
check_index doc_2
|
223
|
+
|
224
|
+
File.open(output_file, "w") do |file|
|
225
|
+
file.puts "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>"
|
226
|
+
file.puts "<?xml-stylesheet type=\"text/xsl\" href=\"uhferret.xsl\" ?>"
|
227
|
+
file.puts "<uhferret>"
|
228
|
+
|
229
|
+
file.puts "<common-trigrams>#{trigram_matches(doc_1, doc_2)}</common-trigrams>"
|
230
|
+
file.puts "<similarity>#{resemblance(doc_1, doc_2)}</similarity>"
|
231
|
+
write_xml_document(file, doc_1, doc_2)
|
232
|
+
write_xml_document(file, doc_2, doc_1)
|
233
|
+
|
234
|
+
file.puts "</uhferret>"
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
# displays each pair of documents, sorted in order of similarity
|
239
|
+
def output_similarity_table(full_path = false)
|
240
|
+
puts "Number of documents: #{size}"
|
241
|
+
puts "Number of distinct trigrams: #{distinct_trigrams_count}"
|
242
|
+
each_pair do |i, j|
|
243
|
+
unless self[i].group_id == self[j].group_id
|
244
|
+
if full_path
|
245
|
+
puts "#{self[i].pathname} ; #{self[j].pathname} ; \
|
246
|
+
#{trigram_matches(i, j)} ; #{trigram_count(i)} ; #{trigram_count(j)} ; \
|
247
|
+
#{resemblance(i, j)}"
|
248
|
+
else
|
249
|
+
puts "#{self[i].filename} ; #{self[j].filename} ; \
|
250
|
+
#{trigram_matches(i, j)} ; #{trigram_count(i)} ; #{trigram_count(j)} ; \
|
251
|
+
#{resemblance(i, j)}"
|
252
|
+
end
|
253
|
+
end
|
254
|
+
end
|
255
|
+
end
|
256
|
+
|
257
|
+
# outputs similarity table as a html page, sorted in order of similarity
|
258
|
+
def output_html_similarity_table
|
259
|
+
puts <<BODY
|
260
|
+
<html><body>
|
261
|
+
<h1>Ferret: Table of Comparisons</h1>
|
262
|
+
<p>Return to <a href="/ferret/home">Ferret home page</a>.</p>
|
263
|
+
<table border=1><tbody><tr><th>Index</th><th>Document 1</th><th>Document 2</th><th>Similarity</th><th>View</th></tr>
|
264
|
+
BODY
|
265
|
+
idx = 0
|
266
|
+
each_pair do |i, j|
|
267
|
+
unless self[i].group_id == self[j].group_id
|
268
|
+
idx += 1
|
269
|
+
break if idx > MAX_TABLE_SIZE
|
270
|
+
|
271
|
+
puts <<ROW
|
272
|
+
<tr>
|
273
|
+
<td> #{idx} </td>
|
274
|
+
<td> #{format_file(self[i].pathname)} </td>
|
275
|
+
<td> #{format_file(self[j].pathname)} </td>
|
276
|
+
<td> #{format("%0.3f", resemblance(i, j))} </td>
|
277
|
+
<td><a href="/ferret/report?upload=#{Dir.pwd}&file1=#{self[i].pathname}&file2=#{self[j].pathname}" target="_blank"\>View</a></td>
|
278
|
+
</tr>
|
279
|
+
ROW
|
280
|
+
end
|
281
|
+
end
|
282
|
+
puts "</tbody></table></p>"
|
283
|
+
|
284
|
+
puts <<TAIL
|
285
|
+
<hr>
|
286
|
+
<p>Return to <a href="/ferret/home">Ferret home page.</a>
|
287
|
+
<hr><font size=-1>Generated by Ferret, Copyright 2012 University of Hertfordshire</font>
|
288
|
+
</body></html>
|
289
|
+
TAIL
|
290
|
+
end
|
291
|
+
|
292
|
+
# outputs a list of trigrams with the document indices in which they
|
293
|
+
# appear, indices are space separated
|
294
|
+
def output_trigram_list
|
295
|
+
begin
|
296
|
+
tuples = @ferret.GetTupleSet
|
297
|
+
tuples.Begin
|
298
|
+
while tuples.HasMore
|
299
|
+
print @ferret.MakeTrigramString(tuples.GetToken(0),
|
300
|
+
tuples.GetToken(1),
|
301
|
+
tuples.GetToken(2))
|
302
|
+
print " FILES:[ "
|
303
|
+
doc_indices = tuples.GetDocumentsForCurrentTuple
|
304
|
+
doc_indices.size.times do |i|
|
305
|
+
print "#{doc_indices[i]} "
|
306
|
+
end
|
307
|
+
print " ]"
|
308
|
+
puts
|
309
|
+
tuples.GetNext
|
310
|
+
end
|
311
|
+
rescue Exception => ex
|
312
|
+
puts "Error in writing trigram list: #{ex}"
|
313
|
+
end
|
314
|
+
end
|
315
|
+
|
316
|
+
# outputs a table of all comparisons, suitable for loading into a spreadsheet
|
317
|
+
def output_all_comparisons
|
318
|
+
# -- output headings
|
319
|
+
size.times do |i|
|
320
|
+
print ", #{self[i].filename}"
|
321
|
+
end
|
322
|
+
puts
|
323
|
+
# -- output comparisons
|
324
|
+
size.times do |i|
|
325
|
+
print self[i].filename
|
326
|
+
size.times do |j|
|
327
|
+
print ", #{resemblance(i, j)}"
|
328
|
+
end
|
329
|
+
puts
|
330
|
+
end
|
331
|
+
end
|
332
|
+
|
333
|
+
private
|
334
|
+
def rm_cwd dir
|
335
|
+
dir[(Dir.pwd.length+1)..-1]
|
336
|
+
end
|
337
|
+
|
338
|
+
private
|
339
|
+
def format_file file
|
340
|
+
rm_cwd(File.dirname(file)) + "/<b>" + File.basename(file) + "</b>"
|
341
|
+
end
|
342
|
+
|
343
|
+
|
344
|
+
private
|
345
|
+
def write_xml_document(out, doc_1, doc_2)
|
346
|
+
# -- output header
|
347
|
+
out.puts "<document>"
|
348
|
+
out.puts "<source>#{self[doc_1].pathname}</source>"
|
349
|
+
out.puts "<num-trigrams>#{self.trigram_count(doc_1)}</num-trigrams>"
|
350
|
+
out.puts "<containment>#{self.containment(doc_1, doc_2)}</containment>"
|
351
|
+
out.puts "<text>"
|
352
|
+
# -- output document itself
|
353
|
+
source_text = IO.readlines(self[doc_1].pathname).join
|
354
|
+
source_document = self[doc_1]
|
355
|
+
source_document.StartInput(@ferret.GetTokenSet)
|
356
|
+
last_written = 0
|
357
|
+
inside_block = false
|
358
|
+
while source_document.ReadTrigram(@ferret.GetTokenSet)
|
359
|
+
if @ferret.IsMatchingTrigram(
|
360
|
+
source_document.GetToken(0),
|
361
|
+
source_document.GetToken(1),
|
362
|
+
source_document.GetToken(2),
|
363
|
+
doc_1,
|
364
|
+
doc_2
|
365
|
+
)
|
366
|
+
unless inside_block
|
367
|
+
if last_written > 0
|
368
|
+
out.print "]]></block>" # end the last block
|
369
|
+
end
|
370
|
+
out.print "<block text=\"copied\"><![CDATA[" # start copied block
|
371
|
+
inside_block = true
|
372
|
+
end
|
373
|
+
out.print source_text[last_written, source_document.GetTrigramEnd - last_written]
|
374
|
+
last_written = source_document.GetTrigramEnd
|
375
|
+
else
|
376
|
+
if last_written < source_document.GetTrigramStart(1)
|
377
|
+
if inside_block or last_written.zero? # moving from inside block to not
|
378
|
+
if last_written > 0
|
379
|
+
out.print "]]></block>" # end the last block
|
380
|
+
end
|
381
|
+
out.print "<block text=\"normal\"><![CDATA[" # start normal block
|
382
|
+
inside_block = false
|
383
|
+
end
|
384
|
+
out.print source_text[last_written, source_document.GetTrigramStart(1) - last_written]
|
385
|
+
last_written = source_document.GetTrigramStart(1)
|
386
|
+
end
|
387
|
+
end
|
388
|
+
end
|
389
|
+
if last_written < source_text.length
|
390
|
+
if inside_block
|
391
|
+
out.print "]]></block>" # end the last block
|
392
|
+
inside_block = false
|
393
|
+
out.print "<block text=\"normal\"><![CDATA[" # start normal block
|
394
|
+
end
|
395
|
+
out.print source_text[last_written..-1] # finish printing whole of source
|
396
|
+
end
|
397
|
+
unless last_written.zero? # i.e. nothing has been written
|
398
|
+
out.print "]]></block>" # end the last block
|
399
|
+
end
|
400
|
+
# -- output footer
|
401
|
+
out.puts "</text>"
|
402
|
+
out.puts "</document>"
|
403
|
+
# -- close up document
|
404
|
+
source_document.CloseInput
|
405
|
+
end
|
406
|
+
|
407
|
+
private
|
408
|
+
def check_index index
|
409
|
+
unless index >= 0 and index < @ferret.Size
|
410
|
+
raise IndexError.new("Index #{index} not in range [0, #{@ferret.Size})")
|
411
|
+
end
|
412
|
+
end
|
413
|
+
|
414
|
+
def check_ferret_has_run method
|
415
|
+
unless @ferret_run
|
416
|
+
raise ArgumentError.new("UHFerret must be 'run' before #{method} can be calculated.")
|
417
|
+
end
|
418
|
+
end
|
419
|
+
end
|
420
|
+
|
421
|
+
# Extend the native class with some convenience methods.
|
422
|
+
class Uhferret_lib::Document
|
423
|
+
|
424
|
+
# Return the filename for this document.
|
425
|
+
def filename
|
426
|
+
File.basename(self.GetPathname)
|
427
|
+
end
|
428
|
+
|
429
|
+
# Return the full pathname for this document.
|
430
|
+
def pathname
|
431
|
+
self.GetPathname
|
432
|
+
end
|
433
|
+
|
434
|
+
# Return the id for this document.
|
435
|
+
def group_id
|
436
|
+
self.GetGroupId
|
437
|
+
end
|
438
|
+
end
|
439
|
+
|
440
|
+
end
|
441
|
+
|
data/lib/utils.rb
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
#--
|
2
|
+
# This file is part of uhferret.
|
3
|
+
#
|
4
|
+
# Author:: Peter Lane
|
5
|
+
# Copyright:: Copyright 2012-20, Peter Lane.
|
6
|
+
# License:: GPLv3
|
7
|
+
#
|
8
|
+
# uhferret is free software: you can redistribute it and/or modify
|
9
|
+
# it under the terms of the GNU General Public License as published by
|
10
|
+
# the Free Software Foundation, either version 3 of the License, or
|
11
|
+
# (at your option) any later version.
|
12
|
+
#
|
13
|
+
# uhferret is distributed in the hope that it will be useful,
|
14
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
16
|
+
# GNU General Public License for more details.
|
17
|
+
#
|
18
|
+
# You should have received a copy of the GNU General Public License
|
19
|
+
# along with uhferret. If not, see <http://www.gnu.org/licenses/>.
|
20
|
+
|
21
|
+
# TODO: Make the conversions etc work on Windows as well as Linux.
|
22
|
+
|
23
|
+
#
|
24
|
+
# A collection of methods to support checking and converting different
|
25
|
+
# document file types.
|
26
|
+
#
|
27
|
+
module Utils
|
28
|
+
|
29
|
+
# Check if given command is present on the system
|
30
|
+
def Utils.command_present? command
|
31
|
+
`which #{command}` != ""
|
32
|
+
end
|
33
|
+
|
34
|
+
# Create a list of permitted compressed file extensions
|
35
|
+
# depending on the available commands
|
36
|
+
CompressedFileExtensions = []
|
37
|
+
[["unrar", ["rar"]],
|
38
|
+
["tar", ["tar.bz2", "tar.gz", "tbz2", "tgz"]],
|
39
|
+
["unzip", ["zip"]]].each do |defn|
|
40
|
+
if Utils.command_present? defn[0]
|
41
|
+
CompressedFileExtensions.concat defn[1]
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# Return true if the filename has a file ending for code
|
46
|
+
def Utils.is_code? filename
|
47
|
+
[".c", ".h", ".cpp", ".java"].include? File.extname(filename)
|
48
|
+
end
|
49
|
+
|
50
|
+
# Return true if the filename has a valid extension
|
51
|
+
def Utils.valid_document? filename
|
52
|
+
Utils.is_code? filename or
|
53
|
+
(".txt" == File.extname(filename)) or
|
54
|
+
Utils.is_pdf_document? filename or
|
55
|
+
Utils.is_wp_document? filename
|
56
|
+
end
|
57
|
+
|
58
|
+
# Return true if the filename ends with .pdf and so is a pdf document.
|
59
|
+
def Utils.is_pdf_document? filename
|
60
|
+
".pdf" == File.extname(filename)
|
61
|
+
end
|
62
|
+
|
63
|
+
# Return true if the filename ends with a known word processor extension.
|
64
|
+
def Utils.is_wp_document? filename
|
65
|
+
[".doc", ".rtf", ".docx", ".abw"].include? File.extname(filename)
|
66
|
+
end
|
67
|
+
|
68
|
+
# Use pdf2txt to convert the pdf file to text
|
69
|
+
# The output is the converted filename, obtained by adding .txt to
|
70
|
+
# the given filename
|
71
|
+
def Utils.convert_pdf_document filename
|
72
|
+
if Utils.command_present?("pdftotext")
|
73
|
+
output_filename = "#{filename}.txt"
|
74
|
+
`pdftotext -layout -enc Latin1 -nopgbrk #{filename} #{output_filename}`
|
75
|
+
return output_filename
|
76
|
+
else
|
77
|
+
return filename
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# Use abiword to convert the word-processed file to text
|
82
|
+
# The output is the converted filename, obtained by adding .txt to
|
83
|
+
# the given filename
|
84
|
+
def Utils.convert_wp_document filename
|
85
|
+
if Utils.command_present?("abiword")
|
86
|
+
output_filename = "#{filename}.txt"
|
87
|
+
`abiword --to=txt #{filename} -o #{output_filename}`
|
88
|
+
return output_filename
|
89
|
+
else
|
90
|
+
return filename
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|