uhferret 1.3.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,441 @@
1
+ # This file is part of uhferret.
2
+ #
3
+ # Author:: Peter Lane
4
+ # Copyright:: Copyright 2011-2020, Peter Lane.
5
+ # License:: GPLv3
6
+ #
7
+ # uhferret is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation, either version 3 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # uhferret is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with uhferret. If not, see <http://www.gnu.org/licenses/>.
19
+
20
+ require 'uhferret_lib'
21
+ require 'utils'
22
+
23
+ module UHFerret
24
+
25
+ # Constant to indicate document is a natural-language document.
26
+ TextDocument = Uhferret_lib::Document::TypeText
27
+
28
+ # Constant to indicate document is a computer program.
29
+ CodeDocument = Uhferret_lib::Document::TypeCode
30
+
31
+ # UHFerret::Ferret holds a reference to a list of documents, and
32
+ # provides methods to manage this list of documents, compute and
33
+ # retrieve similarities between documents.
34
+ class Ferret
35
+
36
+ # Constructs an instance of Ferret.
37
+ # block:: optional block is used to add files etc during construction.
38
+ def initialize &block
39
+ @ferret = Uhferret_lib::DocumentList.new
40
+ self.instance_eval(&block) if block_given?
41
+ @ferret_run = false
42
+ end
43
+
44
+ # Add given filename to list of documents.
45
+ # The type of document can be given as:
46
+ # * UHFerret::TextDocument, for natural language documents
47
+ # * UHFerret::CodeDocument, for c-style computer programs
48
+ # Option third argument specifies the group_id for this document.
49
+ # The group_id can be used to suppress comparisons in some kinds
50
+ # of output.
51
+ # - If a pdf or word-processed document is added, it must first
52
+ # be converted to text. Ferret tries to do this, attaching .txt
53
+ # to the end of the filename.
54
+ def add(filename, type = TextDocument, id = 0)
55
+ if Utils.is_pdf_document?(filename)
56
+ filename = Utils.convert_pdf_document filename
57
+ elsif Utils.is_wp_document?(filename)
58
+ filename = Utils.convert_wp_document filename
59
+ end
60
+ @ferret.AddDocument(filename, type, (id.zero? ? @ferret.GetNewGroupId : id))
61
+ @ferret_run = false
62
+ end
63
+
64
+ # Add list of files specified in given filename
65
+ # The type of documents can be given as:
66
+ # * UHFerret::TextDocument, for natural language documents
67
+ # * UHFerret::CodeDocument, for c-style computer programs
68
+ def add_list_from_file(filename, type = TextDocument)
69
+ within_group = false
70
+ current_id = 0
71
+
72
+ IO.foreach(filename) do |line|
73
+ line.strip!
74
+ if line.upcase == "START GROUP"
75
+ within_group = true
76
+ current_id = @ferret.GetNewGroupId
77
+ elsif line.upcase == "END GROUP"
78
+ within_group = false
79
+ elsif File.readable? line
80
+ add(line, type, (within_group ? current_id : 0))
81
+ end
82
+ end
83
+
84
+ @ferret_run = false
85
+ end
86
+
87
+ # Run ferret on the current document list.
88
+ # You must run ferret before retrieving measures of containment or resemblance.
89
+ #
90
+ # Raises an ArgumentError if there are not at least two documents in the document
91
+ # list.
92
+ def run
93
+ if @ferret.Size >= 2
94
+ @ferret.RunFerret
95
+ @ferret_run = true
96
+ @sorted_pairs = []
97
+ else
98
+ raise ArgumentError.new("UHFerret needs at least two documents to run")
99
+ end
100
+ end
101
+
102
+ # Return document in document list at given index position.
103
+ #
104
+ # Raises an IndexError if index is not valid.
105
+ def [](index)
106
+ check_index index
107
+
108
+ @ferret.getDocument index
109
+ end
110
+
111
+ # Apply provided block to each document in the document list.
112
+ def each
113
+ @ferret.Size.times do |i|
114
+ yield @ferret.getDocument(i)
115
+ end
116
+ end
117
+
118
+ # Return the number of documents in the document list.
119
+ def size
120
+ @ferret.Size
121
+ end
122
+
123
+ # Return the number of pairs of documents compared.
124
+ def num_pairs
125
+ @ferret.NumberOfPairs
126
+ end
127
+
128
+ # Apply provided block to each pair of compared document indices,
129
+ # in descending order of resemblance.
130
+ #
131
+ # Raises an ArgumentError if ferret has not been 'run' before.
132
+ def each_pair
133
+ check_ferret_has_run :each_pair
134
+
135
+ if @sorted_pairs == []
136
+ # extract all valid document pairs
137
+ @ferret.Size.times do |i|
138
+ (i+1).upto(@ferret.Size-1) do |j|
139
+ @sorted_pairs << [i, j]
140
+ end
141
+ end
142
+ # sort into descending order of resemblance
143
+ @sorted_pairs.sort! do |pair_a, pair_b|
144
+ @ferret.ComputeResemblance(pair_b[0], pair_b[1]) <=>
145
+ @ferret.ComputeResemblance(pair_a[0], pair_a[1])
146
+ end
147
+ end
148
+
149
+ # apply block to each pair in sorted order
150
+ @sorted_pairs.each do |pair|
151
+ yield(pair[0], pair[1])
152
+ end
153
+ end
154
+
155
+ # Return the containment of doc_1 in doc_2.
156
+ #
157
+ # Raises an ArgumentError if ferret has not been 'run' before, and
158
+ # an IndexError if the document indices are not valid.
159
+ def containment(doc_1, doc_2)
160
+ check_ferret_has_run :containment
161
+ check_index doc_1
162
+ check_index doc_2
163
+
164
+ @ferret.ComputeContainment(doc_1, doc_2)
165
+ end
166
+
167
+ # Return the resemblance of doc_1 and doc_2.
168
+ #
169
+ # Raises an ArgumentError if ferret has not been 'run' before, and
170
+ # an IndexError if the document indices are not valid.
171
+ def resemblance(doc_1, doc_2)
172
+ check_ferret_has_run :resemblance
173
+ check_index doc_1
174
+ check_index doc_2
175
+
176
+ if doc_1 == doc_2
177
+ return 1.0
178
+ else
179
+ @ferret.ComputeResemblance([doc_1, doc_2].min, [doc_1, doc_2].max)
180
+ end
181
+ end
182
+
183
+ # Return the number of trigrams in given document index.
184
+ #
185
+ # Raises an ArgumentError if ferret has not been 'run' before, and
186
+ # an IndexError if the document index is not valid.
187
+ def trigram_count index
188
+ check_ferret_has_run :trigram_count
189
+ check_index index
190
+
191
+ @ferret.CountTrigrams index
192
+ end
193
+
194
+ # Return the total number of distinct trigrams in set of documents.
195
+ #
196
+ # Raises an ArgumentError if ferret has not been 'run' before calling.
197
+ def distinct_trigrams_count
198
+ check_ferret_has_run :distinct_trigrams_count
199
+
200
+ @ferret.GetTotalTrigramCount
201
+ end
202
+
203
+ # Return the number of matching trigrams in given two document indices.
204
+ #
205
+ # Raises an ArgumentError if ferret has not been 'run' before, and
206
+ # an IndexError if the document indices are not valid.
207
+ def trigram_matches(doc_1, doc_2)
208
+ check_ferret_has_run :trigram_matches
209
+ check_index doc_1
210
+ check_index doc_2
211
+
212
+ @ferret.CountMatches(doc_1, doc_2)
213
+ end
214
+
215
+ # Write an XML report of the given two document indices into given filename.
216
+ #
217
+ # Raises an ArgumentError if ferret has not been 'run' before, and
218
+ # an IndexError if the document indices are not valid.
219
+ def xml_output(output_file, doc_1, doc_2)
220
+ check_ferret_has_run :xml_output
221
+ check_index doc_1
222
+ check_index doc_2
223
+
224
+ File.open(output_file, "w") do |file|
225
+ file.puts "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>"
226
+ file.puts "<?xml-stylesheet type=\"text/xsl\" href=\"uhferret.xsl\" ?>"
227
+ file.puts "<uhferret>"
228
+
229
+ file.puts "<common-trigrams>#{trigram_matches(doc_1, doc_2)}</common-trigrams>"
230
+ file.puts "<similarity>#{resemblance(doc_1, doc_2)}</similarity>"
231
+ write_xml_document(file, doc_1, doc_2)
232
+ write_xml_document(file, doc_2, doc_1)
233
+
234
+ file.puts "</uhferret>"
235
+ end
236
+ end
237
+
238
+ # displays each pair of documents, sorted in order of similarity
239
+ def output_similarity_table(full_path = false)
240
+ puts "Number of documents: #{size}"
241
+ puts "Number of distinct trigrams: #{distinct_trigrams_count}"
242
+ each_pair do |i, j|
243
+ unless self[i].group_id == self[j].group_id
244
+ if full_path
245
+ puts "#{self[i].pathname} ; #{self[j].pathname} ; \
246
+ #{trigram_matches(i, j)} ; #{trigram_count(i)} ; #{trigram_count(j)} ; \
247
+ #{resemblance(i, j)}"
248
+ else
249
+ puts "#{self[i].filename} ; #{self[j].filename} ; \
250
+ #{trigram_matches(i, j)} ; #{trigram_count(i)} ; #{trigram_count(j)} ; \
251
+ #{resemblance(i, j)}"
252
+ end
253
+ end
254
+ end
255
+ end
256
+
257
+ # outputs similarity table as a html page, sorted in order of similarity
258
+ def output_html_similarity_table
259
+ puts <<BODY
260
+ <html><body>
261
+ <h1>Ferret: Table of Comparisons</h1>
262
+ <p>Return to <a href="/ferret/home">Ferret home page</a>.</p>
263
+ <table border=1><tbody><tr><th>Index</th><th>Document 1</th><th>Document 2</th><th>Similarity</th><th>View</th></tr>
264
+ BODY
265
+ idx = 0
266
+ each_pair do |i, j|
267
+ unless self[i].group_id == self[j].group_id
268
+ idx += 1
269
+ break if idx > MAX_TABLE_SIZE
270
+
271
+ puts <<ROW
272
+ <tr>
273
+ <td> #{idx} </td>
274
+ <td> #{format_file(self[i].pathname)} </td>
275
+ <td> #{format_file(self[j].pathname)} </td>
276
+ <td> #{format("%0.3f", resemblance(i, j))} </td>
277
+ <td><a href="/ferret/report?upload=#{Dir.pwd}&file1=#{self[i].pathname}&file2=#{self[j].pathname}" target="_blank"\>View</a></td>
278
+ </tr>
279
+ ROW
280
+ end
281
+ end
282
+ puts "</tbody></table></p>"
283
+
284
+ puts <<TAIL
285
+ <hr>
286
+ <p>Return to <a href="/ferret/home">Ferret home page.</a>
287
+ <hr><font size=-1>Generated by Ferret, Copyright 2012 University of Hertfordshire</font>
288
+ </body></html>
289
+ TAIL
290
+ end
291
+
292
+ # outputs a list of trigrams with the document indices in which they
293
+ # appear, indices are space separated
294
+ def output_trigram_list
295
+ begin
296
+ tuples = @ferret.GetTupleSet
297
+ tuples.Begin
298
+ while tuples.HasMore
299
+ print @ferret.MakeTrigramString(tuples.GetToken(0),
300
+ tuples.GetToken(1),
301
+ tuples.GetToken(2))
302
+ print " FILES:[ "
303
+ doc_indices = tuples.GetDocumentsForCurrentTuple
304
+ doc_indices.size.times do |i|
305
+ print "#{doc_indices[i]} "
306
+ end
307
+ print " ]"
308
+ puts
309
+ tuples.GetNext
310
+ end
311
+ rescue Exception => ex
312
+ puts "Error in writing trigram list: #{ex}"
313
+ end
314
+ end
315
+
316
+ # outputs a table of all comparisons, suitable for loading into a spreadsheet
317
+ def output_all_comparisons
318
+ # -- output headings
319
+ size.times do |i|
320
+ print ", #{self[i].filename}"
321
+ end
322
+ puts
323
+ # -- output comparisons
324
+ size.times do |i|
325
+ print self[i].filename
326
+ size.times do |j|
327
+ print ", #{resemblance(i, j)}"
328
+ end
329
+ puts
330
+ end
331
+ end
332
+
333
+ private
334
+ def rm_cwd dir
335
+ dir[(Dir.pwd.length+1)..-1]
336
+ end
337
+
338
+ private
339
+ def format_file file
340
+ rm_cwd(File.dirname(file)) + "/<b>" + File.basename(file) + "</b>"
341
+ end
342
+
343
+
344
+ private
345
+ def write_xml_document(out, doc_1, doc_2)
346
+ # -- output header
347
+ out.puts "<document>"
348
+ out.puts "<source>#{self[doc_1].pathname}</source>"
349
+ out.puts "<num-trigrams>#{self.trigram_count(doc_1)}</num-trigrams>"
350
+ out.puts "<containment>#{self.containment(doc_1, doc_2)}</containment>"
351
+ out.puts "<text>"
352
+ # -- output document itself
353
+ source_text = IO.readlines(self[doc_1].pathname).join
354
+ source_document = self[doc_1]
355
+ source_document.StartInput(@ferret.GetTokenSet)
356
+ last_written = 0
357
+ inside_block = false
358
+ while source_document.ReadTrigram(@ferret.GetTokenSet)
359
+ if @ferret.IsMatchingTrigram(
360
+ source_document.GetToken(0),
361
+ source_document.GetToken(1),
362
+ source_document.GetToken(2),
363
+ doc_1,
364
+ doc_2
365
+ )
366
+ unless inside_block
367
+ if last_written > 0
368
+ out.print "]]></block>" # end the last block
369
+ end
370
+ out.print "<block text=\"copied\"><![CDATA[" # start copied block
371
+ inside_block = true
372
+ end
373
+ out.print source_text[last_written, source_document.GetTrigramEnd - last_written]
374
+ last_written = source_document.GetTrigramEnd
375
+ else
376
+ if last_written < source_document.GetTrigramStart(1)
377
+ if inside_block or last_written.zero? # moving from inside block to not
378
+ if last_written > 0
379
+ out.print "]]></block>" # end the last block
380
+ end
381
+ out.print "<block text=\"normal\"><![CDATA[" # start normal block
382
+ inside_block = false
383
+ end
384
+ out.print source_text[last_written, source_document.GetTrigramStart(1) - last_written]
385
+ last_written = source_document.GetTrigramStart(1)
386
+ end
387
+ end
388
+ end
389
+ if last_written < source_text.length
390
+ if inside_block
391
+ out.print "]]></block>" # end the last block
392
+ inside_block = false
393
+ out.print "<block text=\"normal\"><![CDATA[" # start normal block
394
+ end
395
+ out.print source_text[last_written..-1] # finish printing whole of source
396
+ end
397
+ unless last_written.zero? # i.e. nothing has been written
398
+ out.print "]]></block>" # end the last block
399
+ end
400
+ # -- output footer
401
+ out.puts "</text>"
402
+ out.puts "</document>"
403
+ # -- close up document
404
+ source_document.CloseInput
405
+ end
406
+
407
+ private
408
+ def check_index index
409
+ unless index >= 0 and index < @ferret.Size
410
+ raise IndexError.new("Index #{index} not in range [0, #{@ferret.Size})")
411
+ end
412
+ end
413
+
414
+ def check_ferret_has_run method
415
+ unless @ferret_run
416
+ raise ArgumentError.new("UHFerret must be 'run' before #{method} can be calculated.")
417
+ end
418
+ end
419
+ end
420
+
421
+ # Extend the native class with some convenience methods.
422
+ class Uhferret_lib::Document
423
+
424
+ # Return the filename for this document.
425
+ def filename
426
+ File.basename(self.GetPathname)
427
+ end
428
+
429
+ # Return the full pathname for this document.
430
+ def pathname
431
+ self.GetPathname
432
+ end
433
+
434
+ # Return the id for this document.
435
+ def group_id
436
+ self.GetGroupId
437
+ end
438
+ end
439
+
440
+ end
441
+
@@ -0,0 +1,93 @@
1
+ #--
2
+ # This file is part of uhferret.
3
+ #
4
+ # Author:: Peter Lane
5
+ # Copyright:: Copyright 2012-20, Peter Lane.
6
+ # License:: GPLv3
7
+ #
8
+ # uhferret is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # uhferret is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with uhferret. If not, see <http://www.gnu.org/licenses/>.
20
+
21
+ # TODO: Make the conversions etc work on Windows as well as Linux.
22
+
23
+ #
24
+ # A collection of methods to support checking and converting different
25
+ # document file types.
26
+ #
27
+ module Utils
28
+
29
+ # Check if given command is present on the system
30
+ def Utils.command_present? command
31
+ `which #{command}` != ""
32
+ end
33
+
34
+ # Create a list of permitted compressed file extensions
35
+ # depending on the available commands
36
+ CompressedFileExtensions = []
37
+ [["unrar", ["rar"]],
38
+ ["tar", ["tar.bz2", "tar.gz", "tbz2", "tgz"]],
39
+ ["unzip", ["zip"]]].each do |defn|
40
+ if Utils.command_present? defn[0]
41
+ CompressedFileExtensions.concat defn[1]
42
+ end
43
+ end
44
+
45
+ # Return true if the filename has a file ending for code
46
+ def Utils.is_code? filename
47
+ [".c", ".h", ".cpp", ".java"].include? File.extname(filename)
48
+ end
49
+
50
+ # Return true if the filename has a valid extension
51
+ def Utils.valid_document? filename
52
+ Utils.is_code? filename or
53
+ (".txt" == File.extname(filename)) or
54
+ Utils.is_pdf_document? filename or
55
+ Utils.is_wp_document? filename
56
+ end
57
+
58
+ # Return true if the filename ends with .pdf and so is a pdf document.
59
+ def Utils.is_pdf_document? filename
60
+ ".pdf" == File.extname(filename)
61
+ end
62
+
63
+ # Return true if the filename ends with a known word processor extension.
64
+ def Utils.is_wp_document? filename
65
+ [".doc", ".rtf", ".docx", ".abw"].include? File.extname(filename)
66
+ end
67
+
68
+ # Use pdf2txt to convert the pdf file to text
69
+ # The output is the converted filename, obtained by adding .txt to
70
+ # the given filename
71
+ def Utils.convert_pdf_document filename
72
+ if Utils.command_present?("pdftotext")
73
+ output_filename = "#{filename}.txt"
74
+ `pdftotext -layout -enc Latin1 -nopgbrk #{filename} #{output_filename}`
75
+ return output_filename
76
+ else
77
+ return filename
78
+ end
79
+ end
80
+
81
+ # Use abiword to convert the word-processed file to text
82
+ # The output is the converted filename, obtained by adding .txt to
83
+ # the given filename
84
+ def Utils.convert_wp_document filename
85
+ if Utils.command_present?("abiword")
86
+ output_filename = "#{filename}.txt"
87
+ `abiword --to=txt #{filename} -o #{output_filename}`
88
+ return output_filename
89
+ else
90
+ return filename
91
+ end
92
+ end
93
+ end