uhferret 1.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,441 @@
1
+ # This file is part of uhferret.
2
+ #
3
+ # Author:: Peter Lane
4
+ # Copyright:: Copyright 2011-2020, Peter Lane.
5
+ # License:: GPLv3
6
+ #
7
+ # uhferret is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU General Public License as published by
9
+ # the Free Software Foundation, either version 3 of the License, or
10
+ # (at your option) any later version.
11
+ #
12
+ # uhferret is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU General Public License
18
+ # along with uhferret. If not, see <http://www.gnu.org/licenses/>.
19
+
20
+ require 'uhferret_lib'
21
+ require 'utils'
22
+
23
+ module UHFerret
24
+
25
+ # Constant to indicate document is a natural-language document.
26
+ TextDocument = Uhferret_lib::Document::TypeText
27
+
28
+ # Constant to indicate document is a computer program.
29
+ CodeDocument = Uhferret_lib::Document::TypeCode
30
+
31
+ # UHFerret::Ferret holds a reference to a list of documents, and
32
+ # provides methods to manage this list of documents, compute and
33
+ # retrieve similarities between documents.
34
+ class Ferret
35
+
36
+ # Constructs an instance of Ferret.
37
+ # block:: optional block is used to add files etc during construction.
38
+ def initialize &block
39
+ @ferret = Uhferret_lib::DocumentList.new
40
+ self.instance_eval(&block) if block_given?
41
+ @ferret_run = false
42
+ end
43
+
44
+ # Add given filename to list of documents.
45
+ # The type of document can be given as:
46
+ # * UHFerret::TextDocument, for natural language documents
47
+ # * UHFerret::CodeDocument, for c-style computer programs
48
+ # Option third argument specifies the group_id for this document.
49
+ # The group_id can be used to suppress comparisons in some kinds
50
+ # of output.
51
+ # - If a pdf or word-processed document is added, it must first
52
+ # be converted to text. Ferret tries to do this, attaching .txt
53
+ # to the end of the filename.
54
+ def add(filename, type = TextDocument, id = 0)
55
+ if Utils.is_pdf_document?(filename)
56
+ filename = Utils.convert_pdf_document filename
57
+ elsif Utils.is_wp_document?(filename)
58
+ filename = Utils.convert_wp_document filename
59
+ end
60
+ @ferret.AddDocument(filename, type, (id.zero? ? @ferret.GetNewGroupId : id))
61
+ @ferret_run = false
62
+ end
63
+
64
+ # Add list of files specified in given filename
65
+ # The type of documents can be given as:
66
+ # * UHFerret::TextDocument, for natural language documents
67
+ # * UHFerret::CodeDocument, for c-style computer programs
68
+ def add_list_from_file(filename, type = TextDocument)
69
+ within_group = false
70
+ current_id = 0
71
+
72
+ IO.foreach(filename) do |line|
73
+ line.strip!
74
+ if line.upcase == "START GROUP"
75
+ within_group = true
76
+ current_id = @ferret.GetNewGroupId
77
+ elsif line.upcase == "END GROUP"
78
+ within_group = false
79
+ elsif File.readable? line
80
+ add(line, type, (within_group ? current_id : 0))
81
+ end
82
+ end
83
+
84
+ @ferret_run = false
85
+ end
86
+
87
+ # Run ferret on the current document list.
88
+ # You must run ferret before retrieving measures of containment or resemblance.
89
+ #
90
+ # Raises an ArgumentError if there are not at least two documents in the document
91
+ # list.
92
+ def run
93
+ if @ferret.Size >= 2
94
+ @ferret.RunFerret
95
+ @ferret_run = true
96
+ @sorted_pairs = []
97
+ else
98
+ raise ArgumentError.new("UHFerret needs at least two documents to run")
99
+ end
100
+ end
101
+
102
+ # Return document in document list at given index position.
103
+ #
104
+ # Raises an IndexError if index is not valid.
105
+ def [](index)
106
+ check_index index
107
+
108
+ @ferret.getDocument index
109
+ end
110
+
111
+ # Apply provided block to each document in the document list.
112
+ def each
113
+ @ferret.Size.times do |i|
114
+ yield @ferret.getDocument(i)
115
+ end
116
+ end
117
+
118
+ # Return the number of documents in the document list.
119
+ def size
120
+ @ferret.Size
121
+ end
122
+
123
+ # Return the number of pairs of documents compared.
124
+ def num_pairs
125
+ @ferret.NumberOfPairs
126
+ end
127
+
128
+ # Apply provided block to each pair of compared document indices,
129
+ # in descending order of resemblance.
130
+ #
131
+ # Raises an ArgumentError if ferret has not been 'run' before.
132
+ def each_pair
133
+ check_ferret_has_run :each_pair
134
+
135
+ if @sorted_pairs == []
136
+ # extract all valid document pairs
137
+ @ferret.Size.times do |i|
138
+ (i+1).upto(@ferret.Size-1) do |j|
139
+ @sorted_pairs << [i, j]
140
+ end
141
+ end
142
+ # sort into descending order of resemblance
143
+ @sorted_pairs.sort! do |pair_a, pair_b|
144
+ @ferret.ComputeResemblance(pair_b[0], pair_b[1]) <=>
145
+ @ferret.ComputeResemblance(pair_a[0], pair_a[1])
146
+ end
147
+ end
148
+
149
+ # apply block to each pair in sorted order
150
+ @sorted_pairs.each do |pair|
151
+ yield(pair[0], pair[1])
152
+ end
153
+ end
154
+
155
+ # Return the containment of doc_1 in doc_2.
156
+ #
157
+ # Raises an ArgumentError if ferret has not been 'run' before, and
158
+ # an IndexError if the document indices are not valid.
159
+ def containment(doc_1, doc_2)
160
+ check_ferret_has_run :containment
161
+ check_index doc_1
162
+ check_index doc_2
163
+
164
+ @ferret.ComputeContainment(doc_1, doc_2)
165
+ end
166
+
167
+ # Return the resemblance of doc_1 and doc_2.
168
+ #
169
+ # Raises an ArgumentError if ferret has not been 'run' before, and
170
+ # an IndexError if the document indices are not valid.
171
+ def resemblance(doc_1, doc_2)
172
+ check_ferret_has_run :resemblance
173
+ check_index doc_1
174
+ check_index doc_2
175
+
176
+ if doc_1 == doc_2
177
+ return 1.0
178
+ else
179
+ @ferret.ComputeResemblance([doc_1, doc_2].min, [doc_1, doc_2].max)
180
+ end
181
+ end
182
+
183
+ # Return the number of trigrams in given document index.
184
+ #
185
+ # Raises an ArgumentError if ferret has not been 'run' before, and
186
+ # an IndexError if the document index is not valid.
187
+ def trigram_count index
188
+ check_ferret_has_run :trigram_count
189
+ check_index index
190
+
191
+ @ferret.CountTrigrams index
192
+ end
193
+
194
+ # Return the total number of distinct trigrams in set of documents.
195
+ #
196
+ # Raises an ArgumentError if ferret has not been 'run' before calling.
197
+ def distinct_trigrams_count
198
+ check_ferret_has_run :distinct_trigrams_count
199
+
200
+ @ferret.GetTotalTrigramCount
201
+ end
202
+
203
+ # Return the number of matching trigrams in given two document indices.
204
+ #
205
+ # Raises an ArgumentError if ferret has not been 'run' before, and
206
+ # an IndexError if the document indices are not valid.
207
+ def trigram_matches(doc_1, doc_2)
208
+ check_ferret_has_run :trigram_matches
209
+ check_index doc_1
210
+ check_index doc_2
211
+
212
+ @ferret.CountMatches(doc_1, doc_2)
213
+ end
214
+
215
+ # Write an XML report of the given two document indices into given filename.
216
+ #
217
+ # Raises an ArgumentError if ferret has not been 'run' before, and
218
+ # an IndexError if the document indices are not valid.
219
+ def xml_output(output_file, doc_1, doc_2)
220
+ check_ferret_has_run :xml_output
221
+ check_index doc_1
222
+ check_index doc_2
223
+
224
+ File.open(output_file, "w") do |file|
225
+ file.puts "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>"
226
+ file.puts "<?xml-stylesheet type=\"text/xsl\" href=\"uhferret.xsl\" ?>"
227
+ file.puts "<uhferret>"
228
+
229
+ file.puts "<common-trigrams>#{trigram_matches(doc_1, doc_2)}</common-trigrams>"
230
+ file.puts "<similarity>#{resemblance(doc_1, doc_2)}</similarity>"
231
+ write_xml_document(file, doc_1, doc_2)
232
+ write_xml_document(file, doc_2, doc_1)
233
+
234
+ file.puts "</uhferret>"
235
+ end
236
+ end
237
+
238
+ # displays each pair of documents, sorted in order of similarity
239
+ def output_similarity_table(full_path = false)
240
+ puts "Number of documents: #{size}"
241
+ puts "Number of distinct trigrams: #{distinct_trigrams_count}"
242
+ each_pair do |i, j|
243
+ unless self[i].group_id == self[j].group_id
244
+ if full_path
245
+ puts "#{self[i].pathname} ; #{self[j].pathname} ; \
246
+ #{trigram_matches(i, j)} ; #{trigram_count(i)} ; #{trigram_count(j)} ; \
247
+ #{resemblance(i, j)}"
248
+ else
249
+ puts "#{self[i].filename} ; #{self[j].filename} ; \
250
+ #{trigram_matches(i, j)} ; #{trigram_count(i)} ; #{trigram_count(j)} ; \
251
+ #{resemblance(i, j)}"
252
+ end
253
+ end
254
+ end
255
+ end
256
+
257
+ # outputs similarity table as a html page, sorted in order of similarity
258
+ def output_html_similarity_table
259
+ puts <<BODY
260
+ <html><body>
261
+ <h1>Ferret: Table of Comparisons</h1>
262
+ <p>Return to <a href="/ferret/home">Ferret home page</a>.</p>
263
+ <table border=1><tbody><tr><th>Index</th><th>Document 1</th><th>Document 2</th><th>Similarity</th><th>View</th></tr>
264
+ BODY
265
+ idx = 0
266
+ each_pair do |i, j|
267
+ unless self[i].group_id == self[j].group_id
268
+ idx += 1
269
+ break if idx > MAX_TABLE_SIZE
270
+
271
+ puts <<ROW
272
+ <tr>
273
+ <td> #{idx} </td>
274
+ <td> #{format_file(self[i].pathname)} </td>
275
+ <td> #{format_file(self[j].pathname)} </td>
276
+ <td> #{format("%0.3f", resemblance(i, j))} </td>
277
+ <td><a href="/ferret/report?upload=#{Dir.pwd}&file1=#{self[i].pathname}&file2=#{self[j].pathname}" target="_blank"\>View</a></td>
278
+ </tr>
279
+ ROW
280
+ end
281
+ end
282
+ puts "</tbody></table></p>"
283
+
284
+ puts <<TAIL
285
+ <hr>
286
+ <p>Return to <a href="/ferret/home">Ferret home page.</a>
287
+ <hr><font size=-1>Generated by Ferret, Copyright 2012 University of Hertfordshire</font>
288
+ </body></html>
289
+ TAIL
290
+ end
291
+
292
+ # outputs a list of trigrams with the document indices in which they
293
+ # appear, indices are space separated
294
+ def output_trigram_list
295
+ begin
296
+ tuples = @ferret.GetTupleSet
297
+ tuples.Begin
298
+ while tuples.HasMore
299
+ print @ferret.MakeTrigramString(tuples.GetToken(0),
300
+ tuples.GetToken(1),
301
+ tuples.GetToken(2))
302
+ print " FILES:[ "
303
+ doc_indices = tuples.GetDocumentsForCurrentTuple
304
+ doc_indices.size.times do |i|
305
+ print "#{doc_indices[i]} "
306
+ end
307
+ print " ]"
308
+ puts
309
+ tuples.GetNext
310
+ end
311
+ rescue Exception => ex
312
+ puts "Error in writing trigram list: #{ex}"
313
+ end
314
+ end
315
+
316
+ # outputs a table of all comparisons, suitable for loading into a spreadsheet
317
+ def output_all_comparisons
318
+ # -- output headings
319
+ size.times do |i|
320
+ print ", #{self[i].filename}"
321
+ end
322
+ puts
323
+ # -- output comparisons
324
+ size.times do |i|
325
+ print self[i].filename
326
+ size.times do |j|
327
+ print ", #{resemblance(i, j)}"
328
+ end
329
+ puts
330
+ end
331
+ end
332
+
333
+ private
334
+ def rm_cwd dir
335
+ dir[(Dir.pwd.length+1)..-1]
336
+ end
337
+
338
+ private
339
+ def format_file file
340
+ rm_cwd(File.dirname(file)) + "/<b>" + File.basename(file) + "</b>"
341
+ end
342
+
343
+
344
+ private
345
+ def write_xml_document(out, doc_1, doc_2)
346
+ # -- output header
347
+ out.puts "<document>"
348
+ out.puts "<source>#{self[doc_1].pathname}</source>"
349
+ out.puts "<num-trigrams>#{self.trigram_count(doc_1)}</num-trigrams>"
350
+ out.puts "<containment>#{self.containment(doc_1, doc_2)}</containment>"
351
+ out.puts "<text>"
352
+ # -- output document itself
353
+ source_text = IO.readlines(self[doc_1].pathname).join
354
+ source_document = self[doc_1]
355
+ source_document.StartInput(@ferret.GetTokenSet)
356
+ last_written = 0
357
+ inside_block = false
358
+ while source_document.ReadTrigram(@ferret.GetTokenSet)
359
+ if @ferret.IsMatchingTrigram(
360
+ source_document.GetToken(0),
361
+ source_document.GetToken(1),
362
+ source_document.GetToken(2),
363
+ doc_1,
364
+ doc_2
365
+ )
366
+ unless inside_block
367
+ if last_written > 0
368
+ out.print "]]></block>" # end the last block
369
+ end
370
+ out.print "<block text=\"copied\"><![CDATA[" # start copied block
371
+ inside_block = true
372
+ end
373
+ out.print source_text[last_written, source_document.GetTrigramEnd - last_written]
374
+ last_written = source_document.GetTrigramEnd
375
+ else
376
+ if last_written < source_document.GetTrigramStart(1)
377
+ if inside_block or last_written.zero? # moving from inside block to not
378
+ if last_written > 0
379
+ out.print "]]></block>" # end the last block
380
+ end
381
+ out.print "<block text=\"normal\"><![CDATA[" # start normal block
382
+ inside_block = false
383
+ end
384
+ out.print source_text[last_written, source_document.GetTrigramStart(1) - last_written]
385
+ last_written = source_document.GetTrigramStart(1)
386
+ end
387
+ end
388
+ end
389
+ if last_written < source_text.length
390
+ if inside_block
391
+ out.print "]]></block>" # end the last block
392
+ inside_block = false
393
+ out.print "<block text=\"normal\"><![CDATA[" # start normal block
394
+ end
395
+ out.print source_text[last_written..-1] # finish printing whole of source
396
+ end
397
+ unless last_written.zero? # i.e. nothing has been written
398
+ out.print "]]></block>" # end the last block
399
+ end
400
+ # -- output footer
401
+ out.puts "</text>"
402
+ out.puts "</document>"
403
+ # -- close up document
404
+ source_document.CloseInput
405
+ end
406
+
407
+ private
408
+ def check_index index
409
+ unless index >= 0 and index < @ferret.Size
410
+ raise IndexError.new("Index #{index} not in range [0, #{@ferret.Size})")
411
+ end
412
+ end
413
+
414
+ def check_ferret_has_run method
415
+ unless @ferret_run
416
+ raise ArgumentError.new("UHFerret must be 'run' before #{method} can be calculated.")
417
+ end
418
+ end
419
+ end
420
+
421
+ # Extend the native class with some convenience methods.
422
+ class Uhferret_lib::Document
423
+
424
+ # Return the filename for this document.
425
+ def filename
426
+ File.basename(self.GetPathname)
427
+ end
428
+
429
+ # Return the full pathname for this document.
430
+ def pathname
431
+ self.GetPathname
432
+ end
433
+
434
+ # Return the id for this document.
435
+ def group_id
436
+ self.GetGroupId
437
+ end
438
+ end
439
+
440
+ end
441
+
@@ -0,0 +1,93 @@
1
+ #--
2
+ # This file is part of uhferret.
3
+ #
4
+ # Author:: Peter Lane
5
+ # Copyright:: Copyright 2012-20, Peter Lane.
6
+ # License:: GPLv3
7
+ #
8
+ # uhferret is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # uhferret is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with uhferret. If not, see <http://www.gnu.org/licenses/>.
20
+
21
+ # TODO: Make the conversions etc work on Windows as well as Linux.
22
+
23
+ #
24
+ # A collection of methods to support checking and converting different
25
+ # document file types.
26
+ #
27
+ module Utils
28
+
29
+ # Check if given command is present on the system
30
+ def Utils.command_present? command
31
+ `which #{command}` != ""
32
+ end
33
+
34
+ # Create a list of permitted compressed file extensions
35
+ # depending on the available commands
36
+ CompressedFileExtensions = []
37
+ [["unrar", ["rar"]],
38
+ ["tar", ["tar.bz2", "tar.gz", "tbz2", "tgz"]],
39
+ ["unzip", ["zip"]]].each do |defn|
40
+ if Utils.command_present? defn[0]
41
+ CompressedFileExtensions.concat defn[1]
42
+ end
43
+ end
44
+
45
+ # Return true if the filename has a file ending for code
46
+ def Utils.is_code? filename
47
+ [".c", ".h", ".cpp", ".java"].include? File.extname(filename)
48
+ end
49
+
50
+ # Return true if the filename has a valid extension
51
+ def Utils.valid_document? filename
52
+ Utils.is_code? filename or
53
+ (".txt" == File.extname(filename)) or
54
+ Utils.is_pdf_document? filename or
55
+ Utils.is_wp_document? filename
56
+ end
57
+
58
+ # Return true if the filename ends with .pdf and so is a pdf document.
59
+ def Utils.is_pdf_document? filename
60
+ ".pdf" == File.extname(filename)
61
+ end
62
+
63
+ # Return true if the filename ends with a known word processor extension.
64
+ def Utils.is_wp_document? filename
65
+ [".doc", ".rtf", ".docx", ".abw"].include? File.extname(filename)
66
+ end
67
+
68
+ # Use pdf2txt to convert the pdf file to text
69
+ # The output is the converted filename, obtained by adding .txt to
70
+ # the given filename
71
+ def Utils.convert_pdf_document filename
72
+ if Utils.command_present?("pdftotext")
73
+ output_filename = "#{filename}.txt"
74
+ `pdftotext -layout -enc Latin1 -nopgbrk #{filename} #{output_filename}`
75
+ return output_filename
76
+ else
77
+ return filename
78
+ end
79
+ end
80
+
81
+ # Use abiword to convert the word-processed file to text
82
+ # The output is the converted filename, obtained by adding .txt to
83
+ # the given filename
84
+ def Utils.convert_wp_document filename
85
+ if Utils.command_present?("abiword")
86
+ output_filename = "#{filename}.txt"
87
+ `abiword --to=txt #{filename} -o #{output_filename}`
88
+ return output_filename
89
+ else
90
+ return filename
91
+ end
92
+ end
93
+ end