RubyGems - uhferret - Versions diffs - 1.3.7 - Mend

uhferret 1.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +7 -0
data/COPYING.txt +674 -0
data/README.rdoc +79 -0
data/bin/uhferret +129 -0
data/bin/uhferret-server +68 -0
data/ext/document.cpp +231 -0
data/ext/document.h +89 -0
data/ext/documentlist.cpp +229 -0
data/ext/documentlist.h +80 -0
data/ext/extconf.rb +2 -0
data/ext/tokenreader.cpp +196 -0
data/ext/tokenreader.h +85 -0
data/ext/tokenset.cpp +111 -0
data/ext/tokenset.h +73 -0
data/ext/tupleset.cpp +150 -0
data/ext/tupleset.h +92 -0
data/ext/uhferret_lib_wrap.cxx +10726 -0
data/lib/uhferret.rb +441 -0
data/lib/utils.rb +93 -0
data/lib/webferret.rb +246 -0
metadata +71 -0

data/lib/uhferret.rb ADDED

@@ -0,0 +1,441 @@
+# This file is part of uhferret.
+#
+# Author::    Peter Lane
+# Copyright:: Copyright 2011-2020, Peter Lane.
+# License::   GPLv3
+#
+# uhferret is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# uhferret is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with uhferret.  If not, see <http://www.gnu.org/licenses/>.
+require 'uhferret_lib'
+require 'utils'
+module UHFerret
+  # Constant to indicate document is a natural-language document.
+  TextDocument = Uhferret_lib::Document::TypeText
+  # Constant to indicate document is a computer program.
+  CodeDocument = Uhferret_lib::Document::TypeCode
+  # UHFerret::Ferret holds a reference to a list of documents, and
+  # provides methods to manage this list of documents, compute and
+  # retrieve similarities between documents.
+  class Ferret
+    # Constructs an instance of Ferret.
+    # block:: optional block is used to add files etc during construction.
+    def initialize &block
+      @ferret = Uhferret_lib::DocumentList.new
+      self.instance_eval(&block) if block_given?
+      @ferret_run = false
+    end
+    # Add given filename to list of documents.
+    # The type of document can be given as:
+    # * UHFerret::TextDocument, for natural language documents
+    # * UHFerret::CodeDocument, for c-style computer programs
+    # Option third argument specifies the group_id for this document.
+    # The group_id can be used to suppress comparisons in some kinds
+    # of output.
+    # - If a pdf or word-processed document is added, it must first
+    #   be converted to text.  Ferret tries to do this, attaching .txt
+    #   to the end of the filename.
+    def add(filename, type = TextDocument, id = 0)
+      if Utils.is_pdf_document?(filename)
+        filename = Utils.convert_pdf_document filename
+      elsif Utils.is_wp_document?(filename)
+        filename = Utils.convert_wp_document filename
+      end
+      @ferret.AddDocument(filename, type, (id.zero? ? @ferret.GetNewGroupId : id))
+      @ferret_run = false
+    end
+    # Add list of files specified in given filename
+    # The type of documents can be given as:
+    # * UHFerret::TextDocument, for natural language documents
+    # * UHFerret::CodeDocument, for c-style computer programs
+    def add_list_from_file(filename, type = TextDocument)
+      within_group = false
+      current_id = 0
+      IO.foreach(filename) do |line|
+        line.strip!
+        if line.upcase == "START GROUP"
+          within_group = true
+          current_id = @ferret.GetNewGroupId
+        elsif line.upcase == "END GROUP"
+          within_group = false
+        elsif File.readable? line
+          add(line, type, (within_group ? current_id : 0))
+        end
+      end
+      @ferret_run = false
+    end
+    # Run ferret on the current document list.
+    # You must run ferret before retrieving measures of containment or resemblance.
+    #
+    # Raises an ArgumentError if there are not at least two documents in the document
+    # list.
+    def run
+      if @ferret.Size >= 2
+        @ferret.RunFerret
+        @ferret_run = true
+        @sorted_pairs = []
+      else
+        raise ArgumentError.new("UHFerret needs at least two documents to run")
+      end
+    end
+    # Return document in document list at given index position.
+    #
+    # Raises an IndexError if index is not valid.
+    def [](index)
+      check_index index
+      @ferret.getDocument index
+    end
+    # Apply provided block to each document in the document list.
+    def each
+      @ferret.Size.times do |i|
+        yield @ferret.getDocument(i)
+      end
+    end
+    # Return the number of documents in the document list.
+    def size
+      @ferret.Size
+    end
+    # Return the number of pairs of documents compared.
+    def num_pairs
+      @ferret.NumberOfPairs
+    end
+    # Apply provided block to each pair of compared document indices,
+    # in descending order of resemblance.
+    #
+    # Raises an ArgumentError if ferret has not been 'run' before.
+    def each_pair
+      check_ferret_has_run :each_pair
+      if @sorted_pairs == []
+        # extract all valid document pairs
+        @ferret.Size.times do |i|
+          (i+1).upto(@ferret.Size-1) do |j|
+            @sorted_pairs << [i, j]
+          end
+        end
+        # sort into descending order of resemblance
+        @sorted_pairs.sort! do |pair_a, pair_b|
+          @ferret.ComputeResemblance(pair_b[0], pair_b[1]) <=>
+          @ferret.ComputeResemblance(pair_a[0], pair_a[1])
+        end
+      end
+      # apply block to each pair in sorted order
+      @sorted_pairs.each do |pair|
+        yield(pair[0], pair[1])
+      end
+    end
+    # Return the containment of doc_1 in doc_2.
+    #
+    # Raises an ArgumentError if ferret has not been 'run' before, and
+    # an IndexError if the document indices are not valid.
+    def containment(doc_1, doc_2)
+      check_ferret_has_run :containment
+      check_index doc_1
+      check_index doc_2
+      @ferret.ComputeContainment(doc_1, doc_2)
+    end
+    # Return the resemblance of doc_1 and doc_2.
+    #
+    # Raises an ArgumentError if ferret has not been 'run' before, and
+    # an IndexError if the document indices are not valid.
+    def resemblance(doc_1, doc_2)
+      check_ferret_has_run :resemblance
+      check_index doc_1
+      check_index doc_2
+      if doc_1 == doc_2
+        return 1.0
+      else
+        @ferret.ComputeResemblance([doc_1, doc_2].min, [doc_1, doc_2].max)
+      end
+    end
+    # Return the number of trigrams in given document index.
+    #
+    # Raises an ArgumentError if ferret has not been 'run' before, and
+    # an IndexError if the document index is not valid.
+    def trigram_count index
+      check_ferret_has_run :trigram_count
+      check_index index
+      @ferret.CountTrigrams index
+    end
+    # Return the total number of distinct trigrams in set of documents.
+    #
+    # Raises an ArgumentError if ferret has not been 'run' before calling.
+    def distinct_trigrams_count
+      check_ferret_has_run :distinct_trigrams_count
+      @ferret.GetTotalTrigramCount
+    end
+    # Return the number of matching trigrams in given two document indices.
+    #
+    # Raises an ArgumentError if ferret has not been 'run' before, and
+    # an IndexError if the document indices are not valid.
+    def trigram_matches(doc_1, doc_2)
+      check_ferret_has_run :trigram_matches
+      check_index doc_1
+      check_index doc_2
+      @ferret.CountMatches(doc_1, doc_2)
+    end
+    # Write an XML report of the given two document indices into given filename.
+    #
+    # Raises an ArgumentError if ferret has not been 'run' before, and
+    # an IndexError if the document indices are not valid.
+    def xml_output(output_file, doc_1, doc_2)
+      check_ferret_has_run :xml_output
+      check_index doc_1
+      check_index doc_2
+      File.open(output_file, "w") do |file|
+        file.puts "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>"
+        file.puts "<?xml-stylesheet type=\"text/xsl\" href=\"uhferret.xsl\" ?>"
+        file.puts "<uhferret>"
+        file.puts "<common-trigrams>#{trigram_matches(doc_1, doc_2)}</common-trigrams>"
+        file.puts "<similarity>#{resemblance(doc_1, doc_2)}</similarity>"
+        write_xml_document(file, doc_1, doc_2)
+        write_xml_document(file, doc_2, doc_1)
+        file.puts "</uhferret>"
+      end
+    end
+    # displays each pair of documents, sorted in order of similarity
+    def output_similarity_table(full_path = false)
+      puts "Number of documents: #{size}"
+      puts "Number of distinct trigrams: #{distinct_trigrams_count}"
+      each_pair do |i, j|
+        unless self[i].group_id == self[j].group_id
+          if full_path
+            puts "#{self[i].pathname} ; #{self[j].pathname} ; \
+          #{trigram_matches(i, j)} ; #{trigram_count(i)} ; #{trigram_count(j)} ; \
+          #{resemblance(i, j)}"
+          else
+            puts "#{self[i].filename} ; #{self[j].filename} ; \
+          #{trigram_matches(i, j)} ; #{trigram_count(i)} ; #{trigram_count(j)} ; \
+          #{resemblance(i, j)}"
+            end
+        end
+      end
+    end
+    # outputs similarity table as a html page, sorted in order of similarity
+    def output_html_similarity_table
+      puts <<BODY
+    <html><body>
+    <h1>Ferret: Table of Comparisons</h1>
+    <p>Return to <a href="/ferret/home">Ferret home page</a>.</p>
+    <table border=1><tbody><tr><th>Index</th><th>Document 1</th><th>Document 2</th><th>Similarity</th><th>View</th></tr>
+BODY
+      idx = 0
+      each_pair do |i, j|
+        unless self[i].group_id == self[j].group_id
+          idx += 1
+          break if idx > MAX_TABLE_SIZE
+          puts <<ROW
+        <tr>
+        <td> #{idx} </td>
+        <td> #{format_file(self[i].pathname)} </td>
+        <td> #{format_file(self[j].pathname)} </td>
+        <td> #{format("%0.3f", resemblance(i, j))} </td>
+        <td><a href="/ferret/report?upload=#{Dir.pwd}&file1=#{self[i].pathname}&file2=#{self[j].pathname}" target="_blank"\>View</a></td>
+        </tr>
+ROW
+        end
+      end
+      puts "</tbody></table></p>"
+      puts <<TAIL
+    <hr>
+    <p>Return to <a href="/ferret/home">Ferret home page.</a>
+    <hr><font size=-1>Generated by Ferret, Copyright 2012 University of Hertfordshire</font>
+    </body></html>
+TAIL
+    end
+    # outputs a list of trigrams with the document indices in which they
+    # appear, indices are space separated
+    def output_trigram_list
+      begin
+        tuples = @ferret.GetTupleSet
+        tuples.Begin
+        while tuples.HasMore
+          print @ferret.MakeTrigramString(tuples.GetToken(0),
+                                          tuples.GetToken(1),
+                                          tuples.GetToken(2))
+          print "  FILES:[ "
+          doc_indices = tuples.GetDocumentsForCurrentTuple
+          doc_indices.size.times do |i|
+            print "#{doc_indices[i]} "
+          end
+          print " ]"
+          puts
+          tuples.GetNext
+        end
+      rescue Exception => ex
+        puts "Error in writing trigram list: #{ex}"
+      end
+    end
+    # outputs a table of all comparisons, suitable for loading into a spreadsheet
+    def output_all_comparisons
+      # -- output headings
+      size.times do |i|
+        print ", #{self[i].filename}"
+      end
+      puts
+      # -- output comparisons
+      size.times do |i|
+        print self[i].filename
+        size.times do |j|
+          print ", #{resemblance(i, j)}"
+        end
+        puts
+      end
+    end
+    private
+    def rm_cwd dir
+      dir[(Dir.pwd.length+1)..-1]
+    end
+    private
+    def format_file file
+      rm_cwd(File.dirname(file)) + "/<b>" + File.basename(file) + "</b>"
+    end
+    private
+    def write_xml_document(out, doc_1, doc_2)
+      # -- output header
+      out.puts "<document>"
+      out.puts "<source>#{self[doc_1].pathname}</source>"
+      out.puts "<num-trigrams>#{self.trigram_count(doc_1)}</num-trigrams>"
+      out.puts "<containment>#{self.containment(doc_1, doc_2)}</containment>"
+      out.puts "<text>"
+      # -- output document itself
+      source_text = IO.readlines(self[doc_1].pathname).join
+      source_document = self[doc_1]
+      source_document.StartInput(@ferret.GetTokenSet)
+      last_written = 0
+      inside_block = false
+      while source_document.ReadTrigram(@ferret.GetTokenSet)
+        if @ferret.IsMatchingTrigram(
+            source_document.GetToken(0),
+            source_document.GetToken(1),
+            source_document.GetToken(2),
+            doc_1,
+            doc_2
+        )
+          unless inside_block
+            if last_written > 0
+              out.print "]]></block>" # end the last block
+            end
+            out.print "<block text=\"copied\"><![CDATA[" # start copied block
+            inside_block = true
+          end
+          out.print source_text[last_written, source_document.GetTrigramEnd - last_written]
+          last_written = source_document.GetTrigramEnd
+        else
+          if last_written < source_document.GetTrigramStart(1)
+            if inside_block or last_written.zero? # moving from inside block to not
+              if last_written > 0
+                out.print "]]></block>" # end the last block
+              end
+              out.print "<block text=\"normal\"><![CDATA[" # start normal block
+              inside_block = false
+            end
+            out.print source_text[last_written, source_document.GetTrigramStart(1) - last_written]
+            last_written = source_document.GetTrigramStart(1)
+          end
+        end
+      end
+      if last_written < source_text.length
+        if inside_block
+          out.print "]]></block>" # end the last block
+          inside_block = false
+          out.print "<block text=\"normal\"><![CDATA[" # start normal block
+        end
+        out.print source_text[last_written..-1] # finish printing whole of source
+      end
+      unless last_written.zero? # i.e. nothing has been written
+        out.print "]]></block>" # end the last block
+      end
+      # -- output footer
+      out.puts "</text>"
+      out.puts "</document>"
+      # -- close up document
+      source_document.CloseInput
+    end
+    private
+    def check_index index
+      unless index >= 0 and index < @ferret.Size
+        raise IndexError.new("Index #{index} not in range [0, #{@ferret.Size})")
+      end
+    end
+    def check_ferret_has_run method
+      unless @ferret_run
+        raise ArgumentError.new("UHFerret must be 'run' before #{method} can be calculated.")
+      end
+    end
+  end
+  # Extend the native class with some convenience methods.
+  class Uhferret_lib::Document
+    # Return the filename for this document.
+    def filename
+      File.basename(self.GetPathname)
+    end
+    # Return the full pathname for this document.
+    def pathname
+      self.GetPathname
+    end
+    # Return the id for this document.
+    def group_id
+      self.GetGroupId
+    end
+  end
+end

data/lib/utils.rb ADDED

@@ -0,0 +1,93 @@
+#--
+# This file is part of uhferret.
+#
+# Author::    Peter Lane
+# Copyright:: Copyright 2012-20, Peter Lane.
+# License::   GPLv3
+#
+# uhferret is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# uhferret is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with uhferret.  If not, see <http://www.gnu.org/licenses/>.
+# TODO: Make the conversions etc work on Windows as well as Linux.
+#
+# A collection of methods to support checking and converting different
+# document file types.
+#
+module Utils
+  # Check if given command is present on the system
+  def Utils.command_present? command
+    `which #{command}` != ""
+  end
+  # Create a list of permitted compressed file extensions
+  # depending on the available commands
+  CompressedFileExtensions = []
+  [["unrar", ["rar"]],
+    ["tar",  ["tar.bz2", "tar.gz", "tbz2", "tgz"]],
+    ["unzip", ["zip"]]].each do |defn|
+    if Utils.command_present? defn[0]
+      CompressedFileExtensions.concat defn[1]
+    end
+  end
+  # Return true if the filename has a file ending for code
+  def Utils.is_code? filename
+    [".c", ".h", ".cpp", ".java"].include? File.extname(filename)
+  end
+  # Return true if the filename has a valid extension
+  def Utils.valid_document? filename
+    Utils.is_code? filename or
+    (".txt" == File.extname(filename)) or
+    Utils.is_pdf_document? filename or
+    Utils.is_wp_document? filename
+  end
+  # Return true if the filename ends with .pdf and so is a pdf document.
+  def Utils.is_pdf_document? filename
+    ".pdf" == File.extname(filename)
+  end
+  # Return true if the filename ends with a known word processor extension.
+  def Utils.is_wp_document? filename
+    [".doc", ".rtf", ".docx", ".abw"].include? File.extname(filename)
+  end
+  # Use pdf2txt to convert the pdf file to text
+  # The output is the converted filename, obtained by adding .txt to
+  # the given filename
+  def Utils.convert_pdf_document filename
+    if Utils.command_present?("pdftotext")
+      output_filename = "#{filename}.txt"
+      `pdftotext -layout -enc Latin1 -nopgbrk #{filename} #{output_filename}`
+      return output_filename
+    else
+      return filename
+    end
+  end
+  # Use abiword to convert the word-processed file to text
+  # The output is the converted filename, obtained by adding .txt to
+  # the given filename
+  def Utils.convert_wp_document filename
+    if Utils.command_present?("abiword")
+      output_filename = "#{filename}.txt"
+      `abiword --to=txt #{filename} -o #{output_filename}`
+      return output_filename
+    else
+      return filename
+    end
+  end
+end