RubyGems - shalmaneser-fred - Versions diffs - 1.2.0.rc4 - Mend

shalmaneser-fred 1.2.0.rc4

Files changed (32) hide show

checksums.yaml +7 -0
data/.yardopts +10 -0
data/CHANGELOG.md +4 -0
data/LICENSE.md +4 -0
data/README.md +93 -0
data/bin/fred +16 -0
data/lib/fred/Baseline.rb +150 -0
data/lib/fred/FileZipped.rb +31 -0
data/lib/fred/FredBOWContext.rb +877 -0
data/lib/fred/FredConventions.rb +232 -0
data/lib/fred/FredDetermineTargets.rb +319 -0
data/lib/fred/FredEval.rb +312 -0
data/lib/fred/FredFeatureExtractors.rb +322 -0
data/lib/fred/FredFeatures.rb +1061 -0
data/lib/fred/FredFeaturize.rb +602 -0
data/lib/fred/FredNumTrainingSenses.rb +27 -0
data/lib/fred/FredParameters.rb +402 -0
data/lib/fred/FredSplit.rb +84 -0
data/lib/fred/FredSplitPkg.rb +180 -0
data/lib/fred/FredTest.rb +606 -0
data/lib/fred/FredTrain.rb +144 -0
data/lib/fred/PlotAndREval.rb +480 -0
data/lib/fred/fred.rb +47 -0
data/lib/fred/fred_config_data.rb +185 -0
data/lib/fred/md5.rb +23 -0
data/lib/fred/opt_parser.rb +250 -0
data/test/frprep/test_opt_parser.rb +94 -0
data/test/functional/functional_test_helper.rb +58 -0
data/test/functional/test_fred.rb +47 -0
data/test/functional/test_frprep.rb +99 -0
data/test/functional/test_rosy.rb +40 -0
metadata +99 -0

@@ -0,0 +1,312 @@
+# FredEval
+# Katrin Erk April 05
+#
+# Frame disambiguation system: evaluate classification results
+#
+# While the other main classes of Fred just provide a new() method
+# and a compute() method,
+# the FredEval class also provides access methods to all the
+# individual evaluation results and allows for a flag that
+# suppresses evaluation output to a file --
+# such that this package can also be used by external systems that
+# wish to evaluate Fred.
+#
+# Inherits from the Eval class that is not Fred-specific
+# Salsa packages
+require "common/Eval"
+require "common/ruby_class_extensions"
+# Fred packages
+require "fred/fred_config_data"
+require "fred/FredConventions"
+require "fred/FredFeatures"
+require "fred/FredDetermineTargets"
+class FredEval < Eval
+  ###
+  # new
+  #
+  # evaluate runtime options and announce the task
+  def initialize(exp_obj, # FredConfigData object
+		 options) # hash: runtime option name (string) => value(string)
+    in_enduser_mode_unavailable()
+    @exp = exp_obj
+    ###
+    # evaluate runtime options
+    @split_id = nil
+    logfilename = nil
+    options.each_pair { |opt, arg|
+      case opt
+      when "--logID"
+	@split_id = arg
+      when "--printLog"
+	logfilename = fred_dirname(@exp, "eval", "log", "new") +
+                      "eval_logfile.txt"
+      else
+	# case of unknown arguments has been dealt with by fred.rb
+      end
+    }
+    ###
+    # make outfile name
+    outfilename =  fred_dirname(@exp, "eval", "eval", "new") +
+                   "eval.txt"
+    ###
+    # do we regard all senses as assigned,
+    # as long as they surpass some threshold?
+    # if we are doing multilabel evaluation, we need the full list of senses
+    @threshold = @exp.get("assignment_confidence_threshold")
+    @target_obj = Targets.new(@exp, nil, "r")
+    unless @target_obj.targets_okay
+      # error during initialization
+      $stderr.puts "Error: Could not read list of known targets, bailing out."
+      exit 1
+    end
+    if @threshold or @exp.get("handle_multilabel") == "keep"
+      @multiple_senses_assigned = true
+    else
+      @multiple_senses_assigned = false
+    end
+    ###
+    # initialize abstract class behind me
+    if @multiple_senses_assigned
+      # we are possibly assigning more than one sense: do precision/recall
+      # instead of accuracy:
+      # "true" is what "this sense has been assigned" is mapped to below.
+      super(outfilename, logfilename, "true")
+    else
+      super(outfilename, logfilename)
+    end
+    # what is being done with instances with multiple sense labels?
+    @handle_multilabel = @exp.get("handle_multilabel")
+    ###
+    # announce the task
+    $stderr.puts "---------"
+    $stderr.print "Fred  experiment #{@exp.get("experiment_ID")}: Evaluating classifiers"
+    if @split_dir
+      $stderr.puts " using split with ID #{@split_id}"
+    else
+      $stderr.puts
+    end
+    if @multiple_senses_assigned
+      $stderr.puts "Allowing for the assignment of multiple senses,"
+      $stderr.puts "computing precision and recall against the full sense list of a lemma."
+    end
+    $stderr.puts "Writing result to #{fred_dirname(@exp, "eval", "eval")}"
+    $stderr.puts "---------"
+  end
+  #####
+  protected
+  ###
+  # each_group
+  #
+  # yield each group name in turn
+  # in our case, group names are lemmas
+  #
+  # also, set object-global variables in such a way
+  # that the elements of this group can be read
+  def each_group()
+    # access to classifier output files
+    output_dir = fred_dirname(@exp, "output", "tab")
+    # access to answer key files
+    if @split_id
+      # make split object and parameter hash to pass to it
+      dataset = "train"
+    else
+      dataset = "test"
+    end
+    # iterate through instance files
+    @target_obj.get_lemmas().sort().each { |lemma|
+      # progress report
+      if @exp.get("verbose")
+        $stderr.puts "Evaluating " + lemma
+      end
+      # file with classification results
+      begin
+        @classfile = File.new(output_dir + fred_result_filename(lemma))
+      rescue
+        # no classification results
+        @classfile = nil
+      end
+      # file with answers:
+      # maybe we need to apply a split first
+      if @split_id
+        @goldreader = AnswerKeyAccess.new(@exp, "train", lemma, "r", @split_id, "test")
+      else
+        @goldreader = AnswerKeyAccess.new(@exp, "test", lemma, "r")
+      end
+      # doing multilabel evaluation?
+      # then we need a list of all senses
+      if @multiple_senses_assigned
+        @all_senses = @target_obj.get_senses(lemma)
+      else
+        @all_senses = nil
+      end
+      yield lemma
+    }
+  end
+  ###
+  # each_instance
+  #
+  # given a lemma name, yield each instance of this lemma in turn,
+  # or rather: yield pairs [gold_class(string), assigned_class(string)]
+  #
+  # relies on each_group() having set the appropriate readers
+  # <@goldreader> and <@classfile>
+  def each_instance(lemma) # string: lemma name
+    # watch out for repeated instances
+    # which may occur if handle_multilabel = repeat.
+    # Only yield them once to avoid re-evaluating multi-label instances
+    #
+    # instance_ids_seen: hash target_ids -> true/nil
+    instance_ids_seen = Hash.new()
+    # read gold file and classifier output file in parallel
+    @goldreader.each { |lemma, pos, target_ids, sid, senses_gold, transformed_gold_senses|
+      # classline: format
+      # (label confidence)*
+      # such that the label with the highest confidence is first
+      classline = nil
+      if @classfile
+        classline = @classfile.gets()
+      end
+      if classline.nil?
+	classline = ""
+      end
+      # $stderr.puts "HIER0 #{classline} #{@classfile.nil?}"
+      # have we done this same instance previously?
+      if instance_ids_seen[target_ids]
+        next
+      end
+      # instance not seen previously, but mark as seen now.
+      instance_ids_seen[target_ids] = true
+      # determine all assigned senses and their confidence levels
+      # determine all sense/confidence pairs
+      # senses assigned: list of pairs [senselist, confidence]
+      # where senselist is an array of sense strings
+      senses_assigned = Array.new()
+      current_sense = nil
+      classline.split().each_with_index { |entry, index|
+        if index % 2 == 0
+          # we have a sense label
+          if @handle_multilabel == "join"
+            # split up joined senses
+            current_sense = fred_split_sense(entry)
+          else
+            current_sense = [entry]
+          end
+        else
+          # we have a confidence level
+          senses_assigned << [current_sense, entry.to_f()]
+        end
+      }
+      if @threshold
+        # multiple senses assigned, and
+        # regard as assigned everything above a given threshold
+        # transform senses_assigned:
+        # in the case of "join", one sense may have several confidence levels,
+        # one on its own and one in a joined sense
+        senses_assigned_hash = Hash.new()
+        senses_assigned.each { |senses, confidence|
+          senses.each { |s|
+            # assign to each sense the maximum of its previous confidence
+            # and this one.
+            # watch out: confidence may be smaller than zero
+            if senses_assigned_hash[s]
+              senses_assigned_hash[s] = [senses_assigned_hash[s], confidence].max()
+            else
+              senses_assigned_hash[s] = confidence
+            end
+          }
+        }
+        # select all sense/confidence pairs where confidence is above threshold
+        senses_assigned = senses_assigned_hash.to_a().select { |sense, confidence|
+          confidence >= @threshold
+        }.map { |sense, confidence|
+          # then retain only the sense, not the confidence
+          sense
+        }
+        unless @all_senses
+          raise "Shouldn't be here"
+        end
+        # for each sense out of the list of all senses:
+        # yield a pair of [applies, has been assigned]
+        # both 'applies' and 'has been assigned' will be
+        # a string of either 'true' or 'false'
+        # assignment is accurate if both are the same
+        @all_senses.each { |sense_of_lemma|
+          gold_class = (senses_gold.include? sense_of_lemma).to_s()
+          assigned_class = (senses_assigned.include? sense_of_lemma).to_s()
+          yield [gold_class, assigned_class]
+        }
+      else
+        # regard only one sense as assigned at a time
+        # count as correct if the list of gold classes
+        # contains the main assigned class
+        # (relatively lenient evaluation)
+        # actually assigned class: only the one with the
+        # maximum confidence
+        # $stderr.puts "HIER5 #{senses_assigned.length()}"
+        if senses_assigned.empty?
+          # nothing to yield
+        else
+          max_senselist = senses_assigned.max { |a, b|
+            a.last() <=> b.last()
+          }.first()
+          max_senselist.each { |single_sense|
+            gold_class = (senses_gold.include? single_sense).to_s()
+            yield [gold_class, "true"]
+          }
+        end
+      end
+    }
+  end
+end

data/lib/fred/FredFeatureExtractors.rb ADDED

@@ -0,0 +1,322 @@
+class FredFeatureInfo
+  ###
+  # class variable:
+  # list of all known extractors
+  # add to it using add_feature()
+  @@extractors = Array.new
+  # boolean. set to true after warning messages have been given once
+  @@warned = false
+  ###
+  # add interface/interpreter
+  def FredFeatureInfo.add_feature(class_name) # Class object
+    @@extractors << class_name
+  end
+  ###
+  def initialize(exp)
+    ##
+    # make list of extractors that are
+    # required by the user
+    @features = Array.new
+    @exp = exp
+    # user-chosen extractors:
+    # returns array of pairs [feature group designator(string), options(array:string)]
+    exp.get_lf("feature").each { |extractor_name, *options|
+      extractor = @@extractors.detect { |e| e.feature_name() == extractor_name }
+      unless extractor
+        # no extractor found matching the given designator
+        unless @@warned
+          $stderr.puts "Warning: Could not find a feature extractor for #{extractor_name}: skipping."
+        end
+        next
+      end
+      # no need to use the options here,
+      # the feature extractors can get their options themselves.
+      @features << extractor
+    }
+    # do not print warnings again if another RosyFeatureInfo object is made
+    @@warned = true
+  end
+  ###
+  # get_extractor_objects
+  #
+  # returns a list of feature extractor objects
+  def get_extractor_objects()
+    return @features.map{ |feature_class|
+      feature_class.new(@exp)
+    }
+  end
+end
+##################################3
+class FredFeatureExtractor
+  ###
+  # feature name:
+  # name by which you choose this feature
+  # in the experiment file
+  def FredFeatureExtractor.feature_name()
+    raise "Overwrite me."
+  end
+  ###
+  # initialize with Fred experiment file object
+  def initialize(exp)
+    @exp = exp
+  end
+  ###
+  # compute features from meta-features
+  #
+  # argument: hash
+  # metafeature_label -> metafeatures
+  #  string -> array:string
+  #
+  # yields each feature as a string
+  def each_feature(feature_hash)
+    raise "overwrite me"
+  end
+  ######
+  protected
+  def FredFeatureExtractor.announce_me
+    # AB: In 1.9 constants are symbols.
+    if Module.constants.include?("FredFeatureInfo") or Module.constants.include?(:FredFeatureInfo)
+      # yup, we have a class to which we can announce ourselves
+      FredFeatureInfo.add_feature(eval(self.name))
+    else
+      # no interface collector class
+#      $stderr.puts "Feature #{self.name()} not announced: no RosyFeatureInfo."
+    end
+  end
+end
+#####
+# context feature
+class FredContextFeatureExtractor < FredFeatureExtractor
+  FredContextFeatureExtractor.announce_me()
+  def FredContextFeatureExtractor.feature_name()
+    return "context"
+  end
+  ###
+  def initialize(exp)
+    super(exp)
+    # cxsizes: list of context sizes chosen as features,
+    # encoded in metafeature labels
+    # written in a hash for fast access
+    @cxsizes = Hash.new()
+    @exp.get_lf("feature", "context").each { |cxsize|
+      @cxsizes[ "CX" + cxsize.to_s() ] = true
+    }
+  end
+  ###
+  def each_feature(feature_hash)
+    # grf#word#lemma#pos#ne
+    lemma_index = 2
+    feature_hash.each { |ftype, fvalues|
+      if @cxsizes[ftype]
+        # this is a context feature of a size chosen
+        # by the user for featurization
+        fvalues.each { |f|
+	next if f =~ /#####/;
+          yield ftype + f.split("#")[lemma_index]
+        }
+      end
+    }
+  end
+end
+#####
+# context feature: POS separately, small contexts only
+class FredContextPOSFeatureExtractor < FredFeatureExtractor
+  FredContextPOSFeatureExtractor.announce_me()
+  def FredContextPOSFeatureExtractor.feature_name()
+    return "context_pos"
+  end
+  ###
+  def initialize(exp)
+    super(exp)
+    # cxsizes: list of context sizes chosen as features,
+    # encoded in metafeature labels
+    # written in a hash for fast access
+    @cxsizes = Hash.new()
+    @exp.get_lf("feature", "context").each { |cxsize|
+      if cxsize <= 10
+        @cxsizes[ "CX" + cxsize.to_s() ] = true
+      end
+    }
+    if @cxsizes.empty?
+      $stderr.puts "context_pos feature warning: will not be computed"
+      $stderr.puts "as there is no context of size <= 10"
+    end
+  end
+  ###
+  def each_feature(feature_hash)
+    # word#lemma#pos#ne
+    pos_index = 2
+    feature_hash.each { |ftype, fvalues|
+      if @cxsizes[ftype]
+        # this is a context feature of a size chosen
+        # by the user for featurization
+        fvalues.each { |f|
+          yield "POS" + ftype + f.split("#")[pos_index]
+        }
+      end
+    }
+  end
+end
+#####
+# bigram/trigram feature
+class FredNgramFeatureExtractor < FredFeatureExtractor
+  FredNgramFeatureExtractor.announce_me()
+  def FredNgramFeatureExtractor.feature_name()
+    return "ngram"
+  end
+  ###
+  def initialize(exp)
+    super(exp)
+    # cxsize: context size from which the ngram feature will be computed
+    # encoded in metafeature labels
+    # written in a hash for fast access
+    @cxsize = @exp.get_lf("feature", "context").detect { |cxsize|
+      cxsize >= 2
+    }
+    unless @cxsize
+      $stderr.puts "Warning: no context of size >= 2, so"
+      $stderr.puts "no ngram feature computed."
+    end
+  end
+  ###
+  def each_feature(feature_hash)
+    # word#lemma#pos#ne
+    lemma_index = 1
+    pos_index = 2
+    feature_hash.each { |ftype, fvalues|
+      if ftype == "CX" + @cxsize.to_s()
+        # compute the ngram features from this context
+        # |fvalues| = 2*cxsize, that is, cxsize describes
+        # the length of a one-sided context window
+        # the bigram of features around the target
+        # concerns fvalues[cxsize-1] and fvalues[cxsize]
+        # the trigram of two words before, one word after includes
+        # fvalues[cxsize-2], fvalues[cxsize-1] and fvalues[cxsize]
+        [
+         [[-1, 0], "BLEM", lemma_index], # bigram of lemmas
+         [[-1, 0], "BPOS", pos_index],   # bigram of POSs
+         [[-2, -1, 0], "TLEM", lemma_index], # trigram of lemmas
+         [[-2, -1, 0], "TPOS", pos_index] # trigram of POSs
+        ].each { |f_indices, label, subindex|
+          fs = f_indices.map { |i| fvalues[@cxsize+i] }.compact()
+          if fs.length() == f_indices.length()
+            # we successfully extracted entries for all the given indices
+            yield label + fs.map { |f| f.split("#")[subindex] }.join()
+          end
+        }
+      end
+    }
+  end
+end
+#####
+# syntax feature
+class FredSynFeatureExtractor < FredFeatureExtractor
+  FredSynFeatureExtractor.announce_me()
+  def FredSynFeatureExtractor.feature_name()
+    return "syntax"
+  end
+  ###
+  def each_feature(feature_hash)
+    feature_hash.each { |ftype, fvalues|
+      case ftype
+       when "CH", "PA"
+        grf_index = 0
+        fvalues.each { |f|
+          yield ftype + f.split("#")[grf_index]
+        }
+      when "SI"
+        # parentlemma#grf#word#lemma#pos#ne
+        grf_index = 1
+        fvalues.each { |f|
+          yield ftype + f.split("#")[grf_index]
+        }
+      else
+        # not a syntactic metafeature
+      end
+    }
+  end
+end
+#####
+# syntax-plus-headword feature
+class FredSynsemFeatureExtractor < FredFeatureExtractor
+  FredSynsemFeatureExtractor.announce_me()
+  def FredSynsemFeatureExtractor.feature_name()
+    return "synsem"
+  end
+  ###
+  def each_feature(feature_hash)
+    feature_hash.each { |ftype, fvalues|
+      case ftype
+      when "CH", "PA"
+        # grf#word#lemma#pos#ne
+        fvalues.each { |f|
+          yield ftype + "SEM" + f
+        }
+      when "SI"
+        # parentlemma#grf#word#lemma#pos#ne
+        # remove parent lemma
+        fvalues.each { |f|
+          yield ftype + "SEM" + f.split("#")[1..-1].join("#")
+        }
+      else
+        # not a syntax feature
+      end
+    }
+  end
+end