RubyGems - shalmaneser - Versions diffs - 1.2.0.rc4 → 1.2.rc5 - Mend

shalmaneser 1.2.0.rc4 → 1.2.rc5

Files changed (115) hide show

checksums.yaml +4 -4
data/README.md +47 -18
data/bin/shalmaneser +8 -2
data/doc/index.md +1 -0
data/lib/shalmaneser/opt_parser.rb +68 -67
metadata +49 -119
data/bin/fred +0 -16
data/bin/frprep +0 -34
data/bin/rosy +0 -17
data/lib/common/AbstractSynInterface.rb +0 -1229
data/lib/common/Counter.rb +0 -18
data/lib/common/EnduserMode.rb +0 -27
data/lib/common/Eval.rb +0 -480
data/lib/common/FixSynSemMapping.rb +0 -196
data/lib/common/Graph.rb +0 -345
data/lib/common/ISO-8859-1.rb +0 -24
data/lib/common/ML.rb +0 -186
data/lib/common/Mallet.rb +0 -236
data/lib/common/Maxent.rb +0 -229
data/lib/common/Optimise.rb +0 -195
data/lib/common/Parser.rb +0 -213
data/lib/common/RegXML.rb +0 -269
data/lib/common/RosyConventions.rb +0 -171
data/lib/common/STXmlTerminalOrder.rb +0 -194
data/lib/common/SalsaTigerRegXML.rb +0 -2347
data/lib/common/SalsaTigerXMLHelper.rb +0 -99
data/lib/common/SynInterfaces.rb +0 -282
data/lib/common/TabFormat.rb +0 -721
data/lib/common/Tiger.rb +0 -1448
data/lib/common/Timbl.rb +0 -144
data/lib/common/Tree.rb +0 -61
data/lib/common/config_data.rb +0 -470
data/lib/common/config_format_element.rb +0 -220
data/lib/common/headz.rb +0 -338
data/lib/common/option_parser.rb +0 -13
data/lib/common/prep_config_data.rb +0 -62
data/lib/common/prep_helper.rb +0 -1330
data/lib/common/ruby_class_extensions.rb +0 -310
data/lib/db/db_interface.rb +0 -48
data/lib/db/db_mysql.rb +0 -145
data/lib/db/db_sqlite.rb +0 -280
data/lib/db/db_table.rb +0 -239
data/lib/db/db_wrapper.rb +0 -176
data/lib/db/sql_query.rb +0 -243
data/lib/ext/maxent/Classify.class +0 -0
data/lib/ext/maxent/Train.class +0 -0
data/lib/fred/Baseline.rb +0 -150
data/lib/fred/FileZipped.rb +0 -31
data/lib/fred/FredBOWContext.rb +0 -877
data/lib/fred/FredConventions.rb +0 -232
data/lib/fred/FredDetermineTargets.rb +0 -319
data/lib/fred/FredEval.rb +0 -312
data/lib/fred/FredFeatureExtractors.rb +0 -322
data/lib/fred/FredFeatures.rb +0 -1061
data/lib/fred/FredFeaturize.rb +0 -602
data/lib/fred/FredNumTrainingSenses.rb +0 -27
data/lib/fred/FredParameters.rb +0 -402
data/lib/fred/FredSplit.rb +0 -84
data/lib/fred/FredSplitPkg.rb +0 -180
data/lib/fred/FredTest.rb +0 -606
data/lib/fred/FredTrain.rb +0 -144
data/lib/fred/PlotAndREval.rb +0 -480
data/lib/fred/fred.rb +0 -47
data/lib/fred/fred_config_data.rb +0 -185
data/lib/fred/md5.rb +0 -23
data/lib/fred/opt_parser.rb +0 -250
data/lib/frprep/Ampersand.rb +0 -39
data/lib/frprep/CollinsInterface.rb +0 -1165
data/lib/frprep/Counter.rb +0 -18
data/lib/frprep/FNCorpusXML.rb +0 -643
data/lib/frprep/FNDatabase.rb +0 -144
data/lib/frprep/FrameXML.rb +0 -513
data/lib/frprep/Graph.rb +0 -345
data/lib/frprep/MiniparInterface.rb +0 -1388
data/lib/frprep/RegXML.rb +0 -269
data/lib/frprep/STXmlTerminalOrder.rb +0 -194
data/lib/frprep/SleepyInterface.rb +0 -384
data/lib/frprep/TntInterface.rb +0 -44
data/lib/frprep/TreetaggerInterface.rb +0 -327
data/lib/frprep/do_parses.rb +0 -143
data/lib/frprep/frprep.rb +0 -693
data/lib/frprep/interfaces/berkeley_interface.rb +0 -372
data/lib/frprep/interfaces/stanford_interface.rb +0 -353
data/lib/frprep/interpreters/berkeley_interpreter.rb +0 -22
data/lib/frprep/interpreters/stanford_interpreter.rb +0 -22
data/lib/frprep/one_parsed_file.rb +0 -28
data/lib/frprep/opt_parser.rb +0 -94
data/lib/frprep/ruby_class_extensions.rb +0 -310
data/lib/rosy/AbstractFeatureAndExternal.rb +0 -242
data/lib/rosy/ExternalConfigData.rb +0 -58
data/lib/rosy/FailedParses.rb +0 -130
data/lib/rosy/FeatureInfo.rb +0 -242
data/lib/rosy/GfInduce.rb +0 -1115
data/lib/rosy/GfInduceFeature.rb +0 -148
data/lib/rosy/InputData.rb +0 -294
data/lib/rosy/RosyConfusability.rb +0 -338
data/lib/rosy/RosyEval.rb +0 -465
data/lib/rosy/RosyFeatureExtractors.rb +0 -1609
data/lib/rosy/RosyFeaturize.rb +0 -281
data/lib/rosy/RosyInspect.rb +0 -336
data/lib/rosy/RosyIterator.rb +0 -478
data/lib/rosy/RosyPhase2FeatureExtractors.rb +0 -230
data/lib/rosy/RosyPruning.rb +0 -165
data/lib/rosy/RosyServices.rb +0 -744
data/lib/rosy/RosySplit.rb +0 -232
data/lib/rosy/RosyTask.rb +0 -19
data/lib/rosy/RosyTest.rb +0 -829
data/lib/rosy/RosyTrain.rb +0 -234
data/lib/rosy/RosyTrainingTestTable.rb +0 -787
data/lib/rosy/TargetsMostFrequentFrame.rb +0 -60
data/lib/rosy/View.rb +0 -418
data/lib/rosy/opt_parser.rb +0 -379
data/lib/rosy/rosy.rb +0 -78
data/lib/rosy/rosy_config_data.rb +0 -121
data/lib/shalmaneser/version.rb +0 -3

data/lib/rosy/GfInduceFeature.rb DELETED

@@ -1,148 +0,0 @@
-# GfInduceFeature
-# Katrin Erk Jan 06
-#
-# use result of GfInduce.rb as
-# feature for Rosy
-require "rosy/GfInduce"
-require "rosy/AbstractFeatureAndExternal"
-require "common/ruby_class_extensions"
-###
-# make filename for GfInduce picle file
-def filename_gfmap(exp,         # ExternalConfigData object
-		   interpreter) # SynInterpreter class
-  # output dir as given in my experiment file
-  # If there is an experiment ID, make subdirectory
-  # named after the experiment ID and place the data there.
-  output_dir = File.new_dir(exp.get("directory"))
-  if exp.get("experiment_id")
-    output_dir = File.new_dir(output_dir, exp.get("experiment_id"))
-  end
-  # output file name:
-  # Gfmap.{<service>=<system_name>.}*{OPT<service>=<system_name>.}*pkl
-  return output_dir +
-    "Gfmap." +
-    interpreter.systems().to_a.map { |service, system_name|
-    service.to_s+ "=" + system_name.to_s
-  }.sort.join(".") + "." +
-    interpreter.optional_systems().to_a.map { |service, system_name|
-    "OPT" + service.to_s + "=" + system_name.to_s
-  }.sort.join(".") + ".pkl"
-end
-################################
-# base class for all following feature extractors
-class GfInduceFeatureExtractor < ExternalFeatureExtractor
-  GfInduceFeatureExtractor.announce_me()
-  @@okay = true  # external experiment file present?
-  @@gf_obj = nil # GfInduce object
-  @@node_to_gf = nil # Hash: SynNodes of a sentence -> Gf label
-  def GfInduceFeatureExtractor.designator()
-    return "gf_fn"
-  end
-  def GfInduceFeatureExtractor.feature_names()
-    return ["gf_fn"]
-  end
-  def GfInduceFeatureExtractor.sql_type()
-    return "VARCHAR(25)"
-  end
-  def GfInduceFeatureExtractor.feature_type()
-    return "syn"
-  end
-  def GfInduceFeatureExtractor.phase()
-    return "phase 1"
-  end
-  ###
-  # set sentence, set node, set other settings:
-  # this is done prior to
-  # feature computation using compute_feature()
-  # such that computations that stay the same for
-  # several features can be done in advance
-  #
-  # This is just relevant for Phase 1
-  #
-  # returns: false/nil if there was a problem
-  def GfInduceFeatureExtractor.set_sentence(sent,  # SalsaTigerSentence object
-                                            frame) # FrameNode object
-    super(sent, frame)
-    if @@okay
-      # we can actually compute something
-      # let the GF object compute all subcat frames
-      # for the target of this frame
-      subcatframes_of_current_target = @@gf_obj.apply(frame.target.children())
-      # keep the most frequent one of the
-      # subcat frames returned by the GF object:
-      if subcatframes_of_current_target.empty?
-        # no subcat frames returned
-        subcatframe = []
-      else
-        # we have at least one subcat frame:
-        # keep the most frequent one of them
-        #
-        # Also, subcatframes_of_current_target
-        # contains triples [frame, actual_subcatframe, frequency]
-        # Of these, keep just the actual_subcatframe
-        subcatframe = subcatframes_of_current_target.sort { |a, b|
-          # sort by frequency
-          b.last <=> a.last
-        }.first[1]
-      end
-      # change into a mapping node(SynNode) -> GF(string)
-      @@node_to_gf = Hash.new
-      subcatframe.each { |gf, prep, fe, synnodes|
-        synnodes.each { |node|
-          @@node_to_gf[node] = "#{gf} #{prep}"
-        }
-      }
-    end
-  end
-  ###
-  # Initialize: read GFInduce pickle
-  def initialize(exp,                  # experiment file object
-                 interpreter_class)    # SynInterpreter class
-    super(exp, interpreter_class)
-    if @exp_external
-      pickle_filename = filename_gfmap(@exp_external, @@interpreter_class)
-      @@gf_obj = GfInduce.from_file(pickle_filename)
-      @@okay = true
-    else
-      # signal that you cannot compute anything
-      @@okay = false
-    end
-  end
-  ###
-  # compute: compute features
-  #
-  # returns an array of features (strings), length the same as the
-  # length of feature_names()
-  #
-  # here: array of length one, content either a string or nil
-  def compute_features()
-    # current node: @@node
-    # check whether the current node has been assigned a slot
-    # in the subcat frame
-    if @@okay
-      return [ @@node_to_gf[@@node] ]
-    else
-      return [ nil ]
-    end
-  end
-end

data/lib/rosy/InputData.rb DELETED

@@ -1,294 +0,0 @@
-###########
-#
-# ke / sp 12 04 05
-#
-# class for input data object
-# offers methods for preprocessing and
-# featurization
-# Salsa packages
-require "common/Parser"
-require "common/SalsaTigerRegXML"
-require "common/ruby_class_extensions"
-# Fred/Rosy packages
-require "rosy/FailedParses"
-require "common/RosyConventions"
-require "rosy/RosyFeatureExtractors"
-require "rosy/RosyPhase2FeatureExtractors"
-require "rosy/RosyPruning"
-require "rosy/GfInduceFeature"
-require "common/FixSynSemMapping"
-class InputData
-  ###
-  def initialize(exp_object,          # RosyConfigData object
-                 dataset,             # train/test
-		 feature_info_object, # FeatureInfo object
-                 interpreter_class,   # SynInterpreter class
-                 input_dir)           # Directory with input files
-    @exp = exp_object
-    @dataset = dataset
-    @interpreter_class = interpreter_class
-    @input_dir = input_dir
-    # store information about failed parses here
-    @failed_parses = FailedParses.new()
-    # feature_extractors_phase1: array of AbstractFeatureExtractor objects
-    @extractors_p1_rosy, @extractors_p1_other = feature_info_object.get_extractor_objects("phase 1",
-                                                                                          @interpreter_class)
-    # global settings
-    unless RosyFeatureExtractor.set("split_nones" => @exp.get("split_nones"))
-      raise "Some grave problem during feature extractor initialization"
-    end
-#     # nothing to set here for now, so deactivated
-#     @extractors_p1_other.each { |extractor_obj|
-#       unless extractor_obj.class.set()
-#         raise "Some grave problem during feature extractor initialization"
-#       end
-#     }
-    # feature_extractors_phase2: array of  AbstractFeatureExtractor objects
-    extractors_p2_rosy, extractors_p2_other = feature_info_object.get_extractor_objects("phase 2",
-                                                                                        @interpreter_class)
-    @feature_extractors_phase2 = extractors_p2_rosy + extractors_p2_other
-  end
-  ###
-  # each_instance_phase1()
-  #
-  # reads the input data from file(s), in the specific input format,
-  # separates it into instances,
-  # threads it through all phase 1 feature extractors
-  # and yields one feature vector per instance
-  #
-  # yields: pairs [feature_name(string), feature_value(object)]
-  def each_instance_phase1()
-    Dir[@input_dir+"*.xml"]. each {|parsefilename|
-      xmlFile = FilePartsParser.new(parsefilename)
-      $stderr.puts "Processing #{parsefilename}"
-      xmlFile.scan_s {|sent_string|
-        sent = SalsaTigerSentence.new(sent_string)
-        # preprocessing: possibly change the SalsaTigerSentence object
-        # before featurization
-        preprocess(sent)
-        sent.each_frame{ |frame|
-          # skip failed parses
-          if sent.get_attribute("failed")
-            handle_failed_parse(sent, frame)
-            next
-          end
-          # Tell feature extractors about the sentence and frame:
-          # first Rosy feature extractors, then the others
-          # if there is a problem, skip this frame
-          unless RosyFeatureExtractor.set_sentence(sent, frame)
-            next
-          end
-          skip_frame = false
-          @extractors_p1_other.each { |extractor_obj|
-            unless extractor_obj.class.set_sentence(sent, frame)
-              skip_frame = true
-              break
-            end
-          }
-          if skip_frame
-            next
-          end
-          sent.each_syn_node { |syn_node|
-            # Tell feature extractors about the current node:
-            # first Rosy feature extractors, then the others
-            # if there is a problem, skip this node
-            unless RosyFeatureExtractor.set_node(syn_node)
-              next
-            end
-            skip_node = false
-            @extractors_p1_other.each { |extractor_obj|
-              unless extractor_obj.class.set_node(syn_node)
-                skip_node = true
-                break
-              end
-            }
-            if skip_node
-              next
-            end
-            # features: array of pairs: [feature_name(string), feature_value(object)]
-            features = Array.new
-            (@extractors_p1_rosy + @extractors_p1_other).each { |extractor|
-              # compute features
-              feature_names = extractor.class.feature_names()
-              feature_index = 0
-              # append new features to features array
-              features.concat extractor.compute_features().map { |feature_value|
-                feature_name = feature_names[feature_index]
-                feature_index += 1
-                # sanity check: feature value longer than the allotted space in the DB?
-                check_feature_length(feature_name, feature_value, extractor)
-                [feature_name, nonnil_feature(feature_value, extractor.class.sql_type()) ]
-              }
-            }
-            yield features
-          } # each syn node
-        } # each frame
-      } # each sentence
-    }
-  end
-  ###
-  # each_phase2_column
-  #
-  # This method implements the application of the
-  # phase 2 extractors to data.
-  #
-  # Given a database view (of either training or test data),
-  # assign a new feature value to each instance
-  #
-  # yields pairs [feature_name(string), feature_values(array)]
-  # The feature_values array has as many lines as the view has instances
-  # so the yield of this method can be fed directly into view.update_column()
-  def each_phase2_column(view) # View object: training or test data
-    @feature_extractors_phase2.each { |extractor|
-      # apply the extractor
-      feature_columns = extractor.compute_features_on_view(view)
-      # interleave with feature values and yield
-      feature_index = 0
-      feature_names = extractor.class.feature_names()
-      feature_columns.each { |feature_values|
-        yield [
-          feature_names[feature_index],
-          feature_values.map { |feature_val| nonnil_feature(feature_val, extractor.class.sql_type)  }
-        ]
-        feature_index += 1
-      }
-    }
-  end
-  ###
-  # get_failed_parses
-  #
-  # returns the FailedParses object in which the info about failed parses has been stored
-  def get_failed_parses()
-    return @failed_parses
-  end
-  #################################
-  private
-  ###
-  def nonnil_feature(feature_value,
-                     sql_type)
-    # feature value nil? then change to noval
-    if feature_value.nil? and sql_type =~ /CHAR/
-      return @exp.get("noval")
-    elsif feature_value.class.to_s == "String" and feature_value.empty?
-      return @exp.get("noval")
-    elsif feature_value.nil?
-      return 0
-    else
-      return feature_value
-    end
-  end
-  ###
-  # preprocess: possibly change the given SalsaTigerSentence
-  # to enable better learning
-  def preprocess(sent)           # SalsaTigerSentence object
-    if @dataset == "train" and
-        (@exp.get("fe_syn_repair") or @exp.get("fe_rel_repair"))
-      FixSynSemMapping.fixit(sent, @exp, @interpreter_class)
-    end
-  end
-  ###
-  # register failed parses
-  def handle_failed_parse(sent,  # SalsaTigerSentence object
-                          frame) # FrameNode
-    # target POS
-    if frame.target()
-      main_target = @interpreter_class.main_node_of_expr(frame.target.children(), "no_mwe")
-    else
-      main_target = nil
-    end
-    if main_target
-      target_pos = @interpreter_class.category(main_target)
-    else
-      target_pos = nil
-    end
-    if frame.target()
-      target_str = frame.target().yield_nodes_ordered().map { |t_node|
-        if t_node.is_syntactic?
-          @interpreter_class.lemma_backoff(t_node)
-        else
-          # not a syntactic node: maybe an unassigned target?
-          ""
-        end
-      }.join(" ")
-    else
-      target_str = ""
-    end
-    @failed_parses.register(construct_instance_id(sent.id(), frame.id()),
-                            frame.name(),
-                            target_str,
-                            target_pos,
-                            frame.children.map { |fe| fe.name })
-  end
-  ###
-  # sanity check: feature value longer than the allotted space in the DB?
-  def check_feature_length(feature_name,  # string
-                           feature_value, # object
-                           extractor_obj) # AbstractFeatureExtractor object
-    if extractor_obj.class.sql_type() =~ /(\d+)/
-      # sql type contains some statement about the length.
-      # just crudely compare to feature length
-      length = $1.to_i
-      if feature_value.class == String and
-          feature_value.length() > length
-        if feature_name == "sentid"
-	  print length;
-          print feature_value;
-	  print feature_value.length();
-	  # if the sentence (instance) ID is too long, we cannot go on.
-          $stderr.puts "Error: Instance ID is longer than its DB column."
-          $stderr.puts "Please increase the DB column size in {Tiger,Collins}FeatureExtractors.rb"
-          raise "SQL entry length surpassed"
-        elsif @exp.get("verbose")
-          # KE Feb 07: don't print warning,
-          # this is just too frequent
-          # for other features, we just issue a warning, and only if we are verbose
-          # $stderr.puts "Warning: feature #{feature_name} longer than its DB column (#{length.to_s} vs #{feature_value.length()}): #{feature_value}"
-        end # feature name check
-      end # length surpassed
-    end # length found in sql type
-  end
-end

data/lib/rosy/RosyConfusability.rb DELETED

@@ -1,338 +0,0 @@
-# RosyConfusability
-# KE May 05
-#
-# Access instance database created by the Rosy role assignment system
-# and compute the confusability of target categories
-# for the data in the (training) database there.
-#
-# We define confusability as follows:
-# Given a frame fr, let
-# - fes(fr) the FEs of fr (a set)
-# - gfs(fe) the grammatical functions realizing the FE fe in the data
-# - gfs(fr) = U_{fe \in fes(fr)} gfs(fe) the grammatical functions realizing roles of fr
-#
-# Then the entropy of a grammatical function gf within fr is
-#
-# gfe_{fr}(gf) = \sum_{fe \in fes(fr)} -p(fe|gf) log p(fe|gf)
-#
-# where p(fe|gf) = f(gf, fe) / f(gf)
-#
-# And the confusability of a frame element fe of fr is
-#
-# c_{fr}(fe) = \sum_{gf \in gfs(fr)} p(gf|fe) gfe_{fr}(gf)
-#
-# where p(gf|fe) = f(gf, fe) / f(fe)
-require "RosyConfigData"
-require "RosyIterator"
-require "RosyConventions"
-require "TargetsMostFrequentFrame"
-require "mysql"
-class RosyConfusability
-  include TargetsMostFrequentSc
-  attr_reader :confusability, :counts_fe_glob, :frame_confusability, :overall_confusability
-  def initialize(exp) # RosyConfigData object
-    @exp = exp
-    @confusability = Hash.new(0.0)
-    @counts_fe_glob = Hash.new(0)
-    @counts_gffe_glob = Hash.new(0)
-    @frame_confusability = Hash.new(0.0)
-    @overall_confusability = 0.0
-    @frequent_gframes = [
-      # NO DUPLICATES
-      "Ext_Comp", "Mod", "Comp", "Gen",
-      "Ext_Obj", "Ext", "Ext_Obj_Comp", "Head",
-      "Ext_Mod", "Gen_Mod", "Mod_Comp", "Comp_Ext",
-      "Gen_Comp", "Ext_Gen", "Ext_Mod_Comp", "Head_Comp",
-      "Obj_Comp", "Obj", "Mod_Head", "Ext_Comp_Obj",
-      "Gen_Head", "Ext_Gen_Mod"
-      # with duplicates
-#       "Ext_Comp", "Mod", "Comp", "Gen",
-#       "Ext_Obj", "Ext", "", "Ext_Obj_Comp",
-#       "Ext_Comp_Comp", "Head", "Mod_Mod", "Gen_Mod",
-#       "Ext_Mod", "Comp_Comp", "Mod_Comp", "Ext_Gen",
-#       "Gen_Comp", "Head_Head", "Ext_Comp_Comp_Comp", "Head_Comp",
-# # "Ext_Ext_Comp",
-# #       "Ext_Obj_Comp_Comp", "Obj_Comp",
-# #       "Ext_Mod_Mod", "Comp_Comp_Comp",
-# #       "Ext_Ext_Obj", "Ext_Mod_Comp", "Comp_Ext", "Obj",
-# #       "Ext_Ext", "Ext_Obj_Obj", "Mod_Mod_Mod", "Gen_Mod_Mod",
-# #       "Ext_Comp_Comp_Comp_Comp", "Gen_Head", "Mod_Head",
-# #       "Ext_Ext_Ext_Comp"
-    ].map { |string|
-      string.split("_")
-    }
-  end
-  def compute(splitID,     # string: split ID, may be nil
-              additionals) # array:string: "target", "target_pos", "gframe", "fgframe"
-    ###
-    # open and initialize stuff:
-    # open database
-    database = Mysql.real_connect(@exp.get('host'), @exp.get('user'),
-                                  @exp.get('passwd'), @exp.get('dbname'))
-    # make an object that creates views.
-    # read one frame at a time.
-    iterator = RosyIterator.new(database, @exp, "train",
-                                "splitID" => splitID,
-                                "xwise" => "frame")
-    # get value for "no val"
-    noval = @exp.get("noval")
-    counts_frame = Hash.new(0)
-    # iterate through all frames and compute confusability of each FE
-    iterator.each_group { |group_descr_hash, frame|
-      $stderr.puts "Computing confusability for #{frame}"
-      # read all instances of the frame, columns: FE and GF
-      view = iterator.get_a_view_for_current_group(["sentid","gold", "fn_gf",
-                                                    "target","target_pos", "frame"])
-      if additionals.include? "tmfframe"
-        # find most frequent gframe for each target
-        tmfframe = determine_target_most_frequent_sc(view, noval)
-      end
-      # count occurrences
-      counts_gf = Hash.new(0)
-      counts_fe = Hash.new(0)
-      counts_gffe = Hash.new(0)
-      view.each_sentence { |sentence|
-        # make string consisting of all FN GFs of this sentence
-        allgfs = Array.new()
-        sentence.each { |inst|
-          if inst["fn_gf"] != noval
-            allgfs << inst["fn_gf"]
-          end
-        }
-        # assume uniqueness of GFs
-        # design decision, could also be done differently.
-        # rationale: if a GF occurs more than once,
-        # it's probable that this is because we get more than
-        # one constituent for this GF, not because
-        # it actually occurred more than once in the
-        # original FrameNet annotation.
-        allgfs.uniq!
-        # now count each instance
-        sentence.each { |row|
-          if row["gold"] == "target"
-            # don't count target among the FEs
-            next
-          end
-          if row["gold"] != noval
-            counts_fe[row["gold"]] += 1
-          end
-          if row["fn_gf"] != noval and row["fn_gf"] != "target"
-            gf = row["fn_gf"]
-            additionals.each { |additional|
-              case additional
-              when "target"
-                gf << "_" + row["target"]
-              when "target_pos"
-                gf << "_" + row["target_pos"]
-              when "gframe"
-                gf << "_" + allgfs.join("_")
-              when "fgframe"
-                # find the maximal frequent frame subsuming allgfs
-                maxfgf = nil
-                @frequent_gframes.each { |fgframe|
-                  if fgframe.subsumed_by?(allgfs)
-                    # fgframe is a subset of allgfs
-                    if maxfgf.nil? or fgframe.length() > maxfgf.length()
-                      maxfgf = fgframe
-                    end
-                  end
-                }
-                if maxfgf.nil?
-                  # nothing there that fits
-                  # leave GF as is
-                else
-                  gf << "_" + maxfgf.join("_")
-                end
-              when "tmfframe"
-                gf << "_" + tmfframe[tmf_target_key(row)]
-              else
-                raise "Don't know how to compute #{additional}"
-              end
-            }
-            counts_gf[gf] += 1
-          end
-          if row["gold"] != noval and gf
-            counts_gffe[gf + " " + row["gold"]] += 1
-          end
-        } # each row of sentence
-      } # each sentence of view
-      # compute gf entropy
-      # gfe_{fr}(gf) = \sum_{fe \in fes(fr)} -p(fe|gf) log_2 p(fe|gf)
-      #
-      # where p(fe|gf) = f(gf, fe) / f(gf)
-      gf_entropy = Hash.new
-      counts_gf.keys.each { |gf|
-        gf_entropy[gf] = 0.0
-        counts_fe.keys.each { |fe|
-          if counts_gf[gf] > 0
-            p_gf_fe = counts_gffe[gf + " " + fe].to_f / counts_gf[gf].to_f
-            # get log_2 via log_10
-            if p_gf_fe > 0.0
-              gf_entropy[gf] -= p_gf_fe * Math.log10(p_gf_fe) * 3.32193
-            end
-          end
-        } # each FE for this GF
-      } # each GF (gf entropy)
-      # compute FE confusability
-      # c_{fr}(fe) = \sum_{gf \in gfs(fr)} p(gf|fe) gfe_{fr}(gf)
-      #
-      # where p(gf|fe) = f(gf, fe) / f(fe)
-      counts_fe.keys.each { |fe|
-        @confusability[frame + " " + fe] = 0.0
-        counts_gf.keys.each { |gf|
-          if counts_fe[fe] > 0
-            p_fe_gf = counts_gffe[gf + " " + fe].to_f / counts_fe[fe].to_f
-            @confusability[frame + " " + fe] += p_fe_gf * gf_entropy[gf]
-          end
-        } # each GF for this FE
-      } # each FE (fe confusability)
-      # remember counts for FEs and GF/FE pairs
-      counts_fe.keys.each { |fe|
-        @counts_fe_glob[frame + " " + fe] = counts_fe[fe]
-      }
-      counts_gffe.each_pair {|event,freq|
-        @counts_gffe_glob[frame+" " +event] = freq
-      }
-      # omit rare FEs:
-      # anything below 5 occurrences
-      counts_fe.each_key {  |fe|
-        if counts_fe[fe] < 5
-          @confusability.delete(frame + " " + fe)
-        end
-      }
-      # compute overall frame confusability
-      # omitting rare FEs with below 5 occurrences:
-      #
-      # c(fr) = sum_{fe \in fes(fr)} f(fe)/f(fr) * c_{fr}(fe)
-      #       = \sum_{gf \in gfs(fr)} p(gf|fr) gfe_{fr}(gf)
-      #
-      # where p(gf|fr) = (sum_{fe\in fes(fr)} f(gf, fe)) / f(fr)
-      counts_frame[frame] = 0
-      counts_fe.each_value { |count|
-        if count >= 5
-          counts_frame[frame] += count
-        end
-      }
-      @frame_confusability[frame] = 0.0
-      counts_fe.each_pair { |fe, count|
-        if count >= 5
-          @frame_confusability[frame] += (count.to_f / counts_frame[frame].to_f) * @confusability[frame + " " + fe]
-        end
-      }
-    } # each frame
-    # compute overall confusability
-    # c = \sum{fr \in frames} f(fr)/N * c(fr)
-    #
-    # where N is the number of FE occurrences overall
-    counts_overall = 0
-    counts_frame.each_value { |count|
-      counts_overall += count
-    }
-    @overall_confusability = 0.0
-    counts_frame.each_pair { |frame, count|
-      @overall_confusability += (count.to_f / counts_overall.to_f) * @frame_confusability[frame]
-    }
-  end
-  # return a copy of @counts_fe_glob, from which all fes with less than 5 occurrences are deleted
-  def get_global_counts
-    global_counts = @counts_fe_glob.clone
-    global_counts.delete_if {|key, value| value < 5}
-    return global_counts
-  end
-  ###
-  #
-  # compute sparseness statistics over the set of
-  # base events used for computing the confusability
-  # returns an array of length 4:
-  # - number of events with freq 1
-  # - number of events with freq 2
-  # - number of events with freq 3-5
-  # - number of events with freq > 5
-  def counts()
-    counts = [0,0,0,0]
-    @counts_gffe_glob.each_value {|freq|
-      case freq
-      when 1
-        counts[0] += 1
-      when 2
-        counts[1] += 1
-      when 3..5
-        counts[2] += 1
-      else
-        counts[3] += 1
-      end
-    }
-    return counts
-  end
-  def to_file(filename)
-    begin
-      file = File.new(filename,"w")
-    rescue
-      raise "Couldn't open file #{filename} for writing."
-    end
-    Marshal.dump({"confusability" => @confusability,
-                  "counts_fe_glob" => @counts_fe_glob,
-                  "counts_gffe_glob" => @counts_gffe_glob,
-                  "frame_confusability" => @frame_confusability,
-                  "overall_confusability" => @overall_confusability
-                 },
-                 file)
-  end
-  def from_file(filename)
-    begin
-      file = File.new(filename)
-    rescue
-      raise "Couldn't open file #{filename} for reading."
-    end
-    hash = Marshal.load(file)
-    @confusability = hash["confusability"]
-    @counts_fe_glob = hash["counts_fe_glob"]
-    @counts_gffe_glob = hash["counts_gffe_glob"]
-    @frame_confusability = hash["frame_confusability"]
-    @overall_confusability =  hash["overall_confusability"]
-  end
-end