RubyGems - shalmaneser-rosy - Versions diffs - 1.2.0.rc4 - Mend

shalmaneser-rosy 1.2.0.rc4

Files changed (38) hide show

checksums.yaml +7 -0
data/.yardopts +10 -0
data/CHANGELOG.md +4 -0
data/LICENSE.md +4 -0
data/README.md +93 -0
data/bin/rosy +17 -0
data/lib/rosy/AbstractFeatureAndExternal.rb +242 -0
data/lib/rosy/ExternalConfigData.rb +58 -0
data/lib/rosy/FailedParses.rb +130 -0
data/lib/rosy/FeatureInfo.rb +242 -0
data/lib/rosy/GfInduce.rb +1115 -0
data/lib/rosy/GfInduceFeature.rb +148 -0
data/lib/rosy/InputData.rb +294 -0
data/lib/rosy/RosyConfusability.rb +338 -0
data/lib/rosy/RosyEval.rb +465 -0
data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
data/lib/rosy/RosyFeaturize.rb +281 -0
data/lib/rosy/RosyInspect.rb +336 -0
data/lib/rosy/RosyIterator.rb +478 -0
data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
data/lib/rosy/RosyPruning.rb +165 -0
data/lib/rosy/RosyServices.rb +744 -0
data/lib/rosy/RosySplit.rb +232 -0
data/lib/rosy/RosyTask.rb +19 -0
data/lib/rosy/RosyTest.rb +829 -0
data/lib/rosy/RosyTrain.rb +234 -0
data/lib/rosy/RosyTrainingTestTable.rb +787 -0
data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
data/lib/rosy/View.rb +418 -0
data/lib/rosy/opt_parser.rb +379 -0
data/lib/rosy/rosy.rb +78 -0
data/lib/rosy/rosy_config_data.rb +121 -0
data/test/frprep/test_opt_parser.rb +94 -0
data/test/functional/functional_test_helper.rb +58 -0
data/test/functional/test_fred.rb +47 -0
data/test/functional/test_frprep.rb +99 -0
data/test/functional/test_rosy.rb +40 -0
metadata +105 -0

@@ -0,0 +1,148 @@
+# GfInduceFeature
+# Katrin Erk Jan 06
+#
+# use result of GfInduce.rb as
+# feature for Rosy
+require "rosy/GfInduce"
+require "rosy/AbstractFeatureAndExternal"
+require "common/ruby_class_extensions"
+###
+# make filename for GfInduce picle file
+def filename_gfmap(exp,         # ExternalConfigData object
+		   interpreter) # SynInterpreter class
+  # output dir as given in my experiment file
+  # If there is an experiment ID, make subdirectory
+  # named after the experiment ID and place the data there.
+  output_dir = File.new_dir(exp.get("directory"))
+  if exp.get("experiment_id")
+    output_dir = File.new_dir(output_dir, exp.get("experiment_id"))
+  end
+  # output file name:
+  # Gfmap.{<service>=<system_name>.}*{OPT<service>=<system_name>.}*pkl
+  return output_dir +
+    "Gfmap." +
+    interpreter.systems().to_a.map { |service, system_name|
+    service.to_s+ "=" + system_name.to_s
+  }.sort.join(".") + "." +
+    interpreter.optional_systems().to_a.map { |service, system_name|
+    "OPT" + service.to_s + "=" + system_name.to_s
+  }.sort.join(".") + ".pkl"
+end
+################################
+# base class for all following feature extractors
+class GfInduceFeatureExtractor < ExternalFeatureExtractor
+  GfInduceFeatureExtractor.announce_me()
+  @@okay = true  # external experiment file present?
+  @@gf_obj = nil # GfInduce object
+  @@node_to_gf = nil # Hash: SynNodes of a sentence -> Gf label
+  def GfInduceFeatureExtractor.designator()
+    return "gf_fn"
+  end
+  def GfInduceFeatureExtractor.feature_names()
+    return ["gf_fn"]
+  end
+  def GfInduceFeatureExtractor.sql_type()
+    return "VARCHAR(25)"
+  end
+  def GfInduceFeatureExtractor.feature_type()
+    return "syn"
+  end
+  def GfInduceFeatureExtractor.phase()
+    return "phase 1"
+  end
+  ###
+  # set sentence, set node, set other settings:
+  # this is done prior to
+  # feature computation using compute_feature()
+  # such that computations that stay the same for
+  # several features can be done in advance
+  #
+  # This is just relevant for Phase 1
+  #
+  # returns: false/nil if there was a problem
+  def GfInduceFeatureExtractor.set_sentence(sent,  # SalsaTigerSentence object
+                                            frame) # FrameNode object
+    super(sent, frame)
+    if @@okay
+      # we can actually compute something
+      # let the GF object compute all subcat frames
+      # for the target of this frame
+      subcatframes_of_current_target = @@gf_obj.apply(frame.target.children())
+      # keep the most frequent one of the
+      # subcat frames returned by the GF object:
+      if subcatframes_of_current_target.empty?
+        # no subcat frames returned
+        subcatframe = []
+      else
+        # we have at least one subcat frame:
+        # keep the most frequent one of them
+        #
+        # Also, subcatframes_of_current_target
+        # contains triples [frame, actual_subcatframe, frequency]
+        # Of these, keep just the actual_subcatframe
+        subcatframe = subcatframes_of_current_target.sort { |a, b|
+          # sort by frequency
+          b.last <=> a.last
+        }.first[1]
+      end
+      # change into a mapping node(SynNode) -> GF(string)
+      @@node_to_gf = Hash.new
+      subcatframe.each { |gf, prep, fe, synnodes|
+        synnodes.each { |node|
+          @@node_to_gf[node] = "#{gf} #{prep}"
+        }
+      }
+    end
+  end
+  ###
+  # Initialize: read GFInduce pickle
+  def initialize(exp,                  # experiment file object
+                 interpreter_class)    # SynInterpreter class
+    super(exp, interpreter_class)
+    if @exp_external
+      pickle_filename = filename_gfmap(@exp_external, @@interpreter_class)
+      @@gf_obj = GfInduce.from_file(pickle_filename)
+      @@okay = true
+    else
+      # signal that you cannot compute anything
+      @@okay = false
+    end
+  end
+  ###
+  # compute: compute features
+  #
+  # returns an array of features (strings), length the same as the
+  # length of feature_names()
+  #
+  # here: array of length one, content either a string or nil
+  def compute_features()
+    # current node: @@node
+    # check whether the current node has been assigned a slot
+    # in the subcat frame
+    if @@okay
+      return [ @@node_to_gf[@@node] ]
+    else
+      return [ nil ]
+    end
+  end
+end

data/lib/rosy/InputData.rb ADDED

@@ -0,0 +1,294 @@
+###########
+#
+# ke / sp 12 04 05
+#
+# class for input data object
+# offers methods for preprocessing and
+# featurization
+# Salsa packages
+require "common/Parser"
+require "common/SalsaTigerRegXML"
+require "common/ruby_class_extensions"
+# Fred/Rosy packages
+require "rosy/FailedParses"
+require "common/RosyConventions"
+require "rosy/RosyFeatureExtractors"
+require "rosy/RosyPhase2FeatureExtractors"
+require "rosy/RosyPruning"
+require "rosy/GfInduceFeature"
+require "common/FixSynSemMapping"
+class InputData
+  ###
+  def initialize(exp_object,          # RosyConfigData object
+                 dataset,             # train/test
+		 feature_info_object, # FeatureInfo object
+                 interpreter_class,   # SynInterpreter class
+                 input_dir)           # Directory with input files
+    @exp = exp_object
+    @dataset = dataset
+    @interpreter_class = interpreter_class
+    @input_dir = input_dir
+    # store information about failed parses here
+    @failed_parses = FailedParses.new()
+    # feature_extractors_phase1: array of AbstractFeatureExtractor objects
+    @extractors_p1_rosy, @extractors_p1_other = feature_info_object.get_extractor_objects("phase 1",
+                                                                                          @interpreter_class)
+    # global settings
+    unless RosyFeatureExtractor.set("split_nones" => @exp.get("split_nones"))
+      raise "Some grave problem during feature extractor initialization"
+    end
+#     # nothing to set here for now, so deactivated
+#     @extractors_p1_other.each { |extractor_obj|
+#       unless extractor_obj.class.set()
+#         raise "Some grave problem during feature extractor initialization"
+#       end
+#     }
+    # feature_extractors_phase2: array of  AbstractFeatureExtractor objects
+    extractors_p2_rosy, extractors_p2_other = feature_info_object.get_extractor_objects("phase 2",
+                                                                                        @interpreter_class)
+    @feature_extractors_phase2 = extractors_p2_rosy + extractors_p2_other
+  end
+  ###
+  # each_instance_phase1()
+  #
+  # reads the input data from file(s), in the specific input format,
+  # separates it into instances,
+  # threads it through all phase 1 feature extractors
+  # and yields one feature vector per instance
+  #
+  # yields: pairs [feature_name(string), feature_value(object)]
+  def each_instance_phase1()
+    Dir[@input_dir+"*.xml"]. each {|parsefilename|
+      xmlFile = FilePartsParser.new(parsefilename)
+      $stderr.puts "Processing #{parsefilename}"
+      xmlFile.scan_s {|sent_string|
+        sent = SalsaTigerSentence.new(sent_string)
+        # preprocessing: possibly change the SalsaTigerSentence object
+        # before featurization
+        preprocess(sent)
+        sent.each_frame{ |frame|
+          # skip failed parses
+          if sent.get_attribute("failed")
+            handle_failed_parse(sent, frame)
+            next
+          end
+          # Tell feature extractors about the sentence and frame:
+          # first Rosy feature extractors, then the others
+          # if there is a problem, skip this frame
+          unless RosyFeatureExtractor.set_sentence(sent, frame)
+            next
+          end
+          skip_frame = false
+          @extractors_p1_other.each { |extractor_obj|
+            unless extractor_obj.class.set_sentence(sent, frame)
+              skip_frame = true
+              break
+            end
+          }
+          if skip_frame
+            next
+          end
+          sent.each_syn_node { |syn_node|
+            # Tell feature extractors about the current node:
+            # first Rosy feature extractors, then the others
+            # if there is a problem, skip this node
+            unless RosyFeatureExtractor.set_node(syn_node)
+              next
+            end
+            skip_node = false
+            @extractors_p1_other.each { |extractor_obj|
+              unless extractor_obj.class.set_node(syn_node)
+                skip_node = true
+                break
+              end
+            }
+            if skip_node
+              next
+            end
+            # features: array of pairs: [feature_name(string), feature_value(object)]
+            features = Array.new
+            (@extractors_p1_rosy + @extractors_p1_other).each { |extractor|
+              # compute features
+              feature_names = extractor.class.feature_names()
+              feature_index = 0
+              # append new features to features array
+              features.concat extractor.compute_features().map { |feature_value|
+                feature_name = feature_names[feature_index]
+                feature_index += 1
+                # sanity check: feature value longer than the allotted space in the DB?
+                check_feature_length(feature_name, feature_value, extractor)
+                [feature_name, nonnil_feature(feature_value, extractor.class.sql_type()) ]
+              }
+            }
+            yield features
+          } # each syn node
+        } # each frame
+      } # each sentence
+    }
+  end
+  ###
+  # each_phase2_column
+  #
+  # This method implements the application of the
+  # phase 2 extractors to data.
+  #
+  # Given a database view (of either training or test data),
+  # assign a new feature value to each instance
+  #
+  # yields pairs [feature_name(string), feature_values(array)]
+  # The feature_values array has as many lines as the view has instances
+  # so the yield of this method can be fed directly into view.update_column()
+  def each_phase2_column(view) # View object: training or test data
+    @feature_extractors_phase2.each { |extractor|
+      # apply the extractor
+      feature_columns = extractor.compute_features_on_view(view)
+      # interleave with feature values and yield
+      feature_index = 0
+      feature_names = extractor.class.feature_names()
+      feature_columns.each { |feature_values|
+        yield [
+          feature_names[feature_index],
+          feature_values.map { |feature_val| nonnil_feature(feature_val, extractor.class.sql_type)  }
+        ]
+        feature_index += 1
+      }
+    }
+  end
+  ###
+  # get_failed_parses
+  #
+  # returns the FailedParses object in which the info about failed parses has been stored
+  def get_failed_parses()
+    return @failed_parses
+  end
+  #################################
+  private
+  ###
+  def nonnil_feature(feature_value,
+                     sql_type)
+    # feature value nil? then change to noval
+    if feature_value.nil? and sql_type =~ /CHAR/
+      return @exp.get("noval")
+    elsif feature_value.class.to_s == "String" and feature_value.empty?
+      return @exp.get("noval")
+    elsif feature_value.nil?
+      return 0
+    else
+      return feature_value
+    end
+  end
+  ###
+  # preprocess: possibly change the given SalsaTigerSentence
+  # to enable better learning
+  def preprocess(sent)           # SalsaTigerSentence object
+    if @dataset == "train" and
+        (@exp.get("fe_syn_repair") or @exp.get("fe_rel_repair"))
+      FixSynSemMapping.fixit(sent, @exp, @interpreter_class)
+    end
+  end
+  ###
+  # register failed parses
+  def handle_failed_parse(sent,  # SalsaTigerSentence object
+                          frame) # FrameNode
+    # target POS
+    if frame.target()
+      main_target = @interpreter_class.main_node_of_expr(frame.target.children(), "no_mwe")
+    else
+      main_target = nil
+    end
+    if main_target
+      target_pos = @interpreter_class.category(main_target)
+    else
+      target_pos = nil
+    end
+    if frame.target()
+      target_str = frame.target().yield_nodes_ordered().map { |t_node|
+        if t_node.is_syntactic?
+          @interpreter_class.lemma_backoff(t_node)
+        else
+          # not a syntactic node: maybe an unassigned target?
+          ""
+        end
+      }.join(" ")
+    else
+      target_str = ""
+    end
+    @failed_parses.register(construct_instance_id(sent.id(), frame.id()),
+                            frame.name(),
+                            target_str,
+                            target_pos,
+                            frame.children.map { |fe| fe.name })
+  end
+  ###
+  # sanity check: feature value longer than the allotted space in the DB?
+  def check_feature_length(feature_name,  # string
+                           feature_value, # object
+                           extractor_obj) # AbstractFeatureExtractor object
+    if extractor_obj.class.sql_type() =~ /(\d+)/
+      # sql type contains some statement about the length.
+      # just crudely compare to feature length
+      length = $1.to_i
+      if feature_value.class == String and
+          feature_value.length() > length
+        if feature_name == "sentid"
+	  print length;
+          print feature_value;
+	  print feature_value.length();
+	  # if the sentence (instance) ID is too long, we cannot go on.
+          $stderr.puts "Error: Instance ID is longer than its DB column."
+          $stderr.puts "Please increase the DB column size in {Tiger,Collins}FeatureExtractors.rb"
+          raise "SQL entry length surpassed"
+        elsif @exp.get("verbose")
+          # KE Feb 07: don't print warning,
+          # this is just too frequent
+          # for other features, we just issue a warning, and only if we are verbose
+          # $stderr.puts "Warning: feature #{feature_name} longer than its DB column (#{length.to_s} vs #{feature_value.length()}): #{feature_value}"
+        end # feature name check
+      end # length surpassed
+    end # length found in sql type
+  end
+end

data/lib/rosy/RosyConfusability.rb ADDED

@@ -0,0 +1,338 @@
+# RosyConfusability
+# KE May 05
+#
+# Access instance database created by the Rosy role assignment system
+# and compute the confusability of target categories
+# for the data in the (training) database there.
+#
+# We define confusability as follows:
+# Given a frame fr, let
+# - fes(fr) the FEs of fr (a set)
+# - gfs(fe) the grammatical functions realizing the FE fe in the data
+# - gfs(fr) = U_{fe \in fes(fr)} gfs(fe) the grammatical functions realizing roles of fr
+#
+# Then the entropy of a grammatical function gf within fr is
+#
+# gfe_{fr}(gf) = \sum_{fe \in fes(fr)} -p(fe|gf) log p(fe|gf)
+#
+# where p(fe|gf) = f(gf, fe) / f(gf)
+#
+# And the confusability of a frame element fe of fr is
+#
+# c_{fr}(fe) = \sum_{gf \in gfs(fr)} p(gf|fe) gfe_{fr}(gf)
+#
+# where p(gf|fe) = f(gf, fe) / f(fe)
+require "RosyConfigData"
+require "RosyIterator"
+require "RosyConventions"
+require "TargetsMostFrequentFrame"
+require "mysql"
+class RosyConfusability
+  include TargetsMostFrequentSc
+  attr_reader :confusability, :counts_fe_glob, :frame_confusability, :overall_confusability
+  def initialize(exp) # RosyConfigData object
+    @exp = exp
+    @confusability = Hash.new(0.0)
+    @counts_fe_glob = Hash.new(0)
+    @counts_gffe_glob = Hash.new(0)
+    @frame_confusability = Hash.new(0.0)
+    @overall_confusability = 0.0
+    @frequent_gframes = [
+      # NO DUPLICATES
+      "Ext_Comp", "Mod", "Comp", "Gen",
+      "Ext_Obj", "Ext", "Ext_Obj_Comp", "Head",
+      "Ext_Mod", "Gen_Mod", "Mod_Comp", "Comp_Ext",
+      "Gen_Comp", "Ext_Gen", "Ext_Mod_Comp", "Head_Comp",
+      "Obj_Comp", "Obj", "Mod_Head", "Ext_Comp_Obj",
+      "Gen_Head", "Ext_Gen_Mod"
+      # with duplicates
+#       "Ext_Comp", "Mod", "Comp", "Gen",
+#       "Ext_Obj", "Ext", "", "Ext_Obj_Comp",
+#       "Ext_Comp_Comp", "Head", "Mod_Mod", "Gen_Mod",
+#       "Ext_Mod", "Comp_Comp", "Mod_Comp", "Ext_Gen",
+#       "Gen_Comp", "Head_Head", "Ext_Comp_Comp_Comp", "Head_Comp",
+# # "Ext_Ext_Comp",
+# #       "Ext_Obj_Comp_Comp", "Obj_Comp",
+# #       "Ext_Mod_Mod", "Comp_Comp_Comp",
+# #       "Ext_Ext_Obj", "Ext_Mod_Comp", "Comp_Ext", "Obj",
+# #       "Ext_Ext", "Ext_Obj_Obj", "Mod_Mod_Mod", "Gen_Mod_Mod",
+# #       "Ext_Comp_Comp_Comp_Comp", "Gen_Head", "Mod_Head",
+# #       "Ext_Ext_Ext_Comp"
+    ].map { |string|
+      string.split("_")
+    }
+  end
+  def compute(splitID,     # string: split ID, may be nil
+              additionals) # array:string: "target", "target_pos", "gframe", "fgframe"
+    ###
+    # open and initialize stuff:
+    # open database
+    database = Mysql.real_connect(@exp.get('host'), @exp.get('user'),
+                                  @exp.get('passwd'), @exp.get('dbname'))
+    # make an object that creates views.
+    # read one frame at a time.
+    iterator = RosyIterator.new(database, @exp, "train",
+                                "splitID" => splitID,
+                                "xwise" => "frame")
+    # get value for "no val"
+    noval = @exp.get("noval")
+    counts_frame = Hash.new(0)
+    # iterate through all frames and compute confusability of each FE
+    iterator.each_group { |group_descr_hash, frame|
+      $stderr.puts "Computing confusability for #{frame}"
+      # read all instances of the frame, columns: FE and GF
+      view = iterator.get_a_view_for_current_group(["sentid","gold", "fn_gf",
+                                                    "target","target_pos", "frame"])
+      if additionals.include? "tmfframe"
+        # find most frequent gframe for each target
+        tmfframe = determine_target_most_frequent_sc(view, noval)
+      end
+      # count occurrences
+      counts_gf = Hash.new(0)
+      counts_fe = Hash.new(0)
+      counts_gffe = Hash.new(0)
+      view.each_sentence { |sentence|
+        # make string consisting of all FN GFs of this sentence
+        allgfs = Array.new()
+        sentence.each { |inst|
+          if inst["fn_gf"] != noval
+            allgfs << inst["fn_gf"]
+          end
+        }
+        # assume uniqueness of GFs
+        # design decision, could also be done differently.
+        # rationale: if a GF occurs more than once,
+        # it's probable that this is because we get more than
+        # one constituent for this GF, not because
+        # it actually occurred more than once in the
+        # original FrameNet annotation.
+        allgfs.uniq!
+        # now count each instance
+        sentence.each { |row|
+          if row["gold"] == "target"
+            # don't count target among the FEs
+            next
+          end
+          if row["gold"] != noval
+            counts_fe[row["gold"]] += 1
+          end
+          if row["fn_gf"] != noval and row["fn_gf"] != "target"
+            gf = row["fn_gf"]
+            additionals.each { |additional|
+              case additional
+              when "target"
+                gf << "_" + row["target"]
+              when "target_pos"
+                gf << "_" + row["target_pos"]
+              when "gframe"
+                gf << "_" + allgfs.join("_")
+              when "fgframe"
+                # find the maximal frequent frame subsuming allgfs
+                maxfgf = nil
+                @frequent_gframes.each { |fgframe|
+                  if fgframe.subsumed_by?(allgfs)
+                    # fgframe is a subset of allgfs
+                    if maxfgf.nil? or fgframe.length() > maxfgf.length()
+                      maxfgf = fgframe
+                    end
+                  end
+                }
+                if maxfgf.nil?
+                  # nothing there that fits
+                  # leave GF as is
+                else
+                  gf << "_" + maxfgf.join("_")
+                end
+              when "tmfframe"
+                gf << "_" + tmfframe[tmf_target_key(row)]
+              else
+                raise "Don't know how to compute #{additional}"
+              end
+            }
+            counts_gf[gf] += 1
+          end
+          if row["gold"] != noval and gf
+            counts_gffe[gf + " " + row["gold"]] += 1
+          end
+        } # each row of sentence
+      } # each sentence of view
+      # compute gf entropy
+      # gfe_{fr}(gf) = \sum_{fe \in fes(fr)} -p(fe|gf) log_2 p(fe|gf)
+      #
+      # where p(fe|gf) = f(gf, fe) / f(gf)
+      gf_entropy = Hash.new
+      counts_gf.keys.each { |gf|
+        gf_entropy[gf] = 0.0
+        counts_fe.keys.each { |fe|
+          if counts_gf[gf] > 0
+            p_gf_fe = counts_gffe[gf + " " + fe].to_f / counts_gf[gf].to_f
+            # get log_2 via log_10
+            if p_gf_fe > 0.0
+              gf_entropy[gf] -= p_gf_fe * Math.log10(p_gf_fe) * 3.32193
+            end
+          end
+        } # each FE for this GF
+      } # each GF (gf entropy)
+      # compute FE confusability
+      # c_{fr}(fe) = \sum_{gf \in gfs(fr)} p(gf|fe) gfe_{fr}(gf)
+      #
+      # where p(gf|fe) = f(gf, fe) / f(fe)
+      counts_fe.keys.each { |fe|
+        @confusability[frame + " " + fe] = 0.0
+        counts_gf.keys.each { |gf|
+          if counts_fe[fe] > 0
+            p_fe_gf = counts_gffe[gf + " " + fe].to_f / counts_fe[fe].to_f
+            @confusability[frame + " " + fe] += p_fe_gf * gf_entropy[gf]
+          end
+        } # each GF for this FE
+      } # each FE (fe confusability)
+      # remember counts for FEs and GF/FE pairs
+      counts_fe.keys.each { |fe|
+        @counts_fe_glob[frame + " " + fe] = counts_fe[fe]
+      }
+      counts_gffe.each_pair {|event,freq|
+        @counts_gffe_glob[frame+" " +event] = freq
+      }
+      # omit rare FEs:
+      # anything below 5 occurrences
+      counts_fe.each_key {  |fe|
+        if counts_fe[fe] < 5
+          @confusability.delete(frame + " " + fe)
+        end
+      }
+      # compute overall frame confusability
+      # omitting rare FEs with below 5 occurrences:
+      #
+      # c(fr) = sum_{fe \in fes(fr)} f(fe)/f(fr) * c_{fr}(fe)
+      #       = \sum_{gf \in gfs(fr)} p(gf|fr) gfe_{fr}(gf)
+      #
+      # where p(gf|fr) = (sum_{fe\in fes(fr)} f(gf, fe)) / f(fr)
+      counts_frame[frame] = 0
+      counts_fe.each_value { |count|
+        if count >= 5
+          counts_frame[frame] += count
+        end
+      }
+      @frame_confusability[frame] = 0.0
+      counts_fe.each_pair { |fe, count|
+        if count >= 5
+          @frame_confusability[frame] += (count.to_f / counts_frame[frame].to_f) * @confusability[frame + " " + fe]
+        end
+      }
+    } # each frame
+    # compute overall confusability
+    # c = \sum{fr \in frames} f(fr)/N * c(fr)
+    #
+    # where N is the number of FE occurrences overall
+    counts_overall = 0
+    counts_frame.each_value { |count|
+      counts_overall += count
+    }
+    @overall_confusability = 0.0
+    counts_frame.each_pair { |frame, count|
+      @overall_confusability += (count.to_f / counts_overall.to_f) * @frame_confusability[frame]
+    }
+  end
+  # return a copy of @counts_fe_glob, from which all fes with less than 5 occurrences are deleted
+  def get_global_counts
+    global_counts = @counts_fe_glob.clone
+    global_counts.delete_if {|key, value| value < 5}
+    return global_counts
+  end
+  ###
+  #
+  # compute sparseness statistics over the set of
+  # base events used for computing the confusability
+  # returns an array of length 4:
+  # - number of events with freq 1
+  # - number of events with freq 2
+  # - number of events with freq 3-5
+  # - number of events with freq > 5
+  def counts()
+    counts = [0,0,0,0]
+    @counts_gffe_glob.each_value {|freq|
+      case freq
+      when 1
+        counts[0] += 1
+      when 2
+        counts[1] += 1
+      when 3..5
+        counts[2] += 1
+      else
+        counts[3] += 1
+      end
+    }
+    return counts
+  end
+  def to_file(filename)
+    begin
+      file = File.new(filename,"w")
+    rescue
+      raise "Couldn't open file #{filename} for writing."
+    end
+    Marshal.dump({"confusability" => @confusability,
+                  "counts_fe_glob" => @counts_fe_glob,
+                  "counts_gffe_glob" => @counts_gffe_glob,
+                  "frame_confusability" => @frame_confusability,
+                  "overall_confusability" => @overall_confusability
+                 },
+                 file)
+  end
+  def from_file(filename)
+    begin
+      file = File.new(filename)
+    rescue
+      raise "Couldn't open file #{filename} for reading."
+    end
+    hash = Marshal.load(file)
+    @confusability = hash["confusability"]
+    @counts_fe_glob = hash["counts_fe_glob"]
+    @counts_gffe_glob = hash["counts_gffe_glob"]
+    @frame_confusability = hash["frame_confusability"]
+    @overall_confusability =  hash["overall_confusability"]
+  end
+end