RubyGems - shalmaneser - Versions diffs - 0.0.1.alpha - Mend

shalmaneser 0.0.1.alpha

Files changed (138) hide show

data/.yardopts +8 -0
data/CHANGELOG.rdoc +0 -0
data/LICENSE.rdoc +0 -0
data/README.rdoc +0 -0
data/lib/common/AbstractSynInterface.rb +1227 -0
data/lib/common/BerkeleyInterface.rb +375 -0
data/lib/common/CollinsInterface.rb +1165 -0
data/lib/common/ConfigData.rb +694 -0
data/lib/common/Counter.rb +18 -0
data/lib/common/DBInterface.rb +48 -0
data/lib/common/EnduserMode.rb +27 -0
data/lib/common/Eval.rb +480 -0
data/lib/common/FixSynSemMapping.rb +196 -0
data/lib/common/FrPrepConfigData.rb +66 -0
data/lib/common/FrprepHelper.rb +1324 -0
data/lib/common/Graph.rb +345 -0
data/lib/common/ISO-8859-1.rb +24 -0
data/lib/common/ML.rb +186 -0
data/lib/common/Maxent.rb +215 -0
data/lib/common/MiniparInterface.rb +1388 -0
data/lib/common/Optimise.rb +195 -0
data/lib/common/Parser.rb +213 -0
data/lib/common/RegXML.rb +269 -0
data/lib/common/RosyConventions.rb +171 -0
data/lib/common/SQLQuery.rb +243 -0
data/lib/common/STXmlTerminalOrder.rb +194 -0
data/lib/common/SalsaTigerRegXML.rb +2347 -0
data/lib/common/SalsaTigerXMLHelper.rb +99 -0
data/lib/common/SleepyInterface.rb +384 -0
data/lib/common/SynInterfaces.rb +275 -0
data/lib/common/TabFormat.rb +720 -0
data/lib/common/Tiger.rb +1448 -0
data/lib/common/TntInterface.rb +44 -0
data/lib/common/Tree.rb +61 -0
data/lib/common/TreetaggerInterface.rb +303 -0
data/lib/common/headz.rb +338 -0
data/lib/common/option_parser.rb +13 -0
data/lib/common/ruby_class_extensions.rb +310 -0
data/lib/fred/Baseline.rb +150 -0
data/lib/fred/FileZipped.rb +31 -0
data/lib/fred/FredBOWContext.rb +863 -0
data/lib/fred/FredConfigData.rb +182 -0
data/lib/fred/FredConventions.rb +232 -0
data/lib/fred/FredDetermineTargets.rb +324 -0
data/lib/fred/FredEval.rb +312 -0
data/lib/fred/FredFeatureExtractors.rb +321 -0
data/lib/fred/FredFeatures.rb +1061 -0
data/lib/fred/FredFeaturize.rb +596 -0
data/lib/fred/FredNumTrainingSenses.rb +27 -0
data/lib/fred/FredParameters.rb +402 -0
data/lib/fred/FredSplit.rb +84 -0
data/lib/fred/FredSplitPkg.rb +180 -0
data/lib/fred/FredTest.rb +607 -0
data/lib/fred/FredTrain.rb +144 -0
data/lib/fred/PlotAndREval.rb +480 -0
data/lib/fred/fred.rb +45 -0
data/lib/fred/md5.rb +23 -0
data/lib/fred/opt_parser.rb +250 -0
data/lib/frprep/AbstractSynInterface.rb +1227 -0
data/lib/frprep/Ampersand.rb +37 -0
data/lib/frprep/BerkeleyInterface.rb +375 -0
data/lib/frprep/CollinsInterface.rb +1165 -0
data/lib/frprep/ConfigData.rb +694 -0
data/lib/frprep/Counter.rb +18 -0
data/lib/frprep/FNCorpusXML.rb +643 -0
data/lib/frprep/FNDatabase.rb +144 -0
data/lib/frprep/FixSynSemMapping.rb +196 -0
data/lib/frprep/FrPrepConfigData.rb +66 -0
data/lib/frprep/FrameXML.rb +513 -0
data/lib/frprep/FrprepHelper.rb +1324 -0
data/lib/frprep/Graph.rb +345 -0
data/lib/frprep/ISO-8859-1.rb +24 -0
data/lib/frprep/MiniparInterface.rb +1388 -0
data/lib/frprep/Parser.rb +213 -0
data/lib/frprep/RegXML.rb +269 -0
data/lib/frprep/STXmlTerminalOrder.rb +194 -0
data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
data/lib/frprep/SleepyInterface.rb +384 -0
data/lib/frprep/SynInterfaces.rb +275 -0
data/lib/frprep/TabFormat.rb +720 -0
data/lib/frprep/Tiger.rb +1448 -0
data/lib/frprep/TntInterface.rb +44 -0
data/lib/frprep/Tree.rb +61 -0
data/lib/frprep/TreetaggerInterface.rb +303 -0
data/lib/frprep/do_parses.rb +142 -0
data/lib/frprep/frprep.rb +686 -0
data/lib/frprep/headz.rb +338 -0
data/lib/frprep/one_parsed_file.rb +28 -0
data/lib/frprep/opt_parser.rb +94 -0
data/lib/frprep/ruby_class_extensions.rb +310 -0
data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
data/lib/rosy/DBMySQL.rb +146 -0
data/lib/rosy/DBSQLite.rb +280 -0
data/lib/rosy/DBTable.rb +239 -0
data/lib/rosy/DBWrapper.rb +176 -0
data/lib/rosy/ExternalConfigData.rb +58 -0
data/lib/rosy/FailedParses.rb +130 -0
data/lib/rosy/FeatureInfo.rb +242 -0
data/lib/rosy/GfInduce.rb +1115 -0
data/lib/rosy/GfInduceFeature.rb +148 -0
data/lib/rosy/InputData.rb +294 -0
data/lib/rosy/RosyConfigData.rb +115 -0
data/lib/rosy/RosyConfusability.rb +338 -0
data/lib/rosy/RosyEval.rb +465 -0
data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
data/lib/rosy/RosyFeaturize.rb +280 -0
data/lib/rosy/RosyInspect.rb +336 -0
data/lib/rosy/RosyIterator.rb +477 -0
data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
data/lib/rosy/RosyPruning.rb +165 -0
data/lib/rosy/RosyServices.rb +744 -0
data/lib/rosy/RosySplit.rb +232 -0
data/lib/rosy/RosyTask.rb +19 -0
data/lib/rosy/RosyTest.rb +826 -0
data/lib/rosy/RosyTrain.rb +232 -0
data/lib/rosy/RosyTrainingTestTable.rb +786 -0
data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
data/lib/rosy/View.rb +418 -0
data/lib/rosy/opt_parser.rb +379 -0
data/lib/rosy/rosy.rb +77 -0
data/lib/shalmaneser/version.rb +3 -0
data/test/frprep/test_opt_parser.rb +94 -0
data/test/functional/functional_test_helper.rb +40 -0
data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
data/test/functional/test_fred.rb +47 -0
data/test/functional/test_frprep.rb +52 -0
data/test/functional/test_rosy.rb +20 -0
metadata +284 -0

data/lib/common/option_parser.rb ADDED Viewed

@@ -0,0 +1,13 @@
+# -*- encoding: us-ascii -*-
+# AB, 2010-11-25
+# It is a general class for parsing options.
+# It is now emtpy, we are implementing three different classes:
+# FRPrepOptionParser, RosyOptionParser and FredOptionParser.
+# All classes above inherit from OptionParser.
+#--
+# TODO: move the functionality to the parent class.
+class OptionParser
+end

data/lib/common/ruby_class_extensions.rb ADDED Viewed

@@ -0,0 +1,310 @@
+# Katrin Erk Oct 05
+#
+# useful extensions to standard classes
+require 'fileutils'
+class String
+  def startswith(other_string)
+    self[0..other_string.length() - 1] == other_string
+  end
+  def endswith(other_string)
+    not(other_string.length() > self.length()) and
+        self[self.length() - other_string.length()..-1] == other_string
+  end
+end
+class File
+  ########
+  # check whether a given path exists,
+  # and if it doesn't, make sure it is created.
+  #
+  # piece together the strings in 'pieces' to make the path,
+  # appending "/" to all strings if necessary
+  #
+  # returns: the path pieced together
+  def File.new_dir(*pieces) # strings, to be pieced together
+    dir_path, dummy = File.make_path(pieces, true)
+    unless File.exists? dir_path
+      FileUtils.mkdir_p dir_path
+    end
+    # check that all went well in creating the directory)
+    File.existing_dir(dir_path)
+    return dir_path
+  end
+  ########
+  # same as new_dir, but last piece is a filename
+  def File.new_filename(*pieces)
+    dir_path, whole_path = File.make_path(pieces, false)
+    unless File.exists? dir_path
+      FileUtils.mkdir_p dir_path
+    end
+    # check that all went well in creating the directory)
+    File.existing_dir(dir_path)
+    return whole_path
+  end
+  #####
+  # check whether a given path exists,
+  # and report failure of it does not exist.
+  #
+  # piece together the strings in 'pieces' to make the path,
+  # appending "/" to all strings if necessary
+  #
+  # returns: the path pieced together
+  def File.existing_dir(*pieces) # strings
+    dir_path, dummy = File.make_path(pieces, true)
+    unless File.exists? dir_path and File.directory? dir_path
+      $stderr.puts "Error: Directory #{dir_path} doesn't exist. Exiting."
+      exit(1)
+    end
+    unless File.executable? dir_path
+      $stderr.puts "Error: Cannot access directory #{dir_path}. Exiting."
+      exit(1)
+    end
+    return dir_path
+  end
+  ####
+  # like existing_dir, but last bit is filename
+  def File.existing_filename(*pieces) # strings
+    dir_path, whole_path = File.make_path(pieces, false)
+    unless File.exists? dir_path and File.directory? dir_path
+      $stderr.puts "Error: Directory #{dir_path} doesn't exist. Exiting"
+      exit(1)
+    end
+    unless File.executable? dir_path
+      $stderr.puts "Error: Cannot access directory #{dir_path}. Exiting."
+      exit(1)
+    end
+    return whole_path
+  end
+  ####
+  # piece together the strings in 'pieces' to make a path,
+  # appending "/" to all but the last string if necessary
+  #
+  # if 'pieces' is already a string, take that as a one-piece path
+  #
+  # if dir is true, also append "/" to the last piece of the string
+  #
+  # the resulting path is expanded: For example, initial
+  # ~ is expanded to the setting of $HOME
+  #
+  # returns: pair of strings (directory_part, whole_path)
+  #
+  def File.make_path(pieces,      # string or array:string
+                     is_dir = false) # Boolean: is the path a directory?
+    if pieces.kind_of? String
+      pieces = [ pieces ]
+    end
+    dir = ""
+    # iterate over all but the filename
+    if is_dir
+      last_dir_index = -1
+    else
+      last_dir_index = -2
+    end
+    pieces[0..last_dir_index].each { |piece|
+      if piece.nil?
+        # whoops, nil entry in name of path!
+        $stderr.puts "File.make_path ERROR: nil for piece of path name."
+        next
+      end
+      if piece =~ /\/$/
+        dir << piece
+      else
+        dir << piece << "/"
+      end
+    }
+    dir = File.expand_path(dir)
+    # expand_path removes the final "/" again
+    unless dir =~ /\/$/
+      dir = dir + "/"
+    end
+    if is_dir
+      return [dir, dir]
+    else
+      return [dir, dir + pieces[-1]]
+    end
+  end
+end
+#############################################
+class Array
+  ###
+  # interleave N arrays:
+  # given arrays [a1... an], [b1,...,bn], ..[z1, ...,zn]
+  # return [[a1,b1, .., z1]...,[an,bn, .., zn]]
+  #
+  # if one array is longer than the other,
+  # e.g. [a1...an], [b1,...,bm] with n> m
+  # the result is
+  # [[a1,b1],...[am, bm], [am+1, nil], ..., [an, nil]]
+  # and analogously for m>n
+  def interleave(*arrays)
+    len = [length(), arrays.map { |a| a.length() }.max()].max()
+    (0..len-1).to_a.map { |ix|
+      [at(ix)] + arrays.map { |a| a[ix] }
+    }
+  end
+  ###
+  # prepend: prepend element to array
+  # because I can never remember which is 'shift'
+  # and which is 'unshift'
+  def prepend(element)
+    unshift(element)
+  end
+  ###
+  # count the number of occurrences of element in this array
+  def count(element)
+    num = 0
+    each { |my_element|
+      if my_element == element
+	num += 1
+      end
+    }
+    return num
+  end
+  ###
+  # count the number of occurrences of
+  # elements from list in this array
+  def counts(list)
+    num = 0
+    each { |my_element|
+      if list.include? my_element
+	num += 1
+      end
+    }
+    return num
+  end
+  ###
+  # draw a random sample of size N
+  # from this array
+  def sample(size)
+    if size < 0
+      return nil
+    elsif size == 0
+      return []
+    elsif size >= length()
+      return self.clone()
+    end
+    rank = Hash.new()
+    each { |my_element|
+      rank[my_element] = rand()
+    }
+    return self.sort { |a, b| rank[a] <=> rank[b] }[0..size-1]
+  end
+end
+class Float
+  ###
+  # round a float to the given number of decimal points
+  def round_to_decpts(n)
+    if self.nan?
+      return self
+    else
+      return (self * 10**n).round.to_f / 10**n
+    end
+  end
+end
+################
+module EnumerableBool
+  ###
+  # And_{x \in X} block(x)
+  def big_and(&block)
+    each { |x|
+      unless block.call(x)
+	return false
+      end
+    }
+    return true
+  end
+  ###
+  # Or_{x \in X} block(x)
+  def big_or(&block)
+    each { |x|
+      if block.call(x)
+	return true
+      end
+    }
+    return false
+  end
+  ###
+  # Sum_{x \in X} block(x)
+  def big_sum(init = 0, &block)
+    sum = init
+    unless block_given?
+      block = Proc.new { |x| x}
+    end
+    each { |x|
+      sum += block.call(x)
+    }
+    return sum
+  end
+end
+################
+# Given an enumerable, distribute its items into two bins (arrays)
+# depending on whether the block returns true
+module EnumerableDistribute
+  def distribute(&block)
+    retv1 = Array.new
+    retv2 = Array.new
+    each { |x|
+      if block.call(x)
+        retv1 << x
+      else
+        retv2 << x
+      end
+    }
+    return [retv1, retv2]
+  end
+end
+#####################
+# map with index
+module MapWithIndex
+  def map_with_index(&block)
+    retv = Array.new
+    each_with_index { |x, index|
+      retv << block.call(x, index)
+    }
+    return retv
+  end
+end
+# include new Mixins into array already.
+# for other classes, do this when requiring StandardPkgExtensions
+class Array
+  include EnumerableBool
+  include EnumerableDistribute
+  include MapWithIndex
+end

data/lib/fred/Baseline.rb ADDED Viewed

@@ -0,0 +1,150 @@
+# Baseline
+# Katrin Erk April 05
+#
+# baseline for WSD:
+# always assign most frequent sense
+# The baseline doesn't do binary classifiers.
+require "fred/FredConventions"
+require "fred/FredSplitPkg"
+require "fred/FredFeatures"
+require "fred/FredDetermineTargets"
+class Baseline
+  ###
+  # new
+  #
+  # get splitlog dir (if any) along with everything else
+  # because we are only evaluating the training data
+  # at test time
+  #
+  def initialize(exp, # FredConfigData object
+		 split_id = nil) # string: split ID
+    @exp = exp
+    @split_id = split_id
+    # for each lemma: remember prevalent sense
+    @lemma_to_sense = Hash.new()
+    if @split_id
+      split_obj = FredSplitPkg.new(@exp)
+    end
+    lemma_done = Hash.new()
+    # iterate through lemmas
+    @target_obj = Targets.new(@exp, nil, "r")
+    unless @target_obj.targets_okay
+      # error during initialization
+      $stderr.puts "Error: Could not read list of known targets, bailing out."
+      exit 1
+    end
+    @target_obj.get_lemmas().each { |lemmapos|
+      if @split_id
+        # read training split of answer keys
+        answer_obj = AnswerKeyAccess.new(@exp, "train", lemmapos, "r", @split_id, "train")
+      else
+        # read full answer key file of training data
+        answer_obj = AnswerKeyAccess.new(@exp, "train", lemmapos, "r")
+      end
+      count_senses = Hash.new(0)
+      answer_obj.each { |lemma, pos, ids, sid, senses_all, senses_this|
+        # senses_this may include more than one sense for multi-label assignment
+        senses_this.each { |sense|
+          count_senses[sense] += 1
+        }
+      }
+      @lemma_to_sense[lemmapos] = count_senses.keys().max { |a, b|
+        count_senses[a] <=> count_senses[b]
+      }
+    }
+    @lemma = nil
+  end
+  ###
+  def train(infilename)
+    # no training here
+  end
+  ###
+  def write(classifier_file)
+    # no classifiers to write
+  end
+  def exists?(classifier_file)
+    return true
+  end
+  def read(classifier_file)
+    values = deconstruct_fred_classifier_filename(File.basename(classifier_file))
+    @lemma = values["lemma"]
+    if @lemma
+      return true
+    else
+      $stderr.puts "Warning: couldn't determine lemma name in #{classifier_file}, skipping"
+      return false
+    end
+  end
+  def read_resultfile(filename)
+    retv = Array.new()
+    begin
+      f = File.new(filename)
+    rescue
+      raise "Could not read baseline result file #{filename}"
+    end
+    f.each { |line|
+      retv << [[ line.chomp(), 1.0 ]]
+    }
+    return retv
+  end
+  def apply(infilename, outfilename)
+    # open input and output file
+    begin
+      out_f = File.new(outfilename, "w")
+    rescue
+      $stderr.puts "Error: cannot write to classification output file #{outfilename}."
+      exit 1
+    end
+    begin
+      f = File.new(infilename)
+    rescue
+      $stderr.puts "Error: cannot read feature file #{infilename}."
+      exit 1
+    end
+    # deconstruct input filename to determine lemma
+    unless @lemma
+      # something went wrong in read()
+      return false
+    end
+    # do we have a sense for this?
+    unless (sense = @lemma_to_sense[@lemma])
+      # nope: assign "NONE" (or whatever the null label is here)
+      sense = @exp.get("negsense")
+      unless sense
+        sense = "NONE"
+      end
+    end
+    f.each { |line|
+      out_f.puts sense
+    }
+    out_f.close()
+    f.close()
+    return true
+  end
+end