RubyGems - frprep - Versions diffs - 0.0.1.prealpha - Mend

frprep 0.0.1.prealpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (138) hide show

data/.yardopts +8 -0
data/CHANGELOG.rdoc +0 -0
data/LICENSE.rdoc +0 -0
data/README.rdoc +0 -0
data/lib/common/AbstractSynInterface.rb +1227 -0
data/lib/common/BerkeleyInterface.rb +375 -0
data/lib/common/CollinsInterface.rb +1165 -0
data/lib/common/ConfigData.rb +694 -0
data/lib/common/Counter.rb +18 -0
data/lib/common/DBInterface.rb +48 -0
data/lib/common/EnduserMode.rb +27 -0
data/lib/common/Eval.rb +480 -0
data/lib/common/FixSynSemMapping.rb +196 -0
data/lib/common/FrPrepConfigData.rb +66 -0
data/lib/common/FrprepHelper.rb +1324 -0
data/lib/common/Graph.rb +345 -0
data/lib/common/ISO-8859-1.rb +24 -0
data/lib/common/ML.rb +186 -0
data/lib/common/Maxent.rb +215 -0
data/lib/common/MiniparInterface.rb +1388 -0
data/lib/common/Optimise.rb +195 -0
data/lib/common/Parser.rb +213 -0
data/lib/common/RegXML.rb +269 -0
data/lib/common/RosyConventions.rb +171 -0
data/lib/common/SQLQuery.rb +243 -0
data/lib/common/STXmlTerminalOrder.rb +194 -0
data/lib/common/SalsaTigerRegXML.rb +2347 -0
data/lib/common/SalsaTigerXMLHelper.rb +99 -0
data/lib/common/SleepyInterface.rb +384 -0
data/lib/common/SynInterfaces.rb +275 -0
data/lib/common/TabFormat.rb +720 -0
data/lib/common/Tiger.rb +1448 -0
data/lib/common/TntInterface.rb +44 -0
data/lib/common/Tree.rb +61 -0
data/lib/common/TreetaggerInterface.rb +303 -0
data/lib/common/headz.rb +338 -0
data/lib/common/option_parser.rb +13 -0
data/lib/common/ruby_class_extensions.rb +310 -0
data/lib/fred/Baseline.rb +150 -0
data/lib/fred/FileZipped.rb +31 -0
data/lib/fred/FredBOWContext.rb +863 -0
data/lib/fred/FredConfigData.rb +182 -0
data/lib/fred/FredConventions.rb +232 -0
data/lib/fred/FredDetermineTargets.rb +324 -0
data/lib/fred/FredEval.rb +312 -0
data/lib/fred/FredFeatureExtractors.rb +321 -0
data/lib/fred/FredFeatures.rb +1061 -0
data/lib/fred/FredFeaturize.rb +596 -0
data/lib/fred/FredNumTrainingSenses.rb +27 -0
data/lib/fred/FredParameters.rb +402 -0
data/lib/fred/FredSplit.rb +84 -0
data/lib/fred/FredSplitPkg.rb +180 -0
data/lib/fred/FredTest.rb +607 -0
data/lib/fred/FredTrain.rb +144 -0
data/lib/fred/PlotAndREval.rb +480 -0
data/lib/fred/fred.rb +45 -0
data/lib/fred/md5.rb +23 -0
data/lib/fred/opt_parser.rb +250 -0
data/lib/frprep/AbstractSynInterface.rb +1227 -0
data/lib/frprep/Ampersand.rb +37 -0
data/lib/frprep/BerkeleyInterface.rb +375 -0
data/lib/frprep/CollinsInterface.rb +1165 -0
data/lib/frprep/ConfigData.rb +694 -0
data/lib/frprep/Counter.rb +18 -0
data/lib/frprep/FNCorpusXML.rb +643 -0
data/lib/frprep/FNDatabase.rb +144 -0
data/lib/frprep/FixSynSemMapping.rb +196 -0
data/lib/frprep/FrPrepConfigData.rb +66 -0
data/lib/frprep/FrameXML.rb +513 -0
data/lib/frprep/FrprepHelper.rb +1324 -0
data/lib/frprep/Graph.rb +345 -0
data/lib/frprep/ISO-8859-1.rb +24 -0
data/lib/frprep/MiniparInterface.rb +1388 -0
data/lib/frprep/Parser.rb +213 -0
data/lib/frprep/RegXML.rb +269 -0
data/lib/frprep/STXmlTerminalOrder.rb +194 -0
data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
data/lib/frprep/SleepyInterface.rb +384 -0
data/lib/frprep/SynInterfaces.rb +275 -0
data/lib/frprep/TabFormat.rb +720 -0
data/lib/frprep/Tiger.rb +1448 -0
data/lib/frprep/TntInterface.rb +44 -0
data/lib/frprep/Tree.rb +61 -0
data/lib/frprep/TreetaggerInterface.rb +303 -0
data/lib/frprep/do_parses.rb +142 -0
data/lib/frprep/frprep.rb +686 -0
data/lib/frprep/headz.rb +338 -0
data/lib/frprep/one_parsed_file.rb +28 -0
data/lib/frprep/opt_parser.rb +94 -0
data/lib/frprep/ruby_class_extensions.rb +310 -0
data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
data/lib/rosy/DBMySQL.rb +146 -0
data/lib/rosy/DBSQLite.rb +280 -0
data/lib/rosy/DBTable.rb +239 -0
data/lib/rosy/DBWrapper.rb +176 -0
data/lib/rosy/ExternalConfigData.rb +58 -0
data/lib/rosy/FailedParses.rb +130 -0
data/lib/rosy/FeatureInfo.rb +242 -0
data/lib/rosy/GfInduce.rb +1115 -0
data/lib/rosy/GfInduceFeature.rb +148 -0
data/lib/rosy/InputData.rb +294 -0
data/lib/rosy/RosyConfigData.rb +115 -0
data/lib/rosy/RosyConfusability.rb +338 -0
data/lib/rosy/RosyEval.rb +465 -0
data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
data/lib/rosy/RosyFeaturize.rb +280 -0
data/lib/rosy/RosyInspect.rb +336 -0
data/lib/rosy/RosyIterator.rb +477 -0
data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
data/lib/rosy/RosyPruning.rb +165 -0
data/lib/rosy/RosyServices.rb +744 -0
data/lib/rosy/RosySplit.rb +232 -0
data/lib/rosy/RosyTask.rb +19 -0
data/lib/rosy/RosyTest.rb +826 -0
data/lib/rosy/RosyTrain.rb +232 -0
data/lib/rosy/RosyTrainingTestTable.rb +786 -0
data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
data/lib/rosy/View.rb +418 -0
data/lib/rosy/opt_parser.rb +379 -0
data/lib/rosy/rosy.rb +77 -0
data/lib/shalmaneser/version.rb +3 -0
data/test/frprep/test_opt_parser.rb +94 -0
data/test/functional/functional_test_helper.rb +40 -0
data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
data/test/functional/test_fred.rb +47 -0
data/test/functional/test_frprep.rb +52 -0
data/test/functional/test_rosy.rb +20 -0
metadata +270 -0

data/lib/common/option_parser.rb ADDED

@@ -0,0 +1,13 @@
+# -*- encoding: us-ascii -*-
+# AB, 2010-11-25
+# It is a general class for parsing options.
+# It is now emtpy, we are implementing three different classes:
+# FRPrepOptionParser, RosyOptionParser and FredOptionParser.
+# All classes above inherit from OptionParser.
+#--
+# TODO: move the functionality to the parent class.
+class OptionParser
+end

data/lib/common/ruby_class_extensions.rb ADDED

@@ -0,0 +1,310 @@
+# Katrin Erk Oct 05
+#
+# useful extensions to standard classes
+require 'fileutils'
+class String
+  def startswith(other_string)
+    self[0..other_string.length() - 1] == other_string
+  end
+  def endswith(other_string)
+    not(other_string.length() > self.length()) and
+        self[self.length() - other_string.length()..-1] == other_string
+  end
+end
+class File
+  ########
+  # check whether a given path exists,
+  # and if it doesn't, make sure it is created.
+  #
+  # piece together the strings in 'pieces' to make the path,
+  # appending "/" to all strings if necessary
+  #
+  # returns: the path pieced together
+  def File.new_dir(*pieces) # strings, to be pieced together
+    dir_path, dummy = File.make_path(pieces, true)
+    unless File.exists? dir_path
+      FileUtils.mkdir_p dir_path
+    end
+    # check that all went well in creating the directory)
+    File.existing_dir(dir_path)
+    return dir_path
+  end
+  ########
+  # same as new_dir, but last piece is a filename
+  def File.new_filename(*pieces)
+    dir_path, whole_path = File.make_path(pieces, false)
+    unless File.exists? dir_path
+      FileUtils.mkdir_p dir_path
+    end
+    # check that all went well in creating the directory)
+    File.existing_dir(dir_path)
+    return whole_path
+  end
+  #####
+  # check whether a given path exists,
+  # and report failure of it does not exist.
+  #
+  # piece together the strings in 'pieces' to make the path,
+  # appending "/" to all strings if necessary
+  #
+  # returns: the path pieced together
+  def File.existing_dir(*pieces) # strings
+    dir_path, dummy = File.make_path(pieces, true)
+    unless File.exists? dir_path and File.directory? dir_path
+      $stderr.puts "Error: Directory #{dir_path} doesn't exist. Exiting."
+      exit(1)
+    end
+    unless File.executable? dir_path
+      $stderr.puts "Error: Cannot access directory #{dir_path}. Exiting."
+      exit(1)
+    end
+    return dir_path
+  end
+  ####
+  # like existing_dir, but last bit is filename
+  def File.existing_filename(*pieces) # strings
+    dir_path, whole_path = File.make_path(pieces, false)
+    unless File.exists? dir_path and File.directory? dir_path
+      $stderr.puts "Error: Directory #{dir_path} doesn't exist. Exiting"
+      exit(1)
+    end
+    unless File.executable? dir_path
+      $stderr.puts "Error: Cannot access directory #{dir_path}. Exiting."
+      exit(1)
+    end
+    return whole_path
+  end
+  ####
+  # piece together the strings in 'pieces' to make a path,
+  # appending "/" to all but the last string if necessary
+  #
+  # if 'pieces' is already a string, take that as a one-piece path
+  #
+  # if dir is true, also append "/" to the last piece of the string
+  #
+  # the resulting path is expanded: For example, initial
+  # ~ is expanded to the setting of $HOME
+  #
+  # returns: pair of strings (directory_part, whole_path)
+  #
+  def File.make_path(pieces,      # string or array:string
+                     is_dir = false) # Boolean: is the path a directory?
+    if pieces.kind_of? String
+      pieces = [ pieces ]
+    end
+    dir = ""
+    # iterate over all but the filename
+    if is_dir
+      last_dir_index = -1
+    else
+      last_dir_index = -2
+    end
+    pieces[0..last_dir_index].each { |piece|
+      if piece.nil?
+        # whoops, nil entry in name of path!
+        $stderr.puts "File.make_path ERROR: nil for piece of path name."
+        next
+      end
+      if piece =~ /\/$/
+        dir << piece
+      else
+        dir << piece << "/"
+      end
+    }
+    dir = File.expand_path(dir)
+    # expand_path removes the final "/" again
+    unless dir =~ /\/$/
+      dir = dir + "/"
+    end
+    if is_dir
+      return [dir, dir]
+    else
+      return [dir, dir + pieces[-1]]
+    end
+  end
+end
+#############################################
+class Array
+  ###
+  # interleave N arrays:
+  # given arrays [a1... an], [b1,...,bn], ..[z1, ...,zn]
+  # return [[a1,b1, .., z1]...,[an,bn, .., zn]]
+  #
+  # if one array is longer than the other,
+  # e.g. [a1...an], [b1,...,bm] with n> m
+  # the result is
+  # [[a1,b1],...[am, bm], [am+1, nil], ..., [an, nil]]
+  # and analogously for m>n
+  def interleave(*arrays)
+    len = [length(), arrays.map { |a| a.length() }.max()].max()
+    (0..len-1).to_a.map { |ix|
+      [at(ix)] + arrays.map { |a| a[ix] }
+    }
+  end
+  ###
+  # prepend: prepend element to array
+  # because I can never remember which is 'shift'
+  # and which is 'unshift'
+  def prepend(element)
+    unshift(element)
+  end
+  ###
+  # count the number of occurrences of element in this array
+  def count(element)
+    num = 0
+    each { |my_element|
+      if my_element == element
+	num += 1
+      end
+    }
+    return num
+  end
+  ###
+  # count the number of occurrences of
+  # elements from list in this array
+  def counts(list)
+    num = 0
+    each { |my_element|
+      if list.include? my_element
+	num += 1
+      end
+    }
+    return num
+  end
+  ###
+  # draw a random sample of size N
+  # from this array
+  def sample(size)
+    if size < 0
+      return nil
+    elsif size == 0
+      return []
+    elsif size >= length()
+      return self.clone()
+    end
+    rank = Hash.new()
+    each { |my_element|
+      rank[my_element] = rand()
+    }
+    return self.sort { |a, b| rank[a] <=> rank[b] }[0..size-1]
+  end
+end
+class Float
+  ###
+  # round a float to the given number of decimal points
+  def round_to_decpts(n)
+    if self.nan?
+      return self
+    else
+      return (self * 10**n).round.to_f / 10**n
+    end
+  end
+end
+################
+module EnumerableBool
+  ###
+  # And_{x \in X} block(x)
+  def big_and(&block)
+    each { |x|
+      unless block.call(x)
+	return false
+      end
+    }
+    return true
+  end
+  ###
+  # Or_{x \in X} block(x)
+  def big_or(&block)
+    each { |x|
+      if block.call(x)
+	return true
+      end
+    }
+    return false
+  end
+  ###
+  # Sum_{x \in X} block(x)
+  def big_sum(init = 0, &block)
+    sum = init
+    unless block_given?
+      block = Proc.new { |x| x}
+    end
+    each { |x|
+      sum += block.call(x)
+    }
+    return sum
+  end
+end
+################
+# Given an enumerable, distribute its items into two bins (arrays)
+# depending on whether the block returns true
+module EnumerableDistribute
+  def distribute(&block)
+    retv1 = Array.new
+    retv2 = Array.new
+    each { |x|
+      if block.call(x)
+        retv1 << x
+      else
+        retv2 << x
+      end
+    }
+    return [retv1, retv2]
+  end
+end
+#####################
+# map with index
+module MapWithIndex
+  def map_with_index(&block)
+    retv = Array.new
+    each_with_index { |x, index|
+      retv << block.call(x, index)
+    }
+    return retv
+  end
+end
+# include new Mixins into array already.
+# for other classes, do this when requiring StandardPkgExtensions
+class Array
+  include EnumerableBool
+  include EnumerableDistribute
+  include MapWithIndex
+end

data/lib/fred/Baseline.rb ADDED

@@ -0,0 +1,150 @@
+# Baseline
+# Katrin Erk April 05
+#
+# baseline for WSD:
+# always assign most frequent sense
+# The baseline doesn't do binary classifiers.
+require "fred/FredConventions"
+require "fred/FredSplitPkg"
+require "fred/FredFeatures"
+require "fred/FredDetermineTargets"
+class Baseline
+  ###
+  # new
+  #
+  # get splitlog dir (if any) along with everything else
+  # because we are only evaluating the training data
+  # at test time
+  #
+  def initialize(exp, # FredConfigData object
+		 split_id = nil) # string: split ID
+    @exp = exp
+    @split_id = split_id
+    # for each lemma: remember prevalent sense
+    @lemma_to_sense = Hash.new()
+    if @split_id
+      split_obj = FredSplitPkg.new(@exp)
+    end
+    lemma_done = Hash.new()
+    # iterate through lemmas
+    @target_obj = Targets.new(@exp, nil, "r")
+    unless @target_obj.targets_okay
+      # error during initialization
+      $stderr.puts "Error: Could not read list of known targets, bailing out."
+      exit 1
+    end
+    @target_obj.get_lemmas().each { |lemmapos|
+      if @split_id
+        # read training split of answer keys
+        answer_obj = AnswerKeyAccess.new(@exp, "train", lemmapos, "r", @split_id, "train")
+      else
+        # read full answer key file of training data
+        answer_obj = AnswerKeyAccess.new(@exp, "train", lemmapos, "r")
+      end
+      count_senses = Hash.new(0)
+      answer_obj.each { |lemma, pos, ids, sid, senses_all, senses_this|
+        # senses_this may include more than one sense for multi-label assignment
+        senses_this.each { |sense|
+          count_senses[sense] += 1
+        }
+      }
+      @lemma_to_sense[lemmapos] = count_senses.keys().max { |a, b|
+        count_senses[a] <=> count_senses[b]
+      }
+    }
+    @lemma = nil
+  end
+  ###
+  def train(infilename)
+    # no training here
+  end
+  ###
+  def write(classifier_file)
+    # no classifiers to write
+  end
+  def exists?(classifier_file)
+    return true
+  end
+  def read(classifier_file)
+    values = deconstruct_fred_classifier_filename(File.basename(classifier_file))
+    @lemma = values["lemma"]
+    if @lemma
+      return true
+    else
+      $stderr.puts "Warning: couldn't determine lemma name in #{classifier_file}, skipping"
+      return false
+    end
+  end
+  def read_resultfile(filename)
+    retv = Array.new()
+    begin
+      f = File.new(filename)
+    rescue
+      raise "Could not read baseline result file #{filename}"
+    end
+    f.each { |line|
+      retv << [[ line.chomp(), 1.0 ]]
+    }
+    return retv
+  end
+  def apply(infilename, outfilename)
+    # open input and output file
+    begin
+      out_f = File.new(outfilename, "w")
+    rescue
+      $stderr.puts "Error: cannot write to classification output file #{outfilename}."
+      exit 1
+    end
+    begin
+      f = File.new(infilename)
+    rescue
+      $stderr.puts "Error: cannot read feature file #{infilename}."
+      exit 1
+    end
+    # deconstruct input filename to determine lemma
+    unless @lemma
+      # something went wrong in read()
+      return false
+    end
+    # do we have a sense for this?
+    unless (sense = @lemma_to_sense[@lemma])
+      # nope: assign "NONE" (or whatever the null label is here)
+      sense = @exp.get("negsense")
+      unless sense
+        sense = "NONE"
+      end
+    end
+    f.each { |line|
+      out_f.puts sense
+    }
+    out_f.close()
+    f.close()
+    return true
+  end
+end