RubyGems - shalmaneser-lib - Versions diffs - 1.2.rc5 - Mend

shalmaneser-lib 1.2.rc5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

checksums.yaml +7 -0
data/.yardopts +10 -0
data/CHANGELOG.md +4 -0
data/LICENSE.md +4 -0
data/README.md +122 -0
data/lib/configuration/config_data.rb +457 -0
data/lib/configuration/config_format_element.rb +210 -0
data/lib/configuration/configuration_error.rb +15 -0
data/lib/configuration/external_config_data.rb +56 -0
data/lib/configuration/frappe_config_data.rb +134 -0
data/lib/configuration/fred_config_data.rb +199 -0
data/lib/configuration/rosy_config_data.rb +126 -0
data/lib/db/db_interface.rb +50 -0
data/lib/db/db_mysql.rb +141 -0
data/lib/db/db_sqlite.rb +280 -0
data/lib/db/db_table.rb +237 -0
data/lib/db/db_view.rb +416 -0
data/lib/db/db_wrapper.rb +175 -0
data/lib/db/select_table_and_columns.rb +10 -0
data/lib/db/sql_query.rb +243 -0
data/lib/definitions.rb +19 -0
data/lib/eval.rb +482 -0
data/lib/ext/maxent/Classify.class +0 -0
data/lib/ext/maxent/Train.class +0 -0
data/lib/external_systems.rb +251 -0
data/lib/framenet_format/fn_corpus_aset.rb +209 -0
data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
data/lib/framenet_format/fn_database.rb +143 -0
data/lib/framenet_format/frame_xml_file.rb +104 -0
data/lib/framenet_format/frame_xml_sentence.rb +411 -0
data/lib/logging.rb +25 -0
data/lib/ml/classifier.rb +189 -0
data/lib/ml/mallet.rb +236 -0
data/lib/ml/maxent.rb +229 -0
data/lib/ml/optimize.rb +195 -0
data/lib/ml/timbl.rb +140 -0
data/lib/monkey_patching/array.rb +82 -0
data/lib/monkey_patching/enumerable_bool.rb +24 -0
data/lib/monkey_patching/enumerable_distribute.rb +18 -0
data/lib/monkey_patching/file.rb +131 -0
data/lib/monkey_patching/subsumed.rb +24 -0
data/lib/ruby_class_extensions.rb +4 -0
data/lib/salsa_tiger_xml/corpus.rb +24 -0
data/lib/salsa_tiger_xml/fe_node.rb +98 -0
data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
data/lib/salsa_tiger_xml/frame_node.rb +145 -0
data/lib/salsa_tiger_xml/graph_node.rb +347 -0
data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
data/lib/salsa_tiger_xml/sem_node.rb +58 -0
data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
data/lib/salsa_tiger_xml/syn_node.rb +169 -0
data/lib/salsa_tiger_xml/tree_node.rb +59 -0
data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
data/lib/salsa_tiger_xml/usp_node.rb +72 -0
data/lib/salsa_tiger_xml/xml_node.rb +163 -0
data/lib/shalmaneser/lib.rb +1 -0
data/lib/tabular_format/fn_tab_format_file.rb +38 -0
data/lib/tabular_format/fn_tab_frame.rb +67 -0
data/lib/tabular_format/fn_tab_sentence.rb +169 -0
data/lib/tabular_format/tab_format_file.rb +91 -0
data/lib/tabular_format/tab_format_named_args.rb +184 -0
data/lib/tabular_format/tab_format_sentence.rb +119 -0
data/lib/value_restriction.rb +49 -0
metadata +131 -0

data/lib/ml/maxent.rb ADDED

@@ -0,0 +1,229 @@
+# wrapper script for the OpenNLP Maxent classifier
+# sp July 2007
+require "tempfile"
+require 'fileutils'
+class Maxent
+  ###
+  def initialize(program_path, parameters)
+    # @note AB: <parameters> is an Array with the last part of the
+    #   line from the experiment file, it should contain the path to our
+    #   java wrappers, but we don't want it.
+    #   Since the presence of this part is checked only here we
+    #   suppose it obsolete and set this path manually here.
+    # if parameters.empty?
+    #   puts "Error: The OpenNLP maxent system needs two paths (first the location of maxent itself and then the location of the interface, usually program/tools/maxent)."
+    #   puts "I got only the program path."
+    #   Kernel.exit
+    # end
+    # @interface_path = parameters.first
+    # @note AB: Setting path manually.
+    #   It assumes <Maxent.rb> ist in <lib/common> and
+    #   <Classify.class> is in <lib/ext/maxent>.
+    # @todo AB: This assumption should be changed. ENV[]???
+    @interface_path = File.expand_path('../ext/maxent', File.dirname(__FILE__))
+    @maxentpath = program_path
+    unless @maxentpath =~ /\/$/
+      @maxentpath = @maxentpath + "/"
+    end
+    # classpath for maxent
+    @cp = "#{@maxentpath}:#{@maxentpath}lib:#{@maxentpath}lib/trove.jar:#{@maxentpath}output/maxent-2.4.0.jar:#{ENV["CLASSPATH"]}"
+  end
+  ###
+  #
+  # write classifier to training directory...
+  def train(infilename,classifier_file)
+    trainfile = Tempfile.new(File.basename(infilename)+".maxenttrain")
+    infile = File.new(infilename)
+    c45_to_maxent(infile,trainfile) # training data in csv format
+    infile.close
+    trainfile.close
+    if classifier_file
+      @classifier_location = classifier_file
+    else
+      @classifier_location = trainfile.path+"Model.bin.gz"
+    end
+    @classifier_location = enforce_compact_storage(@classifier_location)
+    # store model in binary, gzipped form...
+    command = ["cd #{@interface_path}; ",
+                #"/usr/lib/jvm/java-1.7.0/bin/java -cp #{@cp} -Xmx1000m Train",
+                "java -cp #{@cp} -Xmx1000m Train",
+               trainfile.path,
+               @classifier_location].join(" ")
+    # remember location
+    unless  successfully_run(command)
+      return false
+    end
+    trainfile.close(true)
+  end
+  def write(classifier_file)
+    classifier_file = enforce_compact_storage(classifier_file)
+    if @classifier_location
+      @classifier_location = enforce_compact_storage(@classifier_location)
+      %x{cp #{@classifier_location} #{classifier_file}} # store classifier
+   #    File.chmod(0664,classifier_file+".classifier")
+    else
+      $stderr.puts "Maxent error: cannot read Maxent classifier file #{@classifier_file}."
+      return nil
+    end
+  end
+  ###
+  def exists?(classifier_file)
+    classifier_file = enforce_compact_storage(classifier_file)
+    return FileTest.exists?(classifier_file)
+  end
+  ###
+  # return true iff reading the classifier has had success
+  def read(classifier_file)
+    classifier_file = enforce_compact_storage(classifier_file)
+    if exists?(classifier_file)
+      @classifier_location = classifier_file
+      return true
+    else
+      $stderr.puts "No classifier file "+classifier_file
+      return false
+    end
+  end
+  ###
+  def apply(infilename,outfilename)
+    @classifier_location = enforce_compact_storage(@classifier_location)
+    unless @classifier_location
+      return false
+    end
+    testfile = Tempfile.new(File.basename(infilename)+".maxenttrain")
+    infile = File.new(infilename)
+    c45_to_maxent(infile,testfile) # training data in csv format
+    infile.close
+    testfile.close
+    command = ["cd #{@interface_path}; ",
+               #"/usr/lib/jvm/java-1.7.0/bin/java -cp #{@cp} -Xmx1000m Classify ",
+               "java -cp #{@cp} -Xmx1000m Classify ",
+               testfile.path,
+               @classifier_location,
+               ">",
+               outfilename].join(" ")
+    # classify
+    unless  successfully_run(command)
+      return false
+    end
+    # some error in classification
+    unless FileTest.exists?(outfilename)
+      return false
+    end
+    # no errors = success
+    testfile.close(true)
+    return true
+  end
+  #####
+  # format of Maxent result file:
+  # <best label>[<confidence>]  <secondbest_label>[<confidence>] ....
+  #
+  # returns a list of instance_results
+  # where an instance_result is a list of pairs [label, confidence]
+  # where the pairs are sorted by confidence
+  def read_resultfile(filename)
+    begin
+      f = File.new(filename)
+    rescue
+      $stderr.puts "Maxent error: cannot read Maxent result file #{filemame}."
+      return nil
+    end
+    retv = []
+    f.each do |line|
+      line_results = []
+      pieces = line.split # split at whitespace
+      pieces.each {|piece|
+        piece =~ /(\S+)\[(.+)\]/
+        label = $1
+        confidence = $2.to_f
+        line_results << [label, confidence]
+      }
+      # sort: most confident label first
+      retv << line_results.sort {|a,b| b[1] <=> a[1]}
+    end
+    f.close
+    retv
+  end
+  ###################################
+  private
+  ###
+  # produce input file for maxent learner: make attribute-value pairs
+  # where attribute ==    featureX=
+  def c45_to_maxent(inpipe,outpipe)
+    while (line = inpipe.gets)
+      line.chomp!
+      la = line.split(",")
+      label = la.pop
+      if label[-1,1] == "."
+        label.chop!
+      end
+      la.each_index {|i|
+        la[i] = i.to_s + "=" + la[i]
+      }
+      la.push(label)
+      outpipe.puts la.join(" ")
+    end
+  end
+  # since the OpenNLP MaxEnt system determines storage based on filename,
+  # make sure that all models are stored internally as binary, gzipped files.
+  def enforce_compact_storage(filename)
+    if filename =~ /Model.bin.gz/
+      return filename
+    else
+      return filename+"Model.bin.gz"
+    end
+  end
+  ###
+  def successfully_run(command)
+    retv = Kernel.system(command)
+    unless retv
+      $stderr.puts "Error running classifier. Continuing."
+      $stderr.puts "Offending command: "+command
+ #     exit 1
+    end
+    return retv
+  end
+end

data/lib/ml/optimize.rb ADDED

@@ -0,0 +1,195 @@
+# sp 29 07 04
+# "optimise" c4.5 files by replacing all feature values which only
+# occur with one label by a new, common value.
+#
+# two modes of operation:
+# optimise <file>                -- optimise file and store optimisations in <file>.opts
+# optimise <file> <file.opts>    -- apply optimisation from file.opts to file
+class Optimise
+  def initialize
+    @ready = false
+  end
+  def init_from_data(infile) # find new optimisation
+    STDERR.puts "[Optimise] computing new feature optimisation"
+    infile = File.new(infile)
+    labels = []
+    features = nil
+    @replacements = [] # for each feature, store the list of replacements
+    # read data from infile into hash and initialise replacements array
+    while (line = infile.gets)
+      f_l = line.chomp.split(",")
+      if features.nil? # first line: initialisation
+	features = [] # for each feature: array of feature values from file
+	f_l.each_index {|i|
+	  features[i] = []
+	  @replacements[i] = {}
+	}
+      end
+      labels << f_l.pop
+      f_l.each_index {|i|
+	features[i] << f_l[i]
+      }
+    end
+    infile.close
+    features.each_index {|findex| # traverse all features
+      # for each feature *value*, find all label indices
+      fvalues = features[findex]
+      fval_to_label = {} # record fval -> label mappings
+                                  # no label : nil
+                                  # one label: <label>
+                                  # two labels: false
+      fvalues.each_index {|inst_idx|
+	label = labels[inst_idx] # current label
+	fval = fvalues[inst_idx] # current feature value
+	seen_label = fval_to_label[fval] # previously seen label
+	if seen_label.nil?
+	  fval_to_label[fval] = label
+	elsif seen_label and seen_label != label
+	  fval_to_label[fval] = false
+	end
+      } # at the end, all fvals should be mapped to either <label> or false
+      # construct new feature value names
+      new_fvals = {}
+      labels.each {|label|
+	new_fvals[label] = "f"+findex.to_s+"_"+label.gsub(/\./,"")
+      }
+      # record all features values for which we have only seen one label in @replacements
+      fval_to_label.each_pair {|fval,label|
+	if fval == "[U]"
+	  puts "[U]: "+label.to_s+" "+new_fvals[label]
+	end
+	if label
+#	  STDERR.puts "replacement of "+fval+" by "+new_fvals[label]
+	  @replacements[findex][fval] = new_fvals[label]
+	end
+      }
+    #   fvalues = features[findex]
+#       l_to_v = {} # label -> array of feature values
+#       v_to_l = {} # feature value -> array of labels
+#       fvalues.each_index {|inst| # traverse all instances
+# 	fval = fvalues[inst]
+# 	label = labels[inst]
+# 	unless v_to_l.key?(fval) # add entry to v_to_l
+# 	  v_to_l[fval] = []
+#           end
+# 	v_to_l[fval] << label
+# 	unless l_to_v.key?(label) # add entry to l_to_v
+# 	  l_to_v[label] = []
+# 	end
+# 	l_to_v[label] << fval
+#       }
+#       l_to_v.each_pair {|label,values|
+# 	newvalue = "f"+findex.to_s+"_"+label.gsub(/\./,"")
+# 	values.each {|value|
+# 	  if v_to_l[value].uniq.length == 1
+# 	    @replacements[findex][value] = newvalue
+# 	  end
+# 	}
+#       }
+     }
+    @ready = true
+  end
+  def init_from_file(optsfile) # use old optimisation
+    optsinfile = File.new(optsfile)
+    @replacements = read(optsinfile)
+    optsinfile.close
+    @ready = true
+  end
+  def store(outfilename) # store data necessary to recreate optimisation
+    unless @ready
+      raise "[Optimise] Error: Cannot store un-initialised optimisation"
+    end
+    outfile = File.new(outfilename,"w")
+    @replacements.each_index {|i| # for each feature
+      reps = @replacements[i]
+      outfile.puts "<"+i.to_s+">"
+      reps.each_pair{|old,new|
+	outfile.puts [old,new].join("\t")
+      }
+      outfile.puts "</"+i.to_s+">"
+    }
+    outfile.close
+  end
+  def apply(infilename,outfilename)
+    unless @ready
+      raise "[Optimise] Error: Cannot apply un-initialised optimisation"
+    end
+    STDERR.puts "[Optimise] applying feature optimisation"
+    infile = File.new(infilename)
+    outfile = File.new(outfilename,"w")
+    features = []
+    labels = []
+    while (line = infile.gets)
+      tokens = line.chomp.split(",")
+      unless tokens.length == @replacements.length
+	raise "[Optimise] Error: trying to optimise incompatible feature file!\nFile has "+features.length.to_s+" features, and we know replacements for "+@replacements.length.to_s+" features."
+      end
+      label = tokens.pop
+      tokens.each_index {|f_idx|
+	fval = tokens[f_idx]
+	if @replacements[f_idx].key?(fval)
+	  tokens[f_idx] = @replacements[f_idx][fval]
+	end
+      }
+      tokens.push label
+      outfile.puts tokens.join(",")
+    end
+    outfile.close
+  end
+  private
+  def read(infile)
+    @replacements = []
+    while line = infile.gets
+      line.chomp!
+      if line =~ /<(\d+)>/
+	reps = {}
+      elsif line =~ /<\/(\d+)>/
+	@replacements[$1.to_i] = reps
+      else
+	tokens = line.chomp.split("\t")
+	reps[tokens[0]] = tokens[1]
+      end
+    end
+    infile.close
+  end
+  # return recommended filename to store optimisation patterns for basefile
+  def Optimise.recommended_filename(basefile)
+    return basefile+".optimisations"
+  end
+end

data/lib/ml/timbl.rb ADDED

@@ -0,0 +1,140 @@
+# wrapper script for timbl learner
+# sp 24 08 04
+# contract for Learner classes:
+class Timbl
+  def initialize(program_path, parameters)
+    @timblpath = File.join(program_path, "Timbl")
+    unless @timblpath =~ /\s$/
+      # path must end in space so we can just attach parameters
+      @timblpath << " "
+    end
+    if parameters.empty?
+      # was: +vs
+      @params = "-mM -k5 +vs" # default parameters
+    else
+      @params = parameters.join(" ") + " "
+    end
+  end
+  def timbl_out_to_malouf_out(infilename,outfilename) # timbl: [all features], [gold standard label]
+    infile = File.new(infilename)
+    outfile = File.new(outfilename,"w")
+    while (line = infile.gets)
+      larray = line.chomp.split(",")
+      ml_label = larray.last
+      outfile.puts ml_label+"\t1"
+    end
+    infile.close
+    outfile.close
+  end
+  def train(infile,classifier_location)                  # lazy learning: for training, store the
+                                                         # instancebase as a tree (TiMBL -I / -i option)
+    # figure out how many features we have
+    f = File.new(infile)
+    line = f.gets.chomp
+    num_features = line.split(",").length - 1
+    # and train
+    if classifier_location then
+      @instancebase = classifier_location
+    else
+      @instancebase = infile+".instancebase"
+    end
+    successfully_run(@timblpath+@params+" -N#{num_features} -f "+infile+" -I "+@instancebase)
+  end
+  # return true iff reading the classifier has had success
+  def read(classifierfile)
+    unless FileTest.exists?(classifierfile)
+      STDERR.puts "[Timbl] Cannot find instancebase at #{classifierfile}"
+      return false
+    end
+    @instancebase = classifierfile
+    return true
+  end
+  def exists?(classifierfile)
+    return FileTest.exists?(classifierfile)
+  end
+  def write(classifierfile)
+    %x{cp #{@instancebase} #{classifierfile}} # store training data as "modelfile"
+    File.chmod(0664,classifierfile)
+  end
+  def apply(infile,outfile)
+    temp_outfile = outfile+".temp"
+    successfully_run(@timblpath+@params+" -i "+@instancebase+" -t "+infile+" -o "+temp_outfile)
+    # if we have an empty input file, timbl will not produce an output file
+    unless FileTest.exists?(temp_outfile)
+#      STDERR.puts "[Timbl] Warning: Timbl failed to produce an outfile."
+      return false
+    end
+    # no error
+    timbl_out_to_malouf_out(temp_outfile,outfile)
+    File.unlink(temp_outfile)
+    # true iff outfile exists
+    if  FileTest.exists?(outfile)
+      return true
+    else
+#      STDERR.puts "[Timbl] Warning: Final outfile could not be produced."
+      return false
+    end
+  end
+  #####
+  def read_resultfile(filename)
+    begin
+      f = File.new(filename)
+    rescue
+      $stderr.puts "TiMBL error: cannot read TiMBL result file #{filemame}."
+      return nil
+    end
+    retv = []
+    f.each { |line|
+      line_results = []
+      pieces = line.split
+      while not(pieces.empty?)
+        label = pieces.shift
+        begin
+          confidence = pieces.shift.to_f
+        rescue
+          $stderr.puts "Error reading mallet output: invalid line: #{line}"
+          confidence = 0
+        end
+        line_results << [label, confidence]
+      end
+      retv << line_results
+    }
+    return retv
+  end
+  #########################
+  private
+  ###
+  def successfully_run(command)
+    retv = Kernel.system(command)
+    unless retv
+      $stderr.puts "Error running classifier. Exiting."
+      $stderr.puts "Offending command: "+command
+      exit 1
+    end
+  end
+end