RubyGems - shalmaneser-fred - Versions diffs - 1.2.0.rc4 - Mend

shalmaneser-fred 1.2.0.rc4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

checksums.yaml +7 -0
data/.yardopts +10 -0
data/CHANGELOG.md +4 -0
data/LICENSE.md +4 -0
data/README.md +93 -0
data/bin/fred +16 -0
data/lib/fred/Baseline.rb +150 -0
data/lib/fred/FileZipped.rb +31 -0
data/lib/fred/FredBOWContext.rb +877 -0
data/lib/fred/FredConventions.rb +232 -0
data/lib/fred/FredDetermineTargets.rb +319 -0
data/lib/fred/FredEval.rb +312 -0
data/lib/fred/FredFeatureExtractors.rb +322 -0
data/lib/fred/FredFeatures.rb +1061 -0
data/lib/fred/FredFeaturize.rb +602 -0
data/lib/fred/FredNumTrainingSenses.rb +27 -0
data/lib/fred/FredParameters.rb +402 -0
data/lib/fred/FredSplit.rb +84 -0
data/lib/fred/FredSplitPkg.rb +180 -0
data/lib/fred/FredTest.rb +606 -0
data/lib/fred/FredTrain.rb +144 -0
data/lib/fred/PlotAndREval.rb +480 -0
data/lib/fred/fred.rb +47 -0
data/lib/fred/fred_config_data.rb +185 -0
data/lib/fred/md5.rb +23 -0
data/lib/fred/opt_parser.rb +250 -0
data/test/frprep/test_opt_parser.rb +94 -0
data/test/functional/functional_test_helper.rb +58 -0
data/test/functional/test_fred.rb +47 -0
data/test/functional/test_frprep.rb +99 -0
data/test/functional/test_rosy.rb +40 -0
metadata +99 -0

data/lib/fred/FredFeaturize.rb ADDED

@@ -0,0 +1,602 @@
+# FredFeaturize
+#
+# Featurization for WSD
+#
+# Katrin Erk April 2007
+#
+# feature types currently allowed:
+# - context (with parameter giving context size; may be set several times)
+# - syntax
+# - synsem
+#
+# features in Meta-feature file:
+#
+# CX: context: word/lemma/pos/ne
+# CH: children: grfunc/word/lemma/pos/ne
+# PA: parents: grfunc/word/lemma/pos/ne
+# SI: sibling: parent/grfunc/word/lemma/pos/ne
+# TA: target: word/lemma/pos/ne
+require "delegate"
+#######
+require "fred/FileZipped"
+require "common/Parser"
+require "common/RegXML"
+require "common/SalsaTigerRegXML"
+require "common/SalsaTigerXMLHelper"
+require "fred/fred_config_data"
+require "fred/FredConventions"
+require "common/prep_helper"
+require "common/SynInterfaces"
+require "fred/FredBOWContext"
+require "fred/FredDetermineTargets"
+require "fred/FredFeatures"
+####################################
+# grammatical function computation:
+# given a sentence, keep all grammatical function relations in a hash
+# for faster access
+class GrammaticalFunctionAccess
+  def initialize(interpreter_class)
+    @interpreter_class = interpreter_class
+    @to = Hash.new( [] ) # default: return empty array
+    @from = Hash.new( [] ) # default: return empty array
+  end
+  def set_sent(sent) # SalsaTigerRegXML sentence
+    @to.clear()
+    @from.clear()
+    sent.each_syn_node { |current|
+      current_head = @interpreter_class.head_terminal(current)
+      unless current_head
+        next
+      end
+      @interpreter_class.gfs(current, sent).map { |rel, node|
+        # PPs: use head noun rather than preposition as head
+        # Sbar, VP: use verb
+        if (n = @interpreter_class.informative_content_node(node))
+          [rel, n]
+        else
+          [rel, node]
+        end
+      }.each { |rel, node|
+        rel_head = @interpreter_class.head_terminal(node)
+        unless rel_head
+          next
+        end
+        unless @to.has_key? current_head
+          @to[current_head] = Array.new()
+        end
+        unless @to[current_head].include? [rel, rel_head]
+          @to[current_head] << [rel, rel_head]
+        end
+        unless @from.has_key? rel_head
+          @from[rel_head] = Array.new()
+        end
+        unless @from[rel_head].include? [rel, current_head]
+          @from[rel_head] << [rel, current_head]
+        end
+      }
+    }
+    #     $stderr.puts "Changed sentence:"
+    #     @to.each_pair { |p, ch|
+    #       $stderr.puts "\t#{p.id()}: " + ch.map { |rel, n| rel + "/"+n.id()}.join(", ")
+    #     }
+    #     $stdin.gets()
+  end
+  def get_children(node)
+    return @to[node]
+  end
+  def get_parents(node)
+    return @from[node]
+  end
+end
+####################################
+# main class of this package
+####################################
+class FredFeaturize < DelegateClass(GrammaticalFunctionAccess)
+  include WordLemmaPosNe
+  #####
+  def initialize(exp_obj, # FredConfigData object
+		 options, # hash: runtime option name (string) => value(string)
+                 varhash = {}) # optional parameter: "refeaturize"
+    ##
+    # evaluate runtime options
+    if $ENDUSER_MODE
+      # only possible dataset: test
+      @dataset = "test"
+    else
+      @dataset = nil
+    end
+    @append_rather_than_overwrite = false
+    options.each_pair do |opt, arg|
+      case opt
+      when '--dataset'
+	@dataset = arg
+	unless ["train", "test"].include? @dataset
+	  $stderr.puts "--dataset needs to be either 'train' or 'test'"
+          exit 1
+	end
+      when '--append'
+        @append_rather_than_overwrite = true
+      else
+	# case of unknown arguments has been dealt with by fred.rb
+      end
+    end
+    # further sanity checks
+     if @dataset.nil?
+      $stderr.puts  "Please set --dataset: one of 'train', 'test'"
+       exit 1
+    end
+    in_enduser_mode_ensure(@dataset == "test")
+    # evaluate optional "refeaturize" argument
+    # "refeaturize": reuse meta-feature set,
+    # just redo CSV featurization
+    if varhash["refeaturize"]
+      @refeaturize = varhash["refeaturize"]
+    else
+      @refeaturize = false
+    end
+    # prepare experiment file: add preprocessing experiment file data
+    @exp = exp_obj
+    # @note AB: The following is desabled because we don't want to use
+    #   the dependence on {PrepConfigData}. We duplicate options:
+    #   <do_postag>, <pos_tagger>, <do_lemmatize>, <lemmatizer>,
+    #   <do_parse>, <parser>, <directory_preprocessed>
+    #   in the experiment file of Fred.
+    #
+    # preproc_expname = @exp.get("preproc_descr_file_" + @dataset)
+    # if not(preproc_expname)
+    #   $stderr.puts "Please set the name of the preprocessing exp. file name"
+    #   $stderr.puts "in the experiment file, feature preproc_descr_file_#{@dataset}"
+    #   exit 1
+    # elsif not(File.readable?(preproc_expname))
+    #   $stderr.puts "Error in the experiment file:"
+    #   $stderr.puts "Parameter preproc_descr_file_#{@dataset} has to be a readable file."
+    #   exit 1
+    # end
+    # preproc_exp = FrPrepConfigData.new(preproc_expname)
+    # @exp.adjoin(preproc_exp)
+    # get the right syntactic interface
+    SynInterfaces.check_interfaces_abort_if_missing(@exp)
+    @interpreter_class = SynInterfaces.get_interpreter_according_to_exp(@exp)
+    # initialize grammatical function object (delegating)
+    grf_obj = GrammaticalFunctionAccess.new(@interpreter_class)
+    super(grf_obj)
+    # announce the task
+    $stderr.puts "---------"
+    $stderr.puts "Fred experiment #{@exp.get("experiment_ID")}: Featurization of dataset #{@dataset}"
+    if @refeaturize
+      $stderr.puts "Keeping meta-features, redoing featurization only."
+    end
+    if @exp.get("binary_classifiers")
+      $stderr.puts "Writing features for binary classifiers."
+    else
+      $stderr.puts "Writing features for n-ary classifiers."
+    end
+    $stderr.puts "---------"
+  end
+  ####
+  def compute()
+    if @refeaturize
+      # read meta-feature file,
+      # just redo normal featurization
+      refeaturize()
+    else
+      # write meta features and normal features
+      featurize()
+    end
+  end
+  #########################
+  private
+  #####
+  # main featurization
+  def featurize()
+    ###
+    # make objects
+    unless @exp.get("directory_preprocessed")
+      $stderr.puts "Shalmaneser error: could not find the directory with"
+      $stderr.puts "syntactially preprocessed data."
+      $stderr.puts "Please make sure that 'directory_preprocessed'"
+      $stderr.puts "is set in the frprep experiment file you use with this experiment."
+      exit 1
+    end
+    directory = File.existing_dir(@exp.get("directory_preprocessed"))
+    # get context sizes
+    context_sizes = @exp.get_lf("feature", "context")
+    unless context_sizes
+      # no contexts, nothing to compute.
+      # choose default context
+      $stderr.puts "Error: no contexts set."
+      $stderr.puts "I will compute a context of size 1 by default."
+      $stderr.puts "(This goes into the meta-features, but not"
+      $stderr.puts "into the set of features used in the classifier.)"
+      context_sizes = [1]
+    end
+    max_context_size = context_sizes.max()
+    # make target determination object
+    if @dataset == "test" and @exp.get("apply_to_all_known_targets")
+      $stderr.puts "Fred: Using all known targets as instances."
+      target_obj = FindAllTargets.new(@exp, @interpreter_class)
+    else
+      if @append_rather_than_overwrite
+        target_obj = FindTargetsFromFrames.new(@exp, @interpreter_class, "a")
+      else
+        target_obj = FindTargetsFromFrames.new(@exp, @interpreter_class, "w")
+      end
+    end
+    # make context computation object
+    if @exp.get("single_sent_context")
+      # contexts in the input data doesn't go beyond a single sentence
+      context_obj = SingleSentContextProvider.new(max_context_size, @exp,
+                                                  @interpreter_class, target_obj,
+                                                  @dataset)
+      # @todo AB: Put it to the OptionParser, two option are not
+      # compatible, don't do the check here!
+      if @exp.get("noncontiguous_input")
+        $stderr.puts "Warning: 'single_sent_context' has been set in the experiment file."
+        $stderr.puts "So I'm ignoring the 'noncontiguous_input = true' setting."
+      end
+    elsif @exp.get("noncontiguous_input")
+      # the input data is not contiguous but
+      # consists of selected sentences from a larger text
+      context_obj = NoncontiguousContextProvider.new(max_context_size, @exp,
+                                                     @interpreter_class, target_obj,
+                                                     @dataset)
+     else
+      # the input data is contiguous, and we're computing contexts not restricted to single sentences
+      context_obj = ContextProvider.new(max_context_size, @exp,
+                                        @interpreter_class, target_obj, @dataset)
+    end
+    zipped_input_dir = fred_dirname(@exp, @dataset, "input_data", "new")
+    ##
+    # make writer object(s)
+    writer_classes = [
+      MetaFeatureAccess,
+      FredFeatureAccess
+    ]
+    if @append_rather_than_overwrite
+      # append
+      mode = "a"
+      $stderr.puts "Appending new features to the old"
+    else
+      # write
+      mode = "w"
+      $stderr.puts "Removing old features for the same experiment (if any)"
+      writer_classes.each { |w_class|
+        w_class.remove_feature_files(@exp, @dataset)
+      }
+      Dir[zipped_input_dir + "*gz"].each { |filename|
+        File.delete(filename)
+      }
+    end
+    writers = writer_classes.map { |w_class|
+      w_class.new(@exp, @dataset, mode)
+    }
+    ###
+    # zip and store input files
+    Dir[directory + "*.xml"].sort.each { |filename|
+      %x{gzip -c #{filename} > #{zipped_input_dir}#{File.basename(filename)}.gz}
+    }
+    # always remember current sentence
+    @current_sent = nil
+    ###
+    # featurize
+    # context_obj.each_window yields tuples of:
+    # - a context, an array of tuples [word,lemma, pos, ne]
+    #   string/nil*string/nil*string/nil*string/nil
+    # - ID of main target: string
+    # - target_IDs: array:string, list of IDs of target words
+    # - senses: array:string, the senses for the target
+    # - sent: SalsaTigerSentence object
+    #
+    # for each max. context returned by context object:
+    # determine meta-features:
+    # - context words for all context sizes listed in context_sizes,
+    # - children of target
+    # - parent of target
+    # - siblings of target
+    #
+    # and pass on to writing object(s)
+    target_count = 0
+    context_obj.each_window(directory) { |context, main_target_id, target_ids, senses, sent|
+      # inform user
+      if target_count % 500 == 0
+        $stderr.puts "#{target_count}..."
+      end
+      target_count += 1
+      # determine features
+      feature_hash = Hash.new()
+      compute_target_features(context, max_context_size, feature_hash)
+      compute_context_features(context, max_context_size, context_sizes, feature_hash)
+      compute_syn_features(main_target_id, sent, feature_hash)
+      # write
+      each_lemma_pos_and_senses(senses) { |target_lemma, target_pos, target_sid, target_senses|
+        writers.each { |writer_obj|
+          writer_obj.write_item(target_lemma,
+                                target_pos,
+                                target_ids,
+                                target_sid,
+                                target_senses,
+                                feature_hash)
+        }
+      }
+    }
+    # finalize writers
+    writers.each { |writer_obj|
+      writer_obj.flush()
+    }
+    # record the targets that have been read
+    target_obj.done_reading_targets()
+  end
+  #####
+  # reuse of meta-features, recompute CSV features
+  def refeaturize()
+    ##
+    # remove old features:
+    # normal features only. Keep meta-features.
+    # Don't do anything about zipped input.
+    # Assume it stays as is.
+    if @append_rather_than_overwrite
+      # append
+      mode = "a"
+      $stderr.puts "Appending new features to the old"
+    else
+      # write
+      mode = "w"
+      $stderr.puts "Removing old features for the same experiment (if any)"
+      FredFeatureAccess.remove_feature_files(@exp, @dataset)
+    end
+    ##
+    # read meta-feature file,
+    # write fred feature files
+    meta_reader = MetaFeatureAccess.new(@exp, @dataset, "r")
+    feature_writer = FredFeatureAccess.new(@exp, @dataset, mode)
+    ##
+    # featurize
+    target_count = 0
+    meta_reader.each_item { |target_lemma, target_pos, target_ids, target_sid, target_senses, feature_hash|
+      # inform user
+      if target_count % 500 == 0
+        $stderr.puts "#{target_count}..."
+      end
+      target_count += 1
+      feature_writer.write_item(target_lemma,
+                                target_pos,
+                                target_ids,
+                                target_sid,
+                                target_senses,
+                                feature_hash)
+    }
+    feature_writer.flush()
+  end
+  ####
+  # given a list of sense hashes, format
+  # "lex" -> lemma
+  # "pos" -> part of speech
+  # "sense" -> sense
+  #
+  # yield as triples [lemma, pos, sense]
+  def each_lemma_pos_and_senses(shashes)
+    # determine target and POS. If we actually have more than one lemma and POS, we're in trouble
+    target_lemmas = shashes.map { |sense_hash| sense_hash["lex"].to_s().gsub(/\s/, "_") }.uniq()
+    target_pos_s =  shashes.map { |sense_hash| sense_hash["pos"].to_s().gsub(/\s/, "_")}.uniq()
+    target_sid =    shashes.map { |sense_hash| sense_hash["sid"].to_s().gsub(/\s/, "_")}.uniq()
+    if target_lemmas.length() == 1 and target_pos_s.length() == 1 and target_sid.length() == 1
+      yield [target_lemmas.first(), target_pos_s.first(),
+             target_sid.first(),
+             shashes.map { |sense_hash| sense_hash["sense"].to_s().gsub(/\s/, "_") }
+            ]
+    else
+      # trouble
+      # group senses by SID, lemma and pos
+      lemmapos2sense = Hash.new
+      shashes.each { |sense_hash|
+        target_lemma = sense_hash["lex"].to_s().gsub(/\s/, "_")
+        target_pos = sense_hash["pos"].to_s().gsub(/\s/, "_")
+        target_sid = sense_hash["sid"].to_s().gsub(/\s/, "_")
+        target_sense = sense_hash["sense"].to_s().gsub(/\s/, "_")
+        key = [target_sid, target_lemma, target_pos]
+        unless lemmapos2sense[key]
+          lemmapos2sense[key] = Array.new()
+        end
+        lemmapos2sense[key] << target_sense
+      }
+      # and yield
+      lemmapos2sense.each_key { |target_sid, target_lemma, target_pos|
+        yield [target_lemma, target_pos, target_sid,
+               lemmapos2sense[[target_sid, target_lemma, target_pos]]
+              ]
+      }
+    end
+  end
+  ###
+  # given a context, locate the target,
+  # which is right in the middle,
+  # and enter it into the feature hash
+  #
+  # feature type: TA
+  # entry: word#lemma#pos#ne
+  def compute_target_features(context,      # array: word*lemma*pos*ne
+                              center_pos, # integer: size of context, onesided
+                              feature_hash) # hash: feature_type -> array:feature, enter features here
+  feature_hash["TA"] = [
+                          context[center_pos].map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
+                         ]
+  end
+  ###
+  # compute context features:
+  # for each context in the given list of context sizes,
+  # compute a context with feature_type "CXNN" (where NN is the size of the context)
+  # and with features
+  # word#lemma#pos#ne
+  #
+  # enter each context into the feature hash
+  def compute_context_features(context,  # array: word*lemma*pos*ne
+                               center_pos, # int: context is 2*cx_size_onesided + 1 long
+                               context_sizes,    # array:int, produce a context of each of these sizes
+                               feature_hash)     # hash: feature_type -> array:feature, enter features here
+    context_sizes.each { |context_size|
+      # feature type: CXNN, where NN is the size of the context
+      feature_type = "CX" + context_size.to_s()
+      # features: an array of strings
+      feature_hash[feature_type]  = Array.new()
+      # pre-context
+      (center_pos - context_size).upto(center_pos - 1) { |ix|
+        if context[ix]
+          # context entries may be nil at the beginning and end of the text
+          feature_hash[feature_type] << context[ix].map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
+        end
+      }
+      # post-context
+      (center_pos + 1).upto(center_pos + context_size) { |ix|
+        if context[ix]
+          # context entries may be nil at the beginning and end of the text
+          feature_hash[feature_type] << context[ix].map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
+        end
+      }
+    }
+  end
+  ###
+  # compute syntax-dependent features:
+  # children (that is, dependents) of the target word,
+  # parent,
+  # and siblings.
+  #
+  # format:
+  #  feature type is CH for children, PA for parent, SI for siblings
+  #
+  #  individual features are:
+  #    <dependency>#<word>#<lemma>#<pos>#<ne>
+  def compute_syn_features(main_target_id,  # string: ID of the terminal node that is the target
+                           sent,       # SalsaTigerRegXML object
+                           feature_hash) # hash: feature_type -> array:feature, enter features here
+    target = sent.terminals().detect { |t| t.id() == main_target_id }
+    unless target
+      $stderr.puts "Featurization error: cannot find target with ID #{main_target_id}, skipping."
+      return
+    end
+    # if we're starting a new sentence,
+    # compute dependencies using delegate object for grammatical functions.
+    # also, get_children, get_parents below are methods of the delegate
+    unless sent == @current_sent
+      @current_sent = sent
+      set_sent(sent)
+    end
+    # children
+    feature_hash["CH"] = get_children(target).map { |rel, node|
+      #print "\t", rel, " ", node, "\n"
+      rel.to_s() + "#" +
+      word_lemma_pos_ne(node, @interpreter_class).map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
+    }
+    # parent
+    feature_hash["PA"] = get_parents(target).map { |rel, node|
+      #print "\t", rel, " ", node, "\n"
+      rel.to_s() + "#" +
+      word_lemma_pos_ne(node, @interpreter_class).map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
+    }
+    # siblings
+    feature_hash["SI"] = Array.new()
+    get_parents(target).each { |rel, parent|
+      parent_w, d1, d2, d3 = word_lemma_pos_ne(parent, @interpreter_class)
+      get_children(parent).each { |rel, node|
+      #print "\t", rel, " ", node, "\n"
+        if node == target
+          next
+        end
+        feature_hash["SI"] << parent_w + "#" +
+        rel.to_s() + "#" +
+        word_lemma_pos_ne(node, @interpreter_class).map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
+      }
+    }
+  end
+end