RubyGems - shalmaneser - Versions diffs - 0.0.1.alpha - Mend

shalmaneser 0.0.1.alpha

Files changed (138) hide show

data/.yardopts +8 -0
data/CHANGELOG.rdoc +0 -0
data/LICENSE.rdoc +0 -0
data/README.rdoc +0 -0
data/lib/common/AbstractSynInterface.rb +1227 -0
data/lib/common/BerkeleyInterface.rb +375 -0
data/lib/common/CollinsInterface.rb +1165 -0
data/lib/common/ConfigData.rb +694 -0
data/lib/common/Counter.rb +18 -0
data/lib/common/DBInterface.rb +48 -0
data/lib/common/EnduserMode.rb +27 -0
data/lib/common/Eval.rb +480 -0
data/lib/common/FixSynSemMapping.rb +196 -0
data/lib/common/FrPrepConfigData.rb +66 -0
data/lib/common/FrprepHelper.rb +1324 -0
data/lib/common/Graph.rb +345 -0
data/lib/common/ISO-8859-1.rb +24 -0
data/lib/common/ML.rb +186 -0
data/lib/common/Maxent.rb +215 -0
data/lib/common/MiniparInterface.rb +1388 -0
data/lib/common/Optimise.rb +195 -0
data/lib/common/Parser.rb +213 -0
data/lib/common/RegXML.rb +269 -0
data/lib/common/RosyConventions.rb +171 -0
data/lib/common/SQLQuery.rb +243 -0
data/lib/common/STXmlTerminalOrder.rb +194 -0
data/lib/common/SalsaTigerRegXML.rb +2347 -0
data/lib/common/SalsaTigerXMLHelper.rb +99 -0
data/lib/common/SleepyInterface.rb +384 -0
data/lib/common/SynInterfaces.rb +275 -0
data/lib/common/TabFormat.rb +720 -0
data/lib/common/Tiger.rb +1448 -0
data/lib/common/TntInterface.rb +44 -0
data/lib/common/Tree.rb +61 -0
data/lib/common/TreetaggerInterface.rb +303 -0
data/lib/common/headz.rb +338 -0
data/lib/common/option_parser.rb +13 -0
data/lib/common/ruby_class_extensions.rb +310 -0
data/lib/fred/Baseline.rb +150 -0
data/lib/fred/FileZipped.rb +31 -0
data/lib/fred/FredBOWContext.rb +863 -0
data/lib/fred/FredConfigData.rb +182 -0
data/lib/fred/FredConventions.rb +232 -0
data/lib/fred/FredDetermineTargets.rb +324 -0
data/lib/fred/FredEval.rb +312 -0
data/lib/fred/FredFeatureExtractors.rb +321 -0
data/lib/fred/FredFeatures.rb +1061 -0
data/lib/fred/FredFeaturize.rb +596 -0
data/lib/fred/FredNumTrainingSenses.rb +27 -0
data/lib/fred/FredParameters.rb +402 -0
data/lib/fred/FredSplit.rb +84 -0
data/lib/fred/FredSplitPkg.rb +180 -0
data/lib/fred/FredTest.rb +607 -0
data/lib/fred/FredTrain.rb +144 -0
data/lib/fred/PlotAndREval.rb +480 -0
data/lib/fred/fred.rb +45 -0
data/lib/fred/md5.rb +23 -0
data/lib/fred/opt_parser.rb +250 -0
data/lib/frprep/AbstractSynInterface.rb +1227 -0
data/lib/frprep/Ampersand.rb +37 -0
data/lib/frprep/BerkeleyInterface.rb +375 -0
data/lib/frprep/CollinsInterface.rb +1165 -0
data/lib/frprep/ConfigData.rb +694 -0
data/lib/frprep/Counter.rb +18 -0
data/lib/frprep/FNCorpusXML.rb +643 -0
data/lib/frprep/FNDatabase.rb +144 -0
data/lib/frprep/FixSynSemMapping.rb +196 -0
data/lib/frprep/FrPrepConfigData.rb +66 -0
data/lib/frprep/FrameXML.rb +513 -0
data/lib/frprep/FrprepHelper.rb +1324 -0
data/lib/frprep/Graph.rb +345 -0
data/lib/frprep/ISO-8859-1.rb +24 -0
data/lib/frprep/MiniparInterface.rb +1388 -0
data/lib/frprep/Parser.rb +213 -0
data/lib/frprep/RegXML.rb +269 -0
data/lib/frprep/STXmlTerminalOrder.rb +194 -0
data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
data/lib/frprep/SleepyInterface.rb +384 -0
data/lib/frprep/SynInterfaces.rb +275 -0
data/lib/frprep/TabFormat.rb +720 -0
data/lib/frprep/Tiger.rb +1448 -0
data/lib/frprep/TntInterface.rb +44 -0
data/lib/frprep/Tree.rb +61 -0
data/lib/frprep/TreetaggerInterface.rb +303 -0
data/lib/frprep/do_parses.rb +142 -0
data/lib/frprep/frprep.rb +686 -0
data/lib/frprep/headz.rb +338 -0
data/lib/frprep/one_parsed_file.rb +28 -0
data/lib/frprep/opt_parser.rb +94 -0
data/lib/frprep/ruby_class_extensions.rb +310 -0
data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
data/lib/rosy/DBMySQL.rb +146 -0
data/lib/rosy/DBSQLite.rb +280 -0
data/lib/rosy/DBTable.rb +239 -0
data/lib/rosy/DBWrapper.rb +176 -0
data/lib/rosy/ExternalConfigData.rb +58 -0
data/lib/rosy/FailedParses.rb +130 -0
data/lib/rosy/FeatureInfo.rb +242 -0
data/lib/rosy/GfInduce.rb +1115 -0
data/lib/rosy/GfInduceFeature.rb +148 -0
data/lib/rosy/InputData.rb +294 -0
data/lib/rosy/RosyConfigData.rb +115 -0
data/lib/rosy/RosyConfusability.rb +338 -0
data/lib/rosy/RosyEval.rb +465 -0
data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
data/lib/rosy/RosyFeaturize.rb +280 -0
data/lib/rosy/RosyInspect.rb +336 -0
data/lib/rosy/RosyIterator.rb +477 -0
data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
data/lib/rosy/RosyPruning.rb +165 -0
data/lib/rosy/RosyServices.rb +744 -0
data/lib/rosy/RosySplit.rb +232 -0
data/lib/rosy/RosyTask.rb +19 -0
data/lib/rosy/RosyTest.rb +826 -0
data/lib/rosy/RosyTrain.rb +232 -0
data/lib/rosy/RosyTrainingTestTable.rb +786 -0
data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
data/lib/rosy/View.rb +418 -0
data/lib/rosy/opt_parser.rb +379 -0
data/lib/rosy/rosy.rb +77 -0
data/lib/shalmaneser/version.rb +3 -0
data/test/frprep/test_opt_parser.rb +94 -0
data/test/functional/functional_test_helper.rb +40 -0
data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
data/test/functional/test_fred.rb +47 -0
data/test/functional/test_frprep.rb +52 -0
data/test/functional/test_rosy.rb +20 -0
metadata +284 -0

data/lib/frprep/SynInterfaces.rb ADDED Viewed

@@ -0,0 +1,275 @@
+# SynInterfaces.rb
+#
+# ke oct/nov 2005
+#
+# Store all known interfaces to
+# systems that do syntactic analysis
+#
+# Given the name of a system and the service that the
+# system performs, return the appropriate interface
+#
+# There are two types of interfaces to syntactic analysis systems:
+# - interfaces:
+#   offer methods for syntactic analysis,
+#   and the transformation to Salsa/Tiger XML and SalsaTigerSentence objects
+# - interpreters:
+#   interpret the resulting Salsa/Tiger XML (represented as
+#   SalsaTigerSentence and SynNode objects), e.g.
+#   generalize over part of speech;
+#   describe the path between a pair of nodes both as a path
+#   and (potentially) as a grammatical function of one of the nodes;
+#   determine whether a node describes a verb, and in which voice;
+#   determine the head of a constituent
+#
+# Abstract classes for both interfaces and interpreters
+# are in AbstractSynInterface.rb
+require "frprep/ruby_class_extensions"
+class Array
+  include EnumerableBool
+end
+# The list of available interface packages
+# is at the end of this file.
+# Please enter additional interfaces there.
+class SynInterfaces
+  ###
+  # class variable:
+  # list of all known interface classes
+  # add to it using add_interface()
+  @@interfaces = Array.new
+  ###
+  # class variable:
+  # list of all known interpreter classes
+  # add to it using add_interpreter()
+  @@interpreters = Array.new
+  ###
+  # add interface/interpreter
+  def SynInterfaces.add_interface(class_name)
+    $stderr.puts "Initializing interface #{class_name}" if $DEBUG
+    @@interfaces << class_name
+  end
+  def SynInterfaces.add_interpreter(class_name)
+    $stderr.puts "Initializing interpreter #{class_name}" if $DEBUG
+    @@interpreters << class_name
+  end
+  # AB: fake method to preview the interfaces table.
+  def SynInterfaces.explore
+    $stderr.puts "Exploring..."
+    $stderr.puts @@interfaces
+    $stderr.puts @@interpreters
+  end
+  ###
+  # check_interfaces_abort_if_missing:
+  #
+  # Given an experiment file, use some_system_missing? to
+  # determine whether the system can be run with the requested
+  # syntactic processing, exit with an error message if that is not possible
+  def SynInterfaces.check_interfaces_abort_if_missing(exp) #FrPrepConfigData object
+    if (missing = SynInterfaces.some_system_missing?(exp))
+      interwhat, services = missing
+      $stderr.puts
+      $stderr.puts "ERROR: I am missing an #{interwhat} for "
+      services.each_pair { |service, system_name|
+        $stderr.puts "\tservice #{service}, system #{system_name}"
+      }
+      $stderr.puts
+      $stderr.puts "I have the following interfaces:"
+      @@interfaces.each { |interface_class|
+        $stderr.puts "\tservice #{interface_class.service}, system #{interface_class.system}"
+      }
+      $stderr.puts "I have the following interpreters:"
+      @@interpreters.each { |interpreter_class|
+        $stderr.print "\t"
+        $stderr.print interpreter_class.systems.to_a.map { |service, system_name|
+          "service #{service}, system #{system_name}"
+        }.join("; ")
+        unless interpreter_class.optional_systems.empty?
+          $stderr.print ", optional: "
+          $stderr.print interpreter_class.optional_systems.to_a.map { |service, system_name|
+          "service #{service}, system #{system_name}"
+          }.join("; ")
+        end
+        $stderr.puts
+      }
+      $stderr.puts
+      $stderr.puts "Please adapt your experiment file."
+      exit 1
+    end
+  end
+  ###
+  # some_system_missing?
+  # returns nil if I have interfaces and interpreters
+  # for all services requested in the given experiment file
+  # else:
+  # returns pair [interface or interpreter, info]
+  #  where the 1st element is either 'interface' or 'interpreter',
+  #  and the 2nd element is a hash mapping services to system names:
+  #  the services that could not be provided
+  def SynInterfaces.some_system_missing?(exp) # FrPrepConfigData object
+    services = SynInterfaces.requested_services(exp)
+    # check interfaces
+    services.each_pair { |service, system_name|
+      unless SynInterfaces.get_interface(service, system_name)
+        return ["interface", {service => system_name} ]
+      end
+    }
+    # check interpreter
+    unless SynInterfaces.get_interpreter_according_to_exp(exp)
+      return ["interpreter", services]
+    end
+    # everything okay
+    return nil
+  end
+  ###
+  # given the name of a system and the service that it
+  # performs, find the matching interface class
+  #
+  # system: string: name of system, e.g. collins
+  # service: string: service, e.g. parser
+  #
+  # returns: SynInterface class
+  def SynInterfaces.get_interface(service,
+                                  system)
+    # try to find an interface class with the given
+    # name and service
+    @@interfaces.each { |interface_class|
+      if interface_class.system == system and
+	  interface_class.service == service
+	return interface_class
+      end
+    }
+    # at this point, detection of a suitable interface class has failed
+    return nil
+  end
+  ###
+  # helper for get_interpreter:
+  def SynInterfaces.get_interpreter_according_to_exp(exp)
+    return SynInterfaces.get_interpreter(SynInterfaces.requested_services(exp))
+  end
+  ###
+  # given the names and services of a set of systems,
+  # find the matching interpreter class
+  #
+  # an interpreter class has both obligatory systems
+  # (they need to be present for this class to apply)
+  # and optional systems (they may or may not be present
+  # for the class to apply, but no other system performing
+  # the same service may)
+  #
+  # systems:
+  # hash: service(string) -> system name(string)
+  #
+  # returns: SynInterpreter class
+  def SynInterfaces.get_interpreter(systems)
+    # try to find an interface class with the given
+    # service-name pairs
+    @@interpreters.each { |interpreter_class|
+      if interpreter_class.systems.to_a.big_and { |service, system|
+	  # all obligatory entries of interpreter_class
+	  # are in systems
+	  systems[service] == system
+	} and
+	  interpreter_class.optional_systems.to_a.big_and { |service, system|
+	  # all optional entries of interpreter_class are
+	  # either in systems, or the service isn't in systems at all
+	  systems[service].nil? or systems[service] == system
+	} and
+	  systems.to_a.big_and { |service, system|
+	  # all entries in names are in either
+	  # the obligatory or optional set for interpreter_class
+	  interpreter_class.systems[service] == system or
+	    interpreter_class.optional_systems[service] == system
+	}
+	return interpreter_class
+      end
+    }
+    # at this point, detection of a suitable interpreter class has failed
+    return nil
+  end
+  ################
+  protected
+  ###
+  # knows about possible services that can be set in
+  # the experiment file, and where the names of
+  # the matching systems will be found in the experiment file data structure
+  #
+  # WARNING: adapt this when you introduce new services!
+  #
+  # returns: a hash
+  #  <service> => system_name
+  #
+  #  such that for each service/system name pair:
+  #  the service with the given name has been requested in
+  #  the experiment file, and the names of the systems to be used
+  #  for performing the service
+  def SynInterfaces.requested_services(exp)
+    retv = Hash.new
+    [
+      { "flag" => "do_postag", "service"=> "pos_tagger"},
+      { "flag" => "do_lemmatize", "service"=> "lemmatizer"},
+      { "flag" => "do_parse", "service" => "parser" }
+    ].each { |hash|
+      if exp.get(hash["flag"])  # yes, perform this service
+	retv[hash["service"]] = exp.get(hash["service"])
+      end
+    }
+    return retv
+  end
+end
+require "frprep/CollinsInterface"
+require "frprep/BerkeleyInterface"
+require "frprep/SleepyInterface"
+require "frprep/MiniparInterface"
+require "frprep/TntInterface"
+require "frprep/TreetaggerInterface"
+class EmptyInterpreter < SynInterpreter
+  EmptyInterpreter.announce_me()
+  ###
+  # systems interpreted by this class:
+  # returns a hash service(string) -> system name (string),
+  # e.g.
+  # { "parser" => "collins", "lemmatizer" => "treetagger" }
+  def EmptyInterpreter.systems()
+    return {}
+  end
+  ###
+  # names of additional systems that may be interpreted by this class
+  # returns a hash service(string) -> system name(string)
+  # same as names()
+  def SynInterpreter.optional_systems()
+    return {}
+  end
+end

data/lib/frprep/TabFormat.rb ADDED Viewed

@@ -0,0 +1,720 @@
+# TabFormat.rb
+# Katrin Erk, Jan 2004
+#
+# classes to be used with tabular format text files.
+# originally CoNLL2.rb
+# Original: Katrin Erk, Jan 2004 for CoNLL '04 data
+# Rewrite: Sebastian Pado, Mar 2004 for Gemmas FrameNet data (no NEs etc.)
+# Extensions SP Jun/Jul 04
+# renamed GemmaCorpus to FNTabFormat
+# partial rewrite SP 250804: made things cleaner & leaner: no RawFormat, for example
+# sp 04/05: add a "frame" column to FNTabFormat
+#
+# Substantial changes KE 12/06:
+# variable number of columns to accommodate more than one frame per sentence
+#################################################
+# class for reading a file
+# containing data in tabular
+require "tempfile"
+require "frprep/ISO-8859-1"
+require "frprep/ruby_class_extensions"
+#######################
+# This function takes a variable number of arguments and
+# returns them as an array
+# Idea: make formulation of tab format entries easier to read,
+# enclose variable arguments in a repeat() call,
+# which immediately gets transformed into a list
+def repeat(*args)
+  return args
+end
+#######################
+class TabFormatFile
+  #######
+  # initialize:
+  # open files for reading.
+  #
+  # fp is a list of pairs [filename, format]
+  # where format is a list of strings that will be used
+  # to address columns of the file, the 1st string for the 1st column
+  #
+  # format may contain _one_ entry that is an array (or a call to repeat())
+  # e.g.:
+  # ["word", "pos", "lemma", repeat("frame", "target", "gf", "pt")]
+  def initialize(fp)
+    # open files
+    @files = Array.new
+    @patterns = Array.new
+    @no_of_read_lines = 0
+    fp.each_index { |ix|
+      if ix.modulo(2) == 0
+	# filename
+	begin
+	  @files << File.new(fp[ix])
+	rescue
+	  raise 'Sorry, could not read input file ' + fp[ix] + "\n"
+	end
+      else
+	# pattern
+	@patterns += fp[ix]
+      end
+    }
+    @my_sentence_class = TabFormatSentence
+  end
+  ########
+  # each_sentence:
+  # yield each sentence of the files in turn.
+  # sentences are expected to be separated
+  # by a line containing nothing but whitespace.
+  # the last sentence may or may not be followed by
+  # an empty line.
+  # each_sentence ends when EOF is encountered on the first file.
+  # it expects all the other files to be the same length
+  # (in terms of number of lines) as the first file.
+  # each sentence is returned in the form of an
+  # array of TabFormatSentence sentences.
+  def each_sentence
+    unless @read_completely
+      sentence = @my_sentence_class.new(@patterns)
+      begin
+	lines = Array.new
+	while true do
+	  line = ""
+	  linearray = Array.new
+	  @files.each {|f|
+	    linearray << f.readline().chomp()
+	  }
+	#STDERR.puts linearray
+	  @no_of_read_lines += 1
+	  if linearray.detect{|x| x.strip == ""}
+	    if linearray.detect {|x| x.strip != ""}
+	      STDERR.puts "Error: Mismatching empty lines!"
+	      exit(1)
+	    else
+	      # sentence finished. yield it and start a new one
+	      unless sentence.empty?
+		yield sentence
+	      end
+	      sentence = @my_sentence_class.new(@patterns)
+            end
+	    # read an empty line in each of the other files
+	  else
+	    # sentence not yet finished.
+	    # add this line to it
+	    sentence.add_line(linearray.join("\t"))
+	  end
+	end
+      rescue EOFError
+	unless sentence.empty?
+	  # maybe we haven't yielded the last sentence yet.
+	  yield sentence
+	end
+	@read_completely = true
+      end
+    end
+  end
+end
+#################################################
+# class for keeping one line,
+# parsed.
+# The line is kept as follows:
+# - normal features: in a hash @f mapping feature names to values
+# - features of the repeated group: in an array @r of
+#   TabFormatNamedArgs objects, one per group
+#
+# each feature of the line is available by name
+# via the method "get".
+# Additional features (from other input files) can be
+# added to the TabFormatNamedArgs object via the method
+# add_feature
+#
+# methods:
+#
+# new: initialize.
+#    values: array of strings
+#    features:  how to access the strings by name
+#              'features' is an array of strings
+#              later the i-th feature will be used to access
+#              the i-th value,
+#              except for repeated groups
+#
+# get: returns one feature by its name
+#    name: a string
+#
+# add_feature: add another feature to this object,
+#              which can be accessed via "get"
+#    name: name for the new feature, should be distinct
+#          from the ones already used in new()
+#    feature: a string, the value of the feature
+##
+class TabFormatNamedArgs
+  ############
+  def initialize(values, features, group = nil)
+    @f = Hash.new
+    @r = Array.new
+    @group = group
+    # record the feature names, give special attention to a group
+    # if we have one
+    @group_feature_names = nil
+    @feature_names = features.map { |feature|
+      if feature.instance_of? Array
+	# found a group
+	@group_feature_names = feature
+	"GROUP"
+      else
+	feature
+      end
+    }
+    if @feature_names.count("GROUP") > 1
+      $stderr.puts "More than one group in feature set:" + features.join(" ")
+      raise "Cannot handle this."
+    end
+    # group_index: position of group in overall feature list
+    group_index = @feature_names.index("GROUP")
+    unless group_index
+      group_index = @feature_names.length()
+    end
+    num_features_after_group = [0,
+      (@feature_names.length() - 1) - group_index].max()
+    index_after_groups = values.length() - num_features_after_group
+    # features before group: put feature/value pairs in @f hash
+    0.upto(group_index - 1) { |i|
+      @f[features[i]] = values[i]
+    }
+    # group: store each group in @r hash
+    if @group_feature_names
+      # for (group_start = group_index; group_start < index_after_groups;
+      #      group_start += @group_feature_names.length())
+      group_no = 0
+      group_index.step(index_after_groups - 1,
+		       @group_feature_names.length()) { |group_start|
+	@r << TabFormatNamedArgs.new(values.slice(group_start,
+						  @group_feature_names.length()),
+				     @group_feature_names,
+                                     group_no)
+        group_no += 1
+      }
+    end
+    # features after group: put feature/value pairs in @f hash
+    feature_index = group_index + 1
+    index_after_groups.upto(values.length() - 1) { |i|
+      @f[features[feature_index]] = values[i]
+      feature_index += 1
+    }
+  end
+  ############
+  # return feature/value pairs as a tab format line,
+  # order of features as given in the 'features' list
+  # Features not set in the hash: their entry will be "-"
+  #
+  # If the feature list includes a group,
+  # assume zero entries for that group
+  def TabFormatNamedArgs.format_str(hash,     # hash: feature -> value
+				    features) # feature list, as for new()
+    if features.nil?
+      return ""
+    end
+    # sanity check: does the hash contain keys that are not in the feature list?
+    hash.keys().reject { |f| features.include? f }.each { |bad_feature|
+      $stderr.puts "Error: unknown feature #{bad_feature} in format_str: ignoring."
+    }
+    return features.select { |f|
+      # remove the group feature, if it's there
+      not(f.instance_of? Array)
+    }.map { |feature|
+      if hash[feature]
+	hash[feature]
+      else
+	"-"
+      end
+    }.join("\t")
+  end
+  #############
+  def add_feature(name, feature)
+    if @f.has_key? name
+      raise "Trying to add a feature twice: "+name
+    end
+    @f[name] = feature
+  end
+  #############
+  # get feature value, identified by feature name
+  # return: feature value as string
+  def get(name)
+    if (retv = get_nongroup(name))
+      return retv
+    else
+      return get_from_group(name, @group)
+    end
+  end
+  #############
+  def set(name, feature)
+    @f[name] = feature
+  end
+  #############
+  def num_groups()
+    return @r.length()
+  end
+  #############
+  # return line as string, entries connected by tab,
+  # in the order that the entries were in originally
+  def to_s()
+    return @feature_names.map { |feature|
+      case feature
+      when "GROUP"
+	@r.map { |group_obj| group_obj.to_s }.join("\t")
+      else
+	@f[feature]
+      end
+    }.join("\t")
+  end
+  protected
+  # get feature, non-group
+  # return: feature value (string)
+  def get_nongroup(feature)
+    return @f[feature]
+  end
+  # get feature from one of the groups
+  # return: feature value (string)
+  def get_from_group(name, group_no)
+    if not(group_no) or group_no >= @r.length()
+      # no group with that number
+      return nil
+    else
+      return @r[group_no].get_nongroup(name)
+    end
+  end
+end
+#################################################
+# class for keeping and yielding one sentence
+# in tabular format
+class TabFormatSentence
+  ############
+  # initialize:
+  # the sentence will be stored one word (plus additional info
+  # for that word) per line. Each line will be stored in a cell of
+  # the array @lines. the 'initialize' method starts with an empty
+  # array of lines.
+  def initialize(pattern)
+    @lines = Array.new
+    @pattern = pattern
+    # this is just for inheritance; FNTabFormatSentence will need this
+    @group_no = nil
+  end
+  #####
+  # length: number of words in the sentence
+  def length
+    return @lines.length
+  end
+  ################3
+  # add_line:
+  # add one entry to the @lines array, i.e. information for one word
+  # of the sentence.
+  def add_line(line)
+    @lines << line
+  end
+  ###################
+  # empty?:
+  # returns true if there are currently no lines stored in this
+  # TabFormatSentence object
+  # else false
+  def empty?
+    return @lines.empty?
+  end
+  ######################
+  # empty!:
+  # discards all entries to the @lines array,
+  # i.e. empties this TabFormatSentence object of all
+  # data
+  def empty!
+    @lines.clear
+  end
+  #####################
+  # each_line:
+  # yields each line of the sentence
+  # as a string
+  def each_line
+    @lines.each { |l| yield l }
+  end
+  ######################
+  # each_line_parsed:
+  # yields each line of the sentence
+  # broken up as follows:
+  # the line is expected to contain 6 or more pieces of
+  # information, separated by whitespace.
+  # - the word
+  # - the part of speech info for the word
+  # - syntax for roles (not to be used)
+  # - target (or -)
+  # - gramm. function for roles (not to be used)
+  # - one column with role annotation
+  #
+  # All pieces are yielded as strings, except for the argument columns, which
+  # are yielded as an array of strings.
+  def each_line_parsed
+    lineno = 0
+    f = nil
+    @lines.each { |l|
+      f = TabFormatNamedArgs.new(l.split("\t"), @pattern, @group_no)
+      f.add_feature("lineno", lineno)
+      yield f
+      lineno += 1
+    }
+  end
+  ###
+  # read_one_line:
+  # return a line of the sentence specified by its number
+  def read_one_line(number)
+    return(@lines[number])
+  end
+  ###
+  # read_one_line_parsed:
+  # like get_line, but the features in the line are returned
+  # separately,
+  # as in each_line_parsed
+  def read_one_line_parsed(number)
+    if @lines[number].nil?
+      return nil
+    else
+      f = TabFormatNamedArgs.new(@lines[number].split("\t"), @pattern, @group_no)
+      f.add_feature("lineno", number)
+      return f
+    end
+  end
+  # set line no of first line of present sentence
+  def set_starting_line(n)
+    raise "Deprecated"
+  end
+  # returns line no of first line of present sentence
+  def get_starting_line()
+    raise "Deprecated"
+  end
+end
+########################################################
+# TabFormat files containing everything that's in the FN lexunit files
+#
+# one target per sentence
+class FNTabFormatFile < TabFormatFile
+  def initialize(filename,tag_suffix=nil,lemma_suffix=nil)
+    corpusname = File.dirname(filename)+"/"+File.basename(filename,".tab")
+    filename_label_pairs = [filename,FNTabFormatFile.fntab_format()]
+    if lemma_suffix # raise exception if lemmatisation does not esist
+      filename_label_pairs.concat [corpusname+lemma_suffix,["lemma"]]
+    end
+    if tag_suffix # raise exception if tagging does not exist
+      filename_label_pairs.concat [corpusname+tag_suffix,["pos"]]
+    end
+    super(filename_label_pairs)
+    @my_sentence_class = FNTabSentence
+  end
+  def FNTabFormatFile.fntab_format()
+#    return ["word", "pt", "gf", "role", "target", "frame", "lu_sent_ids"]
+    return [
+      "word",
+      FNTabFormatFile.frametab_format(),
+      "ne", "sent_id"
+    ]
+  end
+  def FNTabFormatFile.frametab_format()
+    return ["pt", "gf", "role", "target", "frame", "stuff"]
+  end
+  ##########
+  # given a hash mapping features to values,
+  # format according to fntab_format
+  def FNTabFormatFile.format_str(hash)
+    return TabFormatNamedArgs.format_str(hash, FNTabFormatFile.fntab_format())
+  end
+end
+############################################
+class FNTabSentence < TabFormatSentence
+  ####
+  # overwrite this to get a feature from
+  # a group rather than from the main feature list
+  def get_this(l, feature_name)
+    return l.get(feature_name)
+  end
+  ####
+  def sanity_check()
+    each_line_parsed {|l|
+      if l.get("sent_id").nil?
+        raise "Error: corpus file does not conform to FN format."
+      else
+        return
+      end
+    }
+  end
+  ####
+  # returns the sentence ID, a string, as set by FrameNet
+  def get_sent_id()
+    sanity_check
+    each_line_parsed {|l|
+      return l.get("sent_id")
+    }
+  end
+  ####
+  # iterator, yields each frame of the sentence as a FNTabFrame
+  # object. They contain the complete sentence, but provide
+  # access to exactly one frame of that sentence.
+  def each_frame()
+    # how many frames? assume that each line has the same
+    # number of frames
+    num_frames = read_one_line_parsed(0).num_groups()
+    0.upto(num_frames - 1) { |frame_no|
+      frame_obj = FNTabFrame.new(@pattern, frame_no)
+      each_line { |l| frame_obj.add_line(l) }
+      yield frame_obj
+    }
+  end
+  ####
+  # computes a mapping from word indices to labels on these words
+  #
+  # returns a hash: index_list(array:integer) -> label(string)
+  # An entry il->label means that all the lines whose line
+  # numbers are listed in il are labeled with label.
+  #
+  # Line numbers correspond to words of the sentence. Counting starts at 0.
+  #
+  # By default, "markables" looks for role labels, i.e. labels in the
+  # column "role", but it can also look in another column.
+  # To change the default, give the column name as a parameter.
+  def markables(use_this_column="role")
+    # returns hash of {index list} -> {markup label}
+    sanity_check()
+    idlist_to_annotation_list = Hash.new
+    # add entry for the target word
+    # idlist_to_annotation_list[get_target_indices()] = "target"
+    # determine span of each frame element
+    # if we find overlapping FEs, we write a warning to STDERR
+    # ignore the 2nd label and attempt to "close" the 1st label
+    ids = Array.new
+    label = nil
+    each_line_parsed { |l|
+      this_id = get_this(l, "lineno")
+      # start of FE?
+      this_col = get_this(l, use_this_column)
+      unless this_col
+        $stderr.puts "nil entry #{use_this_column} in line #{this_id} of sent #{get_sent_id()}. Skipping."
+        next
+      end
+      this_fe_ann = this_col.split(":")
+      case this_fe_ann.length
+      when 1 # nothing at all, or a single begin or end
+        markup = this_fe_ann.first
+        if markup == "-"  or markup == "--" # no change
+          if label
+            ids << this_id
+          end
+        elsif markup =~ /^B-(\S+)$/
+          if label # are we within a markable right now?
+            $stderr.puts "[TabFormat] Warning: Markable "+$1.to_s+" starts while within markable  ", label.to_s
+            $stderr.puts "Debug data: Sentence id #{get_sent_id()}, current ID list #{ids.join(" ")}"
+          else
+            label = $1
+            ids << this_id
+          end
+        elsif markup =~ /^E-(\S+)$/
+          if label == $1 # we close the markable we've opened before
+            ids << this_id
+            # store information
+            idlist_to_annotation_list[ids] = label
+            # reset memory
+            label = nil
+            ids = Array.new
+          else
+            $stderr.puts "[TabFormat] Warning: Markable "+$1.to_s+" closes while within markable "+ label.to_s
+            $stderr.puts "Debug data: Sentence id #{get_sent_id()}, current ID list #{ids.join(" ")}"
+          end
+        else
+          $stderr.puts "[TabFormat] Warning: cannot analyse markup "+markup
+          $stderr.puts "Debug data: Sentence id #{get_sent_id()}"
+        end
+      when 2 # this should be a one-word markable
+        b_markup = this_fe_ann[0]
+        e_markup = this_fe_ann[1]
+        if label
+          $stderr.puts "[TabFormat] Warning: Finding new markable at word #{this_id} while within markable ", label
+          $stderr.puts "Debug data: Sentence id #{get_sent_id()}, current ID list #{ids.join(" ")}"
+        else
+          if b_markup =~ /^B-(\S+)$/
+            b_label = $1
+            if e_markup =~ /^E-(\S+)$/
+              e_label = $1
+              if b_label == e_label
+                idlist_to_annotation_list[[this_id]] = b_label
+              else
+                $stderr.puts "[TabFormat] Warning: Starting markable "+b_label+", closing markable "+e_label
+                $stderr.puts "Debug data: Sentence id #{get_sent_id()}, current ID list #{ids.join(" ")}"
+              end
+            else
+              $stderr.puts "[TabFormat] Warning: Unknown end markup "+e_markup
+              $stderr.puts "Debug data: Sentence id #{get_sent_id()}, current ID list #{ids.join(" ")}"
+            end
+          else
+            $stderr.puts "[TabFormat] Warning: Unknown start markup "+b_markup
+            $stderr.puts "Debug data: Sentence id #{get_sent_id()}, current ID list #{ids.join(" ")}"
+          end
+        end
+      else
+        $stderr.puts "Warning: cannot analyse markup with more than two colon-separated parts like "+this_fee_ann.join(":")
+        $stderr.puts "Debug data: Sentence id #{get_sent_id()}"
+      end
+    }
+    unless label.nil?
+      $stderr.puts "[TabFormat] Warning: Markable ", label, " did not end in sentence."
+      $stderr.puts "Debug data: Sentence id #{get_sent_id()}, current ID list #{ids.join(" ")}"
+    end
+    return idlist_to_annotation_list
+  end
+  #######
+  def to_s
+    sanity_check
+    array = Array.new
+    each_line_parsed {|l|
+      array << l.get("word")
+    }
+    return array.join(" ")
+  end
+end
+class FNTabFrame < FNTabSentence
+  ############
+  # initialize:
+  # as parent, except that we also get a frame number
+  # such that we can access the features of ``our'' frame
+  def initialize(pattern, frameno)
+    # by setting @group_no to frameno,
+    # we are initializing each TabFormatNamedArgs object
+    # in each_line_parsed() or read_one_line_parsed()
+    # with the right group number,
+    # such that all calls to TabFormatNamedArgs.get()
+    # will access the right group.
+    super(pattern)
+    @group_no = frameno
+  end
+  # returns the frame introduced by the target word(s)
+  # of this frame group, a string
+  def get_frame()
+    sanity_check()
+    each_line_parsed {|l|
+      return l.get("frame")
+    }
+  end
+  ####
+  # returns an array of integers: the indices of the target of
+  # the frame
+  # These are the line numbers, which start counting at 0
+  #
+  # a target may span more than one word
+  def get_target_indices()
+    sanity_check
+    idx = Array.new
+    each_line_parsed {|l|
+      unless l.get("target") == "-"
+        idx << l.get("lineno")
+      end
+    }
+    return idx
+  end
+  ####
+  # returns a string: the target
+  # in the case of multiword targets,
+  # we find the complete target at all
+  # indices, i.e. we can just take the first one we find
+  def get_target()
+    each_line_parsed {|l|
+      t = l.get("target")
+      unless t == "-"
+	return t
+      end
+    }
+  end
+  ####
+  # get the target POS, according to FrameNet
+  def get_target_fn_pos()
+    get_target() =~ /^[^\.]+\.(\w+)$/
+    return $1
+  end
+end