RubyGems - shalmaneser - Versions diffs - 0.0.1.alpha - Mend

shalmaneser 0.0.1.alpha

Files changed (138) hide show

data/.yardopts +8 -0
data/CHANGELOG.rdoc +0 -0
data/LICENSE.rdoc +0 -0
data/README.rdoc +0 -0
data/lib/common/AbstractSynInterface.rb +1227 -0
data/lib/common/BerkeleyInterface.rb +375 -0
data/lib/common/CollinsInterface.rb +1165 -0
data/lib/common/ConfigData.rb +694 -0
data/lib/common/Counter.rb +18 -0
data/lib/common/DBInterface.rb +48 -0
data/lib/common/EnduserMode.rb +27 -0
data/lib/common/Eval.rb +480 -0
data/lib/common/FixSynSemMapping.rb +196 -0
data/lib/common/FrPrepConfigData.rb +66 -0
data/lib/common/FrprepHelper.rb +1324 -0
data/lib/common/Graph.rb +345 -0
data/lib/common/ISO-8859-1.rb +24 -0
data/lib/common/ML.rb +186 -0
data/lib/common/Maxent.rb +215 -0
data/lib/common/MiniparInterface.rb +1388 -0
data/lib/common/Optimise.rb +195 -0
data/lib/common/Parser.rb +213 -0
data/lib/common/RegXML.rb +269 -0
data/lib/common/RosyConventions.rb +171 -0
data/lib/common/SQLQuery.rb +243 -0
data/lib/common/STXmlTerminalOrder.rb +194 -0
data/lib/common/SalsaTigerRegXML.rb +2347 -0
data/lib/common/SalsaTigerXMLHelper.rb +99 -0
data/lib/common/SleepyInterface.rb +384 -0
data/lib/common/SynInterfaces.rb +275 -0
data/lib/common/TabFormat.rb +720 -0
data/lib/common/Tiger.rb +1448 -0
data/lib/common/TntInterface.rb +44 -0
data/lib/common/Tree.rb +61 -0
data/lib/common/TreetaggerInterface.rb +303 -0
data/lib/common/headz.rb +338 -0
data/lib/common/option_parser.rb +13 -0
data/lib/common/ruby_class_extensions.rb +310 -0
data/lib/fred/Baseline.rb +150 -0
data/lib/fred/FileZipped.rb +31 -0
data/lib/fred/FredBOWContext.rb +863 -0
data/lib/fred/FredConfigData.rb +182 -0
data/lib/fred/FredConventions.rb +232 -0
data/lib/fred/FredDetermineTargets.rb +324 -0
data/lib/fred/FredEval.rb +312 -0
data/lib/fred/FredFeatureExtractors.rb +321 -0
data/lib/fred/FredFeatures.rb +1061 -0
data/lib/fred/FredFeaturize.rb +596 -0
data/lib/fred/FredNumTrainingSenses.rb +27 -0
data/lib/fred/FredParameters.rb +402 -0
data/lib/fred/FredSplit.rb +84 -0
data/lib/fred/FredSplitPkg.rb +180 -0
data/lib/fred/FredTest.rb +607 -0
data/lib/fred/FredTrain.rb +144 -0
data/lib/fred/PlotAndREval.rb +480 -0
data/lib/fred/fred.rb +45 -0
data/lib/fred/md5.rb +23 -0
data/lib/fred/opt_parser.rb +250 -0
data/lib/frprep/AbstractSynInterface.rb +1227 -0
data/lib/frprep/Ampersand.rb +37 -0
data/lib/frprep/BerkeleyInterface.rb +375 -0
data/lib/frprep/CollinsInterface.rb +1165 -0
data/lib/frprep/ConfigData.rb +694 -0
data/lib/frprep/Counter.rb +18 -0
data/lib/frprep/FNCorpusXML.rb +643 -0
data/lib/frprep/FNDatabase.rb +144 -0
data/lib/frprep/FixSynSemMapping.rb +196 -0
data/lib/frprep/FrPrepConfigData.rb +66 -0
data/lib/frprep/FrameXML.rb +513 -0
data/lib/frprep/FrprepHelper.rb +1324 -0
data/lib/frprep/Graph.rb +345 -0
data/lib/frprep/ISO-8859-1.rb +24 -0
data/lib/frprep/MiniparInterface.rb +1388 -0
data/lib/frprep/Parser.rb +213 -0
data/lib/frprep/RegXML.rb +269 -0
data/lib/frprep/STXmlTerminalOrder.rb +194 -0
data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
data/lib/frprep/SleepyInterface.rb +384 -0
data/lib/frprep/SynInterfaces.rb +275 -0
data/lib/frprep/TabFormat.rb +720 -0
data/lib/frprep/Tiger.rb +1448 -0
data/lib/frprep/TntInterface.rb +44 -0
data/lib/frprep/Tree.rb +61 -0
data/lib/frprep/TreetaggerInterface.rb +303 -0
data/lib/frprep/do_parses.rb +142 -0
data/lib/frprep/frprep.rb +686 -0
data/lib/frprep/headz.rb +338 -0
data/lib/frprep/one_parsed_file.rb +28 -0
data/lib/frprep/opt_parser.rb +94 -0
data/lib/frprep/ruby_class_extensions.rb +310 -0
data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
data/lib/rosy/DBMySQL.rb +146 -0
data/lib/rosy/DBSQLite.rb +280 -0
data/lib/rosy/DBTable.rb +239 -0
data/lib/rosy/DBWrapper.rb +176 -0
data/lib/rosy/ExternalConfigData.rb +58 -0
data/lib/rosy/FailedParses.rb +130 -0
data/lib/rosy/FeatureInfo.rb +242 -0
data/lib/rosy/GfInduce.rb +1115 -0
data/lib/rosy/GfInduceFeature.rb +148 -0
data/lib/rosy/InputData.rb +294 -0
data/lib/rosy/RosyConfigData.rb +115 -0
data/lib/rosy/RosyConfusability.rb +338 -0
data/lib/rosy/RosyEval.rb +465 -0
data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
data/lib/rosy/RosyFeaturize.rb +280 -0
data/lib/rosy/RosyInspect.rb +336 -0
data/lib/rosy/RosyIterator.rb +477 -0
data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
data/lib/rosy/RosyPruning.rb +165 -0
data/lib/rosy/RosyServices.rb +744 -0
data/lib/rosy/RosySplit.rb +232 -0
data/lib/rosy/RosyTask.rb +19 -0
data/lib/rosy/RosyTest.rb +826 -0
data/lib/rosy/RosyTrain.rb +232 -0
data/lib/rosy/RosyTrainingTestTable.rb +786 -0
data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
data/lib/rosy/View.rb +418 -0
data/lib/rosy/opt_parser.rb +379 -0
data/lib/rosy/rosy.rb +77 -0
data/lib/shalmaneser/version.rb +3 -0
data/test/frprep/test_opt_parser.rb +94 -0
data/test/functional/functional_test_helper.rb +40 -0
data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
data/test/functional/test_fred.rb +47 -0
data/test/functional/test_frprep.rb +52 -0
data/test/functional/test_rosy.rb +20 -0
metadata +284 -0

data/lib/frprep/FNDatabase.rb ADDED Viewed

@@ -0,0 +1,144 @@
+# sp 28 06 04
+#
+# this module offers methods to extract gemma corpora from the FrameNet database#
+require 'FrameXML'
+class FNDatabase
+  def each_matching_sentence(file_pred,sent_pred)
+    # fundamental access function to FrameXML files
+    # returns file objects where
+    # FrameXMLSentence matches sent_pred
+    # (FrameXMLFile is accessed through FrameXMLSentence.get_file_object and matches file_pred)
+    each_matching_file(file_pred) {|frameNetFile|
+      frameNetFile.each_sentence {|frameNetSent|
+	if sent_pred.call(frameNetSent)
+	  frameNetSent.verify_annotation
+	  yield frameNetSent
+	end
+      }
+    }
+  end
+  def each_matching_file(file_pred)
+    # fundamental access function to FrameXML files
+    # returns file (FrameXMLFile) objects which match file_pred
+    each_framexml_file{|frameNetFile|
+      if file_pred.call(frameNetFile)
+	yield frameNetFile
+      end
+      frameNetFile.close
+    }
+  end
+  def extract_frame(frame,outfile)
+    each_matching_sentence(Proc.new{|fnfile| fnfile.get_frame == frame},
+			   Proc.new{|fnsent| true}) {|fnsent|
+      if fnsent.contains_FE_annotation_and_target
+	fnsent.print_conll_style_to(outfile)
+      end
+    }
+  end
+  def extract_lemma(lemma,outfile)
+    each_matching_sentence(Proc.new{|fnfile| fnfile.get_lu == lemma},
+			   Proc.new{|fnsent| true}) {|fnsent|
+      if fnsent.contains_FE_annotation_and_target
+	fnsent.print_conll_style_to(outfile)
+      end
+    }
+  end
+  def extract_everything(outdirectory)
+    unless outdirectory[-1,1] == "/"
+      outdirectory += "/"
+    end
+    outfiles = Hash.new
+    each_matching_sentence(Proc.new{|fnfile| true},
+			   Proc.new{|fnsent| true}) {|fnsent|
+      frame = fnsent.get_file_obj.get_frame
+      unless outfiles.key?(frame)
+	outfiles[frame] = File.new(outdirectory+frame+".tab","w")
+      end
+      if fnsent.contains_FE_annotation_and_target
+	fnsent.print_conll_style_to(outfiles[frame])
+      end
+    }
+    # close output files
+    outfiles.each_value {|file|
+      file.close
+    }
+    # remove zero-size files
+    Dir[outdirectory+"*"].each {|filename|
+      if FileTest.zero?(filename)
+	File.unlink(filename)
+      end
+    }
+  end
+  def initialize(fn_path)
+    unless fn_path[-1,1] == "/"
+      fn_path += "/"
+    end
+    @fn = fn_path
+  end
+  private
+  def each_framexml_file
+    # files might be zipped
+    Dir[@fn+"lu*.xml.gz"].each {|gzfile|
+      Kernel.system("cp "+gzfile+" /tmp/")
+      Kernel.system("gunzip -f /tmp/"+File.basename(gzfile))
+      gzfile =~ /(.+)\.gz/
+      yield FrameXMLFile.new("/tmp/"+File.basename($1))
+    }
+    # or might not
+    Dir[@fn+"/lu*.xml"].each {|filename|
+      yield FrameXMLFile.new(filename)
+    }
+  end
+  # I  don't really remember what this was good for ;-)
+#   def browse_everything(allFiles)
+#     if allFiles
+#       Dir[fn+"*.xml.gz"].each {|gzfile|
+# 	Kernel.system("cp "+gzfile+" /tmp/")
+# 	Kernel.system("gunzip -f /tmp/"+File.basename(gzfile))
+# 	gzfile =~ /(.+)\.gz/
+# 	#    STDERR.puts File.basename($1)
+# 	#    STDERR.print "."
+# 	ff = FrameXMLFile.new("/tmp/"+File.basename($1))
+# 	ff.each_sentence {|s|
+# 	  if s.contains_FE_annotation_and_target
+# 	    s.verify_annotation
+# 	    if s.verify_annotation
+# 	    puts "****************** Error: Still problems after 2nd verification!"
+# 	    end
+# 	    s.print_conll_style
+# 	  end
+# 	}
+#       }
+#     else
+#       ff = FrameXMLFile.new("/tmp/lu1870.xml")
+#       ff.each_sentence {|s|
+# 	if s.contains_FE_annotation_and_target
+# 	  s.verify_annotation
+# 	if s.verify_annotation
+# 	  puts "****************** Error: Still problems after 2nd verification!"
+# 	end
+# 	  #      s.print_layers
+# 	  s.print_conll_style
+# 	end
+#       }
+#     end
+#   end
+end

data/lib/frprep/FixSynSemMapping.rb ADDED Viewed

@@ -0,0 +1,196 @@
+###
+# FixSynSemMapping:
+# Given a SalsaTigerRegXML sentence with semantic role annotation,
+# simplify the mapping of semantic roles to syntactic constituents
+#
+# The following is lifted from the LREC06 paper on Shalmaneser:
+# During preprocessing, the span of semantic roles in the training corpora is
+# projected onto the output of the syntactic parser by assigning each
+# role to the set of maximal constituents covering its word span.
+# f the word span of a role does not coincide
+# with parse tree constituents, e.g. due to misparses,
+# the role is ``spread out'' across several constituents. This leads to
+# idiosyncratic paths between predicate and semantic role in the parse
+# tree.
+#
+# [The following span standardization algorithm is used to make the
+# syntax-semantics mapping more uniform:]
+# Given a role r that has been assigned, let N be the set of
+# terminal nodes of the syntactic structure that are covered by r.
+#
+#   Iteratively compute the maximal projection of N in the syntactic
+#   structure:
+#   1) If n is a node such that all of n's children are in N,
+#     then remove n's children from N and add n instead.
+#   2) If n is a node with 3 or more children, and all of n's
+#     children except one are in N, then remove n's children from N
+#     and add n instead.
+#   3) If n is an NP with 2 children, and one of them, another NP,
+#     is in N, and the other, a relative clause, is not, then remove
+#     n's children from N and add n instead.
+#
+#   If none of the rules is applicable to N anymore, assign r to the
+#   nodes in N.
+#
+# Rule 1 implements normal maximal projection. Rule 2 ``repairs'' parser
+# errors where all children of a node but one have been assigned the
+# same role. Rule 3 addresses a problem of the FrameNet data, where
+# relative clauses have been omitted from roles assigned to NPs.
+# KE Feb 08: rule 3 currently out of commission!
+require "frprep/SalsaTigerRegXML"
+module FixSynSemMapping
+  ##
+  # fix it
+  #
+  # relevant settings in the experiment file:
+  #
+  # fe_syn_repair:
+  # If there is a node that would be a max. constituent for the
+  # words covered by the given FE, except that it has one child
+  # whose words are not in the FE, use the node as max constituent anyway.
+  # This is to repair cases where the parser has made an attachment choice
+  # that differs from the one in the gold annotation
+  #
+  # fe_rel_repair:
+  # If there is an NP such that all of its children except one have been
+  # assigned the same FE, and that missing child is a relative clause
+  # depending on one of the other children, then take the complete NP as
+  # that FE
+  def FixSynSemMapping.fixit(sent, # SalsaTigerSentence object
+                             exp,  # experiment file object
+                             interpreter_class) # SynInterpreter class
+    unless exp.get("fe_syn_repair") or exp.get("fe_rel_repair")
+      return
+    end
+    if sent.nil?
+	return
+    end
+    # "repair" FEs:
+    sent.each_frame { |frame|
+      frame.each_child { |fe_or_target|
+        # repair only if the FE currently
+        # points to more than one syn node
+        if fe_or_target.children.length() < 2
+          next
+        end
+        if exp.get("fe_rel_repair")
+          lastfe = fe_or_target.children.last()
+          if lastfe and interpreter_class.simplified_pt(lastfe) =~ /^(WDT)|(WP\$?)|(WRB)/
+            # remove syn nodes that the FE points to
+            old_fe_syn = fe_or_target.children()
+            old_fe_syn.each { |child|
+              fe_or_target.remove_child(child)
+            }
+            # set it to point only to the last previous node, the relative pronoun
+            fe_or_target.add_child(lastfe)
+          end
+        end
+        if exp.get("fe_syn_repair")
+          # remove syn nodes that the FE points to
+          old_fe_syn = fe_or_target.children()
+          old_fe_syn.each { |child|
+            fe_or_target.remove_child(child)
+          }
+          # and recompute
+          new_fe_syn = interpreter_class.max_constituents(old_fe_syn.map { |t|
+                                                            t.yield_nodes
+                                                          }.flatten.uniq,
+                                                          sent,
+                                                          exp.get("fe_syn_repair"))
+          # make the FE point to the new nodes
+          new_fe_syn.each { |syn_node|
+            fe_or_target.add_child(syn_node)
+          }
+        end
+      } # each FE
+    } # each frame
+  end # def fixit
+end # module
+#########3
+# old code
+#     if exp.get("fe_rel_repair")
+#       # repair relative clauses:
+#       # then make a procedure to pass on to max constituents
+#       # that will recognize the relevant cases
+#       accept_anyway_proc = Proc.new { |node, children_in, children_out|
+#         # node: SynNode
+#         # children_in, children_out: array:SynNode. children_in are the children
+#         #    that are already covered by the FE, children_out the ones that aren't
+#         # if node is an NP,
+#         # and only one of its children is out,
+#         # and one node in children_in is an NP, and the missing child is an SBAR
+#         # with a child that is a relative pronoun, then consider the child in children_out as covered
+#         if interpreter_class.category(node) == "noun" and
+#             children_out.length() == 1 and
+#             children_in.select { |n| interpreter_class.category(n) == "noun" } and
+#             interpreter_class.category(children_out.first) == "sent" and
+#             (ch = children_out.first.children) and
+#             ch.select { |n| interpreter_class.relative_pronoun?(n) }
+#           true
+#         else
+#           false
+#         end
+#       }
+#     else
+#       accept_anyway_proc = nil
+#     end
+#     # "repair" FEs:
+#     sent.each_frame { |frame|
+#       frame.each_child { |fe_or_target|
+#         # repair only if the FE currently
+#         # points to more than one syn node, or
+#         # if it is a noun with a non-covered sentence sister
+#         if fe_or_target.children.length() > 1 or
+#             (exp.get("fe_rel_repair") and (curr_marked = fe_or_target.children.first())  and
+#              interpreter_class.category(curr_marked) == "noun" and
+#              (p = curr_marked.parent) and
+#              p.children.select { |n| n != curr_marked and interpreter_class.category(n) == "sent" } )
+#           # remember nodes covered by the FE
+#           old_fe_syn = fe_or_target.children()
+#           # remove syn nodes that the FE points to
+#           old_fe_syn.each { |child|
+#             fe_or_target.remove_child(child)
+#           }
+#           # and recompute
+#           new_fe_syn = interpreter_class.max_constituents(old_fe_syn.map { |t| t.yield_nodes}.flatten.uniq,
+#                                                           sent,
+#                                                           exp.get("fe_syn_repair"),
+#                                                           accept_anyway_proc)
+#           # make the FE point to the new nodes
+#           new_fe_syn.each { |syn_node|
+#             fe_or_target.add_child(syn_node)
+#           }
+#         end # if FE points to more than one syn node
+#       } # each FE
+#     } # each frame

data/lib/frprep/FrPrepConfigData.rb ADDED Viewed

@@ -0,0 +1,66 @@
+# FPrepConfigData
+# Katrin Erk July 05
+#
+# Preprocessing for Fred and Rosy:
+# access to a configuration and experiment description file
+require "frprep/ConfigData"
+##############################
+# Class FrPrepConfigData
+#
+# inherits from ConfigData,
+# sets variable names appropriate to preprocessing task
+class FrPrepConfigData < ConfigData
+  def initialize(filename)
+    # initialize config data object
+    super(filename,          # config file
+	  { "prep_experiment_ID" => "string", # experiment identifier
+           "frprep_directory" => "string", # dir for frprep internal data
+            # information about the dataset
+            "language" => "string", # en, de
+            "origin"=> "string",    # FrameNet, Salsa, or nothing
+            "format" => "string",   # Plain, SalsaTab, FNXml, FNCorpusXml, SalsaTigerXML
+            "encoding" => "string", # utf8, iso, hex, or nothing
+            # directories
+            "directory_input" => "string", # dir with input data
+            "directory_preprocessed" => "string", # dir with output Salsa/Tiger XML data
+            "directory_parserout" => "string", # dir with parser output for the parser named below
+            # syntactic processing
+            "pos_tagger" => "string", # name of POS tagger
+            "lemmatizer" => "string", # name of lemmatizer
+            "parser" => "string",     # name of parser
+            "pos_tagger_path" => "string", # path to POS tagger
+            "lemmatizer_path" => "string", # path to lemmatizer
+            "parser_path" => "string",     # path to parser
+            "parser_max_sent_num" => "integer", # max number of sentences per parser input file
+            "parser_max_sent_len" => "integer", # max sentence length the parser handles
+            "do_parse" => "bool",    # use parser?
+            "do_lemmatize" => "bool",# use lemmatizer?
+            "do_postag" => "bool",   # use POS tagger?
+            # output format: if tabformat_output == true,
+            # output in Tab format rather than Salsa/Tiger XML
+            # (this will not work if do_parse == true)
+            "tabformat_output" => "bool",
+            # syntactic repairs, dependent on existing semantic role annotation
+            "fe_syn_repair" => "bool", # map words to constituents for FEs: idealize?
+            "fe_rel_repair" => "bool", # FEs: include non-included relative clauses into FEs
+	  },
+	  [ ] # variables
+	  )
+  end
+end

data/lib/frprep/FrameXML.rb ADDED Viewed

@@ -0,0 +1,513 @@
+# sp 18 06 2004
+#
+# access to FrameNet XML files, sentences, and annotation.
+#
+# sp 10 11 04: only data from the first layer with name XY is
+# used for output. Other data is saved in layer XY.2nd, but is
+# currently not processed.
+#
+# sp 22 05 04: also, if two labels exist which cover the same span
+# (ie there is a double annotation within the same layer), ignore
+# all but the first label.
+#
+# ke 13 07 05:
+#   - changed to RegXMl.rb
+#   - fixed two problems in analyse_layer:
+#     - Deleting problematic labels:
+#       For some reason, thisLayer[i+1..-1].each_index {|other_i|
+#       included the index 0 in any case, resulting in the 1st
+#       label being deleted in any case.
+#     - Deleting problematic labels, checking for label overlap:
+#       The old formulation worked only if labels occurred in the array
+#       in the order they occurred in the sentence, but that was not the case.
+#   - Change in deleting problematic labels:
+#     No longer delete duplicate labels, since e.g. in the PT level there
+#     may be more than one NP label, and we want to keep those
+#
+# KE January 2007:
+# write new adapted FNTab format
+# ( "word", ("pt", "gf", "role", "target", "frame", "stuff")* "ne", "sent_id" )
+require 'Ampersand'
+require 'ISO-8859-1'
+require 'RegXML'
+class FrameXMLFile #  only verified to work for FrameNet v1.1
+  def initialize(filename)
+    @filename = filename
+    file = File.new(filename)
+    counter = 0
+    while true
+      counter +=1
+      line = file.gets
+      if line =~ /<lexunit/
+	break
+      end
+      if counter > 3
+	STDERR.puts "Error: File "+filename+" does not conform to FrameNet v1.1 standard (lexunit in 3rd line)"
+	Kernel.exit
+      end
+    end
+    # found lexunit
+    string = line
+    while (line = file.gets())
+      string << line
+    end
+    @lexunit = RegXML.new(string)
+    attributes = @lexunit.attributes()
+    @id = attributes["ID"]
+    attributes["name"] =~ /^([^.]+).([^.]+)$/
+    @lu = $1
+    @pos = $2.upcase
+    if @lu.nil?
+      raise "[framexml] no lemma in header of file #{@filename}"
+    elsif @pos.nil?
+      raise "[framexml] no pos in header of file #{@filename}"
+    end
+    @frame = attributes["frame"]
+  end
+  def get_lu
+    return @lu.gsub(" ","_")
+  end
+  def get_lu_id
+    return @id
+  end
+  def get_filename
+    return @filename
+  end
+  def get_pos
+    return @pos
+  end
+  def get_frame
+    return @frame
+  end
+  def close
+  end
+  def each_sentence
+    @lexunit.children_and_text().each { |subcorpus|
+      subcorpus.children_and_text().each { |annotationSet|
+        if annotationSet.name == "annotationSet"
+          # sentence found
+          yield FrameXMLSentence.new(annotationSet,self)
+        end
+      }
+    }
+  end
+end
+class FrameXMLSentence
+  def initialize(annotationSet,file_obj)
+    @file_obj = file_obj
+    # layers: hash layer_name -> array:[name, start, stop]
+    #  name: name of the element, string
+    #  start: start character, integer
+    #  stop:  end character, integer
+    @layers = Hash.new
+    annotationSet.children_and_text().each { |sentence_or_layer_elt|
+      case sentence_or_layer_elt.name
+      when "sentence"
+        # sentence: has ID, its child is <text>[text]</text>
+        @sent_id = sentence_or_layer_elt.attributes["ID"]
+        text_elt = sentence_or_layer_elt.children_and_text().detect { |child|
+          child.name == "text"
+        }
+        if text_elt
+          # found the text element. its only child should be the text
+          @orig_text = text_elt.children_and_text().detect { |child|
+            child.text?
+          }
+          if @orig_text
+            # take text out of RegXMl object
+            @orig_text = @orig_text.to_s()
+          end
+        end
+      when "layers"
+        # contains annotation layers
+        sentence_or_layer_elt.children_and_text().each { |layer|
+          unless layer.name == "layer"
+            # additional material, ignore
+            next
+          end
+          name = layer.attributes["name"]
+          unless name
+            raise "layer without a name"
+          end
+          unless @layers.key?(name)
+            @layers[name] = analyse_layer(layer, name)
+          end
+        }
+      end
+    }
+    @pos_text = UtfIso.to_iso_8859_1(@orig_text).split(" ") # text with special characters replaced by iso8859 characters
+    @text = Ampersand.utf8_to_hex(@orig_text).split(" ")  # text with special characters replaced by &...; sequences
+    # all text and pos_text have the same number of elements!
+    @start_is = Hash.new # map char indices (start of words) onto word indices
+    @stop_is = Hash.new   # map char indices (end of words) onto word indices
+    @charidx = Array.new # maps word indices on [start,stop]
+    @double_space = Array.new
+    pos = 0
+    while (match = @orig_text.index(/(\s\s+)/,pos))
+    	  @double_space << match
+	  pos = match+1
+    end
+    # fill start, stop and charidx arrays
+    char_i = 0
+    @pos_text.each_index {|word_i|
+      @start_is[char_i] = word_i
+      startchar = char_i
+      #      puts "Remembering "+char_i.to_s+" as start index of word "+word_i.to_s
+      char_i += our_length(@pos_text[word_i])
+      @stop_is[char_i-1] = word_i
+      stopchar = char_i-1
+      #      puts "Remembering "+(char_i-1).to_s+" as stop index of word "+word_i.to_s
+      @charidx << [startchar,stopchar]
+      # separators
+      if @double_space.include?(char_i) then
+	char_i += 2
+      else
+	char_i += 1
+      end
+    }
+  end
+  def get_file_obj
+    return @file_obj
+  end
+  def get_sent_id
+    return @sent_id
+  end
+  def print_text
+    puts "("+@id+ ")\t"+@text
+  end
+  def contains_FE_annotation_and_target
+    target_info = @layers["Target"][0]
+    unless target_info[0] == "Target"
+      STDERR.puts "Error in sentence from "+filename+": No target" # strictly speaking, no target at pos 0 in @layers["Target"]
+      STDERR.puts "Sentence: "+@text
+      return false
+    else
+      return (@layers.key?("FE") and target_info[2] != 0)
+    end
+  end
+  # we only verify the interesting layers (FE,GF,Target)
+  # if there is weird stuff going on on e.g. the Noun or Adj layer, we don't care.
+  def verify_annotation # returns true if some change has taken place
+    change = false
+    @layers.each_pair {|layername,l|
+      if layername=="FE" or layername=="GF" or layername=="PT" or layername=="Target" # only verify the "important" layers
+	l.each_index {|i|
+	  element,start,stop = l[i]
+	  newstart = start
+	  newstop = stop
+	  @charidx.each_index{|j|
+	    unless j== 0
+	      pstartidx, pstopidx = @charidx[j-1]
+	    end
+	    startidx, stopidx = @charidx[j]
+	    if (start > startidx and start <= stopidx) or
+		(j != 0 and start > pstopidx and start < startidx)
+	      newstart = startidx
+	    end
+	    if (stop >= startidx and stop < stopidx)
+	      newstop = stopidx
+	    elsif (j != 0 and stop > pstopidx and stop < startidx)
+	      newstop = pstopidx
+	    end
+	  }
+	  if start != newstart or stop != newstop
+	    change = true
+	    @layers[layername][i] = [element,newstart,newstop]
+	    STDERR.puts "Heuristics has changed element "+element+" from ["+[start,stop].join(",")+"] to ["+[newstart,newstop].join(",")+"] in file "+@file_obj.get_filename+"."
+	    markable_as_string(layername,element).each {|string|
+	      STDERR.puts "New markable: "+string
+	    }
+	    STDERR.puts "Sentence: "+@pos_text.join(" ")
+	    puts
+	  end
+	}
+      end
+    }
+    return change
+  end
+  def print_conll_style
+    print_conll_style_to(STDOUT)
+  end
+  # CHANGED KE January 2007:
+  # write new adapted FNTab format
+  # ( "word", ("pt", "gf", "role", "target", "frame", "stuff")* "ne", "sent_id" )
+  def print_conll_style_to(out)
+    # even though in principle there might be multiple
+    # labels for one span [i.e. in one value of the
+    # {gf,fe,pt} hashes], we only ever record one
+    gf = Hash.new
+    add_all_to_hash(gf,"GF")
+    fe = Hash.new
+    add_all_to_hash(fe,"FE")
+    pt = Hash.new
+    add_all_to_hash(pt,"PT")
+    target = Hash.new
+    add_all_to_hash(target,"Target")
+    in_target = false
+    @pos_text.each_index {|i|
+      # write format:
+      #  "word" "pt", "gf", "role", "target", "frame", "stuff" "ne", "sent_id"
+      line = Array.new
+      # word
+      word = @pos_text[i]
+      line << word
+      start, stop = @charidx[i]
+      # "pt", "gf", "role",
+      [pt,gf,fe].each {|hash|
+	token = Array.new
+	if hash.key?([start,"start"])
+	  markables = hash.delete([start,"start"])
+	  markables.each {|element|
+	    token << "B-"+element
+	  }
+	end
+	if hash.key?([stop,"stop"])
+	  markables = hash.delete([stop,"stop"])
+	  markables.each {|element|
+	    token << "E-"+element
+	  }
+	end
+	if token.empty?
+	  line << "-"
+	else
+	  line << token.sort.join(":")
+	end
+      }
+      # "target"
+      if target.key?([start,"start"])
+	target.delete([start,"start"])
+      	in_target = true
+      end
+      if in_target
+	line << @file_obj.get_lu+"."+@file_obj.get_pos
+      else
+	line << "-"
+      end
+      if target.key?([stop,"stop"])
+	target.delete([stop,"stop"])
+	in_target = false
+      end
+      # "frame"
+      line << @file_obj.get_frame
+      # "stuff" "ne",
+      line << "-"
+      line << "-"
+      # "sent_id"
+      line << @file_obj.get_lu_id+"-"+@sent_id
+      out.puts line.join("\t")
+    }
+    out.puts
+    [gf,fe,pt,target].each {|hash|
+      unless hash.empty?
+	STDERR.puts @file_obj.get_filename
+	raise "**** Error: Hash not empty after creation of Sentence in CoNLL-Format (could not find matching words for some markup element)!"
+      end
+    }
+  end
+  def print_layers
+    @layers.each {|ln,l|
+      puts "Layer "+ln+":"
+      l.each {|element,start,stop|
+	puts "\t"+element+": "+start.to_s+" -- "+stop.to_s
+      }
+      puts "***"
+    }
+  end
+  private
+  def our_length(string)   # (1) replace &...; with 1 char and " with two chars
+    return string.gsub(/&(.+?);/,"X").length
+  end
+  def is_fe(fename)
+    @layers["FE"].each {|name,start,stop|
+      if fename == name
+	return true
+      end
+    }
+    return false
+  end
+  def markable_as_string(layername,markup_name) # returns an array of all markables with this name
+    result = Array.new
+    festart = nil
+    festop = nil
+    @layers[layername].each {|name,start,stop|
+      if markup_name == name
+	fe = Array.new
+	infe = false
+	@charidx.each_index {|i|
+	  startidx,stopidx = @charidx[i]
+	  if startidx == start
+	    infe = true
+	  end
+	  if infe
+	    fe << @pos_text[i]
+	  end
+	  if stopidx == stop
+	    result << (fe.join(" ")+"["+start.to_s+","+stop.to_s+", VERIFIED]")
+	    break
+	  elsif stopidx > stop
+	    result <<  (fe.join(" ")+"["+start.to_s+","+stop.to_s+",ERROR]")
+	    break
+	  end
+	}
+      end
+    }
+    return result
+  end
+  def add_to_hash(hash,key,name)
+    exists = false
+    if hash.key?(key)
+      exists = true
+    else
+      hash[key] = Array.new
+      hash[key] << name
+    end
+    return exists
+  end
+  def add_all_to_hash(hash,layername)
+    # use "uniq" to remove wrong double annotations
+    @layers[layername].uniq.each {|element,start,stop|
+      exists = add_to_hash(hash,[start, "start"],element)
+      if exists
+	STDERR.puts "Warning ["+@file_obj.get_filename+"]: In layer "+layername+", two elements start at position "+start.to_s+". Only using first. Layer as read from FrameXML: "+@layers[layername].map {|element,start,stop| element+" ("+start.to_s+","+stop.to_s+")"}.join(" ")
+      else
+	add_to_hash(hash,[stop, "stop"],element)
+      end
+    }
+  end
+  def analyse_layer(layer_elt,name) # read layer information from file and store in @layers
+    if name.nil?
+      STDERR.puts "Error: layer line "+line+" with empty name."
+    end
+    # thisLayer, retv: array:[name(string), start(integer), end(integer)]
+    thisLayer = Array.new
+    retv = Array.new
+    labels_elt = layer_elt.children_and_text.detect { |child| child.name == "labels"}
+    unless labels_elt
+      # no labels found, return empty array
+      return thisLayer
+    end
+    labels_elt.children_and_text.each { |label|
+      unless label.name == "label"
+        # some other markup, ignore
+        next
+      end
+      attributes = label.attributes()
+      if attributes["itype"]
+        # null instantiation, don't retain
+        next
+      end
+      if not(attributes["start"]) and not(attributes["end"])
+        # no start and end labels
+        next
+      end
+      thisLayer << [attributes["name"], attributes["start"].to_i, attributes["end"].to_i]
+    }
+    # sanity check: verify that
+    # 1. we don't have overlapping labels
+    deleteHash = Hash.new # keep track of the labels which are to be deleted
+                          # i -> Boolean
+    thisLayer.each_index {|i|
+      # efficiency: skip already delete labels
+      if deleteHash[i]
+        next
+      end
+      this_label, this_from , this_to = thisLayer[i]
+      # compare with all remaining labels
+      (i+1..thisLayer.length()-1).to_a.each { |other_i|
+        other_label,other_from,other_to = thisLayer[other_i]
+        # overlap? Throw out the later FE
+        if this_from <= other_from and other_from <= this_to
+          $stderr.puts "Warning: Label overlap, deleting #{other_label}"
+          deleteHash[other_i] = true
+        elsif this_from <= other_to and other_to <= this_to
+          $stderr.puts "Warning: Label overlap, deleting #{this_label}"
+          deleteHash[i] = true
+        end
+      }
+      # matched with all other labels. If "keep", return
+      if deleteHash[i]
+#	$stderr.puts " deleting entry #{i}"
+      else
+        retv << thisLayer[i]
+      end
+    }
+    return retv
+  end
+end