RubyGems - shalmaneser-prep - Versions diffs - 1.2.0.rc4 - Mend

shalmaneser-prep 1.2.0.rc4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

checksums.yaml +7 -0
data/.yardopts +10 -0
data/CHANGELOG.md +4 -0
data/LICENSE.md +4 -0
data/README.md +93 -0
data/lib/frprep/Ampersand.rb +39 -0
data/lib/frprep/CollinsInterface.rb +1165 -0
data/lib/frprep/Counter.rb +18 -0
data/lib/frprep/FNCorpusXML.rb +643 -0
data/lib/frprep/FNDatabase.rb +144 -0
data/lib/frprep/FrameXML.rb +513 -0
data/lib/frprep/Graph.rb +345 -0
data/lib/frprep/MiniparInterface.rb +1388 -0
data/lib/frprep/RegXML.rb +269 -0
data/lib/frprep/STXmlTerminalOrder.rb +194 -0
data/lib/frprep/SleepyInterface.rb +384 -0
data/lib/frprep/TntInterface.rb +44 -0
data/lib/frprep/TreetaggerInterface.rb +327 -0
data/lib/frprep/do_parses.rb +143 -0
data/lib/frprep/frprep.rb +693 -0
data/lib/frprep/interfaces/berkeley_interface.rb +372 -0
data/lib/frprep/interfaces/stanford_interface.rb +353 -0
data/lib/frprep/interpreters/berkeley_interpreter.rb +22 -0
data/lib/frprep/interpreters/stanford_interpreter.rb +22 -0
data/lib/frprep/one_parsed_file.rb +28 -0
data/lib/frprep/opt_parser.rb +94 -0
data/lib/frprep/ruby_class_extensions.rb +310 -0
data/test/frprep/test_opt_parser.rb +94 -0
data/test/functional/functional_test_helper.rb +58 -0
data/test/functional/test_fred.rb +47 -0
data/test/functional/test_frprep.rb +99 -0
data/test/functional/test_rosy.rb +40 -0
metadata +85 -0

data/lib/frprep/Counter.rb ADDED

@@ -0,0 +1,18 @@
+# Counter class - provides unique ids with state
+class Counter
+  def get
+    return @v
+  end
+  def next
+    @v += 1
+    return (@v-1)
+  end
+  def initialize(init_value)
+    @v = init_value
+  end
+end

data/lib/frprep/FNCorpusXML.rb ADDED

@@ -0,0 +1,643 @@
+# KE Dec 2006
+# Access for FrameNet corpus XML file
+# Mainly taken over from FramesXML
+#
+# changes:
+# - no single frame for the whole corpus
+# - below <sentence> level there is an <annotationSet> level.
+#   One annotationSet may include a single frame,
+#   or a reference to all named entities in a sentence
+#
+# Write out in tab format, one line per word:
+# Format:
+#    word (pt gf role target frame stuff)* ne sent_id
+# with
+#   word: word
+#   whole bracketed group: information about one frame annotation
+#    pt: phrase type
+#    gf: grammatical function
+#    role: frame element
+#    target: LU occurrence
+#    frame: frame
+#    stuff: support, and other things
+#   ne:    named entity
+#   sent_id: sentence ID
+require 'frprep/Ampersand'
+require 'common/ISO-8859-1'
+require 'common/RegXML'
+#####################
+# mixins to make work with RegXML a little less repetitive
+class RegXML
+  def first_child_matching(child_name)
+    return children_and_text().detect { |c| c.name() == child_name }
+  end
+  def each_child_matching(child_name)
+    children_and_text().each { |c|
+      if c.name() == child_name
+        yield c
+      end
+    }
+  end
+end
+#####################
+# class to keep data for one frame
+class FNCorpusAset
+  attr_reader :layers, :aset_type, :aset_id, :frame_name, :lu
+  #######
+  # Analyze RegXML object, store in object variables:
+  #
+  # @aset_type: "frame" or "NER"
+  # @frame_name: frame name for "frame" type
+  # @lu: LU for "frame" type
+  # @aset_id: ID of the annotation set
+  # @layers: hash: layer type (FE, GF, PT, Target, NER) -> [offset, "start"/"stop"]  -> list of labels
+  #     string -> int*string -> array:string
+  #
+  def initialize(aset, #RegXML object
+                 charidx) # array of pairs [start index, stop index] int*int
+    @layers = Hash.new()
+    @frame_name = nil
+    @lu = nil
+    @aset_type = nil
+    attributes = aset.attributes()
+    @aset_id = attributes["ID"]
+    if attributes["frameName"]
+      # all of these seem to be frames. store in 'frames' array
+      unless attributes["luName"]
+        $stderr.puts "FNCorpusAset warning: cannot determine LU name"
+        $stder.puts aset.to_s()
+        return
+      end
+      @aset_type = "frame"
+      @frame_name = attributes["frameName"]
+      @lu = attributes["luName"]
+      unless (layers = aset.first_child_matching("layers"))
+        $stderr.puts "FNCorpusAset warning: unexpectedly no layers found"
+        $stderr.puts aset.to_s()
+        return
+      end
+      layers.each_child_matching("layer") { |l| analyze_layer(l, charidx) }
+    else
+      # all we seem to get here are named entity labels.
+      @aset_type = "NER"
+      unless (layers = aset.first_child_matching("layers"))
+        $stderr.puts "FNCorpusAset Warning: unexpectedly no layers found"
+        $stderr.puts aset.to_s()
+        return
+      end
+      unless (layer = layers.first_child_matching("layer"))
+        $stderr.puts "FNCorpusAset Warning: unexpectedly no layers found"
+        $stderr.puts aset.to_s()
+        return
+      end
+      unless layer.attributes()["name"] == "NER"
+        $stderr.puts "FNCorpusAset Warning: unexpected layer #{layer.attributes()["name"]}, was expecting only an NER layer."
+        $stderr.puts aset.to_s()
+        return
+      end
+      analyze_layer(layer, charidx)
+    end
+  end
+  #############
+  # input: <layer> RegXML object
+  # analyze this, put into @layers data structure
+  def analyze_layer(layer, # RegXML object
+                    charidx) # array:int*int pairs start/end index of words
+    layer_name = layer.attributes()["name"]
+    unless layer_name
+      $stderr.puts "FNCorpusAset warning: cannot determine layer name"
+      $stderr.puts layer.to_s
+      return
+    end
+    # FN-specific: skip 2nd layer FEs for now
+    if layer_name == "FE" and layer.attributes()["rank"] == "2"
+      return
+    end
+    unless @layers[layer_name]
+      @layers[layer_name] = Hash.new()
+    end
+    unless (labels = layer.first_child_matching("labels"))
+      # nothing to record for this layer
+      return
+    end
+    # taking over much of analyse_layer() from class FrameXML
+    thisLayer = Array.new()
+    labels.each_child_matching("label") { |label|
+      attributes = label.attributes()
+      if attributes["itype"] =~ /NI/
+        # null instantiation, ignore
+        next
+      end
+      if not(attributes["start"]) and not(attributes["end"])
+        # no start and end labels
+        next
+      end
+      thisLayer << [attributes["name"], attributes["start"].to_i, attributes["end"].to_i]
+    }
+    # sanity check: do indices
+    # match word start and end indices?
+    thisLayer = verify_annotation(thisLayer, charidx)
+    # sanity check: verify that
+    # we don't have overlapping labels
+    deleteHash = Hash.new # keep track of the labels which are to be deleted
+                          # i -> Boolean
+    thisLayer.each_index {|i|
+      # efficiency: skip already delete labels
+      if deleteHash[i]
+        next
+      end
+      this_label, this_from , this_to = thisLayer[i]
+      # compare with all remaining labels
+      (i+1..thisLayer.length()-1).to_a.each { |other_i|
+        other_label,other_from,other_to = thisLayer[other_i]
+        # overlap? Throw out the later FE
+        if this_from <= other_from and other_from <= this_to
+          $stderr.puts "Warning: Label overlap, deleting #{other_label}"
+          deleteHash[other_i] = true
+        elsif this_from <= other_to and other_to <= this_to
+          $stderr.puts "Warning: Label overlap, deleting #{this_label}"
+          delete_hash[i] = true
+        end
+      }
+      # matched with all other labels. If "keep", return
+      if deleteHash[i]
+#	$stderr.puts " deleting entry #{i}"
+      else
+        [ [this_from, "start"], [this_to, "stop"]].each { |offset, start_or_stop|
+          unless @layers[layer_name].has_key?([offset, start_or_stop])
+            @layers[layer_name][[offset, start_or_stop]] = Array.new()
+          end
+          @layers[layer_name][ [offset, start_or_stop] ] << this_label
+        }
+      end
+    }
+  end
+  ##############3
+  # verify found triples label/from_index/to_index
+  # against given start/end indices of words
+  #
+  # returns: triples, possibly changed
+  def verify_annotation(found,        # array: label/from/to, string*int*int
+                        charidx)      # array: from/to, int*int
+    return found.map {|element, start, stop|
+      newstart = start
+      newstop = stop
+      # compare against word start/stop indices
+      charidx.each_index{|j|
+        unless j== 0
+          pstartidx, pstopidx = charidx[j-1]
+        end
+        startidx, stopidx = charidx[j]
+        if (start > startidx and start <= stopidx) or
+            (j != 0 and start > pstopidx and start < startidx)
+          newstart = startidx
+        end
+        if (stop >= startidx and stop < stopidx)
+          newstop = stopidx
+        elsif (j != 0 and stop > pstopidx and stop < startidx)
+          newstop = pstopidx
+        end
+      }
+      # change?
+      if start != newstart or stop != newstop
+        # report change
+        $stderr.puts "FNCorpusXML warning: Heuristics has changed element "+element
+        $stderr.puts "\tfrom ["+[start,stop].join(",")+"] to ["+[newstart,newstop].join(",")+"]"
+        [element, newstart, newstop]
+      else
+        [element, start, stop]
+      end
+    }
+  end
+end
+#####################
+# one FrameNet corpus
+#
+# just the filename is stored,
+# the text is read only on demand
+class FNCorpusXMLFile
+  ###
+  def initialize(filename)
+    @filename = filename
+  end
+  ###
+  # yield each  document in this corpus
+  # as a string
+  def each_document_string()
+    # read each <document> element and yield it
+    doc_string = ""
+    inside_doc_elem = false
+    f = File.new(@filename)
+    # <corpus>
+    #   <documents>
+    #     <document ...>
+    #     </document>
+    #     <document ...>
+    #     </document>
+    #   </documents>
+    # </corpus>
+    f.each { |line|
+      if not(inside_doc_elem) and line =~ /^.*?(<document\s.*)$/
+        # start of <document>
+        inside_doc_elem = true
+        doc_string << $1
+      elsif inside_doc_elem and line =~ /^(.*?<\/document>).*$/
+        # end of <document>
+        doc_string << $1
+        yield doc_string
+        doc_string = ""
+        inside_doc_elem = false
+      elsif inside_doc_elem
+        # within <document>
+        doc_string << line
+      end
+    }
+  end
+  ###
+  # yield each sentence
+  # as a FNCorpusXMLSentence object
+  def each_sentence()
+    # read each <document> element and yield it
+    sent_string = ""
+    inside_sent_elem = false
+    f = File.new(@filename)
+    # <corpus>
+    #   <documents>
+    #     <document ...>
+    #       <paragraphs>
+    #         <paragraph>
+    #           <sentences>
+    #             <sentence ...>
+    f.each { |line|
+      if not(inside_sent_elem) and line =~ /^.*?(<sentence\s.*)$/
+        # start of <sentence>
+        inside_sent_elem = true
+        sent_string << $1
+      elsif inside_sent_elem and line =~ /^(.*?<\/sentence>).*$/
+        # end of <document>
+        sent_string << $1
+        yield FNCorpusXMLSentence.new(sent_string)
+        sent_string = ""
+        inside_sent_elem = false
+      elsif inside_sent_elem
+        # within <sentence>
+        sent_string << line.chomp()
+      end
+    }
+  end
+  ###
+  # print whole FN file in tab format
+  def print_conll_style(file = $stdout)
+    each_sentence() { |s_obj|
+      s_obj.print_conll_style(file)
+    }
+  end
+end
+#######################################
+# Keep one sentence from FN corpus XML
+# as a RegXML object,
+# offer printout in tabular format
+class FNCorpusXMLSentence
+  #########
+  def initialize(sent_string)
+    @sent = RegXML.new(sent_string)
+    @sent_id = @sent.attributes()["ID"]
+  end
+  ##############
+  # print to file
+  # in tabular format
+  #
+  # row format:
+  # word (pt gf role target frame stuff)* ne sent_id
+  #
+  #   word: word
+  #   whole bracketed group: information about one frame annotation
+  #    pt: phrase type
+  #    gf: grammatical function
+  #    role: frame element
+  #    target: LU occurrence
+  #    frame: frame
+  #    stuff: support, and other things
+  #   ne:    named entity
+  #   sent_id: sentence ID
+  def print_conll_style(file = $stdout)
+    pos_text, charidx = read_sentence()
+    asets = read_annotation_sets(charidx)
+    # aset -> are we inside the target or not?
+    in_target = Hash.new(false)
+    # aset -> are we in all sorts of other annotations, like Support?
+    in_stuff = Hash.new()
+    # are we inside a named entity?
+    in_ne = nil
+    # record every opening and closing label we recognize,
+    # to check later
+    recognized_labels = Hash.new()
+    pos_text.each_index {|i|
+      line = Array.new
+      word = pos_text[i]
+      # add: word
+      line << word
+      start, stop = charidx[i]
+      # iterate over the frames we have
+      # add: (pt gf role target frame stuff)
+      asets.each { |aset|
+        unless aset.aset_type == "frame"
+          # don't treat NEs as a frame here
+          next
+        end
+        # pt, gf, role
+        ["PT", "GF", "FE"].each { |layer|
+          token = Array.new
+          hash = aset.layers[layer]
+          if hash.has_key?([start,"start"])
+            recognized_labels[[layer, start, "start"]] = true
+            markables = hash[[start,"start"]]
+            markables.each {|element|
+              token << "B-"+element
+            }
+          end
+          if hash.has_key?([stop,"stop"])
+            recognized_labels[[layer, stop, "stop"]] = true
+            markables = hash[[stop,"stop"]]
+            markables.each {|element|
+              token << "E-"+element
+            }
+          end
+          if token.empty?
+            line <<  "-"
+          else
+            line <<  token.sort.join(":")
+          end
+        }
+        # target
+        target = aset.layers["Target"]
+        if target.has_key?([start,"start"])
+          recognized_labels[["Target", start, "start"]] = true
+          in_target[aset] = true
+        end
+        if in_target[aset]
+          line << aset.lu
+        else
+          line << "-"
+        end
+        if target.has_key?([stop,"stop"])
+          recognized_labels[["Target", stop, "stop"]] = true
+          in_target[aset] = false
+        end
+        # frame
+        line << aset.frame_name
+        # stuff
+        unless in_stuff.has_key?(aset)
+          in_stuff[aset] = Array.new()
+        end
+        aset.layers.each_key { |layer|
+          if ["PT", "GF", "FE", "Target"].include? layer
+            # already done those
+            next
+          end
+          # all the rest goes in "stuff"
+          if aset.layers[layer].has_key?([start, "start"])
+            aset.layers[layer][[start, "start"]].each { |entry|
+              in_stuff[aset] << layer + "-" + entry
+            }
+            recognized_labels[[layer, start, "start"]] = true
+          end
+        }
+        if in_stuff[aset].empty?
+          line << "-"
+        else
+          line << in_stuff[aset].join(":")
+        end
+        aset.layers.each_key { |layer|
+          if aset.layers[layer].has_key?([stop, "stop"])
+            recognized_labels[[layer, stop, "stop"]] = true
+            aset.layers[layer][[stop, "stop"]].each { |entry|
+              in_stuff[aset].delete(layer + "-" + entry)
+            }
+          end
+        }
+      }
+      # ne
+      if (ner = asets.detect { |a| a.aset_type == "NER" })
+        if ner.layers["NER"] and ner.layers["NER"].has_key?([start, "start"])
+          recognized_labels[["NER", start, "start"]] = true
+          in_ne = ner.layers["NER"][[start,"start"]]
+        end
+        if in_ne
+          line << in_ne.join(":")
+        else
+          line << "-"
+        end
+        if ner.layers["NER"] and ner.layers["NER"].has_key?([stop, "stop"])
+          recognized_labels[["NER", stop, "stop"]] = true
+          in_ne = nil
+        end
+      end
+      # sent id
+      line << @sent_id
+      # sanity check:
+      # row format:
+      # word (pt gf role target frame stuff)* ne sent_id
+      # so number of columns must be 3 + 6x for some x >= 0
+      unless (line.length() - 3)%6 == 0
+        $stderr.puts "Something wrong with the line length."
+        $stderr.puts "I have #{asets.length() - 1} frames plus NEs, "
+        $stderr.puts "but #{line.length()} columns."
+        raise
+      end
+      file.puts line.join("\t")
+    }
+    # sanity check:
+    # now count all labels,
+    # to see if we've printed them all
+    lost_labels = Array.new()
+    asets.each { |aset|
+      aset.layers.each_key { |layer|
+        aset.layers[layer].each_key() { |offset, start_or_stop|
+          unless recognized_labels[[layer, offset, start_or_stop]]
+            lost_labels << [layer, offset, start_or_stop,
+                            aset.layers[layer][[offset, start_or_stop]]]
+          end
+        }
+      }
+    }
+    unless lost_labels.empty?
+      $stderr.puts "Offsets: "
+      pos_text.each_index { |i|
+        $stderr.puts "\t#{pos_text[i]} #{charidx[i][0]} #{charidx[i][1]}"
+      }
+#       $stderr.puts "Recognized:"
+#       recognized_labels.each_key { |k|
+#         $stderr.puts "\t" + k.to_s()
+#       }
+      lost_labels.each { |layer, offset, start_or_stop, labels|
+        $stderr.puts "FNCorpusXML warning: lost label"
+        $stderr.puts "\tLayer #{layer}"
+        $stderr.puts "\tOffset #{offset}"
+        $stderr.puts "\tStatus #{start_or_stop}"
+        $stderr.puts "\tLabels #{labels.join(" ")}"
+      }
+    end
+    file.puts
+  end
+  ################
+  private
+  ###
+  # read annotation sets:
+  # parse the annotation sets in the @sent object,
+  # return as:
+  # array of FNCorpusAset objects
+  def read_annotation_sets(charidx)
+    unless (annotation_sets = @sent.first_child_matching("annotationSets"))
+      return
+    end
+    # return values
+    frames = Array.new()
+    annotation_sets.each_child_matching("annotationSet") { |aset|
+      frames << FNCorpusAset.new(aset, charidx)
+    }
+    return frames
+  end
+  ###
+  # basically taken over from FrameXML.rb
+  # read sentence words,
+  # return as: sentence, indices
+  # - sentence as array of strings, one word per string
+  # - indices: array of pairs [word start char.index, word end char.index] int*int
+  def read_sentence()
+    # all text and pos_text have the same number of elements!
+    charidx = Array.new # maps word indices on [start,stop]
+    pos_text = []
+    unless (text_elt = @sent.first_child_matching("text"))
+      # no text found for this sentence
+      return [pos_text, charidx]
+    end
+    orig_text = text_elt.children_and_text().detect { |child|
+      child.text?
+    }
+    if orig_text
+      # take text out of RegXMl object
+      orig_text = orig_text.to_s()
+    end
+    pos_text = UtfIso.to_iso_8859_1(orig_text).split(" ") # text with special char.s replaced by iso8859 char.s
+    double_space = Array.new
+    pos = 0
+    while (match = orig_text.index(/(\s\s+)/,pos))
+      double_space << match
+      pos = match+1
+    end
+    # fill charidx array
+    char_i = 0
+    pos_text.each_index {|word_i|
+      startchar = char_i
+      #      puts "Remembering "+char_i.to_s+" as start index of word "+word_i.to_s
+      char_i += our_length(pos_text[word_i])
+      stopchar = char_i-1
+      #      puts "Remembering "+(char_i-1).to_s+" as stop index of word "+word_i.to_s
+      charidx << [startchar,stopchar]
+      # separators
+      if double_space.include?(char_i) then
+	char_i += 2
+      else
+	char_i += 1
+      end
+    }
+    return [pos_text, charidx]
+  end
+  ###
+  def our_length(string)   # (1) replace &...; with 1 char and " with two chars
+    return string.gsub(/&(.+?);/,"X").length
+  end
+end