RubyGems - shalmaneser-lib - Versions diffs - 1.2.rc5 - Mend

shalmaneser-lib 1.2.rc5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

checksums.yaml +7 -0
data/.yardopts +10 -0
data/CHANGELOG.md +4 -0
data/LICENSE.md +4 -0
data/README.md +122 -0
data/lib/configuration/config_data.rb +457 -0
data/lib/configuration/config_format_element.rb +210 -0
data/lib/configuration/configuration_error.rb +15 -0
data/lib/configuration/external_config_data.rb +56 -0
data/lib/configuration/frappe_config_data.rb +134 -0
data/lib/configuration/fred_config_data.rb +199 -0
data/lib/configuration/rosy_config_data.rb +126 -0
data/lib/db/db_interface.rb +50 -0
data/lib/db/db_mysql.rb +141 -0
data/lib/db/db_sqlite.rb +280 -0
data/lib/db/db_table.rb +237 -0
data/lib/db/db_view.rb +416 -0
data/lib/db/db_wrapper.rb +175 -0
data/lib/db/select_table_and_columns.rb +10 -0
data/lib/db/sql_query.rb +243 -0
data/lib/definitions.rb +19 -0
data/lib/eval.rb +482 -0
data/lib/ext/maxent/Classify.class +0 -0
data/lib/ext/maxent/Train.class +0 -0
data/lib/external_systems.rb +251 -0
data/lib/framenet_format/fn_corpus_aset.rb +209 -0
data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
data/lib/framenet_format/fn_database.rb +143 -0
data/lib/framenet_format/frame_xml_file.rb +104 -0
data/lib/framenet_format/frame_xml_sentence.rb +411 -0
data/lib/logging.rb +25 -0
data/lib/ml/classifier.rb +189 -0
data/lib/ml/mallet.rb +236 -0
data/lib/ml/maxent.rb +229 -0
data/lib/ml/optimize.rb +195 -0
data/lib/ml/timbl.rb +140 -0
data/lib/monkey_patching/array.rb +82 -0
data/lib/monkey_patching/enumerable_bool.rb +24 -0
data/lib/monkey_patching/enumerable_distribute.rb +18 -0
data/lib/monkey_patching/file.rb +131 -0
data/lib/monkey_patching/subsumed.rb +24 -0
data/lib/ruby_class_extensions.rb +4 -0
data/lib/salsa_tiger_xml/corpus.rb +24 -0
data/lib/salsa_tiger_xml/fe_node.rb +98 -0
data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
data/lib/salsa_tiger_xml/frame_node.rb +145 -0
data/lib/salsa_tiger_xml/graph_node.rb +347 -0
data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
data/lib/salsa_tiger_xml/sem_node.rb +58 -0
data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
data/lib/salsa_tiger_xml/syn_node.rb +169 -0
data/lib/salsa_tiger_xml/tree_node.rb +59 -0
data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
data/lib/salsa_tiger_xml/usp_node.rb +72 -0
data/lib/salsa_tiger_xml/xml_node.rb +163 -0
data/lib/shalmaneser/lib.rb +1 -0
data/lib/tabular_format/fn_tab_format_file.rb +38 -0
data/lib/tabular_format/fn_tab_frame.rb +67 -0
data/lib/tabular_format/fn_tab_sentence.rb +169 -0
data/lib/tabular_format/tab_format_file.rb +91 -0
data/lib/tabular_format/tab_format_named_args.rb +184 -0
data/lib/tabular_format/tab_format_sentence.rb +119 -0
data/lib/value_restriction.rb +49 -0
metadata +131 -0

data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb ADDED

@@ -0,0 +1,84 @@
+module STXML
+# sp jul 05 05
+#
+# Static helper methods for SalsaTigerRegXML:
+# - provide header and footer for Salsa/Tiger XML files
+# - escape and unescape HTML entities
+#
+# changed KE nov 05:
+# many methods moved to FrappeHelper
+class SalsaTigerXMLHelper
+  # escape and unescape strings for representation in XML
+  @replacements = [
+    # ["&apos;&apos;","&quot;"], # added by ines (09/03/09), might cause problems for unescape???
+    ["&", "&amp;"], # must be first for escaping, last for unescaping
+    ["<", "&lt;"],
+    [">", "&gt;"],
+    ["\"", "&apos;&apos;"],
+    # ["\"","&quot;"],
+    # ["\'\'","&quot;"],
+    # ["\`\`","&quot;"],
+    ["\'", "&apos;"],
+    ["\`\`", "&apos;&apos;"],
+    # ["''","&apos;&apos;"]
+  ]
+  ###
+  # get header of SalsaTigerXML files (as string)
+  def self.get_header
+    header = <<ENDOFHEADER
+<?xml version="1.0" encoding="UTF-8"?>
+  <corpus corpusname="corpus" target="">
+    <head>
+      <meta>
+        <format>NeGra format, version 3</format>
+      </meta>
+      <frames xmlns="http://www.clt-st.de/framenet/frame-database">
+      </frames>
+      <wordtags xmlns="http://www.clt-st.de/salsa/wordtags">
+      </wordtags>
+      <flags>
+      </flags>
+      <annotation>
+        <edgelabel>
+        </edgelabel>
+        <secedgelabel>
+        </secedgelabel>
+      </annotation>
+    </head>
+    <body>
+ENDOFHEADER
+    header
+  end
+  ###
+  # get footer of SALSATigerXML files (as string)
+  def self.get_footer
+    footer = <<ENDOFFOOTER
+        </body>
+</corpus>
+ENDOFFOOTER
+    footer
+  end
+  def self.escape(string)
+    @replacements.each do |unescaped, escaped|
+      string.gsub!(unescaped, escaped)
+    end
+    string
+  end
+  def self.unescape(string)
+    # reverse replacements to replace &amp last
+    @replacements.reverse_each do |unescaped, escaped|
+      string.gsub!(escaped, unescaped)
+    end
+    string
+  end
+end
+end

data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb ADDED

@@ -0,0 +1,161 @@
+require_relative 'xml_node'
+require_relative 'string_terminals_in_right_order'
+module STXML
+#############
+# class SalsaTigerXmlNode
+#
+# additional methods:
+#
+# is_terminal?    true if this is a Tiger XML terminal node
+#
+# is_nonterminal? true if this is a Tiger XML nonterminal node
+#
+# is_splitword?   true if this is a splitword part
+#
+# is_syntactic?   true for terminal, nonterminal, splitword
+#
+# is_frame?       true if this is a Salsa/Tiger XML frame
+#
+# is_target?      true if this is a Salsa/Tiger XML frame target
+#
+# is_fe?          true if this is a Salsa/Tiger XML frame element
+#
+# is_outside_sentence? returns false -- this node is not a placeholder for
+#                 a node that is outside the current sentence
+#                 (but see descendant class TSSynNode)
+#
+# yield_nodes     returns the list of descendants thatare leaves of the tree
+#                 NOTE: this overwrites the Graph.yield_nodes method
+#                 since we have to treat splitwords in a special way
+#                 empty array if no yield nodes are present
+#
+# yield_nodes_ordered returns those descendants ordered by precedence
+#                 in the sentence, i.e. their node IDs.
+#
+# sid             returns the sentence ID of this node
+#
+# to_s            returns the yield of this node as a string of space-separated words
+#                 words ordered left to right
+#
+class SalsaTigerXmlNode < XMLNode
+  include StringTerminalsInRightOrder
+  ###
+  # extracting the ID from a RegXML element
+  # depends on whether it has an ID or an IDref
+  #
+  # returns: a string, the ID, or nil if none was found
+  def self.xmlel_id(xml_obj) # RegXML object
+    case xml_obj.name
+    when "edge", "fenode", "uspitem", "splitword", "other_edge"
+      # contains ID ref
+      xml_obj.attributes["idref"]
+    when "part"
+      #  contains ID
+      xml_obj.attributes["id"]
+    else
+      # something else
+      # default: ID is in attribute "id"
+      xml_obj.attributes["id"]
+    end
+  end
+  ###
+  # RegXML object or text
+  def initialize(xml)
+    if xml.text?
+      # text
+      super(xml, nil, nil, true)
+    else
+      # xml element
+      super(xml.name, xml.attributes, SalsaTigerXmlNode.xmlel_id(xml), false)
+    end
+  end
+  ###
+  def is_terminal?
+    get_f("name") == "t"
+  end
+  ###
+  def is_nonterminal?
+    get_f("name") == "nt"
+  end
+  ###
+  def is_splitword?
+    get_f("name") == "part"
+  end
+  ###
+  def is_syntactic?
+    is_terminal? || is_nonterminal? || is_splitword?
+  end
+  ###
+  def is_frame?
+    get_f("name") == "frame"
+  end
+  ###
+  def is_target?
+    get_f("name") == "target"
+  end
+  ###
+  def is_fe?
+    get_f("name") == "fe"
+  end
+  ###
+  def sid
+    # my node ID starts out with the sentence ID
+    id =~ /^(.*?)_/
+    return $1
+  end
+  ###
+  def is_outside_sentence?
+    false
+  end
+  ###
+  def yield_nodes
+    # special consideration: splitwords do not count as children!
+    if children.reject { |c| c.is_splitword? }.empty?
+      return [self]
+    end
+    arr = []
+    children.reject { |c| c.is_splitword? }.each { |c|
+      if c.children.reject(&:is_splitword?).empty?
+        arr << c
+      else
+        arr.concat c.yield_nodes
+      end
+    }
+    arr
+  end
+  ###
+  def yield_nodes_ordered # legacy name
+    # sort_terminals_and_splitwords_... cannot deal with nonterminals
+    # so remove and attach to the end of the chain
+    t, nt  = yield_nodes.distribute { |x| x.is_terminal? || x.is_splitword? }
+    return sort_terminals_and_splitwords_left_to_right(t).concat(nt)
+  end
+  ###
+  # name parallel to the method of SalsaTigerSentence
+  def terminals_sorted
+    return yield_nodes_ordered
+  end
+  ###
+  def to_s
+    string_for_node(self)
+  end
+end
+end

data/lib/salsa_tiger_xml/sem_node.rb ADDED

@@ -0,0 +1,58 @@
+require_relative 'salsa_tiger_xml_node'
+module STXML
+#############
+# class SemNode
+#
+# common superclass for FrameNode and FeNode,
+# with methods that are the same for both:
+#
+#
+# is_usp?   returns true if the frame/FE is involved in underspecification,
+#           else false
+#
+# flags     returns an array of all the frame/FE flags for this node.
+#           members of the array are strings describing the flags
+#           that have been set to true
+#
+# add_flag  add or remove a frame/FE flag
+# remove_flag
+class SemNode < SalsaTigerXmlNode
+  attr_reader :flags
+  def initialize(xml) # RegXML object or text
+    super(xml)
+    # flags: array of FlagNode objects
+    @flags = []
+  end
+  ###
+  def is_usp?
+    get_attribute("usp") == "yes"
+  end
+  ###
+  def add_flag(name) # string: flag name
+    @flags << name
+  end
+  ###
+  def remove_flag(name) # string: flag name
+    @flags.delete(name)
+  end
+  #############
+  protected
+  def get_xml_embedded
+    super() + get_xml_offlags
+  end
+  def get_xml_offlags
+    # and add flags
+    @flags.map do |flagname|
+      "<flag name=\'#{xml_secure_val(flagname)}\'/>\n"
+    end.join
+  end
+end
+end

data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb ADDED

@@ -0,0 +1,192 @@
+module STXML
+#########
+# module StringTerminalsInRightOrder
+#
+# returns the yield of a node, or a list of nodes, as a string
+# of " "-separated words
+#
+# Words are put into the right order, left to right,
+# under the assumption that their node IDs reflect that order
+#
+# Terminal nodes are assumed to have IDs ending in a number,
+# numbered from left to right
+#
+# Splitword nodes are assumed to have IDs ending in N_sM
+# for numbers N and M, where N orders terminals left to right
+# and M orders the splitword parts left to right
+#
+# If the yield of the node/the list of nodes contains all splitwords of a terminal,
+# the whole terminal is taken instead
+#
+# methods:
+#
+# string_for_node  returns the string for the yield of a node
+#     node: a node object
+#
+# string_for_nodes returns the string for the yield of a list of nodes
+#     nodes: a list of node objects
+module StringTerminalsInRightOrder
+  def string_for_node(node)
+    string_for_nodes([node])
+  end
+  def string_for_nodes(nodes)
+    a = right_level_terminals_for_nodes(nodes)
+    a = sort_terminals_and_splitwords_left_to_right(a)
+    return node_array_to_string(a)
+  end
+  #####
+  private
+  # right_level_terminals_for_nodes:
+  # - compute the yield for each element of 'nodes'
+  # - then consider all splitwords in the yield:
+  #   if all splitwords of a terminal are in the yield,
+  #   then use the terminal rather than its splitwords
+  def right_level_terminals_for_nodes(nodes)
+    a = nodes.map { |n| n.yield_nodes}.flatten
+    b = []
+    a.each { |n|
+      if n.is_splitword?
+        # see if a contains all parts of this splitword
+        # if so, take into b the splitword's parent, the terminal,
+        # rather than the individual splitwords
+        if n.parent.nil?
+          # splitword without a parent
+          b << n
+        elsif b.include? n.parent or a.include? n.parent
+          # did we already include the splitword's parent in b?
+          # then we're done
+        else
+          # check if all children of n.parent are in 'a'
+          all_in = true
+          n.parent.each_child { |nsibling|
+            unless a.include? nsibling
+              all_in = false
+              break
+            end
+          }
+          if all_in
+            # yes, all children of n.parent are in 'a'
+            b << n.parent
+          else
+            # no, some sibling of n is not in 'a'
+            b << n
+          end
+        end
+      elsif n.is_terminal?
+        # n is a terminal
+        b << n
+        # if n is anything but a splitword or a terminal,
+        # ignore it
+      end
+    }
+    return b.uniq
+  end
+  # sort_terminals_and_splitwords_left_to_right:
+  # take an array of nodes that consists of terminals and splitwords
+  # and sort them using the following comparison:
+  # - when comparing two terminals, use the
+  #   last numbers in their respective IDs
+  # - when comparing two splitwords, their IDs end in _N_sM
+  #   for numbers N and M.
+  #   If they coincide in N, compare them by M,
+  #   else compare them by M
+  # - when comparing a terminal and a splitword,
+  #   compare the terminal's last number to the splitword's N
+  def sort_terminals_and_splitwords_left_to_right(nodes)
+    nodes.sort { |a, b|
+      if a.is_splitword? and b.is_splitword?
+        compare_splitwords(a, b)
+      elsif a.is_terminal? and b.is_terminal?
+        compare_terminals(a, b)
+      else
+        compare_mixed(a, b)
+      end
+    }
+  end
+  # node_array_to_string:
+  # 'nodes' is an array of node objects, each of which offer a "word" method
+  # string their words together separated by " "
+  def node_array_to_string(nodes)
+    s = ""
+    nodes.each { |n|
+      s = s + n.word + " "
+    }
+    return s
+  end
+  # - when comparing two terminals, use the
+  #   last numbers in their respective IDs
+  def compare_terminals(a, b)
+    last_i(a) <=> last_i(b)
+  end
+  # - when comparing two splitwords, their IDs end in _N_sM
+  #   for numbers N and M.
+  #   If they coincide in N, compare them by M,
+  #   else compare them by M
+  def compare_splitwords(a, b)
+    if splitword_terminal_i(a) == splitword_terminal_i(b)
+      # parts of same terminal?
+      # compare parts
+      last_i(a) <=> last_i(b)
+    else
+      # not parts of same terminal?
+      # compare terminals
+      splitword_terminal_i(a) <=> splitword_terminal_i(b)
+    end
+  end
+  # - when comparing a terminal and a splitword,
+  #   compare the terminal's last number to the splitword's N
+  def compare_mixed(a, b)
+    if a.is_splitword? and b.is_terminal?
+      splitword_terminal_i(a) <=> last_i(b)
+    elsif a.is_terminal? and b.is_splitword?
+       last_i(a) <=> splitword_terminal_i(b)
+    else
+      # not one terminal, one splitword?
+      # then what?
+      $stderr.print "SalsaTigerSentence, compare_mixed: confused by "
+      $stderr.print a.id, ",  ", b.id, "\n"
+    end
+  end
+  # return last number of the ID of a node
+  def last_i(n)
+    n.id =~ /(\d+)$/ # match final string of digits
+    if $1.nil? # if shouldn't happen _in principle_
+               # but we might get weird node IDs for splitwords;
+               # so we act gracefully and catch the case where there
+               # is one final letter behind the digits
+      n.id =~ /(\d+)\w$/
+    end
+    if $1.nil? # this shouldn't ever happen
+      $stderr.print "SalsaTigerSentence, last_i: Couldn't extract digits from: "
+      $stderr.print n.id, "\n"
+      exit 1
+    end
+    return $1.to_i       # and return it as number
+  end
+  # assume the ID of the node includes N_sM
+  # return N
+  def splitword_terminal_i(n)
+    n.id =~ /(\d+)_s\d*/ # match string of digits before splitword ID
+    if $1.nil? # this shouldn't ever happen
+      $stderr.print "SalsaTigerSentence, splitword_terminal_i: Couldn't extract digits from: "
+      $stderr.print n.id, "\n"
+      exit 1
+    end
+    return $1.to_i       # and return it as number
+  end
+end
+end