RubyGems - frprep - Versions diffs - 0.0.1.prealpha - Mend

frprep 0.0.1.prealpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (138) hide show

data/.yardopts +8 -0
data/CHANGELOG.rdoc +0 -0
data/LICENSE.rdoc +0 -0
data/README.rdoc +0 -0
data/lib/common/AbstractSynInterface.rb +1227 -0
data/lib/common/BerkeleyInterface.rb +375 -0
data/lib/common/CollinsInterface.rb +1165 -0
data/lib/common/ConfigData.rb +694 -0
data/lib/common/Counter.rb +18 -0
data/lib/common/DBInterface.rb +48 -0
data/lib/common/EnduserMode.rb +27 -0
data/lib/common/Eval.rb +480 -0
data/lib/common/FixSynSemMapping.rb +196 -0
data/lib/common/FrPrepConfigData.rb +66 -0
data/lib/common/FrprepHelper.rb +1324 -0
data/lib/common/Graph.rb +345 -0
data/lib/common/ISO-8859-1.rb +24 -0
data/lib/common/ML.rb +186 -0
data/lib/common/Maxent.rb +215 -0
data/lib/common/MiniparInterface.rb +1388 -0
data/lib/common/Optimise.rb +195 -0
data/lib/common/Parser.rb +213 -0
data/lib/common/RegXML.rb +269 -0
data/lib/common/RosyConventions.rb +171 -0
data/lib/common/SQLQuery.rb +243 -0
data/lib/common/STXmlTerminalOrder.rb +194 -0
data/lib/common/SalsaTigerRegXML.rb +2347 -0
data/lib/common/SalsaTigerXMLHelper.rb +99 -0
data/lib/common/SleepyInterface.rb +384 -0
data/lib/common/SynInterfaces.rb +275 -0
data/lib/common/TabFormat.rb +720 -0
data/lib/common/Tiger.rb +1448 -0
data/lib/common/TntInterface.rb +44 -0
data/lib/common/Tree.rb +61 -0
data/lib/common/TreetaggerInterface.rb +303 -0
data/lib/common/headz.rb +338 -0
data/lib/common/option_parser.rb +13 -0
data/lib/common/ruby_class_extensions.rb +310 -0
data/lib/fred/Baseline.rb +150 -0
data/lib/fred/FileZipped.rb +31 -0
data/lib/fred/FredBOWContext.rb +863 -0
data/lib/fred/FredConfigData.rb +182 -0
data/lib/fred/FredConventions.rb +232 -0
data/lib/fred/FredDetermineTargets.rb +324 -0
data/lib/fred/FredEval.rb +312 -0
data/lib/fred/FredFeatureExtractors.rb +321 -0
data/lib/fred/FredFeatures.rb +1061 -0
data/lib/fred/FredFeaturize.rb +596 -0
data/lib/fred/FredNumTrainingSenses.rb +27 -0
data/lib/fred/FredParameters.rb +402 -0
data/lib/fred/FredSplit.rb +84 -0
data/lib/fred/FredSplitPkg.rb +180 -0
data/lib/fred/FredTest.rb +607 -0
data/lib/fred/FredTrain.rb +144 -0
data/lib/fred/PlotAndREval.rb +480 -0
data/lib/fred/fred.rb +45 -0
data/lib/fred/md5.rb +23 -0
data/lib/fred/opt_parser.rb +250 -0
data/lib/frprep/AbstractSynInterface.rb +1227 -0
data/lib/frprep/Ampersand.rb +37 -0
data/lib/frprep/BerkeleyInterface.rb +375 -0
data/lib/frprep/CollinsInterface.rb +1165 -0
data/lib/frprep/ConfigData.rb +694 -0
data/lib/frprep/Counter.rb +18 -0
data/lib/frprep/FNCorpusXML.rb +643 -0
data/lib/frprep/FNDatabase.rb +144 -0
data/lib/frprep/FixSynSemMapping.rb +196 -0
data/lib/frprep/FrPrepConfigData.rb +66 -0
data/lib/frprep/FrameXML.rb +513 -0
data/lib/frprep/FrprepHelper.rb +1324 -0
data/lib/frprep/Graph.rb +345 -0
data/lib/frprep/ISO-8859-1.rb +24 -0
data/lib/frprep/MiniparInterface.rb +1388 -0
data/lib/frprep/Parser.rb +213 -0
data/lib/frprep/RegXML.rb +269 -0
data/lib/frprep/STXmlTerminalOrder.rb +194 -0
data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
data/lib/frprep/SleepyInterface.rb +384 -0
data/lib/frprep/SynInterfaces.rb +275 -0
data/lib/frprep/TabFormat.rb +720 -0
data/lib/frprep/Tiger.rb +1448 -0
data/lib/frprep/TntInterface.rb +44 -0
data/lib/frprep/Tree.rb +61 -0
data/lib/frprep/TreetaggerInterface.rb +303 -0
data/lib/frprep/do_parses.rb +142 -0
data/lib/frprep/frprep.rb +686 -0
data/lib/frprep/headz.rb +338 -0
data/lib/frprep/one_parsed_file.rb +28 -0
data/lib/frprep/opt_parser.rb +94 -0
data/lib/frprep/ruby_class_extensions.rb +310 -0
data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
data/lib/rosy/DBMySQL.rb +146 -0
data/lib/rosy/DBSQLite.rb +280 -0
data/lib/rosy/DBTable.rb +239 -0
data/lib/rosy/DBWrapper.rb +176 -0
data/lib/rosy/ExternalConfigData.rb +58 -0
data/lib/rosy/FailedParses.rb +130 -0
data/lib/rosy/FeatureInfo.rb +242 -0
data/lib/rosy/GfInduce.rb +1115 -0
data/lib/rosy/GfInduceFeature.rb +148 -0
data/lib/rosy/InputData.rb +294 -0
data/lib/rosy/RosyConfigData.rb +115 -0
data/lib/rosy/RosyConfusability.rb +338 -0
data/lib/rosy/RosyEval.rb +465 -0
data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
data/lib/rosy/RosyFeaturize.rb +280 -0
data/lib/rosy/RosyInspect.rb +336 -0
data/lib/rosy/RosyIterator.rb +477 -0
data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
data/lib/rosy/RosyPruning.rb +165 -0
data/lib/rosy/RosyServices.rb +744 -0
data/lib/rosy/RosySplit.rb +232 -0
data/lib/rosy/RosyTask.rb +19 -0
data/lib/rosy/RosyTest.rb +826 -0
data/lib/rosy/RosyTrain.rb +232 -0
data/lib/rosy/RosyTrainingTestTable.rb +786 -0
data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
data/lib/rosy/View.rb +418 -0
data/lib/rosy/opt_parser.rb +379 -0
data/lib/rosy/rosy.rb +77 -0
data/lib/shalmaneser/version.rb +3 -0
data/test/frprep/test_opt_parser.rb +94 -0
data/test/functional/functional_test_helper.rb +40 -0
data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
data/test/functional/test_fred.rb +47 -0
data/test/functional/test_frprep.rb +52 -0
data/test/functional/test_rosy.rb +20 -0
metadata +270 -0

data/lib/frprep/Parser.rb ADDED

@@ -0,0 +1,213 @@
+# Alexander Koller 2003
+# extended Katrin Erk June 2003
+#
+# Classes that return a list of sentence DOMs, from various sources
+#
+# Each class in this file defines the following methods:
+#
+#   initialize(...)     "..." depends on the class
+#   extractDOMs()       return list of all s nodes as DOM objects
+#   each_s()            iterate over s nodes; may take less memory
+require "rexml/document"
+class FileParser
+  include REXML
+  def initialize(filename)
+    @file = File.new(filename)
+    @doc = nil
+  end
+  # returns an array of DOMs for the sentences
+  def extractDOMs()
+    ensureParsedDocument()
+    @doc.get_elements("/corpus/body/s")
+  end
+  # Iterates over all sentence nodes. This may be more memory
+  # efficient than using extractDOMs(), but isn't in this case.
+  def each_s()
+    extractDOMs().each { |dom| yield(dom) }
+  end
+  # Iterates over all sentence nodes. The block passed to this
+  # method should return a DOM object as a value. After the iteration
+  # has been completed, the contents of /corpus/body are then replaced
+  # by the list of these results.
+  # At the moment, this changes the FileParser object. This should
+  # probably change in the future, but I don't want to mess with
+  # cloning now.
+  def process_s!()
+    newBody = Element.new('body')
+    each_s { |dom| newBody.add_element( yield(dom) ) }
+    @doc.delete_element("/corpus/body")
+    @doc.elements["corpus"].add_element(newBody)
+    return @doc
+  end
+  private
+  def ensureParsedDocument()
+    if @doc == nil then
+      @doc = Document.new(@file)
+    end
+  end
+end
+#####################################################################
+class FilePartsParser
+  # @file = File object for the corpus
+  # @head = string up to the first <s> tag
+  # @tail = string after the last </s> tag
+  # @rest = string starting with the latest <s> tag (complete this to
+  # a <s>...</s> structure by reading up to next </s> tag)
+  # @readCompletely = boolean specifying whether there's still something
+  # left to read in the file
+  attr_reader :head, :tail
+  def initialize(filename)
+    @file = File.new(filename)
+    @readCompletely = false
+    # read stuff into @head and initialize @rest
+    @head = ''
+    begin
+      while true do
+	line = @file.readline()
+	if line =~ /(.*)(<s\s.*)/ then
+	  @head = @head << $1
+	  @rest = $2
+	  break
+	elsif line =~ /^(.*)(<\/body[\s>].*)$/
+	  # empty corpus
+	  @head = @head << $1
+	  @tail = $2
+	  while (line = @file.readline())
+	    @tail << "\n" + line
+	  end
+	  @readCompletely = true
+	  break
+	else
+	  @head = @head << line
+	end
+      end
+    rescue EOFError
+      @readCompletely = true
+    end
+  end
+  def close()
+    @file.close()
+  end
+  def extractDOMs()
+    allDOMs = Array.new
+    process_s!() { |dom|
+      allDOMs.push(dom)
+      Element.new("x")
+    }
+    return allDOMs
+  end
+  def each_s()
+    process_s!() { |dom|
+      yield(dom)
+      Element.new("x")
+    }
+  end
+  # This function returns the string for the modified corpus.
+  # It doesn't change the internal state of the FilePartsParser,
+  # and is much more memory (and probably time) efficient than
+  # FileParser#process_s!.
+  # The block that is called by the method is given an element
+  # as its argument and is expected to return a changed element.
+  def process_s!()
+    if @readCompletely
+      return
+    end
+    ret = ''
+    scan_s() { |element|
+      # Process the <s> ... </s> element
+      doc = Document.new(element)
+      elt = doc.root
+      changedElt = yield(elt)
+      changedEltAsString = ''
+      changedElt.write(changedEltAsString, 0)
+      ret <<= changedEltAsString
+    }
+    return ret
+  end
+  # KE 12.6.03: scan_s :
+  # doesn't parse a sentence before yielding it
+  # doesn't allow for any changes
+  # but otherwise the same as process_s!
+  def scan_s()
+    if @readCompletely
+      return
+    end
+    begin
+      while true do
+	# Invariant: At this point, @rest always starts with an
+	# unseen <s> tag.
+	# First, we continue reading until we find the closing </s>
+	# No exception should occur in this loop if we're parsing
+	# a valid XML document.
+	while @rest !~ /^(.*<\/s>)(.*)/m do
+	  @rest = @rest << @file.readline()
+	end
+	element = $1
+	@rest = $2
+	yield(element) # change HERE: element not parsed!
+	# Read on up to the next <s>
+	while @rest !~ /(.*)(<s\s.*)/m do
+	  @rest = @rest << @file.readline()
+	end
+	@rest = $2
+      end
+    rescue EOFError
+      @tail = @rest
+      @readCompletely = true
+    end
+  end
+  # KE 5.11.03: get_rest: read all of the file not processed up to this point
+  # and return it as a string
+  def get_rest()
+    begin
+      while true do
+	@rest = @rest << @file.readline()
+      end
+    rescue EOFError
+      @readCompletely = true
+    end
+    return @rest
+  end
+end

data/lib/frprep/RegXML.rb ADDED

@@ -0,0 +1,269 @@
+# RegXML
+#
+# Katrin Erk June 2005
+# SalsaTigerRegXML: take control of the data structure, no underlying xml
+# representation anymore, re-generation of xml on demand
+class RegXML
+  def initialize(string, # string representing a single XML element
+                 i_am_text = false) # boolean: xml element (false) or text (true)
+    unless string.class == String
+      raise "First argument to RegXML.new must be string. I got #{string.class.to_s}"
+    end
+    if i_am_text
+      @s = string
+      @i_am_text = true
+    else
+      @s = string.gsub(/\n/,  " ").freeze
+      @i_am_text = false
+      element_test()
+      dyck_test()
+    end
+  end
+  def to_s()
+    return xml_readable(@s)
+  end
+  def text?
+    return @i_am_text
+  end
+  def name()
+    if @i_am_text
+      # text
+      return nil
+    else
+      # xml element
+      if @s =~ /^\s*<\s*([\w-]+)[\s\/>]/
+        return $1
+      else
+        raise "Cannot parse:\n#{xml_readable(@s)}"
+      end
+    end
+  end
+  def attributes()
+    if @i_am_text
+      # text
+      return {}
+    else
+      #  xml element
+      # remove <element_name  from the beginning of @s,
+      # place the rest up to the first > into elt_contents:
+      # this is a string of the form
+      # - either (name=value)*
+      # - or     (name=value)*/
+      unless @s =~ /^\s*<\s*#{name()}(.*)$/
+        raise "Cannot parse:\n #{xml_readable(@s)}"
+      end
+      retv = Hash.new
+      elt_contents = $1
+      # repeat until only > or /> is left
+      while elt_contents !~ /^\s*\/?>/
+        # shave off the next name=value pair
+        # put the rest into elt_contents
+        # make sure that if the value is quoted with ',
+        # we accept " inside the value, and vice versa.
+        unless elt_contents =~ /^\s*([\w-]+)=(['"])(.*?)\2(.*)$/
+          raise "Cannot parse:\n #{xml_readable(elt_contents)}"
+        end
+        retv[$1] = $3
+        elt_contents = $4
+      end
+      return retv
+    end
+  end
+  def children_and_text()
+    if @i_am_text
+      return []
+    else
+      if unary_element()
+        # <bla/>, no children
+        return []
+      end
+      # @s has the form <bla...>  ... </bla>.
+      # remove <bla ...>  from the beginning of @s,
+      # place the rest up to </bla> into children_s:
+      mainname = name()
+      unless @s =~ /^\s*<\s*#{mainname}(\s+[\w-]+=(["']).*?\2)*\s*>(.*?)<\/\s*#{mainname}\s*>\s*$/
+        raise "Cannot parse:\n #{xml_readable(@s)}"
+      end
+      retv = Array.new
+      children_s = $3
+      # repeat until only whitespace is left
+      while children_s !~ /^\s*$/
+      # shave off the next bit of text
+        # put the rest into children_s
+        unless children_s =~ /^\s*(.*?)(<.*$|$)/
+          $stderr.puts "Whole was:\n #{xml_readable(@s)}"
+          $stderr.puts
+          raise "Cannot parse:\n #{xml_readable(children_s)}"
+        end
+        unless $1.strip.empty?
+          children_s = $2
+          retv << RegXML.new($1, true)
+        end
+        # anything left after we've parsed text?
+        if children_s =~ /^s*$/
+          break
+        end
+        # shave off the next child
+        # and put the rest into children_s
+        # determine the next child's name, and the string index at which
+        # the element start tag ends with either / or >
+        unless children_s =~ /^\s*(<\s*([\w-]+)(\s+[\w-]+=(["']).*?\4)*\s*)/
+          $stderr.puts "Whole was:\n #{xml_readable(@s)}"
+          $stderr.puts
+          raise "Cannot parse:\n #{xml_readable(children_s)}"
+        end
+        childname = $2
+        child = $1
+        endofelt_ix = $&.length()
+        # and remove it
+        case children_s[endofelt_ix..-1]
+        when /^\/>(.*)$/
+          # next child is a unary element
+          children_s = $1
+          retv << RegXML.new(child + "/>")
+        when /^(>.*?<\s*\/\s*#{childname}\s*>)(.*)$/
+          children_s = $2
+          retv << RegXML.new(child + $1)
+        else
+          $stderr.puts "Whole was:\n #{xml_readable(@s)}"
+          $stderr.puts
+          raise "Cannot parse:\n#{xml_readable(children_s)}"
+        end
+      end
+      return retv
+    end
+  end
+  def RegXML.test()
+    bla = RegXML.new("  <bla blupp='a\"b'
+lalala=\"c\">
+  <lalala> </lalala>
+  texttext
+  <lala blupp='b'/>
+  nochtext
+  <la> <l/> </la>
+</ bla >
+")
+    puts "name " + bla.name()
+    puts
+    puts bla.to_s()
+    puts
+    bla.attributes.each { |attr, val|
+      puts "attr " + attr + "=" + val
+    }
+    puts
+    bla.children_and_text.each { |child_obj|
+      if child_obj.text?
+        puts "da text " + child_obj.to_s
+      else
+        puts "da child " + child_obj.to_s
+      end
+    }
+    puts
+    puts "NEU"
+    bla = RegXML.new("  < bla blupp='a\"'/> ")
+    puts "name " + bla.name()
+    puts
+    puts bla.to_s()
+    puts
+    bla.attributes.each { |attr, val|
+      puts "attr " + attr + "=" + val
+    }
+    puts
+    bla.children_and_text.each { |child_obj|
+      if child_obj.text?
+        puts "da text " + child_obj.to_s
+      else
+        puts "da child " + child_obj.to_s
+      end
+    }
+    puts
+  end
+  ##############
+  protected
+  def unary_element()
+    # <bla/>
+    if @s =~ /^\s*<.*\/>\s*$/
+      return true
+    else
+      return false
+    end
+  end
+  def element_test()
+    # make sure we have a single XML element, either <bla/> or
+    # <bla>...</bla>
+    if unary_element()
+      # <bla/>
+    elsif @s =~ /^\s*<\s*([\w-]+)\W.*?<\/\s*\1\s*>\s*$/
+      # <bla  > ... </bla>
+    else
+      raise "Cannot parse:\n #{xml_readable(@s)}"
+    end
+  end
+  def dyck_test()
+    # every prefix of @s must have at least as many < as >
+    opening = 0
+    closing = 0
+    @s.scan(/[<>]/) { |bracket|
+      case bracket
+      when "<"
+        opening += 1
+      when ">"
+        closing += 1
+        if closing > opening
+          raise "More closing than opening brackets in prefix of:\n #{xml_readable(@s)}"
+        end
+      end
+    }
+    # and in total, @s must have equally many < and >
+    unless @s.count("<") == @s.count(">")
+      raise "Inequal number of brackets in:\n #{xml_readable(@s)}"
+    end
+  end
+  def xml_readable(string)
+    return string.gsub(/>/, ">\n")
+  end
+end
+# RegXML.test()