RubyGems - shalmaneser - Versions diffs - 1.2.0.rc4 → 1.2.rc5 - Mend

shalmaneser 1.2.0.rc4 → 1.2.rc5

Files changed (115) hide show

checksums.yaml +4 -4
data/README.md +47 -18
data/bin/shalmaneser +8 -2
data/doc/index.md +1 -0
data/lib/shalmaneser/opt_parser.rb +68 -67
metadata +49 -119
data/bin/fred +0 -16
data/bin/frprep +0 -34
data/bin/rosy +0 -17
data/lib/common/AbstractSynInterface.rb +0 -1229
data/lib/common/Counter.rb +0 -18
data/lib/common/EnduserMode.rb +0 -27
data/lib/common/Eval.rb +0 -480
data/lib/common/FixSynSemMapping.rb +0 -196
data/lib/common/Graph.rb +0 -345
data/lib/common/ISO-8859-1.rb +0 -24
data/lib/common/ML.rb +0 -186
data/lib/common/Mallet.rb +0 -236
data/lib/common/Maxent.rb +0 -229
data/lib/common/Optimise.rb +0 -195
data/lib/common/Parser.rb +0 -213
data/lib/common/RegXML.rb +0 -269
data/lib/common/RosyConventions.rb +0 -171
data/lib/common/STXmlTerminalOrder.rb +0 -194
data/lib/common/SalsaTigerRegXML.rb +0 -2347
data/lib/common/SalsaTigerXMLHelper.rb +0 -99
data/lib/common/SynInterfaces.rb +0 -282
data/lib/common/TabFormat.rb +0 -721
data/lib/common/Tiger.rb +0 -1448
data/lib/common/Timbl.rb +0 -144
data/lib/common/Tree.rb +0 -61
data/lib/common/config_data.rb +0 -470
data/lib/common/config_format_element.rb +0 -220
data/lib/common/headz.rb +0 -338
data/lib/common/option_parser.rb +0 -13
data/lib/common/prep_config_data.rb +0 -62
data/lib/common/prep_helper.rb +0 -1330
data/lib/common/ruby_class_extensions.rb +0 -310
data/lib/db/db_interface.rb +0 -48
data/lib/db/db_mysql.rb +0 -145
data/lib/db/db_sqlite.rb +0 -280
data/lib/db/db_table.rb +0 -239
data/lib/db/db_wrapper.rb +0 -176
data/lib/db/sql_query.rb +0 -243
data/lib/ext/maxent/Classify.class +0 -0
data/lib/ext/maxent/Train.class +0 -0
data/lib/fred/Baseline.rb +0 -150
data/lib/fred/FileZipped.rb +0 -31
data/lib/fred/FredBOWContext.rb +0 -877
data/lib/fred/FredConventions.rb +0 -232
data/lib/fred/FredDetermineTargets.rb +0 -319
data/lib/fred/FredEval.rb +0 -312
data/lib/fred/FredFeatureExtractors.rb +0 -322
data/lib/fred/FredFeatures.rb +0 -1061
data/lib/fred/FredFeaturize.rb +0 -602
data/lib/fred/FredNumTrainingSenses.rb +0 -27
data/lib/fred/FredParameters.rb +0 -402
data/lib/fred/FredSplit.rb +0 -84
data/lib/fred/FredSplitPkg.rb +0 -180
data/lib/fred/FredTest.rb +0 -606
data/lib/fred/FredTrain.rb +0 -144
data/lib/fred/PlotAndREval.rb +0 -480
data/lib/fred/fred.rb +0 -47
data/lib/fred/fred_config_data.rb +0 -185
data/lib/fred/md5.rb +0 -23
data/lib/fred/opt_parser.rb +0 -250
data/lib/frprep/Ampersand.rb +0 -39
data/lib/frprep/CollinsInterface.rb +0 -1165
data/lib/frprep/Counter.rb +0 -18
data/lib/frprep/FNCorpusXML.rb +0 -643
data/lib/frprep/FNDatabase.rb +0 -144
data/lib/frprep/FrameXML.rb +0 -513
data/lib/frprep/Graph.rb +0 -345
data/lib/frprep/MiniparInterface.rb +0 -1388
data/lib/frprep/RegXML.rb +0 -269
data/lib/frprep/STXmlTerminalOrder.rb +0 -194
data/lib/frprep/SleepyInterface.rb +0 -384
data/lib/frprep/TntInterface.rb +0 -44
data/lib/frprep/TreetaggerInterface.rb +0 -327
data/lib/frprep/do_parses.rb +0 -143
data/lib/frprep/frprep.rb +0 -693
data/lib/frprep/interfaces/berkeley_interface.rb +0 -372
data/lib/frprep/interfaces/stanford_interface.rb +0 -353
data/lib/frprep/interpreters/berkeley_interpreter.rb +0 -22
data/lib/frprep/interpreters/stanford_interpreter.rb +0 -22
data/lib/frprep/one_parsed_file.rb +0 -28
data/lib/frprep/opt_parser.rb +0 -94
data/lib/frprep/ruby_class_extensions.rb +0 -310
data/lib/rosy/AbstractFeatureAndExternal.rb +0 -242
data/lib/rosy/ExternalConfigData.rb +0 -58
data/lib/rosy/FailedParses.rb +0 -130
data/lib/rosy/FeatureInfo.rb +0 -242
data/lib/rosy/GfInduce.rb +0 -1115
data/lib/rosy/GfInduceFeature.rb +0 -148
data/lib/rosy/InputData.rb +0 -294
data/lib/rosy/RosyConfusability.rb +0 -338
data/lib/rosy/RosyEval.rb +0 -465
data/lib/rosy/RosyFeatureExtractors.rb +0 -1609
data/lib/rosy/RosyFeaturize.rb +0 -281
data/lib/rosy/RosyInspect.rb +0 -336
data/lib/rosy/RosyIterator.rb +0 -478
data/lib/rosy/RosyPhase2FeatureExtractors.rb +0 -230
data/lib/rosy/RosyPruning.rb +0 -165
data/lib/rosy/RosyServices.rb +0 -744
data/lib/rosy/RosySplit.rb +0 -232
data/lib/rosy/RosyTask.rb +0 -19
data/lib/rosy/RosyTest.rb +0 -829
data/lib/rosy/RosyTrain.rb +0 -234
data/lib/rosy/RosyTrainingTestTable.rb +0 -787
data/lib/rosy/TargetsMostFrequentFrame.rb +0 -60
data/lib/rosy/View.rb +0 -418
data/lib/rosy/opt_parser.rb +0 -379
data/lib/rosy/rosy.rb +0 -78
data/lib/rosy/rosy_config_data.rb +0 -121
data/lib/shalmaneser/version.rb +0 -3

@@ -1,195 +0,0 @@
-# sp 29 07 04
-# "optimise" c4.5 files by replacing all feature values which only
-# occur with one label by a new, common value.
-#
-# two modes of operation:
-# optimise <file>                -- optimise file and store optimisations in <file>.opts
-# optimise <file> <file.opts>    -- apply optimisation from file.opts to file
-class Optimise
-  def initialize
-    @ready = false
-  end
-  def init_from_data(infile) # find new optimisation
-    STDERR.puts "[Optimise] computing new feature optimisation"
-    infile = File.new(infile)
-    labels = Array.new
-    features = nil
-    @replacements = Array.new # for each feature, store the list of replacements
-    # read data from infile into hash and initialise replacements array
-    while (line = infile.gets)
-      f_l = line.chomp.split(",")
-      if features.nil? # first line: initialisation
-	features = Array.new # for each feature: array of feature values from file
-	f_l.each_index {|i|
-	  features[i] = Array.new
-	  @replacements[i] = Hash.new
-	}
-      end
-      labels << f_l.pop
-      f_l.each_index {|i|
-	features[i] << f_l[i]
-      }
-    end
-    infile.close
-    features.each_index {|findex| # traverse all features
-      # for each feature *value*, find all label indices
-      fvalues = features[findex]
-      fval_to_label = Hash.new # record fval -> label mappings
-                                  # no label : nil
-                                  # one label: <label>
-                                  # two labels: false
-      fvalues.each_index {|inst_idx|
-	label = labels[inst_idx] # current label
-	fval = fvalues[inst_idx] # current feature value
-	seen_label = fval_to_label[fval] # previously seen label
-	if seen_label.nil?
-	  fval_to_label[fval] = label
-	elsif seen_label and seen_label != label
-	  fval_to_label[fval] = false
-	end
-      } # at the end, all fvals should be mapped to either <label> or false
-      # construct new feature value names
-      new_fvals = Hash.new
-      labels.each {|label|
-	new_fvals[label] = "f"+findex.to_s+"_"+label.gsub(/\./,"")
-      }
-      # record all features values for which we have only seen one label in @replacements
-      fval_to_label.each_pair {|fval,label|
-	if fval == "[U]"
-	  puts "[U]: "+label.to_s+" "+new_fvals[label]
-	end
-	if label
-#	  STDERR.puts "replacement of "+fval+" by "+new_fvals[label]
-	  @replacements[findex][fval] = new_fvals[label]
-	end
-      }
-    #   fvalues = features[findex]
-#       l_to_v = Hash.new # label -> array of feature values
-#       v_to_l = Hash.new # feature value -> array of labels
-#       fvalues.each_index {|inst| # traverse all instances
-# 	fval = fvalues[inst]
-# 	label = labels[inst]
-# 	unless v_to_l.key?(fval) # add entry to v_to_l
-# 	  v_to_l[fval] = Array.new
-#           end
-# 	v_to_l[fval] << label
-# 	unless l_to_v.key?(label) # add entry to l_to_v
-# 	  l_to_v[label] = Array.new
-# 	end
-# 	l_to_v[label] << fval
-#       }
-#       l_to_v.each_pair {|label,values|
-# 	newvalue = "f"+findex.to_s+"_"+label.gsub(/\./,"")
-# 	values.each {|value|
-# 	  if v_to_l[value].uniq.length == 1
-# 	    @replacements[findex][value] = newvalue
-# 	  end
-# 	}
-#       }
-     }
-    @ready = true
-  end
-  def init_from_file(optsfile) # use old optimisation
-    optsinfile = File.new(optsfile)
-    @replacements = read(optsinfile)
-    optsinfile.close
-    @ready = true
-  end
-  def store(outfilename) # store data necessary to recreate optimisation
-    unless @ready
-      raise "[Optimise] Error: Cannot store un-initialised optimisation"
-    end
-    outfile = File.new(outfilename,"w")
-    @replacements.each_index {|i| # for each feature
-      reps = @replacements[i]
-      outfile.puts "<"+i.to_s+">"
-      reps.each_pair{|old,new|
-	outfile.puts [old,new].join("\t")
-      }
-      outfile.puts "</"+i.to_s+">"
-    }
-    outfile.close
-  end
-  def apply(infilename,outfilename)
-    unless @ready
-      raise "[Optimise] Error: Cannot apply un-initialised optimisation"
-    end
-    STDERR.puts "[Optimise] applying feature optimisation"
-    infile = File.new(infilename)
-    outfile = File.new(outfilename,"w")
-    features = Array.new
-    labels = Array.new
-    while (line = infile.gets)
-      tokens = line.chomp.split(",")
-      unless tokens.length == @replacements.length
-	raise "[Optimise] Error: trying to optimise incompatible feature file!\nFile has "+features.length.to_s+" features, and we know replacements for "+@replacements.length.to_s+" features."
-      end
-      label = tokens.pop
-      tokens.each_index {|f_idx|
-	fval = tokens[f_idx]
-	if @replacements[f_idx].key?(fval)
-	  tokens[f_idx] = @replacements[f_idx][fval]
-	end
-      }
-      tokens.push label
-      outfile.puts tokens.join(",")
-    end
-    outfile.close
-  end
-  private
-  def read(infile)
-    @replacements = Array.new
-    while line = infile.gets
-      line.chomp!
-      if line =~ /<(\d+)>/
-	reps = Hash.new
-      elsif line =~ /<\/(\d+)>/
-	@replacements[$1.to_i] = reps
-      else
-	tokens = line.chomp.split("\t")
-	reps[tokens[0]] = tokens[1]
-      end
-    end
-    infile.close
-  end
-  # return recommended filename to store optimisation patterns for basefile
-  def Optimise.recommended_filename(basefile)
-    return basefile+".optimisations"
-  end
-end

data/lib/common/Parser.rb DELETED

@@ -1,213 +0,0 @@
-# Alexander Koller 2003
-# extended Katrin Erk June 2003
-#
-# Classes that return a list of sentence DOMs, from various sources
-#
-# Each class in this file defines the following methods:
-#
-#   initialize(...)     "..." depends on the class
-#   extractDOMs()       return list of all s nodes as DOM objects
-#   each_s()            iterate over s nodes; may take less memory
-require "rexml/document"
-class FileParser
-  include REXML
-  def initialize(filename)
-    @file = File.new(filename)
-    @doc = nil
-  end
-  # returns an array of DOMs for the sentences
-  def extractDOMs()
-    ensureParsedDocument()
-    @doc.get_elements("/corpus/body/s")
-  end
-  # Iterates over all sentence nodes. This may be more memory
-  # efficient than using extractDOMs(), but isn't in this case.
-  def each_s()
-    extractDOMs().each { |dom| yield(dom) }
-  end
-  # Iterates over all sentence nodes. The block passed to this
-  # method should return a DOM object as a value. After the iteration
-  # has been completed, the contents of /corpus/body are then replaced
-  # by the list of these results.
-  # At the moment, this changes the FileParser object. This should
-  # probably change in the future, but I don't want to mess with
-  # cloning now.
-  def process_s!()
-    newBody = Element.new('body')
-    each_s { |dom| newBody.add_element( yield(dom) ) }
-    @doc.delete_element("/corpus/body")
-    @doc.elements["corpus"].add_element(newBody)
-    return @doc
-  end
-  private
-  def ensureParsedDocument()
-    if @doc == nil then
-      @doc = Document.new(@file)
-    end
-  end
-end
-#####################################################################
-class FilePartsParser
-  # <@file> = File object for the corpus
-  # <@head> = string up to the first <s> tag
-  # <@tail> = string after the last </s> tag
-  # <@rest> = string starting with the latest <s> tag (complete this to
-  # a <s>...</s> structure by reading up to next </s> tag)
-  # <@readCompletely> = boolean specifying whether there's still something
-  # left to read in the file
-  attr_reader :head, :tail
-  def initialize(filename)
-    @file = File.new(filename)
-    @readCompletely = false
-    # read stuff into @head and initialize @rest
-    @head = ''
-    begin
-      while true do
-	line = @file.readline()
-	if line =~ /(.*)(<s\s.*)/ then
-	  @head = @head << $1
-	  @rest = $2
-	  break
-	elsif line =~ /^(.*)(<\/body[\s>].*)$/
-	  # empty corpus
-	  @head = @head << $1
-	  @tail = $2
-	  while (line = @file.readline())
-	    @tail << "\n" + line
-	  end
-	  @readCompletely = true
-	  break
-	else
-	  @head = @head << line
-	end
-      end
-    rescue EOFError
-      @readCompletely = true
-    end
-  end
-  def close()
-    @file.close()
-  end
-  def extractDOMs()
-    allDOMs = Array.new
-    process_s!() { |dom|
-      allDOMs.push(dom)
-      Element.new("x")
-    }
-    return allDOMs
-  end
-  def each_s()
-    process_s!() { |dom|
-      yield(dom)
-      Element.new("x")
-    }
-  end
-  # This function returns the string for the modified corpus.
-  # It doesn't change the internal state of the FilePartsParser,
-  # and is much more memory (and probably time) efficient than
-  # FileParser#process_s!.
-  # The block that is called by the method is given an element
-  # as its argument and is expected to return a changed element.
-  def process_s!()
-    if @readCompletely
-      return
-    end
-    ret = ''
-    scan_s() { |element|
-      # Process the <s> ... </s> element
-      doc = Document.new(element)
-      elt = doc.root
-      changedElt = yield(elt)
-      changedEltAsString = ''
-      changedElt.write(changedEltAsString, 0)
-      ret <<= changedEltAsString
-    }
-    return ret
-  end
-  # KE 12.6.03: scan_s :
-  # doesn't parse a sentence before yielding it
-  # doesn't allow for any changes
-  # but otherwise the same as process_s!
-  def scan_s()
-    if @readCompletely
-      return
-    end
-    begin
-      while true do
-	# Invariant: At this point, @rest always starts with an
-	# unseen <s> tag.
-	# First, we continue reading until we find the closing </s>
-	# No exception should occur in this loop if we're parsing
-	# a valid XML document.
-	while @rest !~ /^(.*<\/s>)(.*)/m do
-	  @rest = @rest << @file.readline()
-	end
-	element = $1
-	@rest = $2
-	yield(element) # change HERE: element not parsed!
-	# Read on up to the next <s>
-	while @rest !~ /(.*)(<s\s.*)/m do
-	  @rest = @rest << @file.readline()
-	end
-	@rest = $2
-      end
-    rescue EOFError
-      @tail = @rest
-      @readCompletely = true
-    end
-  end
-  # KE 5.11.03: get_rest: read all of the file not processed up to this point
-  # and return it as a string
-  def get_rest()
-    begin
-      while true do
-	@rest = @rest << @file.readline()
-      end
-    rescue EOFError
-      @readCompletely = true
-    end
-    return @rest
-  end
-end

data/lib/common/RegXML.rb DELETED

@@ -1,269 +0,0 @@
-# RegXML
-#
-# Katrin Erk June 2005
-# SalsaTigerRegXML: take control of the data structure, no underlying xml
-# representation anymore, re-generation of xml on demand
-class RegXML
-  def initialize(string, # string representing a single XML element
-                 i_am_text = false) # boolean: xml element (false) or text (true)
-    unless string.class == String
-      raise "First argument to RegXML.new must be string. I got #{string.class.to_s}"
-    end
-    if i_am_text
-      @s = string
-      @i_am_text = true
-    else
-      @s = string.gsub(/\n/,  " ").freeze
-      @i_am_text = false
-      element_test()
-      dyck_test()
-    end
-  end
-  def to_s()
-    return xml_readable(@s)
-  end
-  def text?
-    return @i_am_text
-  end
-  def name()
-    if @i_am_text
-      # text
-      return nil
-    else
-      # xml element
-      if @s =~ /^\s*<\s*([\w-]+)[\s\/>]/
-        return $1
-      else
-        raise "Cannot parse:\n#{xml_readable(@s)}"
-      end
-    end
-  end
-  def attributes()
-    if @i_am_text
-      # text
-      return {}
-    else
-      #  xml element
-      # remove <element_name  from the beginning of @s,
-      # place the rest up to the first > into elt_contents:
-      # this is a string of the form
-      # - either (name=value)*
-      # - or     (name=value)*/
-      unless @s =~ /^\s*<\s*#{name()}(.*)$/
-        raise "Cannot parse:\n #{xml_readable(@s)}"
-      end
-      retv = Hash.new
-      elt_contents = $1
-      # repeat until only > or /> is left
-      while elt_contents !~ /^\s*\/?>/
-        # shave off the next name=value pair
-        # put the rest into elt_contents
-        # make sure that if the value is quoted with ',
-        # we accept " inside the value, and vice versa.
-        unless elt_contents =~ /^\s*([\w-]+)=(['"])(.*?)\2(.*)$/
-          raise "Cannot parse:\n #{xml_readable(elt_contents)}"
-        end
-        retv[$1] = $3
-        elt_contents = $4
-      end
-      return retv
-    end
-  end
-  def children_and_text()
-    if @i_am_text
-      return []
-    else
-      if unary_element()
-        # <bla/>, no children
-        return []
-      end
-      # @s has the form <bla...>  ... </bla>.
-      # remove <bla ...>  from the beginning of @s,
-      # place the rest up to </bla> into children_s:
-      mainname = name()
-      unless @s =~ /^\s*<\s*#{mainname}(\s+[\w-]+=(["']).*?\2)*\s*>(.*?)<\/\s*#{mainname}\s*>\s*$/
-        raise "Cannot parse:\n #{xml_readable(@s)}"
-      end
-      retv = Array.new
-      children_s = $3
-      # repeat until only whitespace is left
-      while children_s !~ /^\s*$/
-      # shave off the next bit of text
-        # put the rest into children_s
-        unless children_s =~ /^\s*(.*?)(<.*$|$)/
-          $stderr.puts "Whole was:\n #{xml_readable(@s)}"
-          $stderr.puts
-          raise "Cannot parse:\n #{xml_readable(children_s)}"
-        end
-        unless $1.strip.empty?
-          children_s = $2
-          retv << RegXML.new($1, true)
-        end
-        # anything left after we've parsed text?
-        if children_s =~ /^s*$/
-          break
-        end
-        # shave off the next child
-        # and put the rest into children_s
-        # determine the next child's name, and the string index at which
-        # the element start tag ends with either / or >
-        unless children_s =~ /^\s*(<\s*([\w-]+)(\s+[\w-]+=(["']).*?\4)*\s*)/
-          $stderr.puts "Whole was:\n #{xml_readable(@s)}"
-          $stderr.puts
-          raise "Cannot parse:\n #{xml_readable(children_s)}"
-        end
-        childname = $2
-        child = $1
-        endofelt_ix = $&.length()
-        # and remove it
-        case children_s[endofelt_ix..-1]
-        when /^\/>(.*)$/
-          # next child is a unary element
-          children_s = $1
-          retv << RegXML.new(child + "/>")
-        when /^(>.*?<\s*\/\s*#{childname}\s*>)(.*)$/
-          children_s = $2
-          retv << RegXML.new(child + $1)
-        else
-          $stderr.puts "Whole was:\n #{xml_readable(@s)}"
-          $stderr.puts
-          raise "Cannot parse:\n#{xml_readable(children_s)}"
-        end
-      end
-      return retv
-    end
-  end
-  def RegXML.test()
-    bla = RegXML.new("  <bla blupp='a\"b'
-lalala=\"c\">
-  <lalala> </lalala>
-  texttext
-  <lala blupp='b'/>
-  nochtext
-  <la> <l/> </la>
-</ bla >
-")
-    puts "name " + bla.name()
-    puts
-    puts bla.to_s()
-    puts
-    bla.attributes.each { |attr, val|
-      puts "attr " + attr + "=" + val
-    }
-    puts
-    bla.children_and_text.each { |child_obj|
-      if child_obj.text?
-        puts "da text " + child_obj.to_s
-      else
-        puts "da child " + child_obj.to_s
-      end
-    }
-    puts
-    puts "NEU"
-    bla = RegXML.new("  < bla blupp='a\"'/> ")
-    puts "name " + bla.name()
-    puts
-    puts bla.to_s()
-    puts
-    bla.attributes.each { |attr, val|
-      puts "attr " + attr + "=" + val
-    }
-    puts
-    bla.children_and_text.each { |child_obj|
-      if child_obj.text?
-        puts "da text " + child_obj.to_s
-      else
-        puts "da child " + child_obj.to_s
-      end
-    }
-    puts
-  end
-  ##############
-  protected
-  def unary_element()
-    # <bla/>
-    if @s =~ /^\s*<.*\/>\s*$/
-      return true
-    else
-      return false
-    end
-  end
-  def element_test()
-    # make sure we have a single XML element, either <bla/> or
-    # <bla>...</bla>
-    if unary_element()
-      # <bla/>
-    elsif @s =~ /^\s*<\s*([\w-]+)\W.*?<\/\s*\1\s*>\s*$/
-      # <bla  > ... </bla>
-    else
-      raise "Cannot parse:\n #{xml_readable(@s)}"
-    end
-  end
-  def dyck_test()
-    # every prefix of @s must have at least as many < as >
-    opening = 0
-    closing = 0
-    @s.scan(/[<>]/) { |bracket|
-      case bracket
-      when "<"
-        opening += 1
-      when ">"
-        closing += 1
-        if closing > opening
-          raise "More closing than opening brackets in prefix of:\n #{xml_readable(@s)}"
-        end
-      end
-    }
-    # and in total, @s must have equally many < and >
-    unless @s.count("<") == @s.count(">")
-      raise "Inequal number of brackets in:\n #{xml_readable(@s)}"
-    end
-  end
-  def xml_readable(string)
-    return string.gsub(/>/, ">\n")
-  end
-end
-# RegXML.test()