frprep 0.0.1.prealpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +8 -0
 - data/CHANGELOG.rdoc +0 -0
 - data/LICENSE.rdoc +0 -0
 - data/README.rdoc +0 -0
 - data/lib/common/AbstractSynInterface.rb +1227 -0
 - data/lib/common/BerkeleyInterface.rb +375 -0
 - data/lib/common/CollinsInterface.rb +1165 -0
 - data/lib/common/ConfigData.rb +694 -0
 - data/lib/common/Counter.rb +18 -0
 - data/lib/common/DBInterface.rb +48 -0
 - data/lib/common/EnduserMode.rb +27 -0
 - data/lib/common/Eval.rb +480 -0
 - data/lib/common/FixSynSemMapping.rb +196 -0
 - data/lib/common/FrPrepConfigData.rb +66 -0
 - data/lib/common/FrprepHelper.rb +1324 -0
 - data/lib/common/Graph.rb +345 -0
 - data/lib/common/ISO-8859-1.rb +24 -0
 - data/lib/common/ML.rb +186 -0
 - data/lib/common/Maxent.rb +215 -0
 - data/lib/common/MiniparInterface.rb +1388 -0
 - data/lib/common/Optimise.rb +195 -0
 - data/lib/common/Parser.rb +213 -0
 - data/lib/common/RegXML.rb +269 -0
 - data/lib/common/RosyConventions.rb +171 -0
 - data/lib/common/SQLQuery.rb +243 -0
 - data/lib/common/STXmlTerminalOrder.rb +194 -0
 - data/lib/common/SalsaTigerRegXML.rb +2347 -0
 - data/lib/common/SalsaTigerXMLHelper.rb +99 -0
 - data/lib/common/SleepyInterface.rb +384 -0
 - data/lib/common/SynInterfaces.rb +275 -0
 - data/lib/common/TabFormat.rb +720 -0
 - data/lib/common/Tiger.rb +1448 -0
 - data/lib/common/TntInterface.rb +44 -0
 - data/lib/common/Tree.rb +61 -0
 - data/lib/common/TreetaggerInterface.rb +303 -0
 - data/lib/common/headz.rb +338 -0
 - data/lib/common/option_parser.rb +13 -0
 - data/lib/common/ruby_class_extensions.rb +310 -0
 - data/lib/fred/Baseline.rb +150 -0
 - data/lib/fred/FileZipped.rb +31 -0
 - data/lib/fred/FredBOWContext.rb +863 -0
 - data/lib/fred/FredConfigData.rb +182 -0
 - data/lib/fred/FredConventions.rb +232 -0
 - data/lib/fred/FredDetermineTargets.rb +324 -0
 - data/lib/fred/FredEval.rb +312 -0
 - data/lib/fred/FredFeatureExtractors.rb +321 -0
 - data/lib/fred/FredFeatures.rb +1061 -0
 - data/lib/fred/FredFeaturize.rb +596 -0
 - data/lib/fred/FredNumTrainingSenses.rb +27 -0
 - data/lib/fred/FredParameters.rb +402 -0
 - data/lib/fred/FredSplit.rb +84 -0
 - data/lib/fred/FredSplitPkg.rb +180 -0
 - data/lib/fred/FredTest.rb +607 -0
 - data/lib/fred/FredTrain.rb +144 -0
 - data/lib/fred/PlotAndREval.rb +480 -0
 - data/lib/fred/fred.rb +45 -0
 - data/lib/fred/md5.rb +23 -0
 - data/lib/fred/opt_parser.rb +250 -0
 - data/lib/frprep/AbstractSynInterface.rb +1227 -0
 - data/lib/frprep/Ampersand.rb +37 -0
 - data/lib/frprep/BerkeleyInterface.rb +375 -0
 - data/lib/frprep/CollinsInterface.rb +1165 -0
 - data/lib/frprep/ConfigData.rb +694 -0
 - data/lib/frprep/Counter.rb +18 -0
 - data/lib/frprep/FNCorpusXML.rb +643 -0
 - data/lib/frprep/FNDatabase.rb +144 -0
 - data/lib/frprep/FixSynSemMapping.rb +196 -0
 - data/lib/frprep/FrPrepConfigData.rb +66 -0
 - data/lib/frprep/FrameXML.rb +513 -0
 - data/lib/frprep/FrprepHelper.rb +1324 -0
 - data/lib/frprep/Graph.rb +345 -0
 - data/lib/frprep/ISO-8859-1.rb +24 -0
 - data/lib/frprep/MiniparInterface.rb +1388 -0
 - data/lib/frprep/Parser.rb +213 -0
 - data/lib/frprep/RegXML.rb +269 -0
 - data/lib/frprep/STXmlTerminalOrder.rb +194 -0
 - data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
 - data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
 - data/lib/frprep/SleepyInterface.rb +384 -0
 - data/lib/frprep/SynInterfaces.rb +275 -0
 - data/lib/frprep/TabFormat.rb +720 -0
 - data/lib/frprep/Tiger.rb +1448 -0
 - data/lib/frprep/TntInterface.rb +44 -0
 - data/lib/frprep/Tree.rb +61 -0
 - data/lib/frprep/TreetaggerInterface.rb +303 -0
 - data/lib/frprep/do_parses.rb +142 -0
 - data/lib/frprep/frprep.rb +686 -0
 - data/lib/frprep/headz.rb +338 -0
 - data/lib/frprep/one_parsed_file.rb +28 -0
 - data/lib/frprep/opt_parser.rb +94 -0
 - data/lib/frprep/ruby_class_extensions.rb +310 -0
 - data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
 - data/lib/rosy/DBMySQL.rb +146 -0
 - data/lib/rosy/DBSQLite.rb +280 -0
 - data/lib/rosy/DBTable.rb +239 -0
 - data/lib/rosy/DBWrapper.rb +176 -0
 - data/lib/rosy/ExternalConfigData.rb +58 -0
 - data/lib/rosy/FailedParses.rb +130 -0
 - data/lib/rosy/FeatureInfo.rb +242 -0
 - data/lib/rosy/GfInduce.rb +1115 -0
 - data/lib/rosy/GfInduceFeature.rb +148 -0
 - data/lib/rosy/InputData.rb +294 -0
 - data/lib/rosy/RosyConfigData.rb +115 -0
 - data/lib/rosy/RosyConfusability.rb +338 -0
 - data/lib/rosy/RosyEval.rb +465 -0
 - data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
 - data/lib/rosy/RosyFeaturize.rb +280 -0
 - data/lib/rosy/RosyInspect.rb +336 -0
 - data/lib/rosy/RosyIterator.rb +477 -0
 - data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
 - data/lib/rosy/RosyPruning.rb +165 -0
 - data/lib/rosy/RosyServices.rb +744 -0
 - data/lib/rosy/RosySplit.rb +232 -0
 - data/lib/rosy/RosyTask.rb +19 -0
 - data/lib/rosy/RosyTest.rb +826 -0
 - data/lib/rosy/RosyTrain.rb +232 -0
 - data/lib/rosy/RosyTrainingTestTable.rb +786 -0
 - data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
 - data/lib/rosy/View.rb +418 -0
 - data/lib/rosy/opt_parser.rb +379 -0
 - data/lib/rosy/rosy.rb +77 -0
 - data/lib/shalmaneser/version.rb +3 -0
 - data/test/frprep/test_opt_parser.rb +94 -0
 - data/test/functional/functional_test_helper.rb +40 -0
 - data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
 - data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
 - data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
 - data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
 - data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
 - data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
 - data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
 - data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
 - data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
 - data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
 - data/test/functional/test_fred.rb +47 -0
 - data/test/functional/test_frprep.rb +52 -0
 - data/test/functional/test_rosy.rb +20 -0
 - metadata +270 -0
 
| 
         @@ -0,0 +1,37 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require 'ISO-8859-1'
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            ####################3
         
     | 
| 
      
 4 
     | 
    
         
            +
            # Reformatting to and from 
         
     | 
| 
      
 5 
     | 
    
         
            +
            # a hex format for special characters
         
     | 
| 
      
 6 
     | 
    
         
            +
             
     | 
| 
      
 7 
     | 
    
         
            +
            module Ampersand
         
     | 
| 
      
 8 
     | 
    
         
            +
              def Ampersand.hex_to_iso(str)
         
     | 
| 
      
 9 
     | 
    
         
            +
                return str.gsub(/&.+?;/) { |umlaut|
         
     | 
| 
      
 10 
     | 
    
         
            +
                  if umlaut =~ /&#x(.+);/
         
     | 
| 
      
 11 
     | 
    
         
            +
            	bla = $1.hex
         
     | 
| 
      
 12 
     | 
    
         
            +
            	bla.chr
         
     | 
| 
      
 13 
     | 
    
         
            +
                  else
         
     | 
| 
      
 14 
     | 
    
         
            +
            	umlaut
         
     | 
| 
      
 15 
     | 
    
         
            +
                  end
         
     | 
| 
      
 16 
     | 
    
         
            +
                }
         
     | 
| 
      
 17 
     | 
    
         
            +
              end
         
     | 
| 
      
 18 
     | 
    
         
            +
             
     | 
| 
      
 19 
     | 
    
         
            +
              def Ampersand.iso_to_hex(str)
         
     | 
| 
      
 20 
     | 
    
         
            +
                return utf8_to_hex(UtfIso.from_iso_8859_1(str))
         
     | 
| 
      
 21 
     | 
    
         
            +
              end
         
     | 
| 
      
 22 
     | 
    
         
            +
             
     | 
| 
      
 23 
     | 
    
         
            +
              def Ampersand.utf8_to_hex(str)
         
     | 
| 
      
 24 
     | 
    
         
            +
                arr=str.unpack('U*')
         
     | 
| 
      
 25 
     | 
    
         
            +
                outstr = ""
         
     | 
| 
      
 26 
     | 
    
         
            +
                arr.each { |num|
         
     | 
| 
      
 27 
     | 
    
         
            +
                  if num <  0x80
         
     | 
| 
      
 28 
     | 
    
         
            +
            	outstr << num.chr
         
     | 
| 
      
 29 
     | 
    
         
            +
                  else
         
     | 
| 
      
 30 
     | 
    
         
            +
            	outstr.concat sprintf("&\#x%04x;", num)
         
     | 
| 
      
 31 
     | 
    
         
            +
                  end
         
     | 
| 
      
 32 
     | 
    
         
            +
                }
         
     | 
| 
      
 33 
     | 
    
         
            +
                return outstr
         
     | 
| 
      
 34 
     | 
    
         
            +
              end
         
     | 
| 
      
 35 
     | 
    
         
            +
            end
         
     | 
| 
      
 36 
     | 
    
         
            +
             
     | 
| 
      
 37 
     | 
    
         
            +
             
     | 
| 
         @@ -0,0 +1,375 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # -*- coding: utf-8 -*-
         
     | 
| 
      
 2 
     | 
    
         
            +
            ####
         
     | 
| 
      
 3 
     | 
    
         
            +
            # sp 21 07 05
         
     | 
| 
      
 4 
     | 
    
         
            +
            #
         
     | 
| 
      
 5 
     | 
    
         
            +
            # modified ke 30 10 05: adapted to fit into SynInterface
         
     | 
| 
      
 6 
     | 
    
         
            +
            #
         
     | 
| 
      
 7 
     | 
    
         
            +
            # represents a file containing Berkeley parses
         
     | 
| 
      
 8 
     | 
    
         
            +
            # 
         
     | 
| 
      
 9 
     | 
    
         
            +
            # underlying data structure for individual sentences: SalsaTigerSentence
         
     | 
| 
      
 10 
     | 
    
         
            +
            require "tempfile"
         
     | 
| 
      
 11 
     | 
    
         
            +
             
     | 
| 
      
 12 
     | 
    
         
            +
            require "frprep/SalsaTigerRegXML"
         
     | 
| 
      
 13 
     | 
    
         
            +
            require "frprep/SalsaTigerXMLHelper"
         
     | 
| 
      
 14 
     | 
    
         
            +
            require "frprep/TabFormat"
         
     | 
| 
      
 15 
     | 
    
         
            +
            require "frprep/Counter"
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
      
 17 
     | 
    
         
            +
            require "frprep/AbstractSynInterface"
         
     | 
| 
      
 18 
     | 
    
         
            +
            require "frprep/Tiger.rb"
         
     | 
| 
      
 19 
     | 
    
         
            +
             
     | 
| 
      
 20 
     | 
    
         
            +
            ################################################
         
     | 
| 
      
 21 
     | 
    
         
            +
            # Interface class
         
     | 
| 
      
 22 
     | 
    
         
            +
            class BerkeleyInterface < SynInterfaceSTXML
         
     | 
| 
      
 23 
     | 
    
         
            +
              $stderr.puts 'Announcing Berkeley Interface' if $DEBUG
         
     | 
| 
      
 24 
     | 
    
         
            +
              BerkeleyInterface.announce_me()
         
     | 
| 
      
 25 
     | 
    
         
            +
             
     | 
| 
      
 26 
     | 
    
         
            +
              ###
         
     | 
| 
      
 27 
     | 
    
         
            +
              def BerkeleyInterface.system()
         
     | 
| 
      
 28 
     | 
    
         
            +
                return "berkeley"
         
     | 
| 
      
 29 
     | 
    
         
            +
              end
         
     | 
| 
      
 30 
     | 
    
         
            +
             
     | 
| 
      
 31 
     | 
    
         
            +
              ###
         
     | 
| 
      
 32 
     | 
    
         
            +
              def BerkeleyInterface.service()
         
     | 
| 
      
 33 
     | 
    
         
            +
                return "parser"
         
     | 
| 
      
 34 
     | 
    
         
            +
              end
         
     | 
| 
      
 35 
     | 
    
         
            +
             
     | 
| 
      
 36 
     | 
    
         
            +
              ###
         
     | 
| 
      
 37 
     | 
    
         
            +
              # initialize to set values for all subsequent processing
         
     | 
| 
      
 38 
     | 
    
         
            +
              def initialize(program_path, # string: path to system
         
     | 
| 
      
 39 
     | 
    
         
            +
            		 insuffix,      # string: suffix of tab files
         
     | 
| 
      
 40 
     | 
    
         
            +
            		 outsuffix,     # string: suffix for parsed files
         
     | 
| 
      
 41 
     | 
    
         
            +
            		 stsuffix,      # string: suffix for Salsa/TIGER XML files
         
     | 
| 
      
 42 
     | 
    
         
            +
            		 var_hash = {}) # optional arguments in a hash
         
     | 
| 
      
 43 
     | 
    
         
            +
             
     | 
| 
      
 44 
     | 
    
         
            +
                super(program_path, insuffix, outsuffix, stsuffix, var_hash)
         
     | 
| 
      
 45 
     | 
    
         
            +
                unless @program_path =~ /\/$/
         
     | 
| 
      
 46 
     | 
    
         
            +
                  @program_path = @program_path + "/"
         
     | 
| 
      
 47 
     | 
    
         
            +
                end
         
     | 
| 
      
 48 
     | 
    
         
            +
             
     | 
| 
      
 49 
     | 
    
         
            +
                # new: evaluate var hash
         
     | 
| 
      
 50 
     | 
    
         
            +
                @pos_suffix = var_hash["pos_suffix"]
         
     | 
| 
      
 51 
     | 
    
         
            +
                @lemma_suffix = var_hash["lemma_suffix"]
         
     | 
| 
      
 52 
     | 
    
         
            +
                @tab_dir = var_hash["tab_dir"]
         
     | 
| 
      
 53 
     | 
    
         
            +
              end
         
     | 
| 
      
 54 
     | 
    
         
            +
             
     | 
| 
      
 55 
     | 
    
         
            +
              ####
         
     | 
| 
      
 56 
     | 
    
         
            +
              # parse a directory with TabFormat files and write the parse trees to outputdir 
         
     | 
| 
      
 57 
     | 
    
         
            +
              # I assume that the files in inputdir are smaller than 
         
     | 
| 
      
 58 
     | 
    
         
            +
              # the maximum number of sentences that 
         
     | 
| 
      
 59 
     | 
    
         
            +
              # Berkeley can parse in one go (i.e. that they are split)
         
     | 
| 
      
 60 
     | 
    
         
            +
              def process_dir(in_dir,  # string: input directory name
         
     | 
| 
      
 61 
     | 
    
         
            +
            		  out_dir) # string: output directory name
         
     | 
| 
      
 62 
     | 
    
         
            +
                
         
     | 
| 
      
 63 
     | 
    
         
            +
            # not using x64 arch, adjusting for 32 bit
         
     | 
| 
      
 64 
     | 
    
         
            +
            #    berkeley_prog = "java -d64 -Xmx10000m -jar #{@program_path}berkeley-parser.jar -gr #{@program_path}gerNegra.01.utf8 "
         
     | 
| 
      
 65 
     | 
    
         
            +
                berkeley_prog = "java -Xmx2000m -jar #{@program_path}berkeleyParser.jar -gr #{@program_path}ger_sm5.gr"
         
     | 
| 
      
 66 
     | 
    
         
            +
             
     | 
| 
      
 67 
     | 
    
         
            +
                berkeley_prog = "java -jar #{@program_path}berkeley-parser.jar -gr #{@program_path}gerNegra.01.utf8 "
         
     | 
| 
      
 68 
     | 
    
         
            +
                Dir[in_dir + "*" + @insuffix].each {|inputfilename|
         
     | 
| 
      
 69 
     | 
    
         
            +
                  STDERR.puts "*** Parsing #{inputfilename} with Berkeley"
         
     | 
| 
      
 70 
     | 
    
         
            +
                  corpusfilename = File.basename(inputfilename, @insuffix)
         
     | 
| 
      
 71 
     | 
    
         
            +
                  parsefilename = out_dir + corpusfilename + @outsuffix
         
     | 
| 
      
 72 
     | 
    
         
            +
                  tempfile = Tempfile.new(corpusfilename)
         
     | 
| 
      
 73 
     | 
    
         
            +
             
     | 
| 
      
 74 
     | 
    
         
            +
                  # we need neither lemmata nor POS tags; berkeley can do with the words
         
     | 
| 
      
 75 
     | 
    
         
            +
                  corpusfile = FNTabFormatFile.new(inputfilename,nil, nil) 
         
     | 
| 
      
 76 
     | 
    
         
            +
                  corpusfile.each_sentence {|sentence|
         
     | 
| 
      
 77 
     | 
    
         
            +
                    #puts sentence.to_s
         
     | 
| 
      
 78 
     | 
    
         
            +
                    tempfile.puts sentence.to_s
         
     | 
| 
      
 79 
     | 
    
         
            +
                  }
         
     | 
| 
      
 80 
     | 
    
         
            +
                  tempfile.close
         
     | 
| 
      
 81 
     | 
    
         
            +
                  # parse and remove comments in the parser output
         
     | 
| 
      
 82 
     | 
    
         
            +
                  STDERR.puts "#{berkeley_prog} < #{tempfile.path} > #{parsefilename}"
         
     | 
| 
      
 83 
     | 
    
         
            +
             
     | 
| 
      
 84 
     | 
    
         
            +
                  # AB: for testing we leave this step out, it takes too much time.
         
     | 
| 
      
 85 
     | 
    
         
            +
                  # Please keep the <parsefile> intact!!!
         
     | 
| 
      
 86 
     | 
    
         
            +
            #      Kernel.system("#{berkeley_prog} < #{tempfile.path} > #{parsefilename}")      
         
     | 
| 
      
 87 
     | 
    
         
            +
                  FileUtils.cp tempfile.path, '/home/arbox/input.txt'
         
     | 
| 
      
 88 
     | 
    
         
            +
                }
         
     | 
| 
      
 89 
     | 
    
         
            +
              end
         
     | 
| 
      
 90 
     | 
    
         
            +
             
     | 
| 
      
 91 
     | 
    
         
            +
              ###
         
     | 
| 
      
 92 
     | 
    
         
            +
              # for a given parsed file:
         
     | 
| 
      
 93 
     | 
    
         
            +
              # yield each sentence as a pair 
         
     | 
| 
      
 94 
     | 
    
         
            +
              #  [SalsaTigerSentence object, FNTabFormatSentence object]
         
     | 
| 
      
 95 
     | 
    
         
            +
              # of the sentence in SalsaTigerXML and the matching tab format sentence
         
     | 
| 
      
 96 
     | 
    
         
            +
              #
         
     | 
| 
      
 97 
     | 
    
         
            +
              # If a parse has failed, returns 
         
     | 
| 
      
 98 
     | 
    
         
            +
              #  [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence] 
         
     | 
| 
      
 99 
     | 
    
         
            +
              # to allow more detailed accounting for failed parses
         
     | 
| 
      
 100 
     | 
    
         
            +
              # (basically just a flat structure with a failed=true attribute 
         
     | 
| 
      
 101 
     | 
    
         
            +
              # at the sentence node)
         
     | 
| 
      
 102 
     | 
    
         
            +
              def each_sentence(parsefilename)
         
     | 
| 
      
 103 
     | 
    
         
            +
                # sanity checks
         
     | 
| 
      
 104 
     | 
    
         
            +
                unless @tab_dir
         
     | 
| 
      
 105 
     | 
    
         
            +
                  raise "Need to set tab directory on initialization"
         
     | 
| 
      
 106 
     | 
    
         
            +
                end
         
     | 
| 
      
 107 
     | 
    
         
            +
               
         
     | 
| 
      
 108 
     | 
    
         
            +
                # get matching tab file for this parser output file
         
     | 
| 
      
 109 
     | 
    
         
            +
                parsefile = File.new(parsefilename)
         
     | 
| 
      
 110 
     | 
    
         
            +
                tabfilename = @tab_dir+File.basename(parsefilename, @outsuffix)+ @insuffix
         
     | 
| 
      
 111 
     | 
    
         
            +
                tabfile = FNTabFormatFile.new(tabfilename, @postag_suffix, @lemma_suffix)    
         
     | 
| 
      
 112 
     | 
    
         
            +
             
     | 
| 
      
 113 
     | 
    
         
            +
                sentid = 0
         
     | 
| 
      
 114 
     | 
    
         
            +
                tabfile.each_sentence {|tab_sent| # iterate over corpus sentences
         
     | 
| 
      
 115 
     | 
    
         
            +
                  
         
     | 
| 
      
 116 
     | 
    
         
            +
                  sentence_str = ""
         
     | 
| 
      
 117 
     | 
    
         
            +
                  status = true # error encountered? 
         
     | 
| 
      
 118 
     | 
    
         
            +
                  # assemble next sentence in Berkeley file by reading lines from parsefile
         
     | 
| 
      
 119 
     | 
    
         
            +
                  # for berkeley: 
         
     | 
| 
      
 120 
     | 
    
         
            +
                  while true
         
     | 
| 
      
 121 
     | 
    
         
            +
                    line = parsefile.gets
         
     | 
| 
      
 122 
     | 
    
         
            +
             
     | 
| 
      
 123 
     | 
    
         
            +
                    # search for the next "relevant" file or end of the file
         
     | 
| 
      
 124 
     | 
    
         
            +
            	if line.nil? or line=~/^\( *\(TOP/ or line=~/^\(\(\)/
         
     | 
| 
      
 125 
     | 
    
         
            +
                      break
         
     | 
| 
      
 126 
     | 
    
         
            +
            	end   
         
     | 
| 
      
 127 
     | 
    
         
            +
                    sentid +=1
         
     | 
| 
      
 128 
     | 
    
         
            +
                    
         
     | 
| 
      
 129 
     | 
    
         
            +
                  end
         
     | 
| 
      
 130 
     | 
    
         
            +
                 
         
     | 
| 
      
 131 
     | 
    
         
            +
               
         
     | 
| 
      
 132 
     | 
    
         
            +
                  if line.nil? # while we search a parse, the parse file is over...
         
     | 
| 
      
 133 
     | 
    
         
            +
                    raise "Error: premature end of parser file!"
         
     | 
| 
      
 134 
     | 
    
         
            +
                  end
         
     | 
| 
      
 135 
     | 
    
         
            +
                  
         
     | 
| 
      
 136 
     | 
    
         
            +
             
     | 
| 
      
 137 
     | 
    
         
            +
                  # berkeley parser output: remove brackets /(.*)/
         
     | 
| 
      
 138 
     | 
    
         
            +
                  line.sub!(/^\( */, '')
         
     | 
| 
      
 139 
     | 
    
         
            +
                  line.sub!(/ *\) *$/, '')
         
     | 
| 
      
 140 
     | 
    
         
            +
                  line.gsub!(/\)\)/, ') )')
         
     | 
| 
      
 141 
     | 
    
         
            +
                  line.gsub!(/\)\)/, ') )')
         
     | 
| 
      
 142 
     | 
    
         
            +
                  line.gsub!(/(\([A-Z]+)_/, '\1-')
         
     | 
| 
      
 143 
     | 
    
         
            +
             
     | 
| 
      
 144 
     | 
    
         
            +
                  sentence_str = line.chomp!
         
     | 
| 
      
 145 
     | 
    
         
            +
             
     | 
| 
      
 146 
     | 
    
         
            +
                  # if we are here, we have a sentence_str to work on
         
     | 
| 
      
 147 
     | 
    
         
            +
                  # hopefully, our status is OK
         
     | 
| 
      
 148 
     | 
    
         
            +
                  case status
         
     | 
| 
      
 149 
     | 
    
         
            +
                  when true
         
     | 
| 
      
 150 
     | 
    
         
            +
                    if tab_sent.get_sent_id() and tab_sent.get_sent_id() != "--"
         
     | 
| 
      
 151 
     | 
    
         
            +
                      my_sent_id = tab_sent.get_sent_id()
         
     | 
| 
      
 152 
     | 
    
         
            +
                    else
         
     | 
| 
      
 153 
     | 
    
         
            +
                      my_sent_id = File.basename(parsefilename, @outsuffix) + "_" + sentid.to_s
         
     | 
| 
      
 154 
     | 
    
         
            +
                    end
         
     | 
| 
      
 155 
     | 
    
         
            +
             
     | 
| 
      
 156 
     | 
    
         
            +
                    st_sent = build_salsatiger(" " + sentence_str + " ", 0,
         
     | 
| 
      
 157 
     | 
    
         
            +
                                               Array.new, Counter.new(0),
         
     | 
| 
      
 158 
     | 
    
         
            +
                                               Counter.new(500),
         
     | 
| 
      
 159 
     | 
    
         
            +
                                               SalsaTigerSentence.empty_sentence(my_sent_id.to_s))
         
     | 
| 
      
 160 
     | 
    
         
            +
            	if st_sent.nil?
         
     | 
| 
      
 161 
     | 
    
         
            +
            	  next
         
     | 
| 
      
 162 
     | 
    
         
            +
            	end
         
     | 
| 
      
 163 
     | 
    
         
            +
                    yield [st_sent, tab_sent, BerkeleyInterface.standard_mapping(st_sent, tab_sent)]
         
     | 
| 
      
 164 
     | 
    
         
            +
                  else # i.e. when "failed"
         
     | 
| 
      
 165 
     | 
    
         
            +
                    #raise "Hunh? This is a failed parse, but still we have a parse tree? Look again."
         
     | 
| 
      
 166 
     | 
    
         
            +
                  end
         
     | 
| 
      
 167 
     | 
    
         
            +
                 
         
     | 
| 
      
 168 
     | 
    
         
            +
                }
         
     | 
| 
      
 169 
     | 
    
         
            +
             
     | 
| 
      
 170 
     | 
    
         
            +
                  # we don't have a sentence: hopefully, this is becase parsing has failed
         
     | 
| 
      
 171 
     | 
    
         
            +
                
         
     | 
| 
      
 172 
     | 
    
         
            +
                
         
     | 
| 
      
 173 
     | 
    
         
            +
                # all TabFile sentences are consumed: 
         
     | 
| 
      
 174 
     | 
    
         
            +
                # now we may just encounter comments, garbage, empty lines etc. 
         
     | 
| 
      
 175 
     | 
    
         
            +
                
         
     | 
| 
      
 176 
     | 
    
         
            +
                while not parsefile.eof?
         
     | 
| 
      
 177 
     | 
    
         
            +
             
     | 
| 
      
 178 
     | 
    
         
            +
                  case parsefile.gets
         
     | 
| 
      
 179 
     | 
    
         
            +
                  when nil, /^%/, /^\s*$/ # empty lines, comments, end of input indicate end of current parse 
         
     | 
| 
      
 180 
     | 
    
         
            +
                  else
         
     | 
| 
      
 181 
     | 
    
         
            +
                    raise "Error: premature end of tab file!"
         
     | 
| 
      
 182 
     | 
    
         
            +
                  end
         
     | 
| 
      
 183 
     | 
    
         
            +
                end  
         
     | 
| 
      
 184 
     | 
    
         
            +
              end
         
     | 
| 
      
 185 
     | 
    
         
            +
              
         
     | 
| 
      
 186 
     | 
    
         
            +
             
     | 
| 
      
 187 
     | 
    
         
            +
              ###
         
     | 
| 
      
 188 
     | 
    
         
            +
              # write Salsa/TIGER XML output to file
         
     | 
| 
      
 189 
     | 
    
         
            +
              def to_stxml_file(infilename,  # string: name of parse file
         
     | 
| 
      
 190 
     | 
    
         
            +
            		    outfilename) # string: name of output stxml file
         
     | 
| 
      
 191 
     | 
    
         
            +
             
     | 
| 
      
 192 
     | 
    
         
            +
                outfile = File.new(outfilename, "w")
         
     | 
| 
      
 193 
     | 
    
         
            +
                
         
     | 
| 
      
 194 
     | 
    
         
            +
                outfile.puts SalsaTigerXMLHelper.get_header()
         
     | 
| 
      
 195 
     | 
    
         
            +
                each_sentence(infilename) { |st_sent, tabsent|
         
     | 
| 
      
 196 
     | 
    
         
            +
                  outfile.puts st_sent.get()
         
     | 
| 
      
 197 
     | 
    
         
            +
                }
         
     | 
| 
      
 198 
     | 
    
         
            +
                outfile.puts SalsaTigerXMLHelper.get_footer()
         
     | 
| 
      
 199 
     | 
    
         
            +
                outfile.close()
         
     | 
| 
      
 200 
     | 
    
         
            +
              end
         
     | 
| 
      
 201 
     | 
    
         
            +
             
     | 
| 
      
 202 
     | 
    
         
            +
             
     | 
| 
      
 203 
     | 
    
         
            +
             
     | 
| 
      
 204 
     | 
    
         
            +
              ########################
         
     | 
| 
      
 205 
     | 
    
         
            +
              private
         
     | 
| 
      
 206 
     | 
    
         
            +
             
     | 
| 
      
 207 
     | 
    
         
            +
              ###
         
     | 
| 
      
 208 
     | 
    
         
            +
              # Recursive function for parsing a Berkeley parse tree and 
         
     | 
| 
      
 209 
     | 
    
         
            +
              # building a SalsaTigerSentence recursively
         
     | 
| 
      
 210 
     | 
    
         
            +
              #
         
     | 
| 
      
 211 
     | 
    
         
            +
              # Algorithm: manage stack which contains, for the current constituent, 
         
     | 
| 
      
 212 
     | 
    
         
            +
              # child constituents (if a nonterminal), and the category label.
         
     | 
| 
      
 213 
     | 
    
         
            +
              # When the end of a constituent is reached, a new SynNode (TigerSalsa node) ist created.
         
     | 
| 
      
 214 
     | 
    
         
            +
              # All children and the category label are popped from the stack and integrated into the 
         
     | 
| 
      
 215 
     | 
    
         
            +
              # TigerSalsa data structure. The new node is re-pushed onto the stack.
         
     | 
| 
      
 216 
     | 
    
         
            +
              def build_salsatiger(sentence, # string
         
     | 
| 
      
 217 
     | 
    
         
            +
                                pos,      # position in string (index): integer
         
     | 
| 
      
 218 
     | 
    
         
            +
                                stack,    # stack with incomplete nodes: Array
         
     | 
| 
      
 219 
     | 
    
         
            +
                                termc,    # terminal counter
         
     | 
| 
      
 220 
     | 
    
         
            +
                                nontc,    # nonterminal counter
         
     | 
| 
      
 221 
     | 
    
         
            +
                                sent_obj) # SalsaTigerSentence
         
     | 
| 
      
 222 
     | 
    
         
            +
                
         
     | 
| 
      
 223 
     | 
    
         
            +
               
         
     | 
| 
      
 224 
     | 
    
         
            +
             
     | 
| 
      
 225 
     | 
    
         
            +
                if sentence =~ /\(\)/
         
     | 
| 
      
 226 
     | 
    
         
            +
                  return nil
         
     | 
| 
      
 227 
     | 
    
         
            +
                end
         
     | 
| 
      
 228 
     | 
    
         
            +
             
     | 
| 
      
 229 
     | 
    
         
            +
               # main case distinction: match the beginning of our string 
         
     | 
| 
      
 230 
     | 
    
         
            +
               # (i.e. what follows our current position in the string)
         
     | 
| 
      
 231 
     | 
    
         
            +
                case sentence[pos..-1]
         
     | 
| 
      
 232 
     | 
    
         
            +
                  
         
     | 
| 
      
 233 
     | 
    
         
            +
                when /^ *$/ # nothing -> whole sentence parsed
         
     | 
| 
      
 234 
     | 
    
         
            +
                  if stack.length == 1 
         
     | 
| 
      
 235 
     | 
    
         
            +
            	# sleepy always delivers one "top" node; if we don't get just one
         
     | 
| 
      
 236 
     | 
    
         
            +
                    # node, something has gone wrong
         
     | 
| 
      
 237 
     | 
    
         
            +
                    node = stack.pop
         
     | 
| 
      
 238 
     | 
    
         
            +
                    node.del_attribute("gf")
         
     | 
| 
      
 239 
     | 
    
         
            +
                    return sent_obj
         
     | 
| 
      
 240 
     | 
    
         
            +
                  else
         
     | 
| 
      
 241 
     | 
    
         
            +
                    raise "Error: more than one root node (stack length #{stack.length}). Full sentence: \n#{sentence}"
         
     | 
| 
      
 242 
     | 
    
         
            +
                  end    
         
     | 
| 
      
 243 
     | 
    
         
            +
                  
         
     | 
| 
      
 244 
     | 
    
         
            +
                when /^\s*\(([^ )]+) / 
         
     | 
| 
      
 245 
     | 
    
         
            +
                  # match the beginning of a new constituent 
         
     | 
| 
      
 246 
     | 
    
         
            +
                  # (opening bracket + category + space, may not contain closing bracket)
         
     | 
| 
      
 247 
     | 
    
         
            +
                  cat = $1
         
     | 
| 
      
 248 
     | 
    
         
            +
                  if cat.nil? or cat == ""
         
     | 
| 
      
 249 
     | 
    
         
            +
                    raise "Error: found category nil in sentence #{sentence[pos,10]}, full sentence\n#{sentence}"
         
     | 
| 
      
 250 
     | 
    
         
            +
                  end
         
     | 
| 
      
 251 
     | 
    
         
            +
            #          STDERR.puts "new const #{cat}"
         
     | 
| 
      
 252 
     | 
    
         
            +
                  stack.push cat # throw the category label on the stack    
         
     | 
| 
      
 253 
     | 
    
         
            +
                  return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)    
         
     | 
| 
      
 254 
     | 
    
         
            +
                  
         
     | 
| 
      
 255 
     | 
    
         
            +
                when /^\s*(\S+)\) /
         
     | 
| 
      
 256 
     | 
    
         
            +
                  # match the end of a terminal constituent (something before a closing bracket + space)
         
     | 
| 
      
 257 
     | 
    
         
            +
                  word = $1
         
     | 
| 
      
 258 
     | 
    
         
            +
             
     | 
| 
      
 259 
     | 
    
         
            +
                  comb_cat = stack.pop
         
     | 
| 
      
 260 
     | 
    
         
            +
                  if comb_cat.to_s == ""
         
     | 
| 
      
 261 
     | 
    
         
            +
                    raise "Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
         
     | 
| 
      
 262 
     | 
    
         
            +
                  end
         
     | 
| 
      
 263 
     | 
    
         
            +
             
     | 
| 
      
 264 
     | 
    
         
            +
                  cat,gf = split_cat(comb_cat)
         
     | 
| 
      
 265 
     | 
    
         
            +
                  node = sent_obj.add_syn("t",
         
     | 
| 
      
 266 
     | 
    
         
            +
                                          nil,  # cat (doesn't matter here)
         
     | 
| 
      
 267 
     | 
    
         
            +
                                          SalsaTigerXMLHelper.escape(word), # word
         
     | 
| 
      
 268 
     | 
    
         
            +
                                          cat,  # pos
         
     | 
| 
      
 269 
     | 
    
         
            +
                                          termc.next.to_s)
         
     | 
| 
      
 270 
     | 
    
         
            +
                  node.set_attribute("gf",gf)
         
     | 
| 
      
 271 
     | 
    
         
            +
            #          STDERR.puts "completed terminal #{cat}, #{word}"
         
     | 
| 
      
 272 
     | 
    
         
            +
                  stack.push node
         
     | 
| 
      
 273 
     | 
    
         
            +
                  return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)    
         
     | 
| 
      
 274 
     | 
    
         
            +
                  
         
     | 
| 
      
 275 
     | 
    
         
            +
                when /^\s*\)/ # match the end of a nonterminal (nothing before a closing bracket)
         
     | 
| 
      
 276 
     | 
    
         
            +
                  # now collect children:
         
     | 
| 
      
 277 
     | 
    
         
            +
                  # pop items from the stack until you find the category
         
     | 
| 
      
 278 
     | 
    
         
            +
                  children = Array.new  
         
     | 
| 
      
 279 
     | 
    
         
            +
                  while true
         
     | 
| 
      
 280 
     | 
    
         
            +
                    if stack.empty?
         
     | 
| 
      
 281 
     | 
    
         
            +
                      raise "Error: stack empty; cannot find more children"
         
     | 
| 
      
 282 
     | 
    
         
            +
                    end
         
     | 
| 
      
 283 
     | 
    
         
            +
                    item = stack.pop
         
     | 
| 
      
 284 
     | 
    
         
            +
                    case item.class.to_s
         
     | 
| 
      
 285 
     | 
    
         
            +
                    when "SynNode" # this is a child
         
     | 
| 
      
 286 
     | 
    
         
            +
                      children.push item
         
     | 
| 
      
 287 
     | 
    
         
            +
                    when "String" # this is the category label
         
     | 
| 
      
 288 
     | 
    
         
            +
                      if item.to_s == ""
         
     | 
| 
      
 289 
     | 
    
         
            +
                        raise "Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
         
     | 
| 
      
 290 
     | 
    
         
            +
                      end        
         
     | 
| 
      
 291 
     | 
    
         
            +
                      cat,gf = split_cat(item)
         
     | 
| 
      
 292 
     | 
    
         
            +
                      break
         
     | 
| 
      
 293 
     | 
    
         
            +
                    else
         
     | 
| 
      
 294 
     | 
    
         
            +
                      raise "Error: unknown item class #{item.class.to_s}"
         
     | 
| 
      
 295 
     | 
    
         
            +
                    end
         
     | 
| 
      
 296 
     | 
    
         
            +
                  end
         
     | 
| 
      
 297 
     | 
    
         
            +
                  # now add a nonterminal node to the sentence object and 
         
     | 
| 
      
 298 
     | 
    
         
            +
                  # register the children nodes
         
     | 
| 
      
 299 
     | 
    
         
            +
                  node = sent_obj.add_syn("nt",
         
     | 
| 
      
 300 
     | 
    
         
            +
                                          cat, # cat
         
     | 
| 
      
 301 
     | 
    
         
            +
                                          nil, # word (doesn't matter)
         
     | 
| 
      
 302 
     | 
    
         
            +
                                          nil, # pos (doesn't matter)
         
     | 
| 
      
 303 
     | 
    
         
            +
                                          nontc.next.to_s)
         
     | 
| 
      
 304 
     | 
    
         
            +
                  children.each {|child|
         
     | 
| 
      
 305 
     | 
    
         
            +
                    child_gf = child.get_attribute("gf")
         
     | 
| 
      
 306 
     | 
    
         
            +
                    child.del_attribute("gf")
         
     | 
| 
      
 307 
     | 
    
         
            +
                    node.add_child(child,child_gf)
         
     | 
| 
      
 308 
     | 
    
         
            +
                    child.add_parent(node, child_gf)
         
     | 
| 
      
 309 
     | 
    
         
            +
                   }
         
     | 
| 
      
 310 
     | 
    
         
            +
                  node.set_attribute("gf",gf)
         
     | 
| 
      
 311 
     | 
    
         
            +
            #          STDERR.puts "Completed nonterm #{cat}, #{children.length} children."
         
     | 
| 
      
 312 
     | 
    
         
            +
                  stack.push node
         
     | 
| 
      
 313 
     | 
    
         
            +
             
     | 
| 
      
 314 
     | 
    
         
            +
                  return build_salsatiger(sentence,pos+$&.length, stack,termc,nontc,sent_obj)
         
     | 
| 
      
 315 
     | 
    
         
            +
                else
         
     | 
| 
      
 316 
     | 
    
         
            +
                  raise "Error: cannot analyse sentence at pos #{pos}: #{sentence[pos..-1]}. Complete sentence: \n#{sentence}"
         
     | 
| 
      
 317 
     | 
    
         
            +
                end
         
     | 
| 
      
 318 
     | 
    
         
            +
              end
         
     | 
| 
      
 319 
     | 
    
         
            +
             
     | 
| 
      
 320 
     | 
    
         
            +
             
     | 
| 
      
 321 
     | 
    
         
            +
             
     | 
| 
      
 322 
     | 
    
         
            +
             
     | 
| 
      
 323 
     | 
    
         
            +
              ###
         
     | 
| 
      
 324 
     | 
    
         
            +
              # Berkeley delivers node labels as "phrase type"-"grammatical function"
         
     | 
| 
      
 325 
     | 
    
         
            +
              # but the GF may not be present.
         
     | 
| 
      
 326 
     | 
    
         
            +
             
     | 
| 
      
 327 
     | 
    
         
            +
              def split_cat(cat)
         
     | 
| 
      
 328 
     | 
    
         
            +
             
     | 
| 
      
 329 
     | 
    
         
            +
                cat =~ /^([^-]*)(-([^-]*))?$/
         
     | 
| 
      
 330 
     | 
    
         
            +
                unless $1
         
     | 
| 
      
 331 
     | 
    
         
            +
                  raise "Error: could not identify category in #{cat}"
         
     | 
| 
      
 332 
     | 
    
         
            +
                end
         
     | 
| 
      
 333 
     | 
    
         
            +
                
         
     | 
| 
      
 334 
     | 
    
         
            +
                proper_cat = $1
         
     | 
| 
      
 335 
     | 
    
         
            +
                
         
     | 
| 
      
 336 
     | 
    
         
            +
                if $3    
         
     | 
| 
      
 337 
     | 
    
         
            +
                  gf = $3
         
     | 
| 
      
 338 
     | 
    
         
            +
                else
         
     | 
| 
      
 339 
     | 
    
         
            +
                  gf = ""
         
     | 
| 
      
 340 
     | 
    
         
            +
                end
         
     | 
| 
      
 341 
     | 
    
         
            +
                
         
     | 
| 
      
 342 
     | 
    
         
            +
                return [proper_cat,gf]
         
     | 
| 
      
 343 
     | 
    
         
            +
                
         
     | 
| 
      
 344 
     | 
    
         
            +
              end
         
     | 
| 
      
 345 
     | 
    
         
            +
            end
         
     | 
| 
      
 346 
     | 
    
         
            +
             
     | 
| 
      
 347 
     | 
    
         
            +
             
     | 
| 
      
 348 
     | 
    
         
            +
             
     | 
| 
      
 349 
     | 
    
         
            +
            ################################################
         
     | 
| 
      
 350 
     | 
    
         
            +
            # Interpreter class
         
     | 
| 
      
 351 
     | 
    
         
            +
            class BerkeleyInterpreter < Tiger
         
     | 
| 
      
 352 
     | 
    
         
            +
              BerkeleyInterpreter.announce_me()
         
     | 
| 
      
 353 
     | 
    
         
            +
             
     | 
| 
      
 354 
     | 
    
         
            +
              ###
         
     | 
| 
      
 355 
     | 
    
         
            +
              # names of the systems interpreted by this class:
         
     | 
| 
      
 356 
     | 
    
         
            +
              # returns a hash service(string) -> system name (string),
         
     | 
| 
      
 357 
     | 
    
         
            +
              # e.g.
         
     | 
| 
      
 358 
     | 
    
         
            +
              # { "parser" => "collins", "lemmatizer" => "treetagger" }
         
     | 
| 
      
 359 
     | 
    
         
            +
              def BerkeleyInterpreter.systems()
         
     | 
| 
      
 360 
     | 
    
         
            +
                return {
         
     | 
| 
      
 361 
     | 
    
         
            +
            	"parser" => "berkeley"
         
     | 
| 
      
 362 
     | 
    
         
            +
                }
         
     | 
| 
      
 363 
     | 
    
         
            +
              end
         
     | 
| 
      
 364 
     | 
    
         
            +
             
     | 
| 
      
 365 
     | 
    
         
            +
              ###
         
     | 
| 
      
 366 
     | 
    
         
            +
              # names of additional systems that may be interpreted by this class
         
     | 
| 
      
 367 
     | 
    
         
            +
              # returns a hash service(string) -> system name(string)
         
     | 
| 
      
 368 
     | 
    
         
            +
              # same as names()
         
     | 
| 
      
 369 
     | 
    
         
            +
              def BerkeleyInterpreter.optional_systems()
         
     | 
| 
      
 370 
     | 
    
         
            +
                return {
         
     | 
| 
      
 371 
     | 
    
         
            +
                  "lemmatizer" => "treetagger"
         
     | 
| 
      
 372 
     | 
    
         
            +
                }
         
     | 
| 
      
 373 
     | 
    
         
            +
              end
         
     | 
| 
      
 374 
     | 
    
         
            +
             
     | 
| 
      
 375 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,1165 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            ####
         
     | 
| 
      
 2 
     | 
    
         
            +
            # sp 15 04 05
         
     | 
| 
      
 3 
     | 
    
         
            +
            #
         
     | 
| 
      
 4 
     | 
    
         
            +
            # modified ke 30 10 05: adapted to fit into SynInterface
         
     | 
| 
      
 5 
     | 
    
         
            +
            #
         
     | 
| 
      
 6 
     | 
    
         
            +
            # represents a file containing Collins parses
         
     | 
| 
      
 7 
     | 
    
         
            +
            # 
         
     | 
| 
      
 8 
     | 
    
         
            +
            # underlying data structure for individual sentences: SalsaTigerSentence
         
     | 
| 
      
 9 
     | 
    
         
            +
             
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
            require "tempfile"
         
     | 
| 
      
 12 
     | 
    
         
            +
            require "frprep/TabFormat"
         
     | 
| 
      
 13 
     | 
    
         
            +
            require "frprep/SalsaTigerRegXML"
         
     | 
| 
      
 14 
     | 
    
         
            +
            require "frprep/SalsaTigerXMLHelper"
         
     | 
| 
      
 15 
     | 
    
         
            +
            require "frprep/Counter"
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
      
 17 
     | 
    
         
            +
            require "frprep/AbstractSynInterface"
         
     | 
| 
      
 18 
     | 
    
         
            +
             
     | 
| 
      
 19 
     | 
    
         
            +
            ################################################
         
     | 
| 
      
 20 
     | 
    
         
            +
            # Interface class
         
     | 
| 
      
 21 
     | 
    
         
            +
            class CollinsInterface < SynInterfaceSTXML
         
     | 
| 
      
 22 
     | 
    
         
            +
              CollinsInterface.announce_me()
         
     | 
| 
      
 23 
     | 
    
         
            +
             
     | 
| 
      
 24 
     | 
    
         
            +
              ###
         
     | 
| 
      
 25 
     | 
    
         
            +
              def CollinsInterface.system()
         
     | 
| 
      
 26 
     | 
    
         
            +
                return "collins"
         
     | 
| 
      
 27 
     | 
    
         
            +
              end
         
     | 
| 
      
 28 
     | 
    
         
            +
             
     | 
| 
      
 29 
     | 
    
         
            +
              ###
         
     | 
| 
      
 30 
     | 
    
         
            +
              def CollinsInterface.service()
         
     | 
| 
      
 31 
     | 
    
         
            +
                return "parser"
         
     | 
| 
      
 32 
     | 
    
         
            +
              end
         
     | 
| 
      
 33 
     | 
    
         
            +
             
     | 
| 
      
 34 
     | 
    
         
            +
              ###
         
     | 
| 
      
 35 
     | 
    
         
            +
              # initialize to set values for all subsequent processing
         
     | 
| 
      
 36 
     | 
    
         
            +
              def initialize(program_path, # string: path to system
         
     | 
| 
      
 37 
     | 
    
         
            +
            		 insuffix,      # string: suffix of tab files
         
     | 
| 
      
 38 
     | 
    
         
            +
            		 outsuffix,     # string: suffix for parsed files
         
     | 
| 
      
 39 
     | 
    
         
            +
            		 stsuffix,      # string: suffix for Salsa/TIGER XML files
         
     | 
| 
      
 40 
     | 
    
         
            +
            		 var_hash = {}) # optional arguments in a hash
         
     | 
| 
      
 41 
     | 
    
         
            +
             
     | 
| 
      
 42 
     | 
    
         
            +
                super(program_path, insuffix, outsuffix, stsuffix, var_hash)
         
     | 
| 
      
 43 
     | 
    
         
            +
                # I am not expecting any parameters, but I need
         
     | 
| 
      
 44 
     | 
    
         
            +
                # the program path to end in a /.
         
     | 
| 
      
 45 
     | 
    
         
            +
                unless @program_path =~ /\/$/
         
     | 
| 
      
 46 
     | 
    
         
            +
                  @program_path = @program_path + "/"
         
     | 
| 
      
 47 
     | 
    
         
            +
                end
         
     | 
| 
      
 48 
     | 
    
         
            +
             
     | 
| 
      
 49 
     | 
    
         
            +
                # new: evaluate var hash
         
     | 
| 
      
 50 
     | 
    
         
            +
                @pos_suffix = var_hash["pos_suffix"]
         
     | 
| 
      
 51 
     | 
    
         
            +
                @lemma_suffix = var_hash["lemma_suffix"]
         
     | 
| 
      
 52 
     | 
    
         
            +
                @tab_dir = var_hash["tab_dir"]
         
     | 
| 
      
 53 
     | 
    
         
            +
              end
         
     | 
| 
      
 54 
     | 
    
         
            +
             
     | 
| 
      
 55 
     | 
    
         
            +
              
         
     | 
| 
      
 56 
     | 
    
         
            +
              ###
         
     | 
| 
      
 57 
     | 
    
         
            +
              # parse a bunch of TabFormat files (*.<insuffix>) with Collins model 3
         
     | 
| 
      
 58 
     | 
    
         
            +
              # required: POS tags must be present
         
     | 
| 
      
 59 
     | 
    
         
            +
              # produced: in outputdir, files *.<outsuffix>
         
     | 
| 
      
 60 
     | 
    
         
            +
              # I assume that the files in inputdir are smaller than 
         
     | 
| 
      
 61 
     | 
    
         
            +
              # the maximum number of sentences 
         
     | 
| 
      
 62 
     | 
    
         
            +
              # Collins can parse in one go (i.e. that they are split) and I don't have to care
         
     | 
| 
      
 63 
     | 
    
         
            +
              def process_dir(in_dir,        # string: name of input directory
         
     | 
| 
      
 64 
     | 
    
         
            +
            		  out_dir)       # string: name of output directory
         
     | 
| 
      
 65 
     | 
    
         
            +
                print "parsing ", in_dir, " and writing to ", out_dir, "\n"
         
     | 
| 
      
 66 
     | 
    
         
            +
             
     | 
| 
      
 67 
     | 
    
         
            +
                unless @pos_suffix
         
     | 
| 
      
 68 
     | 
    
         
            +
                  raise "Collins interface: need suffix for POS files"
         
     | 
| 
      
 69 
     | 
    
         
            +
                end
         
     | 
| 
      
 70 
     | 
    
         
            +
                
         
     | 
| 
      
 71 
     | 
    
         
            +
                collins_prog = "gunzip -c #{@program_path}models/model3/events.gz | nice #{@program_path}code/parser"    
         
     | 
| 
      
 72 
     | 
    
         
            +
                collins_params = " #{@program_path}models/model3/grammar 10000 1 1 1 1"
         
     | 
| 
      
 73 
     | 
    
         
            +
                
         
     | 
| 
      
 74 
     | 
    
         
            +
                Dir[in_dir+ "*" + @insuffix].each { |inputfilename|
         
     | 
| 
      
 75 
     | 
    
         
            +
             
     | 
| 
      
 76 
     | 
    
         
            +
                  STDERR.puts "*** Parsing #{inputfilename} with Collins"
         
     | 
| 
      
 77 
     | 
    
         
            +
             
     | 
| 
      
 78 
     | 
    
         
            +
                  corpusfilename = File.basename(inputfilename, @insuffix)
         
     | 
| 
      
 79 
     | 
    
         
            +
                  parsefilename = out_dir+corpusfilename+ @outsuffix
         
     | 
| 
      
 80 
     | 
    
         
            +
                  tempfile = Tempfile.new(corpusfilename)
         
     | 
| 
      
 81 
     | 
    
         
            +
            	
         
     | 
| 
      
 82 
     | 
    
         
            +
                  # we need to have part of speech tags (but no lemmas at this point)
         
     | 
| 
      
 83 
     | 
    
         
            +
                  # included automatically by FNTabFormatFile initialize from *.pos
         
     | 
| 
      
 84 
     | 
    
         
            +
                  tabfile = FNTabFormatFile.new(inputfilename,@pos_suffix)
         
     | 
| 
      
 85 
     | 
    
         
            +
             
     | 
| 
      
 86 
     | 
    
         
            +
                  CollinsInterface.produce_collins_input(tabfile,tempfile)
         
     | 
| 
      
 87 
     | 
    
         
            +
                  tempfile.close
         
     | 
| 
      
 88 
     | 
    
         
            +
            	print collins_prog+" "+tempfile.path+" "+ collins_params+" > "+parsefilename
         
     | 
| 
      
 89 
     | 
    
         
            +
                  Kernel.system(collins_prog+" "+tempfile.path+" "+
         
     | 
| 
      
 90 
     | 
    
         
            +
            		    collins_params+" > "+parsefilename)    
         
     | 
| 
      
 91 
     | 
    
         
            +
                  tempfile.close(true)
         
     | 
| 
      
 92 
     | 
    
         
            +
                }
         
     | 
| 
      
 93 
     | 
    
         
            +
              end
         
     | 
| 
      
 94 
     | 
    
         
            +
             
     | 
| 
      
 95 
     | 
    
         
            +
              ###
         
     | 
| 
      
 96 
     | 
    
         
            +
              # for a given parsed file:
         
     | 
| 
      
 97 
     | 
    
         
            +
              # yield each sentence as a pair 
         
     | 
| 
      
 98 
     | 
    
         
            +
              #  [SalsaTigerSentence object, FNTabFormatSentence object]
         
     | 
| 
      
 99 
     | 
    
         
            +
              # of the sentence in SalsaTigerXML and the matching tab format sentence
         
     | 
| 
      
 100 
     | 
    
         
            +
              #
         
     | 
| 
      
 101 
     | 
    
         
            +
              # If a parse has failed, returns 
         
     | 
| 
      
 102 
     | 
    
         
            +
              #  [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence] 
         
     | 
| 
      
 103 
     | 
    
         
            +
              # to allow more detailed accounting for failed parses
         
     | 
| 
      
 104 
     | 
    
         
            +
              def each_sentence(parsefilename)
         
     | 
| 
      
 105 
     | 
    
         
            +
             
     | 
| 
      
 106 
     | 
    
         
            +
                # sanity checks
         
     | 
| 
      
 107 
     | 
    
         
            +
                unless @tab_dir
         
     | 
| 
      
 108 
     | 
    
         
            +
                  raise "Need to set tab directory on initialization"
         
     | 
| 
      
 109 
     | 
    
         
            +
                end
         
     | 
| 
      
 110 
     | 
    
         
            +
             
     | 
| 
      
 111 
     | 
    
         
            +
                # get matching tab file for this parser output file
         
     | 
| 
      
 112 
     | 
    
         
            +
                parserfile = File.new(parsefilename)
         
     | 
| 
      
 113 
     | 
    
         
            +
                tabfilename = @tab_dir+File.basename(parsefilename, @outsuffix)+ @insuffix
         
     | 
| 
      
 114 
     | 
    
         
            +
             
     | 
| 
      
 115 
     | 
    
         
            +
                corpusfile = FNTabFormatFile.new(tabfilename, @pos_suffix, @lemma_suffix)    
         
     | 
| 
      
 116 
     | 
    
         
            +
             
     | 
| 
      
 117 
     | 
    
         
            +
                corpusfile.each_sentence {|tab_sent| # iterate over corpus sentences
         
     | 
| 
      
 118 
     | 
    
         
            +
                  
         
     | 
| 
      
 119 
     | 
    
         
            +
                  my_sent_id = tab_sent.get_sent_id()
         
     | 
| 
      
 120 
     | 
    
         
            +
                  
         
     | 
| 
      
 121 
     | 
    
         
            +
                  while true # find next matching line in parse file
         
     | 
| 
      
 122 
     | 
    
         
            +
            	line = parserfile.gets
         
     | 
| 
      
 123 
     | 
    
         
            +
                    # search for the next "relevant" file or end of the file
         
     | 
| 
      
 124 
     | 
    
         
            +
            	if line.nil? or line=~/^\(TOP/
         
     | 
| 
      
 125 
     | 
    
         
            +
            	  break
         
     | 
| 
      
 126 
     | 
    
         
            +
            	end
         
     | 
| 
      
 127 
     | 
    
         
            +
                  end
         
     | 
| 
      
 128 
     | 
    
         
            +
            	STDERR.puts line
         
     | 
| 
      
 129 
     | 
    
         
            +
                  # while we search a parse, the parse file is over...
         
     | 
| 
      
 130 
     | 
    
         
            +
                  if line.nil?
         
     | 
| 
      
 131 
     | 
    
         
            +
                    raise "Error: premature end of parser file!"
         
     | 
| 
      
 132 
     | 
    
         
            +
                  end
         
     | 
| 
      
 133 
     | 
    
         
            +
                  
         
     | 
| 
      
 134 
     | 
    
         
            +
                  line.chomp!
         
     | 
| 
      
 135 
     | 
    
         
            +
                  
         
     | 
| 
      
 136 
     | 
    
         
            +
                  # it now holds that line =~ ^(TOP
         
     | 
| 
      
 137 
     | 
    
         
            +
                  
         
     | 
| 
      
 138 
     | 
    
         
            +
                  case line
         
     | 
| 
      
 139 
     | 
    
         
            +
                  when /^\(TOP~/ # successful parse
         
     | 
| 
      
 140 
     | 
    
         
            +
             
     | 
| 
      
 141 
     | 
    
         
            +
                    st_sent = SalsaTigerSentence.empty_sentence(my_sent_id.to_s)
         
     | 
| 
      
 142 
     | 
    
         
            +
             
     | 
| 
      
 143 
     | 
    
         
            +
            	build_salsatiger(line,st_sent)
         
     | 
| 
      
 144 
     | 
    
         
            +
            	
         
     | 
| 
      
 145 
     | 
    
         
            +
                    yield [st_sent, tab_sent, CollinsInterface.standard_mapping(st_sent, tab_sent)]
         
     | 
| 
      
 146 
     | 
    
         
            +
             
     | 
| 
      
 147 
     | 
    
         
            +
                  else 
         
     | 
| 
      
 148 
     | 
    
         
            +
                    # failed parse: create a "failed" parse object 
         
     | 
| 
      
 149 
     | 
    
         
            +
                    # with one nonterminal node and all the terminals
         
     | 
| 
      
 150 
     | 
    
         
            +
                    
         
     | 
| 
      
 151 
     | 
    
         
            +
                    sent = CollinsInterface.failed_sentence(tab_sent,my_sent_id)
         
     | 
| 
      
 152 
     | 
    
         
            +
                    yield [sent, tab_sent, CollinsInterface.standard_mapping(sent, tab_sent)] 
         
     | 
| 
      
 153 
     | 
    
         
            +
            	
         
     | 
| 
      
 154 
     | 
    
         
            +
                  end
         
     | 
| 
      
 155 
     | 
    
         
            +
                }
         
     | 
| 
      
 156 
     | 
    
         
            +
                # after the end of the corpusfile, check if there are any parses left
         
     | 
| 
      
 157 
     | 
    
         
            +
                while true
         
     | 
| 
      
 158 
     | 
    
         
            +
                  line = parserfile.gets
         
     | 
| 
      
 159 
     | 
    
         
            +
                  if line.nil? # if there are none, everything is fine
         
     | 
| 
      
 160 
     | 
    
         
            +
                    break
         
     | 
| 
      
 161 
     | 
    
         
            +
                  elsif line =~ /^\(TOP/ # if there are, raise an exception
         
     | 
| 
      
 162 
     | 
    
         
            +
                    raise "Error: premature end of corpus file!"
         
     | 
| 
      
 163 
     | 
    
         
            +
                  end
         
     | 
| 
      
 164 
     | 
    
         
            +
                end
         
     | 
| 
      
 165 
     | 
    
         
            +
              end    
         
     | 
| 
      
 166 
     | 
    
         
            +
             
     | 
| 
      
 167 
     | 
    
         
            +
              ###
         
     | 
| 
      
 168 
     | 
    
         
            +
              # write Salsa/TIGER XML output to file
         
     | 
| 
      
 169 
     | 
    
         
            +
              def to_stxml_file(infilename,  # string: name of parse file
         
     | 
| 
      
 170 
     | 
    
         
            +
            		    outfilename) # string: name of output stxml file
         
     | 
| 
      
 171 
     | 
    
         
            +
             
     | 
| 
      
 172 
     | 
    
         
            +
                outfile = File.new(outfilename, "w")
         
     | 
| 
      
 173 
     | 
    
         
            +
                outfile.puts SalsaTigerXMLHelper.get_header()
         
     | 
| 
      
 174 
     | 
    
         
            +
                each_sentence(infilename) { |st_sent, tabsent|
         
     | 
| 
      
 175 
     | 
    
         
            +
                  outfile.puts st_sent.get()
         
     | 
| 
      
 176 
     | 
    
         
            +
                }
         
     | 
| 
      
 177 
     | 
    
         
            +
                outfile.puts SalsaTigerXMLHelper.get_footer()
         
     | 
| 
      
 178 
     | 
    
         
            +
                outfile.close()
         
     | 
| 
      
 179 
     | 
    
         
            +
              end
         
     | 
| 
      
 180 
     | 
    
         
            +
             
     | 
| 
      
 181 
     | 
    
         
            +
             
     | 
| 
      
 182 
     | 
    
         
            +
              ########################
         
     | 
| 
      
 183 
     | 
    
         
            +
              private
         
     | 
| 
      
 184 
     | 
    
         
            +
              
         
     | 
| 
      
 185 
     | 
    
         
            +
              # Build a SalsaTigerSentence corresponding to the Collins parse in argument string.
         
     | 
| 
      
 186 
     | 
    
         
            +
              #
         
     | 
| 
      
 187 
     | 
    
         
            +
              # Special features: removes unary nodes and traces
         
     | 
| 
      
 188 
     | 
    
         
            +
              def build_salsatiger(string,st_sent)
         
     | 
| 
      
 189 
     | 
    
         
            +
                
         
     | 
| 
      
 190 
     | 
    
         
            +
                nt_c = Counter.new(500)
         
     | 
| 
      
 191 
     | 
    
         
            +
                t_c = Counter.new(0)
         
     | 
| 
      
 192 
     | 
    
         
            +
                
         
     | 
| 
      
 193 
     | 
    
         
            +
                position = 0
         
     | 
| 
      
 194 
     | 
    
         
            +
                stack = Array.new
         
     | 
| 
      
 195 
     | 
    
         
            +
                
         
     | 
| 
      
 196 
     | 
    
         
            +
                while position < string.length
         
     | 
| 
      
 197 
     | 
    
         
            +
                  if string[position,1] == "(" # push nonterminal
         
     | 
| 
      
 198 
     | 
    
         
            +
            	nextspace = string.index(" ",position)
         
     | 
| 
      
 199 
     | 
    
         
            +
            	nonterminal = string[position+1..nextspace-1]
         
     | 
| 
      
 200 
     | 
    
         
            +
            	stack.push nonterminal
         
     | 
| 
      
 201 
     | 
    
         
            +
            	position = nextspace+1
         
     | 
| 
      
 202 
     | 
    
         
            +
                  elsif string[position,1] == ")" # reduce stack
         
     | 
| 
      
 203 
     | 
    
         
            +
            	tempstack = Array.new
         
     | 
| 
      
 204 
     | 
    
         
            +
            	while true
         
     | 
| 
      
 205 
     | 
    
         
            +
                      # get all Nodes from the stack and put them on a tempstack,
         
     | 
| 
      
 206 
     | 
    
         
            +
                      # until you find a String, which is a not-yet existing nonterminal
         
     | 
| 
      
 207 
     | 
    
         
            +
            	  object = stack.pop
         
     | 
| 
      
 208 
     | 
    
         
            +
                      if object.kind_of? SynNode
         
     | 
| 
      
 209 
     | 
    
         
            +
            	    tempstack.push(object) # terminal or subtree
         
     | 
| 
      
 210 
     | 
    
         
            +
            	  else #  string (nonterminal label)
         
     | 
| 
      
 211 
     | 
    
         
            +
                        if tempstack.length == 1 # skip unary nodes: do nothing and write tempstack back to stack
         
     | 
| 
      
 212 
     | 
    
         
            +
                          stack += tempstack
         
     | 
| 
      
 213 
     | 
    
         
            +
                          break
         
     | 
| 
      
 214 
     | 
    
         
            +
                          # puts "Unary node #{object}"
         
     | 
| 
      
 215 
     | 
    
         
            +
                        end
         
     | 
| 
      
 216 
     | 
    
         
            +
            	    nt_a = object.split("~")
         
     | 
| 
      
 217 
     | 
    
         
            +
            	    unless nt_a.length == 4
         
     | 
| 
      
 218 
     | 
    
         
            +
                          # something went wrong. maybe it's about character encoding
         
     | 
| 
      
 219 
     | 
    
         
            +
                          if nt_a.length() > 4
         
     | 
| 
      
 220 
     | 
    
         
            +
                            # yes, assume it's about character encoding
         
     | 
| 
      
 221 
     | 
    
         
            +
                            nt_a = [nt_a[0], nt_a[1..-3].join("~"), nt_a[-2], nt_a[-1]]
         
     | 
| 
      
 222 
     | 
    
         
            +
                          else
         
     | 
| 
      
 223 
     | 
    
         
            +
                            # whoa, _less_ pieces than expected: problem.
         
     | 
| 
      
 224 
     | 
    
         
            +
                            $stderr.puts "Collins parse tree translation nonrecoverable error:"
         
     | 
| 
      
 225 
     | 
    
         
            +
                            $stderr.puts "Unexpectedly too few components in nonterminal " + nt_a.join("~")
         
     | 
| 
      
 226 
     | 
    
         
            +
                            raise StandardError.new("nonrecoverable error")
         
     | 
| 
      
 227 
     | 
    
         
            +
                          end
         
     | 
| 
      
 228 
     | 
    
         
            +
            	    end
         
     | 
| 
      
 229 
     | 
    
         
            +
                        
         
     | 
| 
      
 230 
     | 
    
         
            +
                        # construct a new nonterminal            
         
     | 
| 
      
 231 
     | 
    
         
            +
                        node = st_sent.add_syn("nt",
         
     | 
| 
      
 232 
     | 
    
         
            +
                                               SalsaTigerXMLHelper.escape(nt_a[0].strip), # cat
         
     | 
| 
      
 233 
     | 
    
         
            +
                                               nil, # word (doesn't matter)
         
     | 
| 
      
 234 
     | 
    
         
            +
                                               nil, # pos (doesn't matter)
         
     | 
| 
      
 235 
     | 
    
         
            +
                                               nt_c.next.to_s)
         
     | 
| 
      
 236 
     | 
    
         
            +
                        node.set_attribute("head",SalsaTigerXMLHelper.escape(nt_a[1].strip))            
         
     | 
| 
      
 237 
     | 
    
         
            +
            	    tempstack.reverse.each {|child|
         
     | 
| 
      
 238 
     | 
    
         
            +
            	      node.add_child(child,nil)
         
     | 
| 
      
 239 
     | 
    
         
            +
            	      child.set_parent(node,nil)
         
     | 
| 
      
 240 
     | 
    
         
            +
            	    }
         
     | 
| 
      
 241 
     | 
    
         
            +
            	    stack.push(node)
         
     | 
| 
      
 242 
     | 
    
         
            +
            	    break # while
         
     | 
| 
      
 243 
     | 
    
         
            +
            	  end
         
     | 
| 
      
 244 
     | 
    
         
            +
            	end
         
     | 
| 
      
 245 
     | 
    
         
            +
            	position = position+2 # == nextspace+1	
         
     | 
| 
      
 246 
     | 
    
         
            +
                  else # terminal
         
     | 
| 
      
 247 
     | 
    
         
            +
            	nextspace = string.index(" ",position)
         
     | 
| 
      
 248 
     | 
    
         
            +
            	terminal = string[position..nextspace].strip
         
     | 
| 
      
 249 
     | 
    
         
            +
            	t_a = terminal.split("/")
         
     | 
| 
      
 250 
     | 
    
         
            +
            	unless t_a.length == 2
         
     | 
| 
      
 251 
     | 
    
         
            +
            	  raise "[collins] Cannot split terminal #{terminal} into word and POS!"
         
     | 
| 
      
 252 
     | 
    
         
            +
                    end
         
     | 
| 
      
 253 
     | 
    
         
            +
                    
         
     | 
| 
      
 254 
     | 
    
         
            +
                    word = t_a[0]
         
     | 
| 
      
 255 
     | 
    
         
            +
                    pos = t_a[1]
         
     | 
| 
      
 256 
     | 
    
         
            +
                    
         
     | 
| 
      
 257 
     | 
    
         
            +
                    unless pos =~ /TRACE/                   
         
     | 
| 
      
 258 
     | 
    
         
            +
                      # construct a new terminal
         
     | 
| 
      
 259 
     | 
    
         
            +
                      node = st_sent.add_syn("t",
         
     | 
| 
      
 260 
     | 
    
         
            +
                                             nil,
         
     | 
| 
      
 261 
     | 
    
         
            +
                                             SalsaTigerXMLHelper.escape(CollinsInterface.unescape(word)), # word
         
     | 
| 
      
 262 
     | 
    
         
            +
                                             SalsaTigerXMLHelper.escape(pos), # pos
         
     | 
| 
      
 263 
     | 
    
         
            +
                                             t_c.next.to_s)
         
     | 
| 
      
 264 
     | 
    
         
            +
                      stack.push(node)
         
     | 
| 
      
 265 
     | 
    
         
            +
                    end
         
     | 
| 
      
 266 
     | 
    
         
            +
            	position = nextspace+1
         
     | 
| 
      
 267 
     | 
    
         
            +
                  end
         
     | 
| 
      
 268 
     | 
    
         
            +
                end
         
     | 
| 
      
 269 
     | 
    
         
            +
                
         
     | 
| 
      
 270 
     | 
    
         
            +
                # at the very end, we need to have exactly one syntactic root
         
     | 
| 
      
 271 
     | 
    
         
            +
                
         
     | 
| 
      
 272 
     | 
    
         
            +
                if stack.length != 1
         
     | 
| 
      
 273 
     | 
    
         
            +
                  raise "[collins] Error: Sentence has #{stack.length} roots"
         
     | 
| 
      
 274 
     | 
    
         
            +
                end  
         
     | 
| 
      
 275 
     | 
    
         
            +
              end
         
     | 
| 
      
 276 
     | 
    
         
            +
             
     | 
| 
      
 277 
     | 
    
         
            +
             
     | 
| 
      
 278 
     | 
    
         
            +
              ####
         
     | 
| 
      
 279 
     | 
    
         
            +
              # extract the Collins parser input format from a TabFormat object
         
     | 
| 
      
 280 
     | 
    
         
            +
              # that includes part-of-speech (pos) 
         
     | 
| 
      
 281 
     | 
    
         
            +
              #
         
     | 
| 
      
 282 
     | 
    
         
            +
              def CollinsInterface.produce_collins_input(corpusfile,tempfile)
         
     | 
| 
      
 283 
     | 
    
         
            +
                corpusfile.each_sentence {|s|
         
     | 
| 
      
 284 
     | 
    
         
            +
                  words = Array.new
         
     | 
| 
      
 285 
     | 
    
         
            +
                  s.each_line_parsed {|line_obj|
         
     | 
| 
      
 286 
     | 
    
         
            +
            	word = line_obj.get("word")
         
     | 
| 
      
 287 
     | 
    
         
            +
            	tag = line_obj.get("pos")
         
     | 
| 
      
 288 
     | 
    
         
            +
            	if tag.nil?
         
     | 
| 
      
 289 
     | 
    
         
            +
            	  raise "Error: FNTabFormat object not tagged!"
         
     | 
| 
      
 290 
     | 
    
         
            +
            	end
         
     | 
| 
      
 291 
     | 
    
         
            +
            	word_tag_pair = CollinsInterface.escape(word,tag)
         
     | 
| 
      
 292 
     | 
    
         
            +
            	if word_tag_pair =~ /\)/
         
     | 
| 
      
 293 
     | 
    
         
            +
            	  puts word_tag_pair
         
     | 
| 
      
 294 
     | 
    
         
            +
            	  puts s.to_s
         
     | 
| 
      
 295 
     | 
    
         
            +
            	end
         
     | 
| 
      
 296 
     | 
    
         
            +
            	words << word_tag_pair
         
     | 
| 
      
 297 
     | 
    
         
            +
                  }
         
     | 
| 
      
 298 
     | 
    
         
            +
                  tempfile.puts words.length.to_s+" "+words.join(" ")
         
     | 
| 
      
 299 
     | 
    
         
            +
                }
         
     | 
| 
      
 300 
     | 
    
         
            +
              end
         
     | 
| 
      
 301 
     | 
    
         
            +
             
     | 
| 
      
 302 
     | 
    
         
            +
              ####
         
     | 
| 
      
 303 
     | 
    
         
            +
              def CollinsInterface.escape(word,pos) # returns array word+" "+lemma
         
     | 
| 
      
 304 
     | 
    
         
            +
                case word
         
     | 
| 
      
 305 
     | 
    
         
            +
                  
         
     | 
| 
      
 306 
     | 
    
         
            +
                  # replace opening or closing brackets
         
     | 
| 
      
 307 
     | 
    
         
            +
                  # word representation is {L,R}R{B,S,C} (bracket, square, curly)
         
     | 
| 
      
 308 
     | 
    
         
            +
                  # POS for opening brackets is LRB, closing brackets RRB
         
     | 
| 
      
 309 
     | 
    
         
            +
                  
         
     | 
| 
      
 310 
     | 
    
         
            +
                when "("
         
     | 
| 
      
 311 
     | 
    
         
            +
                  return "LRB -LRB-"
         
     | 
| 
      
 312 
     | 
    
         
            +
                when "["
         
     | 
| 
      
 313 
     | 
    
         
            +
                  return "LRS -LRB-"
         
     | 
| 
      
 314 
     | 
    
         
            +
                when "{"
         
     | 
| 
      
 315 
     | 
    
         
            +
                  return "LRC -LRB-"
         
     | 
| 
      
 316 
     | 
    
         
            +
                
         
     | 
| 
      
 317 
     | 
    
         
            +
                when ")" 
         
     | 
| 
      
 318 
     | 
    
         
            +
                  return "RRB -RRB-"
         
     | 
| 
      
 319 
     | 
    
         
            +
                when "]"
         
     | 
| 
      
 320 
     | 
    
         
            +
                  return "RRS -RRB-"
         
     | 
| 
      
 321 
     | 
    
         
            +
                when "}"
         
     | 
| 
      
 322 
     | 
    
         
            +
                  return "RRC -RRB-"
         
     | 
| 
      
 323 
     | 
    
         
            +
                  
         
     | 
| 
      
 324 
     | 
    
         
            +
                  # catch those brackets or slashes inside words
         
     | 
| 
      
 325 
     | 
    
         
            +
                else 
         
     | 
| 
      
 326 
     | 
    
         
            +
                  word.gsub!(/\(/,"LRB")
         
     | 
| 
      
 327 
     | 
    
         
            +
                  word.gsub!(/\)/,"RRB")
         
     | 
| 
      
 328 
     | 
    
         
            +
                  word.gsub!(/\[/,"LRS")
         
     | 
| 
      
 329 
     | 
    
         
            +
                  word.gsub!(/\]/,"RRS")
         
     | 
| 
      
 330 
     | 
    
         
            +
                  word.gsub!(/\{/,"LRC")
         
     | 
| 
      
 331 
     | 
    
         
            +
                  word.gsub!(/\}/,"RRC")
         
     | 
| 
      
 332 
     | 
    
         
            +
                  word.gsub!(/\//,"&Slash;")
         
     | 
| 
      
 333 
     | 
    
         
            +
                  return word+" "+pos
         
     | 
| 
      
 334 
     | 
    
         
            +
                end
         
     | 
| 
      
 335 
     | 
    
         
            +
              end
         
     | 
| 
      
 336 
     | 
    
         
            +
              
         
     | 
| 
      
 337 
     | 
    
         
            +
              ####
         
     | 
| 
      
 338 
     | 
    
         
            +
              # replace replacements with original values
         
     | 
| 
      
 339 
     | 
    
         
            +
              def CollinsInterface.unescape(word)
         
     | 
| 
      
 340 
     | 
    
         
            +
                return word.gsub(/LRB/,"(").gsub(/RRB/,")").gsub(/LRS/,"[").gsub(/RRS/,"]").gsub(/LRC/,"{").gsub(/RRC/,"}").gsub(/&Slash;/,"/")
         
     | 
| 
      
 341 
     | 
    
         
            +
              end
         
     | 
| 
      
 342 
     | 
    
         
            +
            end
         
     | 
| 
      
 343 
     | 
    
         
            +
             
     | 
| 
      
 344 
     | 
    
         
            +
            ################################################
         
     | 
| 
      
 345 
     | 
    
         
            +
            # Interpreter class
         
     | 
| 
      
 346 
     | 
    
         
            +
            class CollinsTntInterpreter < SynInterpreter
         
     | 
| 
      
 347 
     | 
    
         
            +
              CollinsTntInterpreter.announce_me()
         
     | 
| 
      
 348 
     | 
    
         
            +
             
     | 
| 
      
 349 
     | 
    
         
            +
              ###
         
     | 
| 
      
 350 
     | 
    
         
            +
              # names of the systems interpreted by this class:
         
     | 
| 
      
 351 
     | 
    
         
            +
              # returns a hash service(string) -> system name (string),
         
     | 
| 
      
 352 
     | 
    
         
            +
              # e.g.
         
     | 
| 
      
 353 
     | 
    
         
            +
              # { "parser" => "collins", "lemmatizer" => "treetagger" }
         
     | 
| 
      
 354 
     | 
    
         
            +
              def CollinsTntInterpreter.systems()
         
     | 
| 
      
 355 
     | 
    
         
            +
                return {
         
     | 
| 
      
 356 
     | 
    
         
            +
                  "pos_tagger" => "treetagger",
         
     | 
| 
      
 357 
     | 
    
         
            +
                  "parser" => "collins"
         
     | 
| 
      
 358 
     | 
    
         
            +
                }
         
     | 
| 
      
 359 
     | 
    
         
            +
              end
         
     | 
| 
      
 360 
     | 
    
         
            +
             
     | 
| 
      
 361 
     | 
    
         
            +
              ###
         
     | 
| 
      
 362 
     | 
    
         
            +
              # names of additional systems that may be interpreted by this class
         
     | 
| 
      
 363 
     | 
    
         
            +
              # returns a hash service(string) -> system name(string)
         
     | 
| 
      
 364 
     | 
    
         
            +
              # same as names()
         
     | 
| 
      
 365 
     | 
    
         
            +
              def CollinsTntInterpreter.optional_systems()
         
     | 
| 
      
 366 
     | 
    
         
            +
                return {
         
     | 
| 
      
 367 
     | 
    
         
            +
                  "lemmatizer" => "treetagger"
         
     | 
| 
      
 368 
     | 
    
         
            +
                }
         
     | 
| 
      
 369 
     | 
    
         
            +
              end
         
     | 
| 
      
 370 
     | 
    
         
            +
             
     | 
| 
      
 371 
     | 
    
         
            +
              ###
         
     | 
| 
      
 372 
     | 
    
         
            +
              # generalize over POS tags.
         
     | 
| 
      
 373 
     | 
    
         
            +
              #
         
     | 
| 
      
 374 
     | 
    
         
            +
              # returns one of:
         
     | 
| 
      
 375 
     | 
    
         
            +
              #
         
     | 
| 
      
 376 
     | 
    
         
            +
              # adj:  adjective (phrase)
         
     | 
| 
      
 377 
     | 
    
         
            +
              # adv:  adverb (phrase)
         
     | 
| 
      
 378 
     | 
    
         
            +
              # card: numbers, quantity phrases
         
     | 
| 
      
 379 
     | 
    
         
            +
              # con:  conjunction
         
     | 
| 
      
 380 
     | 
    
         
            +
              # det:  determiner, including possessive/demonstrative pronouns etc.
         
     | 
| 
      
 381 
     | 
    
         
            +
              # for:  foreign material
         
     | 
| 
      
 382 
     | 
    
         
            +
              # noun: noun (phrase), including personal pronouns, proper names, expletives
         
     | 
| 
      
 383 
     | 
    
         
            +
              # part: particles, truncated words (German compound parts)
         
     | 
| 
      
 384 
     | 
    
         
            +
              # prep: preposition (phrase)
         
     | 
| 
      
 385 
     | 
    
         
            +
              # pun:  punctuation, brackets, etc.
         
     | 
| 
      
 386 
     | 
    
         
            +
              # sent: sentence
         
     | 
| 
      
 387 
     | 
    
         
            +
              # top:  top node of a sentence
         
     | 
| 
      
 388 
     | 
    
         
            +
              # verb: verb (phrase)
         
     | 
| 
      
 389 
     | 
    
         
            +
              # nil:  something went wrong
         
     | 
| 
      
 390 
     | 
    
         
            +
              #
         
     | 
| 
      
 391 
     | 
    
         
            +
              # returns: string, or nil
         
     | 
| 
      
 392 
     | 
    
         
            +
              def CollinsTntInterpreter.category(node) # SynNode
         
     | 
| 
      
 393 
     | 
    
         
            +
                pt = CollinsTntInterpreter.simplified_pt(node)
         
     | 
| 
      
 394 
     | 
    
         
            +
                if pt.nil?
         
     | 
| 
      
 395 
     | 
    
         
            +
                  # phrase type could not be determined
         
     | 
| 
      
 396 
     | 
    
         
            +
                  return nil
         
     | 
| 
      
 397 
     | 
    
         
            +
                end
         
     | 
| 
      
 398 
     | 
    
         
            +
             
     | 
| 
      
 399 
     | 
    
         
            +
                pt.to_s.strip() =~ /^([^-]*)/  
         
     | 
| 
      
 400 
     | 
    
         
            +
                case $1
         
     | 
| 
      
 401 
     | 
    
         
            +
                when  /^JJ/ ,/(WH)?ADJP/, /^PDT/ then  return "adj"
         
     | 
| 
      
 402 
     | 
    
         
            +
                when /^RB/, /(WH)?ADVP/, /^UH/ then return "adv"
         
     | 
| 
      
 403 
     | 
    
         
            +
                when /^CD/, /^QP/ then  return "card"
         
     | 
| 
      
 404 
     | 
    
         
            +
                when /^CC/, /^WRB/, /^CONJP/ then return "con"
         
     | 
| 
      
 405 
     | 
    
         
            +
                when /^DT/, /^POS/ then  return "det"
         
     | 
| 
      
 406 
     | 
    
         
            +
                when /^FW/, /^SYM/ then  return "for"
         
     | 
| 
      
 407 
     | 
    
         
            +
                when /^N/, "WHAD", "WDT", /^PRP/ , /^WHNP/, /^EX/, /^WP/  then return "noun"
         
     | 
| 
      
 408 
     | 
    
         
            +
                when  /^IN/ , /^TO/, /(WH)?PP/, "RP", /^PR(T|N)/ then return "prep"
         
     | 
| 
      
 409 
     | 
    
         
            +
                when /^PUNC/, /LRB/, /RRB/, /[,'".:;!?\(\)]/ then  return "pun"
         
     | 
| 
      
 410 
     | 
    
         
            +
                when /^S(s|bar|BAR|G|Q|BARQ|INV)?$/, /^UCP/, /^FRAG/, /^X/, /^INTJ/ then return "sent"
         
     | 
| 
      
 411 
     | 
    
         
            +
                when /^TOP/ then  return "top"
         
     | 
| 
      
 412 
     | 
    
         
            +
                when /^TRACE/ then  return "trace"
         
     | 
| 
      
 413 
     | 
    
         
            +
                when /^V/ , /^MD/ then return "verb"
         
     | 
| 
      
 414 
     | 
    
         
            +
                else
         
     | 
| 
      
 415 
     | 
    
         
            +
            #      $stderr.puts "WARNING: Unknown category/POS "+c.to_s + " (English data)"
         
     | 
| 
      
 416 
     | 
    
         
            +
                  return nil
         
     | 
| 
      
 417 
     | 
    
         
            +
                end
         
     | 
| 
      
 418 
     | 
    
         
            +
              end
         
     | 
| 
      
 419 
     | 
    
         
            +
             
     | 
| 
      
 420 
     | 
    
         
            +
             
     | 
| 
      
 421 
     | 
    
         
            +
              ###
         
     | 
| 
      
 422 
     | 
    
         
            +
              # is relative pronoun?
         
     | 
| 
      
 423 
     | 
    
         
            +
              #
         
     | 
| 
      
 424 
     | 
    
         
            +
              def CollinsTntInterpreter.relative_pronoun?(node) # SynNode
         
     | 
| 
      
 425 
     | 
    
         
            +
                pt = CollinsTntInterpreter.simplified_pt(node)
         
     | 
| 
      
 426 
     | 
    
         
            +
                if pt.nil?
         
     | 
| 
      
 427 
     | 
    
         
            +
                  # phrase type could not be determined
         
     | 
| 
      
 428 
     | 
    
         
            +
                  return nil
         
     | 
| 
      
 429 
     | 
    
         
            +
                end
         
     | 
| 
      
 430 
     | 
    
         
            +
             
     | 
| 
      
 431 
     | 
    
         
            +
                pt.to_s.strip() =~ /^([^-]*)/  
         
     | 
| 
      
 432 
     | 
    
         
            +
                case $1
         
     | 
| 
      
 433 
     | 
    
         
            +
                when /^WDT/, /^WHAD/, /^WHNP/, /^WP/
         
     | 
| 
      
 434 
     | 
    
         
            +
                  return true
         
     | 
| 
      
 435 
     | 
    
         
            +
                else
         
     | 
| 
      
 436 
     | 
    
         
            +
                  return false
         
     | 
| 
      
 437 
     | 
    
         
            +
                end
         
     | 
| 
      
 438 
     | 
    
         
            +
              end
         
     | 
| 
      
 439 
     | 
    
         
            +
             
     | 
| 
      
 440 
     | 
    
         
            +
              ###
         
     | 
| 
      
 441 
     | 
    
         
            +
              # lemma_backoff:
         
     | 
| 
      
 442 
     | 
    
         
            +
              #
         
     | 
| 
      
 443 
     | 
    
         
            +
              # if we have lemma information, return that,
         
     | 
| 
      
 444 
     | 
    
         
            +
              # and failing that, return the word
         
     | 
| 
      
 445 
     | 
    
         
            +
              #
         
     | 
| 
      
 446 
     | 
    
         
            +
              # returns: string, or nil
         
     | 
| 
      
 447 
     | 
    
         
            +
              def CollinsTntInterpreter.lemma_backoff(node)
         
     | 
| 
      
 448 
     | 
    
         
            +
                lemma = super(node)
         
     | 
| 
      
 449 
     | 
    
         
            +
                # lemmatizer has returned more than one possible lemma form: 
         
     | 
| 
      
 450 
     | 
    
         
            +
                # just accept the first 
         
     | 
| 
      
 451 
     | 
    
         
            +
                if lemma =~ /^([^|]+)|/ 
         
     | 
| 
      
 452 
     | 
    
         
            +
                  return $1
         
     | 
| 
      
 453 
     | 
    
         
            +
                else
         
     | 
| 
      
 454 
     | 
    
         
            +
                  return lemma
         
     | 
| 
      
 455 
     | 
    
         
            +
                end
         
     | 
| 
      
 456 
     | 
    
         
            +
              end
         
     | 
| 
      
 457 
     | 
    
         
            +
             
     | 
| 
      
 458 
     | 
    
         
            +
             
     | 
| 
      
 459 
     | 
    
         
            +
              ###
         
     | 
| 
      
 460 
     | 
    
         
            +
              # simplified phrase type:
         
     | 
| 
      
 461 
     | 
    
         
            +
              # like phrase type, but may simplify
         
     | 
| 
      
 462 
     | 
    
         
            +
              # the constituent label
         
     | 
| 
      
 463 
     | 
    
         
            +
              #
         
     | 
| 
      
 464 
     | 
    
         
            +
              # returns: string
         
     | 
| 
      
 465 
     | 
    
         
            +
              def CollinsTntInterpreter.simplified_pt(node)
         
     | 
| 
      
 466 
     | 
    
         
            +
                CollinsTntInterpreter.pt(node) =~ /^(\w+)(-\w)*/
         
     | 
| 
      
 467 
     | 
    
         
            +
                return $1
         
     | 
| 
      
 468 
     | 
    
         
            +
              end
         
     | 
| 
      
 469 
     | 
    
         
            +
             
     | 
| 
      
 470 
     | 
    
         
            +
              ###
         
     | 
| 
      
 471 
     | 
    
         
            +
              # verb_with_particle:
         
     | 
| 
      
 472 
     | 
    
         
            +
              #
         
     | 
| 
      
 473 
     | 
    
         
            +
              # given a node and a nodelist,
         
     | 
| 
      
 474 
     | 
    
         
            +
              # if the node represents a verb:
         
     | 
| 
      
 475 
     | 
    
         
            +
              # see if the verb has a particle among the nodes in nodelist
         
     | 
| 
      
 476 
     | 
    
         
            +
              # if so, return it
         
     | 
| 
      
 477 
     | 
    
         
            +
              #
         
     | 
| 
      
 478 
     | 
    
         
            +
              # returns: SynNode object if successful, else nil
         
     | 
| 
      
 479 
     | 
    
         
            +
              def CollinsTntInterpreter.particle_of_verb(node,
         
     | 
| 
      
 480 
     | 
    
         
            +
            					     node_list)
         
     | 
| 
      
 481 
     | 
    
         
            +
             
     | 
| 
      
 482 
     | 
    
         
            +
                # must be verb
         
     | 
| 
      
 483 
     | 
    
         
            +
                unless CollinsTntInterpreter.category(node) == "verb"
         
     | 
| 
      
 484 
     | 
    
         
            +
                  return nil
         
     | 
| 
      
 485 
     | 
    
         
            +
                end
         
     | 
| 
      
 486 
     | 
    
         
            +
             
     | 
| 
      
 487 
     | 
    
         
            +
                # must have parent
         
     | 
| 
      
 488 
     | 
    
         
            +
                unless node.parent
         
     | 
| 
      
 489 
     | 
    
         
            +
                  return nil
         
     | 
| 
      
 490 
     | 
    
         
            +
                end
         
     | 
| 
      
 491 
     | 
    
         
            +
             
     | 
| 
      
 492 
     | 
    
         
            +
                # look for sisters of the verb node that have the particle category
         
     | 
| 
      
 493 
     | 
    
         
            +
                particles = node.parent.children.select { |sister|
         
     | 
| 
      
 494 
     | 
    
         
            +
                  CollinsTntInterpreter.category(sister) == "part"
         
     | 
| 
      
 495 
     | 
    
         
            +
                }.map { |n| n.children}.flatten.select { |niece|
         
     | 
| 
      
 496 
     | 
    
         
            +
                  # now look for children of those nodes that are particles and are in the nodelist
         
     | 
| 
      
 497 
     | 
    
         
            +
                  nodelist.include? niece and
         
     | 
| 
      
 498 
     | 
    
         
            +
            	CollinsTntInterpreter.category(niece) == "part"
         
     | 
| 
      
 499 
     | 
    
         
            +
                }
         
     | 
| 
      
 500 
     | 
    
         
            +
             
     | 
| 
      
 501 
     | 
    
         
            +
                if particles.length == 0
         
     | 
| 
      
 502 
     | 
    
         
            +
                  return nil
         
     | 
| 
      
 503 
     | 
    
         
            +
                else
         
     | 
| 
      
 504 
     | 
    
         
            +
                  return particles.first
         
     | 
| 
      
 505 
     | 
    
         
            +
                end
         
     | 
| 
      
 506 
     | 
    
         
            +
              end
         
     | 
| 
      
 507 
     | 
    
         
            +
             
     | 
| 
      
 508 
     | 
    
         
            +
              ###
         
     | 
| 
      
 509 
     | 
    
         
            +
              # auxiliary?
         
     | 
| 
      
 510 
     | 
    
         
            +
              # 
         
     | 
| 
      
 511 
     | 
    
         
            +
              # returns true if the given node is an auxiliary
         
     | 
| 
      
 512 
     | 
    
         
            +
              # else false
         
     | 
| 
      
 513 
     | 
    
         
            +
              def CollinsTntInterpreter.auxiliary?(node)
         
     | 
| 
      
 514 
     | 
    
         
            +
             
     | 
| 
      
 515 
     | 
    
         
            +
                # look for
         
     | 
| 
      
 516 
     | 
    
         
            +
                #             ---VP---
         
     | 
| 
      
 517 
     | 
    
         
            +
                #             |      |
         
     | 
| 
      
 518 
     | 
    
         
            +
                #  the given node   VP-A
         
     | 
| 
      
 519 
     | 
    
         
            +
                #                    |
         
     | 
| 
      
 520 
     | 
    
         
            +
                #                verb node 
         
     | 
| 
      
 521 
     | 
    
         
            +
                # verb?
         
     | 
| 
      
 522 
     | 
    
         
            +
                unless CollinsTntInterpreter.category(node) == "verb"
         
     | 
| 
      
 523 
     | 
    
         
            +
                  return false
         
     | 
| 
      
 524 
     | 
    
         
            +
                end
         
     | 
| 
      
 525 
     | 
    
         
            +
             
     | 
| 
      
 526 
     | 
    
         
            +
                unless (parent = node.parent) and 
         
     | 
| 
      
 527 
     | 
    
         
            +
                    parent.category() == "VP" 
         
     | 
| 
      
 528 
     | 
    
         
            +
                  return false
         
     | 
| 
      
 529 
     | 
    
         
            +
                end
         
     | 
| 
      
 530 
     | 
    
         
            +
                unless (vpa_node = parent.children.detect { |other_child| other_child.category() == "VP-A" })
         
     | 
| 
      
 531 
     | 
    
         
            +
                  return false
         
     | 
| 
      
 532 
     | 
    
         
            +
                end
         
     | 
| 
      
 533 
     | 
    
         
            +
                unless vpa_node.children.detect { |other_node| CollinsTntInterpreter.category(other_node) == "verb" }
         
     | 
| 
      
 534 
     | 
    
         
            +
                  return false
         
     | 
| 
      
 535 
     | 
    
         
            +
                end
         
     | 
| 
      
 536 
     | 
    
         
            +
             
     | 
| 
      
 537 
     | 
    
         
            +
                return true
         
     | 
| 
      
 538 
     | 
    
         
            +
               
         
     | 
| 
      
 539 
     | 
    
         
            +
              end
         
     | 
| 
      
 540 
     | 
    
         
            +
             
     | 
| 
      
 541 
     | 
    
         
            +
              ###
         
     | 
| 
      
 542 
     | 
    
         
            +
              # modal?
         
     | 
| 
      
 543 
     | 
    
         
            +
              #
         
     | 
| 
      
 544 
     | 
    
         
            +
              # returns true if the given node is a modal verb,
         
     | 
| 
      
 545 
     | 
    
         
            +
              # else false
         
     | 
| 
      
 546 
     | 
    
         
            +
              def CollinsTntInterpreter.modal?(node)
         
     | 
| 
      
 547 
     | 
    
         
            +
                if node.part_of_speech() =~ /^MD/
         
     | 
| 
      
 548 
     | 
    
         
            +
                  return true
         
     | 
| 
      
 549 
     | 
    
         
            +
                else
         
     | 
| 
      
 550 
     | 
    
         
            +
                  return false
         
     | 
| 
      
 551 
     | 
    
         
            +
                end
         
     | 
| 
      
 552 
     | 
    
         
            +
              end
         
     | 
| 
      
 553 
     | 
    
         
            +
             
     | 
| 
      
 554 
     | 
    
         
            +
              ###
         
     | 
| 
      
 555 
     | 
    
         
            +
              # voice
         
     | 
| 
      
 556 
     | 
    
         
            +
              #
         
     | 
| 
      
 557 
     | 
    
         
            +
              # given a constituent, return 
         
     | 
| 
      
 558 
     | 
    
         
            +
              # - "active"/"passive" if it is a verb
         
     | 
| 
      
 559 
     | 
    
         
            +
              # - nil, else
         
     | 
| 
      
 560 
     | 
    
         
            +
              def CollinsTntInterpreter.voice(node) # SynNode
         
     | 
| 
      
 561 
     | 
    
         
            +
             
     | 
| 
      
 562 
     | 
    
         
            +
                tobe = ["be","am","is","are","was","were"]
         
     | 
| 
      
 563 
     | 
    
         
            +
             
     | 
| 
      
 564 
     | 
    
         
            +
                unless CollinsTntInterpreter.category(node) == "verb"
         
     | 
| 
      
 565 
     | 
    
         
            +
                  return nil
         
     | 
| 
      
 566 
     | 
    
         
            +
                end
         
     | 
| 
      
 567 
     | 
    
         
            +
             
     | 
| 
      
 568 
     | 
    
         
            +
                # if we have a gerund, a present tense, or an infitive  
         
     | 
| 
      
 569 
     | 
    
         
            +
                # then we are sure that we have an active form
         
     | 
| 
      
 570 
     | 
    
         
            +
                case CollinsTntInterpreter.pt(node)
         
     | 
| 
      
 571 
     | 
    
         
            +
                when "VBG","VBP", "VBZ", "VB"
         
     | 
| 
      
 572 
     | 
    
         
            +
                  return "active"
         
     | 
| 
      
 573 
     | 
    
         
            +
                end
         
     | 
| 
      
 574 
     | 
    
         
            +
             
     | 
| 
      
 575 
     | 
    
         
            +
             
     | 
| 
      
 576 
     | 
    
         
            +
                # There is an ambiguity for many word forms between VBN (past participle - passive)
         
     | 
| 
      
 577 
     | 
    
         
            +
                # and VBD (past tense - active)
         
     | 
| 
      
 578 
     | 
    
         
            +
                
         
     | 
| 
      
 579 
     | 
    
         
            +
                # so for these, we only say something if we can exclude one possibility,
         
     | 
| 
      
 580 
     | 
    
         
            +
                # this is the case
         
     | 
| 
      
 581 
     | 
    
         
            +
                # (a)  when there is a c-commanding "to be" somewhere. -> passive
         
     | 
| 
      
 582 
     | 
    
         
            +
                # (b)  when there is no "to be", but a "to have" somewhere. -> active
         
     | 
| 
      
 583 
     | 
    
         
            +
                
         
     | 
| 
      
 584 
     | 
    
         
            +
                # collect lemmas of c-commanding verbs.
         
     | 
| 
      
 585 
     | 
    
         
            +
             
     | 
| 
      
 586 
     | 
    
         
            +
                parent = node.parent
         
     | 
| 
      
 587 
     | 
    
         
            +
                if parent.nil?
         
     | 
| 
      
 588 
     | 
    
         
            +
                  return nil
         
     | 
| 
      
 589 
     | 
    
         
            +
                end
         
     | 
| 
      
 590 
     | 
    
         
            +
                gp = parent.parent
         
     | 
| 
      
 591 
     | 
    
         
            +
                if gp.nil?
         
     | 
| 
      
 592 
     | 
    
         
            +
                  return nil
         
     | 
| 
      
 593 
     | 
    
         
            +
                end
         
     | 
| 
      
 594 
     | 
    
         
            +
                
         
     | 
| 
      
 595 
     | 
    
         
            +
            #    other_verbs = Array.new
         
     | 
| 
      
 596 
     | 
    
         
            +
            #    
         
     | 
| 
      
 597 
     | 
    
         
            +
            #    current_node = node
         
     | 
| 
      
 598 
     | 
    
         
            +
            #    while current_node = current_node.parent 
         
     | 
| 
      
 599 
     | 
    
         
            +
            #      pt =  CollinsTntInterpreter.category(current_node)
         
     | 
| 
      
 600 
     | 
    
         
            +
            #      unless ["verb","sentence"].include? pt
         
     | 
| 
      
 601 
     | 
    
         
            +
            #        break
         
     | 
| 
      
 602 
     | 
    
         
            +
            #      end
         
     | 
| 
      
 603 
     | 
    
         
            +
            #      current_node.children.each {|child|
         
     | 
| 
      
 604 
     | 
    
         
            +
            #        if CollinsTntInterpreter.category(child) == "verb"
         
     | 
| 
      
 605 
     | 
    
         
            +
            #          other_verbs << CollinsTntInterpreter.lemma_backoff(nephew)
         
     | 
| 
      
 606 
     | 
    
         
            +
            #        end
         
     | 
| 
      
 607 
     | 
    
         
            +
            #      }
         
     | 
| 
      
 608 
     | 
    
         
            +
            #    end
         
     | 
| 
      
 609 
     | 
    
         
            +
            #    
         
     | 
| 
      
 610 
     | 
    
         
            +
            #    unless (tobe & other_verbs).empty?
         
     | 
| 
      
 611 
     | 
    
         
            +
            #      puts "passive "+node.id
         
     | 
| 
      
 612 
     | 
    
         
            +
            #      return "passive"
         
     | 
| 
      
 613 
     | 
    
         
            +
            #    end
         
     | 
| 
      
 614 
     | 
    
         
            +
            #    unless (tohave & other_verbs).empty?
         
     | 
| 
      
 615 
     | 
    
         
            +
            #      return "active"
         
     | 
| 
      
 616 
     | 
    
         
            +
            #    end
         
     | 
| 
      
 617 
     | 
    
         
            +
                
         
     | 
| 
      
 618 
     | 
    
         
            +
                if CollinsTntInterpreter.category(gp) == "verb" or CollinsTntInterpreter.category(gp) == "sent"
         
     | 
| 
      
 619 
     | 
    
         
            +
                  
         
     | 
| 
      
 620 
     | 
    
         
            +
                  current_node = node
         
     | 
| 
      
 621 
     | 
    
         
            +
                  
         
     | 
| 
      
 622 
     | 
    
         
            +
                  while current_node = current_node.parent 
         
     | 
| 
      
 623 
     | 
    
         
            +
                    pt =  CollinsTntInterpreter.category(current_node)
         
     | 
| 
      
 624 
     | 
    
         
            +
                    unless ["verb","sent"].include? pt
         
     | 
| 
      
 625 
     | 
    
         
            +
                      break
         
     | 
| 
      
 626 
     | 
    
         
            +
                    end
         
     | 
| 
      
 627 
     | 
    
         
            +
                    if current_node.children.detect {|nephew| tobe.include? CollinsTntInterpreter.lemma_backoff(nephew)}
         
     | 
| 
      
 628 
     | 
    
         
            +
                      return "passive"
         
     | 
| 
      
 629 
     | 
    
         
            +
                    end
         
     | 
| 
      
 630 
     | 
    
         
            +
                  end
         
     | 
| 
      
 631 
     | 
    
         
            +
                  # if no "to be" has been found...
         
     | 
| 
      
 632 
     | 
    
         
            +
                  return "active"
         
     | 
| 
      
 633 
     | 
    
         
            +
                end
         
     | 
| 
      
 634 
     | 
    
         
            +
                
         
     | 
| 
      
 635 
     | 
    
         
            +
                # case 2: The grandfather is something else (e.g. a noun phrase)
         
     | 
| 
      
 636 
     | 
    
         
            +
                # here, simple past forms are often mis-tagged as passives      
         
     | 
| 
      
 637 
     | 
    
         
            +
                #
         
     | 
| 
      
 638 
     | 
    
         
            +
                
         
     | 
| 
      
 639 
     | 
    
         
            +
                # if we were cautious, we would return "dontknow" here; 
         
     | 
| 
      
 640 
     | 
    
         
            +
                # however, these cases are so rare that it is unlikely that 
         
     | 
| 
      
 641 
     | 
    
         
            +
                # assignments would be more reliable; so we rely on the 
         
     | 
| 
      
 642 
     | 
    
         
            +
                # POS tag anyway.
         
     | 
| 
      
 643 
     | 
    
         
            +
                
         
     | 
| 
      
 644 
     | 
    
         
            +
                
         
     | 
| 
      
 645 
     | 
    
         
            +
                case CollinsTntInterpreter.pt(node)
         
     | 
| 
      
 646 
     | 
    
         
            +
                when "VBN","VBD"
         
     | 
| 
      
 647 
     | 
    
         
            +
                  return "passive"
         
     | 
| 
      
 648 
     | 
    
         
            +
                  # this must be some kind of error...
         
     | 
| 
      
 649 
     | 
    
         
            +
                else
         
     | 
| 
      
 650 
     | 
    
         
            +
                  return nil
         
     | 
| 
      
 651 
     | 
    
         
            +
                end    
         
     | 
| 
      
 652 
     | 
    
         
            +
              end
         
     | 
| 
      
 653 
     | 
    
         
            +
             
     | 
| 
      
 654 
     | 
    
         
            +
              ###
         
     | 
| 
      
 655 
     | 
    
         
            +
              # gfs
         
     | 
| 
      
 656 
     | 
    
         
            +
              #
         
     | 
| 
      
 657 
     | 
    
         
            +
              # grammatical functions of a constituent:
         
     | 
| 
      
 658 
     | 
    
         
            +
              # 
         
     | 
| 
      
 659 
     | 
    
         
            +
              # returns: a list of pairs [relation(string), node(SynNode)]
         
     | 
| 
      
 660 
     | 
    
         
            +
              # where <node> stands in the relation <relation> to the parameter
         
     | 
| 
      
 661 
     | 
    
         
            +
              # that the method was called with
         
     | 
| 
      
 662 
     | 
    
         
            +
              def CollinsTntInterpreter.gfs(anchor_node,    # SynNode
         
     | 
| 
      
 663 
     | 
    
         
            +
                                            sent)    # SalsaTigerSentence
         
     | 
| 
      
 664 
     | 
    
         
            +
             
     | 
| 
      
 665 
     | 
    
         
            +
                return sent.syn_nodes.map { |gf_node|
         
     | 
| 
      
 666 
     | 
    
         
            +
             
     | 
| 
      
 667 
     | 
    
         
            +
                  case CollinsTntInterpreter.category(anchor_node)
         
     | 
| 
      
 668 
     | 
    
         
            +
                  when "adj"
         
     | 
| 
      
 669 
     | 
    
         
            +
                    rel = CollinsTntInterpreter.gf_adj(anchor_node, gf_node)
         
     | 
| 
      
 670 
     | 
    
         
            +
                  when "verb"
         
     | 
| 
      
 671 
     | 
    
         
            +
                    rel = CollinsTntInterpreter.gf_verb(anchor_node, gf_node)
         
     | 
| 
      
 672 
     | 
    
         
            +
                  when "noun"
         
     | 
| 
      
 673 
     | 
    
         
            +
                    rel = CollinsTntInterpreter.gf_noun(anchor_node, gf_node)
         
     | 
| 
      
 674 
     | 
    
         
            +
                  end
         
     | 
| 
      
 675 
     | 
    
         
            +
             
     | 
| 
      
 676 
     | 
    
         
            +
                  if rel
         
     | 
| 
      
 677 
     | 
    
         
            +
                    [rel, gf_node]
         
     | 
| 
      
 678 
     | 
    
         
            +
                  else
         
     | 
| 
      
 679 
     | 
    
         
            +
                    nil
         
     | 
| 
      
 680 
     | 
    
         
            +
                  end
         
     | 
| 
      
 681 
     | 
    
         
            +
                }.compact()
         
     | 
| 
      
 682 
     | 
    
         
            +
              end
         
     | 
| 
      
 683 
     | 
    
         
            +
             
     | 
| 
      
 684 
     | 
    
         
            +
              ###
         
     | 
| 
      
 685 
     | 
    
         
            +
              # informative_content_node
         
     | 
| 
      
 686 
     | 
    
         
            +
              #
         
     | 
| 
      
 687 
     | 
    
         
            +
              # for most constituents: nil
         
     | 
| 
      
 688 
     | 
    
         
            +
              # for a PP, the NP
         
     | 
| 
      
 689 
     | 
    
         
            +
              # for an SBAR, the VP
         
     | 
| 
      
 690 
     | 
    
         
            +
              # for a VP, the embedded VP 
         
     | 
| 
      
 691 
     | 
    
         
            +
              def CollinsTntInterpreter.informative_content_node(node)
         
     | 
| 
      
 692 
     | 
    
         
            +
                this_pt = CollinsTntInterpreter.simplified_pt(node)
         
     | 
| 
      
 693 
     | 
    
         
            +
             
     | 
| 
      
 694 
     | 
    
         
            +
                unless ["SBAR", "VP", "PP"].include? this_pt
         
     | 
| 
      
 695 
     | 
    
         
            +
                  return nil
         
     | 
| 
      
 696 
     | 
    
         
            +
                end
         
     | 
| 
      
 697 
     | 
    
         
            +
                
         
     | 
| 
      
 698 
     | 
    
         
            +
                nh = CollinsTntInterpreter.head_terminal(node)
         
     | 
| 
      
 699 
     | 
    
         
            +
                unless nh
         
     | 
| 
      
 700 
     | 
    
         
            +
                  return nil
         
     | 
| 
      
 701 
     | 
    
         
            +
                end
         
     | 
| 
      
 702 
     | 
    
         
            +
                headlemma = CollinsTntInterpreter.lemma_backoff(nh)
         
     | 
| 
      
 703 
     | 
    
         
            +
             
     | 
| 
      
 704 
     | 
    
         
            +
                nonhead_children = node.children().reject { |n| 
         
     | 
| 
      
 705 
     | 
    
         
            +
                  nnh = CollinsTntInterpreter.head_terminal(n)
         
     | 
| 
      
 706 
     | 
    
         
            +
                  not(nnh) or 
         
     | 
| 
      
 707 
     | 
    
         
            +
                    CollinsTntInterpreter.lemma_backoff(nnh) == headlemma 
         
     | 
| 
      
 708 
     | 
    
         
            +
                }
         
     | 
| 
      
 709 
     | 
    
         
            +
                if nonhead_children.length() == 1
         
     | 
| 
      
 710 
     | 
    
         
            +
                  return nonhead_children.first
         
     | 
| 
      
 711 
     | 
    
         
            +
                end
         
     | 
| 
      
 712 
     | 
    
         
            +
             
     | 
| 
      
 713 
     | 
    
         
            +
                # more than one child:
         
     | 
| 
      
 714 
     | 
    
         
            +
                # for SBAR and VP take child with head POS starting in VB,
         
     | 
| 
      
 715 
     | 
    
         
            +
                # for PP child with head POS starting in NN
         
     | 
| 
      
 716 
     | 
    
         
            +
                case this_pt
         
     | 
| 
      
 717 
     | 
    
         
            +
                when "SBAR", "VP"
         
     | 
| 
      
 718 
     | 
    
         
            +
                  icont_child = nonhead_children.detect { |n|
         
     | 
| 
      
 719 
     | 
    
         
            +
                    h = CollinsTntInterpreter.head_terminal(n)
         
     | 
| 
      
 720 
     | 
    
         
            +
                    h and h.part_of_speech() =~ /^VB/
         
     | 
| 
      
 721 
     | 
    
         
            +
                  }
         
     | 
| 
      
 722 
     | 
    
         
            +
                when "PP"
         
     | 
| 
      
 723 
     | 
    
         
            +
                  icont_child = nonhead_children.detect { |n|
         
     | 
| 
      
 724 
     | 
    
         
            +
                    h = CollinsTntInterpreter.head_terminal(n)
         
     | 
| 
      
 725 
     | 
    
         
            +
                    h and h.part_of_speech() =~ /^NN/
         
     | 
| 
      
 726 
     | 
    
         
            +
                  }
         
     | 
| 
      
 727 
     | 
    
         
            +
                else
         
     | 
| 
      
 728 
     | 
    
         
            +
                  raise "Shouldn't be here"
         
     | 
| 
      
 729 
     | 
    
         
            +
                end
         
     | 
| 
      
 730 
     | 
    
         
            +
             
     | 
| 
      
 731 
     | 
    
         
            +
                if icont_child
         
     | 
| 
      
 732 
     | 
    
         
            +
                  return icont_child
         
     | 
| 
      
 733 
     | 
    
         
            +
                else
         
     | 
| 
      
 734 
     | 
    
         
            +
                  return nonhead_children.first
         
     | 
| 
      
 735 
     | 
    
         
            +
                end
         
     | 
| 
      
 736 
     | 
    
         
            +
              end
         
     | 
| 
      
 737 
     | 
    
         
            +
             
     | 
| 
      
 738 
     | 
    
         
            +
             
     | 
| 
      
 739 
     | 
    
         
            +
             
     | 
| 
      
 740 
     | 
    
         
            +
             
     | 
| 
      
 741 
     | 
    
         
            +
              ########
         
     | 
| 
      
 742 
     | 
    
         
            +
              # prune?
         
     | 
| 
      
 743 
     | 
    
         
            +
              # given a target node t and another node n of the syntactic structure,
         
     | 
| 
      
 744 
     | 
    
         
            +
              # decide whether n is likely to instantiate a semantic role
         
     | 
| 
      
 745 
     | 
    
         
            +
              # of t. If not, recommend n for pruning. 
         
     | 
| 
      
 746 
     | 
    
         
            +
              #
         
     | 
| 
      
 747 
     | 
    
         
            +
              # This method implements a slight variant of Xue and Palmer (EMNLP 2004).
         
     | 
| 
      
 748 
     | 
    
         
            +
              # Pruning according to Xue & Palmer, EMNLP 2004:
         
     | 
| 
      
 749 
     | 
    
         
            +
              # "Step 1: Designate the predicate as the current node and 
         
     | 
| 
      
 750 
     | 
    
         
            +
              #    collect its sisters (constituents attached at the same level
         
     | 
| 
      
 751 
     | 
    
         
            +
              #    as the predicate) unless its sisters are coordinated with the 
         
     | 
| 
      
 752 
     | 
    
         
            +
              #    predicate. If a sister is a PP, also collect its immediate
         
     | 
| 
      
 753 
     | 
    
         
            +
              #    children. 
         
     | 
| 
      
 754 
     | 
    
         
            +
              #  Step 2: Reset the current node to its parent and repeat Step 1
         
     | 
| 
      
 755 
     | 
    
         
            +
              #    till it reaches the top level node.
         
     | 
| 
      
 756 
     | 
    
         
            +
              #
         
     | 
| 
      
 757 
     | 
    
         
            +
              # Modifications made here:
         
     | 
| 
      
 758 
     | 
    
         
            +
              # - paths of length 0 accepted in any case
         
     | 
| 
      
 759 
     | 
    
         
            +
              #
         
     | 
| 
      
 760 
     | 
    
         
            +
              # returns: false to recommend n for pruning, else true
         
     | 
| 
      
 761 
     | 
    
         
            +
              def CollinsTntInterpreter.prune?(node, # SynNode
         
     | 
| 
      
 762 
     | 
    
         
            +
                                               paths_to_target, # hash: node ID -> Path object: paths from target to node
         
     | 
| 
      
 763 
     | 
    
         
            +
                                               terminal_index)  # hash: terminal node -> word index in sentence
         
     | 
| 
      
 764 
     | 
    
         
            +
             
     | 
| 
      
 765 
     | 
    
         
            +
                path_to_target = paths_to_target[node.id()]
         
     | 
| 
      
 766 
     | 
    
         
            +
             
     | 
| 
      
 767 
     | 
    
         
            +
                if not path_to_target
         
     | 
| 
      
 768 
     | 
    
         
            +
                  # no path from target to node: suggest for pruning
         
     | 
| 
      
 769 
     | 
    
         
            +
             
     | 
| 
      
 770 
     | 
    
         
            +
                  return 0
         
     | 
| 
      
 771 
     | 
    
         
            +
             
     | 
| 
      
 772 
     | 
    
         
            +
                elsif path_to_target.length == 0 
         
     | 
| 
      
 773 
     | 
    
         
            +
                  # target may be its own role: definite accept
         
     | 
| 
      
 774 
     | 
    
         
            +
             
     | 
| 
      
 775 
     | 
    
         
            +
                  return 1
         
     | 
| 
      
 776 
     | 
    
         
            +
             
     | 
| 
      
 777 
     | 
    
         
            +
                else
         
     | 
| 
      
 778 
     | 
    
         
            +
                  # consider path from target to node. 
         
     | 
| 
      
 779 
     | 
    
         
            +
                  # (1) If the path to the current node includes at least one Up
         
     | 
| 
      
 780 
     | 
    
         
            +
                  # and exactly one Down,  keep.
         
     | 
| 
      
 781 
     | 
    
         
            +
                  # (2) Else, if the path includes at least one Up and exactly two Down,
         
     | 
| 
      
 782 
     | 
    
         
            +
                  # and the current node's parent is a PP, keep
         
     | 
| 
      
 783 
     | 
    
         
            +
                  # (3) else discard
         
     | 
| 
      
 784 
     | 
    
         
            +
             
     | 
| 
      
 785 
     | 
    
         
            +
                  # count number of up and down steps in path to target
         
     | 
| 
      
 786 
     | 
    
         
            +
                  num_up = 0
         
     | 
| 
      
 787 
     | 
    
         
            +
                  num_down = 0
         
     | 
| 
      
 788 
     | 
    
         
            +
                  path_to_target.each_step { |direction, edgelabel, nodelabel, endnode|
         
     | 
| 
      
 789 
     | 
    
         
            +
                    case direction
         
     | 
| 
      
 790 
     | 
    
         
            +
                    when /U/
         
     | 
| 
      
 791 
     | 
    
         
            +
                      num_up += 1
         
     | 
| 
      
 792 
     | 
    
         
            +
                    when /D/
         
     | 
| 
      
 793 
     | 
    
         
            +
                      num_down += 1
         
     | 
| 
      
 794 
     | 
    
         
            +
                    end
         
     | 
| 
      
 795 
     | 
    
         
            +
                  }
         
     | 
| 
      
 796 
     | 
    
         
            +
             
     | 
| 
      
 797 
     | 
    
         
            +
                  # coordination sister between node and target?
         
     | 
| 
      
 798 
     | 
    
         
            +
                  conj_sister_between = CollinsTntInterpreter.conj_sister_between?(node, paths_to_target, 
         
     | 
| 
      
 799 
     | 
    
         
            +
                                                                                   terminal_index) 
         
     | 
| 
      
 800 
     | 
    
         
            +
             
     | 
| 
      
 801 
     | 
    
         
            +
             
         
     | 
| 
      
 802 
     | 
    
         
            +
                  if conj_sister_between
         
     | 
| 
      
 803 
     | 
    
         
            +
                    # coordination between me and the target -- drop
         
     | 
| 
      
 804 
     | 
    
         
            +
                    return 0
         
     | 
| 
      
 805 
     | 
    
         
            +
                  
         
     | 
| 
      
 806 
     | 
    
         
            +
                  elsif num_up >= 1 and num_down == 1
         
     | 
| 
      
 807 
     | 
    
         
            +
                    # case (1)
         
     | 
| 
      
 808 
     | 
    
         
            +
                    return  1
         
     | 
| 
      
 809 
     | 
    
         
            +
             
     | 
| 
      
 810 
     | 
    
         
            +
                  elsif num_up >= 1 and num_down == 2 and
         
     | 
| 
      
 811 
     | 
    
         
            +
                      (p = node.parent()) and CollinsTntInterpreter.category(p) == "prep"
         
     | 
| 
      
 812 
     | 
    
         
            +
                  
         
     | 
| 
      
 813 
     | 
    
         
            +
                    # case (2)
         
     | 
| 
      
 814 
     | 
    
         
            +
                    return 1
         
     | 
| 
      
 815 
     | 
    
         
            +
             
     | 
| 
      
 816 
     | 
    
         
            +
                  else
         
     | 
| 
      
 817 
     | 
    
         
            +
                    # case (3)
         
     | 
| 
      
 818 
     | 
    
         
            +
                    return 0
         
     | 
| 
      
 819 
     | 
    
         
            +
                  end
         
     | 
| 
      
 820 
     | 
    
         
            +
                end
         
     | 
| 
      
 821 
     | 
    
         
            +
              end
         
     | 
| 
      
 822 
     | 
    
         
            +
             
     | 
| 
      
 823 
     | 
    
         
            +
             
     | 
| 
      
 824 
     | 
    
         
            +
              ###
         
     | 
| 
      
 825 
     | 
    
         
            +
              private
         
     | 
| 
      
 826 
     | 
    
         
            +
              
         
     | 
| 
      
 827 
     | 
    
         
            +
              
         
     | 
| 
      
 828 
     | 
    
         
            +
              ###
         
     | 
| 
      
 829 
     | 
    
         
            +
              # given an anchor node and another node that may be some
         
     | 
| 
      
 830 
     | 
    
         
            +
              # grammatical function of the anchor node:
         
     | 
| 
      
 831 
     | 
    
         
            +
              # return the grammatical function (string) if found,
         
     | 
| 
      
 832 
     | 
    
         
            +
              # else nil.
         
     | 
| 
      
 833 
     | 
    
         
            +
              #
         
     | 
| 
      
 834 
     | 
    
         
            +
              # here: anchor node is verb.
         
     | 
| 
      
 835 
     | 
    
         
            +
              def CollinsTntInterpreter.gf_verb(anchor_node, # SynNode
         
     | 
| 
      
 836 
     | 
    
         
            +
                                                gf_node) # SynNode
         
     | 
| 
      
 837 
     | 
    
         
            +
             
     | 
| 
      
 838 
     | 
    
         
            +
                # first classification: according to constituent type
         
     | 
| 
      
 839 
     | 
    
         
            +
                cat = CollinsTntInterpreter.category(gf_node)
         
     | 
| 
      
 840 
     | 
    
         
            +
                if cat.nil?
         
     | 
| 
      
 841 
     | 
    
         
            +
                  return nil
         
     | 
| 
      
 842 
     | 
    
         
            +
                end
         
     | 
| 
      
 843 
     | 
    
         
            +
                
         
     | 
| 
      
 844 
     | 
    
         
            +
                # second classification: according to path 
         
     | 
| 
      
 845 
     | 
    
         
            +
                path = CollinsTntInterpreter.path_between(anchor_node, gf_node)
         
     | 
| 
      
 846 
     | 
    
         
            +
                if path.nil?
         
     | 
| 
      
 847 
     | 
    
         
            +
                  # no path between anchor node and gf node
         
     | 
| 
      
 848 
     | 
    
         
            +
                  return nil
         
     | 
| 
      
 849 
     | 
    
         
            +
                end
         
     | 
| 
      
 850 
     | 
    
         
            +
             
     | 
| 
      
 851 
     | 
    
         
            +
                path.set_cutoff_last_pt_on_printing(true)
         
     | 
| 
      
 852 
     | 
    
         
            +
                path_string = path.print(true,false,true)
         
     | 
| 
      
 853 
     | 
    
         
            +
                
         
     | 
| 
      
 854 
     | 
    
         
            +
                case path_string
         
     | 
| 
      
 855 
     | 
    
         
            +
                when "U VP D ", "U SG D "
         
     | 
| 
      
 856 
     | 
    
         
            +
                  categ2 = "inside"
         
     | 
| 
      
 857 
     | 
    
         
            +
                when /^U (VP U )*S(BAR)? D $/
         
     | 
| 
      
 858 
     | 
    
         
            +
                  categ2 = "external"
         
     | 
| 
      
 859 
     | 
    
         
            +
                when /^U (VP U )*VP D ADVP D $/
         
     | 
| 
      
 860 
     | 
    
         
            +
                  categ2 = "external"
         
     | 
| 
      
 861 
     | 
    
         
            +
                else
         
     | 
| 
      
 862 
     | 
    
         
            +
                  categ2 = ""
         
     | 
| 
      
 863 
     | 
    
         
            +
                end
         
     | 
| 
      
 864 
     | 
    
         
            +
                
         
     | 
| 
      
 865 
     | 
    
         
            +
                # now evaluate based on both
         
     | 
| 
      
 866 
     | 
    
         
            +
                case cat+ "+" + categ2
         
     | 
| 
      
 867 
     | 
    
         
            +
                when "noun+inside"
         
     | 
| 
      
 868 
     | 
    
         
            +
                  # direct object
         
     | 
| 
      
 869 
     | 
    
         
            +
                  return  "OA"
         
     | 
| 
      
 870 
     | 
    
         
            +
                  
         
     | 
| 
      
 871 
     | 
    
         
            +
                when "noun+external"
         
     | 
| 
      
 872 
     | 
    
         
            +
                  unless CollinsTntInterpreter.relative_position(gf_node, anchor_node) == "LEFT"
         
     | 
| 
      
 873 
     | 
    
         
            +
                    return nil
         
     | 
| 
      
 874 
     | 
    
         
            +
                  end
         
     | 
| 
      
 875 
     | 
    
         
            +
             
     | 
| 
      
 876 
     | 
    
         
            +
                  if CollinsTntInterpreter.voice(anchor_node) == "passive"
         
     | 
| 
      
 877 
     | 
    
         
            +
                    return "OA"
         
     | 
| 
      
 878 
     | 
    
         
            +
                  else
         
     | 
| 
      
 879 
     | 
    
         
            +
                    return "SB"
         
     | 
| 
      
 880 
     | 
    
         
            +
                  end
         
     | 
| 
      
 881 
     | 
    
         
            +
                  
         
     | 
| 
      
 882 
     | 
    
         
            +
                when "prep+inside"
         
     | 
| 
      
 883 
     | 
    
         
            +
                  if CollinsTntInterpreter.voice(anchor_node) == "passive" and 
         
     | 
| 
      
 884 
     | 
    
         
            +
                      CollinsTntInterpreter.preposition(gf_node) == "by"
         
     | 
| 
      
 885 
     | 
    
         
            +
            	return "SB"
         
     | 
| 
      
 886 
     | 
    
         
            +
                  else
         
     | 
| 
      
 887 
     | 
    
         
            +
            	return "MO-" + CollinsTntInterpreter.preposition(gf_node).to_s
         
     | 
| 
      
 888 
     | 
    
         
            +
                  end
         
     | 
| 
      
 889 
     | 
    
         
            +
                  
         
     | 
| 
      
 890 
     | 
    
         
            +
                when "sent+inside"
         
     | 
| 
      
 891 
     | 
    
         
            +
                  return  "OC"
         
     | 
| 
      
 892 
     | 
    
         
            +
                  
         
     | 
| 
      
 893 
     | 
    
         
            +
                when "sent+external"
         
     | 
| 
      
 894 
     | 
    
         
            +
                  return  "OC"
         
     | 
| 
      
 895 
     | 
    
         
            +
                  
         
     | 
| 
      
 896 
     | 
    
         
            +
                else
         
     | 
| 
      
 897 
     | 
    
         
            +
                  return nil
         
     | 
| 
      
 898 
     | 
    
         
            +
                end
         
     | 
| 
      
 899 
     | 
    
         
            +
              end
         
     | 
| 
      
 900 
     | 
    
         
            +
              
         
     | 
| 
      
 901 
     | 
    
         
            +
              ###
         
     | 
| 
      
 902 
     | 
    
         
            +
              # given an anchor node and another node that may be some
         
     | 
| 
      
 903 
     | 
    
         
            +
              # grammatical function of the anchor node:
         
     | 
| 
      
 904 
     | 
    
         
            +
              # return the grammatical function (string) if found,
         
     | 
| 
      
 905 
     | 
    
         
            +
              # else nil.
         
     | 
| 
      
 906 
     | 
    
         
            +
              #
         
     | 
| 
      
 907 
     | 
    
         
            +
              # here: anchor node is noun.
         
     | 
| 
      
 908 
     | 
    
         
            +
              def CollinsTntInterpreter.gf_noun(anchor_node,  # SynNode
         
     | 
| 
      
 909 
     | 
    
         
            +
                                                gf_node)      # SynNode
         
     | 
| 
      
 910 
     | 
    
         
            +
             
     | 
| 
      
 911 
     | 
    
         
            +
                # first classification: according to constituent type
         
     | 
| 
      
 912 
     | 
    
         
            +
                cat = CollinsTntInterpreter.category(gf_node)
         
     | 
| 
      
 913 
     | 
    
         
            +
                if cat.nil?
         
     | 
| 
      
 914 
     | 
    
         
            +
                  return nil
         
     | 
| 
      
 915 
     | 
    
         
            +
                end
         
     | 
| 
      
 916 
     | 
    
         
            +
             
     | 
| 
      
 917 
     | 
    
         
            +
                # second classification: according to path
         
     | 
| 
      
 918 
     | 
    
         
            +
                path = CollinsTntInterpreter.path_between(anchor_node, gf_node)
         
     | 
| 
      
 919 
     | 
    
         
            +
                if path.nil?
         
     | 
| 
      
 920 
     | 
    
         
            +
                  # no path between anchor node and gf node
         
     | 
| 
      
 921 
     | 
    
         
            +
                  return nil
         
     | 
| 
      
 922 
     | 
    
         
            +
                end
         
     | 
| 
      
 923 
     | 
    
         
            +
             
     | 
| 
      
 924 
     | 
    
         
            +
                path.set_cutoff_last_pt_on_printing(true)
         
     | 
| 
      
 925 
     | 
    
         
            +
                path_string = path.print(true,false,true)
         
     | 
| 
      
 926 
     | 
    
         
            +
                
         
     | 
| 
      
 927 
     | 
    
         
            +
                case path_string
         
     | 
| 
      
 928 
     | 
    
         
            +
                when "U NPB D "
         
     | 
| 
      
 929 
     | 
    
         
            +
                  categ2 = "np-neighbor"
         
     | 
| 
      
 930 
     | 
    
         
            +
                when "U NPB U NP D "
         
     | 
| 
      
 931 
     | 
    
         
            +
                  categ2 = "np-parent"
         
     | 
| 
      
 932 
     | 
    
         
            +
                when "U NP D "
         
     | 
| 
      
 933 
     | 
    
         
            +
                  categ2 = "np-a"
         
     | 
| 
      
 934 
     | 
    
         
            +
                when /^U NPB (U NP )?(U NP )?U S(BAR)? D( VP D)? $/
         
     | 
| 
      
 935 
     | 
    
         
            +
                  categ2 = "beyond-s"
         
     | 
| 
      
 936 
     | 
    
         
            +
                when /^U NP(B)? (U NP )?U VP D $/
         
     | 
| 
      
 937 
     | 
    
         
            +
                  categ2 = "beyond-vp"
         
     | 
| 
      
 938 
     | 
    
         
            +
                when /^U NPB (U NP )?(U NP)?U PP U VP(-A)? D $/
         
     | 
| 
      
 939 
     | 
    
         
            +
                  categ2 = "beyond-pp-vp"
         
     | 
| 
      
 940 
     | 
    
         
            +
                else
         
     | 
| 
      
 941 
     | 
    
         
            +
                  categ2 = ""
         
     | 
| 
      
 942 
     | 
    
         
            +
                end
         
     | 
| 
      
 943 
     | 
    
         
            +
             
     | 
| 
      
 944 
     | 
    
         
            +
                # now evaluate based on both
         
     | 
| 
      
 945 
     | 
    
         
            +
                case cat + "+" + categ2
         
     | 
| 
      
 946 
     | 
    
         
            +
                when "noun+np-neighbor"
         
     | 
| 
      
 947 
     | 
    
         
            +
                  return "AG"
         
     | 
| 
      
 948 
     | 
    
         
            +
             
     | 
| 
      
 949 
     | 
    
         
            +
                when "sent+np-parent"
         
     | 
| 
      
 950 
     | 
    
         
            +
                  return "OC"
         
     | 
| 
      
 951 
     | 
    
         
            +
             
     | 
| 
      
 952 
     | 
    
         
            +
                when "prep+np-parent", "prep+np-a"
         
     | 
| 
      
 953 
     | 
    
         
            +
                  return "MO-" + CollinsTntInterpreter.preposition(gf_node).to_s
         
     | 
| 
      
 954 
     | 
    
         
            +
                  # relation of anchor noun to governing verb not covered by "gfs" method
         
     | 
| 
      
 955 
     | 
    
         
            +
            #     when "verb+beyond-s"
         
     | 
| 
      
 956 
     | 
    
         
            +
            #       return "SB-of"
         
     | 
| 
      
 957 
     | 
    
         
            +
             
     | 
| 
      
 958 
     | 
    
         
            +
            #     when "verb+beyond-vp"
         
     | 
| 
      
 959 
     | 
    
         
            +
            #       return "OA-of"
         
     | 
| 
      
 960 
     | 
    
         
            +
             
     | 
| 
      
 961 
     | 
    
         
            +
            #     when "verb+beyond-pp-vp"
         
     | 
| 
      
 962 
     | 
    
         
            +
            #       return "MO-of"
         
     | 
| 
      
 963 
     | 
    
         
            +
                else
         
     | 
| 
      
 964 
     | 
    
         
            +
                  return nil
         
     | 
| 
      
 965 
     | 
    
         
            +
                end
         
     | 
| 
      
 966 
     | 
    
         
            +
              end
         
     | 
| 
      
 967 
     | 
    
         
            +
              
         
     | 
| 
      
 968 
     | 
    
         
            +
             
     | 
| 
      
 969 
     | 
    
         
            +
              ###
         
     | 
| 
      
 970 
     | 
    
         
            +
              # given an anchor node and another node that may be some
         
     | 
| 
      
 971 
     | 
    
         
            +
              # grammatical function of the anchor node:
         
     | 
| 
      
 972 
     | 
    
         
            +
              # return the grammatical function (string) if found,
         
     | 
| 
      
 973 
     | 
    
         
            +
              # else nil.
         
     | 
| 
      
 974 
     | 
    
         
            +
              #
         
     | 
| 
      
 975 
     | 
    
         
            +
              # here: anchor node is adjective.
         
     | 
| 
      
 976 
     | 
    
         
            +
              def CollinsTntInterpreter.gf_adj(anchor_node,  # SynNode
         
     | 
| 
      
 977 
     | 
    
         
            +
                                               gf_node)      # SynNode
         
     | 
| 
      
 978 
     | 
    
         
            +
                
         
     | 
| 
      
 979 
     | 
    
         
            +
                # first classification: according to constituent type
         
     | 
| 
      
 980 
     | 
    
         
            +
                cat = CollinsTntInterpreter.category(gf_node)
         
     | 
| 
      
 981 
     | 
    
         
            +
                if cat.nil?
         
     | 
| 
      
 982 
     | 
    
         
            +
                  return nil
         
     | 
| 
      
 983 
     | 
    
         
            +
                end
         
     | 
| 
      
 984 
     | 
    
         
            +
             
     | 
| 
      
 985 
     | 
    
         
            +
                # second classification: according to path
         
     | 
| 
      
 986 
     | 
    
         
            +
                path = CollinsTntInterpreter.path_between(anchor_node, gf_node)
         
     | 
| 
      
 987 
     | 
    
         
            +
                if path.nil?
         
     | 
| 
      
 988 
     | 
    
         
            +
                  # no path between anchor node and gf node
         
     | 
| 
      
 989 
     | 
    
         
            +
                  return nil
         
     | 
| 
      
 990 
     | 
    
         
            +
                end
         
     | 
| 
      
 991 
     | 
    
         
            +
             
     | 
| 
      
 992 
     | 
    
         
            +
                path.set_cutoff_last_pt_on_printing(true)
         
     | 
| 
      
 993 
     | 
    
         
            +
                path_string = path.print(true,false,true)
         
     | 
| 
      
 994 
     | 
    
         
            +
                
         
     | 
| 
      
 995 
     | 
    
         
            +
                case path_string
         
     | 
| 
      
 996 
     | 
    
         
            +
                when /^(U ADJP )?U NPB D $/
         
     | 
| 
      
 997 
     | 
    
         
            +
                  categ2 = "nnpath"
         
     | 
| 
      
 998 
     | 
    
         
            +
                when "U ADJP D "
         
     | 
| 
      
 999 
     | 
    
         
            +
                  categ2 = "adjp-neighbor"
         
     | 
| 
      
 1000 
     | 
    
         
            +
                when /^(U ADJP )?U (VP U )?S(BAR)? D $/
         
     | 
| 
      
 1001 
     | 
    
         
            +
                  categ2 = "s"
         
     | 
| 
      
 1002 
     | 
    
         
            +
                when /^U (ADJP U )?VP D $/
         
     | 
| 
      
 1003 
     | 
    
         
            +
                  categ2 = "vp"
         
     | 
| 
      
 1004 
     | 
    
         
            +
                else
         
     | 
| 
      
 1005 
     | 
    
         
            +
                  categ2 = ""
         
     | 
| 
      
 1006 
     | 
    
         
            +
                end
         
     | 
| 
      
 1007 
     | 
    
         
            +
                
         
     | 
| 
      
 1008 
     | 
    
         
            +
                # now evaluate based on both
         
     | 
| 
      
 1009 
     | 
    
         
            +
                case cat + "+" + categ2
         
     | 
| 
      
 1010 
     | 
    
         
            +
                when "noun+nnpath"
         
     | 
| 
      
 1011 
     | 
    
         
            +
                  return "HD"
         
     | 
| 
      
 1012 
     | 
    
         
            +
                when "verb+adjp-neighbor"
         
     | 
| 
      
 1013 
     | 
    
         
            +
                  return "OC"
         
     | 
| 
      
 1014 
     | 
    
         
            +
                when "prep+vp", "prep+adjp-neighbor"
         
     | 
| 
      
 1015 
     | 
    
         
            +
                  return "MO-" + CollinsTntInterpreter.preposition(gf_node).to_s
         
     | 
| 
      
 1016 
     | 
    
         
            +
                else
         
     | 
| 
      
 1017 
     | 
    
         
            +
                  return nil
         
     | 
| 
      
 1018 
     | 
    
         
            +
                end
         
     | 
| 
      
 1019 
     | 
    
         
            +
              end
         
     | 
| 
      
 1020 
     | 
    
         
            +
             
     | 
| 
      
 1021 
     | 
    
         
            +
              ####
         
     | 
| 
      
 1022 
     | 
    
         
            +
              # auxiliary of prune?:
         
     | 
| 
      
 1023 
     | 
    
         
            +
              #
         
     | 
| 
      
 1024 
     | 
    
         
            +
              # given a node and a hash mapping node IDs to paths to target:
         
     | 
| 
      
 1025 
     | 
    
         
            +
              # Does that node have a sister that is a coordination and that 
         
     | 
| 
      
 1026 
     | 
    
         
            +
              # is between it and the target?
         
     | 
| 
      
 1027 
     | 
    
         
            +
              #
         
     | 
| 
      
 1028 
     | 
    
         
            +
              def CollinsTntInterpreter.conj_sister_between?(node, # SynNode
         
     | 
| 
      
 1029 
     | 
    
         
            +
                                                             paths_to_target, # Hash: node ID -> Path obj: path from node to target
         
     | 
| 
      
 1030 
     | 
    
         
            +
                                                             ti)  # hash: terminal node -> word index in sentence
         
     | 
| 
      
 1031 
     | 
    
         
            +
             
     | 
| 
      
 1032 
     | 
    
         
            +
                # does node have sisters that represent coordination?
         
     | 
| 
      
 1033 
     | 
    
         
            +
                unless (p = node.parent())
         
     | 
| 
      
 1034 
     | 
    
         
            +
                  return false
         
     | 
| 
      
 1035 
     | 
    
         
            +
                end
         
     | 
| 
      
 1036 
     | 
    
         
            +
             
     | 
| 
      
 1037 
     | 
    
         
            +
                unless (conj_sisters = p.children.select { |sib| 
         
     | 
| 
      
 1038 
     | 
    
         
            +
                          sib != node and CollinsTntInterpreter.category(sib) == "con" 
         
     | 
| 
      
 1039 
     | 
    
         
            +
                        } ) and
         
     | 
| 
      
 1040 
     | 
    
         
            +
                    not (conj_sisters.empty?)
         
     | 
| 
      
 1041 
     | 
    
         
            +
                  return false
         
     | 
| 
      
 1042 
     | 
    
         
            +
                end
         
     | 
| 
      
 1043 
     | 
    
         
            +
             
     | 
| 
      
 1044 
     | 
    
         
            +
                # represent each coordination sister, and the node itself,
         
     | 
| 
      
 1045 
     | 
    
         
            +
                # as a triple [node, leftmost terminal index(node), rightmost terminal index(node)
         
     | 
| 
      
 1046 
     | 
    
         
            +
                conj_sisters = conj_sisters.map { |n|
         
     | 
| 
      
 1047 
     | 
    
         
            +
                  [n, CollinsTntInterpreter.lti(n, ti), CollinsTntInterpreter.rti(n, ti)]
         
     | 
| 
      
 1048 
     | 
    
         
            +
                }
         
     | 
| 
      
 1049 
     | 
    
         
            +
             
     | 
| 
      
 1050 
     | 
    
         
            +
                this_triple = [node, CollinsTntInterpreter.lti(node, ti), CollinsTntInterpreter.rti(node, ti)]
         
     | 
| 
      
 1051 
     | 
    
         
            +
             
     | 
| 
      
 1052 
     | 
    
         
            +
                # sisters closer to the target than node:
         
     | 
| 
      
 1053 
     | 
    
         
            +
                # also map to triples
         
     | 
| 
      
 1054 
     | 
    
         
            +
                sisters_closer_to_target = p.children.select { |sib|
         
     | 
| 
      
 1055 
     | 
    
         
            +
                  sib != node and 
         
     | 
| 
      
 1056 
     | 
    
         
            +
                    not(conj_sisters.include? sib) and
         
     | 
| 
      
 1057 
     | 
    
         
            +
                    paths_to_target[sib.id()] and 
         
     | 
| 
      
 1058 
     | 
    
         
            +
                    paths_to_target[sib.id()].length() < paths_to_target[node.id()].length
         
     | 
| 
      
 1059 
     | 
    
         
            +
                }.map { |n|
         
     | 
| 
      
 1060 
     | 
    
         
            +
                  [n, CollinsTntInterpreter.lti(n, ti), CollinsTntInterpreter.rti(n, ti)]
         
     | 
| 
      
 1061 
     | 
    
         
            +
                }
         
     | 
| 
      
 1062 
     | 
    
         
            +
                  
         
     | 
| 
      
 1063 
     | 
    
         
            +
                if sisters_closer_to_target.empty?
         
     | 
| 
      
 1064 
     | 
    
         
            +
                  return false
         
     | 
| 
      
 1065 
     | 
    
         
            +
                end
         
     | 
| 
      
 1066 
     | 
    
         
            +
             
     | 
| 
      
 1067 
     | 
    
         
            +
                # is there any coordination sister that is inbetween this node
         
     | 
| 
      
 1068 
     | 
    
         
            +
                # and some sister that is closer to the target?
         
     | 
| 
      
 1069 
     | 
    
         
            +
                # if so, return true
         
     | 
| 
      
 1070 
     | 
    
         
            +
                conj_sisters.each { |conj_triple|
         
     | 
| 
      
 1071 
     | 
    
         
            +
                  if leftof(conj_triple, this_triple) and 
         
     | 
| 
      
 1072 
     | 
    
         
            +
                      sisters_closer_to_target.detect { |s| CollinsTntInterpreter.leftof(s, conj_triple) }
         
     | 
| 
      
 1073 
     | 
    
         
            +
                    
         
     | 
| 
      
 1074 
     | 
    
         
            +
                    return true
         
     | 
| 
      
 1075 
     | 
    
         
            +
             
     | 
| 
      
 1076 
     | 
    
         
            +
                  elsif rightof(conj_triple, this_triple) and
         
     | 
| 
      
 1077 
     | 
    
         
            +
                      sisters_closer_to_target.detect { |s| CollinsTntInterpreter.rightof(s, conj_triple) }
         
     | 
| 
      
 1078 
     | 
    
         
            +
                        
         
     | 
| 
      
 1079 
     | 
    
         
            +
                    return true
         
     | 
| 
      
 1080 
     | 
    
         
            +
                  end
         
     | 
| 
      
 1081 
     | 
    
         
            +
                }
         
     | 
| 
      
 1082 
     | 
    
         
            +
                
         
     | 
| 
      
 1083 
     | 
    
         
            +
                # else return false
         
     | 
| 
      
 1084 
     | 
    
         
            +
                return false
         
     | 
| 
      
 1085 
     | 
    
         
            +
              end
         
     | 
| 
      
 1086 
     | 
    
         
            +
             
     | 
| 
      
 1087 
     | 
    
         
            +
              ###
         
     | 
| 
      
 1088 
     | 
    
         
            +
              # lti, rti: terminal index of the leftmost/rightmost terminal of
         
     | 
| 
      
 1089 
     | 
    
         
            +
              # a given node (SynNode)
         
     | 
| 
      
 1090 
     | 
    
         
            +
              #
         
     | 
| 
      
 1091 
     | 
    
         
            +
              # auxiliary of conj_sister_between?
         
     | 
| 
      
 1092 
     | 
    
         
            +
              def CollinsTntInterpreter.lti(node, # SynNode
         
     | 
| 
      
 1093 
     | 
    
         
            +
                                            terminal_index) # hash: terminal node -> word index in sentence
         
     | 
| 
      
 1094 
     | 
    
         
            +
                lt = CollinsTntInterpreter.leftmost_terminal(node)
         
     | 
| 
      
 1095 
     | 
    
         
            +
                unless lt
         
     | 
| 
      
 1096 
     | 
    
         
            +
                  return nil
         
     | 
| 
      
 1097 
     | 
    
         
            +
                end
         
     | 
| 
      
 1098 
     | 
    
         
            +
             
     | 
| 
      
 1099 
     | 
    
         
            +
                return terminal_index[lt]
         
     | 
| 
      
 1100 
     | 
    
         
            +
              end
         
     | 
| 
      
 1101 
     | 
    
         
            +
             
     | 
| 
      
 1102 
     | 
    
         
            +
              def CollinsTntInterpreter.rti(node, # SynNode
         
     | 
| 
      
 1103 
     | 
    
         
            +
                                            terminal_index) # hash: terminal node -> word index in sentence
         
     | 
| 
      
 1104 
     | 
    
         
            +
                rt = CollinsTntInterpreter.rightmost_terminal(node)
         
     | 
| 
      
 1105 
     | 
    
         
            +
                unless rt
         
     | 
| 
      
 1106 
     | 
    
         
            +
                  return nil
         
     | 
| 
      
 1107 
     | 
    
         
            +
                end
         
     | 
| 
      
 1108 
     | 
    
         
            +
                
         
     | 
| 
      
 1109 
     | 
    
         
            +
                return terminal_index[rt]
         
     | 
| 
      
 1110 
     | 
    
         
            +
              end
         
     | 
| 
      
 1111 
     | 
    
         
            +
             
     | 
| 
      
 1112 
     | 
    
         
            +
              ###
         
     | 
| 
      
 1113 
     | 
    
         
            +
              # leftof, rightof: given 2 triples 
         
     | 
| 
      
 1114 
     | 
    
         
            +
              # [node(SynNode), index of leftmost terminal(integer/nil), index of rightmost terminal(integer/nil),
         
     | 
| 
      
 1115 
     | 
    
         
            +
              # 
         
     | 
| 
      
 1116 
     | 
    
         
            +
              # auxiliaries of conj_sister_between?
         
     | 
| 
      
 1117 
     | 
    
         
            +
              #
         
     | 
| 
      
 1118 
     | 
    
         
            +
              # return true if both leftmost and rightmost terminal indices of the first triple are
         
     | 
| 
      
 1119 
     | 
    
         
            +
              # smaller than (for leftof) / bigger than (for rightof) the
         
     | 
| 
      
 1120 
     | 
    
         
            +
              # corresponding indices of the second triple
         
     | 
| 
      
 1121 
     | 
    
         
            +
              #
         
     | 
| 
      
 1122 
     | 
    
         
            +
              # return false if some index is nil
         
     | 
| 
      
 1123 
     | 
    
         
            +
              def CollinsTntInterpreter.leftof(triple1,
         
     | 
| 
      
 1124 
     | 
    
         
            +
                                               triple2)
         
     | 
| 
      
 1125 
     | 
    
         
            +
                dummy, lm1, rm1 = triple1
         
     | 
| 
      
 1126 
     | 
    
         
            +
                dummy, lm2, rm2 = triple2
         
     | 
| 
      
 1127 
     | 
    
         
            +
             
     | 
| 
      
 1128 
     | 
    
         
            +
                if lm1.nil? or rm1.nil? or lm2.nil? or rm2.nil?
         
     | 
| 
      
 1129 
     | 
    
         
            +
                  return false
         
     | 
| 
      
 1130 
     | 
    
         
            +
                elsif lm1 < lm2 and rm1 < rm2
         
     | 
| 
      
 1131 
     | 
    
         
            +
                  return true
         
     | 
| 
      
 1132 
     | 
    
         
            +
                else
         
     | 
| 
      
 1133 
     | 
    
         
            +
                  return false
         
     | 
| 
      
 1134 
     | 
    
         
            +
                end
         
     | 
| 
      
 1135 
     | 
    
         
            +
              end
         
     | 
| 
      
 1136 
     | 
    
         
            +
             
     | 
| 
      
 1137 
     | 
    
         
            +
              def CollinsTntInterpreter.rightof(triple1, 
         
     | 
| 
      
 1138 
     | 
    
         
            +
                                                triple2)
         
     | 
| 
      
 1139 
     | 
    
         
            +
                dummy, lm1, rm1 = triple1
         
     | 
| 
      
 1140 
     | 
    
         
            +
                dummy, lm2, rm2 = triple2
         
     | 
| 
      
 1141 
     | 
    
         
            +
                
         
     | 
| 
      
 1142 
     | 
    
         
            +
                if lm1.nil? or rm1.nil? or lm2.nil? or rm2.nil?
         
     | 
| 
      
 1143 
     | 
    
         
            +
                  return false
         
     | 
| 
      
 1144 
     | 
    
         
            +
                elsif lm1 > lm2 and rm1 > rm2
         
     | 
| 
      
 1145 
     | 
    
         
            +
                  return true
         
     | 
| 
      
 1146 
     | 
    
         
            +
                else
         
     | 
| 
      
 1147 
     | 
    
         
            +
                  return false
         
     | 
| 
      
 1148 
     | 
    
         
            +
                end
         
     | 
| 
      
 1149 
     | 
    
         
            +
              end
         
     | 
| 
      
 1150 
     | 
    
         
            +
            end
         
     | 
| 
      
 1151 
     | 
    
         
            +
             
     | 
| 
      
 1152 
     | 
    
         
            +
             
     | 
| 
      
 1153 
     | 
    
         
            +
            # use TreeTagger as replacement for TnT; re-use everything, but use treetagger as POS tagger
         
     | 
| 
      
 1154 
     | 
    
         
            +
             
     | 
| 
      
 1155 
     | 
    
         
            +
            class CollinsTreeTaggerInterpreter < CollinsTntInterpreter
         
     | 
| 
      
 1156 
     | 
    
         
            +
              CollinsTreeTaggerInterpreter.announce_me()
         
     | 
| 
      
 1157 
     | 
    
         
            +
             
     | 
| 
      
 1158 
     | 
    
         
            +
              def CollinsTreeTaggerInterpreter.systems()
         
     | 
| 
      
 1159 
     | 
    
         
            +
                return {
         
     | 
| 
      
 1160 
     | 
    
         
            +
                  "pos_tagger" => "treetagger",
         
     | 
| 
      
 1161 
     | 
    
         
            +
                  "parser" => "collins"
         
     | 
| 
      
 1162 
     | 
    
         
            +
                }
         
     | 
| 
      
 1163 
     | 
    
         
            +
              end
         
     | 
| 
      
 1164 
     | 
    
         
            +
            end
         
     | 
| 
      
 1165 
     | 
    
         
            +
             
     |