frprep 0.0.1.prealpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +8 -0
 - data/CHANGELOG.rdoc +0 -0
 - data/LICENSE.rdoc +0 -0
 - data/README.rdoc +0 -0
 - data/lib/common/AbstractSynInterface.rb +1227 -0
 - data/lib/common/BerkeleyInterface.rb +375 -0
 - data/lib/common/CollinsInterface.rb +1165 -0
 - data/lib/common/ConfigData.rb +694 -0
 - data/lib/common/Counter.rb +18 -0
 - data/lib/common/DBInterface.rb +48 -0
 - data/lib/common/EnduserMode.rb +27 -0
 - data/lib/common/Eval.rb +480 -0
 - data/lib/common/FixSynSemMapping.rb +196 -0
 - data/lib/common/FrPrepConfigData.rb +66 -0
 - data/lib/common/FrprepHelper.rb +1324 -0
 - data/lib/common/Graph.rb +345 -0
 - data/lib/common/ISO-8859-1.rb +24 -0
 - data/lib/common/ML.rb +186 -0
 - data/lib/common/Maxent.rb +215 -0
 - data/lib/common/MiniparInterface.rb +1388 -0
 - data/lib/common/Optimise.rb +195 -0
 - data/lib/common/Parser.rb +213 -0
 - data/lib/common/RegXML.rb +269 -0
 - data/lib/common/RosyConventions.rb +171 -0
 - data/lib/common/SQLQuery.rb +243 -0
 - data/lib/common/STXmlTerminalOrder.rb +194 -0
 - data/lib/common/SalsaTigerRegXML.rb +2347 -0
 - data/lib/common/SalsaTigerXMLHelper.rb +99 -0
 - data/lib/common/SleepyInterface.rb +384 -0
 - data/lib/common/SynInterfaces.rb +275 -0
 - data/lib/common/TabFormat.rb +720 -0
 - data/lib/common/Tiger.rb +1448 -0
 - data/lib/common/TntInterface.rb +44 -0
 - data/lib/common/Tree.rb +61 -0
 - data/lib/common/TreetaggerInterface.rb +303 -0
 - data/lib/common/headz.rb +338 -0
 - data/lib/common/option_parser.rb +13 -0
 - data/lib/common/ruby_class_extensions.rb +310 -0
 - data/lib/fred/Baseline.rb +150 -0
 - data/lib/fred/FileZipped.rb +31 -0
 - data/lib/fred/FredBOWContext.rb +863 -0
 - data/lib/fred/FredConfigData.rb +182 -0
 - data/lib/fred/FredConventions.rb +232 -0
 - data/lib/fred/FredDetermineTargets.rb +324 -0
 - data/lib/fred/FredEval.rb +312 -0
 - data/lib/fred/FredFeatureExtractors.rb +321 -0
 - data/lib/fred/FredFeatures.rb +1061 -0
 - data/lib/fred/FredFeaturize.rb +596 -0
 - data/lib/fred/FredNumTrainingSenses.rb +27 -0
 - data/lib/fred/FredParameters.rb +402 -0
 - data/lib/fred/FredSplit.rb +84 -0
 - data/lib/fred/FredSplitPkg.rb +180 -0
 - data/lib/fred/FredTest.rb +607 -0
 - data/lib/fred/FredTrain.rb +144 -0
 - data/lib/fred/PlotAndREval.rb +480 -0
 - data/lib/fred/fred.rb +45 -0
 - data/lib/fred/md5.rb +23 -0
 - data/lib/fred/opt_parser.rb +250 -0
 - data/lib/frprep/AbstractSynInterface.rb +1227 -0
 - data/lib/frprep/Ampersand.rb +37 -0
 - data/lib/frprep/BerkeleyInterface.rb +375 -0
 - data/lib/frprep/CollinsInterface.rb +1165 -0
 - data/lib/frprep/ConfigData.rb +694 -0
 - data/lib/frprep/Counter.rb +18 -0
 - data/lib/frprep/FNCorpusXML.rb +643 -0
 - data/lib/frprep/FNDatabase.rb +144 -0
 - data/lib/frprep/FixSynSemMapping.rb +196 -0
 - data/lib/frprep/FrPrepConfigData.rb +66 -0
 - data/lib/frprep/FrameXML.rb +513 -0
 - data/lib/frprep/FrprepHelper.rb +1324 -0
 - data/lib/frprep/Graph.rb +345 -0
 - data/lib/frprep/ISO-8859-1.rb +24 -0
 - data/lib/frprep/MiniparInterface.rb +1388 -0
 - data/lib/frprep/Parser.rb +213 -0
 - data/lib/frprep/RegXML.rb +269 -0
 - data/lib/frprep/STXmlTerminalOrder.rb +194 -0
 - data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
 - data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
 - data/lib/frprep/SleepyInterface.rb +384 -0
 - data/lib/frprep/SynInterfaces.rb +275 -0
 - data/lib/frprep/TabFormat.rb +720 -0
 - data/lib/frprep/Tiger.rb +1448 -0
 - data/lib/frprep/TntInterface.rb +44 -0
 - data/lib/frprep/Tree.rb +61 -0
 - data/lib/frprep/TreetaggerInterface.rb +303 -0
 - data/lib/frprep/do_parses.rb +142 -0
 - data/lib/frprep/frprep.rb +686 -0
 - data/lib/frprep/headz.rb +338 -0
 - data/lib/frprep/one_parsed_file.rb +28 -0
 - data/lib/frprep/opt_parser.rb +94 -0
 - data/lib/frprep/ruby_class_extensions.rb +310 -0
 - data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
 - data/lib/rosy/DBMySQL.rb +146 -0
 - data/lib/rosy/DBSQLite.rb +280 -0
 - data/lib/rosy/DBTable.rb +239 -0
 - data/lib/rosy/DBWrapper.rb +176 -0
 - data/lib/rosy/ExternalConfigData.rb +58 -0
 - data/lib/rosy/FailedParses.rb +130 -0
 - data/lib/rosy/FeatureInfo.rb +242 -0
 - data/lib/rosy/GfInduce.rb +1115 -0
 - data/lib/rosy/GfInduceFeature.rb +148 -0
 - data/lib/rosy/InputData.rb +294 -0
 - data/lib/rosy/RosyConfigData.rb +115 -0
 - data/lib/rosy/RosyConfusability.rb +338 -0
 - data/lib/rosy/RosyEval.rb +465 -0
 - data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
 - data/lib/rosy/RosyFeaturize.rb +280 -0
 - data/lib/rosy/RosyInspect.rb +336 -0
 - data/lib/rosy/RosyIterator.rb +477 -0
 - data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
 - data/lib/rosy/RosyPruning.rb +165 -0
 - data/lib/rosy/RosyServices.rb +744 -0
 - data/lib/rosy/RosySplit.rb +232 -0
 - data/lib/rosy/RosyTask.rb +19 -0
 - data/lib/rosy/RosyTest.rb +826 -0
 - data/lib/rosy/RosyTrain.rb +232 -0
 - data/lib/rosy/RosyTrainingTestTable.rb +786 -0
 - data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
 - data/lib/rosy/View.rb +418 -0
 - data/lib/rosy/opt_parser.rb +379 -0
 - data/lib/rosy/rosy.rb +77 -0
 - data/lib/shalmaneser/version.rb +3 -0
 - data/test/frprep/test_opt_parser.rb +94 -0
 - data/test/functional/functional_test_helper.rb +40 -0
 - data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
 - data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
 - data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
 - data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
 - data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
 - data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
 - data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
 - data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
 - data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
 - data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
 - data/test/functional/test_fred.rb +47 -0
 - data/test/functional/test_frprep.rb +52 -0
 - data/test/functional/test_rosy.rb +20 -0
 - metadata +270 -0
 
| 
         @@ -0,0 +1,215 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # wrapper script for the OpenNLP Maxent classifier
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            # sp July 2007
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
            require "tempfile"
         
     | 
| 
      
 7 
     | 
    
         
            +
            require 'fileutils'
         
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
            class Maxent
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
              ###
         
     | 
| 
      
 12 
     | 
    
         
            +
              def initialize(program_path,parameters)
         
     | 
| 
      
 13 
     | 
    
         
            +
                
         
     | 
| 
      
 14 
     | 
    
         
            +
                if parameters.empty?	
         
     | 
| 
      
 15 
     | 
    
         
            +
                  puts "Error: The OpenNLP maxent system needs two paths (first the location of maxent itself and then the location of the interface, usually program/tools/maxent)."
         
     | 
| 
      
 16 
     | 
    
         
            +
                  puts "I got only the program path."
         
     | 
| 
      
 17 
     | 
    
         
            +
                  Kernel.exit
         
     | 
| 
      
 18 
     | 
    
         
            +
                end
         
     | 
| 
      
 19 
     | 
    
         
            +
             
     | 
| 
      
 20 
     | 
    
         
            +
                @maxentpath = program_path
         
     | 
| 
      
 21 
     | 
    
         
            +
                @interface_path = parameters.first
         
     | 
| 
      
 22 
     | 
    
         
            +
                unless @maxentpath =~ /\/$/
         
     | 
| 
      
 23 
     | 
    
         
            +
                  @maxentpath = @maxentpath + "/"
         
     | 
| 
      
 24 
     | 
    
         
            +
                end
         
     | 
| 
      
 25 
     | 
    
         
            +
                
         
     | 
| 
      
 26 
     | 
    
         
            +
                # classpath for maxent
         
     | 
| 
      
 27 
     | 
    
         
            +
                
         
     | 
| 
      
 28 
     | 
    
         
            +
                @cp = "#{ENV["CLASSPATH"]}:#{@maxentpath}:#{@maxentpath}lib:#{@maxentpath}lib/trove.jar:#{@maxentpath}output/maxent-2.4.0.jar"
         
     | 
| 
      
 29 
     | 
    
         
            +
             
     | 
| 
      
 30 
     | 
    
         
            +
              end
         
     | 
| 
      
 31 
     | 
    
         
            +
             
     | 
| 
      
 32 
     | 
    
         
            +
              ###
         
     | 
| 
      
 33 
     | 
    
         
            +
              #
         
     | 
| 
      
 34 
     | 
    
         
            +
              # write classifier to training directory...
         
     | 
| 
      
 35 
     | 
    
         
            +
              def train(infilename,classifier_file)
         
     | 
| 
      
 36 
     | 
    
         
            +
                trainfile = Tempfile.new(File.basename(infilename)+".maxenttrain")
         
     | 
| 
      
 37 
     | 
    
         
            +
                infile = File.new(infilename)
         
     | 
| 
      
 38 
     | 
    
         
            +
                c45_to_maxent(infile,trainfile) # training data in csv format
         
     | 
| 
      
 39 
     | 
    
         
            +
                infile.close
         
     | 
| 
      
 40 
     | 
    
         
            +
                trainfile.close
         
     | 
| 
      
 41 
     | 
    
         
            +
             
     | 
| 
      
 42 
     | 
    
         
            +
                if classifier_file
         
     | 
| 
      
 43 
     | 
    
         
            +
                  @classifier_location = classifier_file
         
     | 
| 
      
 44 
     | 
    
         
            +
                else
         
     | 
| 
      
 45 
     | 
    
         
            +
                  @classifier_location = trainfile.path+"Model.bin.gz"
         
     | 
| 
      
 46 
     | 
    
         
            +
                end
         
     | 
| 
      
 47 
     | 
    
         
            +
                
         
     | 
| 
      
 48 
     | 
    
         
            +
                @classifier_location = enforce_compact_storage(@classifier_location)
         
     | 
| 
      
 49 
     | 
    
         
            +
             
     | 
| 
      
 50 
     | 
    
         
            +
                # store model in binary, gzipped form...
         
     | 
| 
      
 51 
     | 
    
         
            +
                command = ["cd #{@interface_path}; ",
         
     | 
| 
      
 52 
     | 
    
         
            +
                            #"/usr/lib/jvm/java-1.7.0/bin/java -cp #{@cp} -Xmx1000m Train",
         
     | 
| 
      
 53 
     | 
    
         
            +
            		"java -cp #{@cp} -Xmx1000m Train",
         
     | 
| 
      
 54 
     | 
    
         
            +
                           trainfile.path,
         
     | 
| 
      
 55 
     | 
    
         
            +
                           @classifier_location].join(" ")
         
     | 
| 
      
 56 
     | 
    
         
            +
                # remember location
         
     | 
| 
      
 57 
     | 
    
         
            +
                unless  successfully_run(command)
         
     | 
| 
      
 58 
     | 
    
         
            +
                  return false
         
     | 
| 
      
 59 
     | 
    
         
            +
                end
         
     | 
| 
      
 60 
     | 
    
         
            +
                trainfile.close(true)
         
     | 
| 
      
 61 
     | 
    
         
            +
              end
         
     | 
| 
      
 62 
     | 
    
         
            +
             
     | 
| 
      
 63 
     | 
    
         
            +
              def write(classifier_file)
         
     | 
| 
      
 64 
     | 
    
         
            +
                
         
     | 
| 
      
 65 
     | 
    
         
            +
                classifier_file = enforce_compact_storage(classifier_file)
         
     | 
| 
      
 66 
     | 
    
         
            +
                
         
     | 
| 
      
 67 
     | 
    
         
            +
                if @classifier_location
         
     | 
| 
      
 68 
     | 
    
         
            +
                  @classifier_location = enforce_compact_storage(@classifier_location)
         
     | 
| 
      
 69 
     | 
    
         
            +
                  %x{cp #{@classifier_location} #{classifier_file}} # store classifier
         
     | 
| 
      
 70 
     | 
    
         
            +
               #    File.chmod(0664,classifier_file+".classifier")
         
     | 
| 
      
 71 
     | 
    
         
            +
                else
         
     | 
| 
      
 72 
     | 
    
         
            +
                  $stderr.puts "Maxent error: cannot read Maxent classifier file #{@classifier_file}."
         
     | 
| 
      
 73 
     | 
    
         
            +
                  return nil      
         
     | 
| 
      
 74 
     | 
    
         
            +
                end
         
     | 
| 
      
 75 
     | 
    
         
            +
              end
         
     | 
| 
      
 76 
     | 
    
         
            +
             
     | 
| 
      
 77 
     | 
    
         
            +
              ###
         
     | 
| 
      
 78 
     | 
    
         
            +
              def exists?(classifier_file)
         
     | 
| 
      
 79 
     | 
    
         
            +
                classifier_file = enforce_compact_storage(classifier_file)    
         
     | 
| 
      
 80 
     | 
    
         
            +
                return FileTest.exists?(classifier_file)
         
     | 
| 
      
 81 
     | 
    
         
            +
              end
         
     | 
| 
      
 82 
     | 
    
         
            +
              
         
     | 
| 
      
 83 
     | 
    
         
            +
              ###
         
     | 
| 
      
 84 
     | 
    
         
            +
              # return true iff reading the classifier has had success
         
     | 
| 
      
 85 
     | 
    
         
            +
              def read(classifier_file)
         
     | 
| 
      
 86 
     | 
    
         
            +
                
         
     | 
| 
      
 87 
     | 
    
         
            +
                classifier_file = enforce_compact_storage(classifier_file)
         
     | 
| 
      
 88 
     | 
    
         
            +
             
     | 
| 
      
 89 
     | 
    
         
            +
                if exists?(classifier_file)
         
     | 
| 
      
 90 
     | 
    
         
            +
                  @classifier_location = classifier_file
         
     | 
| 
      
 91 
     | 
    
         
            +
                  return true
         
     | 
| 
      
 92 
     | 
    
         
            +
                else
         
     | 
| 
      
 93 
     | 
    
         
            +
                  $stderr.puts "No classifier file "+classifier_file
         
     | 
| 
      
 94 
     | 
    
         
            +
                  return false
         
     | 
| 
      
 95 
     | 
    
         
            +
                end
         
     | 
| 
      
 96 
     | 
    
         
            +
              end
         
     | 
| 
      
 97 
     | 
    
         
            +
              
         
     | 
| 
      
 98 
     | 
    
         
            +
              ###
         
     | 
| 
      
 99 
     | 
    
         
            +
              def apply(infilename,outfilename)
         
     | 
| 
      
 100 
     | 
    
         
            +
                
         
     | 
| 
      
 101 
     | 
    
         
            +
                @classifier_location = enforce_compact_storage(@classifier_location)
         
     | 
| 
      
 102 
     | 
    
         
            +
                unless @classifier_location
         
     | 
| 
      
 103 
     | 
    
         
            +
                  return false
         
     | 
| 
      
 104 
     | 
    
         
            +
                end
         
     | 
| 
      
 105 
     | 
    
         
            +
             
     | 
| 
      
 106 
     | 
    
         
            +
                testfile = Tempfile.new(File.basename(infilename)+".maxenttrain")
         
     | 
| 
      
 107 
     | 
    
         
            +
                
         
     | 
| 
      
 108 
     | 
    
         
            +
                infile = File.new(infilename)
         
     | 
| 
      
 109 
     | 
    
         
            +
                c45_to_maxent(infile,testfile) # training data in csv format
         
     | 
| 
      
 110 
     | 
    
         
            +
                infile.close
         
     | 
| 
      
 111 
     | 
    
         
            +
                testfile.close
         
     | 
| 
      
 112 
     | 
    
         
            +
                
         
     | 
| 
      
 113 
     | 
    
         
            +
                command = ["cd #{@interface_path}; ",
         
     | 
| 
      
 114 
     | 
    
         
            +
                           #"/usr/lib/jvm/java-1.7.0/bin/java -cp #{@cp} -Xmx1000m Classify ",
         
     | 
| 
      
 115 
     | 
    
         
            +
                           "java -cp #{@cp} -Xmx1000m Classify ",
         
     | 
| 
      
 116 
     | 
    
         
            +
                           testfile.path,
         
     | 
| 
      
 117 
     | 
    
         
            +
                           @classifier_location,
         
     | 
| 
      
 118 
     | 
    
         
            +
                           ">",
         
     | 
| 
      
 119 
     | 
    
         
            +
                           outfilename].join(" ")
         
     | 
| 
      
 120 
     | 
    
         
            +
                
         
     | 
| 
      
 121 
     | 
    
         
            +
                # classify
         
     | 
| 
      
 122 
     | 
    
         
            +
                unless  successfully_run(command)
         
     | 
| 
      
 123 
     | 
    
         
            +
                  return false
         
     | 
| 
      
 124 
     | 
    
         
            +
                end
         
     | 
| 
      
 125 
     | 
    
         
            +
                
         
     | 
| 
      
 126 
     | 
    
         
            +
                # some error in classification
         
     | 
| 
      
 127 
     | 
    
         
            +
                unless FileTest.exists?(outfilename)
         
     | 
| 
      
 128 
     | 
    
         
            +
                  return false
         
     | 
| 
      
 129 
     | 
    
         
            +
                end
         
     | 
| 
      
 130 
     | 
    
         
            +
                
         
     | 
| 
      
 131 
     | 
    
         
            +
                # no errors = success
         
     | 
| 
      
 132 
     | 
    
         
            +
                testfile.close(true)
         
     | 
| 
      
 133 
     | 
    
         
            +
                return true
         
     | 
| 
      
 134 
     | 
    
         
            +
              end
         
     | 
| 
      
 135 
     | 
    
         
            +
             
     | 
| 
      
 136 
     | 
    
         
            +
              #####
         
     | 
| 
      
 137 
     | 
    
         
            +
              # format of Maxent result file:
         
     | 
| 
      
 138 
     | 
    
         
            +
              # <best label>[<confidence>]  <secondbest_label>[<confidence>] ....
         
     | 
| 
      
 139 
     | 
    
         
            +
              #
         
     | 
| 
      
 140 
     | 
    
         
            +
              # returns a list of instance_results
         
     | 
| 
      
 141 
     | 
    
         
            +
              # where an instance_result is a list of pairs [label, confidence]
         
     | 
| 
      
 142 
     | 
    
         
            +
              # where the pairs are sorted by confidence
         
     | 
| 
      
 143 
     | 
    
         
            +
              def read_resultfile(filename)
         
     | 
| 
      
 144 
     | 
    
         
            +
                begin
         
     | 
| 
      
 145 
     | 
    
         
            +
                  f = File.new(filename)
         
     | 
| 
      
 146 
     | 
    
         
            +
                rescue
         
     | 
| 
      
 147 
     | 
    
         
            +
                  $stderr.puts "Maxent error: cannot read Maxent result file #{filemame}."
         
     | 
| 
      
 148 
     | 
    
         
            +
                  return nil
         
     | 
| 
      
 149 
     | 
    
         
            +
                end
         
     | 
| 
      
 150 
     | 
    
         
            +
             
     | 
| 
      
 151 
     | 
    
         
            +
                retv = Array.new()
         
     | 
| 
      
 152 
     | 
    
         
            +
             
     | 
| 
      
 153 
     | 
    
         
            +
                f.each { |line|
         
     | 
| 
      
 154 
     | 
    
         
            +
                  line_results = Array.new()
         
     | 
| 
      
 155 
     | 
    
         
            +
                  pieces = line.split() # split at whitespace
         
     | 
| 
      
 156 
     | 
    
         
            +
             
     | 
| 
      
 157 
     | 
    
         
            +
                  pieces.each {|piece|
         
     | 
| 
      
 158 
     | 
    
         
            +
                    piece =~ /(\S+)\[(.+)\]/
         
     | 
| 
      
 159 
     | 
    
         
            +
                    label = $1
         
     | 
| 
      
 160 
     | 
    
         
            +
                    confidence = $2.to_f
         
     | 
| 
      
 161 
     | 
    
         
            +
                    
         
     | 
| 
      
 162 
     | 
    
         
            +
                    line_results << [label, confidence]        
         
     | 
| 
      
 163 
     | 
    
         
            +
                  }
         
     | 
| 
      
 164 
     | 
    
         
            +
             
     | 
| 
      
 165 
     | 
    
         
            +
                  # sort: most confident label first
         
     | 
| 
      
 166 
     | 
    
         
            +
                  retv << line_results.sort {|a,b| b[1] <=> a[1]}
         
     | 
| 
      
 167 
     | 
    
         
            +
                }
         
     | 
| 
      
 168 
     | 
    
         
            +
                return retv
         
     | 
| 
      
 169 
     | 
    
         
            +
              end
         
     | 
| 
      
 170 
     | 
    
         
            +
             
     | 
| 
      
 171 
     | 
    
         
            +
              
         
     | 
| 
      
 172 
     | 
    
         
            +
              ###################################
         
     | 
| 
      
 173 
     | 
    
         
            +
              private
         
     | 
| 
      
 174 
     | 
    
         
            +
             
     | 
| 
      
 175 
     | 
    
         
            +
              ###
         
     | 
| 
      
 176 
     | 
    
         
            +
              # produce input file for maxent learner: make attribute-value pairs
         
     | 
| 
      
 177 
     | 
    
         
            +
              # where attribute ==    featureX=
         
     | 
| 
      
 178 
     | 
    
         
            +
              def c45_to_maxent(inpipe,outpipe) 
         
     | 
| 
      
 179 
     | 
    
         
            +
                while (line = inpipe.gets)
         
     | 
| 
      
 180 
     | 
    
         
            +
                  line.chomp!
         
     | 
| 
      
 181 
     | 
    
         
            +
                  la = line.split(",")
         
     | 
| 
      
 182 
     | 
    
         
            +
                  label = la.pop
         
     | 
| 
      
 183 
     | 
    
         
            +
                  if label[-1,1] == "."
         
     | 
| 
      
 184 
     | 
    
         
            +
            	label.chop!
         
     | 
| 
      
 185 
     | 
    
         
            +
                  end
         
     | 
| 
      
 186 
     | 
    
         
            +
                  la.each_index {|i|
         
     | 
| 
      
 187 
     | 
    
         
            +
                    la[i] = i.to_s() + "=" + la[i]
         
     | 
| 
      
 188 
     | 
    
         
            +
                  }
         
     | 
| 
      
 189 
     | 
    
         
            +
                  la.push(label)
         
     | 
| 
      
 190 
     | 
    
         
            +
                  outpipe.puts la.join(" ")
         
     | 
| 
      
 191 
     | 
    
         
            +
                end
         
     | 
| 
      
 192 
     | 
    
         
            +
              end
         
     | 
| 
      
 193 
     | 
    
         
            +
             
     | 
| 
      
 194 
     | 
    
         
            +
              # since the OpenNLP MaxEnt system determines storage based on filename,
         
     | 
| 
      
 195 
     | 
    
         
            +
              # make sure that all models are stored internally as binary, gzipped files.
         
     | 
| 
      
 196 
     | 
    
         
            +
              
         
     | 
| 
      
 197 
     | 
    
         
            +
              def enforce_compact_storage(filename)
         
     | 
| 
      
 198 
     | 
    
         
            +
                if filename =~ /Model.bin.gz/
         
     | 
| 
      
 199 
     | 
    
         
            +
                  return filename
         
     | 
| 
      
 200 
     | 
    
         
            +
                else
         
     | 
| 
      
 201 
     | 
    
         
            +
                  return filename+"Model.bin.gz"
         
     | 
| 
      
 202 
     | 
    
         
            +
                end
         
     | 
| 
      
 203 
     | 
    
         
            +
              end
         
     | 
| 
      
 204 
     | 
    
         
            +
             
     | 
| 
      
 205 
     | 
    
         
            +
              ###
         
     | 
| 
      
 206 
     | 
    
         
            +
              def successfully_run(command)
         
     | 
| 
      
 207 
     | 
    
         
            +
                retv = Kernel.system(command)
         
     | 
| 
      
 208 
     | 
    
         
            +
                unless retv
         
     | 
| 
      
 209 
     | 
    
         
            +
                  $stderr.puts "Error running classifier. Continuing."
         
     | 
| 
      
 210 
     | 
    
         
            +
                  $stderr.puts "Offending command: "+command
         
     | 
| 
      
 211 
     | 
    
         
            +
             #     exit 1
         
     | 
| 
      
 212 
     | 
    
         
            +
                end
         
     | 
| 
      
 213 
     | 
    
         
            +
                return retv
         
     | 
| 
      
 214 
     | 
    
         
            +
              end
         
     | 
| 
      
 215 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,1388 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            ####
         
     | 
| 
      
 2 
     | 
    
         
            +
            # KE Nov 2005
         
     | 
| 
      
 3 
     | 
    
         
            +
            #
         
     | 
| 
      
 4 
     | 
    
         
            +
            # Interface for use of the Minipar parser:
         
     | 
| 
      
 5 
     | 
    
         
            +
            # parsing with Salsa/Tiger XML output format,
         
     | 
| 
      
 6 
     | 
    
         
            +
            # class for interpreting the Salsa/Tiger XML data structures
         
     | 
| 
      
 7 
     | 
    
         
            +
             
     | 
| 
      
 8 
     | 
    
         
            +
            require "tempfile"
         
     | 
| 
      
 9 
     | 
    
         
            +
            require "common/TabFormat"
         
     | 
| 
      
 10 
     | 
    
         
            +
            require "common/SalsaTigerRegXML"
         
     | 
| 
      
 11 
     | 
    
         
            +
            require "common/SalsaTigerXMLHelper"
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
            require "common/AbstractSynInterface"
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
            #########################################
         
     | 
| 
      
 16 
     | 
    
         
            +
            # MiniparSentence class
         
     | 
| 
      
 17 
     | 
    
         
            +
            #
         
     | 
| 
      
 18 
     | 
    
         
            +
            # analyze one minipar output sentence,
         
     | 
| 
      
 19 
     | 
    
         
            +
            # provide access
         
     | 
| 
      
 20 
     | 
    
         
            +
            #
         
     | 
| 
      
 21 
     | 
    
         
            +
            # hash representation of a node:
         
     | 
| 
      
 22 
     | 
    
         
            +
            # keys are
         
     | 
| 
      
 23 
     | 
    
         
            +
            #   index, word , lemma, pos, parent_index, edgelabel, governing_lemma, antecedent_index
         
     | 
| 
      
 24 
     | 
    
         
            +
            #
         
     | 
| 
      
 25 
     | 
    
         
            +
            # other access: as SalsaTigerSentence object
         
     | 
| 
      
 26 
     | 
    
         
            +
            class MiniparSentence
         
     | 
| 
      
 27 
     | 
    
         
            +
             
     | 
| 
      
 28 
     | 
    
         
            +
              ########
         
     | 
| 
      
 29 
     | 
    
         
            +
              def initialize(sentence) # array:string, one minipar node per string
         
     | 
| 
      
 30 
     | 
    
         
            +
                @nodes = Array.new
         
     | 
| 
      
 31 
     | 
    
         
            +
             
     | 
| 
      
 32 
     | 
    
         
            +
                sentence.each { |line_string|
         
     | 
| 
      
 33 
     | 
    
         
            +
                  @nodes << analyze_line(line_string)
         
     | 
| 
      
 34 
     | 
    
         
            +
                }
         
     | 
| 
      
 35 
     | 
    
         
            +
                # sort nodes by line index -- sometimes nodes with lower index are mentioned later in the sentence
         
     | 
| 
      
 36 
     | 
    
         
            +
                @nodes.sort! { |a, b| a["index"].to_i <=> b["index"].to_i }
         
     | 
| 
      
 37 
     | 
    
         
            +
             
     | 
| 
      
 38 
     | 
    
         
            +
                @tabsent = nil
         
     | 
| 
      
 39 
     | 
    
         
            +
                # nodehash_mapping: hash tabindex -> array:nodehashes
         
     | 
| 
      
 40 
     | 
    
         
            +
                @nodehash_mapping = nil
         
     | 
| 
      
 41 
     | 
    
         
            +
              end
         
     | 
| 
      
 42 
     | 
    
         
            +
             
     | 
| 
      
 43 
     | 
    
         
            +
              #####
         
     | 
| 
      
 44 
     | 
    
         
            +
              def nodes()
         
     | 
| 
      
 45 
     | 
    
         
            +
                return @nodes.clone.freeze()
         
     | 
| 
      
 46 
     | 
    
         
            +
              end
         
     | 
| 
      
 47 
     | 
    
         
            +
             
     | 
| 
      
 48 
     | 
    
         
            +
              #####3
         
     | 
| 
      
 49 
     | 
    
         
            +
              # stxml:
         
     | 
| 
      
 50 
     | 
    
         
            +
              #
         
     | 
| 
      
 51 
     | 
    
         
            +
              # make SalsaTigerSentence object from this sentence,
         
     | 
| 
      
 52 
     | 
    
         
            +
              # one node per minipar node.
         
     | 
| 
      
 53 
     | 
    
         
            +
              # if it is a nonterminal, duplicate it as a terminal
         
     | 
| 
      
 54 
     | 
    
         
            +
              #
         
     | 
| 
      
 55 
     | 
    
         
            +
              # return: pair [SalsaTigerSentence, mapping]:
         
     | 
| 
      
 56 
     | 
    
         
            +
              # if we have a tab sent, mapping is a mapping from tab word indices to SynNode objects
         
     | 
| 
      
 57 
     | 
    
         
            +
              # of the minipar sentence representation
         
     | 
| 
      
 58 
     | 
    
         
            +
              def stxml(sentence_id)
         
     | 
| 
      
 59 
     | 
    
         
            +
                return salsatigerxml_output(sentence_id)
         
     | 
| 
      
 60 
     | 
    
         
            +
              end
         
     | 
| 
      
 61 
     | 
    
         
            +
             
     | 
| 
      
 62 
     | 
    
         
            +
              #####
         
     | 
| 
      
 63 
     | 
    
         
            +
              # set tabsent:
         
     | 
| 
      
 64 
     | 
    
         
            +
              # set this tab format sentence, which has entries "word", "lineno", 
         
     | 
| 
      
 65 
     | 
    
         
            +
              # as the sentence matching this minipar output sentence.
         
     | 
| 
      
 66 
     | 
    
         
            +
              #
         
     | 
| 
      
 67 
     | 
    
         
            +
              # On success, remember the tab sentence as well as the mapping
         
     | 
| 
      
 68 
     | 
    
         
            +
              # between fntab sentence indices and minipar node hash indices
         
     | 
| 
      
 69 
     | 
    
         
            +
              #
         
     | 
| 
      
 70 
     | 
    
         
            +
              # returns true on success
         
     | 
| 
      
 71 
     | 
    
         
            +
              #         or false if matching failed
         
     | 
| 
      
 72 
     | 
    
         
            +
             
     | 
| 
      
 73 
     | 
    
         
            +
              def set_tabsent(tabsent, # TabFileFormat object
         
     | 
| 
      
 74 
     | 
    
         
            +
            		  sloppy = true) # not nil or false: allow sloppy match
         
     | 
| 
      
 75 
     | 
    
         
            +
             
     | 
| 
      
 76 
     | 
    
         
            +
                # empty minipar sentence? then no match
         
     | 
| 
      
 77 
     | 
    
         
            +
                if @nodes.empty? 
         
     | 
| 
      
 78 
     | 
    
         
            +
                  return false
         
     | 
| 
      
 79 
     | 
    
         
            +
                end
         
     | 
| 
      
 80 
     | 
    
         
            +
             
     | 
| 
      
 81 
     | 
    
         
            +
                # tabwords: array:string
         
     | 
| 
      
 82 
     | 
    
         
            +
                tabwords = Array.new
         
     | 
| 
      
 83 
     | 
    
         
            +
                tabsent.each_line_parsed { |l| tabwords << l.get("word") }
         
     | 
| 
      
 84 
     | 
    
         
            +
             
     | 
| 
      
 85 
     | 
    
         
            +
                # main data structure: a chart of partial mappings fn_index -> minipar_index
         
     | 
| 
      
 86 
     | 
    
         
            +
                # represented as an array of partial mappings
         
     | 
| 
      
 87 
     | 
    
         
            +
                # each partial mapping is an array of triples [fn_index, min_index, "full"|"partial"]
         
     | 
| 
      
 88 
     | 
    
         
            +
                old_chart = Array.new
         
     | 
| 
      
 89 
     | 
    
         
            +
             
     | 
| 
      
 90 
     | 
    
         
            +
                # enter data for 1st minipar node into the chart
         
     | 
| 
      
 91 
     | 
    
         
            +
                first_node_no = 0
         
     | 
| 
      
 92 
     | 
    
         
            +
                while @nodes[first_node_no]["word"].nil?
         
     | 
| 
      
 93 
     | 
    
         
            +
                  first_node_no += 1
         
     | 
| 
      
 94 
     | 
    
         
            +
                end
         
     | 
| 
      
 95 
     | 
    
         
            +
                old_chart = fnw_minw_match(tabwords, @nodes[first_node_no]["word"]).map { |fnw_index, match_how|
         
     | 
| 
      
 96 
     | 
    
         
            +
                  [[fnw_index, first_node_no, match_how]]
         
     | 
| 
      
 97 
     | 
    
         
            +
                }
         
     | 
| 
      
 98 
     | 
    
         
            +
                
         
     | 
| 
      
 99 
     | 
    
         
            +
                if old_chart.empty?
         
     | 
| 
      
 100 
     | 
    
         
            +
                  # unmatched single word in minipar sentence
         
     | 
| 
      
 101 
     | 
    
         
            +
                  return false
         
     | 
| 
      
 102 
     | 
    
         
            +
                end
         
     | 
| 
      
 103 
     | 
    
         
            +
             
     | 
| 
      
 104 
     | 
    
         
            +
                # enter data for the rest of the minipar nodes into the chart
         
     | 
| 
      
 105 
     | 
    
         
            +
                (first_node_no + 1).upto(@nodes.length - 1) { |node_no|
         
     | 
| 
      
 106 
     | 
    
         
            +
                  unless @nodes[node_no]["word"]
         
     | 
| 
      
 107 
     | 
    
         
            +
                    # minipar node with empty word, skip
         
     | 
| 
      
 108 
     | 
    
         
            +
                    next
         
     | 
| 
      
 109 
     | 
    
         
            +
                  end
         
     | 
| 
      
 110 
     | 
    
         
            +
                  new_chart = Array.new
         
     | 
| 
      
 111 
     | 
    
         
            +
             
     | 
| 
      
 112 
     | 
    
         
            +
                  # each partial mapping found up to now:
         
     | 
| 
      
 113 
     | 
    
         
            +
                  # try to extend it, record results in new_chart
         
     | 
| 
      
 114 
     | 
    
         
            +
                  old_chart.each { |partial_mapping|
         
     | 
| 
      
 115 
     | 
    
         
            +
                    prev_fnw_index, prev_mw_index, match_how = partial_mapping.last
         
     | 
| 
      
 116 
     | 
    
         
            +
                    
         
     | 
| 
      
 117 
     | 
    
         
            +
                    # where do we start looking in tabwords? same word as before, or advance one?
         
     | 
| 
      
 118 
     | 
    
         
            +
                    case match_how
         
     | 
| 
      
 119 
     | 
    
         
            +
                    when "full"
         
     | 
| 
      
 120 
     | 
    
         
            +
                      fnw_index = prev_fnw_index + 1
         
     | 
| 
      
 121 
     | 
    
         
            +
                    when "partial"
         
     | 
| 
      
 122 
     | 
    
         
            +
                      fnw_index = prev_fnw_index
         
     | 
| 
      
 123 
     | 
    
         
            +
                    else
         
     | 
| 
      
 124 
     | 
    
         
            +
                      raise "Shouldn't be here"
         
     | 
| 
      
 125 
     | 
    
         
            +
                    end
         
     | 
| 
      
 126 
     | 
    
         
            +
             
     | 
| 
      
 127 
     | 
    
         
            +
                    fnw_minw_match(tabwords[fnw_index..tabwords.length()-1], 
         
     | 
| 
      
 128 
     | 
    
         
            +
                                   @nodes[node_no]["word"]).each { |match_offset, match_how|
         
     | 
| 
      
 129 
     | 
    
         
            +
                      new_chart.push partial_mapping + [[fnw_index + match_offset, node_no, match_how]]
         
     | 
| 
      
 130 
     | 
    
         
            +
                    }
         
     | 
| 
      
 131 
     | 
    
         
            +
                  }
         
     | 
| 
      
 132 
     | 
    
         
            +
             
     | 
| 
      
 133 
     | 
    
         
            +
                  if new_chart.empty?
         
     | 
| 
      
 134 
     | 
    
         
            +
                    # no partial mappings found that would work up to this minipar node:
         
     | 
| 
      
 135 
     | 
    
         
            +
                    # matching failed
         
     | 
| 
      
 136 
     | 
    
         
            +
                    return false
         
     | 
| 
      
 137 
     | 
    
         
            +
                  end
         
     | 
| 
      
 138 
     | 
    
         
            +
             
     | 
| 
      
 139 
     | 
    
         
            +
                  old_chart = new_chart
         
     | 
| 
      
 140 
     | 
    
         
            +
                }
         
     | 
| 
      
 141 
     | 
    
         
            +
             
     | 
| 
      
 142 
     | 
    
         
            +
            #     $stderr.puts "Msent: "+ @nodes.map { |n| n["word"]}.join(" ")
         
     | 
| 
      
 143 
     | 
    
         
            +
            #     $stderr.puts "Tsent: "+ tabwords.join(" ")
         
     | 
| 
      
 144 
     | 
    
         
            +
            #     $stderr.puts "Mappings: "
         
     | 
| 
      
 145 
     | 
    
         
            +
            #     old_chart.each { |mapping|
         
     | 
| 
      
 146 
     | 
    
         
            +
            #       mapping.each { |fnw_ix, mnode_no, match_how|
         
     | 
| 
      
 147 
     | 
    
         
            +
            #         $stderr.print tabwords[fnw_ix] + ":" + @nodes[mnode_no]["word"] + ":" + match_how + " "
         
     | 
| 
      
 148 
     | 
    
         
            +
            #       }
         
     | 
| 
      
 149 
     | 
    
         
            +
            #       $stderr.puts
         
     | 
| 
      
 150 
     | 
    
         
            +
            #     }
         
     | 
| 
      
 151 
     | 
    
         
            +
            #     $stderr.puts "any key"
         
     | 
| 
      
 152 
     | 
    
         
            +
            #     $stdin.gets()
         
     | 
| 
      
 153 
     | 
    
         
            +
             
     | 
| 
      
 154 
     | 
    
         
            +
                # filter chart: if some fntab sent words are only matched partially, discard
         
     | 
| 
      
 155 
     | 
    
         
            +
                if sloppy
         
     | 
| 
      
 156 
     | 
    
         
            +
                  chart = old_chart
         
     | 
| 
      
 157 
     | 
    
         
            +
                else
         
     | 
| 
      
 158 
     | 
    
         
            +
                  chart = old_chart.select { |mapping|
         
     | 
| 
      
 159 
     | 
    
         
            +
             
     | 
| 
      
 160 
     | 
    
         
            +
            	mapping_ok = true
         
     | 
| 
      
 161 
     | 
    
         
            +
            	tabwords.each_with_index { |fnw, fnw_index|
         
     | 
| 
      
 162 
     | 
    
         
            +
            	  
         
     | 
| 
      
 163 
     | 
    
         
            +
            	  tuples = mapping.select { |other_fnw_index, mnode_no, match_how| other_fnw_index == fnw_index }
         
     | 
| 
      
 164 
     | 
    
         
            +
            	  
         
     | 
| 
      
 165 
     | 
    
         
            +
            	  unless tuples.empty?        
         
     | 
| 
      
 166 
     | 
    
         
            +
            	    word = tuples.map { |fnw_index, mnode_no, match_how| @nodes[mnode_no]["word"] }.join()
         
     | 
| 
      
 167 
     | 
    
         
            +
                    
         
     | 
| 
      
 168 
     | 
    
         
            +
            	    unless word == fnw
         
     | 
| 
      
 169 
     | 
    
         
            +
            	      mapping_ok = false
         
     | 
| 
      
 170 
     | 
    
         
            +
            	      break
         
     | 
| 
      
 171 
     | 
    
         
            +
            	    end
         
     | 
| 
      
 172 
     | 
    
         
            +
            	  end
         
     | 
| 
      
 173 
     | 
    
         
            +
            	}
         
     | 
| 
      
 174 
     | 
    
         
            +
            	mapping_ok      
         
     | 
| 
      
 175 
     | 
    
         
            +
                  }
         
     | 
| 
      
 176 
     | 
    
         
            +
                end
         
     | 
| 
      
 177 
     | 
    
         
            +
             
     | 
| 
      
 178 
     | 
    
         
            +
                if chart.empty?
         
     | 
| 
      
 179 
     | 
    
         
            +
                  return false
         
     | 
| 
      
 180 
     | 
    
         
            +
                elsif chart.length() > 1
         
     | 
| 
      
 181 
     | 
    
         
            +
            #      $stderr.puts "Found more than one mapping for sentence:"
         
     | 
| 
      
 182 
     | 
    
         
            +
            #      $stderr.puts "Msent: " + @nodes.map { |n| n["word"]}.join(" ")
         
     | 
| 
      
 183 
     | 
    
         
            +
            #      $stderr.puts "Tsent: "+ tabwords.join(" ")
         
     | 
| 
      
 184 
     | 
    
         
            +
            #      $stderr.puts
         
     | 
| 
      
 185 
     | 
    
         
            +
                end
         
     | 
| 
      
 186 
     | 
    
         
            +
             
     | 
| 
      
 187 
     | 
    
         
            +
                # success: found mapping
         
     | 
| 
      
 188 
     | 
    
         
            +
                # nodehash_mapping: hash tab sentence word index -> array: SynNodes
         
     | 
| 
      
 189 
     | 
    
         
            +
                @tabsent = tabsent
         
     | 
| 
      
 190 
     | 
    
         
            +
                @nodehash_mapping = Hash.new
         
     | 
| 
      
 191 
     | 
    
         
            +
                chart.first.each { |tabindex, mindex, match_how|
         
     | 
| 
      
 192 
     | 
    
         
            +
                  unless @nodehash_mapping[tabindex]
         
     | 
| 
      
 193 
     | 
    
         
            +
                    @nodehash_mapping[tabindex] = Array.new
         
     | 
| 
      
 194 
     | 
    
         
            +
                  end
         
     | 
| 
      
 195 
     | 
    
         
            +
                  @nodehash_mapping[tabindex] << @nodes[mindex]
         
     | 
| 
      
 196 
     | 
    
         
            +
                }
         
     | 
| 
      
 197 
     | 
    
         
            +
                return true
         
     | 
| 
      
 198 
     | 
    
         
            +
              end
         
     | 
| 
      
 199 
     | 
    
         
            +
             
     | 
| 
      
 200 
     | 
    
         
            +
              # nodehash_mapping: hash tabindex -> array:nodehashes
         
     | 
| 
      
 201 
     | 
    
         
            +
              def nodehash_mapping()
         
     | 
| 
      
 202 
     | 
    
         
            +
                if @nodehash_mapping
         
     | 
| 
      
 203 
     | 
    
         
            +
                  return @nodehash_mapping.clone.freeze()
         
     | 
| 
      
 204 
     | 
    
         
            +
                else
         
     | 
| 
      
 205 
     | 
    
         
            +
                  return nil
         
     | 
| 
      
 206 
     | 
    
         
            +
                end
         
     | 
| 
      
 207 
     | 
    
         
            +
              end
         
     | 
| 
      
 208 
     | 
    
         
            +
             
     | 
| 
      
 209 
     | 
    
         
            +
             
     | 
| 
      
 210 
     | 
    
         
            +
              ################################################3
         
     | 
| 
      
 211 
     | 
    
         
            +
              ################################################3
         
     | 
| 
      
 212 
     | 
    
         
            +
              private
         
     | 
| 
      
 213 
     | 
    
         
            +
             
     | 
| 
      
 214 
     | 
    
         
            +
              ###########
         
     | 
| 
      
 215 
     | 
    
         
            +
              # analyze one line of the sentence array.
         
     | 
| 
      
 216 
     | 
    
         
            +
              # 
         
     | 
| 
      
 217 
     | 
    
         
            +
              # examples of possible entries:
         
     | 
| 
      
 218 
     | 
    
         
            +
              # E1      (()     fin C   E4      )
         
     | 
| 
      
 219 
     | 
    
         
            +
              # 3       (them   ~ N     2       obj     (gov call))
         
     | 
| 
      
 220 
     | 
    
         
            +
              # E5      (()     they N  2       subj    (gov call)      (antecedent 1))
         
     | 
| 
      
 221 
     | 
    
         
            +
              def analyze_line(line)
         
     | 
| 
      
 222 
     | 
    
         
            +
                retv = Hash.new()
         
     | 
| 
      
 223 
     | 
    
         
            +
             
     | 
| 
      
 224 
     | 
    
         
            +
                unless line =~ /^(\w+)\t\((.+)\)\s*$/
         
     | 
| 
      
 225 
     | 
    
         
            +
                  raise "Cannot parse line: #{line}"
         
     | 
| 
      
 226 
     | 
    
         
            +
                end
         
     | 
| 
      
 227 
     | 
    
         
            +
             
     | 
| 
      
 228 
     | 
    
         
            +
                # line structure: 
         
     | 
| 
      
 229 
     | 
    
         
            +
                # index ( node descr )
         
     | 
| 
      
 230 
     | 
    
         
            +
                retv["index"] = $1
         
     | 
| 
      
 231 
     | 
    
         
            +
             
     | 
| 
      
 232 
     | 
    
         
            +
                descr = $2
         
     | 
| 
      
 233 
     | 
    
         
            +
                word, lemma_pos, parentindex, edgelabel, governor, antecedent = descr.split("\t")
         
     | 
| 
      
 234 
     | 
    
         
            +
             
     | 
| 
      
 235 
     | 
    
         
            +
                # word
         
     | 
| 
      
 236 
     | 
    
         
            +
                if word
         
     | 
| 
      
 237 
     | 
    
         
            +
                  if word =~ /^['"](.+)['"]$/
         
     | 
| 
      
 238 
     | 
    
         
            +
                    # quoted? remove quotes
         
     | 
| 
      
 239 
     | 
    
         
            +
                    word = $1
         
     | 
| 
      
 240 
     | 
    
         
            +
                  end
         
     | 
| 
      
 241 
     | 
    
         
            +
                  unless word == "()"
         
     | 
| 
      
 242 
     | 
    
         
            +
                    retv["word"] = word
         
     | 
| 
      
 243 
     | 
    
         
            +
                  end
         
     | 
| 
      
 244 
     | 
    
         
            +
                end
         
     | 
| 
      
 245 
     | 
    
         
            +
             
     | 
| 
      
 246 
     | 
    
         
            +
                # lemma, POS
         
     | 
| 
      
 247 
     | 
    
         
            +
                if lemma_pos
         
     | 
| 
      
 248 
     | 
    
         
            +
                  lemma_pos.strip!
         
     | 
| 
      
 249 
     | 
    
         
            +
                  if lemma_pos == "U"
         
     | 
| 
      
 250 
     | 
    
         
            +
                    # neither lemma nor POS for this node
         
     | 
| 
      
 251 
     | 
    
         
            +
                  else
         
     | 
| 
      
 252 
     | 
    
         
            +
                    # we have both lemma and POS
         
     | 
| 
      
 253 
     | 
    
         
            +
                    
         
     | 
| 
      
 254 
     | 
    
         
            +
                    if lemma_pos =~ /^(.+)\s(.+)$/  
         
     | 
| 
      
 255 
     | 
    
         
            +
                      # lemma may be "...." with spaces in.
         
     | 
| 
      
 256 
     | 
    
         
            +
                      # this regexp. uses the last space to separate lemma and POS
         
     | 
| 
      
 257 
     | 
    
         
            +
                      retv["lemma"] = $1
         
     | 
| 
      
 258 
     | 
    
         
            +
                      retv["pos"] = $2
         
     | 
| 
      
 259 
     | 
    
         
            +
                      
         
     | 
| 
      
 260 
     | 
    
         
            +
                      if retv["lemma"] =~ /^"(.+)"$/
         
     | 
| 
      
 261 
     | 
    
         
            +
                        # remove quotes around lemma
         
     | 
| 
      
 262 
     | 
    
         
            +
                        retv["lemma"] = $1
         
     | 
| 
      
 263 
     | 
    
         
            +
                        
         
     | 
| 
      
 264 
     | 
    
         
            +
                      elsif retv["lemma"] == "~"
         
     | 
| 
      
 265 
     | 
    
         
            +
                        # lemma same as word
         
     | 
| 
      
 266 
     | 
    
         
            +
                        retv["lemma"] = retv["word"]
         
     | 
| 
      
 267 
     | 
    
         
            +
                      end
         
     | 
| 
      
 268 
     | 
    
         
            +
                    elsif lemma_pos.strip().split().length() == 1
         
     | 
| 
      
 269 
     | 
    
         
            +
                      # only pos given
         
     | 
| 
      
 270 
     | 
    
         
            +
            	  retv["pos"] = lemma_pos.strip()
         
     | 
| 
      
 271 
     | 
    
         
            +
            	else
         
     | 
| 
      
 272 
     | 
    
         
            +
            	  $stderr.puts "cannot parse lemma_pos pair " + lemma_pos
         
     | 
| 
      
 273 
     | 
    
         
            +
                    end
         
     | 
| 
      
 274 
     | 
    
         
            +
                  end
         
     | 
| 
      
 275 
     | 
    
         
            +
                end
         
     | 
| 
      
 276 
     | 
    
         
            +
                    
         
     | 
| 
      
 277 
     | 
    
         
            +
                # parent index
         
     | 
| 
      
 278 
     | 
    
         
            +
                if parentindex.nil? or parentindex == "*"
         
     | 
| 
      
 279 
     | 
    
         
            +
                  # root
         
     | 
| 
      
 280 
     | 
    
         
            +
                else
         
     | 
| 
      
 281 
     | 
    
         
            +
                  retv["parent_index"] = parentindex
         
     | 
| 
      
 282 
     | 
    
         
            +
                end
         
     | 
| 
      
 283 
     | 
    
         
            +
             
     | 
| 
      
 284 
     | 
    
         
            +
                # edge label
         
     | 
| 
      
 285 
     | 
    
         
            +
                if edgelabel.nil? or edgelabel.strip.empty? 
         
     | 
| 
      
 286 
     | 
    
         
            +
                  # no edge label given
         
     | 
| 
      
 287 
     | 
    
         
            +
                else
         
     | 
| 
      
 288 
     | 
    
         
            +
                  retv["edgelabel"] = edgelabel
         
     | 
| 
      
 289 
     | 
    
         
            +
                end
         
     | 
| 
      
 290 
     | 
    
         
            +
                
         
     | 
| 
      
 291 
     | 
    
         
            +
                # governing word
         
     | 
| 
      
 292 
     | 
    
         
            +
                if governor and not(governor.strip.empty?)
         
     | 
| 
      
 293 
     | 
    
         
            +
                  # expected format:
         
     | 
| 
      
 294 
     | 
    
         
            +
                  # (gov <governing_lemma>)
         
     | 
| 
      
 295 
     | 
    
         
            +
                  if governor =~ /^\(gov\s(.+)\)$/
         
     | 
| 
      
 296 
     | 
    
         
            +
                    retv["governing_lemma"] = $1
         
     | 
| 
      
 297 
     | 
    
         
            +
                  elsif governor == "(gov )"
         
     | 
| 
      
 298 
     | 
    
         
            +
            	# okay, no governor given
         
     | 
| 
      
 299 
     | 
    
         
            +
                  else
         
     | 
| 
      
 300 
     | 
    
         
            +
                    $stderr.puts "cannot parse governor "+ governor
         
     | 
| 
      
 301 
     | 
    
         
            +
                  end
         
     | 
| 
      
 302 
     | 
    
         
            +
                end
         
     | 
| 
      
 303 
     | 
    
         
            +
             
     | 
| 
      
 304 
     | 
    
         
            +
                # antecedent
         
     | 
| 
      
 305 
     | 
    
         
            +
                if antecedent and not(antecedent.strip.empty?)
         
     | 
| 
      
 306 
     | 
    
         
            +
                  # expected format:
         
     | 
| 
      
 307 
     | 
    
         
            +
                  # (antecedent <index>)
         
     | 
| 
      
 308 
     | 
    
         
            +
                  if antecedent =~ /^\(antecedent\s(.+)\)$/
         
     | 
| 
      
 309 
     | 
    
         
            +
                    retv["antecedent_index"] = $1
         
     | 
| 
      
 310 
     | 
    
         
            +
                  else
         
     | 
| 
      
 311 
     | 
    
         
            +
                    $stderr.puts "cannot parse antecedent "+ antecedent
         
     | 
| 
      
 312 
     | 
    
         
            +
                  end
         
     | 
| 
      
 313 
     | 
    
         
            +
                end
         
     | 
| 
      
 314 
     | 
    
         
            +
             
     | 
| 
      
 315 
     | 
    
         
            +
                return retv
         
     | 
| 
      
 316 
     | 
    
         
            +
              end
         
     | 
| 
      
 317 
     | 
    
         
            +
             
     | 
| 
      
 318 
     | 
    
         
            +
              ###########
         
     | 
| 
      
 319 
     | 
    
         
            +
              # returns: SalsaTigerSentence object describing this minipar parse
         
     | 
| 
      
 320 
     | 
    
         
            +
              def salsatigerxml_output(sentence_id)
         
     | 
| 
      
 321 
     | 
    
         
            +
             
     | 
| 
      
 322 
     | 
    
         
            +
                # start sentence object
         
     | 
| 
      
 323 
     | 
    
         
            +
                sent_obj = SalsaTigerSentence.empty_sentence(sentence_id)
         
     | 
| 
      
 324 
     | 
    
         
            +
             
     | 
| 
      
 325 
     | 
    
         
            +
                # determine children of each node
         
     | 
| 
      
 326 
     | 
    
         
            +
                # so we'll know which nodes to make terminal and which to make nonterminal
         
     | 
| 
      
 327 
     | 
    
         
            +
                i_have_children = Hash.new
         
     | 
| 
      
 328 
     | 
    
         
            +
                @nodes.each { | node|
         
     | 
| 
      
 329 
     | 
    
         
            +
                  if (parent_ix = node["parent_index"])
         
     | 
| 
      
 330 
     | 
    
         
            +
                    # node has parent. record the parent as having children
         
     | 
| 
      
 331 
     | 
    
         
            +
                    i_have_children[parent_ix] = true
         
     | 
| 
      
 332 
     | 
    
         
            +
                  end
         
     | 
| 
      
 333 
     | 
    
         
            +
                }
         
     | 
| 
      
 334 
     | 
    
         
            +
             
     | 
| 
      
 335 
     | 
    
         
            +
                # make SynNode objects for each minipar node
         
     | 
| 
      
 336 
     | 
    
         
            +
                # minipar terminal: one SynNode terminal
         
     | 
| 
      
 337 
     | 
    
         
            +
                # minipar nonterminal: one SynNode nonterminal, plus one SynNode terminal
         
     | 
| 
      
 338 
     | 
    
         
            +
                #                      duplicating the word, lemma and POS info
         
     | 
| 
      
 339 
     | 
    
         
            +
                #                      to keep with the SalsaTigerSentence assumptions that
         
     | 
| 
      
 340 
     | 
    
         
            +
                #                      the sentence can be read off from the terminals
         
     | 
| 
      
 341 
     | 
    
         
            +
                index_to_synnode = Hash.new
         
     | 
| 
      
 342 
     | 
    
         
            +
                @nodes.each { |minipar_node|
         
     | 
| 
      
 343 
     | 
    
         
            +
                  node_id = minipar_node["index"]
         
     | 
| 
      
 344 
     | 
    
         
            +
                  if minipar_node["word"]
         
     | 
| 
      
 345 
     | 
    
         
            +
                    word = SalsaTigerXMLHelper.escape(minipar_node["word"])
         
     | 
| 
      
 346 
     | 
    
         
            +
                  elsif not(i_have_children[minipar_node["index"]])
         
     | 
| 
      
 347 
     | 
    
         
            +
                    # node without word and children: probably has an antecedent
         
     | 
| 
      
 348 
     | 
    
         
            +
                    # add an empty word so the Salsa tool can represent the node with the antecedent
         
     | 
| 
      
 349 
     | 
    
         
            +
                    word = ""
         
     | 
| 
      
 350 
     | 
    
         
            +
                  else
         
     | 
| 
      
 351 
     | 
    
         
            +
                    word = nil
         
     | 
| 
      
 352 
     | 
    
         
            +
                  end
         
     | 
| 
      
 353 
     | 
    
         
            +
             
     | 
| 
      
 354 
     | 
    
         
            +
                  if word
         
     | 
| 
      
 355 
     | 
    
         
            +
                    # make a terminal SynNode for this minipar node
         
     | 
| 
      
 356 
     | 
    
         
            +
                    # only if it has a word, otherwise it's not much use as a terminal
         
     | 
| 
      
 357 
     | 
    
         
            +
                    t_node = sent_obj.add_syn("t", 
         
     | 
| 
      
 358 
     | 
    
         
            +
                                              nil,  # category
         
     | 
| 
      
 359 
     | 
    
         
            +
                                              word, # word
         
     | 
| 
      
 360 
     | 
    
         
            +
                                              minipar_node["pos"], # POS
         
     | 
| 
      
 361 
     | 
    
         
            +
                                              node_id) # node ID
         
     | 
| 
      
 362 
     | 
    
         
            +
                    if minipar_node["lemma"]
         
     | 
| 
      
 363 
     | 
    
         
            +
                      t_node.set_attribute("lemma", SalsaTigerXMLHelper.escape(minipar_node["lemma"]))
         
     | 
| 
      
 364 
     | 
    
         
            +
                    end
         
     | 
| 
      
 365 
     | 
    
         
            +
             
     | 
| 
      
 366 
     | 
    
         
            +
                    # remember this node
         
     | 
| 
      
 367 
     | 
    
         
            +
                    index_to_synnode[minipar_node["index"]] = t_node
         
     | 
| 
      
 368 
     | 
    
         
            +
                  else
         
     | 
| 
      
 369 
     | 
    
         
            +
                    t_node = nil
         
     | 
| 
      
 370 
     | 
    
         
            +
                  end
         
     | 
| 
      
 371 
     | 
    
         
            +
             
     | 
| 
      
 372 
     | 
    
         
            +
                  if i_have_children[minipar_node["index"]] or not(word)
         
     | 
| 
      
 373 
     | 
    
         
            +
                    # does this minipar node have children, or
         
     | 
| 
      
 374 
     | 
    
         
            +
                    # does it lack a word? then add a (second) nonterminal SynNode for it
         
     | 
| 
      
 375 
     | 
    
         
            +
                    node_id = node_id + "nt"
         
     | 
| 
      
 376 
     | 
    
         
            +
                    nt_node = sent_obj.add_syn("nt",
         
     | 
| 
      
 377 
     | 
    
         
            +
                                               minipar_node["pos"],  # category
         
     | 
| 
      
 378 
     | 
    
         
            +
                                               word, # word
         
     | 
| 
      
 379 
     | 
    
         
            +
                                               minipar_node["pos"], # POS
         
     | 
| 
      
 380 
     | 
    
         
            +
                                               node_id) # node ID
         
     | 
| 
      
 381 
     | 
    
         
            +
                    if minipar_node["lemma"]
         
     | 
| 
      
 382 
     | 
    
         
            +
                      nt_node.set_attribute("lemma", SalsaTigerXMLHelper.escape(minipar_node["lemma"]))
         
     | 
| 
      
 383 
     | 
    
         
            +
                    end
         
     | 
| 
      
 384 
     | 
    
         
            +
             
     | 
| 
      
 385 
     | 
    
         
            +
                    # link t node to nt node
         
     | 
| 
      
 386 
     | 
    
         
            +
                    if t_node
         
     | 
| 
      
 387 
     | 
    
         
            +
                      nt_node.add_child(t_node, "Head")
         
     | 
| 
      
 388 
     | 
    
         
            +
                      t_node.add_parent(nt_node, "Head")
         
     | 
| 
      
 389 
     | 
    
         
            +
                    end
         
     | 
| 
      
 390 
     | 
    
         
            +
             
     | 
| 
      
 391 
     | 
    
         
            +
                    # just terminal node: remember it
         
     | 
| 
      
 392 
     | 
    
         
            +
                    # both terminal and nonterminal:remember just the nonterminal
         
     | 
| 
      
 393 
     | 
    
         
            +
                    index_to_synnode[minipar_node["index"]] = nt_node
         
     | 
| 
      
 394 
     | 
    
         
            +
                  end
         
     | 
| 
      
 395 
     | 
    
         
            +
             
     | 
| 
      
 396 
     | 
    
         
            +
                }
         
     | 
| 
      
 397 
     | 
    
         
            +
             
     | 
| 
      
 398 
     | 
    
         
            +
                # link SynNodes
         
     | 
| 
      
 399 
     | 
    
         
            +
                @nodes.each { |minipar_node|
         
     | 
| 
      
 400 
     | 
    
         
            +
                  # find my syn node
         
     | 
| 
      
 401 
     | 
    
         
            +
                  my_synnode = index_to_synnode[minipar_node["index"]]
         
     | 
| 
      
 402 
     | 
    
         
            +
                  unless my_synnode
         
     | 
| 
      
 403 
     | 
    
         
            +
                    raise "Error: no syn node constructed for index in sentence #{sentence_id}"
         
     | 
| 
      
 404 
     | 
    
         
            +
                  end
         
     | 
| 
      
 405 
     | 
    
         
            +
             
     | 
| 
      
 406 
     | 
    
         
            +
                  # link to parent syn node
         
     | 
| 
      
 407 
     | 
    
         
            +
                  if (parent_ix = minipar_node["parent_index"])
         
     | 
| 
      
 408 
     | 
    
         
            +
                    parent_synnode = index_to_synnode[parent_ix]
         
     | 
| 
      
 409 
     | 
    
         
            +
                    unless parent_synnode
         
     | 
| 
      
 410 
     | 
    
         
            +
                      raise "Error: no syn node constructed for parent index #{parent_ix} in sentence #{sentence_id}"
         
     | 
| 
      
 411 
     | 
    
         
            +
                    end
         
     | 
| 
      
 412 
     | 
    
         
            +
             
     | 
| 
      
 413 
     | 
    
         
            +
                    parent_synnode.add_child(my_synnode, minipar_node["edgelabel"])
         
     | 
| 
      
 414 
     | 
    
         
            +
                    my_synnode.add_parent(parent_synnode, minipar_node["edgelabel"])
         
     | 
| 
      
 415 
     | 
    
         
            +
                  end
         
     | 
| 
      
 416 
     | 
    
         
            +
             
     | 
| 
      
 417 
     | 
    
         
            +
                  # remember antecedent: both the node itself and its index, the latter as an attribute
         
     | 
| 
      
 418 
     | 
    
         
            +
                  # this way, we have
         
     | 
| 
      
 419 
     | 
    
         
            +
                  # - easy access to the antecedent via the node itself
         
     | 
| 
      
 420 
     | 
    
         
            +
                  # - a record of the antecedent in the SalsaTigerXML output
         
     | 
| 
      
 421 
     | 
    
         
            +
                  if (antecedent_ix = minipar_node["antecedent_index"])
         
     | 
| 
      
 422 
     | 
    
         
            +
                    antecedent_synnode = index_to_synnode[antecedent_ix]
         
     | 
| 
      
 423 
     | 
    
         
            +
                    unless antecedent_synnode
         
     | 
| 
      
 424 
     | 
    
         
            +
                      raise "Error: no syn node constructed for antecedent index #{antecedent_ix} in sentence #{sentence_id}"
         
     | 
| 
      
 425 
     | 
    
         
            +
                    end
         
     | 
| 
      
 426 
     | 
    
         
            +
             
     | 
| 
      
 427 
     | 
    
         
            +
                    my_synnode.set_f("antecedent", antecedent_synnode)
         
     | 
| 
      
 428 
     | 
    
         
            +
                    my_synnode.set_attribute("antecedent", antecedent_synnode.id())
         
     | 
| 
      
 429 
     | 
    
         
            +
                  end
         
     | 
| 
      
 430 
     | 
    
         
            +
                }
         
     | 
| 
      
 431 
     | 
    
         
            +
                
         
     | 
| 
      
 432 
     | 
    
         
            +
                return [sent_obj, construct_tabsent_mapping_stxml(sent_obj)]
         
     | 
| 
      
 433 
     | 
    
         
            +
              end
         
     | 
| 
      
 434 
     | 
    
         
            +
             
     | 
| 
      
 435 
     | 
    
         
            +
              ###########3
         
     | 
| 
      
 436 
     | 
    
         
            +
              # construct mapping fntab line -> array of SynNodes
         
     | 
| 
      
 437 
     | 
    
         
            +
              # and add fntab words not present in minipar as children of the
         
     | 
| 
      
 438 
     | 
    
         
            +
              # SalsaTigerSentence object's root
         
     | 
| 
      
 439 
     | 
    
         
            +
              def construct_tabsent_mapping_stxml(sent)
         
     | 
| 
      
 440 
     | 
    
         
            +
                unless @tabsent
         
     | 
| 
      
 441 
     | 
    
         
            +
                  return nil
         
     | 
| 
      
 442 
     | 
    
         
            +
                end
         
     | 
| 
      
 443 
     | 
    
         
            +
             
     | 
| 
      
 444 
     | 
    
         
            +
                retv = Hash.new
         
     | 
| 
      
 445 
     | 
    
         
            +
                prev_minipar_index = nil
         
     | 
| 
      
 446 
     | 
    
         
            +
             
     | 
| 
      
 447 
     | 
    
         
            +
                @tabsent.each_line_parsed { |tabline|
         
     | 
| 
      
 448 
     | 
    
         
            +
                  retv[tabline.get("lineno")] = Array.new
         
     | 
| 
      
 449 
     | 
    
         
            +
             
     | 
| 
      
 450 
     | 
    
         
            +
                  # nodehash_mapping: hash tabsent lineno -> array: member of @nodes
         
     | 
| 
      
 451 
     | 
    
         
            +
                  if (nodehashes = @nodehash_mapping[tabline.get("lineno")])
         
     | 
| 
      
 452 
     | 
    
         
            +
                    nodehashes.each { |nodehash|
         
     | 
| 
      
 453 
     | 
    
         
            +
                      prev_minipar_index = nodehash["index"]
         
     | 
| 
      
 454 
     | 
    
         
            +
             
     | 
| 
      
 455 
     | 
    
         
            +
                      # this tabsent word has a corresponding minipar node
         
     | 
| 
      
 456 
     | 
    
         
            +
                      # enter it in tabsent_mapping
         
     | 
| 
      
 457 
     | 
    
         
            +
                      if (node = sent.syn_node_with_id(sent.id() + "_" + nodehash["index"]))
         
     | 
| 
      
 458 
     | 
    
         
            +
                        # terminal matching this fntab word
         
     | 
| 
      
 459 
     | 
    
         
            +
                        retv[tabline.get("lineno")] << node
         
     | 
| 
      
 460 
     | 
    
         
            +
                      elsif (node = sent.syn_node_with_id(sent.id() + "_" + nodehash["index"] + "nt"))
         
     | 
| 
      
 461 
     | 
    
         
            +
                        # we have a nonterminal matching this fntab word
         
     | 
| 
      
 462 
     | 
    
         
            +
                        retv[tabline.get("lineno")] << node
         
     | 
| 
      
 463 
     | 
    
         
            +
                      else
         
     | 
| 
      
 464 
     | 
    
         
            +
                        # no match after all? 
         
     | 
| 
      
 465 
     | 
    
         
            +
                        raise "missing: SalsaTigerSentence node for minipar node with index #{nodehash["index"]}"
         
     | 
| 
      
 466 
     | 
    
         
            +
                      end
         
     | 
| 
      
 467 
     | 
    
         
            +
                    }
         
     | 
| 
      
 468 
     | 
    
         
            +
             
     | 
| 
      
 469 
     | 
    
         
            +
                  else
         
     | 
| 
      
 470 
     | 
    
         
            +
                    # this tabsent word has no corresponding minipar node yet
         
     | 
| 
      
 471 
     | 
    
         
            +
                    # make one. See to it that it occurs in the right spot in sent.terminals_ordered.
         
     | 
| 
      
 472 
     | 
    
         
            +
                    parent = sent.syn_roots.first
         
     | 
| 
      
 473 
     | 
    
         
            +
                    node = sent.add_syn("t", # terminal
         
     | 
| 
      
 474 
     | 
    
         
            +
                                        "",  # category
         
     | 
| 
      
 475 
     | 
    
         
            +
                                        tabline.get("word"), # word
         
     | 
| 
      
 476 
     | 
    
         
            +
                                        "", # part of speech
         
     | 
| 
      
 477 
     | 
    
         
            +
                                        (prev_minipar_index.to_i + 1).to_s) # ID
         
     | 
| 
      
 478 
     | 
    
         
            +
                    parent.add_child(node, "-")
         
     | 
| 
      
 479 
     | 
    
         
            +
                    node.add_parent(parent, "-")
         
     | 
| 
      
 480 
     | 
    
         
            +
             
     | 
| 
      
 481 
     | 
    
         
            +
                    retv[tabline.get("lineno")] = [node]
         
     | 
| 
      
 482 
     | 
    
         
            +
                  end
         
     | 
| 
      
 483 
     | 
    
         
            +
                }
         
     | 
| 
      
 484 
     | 
    
         
            +
             
     | 
| 
      
 485 
     | 
    
         
            +
                return retv
         
     | 
| 
      
 486 
     | 
    
         
            +
              end
         
     | 
| 
      
 487 
     | 
    
         
            +
             
     | 
| 
      
 488 
     | 
    
         
            +
              ######
         
     | 
| 
      
 489 
     | 
    
         
            +
              # return a list of pairs [fntab word index, match type]
         
     | 
| 
      
 490 
     | 
    
         
            +
              # with an entry for each fntab word on fnw_list that matches minw,
         
     | 
| 
      
 491 
     | 
    
         
            +
              # either fnw == minw (match_type "full") or minw part_of fnw (match_type "partial")
         
     | 
| 
      
 492 
     | 
    
         
            +
              def fnw_minw_match(fnw_list, minw)
         
     | 
| 
      
 493 
     | 
    
         
            +
                retv = Array.new
         
     | 
| 
      
 494 
     | 
    
         
            +
             
     | 
| 
      
 495 
     | 
    
         
            +
                fnw_list.each_with_index { |fnw, fnw_index|
         
     | 
| 
      
 496 
     | 
    
         
            +
                  if fnw == minw
         
     | 
| 
      
 497 
     | 
    
         
            +
                    # words identical
         
     | 
| 
      
 498 
     | 
    
         
            +
                    retv << [fnw_index, "full"]
         
     | 
| 
      
 499 
     | 
    
         
            +
                  elsif fnw.index(minw)
         
     | 
| 
      
 500 
     | 
    
         
            +
                    # fn word includes minipar word
         
     | 
| 
      
 501 
     | 
    
         
            +
                    retv << [fnw_index, "partial"]
         
     | 
| 
      
 502 
     | 
    
         
            +
                  end
         
     | 
| 
      
 503 
     | 
    
         
            +
                }
         
     | 
| 
      
 504 
     | 
    
         
            +
             
     | 
| 
      
 505 
     | 
    
         
            +
                return retv
         
     | 
| 
      
 506 
     | 
    
         
            +
              end
         
     | 
| 
      
 507 
     | 
    
         
            +
            end
         
     | 
| 
      
 508 
     | 
    
         
            +
             
     | 
| 
      
 509 
     | 
    
         
            +
             
     | 
| 
      
 510 
     | 
    
         
            +
             
     | 
| 
      
 511 
     | 
    
         
            +
            ################################################
         
     | 
| 
      
 512 
     | 
    
         
            +
            # Interface class
         
     | 
| 
      
 513 
     | 
    
         
            +
            class MiniparInterface < SynInterfaceSTXML
         
     | 
| 
      
 514 
     | 
    
         
            +
              MiniparInterface.announce_me()
         
     | 
| 
      
 515 
     | 
    
         
            +
             
     | 
| 
      
 516 
     | 
    
         
            +
              ###
         
     | 
| 
      
 517 
     | 
    
         
            +
              def MiniparInterface.system()
         
     | 
| 
      
 518 
     | 
    
         
            +
                return "minipar"
         
     | 
| 
      
 519 
     | 
    
         
            +
              end
         
     | 
| 
      
 520 
     | 
    
         
            +
             
     | 
| 
      
 521 
     | 
    
         
            +
              ###
         
     | 
| 
      
 522 
     | 
    
         
            +
              def MiniparInterface.service()
         
     | 
| 
      
 523 
     | 
    
         
            +
                return "parser"
         
     | 
| 
      
 524 
     | 
    
         
            +
              end
         
     | 
| 
      
 525 
     | 
    
         
            +
             
     | 
| 
      
 526 
     | 
    
         
            +
              ###
         
     | 
| 
      
 527 
     | 
    
         
            +
              # initialize to set values for all subsequent processing
         
     | 
| 
      
 528 
     | 
    
         
            +
              def initialize(program_path, # string: path to system
         
     | 
| 
      
 529 
     | 
    
         
            +
            		 insuffix,      # string: suffix of tab files
         
     | 
| 
      
 530 
     | 
    
         
            +
            		 outsuffix,     # string: suffix for parsed files
         
     | 
| 
      
 531 
     | 
    
         
            +
            		 stsuffix,      # string: suffix for Salsa/TIGER XML files
         
     | 
| 
      
 532 
     | 
    
         
            +
            		 var_hash = {}) # optional arguments in a hash
         
     | 
| 
      
 533 
     | 
    
         
            +
             
     | 
| 
      
 534 
     | 
    
         
            +
                super(program_path, insuffix, outsuffix, stsuffix, var_hash)
         
     | 
| 
      
 535 
     | 
    
         
            +
             
     | 
| 
      
 536 
     | 
    
         
            +
                # new: evaluate var hash
         
     | 
| 
      
 537 
     | 
    
         
            +
                @pos_suffix = var_hash["pos_suffix"]
         
     | 
| 
      
 538 
     | 
    
         
            +
                @lemma_suffix = var_hash["lemma_suffix"]
         
     | 
| 
      
 539 
     | 
    
         
            +
                @tab_dir = var_hash["tab_dir"]
         
     | 
| 
      
 540 
     | 
    
         
            +
              end
         
     | 
| 
      
 541 
     | 
    
         
            +
             
     | 
| 
      
 542 
     | 
    
         
            +
              
         
     | 
| 
      
 543 
     | 
    
         
            +
              ###
         
     | 
| 
      
 544 
     | 
    
         
            +
              # process one file, writing the result to outfilename
         
     | 
| 
      
 545 
     | 
    
         
            +
              #  input format is FNTabFormat, output format is 
         
     | 
| 
      
 546 
     | 
    
         
            +
              #  Minipar format
         
     | 
| 
      
 547 
     | 
    
         
            +
              #
         
     | 
| 
      
 548 
     | 
    
         
            +
              # returns: nothing
         
     | 
| 
      
 549 
     | 
    
         
            +
              def process_file(infilename,    # string: name of input file
         
     | 
| 
      
 550 
     | 
    
         
            +
            		  outfilename)    # string: name of output file
         
     | 
| 
      
 551 
     | 
    
         
            +
                
         
     | 
| 
      
 552 
     | 
    
         
            +
                tf = Tempfile.new("minipar")
         
     | 
| 
      
 553 
     | 
    
         
            +
                reader = FNTabFormatFile.new(infilename)
         
     | 
| 
      
 554 
     | 
    
         
            +
                reader.each_sentence { |sent|
         
     | 
| 
      
 555 
     | 
    
         
            +
                  sent.each_line_parsed { |line|
         
     | 
| 
      
 556 
     | 
    
         
            +
                    tf.print line.get("word"), " "
         
     | 
| 
      
 557 
     | 
    
         
            +
                  }
         
     | 
| 
      
 558 
     | 
    
         
            +
                  tf.puts
         
     | 
| 
      
 559 
     | 
    
         
            +
                }
         
     | 
| 
      
 560 
     | 
    
         
            +
             
     | 
| 
      
 561 
     | 
    
         
            +
                tf.close()
         
     | 
| 
      
 562 
     | 
    
         
            +
                %x{#{@program_path} < #{tf.path()} > #{outfilename}}
         
     | 
| 
      
 563 
     | 
    
         
            +
              end
         
     | 
| 
      
 564 
     | 
    
         
            +
             
     | 
| 
      
 565 
     | 
    
         
            +
              #########3
         
     | 
| 
      
 566 
     | 
    
         
            +
              # yields tuples
         
     | 
| 
      
 567 
     | 
    
         
            +
              #  [ minipar output sentence, tab sentence, mapping]
         
     | 
| 
      
 568 
     | 
    
         
            +
              #
         
     | 
| 
      
 569 
     | 
    
         
            +
              # minipar output sentence is 
         
     | 
| 
      
 570 
     | 
    
         
            +
              #  - either an array of hashes, each describing one node;
         
     | 
| 
      
 571 
     | 
    
         
            +
              #  - or a SalsaTigerSentence object
         
     | 
| 
      
 572 
     | 
    
         
            +
              #  - or a MiniparSentence object 
         
     | 
| 
      
 573 
     | 
    
         
            +
              #    (which has methods returns the sentence as either a 
         
     | 
| 
      
 574 
     | 
    
         
            +
              #     nodehash array or a SalsaTigerSentence)
         
     | 
| 
      
 575 
     | 
    
         
            +
              #
         
     | 
| 
      
 576 
     | 
    
         
            +
              # tab sentence: matching tab sentence, if tab file has been given on initialization
         
     | 
| 
      
 577 
     | 
    
         
            +
              #
         
     | 
| 
      
 578 
     | 
    
         
            +
              # mapping: hash: line in tab sentence(integer) -> array:SynNode
         
     | 
| 
      
 579 
     | 
    
         
            +
              #   mapping tab sentence nodes to matching nodes in the SalsaTigerSentence data structure
         
     | 
| 
      
 580 
     | 
    
         
            +
              #
         
     | 
| 
      
 581 
     | 
    
         
            +
              # If a parse has failed, returns 
         
     | 
| 
      
 582 
     | 
    
         
            +
              #  [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence] 
         
     | 
| 
      
 583 
     | 
    
         
            +
              # to allow more detailed accounting for failed parses
         
     | 
| 
      
 584 
     | 
    
         
            +
              def each_sentence(parsefilename,    # name of minipar output file
         
     | 
| 
      
 585 
     | 
    
         
            +
                                format = "stxml") # format to return data in
         
     | 
| 
      
 586 
     | 
    
         
            +
                # sanity checks
         
     | 
| 
      
 587 
     | 
    
         
            +
                unless @tab_dir
         
     | 
| 
      
 588 
     | 
    
         
            +
                  raise "Need to set tab directory on initialization"
         
     | 
| 
      
 589 
     | 
    
         
            +
                end
         
     | 
| 
      
 590 
     | 
    
         
            +
             
     | 
| 
      
 591 
     | 
    
         
            +
                # get matching tab file for this parser output file,
         
     | 
| 
      
 592 
     | 
    
         
            +
                # read its contents
         
     | 
| 
      
 593 
     | 
    
         
            +
                tabfilename = @tab_dir+File.basename(parsefilename, @outsuffix)+ @insuffix
         
     | 
| 
      
 594 
     | 
    
         
            +
                @tab_sentences = Array.new
         
     | 
| 
      
 595 
     | 
    
         
            +
                reader = FNTabFormatFile.new(tabfilename)
         
     | 
| 
      
 596 
     | 
    
         
            +
                reader.each_sentence { |sent_obj| @tab_sentences << sent_obj  }
         
     | 
| 
      
 597 
     | 
    
         
            +
             
     | 
| 
      
 598 
     | 
    
         
            +
                stream = open_minipar_outfile(parsefilename)
         
     | 
| 
      
 599 
     | 
    
         
            +
             
     | 
| 
      
 600 
     | 
    
         
            +
                sentno = 0
         
     | 
| 
      
 601 
     | 
    
         
            +
                tab_sentno = 0
         
     | 
| 
      
 602 
     | 
    
         
            +
                matched_tabsent = Hash.new()
         
     | 
| 
      
 603 
     | 
    
         
            +
             
     | 
| 
      
 604 
     | 
    
         
            +
                each_miniparsent_obj(stream) { |parse|
         
     | 
| 
      
 605 
     | 
    
         
            +
             
     | 
| 
      
 606 
     | 
    
         
            +
                  if (matching_tab_sentno = matching_tabsent(parse, tab_sentno))
         
     | 
| 
      
 607 
     | 
    
         
            +
                    # found matching tab sentence
         
     | 
| 
      
 608 
     | 
    
         
            +
                    tabsent = @tab_sentences[matching_tab_sentno]
         
     | 
| 
      
 609 
     | 
    
         
            +
                    tab_sentno = matching_tab_sentno + 1
         
     | 
| 
      
 610 
     | 
    
         
            +
                    matched_tabsent[matching_tab_sentno] = true
         
     | 
| 
      
 611 
     | 
    
         
            +
                  else
         
     | 
| 
      
 612 
     | 
    
         
            +
                    tabsent = nil
         
     | 
| 
      
 613 
     | 
    
         
            +
                  end
         
     | 
| 
      
 614 
     | 
    
         
            +
             
     | 
| 
      
 615 
     | 
    
         
            +
                  # yield minipar parse in the required format
         
     | 
| 
      
 616 
     | 
    
         
            +
                  case format
         
     | 
| 
      
 617 
     | 
    
         
            +
                  when "nodehashes"
         
     | 
| 
      
 618 
     | 
    
         
            +
                    yield [parse.nodes(), tabsent, parse.nodehash_mapping()]
         
     | 
| 
      
 619 
     | 
    
         
            +
                  when "stxml"
         
     | 
| 
      
 620 
     | 
    
         
            +
                    sent, mapping = parse.stxml(@filename_core + sentno.to_s)
         
     | 
| 
      
 621 
     | 
    
         
            +
                    yield [sent, tabsent, mapping]
         
     | 
| 
      
 622 
     | 
    
         
            +
                  when "objects"
         
     | 
| 
      
 623 
     | 
    
         
            +
                    yield [parse, tabsent]
         
     | 
| 
      
 624 
     | 
    
         
            +
                  else
         
     | 
| 
      
 625 
     | 
    
         
            +
                    raise "Unknown each_sentence format #{format}"
         
     | 
| 
      
 626 
     | 
    
         
            +
                  end
         
     | 
| 
      
 627 
     | 
    
         
            +
             
     | 
| 
      
 628 
     | 
    
         
            +
                  sentno += 1
         
     | 
| 
      
 629 
     | 
    
         
            +
                }
         
     | 
| 
      
 630 
     | 
    
         
            +
             
     | 
| 
      
 631 
     | 
    
         
            +
                ##
         
     | 
| 
      
 632 
     | 
    
         
            +
                # each unmatched tab sentence: yield as failed parse object
         
     | 
| 
      
 633 
     | 
    
         
            +
                @tab_sentences.each_with_index { |tabsent, index|
         
     | 
| 
      
 634 
     | 
    
         
            +
                  unless matched_tabsent[index]
         
     | 
| 
      
 635 
     | 
    
         
            +
                    # spotted an unmatched sentence
         
     | 
| 
      
 636 
     | 
    
         
            +
                    sent = MiniparInterface.failed_sentence(tabsent,tabsent.get_sent_id())
         
     | 
| 
      
 637 
     | 
    
         
            +
                    yield [sent, tabsent, MiniparInterface.standard_mapping(sent, tabsent)]
         
     | 
| 
      
 638 
     | 
    
         
            +
                  end
         
     | 
| 
      
 639 
     | 
    
         
            +
                }
         
     | 
| 
      
 640 
     | 
    
         
            +
              end    
         
     | 
| 
      
 641 
     | 
    
         
            +
             
     | 
| 
      
 642 
     | 
    
         
            +
              ###
         
     | 
| 
      
 643 
     | 
    
         
            +
              # write Salsa/TIGER XML output to file
         
     | 
| 
      
 644 
     | 
    
         
            +
              def to_stxml_file(infilename,  # string: name of parse file
         
     | 
| 
      
 645 
     | 
    
         
            +
            		    outfilename) # string: name of output stxml file
         
     | 
| 
      
 646 
     | 
    
         
            +
             
     | 
| 
      
 647 
     | 
    
         
            +
                outfile = File.new(outfilename, "w")
         
     | 
| 
      
 648 
     | 
    
         
            +
                outfile.puts SalsaTigerXMLHelper.get_header()
         
     | 
| 
      
 649 
     | 
    
         
            +
                each_sentence(infilename) { |st_sent, tabsent|
         
     | 
| 
      
 650 
     | 
    
         
            +
                  outfile.puts st_sent.get()
         
     | 
| 
      
 651 
     | 
    
         
            +
                }
         
     | 
| 
      
 652 
     | 
    
         
            +
                outfile.puts SalsaTigerXMLHelper.get_footer()
         
     | 
| 
      
 653 
     | 
    
         
            +
                outfile.close()
         
     | 
| 
      
 654 
     | 
    
         
            +
              end
         
     | 
| 
      
 655 
     | 
    
         
            +
             
     | 
| 
      
 656 
     | 
    
         
            +
             
     | 
| 
      
 657 
     | 
    
         
            +
              #####################3
         
     | 
| 
      
 658 
     | 
    
         
            +
              private
         
     | 
| 
      
 659 
     | 
    
         
            +
             
     | 
| 
      
 660 
     | 
    
         
            +
              ###
         
     | 
| 
      
 661 
     | 
    
         
            +
              # open minipar outfile
         
     | 
| 
      
 662 
     | 
    
         
            +
              #
         
     | 
| 
      
 663 
     | 
    
         
            +
              # return: IO stream for reading minipar outfile
         
     | 
| 
      
 664 
     | 
    
         
            +
              def open_minipar_outfile(filename)
         
     | 
| 
      
 665 
     | 
    
         
            +
                
         
     | 
| 
      
 666 
     | 
    
         
            +
                ##
         
     | 
| 
      
 667 
     | 
    
         
            +
                # zipped? then unzip first
         
     | 
| 
      
 668 
     | 
    
         
            +
                # (the Ruby read-zipped package doesn't seem to be reliable)
         
     | 
| 
      
 669 
     | 
    
         
            +
                if filename =~  /\.gz$/
         
     | 
| 
      
 670 
     | 
    
         
            +
                  @filename_core = File.basename(filename, ".gz")
         
     | 
| 
      
 671 
     | 
    
         
            +
                  return IO.popen("zcat #{filename}") 
         
     | 
| 
      
 672 
     | 
    
         
            +
                else
         
     | 
| 
      
 673 
     | 
    
         
            +
                  @filename_core = File.basename(filename)
         
     | 
| 
      
 674 
     | 
    
         
            +
                  begin
         
     | 
| 
      
 675 
     | 
    
         
            +
                    return File.new(filename)
         
     | 
| 
      
 676 
     | 
    
         
            +
                  rescue
         
     | 
| 
      
 677 
     | 
    
         
            +
                    raise "Couldn't read minipar file #{filename}"
         
     | 
| 
      
 678 
     | 
    
         
            +
                  end
         
     | 
| 
      
 679 
     | 
    
         
            +
                end
         
     | 
| 
      
 680 
     | 
    
         
            +
              end
         
     | 
| 
      
 681 
     | 
    
         
            +
             
     | 
| 
      
 682 
     | 
    
         
            +
              ###
         
     | 
| 
      
 683 
     | 
    
         
            +
              # each_miniparsent_obj
         
     | 
| 
      
 684 
     | 
    
         
            +
              # read minipar output from stream,
         
     | 
| 
      
 685 
     | 
    
         
            +
              # yield sentence-wise as MiniparSentence objects
         
     | 
| 
      
 686 
     | 
    
         
            +
              def each_miniparsent_obj(stream) # IO object: stream to read from 
         
     | 
| 
      
 687 
     | 
    
         
            +
             
     | 
| 
      
 688 
     | 
    
         
            +
                # status: string
         
     | 
| 
      
 689 
     | 
    
         
            +
                # "outside": waiting for next start of sentence with ( alone in a line
         
     | 
| 
      
 690 
     | 
    
         
            +
                # "inside": inside a sentence, sentence ends with ) alone on a line
         
     | 
| 
      
 691 
     | 
    
         
            +
                status = "outside"
         
     | 
| 
      
 692 
     | 
    
         
            +
                
         
     | 
| 
      
 693 
     | 
    
         
            +
                # sentence: array of strings, one for each line of the sentence
         
     | 
| 
      
 694 
     | 
    
         
            +
                sentence = Array.new()
         
     | 
| 
      
 695 
     | 
    
         
            +
             
     | 
| 
      
 696 
     | 
    
         
            +
                while (line = stream.gets())
         
     | 
| 
      
 697 
     | 
    
         
            +
                  case status
         
     | 
| 
      
 698 
     | 
    
         
            +
                  when "outside"
         
     | 
| 
      
 699 
     | 
    
         
            +
                    # start of sentence?
         
     | 
| 
      
 700 
     | 
    
         
            +
                    if ["(", "> ("].include? line.chomp().strip()
         
     | 
| 
      
 701 
     | 
    
         
            +
                      sentence.clear()
         
     | 
| 
      
 702 
     | 
    
         
            +
                      status = "inside"
         
     | 
| 
      
 703 
     | 
    
         
            +
                    end
         
     | 
| 
      
 704 
     | 
    
         
            +
                    
         
     | 
| 
      
 705 
     | 
    
         
            +
                  when "inside"
         
     | 
| 
      
 706 
     | 
    
         
            +
                    if line.chomp().strip() == ")"
         
     | 
| 
      
 707 
     | 
    
         
            +
                      # end of sentence
         
     | 
| 
      
 708 
     | 
    
         
            +
                      yield MiniparSentence.new(sentence)
         
     | 
| 
      
 709 
     | 
    
         
            +
                      status = "outside"
         
     | 
| 
      
 710 
     | 
    
         
            +
                    else
         
     | 
| 
      
 711 
     | 
    
         
            +
                      # inside sentence
         
     | 
| 
      
 712 
     | 
    
         
            +
                      sentence << line.chomp().strip()
         
     | 
| 
      
 713 
     | 
    
         
            +
                    end
         
     | 
| 
      
 714 
     | 
    
         
            +
                  else
         
     | 
| 
      
 715 
     | 
    
         
            +
                    raise "Shouldn't be here"
         
     | 
| 
      
 716 
     | 
    
         
            +
                  end # case
         
     | 
| 
      
 717 
     | 
    
         
            +
                end # while file not ended
         
     | 
| 
      
 718 
     | 
    
         
            +
              end
         
     | 
| 
      
 719 
     | 
    
         
            +
             
     | 
| 
      
 720 
     | 
    
         
            +
              ###
         
     | 
| 
      
 721 
     | 
    
         
            +
              # matching_tabsent
         
     | 
| 
      
 722 
     | 
    
         
            +
              #
         
     | 
| 
      
 723 
     | 
    
         
            +
              # if we have tab sentences, and if there is 
         
     | 
| 
      
 724 
     | 
    
         
            +
              # a tab sentence matching the given minipar sentence,
         
     | 
| 
      
 725 
     | 
    
         
            +
              # return its index, else return false
         
     | 
| 
      
 726 
     | 
    
         
            +
              #
         
     | 
| 
      
 727 
     | 
    
         
            +
              # If there is a matching tabsent,
         
     | 
| 
      
 728 
     | 
    
         
            +
              # the MiniparSentence will remember it (and the terminal mapping)
         
     | 
| 
      
 729 
     | 
    
         
            +
              def matching_tabsent(parse,  # MiniparSentence object
         
     | 
| 
      
 730 
     | 
    
         
            +
                                   tabsent_no) # integer: starting point in @tab_sentences array
         
     | 
| 
      
 731 
     | 
    
         
            +
                if @tab_sentences.empty?
         
     | 
| 
      
 732 
     | 
    
         
            +
                  return nil
         
     | 
| 
      
 733 
     | 
    
         
            +
                end
         
     | 
| 
      
 734 
     | 
    
         
            +
             
     | 
| 
      
 735 
     | 
    
         
            +
                tabsent_no.upto(@tab_sentences.length() - 1) { |index|
         
     | 
| 
      
 736 
     | 
    
         
            +
                  if parse.set_tabsent(@tab_sentences[index])
         
     | 
| 
      
 737 
     | 
    
         
            +
                    return index
         
     | 
| 
      
 738 
     | 
    
         
            +
                  end
         
     | 
| 
      
 739 
     | 
    
         
            +
                }
         
     | 
| 
      
 740 
     | 
    
         
            +
                
         
     | 
| 
      
 741 
     | 
    
         
            +
                # no match found up to now. so try sloppy match
         
     | 
| 
      
 742 
     | 
    
         
            +
                if parse.set_tabsent(@tab_sentences[tabsent_no], "sloppy")
         
     | 
| 
      
 743 
     | 
    
         
            +
            #      $stderr.puts "Warning: sloppy match used. Minipar sentence:"
         
     | 
| 
      
 744 
     | 
    
         
            +
            #      $stderr.puts parse.nodes().map { |n| n["word"].to_s }.join(" ")
         
     | 
| 
      
 745 
     | 
    
         
            +
            #      $stderr.puts "Matching fntab sentence: "
         
     | 
| 
      
 746 
     | 
    
         
            +
            #      @tab_sentences[tabsent_no].each_line_parsed { |l| $stderr.print l.get("word"), " " }
         
     | 
| 
      
 747 
     | 
    
         
            +
            #      $stderr.puts
         
     | 
| 
      
 748 
     | 
    
         
            +
                  return tabsent_no
         
     | 
| 
      
 749 
     | 
    
         
            +
                end
         
     | 
| 
      
 750 
     | 
    
         
            +
             
     | 
| 
      
 751 
     | 
    
         
            +
            #    $stderr.puts "Warning: No match found for minipar sentence:"
         
     | 
| 
      
 752 
     | 
    
         
            +
            #    $stderr.puts parse.nodes().map { |n| n["word"].to_s }.join(" ")
         
     | 
| 
      
 753 
     | 
    
         
            +
            #    $stderr.puts "First tested fntab sentence: "
         
     | 
| 
      
 754 
     | 
    
         
            +
            #    @tab_sentences[tabsent_no].each_line_parsed { |l| $stderr.print l.get("word"), " " }
         
     | 
| 
      
 755 
     | 
    
         
            +
            #    $stderr.puts
         
     | 
| 
      
 756 
     | 
    
         
            +
             
     | 
| 
      
 757 
     | 
    
         
            +
                return nil
         
     | 
| 
      
 758 
     | 
    
         
            +
              end
         
     | 
| 
      
 759 
     | 
    
         
            +
            end
         
     | 
| 
      
 760 
     | 
    
         
            +
             
     | 
| 
      
 761 
     | 
    
         
            +
            ################################################
         
     | 
| 
      
 762 
     | 
    
         
            +
            # Interpreter class
         
     | 
| 
      
 763 
     | 
    
         
            +
            class MiniparInterpreter < SynInterpreter
         
     | 
| 
      
 764 
     | 
    
         
            +
              MiniparInterpreter.announce_me()
         
     | 
| 
      
 765 
     | 
    
         
            +
             
     | 
| 
      
 766 
     | 
    
         
            +
              ###
         
     | 
| 
      
 767 
     | 
    
         
            +
              # names of the systems interpreted by this class:
         
     | 
| 
      
 768 
     | 
    
         
            +
              # returns a hash service(string) -> system name (string),
         
     | 
| 
      
 769 
     | 
    
         
            +
              # e.g.
         
     | 
| 
      
 770 
     | 
    
         
            +
              # { "parser" => "collins", "lemmatizer" => "treetagger" }
         
     | 
| 
      
 771 
     | 
    
         
            +
              def MiniparInterpreter.systems()
         
     | 
| 
      
 772 
     | 
    
         
            +
                return {
         
     | 
| 
      
 773 
     | 
    
         
            +
                  "parser" => "minipar"
         
     | 
| 
      
 774 
     | 
    
         
            +
                }
         
     | 
| 
      
 775 
     | 
    
         
            +
              end
         
     | 
| 
      
 776 
     | 
    
         
            +
             
     | 
| 
      
 777 
     | 
    
         
            +
              ###
         
     | 
| 
      
 778 
     | 
    
         
            +
              # names of additional systems that may be interpreted by this class
         
     | 
| 
      
 779 
     | 
    
         
            +
              # returns a hash service(string) -> system name(string)
         
     | 
| 
      
 780 
     | 
    
         
            +
              # same as names()
         
     | 
| 
      
 781 
     | 
    
         
            +
              def MiniparInterpreter.optional_systems()
         
     | 
| 
      
 782 
     | 
    
         
            +
                return {}
         
     | 
| 
      
 783 
     | 
    
         
            +
              end
         
     | 
| 
      
 784 
     | 
    
         
            +
             
     | 
| 
      
 785 
     | 
    
         
            +
              ###
         
     | 
| 
      
 786 
     | 
    
         
            +
              # generalize over POS tags.
         
     | 
| 
      
 787 
     | 
    
         
            +
              #
         
     | 
| 
      
 788 
     | 
    
         
            +
              # returns one of:
         
     | 
| 
      
 789 
     | 
    
         
            +
              #
         
     | 
| 
      
 790 
     | 
    
         
            +
              # adj:  adjective (phrase)
         
     | 
| 
      
 791 
     | 
    
         
            +
              # adv:  adverb (phrase)
         
     | 
| 
      
 792 
     | 
    
         
            +
              # card: numbers, quantity phrases
         
     | 
| 
      
 793 
     | 
    
         
            +
              # con:  conjunction
         
     | 
| 
      
 794 
     | 
    
         
            +
              # det:  determiner, including possessive/demonstrative pronouns etc.
         
     | 
| 
      
 795 
     | 
    
         
            +
              # for:  foreign material
         
     | 
| 
      
 796 
     | 
    
         
            +
              # noun: noun (phrase), including personal pronouns, proper names, expletives
         
     | 
| 
      
 797 
     | 
    
         
            +
              # part: particles, truncated words (German compound parts)
         
     | 
| 
      
 798 
     | 
    
         
            +
              # prep: preposition (phrase)
         
     | 
| 
      
 799 
     | 
    
         
            +
              # pun:  punctuation, brackets, etc.
         
     | 
| 
      
 800 
     | 
    
         
            +
              # sent: sentence
         
     | 
| 
      
 801 
     | 
    
         
            +
              # top:  top node of a sentence
         
     | 
| 
      
 802 
     | 
    
         
            +
              # verb: verb (phrase)
         
     | 
| 
      
 803 
     | 
    
         
            +
              # nil:  something went wrong
         
     | 
| 
      
 804 
     | 
    
         
            +
              #
         
     | 
| 
      
 805 
     | 
    
         
            +
              # returns: string, or nil
         
     | 
| 
      
 806 
     | 
    
         
            +
              def MiniparInterpreter.category(node) # SynNode
         
     | 
| 
      
 807 
     | 
    
         
            +
                node = MiniparInterpreter.ensure_upper(node)
         
     | 
| 
      
 808 
     | 
    
         
            +
             
     | 
| 
      
 809 
     | 
    
         
            +
                if node.get_attribute("lemma") =~ /NUM/
         
     | 
| 
      
 810 
     | 
    
         
            +
                  return "card"
         
     | 
| 
      
 811 
     | 
    
         
            +
                end
         
     | 
| 
      
 812 
     | 
    
         
            +
             
     | 
| 
      
 813 
     | 
    
         
            +
                if node.part_of_speech() == "U" and
         
     | 
| 
      
 814 
     | 
    
         
            +
                    node.parent_label() == "lex-mod" and
         
     | 
| 
      
 815 
     | 
    
         
            +
                    node.parent and MiniparInterpreter.category(node.parent) == "verb"
         
     | 
| 
      
 816 
     | 
    
         
            +
                  # this node is part of a complex verb
         
     | 
| 
      
 817 
     | 
    
         
            +
                  return "part"
         
     | 
| 
      
 818 
     | 
    
         
            +
                end
         
     | 
| 
      
 819 
     | 
    
         
            +
             
     | 
| 
      
 820 
     | 
    
         
            +
                if node.word =~ /^[!?;`'",(){}\[\]\.\:]+$/ 
         
     | 
| 
      
 821 
     | 
    
         
            +
                  return "pun"
         
     | 
| 
      
 822 
     | 
    
         
            +
                end
         
     | 
| 
      
 823 
     | 
    
         
            +
             
     | 
| 
      
 824 
     | 
    
         
            +
                if node.parent.nil?
         
     | 
| 
      
 825 
     | 
    
         
            +
                  return "top"
         
     | 
| 
      
 826 
     | 
    
         
            +
                end    
         
     | 
| 
      
 827 
     | 
    
         
            +
             
     | 
| 
      
 828 
     | 
    
         
            +
                case node.part_of_speech()
         
     | 
| 
      
 829 
     | 
    
         
            +
             
     | 
| 
      
 830 
     | 
    
         
            +
                when "A"  # same POS for adjectives and adverbs
         
     | 
| 
      
 831 
     | 
    
         
            +
                  parent = node.parent
         
     | 
| 
      
 832 
     | 
    
         
            +
                  if parent 
         
     | 
| 
      
 833 
     | 
    
         
            +
                    if MiniparInterpreter.category(parent) == "verb"
         
     | 
| 
      
 834 
     | 
    
         
            +
                      return "adv"
         
     | 
| 
      
 835 
     | 
    
         
            +
                    else
         
     | 
| 
      
 836 
     | 
    
         
            +
                      return "adj"
         
     | 
| 
      
 837 
     | 
    
         
            +
                    end
         
     | 
| 
      
 838 
     | 
    
         
            +
                  else
         
     | 
| 
      
 839 
     | 
    
         
            +
                    return "adj"
         
     | 
| 
      
 840 
     | 
    
         
            +
                  end
         
     | 
| 
      
 841 
     | 
    
         
            +
             
     | 
| 
      
 842 
     | 
    
         
            +
                when "Det"
         
     | 
| 
      
 843 
     | 
    
         
            +
                  return "det"
         
     | 
| 
      
 844 
     | 
    
         
            +
                when "N"
         
     | 
| 
      
 845 
     | 
    
         
            +
                  return "noun"
         
     | 
| 
      
 846 
     | 
    
         
            +
             
     | 
| 
      
 847 
     | 
    
         
            +
                when "Prep"
         
     | 
| 
      
 848 
     | 
    
         
            +
                  return "prep"
         
     | 
| 
      
 849 
     | 
    
         
            +
             
     | 
| 
      
 850 
     | 
    
         
            +
                when "C"
         
     | 
| 
      
 851 
     | 
    
         
            +
                  return "sent"
         
     | 
| 
      
 852 
     | 
    
         
            +
             
     | 
| 
      
 853 
     | 
    
         
            +
                when /^V/
         
     | 
| 
      
 854 
     | 
    
         
            +
                  return "verb"
         
     | 
| 
      
 855 
     | 
    
         
            +
                  
         
     | 
| 
      
 856 
     | 
    
         
            +
                else
         
     | 
| 
      
 857 
     | 
    
         
            +
                  return nil
         
     | 
| 
      
 858 
     | 
    
         
            +
                end
         
     | 
| 
      
 859 
     | 
    
         
            +
              end
         
     | 
| 
      
 860 
     | 
    
         
            +
             
     | 
| 
      
 861 
     | 
    
         
            +
              ###
         
     | 
| 
      
 862 
     | 
    
         
            +
              # is relative pronoun?
         
     | 
| 
      
 863 
     | 
    
         
            +
              #
         
     | 
| 
      
 864 
     | 
    
         
            +
              def MiniparInterpreter.relative_pronoun?(node) # SynNode
         
     | 
| 
      
 865 
     | 
    
         
            +
                if node.parent_label() =~ /^wh/
         
     | 
| 
      
 866 
     | 
    
         
            +
                  return true
         
     | 
| 
      
 867 
     | 
    
         
            +
                else
         
     | 
| 
      
 868 
     | 
    
         
            +
                  return false
         
     | 
| 
      
 869 
     | 
    
         
            +
                end
         
     | 
| 
      
 870 
     | 
    
         
            +
              end
         
     | 
| 
      
 871 
     | 
    
         
            +
             
     | 
| 
      
 872 
     | 
    
         
            +
              ###
         
     | 
| 
      
 873 
     | 
    
         
            +
              # phrase type:
         
     | 
| 
      
 874 
     | 
    
         
            +
              # constituent label for nonterminals,
         
     | 
| 
      
 875 
     | 
    
         
            +
              # part of speech for terminals
         
     | 
| 
      
 876 
     | 
    
         
            +
              #
         
     | 
| 
      
 877 
     | 
    
         
            +
              # returns: string
         
     | 
| 
      
 878 
     | 
    
         
            +
              def MiniparInterpreter.pt(node)
         
     | 
| 
      
 879 
     | 
    
         
            +
                return node.part_of_speech()
         
     | 
| 
      
 880 
     | 
    
         
            +
              end
         
     | 
| 
      
 881 
     | 
    
         
            +
             
     | 
| 
      
 882 
     | 
    
         
            +
              ###
         
     | 
| 
      
 883 
     | 
    
         
            +
              # auxiliary?
         
     | 
| 
      
 884 
     | 
    
         
            +
              # 
         
     | 
| 
      
 885 
     | 
    
         
            +
              # returns true if the given node is an auxiliary
         
     | 
| 
      
 886 
     | 
    
         
            +
              #
         
     | 
| 
      
 887 
     | 
    
         
            +
              # returns: boolean
         
     | 
| 
      
 888 
     | 
    
         
            +
              def MiniparInterpreter.auxiliary?(node)
         
     | 
| 
      
 889 
     | 
    
         
            +
                if MiniparInterpreter.aux_or_modal?(node) and
         
     | 
| 
      
 890 
     | 
    
         
            +
                    not(MiniparInterpreter.modal?(node))
         
     | 
| 
      
 891 
     | 
    
         
            +
                  return true
         
     | 
| 
      
 892 
     | 
    
         
            +
                else
         
     | 
| 
      
 893 
     | 
    
         
            +
                  return false
         
     | 
| 
      
 894 
     | 
    
         
            +
                end
         
     | 
| 
      
 895 
     | 
    
         
            +
              end
         
     | 
| 
      
 896 
     | 
    
         
            +
             
     | 
| 
      
 897 
     | 
    
         
            +
              ###
         
     | 
| 
      
 898 
     | 
    
         
            +
              # modal?
         
     | 
| 
      
 899 
     | 
    
         
            +
              #
         
     | 
| 
      
 900 
     | 
    
         
            +
              # returns true if the given node is a modal verb
         
     | 
| 
      
 901 
     | 
    
         
            +
              #
         
     | 
| 
      
 902 
     | 
    
         
            +
              # returns: boolean
         
     | 
| 
      
 903 
     | 
    
         
            +
              def MiniparInterpreter.modal?(node)
         
     | 
| 
      
 904 
     | 
    
         
            +
                if MiniparInterpreter.aux_or_modal?(node) and
         
     | 
| 
      
 905 
     | 
    
         
            +
                    ["can",
         
     | 
| 
      
 906 
     | 
    
         
            +
                     "could",
         
     | 
| 
      
 907 
     | 
    
         
            +
                     "must",
         
     | 
| 
      
 908 
     | 
    
         
            +
                     "should",
         
     | 
| 
      
 909 
     | 
    
         
            +
                     "shall"
         
     | 
| 
      
 910 
     | 
    
         
            +
                  ].include? node.word()
         
     | 
| 
      
 911 
     | 
    
         
            +
                  return true
         
     | 
| 
      
 912 
     | 
    
         
            +
                else
         
     | 
| 
      
 913 
     | 
    
         
            +
                  return false
         
     | 
| 
      
 914 
     | 
    
         
            +
                end
         
     | 
| 
      
 915 
     | 
    
         
            +
              end
         
     | 
| 
      
 916 
     | 
    
         
            +
             
     | 
| 
      
 917 
     | 
    
         
            +
              ###
         
     | 
| 
      
 918 
     | 
    
         
            +
              # head_terminal
         
     | 
| 
      
 919 
     | 
    
         
            +
              #
         
     | 
| 
      
 920 
     | 
    
         
            +
              # given a constituent, return the terminal node
         
     | 
| 
      
 921 
     | 
    
         
            +
              # that describes its headword
         
     | 
| 
      
 922 
     | 
    
         
            +
              #
         
     | 
| 
      
 923 
     | 
    
         
            +
              # returns: a SynNode object if successful, else nil
         
     | 
| 
      
 924 
     | 
    
         
            +
              def MiniparInterpreter.head_terminal(node)
         
     | 
| 
      
 925 
     | 
    
         
            +
                if node.is_terminal?
         
     | 
| 
      
 926 
     | 
    
         
            +
                  return node
         
     | 
| 
      
 927 
     | 
    
         
            +
                else
         
     | 
| 
      
 928 
     | 
    
         
            +
                  return node.children_by_edgelabels(["Head"]).first
         
     | 
| 
      
 929 
     | 
    
         
            +
                end
         
     | 
| 
      
 930 
     | 
    
         
            +
              end
         
     | 
| 
      
 931 
     | 
    
         
            +
             
     | 
| 
      
 932 
     | 
    
         
            +
              ###
         
     | 
| 
      
 933 
     | 
    
         
            +
              # voice
         
     | 
| 
      
 934 
     | 
    
         
            +
              #
         
     | 
| 
      
 935 
     | 
    
         
            +
              # given a constituent, return 
         
     | 
| 
      
 936 
     | 
    
         
            +
              # - "active"/"passive" if it is a verb
         
     | 
| 
      
 937 
     | 
    
         
            +
              # - nil, else
         
     | 
| 
      
 938 
     | 
    
         
            +
              def MiniparInterpreter.voice(verb_node)
         
     | 
| 
      
 939 
     | 
    
         
            +
                
         
     | 
| 
      
 940 
     | 
    
         
            +
                # am I a terminal added to make minipar representations
         
     | 
| 
      
 941 
     | 
    
         
            +
                # more TigerXML-like? then move to my parent
         
     | 
| 
      
 942 
     | 
    
         
            +
                verb_node = MiniparInterpreter.ensure_upper(verb_node)
         
     | 
| 
      
 943 
     | 
    
         
            +
             
     | 
| 
      
 944 
     | 
    
         
            +
                # verb has to have part of speech V or VBE
         
     | 
| 
      
 945 
     | 
    
         
            +
                unless ["V", "VBE"].include? verb_node.part_of_speech()
         
     | 
| 
      
 946 
     | 
    
         
            +
                  return nil
         
     | 
| 
      
 947 
     | 
    
         
            +
                end
         
     | 
| 
      
 948 
     | 
    
         
            +
             
     | 
| 
      
 949 
     | 
    
         
            +
                # outgoing edge "by_subj"? 
         
     | 
| 
      
 950 
     | 
    
         
            +
                # then assume passive
         
     | 
| 
      
 951 
     | 
    
         
            +
                unless verb_node.children_by_edgelabels(["by_subj"]).empty?
         
     | 
| 
      
 952 
     | 
    
         
            +
            #      $stderr.puts "passive #{verb_node.id()} by_subj"
         
     | 
| 
      
 953 
     | 
    
         
            +
                  return "passive"
         
     | 
| 
      
 954 
     | 
    
         
            +
                end
         
     | 
| 
      
 955 
     | 
    
         
            +
             
     | 
| 
      
 956 
     | 
    
         
            +
                # outgoing edge to auxiliary "be", and not "be ....ing"?
         
     | 
| 
      
 957 
     | 
    
         
            +
                # then assume passive
         
     | 
| 
      
 958 
     | 
    
         
            +
                if not(verb_node.children_by_edgelabels(["be"]).empty?) and
         
     | 
| 
      
 959 
     | 
    
         
            +
                    verb_node.word !~ /ing$/
         
     | 
| 
      
 960 
     | 
    
         
            +
            #      $stderr.puts "passive #{verb_node.id()} be"
         
     | 
| 
      
 961 
     | 
    
         
            +
                  return "passive"
         
     | 
| 
      
 962 
     | 
    
         
            +
                end
         
     | 
| 
      
 963 
     | 
    
         
            +
             
     | 
| 
      
 964 
     | 
    
         
            +
                # vrel incoming edge? then assume passive
         
     | 
| 
      
 965 
     | 
    
         
            +
                if verb_node.parent_label() == "vrel"
         
     | 
| 
      
 966 
     | 
    
         
            +
            #      $stderr.puts "passive #{verb_node.id()} vrel"
         
     | 
| 
      
 967 
     | 
    
         
            +
                  return "passive"
         
     | 
| 
      
 968 
     | 
    
         
            +
                end
         
     | 
| 
      
 969 
     | 
    
         
            +
             
     | 
| 
      
 970 
     | 
    
         
            +
                # obj child coreferent with s child? 
         
     | 
| 
      
 971 
     | 
    
         
            +
                # then assume passive
         
     | 
| 
      
 972 
     | 
    
         
            +
                if (obj_ch = verb_node.children_by_edgelabels(["obj"]).first)
         
     | 
| 
      
 973 
     | 
    
         
            +
                  if (s_ch = verb_node.children_by_edgelabels(["s"]).first)
         
     | 
| 
      
 974 
     | 
    
         
            +
                    if obj_ch.get_f("antecedent") == s_ch
         
     | 
| 
      
 975 
     | 
    
         
            +
            #          $stderr.puts "passive #{verb_node.id()} obj=s"
         
     | 
| 
      
 976 
     | 
    
         
            +
                      return "passive"
         
     | 
| 
      
 977 
     | 
    
         
            +
                    end
         
     | 
| 
      
 978 
     | 
    
         
            +
                  end
         
     | 
| 
      
 979 
     | 
    
         
            +
                end
         
     | 
| 
      
 980 
     | 
    
         
            +
             
     | 
| 
      
 981 
     | 
    
         
            +
                # okay, assume active voice
         
     | 
| 
      
 982 
     | 
    
         
            +
                return "active"
         
     | 
| 
      
 983 
     | 
    
         
            +
              end
         
     | 
| 
      
 984 
     | 
    
         
            +
             
     | 
| 
      
 985 
     | 
    
         
            +
              ###
         
     | 
| 
      
 986 
     | 
    
         
            +
              # gfs
         
     | 
| 
      
 987 
     | 
    
         
            +
              #
         
     | 
| 
      
 988 
     | 
    
         
            +
              # grammatical functions of a constituent:
         
     | 
| 
      
 989 
     | 
    
         
            +
              # 
         
     | 
| 
      
 990 
     | 
    
         
            +
              # returns: a list of pairs [relation(string), node(SynNode)]
         
     | 
| 
      
 991 
     | 
    
         
            +
              # where <node> stands in the relation <relation> to the parameter
         
     | 
| 
      
 992 
     | 
    
         
            +
              # that the method was called with
         
     | 
| 
      
 993 
     | 
    
         
            +
              def MiniparInterpreter.gfs(start_node,    # SynNode
         
     | 
| 
      
 994 
     | 
    
         
            +
                                         sent)    # SalsaTigerSentence
         
     | 
| 
      
 995 
     | 
    
         
            +
             
     | 
| 
      
 996 
     | 
    
         
            +
                start_node = MiniparInterpreter.ensure_upper(start_node)
         
     | 
| 
      
 997 
     | 
    
         
            +
                
         
     | 
| 
      
 998 
     | 
    
         
            +
                retv =  start_node.children_with_edgelabel.reject { |edgelabel, node|
         
     | 
| 
      
 999 
     | 
    
         
            +
                  ["Head",  # head of the target node -- not really bearer of a GF
         
     | 
| 
      
 1000 
     | 
    
         
            +
            	"-",
         
     | 
| 
      
 1001 
     | 
    
         
            +
            	"aux",
         
     | 
| 
      
 1002 
     | 
    
         
            +
            	"have",
         
     | 
| 
      
 1003 
     | 
    
         
            +
            	"be"
         
     | 
| 
      
 1004 
     | 
    
         
            +
                  ].include? edgelabel
         
     | 
| 
      
 1005 
     | 
    
         
            +
                }.map { |edgelabel,node|
         
     | 
| 
      
 1006 
     | 
    
         
            +
            	
         
     | 
| 
      
 1007 
     | 
    
         
            +
                  # map node to suitable other node
         
     | 
| 
      
 1008 
     | 
    
         
            +
                  while (ant_id = node.get_attribute("antecedent"))
         
     | 
| 
      
 1009 
     | 
    
         
            +
            	
         
     | 
| 
      
 1010 
     | 
    
         
            +
                    # Antecedent node for empty nodes and relative pronouns
         
     | 
| 
      
 1011 
     | 
    
         
            +
            	  
         
     | 
| 
      
 1012 
     | 
    
         
            +
                    new_node = sent.syn_node_with_id(ant_id)
         
     | 
| 
      
 1013 
     | 
    
         
            +
                    if new_node
         
     | 
| 
      
 1014 
     | 
    
         
            +
                      node = new_node
         
     | 
| 
      
 1015 
     | 
    
         
            +
                    else
         
     | 
| 
      
 1016 
     | 
    
         
            +
                      # error. stop seeking
         
     | 
| 
      
 1017 
     | 
    
         
            +
                      #	    $stderr.puts "Antecedent ID not matching any node: #{ant_id}"
         
     | 
| 
      
 1018 
     | 
    
         
            +
                      break
         
     | 
| 
      
 1019 
     | 
    
         
            +
                    end
         
     | 
| 
      
 1020 
     | 
    
         
            +
                  end
         
     | 
| 
      
 1021 
     | 
    
         
            +
                  
         
     | 
| 
      
 1022 
     | 
    
         
            +
                  # PP -- i.e. edgelabel == mod and node.POS == Prep?
         
     | 
| 
      
 1023 
     | 
    
         
            +
                  # then add the preposition to the edgelabel,
         
     | 
| 
      
 1024 
     | 
    
         
            +
                  # and take the node's head as head instead of the node
         
     | 
| 
      
 1025 
     | 
    
         
            +
                  if edgelabel == "mod" and
         
     | 
| 
      
 1026 
     | 
    
         
            +
            	  node.part_of_speech() == "Prep"
         
     | 
| 
      
 1027 
     | 
    
         
            +
            	edgelabel = edgelabel + "-" + node.word().to_s
         
     | 
| 
      
 1028 
     | 
    
         
            +
                  end
         
     | 
| 
      
 1029 
     | 
    
         
            +
             
     | 
| 
      
 1030 
     | 
    
         
            +
                  [edgelabel, node]
         
     | 
| 
      
 1031 
     | 
    
         
            +
                }
         
     | 
| 
      
 1032 
     | 
    
         
            +
             
     | 
| 
      
 1033 
     | 
    
         
            +
                # duplicate entries?
         
     | 
| 
      
 1034 
     | 
    
         
            +
                # s is often coreferent with either subj or obj
         
     | 
| 
      
 1035 
     | 
    
         
            +
                if MiniparInterpreter.voice(start_node) == "active" and
         
     | 
| 
      
 1036 
     | 
    
         
            +
                    (s_entry = retv.assoc("s")) and
         
     | 
| 
      
 1037 
     | 
    
         
            +
                    (subj_entry = retv.assoc("subj")) and
         
     | 
| 
      
 1038 
     | 
    
         
            +
                    s_entry.last == subj_entry.last
         
     | 
| 
      
 1039 
     | 
    
         
            +
                  retv.delete(s_entry)
         
     | 
| 
      
 1040 
     | 
    
         
            +
             
     | 
| 
      
 1041 
     | 
    
         
            +
                elsif MiniparInterpreter.voice(start_node) == "passive" and
         
     | 
| 
      
 1042 
     | 
    
         
            +
                    (s_entry = retv.assoc("s")) and
         
     | 
| 
      
 1043 
     | 
    
         
            +
                    (obj_entry = retv.assoc("obj")) and
         
     | 
| 
      
 1044 
     | 
    
         
            +
                    s_entry.last == obj_entry.last
         
     | 
| 
      
 1045 
     | 
    
         
            +
                  retv.delete(s_entry)
         
     | 
| 
      
 1046 
     | 
    
         
            +
                end
         
     | 
| 
      
 1047 
     | 
    
         
            +
                    
         
     | 
| 
      
 1048 
     | 
    
         
            +
            #    $stderr.puts "blip " + retv.map { |l, n| l}.join(" ")
         
     | 
| 
      
 1049 
     | 
    
         
            +
                return retv
         
     | 
| 
      
 1050 
     | 
    
         
            +
              end
         
     | 
| 
      
 1051 
     | 
    
         
            +
             
     | 
| 
      
 1052 
     | 
    
         
            +
              ###
         
     | 
| 
      
 1053 
     | 
    
         
            +
              # informative_content_node
         
     | 
| 
      
 1054 
     | 
    
         
            +
              #
         
     | 
| 
      
 1055 
     | 
    
         
            +
              # for most constituents: the head
         
     | 
| 
      
 1056 
     | 
    
         
            +
              # for a PP, the NP
         
     | 
| 
      
 1057 
     | 
    
         
            +
              # for an SBAR, the VP
         
     | 
| 
      
 1058 
     | 
    
         
            +
              # for a VP, the embedded VP 
         
     | 
| 
      
 1059 
     | 
    
         
            +
              def MiniparInterpreter.informative_content_node(node)
         
     | 
| 
      
 1060 
     | 
    
         
            +
                node = MiniparInterpreter.ensure_upper(node)
         
     | 
| 
      
 1061 
     | 
    
         
            +
             
     | 
| 
      
 1062 
     | 
    
         
            +
                if node.part_of_speech() == "Prep"
         
     | 
| 
      
 1063 
     | 
    
         
            +
                  # use complement of this constituent 
         
     | 
| 
      
 1064 
     | 
    
         
            +
                  children = node.children_by_edgelabels(["pcomp-n", 
         
     | 
| 
      
 1065 
     | 
    
         
            +
                                                          "vpsc_pcomp-c", 
         
     | 
| 
      
 1066 
     | 
    
         
            +
                                                          "pcomp-c"])
         
     | 
| 
      
 1067 
     | 
    
         
            +
             
     | 
| 
      
 1068 
     | 
    
         
            +
                  if children.empty?
         
     | 
| 
      
 1069 
     | 
    
         
            +
                    # no suitable child found
         
     | 
| 
      
 1070 
     | 
    
         
            +
            #        $stderr.puts "Prep node without suitable child."
         
     | 
| 
      
 1071 
     | 
    
         
            +
            #        $stderr.puts "Outgoing edges: " + node.child_labels().join(", ")
         
     | 
| 
      
 1072 
     | 
    
         
            +
                    return nil
         
     | 
| 
      
 1073 
     | 
    
         
            +
                    
         
     | 
| 
      
 1074 
     | 
    
         
            +
                  else
         
     | 
| 
      
 1075 
     | 
    
         
            +
            #         if children.length() > 1
         
     | 
| 
      
 1076 
     | 
    
         
            +
            #           $stderr.puts "Too many suitable children for prep node: "
         
     | 
| 
      
 1077 
     | 
    
         
            +
            #           $stderr.puts "Outgoing edges: " + node.child_labels().join(", ")
         
     | 
| 
      
 1078 
     | 
    
         
            +
            #         end
         
     | 
| 
      
 1079 
     | 
    
         
            +
                    
         
     | 
| 
      
 1080 
     | 
    
         
            +
                    return children.first
         
     | 
| 
      
 1081 
     | 
    
         
            +
                  end  
         
     | 
| 
      
 1082 
     | 
    
         
            +
             
     | 
| 
      
 1083 
     | 
    
         
            +
             
     | 
| 
      
 1084 
     | 
    
         
            +
                elsif node.part_of_speech() == "SentAdjunct"
         
     | 
| 
      
 1085 
     | 
    
         
            +
                  # use complement of this constituent 
         
     | 
| 
      
 1086 
     | 
    
         
            +
                  children = node.children_by_edgelabels(["comp1"])
         
     | 
| 
      
 1087 
     | 
    
         
            +
             
     | 
| 
      
 1088 
     | 
    
         
            +
                  if children.empty?
         
     | 
| 
      
 1089 
     | 
    
         
            +
                    # no suitable child found
         
     | 
| 
      
 1090 
     | 
    
         
            +
            #        $stderr.puts "SentAdjunct node without suitable child."
         
     | 
| 
      
 1091 
     | 
    
         
            +
            #        $stderr.puts "Outgoing edges: " + node.child_labels().join(", ")
         
     | 
| 
      
 1092 
     | 
    
         
            +
                    return nil
         
     | 
| 
      
 1093 
     | 
    
         
            +
             
     | 
| 
      
 1094 
     | 
    
         
            +
                  else
         
     | 
| 
      
 1095 
     | 
    
         
            +
            #         if children.length() > 1
         
     | 
| 
      
 1096 
     | 
    
         
            +
            #           $stderr.puts "Too many suitable children for sent. adjunct node: "
         
     | 
| 
      
 1097 
     | 
    
         
            +
            #           $stderr.puts "Outgoing edges: " + node.child_labels().join(", ")
         
     | 
| 
      
 1098 
     | 
    
         
            +
            #         end
         
     | 
| 
      
 1099 
     | 
    
         
            +
             
     | 
| 
      
 1100 
     | 
    
         
            +
                    return children.first
         
     | 
| 
      
 1101 
     | 
    
         
            +
                  end  
         
     | 
| 
      
 1102 
     | 
    
         
            +
                
         
     | 
| 
      
 1103 
     | 
    
         
            +
                elsif node.word().nil? or node.word().empty?
         
     | 
| 
      
 1104 
     | 
    
         
            +
                  # no word for this node: use child instead
         
     | 
| 
      
 1105 
     | 
    
         
            +
             
     | 
| 
      
 1106 
     | 
    
         
            +
                  children = node.children_by_edgelabels(["i"])
         
     | 
| 
      
 1107 
     | 
    
         
            +
                  if children.length() > 0
         
     | 
| 
      
 1108 
     | 
    
         
            +
            #         if children.length() > 1
         
     | 
| 
      
 1109 
     | 
    
         
            +
            #           $stderr.puts "Too many i edges from empty node."
         
     | 
| 
      
 1110 
     | 
    
         
            +
            #         end
         
     | 
| 
      
 1111 
     | 
    
         
            +
             
     | 
| 
      
 1112 
     | 
    
         
            +
                    return children.first
         
     | 
| 
      
 1113 
     | 
    
         
            +
                  end
         
     | 
| 
      
 1114 
     | 
    
         
            +
             
     | 
| 
      
 1115 
     | 
    
         
            +
                  children = node.children_by_edgelabels(["nn"])
         
     | 
| 
      
 1116 
     | 
    
         
            +
                  if children.length() > 0
         
     | 
| 
      
 1117 
     | 
    
         
            +
            #         if children.length() > 1
         
     | 
| 
      
 1118 
     | 
    
         
            +
            #           $stderr.puts "Too many nn edges from empty node."
         
     | 
| 
      
 1119 
     | 
    
         
            +
            #         end
         
     | 
| 
      
 1120 
     | 
    
         
            +
             
     | 
| 
      
 1121 
     | 
    
         
            +
                    return children.first
         
     | 
| 
      
 1122 
     | 
    
         
            +
                  end
         
     | 
| 
      
 1123 
     | 
    
         
            +
                
         
     | 
| 
      
 1124 
     | 
    
         
            +
                  # no children for this node: try antecedent
         
     | 
| 
      
 1125 
     | 
    
         
            +
                  ant = node.get_f("antecedent")
         
     | 
| 
      
 1126 
     | 
    
         
            +
                  if ant
         
     | 
| 
      
 1127 
     | 
    
         
            +
                    return ant
         
     | 
| 
      
 1128 
     | 
    
         
            +
                  end
         
     | 
| 
      
 1129 
     | 
    
         
            +
             
     | 
| 
      
 1130 
     | 
    
         
            +
                  return nil
         
     | 
| 
      
 1131 
     | 
    
         
            +
                end
         
     | 
| 
      
 1132 
     | 
    
         
            +
                
         
     | 
| 
      
 1133 
     | 
    
         
            +
              end
         
     | 
| 
      
 1134 
     | 
    
         
            +
             
     | 
| 
      
 1135 
     | 
    
         
            +
              ###
         
     | 
| 
      
 1136 
     | 
    
         
            +
              # path_between
         
     | 
| 
      
 1137 
     | 
    
         
            +
              #
         
     | 
| 
      
 1138 
     | 
    
         
            +
              # construct path in syntactic structure between two nodes, 
         
     | 
| 
      
 1139 
     | 
    
         
            +
              # using 
         
     | 
| 
      
 1140 
     | 
    
         
            +
              # - node labels
         
     | 
| 
      
 1141 
     | 
    
         
            +
              # - edge labels
         
     | 
| 
      
 1142 
     | 
    
         
            +
              # - direction Up, Down
         
     | 
| 
      
 1143 
     | 
    
         
            +
              #
         
     | 
| 
      
 1144 
     | 
    
         
            +
              # use_nontree_edges: set to true to use coreference edges
         
     | 
| 
      
 1145 
     | 
    
         
            +
              # and other non-tree edges returned by the parser
         
     | 
| 
      
 1146 
     | 
    
         
            +
              # in path computation. 
         
     | 
| 
      
 1147 
     | 
    
         
            +
              #
         
     | 
| 
      
 1148 
     | 
    
         
            +
              # returns: Path object
         
     | 
| 
      
 1149 
     | 
    
         
            +
              def MiniparInterpreter.path_between(from_node, # SynNode
         
     | 
| 
      
 1150 
     | 
    
         
            +
                                                  to_node,   # SynNode
         
     | 
| 
      
 1151 
     | 
    
         
            +
            				      use_nontree_edges = false) # boolean
         
     | 
| 
      
 1152 
     | 
    
         
            +
                from_node = MiniparInterpreter.ensure_upper(from_node)
         
     | 
| 
      
 1153 
     | 
    
         
            +
                to_node = MiniparInterpreter.ensure_upper(to_node)
         
     | 
| 
      
 1154 
     | 
    
         
            +
             
     | 
| 
      
 1155 
     | 
    
         
            +
                if use_nontree_edges
         
     | 
| 
      
 1156 
     | 
    
         
            +
                  MiniparInterpreter.each_reachable_node(from_node) { |node, ant, paths, prev|
         
     | 
| 
      
 1157 
     | 
    
         
            +
            	if node == to_node
         
     | 
| 
      
 1158 
     | 
    
         
            +
            	  return paths.first
         
     | 
| 
      
 1159 
     | 
    
         
            +
            	end
         
     | 
| 
      
 1160 
     | 
    
         
            +
            	true # each_reachable_node requires boolean to determine
         
     | 
| 
      
 1161 
     | 
    
         
            +
            	     # whether to continue the path beyond node
         
     | 
| 
      
 1162 
     | 
    
         
            +
                  }
         
     | 
| 
      
 1163 
     | 
    
         
            +
                else
         
     | 
| 
      
 1164 
     | 
    
         
            +
                  return super(from_node, to_node)
         
     | 
| 
      
 1165 
     | 
    
         
            +
                end
         
     | 
| 
      
 1166 
     | 
    
         
            +
              end
         
     | 
| 
      
 1167 
     | 
    
         
            +
             
     | 
| 
      
 1168 
     | 
    
         
            +
              ###
         
     | 
| 
      
 1169 
     | 
    
         
            +
              # surrounding_nodes:
         
     | 
| 
      
 1170 
     | 
    
         
            +
              #
         
     | 
| 
      
 1171 
     | 
    
         
            +
              # construct paths in syntactic structure between a node and each of its neighbors
         
     | 
| 
      
 1172 
     | 
    
         
            +
              # path construction as in path_between.
         
     | 
| 
      
 1173 
     | 
    
         
            +
              # Neighbors: parent, child, plus potentially neighbors by nontree edges
         
     | 
| 
      
 1174 
     | 
    
         
            +
              # use_nontree_edges: again, same as in path_between
         
     | 
| 
      
 1175 
     | 
    
         
            +
              #
         
     | 
| 
      
 1176 
     | 
    
         
            +
              # returns: list of pairs [neighbor(SynNode), path(Path)]
         
     | 
| 
      
 1177 
     | 
    
         
            +
              def MiniparInterpreter.surrounding_nodes(node, # SynNode
         
     | 
| 
      
 1178 
     | 
    
         
            +
                                                       use_nontree_edges = false) # boolean
         
     | 
| 
      
 1179 
     | 
    
         
            +
                normal_neighbors = super(node, use_nontree_edges)
         
     | 
| 
      
 1180 
     | 
    
         
            +
                # add antecedents
         
     | 
| 
      
 1181 
     | 
    
         
            +
                more_neighbors = Array.new
         
     | 
| 
      
 1182 
     | 
    
         
            +
                normal_neighbors.each { |neighbor, path|
         
     | 
| 
      
 1183 
     | 
    
         
            +
                  while n = (neighbor.get_f("antecedent"))
         
     | 
| 
      
 1184 
     | 
    
         
            +
                    more_neighbors << [n, path]
         
     | 
| 
      
 1185 
     | 
    
         
            +
                    neighbor = n
         
     | 
| 
      
 1186 
     | 
    
         
            +
                  end
         
     | 
| 
      
 1187 
     | 
    
         
            +
                }
         
     | 
| 
      
 1188 
     | 
    
         
            +
                return normal_neighbors + more_neighbors
         
     | 
| 
      
 1189 
     | 
    
         
            +
              end
         
     | 
| 
      
 1190 
     | 
    
         
            +
             
     | 
| 
      
 1191 
     | 
    
         
            +
             
     | 
| 
      
 1192 
     | 
    
         
            +
            #   ###
         
     | 
| 
      
 1193 
     | 
    
         
            +
            #   # main node of expression
         
     | 
| 
      
 1194 
     | 
    
         
            +
            #   #
         
     | 
| 
      
 1195 
     | 
    
         
            +
            #   # 2nd argument non-nil:
         
     | 
| 
      
 1196 
     | 
    
         
            +
            #   # don't handle multiword expressions beyond verbs with separate particles
         
     | 
| 
      
 1197 
     | 
    
         
            +
            #   # 
         
     | 
| 
      
 1198 
     | 
    
         
            +
            #   # returns: SynNode, main node, if found
         
     | 
| 
      
 1199 
     | 
    
         
            +
            #   # else nil
         
     | 
| 
      
 1200 
     | 
    
         
            +
            #   def MiniparInterpreter.main_node_of_expr(nodelist, 
         
     | 
| 
      
 1201 
     | 
    
         
            +
            #                                            no_mwes = nil)
         
     | 
| 
      
 1202 
     | 
    
         
            +
             
     | 
| 
      
 1203 
     | 
    
         
            +
            #     nodelist = nodelist.map { |n| MiniparInterpreter.ensure_upper(n) }.uniq()
         
     | 
| 
      
 1204 
     | 
    
         
            +
             
     | 
| 
      
 1205 
     | 
    
         
            +
            #     # main reason we are overwriting the parent method:
         
     | 
| 
      
 1206 
     | 
    
         
            +
            #     # don't go to terminal nodes right away.
         
     | 
| 
      
 1207 
     | 
    
         
            +
            #     # If we have a single nonterminal, stay with it.
         
     | 
| 
      
 1208 
     | 
    
         
            +
            #     # Otherwise, use parent method
         
     | 
| 
      
 1209 
     | 
    
         
            +
            #     if nodelist.length() == 1
         
     | 
| 
      
 1210 
     | 
    
         
            +
            #       return nodelist.first
         
     | 
| 
      
 1211 
     | 
    
         
            +
            #     end
         
     | 
| 
      
 1212 
     | 
    
         
            +
             
     | 
| 
      
 1213 
     | 
    
         
            +
            #     return super(nodelist, no_mwes)
         
     | 
| 
      
 1214 
     | 
    
         
            +
            #   end
         
     | 
| 
      
 1215 
     | 
    
         
            +
             
     | 
| 
      
 1216 
     | 
    
         
            +
              ########
         
     | 
| 
      
 1217 
     | 
    
         
            +
              # max constituents:
         
     | 
| 
      
 1218 
     | 
    
         
            +
              # given a set of nodes, compute the maximal constituents
         
     | 
| 
      
 1219 
     | 
    
         
            +
              # that exactly cover them
         
     | 
| 
      
 1220 
     | 
    
         
            +
              #
         
     | 
| 
      
 1221 
     | 
    
         
            +
              # overwrite default: ignore empty terminals, both in nodeset
         
     | 
| 
      
 1222 
     | 
    
         
            +
              #  and in the nodes that are tested as potential maximal constituents
         
     | 
| 
      
 1223 
     | 
    
         
            +
              def MiniparInterpreter.max_constituents(nodeset, # Array:SynNode
         
     | 
| 
      
 1224 
     | 
    
         
            +
                                                      sent,    # SalsaTigerSentence
         
     | 
| 
      
 1225 
     | 
    
         
            +
                                                      idealize_maxconst = false) # boolean
         
     | 
| 
      
 1226 
     | 
    
         
            +
              
         
     | 
| 
      
 1227 
     | 
    
         
            +
                my_nodeset = nodeset.reject { |n| MiniparInterpreter.empty_terminal?(n)}
         
     | 
| 
      
 1228 
     | 
    
         
            +
                if idealize_maxconst
         
     | 
| 
      
 1229 
     | 
    
         
            +
                  return sent.max_constituents_smc(my_nodeset, idealize_maxconst, true)
         
     | 
| 
      
 1230 
     | 
    
         
            +
                else
         
     | 
| 
      
 1231 
     | 
    
         
            +
                  return sent.max_constituents_for_nodes(my_nodeset, true)
         
     | 
| 
      
 1232 
     | 
    
         
            +
                end
         
     | 
| 
      
 1233 
     | 
    
         
            +
              end
         
     | 
| 
      
 1234 
     | 
    
         
            +
             
     | 
| 
      
 1235 
     | 
    
         
            +
             
     | 
| 
      
 1236 
     | 
    
         
            +
              ###
         
     | 
| 
      
 1237 
     | 
    
         
            +
              # for all nodes reachable from a given from_node:
         
     | 
| 
      
 1238 
     | 
    
         
            +
              # compute the path from from_node,
         
     | 
| 
      
 1239 
     | 
    
         
            +
              # using both tree edges and coreference edges
         
     | 
| 
      
 1240 
     | 
    
         
            +
              #
         
     | 
| 
      
 1241 
     | 
    
         
            +
              # compute a widening circle of nodes from from_node outward,
         
     | 
| 
      
 1242 
     | 
    
         
            +
              # following all antecedent links as 0-length paths.
         
     | 
| 
      
 1243 
     | 
    
         
            +
              #
         
     | 
| 
      
 1244 
     | 
    
         
            +
              # yields tuples
         
     | 
| 
      
 1245 
     | 
    
         
            +
              #  [
         
     | 
| 
      
 1246 
     | 
    
         
            +
              #   minipar node, 
         
     | 
| 
      
 1247 
     | 
    
         
            +
              #   array: other minipar node(s) reached from this one solely via antecedent edges,
         
     | 
| 
      
 1248 
     | 
    
         
            +
              #   array: minimal paths from start_node to this node as Path objects
         
     | 
| 
      
 1249 
     | 
    
         
            +
              #   minipar node 2: last stop on path from start_node to minipar_node
         
     | 
| 
      
 1250 
     | 
    
         
            +
              #  ]
         
     | 
| 
      
 1251 
     | 
    
         
            +
              def MiniparInterpreter.each_reachable_node(from_node)   # SynNode
         
     | 
| 
      
 1252 
     | 
    
         
            +
             
     | 
| 
      
 1253 
     | 
    
         
            +
                from_node = MiniparInterpreter.ensure_upper(from_node)
         
     | 
| 
      
 1254 
     | 
    
         
            +
             
     | 
| 
      
 1255 
     | 
    
         
            +
                # rim: array:SynNode, current outermost nodes
         
     | 
| 
      
 1256 
     | 
    
         
            +
                rim = [ from_node ]
         
     | 
| 
      
 1257 
     | 
    
         
            +
                # seen: hash SynNode->Path, mapping (seen) minipar nodes to
         
     | 
| 
      
 1258 
     | 
    
         
            +
                # the path leading from the target to them
         
     | 
| 
      
 1259 
     | 
    
         
            +
                seen = {
         
     | 
| 
      
 1260 
     | 
    
         
            +
                  from_node => [Path.new(from_node)]
         
     | 
| 
      
 1261 
     | 
    
         
            +
                }
         
     | 
| 
      
 1262 
     | 
    
         
            +
                
         
     | 
| 
      
 1263 
     | 
    
         
            +
                while not(rim.empty?)
         
     | 
| 
      
 1264 
     | 
    
         
            +
                  # remove node from the beginning of the rim
         
     | 
| 
      
 1265 
     | 
    
         
            +
                  minipar_node = rim.shift()
         
     | 
| 
      
 1266 
     | 
    
         
            +
                  
         
     | 
| 
      
 1267 
     | 
    
         
            +
                  # make tuples:
         
     | 
| 
      
 1268 
     | 
    
         
            +
                  # ["D" for down from minipar_node, or "U" for up,
         
     | 
| 
      
 1269 
     | 
    
         
            +
                  #  parent or child of minipar_node, 
         
     | 
| 
      
 1270 
     | 
    
         
            +
                  #  edgelabel between minipar_node and that parent or child,
         
     | 
| 
      
 1271 
     | 
    
         
            +
                  #  POS of that parent or child,
         
     | 
| 
      
 1272 
     | 
    
         
            +
                  #  preposition
         
     | 
| 
      
 1273 
     | 
    
         
            +
                  #  ]
         
     | 
| 
      
 1274 
     | 
    
         
            +
                  surrounding_n = minipar_node.children.map { |child| 
         
     | 
| 
      
 1275 
     | 
    
         
            +
                    ["D", child, 
         
     | 
| 
      
 1276 
     | 
    
         
            +
                     minipar_node.child_label(child), child.part_of_speech()]
         
     | 
| 
      
 1277 
     | 
    
         
            +
                  }
         
     | 
| 
      
 1278 
     | 
    
         
            +
                  if minipar_node.parent
         
     | 
| 
      
 1279 
     | 
    
         
            +
                    surrounding_n.push([
         
     | 
| 
      
 1280 
     | 
    
         
            +
            				 "U", minipar_node.parent, 
         
     | 
| 
      
 1281 
     | 
    
         
            +
            				 minipar_node.parent_label(), 
         
     | 
| 
      
 1282 
     | 
    
         
            +
            				 minipar_node.parent.part_of_speech()
         
     | 
| 
      
 1283 
     | 
    
         
            +
            			       ])
         
     | 
| 
      
 1284 
     | 
    
         
            +
                  end
         
     | 
| 
      
 1285 
     | 
    
         
            +
                
         
     | 
| 
      
 1286 
     | 
    
         
            +
                  surrounding_n.each { |direction, new_node, edgelabel, nodelabel|
         
     | 
| 
      
 1287 
     | 
    
         
            +
                    
         
     | 
| 
      
 1288 
     | 
    
         
            +
                    # node we are actually using: the antecedent, if it's there
         
     | 
| 
      
 1289 
     | 
    
         
            +
                    # the coref chain may have a length > 1
         
     | 
| 
      
 1290 
     | 
    
         
            +
                    actual_new_node = new_node
         
     | 
| 
      
 1291 
     | 
    
         
            +
                    antecedents = []
         
     | 
| 
      
 1292 
     | 
    
         
            +
                    while actual_new_node.get_f("antecedent")
         
     | 
| 
      
 1293 
     | 
    
         
            +
                      antecedents << actual_new_node.get_f("antecedent")
         
     | 
| 
      
 1294 
     | 
    
         
            +
                      actual_new_node = actual_new_node.get_f("antecedent")
         
     | 
| 
      
 1295 
     | 
    
         
            +
                    end
         
     | 
| 
      
 1296 
     | 
    
         
            +
                    
         
     | 
| 
      
 1297 
     | 
    
         
            +
                    # node seen before, and  seen with shorter path?
         
     | 
| 
      
 1298 
     | 
    
         
            +
                    # all paths in seen[actual_new_node] have the same length
         
     | 
| 
      
 1299 
     | 
    
         
            +
                    if seen[actual_new_node] and
         
     | 
| 
      
 1300 
     | 
    
         
            +
                        seen[actual_new_node].first.length() < seen[minipar_node].first.length() + 1
         
     | 
| 
      
 1301 
     | 
    
         
            +
                      # yes, seen with a shorter path. discard
         
     | 
| 
      
 1302 
     | 
    
         
            +
                      next
         
     | 
| 
      
 1303 
     | 
    
         
            +
                    end
         
     | 
| 
      
 1304 
     | 
    
         
            +
                    
         
     | 
| 
      
 1305 
     | 
    
         
            +
                    # make paths for this new_node
         
     | 
| 
      
 1306 
     | 
    
         
            +
                    paths = seen[minipar_node].map { |previous_path| 
         
     | 
| 
      
 1307 
     | 
    
         
            +
            	  new_path = previous_path.deep_clone
         
     | 
| 
      
 1308 
     | 
    
         
            +
                      if new_node.part_of_speech() == "Prep"
         
     | 
| 
      
 1309 
     | 
    
         
            +
                        # preposition? add to path too
         
     | 
| 
      
 1310 
     | 
    
         
            +
            	    new_path.add_last_step(direction, 
         
     | 
| 
      
 1311 
     | 
    
         
            +
            				   edgelabel + "-" + new_node.get_attribute("lemma"), 
         
     | 
| 
      
 1312 
     | 
    
         
            +
            				   nodelabel,
         
     | 
| 
      
 1313 
     | 
    
         
            +
            				   new_node)
         
     | 
| 
      
 1314 
     | 
    
         
            +
                      else
         
     | 
| 
      
 1315 
     | 
    
         
            +
                        new_path.add_last_step(direction, edgelabel, nodelabel, new_node)
         
     | 
| 
      
 1316 
     | 
    
         
            +
                      end
         
     | 
| 
      
 1317 
     | 
    
         
            +
            	  new_path
         
     | 
| 
      
 1318 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1319 
     | 
    
         
            +
                    
         
     | 
| 
      
 1320 
     | 
    
         
            +
                    # node not seen before: record
         
     | 
| 
      
 1321 
     | 
    
         
            +
                    unless seen[actual_new_node]
         
     | 
| 
      
 1322 
     | 
    
         
            +
                      seen[actual_new_node] = Array.new
         
     | 
| 
      
 1323 
     | 
    
         
            +
                    end
         
     | 
| 
      
 1324 
     | 
    
         
            +
                    seen[actual_new_node].concat paths
         
     | 
| 
      
 1325 
     | 
    
         
            +
                    
         
     | 
| 
      
 1326 
     | 
    
         
            +
            	keepthisnode = yield(new_node, antecedents, paths, minipar_node)
         
     | 
| 
      
 1327 
     | 
    
         
            +
                    
         
     | 
| 
      
 1328 
     | 
    
         
            +
                    if keepthisnode and not(rim.include?(actual_new_node))
         
     | 
| 
      
 1329 
     | 
    
         
            +
                      rim.push actual_new_node
         
     | 
| 
      
 1330 
     | 
    
         
            +
                    end
         
     | 
| 
      
 1331 
     | 
    
         
            +
                    
         
     | 
| 
      
 1332 
     | 
    
         
            +
                  } # each parent or child of the current rim node
         
     | 
| 
      
 1333 
     | 
    
         
            +
                end # while new rim nodes keep being discovered
         
     | 
| 
      
 1334 
     | 
    
         
            +
              end
         
     | 
| 
      
 1335 
     | 
    
         
            +
             
     | 
| 
      
 1336 
     | 
    
         
            +
              #####################33
         
     | 
| 
      
 1337 
     | 
    
         
            +
              private
         
     | 
| 
      
 1338 
     | 
    
         
            +
             
     | 
| 
      
 1339 
     | 
    
         
            +
              ###
         
     | 
| 
      
 1340 
     | 
    
         
            +
              # auxiliaries and modals share this characteristic
         
     | 
| 
      
 1341 
     | 
    
         
            +
              def MiniparInterpreter.aux_or_modal?(node)
         
     | 
| 
      
 1342 
     | 
    
         
            +
                node = MiniparInterpreter.ensure_upper(node)
         
     | 
| 
      
 1343 
     | 
    
         
            +
                
         
     | 
| 
      
 1344 
     | 
    
         
            +
                if (l = node.parent_label()) and
         
     | 
| 
      
 1345 
     | 
    
         
            +
                    ["be", "have", "aux"].include? l and
         
     | 
| 
      
 1346 
     | 
    
         
            +
                    (p = node.parent()) and
         
     | 
| 
      
 1347 
     | 
    
         
            +
                    MiniparInterpreter.category(p) == "verb"
         
     | 
| 
      
 1348 
     | 
    
         
            +
                  return true
         
     | 
| 
      
 1349 
     | 
    
         
            +
                else
         
     | 
| 
      
 1350 
     | 
    
         
            +
                  return false
         
     | 
| 
      
 1351 
     | 
    
         
            +
                end
         
     | 
| 
      
 1352 
     | 
    
         
            +
              end
         
     | 
| 
      
 1353 
     | 
    
         
            +
             
     | 
| 
      
 1354 
     | 
    
         
            +
              ###
         
     | 
| 
      
 1355 
     | 
    
         
            +
              # given a node: if it has a Head child, return that,
         
     | 
| 
      
 1356 
     | 
    
         
            +
              # else return the node
         
     | 
| 
      
 1357 
     | 
    
         
            +
              def MiniparInterpreter.ensure_terminal(node)
         
     | 
| 
      
 1358 
     | 
    
         
            +
                headchildren = node.children_by_edgelabels(["Head"])
         
     | 
| 
      
 1359 
     | 
    
         
            +
                if headchildren and not(headchildren.empty?)
         
     | 
| 
      
 1360 
     | 
    
         
            +
                  return headchildren.first
         
     | 
| 
      
 1361 
     | 
    
         
            +
                else
         
     | 
| 
      
 1362 
     | 
    
         
            +
                  return node
         
     | 
| 
      
 1363 
     | 
    
         
            +
                end
         
     | 
| 
      
 1364 
     | 
    
         
            +
              end
         
     | 
| 
      
 1365 
     | 
    
         
            +
             
     | 
| 
      
 1366 
     | 
    
         
            +
              ###
         
     | 
| 
      
 1367 
     | 
    
         
            +
              # given a node: if it is a terminal that is linked to its
         
     | 
| 
      
 1368 
     | 
    
         
            +
              # parent by a Head edge, return the parent,
         
     | 
| 
      
 1369 
     | 
    
         
            +
              # else return the node
         
     | 
| 
      
 1370 
     | 
    
         
            +
              def MiniparInterpreter.ensure_upper(node)
         
     | 
| 
      
 1371 
     | 
    
         
            +
                if node.parent_label() == "Head"
         
     | 
| 
      
 1372 
     | 
    
         
            +
                  return node.parent
         
     | 
| 
      
 1373 
     | 
    
         
            +
                else
         
     | 
| 
      
 1374 
     | 
    
         
            +
                  return node
         
     | 
| 
      
 1375 
     | 
    
         
            +
                end
         
     | 
| 
      
 1376 
     | 
    
         
            +
              end
         
     | 
| 
      
 1377 
     | 
    
         
            +
             
     | 
| 
      
 1378 
     | 
    
         
            +
              ###
         
     | 
| 
      
 1379 
     | 
    
         
            +
              # is this an empty terminal?
         
     | 
| 
      
 1380 
     | 
    
         
            +
              def MiniparInterpreter.empty_terminal?(node)
         
     | 
| 
      
 1381 
     | 
    
         
            +
                if node.is_terminal? and node.word().empty?
         
     | 
| 
      
 1382 
     | 
    
         
            +
                  return true
         
     | 
| 
      
 1383 
     | 
    
         
            +
                else
         
     | 
| 
      
 1384 
     | 
    
         
            +
                  return false
         
     | 
| 
      
 1385 
     | 
    
         
            +
                end
         
     | 
| 
      
 1386 
     | 
    
         
            +
              end
         
     | 
| 
      
 1387 
     | 
    
         
            +
             
     | 
| 
      
 1388 
     | 
    
         
            +
            end
         
     |