frprep 0.0.1.prealpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +8 -0
 - data/CHANGELOG.rdoc +0 -0
 - data/LICENSE.rdoc +0 -0
 - data/README.rdoc +0 -0
 - data/lib/common/AbstractSynInterface.rb +1227 -0
 - data/lib/common/BerkeleyInterface.rb +375 -0
 - data/lib/common/CollinsInterface.rb +1165 -0
 - data/lib/common/ConfigData.rb +694 -0
 - data/lib/common/Counter.rb +18 -0
 - data/lib/common/DBInterface.rb +48 -0
 - data/lib/common/EnduserMode.rb +27 -0
 - data/lib/common/Eval.rb +480 -0
 - data/lib/common/FixSynSemMapping.rb +196 -0
 - data/lib/common/FrPrepConfigData.rb +66 -0
 - data/lib/common/FrprepHelper.rb +1324 -0
 - data/lib/common/Graph.rb +345 -0
 - data/lib/common/ISO-8859-1.rb +24 -0
 - data/lib/common/ML.rb +186 -0
 - data/lib/common/Maxent.rb +215 -0
 - data/lib/common/MiniparInterface.rb +1388 -0
 - data/lib/common/Optimise.rb +195 -0
 - data/lib/common/Parser.rb +213 -0
 - data/lib/common/RegXML.rb +269 -0
 - data/lib/common/RosyConventions.rb +171 -0
 - data/lib/common/SQLQuery.rb +243 -0
 - data/lib/common/STXmlTerminalOrder.rb +194 -0
 - data/lib/common/SalsaTigerRegXML.rb +2347 -0
 - data/lib/common/SalsaTigerXMLHelper.rb +99 -0
 - data/lib/common/SleepyInterface.rb +384 -0
 - data/lib/common/SynInterfaces.rb +275 -0
 - data/lib/common/TabFormat.rb +720 -0
 - data/lib/common/Tiger.rb +1448 -0
 - data/lib/common/TntInterface.rb +44 -0
 - data/lib/common/Tree.rb +61 -0
 - data/lib/common/TreetaggerInterface.rb +303 -0
 - data/lib/common/headz.rb +338 -0
 - data/lib/common/option_parser.rb +13 -0
 - data/lib/common/ruby_class_extensions.rb +310 -0
 - data/lib/fred/Baseline.rb +150 -0
 - data/lib/fred/FileZipped.rb +31 -0
 - data/lib/fred/FredBOWContext.rb +863 -0
 - data/lib/fred/FredConfigData.rb +182 -0
 - data/lib/fred/FredConventions.rb +232 -0
 - data/lib/fred/FredDetermineTargets.rb +324 -0
 - data/lib/fred/FredEval.rb +312 -0
 - data/lib/fred/FredFeatureExtractors.rb +321 -0
 - data/lib/fred/FredFeatures.rb +1061 -0
 - data/lib/fred/FredFeaturize.rb +596 -0
 - data/lib/fred/FredNumTrainingSenses.rb +27 -0
 - data/lib/fred/FredParameters.rb +402 -0
 - data/lib/fred/FredSplit.rb +84 -0
 - data/lib/fred/FredSplitPkg.rb +180 -0
 - data/lib/fred/FredTest.rb +607 -0
 - data/lib/fred/FredTrain.rb +144 -0
 - data/lib/fred/PlotAndREval.rb +480 -0
 - data/lib/fred/fred.rb +45 -0
 - data/lib/fred/md5.rb +23 -0
 - data/lib/fred/opt_parser.rb +250 -0
 - data/lib/frprep/AbstractSynInterface.rb +1227 -0
 - data/lib/frprep/Ampersand.rb +37 -0
 - data/lib/frprep/BerkeleyInterface.rb +375 -0
 - data/lib/frprep/CollinsInterface.rb +1165 -0
 - data/lib/frprep/ConfigData.rb +694 -0
 - data/lib/frprep/Counter.rb +18 -0
 - data/lib/frprep/FNCorpusXML.rb +643 -0
 - data/lib/frprep/FNDatabase.rb +144 -0
 - data/lib/frprep/FixSynSemMapping.rb +196 -0
 - data/lib/frprep/FrPrepConfigData.rb +66 -0
 - data/lib/frprep/FrameXML.rb +513 -0
 - data/lib/frprep/FrprepHelper.rb +1324 -0
 - data/lib/frprep/Graph.rb +345 -0
 - data/lib/frprep/ISO-8859-1.rb +24 -0
 - data/lib/frprep/MiniparInterface.rb +1388 -0
 - data/lib/frprep/Parser.rb +213 -0
 - data/lib/frprep/RegXML.rb +269 -0
 - data/lib/frprep/STXmlTerminalOrder.rb +194 -0
 - data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
 - data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
 - data/lib/frprep/SleepyInterface.rb +384 -0
 - data/lib/frprep/SynInterfaces.rb +275 -0
 - data/lib/frprep/TabFormat.rb +720 -0
 - data/lib/frprep/Tiger.rb +1448 -0
 - data/lib/frprep/TntInterface.rb +44 -0
 - data/lib/frprep/Tree.rb +61 -0
 - data/lib/frprep/TreetaggerInterface.rb +303 -0
 - data/lib/frprep/do_parses.rb +142 -0
 - data/lib/frprep/frprep.rb +686 -0
 - data/lib/frprep/headz.rb +338 -0
 - data/lib/frprep/one_parsed_file.rb +28 -0
 - data/lib/frprep/opt_parser.rb +94 -0
 - data/lib/frprep/ruby_class_extensions.rb +310 -0
 - data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
 - data/lib/rosy/DBMySQL.rb +146 -0
 - data/lib/rosy/DBSQLite.rb +280 -0
 - data/lib/rosy/DBTable.rb +239 -0
 - data/lib/rosy/DBWrapper.rb +176 -0
 - data/lib/rosy/ExternalConfigData.rb +58 -0
 - data/lib/rosy/FailedParses.rb +130 -0
 - data/lib/rosy/FeatureInfo.rb +242 -0
 - data/lib/rosy/GfInduce.rb +1115 -0
 - data/lib/rosy/GfInduceFeature.rb +148 -0
 - data/lib/rosy/InputData.rb +294 -0
 - data/lib/rosy/RosyConfigData.rb +115 -0
 - data/lib/rosy/RosyConfusability.rb +338 -0
 - data/lib/rosy/RosyEval.rb +465 -0
 - data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
 - data/lib/rosy/RosyFeaturize.rb +280 -0
 - data/lib/rosy/RosyInspect.rb +336 -0
 - data/lib/rosy/RosyIterator.rb +477 -0
 - data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
 - data/lib/rosy/RosyPruning.rb +165 -0
 - data/lib/rosy/RosyServices.rb +744 -0
 - data/lib/rosy/RosySplit.rb +232 -0
 - data/lib/rosy/RosyTask.rb +19 -0
 - data/lib/rosy/RosyTest.rb +826 -0
 - data/lib/rosy/RosyTrain.rb +232 -0
 - data/lib/rosy/RosyTrainingTestTable.rb +786 -0
 - data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
 - data/lib/rosy/View.rb +418 -0
 - data/lib/rosy/opt_parser.rb +379 -0
 - data/lib/rosy/rosy.rb +77 -0
 - data/lib/shalmaneser/version.rb +3 -0
 - data/test/frprep/test_opt_parser.rb +94 -0
 - data/test/functional/functional_test_helper.rb +40 -0
 - data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
 - data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
 - data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
 - data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
 - data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
 - data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
 - data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
 - data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
 - data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
 - data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
 - data/test/functional/test_fred.rb +47 -0
 - data/test/functional/test_frprep.rb +52 -0
 - data/test/functional/test_rosy.rb +20 -0
 - metadata +270 -0
 
| 
         @@ -0,0 +1,1324 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # Salsa packages
         
     | 
| 
      
 2 
     | 
    
         
            +
            require "frprep/ISO-8859-1"
         
     | 
| 
      
 3 
     | 
    
         
            +
            require "frprep/Parser"
         
     | 
| 
      
 4 
     | 
    
         
            +
            require "frprep/RegXML"
         
     | 
| 
      
 5 
     | 
    
         
            +
            require "frprep/SalsaTigerRegXML"
         
     | 
| 
      
 6 
     | 
    
         
            +
            require "frprep/SalsaTigerXMLHelper"
         
     | 
| 
      
 7 
     | 
    
         
            +
            require "frprep/TabFormat"
         
     | 
| 
      
 8 
     | 
    
         
            +
            require "frprep/ruby_class_extensions"
         
     | 
| 
      
 9 
     | 
    
         
            +
            require "frprep/AbstractSynInterface"
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
            ############################################3
         
     | 
| 
      
 12 
     | 
    
         
            +
            # Module FrprepHelper:
         
     | 
| 
      
 13 
     | 
    
         
            +
            #
         
     | 
| 
      
 14 
     | 
    
         
            +
            # diverse transformation methods for frprep.rb
         
     | 
| 
      
 15 
     | 
    
         
            +
            # moved over here to make the main file less crowded
         
     | 
| 
      
 16 
     | 
    
         
            +
            module FrprepHelper
         
     | 
| 
      
 17 
     | 
    
         
            +
              
         
     | 
| 
      
 18 
     | 
    
         
            +
              ####
         
     | 
| 
      
 19 
     | 
    
         
            +
              # transform a file to UTF-8 from a given encoding
         
     | 
| 
      
 20 
     | 
    
         
            +
              def FrprepHelper.to_utf8_file(input_filename, # string: name of input file
         
     | 
| 
      
 21 
     | 
    
         
            +
                                       output_filename, # string: name of output file
         
     | 
| 
      
 22 
     | 
    
         
            +
                                       encoding) # string: "iso", "hex"
         
     | 
| 
      
 23 
     | 
    
         
            +
                begin
         
     | 
| 
      
 24 
     | 
    
         
            +
                  infile = File.new(input_filename)
         
     | 
| 
      
 25 
     | 
    
         
            +
                  outfile = File.new(output_filename, "w")
         
     | 
| 
      
 26 
     | 
    
         
            +
                rescue
         
     | 
| 
      
 27 
     | 
    
         
            +
                  raise "Could not read #{input_filename}, or could not write to #{output_filename}."
         
     | 
| 
      
 28 
     | 
    
         
            +
                end
         
     | 
| 
      
 29 
     | 
    
         
            +
             
     | 
| 
      
 30 
     | 
    
         
            +
                while (line = infile.gets())
         
     | 
| 
      
 31 
     | 
    
         
            +
                  case encoding
         
     | 
| 
      
 32 
     | 
    
         
            +
                  when "iso"
         
     | 
| 
      
 33 
     | 
    
         
            +
                    outfile.puts UtfIso.from_iso_8859_1(line)
         
     | 
| 
      
 34 
     | 
    
         
            +
                  when "hex"
         
     | 
| 
      
 35 
     | 
    
         
            +
                    outfile.puts UtfIso.from_iso_8859_1(Ampersand.hex_to_iso(line))
         
     | 
| 
      
 36 
     | 
    
         
            +
                  else
         
     | 
| 
      
 37 
     | 
    
         
            +
                    raise "Shouldn't be here."
         
     | 
| 
      
 38 
     | 
    
         
            +
                  end
         
     | 
| 
      
 39 
     | 
    
         
            +
                end 
         
     | 
| 
      
 40 
     | 
    
         
            +
                infile.close()
         
     | 
| 
      
 41 
     | 
    
         
            +
                outfile.close()
         
     | 
| 
      
 42 
     | 
    
         
            +
              end
         
     | 
| 
      
 43 
     | 
    
         
            +
             
     | 
| 
      
 44 
     | 
    
         
            +
              ####
         
     | 
| 
      
 45 
     | 
    
         
            +
              # transform BNC format file to plaintext file
         
     | 
| 
      
 46 
     | 
    
         
            +
              def FrprepHelper.bnc_to_plain_file(input_filename, # string: name of input file
         
     | 
| 
      
 47 
     | 
    
         
            +
                                                 output_filename) # string: name of output file
         
     | 
| 
      
 48 
     | 
    
         
            +
                begin
         
     | 
| 
      
 49 
     | 
    
         
            +
                  infile = File.new(input_filename)
         
     | 
| 
      
 50 
     | 
    
         
            +
                  outfile = File.new(output_filename, "w")
         
     | 
| 
      
 51 
     | 
    
         
            +
                rescue
         
     | 
| 
      
 52 
     | 
    
         
            +
                  raise "Could not read #{input_filename}, or could not write to #{output_filename}."
         
     | 
| 
      
 53 
     | 
    
         
            +
                end
         
     | 
| 
      
 54 
     | 
    
         
            +
             
     | 
| 
      
 55 
     | 
    
         
            +
                infile.each { |line|
         
     | 
| 
      
 56 
     | 
    
         
            +
                # does this line contain a sentence?
         
     | 
| 
      
 57 
     | 
    
         
            +
                if line =~ /^\s*<s\s+n=/
         
     | 
| 
      
 58 
     | 
    
         
            +
                  # remove all tags, replace by spaces, 
         
     | 
| 
      
 59 
     | 
    
         
            +
                  # then remove superfluous spaces
         
     | 
| 
      
 60 
     | 
    
         
            +
                  textline = line.gsub(/<.+?>/, " ").strip().squeeze(" ")
         
     | 
| 
      
 61 
     | 
    
         
            +
             
     | 
| 
      
 62 
     | 
    
         
            +
             
     | 
| 
      
 63 
     | 
    
         
            +
                  textline.gsub!(/&bquo;/, '"')
         
     | 
| 
      
 64 
     | 
    
         
            +
                  textline.gsub!(/&equo;/, '"')
         
     | 
| 
      
 65 
     | 
    
         
            +
                  textline.gsub!(/—/, "-")
         
     | 
| 
      
 66 
     | 
    
         
            +
                  textline.gsub!(/–/, "-")
         
     | 
| 
      
 67 
     | 
    
         
            +
                  textline.gsub!(/%/, "%")
         
     | 
| 
      
 68 
     | 
    
         
            +
                  textline.gsub!(/£/, " pounds ")
         
     | 
| 
      
 69 
     | 
    
         
            +
                  textline.gsub!(/&/, " and ")
         
     | 
| 
      
 70 
     | 
    
         
            +
                  textline.gsub!(/…/, "...")
         
     | 
| 
      
 71 
     | 
    
         
            +
                  textline.gsub!(/©/, "(copyright)")
         
     | 
| 
      
 72 
     | 
    
         
            +
                  textline.gsub!(/é/, "e")
         
     | 
| 
      
 73 
     | 
    
         
            +
                  textline.gsub!(/•/, "*")
         
     | 
| 
      
 74 
     | 
    
         
            +
                  textline.gsub!(/$/, "$")
         
     | 
| 
      
 75 
     | 
    
         
            +
                  textline.gsub!(/°/, " degree ")
         
     | 
| 
      
 76 
     | 
    
         
            +
             
     | 
| 
      
 77 
     | 
    
         
            +
                  textline.gsub!(/½/, "1/2")
         
     | 
| 
      
 78 
     | 
    
         
            +
                  textline.gsub!(/¾/, "3/4")
         
     | 
| 
      
 79 
     | 
    
         
            +
             
     | 
| 
      
 80 
     | 
    
         
            +
                  textline.gsub!(/[/, "[")
         
     | 
| 
      
 81 
     | 
    
         
            +
                  textline.gsub!(/]/, "]")
         
     | 
| 
      
 82 
     | 
    
         
            +
             
     | 
| 
      
 83 
     | 
    
         
            +
                  textline.gsub!(/&ins;/, "i")
         
     | 
| 
      
 84 
     | 
    
         
            +
                  textline.gsub!(/&ft;/, "ft")
         
     | 
| 
      
 85 
     | 
    
         
            +
             
     | 
| 
      
 86 
     | 
    
         
            +
                  textline.gsub!(/→/, ">")
         
     | 
| 
      
 87 
     | 
    
         
            +
                  textline.gsub!(/←/, "<")
         
     | 
| 
      
 88 
     | 
    
         
            +
             
     | 
| 
      
 89 
     | 
    
         
            +
             
     | 
| 
      
 90 
     | 
    
         
            +
                  textline.gsub!(/á/, "a")
         
     | 
| 
      
 91 
     | 
    
         
            +
                  textline.gsub!(/ä/, "a")
         
     | 
| 
      
 92 
     | 
    
         
            +
                  textline.gsub!(/à/, "a")
         
     | 
| 
      
 93 
     | 
    
         
            +
                  textline.gsub!(/ã/, "a")
         
     | 
| 
      
 94 
     | 
    
         
            +
                  textline.gsub!(/â/, "a")
         
     | 
| 
      
 95 
     | 
    
         
            +
                  textline.gsub!(/Á/, "A")
         
     | 
| 
      
 96 
     | 
    
         
            +
                  textline.gsub!(/Ä/, "A")
         
     | 
| 
      
 97 
     | 
    
         
            +
                  textline.gsub!(/À/, "A")
         
     | 
| 
      
 98 
     | 
    
         
            +
                  textline.gsub!(/Ã/, "A")
         
     | 
| 
      
 99 
     | 
    
         
            +
                  textline.gsub!(/Â/, "A")
         
     | 
| 
      
 100 
     | 
    
         
            +
             
     | 
| 
      
 101 
     | 
    
         
            +
                  textline.gsub!(/é/, "e")
         
     | 
| 
      
 102 
     | 
    
         
            +
                  textline.gsub!(/è/, "e")
         
     | 
| 
      
 103 
     | 
    
         
            +
                  textline.gsub!(/ê/, "e")
         
     | 
| 
      
 104 
     | 
    
         
            +
                  textline.gsub!(/ë/, "e")
         
     | 
| 
      
 105 
     | 
    
         
            +
                  textline.gsub!(/É/, "E")
         
     | 
| 
      
 106 
     | 
    
         
            +
                  textline.gsub!(/È/, "E")
         
     | 
| 
      
 107 
     | 
    
         
            +
                  textline.gsub!(/Ê/, "E")
         
     | 
| 
      
 108 
     | 
    
         
            +
                  textline.gsub!(/Ë/, "E")
         
     | 
| 
      
 109 
     | 
    
         
            +
             
     | 
| 
      
 110 
     | 
    
         
            +
                  textline.gsub!(/í/, "i")
         
     | 
| 
      
 111 
     | 
    
         
            +
                  textline.gsub!(/ì/, "i")
         
     | 
| 
      
 112 
     | 
    
         
            +
                  textline.gsub!(/î/, "i")
         
     | 
| 
      
 113 
     | 
    
         
            +
                  textline.gsub!(/ï/, "i")
         
     | 
| 
      
 114 
     | 
    
         
            +
                  textline.gsub!(/Í/, "I")
         
     | 
| 
      
 115 
     | 
    
         
            +
                  textline.gsub!(/Ì/, "I")
         
     | 
| 
      
 116 
     | 
    
         
            +
                  textline.gsub!(/Î/, "I")
         
     | 
| 
      
 117 
     | 
    
         
            +
             
     | 
| 
      
 118 
     | 
    
         
            +
                  textline.gsub!(/ó/, "o")
         
     | 
| 
      
 119 
     | 
    
         
            +
                  textline.gsub!(/ò/, "o")
         
     | 
| 
      
 120 
     | 
    
         
            +
                  textline.gsub!(/ô/, "o")
         
     | 
| 
      
 121 
     | 
    
         
            +
                  textline.gsub!(/ö/, "o")
         
     | 
| 
      
 122 
     | 
    
         
            +
                  textline.gsub!(/Ó/, "O")
         
     | 
| 
      
 123 
     | 
    
         
            +
                  textline.gsub!(/Ò/, "O")
         
     | 
| 
      
 124 
     | 
    
         
            +
                  textline.gsub!(/Ô/, "O")
         
     | 
| 
      
 125 
     | 
    
         
            +
                  textline.gsub!(/Ö/, "O")
         
     | 
| 
      
 126 
     | 
    
         
            +
             
     | 
| 
      
 127 
     | 
    
         
            +
                  textline.gsub!(/ú/, "u")
         
     | 
| 
      
 128 
     | 
    
         
            +
                  textline.gsub!(/ù/, "u")
         
     | 
| 
      
 129 
     | 
    
         
            +
                  textline.gsub!(/û/, "u")
         
     | 
| 
      
 130 
     | 
    
         
            +
                  textline.gsub!(/ü/, "u")
         
     | 
| 
      
 131 
     | 
    
         
            +
                  textline.gsub!(/Ú/, "U")
         
     | 
| 
      
 132 
     | 
    
         
            +
                  textline.gsub!(/Ù/, "U")
         
     | 
| 
      
 133 
     | 
    
         
            +
                  textline.gsub!(/Û/, "U")
         
     | 
| 
      
 134 
     | 
    
         
            +
                  textline.gsub!(/Ü/, "U")
         
     | 
| 
      
 135 
     | 
    
         
            +
             
     | 
| 
      
 136 
     | 
    
         
            +
                  textline.gsub!(/ÿ/, "y")
         
     | 
| 
      
 137 
     | 
    
         
            +
                  textline.gsub!(/Ÿ/, "Y")
         
     | 
| 
      
 138 
     | 
    
         
            +
             
     | 
| 
      
 139 
     | 
    
         
            +
                  textline.gsub!(/ñ/, "n")
         
     | 
| 
      
 140 
     | 
    
         
            +
                  textline.gsub!(/Ñ/, "N")
         
     | 
| 
      
 141 
     | 
    
         
            +
             
     | 
| 
      
 142 
     | 
    
         
            +
                  textline.gsub!(/ç/, "c")
         
     | 
| 
      
 143 
     | 
    
         
            +
                  textline.gsub!(/Ç/, "C")
         
     | 
| 
      
 144 
     | 
    
         
            +
             
     | 
| 
      
 145 
     | 
    
         
            +
             
     | 
| 
      
 146 
     | 
    
         
            +
                  outfile.puts textline
         
     | 
| 
      
 147 
     | 
    
         
            +
                end
         
     | 
| 
      
 148 
     | 
    
         
            +
                }
         
     | 
| 
      
 149 
     | 
    
         
            +
                infile.close()
         
     | 
| 
      
 150 
     | 
    
         
            +
                outfile.close()
         
     | 
| 
      
 151 
     | 
    
         
            +
              end
         
     | 
| 
      
 152 
     | 
    
         
            +
             
     | 
| 
      
 153 
     | 
    
         
            +
             
     | 
| 
      
 154 
     | 
    
         
            +
              ####
         
     | 
| 
      
 155 
     | 
    
         
            +
              # transform plaintext file to Tab format file
         
     | 
| 
      
 156 
     | 
    
         
            +
              def FrprepHelper.plain_to_tab_file(input_filename,# string: name of input file
         
     | 
| 
      
 157 
     | 
    
         
            +
                                                 output_filename)    # string: name of output file
         
     | 
| 
      
 158 
     | 
    
         
            +
                begin
         
     | 
| 
      
 159 
     | 
    
         
            +
                  infile = File.new(input_filename)
         
     | 
| 
      
 160 
     | 
    
         
            +
                  outfile = File.new(output_filename, "w")
         
     | 
| 
      
 161 
     | 
    
         
            +
                rescue
         
     | 
| 
      
 162 
     | 
    
         
            +
                  raise "Could not read #{input_filename}, or could not write to #{output_filename}."
         
     | 
| 
      
 163 
     | 
    
         
            +
                end
         
     | 
| 
      
 164 
     | 
    
         
            +
             
     | 
| 
      
 165 
     | 
    
         
            +
                filename_core = File.basename(input_filename, "txt")
         
     | 
| 
      
 166 
     | 
    
         
            +
             
     | 
| 
      
 167 
     | 
    
         
            +
                # array(string): keep the words of each sentence
         
     | 
| 
      
 168 
     | 
    
         
            +
                sentence = Array.new
         
     | 
| 
      
 169 
     | 
    
         
            +
                # sentence number for making the sentence ID:
         
     | 
| 
      
 170 
     | 
    
         
            +
                # global count, over all input files
         
     | 
| 
      
 171 
     | 
    
         
            +
                sentno = 0
         
     | 
| 
      
 172 
     | 
    
         
            +
                
         
     | 
| 
      
 173 
     | 
    
         
            +
                while (line = infile.gets())
         
     | 
| 
      
 174 
     | 
    
         
            +
                  
         
     | 
| 
      
 175 
     | 
    
         
            +
                  # make a sentence ID for the next sentence: running number
         
     | 
| 
      
 176 
     | 
    
         
            +
                  sentid = filename_core + "_" + sentno.to_s
         
     | 
| 
      
 177 
     | 
    
         
            +
                  sentno += 1
         
     | 
| 
      
 178 
     | 
    
         
            +
                  
         
     | 
| 
      
 179 
     | 
    
         
            +
                  # read words into the sentence array,
         
     | 
| 
      
 180 
     | 
    
         
            +
                  # separating out punctuation attached to the beginning or end of words
         
     | 
| 
      
 181 
     | 
    
         
            +
                  sentence.clear()
         
     | 
| 
      
 182 
     | 
    
         
            +
                  line.split.each { |word|
         
     | 
| 
      
 183 
     | 
    
         
            +
                    # punctuation at the beginning of the word
         
     | 
| 
      
 184 
     | 
    
         
            +
                    #if word =~ /^([\(\[`'\"-]+)(.*)$/
         
     | 
| 
      
 185 
     | 
    
         
            +
                    if word =~ /^([\(\[`\"-]+)(.*)$/
         
     | 
| 
      
 186 
     | 
    
         
            +
                    punct = $1
         
     | 
| 
      
 187 
     | 
    
         
            +
                      word = $2
         
     | 
| 
      
 188 
     | 
    
         
            +
                      punct.scan(/./) { |single_punct|
         
     | 
| 
      
 189 
     | 
    
         
            +
                        sentence << single_punct
         
     | 
| 
      
 190 
     | 
    
         
            +
                      }
         
     | 
| 
      
 191 
     | 
    
         
            +
               
         
     | 
| 
      
 192 
     | 
    
         
            +
                    end
         
     | 
| 
      
 193 
     | 
    
         
            +
                    # punctuation at the end of the word
         
     | 
| 
      
 194 
     | 
    
         
            +
                    #if word =~ /[,:;-\`?!'\"\.\)\]]+$/
         
     | 
| 
      
 195 
     | 
    
         
            +
                    if word =~ /[,:;-\`?!\"\.\)\]]+$/
         
     | 
| 
      
 196 
     | 
    
         
            +
                      sentence << $`  # part before the match: the word
         
     | 
| 
      
 197 
     | 
    
         
            +
                      punct = $&
         
     | 
| 
      
 198 
     | 
    
         
            +
                      punct.scan(/./) { |single_punct|
         
     | 
| 
      
 199 
     | 
    
         
            +
                        sentence << single_punct
         
     | 
| 
      
 200 
     | 
    
         
            +
                      }
         
     | 
| 
      
 201 
     | 
    
         
            +
                         
         
     | 
| 
      
 202 
     | 
    
         
            +
                    else
         
     | 
| 
      
 203 
     | 
    
         
            +
                      # no punctuation recognized
         
     | 
| 
      
 204 
     | 
    
         
            +
                      sentence << word
         
     | 
| 
      
 205 
     | 
    
         
            +
                    end
         
     | 
| 
      
 206 
     | 
    
         
            +
                  }
         
     | 
| 
      
 207 
     | 
    
         
            +
             
     | 
| 
      
 208 
     | 
    
         
            +
             
     | 
| 
      
 209 
     | 
    
         
            +
                  # remove empty words
         
     | 
| 
      
 210 
     | 
    
         
            +
                  sentence.reject! { |word| word.nil? or word.strip.empty? }
         
     | 
| 
      
 211 
     | 
    
         
            +
             
     | 
| 
      
 212 
     | 
    
         
            +
                  # write words to tab file
         
     | 
| 
      
 213 
     | 
    
         
            +
                  # KE Dec 06: TabFormat changed
         
     | 
| 
      
 214 
     | 
    
         
            +
                  sentence.each { |word|
         
     | 
| 
      
 215 
     | 
    
         
            +
                    # for each word, one line, entries in the line tab-separated
         
     | 
| 
      
 216 
     | 
    
         
            +
                    # the 'word' entry is the word, the 'lu_sent_ids' entry is the sentence ID sentid,
         
     | 
| 
      
 217 
     | 
    
         
            +
                    # all other entries (gf, pt, frame etc.) are not set
         
     | 
| 
      
 218 
     | 
    
         
            +
            	outfile.puts FNTabFormatFile.format_str({ 
         
     | 
| 
      
 219 
     | 
    
         
            +
            						  "word" => word,
         
     | 
| 
      
 220 
     | 
    
         
            +
            						  "sent_id" => sentid
         
     | 
| 
      
 221 
     | 
    
         
            +
            						})
         
     | 
| 
      
 222 
     | 
    
         
            +
                  }
         
     | 
| 
      
 223 
     | 
    
         
            +
                  outfile.puts
         
     | 
| 
      
 224 
     | 
    
         
            +
                end
         
     | 
| 
      
 225 
     | 
    
         
            +
                outfile.close()
         
     | 
| 
      
 226 
     | 
    
         
            +
              end
         
     | 
| 
      
 227 
     | 
    
         
            +
             
     | 
| 
      
 228 
     | 
    
         
            +
              ###########
         
     | 
| 
      
 229 
     | 
    
         
            +
              # 
         
     | 
| 
      
 230 
     | 
    
         
            +
              # class method split_dir: 
         
     | 
| 
      
 231 
     | 
    
         
            +
              # read all files in one directory and produce chunk files *#{suffix} in outdir
         
     | 
| 
      
 232 
     | 
    
         
            +
              # with a certain number of files in them (sent_num).
         
     | 
| 
      
 233 
     | 
    
         
            +
              # Optionally, remove all sentences longer than sent_leng
         
     | 
| 
      
 234 
     | 
    
         
            +
              #
         
     | 
| 
      
 235 
     | 
    
         
            +
              # produces output files 1.<suffix>, 2.<suffix>, etc.
         
     | 
| 
      
 236 
     | 
    
         
            +
              #
         
     | 
| 
      
 237 
     | 
    
         
            +
              # assumes TabFormat sentences
         
     | 
| 
      
 238 
     | 
    
         
            +
              #
         
     | 
| 
      
 239 
     | 
    
         
            +
              # example: split_all("/tmp/in","/tmp/out",".tab",2000,80)
         
     | 
| 
      
 240 
     | 
    
         
            +
             
     | 
| 
      
 241 
     | 
    
         
            +
              def FrprepHelper.split_dir(indir,
         
     | 
| 
      
 242 
     | 
    
         
            +
            			     outdir,
         
     | 
| 
      
 243 
     | 
    
         
            +
            			     suffix,
         
     | 
| 
      
 244 
     | 
    
         
            +
            			     sent_num,
         
     | 
| 
      
 245 
     | 
    
         
            +
            			     sent_leng=nil)
         
     | 
| 
      
 246 
     | 
    
         
            +
             
     | 
| 
      
 247 
     | 
    
         
            +
                unless indir[-1,1] == "/"
         
     | 
| 
      
 248 
     | 
    
         
            +
                  indir += "/"
         
     | 
| 
      
 249 
     | 
    
         
            +
                end
         
     | 
| 
      
 250 
     | 
    
         
            +
                unless outdir[-1,1] == "/"
         
     | 
| 
      
 251 
     | 
    
         
            +
                  outdir += "/"
         
     | 
| 
      
 252 
     | 
    
         
            +
                end
         
     | 
| 
      
 253 
     | 
    
         
            +
             
     | 
| 
      
 254 
     | 
    
         
            +
                outfile_counter = 0
         
     | 
| 
      
 255 
     | 
    
         
            +
                line_stack = Array.new
         
     | 
| 
      
 256 
     | 
    
         
            +
                sent_stack = Array.new
         
     | 
| 
      
 257 
     | 
    
         
            +
                
         
     | 
| 
      
 258 
     | 
    
         
            +
                Dir[indir+"*#{suffix}"].each {|infilename|
         
     | 
| 
      
 259 
     | 
    
         
            +
                  STDERR.puts "Now splitting #{infilename}"
         
     | 
| 
      
 260 
     | 
    
         
            +
                  infile = File.new(infilename)
         
     | 
| 
      
 261 
     | 
    
         
            +
                  
         
     | 
| 
      
 262 
     | 
    
         
            +
                  while line = infile.gets
         
     | 
| 
      
 263 
     | 
    
         
            +
                    line.chomp!        
         
     | 
| 
      
 264 
     | 
    
         
            +
                    case line
         
     | 
| 
      
 265 
     | 
    
         
            +
                    when "" # end of sentence
         
     | 
| 
      
 266 
     | 
    
         
            +
                      if !(sent_leng.nil? or line_stack.length < sent_leng) # record sentence
         
     | 
| 
      
 267 
     | 
    
         
            +
                        # suppress multiple empty lines
         
     | 
| 
      
 268 
     | 
    
         
            +
                        # to avoid problems with lemmatiser
         
     | 
| 
      
 269 
     | 
    
         
            +
                        # only record sent_stack if it is not empty.
         
     | 
| 
      
 270 
     | 
    
         
            +
             
     | 
| 
      
 271 
     | 
    
         
            +
                        # change (sp 15 01 07): just cut off sentence at sent_leng.            
         
     | 
| 
      
 272 
     | 
    
         
            +
                        
         
     | 
| 
      
 273 
     | 
    
         
            +
                        STDERR.puts "Cutting off long sentence #{line_stack.last.split("\t").last}" 
         
     | 
| 
      
 274 
     | 
    
         
            +
                        line_stack = line_stack[0..sent_leng-1]
         
     | 
| 
      
 275 
     | 
    
         
            +
                      end          
         
     | 
| 
      
 276 
     | 
    
         
            +
                      unless line_stack.empty?
         
     | 
| 
      
 277 
     | 
    
         
            +
                        sent_stack << line_stack
         
     | 
| 
      
 278 
     | 
    
         
            +
                        # reset line_stack
         
     | 
| 
      
 279 
     | 
    
         
            +
                        line_stack = Array.new
         
     | 
| 
      
 280 
     | 
    
         
            +
                      end
         
     | 
| 
      
 281 
     | 
    
         
            +
            	
         
     | 
| 
      
 282 
     | 
    
         
            +
             
     | 
| 
      
 283 
     | 
    
         
            +
                      # check if we have to empty the sent stack
         
     | 
| 
      
 284 
     | 
    
         
            +
                      if sent_stack.length == sent_num # enough sentences for new outfile? 
         
     | 
| 
      
 285 
     | 
    
         
            +
                        outfile = File.new(outdir+outfile_counter.to_s+"#{suffix}","w")
         
     | 
| 
      
 286 
     | 
    
         
            +
                        sent_stack.each {|l_stack|
         
     | 
| 
      
 287 
     | 
    
         
            +
                          outfile.puts l_stack.join("\n")              
         
     | 
| 
      
 288 
     | 
    
         
            +
                          outfile.puts              
         
     | 
| 
      
 289 
     | 
    
         
            +
                        }
         
     | 
| 
      
 290 
     | 
    
         
            +
                        outfile.close
         
     | 
| 
      
 291 
     | 
    
         
            +
                        outfile_counter += 1
         
     | 
| 
      
 292 
     | 
    
         
            +
                        sent_stack = Array.new
         
     | 
| 
      
 293 
     | 
    
         
            +
                      end
         
     | 
| 
      
 294 
     | 
    
         
            +
                    
         
     | 
| 
      
 295 
     | 
    
         
            +
                    else # for any other line
         
     | 
| 
      
 296 
     | 
    
         
            +
            	  line_stack << line
         
     | 
| 
      
 297 
     | 
    
         
            +
            	end
         
     | 
| 
      
 298 
     | 
    
         
            +
                  end
         
     | 
| 
      
 299 
     | 
    
         
            +
                  infile.close
         
     | 
| 
      
 300 
     | 
    
         
            +
                }
         
     | 
| 
      
 301 
     | 
    
         
            +
                # the last remaining sentences
         
     | 
| 
      
 302 
     | 
    
         
            +
                unless sent_stack.empty?
         
     | 
| 
      
 303 
     | 
    
         
            +
                  outfile = File.new(outdir+outfile_counter.to_s+"#{suffix}","w")
         
     | 
| 
      
 304 
     | 
    
         
            +
                  sent_stack.each {|l_stack|
         
     | 
| 
      
 305 
     | 
    
         
            +
                    l_stack << "\n"
         
     | 
| 
      
 306 
     | 
    
         
            +
                    outfile.puts l_stack.join("\n")
         
     | 
| 
      
 307 
     | 
    
         
            +
                  }
         
     | 
| 
      
 308 
     | 
    
         
            +
                  outfile.close
         
     | 
| 
      
 309 
     | 
    
         
            +
                end
         
     | 
| 
      
 310 
     | 
    
         
            +
              end  
         
     | 
| 
      
 311 
     | 
    
         
            +
             
     | 
| 
      
 312 
     | 
    
         
            +
              ####
         
     | 
| 
      
 313 
     | 
    
         
            +
              # note salsa targetlemma
         
     | 
| 
      
 314 
     | 
    
         
            +
              #
         
     | 
| 
      
 315 
     | 
    
         
            +
              # old_dir contains xml files whose name starts with the
         
     | 
| 
      
 316 
     | 
    
         
            +
              # target lemma for all frames in the file
         
     | 
| 
      
 317 
     | 
    
         
            +
              # record that target lemma in the <target> element of each frame
         
     | 
| 
      
 318 
     | 
    
         
            +
              def FrprepHelper.note_salsa_targetlemma(old_dir, # string ending in /
         
     | 
| 
      
 319 
     | 
    
         
            +
            					  new_dir) # string ending in /
         
     | 
| 
      
 320 
     | 
    
         
            +
             
     | 
| 
      
 321 
     | 
    
         
            +
                
         
     | 
| 
      
 322 
     | 
    
         
            +
                # each input file: extract target lemma from filename,
         
     | 
| 
      
 323 
     | 
    
         
            +
                # not this lemma in the <target> element of each frame
         
     | 
| 
      
 324 
     | 
    
         
            +
                Dir[old_dir + "*.xml"].each { |filename|
         
     | 
| 
      
 325 
     | 
    
         
            +
                  changedfilename = new_dir + File.basename(filename)
         
     | 
| 
      
 326 
     | 
    
         
            +
             
     | 
| 
      
 327 
     | 
    
         
            +
                  if File.basename(filename) =~ /^(.*?)[_\.]/
         
     | 
| 
      
 328 
     | 
    
         
            +
            	lemma = $1
         
     | 
| 
      
 329 
     | 
    
         
            +
            	  
         
     | 
| 
      
 330 
     | 
    
         
            +
            	infile = FilePartsParser.new(filename)
         
     | 
| 
      
 331 
     | 
    
         
            +
            	outfile = File.new(changedfilename, "w")
         
     | 
| 
      
 332 
     | 
    
         
            +
             
     | 
| 
      
 333 
     | 
    
         
            +
            	# write header
         
     | 
| 
      
 334 
     | 
    
         
            +
            	outfile.puts infile.head()
         
     | 
| 
      
 335 
     | 
    
         
            +
             
     | 
| 
      
 336 
     | 
    
         
            +
            	# iterate through sentences, yield as SalsaTigerSentence objects
         
     | 
| 
      
 337 
     | 
    
         
            +
            	infile.scan_s() { |sent_string|
         
     | 
| 
      
 338 
     | 
    
         
            +
            	  sent = SalsaTigerSentence.new(sent_string)
         
     | 
| 
      
 339 
     | 
    
         
            +
            	  sent.each_frame { |frame|
         
     | 
| 
      
 340 
     | 
    
         
            +
            	    frame.target.set_attribute("lemma", lemma)
         
     | 
| 
      
 341 
     | 
    
         
            +
            	  }
         
     | 
| 
      
 342 
     | 
    
         
            +
             
     | 
| 
      
 343 
     | 
    
         
            +
            	  # write changed sentence
         
     | 
| 
      
 344 
     | 
    
         
            +
            	  outfile.puts sent.get()
         
     | 
| 
      
 345 
     | 
    
         
            +
            	} # each sentence
         
     | 
| 
      
 346 
     | 
    
         
            +
            	
         
     | 
| 
      
 347 
     | 
    
         
            +
            	# write footer
         
     | 
| 
      
 348 
     | 
    
         
            +
            	outfile.puts infile.tail()
         
     | 
| 
      
 349 
     | 
    
         
            +
            	infile.close()
         
     | 
| 
      
 350 
     | 
    
         
            +
            	outfile.close()
         
     | 
| 
      
 351 
     | 
    
         
            +
                  
         
     | 
| 
      
 352 
     | 
    
         
            +
                  else
         
     | 
| 
      
 353 
     | 
    
         
            +
            	# couldn't determine lemma
         
     | 
| 
      
 354 
     | 
    
         
            +
            	# just copy the file
         
     | 
| 
      
 355 
     | 
    
         
            +
            	`cp #{filename} #{changedfilename}`
         
     | 
| 
      
 356 
     | 
    
         
            +
                  end
         
     | 
| 
      
 357 
     | 
    
         
            +
                }
         
     | 
| 
      
 358 
     | 
    
         
            +
              end
         
     | 
| 
      
 359 
     | 
    
         
            +
             
     | 
| 
      
 360 
     | 
    
         
            +
              ####
         
     | 
| 
      
 361 
     | 
    
         
            +
              # stxml_split_dir
         
     | 
| 
      
 362 
     | 
    
         
            +
              #
         
     | 
| 
      
 363 
     | 
    
         
            +
              # split SalsaTigerXML files into new files of given length,
         
     | 
| 
      
 364 
     | 
    
         
            +
              # skipping sentences that are too long
         
     | 
| 
      
 365 
     | 
    
         
            +
              #
         
     | 
| 
      
 366 
     | 
    
         
            +
              # At the same time, sentences that occur several times (i.e. sentences which are 
         
     | 
| 
      
 367 
     | 
    
         
            +
              # annotated by SALSA for more than one predicate) are compacted into one occurrence
         
     | 
| 
      
 368 
     | 
    
         
            +
              # with combined semantics.
         
     | 
| 
      
 369 
     | 
    
         
            +
              #
         
     | 
| 
      
 370 
     | 
    
         
            +
              # assumes that all files in input_dir with 
         
     | 
| 
      
 371 
     | 
    
         
            +
              # extension .xml are SalsaTigerXMl files
         
     | 
| 
      
 372 
     | 
    
         
            +
              def FrprepHelper.stxml_split_dir(input_dir, # string: input directory with STXML files
         
     | 
| 
      
 373 
     | 
    
         
            +
            				   split_dir, # string: output directory
         
     | 
| 
      
 374 
     | 
    
         
            +
            				   max_sentnum, # integer: max num of sentences per file
         
     | 
| 
      
 375 
     | 
    
         
            +
            				   max_sentlen) # integer: max num of terminals per sentence
         
     | 
| 
      
 376 
     | 
    
         
            +
                
         
     | 
| 
      
 377 
     | 
    
         
            +
                filenames = Dir[input_dir+"*.xml"].to_a
         
     | 
| 
      
 378 
     | 
    
         
            +
                
         
     | 
| 
      
 379 
     | 
    
         
            +
                graph_hash = Hash.new # for each sentence id, keep <s...</graph>
         
     | 
| 
      
 380 
     | 
    
         
            +
                frame_hash = Hash.new # for each sentence id , keep the <frame...  </frame> string
         
     | 
| 
      
 381 
     | 
    
         
            +
                uspfes_hash = Hash.new # for each sentence id, keep the uspfes stuff
         
     | 
| 
      
 382 
     | 
    
         
            +
                uspframes_hash = Hash.new # for each sentence id, keep the uspframes stuff
         
     | 
| 
      
 383 
     | 
    
         
            +
             
     | 
| 
      
 384 
     | 
    
         
            +
                ########################
         
     | 
| 
      
 385 
     | 
    
         
            +
                # Traverse of file(s): compute an index of all frames for each sentence, with unique identifiers 
         
     | 
| 
      
 386 
     | 
    
         
            +
                
         
     | 
| 
      
 387 
     | 
    
         
            +
                filenames.each {|filename|
         
     | 
| 
      
 388 
     | 
    
         
            +
                  
         
     | 
| 
      
 389 
     | 
    
         
            +
                  infile = FilePartsParser.new(filename)      
         
     | 
| 
      
 390 
     | 
    
         
            +
                  infile.scan_s {|sent_str|
         
     | 
| 
      
 391 
     | 
    
         
            +
                    
         
     | 
| 
      
 392 
     | 
    
         
            +
                    sentlen = 0
         
     | 
| 
      
 393 
     | 
    
         
            +
                    sent_str.delete("\n").scan(/<t\s/) { |occ| sentlen += 1}
         
     | 
| 
      
 394 
     | 
    
         
            +
                    if sentlen > max_sentlen
         
     | 
| 
      
 395 
     | 
    
         
            +
                      sent = RegXML.new(sent_str)
         
     | 
| 
      
 396 
     | 
    
         
            +
                      # revisit handling of long sentences
         
     | 
| 
      
 397 
     | 
    
         
            +
                      # $stderr.puts "I would have skipped overly long sentence " + sent.attributes["id"]+" but Sebastian forbade me.".to_s          
         
     | 
| 
      
 398 
     | 
    
         
            +
                      # next
         
     | 
| 
      
 399 
     | 
    
         
            +
                    end
         
     | 
| 
      
 400 
     | 
    
         
            +
             
     | 
| 
      
 401 
     | 
    
         
            +
                    # substitute old frame identifiers with new, unique ones
         
     | 
| 
      
 402 
     | 
    
         
            +
                    
         
     | 
| 
      
 403 
     | 
    
         
            +
                    # problem: we may have several frames per sentence, and need to keep track of them
         
     | 
| 
      
 404 
     | 
    
         
            +
                    # if we rename etc sxx_f1 to sxx_f2 and there is already a sxx_f2, then
         
     | 
| 
      
 405 
     | 
    
         
            +
                    # we cannot distinguish between these frames 
         
     | 
| 
      
 406 
     | 
    
         
            +
                    
         
     | 
| 
      
 407 
     | 
    
         
            +
                    # therefore, we substitute temporary identifiers until we have substituted 
         
     | 
| 
      
 408 
     | 
    
         
            +
                    # all ids with temporary ones, and re-substitute final ones at the end.
         
     | 
| 
      
 409 
     | 
    
         
            +
                    
         
     | 
| 
      
 410 
     | 
    
         
            +
                    this_frames = Array.new
         
     | 
| 
      
 411 
     | 
    
         
            +
                    
         
     | 
| 
      
 412 
     | 
    
         
            +
                    temp_subs = Array.new
         
     | 
| 
      
 413 
     | 
    
         
            +
                    final_subs = Array.new
         
     | 
| 
      
 414 
     | 
    
         
            +
                    
         
     | 
| 
      
 415 
     | 
    
         
            +
                    sent = RegXML.new(sent_str)
         
     | 
| 
      
 416 
     | 
    
         
            +
                    sentid = sent.attributes["id"].to_s
         
     | 
| 
      
 417 
     | 
    
         
            +
                    if sentid.nil?
         
     | 
| 
      
 418 
     | 
    
         
            +
                      STDERR.puts "[frprep] Warning: cannot find sentence id, skipping sentence:"
         
     | 
| 
      
 419 
     | 
    
         
            +
                      STDERR.puts sent_str
         
     | 
| 
      
 420 
     | 
    
         
            +
                      # strange sentence, no ID? skip
         
     | 
| 
      
 421 
     | 
    
         
            +
                      next
         
     | 
| 
      
 422 
     | 
    
         
            +
                    end
         
     | 
| 
      
 423 
     | 
    
         
            +
                    
         
     | 
| 
      
 424 
     | 
    
         
            +
                    unless frame_hash.key? sentid
         
     | 
| 
      
 425 
     | 
    
         
            +
                      frame_hash[sentid] = Array.new      
         
     | 
| 
      
 426 
     | 
    
         
            +
                      uspfes_hash[sentid] = Array.new
         
     | 
| 
      
 427 
     | 
    
         
            +
                      uspframes_hash[sentid] = Array.new
         
     | 
| 
      
 428 
     | 
    
         
            +
                    end
         
     | 
| 
      
 429 
     | 
    
         
            +
                    
         
     | 
| 
      
 430 
     | 
    
         
            +
                    # find everything up to and including the graph
         
     | 
| 
      
 431 
     | 
    
         
            +
                    sent_children = sent.children_and_text()
         
     | 
| 
      
 432 
     | 
    
         
            +
                    graph = sent_children.detect { |child| child.name == "graph" }
         
     | 
| 
      
 433 
     | 
    
         
            +
                    graph_hash[sentid] = "<s " + 
         
     | 
| 
      
 434 
     | 
    
         
            +
                                           sent.attributes.to_a.map { |at, val| "#{at}=\'#{val}\'" }.join(" ") +
         
     | 
| 
      
 435 
     | 
    
         
            +
                                           ">" + 
         
     | 
| 
      
 436 
     | 
    
         
            +
                                           graph.to_s
         
     | 
| 
      
 437 
     | 
    
         
            +
                    
         
     | 
| 
      
 438 
     | 
    
         
            +
                    # find the usp block
         
     | 
| 
      
 439 
     | 
    
         
            +
                    
         
     | 
| 
      
 440 
     | 
    
         
            +
                    sem = sent_children.detect { |child| child.name == "sem"}
         
     | 
| 
      
 441 
     | 
    
         
            +
                    usp = ""
         
     | 
| 
      
 442 
     | 
    
         
            +
                    if sem
         
     | 
| 
      
 443 
     | 
    
         
            +
                      usp = sem.children_and_text.detect { |child| child.name == "usp" }
         
     | 
| 
      
 444 
     | 
    
         
            +
                      usp = usp.to_s
         
     | 
| 
      
 445 
     | 
    
         
            +
                    end
         
     | 
| 
      
 446 
     | 
    
         
            +
                    
         
     | 
| 
      
 447 
     | 
    
         
            +
                    # find all frames
         
     | 
| 
      
 448 
     | 
    
         
            +
                    if sem
         
     | 
| 
      
 449 
     | 
    
         
            +
                      frames = sem.children_and_text.detect { |child| child.name == "frames" }
         
     | 
| 
      
 450 
     | 
    
         
            +
                      if frames
         
     | 
| 
      
 451 
     | 
    
         
            +
                        frames.children_and_text.each { |frame|
         
     | 
| 
      
 452 
     | 
    
         
            +
                          unless frame.name == "frame"
         
     | 
| 
      
 453 
     | 
    
         
            +
                            next
         
     | 
| 
      
 454 
     | 
    
         
            +
                          end
         
     | 
| 
      
 455 
     | 
    
         
            +
                          frameid = frame.attributes["id"]
         
     | 
| 
      
 456 
     | 
    
         
            +
                          
         
     | 
| 
      
 457 
     | 
    
         
            +
                          temp_frameid = "#{sentid}_temp_f#{frame_hash[sentid].length+this_frames.length+1}"
         
     | 
| 
      
 458 
     | 
    
         
            +
                          final_frameid = "#{sentid}_f#{frame_hash[sentid].length+this_frames.length+1}"
         
     | 
| 
      
 459 
     | 
    
         
            +
                          
         
     | 
| 
      
 460 
     | 
    
         
            +
                          temp_subs << [frameid,temp_frameid]
         
     | 
| 
      
 461 
     | 
    
         
            +
                          final_subs << [temp_frameid,final_frameid]
         
     | 
| 
      
 462 
     | 
    
         
            +
                          
         
     | 
| 
      
 463 
     | 
    
         
            +
                          this_frames << frame.to_s
         
     | 
| 
      
 464 
     | 
    
         
            +
                        }
         
     | 
| 
      
 465 
     | 
    
         
            +
                      end
         
     | 
| 
      
 466 
     | 
    
         
            +
                    end
         
     | 
| 
      
 467 
     | 
    
         
            +
                    
         
     | 
| 
      
 468 
     | 
    
         
            +
                    # now first rename all the frames to temporary names
         
     | 
| 
      
 469 
     | 
    
         
            +
                    
         
     | 
| 
      
 470 
     | 
    
         
            +
                    temp_subs.each {|orig_frameid, temp_frameid|
         
     | 
| 
      
 471 
     | 
    
         
            +
                      this_frames.map! {|frame_str|
         
     | 
| 
      
 472 
     | 
    
         
            +
            	#print "orig ", orig_frameid, " temp ", temp_frameid, "\n"
         
     | 
| 
      
 473 
     | 
    
         
            +
                        frame_str.gsub(orig_frameid,temp_frameid)
         
     | 
| 
      
 474 
     | 
    
         
            +
                      }
         
     | 
| 
      
 475 
     | 
    
         
            +
                  
         
     | 
| 
      
 476 
     | 
    
         
            +
                      usp.gsub!(orig_frameid,temp_frameid)
         
     | 
| 
      
 477 
     | 
    
         
            +
                    }
         
     | 
| 
      
 478 
     | 
    
         
            +
                    
         
     | 
| 
      
 479 
     | 
    
         
            +
                    # and re-rename the temporary names
         
     | 
| 
      
 480 
     | 
    
         
            +
                    
         
     | 
| 
      
 481 
     | 
    
         
            +
                    final_subs.each {|temp_frameid, final_frameid|
         
     | 
| 
      
 482 
     | 
    
         
            +
                      this_frames.map! {|frame_str|
         
     | 
| 
      
 483 
     | 
    
         
            +
                        frame_str.gsub(temp_frameid,final_frameid)
         
     | 
| 
      
 484 
     | 
    
         
            +
                      }
         
     | 
| 
      
 485 
     | 
    
         
            +
                      usp.gsub!(temp_frameid, final_frameid)
         
     | 
| 
      
 486 
     | 
    
         
            +
                    }
         
     | 
| 
      
 487 
     | 
    
         
            +
                    
         
     | 
| 
      
 488 
     | 
    
         
            +
                    # store frames in data structure
         
     | 
| 
      
 489 
     | 
    
         
            +
                    this_frames.each {|frame_str|
         
     | 
| 
      
 490 
     | 
    
         
            +
                      frame_hash[sentid] << frame_str
         
     | 
| 
      
 491 
     | 
    
         
            +
                    }
         
     | 
| 
      
 492 
     | 
    
         
            +
                    
         
     | 
| 
      
 493 
     | 
    
         
            +
                    # store uspfes in data structure
         
     | 
| 
      
 494 
     | 
    
         
            +
                    unless usp.empty?
         
     | 
| 
      
 495 
     | 
    
         
            +
                      usp_elt = RegXML.new(usp)
         
     | 
| 
      
 496 
     | 
    
         
            +
                      uspfes = usp_elt.children_and_text.detect { |child| child.name == "uspfes" }
         
     | 
| 
      
 497 
     | 
    
         
            +
                      uspfes.children_and_text.each { |child|
         
     | 
| 
      
 498 
     | 
    
         
            +
                        unless child.name == "uspblock"
         
     | 
| 
      
 499 
     | 
    
         
            +
                          next
         
     | 
| 
      
 500 
     | 
    
         
            +
                        end
         
     | 
| 
      
 501 
     | 
    
         
            +
                        uspfes_hash[sentid] << child.to_s
         
     | 
| 
      
 502 
     | 
    
         
            +
                      }
         
     | 
| 
      
 503 
     | 
    
         
            +
                    
         
     | 
| 
      
 504 
     | 
    
         
            +
                      # store uspframes in data structure
         
     | 
| 
      
 505 
     | 
    
         
            +
                      uspframes = usp_elt.children_and_text.detect { |child| child.name == "uspframes" }
         
     | 
| 
      
 506 
     | 
    
         
            +
                      uspframes.children_and_text.each { |child|
         
     | 
| 
      
 507 
     | 
    
         
            +
                        unless child.name == "uspblock"
         
     | 
| 
      
 508 
     | 
    
         
            +
                          next
         
     | 
| 
      
 509 
     | 
    
         
            +
                        end
         
     | 
| 
      
 510 
     | 
    
         
            +
                        uspframes_hash[sentid] << child.to_s
         
     | 
| 
      
 511 
     | 
    
         
            +
                      }
         
     | 
| 
      
 512 
     | 
    
         
            +
                    end
         
     | 
| 
      
 513 
     | 
    
         
            +
                  }
         
     | 
| 
      
 514 
     | 
    
         
            +
                }
         
     | 
| 
      
 515 
     | 
    
         
            +
             
     | 
| 
      
 516 
     | 
    
         
            +
                # now write everything in the data structure back to a file
         
     | 
| 
      
 517 
     | 
    
         
            +
                
         
     | 
| 
      
 518 
     | 
    
         
            +
                filecounter = 0
         
     | 
| 
      
 519 
     | 
    
         
            +
                sentcounter = 0
         
     | 
| 
      
 520 
     | 
    
         
            +
                outfile = nil
         
     | 
| 
      
 521 
     | 
    
         
            +
                sent_stack = Array.new
         
     | 
| 
      
 522 
     | 
    
         
            +
                    
         
     | 
| 
      
 523 
     | 
    
         
            +
                graph_hash.sort {|a,b| a[0].to_i <=> b[0].to_i}.each {|sentid,graph_str|
         
     | 
| 
      
 524 
     | 
    
         
            +
                  
         
     | 
| 
      
 525 
     | 
    
         
            +
                  if sentcounter == max_sentnum
         
     | 
| 
      
 526 
     | 
    
         
            +
                    outfile.puts SalsaTigerXMLHelper.get_footer
         
     | 
| 
      
 527 
     | 
    
         
            +
                    outfile.close
         
     | 
| 
      
 528 
     | 
    
         
            +
                    outfile = nil
         
     | 
| 
      
 529 
     | 
    
         
            +
                  end
         
     | 
| 
      
 530 
     | 
    
         
            +
                  
         
     | 
| 
      
 531 
     | 
    
         
            +
                  unless outfile
         
     | 
| 
      
 532 
     | 
    
         
            +
                    outfile = File.new(split_dir+filecounter.to_s+".xml","w")
         
     | 
| 
      
 533 
     | 
    
         
            +
                    outfile.puts SalsaTigerXMLHelper.get_header    
         
     | 
| 
      
 534 
     | 
    
         
            +
                    filecounter +=1
         
     | 
| 
      
 535 
     | 
    
         
            +
                    sentcounter = 0
         
     | 
| 
      
 536 
     | 
    
         
            +
                  end
         
     | 
| 
      
 537 
     | 
    
         
            +
                        
         
     | 
| 
      
 538 
     | 
    
         
            +
                  xml = Array.new
         
     | 
| 
      
 539 
     | 
    
         
            +
                  xml << graph_str
         
     | 
| 
      
 540 
     | 
    
         
            +
                  xml << "<sem>"
         
     | 
| 
      
 541 
     | 
    
         
            +
                  xml << "<globals>"
         
     | 
| 
      
 542 
     | 
    
         
            +
                  xml << "</globals>"
         
     | 
| 
      
 543 
     | 
    
         
            +
                  xml << "<frames>"
         
     | 
| 
      
 544 
     | 
    
         
            +
                  frame_hash[sentid].each {|frame_str|
         
     | 
| 
      
 545 
     | 
    
         
            +
                    xml << frame_str
         
     | 
| 
      
 546 
     | 
    
         
            +
                  }
         
     | 
| 
      
 547 
     | 
    
         
            +
                  xml << "</frames>"
         
     | 
| 
      
 548 
     | 
    
         
            +
                  xml << "<usp>"
         
     | 
| 
      
 549 
     | 
    
         
            +
                  xml << "<uspframes>"
         
     | 
| 
      
 550 
     | 
    
         
            +
                  uspframes_hash[sentid].each {|uspblock_str|
         
     | 
| 
      
 551 
     | 
    
         
            +
                    xml << uspblock_str
         
     | 
| 
      
 552 
     | 
    
         
            +
                  }
         
     | 
| 
      
 553 
     | 
    
         
            +
                  xml << "</uspframes>"
         
     | 
| 
      
 554 
     | 
    
         
            +
                  xml << "<uspfes>"
         
     | 
| 
      
 555 
     | 
    
         
            +
                  uspfes_hash[sentid].each {|uspblock_str|
         
     | 
| 
      
 556 
     | 
    
         
            +
                    xml << uspblock_str
         
     | 
| 
      
 557 
     | 
    
         
            +
                  }
         
     | 
| 
      
 558 
     | 
    
         
            +
                  xml << "</uspfes>"
         
     | 
| 
      
 559 
     | 
    
         
            +
                  xml << "</usp>"
         
     | 
| 
      
 560 
     | 
    
         
            +
                  xml << "</sem>"
         
     | 
| 
      
 561 
     | 
    
         
            +
                  xml << "</s>"
         
     | 
| 
      
 562 
     | 
    
         
            +
                  
         
     | 
| 
      
 563 
     | 
    
         
            +
                  outfile.puts xml.join("\n")
         
     | 
| 
      
 564 
     | 
    
         
            +
                  sentcounter += 1
         
     | 
| 
      
 565 
     | 
    
         
            +
                }
         
     | 
| 
      
 566 
     | 
    
         
            +
                
         
     | 
| 
      
 567 
     | 
    
         
            +
                if outfile 
         
     | 
| 
      
 568 
     | 
    
         
            +
                  outfile.puts SalsaTigerXMLHelper.get_footer
         
     | 
| 
      
 569 
     | 
    
         
            +
                  outfile.close
         
     | 
| 
      
 570 
     | 
    
         
            +
                  outfile = nil
         
     | 
| 
      
 571 
     | 
    
         
            +
                end   
         
     | 
| 
      
 572 
     | 
    
         
            +
                
         
     | 
| 
      
 573 
     | 
    
         
            +
              end
         
     | 
| 
      
 574 
     | 
    
         
            +
             
     | 
| 
      
 575 
     | 
    
         
            +
             
     | 
| 
      
 576 
     | 
    
         
            +
              ####
         
     | 
| 
      
 577 
     | 
    
         
            +
              # transform SalsaTigerXML file to Tab format file
         
     | 
| 
      
 578 
     | 
    
         
            +
              def FrprepHelper.stxml_to_tab_file(input_filename,   # string: name of input file
         
     | 
| 
      
 579 
     | 
    
         
            +
                                                 output_filename,  # string: name of output file
         
     | 
| 
      
 580 
     | 
    
         
            +
                                                 exp)              # FrprepConfigData
         
     | 
| 
      
 581 
     | 
    
         
            +
                infile = FilePartsParser.new(input_filename)
         
     | 
| 
      
 582 
     | 
    
         
            +
                begin
         
     | 
| 
      
 583 
     | 
    
         
            +
                  outfile = File.new(output_filename,"w")
         
     | 
| 
      
 584 
     | 
    
         
            +
                rescue
         
     | 
| 
      
 585 
     | 
    
         
            +
                  raise "Stxml to tab: could not write to tab file #{output_filename}"
         
     | 
| 
      
 586 
     | 
    
         
            +
                end
         
     | 
| 
      
 587 
     | 
    
         
            +
             
     | 
| 
      
 588 
     | 
    
         
            +
                infile.scan_s {|sent_string|        
         
     | 
| 
      
 589 
     | 
    
         
            +
             
     | 
| 
      
 590 
     | 
    
         
            +
                  # determine sentence ID
         
     | 
| 
      
 591 
     | 
    
         
            +
                  sentid = RegXML.new(sent_string).attributes["id"]
         
     | 
| 
      
 592 
     | 
    
         
            +
                  unless sentid
         
     | 
| 
      
 593 
     | 
    
         
            +
                    $stderr.puts "No sentence ID in sentence:\n "+ sent_string
         
     | 
| 
      
 594 
     | 
    
         
            +
                    $stderr.puts "Making a new one up."
         
     | 
| 
      
 595 
     | 
    
         
            +
                    sentid = Time.new().to_f.to_s
         
     | 
| 
      
 596 
     | 
    
         
            +
                  end
         
     | 
| 
      
 597 
     | 
    
         
            +
             
     | 
| 
      
 598 
     | 
    
         
            +
                  # find terminals and process them
         
     | 
| 
      
 599 
     | 
    
         
            +
                  unless sent_string.delete("\n") =~ /<terminals[ >].+<\/terminals>/
         
     | 
| 
      
 600 
     | 
    
         
            +
                    $stderr.puts "Warning: could not find terminals in sentence:"
         
     | 
| 
      
 601 
     | 
    
         
            +
                    $stderr.puts sent_string
         
     | 
| 
      
 602 
     | 
    
         
            +
                    $stderr.puts "Skipping"
         
     | 
| 
      
 603 
     | 
    
         
            +
                    next
         
     | 
| 
      
 604 
     | 
    
         
            +
                  end
         
     | 
| 
      
 605 
     | 
    
         
            +
             
     | 
| 
      
 606 
     | 
    
         
            +
                    # modified by ines, 27/08/08
         
     | 
| 
      
 607 
     | 
    
         
            +
                    # for Berkeley => convert ( ) to -LRB- -RRB-
         
     | 
| 
      
 608 
     | 
    
         
            +
                   
         
     | 
| 
      
 609 
     | 
    
         
            +
            	text = $&
         
     | 
| 
      
 610 
     | 
    
         
            +
                    if exp.get("parser") == "berkeley"
         
     | 
| 
      
 611 
     | 
    
         
            +
                      text.gsub!(/word='\('/, "word='*LRB*'")
         
     | 
| 
      
 612 
     | 
    
         
            +
                      text.gsub!(/word='\)'/, "word='*RRB*'")
         
     | 
| 
      
 613 
     | 
    
         
            +
             	  text.gsub!(/word=['"]``['"]/, "word='\"'")
         
     | 
| 
      
 614 
     | 
    
         
            +
            	  text.gsub!(/word=['"]''['"]/,  "word='\"'")
         
     | 
| 
      
 615 
     | 
    
         
            +
            	  text.gsub!(/word=['"]\'\'['"]/,  "word='\"'")
         
     | 
| 
      
 616 
     | 
    
         
            +
            	  #text.gsub!(/word=['"]\(['"]/,  "word='-LRB-'")
         
     | 
| 
      
 617 
     | 
    
         
            +
            	  #text.gsub!(/word=['"]\)['"]/,  "word='-RRB-'")
         
     | 
| 
      
 618 
     | 
    
         
            +
             
     | 
| 
      
 619 
     | 
    
         
            +
                    end
         
     | 
| 
      
 620 
     | 
    
         
            +
                  terminals = text
         
     | 
| 
      
 621 
     | 
    
         
            +
                  #terminals = sent_string
         
     | 
| 
      
 622 
     | 
    
         
            +
                  terminals = RegXML.new(terminals)
         
     | 
| 
      
 623 
     | 
    
         
            +
                  terminals.children_and_text.each { |terminal|
         
     | 
| 
      
 624 
     | 
    
         
            +
             
         
     | 
| 
      
 625 
     | 
    
         
            +
                    unless terminal.name == "t"
         
     | 
| 
      
 626 
     | 
    
         
            +
                      # not a terminal after all
         
     | 
| 
      
 627 
     | 
    
         
            +
                      next
         
     | 
| 
      
 628 
     | 
    
         
            +
                    end
         
     | 
| 
      
 629 
     | 
    
         
            +
              
         
     | 
| 
      
 630 
     | 
    
         
            +
             
     | 
| 
      
 631 
     | 
    
         
            +
                    outfile.puts FNTabFormatFile.format_str({
         
     | 
| 
      
 632 
     | 
    
         
            +
            						  "word" => SalsaTigerXMLHelper.unescape(terminal.attributes["word"]),
         
     | 
| 
      
 633 
     | 
    
         
            +
            						  "sent_id" => sentid
         
     | 
| 
      
 634 
     | 
    
         
            +
            						})
         
     | 
| 
      
 635 
     | 
    
         
            +
                  } # each terminal
         
     | 
| 
      
 636 
     | 
    
         
            +
                  outfile.puts
         
     | 
| 
      
 637 
     | 
    
         
            +
                } # each sentence
         
     | 
| 
      
 638 
     | 
    
         
            +
                outfile.close
         
     | 
| 
      
 639 
     | 
    
         
            +
              end
         
     | 
| 
      
 640 
     | 
    
         
            +
             
     | 
| 
      
 641 
     | 
    
         
            +
              ###
         
     | 
| 
      
 642 
     | 
    
         
            +
              # add semantics from tab:
         
     | 
| 
      
 643 
     | 
    
         
            +
              #
         
     | 
| 
      
 644 
     | 
    
         
            +
              # add information about semantics from a FN tab sentence
         
     | 
| 
      
 645 
     | 
    
         
            +
              # to a SalsaTigerSentence object:
         
     | 
| 
      
 646 
     | 
    
         
            +
              # - frames (one frame per sentence)
         
     | 
| 
      
 647 
     | 
    
         
            +
              # - roles
         
     | 
| 
      
 648 
     | 
    
         
            +
              # - FrameNet grammatical functions
         
     | 
| 
      
 649 
     | 
    
         
            +
              # - FrameNet POS of target
         
     | 
| 
      
 650 
     | 
    
         
            +
              def FrprepHelper.add_semantics_from_tab(st_sent,  # SalsaTigerSentence object
         
     | 
| 
      
 651 
     | 
    
         
            +
            					  tab_sent, # FNTabFormatSentence object
         
     | 
| 
      
 652 
     | 
    
         
            +
                                                      mapping,  # hash: tab lineno -> array:SynNode
         
     | 
| 
      
 653 
     | 
    
         
            +
                                                      interpreter_class, # SynInterpreter class
         
     | 
| 
      
 654 
     | 
    
         
            +
                                                      exp)      # FrprepConfigData
         
     | 
| 
      
 655 
     | 
    
         
            +
             
     | 
| 
      
 656 
     | 
    
         
            +
                if tab_sent.nil?
         
     | 
| 
      
 657 
     | 
    
         
            +
                  # tab sentence not found
         
     | 
| 
      
 658 
     | 
    
         
            +
                  return
         
     | 
| 
      
 659 
     | 
    
         
            +
                end
         
     | 
| 
      
 660 
     | 
    
         
            +
             
     | 
| 
      
 661 
     | 
    
         
            +
                # iterate through frames in the tabsent
         
     | 
| 
      
 662 
     | 
    
         
            +
                frame_index = 0
         
     | 
| 
      
 663 
     | 
    
         
            +
                tab_sent.each_frame { |tab_frame_obj|
         
     | 
| 
      
 664 
     | 
    
         
            +
                  frame_name = tab_frame_obj.get_frame() # string
         
     | 
| 
      
 665 
     | 
    
         
            +
                    
         
     | 
| 
      
 666 
     | 
    
         
            +
                  if frame_name.nil? or frame_name =~ /^-*$/
         
     | 
| 
      
 667 
     | 
    
         
            +
            	# weird: a frame without a frame
         
     | 
| 
      
 668 
     | 
    
         
            +
            	$stderr.puts "Warning: frame entry without a frame in tab sentence #{st_sent.id}."
         
     | 
| 
      
 669 
     | 
    
         
            +
            	$stderr.puts "Skipping"
         
     | 
| 
      
 670 
     | 
    
         
            +
            	next
         
     | 
| 
      
 671 
     | 
    
         
            +
                  end
         
     | 
| 
      
 672 
     | 
    
         
            +
                    
         
     | 
| 
      
 673 
     | 
    
         
            +
                  frame_node = st_sent.add_frame(frame_name, tab_sent.get_sent_id() + "_f#{frame_index}")
         
     | 
| 
      
 674 
     | 
    
         
            +
                  frame_index += 1
         
     | 
| 
      
 675 
     | 
    
         
            +
                
         
     | 
| 
      
 676 
     | 
    
         
            +
                  # target
         
     | 
| 
      
 677 
     | 
    
         
            +
                  target_nodes = Array.new
         
     | 
| 
      
 678 
     | 
    
         
            +
                  tab_frame_obj.get_target_indices.each {|terminal_id| 
         
     | 
| 
      
 679 
     | 
    
         
            +
            	if mapping[terminal_id]
         
     | 
| 
      
 680 
     | 
    
         
            +
            	  target_nodes.concat mapping[terminal_id]
         
     | 
| 
      
 681 
     | 
    
         
            +
            	end
         
     | 
| 
      
 682 
     | 
    
         
            +
                  }
         
     | 
| 
      
 683 
     | 
    
         
            +
             
     | 
| 
      
 684 
     | 
    
         
            +
                  # let the interpreter class decide on how to determine the maximum constituents
         
     | 
| 
      
 685 
     | 
    
         
            +
                  target_maxnodes = interpreter_class.max_constituents(target_nodes, st_sent)
         
     | 
| 
      
 686 
     | 
    
         
            +
                  if target_maxnodes.empty?
         
     | 
| 
      
 687 
     | 
    
         
            +
            	# HIEr
         
     | 
| 
      
 688 
     | 
    
         
            +
            	STDERR.puts  "Warning: no target in frame entry, sentence #{st_sent.id}."
         
     | 
| 
      
 689 
     | 
    
         
            +
            	$stderr.puts "frame is #{frame_name}, frame no #{frame_index}"
         
     | 
| 
      
 690 
     | 
    
         
            +
            	$stderr.puts "Skipping."
         
     | 
| 
      
 691 
     | 
    
         
            +
            	$stderr.puts "target indices: " + tab_frame_obj.get_target_indices.join(", ")
         
     | 
| 
      
 692 
     | 
    
         
            +
                    #tab_sent.each_line { |line|
         
     | 
| 
      
 693 
     | 
    
         
            +
                    #  $stderr.puts line
         
     | 
| 
      
 694 
     | 
    
         
            +
            	#  $stderr.puts "--"
         
     | 
| 
      
 695 
     | 
    
         
            +
                    #}
         
     | 
| 
      
 696 
     | 
    
         
            +
            	next
         
     | 
| 
      
 697 
     | 
    
         
            +
                  end
         
     | 
| 
      
 698 
     | 
    
         
            +
                  frame_node.add_fe("target",target_maxnodes)
         
     | 
| 
      
 699 
     | 
    
         
            +
                    
         
     | 
| 
      
 700 
     | 
    
         
            +
                  # set features on target: target lemma, target POS
         
     | 
| 
      
 701 
     | 
    
         
            +
                  target_lemma = tab_frame_obj.get_target()
         
     | 
| 
      
 702 
     | 
    
         
            +
                  target_pos = nil
         
     | 
| 
      
 703 
     | 
    
         
            +
                  if target_lemma
         
     | 
| 
      
 704 
     | 
    
         
            +
            	if exp.get("origin") == "FrameNet"
         
     | 
| 
      
 705 
     | 
    
         
            +
            	  # FrameNet data: here the lemma in the tab file has the form
         
     | 
| 
      
 706 
     | 
    
         
            +
            	  # <lemma>.<POS>
         
     | 
| 
      
 707 
     | 
    
         
            +
            	  # separate the two
         
     | 
| 
      
 708 
     | 
    
         
            +
            	  if target_lemma =~ /^(.*)\.(.*)$/
         
     | 
| 
      
 709 
     | 
    
         
            +
            	    target_lemma = $1
         
     | 
| 
      
 710 
     | 
    
         
            +
            	    target_pos = $2
         
     | 
| 
      
 711 
     | 
    
         
            +
            	  end
         
     | 
| 
      
 712 
     | 
    
         
            +
            	end
         
     | 
| 
      
 713 
     | 
    
         
            +
            	frame_node.target.set_attribute("lemma", target_lemma)
         
     | 
| 
      
 714 
     | 
    
         
            +
                    if target_pos
         
     | 
| 
      
 715 
     | 
    
         
            +
                      frame_node.target.set_attribute("pos", target_pos)
         
     | 
| 
      
 716 
     | 
    
         
            +
                    end
         
     | 
| 
      
 717 
     | 
    
         
            +
                  end
         
     | 
| 
      
 718 
     | 
    
         
            +
                    
         
     | 
| 
      
 719 
     | 
    
         
            +
                  # roles, GF, PT
         
     | 
| 
      
 720 
     | 
    
         
            +
                  # synnode_markable_label: 
         
     | 
| 
      
 721 
     | 
    
         
            +
                  #   hash "role" | "gf" | "pt" -> SynNode -> array: label(string)
         
     | 
| 
      
 722 
     | 
    
         
            +
                  layer_synnode_label = Hash.new
         
     | 
| 
      
 723 
     | 
    
         
            +
                  ["gf", "pt", "role"].each {|layer|
         
     | 
| 
      
 724 
     | 
    
         
            +
            	termids2labels = tab_frame_obj.markables(layer)
         
     | 
| 
      
 725 
     | 
    
         
            +
             
     | 
| 
      
 726 
     | 
    
         
            +
            	unless layer_synnode_label[layer]
         
     | 
| 
      
 727 
     | 
    
         
            +
            	  layer_synnode_label[layer] = Hash.new
         
     | 
| 
      
 728 
     | 
    
         
            +
            	end
         
     | 
| 
      
 729 
     | 
    
         
            +
             
     | 
| 
      
 730 
     | 
    
         
            +
            	termids2labels.each {|terminal_indices, label|
         
     | 
| 
      
 731 
     | 
    
         
            +
            	  terminal_indices.each { |t_i|
         
     | 
| 
      
 732 
     | 
    
         
            +
             
     | 
| 
      
 733 
     | 
    
         
            +
            	    if (nodes = mapping[t_i])
         
     | 
| 
      
 734 
     | 
    
         
            +
             
     | 
| 
      
 735 
     | 
    
         
            +
            	      nodes.each { |node|
         
     | 
| 
      
 736 
     | 
    
         
            +
            		unless layer_synnode_label[layer][node]
         
     | 
| 
      
 737 
     | 
    
         
            +
            		  layer_synnode_label[layer][node] = Array.new
         
     | 
| 
      
 738 
     | 
    
         
            +
            		end
         
     | 
| 
      
 739 
     | 
    
         
            +
             
     | 
| 
      
 740 
     | 
    
         
            +
            		layer_synnode_label[layer][node] << label
         
     | 
| 
      
 741 
     | 
    
         
            +
            	      } # each node that t_i maps to
         
     | 
| 
      
 742 
     | 
    
         
            +
            	    end # if t_i maps to anything
         
     | 
| 
      
 743 
     | 
    
         
            +
             
     | 
| 
      
 744 
     | 
    
         
            +
            	  } # each terminal index
         
     | 
| 
      
 745 
     | 
    
         
            +
            	} # each mapping terminal indices -> label
         
     | 
| 
      
 746 
     | 
    
         
            +
                  } # each layer
         
     | 
| 
      
 747 
     | 
    
         
            +
             
     | 
| 
      
 748 
     | 
    
         
            +
                  # 'stuff' (Support and other things)
         
     | 
| 
      
 749 
     | 
    
         
            +
                  layer_synnode_label["stuff"] = Hash.new
         
     | 
| 
      
 750 
     | 
    
         
            +
                  tab_frame_obj.each_line_parsed { |line_obj|
         
     | 
| 
      
 751 
     | 
    
         
            +
                    if (label = line_obj.get("stuff")) != "-"
         
     | 
| 
      
 752 
     | 
    
         
            +
                      if (nodes = mapping[line_obj.get("lineno")])
         
     | 
| 
      
 753 
     | 
    
         
            +
                        nodes.each { |node|
         
     | 
| 
      
 754 
     | 
    
         
            +
                          unless layer_synnode_label["stuff"][node]
         
     | 
| 
      
 755 
     | 
    
         
            +
                            layer_synnode_label["stuff"][node] = Array.new
         
     | 
| 
      
 756 
     | 
    
         
            +
                          end
         
     | 
| 
      
 757 
     | 
    
         
            +
                          layer_synnode_label["stuff"][node] << label 
         
     | 
| 
      
 758 
     | 
    
         
            +
                        }
         
     | 
| 
      
 759 
     | 
    
         
            +
                      end
         
     | 
| 
      
 760 
     | 
    
         
            +
                    end
         
     | 
| 
      
 761 
     | 
    
         
            +
                  }
         
     | 
| 
      
 762 
     | 
    
         
            +
             
     | 
| 
      
 763 
     | 
    
         
            +
                  # reencode:
         
     | 
| 
      
 764 
     | 
    
         
            +
                  #  hash role_label(string) -> array of tuples [synnodes, gflabels, ptlabels]
         
     | 
| 
      
 765 
     | 
    
         
            +
                  #   synnodes: array:SynNode.  gflabels, ptlabels: array:String
         
     | 
| 
      
 766 
     | 
    
         
            +
                  #
         
     | 
| 
      
 767 
     | 
    
         
            +
                  # note that in this step, any gf or pt labels that have been
         
     | 
| 
      
 768 
     | 
    
         
            +
                  # assigned to a SynNode that has not also been assigned a role
         
     | 
| 
      
 769 
     | 
    
         
            +
                  # will be lost
         
     | 
| 
      
 770 
     | 
    
         
            +
                  role2nodes_labels = Hash.new
         
     | 
| 
      
 771 
     | 
    
         
            +
                  layer_synnode_label["role"].each_pair { |synnode, labels|
         
     | 
| 
      
 772 
     | 
    
         
            +
            	labels.each { | rolelabel|
         
     | 
| 
      
 773 
     | 
    
         
            +
            	  unless role2nodes_labels[rolelabel]
         
     | 
| 
      
 774 
     | 
    
         
            +
            	    role2nodes_labels[rolelabel] = Array.new
         
     | 
| 
      
 775 
     | 
    
         
            +
            	  end
         
     | 
| 
      
 776 
     | 
    
         
            +
            	  
         
     | 
| 
      
 777 
     | 
    
         
            +
            	  role2nodes_labels[rolelabel] << [
         
     | 
| 
      
 778 
     | 
    
         
            +
            	    synnode, 
         
     | 
| 
      
 779 
     | 
    
         
            +
            	    layer_synnode_label["gf"][synnode], 
         
     | 
| 
      
 780 
     | 
    
         
            +
            	    layer_synnode_label["pt"][synnode]
         
     | 
| 
      
 781 
     | 
    
         
            +
            	  ]
         
     | 
| 
      
 782 
     | 
    
         
            +
            	} # each role label
         
     | 
| 
      
 783 
     | 
    
         
            +
                  } # each pair SynNode/role labels
         
     | 
| 
      
 784 
     | 
    
         
            +
             
     | 
| 
      
 785 
     | 
    
         
            +
                  # reencode "stuff", but only the support cases
         
     | 
| 
      
 786 
     | 
    
         
            +
                  role2nodes_labels["Support"] = Array.new()
         
     | 
| 
      
 787 
     | 
    
         
            +
             
     | 
| 
      
 788 
     | 
    
         
            +
                  layer_synnode_label["stuff"].each_pair { |synnode, labels|
         
     | 
| 
      
 789 
     | 
    
         
            +
                    labels.each { |stufflabel|
         
     | 
| 
      
 790 
     | 
    
         
            +
                      if stufflabel =~ /Supp/
         
     | 
| 
      
 791 
     | 
    
         
            +
                        # some sort of support
         
     | 
| 
      
 792 
     | 
    
         
            +
                        role2nodes_labels["Support"] << [synnode, nil, nil]
         
     | 
| 
      
 793 
     | 
    
         
            +
                      end
         
     | 
| 
      
 794 
     | 
    
         
            +
                    }
         
     | 
| 
      
 795 
     | 
    
         
            +
                  }
         
     | 
| 
      
 796 
     | 
    
         
            +
             
     | 
| 
      
 797 
     | 
    
         
            +
                  ##
         
     | 
| 
      
 798 
     | 
    
         
            +
                  # each role label: 
         
     | 
| 
      
 799 
     | 
    
         
            +
                  # make FeNode for the current frame
         
     | 
| 
      
 800 
     | 
    
         
            +
                  role2nodes_labels.each_pair { |rolelabel, node_gf_pt|
         
     | 
| 
      
 801 
     | 
    
         
            +
            	
         
     | 
| 
      
 802 
     | 
    
         
            +
            	# get list of syn nodes, GF and PT labels for this role
         
     | 
| 
      
 803 
     | 
    
         
            +
            	# shortcut for GF and PT labels: take any labels that have
         
     | 
| 
      
 804 
     | 
    
         
            +
            	# been assigned for _some_ Synnode of this role
         
     | 
| 
      
 805 
     | 
    
         
            +
            	synnodes = node_gf_pt.map { |ngp| ngp[0] } 
         
     | 
| 
      
 806 
     | 
    
         
            +
            	gflabels = node_gf_pt.map { |ngp| ngp[1] }.compact.flatten.uniq
         
     | 
| 
      
 807 
     | 
    
         
            +
            	ptlabels = node_gf_pt.map { |ngp| ngp[2] }.compact.flatten.uniq
         
     | 
| 
      
 808 
     | 
    
         
            +
             
     | 
| 
      
 809 
     | 
    
         
            +
             
     | 
| 
      
 810 
     | 
    
         
            +
            	# let the interpreter class decide on how to 
         
     | 
| 
      
 811 
     | 
    
         
            +
            	# determine the maximum constituents
         
     | 
| 
      
 812 
     | 
    
         
            +
            	maxnodes = interpreter_class.max_constituents(synnodes, st_sent)
         
     | 
| 
      
 813 
     | 
    
         
            +
             
     | 
| 
      
 814 
     | 
    
         
            +
            	fe_node = st_sent.add_fe(frame_node, rolelabel, maxnodes)
         
     | 
| 
      
 815 
     | 
    
         
            +
            	unless gflabels.empty?
         
     | 
| 
      
 816 
     | 
    
         
            +
            	  fe_node.set_attribute("gf", gflabels.join(","))
         
     | 
| 
      
 817 
     | 
    
         
            +
            	end
         
     | 
| 
      
 818 
     | 
    
         
            +
            	unless ptlabels.empty?
         
     | 
| 
      
 819 
     | 
    
         
            +
            	  fe_node.set_attribute("pt", ptlabels.join(","))
         
     | 
| 
      
 820 
     | 
    
         
            +
            	end
         
     | 
| 
      
 821 
     | 
    
         
            +
                  } # each role label
         
     | 
| 
      
 822 
     | 
    
         
            +
                } # each frame
         
     | 
| 
      
 823 
     | 
    
         
            +
              end
         
     | 
| 
      
 824 
     | 
    
         
            +
             
     | 
| 
      
 825 
     | 
    
         
            +
             
     | 
| 
      
 826 
     | 
    
         
            +
              ######
         
     | 
| 
      
 827 
     | 
    
         
            +
              # handle multiword targets:
         
     | 
| 
      
 828 
     | 
    
         
            +
              # if you find a verb with a separate prefix, 
         
     | 
| 
      
 829 
     | 
    
         
            +
              # change the verb's lemma information accordingly
         
     | 
| 
      
 830 
     | 
    
         
            +
              # and add an attribute "other_words" to the verb node
         
     | 
| 
      
 831 
     | 
    
         
            +
              # pointing to the other node
         
     | 
| 
      
 832 
     | 
    
         
            +
              #
         
     | 
| 
      
 833 
     | 
    
         
            +
              # In general, it will be assumed that "other_words" contains
         
     | 
| 
      
 834 
     | 
    
         
            +
              # a list of node IDs for other nodes belonging to the same 
         
     | 
| 
      
 835 
     | 
    
         
            +
              # group, node IDs separated by spaces, and that 
         
     | 
| 
      
 836 
     | 
    
         
            +
              # each node of a group has the "other_words" attribute.
         
     | 
| 
      
 837 
     | 
    
         
            +
              #
         
     | 
| 
      
 838 
     | 
    
         
            +
              def FrprepHelper.handle_multiword_targets(sent,  # SalsaTigerSentence object
         
     | 
| 
      
 839 
     | 
    
         
            +
            					    interpreter, # SynInterpreter object
         
     | 
| 
      
 840 
     | 
    
         
            +
            					    language) # string: en, de
         
     | 
| 
      
 841 
     | 
    
         
            +
                ##
         
     | 
| 
      
 842 
     | 
    
         
            +
                # only retain the interesting words of the sentence:
         
     | 
| 
      
 843 
     | 
    
         
            +
                # content words and prepositions
         
     | 
| 
      
 844 
     | 
    
         
            +
                if sent.nil?
         
     | 
| 
      
 845 
     | 
    
         
            +
            	return
         
     | 
| 
      
 846 
     | 
    
         
            +
                end
         
     | 
| 
      
 847 
     | 
    
         
            +
             
     | 
| 
      
 848 
     | 
    
         
            +
                nodes = sent.terminals.select { |node|
         
     | 
| 
      
 849 
     | 
    
         
            +
                  [  
         
     | 
| 
      
 850 
     | 
    
         
            +
                    "adj", "adv", "card", "noun", "part", "prep", "verb"
         
     | 
| 
      
 851 
     | 
    
         
            +
                  ].include? interpreter.category(node)
         
     | 
| 
      
 852 
     | 
    
         
            +
                }
         
     | 
| 
      
 853 
     | 
    
         
            +
             
     | 
| 
      
 854 
     | 
    
         
            +
                ##
         
     | 
| 
      
 855 
     | 
    
         
            +
                # group:
         
     | 
| 
      
 856 
     | 
    
         
            +
                # group verbs with their separate particles
         
     | 
| 
      
 857 
     | 
    
         
            +
                # (at a later point, other types of grouping can be inserted here)
         
     | 
| 
      
 858 
     | 
    
         
            +
                groups = FrprepHelper.group_words(nodes, interpreter)
         
     | 
| 
      
 859 
     | 
    
         
            +
             
     | 
| 
      
 860 
     | 
    
         
            +
                ##
         
     | 
| 
      
 861 
     | 
    
         
            +
                # record grouping information as attributes on the terminals.
         
     | 
| 
      
 862 
     | 
    
         
            +
                groups.each { |descr, group_of_nodes|
         
     | 
| 
      
 863 
     | 
    
         
            +
                  case descr
         
     | 
| 
      
 864 
     | 
    
         
            +
                  when "none"
         
     | 
| 
      
 865 
     | 
    
         
            +
                    # no grouping
         
     | 
| 
      
 866 
     | 
    
         
            +
                  when "part"
         
     | 
| 
      
 867 
     | 
    
         
            +
                    # separate particle belonging to a verb
         
     | 
| 
      
 868 
     | 
    
         
            +
                    
         
     | 
| 
      
 869 
     | 
    
         
            +
                    # group_of_nodes is a pair [verb, particle]
         
     | 
| 
      
 870 
     | 
    
         
            +
                    verb, particle = group_of_nodes
         
     | 
| 
      
 871 
     | 
    
         
            +
             
     | 
| 
      
 872 
     | 
    
         
            +
                    verb.set_attribute("other_words", particle.id())
         
     | 
| 
      
 873 
     | 
    
         
            +
                    particle.set_attribute("other_words", verb.id())
         
     | 
| 
      
 874 
     | 
    
         
            +
                    
         
     | 
| 
      
 875 
     | 
    
         
            +
                    if verb.get_attribute("lemma") and particle.get_attribute("lemma")
         
     | 
| 
      
 876 
     | 
    
         
            +
                      case language
         
     | 
| 
      
 877 
     | 
    
         
            +
                      when "de"
         
     | 
| 
      
 878 
     | 
    
         
            +
                        # German: prepend SVP to get the real lemma of the verb
         
     | 
| 
      
 879 
     | 
    
         
            +
                        verb.set_attribute("lemma", 
         
     | 
| 
      
 880 
     | 
    
         
            +
                                           particle.get_attribute("lemma") + 
         
     | 
| 
      
 881 
     | 
    
         
            +
            			       verb.get_attribute("lemma"))
         
     | 
| 
      
 882 
     | 
    
         
            +
                      when "en"
         
     | 
| 
      
 883 
     | 
    
         
            +
                        # English: append particle as separate word after the lemma of the verb
         
     | 
| 
      
 884 
     | 
    
         
            +
                        verb.set_attribute("lemma", 
         
     | 
| 
      
 885 
     | 
    
         
            +
                                           verb.get_attribute("lemma") + " " + 
         
     | 
| 
      
 886 
     | 
    
         
            +
            			       particle.get_attribute("lemma"))
         
     | 
| 
      
 887 
     | 
    
         
            +
                      else
         
     | 
| 
      
 888 
     | 
    
         
            +
            	    # default
         
     | 
| 
      
 889 
     | 
    
         
            +
            	    verb.set_attribute("lemma",
         
     | 
| 
      
 890 
     | 
    
         
            +
            			       verb.get_attribute("lemma") + " " + 
         
     | 
| 
      
 891 
     | 
    
         
            +
            			       particle.get_attribute("lemma"))
         
     | 
| 
      
 892 
     | 
    
         
            +
                      end
         
     | 
| 
      
 893 
     | 
    
         
            +
                    end
         
     | 
| 
      
 894 
     | 
    
         
            +
             
     | 
| 
      
 895 
     | 
    
         
            +
                  else
         
     | 
| 
      
 896 
     | 
    
         
            +
                    raise "Shouldn't be here: unexpected description #{descr}"
         
     | 
| 
      
 897 
     | 
    
         
            +
                  end
         
     | 
| 
      
 898 
     | 
    
         
            +
                }
         
     | 
| 
      
 899 
     | 
    
         
            +
              end
         
     | 
| 
      
 900 
     | 
    
         
            +
             
     | 
| 
      
 901 
     | 
    
         
            +
              ########################
         
     | 
| 
      
 902 
     | 
    
         
            +
              # group_words
         
     | 
| 
      
 903 
     | 
    
         
            +
              #
         
     | 
| 
      
 904 
     | 
    
         
            +
              # auxiliary of transform_multiword targets
         
     | 
| 
      
 905 
     | 
    
         
            +
              #
         
     | 
| 
      
 906 
     | 
    
         
            +
              # Group terminals: 
         
     | 
| 
      
 907 
     | 
    
         
            +
              # At the moment, just find separate prefixes and particles
         
     | 
| 
      
 908 
     | 
    
         
            +
              # for verbs
         
     | 
| 
      
 909 
     | 
    
         
            +
              #
         
     | 
| 
      
 910 
     | 
    
         
            +
              # returns: list of pairs [descr, nodes]
         
     | 
| 
      
 911 
     | 
    
         
            +
              # descr: string, "none" (no group), "part" (separate verb particle)
         
     | 
| 
      
 912 
     | 
    
         
            +
              # nodes: array:SynNode
         
     | 
| 
      
 913 
     | 
    
         
            +
              def FrprepHelper.group_words(nodes,    # array: SynNode
         
     | 
| 
      
 914 
     | 
    
         
            +
            			       interpreter) # SynInterpreter object
         
     | 
| 
      
 915 
     | 
    
         
            +
             
     | 
| 
      
 916 
     | 
    
         
            +
                retv = Array.new # array of groups, array:array:SynNode
         
     | 
| 
      
 917 
     | 
    
         
            +
                done = Array.new # remember nodes already covered
         
     | 
| 
      
 918 
     | 
    
         
            +
             
     | 
| 
      
 919 
     | 
    
         
            +
                nodes.each { |terminal_node|
         
     | 
| 
      
 920 
     | 
    
         
            +
                  if done.include? terminal_node
         
     | 
| 
      
 921 
     | 
    
         
            +
                    # we have already included this node in one of the groups
         
     | 
| 
      
 922 
     | 
    
         
            +
                    next
         
     | 
| 
      
 923 
     | 
    
         
            +
                  end
         
     | 
| 
      
 924 
     | 
    
         
            +
             
     | 
| 
      
 925 
     | 
    
         
            +
                  if (svp = interpreter.particle_of_verb(terminal_node, nodes))
         
     | 
| 
      
 926 
     | 
    
         
            +
                    retv << ["part", [terminal_node, svp]]
         
     | 
| 
      
 927 
     | 
    
         
            +
                    done << terminal_node
         
     | 
| 
      
 928 
     | 
    
         
            +
                    done << svp
         
     | 
| 
      
 929 
     | 
    
         
            +
                  else
         
     | 
| 
      
 930 
     | 
    
         
            +
                    retv << ["none", [terminal_node]]
         
     | 
| 
      
 931 
     | 
    
         
            +
                    done << terminal_node
         
     | 
| 
      
 932 
     | 
    
         
            +
                  end
         
     | 
| 
      
 933 
     | 
    
         
            +
             
     | 
| 
      
 934 
     | 
    
         
            +
                }
         
     | 
| 
      
 935 
     | 
    
         
            +
             
     | 
| 
      
 936 
     | 
    
         
            +
                return retv
         
     | 
| 
      
 937 
     | 
    
         
            +
              end
         
     | 
| 
      
 938 
     | 
    
         
            +
             
     | 
| 
      
 939 
     | 
    
         
            +
             
     | 
| 
      
 940 
     | 
    
         
            +
              ######
         
     | 
| 
      
 941 
     | 
    
         
            +
              # handle unknown framenames
         
     | 
| 
      
 942 
     | 
    
         
            +
              #
         
     | 
| 
      
 943 
     | 
    
         
            +
              # For all frames with names matching Unknown\d+,
         
     | 
| 
      
 944 
     | 
    
         
            +
              # rename them to <lemma>_Unknown\d+
         
     | 
| 
      
 945 
     | 
    
         
            +
              def FrprepHelper.handle_unknown_framenames(sent,     # SalsaTigerSentence
         
     | 
| 
      
 946 
     | 
    
         
            +
            					     interpreter) # SynInterpreter class
         
     | 
| 
      
 947 
     | 
    
         
            +
                if sent.nil?
         
     | 
| 
      
 948 
     | 
    
         
            +
            	return
         
     | 
| 
      
 949 
     | 
    
         
            +
                end
         
     | 
| 
      
 950 
     | 
    
         
            +
             
     | 
| 
      
 951 
     | 
    
         
            +
                sent.each_frame { |frame|
         
     | 
| 
      
 952 
     | 
    
         
            +
                  if frame.name() =~ /^Unknown/
         
     | 
| 
      
 953 
     | 
    
         
            +
                    if frame.target
         
     | 
| 
      
 954 
     | 
    
         
            +
                      maintarget = interpreter.main_node_of_expr(frame.target.children(), "no_mwe")
         
     | 
| 
      
 955 
     | 
    
         
            +
                    else
         
     | 
| 
      
 956 
     | 
    
         
            +
                      maintarget = nil
         
     | 
| 
      
 957 
     | 
    
         
            +
                    end
         
     | 
| 
      
 958 
     | 
    
         
            +
                    unless maintarget
         
     | 
| 
      
 959 
     | 
    
         
            +
                      $stderr.puts "Warning: Unknown frame, and I could not determine the target lemma: Frame #{frame.id()}"
         
     | 
| 
      
 960 
     | 
    
         
            +
                      $stderr.puts "Cannot repair frame name, leaving it as is."
         
     | 
| 
      
 961 
     | 
    
         
            +
                      return
         
     | 
| 
      
 962 
     | 
    
         
            +
                    end
         
     | 
| 
      
 963 
     | 
    
         
            +
                      
         
     | 
| 
      
 964 
     | 
    
         
            +
                    # get lemma, if it exists, otherwise get word
         
     | 
| 
      
 965 
     | 
    
         
            +
                    # also, if the lemmatizer has returned a disjunction of lemmas,
         
     | 
| 
      
 966 
     | 
    
         
            +
                    # get the first disjunct
         
     | 
| 
      
 967 
     | 
    
         
            +
                    lemma = interpreter.lemma_backoff(maintarget)
         
     | 
| 
      
 968 
     | 
    
         
            +
                    if lemma
         
     | 
| 
      
 969 
     | 
    
         
            +
                      # we have a lemma
         
     | 
| 
      
 970 
     | 
    
         
            +
                      frame.set_name(lemma + "_" + frame.name())
         
     | 
| 
      
 971 
     | 
    
         
            +
                    else
         
     | 
| 
      
 972 
     | 
    
         
            +
                      # the main target word has no lemma attribute,
         
     | 
| 
      
 973 
     | 
    
         
            +
                      # and somehow I couldn't even get the target word
         
     | 
| 
      
 974 
     | 
    
         
            +
                      $stderr.puts "Warning: Salsa 'Unknown' frame."
         
     | 
| 
      
 975 
     | 
    
         
            +
                      $stderr.puts "Trying to make its lemma-specificity explicit, but"
         
     | 
| 
      
 976 
     | 
    
         
            +
                      $stderr.puts "I could not determine the target lemma nor the target word: frame #{frame.id()}"
         
     | 
| 
      
 977 
     | 
    
         
            +
                      $stderr.puts "Leaving 'Unknown' as it is."
         
     | 
| 
      
 978 
     | 
    
         
            +
                    end
         
     | 
| 
      
 979 
     | 
    
         
            +
                  end
         
     | 
| 
      
 980 
     | 
    
         
            +
                } 
         
     | 
| 
      
 981 
     | 
    
         
            +
              end
         
     | 
| 
      
 982 
     | 
    
         
            +
             
     | 
| 
      
 983 
     | 
    
         
            +
             
     | 
| 
      
 984 
     | 
    
         
            +
              #####################
         
     | 
| 
      
 985 
     | 
    
         
            +
              #
         
     | 
| 
      
 986 
     | 
    
         
            +
              # Integrate the semantic annotation of an old sentence
         
     | 
| 
      
 987 
     | 
    
         
            +
              # into the corresponding new sentence
         
     | 
| 
      
 988 
     | 
    
         
            +
              # At the same time, integrate the lemma information from the
         
     | 
| 
      
 989 
     | 
    
         
            +
              # old sentence into the new sentence
         
     | 
| 
      
 990 
     | 
    
         
            +
              def FrprepHelper.integrate_stxml_semantics_and_lemmas(oldsent, 
         
     | 
| 
      
 991 
     | 
    
         
            +
                                                                    newsent,
         
     | 
| 
      
 992 
     | 
    
         
            +
                                                                    interpreter_class,
         
     | 
| 
      
 993 
     | 
    
         
            +
                                                                    exp)
         
     | 
| 
      
 994 
     | 
    
         
            +
                if oldsent.nil? or newsent.nil?
         
     | 
| 
      
 995 
     | 
    
         
            +
            	return
         
     | 
| 
      
 996 
     | 
    
         
            +
                end
         
     | 
| 
      
 997 
     | 
    
         
            +
                ##
         
     | 
| 
      
 998 
     | 
    
         
            +
                # match old and new sentence via terminals
         
     | 
| 
      
 999 
     | 
    
         
            +
                newterminals = newsent.terminals_sorted()
         
     | 
| 
      
 1000 
     | 
    
         
            +
                oldterminals = oldsent.terminals_sorted()
         
     | 
| 
      
 1001 
     | 
    
         
            +
                # sanity check: exact match on terminals?
         
     | 
| 
      
 1002 
     | 
    
         
            +
                newterminals.interleave(oldterminals).each { |newnode, oldnode|
         
     | 
| 
      
 1003 
     | 
    
         
            +
            	#print "old ", oldnode.word, "  ", newnode.word, "\n"
         
     | 
| 
      
 1004 
     | 
    
         
            +
                  # new and old word: use both unescaped and escaped variant
         
     | 
| 
      
 1005 
     | 
    
         
            +
                  if newnode
         
     | 
| 
      
 1006 
     | 
    
         
            +
                    newwords = [ newnode.word, SalsaTigerXMLHelper.escape(newnode.word) ]
         
     | 
| 
      
 1007 
     | 
    
         
            +
                  else
         
     | 
| 
      
 1008 
     | 
    
         
            +
                    newwords = [nil, nil]
         
     | 
| 
      
 1009 
     | 
    
         
            +
                  end
         
     | 
| 
      
 1010 
     | 
    
         
            +
                  if oldnode
         
     | 
| 
      
 1011 
     | 
    
         
            +
                    oldwords = [ oldnode.word, SalsaTigerXMLHelper.escape(oldnode.word) ]
         
     | 
| 
      
 1012 
     | 
    
         
            +
                  else
         
     | 
| 
      
 1013 
     | 
    
         
            +
                    oldwords = [ nil, nil]
         
     | 
| 
      
 1014 
     | 
    
         
            +
                  end
         
     | 
| 
      
 1015 
     | 
    
         
            +
             
     | 
| 
      
 1016 
     | 
    
         
            +
                  if (newwords & oldwords).empty?
         
     | 
| 
      
 1017 
     | 
    
         
            +
                    # old and new word don't match, either escaped or non-escaped
         
     | 
| 
      
 1018 
     | 
    
         
            +
             
     | 
| 
      
 1019 
     | 
    
         
            +
                    $stderr.puts "Warning: could not match terminals of sentence #{newsent.id()}"
         
     | 
| 
      
 1020 
     | 
    
         
            +
                    $stderr.puts "This means that I cannot match the semantic annotation"
         
     | 
| 
      
 1021 
     | 
    
         
            +
                    $stderr.puts "to the newly parsed sentence. Skipping."
         
     | 
| 
      
 1022 
     | 
    
         
            +
                    #$stderr.puts "Old sentence: "
         
     | 
| 
      
 1023 
     | 
    
         
            +
                    #$stderr.puts oldterminals.map { |n| n.word }.join("--")
         
     | 
| 
      
 1024 
     | 
    
         
            +
                    #$stderr.puts "New sentence: "
         
     | 
| 
      
 1025 
     | 
    
         
            +
                    #$stderr.puts newterminals.map { |n| n.word }.join("--")
         
     | 
| 
      
 1026 
     | 
    
         
            +
                    return false
         
     | 
| 
      
 1027 
     | 
    
         
            +
                  end
         
     | 
| 
      
 1028 
     | 
    
         
            +
                }
         
     | 
| 
      
 1029 
     | 
    
         
            +
             
     | 
| 
      
 1030 
     | 
    
         
            +
                ##
         
     | 
| 
      
 1031 
     | 
    
         
            +
                # copy lemma information
         
     | 
| 
      
 1032 
     | 
    
         
            +
                oldterminals.each_with_index { |oldnode, ix|
         
     | 
| 
      
 1033 
     | 
    
         
            +
                  newnode = newterminals[ix]
         
     | 
| 
      
 1034 
     | 
    
         
            +
                  if oldnode.get_attribute("lemma")
         
     | 
| 
      
 1035 
     | 
    
         
            +
                    newnode.set_attribute("lemma", oldnode.get_attribute("lemma"))
         
     | 
| 
      
 1036 
     | 
    
         
            +
                  end
         
     | 
| 
      
 1037 
     | 
    
         
            +
                }
         
     | 
| 
      
 1038 
     | 
    
         
            +
             
     | 
| 
      
 1039 
     | 
    
         
            +
                ##
         
     | 
| 
      
 1040 
     | 
    
         
            +
                # copy frames
         
     | 
| 
      
 1041 
     | 
    
         
            +
                oldsent.each_frame { |oldframe|
         
     | 
| 
      
 1042 
     | 
    
         
            +
                  # make new frame with same ID
         
     | 
| 
      
 1043 
     | 
    
         
            +
                  newframe = newsent.add_frame(oldframe.name, oldframe.id())
         
     | 
| 
      
 1044 
     | 
    
         
            +
                  # copy FEs
         
     | 
| 
      
 1045 
     | 
    
         
            +
                  oldframe.each_child { |oldfe|
         
     | 
| 
      
 1046 
     | 
    
         
            +
                    # new nodes: map old terminals to new terminals,
         
     | 
| 
      
 1047 
     | 
    
         
            +
                    # then find max constituents covering them
         
     | 
| 
      
 1048 
     | 
    
         
            +
                    newnodes = oldfe.descendants.select { |n| 
         
     | 
| 
      
 1049 
     | 
    
         
            +
                      n.is_terminal? 
         
     | 
| 
      
 1050 
     | 
    
         
            +
                    }.map { |n|
         
     | 
| 
      
 1051 
     | 
    
         
            +
                      oldterminals.index(n)
         
     | 
| 
      
 1052 
     | 
    
         
            +
                    }.map { |ix|
         
     | 
| 
      
 1053 
     | 
    
         
            +
                      newterminals[ix]
         
     | 
| 
      
 1054 
     | 
    
         
            +
                    }
         
     | 
| 
      
 1055 
     | 
    
         
            +
             
     | 
| 
      
 1056 
     | 
    
         
            +
                    # let the interpreter class decide on how to determine the maximum constituents
         
     | 
| 
      
 1057 
     | 
    
         
            +
                    newnodes = interpreter_class.max_constituents(newnodes, newsent)
         
     | 
| 
      
 1058 
     | 
    
         
            +
             
     | 
| 
      
 1059 
     | 
    
         
            +
                    # make new FE with same ID
         
     | 
| 
      
 1060 
     | 
    
         
            +
                    new_fe = newsent.add_fe(newframe, oldfe.name(), newnodes, oldfe.id())
         
     | 
| 
      
 1061 
     | 
    
         
            +
            	# keep all attributes of the FE
         
     | 
| 
      
 1062 
     | 
    
         
            +
            	if oldfe.get_f("attributes")
         
     | 
| 
      
 1063 
     | 
    
         
            +
            	  oldfe.get_f("attributes").each_pair { |attr, value|
         
     | 
| 
      
 1064 
     | 
    
         
            +
            	    new_fe.set_attribute(attr, value)
         
     | 
| 
      
 1065 
     | 
    
         
            +
            	  }
         
     | 
| 
      
 1066 
     | 
    
         
            +
            	end
         
     | 
| 
      
 1067 
     | 
    
         
            +
                  }
         
     | 
| 
      
 1068 
     | 
    
         
            +
                }
         
     | 
| 
      
 1069 
     | 
    
         
            +
             
     | 
| 
      
 1070 
     | 
    
         
            +
                ##
         
     | 
| 
      
 1071 
     | 
    
         
            +
                ### changed by ines => appears twice in stxml file
         
     | 
| 
      
 1072 
     | 
    
         
            +
             
     | 
| 
      
 1073 
     | 
    
         
            +
                # copy underspecification
         
     | 
| 
      
 1074 
     | 
    
         
            +
                # keep as is, since we've kept all frame and FE IDs
         
     | 
| 
      
 1075 
     | 
    
         
            +
                oldsent.each_usp_frameblock { |olduspframe|
         
     | 
| 
      
 1076 
     | 
    
         
            +
                  newuspframe = newsent.add_usp("frame")
         
     | 
| 
      
 1077 
     | 
    
         
            +
                  olduspframe.each_child { |oldnode|
         
     | 
| 
      
 1078 
     | 
    
         
            +
                    newnode = newsent.sem_node_with_id(oldnode.id())
         
     | 
| 
      
 1079 
     | 
    
         
            +
                    if newnode
         
     | 
| 
      
 1080 
     | 
    
         
            +
                      newuspframe.add_child(newnode)
         
     | 
| 
      
 1081 
     | 
    
         
            +
                    else
         
     | 
| 
      
 1082 
     | 
    
         
            +
                      $stderr.puts "Error: unknown frame with ID #{oldnode.id()}"
         
     | 
| 
      
 1083 
     | 
    
         
            +
                    end
         
     | 
| 
      
 1084 
     | 
    
         
            +
                  }
         
     | 
| 
      
 1085 
     | 
    
         
            +
                }
         
     | 
| 
      
 1086 
     | 
    
         
            +
                oldsent.each_usp_feblock { |olduspfe|
         
     | 
| 
      
 1087 
     | 
    
         
            +
                  newuspfe = newsent.add_usp("fe")
         
     | 
| 
      
 1088 
     | 
    
         
            +
                  olduspfe.each_child { |oldnode|
         
     | 
| 
      
 1089 
     | 
    
         
            +
                    newnode = newsent.sem_node_with_id(oldnode.id())
         
     | 
| 
      
 1090 
     | 
    
         
            +
                    if newnode
         
     | 
| 
      
 1091 
     | 
    
         
            +
                      newuspfe.add_child(newnode)
         
     | 
| 
      
 1092 
     | 
    
         
            +
                    else
         
     | 
| 
      
 1093 
     | 
    
         
            +
                      $stderr.puts "Error: unknown FE with ID #{oldnode.id()}"
         
     | 
| 
      
 1094 
     | 
    
         
            +
                    end
         
     | 
| 
      
 1095 
     | 
    
         
            +
                  }
         
     | 
| 
      
 1096 
     | 
    
         
            +
                }
         
     | 
| 
      
 1097 
     | 
    
         
            +
             
     | 
| 
      
 1098 
     | 
    
         
            +
              end
         
     | 
| 
      
 1099 
     | 
    
         
            +
             
     | 
| 
      
 1100 
     | 
    
         
            +
              ####################
         
     | 
| 
      
 1101 
     | 
    
         
            +
              # add head attributes to each nonterminal in each 
         
     | 
| 
      
 1102 
     | 
    
         
            +
              # SalsaTigerXML file in a directory
         
     | 
| 
      
 1103 
     | 
    
         
            +
             
     | 
| 
      
 1104 
     | 
    
         
            +
              def FrprepHelper.add_head_attributes(st_sent,      # SalsaTigerSentence object
         
     | 
| 
      
 1105 
     | 
    
         
            +
            				       interpreter)  # SynInterpreter class
         
     | 
| 
      
 1106 
     | 
    
         
            +
                st_sent.each_nonterminal {|nt_node|
         
     | 
| 
      
 1107 
     | 
    
         
            +
                 head_term = interpreter.head_terminal(nt_node)
         
     | 
| 
      
 1108 
     | 
    
         
            +
                  if head_term and head_term.word()
         
     | 
| 
      
 1109 
     | 
    
         
            +
            	nt_node.set_attribute("head", head_term.word())
         
     | 
| 
      
 1110 
     | 
    
         
            +
                  else
         
     | 
| 
      
 1111 
     | 
    
         
            +
            	nt_node.set_attribute("head", "--")
         
     | 
| 
      
 1112 
     | 
    
         
            +
                  end
         
     | 
| 
      
 1113 
     | 
    
         
            +
                } # each nonterminal
         
     | 
| 
      
 1114 
     | 
    
         
            +
              end
         
     | 
| 
      
 1115 
     | 
    
         
            +
             
     | 
| 
      
 1116 
     | 
    
         
            +
              # add lemma information to each terminal in a given SalsaTigerSentence object
         
     | 
| 
      
 1117 
     | 
    
         
            +
              def FrprepHelper.add_lemmas_from_tab(st_sent, # SalsaTigerSentence object
         
     | 
| 
      
 1118 
     | 
    
         
            +
            				       tab_sent,# FNTabFormatSentence object
         
     | 
| 
      
 1119 
     | 
    
         
            +
                                                   mapping) # hash: tab lineno -> array:SynNode
         
     | 
| 
      
 1120 
     | 
    
         
            +
                if tab_sent.nil?
         
     | 
| 
      
 1121 
     | 
    
         
            +
                  # tab sentence not found
         
     | 
| 
      
 1122 
     | 
    
         
            +
                  return
         
     | 
| 
      
 1123 
     | 
    
         
            +
                end
         
     | 
| 
      
 1124 
     | 
    
         
            +
             
     | 
| 
      
 1125 
     | 
    
         
            +
                # produce list with word, lemma pairs
         
     | 
| 
      
 1126 
     | 
    
         
            +
                lemmat = Array.new        
         
     | 
| 
      
 1127 
     | 
    
         
            +
                tab_sent.each_line_parsed {|line|
         
     | 
| 
      
 1128 
     | 
    
         
            +
                  word = line.get("word")
         
     | 
| 
      
 1129 
     | 
    
         
            +
                  lemma = line.get("lemma")
         
     | 
| 
      
 1130 
     | 
    
         
            +
                  lemmat << [word,lemma]
         
     | 
| 
      
 1131 
     | 
    
         
            +
                }
         
     | 
| 
      
 1132 
     | 
    
         
            +
                
         
     | 
| 
      
 1133 
     | 
    
         
            +
                # match with st_sent terminal list and add lemma attributes
         
     | 
| 
      
 1134 
     | 
    
         
            +
                # KE Jan 07: if word mismatch,
         
     | 
| 
      
 1135 
     | 
    
         
            +
                # set to Lemmatizer file version,
         
     | 
| 
      
 1136 
     | 
    
         
            +
                # but count mismatches
         
     | 
| 
      
 1137 
     | 
    
         
            +
                word_mismatches = Array.new()
         
     | 
| 
      
 1138 
     | 
    
         
            +
             
     | 
| 
      
 1139 
     | 
    
         
            +
                st_sent.each_terminal_sorted {|t|
         
     | 
| 
      
 1140 
     | 
    
         
            +
                  matching_lineno = (0..lemmat.length()-1).to_a.detect { |tab_lineno|
         
     | 
| 
      
 1141 
     | 
    
         
            +
                    mapping[tab_lineno].include? t
         
     | 
| 
      
 1142 
     | 
    
         
            +
                  }
         
     | 
| 
      
 1143 
     | 
    
         
            +
                  unless matching_lineno
         
     | 
| 
      
 1144 
     | 
    
         
            +
                    next
         
     | 
| 
      
 1145 
     | 
    
         
            +
                  end
         
     | 
| 
      
 1146 
     | 
    
         
            +
                  word, lemma = lemmat[matching_lineno]
         
     | 
| 
      
 1147 
     | 
    
         
            +
                      
         
     | 
| 
      
 1148 
     | 
    
         
            +
                  # transform characters to XML-friendly form
         
     | 
| 
      
 1149 
     | 
    
         
            +
                  # for comparison with st_word, which is also escaped
         
     | 
| 
      
 1150 
     | 
    
         
            +
                  word = SalsaTigerXMLHelper.escape(word)
         
     | 
| 
      
 1151 
     | 
    
         
            +
                  st_word = t.word()
         
     | 
| 
      
 1152 
     | 
    
         
            +
                  if word != st_word and
         
     | 
| 
      
 1153 
     | 
    
         
            +
                      word != SalsaTigerXMLHelper.escape(st_word)
         
     | 
| 
      
 1154 
     | 
    
         
            +
                    # true mismatch.
         
     | 
| 
      
 1155 
     | 
    
         
            +
                    # use the Lemmatizer version of the word, remember the mismatch
         
     | 
| 
      
 1156 
     | 
    
         
            +
                    word_mismatches << [st_word, word]
         
     | 
| 
      
 1157 
     | 
    
         
            +
                    t.set_attribute("word", word)
         
     | 
| 
      
 1158 
     | 
    
         
            +
                  end
         
     | 
| 
      
 1159 
     | 
    
         
            +
                  
         
     | 
| 
      
 1160 
     | 
    
         
            +
                  if lemma
         
     | 
| 
      
 1161 
     | 
    
         
            +
                    # we actually do have lemma information
         
     | 
| 
      
 1162 
     | 
    
         
            +
                    lemmatised_head = SalsaTigerXMLHelper.escape(lemma)
         
     | 
| 
      
 1163 
     | 
    
         
            +
                    t.set_attribute("lemma",lemmatised_head)
         
     | 
| 
      
 1164 
     | 
    
         
            +
                  end
         
     | 
| 
      
 1165 
     | 
    
         
            +
                } # each terminal
         
     | 
| 
      
 1166 
     | 
    
         
            +
             
     | 
| 
      
 1167 
     | 
    
         
            +
                # did we have mismatches? then report them
         
     | 
| 
      
 1168 
     | 
    
         
            +
                unless word_mismatches.empty?
         
     | 
| 
      
 1169 
     | 
    
         
            +
                  $stderr.puts "Warning: Word mismatches found between Lemmatizer file and SalsaTigerXML file generalted from parser output."
         
     | 
| 
      
 1170 
     | 
    
         
            +
                  $stderr.puts "(May be due to failed reencoding of special character in the parser output.)"
         
     | 
| 
      
 1171 
     | 
    
         
            +
                  $stderr.puts "I am using the Lemmatizer version by default."
         
     | 
| 
      
 1172 
     | 
    
         
            +
                  $stderr.puts "Version used:"
         
     | 
| 
      
 1173 
     | 
    
         
            +
                  $stderr.print "\t"
         
     | 
| 
      
 1174 
     | 
    
         
            +
                  st_sent.each_terminal_sorted { |t| $stderr.print ">>#{t}<<" }
         
     | 
| 
      
 1175 
     | 
    
         
            +
                  $stderr.puts
         
     | 
| 
      
 1176 
     | 
    
         
            +
                  $stderr.print "SalsaTigerXML file had: "
         
     | 
| 
      
 1177 
     | 
    
         
            +
                  $stderr.print word_mismatches.map { |st_word, tab_word|
         
     | 
| 
      
 1178 
     | 
    
         
            +
                    "#{st_word} instead of #{tab_word}"
         
     | 
| 
      
 1179 
     | 
    
         
            +
                  }.join(", ")
         
     | 
| 
      
 1180 
     | 
    
         
            +
                  $stderr.puts
         
     | 
| 
      
 1181 
     | 
    
         
            +
                end
         
     | 
| 
      
 1182 
     | 
    
         
            +
              end
         
     | 
| 
      
 1183 
     | 
    
         
            +
             
     | 
| 
      
 1184 
     | 
    
         
            +
              ###################3
         
     | 
| 
      
 1185 
     | 
    
         
            +
              # given a SalsaTigerSentence,
         
     | 
| 
      
 1186 
     | 
    
         
            +
              # look for FrameNet frames that are 
         
     | 
| 
      
 1187 
     | 
    
         
            +
              # test frames, and remove them
         
     | 
| 
      
 1188 
     | 
    
         
            +
              def FrprepHelper.remove_deprecated_frames(sent,  # SalsaTigerSentence
         
     | 
| 
      
 1189 
     | 
    
         
            +
                                                        exp)   # FrprepConfigData
         
     | 
| 
      
 1190 
     | 
    
         
            +
             
     | 
| 
      
 1191 
     | 
    
         
            +
                unless exp.get("origin") == "FrameNet"
         
     | 
| 
      
 1192 
     | 
    
         
            +
                  return
         
     | 
| 
      
 1193 
     | 
    
         
            +
                end
         
     | 
| 
      
 1194 
     | 
    
         
            +
             
     | 
| 
      
 1195 
     | 
    
         
            +
                sent.frames.each { |frame_obj|
         
     | 
| 
      
 1196 
     | 
    
         
            +
                  if frame_obj.name() == "Boulder" or
         
     | 
| 
      
 1197 
     | 
    
         
            +
                      frame_obj.name() =~ /^Test/
         
     | 
| 
      
 1198 
     | 
    
         
            +
                    sent.remove_frame(frame_obj)
         
     | 
| 
      
 1199 
     | 
    
         
            +
                  end
         
     | 
| 
      
 1200 
     | 
    
         
            +
                }
         
     | 
| 
      
 1201 
     | 
    
         
            +
              end
         
     | 
| 
      
 1202 
     | 
    
         
            +
             
     | 
| 
      
 1203 
     | 
    
         
            +
            end
         
     | 
| 
      
 1204 
     | 
    
         
            +
             
     | 
| 
      
 1205 
     | 
    
         
            +
            ############################################3
         
     | 
| 
      
 1206 
     | 
    
         
            +
            # Class FrprepFlatSyntax:
         
     | 
| 
      
 1207 
     | 
    
         
            +
            #
         
     | 
| 
      
 1208 
     | 
    
         
            +
            # given a FNTabFormat file,
         
     | 
| 
      
 1209 
     | 
    
         
            +
            # yield each of its sentences in SalsaTigerXML,
         
     | 
| 
      
 1210 
     | 
    
         
            +
            # constructing a flat syntax
         
     | 
| 
      
 1211 
     | 
    
         
            +
            class FrprepFlatSyntax
         
     | 
| 
      
 1212 
     | 
    
         
            +
              def initialize(tabfilename, # string: name of tab file
         
     | 
| 
      
 1213 
     | 
    
         
            +
                             postag_suffix, # postag file suffix (or nil)  
         
     | 
| 
      
 1214 
     | 
    
         
            +
                             lemma_suffix)  # lemmatisation file suffix (or nil)
         
     | 
| 
      
 1215 
     | 
    
         
            +
                
         
     | 
| 
      
 1216 
     | 
    
         
            +
                @tabfilename = tabfilename
         
     | 
| 
      
 1217 
     | 
    
         
            +
                @pos_suffix = postag_suffix
         
     | 
| 
      
 1218 
     | 
    
         
            +
                @lemma_suffix = lemma_suffix
         
     | 
| 
      
 1219 
     | 
    
         
            +
              end
         
     | 
| 
      
 1220 
     | 
    
         
            +
             
     | 
| 
      
 1221 
     | 
    
         
            +
              # yield each non-parse sentence as a tuple
         
     | 
| 
      
 1222 
     | 
    
         
            +
              # [ salsa/tiger xml sentence, tab format sentence, mapping]
         
     | 
| 
      
 1223 
     | 
    
         
            +
              # of a SalsaTigerSentence object, a FNTabSentence object,
         
     | 
| 
      
 1224 
     | 
    
         
            +
              # and a hash: FNTab sentence lineno(integer) -> array:SynNode
         
     | 
| 
      
 1225 
     | 
    
         
            +
              # pointing each tab word to one or more SalsaTigerSentence terminals
         
     | 
| 
      
 1226 
     | 
    
         
            +
              def each_sentence(dummy)
         
     | 
| 
      
 1227 
     | 
    
         
            +
             
     | 
| 
      
 1228 
     | 
    
         
            +
                # read tab file with lemma and POS info
         
     | 
| 
      
 1229 
     | 
    
         
            +
                tabfile = FNTabFormatFile.new(@tabfilename, @pos_suffix, @lemma_suffix)
         
     | 
| 
      
 1230 
     | 
    
         
            +
             
     | 
| 
      
 1231 
     | 
    
         
            +
                tabfile.each_sentence() { |tabsent|
         
     | 
| 
      
 1232 
     | 
    
         
            +
                  # start new, empty sentence with "failed" attribute (i.e. no parse)
         
     | 
| 
      
 1233 
     | 
    
         
            +
                  # and with the ID of the corresponding TabFormat sentence
         
     | 
| 
      
 1234 
     | 
    
         
            +
                  sentid = tabsent.get_sent_id()
         
     | 
| 
      
 1235 
     | 
    
         
            +
                  if sentid.nil? or sentid =~ /^-*$/
         
     | 
| 
      
 1236 
     | 
    
         
            +
                    $stderr.puts "No sentence ID for sentence:"
         
     | 
| 
      
 1237 
     | 
    
         
            +
                    tabsent.each_line_parsed { |l| $stderr.print l.get("word"), " "}
         
     | 
| 
      
 1238 
     | 
    
         
            +
                    $stderr.puts
         
     | 
| 
      
 1239 
     | 
    
         
            +
                    sentid = Time.new().to_f.to_s
         
     | 
| 
      
 1240 
     | 
    
         
            +
                  end
         
     | 
| 
      
 1241 
     | 
    
         
            +
                  sent = SalsaTigerSentence.new("<s id=\"#{SalsaTigerXMLHelper.escape(sentid)}\" failed=\"true\"></s>")
         
     | 
| 
      
 1242 
     | 
    
         
            +
                  
         
     | 
| 
      
 1243 
     | 
    
         
            +
                  # add single nonterminal node, category "S"
         
     | 
| 
      
 1244 
     | 
    
         
            +
                  single_nonterminal_id = SalsaTigerXMLHelper.escape(sentid.to_s + "_NT")
         
     | 
| 
      
 1245 
     | 
    
         
            +
                  vroot = sent.add_syn("nt", "S", # category
         
     | 
| 
      
 1246 
     | 
    
         
            +
                                       nil,  # word
         
     | 
| 
      
 1247 
     | 
    
         
            +
                                       nil,  # pos
         
     | 
| 
      
 1248 
     | 
    
         
            +
                                       single_nonterminal_id)
         
     | 
| 
      
 1249 
     | 
    
         
            +
                  
         
     | 
| 
      
 1250 
     | 
    
         
            +
                  # add terminals
         
     | 
| 
      
 1251 
     | 
    
         
            +
                  tabsent.each_line_parsed() { |line_obj|
         
     | 
| 
      
 1252 
     | 
    
         
            +
                    # make terminal node with tab sent info
         
     | 
| 
      
 1253 
     | 
    
         
            +
                    node_id = sentid.to_s + "_" + line_obj.get("lineno").to_s
         
     | 
| 
      
 1254 
     | 
    
         
            +
                    word = line_obj.get("word")
         
     | 
| 
      
 1255 
     | 
    
         
            +
                    unless word
         
     | 
| 
      
 1256 
     | 
    
         
            +
                      word = ""
         
     | 
| 
      
 1257 
     | 
    
         
            +
                    end
         
     | 
| 
      
 1258 
     | 
    
         
            +
                    word = SalsaTigerXMLHelper.escape(word)
         
     | 
| 
      
 1259 
     | 
    
         
            +
                    pos = line_obj.get("pos")
         
     | 
| 
      
 1260 
     | 
    
         
            +
                    unless pos
         
     | 
| 
      
 1261 
     | 
    
         
            +
                      pos = ""
         
     | 
| 
      
 1262 
     | 
    
         
            +
                    end
         
     | 
| 
      
 1263 
     | 
    
         
            +
                    pos = SalsaTigerXMLHelper.escape(pos)
         
     | 
| 
      
 1264 
     | 
    
         
            +
                    terminal = sent.add_syn("t", nil, # category
         
     | 
| 
      
 1265 
     | 
    
         
            +
                                            word, pos, 
         
     | 
| 
      
 1266 
     | 
    
         
            +
                                            node_id)
         
     | 
| 
      
 1267 
     | 
    
         
            +
             
     | 
| 
      
 1268 
     | 
    
         
            +
                    if line_obj.get("lemma")
         
     | 
| 
      
 1269 
     | 
    
         
            +
                      # lemma
         
     | 
| 
      
 1270 
     | 
    
         
            +
                      terminal.set_attribute("lemma", SalsaTigerXMLHelper.escape(line_obj.get("lemma")))
         
     | 
| 
      
 1271 
     | 
    
         
            +
                    end
         
     | 
| 
      
 1272 
     | 
    
         
            +
             
     | 
| 
      
 1273 
     | 
    
         
            +
                    # add new terminal as child of vroot
         
     | 
| 
      
 1274 
     | 
    
         
            +
                    vroot.add_child(terminal, nil)
         
     | 
| 
      
 1275 
     | 
    
         
            +
                    terminal.add_parent(vroot, nil)
         
     | 
| 
      
 1276 
     | 
    
         
            +
                  } # each line of tab file
         
     | 
| 
      
 1277 
     | 
    
         
            +
             
     | 
| 
      
 1278 
     | 
    
         
            +
                  # yield newly constructed SalsaTigerXMl sentence plus tab sentence
         
     | 
| 
      
 1279 
     | 
    
         
            +
                  yield [sent, tabsent, SynInterfaceSTXML.standard_mapping(sent, tabsent)]
         
     | 
| 
      
 1280 
     | 
    
         
            +
                }
         
     | 
| 
      
 1281 
     | 
    
         
            +
              end
         
     | 
| 
      
 1282 
     | 
    
         
            +
            end
         
     | 
| 
      
 1283 
     | 
    
         
            +
             
     | 
| 
      
 1284 
     | 
    
         
            +
            ############################################3
         
     | 
| 
      
 1285 
     | 
    
         
            +
            # Class FrprepReadStxml
         
     | 
| 
      
 1286 
     | 
    
         
            +
            #
         
     | 
| 
      
 1287 
     | 
    
         
            +
            # given a STXML file,
         
     | 
| 
      
 1288 
     | 
    
         
            +
            # yield each of its sentences
         
     | 
| 
      
 1289 
     | 
    
         
            +
            class FrprepReadStxml
         
     | 
| 
      
 1290 
     | 
    
         
            +
              def initialize(stxmlfilename, # string: name of SalsaTigerXML file
         
     | 
| 
      
 1291 
     | 
    
         
            +
                             tabfilename,   # string: name of corresponding tab file (or nil)
         
     | 
| 
      
 1292 
     | 
    
         
            +
                             postag_suffix,    #  POS tag file suffix (or nil)
         
     | 
| 
      
 1293 
     | 
    
         
            +
                             lemma_suffix)     #  lemmatization file suffix (or nil)
         
     | 
| 
      
 1294 
     | 
    
         
            +
             
     | 
| 
      
 1295 
     | 
    
         
            +
                @stxmlfilename = stxmlfilename
         
     | 
| 
      
 1296 
     | 
    
         
            +
                @tabfilename = tabfilename
         
     | 
| 
      
 1297 
     | 
    
         
            +
                @pos_suffix = postag_suffix
         
     | 
| 
      
 1298 
     | 
    
         
            +
                @lemma_suffix = lemma_suffix
         
     | 
| 
      
 1299 
     | 
    
         
            +
              end
         
     | 
| 
      
 1300 
     | 
    
         
            +
              # yield each non-parse sentence as a tuple
         
     | 
| 
      
 1301 
     | 
    
         
            +
              # [ salsa/tiger xml sentence, tab format sentence, mapping]
         
     | 
| 
      
 1302 
     | 
    
         
            +
              # of a SalsaTigerSentence object, a FNTabSentence object,
         
     | 
| 
      
 1303 
     | 
    
         
            +
              # and a hash: FNTab sentence lineno(integer) -> array:SynNode
         
     | 
| 
      
 1304 
     | 
    
         
            +
              # pointing each tab word to one or more SalsaTigerSentence terminals
         
     | 
| 
      
 1305 
     | 
    
         
            +
              def each_sentence(dummy)
         
     | 
| 
      
 1306 
     | 
    
         
            +
                # read corresponding tab file?
         
     | 
| 
      
 1307 
     | 
    
         
            +
                tab_sents = Array.new()
         
     | 
| 
      
 1308 
     | 
    
         
            +
                if File.exists? @tabfilename
         
     | 
| 
      
 1309 
     | 
    
         
            +
                  tabfile = FNTabFormatFile.new(@tabfilename,@pos_suffix,@lemma_suffix)    
         
     | 
| 
      
 1310 
     | 
    
         
            +
                  tabfile.each_sentence { |tabsent|
         
     | 
| 
      
 1311 
     | 
    
         
            +
                    tab_sents << tabsent
         
     | 
| 
      
 1312 
     | 
    
         
            +
                  }      
         
     | 
| 
      
 1313 
     | 
    
         
            +
                end
         
     | 
| 
      
 1314 
     | 
    
         
            +
             
     | 
| 
      
 1315 
     | 
    
         
            +
                # read STXML file
         
     | 
| 
      
 1316 
     | 
    
         
            +
                infile = FilePartsParser.new(@stxmlfilename)
         
     | 
| 
      
 1317 
     | 
    
         
            +
                index = 0
         
     | 
| 
      
 1318 
     | 
    
         
            +
                infile.scan_s { |sent_string|
         
     | 
| 
      
 1319 
     | 
    
         
            +
                  sent = SalsaTigerSentence.new(sent_string)
         
     | 
| 
      
 1320 
     | 
    
         
            +
                  yield [sent, tab_sents.at(index), SynInterfaceSTXML.standard_mapping(sent, tab_sents.at(index))]
         
     | 
| 
      
 1321 
     | 
    
         
            +
                  index += 1
         
     | 
| 
      
 1322 
     | 
    
         
            +
                }
         
     | 
| 
      
 1323 
     | 
    
         
            +
              end
         
     | 
| 
      
 1324 
     | 
    
         
            +
            end
         
     |