frprep 0.0.1.prealpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +8 -0
 - data/CHANGELOG.rdoc +0 -0
 - data/LICENSE.rdoc +0 -0
 - data/README.rdoc +0 -0
 - data/lib/common/AbstractSynInterface.rb +1227 -0
 - data/lib/common/BerkeleyInterface.rb +375 -0
 - data/lib/common/CollinsInterface.rb +1165 -0
 - data/lib/common/ConfigData.rb +694 -0
 - data/lib/common/Counter.rb +18 -0
 - data/lib/common/DBInterface.rb +48 -0
 - data/lib/common/EnduserMode.rb +27 -0
 - data/lib/common/Eval.rb +480 -0
 - data/lib/common/FixSynSemMapping.rb +196 -0
 - data/lib/common/FrPrepConfigData.rb +66 -0
 - data/lib/common/FrprepHelper.rb +1324 -0
 - data/lib/common/Graph.rb +345 -0
 - data/lib/common/ISO-8859-1.rb +24 -0
 - data/lib/common/ML.rb +186 -0
 - data/lib/common/Maxent.rb +215 -0
 - data/lib/common/MiniparInterface.rb +1388 -0
 - data/lib/common/Optimise.rb +195 -0
 - data/lib/common/Parser.rb +213 -0
 - data/lib/common/RegXML.rb +269 -0
 - data/lib/common/RosyConventions.rb +171 -0
 - data/lib/common/SQLQuery.rb +243 -0
 - data/lib/common/STXmlTerminalOrder.rb +194 -0
 - data/lib/common/SalsaTigerRegXML.rb +2347 -0
 - data/lib/common/SalsaTigerXMLHelper.rb +99 -0
 - data/lib/common/SleepyInterface.rb +384 -0
 - data/lib/common/SynInterfaces.rb +275 -0
 - data/lib/common/TabFormat.rb +720 -0
 - data/lib/common/Tiger.rb +1448 -0
 - data/lib/common/TntInterface.rb +44 -0
 - data/lib/common/Tree.rb +61 -0
 - data/lib/common/TreetaggerInterface.rb +303 -0
 - data/lib/common/headz.rb +338 -0
 - data/lib/common/option_parser.rb +13 -0
 - data/lib/common/ruby_class_extensions.rb +310 -0
 - data/lib/fred/Baseline.rb +150 -0
 - data/lib/fred/FileZipped.rb +31 -0
 - data/lib/fred/FredBOWContext.rb +863 -0
 - data/lib/fred/FredConfigData.rb +182 -0
 - data/lib/fred/FredConventions.rb +232 -0
 - data/lib/fred/FredDetermineTargets.rb +324 -0
 - data/lib/fred/FredEval.rb +312 -0
 - data/lib/fred/FredFeatureExtractors.rb +321 -0
 - data/lib/fred/FredFeatures.rb +1061 -0
 - data/lib/fred/FredFeaturize.rb +596 -0
 - data/lib/fred/FredNumTrainingSenses.rb +27 -0
 - data/lib/fred/FredParameters.rb +402 -0
 - data/lib/fred/FredSplit.rb +84 -0
 - data/lib/fred/FredSplitPkg.rb +180 -0
 - data/lib/fred/FredTest.rb +607 -0
 - data/lib/fred/FredTrain.rb +144 -0
 - data/lib/fred/PlotAndREval.rb +480 -0
 - data/lib/fred/fred.rb +45 -0
 - data/lib/fred/md5.rb +23 -0
 - data/lib/fred/opt_parser.rb +250 -0
 - data/lib/frprep/AbstractSynInterface.rb +1227 -0
 - data/lib/frprep/Ampersand.rb +37 -0
 - data/lib/frprep/BerkeleyInterface.rb +375 -0
 - data/lib/frprep/CollinsInterface.rb +1165 -0
 - data/lib/frprep/ConfigData.rb +694 -0
 - data/lib/frprep/Counter.rb +18 -0
 - data/lib/frprep/FNCorpusXML.rb +643 -0
 - data/lib/frprep/FNDatabase.rb +144 -0
 - data/lib/frprep/FixSynSemMapping.rb +196 -0
 - data/lib/frprep/FrPrepConfigData.rb +66 -0
 - data/lib/frprep/FrameXML.rb +513 -0
 - data/lib/frprep/FrprepHelper.rb +1324 -0
 - data/lib/frprep/Graph.rb +345 -0
 - data/lib/frprep/ISO-8859-1.rb +24 -0
 - data/lib/frprep/MiniparInterface.rb +1388 -0
 - data/lib/frprep/Parser.rb +213 -0
 - data/lib/frprep/RegXML.rb +269 -0
 - data/lib/frprep/STXmlTerminalOrder.rb +194 -0
 - data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
 - data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
 - data/lib/frprep/SleepyInterface.rb +384 -0
 - data/lib/frprep/SynInterfaces.rb +275 -0
 - data/lib/frprep/TabFormat.rb +720 -0
 - data/lib/frprep/Tiger.rb +1448 -0
 - data/lib/frprep/TntInterface.rb +44 -0
 - data/lib/frprep/Tree.rb +61 -0
 - data/lib/frprep/TreetaggerInterface.rb +303 -0
 - data/lib/frprep/do_parses.rb +142 -0
 - data/lib/frprep/frprep.rb +686 -0
 - data/lib/frprep/headz.rb +338 -0
 - data/lib/frprep/one_parsed_file.rb +28 -0
 - data/lib/frprep/opt_parser.rb +94 -0
 - data/lib/frprep/ruby_class_extensions.rb +310 -0
 - data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
 - data/lib/rosy/DBMySQL.rb +146 -0
 - data/lib/rosy/DBSQLite.rb +280 -0
 - data/lib/rosy/DBTable.rb +239 -0
 - data/lib/rosy/DBWrapper.rb +176 -0
 - data/lib/rosy/ExternalConfigData.rb +58 -0
 - data/lib/rosy/FailedParses.rb +130 -0
 - data/lib/rosy/FeatureInfo.rb +242 -0
 - data/lib/rosy/GfInduce.rb +1115 -0
 - data/lib/rosy/GfInduceFeature.rb +148 -0
 - data/lib/rosy/InputData.rb +294 -0
 - data/lib/rosy/RosyConfigData.rb +115 -0
 - data/lib/rosy/RosyConfusability.rb +338 -0
 - data/lib/rosy/RosyEval.rb +465 -0
 - data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
 - data/lib/rosy/RosyFeaturize.rb +280 -0
 - data/lib/rosy/RosyInspect.rb +336 -0
 - data/lib/rosy/RosyIterator.rb +477 -0
 - data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
 - data/lib/rosy/RosyPruning.rb +165 -0
 - data/lib/rosy/RosyServices.rb +744 -0
 - data/lib/rosy/RosySplit.rb +232 -0
 - data/lib/rosy/RosyTask.rb +19 -0
 - data/lib/rosy/RosyTest.rb +826 -0
 - data/lib/rosy/RosyTrain.rb +232 -0
 - data/lib/rosy/RosyTrainingTestTable.rb +786 -0
 - data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
 - data/lib/rosy/View.rb +418 -0
 - data/lib/rosy/opt_parser.rb +379 -0
 - data/lib/rosy/rosy.rb +77 -0
 - data/lib/shalmaneser/version.rb +3 -0
 - data/test/frprep/test_opt_parser.rb +94 -0
 - data/test/functional/functional_test_helper.rb +40 -0
 - data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
 - data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
 - data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
 - data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
 - data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
 - data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
 - data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
 - data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
 - data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
 - data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
 - data/test/functional/test_fred.rb +47 -0
 - data/test/functional/test_frprep.rb +52 -0
 - data/test/functional/test_rosy.rb +20 -0
 - metadata +270 -0
 
| 
         @@ -0,0 +1,99 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # sp jul 05 05
         
     | 
| 
      
 2 
     | 
    
         
            +
            #
         
     | 
| 
      
 3 
     | 
    
         
            +
            # Static helper methods for SalsaTigerRegXML:
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            # - provide header and footer for Salsa/Tiger XML files
         
     | 
| 
      
 6 
     | 
    
         
            +
            # - escape and unescape HTML entities 
         
     | 
| 
      
 7 
     | 
    
         
            +
            #
         
     | 
| 
      
 8 
     | 
    
         
            +
            # changed KE nov 05:
         
     | 
| 
      
 9 
     | 
    
         
            +
            # many methods moved to FrprepHelper
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
            require "common/SalsaTigerRegXML"
         
     | 
| 
      
 12 
     | 
    
         
            +
            require "common/headz"
         
     | 
| 
      
 13 
     | 
    
         
            +
            require "common/Parser"
         
     | 
| 
      
 14 
     | 
    
         
            +
            require "tempfile"
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
            class SalsaTigerXMLHelper
         
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
      
 18 
     | 
    
         
            +
             
     | 
| 
      
 19 
     | 
    
         
            +
              ###
         
     | 
| 
      
 20 
     | 
    
         
            +
              # get header of SalsaTigerXML files (as string)
         
     | 
| 
      
 21 
     | 
    
         
            +
              def SalsaTigerXMLHelper.get_header 
         
     | 
| 
      
 22 
     | 
    
         
            +
                
         
     | 
| 
      
 23 
     | 
    
         
            +
                header = <<ENDOFHEADER
         
     | 
| 
      
 24 
     | 
    
         
            +
            <?xml version="1.0" encoding="UTF-8"?>
         
     | 
| 
      
 25 
     | 
    
         
            +
              <corpus corpusname="corpus" target="">
         
     | 
| 
      
 26 
     | 
    
         
            +
            	<head>
         
     | 
| 
      
 27 
     | 
    
         
            +
            		<meta>
         
     | 
| 
      
 28 
     | 
    
         
            +
            			<format>
         
     | 
| 
      
 29 
     | 
    
         
            +
            			NeGra format, version 3</format>
         
     | 
| 
      
 30 
     | 
    
         
            +
            		</meta>
         
     | 
| 
      
 31 
     | 
    
         
            +
            		<frames xmlns="http://www.clt-st.de/framenet/frame-database">
         
     | 
| 
      
 32 
     | 
    
         
            +
            		</frames>
         
     | 
| 
      
 33 
     | 
    
         
            +
            		<wordtags xmlns="http://www.clt-st.de/salsa/wordtags">
         
     | 
| 
      
 34 
     | 
    
         
            +
            		</wordtags>
         
     | 
| 
      
 35 
     | 
    
         
            +
            		<flags>
         
     | 
| 
      
 36 
     | 
    
         
            +
            		</flags>
         
     | 
| 
      
 37 
     | 
    
         
            +
            		<annotation>
         
     | 
| 
      
 38 
     | 
    
         
            +
            			<edgelabel>
         
     | 
| 
      
 39 
     | 
    
         
            +
            			</edgelabel>
         
     | 
| 
      
 40 
     | 
    
         
            +
            			<secedgelabel>
         
     | 
| 
      
 41 
     | 
    
         
            +
            			</secedgelabel>
         
     | 
| 
      
 42 
     | 
    
         
            +
            		</annotation>
         
     | 
| 
      
 43 
     | 
    
         
            +
            	</head>
         
     | 
| 
      
 44 
     | 
    
         
            +
            	<body>
         
     | 
| 
      
 45 
     | 
    
         
            +
            ENDOFHEADER
         
     | 
| 
      
 46 
     | 
    
         
            +
                
         
     | 
| 
      
 47 
     | 
    
         
            +
                return header
         
     | 
| 
      
 48 
     | 
    
         
            +
                
         
     | 
| 
      
 49 
     | 
    
         
            +
              end
         
     | 
| 
      
 50 
     | 
    
         
            +
              
         
     | 
| 
      
 51 
     | 
    
         
            +
              ###
         
     | 
| 
      
 52 
     | 
    
         
            +
              # get footer of SALSATigerXML files (as string)
         
     | 
| 
      
 53 
     | 
    
         
            +
              def SalsaTigerXMLHelper.get_footer
         
     | 
| 
      
 54 
     | 
    
         
            +
                
         
     | 
| 
      
 55 
     | 
    
         
            +
                footer = <<ENDOFFOOTER
         
     | 
| 
      
 56 
     | 
    
         
            +
            	</body>
         
     | 
| 
      
 57 
     | 
    
         
            +
            </corpus>
         
     | 
| 
      
 58 
     | 
    
         
            +
            ENDOFFOOTER
         
     | 
| 
      
 59 
     | 
    
         
            +
                
         
     | 
| 
      
 60 
     | 
    
         
            +
                return footer
         
     | 
| 
      
 61 
     | 
    
         
            +
              end
         
     | 
| 
      
 62 
     | 
    
         
            +
              
         
     | 
| 
      
 63 
     | 
    
         
            +
             
     | 
| 
      
 64 
     | 
    
         
            +
              
         
     | 
| 
      
 65 
     | 
    
         
            +
            # escape and unescape strings for representation in XML
         
     | 
| 
      
 66 
     | 
    
         
            +
              
         
     | 
| 
      
 67 
     | 
    
         
            +
              @@replacements = [
         
     | 
| 
      
 68 
     | 
    
         
            +
            #  ["''","""], # added by ines (09/03/09), might cause problems for unescape???
         
     | 
| 
      
 69 
     | 
    
         
            +
              ["&","&"], # must be first for escaping, last for unescaping
         
     | 
| 
      
 70 
     | 
    
         
            +
              ["<","<"],
         
     | 
| 
      
 71 
     | 
    
         
            +
              [">", ">"],
         
     | 
| 
      
 72 
     | 
    
         
            +
              ["\"","''"],
         
     | 
| 
      
 73 
     | 
    
         
            +
            #  ["\"","""],
         
     | 
| 
      
 74 
     | 
    
         
            +
            #  ["\'\'","""],
         
     | 
| 
      
 75 
     | 
    
         
            +
            #  ["\`\`","""],
         
     | 
| 
      
 76 
     | 
    
         
            +
              ["\'","'"],
         
     | 
| 
      
 77 
     | 
    
         
            +
              ["\`\`","''"],
         
     | 
| 
      
 78 
     | 
    
         
            +
            #  ["''","''"]
         
     | 
| 
      
 79 
     | 
    
         
            +
              ]
         
     | 
| 
      
 80 
     | 
    
         
            +
             
     | 
| 
      
 81 
     | 
    
         
            +
             
     | 
| 
      
 82 
     | 
    
         
            +
             
     | 
| 
      
 83 
     | 
    
         
            +
            def SalsaTigerXMLHelper.escape(string)
         
     | 
| 
      
 84 
     | 
    
         
            +
              @@replacements.each {|unescaped,escaped|
         
     | 
| 
      
 85 
     | 
    
         
            +
                string.gsub!(unescaped,escaped)
         
     | 
| 
      
 86 
     | 
    
         
            +
              }
         
     | 
| 
      
 87 
     | 
    
         
            +
              return string
         
     | 
| 
      
 88 
     | 
    
         
            +
            end
         
     | 
| 
      
 89 
     | 
    
         
            +
             
     | 
| 
      
 90 
     | 
    
         
            +
            def SalsaTigerXMLHelper.unescape(string)
         
     | 
| 
      
 91 
     | 
    
         
            +
              # reverse replacements to replace & last
         
     | 
| 
      
 92 
     | 
    
         
            +
              @@replacements.reverse.each {|unescaped,escaped| 
         
     | 
| 
      
 93 
     | 
    
         
            +
                string.gsub!(escaped,unescaped)
         
     | 
| 
      
 94 
     | 
    
         
            +
              }
         
     | 
| 
      
 95 
     | 
    
         
            +
              return string
         
     | 
| 
      
 96 
     | 
    
         
            +
            end
         
     | 
| 
      
 97 
     | 
    
         
            +
             
     | 
| 
      
 98 
     | 
    
         
            +
             
     | 
| 
      
 99 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,384 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            ####
         
     | 
| 
      
 2 
     | 
    
         
            +
            # sp 21 07 05
         
     | 
| 
      
 3 
     | 
    
         
            +
            #
         
     | 
| 
      
 4 
     | 
    
         
            +
            # modified ke 30 10 05: adapted to fit into SynInterface
         
     | 
| 
      
 5 
     | 
    
         
            +
            #
         
     | 
| 
      
 6 
     | 
    
         
            +
            # represents a file containing Sleepy parses
         
     | 
| 
      
 7 
     | 
    
         
            +
            # 
         
     | 
| 
      
 8 
     | 
    
         
            +
            # underlying data structure for individual sentences: SalsaTigerSentence
         
     | 
| 
      
 9 
     | 
    
         
            +
            require "tempfile"
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
            require "common/SalsaTigerRegXML"
         
     | 
| 
      
 12 
     | 
    
         
            +
            require "common/SalsaTigerXMLHelper"
         
     | 
| 
      
 13 
     | 
    
         
            +
            require "common/TabFormat"
         
     | 
| 
      
 14 
     | 
    
         
            +
            require "common/Counter"
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
            require "common/AbstractSynInterface"
         
     | 
| 
      
 17 
     | 
    
         
            +
            require "common/Tiger.rb"
         
     | 
| 
      
 18 
     | 
    
         
            +
             
     | 
| 
      
 19 
     | 
    
         
            +
            ################################################
         
     | 
| 
      
 20 
     | 
    
         
            +
            # Interface class
         
     | 
| 
      
 21 
     | 
    
         
            +
            class SleepyInterface < SynInterfaceSTXML
         
     | 
| 
      
 22 
     | 
    
         
            +
              SleepyInterface.announce_me()
         
     | 
| 
      
 23 
     | 
    
         
            +
             
     | 
| 
      
 24 
     | 
    
         
            +
              ###
         
     | 
| 
      
 25 
     | 
    
         
            +
              def SleepyInterface.system()
         
     | 
| 
      
 26 
     | 
    
         
            +
                return "sleepy"
         
     | 
| 
      
 27 
     | 
    
         
            +
              end
         
     | 
| 
      
 28 
     | 
    
         
            +
             
     | 
| 
      
 29 
     | 
    
         
            +
              ###
         
     | 
| 
      
 30 
     | 
    
         
            +
              def SleepyInterface.service()
         
     | 
| 
      
 31 
     | 
    
         
            +
                return "parser"
         
     | 
| 
      
 32 
     | 
    
         
            +
              end
         
     | 
| 
      
 33 
     | 
    
         
            +
             
     | 
| 
      
 34 
     | 
    
         
            +
              ###
         
     | 
| 
      
 35 
     | 
    
         
            +
              # initialize to set values for all subsequent processing
         
     | 
| 
      
 36 
     | 
    
         
            +
              def initialize(program_path, # string: path to system
         
     | 
| 
      
 37 
     | 
    
         
            +
            		 insuffix,      # string: suffix of tab files
         
     | 
| 
      
 38 
     | 
    
         
            +
            		 outsuffix,     # string: suffix for parsed files
         
     | 
| 
      
 39 
     | 
    
         
            +
            		 stsuffix,      # string: suffix for Salsa/TIGER XML files
         
     | 
| 
      
 40 
     | 
    
         
            +
            		 var_hash = {}) # optional arguments in a hash
         
     | 
| 
      
 41 
     | 
    
         
            +
             
     | 
| 
      
 42 
     | 
    
         
            +
                super(program_path, insuffix, outsuffix, stsuffix, var_hash)
         
     | 
| 
      
 43 
     | 
    
         
            +
                unless @program_path =~ /\/$/
         
     | 
| 
      
 44 
     | 
    
         
            +
                  @program_path = @program_path + "/"
         
     | 
| 
      
 45 
     | 
    
         
            +
                end
         
     | 
| 
      
 46 
     | 
    
         
            +
             
     | 
| 
      
 47 
     | 
    
         
            +
                # new: evaluate var hash
         
     | 
| 
      
 48 
     | 
    
         
            +
                @pos_suffix = var_hash["pos_suffix"]
         
     | 
| 
      
 49 
     | 
    
         
            +
                @lemma_suffix = var_hash["lemma_suffix"]
         
     | 
| 
      
 50 
     | 
    
         
            +
                @tab_dir = var_hash["tab_dir"]
         
     | 
| 
      
 51 
     | 
    
         
            +
              end
         
     | 
| 
      
 52 
     | 
    
         
            +
             
     | 
| 
      
 53 
     | 
    
         
            +
              ####
         
     | 
| 
      
 54 
     | 
    
         
            +
              # parse a directory with TabFormat files and write the parse trees to outputdir 
         
     | 
| 
      
 55 
     | 
    
         
            +
              # I assume that the files in inputdir are smaller than 
         
     | 
| 
      
 56 
     | 
    
         
            +
              # the maximum number of sentences that 
         
     | 
| 
      
 57 
     | 
    
         
            +
              # Sleepy can parse in one go (i.e. that they are split)
         
     | 
| 
      
 58 
     | 
    
         
            +
              def process_dir(in_dir,  # string: input directory name
         
     | 
| 
      
 59 
     | 
    
         
            +
            		  out_dir) # string: output directory name
         
     | 
| 
      
 60 
     | 
    
         
            +
                
         
     | 
| 
      
 61 
     | 
    
         
            +
                sleepy_prog = "#{@program_path}sleepy  --beam 1000 --model-file #{@program_path}negra.model --parse "
         
     | 
| 
      
 62 
     | 
    
         
            +
                
         
     | 
| 
      
 63 
     | 
    
         
            +
                Dir[in_dir + "*" + @insuffix].each {|inputfilename|
         
     | 
| 
      
 64 
     | 
    
         
            +
                  STDERR.puts "*** Parsing #{inputfilename} with Sleepy"
         
     | 
| 
      
 65 
     | 
    
         
            +
                  corpusfilename = File.basename(inputfilename, @insuffix)
         
     | 
| 
      
 66 
     | 
    
         
            +
                  parsefilename = out_dir + corpusfilename + @outsuffix
         
     | 
| 
      
 67 
     | 
    
         
            +
                  tempfile = Tempfile.new(corpusfilename)
         
     | 
| 
      
 68 
     | 
    
         
            +
             
     | 
| 
      
 69 
     | 
    
         
            +
                  # we need neither lemmata nor POS tags; sleepy can do with the words
         
     | 
| 
      
 70 
     | 
    
         
            +
                  corpusfile = FNTabFormatFile.new(inputfilename,nil, nil) 
         
     | 
| 
      
 71 
     | 
    
         
            +
                  corpusfile.each_sentence {|sentence|
         
     | 
| 
      
 72 
     | 
    
         
            +
                    tempfile.puts sentence.to_s
         
     | 
| 
      
 73 
     | 
    
         
            +
                  }
         
     | 
| 
      
 74 
     | 
    
         
            +
                  tempfile.close
         
     | 
| 
      
 75 
     | 
    
         
            +
                  # parse and remove comments in the parser output
         
     | 
| 
      
 76 
     | 
    
         
            +
                  Kernel.system(sleepy_prog+" "+tempfile.path+" 2>&1 | grep -v \"Span:\" > "+parsefilename)      
         
     | 
| 
      
 77 
     | 
    
         
            +
                }
         
     | 
| 
      
 78 
     | 
    
         
            +
              end
         
     | 
| 
      
 79 
     | 
    
         
            +
             
     | 
| 
      
 80 
     | 
    
         
            +
              ###
         
     | 
| 
      
 81 
     | 
    
         
            +
              # for a given parsed file:
         
     | 
| 
      
 82 
     | 
    
         
            +
              # yield each sentence as a pair 
         
     | 
| 
      
 83 
     | 
    
         
            +
              #  [SalsaTigerSentence object, FNTabFormatSentence object]
         
     | 
| 
      
 84 
     | 
    
         
            +
              # of the sentence in SalsaTigerXML and the matching tab format sentence
         
     | 
| 
      
 85 
     | 
    
         
            +
              #
         
     | 
| 
      
 86 
     | 
    
         
            +
              # If a parse has failed, returns 
         
     | 
| 
      
 87 
     | 
    
         
            +
              #  [failed_sentence (flat SalsaTigerSentence), FNTabFormatSentence] 
         
     | 
| 
      
 88 
     | 
    
         
            +
              # to allow more detailed accounting for failed parses
         
     | 
| 
      
 89 
     | 
    
         
            +
              # (basically just a flat structure with a failed=true attribute 
         
     | 
| 
      
 90 
     | 
    
         
            +
              # at the sentence node)
         
     | 
| 
      
 91 
     | 
    
         
            +
              def each_sentence(parsefilename)
         
     | 
| 
      
 92 
     | 
    
         
            +
                # sanity checks
         
     | 
| 
      
 93 
     | 
    
         
            +
                unless @tab_dir
         
     | 
| 
      
 94 
     | 
    
         
            +
                  $stderr.puts "SleepyInterface error: Need to set tab directory on initialization"
         
     | 
| 
      
 95 
     | 
    
         
            +
                  exit 1
         
     | 
| 
      
 96 
     | 
    
         
            +
                end
         
     | 
| 
      
 97 
     | 
    
         
            +
             
     | 
| 
      
 98 
     | 
    
         
            +
                # get matching tab file for this parser output file
         
     | 
| 
      
 99 
     | 
    
         
            +
                parsefile = File.new(parsefilename)
         
     | 
| 
      
 100 
     | 
    
         
            +
                tabfilename = @tab_dir+File.basename(parsefilename, @outsuffix)+ @insuffix
         
     | 
| 
      
 101 
     | 
    
         
            +
                tabfile = FNTabFormatFile.new(tabfilename, @postag_suffix, @lemma_suffix)    
         
     | 
| 
      
 102 
     | 
    
         
            +
             
     | 
| 
      
 103 
     | 
    
         
            +
                sentid = 0
         
     | 
| 
      
 104 
     | 
    
         
            +
                
         
     | 
| 
      
 105 
     | 
    
         
            +
                tabfile.each_sentence {|tab_sent| # iterate over corpus sentences
         
     | 
| 
      
 106 
     | 
    
         
            +
                  
         
     | 
| 
      
 107 
     | 
    
         
            +
                  sentence_str = ""
         
     | 
| 
      
 108 
     | 
    
         
            +
                  status = true # error encountered? 
         
     | 
| 
      
 109 
     | 
    
         
            +
                  
         
     | 
| 
      
 110 
     | 
    
         
            +
                  # assemble next sentence in Sleepy file by reading lines from parsefile
         
     | 
| 
      
 111 
     | 
    
         
            +
                  while true
         
     | 
| 
      
 112 
     | 
    
         
            +
                    line = parsefile.gets
         
     | 
| 
      
 113 
     | 
    
         
            +
                    case line
         
     | 
| 
      
 114 
     | 
    
         
            +
                    when /% Parse failed/
         
     | 
| 
      
 115 
     | 
    
         
            +
                      status = false
         
     | 
| 
      
 116 
     | 
    
         
            +
                      break
         
     | 
| 
      
 117 
     | 
    
         
            +
                    when nil # end of file: nothing more to break
         
     | 
| 
      
 118 
     | 
    
         
            +
                      break
         
     | 
| 
      
 119 
     | 
    
         
            +
                    when /^%/, /^\s*$/ # empty lines, other comments: end of current sentence
         
     | 
| 
      
 120 
     | 
    
         
            +
                      unless sentence_str == "" # only break if you have read something 
         
     | 
| 
      
 121 
     | 
    
         
            +
                        break
         
     | 
| 
      
 122 
     | 
    
         
            +
                      end
         
     | 
| 
      
 123 
     | 
    
         
            +
                    else
         
     | 
| 
      
 124 
     | 
    
         
            +
                      sentence_str += line.chomp # collect line of current parse and continue reading
         
     | 
| 
      
 125 
     | 
    
         
            +
                    end
         
     | 
| 
      
 126 
     | 
    
         
            +
                  end
         
     | 
| 
      
 127 
     | 
    
         
            +
                  
         
     | 
| 
      
 128 
     | 
    
         
            +
                  # we have reached some kind of end
         
     | 
| 
      
 129 
     | 
    
         
            +
                  sentid +=1
         
     | 
| 
      
 130 
     | 
    
         
            +
                  
         
     | 
| 
      
 131 
     | 
    
         
            +
                  # we don't have a sentence: hopefully, this is becase parsing has failed
         
     | 
| 
      
 132 
     | 
    
         
            +
                  # if this is not the case, we are in trouble
         
     | 
| 
      
 133 
     | 
    
         
            +
                  if sentence_str == ""
         
     | 
| 
      
 134 
     | 
    
         
            +
                    case status
         
     | 
| 
      
 135 
     | 
    
         
            +
             
     | 
| 
      
 136 
     | 
    
         
            +
                    when false
         
     | 
| 
      
 137 
     | 
    
         
            +
                      # return a SalsaTigerSentence object for the failed sentence
         
     | 
| 
      
 138 
     | 
    
         
            +
                      # with a virtual top node and one terminal per word.
         
     | 
| 
      
 139 
     | 
    
         
            +
                      if tab_sent.get_sent_id() and tab_sent.get_sent_id() != "--"
         
     | 
| 
      
 140 
     | 
    
         
            +
                        my_sent_id = tab_sent.get_sent_id()
         
     | 
| 
      
 141 
     | 
    
         
            +
                      else
         
     | 
| 
      
 142 
     | 
    
         
            +
                        my_sent_id = File.basename(parsefilename, @outsuffix) + "_" + sentid.to_s
         
     | 
| 
      
 143 
     | 
    
         
            +
                      end
         
     | 
| 
      
 144 
     | 
    
         
            +
                      sent = SleepyInterface.failed_sentence(tab_sent, my_sent_id)
         
     | 
| 
      
 145 
     | 
    
         
            +
                      yield [sent, tab_sent, SleepyInterface.standard_mapping(sent, tab_sent)] 
         
     | 
| 
      
 146 
     | 
    
         
            +
             
     | 
| 
      
 147 
     | 
    
         
            +
                    else
         
     | 
| 
      
 148 
     | 
    
         
            +
            	  # this may not happen: we need some sentence for the current 
         
     | 
| 
      
 149 
     | 
    
         
            +
            	  # TabFile sentence
         
     | 
| 
      
 150 
     | 
    
         
            +
                      $stderr.puts "SleepyInterface error: premature end of parser file!" 
         
     | 
| 
      
 151 
     | 
    
         
            +
                      exit 1
         
     | 
| 
      
 152 
     | 
    
         
            +
                    end 
         
     | 
| 
      
 153 
     | 
    
         
            +
                  else
         
     | 
| 
      
 154 
     | 
    
         
            +
                    # if we are here, we have a sentence_str to work on
         
     | 
| 
      
 155 
     | 
    
         
            +
                    # hopefully, our status is OK
         
     | 
| 
      
 156 
     | 
    
         
            +
                    case status
         
     | 
| 
      
 157 
     | 
    
         
            +
                    when true
         
     | 
| 
      
 158 
     | 
    
         
            +
                      if tab_sent.get_sent_id() and tab_sent.get_sent_id() != "--"
         
     | 
| 
      
 159 
     | 
    
         
            +
                        my_sent_id = tab_sent.get_sent_id()
         
     | 
| 
      
 160 
     | 
    
         
            +
                      else
         
     | 
| 
      
 161 
     | 
    
         
            +
                        my_sent_id = File.basename(parsefilename, @outsuffix) + "_" + sentid.to_s
         
     | 
| 
      
 162 
     | 
    
         
            +
                      end
         
     | 
| 
      
 163 
     | 
    
         
            +
                      st_sent = build_salsatiger(" " + sentence_str + " ", 0,
         
     | 
| 
      
 164 
     | 
    
         
            +
            				     Array.new, Counter.new(0),
         
     | 
| 
      
 165 
     | 
    
         
            +
            				     Counter.new(500),
         
     | 
| 
      
 166 
     | 
    
         
            +
            				     SalsaTigerSentence.empty_sentence(my_sent_id.to_s))
         
     | 
| 
      
 167 
     | 
    
         
            +
                      yield [st_sent, tab_sent, SleepyInterface.standard_mapping(st_sent, tab_sent)]
         
     | 
| 
      
 168 
     | 
    
         
            +
             
     | 
| 
      
 169 
     | 
    
         
            +
                    else # i.e. when "failed"
         
     | 
| 
      
 170 
     | 
    
         
            +
                      $stderr.puts "SleepyInterface error: failed parse, but parse tree exists??"
         
     | 
| 
      
 171 
     | 
    
         
            +
                      exit 1
         
     | 
| 
      
 172 
     | 
    
         
            +
                    end
         
     | 
| 
      
 173 
     | 
    
         
            +
                  end
         
     | 
| 
      
 174 
     | 
    
         
            +
                }
         
     | 
| 
      
 175 
     | 
    
         
            +
                
         
     | 
| 
      
 176 
     | 
    
         
            +
                # all TabFile sentences are consumed: 
         
     | 
| 
      
 177 
     | 
    
         
            +
                # now we may just encounter comments, garbage, empty lines etc. 
         
     | 
| 
      
 178 
     | 
    
         
            +
                
         
     | 
| 
      
 179 
     | 
    
         
            +
                while not parsefile.eof?
         
     | 
| 
      
 180 
     | 
    
         
            +
                  case parsefile.gets
         
     | 
| 
      
 181 
     | 
    
         
            +
                  when nil, /^%/, /^\s*$/ # empty lines, comments, end of input indicate end of current parse 
         
     | 
| 
      
 182 
     | 
    
         
            +
                  else
         
     | 
| 
      
 183 
     | 
    
         
            +
                    $stderr.puts "SleepyInterface error: premature end of tab file"
         
     | 
| 
      
 184 
     | 
    
         
            +
                    exit 1
         
     | 
| 
      
 185 
     | 
    
         
            +
                  end
         
     | 
| 
      
 186 
     | 
    
         
            +
                end  
         
     | 
| 
      
 187 
     | 
    
         
            +
              end
         
     | 
| 
      
 188 
     | 
    
         
            +
              
         
     | 
| 
      
 189 
     | 
    
         
            +
             
     | 
| 
      
 190 
     | 
    
         
            +
              ###
         
     | 
| 
      
 191 
     | 
    
         
            +
              # write Salsa/TIGER XML output to file
         
     | 
| 
      
 192 
     | 
    
         
            +
              def to_stxml_file(infilename,  # string: name of parse file
         
     | 
| 
      
 193 
     | 
    
         
            +
            		    outfilename) # string: name of output stxml file
         
     | 
| 
      
 194 
     | 
    
         
            +
             
     | 
| 
      
 195 
     | 
    
         
            +
                outfile = File.new(outfilename, "w")
         
     | 
| 
      
 196 
     | 
    
         
            +
                outfile.puts SalsaTigerXMLHelper.get_header()
         
     | 
| 
      
 197 
     | 
    
         
            +
                each_sentence(infilename) { |st_sent, tabsent|
         
     | 
| 
      
 198 
     | 
    
         
            +
                  outfile.puts st_sent.get()
         
     | 
| 
      
 199 
     | 
    
         
            +
                }
         
     | 
| 
      
 200 
     | 
    
         
            +
                outfile.puts SalsaTigerXMLHelper.get_footer()
         
     | 
| 
      
 201 
     | 
    
         
            +
                outfile.close()
         
     | 
| 
      
 202 
     | 
    
         
            +
              end
         
     | 
| 
      
 203 
     | 
    
         
            +
             
     | 
| 
      
 204 
     | 
    
         
            +
             
     | 
| 
      
 205 
     | 
    
         
            +
             
     | 
| 
      
 206 
     | 
    
         
            +
              ########################
         
     | 
| 
      
 207 
     | 
    
         
            +
              private
         
     | 
| 
      
 208 
     | 
    
         
            +
             
     | 
| 
      
 209 
     | 
    
         
            +
              ###
         
     | 
| 
      
 210 
     | 
    
         
            +
              # Recursive function for parsing a Sleepy parse tree and 
         
     | 
| 
      
 211 
     | 
    
         
            +
              # building a SalsaTigerSentence recursively
         
     | 
| 
      
 212 
     | 
    
         
            +
              #
         
     | 
| 
      
 213 
     | 
    
         
            +
              # Algorithm: manage stack which contains, for the current constituent, 
         
     | 
| 
      
 214 
     | 
    
         
            +
              # child constituents (if a nonterminal), and the category label.
         
     | 
| 
      
 215 
     | 
    
         
            +
              # When the end of a constituent is reached, a new SynNode (TigerSalsa node) ist created.
         
     | 
| 
      
 216 
     | 
    
         
            +
              # All children and the category label are popped from the stack and integrated into the 
         
     | 
| 
      
 217 
     | 
    
         
            +
              # TigerSalsa data structure. The new node is re-pushed onto the stack.
         
     | 
| 
      
 218 
     | 
    
         
            +
              def build_salsatiger(sentence, # string
         
     | 
| 
      
 219 
     | 
    
         
            +
                                pos,      # position in string (index): integer
         
     | 
| 
      
 220 
     | 
    
         
            +
                                stack,    # stack with incomplete nodes: Array
         
     | 
| 
      
 221 
     | 
    
         
            +
                                termc,    # terminal counter
         
     | 
| 
      
 222 
     | 
    
         
            +
                                nontc,    # nonterminal counter
         
     | 
| 
      
 223 
     | 
    
         
            +
                                sent_obj) # SalsaTigerSentence
         
     | 
| 
      
 224 
     | 
    
         
            +
                
         
     | 
| 
      
 225 
     | 
    
         
            +
                
         
     | 
| 
      
 226 
     | 
    
         
            +
                # main case distinction: match the beginning of our string 
         
     | 
| 
      
 227 
     | 
    
         
            +
                # (i.e. what follows our current position in the string)
         
     | 
| 
      
 228 
     | 
    
         
            +
                
         
     | 
| 
      
 229 
     | 
    
         
            +
                case sentence[pos..-1]
         
     | 
| 
      
 230 
     | 
    
         
            +
                  
         
     | 
| 
      
 231 
     | 
    
         
            +
                when /^ *$/ # nothing -> whole sentence parsed
         
     | 
| 
      
 232 
     | 
    
         
            +
                  if stack.length == 1 
         
     | 
| 
      
 233 
     | 
    
         
            +
            	# sleepy always delivers one "top" node; if we don't get just one
         
     | 
| 
      
 234 
     | 
    
         
            +
                    # node, something has gone wrong
         
     | 
| 
      
 235 
     | 
    
         
            +
                    node = stack.pop
         
     | 
| 
      
 236 
     | 
    
         
            +
                    node.del_attribute("gf")
         
     | 
| 
      
 237 
     | 
    
         
            +
                    return sent_obj
         
     | 
| 
      
 238 
     | 
    
         
            +
                  else
         
     | 
| 
      
 239 
     | 
    
         
            +
                    $stderr.puts "SleepyINterface Error: more than one root node (stack length #{stack.length}). Full sentence: \n#{sentence}"
         
     | 
| 
      
 240 
     | 
    
         
            +
                    exit 1
         
     | 
| 
      
 241 
     | 
    
         
            +
                  end    
         
     | 
| 
      
 242 
     | 
    
         
            +
                  
         
     | 
| 
      
 243 
     | 
    
         
            +
                when /^\s*\(([^ )]+) / 
         
     | 
| 
      
 244 
     | 
    
         
            +
                  # match the beginning of a new constituent 
         
     | 
| 
      
 245 
     | 
    
         
            +
                  # (opening bracket + category + space, may not contain closing bracket)
         
     | 
| 
      
 246 
     | 
    
         
            +
                  cat = $1
         
     | 
| 
      
 247 
     | 
    
         
            +
                  if cat.nil? or cat == ""
         
     | 
| 
      
 248 
     | 
    
         
            +
                    $stderr.puts "SleepyInterface Error: found category nil in sentence #{sentence[pos,10]}, full sentence\n#{sentence}"
         
     | 
| 
      
 249 
     | 
    
         
            +
                    exit 1
         
     | 
| 
      
 250 
     | 
    
         
            +
                  end
         
     | 
| 
      
 251 
     | 
    
         
            +
            #          STDERR.puts "new const #{cat}"
         
     | 
| 
      
 252 
     | 
    
         
            +
                  stack.push cat # throw the category label on the stack    
         
     | 
| 
      
 253 
     | 
    
         
            +
                  return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)    
         
     | 
| 
      
 254 
     | 
    
         
            +
                  
         
     | 
| 
      
 255 
     | 
    
         
            +
                when /^\s*(\S+)\) /
         
     | 
| 
      
 256 
     | 
    
         
            +
                  # match the end of a terminal constituent (something before a closing bracket + space)
         
     | 
| 
      
 257 
     | 
    
         
            +
                  word = $1
         
     | 
| 
      
 258 
     | 
    
         
            +
                  comb_cat = stack.pop
         
     | 
| 
      
 259 
     | 
    
         
            +
                  if comb_cat.to_s == ""
         
     | 
| 
      
 260 
     | 
    
         
            +
                    $stderr.puts "SleepyInterface error: Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
         
     | 
| 
      
 261 
     | 
    
         
            +
                    exit 1
         
     | 
| 
      
 262 
     | 
    
         
            +
                  end
         
     | 
| 
      
 263 
     | 
    
         
            +
                  cat,gf = split_cat(comb_cat)
         
     | 
| 
      
 264 
     | 
    
         
            +
                  node = sent_obj.add_syn("t",
         
     | 
| 
      
 265 
     | 
    
         
            +
                                          nil,  # cat (doesn't matter here)
         
     | 
| 
      
 266 
     | 
    
         
            +
                                          SalsaTigerXMLHelper.escape(word), # word
         
     | 
| 
      
 267 
     | 
    
         
            +
                                          cat,  # pos
         
     | 
| 
      
 268 
     | 
    
         
            +
                                          termc.next.to_s)
         
     | 
| 
      
 269 
     | 
    
         
            +
                  node.set_attribute("gf",gf)
         
     | 
| 
      
 270 
     | 
    
         
            +
            #          STDERR.puts "completed terminal #{cat}, #{word}"
         
     | 
| 
      
 271 
     | 
    
         
            +
                  stack.push node
         
     | 
| 
      
 272 
     | 
    
         
            +
                  return build_salsatiger(sentence,pos+$&.length,stack,termc,nontc,sent_obj)    
         
     | 
| 
      
 273 
     | 
    
         
            +
                  
         
     | 
| 
      
 274 
     | 
    
         
            +
                when /^\s*\)/ # match the end of a nonterminal (nothing before a closing bracket)
         
     | 
| 
      
 275 
     | 
    
         
            +
                  # now collect children:
         
     | 
| 
      
 276 
     | 
    
         
            +
                  # pop items from the stack until you find the category
         
     | 
| 
      
 277 
     | 
    
         
            +
                  children = Array.new  
         
     | 
| 
      
 278 
     | 
    
         
            +
                  while true
         
     | 
| 
      
 279 
     | 
    
         
            +
                    if stack.empty?
         
     | 
| 
      
 280 
     | 
    
         
            +
                      $stderr.puts  "SleepyInterface Error: stack empty; cannot find more children"
         
     | 
| 
      
 281 
     | 
    
         
            +
                      exit 1
         
     | 
| 
      
 282 
     | 
    
         
            +
                    end
         
     | 
| 
      
 283 
     | 
    
         
            +
                    item = stack.pop
         
     | 
| 
      
 284 
     | 
    
         
            +
                    case item.class.to_s
         
     | 
| 
      
 285 
     | 
    
         
            +
                    when "SynNode" # this is a child
         
     | 
| 
      
 286 
     | 
    
         
            +
                      children.push item
         
     | 
| 
      
 287 
     | 
    
         
            +
                    when "String" # this is the category label
         
     | 
| 
      
 288 
     | 
    
         
            +
                      if item.to_s == ""
         
     | 
| 
      
 289 
     | 
    
         
            +
                        $stderr.puts "SleepyInterface error: Empty cat at position #{sentence[pos,10]}, full sentence\n#{sentence}"
         
     | 
| 
      
 290 
     | 
    
         
            +
                        exit 1
         
     | 
| 
      
 291 
     | 
    
         
            +
                      end        
         
     | 
| 
      
 292 
     | 
    
         
            +
                      cat,gf = split_cat(item)
         
     | 
| 
      
 293 
     | 
    
         
            +
                      break
         
     | 
| 
      
 294 
     | 
    
         
            +
                    else
         
     | 
| 
      
 295 
     | 
    
         
            +
                      $stderr.puts "SleepyInterface Error: unknown item class #{item.class.to_s}"
         
     | 
| 
      
 296 
     | 
    
         
            +
                      exit 1
         
     | 
| 
      
 297 
     | 
    
         
            +
                    end
         
     | 
| 
      
 298 
     | 
    
         
            +
                  end
         
     | 
| 
      
 299 
     | 
    
         
            +
                  # now add a nonterminal node to the sentence object and 
         
     | 
| 
      
 300 
     | 
    
         
            +
                  # register the children nodes
         
     | 
| 
      
 301 
     | 
    
         
            +
                  node = sent_obj.add_syn("nt",
         
     | 
| 
      
 302 
     | 
    
         
            +
                                          cat, # cat
         
     | 
| 
      
 303 
     | 
    
         
            +
                                          nil, # word (doesn't matter)
         
     | 
| 
      
 304 
     | 
    
         
            +
                                          nil, # pos (doesn't matter)
         
     | 
| 
      
 305 
     | 
    
         
            +
                                          nontc.next.to_s)
         
     | 
| 
      
 306 
     | 
    
         
            +
                  children.each {|child|
         
     | 
| 
      
 307 
     | 
    
         
            +
                    child_gf = child.get_attribute("gf")
         
     | 
| 
      
 308 
     | 
    
         
            +
                    child.del_attribute("gf")
         
     | 
| 
      
 309 
     | 
    
         
            +
                    node.add_child(child,child_gf)
         
     | 
| 
      
 310 
     | 
    
         
            +
                    child.add_parent(node, child_gf)
         
     | 
| 
      
 311 
     | 
    
         
            +
                  }
         
     | 
| 
      
 312 
     | 
    
         
            +
                  node.set_attribute("gf",gf)
         
     | 
| 
      
 313 
     | 
    
         
            +
            #          STDERR.puts "Completed nonterm #{cat}, #{children.length} children."
         
     | 
| 
      
 314 
     | 
    
         
            +
                  stack.push node
         
     | 
| 
      
 315 
     | 
    
         
            +
                  return build_salsatiger(sentence,pos+$&.length, stack,termc,nontc,sent_obj)
         
     | 
| 
      
 316 
     | 
    
         
            +
                else
         
     | 
| 
      
 317 
     | 
    
         
            +
                  
         
     | 
| 
      
 318 
     | 
    
         
            +
                  if sentence =~ /Fatal error: exception Out_of_memory/
         
     | 
| 
      
 319 
     | 
    
         
            +
                    $stderr.puts "SleepyInterface error: Sleepy parser ran out of memory."
         
     | 
| 
      
 320 
     | 
    
         
            +
                    $stderr.puts "Try reducing the max. sentence length"
         
     | 
| 
      
 321 
     | 
    
         
            +
                    $stderr.puts "in the experiment file."
         
     | 
| 
      
 322 
     | 
    
         
            +
                    exit 1
         
     | 
| 
      
 323 
     | 
    
         
            +
                  end
         
     | 
| 
      
 324 
     | 
    
         
            +
             
     | 
| 
      
 325 
     | 
    
         
            +
             
     | 
| 
      
 326 
     | 
    
         
            +
                  $stderr.puts "SleepyInterface Error: cannot analyse sentence at pos #{pos}:\n #{sentence[pos..-1]}\n Complete sentence: \n#{sentence}"
         
     | 
| 
      
 327 
     | 
    
         
            +
                  exit 1
         
     | 
| 
      
 328 
     | 
    
         
            +
                end
         
     | 
| 
      
 329 
     | 
    
         
            +
              end
         
     | 
| 
      
 330 
     | 
    
         
            +
             
     | 
| 
      
 331 
     | 
    
         
            +
              ###
         
     | 
| 
      
 332 
     | 
    
         
            +
              # Sleepy delivers node labels as "phrase type"-"grammatical function"
         
     | 
| 
      
 333 
     | 
    
         
            +
              # but the GF may not be present.
         
     | 
| 
      
 334 
     | 
    
         
            +
             
     | 
| 
      
 335 
     | 
    
         
            +
              def split_cat(cat)
         
     | 
| 
      
 336 
     | 
    
         
            +
                
         
     | 
| 
      
 337 
     | 
    
         
            +
                cat =~ /^([^-]*)(-([^-]*))?$/
         
     | 
| 
      
 338 
     | 
    
         
            +
                unless $1
         
     | 
| 
      
 339 
     | 
    
         
            +
                  $stderr.puts "SleepyInterface Error: could not identify category in #{cat}"
         
     | 
| 
      
 340 
     | 
    
         
            +
                  exit 1
         
     | 
| 
      
 341 
     | 
    
         
            +
                end
         
     | 
| 
      
 342 
     | 
    
         
            +
                
         
     | 
| 
      
 343 
     | 
    
         
            +
                proper_cat = $1
         
     | 
| 
      
 344 
     | 
    
         
            +
                
         
     | 
| 
      
 345 
     | 
    
         
            +
                if $3    
         
     | 
| 
      
 346 
     | 
    
         
            +
                  gf = $3
         
     | 
| 
      
 347 
     | 
    
         
            +
                else
         
     | 
| 
      
 348 
     | 
    
         
            +
                  gf = ""
         
     | 
| 
      
 349 
     | 
    
         
            +
                end
         
     | 
| 
      
 350 
     | 
    
         
            +
                
         
     | 
| 
      
 351 
     | 
    
         
            +
                return [proper_cat,gf]
         
     | 
| 
      
 352 
     | 
    
         
            +
                
         
     | 
| 
      
 353 
     | 
    
         
            +
              end
         
     | 
| 
      
 354 
     | 
    
         
            +
            end
         
     | 
| 
      
 355 
     | 
    
         
            +
             
     | 
| 
      
 356 
     | 
    
         
            +
             
     | 
| 
      
 357 
     | 
    
         
            +
             
     | 
| 
      
 358 
     | 
    
         
            +
            ################################################
         
     | 
| 
      
 359 
     | 
    
         
            +
            # Interpreter class
         
     | 
| 
      
 360 
     | 
    
         
            +
            class SleepyInterpreter < Tiger
         
     | 
| 
      
 361 
     | 
    
         
            +
              SleepyInterpreter.announce_me()
         
     | 
| 
      
 362 
     | 
    
         
            +
             
     | 
| 
      
 363 
     | 
    
         
            +
              ###
         
     | 
| 
      
 364 
     | 
    
         
            +
              # names of the systems interpreted by this class:
         
     | 
| 
      
 365 
     | 
    
         
            +
              # returns a hash service(string) -> system name (string),
         
     | 
| 
      
 366 
     | 
    
         
            +
              # e.g.
         
     | 
| 
      
 367 
     | 
    
         
            +
              # { "parser" => "collins", "lemmatizer" => "treetagger" }
         
     | 
| 
      
 368 
     | 
    
         
            +
              def SleepyInterpreter.systems()
         
     | 
| 
      
 369 
     | 
    
         
            +
                return {
         
     | 
| 
      
 370 
     | 
    
         
            +
            	"parser" => "sleepy"
         
     | 
| 
      
 371 
     | 
    
         
            +
                }
         
     | 
| 
      
 372 
     | 
    
         
            +
              end
         
     | 
| 
      
 373 
     | 
    
         
            +
             
     | 
| 
      
 374 
     | 
    
         
            +
              ###
         
     | 
| 
      
 375 
     | 
    
         
            +
              # names of additional systems that may be interpreted by this class
         
     | 
| 
      
 376 
     | 
    
         
            +
              # returns a hash service(string) -> system name(string)
         
     | 
| 
      
 377 
     | 
    
         
            +
              # same as names()
         
     | 
| 
      
 378 
     | 
    
         
            +
              def SleepyInterpreter.optional_systems()
         
     | 
| 
      
 379 
     | 
    
         
            +
                return {
         
     | 
| 
      
 380 
     | 
    
         
            +
                  "lemmatizer" => "treetagger"
         
     | 
| 
      
 381 
     | 
    
         
            +
                }
         
     | 
| 
      
 382 
     | 
    
         
            +
              end
         
     | 
| 
      
 383 
     | 
    
         
            +
             
     | 
| 
      
 384 
     | 
    
         
            +
            end
         
     |