frprep 0.0.1.prealpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +8 -0
 - data/CHANGELOG.rdoc +0 -0
 - data/LICENSE.rdoc +0 -0
 - data/README.rdoc +0 -0
 - data/lib/common/AbstractSynInterface.rb +1227 -0
 - data/lib/common/BerkeleyInterface.rb +375 -0
 - data/lib/common/CollinsInterface.rb +1165 -0
 - data/lib/common/ConfigData.rb +694 -0
 - data/lib/common/Counter.rb +18 -0
 - data/lib/common/DBInterface.rb +48 -0
 - data/lib/common/EnduserMode.rb +27 -0
 - data/lib/common/Eval.rb +480 -0
 - data/lib/common/FixSynSemMapping.rb +196 -0
 - data/lib/common/FrPrepConfigData.rb +66 -0
 - data/lib/common/FrprepHelper.rb +1324 -0
 - data/lib/common/Graph.rb +345 -0
 - data/lib/common/ISO-8859-1.rb +24 -0
 - data/lib/common/ML.rb +186 -0
 - data/lib/common/Maxent.rb +215 -0
 - data/lib/common/MiniparInterface.rb +1388 -0
 - data/lib/common/Optimise.rb +195 -0
 - data/lib/common/Parser.rb +213 -0
 - data/lib/common/RegXML.rb +269 -0
 - data/lib/common/RosyConventions.rb +171 -0
 - data/lib/common/SQLQuery.rb +243 -0
 - data/lib/common/STXmlTerminalOrder.rb +194 -0
 - data/lib/common/SalsaTigerRegXML.rb +2347 -0
 - data/lib/common/SalsaTigerXMLHelper.rb +99 -0
 - data/lib/common/SleepyInterface.rb +384 -0
 - data/lib/common/SynInterfaces.rb +275 -0
 - data/lib/common/TabFormat.rb +720 -0
 - data/lib/common/Tiger.rb +1448 -0
 - data/lib/common/TntInterface.rb +44 -0
 - data/lib/common/Tree.rb +61 -0
 - data/lib/common/TreetaggerInterface.rb +303 -0
 - data/lib/common/headz.rb +338 -0
 - data/lib/common/option_parser.rb +13 -0
 - data/lib/common/ruby_class_extensions.rb +310 -0
 - data/lib/fred/Baseline.rb +150 -0
 - data/lib/fred/FileZipped.rb +31 -0
 - data/lib/fred/FredBOWContext.rb +863 -0
 - data/lib/fred/FredConfigData.rb +182 -0
 - data/lib/fred/FredConventions.rb +232 -0
 - data/lib/fred/FredDetermineTargets.rb +324 -0
 - data/lib/fred/FredEval.rb +312 -0
 - data/lib/fred/FredFeatureExtractors.rb +321 -0
 - data/lib/fred/FredFeatures.rb +1061 -0
 - data/lib/fred/FredFeaturize.rb +596 -0
 - data/lib/fred/FredNumTrainingSenses.rb +27 -0
 - data/lib/fred/FredParameters.rb +402 -0
 - data/lib/fred/FredSplit.rb +84 -0
 - data/lib/fred/FredSplitPkg.rb +180 -0
 - data/lib/fred/FredTest.rb +607 -0
 - data/lib/fred/FredTrain.rb +144 -0
 - data/lib/fred/PlotAndREval.rb +480 -0
 - data/lib/fred/fred.rb +45 -0
 - data/lib/fred/md5.rb +23 -0
 - data/lib/fred/opt_parser.rb +250 -0
 - data/lib/frprep/AbstractSynInterface.rb +1227 -0
 - data/lib/frprep/Ampersand.rb +37 -0
 - data/lib/frprep/BerkeleyInterface.rb +375 -0
 - data/lib/frprep/CollinsInterface.rb +1165 -0
 - data/lib/frprep/ConfigData.rb +694 -0
 - data/lib/frprep/Counter.rb +18 -0
 - data/lib/frprep/FNCorpusXML.rb +643 -0
 - data/lib/frprep/FNDatabase.rb +144 -0
 - data/lib/frprep/FixSynSemMapping.rb +196 -0
 - data/lib/frprep/FrPrepConfigData.rb +66 -0
 - data/lib/frprep/FrameXML.rb +513 -0
 - data/lib/frprep/FrprepHelper.rb +1324 -0
 - data/lib/frprep/Graph.rb +345 -0
 - data/lib/frprep/ISO-8859-1.rb +24 -0
 - data/lib/frprep/MiniparInterface.rb +1388 -0
 - data/lib/frprep/Parser.rb +213 -0
 - data/lib/frprep/RegXML.rb +269 -0
 - data/lib/frprep/STXmlTerminalOrder.rb +194 -0
 - data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
 - data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
 - data/lib/frprep/SleepyInterface.rb +384 -0
 - data/lib/frprep/SynInterfaces.rb +275 -0
 - data/lib/frprep/TabFormat.rb +720 -0
 - data/lib/frprep/Tiger.rb +1448 -0
 - data/lib/frprep/TntInterface.rb +44 -0
 - data/lib/frprep/Tree.rb +61 -0
 - data/lib/frprep/TreetaggerInterface.rb +303 -0
 - data/lib/frprep/do_parses.rb +142 -0
 - data/lib/frprep/frprep.rb +686 -0
 - data/lib/frprep/headz.rb +338 -0
 - data/lib/frprep/one_parsed_file.rb +28 -0
 - data/lib/frprep/opt_parser.rb +94 -0
 - data/lib/frprep/ruby_class_extensions.rb +310 -0
 - data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
 - data/lib/rosy/DBMySQL.rb +146 -0
 - data/lib/rosy/DBSQLite.rb +280 -0
 - data/lib/rosy/DBTable.rb +239 -0
 - data/lib/rosy/DBWrapper.rb +176 -0
 - data/lib/rosy/ExternalConfigData.rb +58 -0
 - data/lib/rosy/FailedParses.rb +130 -0
 - data/lib/rosy/FeatureInfo.rb +242 -0
 - data/lib/rosy/GfInduce.rb +1115 -0
 - data/lib/rosy/GfInduceFeature.rb +148 -0
 - data/lib/rosy/InputData.rb +294 -0
 - data/lib/rosy/RosyConfigData.rb +115 -0
 - data/lib/rosy/RosyConfusability.rb +338 -0
 - data/lib/rosy/RosyEval.rb +465 -0
 - data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
 - data/lib/rosy/RosyFeaturize.rb +280 -0
 - data/lib/rosy/RosyInspect.rb +336 -0
 - data/lib/rosy/RosyIterator.rb +477 -0
 - data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
 - data/lib/rosy/RosyPruning.rb +165 -0
 - data/lib/rosy/RosyServices.rb +744 -0
 - data/lib/rosy/RosySplit.rb +232 -0
 - data/lib/rosy/RosyTask.rb +19 -0
 - data/lib/rosy/RosyTest.rb +826 -0
 - data/lib/rosy/RosyTrain.rb +232 -0
 - data/lib/rosy/RosyTrainingTestTable.rb +786 -0
 - data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
 - data/lib/rosy/View.rb +418 -0
 - data/lib/rosy/opt_parser.rb +379 -0
 - data/lib/rosy/rosy.rb +77 -0
 - data/lib/shalmaneser/version.rb +3 -0
 - data/test/frprep/test_opt_parser.rb +94 -0
 - data/test/functional/functional_test_helper.rb +40 -0
 - data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
 - data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
 - data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
 - data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
 - data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
 - data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
 - data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
 - data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
 - data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
 - data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
 - data/test/functional/test_fred.rb +47 -0
 - data/test/functional/test_frprep.rb +52 -0
 - data/test/functional/test_rosy.rb +20 -0
 - metadata +270 -0
 
| 
         @@ -0,0 +1,312 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # FredEval
         
     | 
| 
      
 2 
     | 
    
         
            +
            # Katrin Erk April 05
         
     | 
| 
      
 3 
     | 
    
         
            +
            #
         
     | 
| 
      
 4 
     | 
    
         
            +
            # Frame disambiguation system: evaluate classification results
         
     | 
| 
      
 5 
     | 
    
         
            +
            #
         
     | 
| 
      
 6 
     | 
    
         
            +
            # While the other main classes of Fred just provide a new() method
         
     | 
| 
      
 7 
     | 
    
         
            +
            # and a compute() method,
         
     | 
| 
      
 8 
     | 
    
         
            +
            # the FredEval class also provides access methods to all the
         
     | 
| 
      
 9 
     | 
    
         
            +
            # individual evaluation results and allows for a flag that
         
     | 
| 
      
 10 
     | 
    
         
            +
            # suppresses evaluation output to a file --
         
     | 
| 
      
 11 
     | 
    
         
            +
            # such that this package can also be used by external systems that
         
     | 
| 
      
 12 
     | 
    
         
            +
            # wish to evaluate Fred.
         
     | 
| 
      
 13 
     | 
    
         
            +
            #
         
     | 
| 
      
 14 
     | 
    
         
            +
            # Inherits from the Eval class that is not Fred-specific
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
            # Salsa packages
         
     | 
| 
      
 17 
     | 
    
         
            +
            require "common/Eval"
         
     | 
| 
      
 18 
     | 
    
         
            +
            require "common/ruby_class_extensions"
         
     | 
| 
      
 19 
     | 
    
         
            +
             
     | 
| 
      
 20 
     | 
    
         
            +
            # Fred packages
         
     | 
| 
      
 21 
     | 
    
         
            +
            require "fred/FredConfigData"
         
     | 
| 
      
 22 
     | 
    
         
            +
            require "fred/FredConventions"
         
     | 
| 
      
 23 
     | 
    
         
            +
            require "fred/FredFeatures"
         
     | 
| 
      
 24 
     | 
    
         
            +
            require "fred/FredDetermineTargets"
         
     | 
| 
      
 25 
     | 
    
         
            +
             
     | 
| 
      
 26 
     | 
    
         
            +
            class FredEval < Eval
         
     | 
| 
      
 27 
     | 
    
         
            +
             
     | 
| 
      
 28 
     | 
    
         
            +
              ###
         
     | 
| 
      
 29 
     | 
    
         
            +
              # new
         
     | 
| 
      
 30 
     | 
    
         
            +
              #
         
     | 
| 
      
 31 
     | 
    
         
            +
              # evaluate runtime options and announce the task
         
     | 
| 
      
 32 
     | 
    
         
            +
              def initialize(exp_obj, # FredConfigData object
         
     | 
| 
      
 33 
     | 
    
         
            +
            		 options) # hash: runtime option name (string) => value(string)
         
     | 
| 
      
 34 
     | 
    
         
            +
             
     | 
| 
      
 35 
     | 
    
         
            +
                in_enduser_mode_unavailable()
         
     | 
| 
      
 36 
     | 
    
         
            +
             
     | 
| 
      
 37 
     | 
    
         
            +
                @exp = exp_obj
         
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
      
 39 
     | 
    
         
            +
                ###
         
     | 
| 
      
 40 
     | 
    
         
            +
                # evaluate runtime options
         
     | 
| 
      
 41 
     | 
    
         
            +
                @split_id = nil
         
     | 
| 
      
 42 
     | 
    
         
            +
                logfilename = nil
         
     | 
| 
      
 43 
     | 
    
         
            +
             
     | 
| 
      
 44 
     | 
    
         
            +
                options.each_pair { |opt, arg|
         
     | 
| 
      
 45 
     | 
    
         
            +
                  case opt
         
     | 
| 
      
 46 
     | 
    
         
            +
                  when "--logID"
         
     | 
| 
      
 47 
     | 
    
         
            +
            	
         
     | 
| 
      
 48 
     | 
    
         
            +
            	@split_id = arg
         
     | 
| 
      
 49 
     | 
    
         
            +
                  when "--printLog"
         
     | 
| 
      
 50 
     | 
    
         
            +
            	logfilename = fred_dirname(@exp, "eval", "log", "new") +
         
     | 
| 
      
 51 
     | 
    
         
            +
                                  "eval_logfile.txt"
         
     | 
| 
      
 52 
     | 
    
         
            +
            	
         
     | 
| 
      
 53 
     | 
    
         
            +
                  else
         
     | 
| 
      
 54 
     | 
    
         
            +
            	# case of unknown arguments has been dealt with by fred.rb
         
     | 
| 
      
 55 
     | 
    
         
            +
                  end
         
     | 
| 
      
 56 
     | 
    
         
            +
                }
         
     | 
| 
      
 57 
     | 
    
         
            +
             
     | 
| 
      
 58 
     | 
    
         
            +
                ###
         
     | 
| 
      
 59 
     | 
    
         
            +
                # make outfile name
         
     | 
| 
      
 60 
     | 
    
         
            +
                outfilename =  fred_dirname(@exp, "eval", "eval", "new") + 
         
     | 
| 
      
 61 
     | 
    
         
            +
                               "eval.txt"
         
     | 
| 
      
 62 
     | 
    
         
            +
             
     | 
| 
      
 63 
     | 
    
         
            +
                ###
         
     | 
| 
      
 64 
     | 
    
         
            +
                # do we regard all senses as assigned,
         
     | 
| 
      
 65 
     | 
    
         
            +
                # as long as they surpass some threshold?
         
     | 
| 
      
 66 
     | 
    
         
            +
                # if we are doing multilabel evaluation, we need the full list of senses
         
     | 
| 
      
 67 
     | 
    
         
            +
                @threshold = @exp.get("assignment_confidence_threshold")
         
     | 
| 
      
 68 
     | 
    
         
            +
                @target_obj = Targets.new(@exp, nil, "r")
         
     | 
| 
      
 69 
     | 
    
         
            +
                unless @target_obj.targets_okay
         
     | 
| 
      
 70 
     | 
    
         
            +
                  # error during initialization
         
     | 
| 
      
 71 
     | 
    
         
            +
                  $stderr.puts "Error: Could not read list of known targets, bailing out."
         
     | 
| 
      
 72 
     | 
    
         
            +
                  exit 1
         
     | 
| 
      
 73 
     | 
    
         
            +
                end
         
     | 
| 
      
 74 
     | 
    
         
            +
             
     | 
| 
      
 75 
     | 
    
         
            +
                if @threshold or @exp.get("handle_multilabel") == "keep"
         
     | 
| 
      
 76 
     | 
    
         
            +
                  @multiple_senses_assigned = true
         
     | 
| 
      
 77 
     | 
    
         
            +
                else
         
     | 
| 
      
 78 
     | 
    
         
            +
                  @multiple_senses_assigned = false
         
     | 
| 
      
 79 
     | 
    
         
            +
                end
         
     | 
| 
      
 80 
     | 
    
         
            +
             
     | 
| 
      
 81 
     | 
    
         
            +
             
     | 
| 
      
 82 
     | 
    
         
            +
                ###
         
     | 
| 
      
 83 
     | 
    
         
            +
                # initialize abstract class behind me
         
     | 
| 
      
 84 
     | 
    
         
            +
                if @multiple_senses_assigned
         
     | 
| 
      
 85 
     | 
    
         
            +
                  # we are possibly assigning more than one sense: do precision/recall
         
     | 
| 
      
 86 
     | 
    
         
            +
                  # instead of accuracy:
         
     | 
| 
      
 87 
     | 
    
         
            +
                  # "true" is what "this sense has been assigned" is mapped to below.
         
     | 
| 
      
 88 
     | 
    
         
            +
                  super(outfilename, logfilename, "true")
         
     | 
| 
      
 89 
     | 
    
         
            +
                else
         
     | 
| 
      
 90 
     | 
    
         
            +
                  super(outfilename, logfilename)
         
     | 
| 
      
 91 
     | 
    
         
            +
                end
         
     | 
| 
      
 92 
     | 
    
         
            +
             
     | 
| 
      
 93 
     | 
    
         
            +
                # what is being done with instances with multiple sense labels?
         
     | 
| 
      
 94 
     | 
    
         
            +
                @handle_multilabel = @exp.get("handle_multilabel")
         
     | 
| 
      
 95 
     | 
    
         
            +
             
     | 
| 
      
 96 
     | 
    
         
            +
                ###
         
     | 
| 
      
 97 
     | 
    
         
            +
                # announce the task
         
     | 
| 
      
 98 
     | 
    
         
            +
                $stderr.puts "---------"
         
     | 
| 
      
 99 
     | 
    
         
            +
                $stderr.print "Fred  experiment #{@exp.get("experiment_ID")}: Evaluating classifiers"
         
     | 
| 
      
 100 
     | 
    
         
            +
                if @split_dir
         
     | 
| 
      
 101 
     | 
    
         
            +
                  $stderr.puts " using split with ID #{@split_id}"
         
     | 
| 
      
 102 
     | 
    
         
            +
                else
         
     | 
| 
      
 103 
     | 
    
         
            +
                  $stderr.puts
         
     | 
| 
      
 104 
     | 
    
         
            +
                end
         
     | 
| 
      
 105 
     | 
    
         
            +
                if @multiple_senses_assigned
         
     | 
| 
      
 106 
     | 
    
         
            +
                  $stderr.puts "Allowing for the assignment of multiple senses,"
         
     | 
| 
      
 107 
     | 
    
         
            +
                  $stderr.puts "computing precision and recall against the full sense list of a lemma."
         
     | 
| 
      
 108 
     | 
    
         
            +
                end
         
     | 
| 
      
 109 
     | 
    
         
            +
                $stderr.puts "Writing result to #{fred_dirname(@exp, "eval", "eval")}"
         
     | 
| 
      
 110 
     | 
    
         
            +
                $stderr.puts "---------"
         
     | 
| 
      
 111 
     | 
    
         
            +
              end
         
     | 
| 
      
 112 
     | 
    
         
            +
             
     | 
| 
      
 113 
     | 
    
         
            +
              #####
         
     | 
| 
      
 114 
     | 
    
         
            +
              protected
         
     | 
| 
      
 115 
     | 
    
         
            +
             
     | 
| 
      
 116 
     | 
    
         
            +
              ###
         
     | 
| 
      
 117 
     | 
    
         
            +
              # each_group
         
     | 
| 
      
 118 
     | 
    
         
            +
              #
         
     | 
| 
      
 119 
     | 
    
         
            +
              # yield each group name in turn
         
     | 
| 
      
 120 
     | 
    
         
            +
              # in our case, group names are lemmas
         
     | 
| 
      
 121 
     | 
    
         
            +
              #
         
     | 
| 
      
 122 
     | 
    
         
            +
              # also, set object-global variables in such a way
         
     | 
| 
      
 123 
     | 
    
         
            +
              # that the elements of this group can be read
         
     | 
| 
      
 124 
     | 
    
         
            +
              def each_group()
         
     | 
| 
      
 125 
     | 
    
         
            +
             
     | 
| 
      
 126 
     | 
    
         
            +
                # access to classifier output files
         
     | 
| 
      
 127 
     | 
    
         
            +
                output_dir = fred_dirname(@exp, "output", "tab")
         
     | 
| 
      
 128 
     | 
    
         
            +
                # access to answer key files
         
     | 
| 
      
 129 
     | 
    
         
            +
             
     | 
| 
      
 130 
     | 
    
         
            +
                
         
     | 
| 
      
 131 
     | 
    
         
            +
                if @split_id
         
     | 
| 
      
 132 
     | 
    
         
            +
                  # make split object and parameter hash to pass to it
         
     | 
| 
      
 133 
     | 
    
         
            +
                  dataset = "train"
         
     | 
| 
      
 134 
     | 
    
         
            +
                else
         
     | 
| 
      
 135 
     | 
    
         
            +
                  dataset = "test"
         
     | 
| 
      
 136 
     | 
    
         
            +
                end
         
     | 
| 
      
 137 
     | 
    
         
            +
             
     | 
| 
      
 138 
     | 
    
         
            +
                # iterate through instance files
         
     | 
| 
      
 139 
     | 
    
         
            +
                @target_obj.get_lemmas().sort().each { |lemma|
         
     | 
| 
      
 140 
     | 
    
         
            +
                  # progress report
         
     | 
| 
      
 141 
     | 
    
         
            +
                  if @exp.get("verbose")
         
     | 
| 
      
 142 
     | 
    
         
            +
                    $stderr.puts "Evaluating " + lemma
         
     | 
| 
      
 143 
     | 
    
         
            +
                  end
         
     | 
| 
      
 144 
     | 
    
         
            +
             
     | 
| 
      
 145 
     | 
    
         
            +
                  # file with classification results
         
     | 
| 
      
 146 
     | 
    
         
            +
                  begin
         
     | 
| 
      
 147 
     | 
    
         
            +
                    @classfile = File.new(output_dir + fred_result_filename(lemma))
         
     | 
| 
      
 148 
     | 
    
         
            +
                  rescue
         
     | 
| 
      
 149 
     | 
    
         
            +
                    # no classification results
         
     | 
| 
      
 150 
     | 
    
         
            +
                    @classfile = nil
         
     | 
| 
      
 151 
     | 
    
         
            +
                  end
         
     | 
| 
      
 152 
     | 
    
         
            +
             
     | 
| 
      
 153 
     | 
    
         
            +
                  # file with answers:
         
     | 
| 
      
 154 
     | 
    
         
            +
                  # maybe we need to apply a split first
         
     | 
| 
      
 155 
     | 
    
         
            +
                  if @split_id
         
     | 
| 
      
 156 
     | 
    
         
            +
                    @goldreader = AnswerKeyAccess.new(@exp, "train", lemma, "r", @split_id, "test")
         
     | 
| 
      
 157 
     | 
    
         
            +
                  else
         
     | 
| 
      
 158 
     | 
    
         
            +
                    @goldreader = AnswerKeyAccess.new(@exp, "test", lemma, "r")
         
     | 
| 
      
 159 
     | 
    
         
            +
                  end      
         
     | 
| 
      
 160 
     | 
    
         
            +
             
     | 
| 
      
 161 
     | 
    
         
            +
                  # doing multilabel evaluation?
         
     | 
| 
      
 162 
     | 
    
         
            +
                  # then we need a list of all senses
         
     | 
| 
      
 163 
     | 
    
         
            +
                  if @multiple_senses_assigned
         
     | 
| 
      
 164 
     | 
    
         
            +
                    @all_senses = @target_obj.get_senses(lemma)
         
     | 
| 
      
 165 
     | 
    
         
            +
                  else
         
     | 
| 
      
 166 
     | 
    
         
            +
                    @all_senses = nil
         
     | 
| 
      
 167 
     | 
    
         
            +
                  end
         
     | 
| 
      
 168 
     | 
    
         
            +
             
     | 
| 
      
 169 
     | 
    
         
            +
                  yield lemma
         
     | 
| 
      
 170 
     | 
    
         
            +
                }
         
     | 
| 
      
 171 
     | 
    
         
            +
              end
         
     | 
| 
      
 172 
     | 
    
         
            +
             
     | 
| 
      
 173 
     | 
    
         
            +
              ###
         
     | 
| 
      
 174 
     | 
    
         
            +
              # each_instance
         
     | 
| 
      
 175 
     | 
    
         
            +
              #
         
     | 
| 
      
 176 
     | 
    
         
            +
              # given a lemma name, yield each instance of this lemma in turn,
         
     | 
| 
      
 177 
     | 
    
         
            +
              # or rather: yield pairs [gold_class(string), assigned_class(string)]
         
     | 
| 
      
 178 
     | 
    
         
            +
              #
         
     | 
| 
      
 179 
     | 
    
         
            +
              # relies on each_group() having set the appropriate readers
         
     | 
| 
      
 180 
     | 
    
         
            +
              # @goldreader and @classfile
         
     | 
| 
      
 181 
     | 
    
         
            +
              def each_instance(lemma) # string: lemma name
         
     | 
| 
      
 182 
     | 
    
         
            +
             
     | 
| 
      
 183 
     | 
    
         
            +
                # watch out for repeated instances
         
     | 
| 
      
 184 
     | 
    
         
            +
                # which may occur if handle_multilabel = repeat.
         
     | 
| 
      
 185 
     | 
    
         
            +
                # Only yield them once to avoid re-evaluating multi-label instances
         
     | 
| 
      
 186 
     | 
    
         
            +
                #
         
     | 
| 
      
 187 
     | 
    
         
            +
                # instance_ids_seen: hash target_ids -> true/nil
         
     | 
| 
      
 188 
     | 
    
         
            +
                instance_ids_seen = Hash.new()
         
     | 
| 
      
 189 
     | 
    
         
            +
             
     | 
| 
      
 190 
     | 
    
         
            +
                # read gold file and classifier output file in parallel
         
     | 
| 
      
 191 
     | 
    
         
            +
                @goldreader.each { |lemma, pos, target_ids, sid, senses_gold, transformed_gold_senses|
         
     | 
| 
      
 192 
     | 
    
         
            +
             
     | 
| 
      
 193 
     | 
    
         
            +
                  # classline: format
         
     | 
| 
      
 194 
     | 
    
         
            +
                  # (label confidence)*
         
     | 
| 
      
 195 
     | 
    
         
            +
                  # such that the label with the highest confidence is first
         
     | 
| 
      
 196 
     | 
    
         
            +
                  classline = nil
         
     | 
| 
      
 197 
     | 
    
         
            +
                  if @classfile
         
     | 
| 
      
 198 
     | 
    
         
            +
                    classline = @classfile.gets()
         
     | 
| 
      
 199 
     | 
    
         
            +
                  end
         
     | 
| 
      
 200 
     | 
    
         
            +
                  if classline.nil?
         
     | 
| 
      
 201 
     | 
    
         
            +
            	classline = ""
         
     | 
| 
      
 202 
     | 
    
         
            +
                  end
         
     | 
| 
      
 203 
     | 
    
         
            +
             
     | 
| 
      
 204 
     | 
    
         
            +
                  # $stderr.puts "HIER0 #{classline} #{@classfile.nil?}"
         
     | 
| 
      
 205 
     | 
    
         
            +
             
     | 
| 
      
 206 
     | 
    
         
            +
                  # have we done this same instance previously?
         
     | 
| 
      
 207 
     | 
    
         
            +
                  if instance_ids_seen[target_ids]
         
     | 
| 
      
 208 
     | 
    
         
            +
                    next
         
     | 
| 
      
 209 
     | 
    
         
            +
                  end
         
     | 
| 
      
 210 
     | 
    
         
            +
                  # instance not seen previously, but mark as seen now.
         
     | 
| 
      
 211 
     | 
    
         
            +
                  instance_ids_seen[target_ids] = true
         
     | 
| 
      
 212 
     | 
    
         
            +
             
     | 
| 
      
 213 
     | 
    
         
            +
                  # determine all assigned senses and their confidence levels
         
     | 
| 
      
 214 
     | 
    
         
            +
                  # determine all sense/confidence pairs
         
     | 
| 
      
 215 
     | 
    
         
            +
                  # senses assigned: list of pairs [senselist, confidence]
         
     | 
| 
      
 216 
     | 
    
         
            +
                  # where senselist is an array of sense strings
         
     | 
| 
      
 217 
     | 
    
         
            +
                  senses_assigned = Array.new()
         
     | 
| 
      
 218 
     | 
    
         
            +
                  current_sense = nil
         
     | 
| 
      
 219 
     | 
    
         
            +
             
     | 
| 
      
 220 
     | 
    
         
            +
                  classline.split().each_with_index { |entry, index|
         
     | 
| 
      
 221 
     | 
    
         
            +
                    if index % 2 == 0
         
     | 
| 
      
 222 
     | 
    
         
            +
                      # we have a sense label
         
     | 
| 
      
 223 
     | 
    
         
            +
                      if @handle_multilabel == "join"
         
     | 
| 
      
 224 
     | 
    
         
            +
                        # split up joined senses
         
     | 
| 
      
 225 
     | 
    
         
            +
                        current_sense = fred_split_sense(entry)
         
     | 
| 
      
 226 
     | 
    
         
            +
                      else
         
     | 
| 
      
 227 
     | 
    
         
            +
                        current_sense = [entry]
         
     | 
| 
      
 228 
     | 
    
         
            +
                      end
         
     | 
| 
      
 229 
     | 
    
         
            +
             
     | 
| 
      
 230 
     | 
    
         
            +
                    else
         
     | 
| 
      
 231 
     | 
    
         
            +
                      # we have a confidence level
         
     | 
| 
      
 232 
     | 
    
         
            +
                      senses_assigned << [current_sense, entry.to_f()]
         
     | 
| 
      
 233 
     | 
    
         
            +
                    end
         
     | 
| 
      
 234 
     | 
    
         
            +
                  }
         
     | 
| 
      
 235 
     | 
    
         
            +
             
     | 
| 
      
 236 
     | 
    
         
            +
             
     | 
| 
      
 237 
     | 
    
         
            +
                  if @threshold
         
     | 
| 
      
 238 
     | 
    
         
            +
                    # multiple senses assigned, and
         
     | 
| 
      
 239 
     | 
    
         
            +
                    # regard as assigned everything above a given threshold
         
     | 
| 
      
 240 
     | 
    
         
            +
             
     | 
| 
      
 241 
     | 
    
         
            +
                    # transform senses_assigned: 
         
     | 
| 
      
 242 
     | 
    
         
            +
                    # in the case of "join", one sense may have several confidence levels,
         
     | 
| 
      
 243 
     | 
    
         
            +
                    # one on its own and one in a joined sense
         
     | 
| 
      
 244 
     | 
    
         
            +
                    senses_assigned_hash = Hash.new()
         
     | 
| 
      
 245 
     | 
    
         
            +
                    senses_assigned.each { |senses, confidence|
         
     | 
| 
      
 246 
     | 
    
         
            +
                      senses.each { |s|
         
     | 
| 
      
 247 
     | 
    
         
            +
                        # assign to each sense the maximum of its previous confidence
         
     | 
| 
      
 248 
     | 
    
         
            +
                        # and this one.
         
     | 
| 
      
 249 
     | 
    
         
            +
                        # watch out: confidence may be smaller than zero
         
     | 
| 
      
 250 
     | 
    
         
            +
                        if senses_assigned_hash[s]
         
     | 
| 
      
 251 
     | 
    
         
            +
                          senses_assigned_hash[s] = [senses_assigned_hash[s], confidence].max()
         
     | 
| 
      
 252 
     | 
    
         
            +
                        else
         
     | 
| 
      
 253 
     | 
    
         
            +
                          senses_assigned_hash[s] = confidence
         
     | 
| 
      
 254 
     | 
    
         
            +
                        end
         
     | 
| 
      
 255 
     | 
    
         
            +
                      }
         
     | 
| 
      
 256 
     | 
    
         
            +
                    }
         
     | 
| 
      
 257 
     | 
    
         
            +
             
     | 
| 
      
 258 
     | 
    
         
            +
                    # select all sense/confidence pairs where confidence is above threshold
         
     | 
| 
      
 259 
     | 
    
         
            +
                    senses_assigned = senses_assigned_hash.to_a().select { |sense, confidence|
         
     | 
| 
      
 260 
     | 
    
         
            +
                      confidence >= @threshold
         
     | 
| 
      
 261 
     | 
    
         
            +
                    }.map { |sense, confidence| 
         
     | 
| 
      
 262 
     | 
    
         
            +
                      # then retain only the sense, not the confidence
         
     | 
| 
      
 263 
     | 
    
         
            +
                      sense 
         
     | 
| 
      
 264 
     | 
    
         
            +
                    }
         
     | 
| 
      
 265 
     | 
    
         
            +
             
     | 
| 
      
 266 
     | 
    
         
            +
             
     | 
| 
      
 267 
     | 
    
         
            +
                    unless @all_senses
         
     | 
| 
      
 268 
     | 
    
         
            +
                      raise "Shouldn't be here"
         
     | 
| 
      
 269 
     | 
    
         
            +
                    end
         
     | 
| 
      
 270 
     | 
    
         
            +
             
     | 
| 
      
 271 
     | 
    
         
            +
                    # for each sense out of the list of all senses:
         
     | 
| 
      
 272 
     | 
    
         
            +
                    # yield a pair of [applies, has been assigned]
         
     | 
| 
      
 273 
     | 
    
         
            +
                    # both 'applies' and 'has been assigned' will be
         
     | 
| 
      
 274 
     | 
    
         
            +
                    # a string of either 'true' or 'false'
         
     | 
| 
      
 275 
     | 
    
         
            +
                    # assignment is accurate if both are the same
         
     | 
| 
      
 276 
     | 
    
         
            +
                    @all_senses.each { |sense_of_lemma|
         
     | 
| 
      
 277 
     | 
    
         
            +
                      gold_class = (senses_gold.include? sense_of_lemma).to_s()
         
     | 
| 
      
 278 
     | 
    
         
            +
                      assigned_class = (senses_assigned.include? sense_of_lemma).to_s()
         
     | 
| 
      
 279 
     | 
    
         
            +
                      yield [gold_class, assigned_class]
         
     | 
| 
      
 280 
     | 
    
         
            +
                    }
         
     | 
| 
      
 281 
     | 
    
         
            +
             
     | 
| 
      
 282 
     | 
    
         
            +
             
     | 
| 
      
 283 
     | 
    
         
            +
                  else
         
     | 
| 
      
 284 
     | 
    
         
            +
                    # regard only one sense as assigned at a time
         
     | 
| 
      
 285 
     | 
    
         
            +
                    # count as correct if the list of gold classes
         
     | 
| 
      
 286 
     | 
    
         
            +
                    # contains the main assigned class
         
     | 
| 
      
 287 
     | 
    
         
            +
                    # (relatively lenient evaluation)
         
     | 
| 
      
 288 
     | 
    
         
            +
             
     | 
| 
      
 289 
     | 
    
         
            +
                    # actually assigned class: only the one with the 
         
     | 
| 
      
 290 
     | 
    
         
            +
                    # maximum confidence
         
     | 
| 
      
 291 
     | 
    
         
            +
                    # $stderr.puts "HIER5 #{senses_assigned.length()}"
         
     | 
| 
      
 292 
     | 
    
         
            +
             
     | 
| 
      
 293 
     | 
    
         
            +
                    if senses_assigned.empty?
         
     | 
| 
      
 294 
     | 
    
         
            +
                      # nothing to yield
         
     | 
| 
      
 295 
     | 
    
         
            +
                    else
         
     | 
| 
      
 296 
     | 
    
         
            +
             
     | 
| 
      
 297 
     | 
    
         
            +
                      max_senselist = senses_assigned.max { |a, b|
         
     | 
| 
      
 298 
     | 
    
         
            +
                        a.last() <=> b.last()
         
     | 
| 
      
 299 
     | 
    
         
            +
                      }.first()
         
     | 
| 
      
 300 
     | 
    
         
            +
                      
         
     | 
| 
      
 301 
     | 
    
         
            +
             
     | 
| 
      
 302 
     | 
    
         
            +
                      max_senselist.each { |single_sense|
         
     | 
| 
      
 303 
     | 
    
         
            +
                        gold_class = (senses_gold.include? single_sense).to_s()
         
     | 
| 
      
 304 
     | 
    
         
            +
                        yield [gold_class, "true"]
         
     | 
| 
      
 305 
     | 
    
         
            +
                      }
         
     | 
| 
      
 306 
     | 
    
         
            +
                    end
         
     | 
| 
      
 307 
     | 
    
         
            +
             
     | 
| 
      
 308 
     | 
    
         
            +
                  end
         
     | 
| 
      
 309 
     | 
    
         
            +
                }
         
     | 
| 
      
 310 
     | 
    
         
            +
              end
         
     | 
| 
      
 311 
     | 
    
         
            +
             
     | 
| 
      
 312 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,321 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            class FredFeatureInfo
         
     | 
| 
      
 2 
     | 
    
         
            +
              ###
         
     | 
| 
      
 3 
     | 
    
         
            +
              # class variable:
         
     | 
| 
      
 4 
     | 
    
         
            +
              # list of all known extractors
         
     | 
| 
      
 5 
     | 
    
         
            +
              # add to it using add_feature()
         
     | 
| 
      
 6 
     | 
    
         
            +
              @@extractors = Array.new
         
     | 
| 
      
 7 
     | 
    
         
            +
             
     | 
| 
      
 8 
     | 
    
         
            +
              # boolean. set to true after warning messages have been given once
         
     | 
| 
      
 9 
     | 
    
         
            +
              @@warned = false
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
              ###
         
     | 
| 
      
 12 
     | 
    
         
            +
              # add interface/interpreter
         
     | 
| 
      
 13 
     | 
    
         
            +
              def FredFeatureInfo.add_feature(class_name) # Class object
         
     | 
| 
      
 14 
     | 
    
         
            +
                @@extractors << class_name
         
     | 
| 
      
 15 
     | 
    
         
            +
              end
         
     | 
| 
      
 16 
     | 
    
         
            +
              
         
     | 
| 
      
 17 
     | 
    
         
            +
              ###
         
     | 
| 
      
 18 
     | 
    
         
            +
              def initialize(exp)
         
     | 
| 
      
 19 
     | 
    
         
            +
                
         
     | 
| 
      
 20 
     | 
    
         
            +
                ##
         
     | 
| 
      
 21 
     | 
    
         
            +
                # make list of extractors that are
         
     | 
| 
      
 22 
     | 
    
         
            +
                # required by the user
         
     | 
| 
      
 23 
     | 
    
         
            +
                @features = Array.new
         
     | 
| 
      
 24 
     | 
    
         
            +
                @exp = exp
         
     | 
| 
      
 25 
     | 
    
         
            +
             
     | 
| 
      
 26 
     | 
    
         
            +
                # user-chosen extractors:
         
     | 
| 
      
 27 
     | 
    
         
            +
                # returns array of pairs [feature group designator(string), options(array:string)]
         
     | 
| 
      
 28 
     | 
    
         
            +
                exp.get_lf("feature").each { |extractor_name, *options|
         
     | 
| 
      
 29 
     | 
    
         
            +
                  
         
     | 
| 
      
 30 
     | 
    
         
            +
                  extractor = @@extractors.detect { |e| e.feature_name() == extractor_name }
         
     | 
| 
      
 31 
     | 
    
         
            +
                  unless extractor
         
     | 
| 
      
 32 
     | 
    
         
            +
                    # no extractor found matching the given designator
         
     | 
| 
      
 33 
     | 
    
         
            +
                    unless @@warned
         
     | 
| 
      
 34 
     | 
    
         
            +
                      $stderr.puts "Warning: Could not find a feature extractor for #{extractor_name}: skipping."
         
     | 
| 
      
 35 
     | 
    
         
            +
                    end
         
     | 
| 
      
 36 
     | 
    
         
            +
                    next
         
     | 
| 
      
 37 
     | 
    
         
            +
                  end
         
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
      
 39 
     | 
    
         
            +
                  # no need to use the options here,
         
     | 
| 
      
 40 
     | 
    
         
            +
                  # the feature extractors can get their options themselves.
         
     | 
| 
      
 41 
     | 
    
         
            +
                  @features << extractor
         
     | 
| 
      
 42 
     | 
    
         
            +
                }
         
     | 
| 
      
 43 
     | 
    
         
            +
             
     | 
| 
      
 44 
     | 
    
         
            +
                # do not print warnings again if another RosyFeatureInfo object is made
         
     | 
| 
      
 45 
     | 
    
         
            +
                @@warned = true
         
     | 
| 
      
 46 
     | 
    
         
            +
              end
         
     | 
| 
      
 47 
     | 
    
         
            +
             
     | 
| 
      
 48 
     | 
    
         
            +
              ###
         
     | 
| 
      
 49 
     | 
    
         
            +
              # get_extractor_objects
         
     | 
| 
      
 50 
     | 
    
         
            +
              #
         
     | 
| 
      
 51 
     | 
    
         
            +
              # returns a list of feature extractor objects
         
     | 
| 
      
 52 
     | 
    
         
            +
              def get_extractor_objects()
         
     | 
| 
      
 53 
     | 
    
         
            +
             
     | 
| 
      
 54 
     | 
    
         
            +
                return @features.map{ |feature_class|
         
     | 
| 
      
 55 
     | 
    
         
            +
                  feature_class.new(@exp)
         
     | 
| 
      
 56 
     | 
    
         
            +
                }
         
     | 
| 
      
 57 
     | 
    
         
            +
              end
         
     | 
| 
      
 58 
     | 
    
         
            +
            end
         
     | 
| 
      
 59 
     | 
    
         
            +
             
     | 
| 
      
 60 
     | 
    
         
            +
            ##################################3
         
     | 
| 
      
 61 
     | 
    
         
            +
            class FredFeatureExtractor
         
     | 
| 
      
 62 
     | 
    
         
            +
              ###
         
     | 
| 
      
 63 
     | 
    
         
            +
              # feature name:
         
     | 
| 
      
 64 
     | 
    
         
            +
              # name by which you choose this feature
         
     | 
| 
      
 65 
     | 
    
         
            +
              # in the experiment file
         
     | 
| 
      
 66 
     | 
    
         
            +
              def FredFeatureExtractor.feature_name()
         
     | 
| 
      
 67 
     | 
    
         
            +
                raise "Overwrite me."
         
     | 
| 
      
 68 
     | 
    
         
            +
              end
         
     | 
| 
      
 69 
     | 
    
         
            +
             
     | 
| 
      
 70 
     | 
    
         
            +
              ###
         
     | 
| 
      
 71 
     | 
    
         
            +
              # initialize with Fred experiment file object
         
     | 
| 
      
 72 
     | 
    
         
            +
              def initialize(exp)
         
     | 
| 
      
 73 
     | 
    
         
            +
                @exp = exp
         
     | 
| 
      
 74 
     | 
    
         
            +
              end
         
     | 
| 
      
 75 
     | 
    
         
            +
             
     | 
| 
      
 76 
     | 
    
         
            +
              ###
         
     | 
| 
      
 77 
     | 
    
         
            +
              # compute features from meta-features
         
     | 
| 
      
 78 
     | 
    
         
            +
              #
         
     | 
| 
      
 79 
     | 
    
         
            +
              # argument: hash 
         
     | 
| 
      
 80 
     | 
    
         
            +
              # metafeature_label -> metafeatures
         
     | 
| 
      
 81 
     | 
    
         
            +
              #  string -> array:string
         
     | 
| 
      
 82 
     | 
    
         
            +
              # 
         
     | 
| 
      
 83 
     | 
    
         
            +
              # yields each feature as a string
         
     | 
| 
      
 84 
     | 
    
         
            +
              def each_feature(feature_hash)
         
     | 
| 
      
 85 
     | 
    
         
            +
                raise "overwrite me"
         
     | 
| 
      
 86 
     | 
    
         
            +
              end
         
     | 
| 
      
 87 
     | 
    
         
            +
             
     | 
| 
      
 88 
     | 
    
         
            +
              ######
         
     | 
| 
      
 89 
     | 
    
         
            +
              protected
         
     | 
| 
      
 90 
     | 
    
         
            +
              
         
     | 
| 
      
 91 
     | 
    
         
            +
              def FredFeatureExtractor.announce_me()
         
     | 
| 
      
 92 
     | 
    
         
            +
                if Module.constants.include? "FredFeatureInfo"
         
     | 
| 
      
 93 
     | 
    
         
            +
                  # yup, we have a class to which we can announce ourselves
         
     | 
| 
      
 94 
     | 
    
         
            +
                  FredFeatureInfo.add_feature(eval(self.name()))
         
     | 
| 
      
 95 
     | 
    
         
            +
                else
         
     | 
| 
      
 96 
     | 
    
         
            +
                  # no interface collector class
         
     | 
| 
      
 97 
     | 
    
         
            +
            #      $stderr.puts "Feature #{self.name()} not announced: no RosyFeatureInfo."
         
     | 
| 
      
 98 
     | 
    
         
            +
                end
         
     | 
| 
      
 99 
     | 
    
         
            +
              end
         
     | 
| 
      
 100 
     | 
    
         
            +
             
     | 
| 
      
 101 
     | 
    
         
            +
            end
         
     | 
| 
      
 102 
     | 
    
         
            +
             
     | 
| 
      
 103 
     | 
    
         
            +
            #####
         
     | 
| 
      
 104 
     | 
    
         
            +
            # context feature
         
     | 
| 
      
 105 
     | 
    
         
            +
            class FredContextFeatureExtractor < FredFeatureExtractor
         
     | 
| 
      
 106 
     | 
    
         
            +
              FredContextFeatureExtractor.announce_me()
         
     | 
| 
      
 107 
     | 
    
         
            +
             
     | 
| 
      
 108 
     | 
    
         
            +
              def FredContextFeatureExtractor.feature_name()
         
     | 
| 
      
 109 
     | 
    
         
            +
                return "context"
         
     | 
| 
      
 110 
     | 
    
         
            +
              end
         
     | 
| 
      
 111 
     | 
    
         
            +
             
     | 
| 
      
 112 
     | 
    
         
            +
              ###
         
     | 
| 
      
 113 
     | 
    
         
            +
              def initialize(exp)
         
     | 
| 
      
 114 
     | 
    
         
            +
                super(exp)
         
     | 
| 
      
 115 
     | 
    
         
            +
             
     | 
| 
      
 116 
     | 
    
         
            +
                # cxsizes: list of context sizes chosen as features,
         
     | 
| 
      
 117 
     | 
    
         
            +
                # encoded in metafeature labels
         
     | 
| 
      
 118 
     | 
    
         
            +
                # written in a hash for fast access
         
     | 
| 
      
 119 
     | 
    
         
            +
                @cxsizes = Hash.new()
         
     | 
| 
      
 120 
     | 
    
         
            +
                @exp.get_lf("feature", "context").each { |cxsize|
         
     | 
| 
      
 121 
     | 
    
         
            +
                  @cxsizes[ "CX" + cxsize.to_s() ] = true
         
     | 
| 
      
 122 
     | 
    
         
            +
                }
         
     | 
| 
      
 123 
     | 
    
         
            +
              end
         
     | 
| 
      
 124 
     | 
    
         
            +
              
         
     | 
| 
      
 125 
     | 
    
         
            +
              ###
         
     | 
| 
      
 126 
     | 
    
         
            +
              def each_feature(feature_hash)
         
     | 
| 
      
 127 
     | 
    
         
            +
                # grf#word#lemma#pos#ne
         
     | 
| 
      
 128 
     | 
    
         
            +
                lemma_index = 2
         
     | 
| 
      
 129 
     | 
    
         
            +
             
     | 
| 
      
 130 
     | 
    
         
            +
                feature_hash.each { |ftype, fvalues|
         
     | 
| 
      
 131 
     | 
    
         
            +
                  if @cxsizes[ftype]
         
     | 
| 
      
 132 
     | 
    
         
            +
                    # this is a context feature of a size chosen 
         
     | 
| 
      
 133 
     | 
    
         
            +
                    # by the user for featurization
         
     | 
| 
      
 134 
     | 
    
         
            +
             
     | 
| 
      
 135 
     | 
    
         
            +
                    fvalues.each { |f| 
         
     | 
| 
      
 136 
     | 
    
         
            +
            	next if f =~ /#####/;
         
     | 
| 
      
 137 
     | 
    
         
            +
                      yield ftype + f.split("#")[lemma_index]
         
     | 
| 
      
 138 
     | 
    
         
            +
                    }
         
     | 
| 
      
 139 
     | 
    
         
            +
                  end
         
     | 
| 
      
 140 
     | 
    
         
            +
                }
         
     | 
| 
      
 141 
     | 
    
         
            +
              end
         
     | 
| 
      
 142 
     | 
    
         
            +
            end
         
     | 
| 
      
 143 
     | 
    
         
            +
             
     | 
| 
      
 144 
     | 
    
         
            +
            #####
         
     | 
| 
      
 145 
     | 
    
         
            +
            # context feature: POS separately, small contexts only 
         
     | 
| 
      
 146 
     | 
    
         
            +
            class FredContextPOSFeatureExtractor < FredFeatureExtractor
         
     | 
| 
      
 147 
     | 
    
         
            +
              FredContextPOSFeatureExtractor.announce_me()
         
     | 
| 
      
 148 
     | 
    
         
            +
             
     | 
| 
      
 149 
     | 
    
         
            +
              def FredContextPOSFeatureExtractor.feature_name()
         
     | 
| 
      
 150 
     | 
    
         
            +
                return "context_pos"
         
     | 
| 
      
 151 
     | 
    
         
            +
              end
         
     | 
| 
      
 152 
     | 
    
         
            +
             
     | 
| 
      
 153 
     | 
    
         
            +
              ###
         
     | 
| 
      
 154 
     | 
    
         
            +
              def initialize(exp)
         
     | 
| 
      
 155 
     | 
    
         
            +
                super(exp)
         
     | 
| 
      
 156 
     | 
    
         
            +
             
     | 
| 
      
 157 
     | 
    
         
            +
                # cxsizes: list of context sizes chosen as features,
         
     | 
| 
      
 158 
     | 
    
         
            +
                # encoded in metafeature labels
         
     | 
| 
      
 159 
     | 
    
         
            +
                # written in a hash for fast access
         
     | 
| 
      
 160 
     | 
    
         
            +
                @cxsizes = Hash.new()
         
     | 
| 
      
 161 
     | 
    
         
            +
                @exp.get_lf("feature", "context").each { |cxsize|
         
     | 
| 
      
 162 
     | 
    
         
            +
                  if cxsize <= 10
         
     | 
| 
      
 163 
     | 
    
         
            +
                    @cxsizes[ "CX" + cxsize.to_s() ] = true
         
     | 
| 
      
 164 
     | 
    
         
            +
                  end
         
     | 
| 
      
 165 
     | 
    
         
            +
                }
         
     | 
| 
      
 166 
     | 
    
         
            +
                if @cxsizes.empty?
         
     | 
| 
      
 167 
     | 
    
         
            +
                  $stderr.puts "context_pos feature warning: will not be computed"
         
     | 
| 
      
 168 
     | 
    
         
            +
                  $stderr.puts "as there is no context of size <= 10"
         
     | 
| 
      
 169 
     | 
    
         
            +
                end
         
     | 
| 
      
 170 
     | 
    
         
            +
              end
         
     | 
| 
      
 171 
     | 
    
         
            +
              
         
     | 
| 
      
 172 
     | 
    
         
            +
              ###
         
     | 
| 
      
 173 
     | 
    
         
            +
              def each_feature(feature_hash)
         
     | 
| 
      
 174 
     | 
    
         
            +
                # word#lemma#pos#ne
         
     | 
| 
      
 175 
     | 
    
         
            +
                pos_index = 2
         
     | 
| 
      
 176 
     | 
    
         
            +
             
     | 
| 
      
 177 
     | 
    
         
            +
                feature_hash.each { |ftype, fvalues|
         
     | 
| 
      
 178 
     | 
    
         
            +
                  if @cxsizes[ftype]
         
     | 
| 
      
 179 
     | 
    
         
            +
                    # this is a context feature of a size chosen 
         
     | 
| 
      
 180 
     | 
    
         
            +
                    # by the user for featurization
         
     | 
| 
      
 181 
     | 
    
         
            +
             
     | 
| 
      
 182 
     | 
    
         
            +
                    fvalues.each { |f| 
         
     | 
| 
      
 183 
     | 
    
         
            +
                      yield "POS" + ftype + f.split("#")[pos_index]
         
     | 
| 
      
 184 
     | 
    
         
            +
                    }
         
     | 
| 
      
 185 
     | 
    
         
            +
                  end
         
     | 
| 
      
 186 
     | 
    
         
            +
                }
         
     | 
| 
      
 187 
     | 
    
         
            +
              end
         
     | 
| 
      
 188 
     | 
    
         
            +
            end
         
     | 
| 
      
 189 
     | 
    
         
            +
             
     | 
| 
      
 190 
     | 
    
         
            +
            #####
         
     | 
| 
      
 191 
     | 
    
         
            +
            # bigram/trigram feature
         
     | 
| 
      
 192 
     | 
    
         
            +
            class FredNgramFeatureExtractor < FredFeatureExtractor
         
     | 
| 
      
 193 
     | 
    
         
            +
              FredNgramFeatureExtractor.announce_me()
         
     | 
| 
      
 194 
     | 
    
         
            +
             
     | 
| 
      
 195 
     | 
    
         
            +
              def FredNgramFeatureExtractor.feature_name()
         
     | 
| 
      
 196 
     | 
    
         
            +
                return "ngram"
         
     | 
| 
      
 197 
     | 
    
         
            +
              end
         
     | 
| 
      
 198 
     | 
    
         
            +
             
     | 
| 
      
 199 
     | 
    
         
            +
              ###
         
     | 
| 
      
 200 
     | 
    
         
            +
              def initialize(exp)
         
     | 
| 
      
 201 
     | 
    
         
            +
                super(exp)
         
     | 
| 
      
 202 
     | 
    
         
            +
             
     | 
| 
      
 203 
     | 
    
         
            +
                # cxsize: context size from which the ngram feature will be computed
         
     | 
| 
      
 204 
     | 
    
         
            +
                # encoded in metafeature labels
         
     | 
| 
      
 205 
     | 
    
         
            +
                # written in a hash for fast access
         
     | 
| 
      
 206 
     | 
    
         
            +
                @cxsize = @exp.get_lf("feature", "context").detect { |cxsize|
         
     | 
| 
      
 207 
     | 
    
         
            +
                  cxsize >= 2
         
     | 
| 
      
 208 
     | 
    
         
            +
                }
         
     | 
| 
      
 209 
     | 
    
         
            +
                unless @cxsize
         
     | 
| 
      
 210 
     | 
    
         
            +
                  $stderr.puts "Warning: no context of size >= 2, so"
         
     | 
| 
      
 211 
     | 
    
         
            +
                  $stderr.puts "no ngram feature computed."
         
     | 
| 
      
 212 
     | 
    
         
            +
                end
         
     | 
| 
      
 213 
     | 
    
         
            +
              end
         
     | 
| 
      
 214 
     | 
    
         
            +
              
         
     | 
| 
      
 215 
     | 
    
         
            +
              ###
         
     | 
| 
      
 216 
     | 
    
         
            +
              def each_feature(feature_hash)
         
     | 
| 
      
 217 
     | 
    
         
            +
                # word#lemma#pos#ne
         
     | 
| 
      
 218 
     | 
    
         
            +
                lemma_index = 1
         
     | 
| 
      
 219 
     | 
    
         
            +
                pos_index = 2
         
     | 
| 
      
 220 
     | 
    
         
            +
             
     | 
| 
      
 221 
     | 
    
         
            +
                feature_hash.each { |ftype, fvalues|
         
     | 
| 
      
 222 
     | 
    
         
            +
                  if ftype == "CX" + @cxsize.to_s()
         
     | 
| 
      
 223 
     | 
    
         
            +
                    # compute the ngram features from this context
         
     | 
| 
      
 224 
     | 
    
         
            +
                    # |fvalues| = 2*cxsize, that is, cxsize describes
         
     | 
| 
      
 225 
     | 
    
         
            +
                    # the length of a one-sided context window
         
     | 
| 
      
 226 
     | 
    
         
            +
                    # the bigram of features around the target
         
     | 
| 
      
 227 
     | 
    
         
            +
                    # concerns fvalues[cxsize-1] and fvalues[cxsize]
         
     | 
| 
      
 228 
     | 
    
         
            +
                    # the trigram of two words before, one word after includes
         
     | 
| 
      
 229 
     | 
    
         
            +
                    # fvalues[cxsize-2], fvalues[cxsize-1] and fvalues[cxsize]
         
     | 
| 
      
 230 
     | 
    
         
            +
             
     | 
| 
      
 231 
     | 
    
         
            +
                    [
         
     | 
| 
      
 232 
     | 
    
         
            +
                     [[-1, 0], "BLEM", lemma_index], # bigram of lemmas
         
     | 
| 
      
 233 
     | 
    
         
            +
                     [[-1, 0], "BPOS", pos_index],   # bigram of POSs
         
     | 
| 
      
 234 
     | 
    
         
            +
                     [[-2, -1, 0], "TLEM", lemma_index], # trigram of lemmas
         
     | 
| 
      
 235 
     | 
    
         
            +
                     [[-2, -1, 0], "TPOS", pos_index] # trigram of POSs
         
     | 
| 
      
 236 
     | 
    
         
            +
                    ].each { |f_indices, label, subindex|
         
     | 
| 
      
 237 
     | 
    
         
            +
                      fs = f_indices.map { |i| fvalues[@cxsize+i] }.compact()
         
     | 
| 
      
 238 
     | 
    
         
            +
                      if fs.length() == f_indices.length()
         
     | 
| 
      
 239 
     | 
    
         
            +
                        # we successfully extracted entries for all the given indices
         
     | 
| 
      
 240 
     | 
    
         
            +
                        yield label + fs.map { |f| f.split("#")[subindex] }.join()
         
     | 
| 
      
 241 
     | 
    
         
            +
                      end
         
     | 
| 
      
 242 
     | 
    
         
            +
                    }
         
     | 
| 
      
 243 
     | 
    
         
            +
                  end
         
     | 
| 
      
 244 
     | 
    
         
            +
                }
         
     | 
| 
      
 245 
     | 
    
         
            +
              end
         
     | 
| 
      
 246 
     | 
    
         
            +
            end
         
     | 
| 
      
 247 
     | 
    
         
            +
             
     | 
| 
      
 248 
     | 
    
         
            +
             
     | 
| 
      
 249 
     | 
    
         
            +
            #####
         
     | 
| 
      
 250 
     | 
    
         
            +
            # syntax feature
         
     | 
| 
      
 251 
     | 
    
         
            +
            class FredSynFeatureExtractor < FredFeatureExtractor
         
     | 
| 
      
 252 
     | 
    
         
            +
              FredSynFeatureExtractor.announce_me()
         
     | 
| 
      
 253 
     | 
    
         
            +
             
     | 
| 
      
 254 
     | 
    
         
            +
              def FredSynFeatureExtractor.feature_name()
         
     | 
| 
      
 255 
     | 
    
         
            +
                return "syntax"
         
     | 
| 
      
 256 
     | 
    
         
            +
              end
         
     | 
| 
      
 257 
     | 
    
         
            +
             
     | 
| 
      
 258 
     | 
    
         
            +
              ###
         
     | 
| 
      
 259 
     | 
    
         
            +
              def each_feature(feature_hash)
         
     | 
| 
      
 260 
     | 
    
         
            +
                
         
     | 
| 
      
 261 
     | 
    
         
            +
                feature_hash.each { |ftype, fvalues|
         
     | 
| 
      
 262 
     | 
    
         
            +
             
     | 
| 
      
 263 
     | 
    
         
            +
                  case ftype
         
     | 
| 
      
 264 
     | 
    
         
            +
                   when "CH", "PA"
         
     | 
| 
      
 265 
     | 
    
         
            +
                    grf_index = 0
         
     | 
| 
      
 266 
     | 
    
         
            +
             
     | 
| 
      
 267 
     | 
    
         
            +
                    fvalues.each { |f|
         
     | 
| 
      
 268 
     | 
    
         
            +
                      yield ftype + f.split("#")[grf_index]
         
     | 
| 
      
 269 
     | 
    
         
            +
                    }
         
     | 
| 
      
 270 
     | 
    
         
            +
             
     | 
| 
      
 271 
     | 
    
         
            +
                  when "SI"
         
     | 
| 
      
 272 
     | 
    
         
            +
                    # parentlemma#grf#word#lemma#pos#ne
         
     | 
| 
      
 273 
     | 
    
         
            +
                    grf_index = 1
         
     | 
| 
      
 274 
     | 
    
         
            +
             
     | 
| 
      
 275 
     | 
    
         
            +
                    fvalues.each { |f|
         
     | 
| 
      
 276 
     | 
    
         
            +
                      yield ftype + f.split("#")[grf_index]
         
     | 
| 
      
 277 
     | 
    
         
            +
                    }
         
     | 
| 
      
 278 
     | 
    
         
            +
             
     | 
| 
      
 279 
     | 
    
         
            +
                  else
         
     | 
| 
      
 280 
     | 
    
         
            +
                    # not a syntactic metafeature
         
     | 
| 
      
 281 
     | 
    
         
            +
                  end
         
     | 
| 
      
 282 
     | 
    
         
            +
                }
         
     | 
| 
      
 283 
     | 
    
         
            +
              end
         
     | 
| 
      
 284 
     | 
    
         
            +
            end
         
     | 
| 
      
 285 
     | 
    
         
            +
             
     | 
| 
      
 286 
     | 
    
         
            +
             
     | 
| 
      
 287 
     | 
    
         
            +
             
     | 
| 
      
 288 
     | 
    
         
            +
             
     | 
| 
      
 289 
     | 
    
         
            +
            #####
         
     | 
| 
      
 290 
     | 
    
         
            +
            # syntax-plus-headword feature
         
     | 
| 
      
 291 
     | 
    
         
            +
            class FredSynsemFeatureExtractor < FredFeatureExtractor
         
     | 
| 
      
 292 
     | 
    
         
            +
              FredSynsemFeatureExtractor.announce_me()
         
     | 
| 
      
 293 
     | 
    
         
            +
             
     | 
| 
      
 294 
     | 
    
         
            +
              def FredSynsemFeatureExtractor.feature_name()
         
     | 
| 
      
 295 
     | 
    
         
            +
                return "synsem"
         
     | 
| 
      
 296 
     | 
    
         
            +
              end
         
     | 
| 
      
 297 
     | 
    
         
            +
             
     | 
| 
      
 298 
     | 
    
         
            +
              ###
         
     | 
| 
      
 299 
     | 
    
         
            +
              def each_feature(feature_hash)
         
     | 
| 
      
 300 
     | 
    
         
            +
                
         
     | 
| 
      
 301 
     | 
    
         
            +
                feature_hash.each { |ftype, fvalues|
         
     | 
| 
      
 302 
     | 
    
         
            +
                  case ftype
         
     | 
| 
      
 303 
     | 
    
         
            +
                  when "CH", "PA"
         
     | 
| 
      
 304 
     | 
    
         
            +
                    # grf#word#lemma#pos#ne
         
     | 
| 
      
 305 
     | 
    
         
            +
                    fvalues.each { |f| 
         
     | 
| 
      
 306 
     | 
    
         
            +
                      yield ftype + "SEM" + f
         
     | 
| 
      
 307 
     | 
    
         
            +
                    }
         
     | 
| 
      
 308 
     | 
    
         
            +
             
     | 
| 
      
 309 
     | 
    
         
            +
                  when "SI"
         
     | 
| 
      
 310 
     | 
    
         
            +
                    # parentlemma#grf#word#lemma#pos#ne
         
     | 
| 
      
 311 
     | 
    
         
            +
                    # remove parent lemma
         
     | 
| 
      
 312 
     | 
    
         
            +
                    fvalues.each { |f|
         
     | 
| 
      
 313 
     | 
    
         
            +
                      yield ftype + "SEM" + f.split("#")[1..-1].join("#")
         
     | 
| 
      
 314 
     | 
    
         
            +
                    }
         
     | 
| 
      
 315 
     | 
    
         
            +
             
     | 
| 
      
 316 
     | 
    
         
            +
                  else
         
     | 
| 
      
 317 
     | 
    
         
            +
                    # not a syntax feature
         
     | 
| 
      
 318 
     | 
    
         
            +
                  end
         
     | 
| 
      
 319 
     | 
    
         
            +
                }
         
     | 
| 
      
 320 
     | 
    
         
            +
              end
         
     | 
| 
      
 321 
     | 
    
         
            +
            end
         
     |