frprep 0.0.1.prealpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +8 -0
 - data/CHANGELOG.rdoc +0 -0
 - data/LICENSE.rdoc +0 -0
 - data/README.rdoc +0 -0
 - data/lib/common/AbstractSynInterface.rb +1227 -0
 - data/lib/common/BerkeleyInterface.rb +375 -0
 - data/lib/common/CollinsInterface.rb +1165 -0
 - data/lib/common/ConfigData.rb +694 -0
 - data/lib/common/Counter.rb +18 -0
 - data/lib/common/DBInterface.rb +48 -0
 - data/lib/common/EnduserMode.rb +27 -0
 - data/lib/common/Eval.rb +480 -0
 - data/lib/common/FixSynSemMapping.rb +196 -0
 - data/lib/common/FrPrepConfigData.rb +66 -0
 - data/lib/common/FrprepHelper.rb +1324 -0
 - data/lib/common/Graph.rb +345 -0
 - data/lib/common/ISO-8859-1.rb +24 -0
 - data/lib/common/ML.rb +186 -0
 - data/lib/common/Maxent.rb +215 -0
 - data/lib/common/MiniparInterface.rb +1388 -0
 - data/lib/common/Optimise.rb +195 -0
 - data/lib/common/Parser.rb +213 -0
 - data/lib/common/RegXML.rb +269 -0
 - data/lib/common/RosyConventions.rb +171 -0
 - data/lib/common/SQLQuery.rb +243 -0
 - data/lib/common/STXmlTerminalOrder.rb +194 -0
 - data/lib/common/SalsaTigerRegXML.rb +2347 -0
 - data/lib/common/SalsaTigerXMLHelper.rb +99 -0
 - data/lib/common/SleepyInterface.rb +384 -0
 - data/lib/common/SynInterfaces.rb +275 -0
 - data/lib/common/TabFormat.rb +720 -0
 - data/lib/common/Tiger.rb +1448 -0
 - data/lib/common/TntInterface.rb +44 -0
 - data/lib/common/Tree.rb +61 -0
 - data/lib/common/TreetaggerInterface.rb +303 -0
 - data/lib/common/headz.rb +338 -0
 - data/lib/common/option_parser.rb +13 -0
 - data/lib/common/ruby_class_extensions.rb +310 -0
 - data/lib/fred/Baseline.rb +150 -0
 - data/lib/fred/FileZipped.rb +31 -0
 - data/lib/fred/FredBOWContext.rb +863 -0
 - data/lib/fred/FredConfigData.rb +182 -0
 - data/lib/fred/FredConventions.rb +232 -0
 - data/lib/fred/FredDetermineTargets.rb +324 -0
 - data/lib/fred/FredEval.rb +312 -0
 - data/lib/fred/FredFeatureExtractors.rb +321 -0
 - data/lib/fred/FredFeatures.rb +1061 -0
 - data/lib/fred/FredFeaturize.rb +596 -0
 - data/lib/fred/FredNumTrainingSenses.rb +27 -0
 - data/lib/fred/FredParameters.rb +402 -0
 - data/lib/fred/FredSplit.rb +84 -0
 - data/lib/fred/FredSplitPkg.rb +180 -0
 - data/lib/fred/FredTest.rb +607 -0
 - data/lib/fred/FredTrain.rb +144 -0
 - data/lib/fred/PlotAndREval.rb +480 -0
 - data/lib/fred/fred.rb +45 -0
 - data/lib/fred/md5.rb +23 -0
 - data/lib/fred/opt_parser.rb +250 -0
 - data/lib/frprep/AbstractSynInterface.rb +1227 -0
 - data/lib/frprep/Ampersand.rb +37 -0
 - data/lib/frprep/BerkeleyInterface.rb +375 -0
 - data/lib/frprep/CollinsInterface.rb +1165 -0
 - data/lib/frprep/ConfigData.rb +694 -0
 - data/lib/frprep/Counter.rb +18 -0
 - data/lib/frprep/FNCorpusXML.rb +643 -0
 - data/lib/frprep/FNDatabase.rb +144 -0
 - data/lib/frprep/FixSynSemMapping.rb +196 -0
 - data/lib/frprep/FrPrepConfigData.rb +66 -0
 - data/lib/frprep/FrameXML.rb +513 -0
 - data/lib/frprep/FrprepHelper.rb +1324 -0
 - data/lib/frprep/Graph.rb +345 -0
 - data/lib/frprep/ISO-8859-1.rb +24 -0
 - data/lib/frprep/MiniparInterface.rb +1388 -0
 - data/lib/frprep/Parser.rb +213 -0
 - data/lib/frprep/RegXML.rb +269 -0
 - data/lib/frprep/STXmlTerminalOrder.rb +194 -0
 - data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
 - data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
 - data/lib/frprep/SleepyInterface.rb +384 -0
 - data/lib/frprep/SynInterfaces.rb +275 -0
 - data/lib/frprep/TabFormat.rb +720 -0
 - data/lib/frprep/Tiger.rb +1448 -0
 - data/lib/frprep/TntInterface.rb +44 -0
 - data/lib/frprep/Tree.rb +61 -0
 - data/lib/frprep/TreetaggerInterface.rb +303 -0
 - data/lib/frprep/do_parses.rb +142 -0
 - data/lib/frprep/frprep.rb +686 -0
 - data/lib/frprep/headz.rb +338 -0
 - data/lib/frprep/one_parsed_file.rb +28 -0
 - data/lib/frprep/opt_parser.rb +94 -0
 - data/lib/frprep/ruby_class_extensions.rb +310 -0
 - data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
 - data/lib/rosy/DBMySQL.rb +146 -0
 - data/lib/rosy/DBSQLite.rb +280 -0
 - data/lib/rosy/DBTable.rb +239 -0
 - data/lib/rosy/DBWrapper.rb +176 -0
 - data/lib/rosy/ExternalConfigData.rb +58 -0
 - data/lib/rosy/FailedParses.rb +130 -0
 - data/lib/rosy/FeatureInfo.rb +242 -0
 - data/lib/rosy/GfInduce.rb +1115 -0
 - data/lib/rosy/GfInduceFeature.rb +148 -0
 - data/lib/rosy/InputData.rb +294 -0
 - data/lib/rosy/RosyConfigData.rb +115 -0
 - data/lib/rosy/RosyConfusability.rb +338 -0
 - data/lib/rosy/RosyEval.rb +465 -0
 - data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
 - data/lib/rosy/RosyFeaturize.rb +280 -0
 - data/lib/rosy/RosyInspect.rb +336 -0
 - data/lib/rosy/RosyIterator.rb +477 -0
 - data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
 - data/lib/rosy/RosyPruning.rb +165 -0
 - data/lib/rosy/RosyServices.rb +744 -0
 - data/lib/rosy/RosySplit.rb +232 -0
 - data/lib/rosy/RosyTask.rb +19 -0
 - data/lib/rosy/RosyTest.rb +826 -0
 - data/lib/rosy/RosyTrain.rb +232 -0
 - data/lib/rosy/RosyTrainingTestTable.rb +786 -0
 - data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
 - data/lib/rosy/View.rb +418 -0
 - data/lib/rosy/opt_parser.rb +379 -0
 - data/lib/rosy/rosy.rb +77 -0
 - data/lib/shalmaneser/version.rb +3 -0
 - data/test/frprep/test_opt_parser.rb +94 -0
 - data/test/functional/functional_test_helper.rb +40 -0
 - data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
 - data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
 - data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
 - data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
 - data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
 - data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
 - data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
 - data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
 - data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
 - data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
 - data/test/functional/test_fred.rb +47 -0
 - data/test/functional/test_frprep.rb +52 -0
 - data/test/functional/test_rosy.rb +20 -0
 - metadata +270 -0
 
| 
         @@ -0,0 +1,144 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # FredTrain
         
     | 
| 
      
 2 
     | 
    
         
            +
            # Katrin Erk April 05
         
     | 
| 
      
 3 
     | 
    
         
            +
            #
         
     | 
| 
      
 4 
     | 
    
         
            +
            # Frame disambiguation system: train classifiers
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
            require "common/ruby_class_extensions"
         
     | 
| 
      
 7 
     | 
    
         
            +
             
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
            # Shalmaneser packages
         
     | 
| 
      
 10 
     | 
    
         
            +
            require "fred/FredConventions"
         
     | 
| 
      
 11 
     | 
    
         
            +
            require "common/ML"
         
     | 
| 
      
 12 
     | 
    
         
            +
            require "fred/FredDetermineTargets"
         
     | 
| 
      
 13 
     | 
    
         
            +
            require "fred/FredSplitPkg"
         
     | 
| 
      
 14 
     | 
    
         
            +
            require "fred/FredFeatures"
         
     | 
| 
      
 15 
     | 
    
         
            +
            require "fred/FredNumTrainingSenses"
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
      
 17 
     | 
    
         
            +
            class FredTrain
         
     | 
| 
      
 18 
     | 
    
         
            +
             
     | 
| 
      
 19 
     | 
    
         
            +
              ###
         
     | 
| 
      
 20 
     | 
    
         
            +
              # new
         
     | 
| 
      
 21 
     | 
    
         
            +
              #
         
     | 
| 
      
 22 
     | 
    
         
            +
              # evaluate runtime options and announce the task
         
     | 
| 
      
 23 
     | 
    
         
            +
              def initialize(exp_obj, # FredConfigData object
         
     | 
| 
      
 24 
     | 
    
         
            +
            		 options) # hash: runtime option name (string) => value(string)
         
     | 
| 
      
 25 
     | 
    
         
            +
             
     | 
| 
      
 26 
     | 
    
         
            +
             
     | 
| 
      
 27 
     | 
    
         
            +
                in_enduser_mode_unavailable()
         
     | 
| 
      
 28 
     | 
    
         
            +
             
     | 
| 
      
 29 
     | 
    
         
            +
                @exp = exp_obj
         
     | 
| 
      
 30 
     | 
    
         
            +
             
     | 
| 
      
 31 
     | 
    
         
            +
                # evaluate runtime options
         
     | 
| 
      
 32 
     | 
    
         
            +
                @split_id = nil
         
     | 
| 
      
 33 
     | 
    
         
            +
             
     | 
| 
      
 34 
     | 
    
         
            +
                options.each_pair { |opt, arg|
         
     | 
| 
      
 35 
     | 
    
         
            +
                  case opt
         
     | 
| 
      
 36 
     | 
    
         
            +
                  when "--logID"
         
     | 
| 
      
 37 
     | 
    
         
            +
                    @split_id = arg
         
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
      
 39 
     | 
    
         
            +
                  else
         
     | 
| 
      
 40 
     | 
    
         
            +
            	# case of unknown arguments has been dealt with by fred.rb
         
     | 
| 
      
 41 
     | 
    
         
            +
                  end
         
     | 
| 
      
 42 
     | 
    
         
            +
                }
         
     | 
| 
      
 43 
     | 
    
         
            +
             
     | 
| 
      
 44 
     | 
    
         
            +
                # announce the task
         
     | 
| 
      
 45 
     | 
    
         
            +
                $stderr.puts "---------"
         
     | 
| 
      
 46 
     | 
    
         
            +
                $stderr.print "Fred experiment #{@exp.get("experiment_ID")}: Training classifiers"
         
     | 
| 
      
 47 
     | 
    
         
            +
                if @split_id
         
     | 
| 
      
 48 
     | 
    
         
            +
                  $stderr.puts " using split with ID #{@split_id}"
         
     | 
| 
      
 49 
     | 
    
         
            +
                else
         
     | 
| 
      
 50 
     | 
    
         
            +
                  $stderr.puts
         
     | 
| 
      
 51 
     | 
    
         
            +
                end
         
     | 
| 
      
 52 
     | 
    
         
            +
                $stderr.puts "---------"
         
     | 
| 
      
 53 
     | 
    
         
            +
             
     | 
| 
      
 54 
     | 
    
         
            +
                # make an object that can list lemmas and their senses
         
     | 
| 
      
 55 
     | 
    
         
            +
                @lemmas_and_senses_obj = Targets.new(@exp, nil, "r")
         
     | 
| 
      
 56 
     | 
    
         
            +
                unless @lemmas_and_senses_obj.targets_okay
         
     | 
| 
      
 57 
     | 
    
         
            +
                  # error during initialization
         
     | 
| 
      
 58 
     | 
    
         
            +
                  $stderr.puts "Error: Could not read list of known targets, bailing out."
         
     | 
| 
      
 59 
     | 
    
         
            +
                  exit 1
         
     | 
| 
      
 60 
     | 
    
         
            +
                end
         
     | 
| 
      
 61 
     | 
    
         
            +
             
     | 
| 
      
 62 
     | 
    
         
            +
                ###
         
     | 
| 
      
 63 
     | 
    
         
            +
                # start objects for the different classifier types
         
     | 
| 
      
 64 
     | 
    
         
            +
             
     | 
| 
      
 65 
     | 
    
         
            +
                # get_lf returns: array of pairs [classifier_name, options[array]]
         
     | 
| 
      
 66 
     | 
    
         
            +
                #
         
     | 
| 
      
 67 
     | 
    
         
            +
                # @classifiers: list of pairs [Classifier object, classifier name(string)]
         
     | 
| 
      
 68 
     | 
    
         
            +
                @classifiers = @exp.get_lf("classifier").map { |classif_name, options|
         
     | 
| 
      
 69 
     | 
    
         
            +
                  [Classifier.new(classif_name, options), classif_name]
         
     | 
| 
      
 70 
     | 
    
         
            +
                }
         
     | 
| 
      
 71 
     | 
    
         
            +
                # sanity check: we need at least one classifier
         
     | 
| 
      
 72 
     | 
    
         
            +
                if @classifiers.empty?
         
     | 
| 
      
 73 
     | 
    
         
            +
                  raise "I need at least one classifier, please specify using exp. file option 'classifier'"
         
     | 
| 
      
 74 
     | 
    
         
            +
                end
         
     | 
| 
      
 75 
     | 
    
         
            +
             
     | 
| 
      
 76 
     | 
    
         
            +
                # get an object for listing senses of each lemma
         
     | 
| 
      
 77 
     | 
    
         
            +
                @lemmas_and_senses = Targets.new(@exp, nil, "r")
         
     | 
| 
      
 78 
     | 
    
         
            +
              end
         
     | 
| 
      
 79 
     | 
    
         
            +
             
     | 
| 
      
 80 
     | 
    
         
            +
              ###
         
     | 
| 
      
 81 
     | 
    
         
            +
              # compute
         
     | 
| 
      
 82 
     | 
    
         
            +
              #
         
     | 
| 
      
 83 
     | 
    
         
            +
              # do the training
         
     | 
| 
      
 84 
     | 
    
         
            +
              def compute()
         
     | 
| 
      
 85 
     | 
    
         
            +
             
     | 
| 
      
 86 
     | 
    
         
            +
                if @split_id
         
     | 
| 
      
 87 
     | 
    
         
            +
                  # make split object and parameter hash to pass to it
         
     | 
| 
      
 88 
     | 
    
         
            +
                  split_obj = FredSplitPkg.new(@exp)
         
     | 
| 
      
 89 
     | 
    
         
            +
                else
         
     | 
| 
      
 90 
     | 
    
         
            +
                  split_obj = nil
         
     | 
| 
      
 91 
     | 
    
         
            +
                end
         
     | 
| 
      
 92 
     | 
    
         
            +
             
     | 
| 
      
 93 
     | 
    
         
            +
                classif_dir = fred_classifier_directory(@exp, @split_id)
         
     | 
| 
      
 94 
     | 
    
         
            +
                # iterate through instance files
         
     | 
| 
      
 95 
     | 
    
         
            +
                FredFeatureAccess.each_feature_file(@exp, "train") { |filename, values|
         
     | 
| 
      
 96 
     | 
    
         
            +
                  # progress report
         
     | 
| 
      
 97 
     | 
    
         
            +
                  if @exp.get("verbose")
         
     | 
| 
      
 98 
     | 
    
         
            +
                    $stderr.puts "Training on " + values["lemma"]
         
     | 
| 
      
 99 
     | 
    
         
            +
                  end
         
     | 
| 
      
 100 
     | 
    
         
            +
             
     | 
| 
      
 101 
     | 
    
         
            +
                  # only one sense? then just assign that
         
     | 
| 
      
 102 
     | 
    
         
            +
                  num_senses = determine_training_senses(values["lemma"], @exp, 
         
     | 
| 
      
 103 
     | 
    
         
            +
                                                         @lemmas_and_senses, 
         
     | 
| 
      
 104 
     | 
    
         
            +
                                                         @split_id).length()
         
     | 
| 
      
 105 
     | 
    
         
            +
             
     | 
| 
      
 106 
     | 
    
         
            +
                  if num_senses > 1
         
     | 
| 
      
 107 
     | 
    
         
            +
                    # more than one sense: train
         
     | 
| 
      
 108 
     | 
    
         
            +
                    # if we're splitting the data, do that now
         
     | 
| 
      
 109 
     | 
    
         
            +
                    if split_obj
         
     | 
| 
      
 110 
     | 
    
         
            +
                      tempfile = split_obj.apply_split(filename, values["lemma"], "train", @split_id)
         
     | 
| 
      
 111 
     | 
    
         
            +
                      
         
     | 
| 
      
 112 
     | 
    
         
            +
                      if tempfile.nil?
         
     | 
| 
      
 113 
     | 
    
         
            +
                        # the training part of the split doesn't contain any data
         
     | 
| 
      
 114 
     | 
    
         
            +
                        $stderr.puts "Skipping #{values["lemma"]}: no training data in split"
         
     | 
| 
      
 115 
     | 
    
         
            +
                        next
         
     | 
| 
      
 116 
     | 
    
         
            +
                      end
         
     | 
| 
      
 117 
     | 
    
         
            +
                      
         
     | 
| 
      
 118 
     | 
    
         
            +
                      filename = tempfile.path()
         
     | 
| 
      
 119 
     | 
    
         
            +
                    end
         
     | 
| 
      
 120 
     | 
    
         
            +
             
     | 
| 
      
 121 
     | 
    
         
            +
                    @classifiers.each { |classifier, classifier_name|
         
     | 
| 
      
 122 
     | 
    
         
            +
                      # where do we write the classifier?
         
     | 
| 
      
 123 
     | 
    
         
            +
                      output_name = classif_dir + fred_classifier_filename(classifier_name,
         
     | 
| 
      
 124 
     | 
    
         
            +
                                                                           values["lemma"],
         
     | 
| 
      
 125 
     | 
    
         
            +
                                                                           values["sense"])
         
     | 
| 
      
 126 
     | 
    
         
            +
                      # HIER
         
     | 
| 
      
 127 
     | 
    
         
            +
                       $stderr.puts "FRED: Writing classifier #{output_name}"
         
     | 
| 
      
 128 
     | 
    
         
            +
                      
         
     | 
| 
      
 129 
     | 
    
         
            +
                      classifier.train(filename, output_name)
         
     | 
| 
      
 130 
     | 
    
         
            +
                    } # each classifier
         
     | 
| 
      
 131 
     | 
    
         
            +
             
     | 
| 
      
 132 
     | 
    
         
            +
                    if split_obj
         
     | 
| 
      
 133 
     | 
    
         
            +
                      tempfile.close(true)
         
     | 
| 
      
 134 
     | 
    
         
            +
                    end
         
     | 
| 
      
 135 
     | 
    
         
            +
             
     | 
| 
      
 136 
     | 
    
         
            +
                  elsif num_senses  == 1
         
     | 
| 
      
 137 
     | 
    
         
            +
                    # only one sense: no need to write a training file
         
     | 
| 
      
 138 
     | 
    
         
            +
                  else
         
     | 
| 
      
 139 
     | 
    
         
            +
                    $stderr.puts "Error: no senses for lemma #{values["lemma"]}"
         
     | 
| 
      
 140 
     | 
    
         
            +
                  end
         
     | 
| 
      
 141 
     | 
    
         
            +
                    
         
     | 
| 
      
 142 
     | 
    
         
            +
                } # each feature file
         
     | 
| 
      
 143 
     | 
    
         
            +
              end
         
     | 
| 
      
 144 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,480 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require "tempfile"
         
     | 
| 
      
 2 
     | 
    
         
            +
            require "StandardPkgExtensions"
         
     | 
| 
      
 3 
     | 
    
         
            +
            class Array
         
     | 
| 
      
 4 
     | 
    
         
            +
              include EnumerableBool
         
     | 
| 
      
 5 
     | 
    
         
            +
            end
         
     | 
| 
      
 6 
     | 
    
         
            +
             
     | 
| 
      
 7 
     | 
    
         
            +
            module PlotAndREval
         
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
              ############
         
     | 
| 
      
 10 
     | 
    
         
            +
              # given a set of mappings x_axis_value -> y_axis_value,
         
     | 
| 
      
 11 
     | 
    
         
            +
              # plot them all within the same gnuplot graph
         
     | 
| 
      
 12 
     | 
    
         
            +
              #
         
     | 
| 
      
 13 
     | 
    
         
            +
              # scores: 
         
     | 
| 
      
 14 
     | 
    
         
            +
              # either hash: score_label(string) -> hash x_axis(float) -> y_axis(float)
         
     | 
| 
      
 15 
     | 
    
         
            +
              # or hash: score_label(string) -> array [x_axis(float), y_axis(float)]
         
     | 
| 
      
 16 
     | 
    
         
            +
              def PlotAndREval.gnuplot_direct(scores,     
         
     | 
| 
      
 17 
     | 
    
         
            +
                                              title,      # string: title for output files
         
     | 
| 
      
 18 
     | 
    
         
            +
                                              x_name,     # string: label for x axis
         
     | 
| 
      
 19 
     | 
    
         
            +
                                              y_name,     # string: label for y axis
         
     | 
| 
      
 20 
     | 
    
         
            +
                                              plotoutfile, # string: name of gnuplot output file
         
     | 
| 
      
 21 
     | 
    
         
            +
                                              data_style = "linespoints") # data style
         
     | 
| 
      
 22 
     | 
    
         
            +
             
     | 
| 
      
 23 
     | 
    
         
            +
                # for each score label: write x_axis/y_axis pairs to a separate tempfile
         
     | 
| 
      
 24 
     | 
    
         
            +
                score_file = Hash.new
         
     | 
| 
      
 25 
     | 
    
         
            +
                scores.each_pair { |score_label, score_values|
         
     | 
| 
      
 26 
     | 
    
         
            +
                  score_file[score_label] = Tempfile.new("PlotAndREval")
         
     | 
| 
      
 27 
     | 
    
         
            +
                  score_values.to_a.sort { |a, b|  a.first <=> b.first}.each { |x_val, y_val|
         
     | 
| 
      
 28 
     | 
    
         
            +
                    score_file[score_label].puts "#{x_val} #{y_val}"
         
     | 
| 
      
 29 
     | 
    
         
            +
                  }
         
     | 
| 
      
 30 
     | 
    
         
            +
                  score_file[score_label].close()
         
     | 
| 
      
 31 
     | 
    
         
            +
                }
         
     | 
| 
      
 32 
     | 
    
         
            +
             
     | 
| 
      
 33 
     | 
    
         
            +
                # write command file for gnuplot
         
     | 
| 
      
 34 
     | 
    
         
            +
                gf = Tempfile.new("PlotAndREval")
         
     | 
| 
      
 35 
     | 
    
         
            +
             
     | 
| 
      
 36 
     | 
    
         
            +
                gf.puts "set title \"" + title + "\""
         
     | 
| 
      
 37 
     | 
    
         
            +
                gf.puts "set ylabel \""+ y_name + "\""
         
     | 
| 
      
 38 
     | 
    
         
            +
                gf.puts "set xlabel \""+ x_name + "\""
         
     | 
| 
      
 39 
     | 
    
         
            +
                gf.puts "set time"
         
     | 
| 
      
 40 
     | 
    
         
            +
                gf.puts "set data style " + data_style
         
     | 
| 
      
 41 
     | 
    
         
            +
                gf.puts "set grid"
         
     | 
| 
      
 42 
     | 
    
         
            +
                gf.puts "set output \"" + plotoutfile + "\""
         
     | 
| 
      
 43 
     | 
    
         
            +
                gf.puts "set terminal postscript color"
         
     | 
| 
      
 44 
     | 
    
         
            +
             
     | 
| 
      
 45 
     | 
    
         
            +
             
     | 
| 
      
 46 
     | 
    
         
            +
                gf.print "plot "
         
     | 
| 
      
 47 
     | 
    
         
            +
                gf.puts score_file.to_a.map { |score_label, tempfile|
         
     | 
| 
      
 48 
     | 
    
         
            +
                  # plot "<filename>" using "<title>", "<filename>" using "<title>",...
         
     | 
| 
      
 49 
     | 
    
         
            +
                  "\"" + tempfile.path() + "\"" + " title \"" + score_label + "\""
         
     | 
| 
      
 50 
     | 
    
         
            +
                }.join(", ")
         
     | 
| 
      
 51 
     | 
    
         
            +
                # finalize tempfile
         
     | 
| 
      
 52 
     | 
    
         
            +
                gf.close()
         
     | 
| 
      
 53 
     | 
    
         
            +
             
     | 
| 
      
 54 
     | 
    
         
            +
                %x{gnuplot #{gf.path()}}
         
     | 
| 
      
 55 
     | 
    
         
            +
              end
         
     | 
| 
      
 56 
     | 
    
         
            +
             
     | 
| 
      
 57 
     | 
    
         
            +
              #################
         
     | 
| 
      
 58 
     | 
    
         
            +
              # Given a list of pairs [x, y], 
         
     | 
| 
      
 59 
     | 
    
         
            +
              # group them into N bins (by splitting the range from min score to max score)
         
     | 
| 
      
 60 
     | 
    
         
            +
              # compute the average y for each x bin, and plot
         
     | 
| 
      
 61 
     | 
    
         
            +
              def PlotAndREval.gnuplot_average(scores, # array of pairs [x(float), y(float)
         
     | 
| 
      
 62 
     | 
    
         
            +
                                               title,  # string: title for output file
         
     | 
| 
      
 63 
     | 
    
         
            +
                                               x_label, # label for x axis
         
     | 
| 
      
 64 
     | 
    
         
            +
                                               y_label, # label for y axis
         
     | 
| 
      
 65 
     | 
    
         
            +
                                               plotoutfile, # string: name of gnuplot output file
         
     | 
| 
      
 66 
     | 
    
         
            +
                                               min_value, # float: minimum value
         
     | 
| 
      
 67 
     | 
    
         
            +
                                               bin_size) # float: size of one bin
         
     | 
| 
      
 68 
     | 
    
         
            +
             
     | 
| 
      
 69 
     | 
    
         
            +
                # sort scores into bins
         
     | 
| 
      
 70 
     | 
    
         
            +
                bin = Hash.new()
         
     | 
| 
      
 71 
     | 
    
         
            +
                
         
     | 
| 
      
 72 
     | 
    
         
            +
                scores.each { |xval, yval|
         
     | 
| 
      
 73 
     | 
    
         
            +
                  bin_no = (xval - min_value / bin_size).floor
         
     | 
| 
      
 74 
     | 
    
         
            +
                  unless bin[bin_no]
         
     | 
| 
      
 75 
     | 
    
         
            +
                    bin[bin_no] = Array.new
         
     | 
| 
      
 76 
     | 
    
         
            +
                  end
         
     | 
| 
      
 77 
     | 
    
         
            +
                  bin[bin_no] << yval
         
     | 
| 
      
 78 
     | 
    
         
            +
                }
         
     | 
| 
      
 79 
     | 
    
         
            +
             
     | 
| 
      
 80 
     | 
    
         
            +
                # print average for each bin to temp infile for gnuplot
         
     | 
| 
      
 81 
     | 
    
         
            +
                tf = Tempfile.new("plot_and_r")
         
     | 
| 
      
 82 
     | 
    
         
            +
                
         
     | 
| 
      
 83 
     | 
    
         
            +
                bin.keys.sort.each { |bin_no|
         
     | 
| 
      
 84 
     | 
    
         
            +
                  if bin[bin_no].length() > 0
         
     | 
| 
      
 85 
     | 
    
         
            +
                    avg = (bin[bin_no].big_sum(0.0) { |yval| yval }) / bin[bin_no].length().to_f
         
     | 
| 
      
 86 
     | 
    
         
            +
                  else
         
     | 
| 
      
 87 
     | 
    
         
            +
                    avg = 0.0
         
     | 
| 
      
 88 
     | 
    
         
            +
                  end
         
     | 
| 
      
 89 
     | 
    
         
            +
                  val = min_value + (bin_no.to_f * bin_size)
         
     | 
| 
      
 90 
     | 
    
         
            +
                  tf.print val, "\t", avg, "\n"
         
     | 
| 
      
 91 
     | 
    
         
            +
                }
         
     | 
| 
      
 92 
     | 
    
         
            +
                tf.close()
         
     | 
| 
      
 93 
     | 
    
         
            +
                
         
     | 
| 
      
 94 
     | 
    
         
            +
                # make gnuplot main infile
         
     | 
| 
      
 95 
     | 
    
         
            +
                gf = Tempfile.new("plot_and_r")
         
     | 
| 
      
 96 
     | 
    
         
            +
                gf.puts "set title \"#{title}\""
         
     | 
| 
      
 97 
     | 
    
         
            +
                gf.puts "set ylabel \"#{y_label}\""
         
     | 
| 
      
 98 
     | 
    
         
            +
                gf.puts "set xlabel \"#{x_label}\""
         
     | 
| 
      
 99 
     | 
    
         
            +
                gf.puts "set time"
         
     | 
| 
      
 100 
     | 
    
         
            +
                gf.puts "set data style linespoints"
         
     | 
| 
      
 101 
     | 
    
         
            +
                gf.puts "set grid"
         
     | 
| 
      
 102 
     | 
    
         
            +
                gf.puts "set output \"" + plotoutfile + "\""
         
     | 
| 
      
 103 
     | 
    
         
            +
                gf.puts "set terminal postscript color"
         
     | 
| 
      
 104 
     | 
    
         
            +
                gf.print "plot \"#{tf.path()}\" title \"#{y_label}\""
         
     | 
| 
      
 105 
     | 
    
         
            +
                gf.puts
         
     | 
| 
      
 106 
     | 
    
         
            +
                gf.puts
         
     | 
| 
      
 107 
     | 
    
         
            +
                gf.close()
         
     | 
| 
      
 108 
     | 
    
         
            +
                
         
     | 
| 
      
 109 
     | 
    
         
            +
                # now gnuplot it
         
     | 
| 
      
 110 
     | 
    
         
            +
                %x{gnuplot #{gf.path()}}
         
     | 
| 
      
 111 
     | 
    
         
            +
             
     | 
| 
      
 112 
     | 
    
         
            +
                # and remove temp files
         
     | 
| 
      
 113 
     | 
    
         
            +
                tf.close(true)
         
     | 
| 
      
 114 
     | 
    
         
            +
                gf.close(true)
         
     | 
| 
      
 115 
     | 
    
         
            +
              end
         
     | 
| 
      
 116 
     | 
    
         
            +
              
         
     | 
| 
      
 117 
     | 
    
         
            +
              #################
         
     | 
| 
      
 118 
     | 
    
         
            +
              # given a mapping from labels to scores,
         
     | 
| 
      
 119 
     | 
    
         
            +
              # split the range form min. score to max. score into
         
     | 
| 
      
 120 
     | 
    
         
            +
              # 20 bins, sort the label/score pairs into the bins,
         
     | 
| 
      
 121 
     | 
    
         
            +
              # and gnuplot them as a bar graph of 20 bars.
         
     | 
| 
      
 122 
     | 
    
         
            +
              #
         
     | 
| 
      
 123 
     | 
    
         
            +
              # A title for the graph must be given, and a 
         
     | 
| 
      
 124 
     | 
    
         
            +
              # name for the gnuplot output file.
         
     | 
| 
      
 125 
     | 
    
         
            +
              # If the name of a text output file is given,
         
     | 
| 
      
 126 
     | 
    
         
            +
              # the result is also printed as text.
         
     | 
| 
      
 127 
     | 
    
         
            +
              #
         
     | 
| 
      
 128 
     | 
    
         
            +
              # If minvalue and maxvalue are given, they are used 
         
     | 
| 
      
 129 
     | 
    
         
            +
              # as start and end of the scale instead of the
         
     | 
| 
      
 130 
     | 
    
         
            +
              # min. and max. values from the scores hash.
         
     | 
| 
      
 131 
     | 
    
         
            +
              def PlotAndREval.gnuplot_quantity_chart(scores, # hash:label(string) -> value(float), label->score-mapping
         
     | 
| 
      
 132 
     | 
    
         
            +
                                                      title,  # string: title for output files
         
     | 
| 
      
 133 
     | 
    
         
            +
                                                      score_name, # string: what are the scores? (label for y axis)
         
     | 
| 
      
 134 
     | 
    
         
            +
                                                      plotoutfile, # string: name of gnuplot output file
         
     | 
| 
      
 135 
     | 
    
         
            +
                                                      textoutfile = nil, # string: name of text output file
         
     | 
| 
      
 136 
     | 
    
         
            +
                                                      minvalue=nil, # float: minimum value for y axis
         
     | 
| 
      
 137 
     | 
    
         
            +
                                                      maxvalue=nil) # float: maximum value for y axis
         
     | 
| 
      
 138 
     | 
    
         
            +
                
         
     | 
| 
      
 139 
     | 
    
         
            +
             
     | 
| 
      
 140 
     | 
    
         
            +
                # group scores in 20 subgroups
         
     | 
| 
      
 141 
     | 
    
         
            +
                # first determine minimum, maximum score, single interval
         
     | 
| 
      
 142 
     | 
    
         
            +
                if minvalue.nil?
         
     | 
| 
      
 143 
     | 
    
         
            +
                  minvalue = 1.0/0.0 # infinity
         
     | 
| 
      
 144 
     | 
    
         
            +
                  scores.values.each { |score|
         
     | 
| 
      
 145 
     | 
    
         
            +
                    minvalue = [score, minvalue].min
         
     | 
| 
      
 146 
     | 
    
         
            +
                  }
         
     | 
| 
      
 147 
     | 
    
         
            +
                end
         
     | 
| 
      
 148 
     | 
    
         
            +
                if maxvalue.nil?
         
     | 
| 
      
 149 
     | 
    
         
            +
                  maxvalue = -1.0/0.0 # -infinity
         
     | 
| 
      
 150 
     | 
    
         
            +
                  scores.values.each { |score|
         
     | 
| 
      
 151 
     | 
    
         
            +
                    maxvalue = [score, maxvalue].max
         
     | 
| 
      
 152 
     | 
    
         
            +
                  }
         
     | 
| 
      
 153 
     | 
    
         
            +
                end
         
     | 
| 
      
 154 
     | 
    
         
            +
                
         
     | 
| 
      
 155 
     | 
    
         
            +
                interval = (maxvalue - minvalue) / 20.0
         
     | 
| 
      
 156 
     | 
    
         
            +
                
         
     | 
| 
      
 157 
     | 
    
         
            +
                # now compute the number of scores in each interval
         
     | 
| 
      
 158 
     | 
    
         
            +
                num_in_range = Hash.new(0)
         
     | 
| 
      
 159 
     | 
    
         
            +
                
         
     | 
| 
      
 160 
     | 
    
         
            +
                scores.each_pair { |label, score|
         
     | 
| 
      
 161 
     | 
    
         
            +
                  num = (score / interval).floor
         
     | 
| 
      
 162 
     | 
    
         
            +
                  num_in_range[num] += 1
         
     | 
| 
      
 163 
     | 
    
         
            +
                }
         
     | 
| 
      
 164 
     | 
    
         
            +
             
     | 
| 
      
 165 
     | 
    
         
            +
                # open output files:
         
     | 
| 
      
 166 
     | 
    
         
            +
                # text output, temp files for gnuplot
         
     | 
| 
      
 167 
     | 
    
         
            +
                if textoutfile
         
     | 
| 
      
 168 
     | 
    
         
            +
                  textout = File.new(textoutfile, "w")
         
     | 
| 
      
 169 
     | 
    
         
            +
                  
         
     | 
| 
      
 170 
     | 
    
         
            +
                  # document number of scores in each range
         
     | 
| 
      
 171 
     | 
    
         
            +
                  # to text outfile
         
     | 
| 
      
 172 
     | 
    
         
            +
                  textout.puts "-------------------------"
         
     | 
| 
      
 173 
     | 
    
         
            +
                  textout.puts title
         
     | 
| 
      
 174 
     | 
    
         
            +
                  textout.puts "-------------------------"
         
     | 
| 
      
 175 
     | 
    
         
            +
                  
         
     | 
| 
      
 176 
     | 
    
         
            +
                num_in_range.keys.sort.each { |rangeno|
         
     | 
| 
      
 177 
     | 
    
         
            +
                    range_lower = interval * rangeno.to_f
         
     | 
| 
      
 178 
     | 
    
         
            +
                    textout.print "number of values btw. ", sprintf("%.2f", range_lower),
         
     | 
| 
      
 179 
     | 
    
         
            +
                    " and ", sprintf("%.2f", range_lower + interval), ": ", 
         
     | 
| 
      
 180 
     | 
    
         
            +
                    num_in_range[rangeno], "\n"
         
     | 
| 
      
 181 
     | 
    
         
            +
                  }
         
     | 
| 
      
 182 
     | 
    
         
            +
                  
         
     | 
| 
      
 183 
     | 
    
         
            +
                  textout.close()
         
     | 
| 
      
 184 
     | 
    
         
            +
                end
         
     | 
| 
      
 185 
     | 
    
         
            +
                
         
     | 
| 
      
 186 
     | 
    
         
            +
                # document number of scores in each range
         
     | 
| 
      
 187 
     | 
    
         
            +
                # to temp. infile for gnuplot
         
     | 
| 
      
 188 
     | 
    
         
            +
                tf = Tempfile.new("plot_and_r")
         
     | 
| 
      
 189 
     | 
    
         
            +
                
         
     | 
| 
      
 190 
     | 
    
         
            +
                0.upto(19) { |rangeno|
         
     | 
| 
      
 191 
     | 
    
         
            +
                  range_lower = interval * rangeno.to_f
         
     | 
| 
      
 192 
     | 
    
         
            +
                  tf.print range_lower, "\t", num_in_range[rangeno], "\n"
         
     | 
| 
      
 193 
     | 
    
         
            +
                }
         
     | 
| 
      
 194 
     | 
    
         
            +
                tf.close()
         
     | 
| 
      
 195 
     | 
    
         
            +
                
         
     | 
| 
      
 196 
     | 
    
         
            +
                # make gnuplot main infile
         
     | 
| 
      
 197 
     | 
    
         
            +
                gf = Tempfile.new("plot_and_r")
         
     | 
| 
      
 198 
     | 
    
         
            +
                gf.puts "set title \"" + title+ "\""
         
     | 
| 
      
 199 
     | 
    
         
            +
                gf.puts "set ylabel \"num items\""
         
     | 
| 
      
 200 
     | 
    
         
            +
                gf.puts "set xlabel \"" + score_name + "\""
         
     | 
| 
      
 201 
     | 
    
         
            +
                gf.puts "set time"
         
     | 
| 
      
 202 
     | 
    
         
            +
                gf.puts "set data style boxes"
         
     | 
| 
      
 203 
     | 
    
         
            +
                gf.puts "set boxwidth " + (interval/2.0).to_s
         
     | 
| 
      
 204 
     | 
    
         
            +
                gf.puts "set grid"
         
     | 
| 
      
 205 
     | 
    
         
            +
                gf.puts "set output \"" + plotoutfile + "\""
         
     | 
| 
      
 206 
     | 
    
         
            +
                gf.puts "set terminal postscript color"
         
     | 
| 
      
 207 
     | 
    
         
            +
                gf.print "plot \"" + tf.path() + "\" title \"" + score_name + "\" with boxes"
         
     | 
| 
      
 208 
     | 
    
         
            +
                gf.puts
         
     | 
| 
      
 209 
     | 
    
         
            +
                gf.puts
         
     | 
| 
      
 210 
     | 
    
         
            +
                gf.close()
         
     | 
| 
      
 211 
     | 
    
         
            +
                
         
     | 
| 
      
 212 
     | 
    
         
            +
                # now gnuplot it
         
     | 
| 
      
 213 
     | 
    
         
            +
                %x{gnuplot #{gf.path()}}
         
     | 
| 
      
 214 
     | 
    
         
            +
             
     | 
| 
      
 215 
     | 
    
         
            +
                # and remove temp files
         
     | 
| 
      
 216 
     | 
    
         
            +
                tf.close(true)
         
     | 
| 
      
 217 
     | 
    
         
            +
                gf.close(true)
         
     | 
| 
      
 218 
     | 
    
         
            +
              end
         
     | 
| 
      
 219 
     | 
    
         
            +
             
     | 
| 
      
 220 
     | 
    
         
            +
             
     | 
| 
      
 221 
     | 
    
         
            +
              #####
         
     | 
| 
      
 222 
     | 
    
         
            +
              # draws a scatter plot comparing two
         
     | 
| 
      
 223 
     | 
    
         
            +
              # mappings from labels to scores
         
     | 
| 
      
 224 
     | 
    
         
            +
              # the first (base) scores are drawn on the x axis,
         
     | 
| 
      
 225 
     | 
    
         
            +
              # the second (comparison) scores are drawn on the y axis.
         
     | 
| 
      
 226 
     | 
    
         
            +
              # The method only looks at labels present in the base score,
         
     | 
| 
      
 227 
     | 
    
         
            +
              # so if a label is present only in the comparison score but not the base score
         
     | 
| 
      
 228 
     | 
    
         
            +
              # it is ignored.
         
     | 
| 
      
 229 
     | 
    
         
            +
              def PlotAndREval.gnuplot_correlation_chart(base_scores, # hash: label(string) -> value(float)
         
     | 
| 
      
 230 
     | 
    
         
            +
                                                         comparison_scores, # hash: label(string) -> value(float)
         
     | 
| 
      
 231 
     | 
    
         
            +
                                                         title,  # string: title for output files
         
     | 
| 
      
 232 
     | 
    
         
            +
                                                         base_name, # string: what are the base scores?
         
     | 
| 
      
 233 
     | 
    
         
            +
                                                         comparison_name, # string: what are the comparison scores?
         
     | 
| 
      
 234 
     | 
    
         
            +
                                                         plotoutfile, # string: name of gnuplot output file
         
     | 
| 
      
 235 
     | 
    
         
            +
                                                         textoutfile = nil) # string: name of text output file
         
     | 
| 
      
 236 
     | 
    
         
            +
                
         
     | 
| 
      
 237 
     | 
    
         
            +
                # text output: base score/comparison score pairs
         
     | 
| 
      
 238 
     | 
    
         
            +
                if textoutfile
         
     | 
| 
      
 239 
     | 
    
         
            +
                  begin
         
     | 
| 
      
 240 
     | 
    
         
            +
                    textout = File.new(textoutfile, "w")
         
     | 
| 
      
 241 
     | 
    
         
            +
                  rescue
         
     | 
| 
      
 242 
     | 
    
         
            +
                    raise "Couldn't write to " + textoutfile
         
     | 
| 
      
 243 
     | 
    
         
            +
                  end
         
     | 
| 
      
 244 
     | 
    
         
            +
                  
         
     | 
| 
      
 245 
     | 
    
         
            +
                  textout.puts "------------------------"
         
     | 
| 
      
 246 
     | 
    
         
            +
                  textout.puts title
         
     | 
| 
      
 247 
     | 
    
         
            +
                  textout.puts "------------------------"
         
     | 
| 
      
 248 
     | 
    
         
            +
                  
         
     | 
| 
      
 249 
     | 
    
         
            +
                  # text output: base score / comparison score pairs
         
     | 
| 
      
 250 
     | 
    
         
            +
                  base_scores.to_a.sort { |a, b| b.last <=> a.last }.each { |label, score|
         
     | 
| 
      
 251 
     | 
    
         
            +
                    
         
     | 
| 
      
 252 
     | 
    
         
            +
                    textout.print label, ": ", base_name, ": ", score, ", ", comparison_name, ": "
         
     | 
| 
      
 253 
     | 
    
         
            +
                    if comparison_scores[label]
         
     | 
| 
      
 254 
     | 
    
         
            +
                      textout.print comparison_scores[label], "\n"
         
     | 
| 
      
 255 
     | 
    
         
            +
                    else
         
     | 
| 
      
 256 
     | 
    
         
            +
                      textout.print "--", "\n"
         
     | 
| 
      
 257 
     | 
    
         
            +
                    end   
         
     | 
| 
      
 258 
     | 
    
         
            +
                  }
         
     | 
| 
      
 259 
     | 
    
         
            +
                end
         
     | 
| 
      
 260 
     | 
    
         
            +
              
         
     | 
| 
      
 261 
     | 
    
         
            +
             
     | 
| 
      
 262 
     | 
    
         
            +
                # make scatter plot: base vs. comparison
         
     | 
| 
      
 263 
     | 
    
         
            +
                
         
     | 
| 
      
 264 
     | 
    
         
            +
                tf = Tempfile.new("plot_and_r")
         
     | 
| 
      
 265 
     | 
    
         
            +
                base_scores.each_pair { |label, score|
         
     | 
| 
      
 266 
     | 
    
         
            +
                  if comparison_scores[label]
         
     | 
| 
      
 267 
     | 
    
         
            +
                    tf.print score, "\t", comparison_scores[label], "\n"
         
     | 
| 
      
 268 
     | 
    
         
            +
                  else
         
     | 
| 
      
 269 
     | 
    
         
            +
                    $stderr.puts "no comparison scores for " + label
         
     | 
| 
      
 270 
     | 
    
         
            +
                  end
         
     | 
| 
      
 271 
     | 
    
         
            +
                }
         
     | 
| 
      
 272 
     | 
    
         
            +
                tf.close()
         
     | 
| 
      
 273 
     | 
    
         
            +
                
         
     | 
| 
      
 274 
     | 
    
         
            +
                # make gnuplot main infile
         
     | 
| 
      
 275 
     | 
    
         
            +
                gf = Tempfile.new("plot_and_r")
         
     | 
| 
      
 276 
     | 
    
         
            +
                gf.puts "set title \"" + title + "\""
         
     | 
| 
      
 277 
     | 
    
         
            +
                gf.puts "set ylabel \"" + comparison_name + "\""
         
     | 
| 
      
 278 
     | 
    
         
            +
                gf.puts "set xlabel \"" + base_name + "\""
         
     | 
| 
      
 279 
     | 
    
         
            +
                gf.puts "set time"
         
     | 
| 
      
 280 
     | 
    
         
            +
                gf.puts "set data style points"
         
     | 
| 
      
 281 
     | 
    
         
            +
                gf.puts "set grid"
         
     | 
| 
      
 282 
     | 
    
         
            +
                gf.puts "set output \"" + plotoutfile + "\""
         
     | 
| 
      
 283 
     | 
    
         
            +
                gf.puts "set terminal postscript color"
         
     | 
| 
      
 284 
     | 
    
         
            +
                gf.puts "plot \"" + tf.path() + "\""
         
     | 
| 
      
 285 
     | 
    
         
            +
                gf.puts
         
     | 
| 
      
 286 
     | 
    
         
            +
                gf.close()
         
     | 
| 
      
 287 
     | 
    
         
            +
                
         
     | 
| 
      
 288 
     | 
    
         
            +
                # now gnuplot it
         
     | 
| 
      
 289 
     | 
    
         
            +
                %x{gnuplot #{gf.path()}}
         
     | 
| 
      
 290 
     | 
    
         
            +
                tf.close(true)
         
     | 
| 
      
 291 
     | 
    
         
            +
                gf.close(true)  
         
     | 
| 
      
 292 
     | 
    
         
            +
              end
         
     | 
| 
      
 293 
     | 
    
         
            +
             
     | 
| 
      
 294 
     | 
    
         
            +
             
     | 
| 
      
 295 
     | 
    
         
            +
              # given two mappings from labels to scores,
         
     | 
| 
      
 296 
     | 
    
         
            +
              # draw a gnuplot drawing comparing them
         
     | 
| 
      
 297 
     | 
    
         
            +
              # as box scores:
         
     | 
| 
      
 298 
     | 
    
         
            +
              # sort the first mapping by scores (in descending order),
         
     | 
| 
      
 299 
     | 
    
         
            +
              # then for each label draw first the score from the first mapping
         
     | 
| 
      
 300 
     | 
    
         
            +
              # as a box, then the score from the second mapping
         
     | 
| 
      
 301 
     | 
    
         
            +
              # as a differently colored box.
         
     | 
| 
      
 302 
     | 
    
         
            +
              #
         
     | 
| 
      
 303 
     | 
    
         
            +
              # Scores1 is the basis for the comparison: only those labels
         
     | 
| 
      
 304 
     | 
    
         
            +
              # are used that occur in mapping 1 are included in the comparison
         
     | 
| 
      
 305 
     | 
    
         
            +
              #
         
     | 
| 
      
 306 
     | 
    
         
            +
              # A title for the graph must be given, and a 
         
     | 
| 
      
 307 
     | 
    
         
            +
              # name for the gnuplot output file.
         
     | 
| 
      
 308 
     | 
    
         
            +
              # If the name of a text output file is given,
         
     | 
| 
      
 309 
     | 
    
         
            +
              # the result is also printed as text.
         
     | 
| 
      
 310 
     | 
    
         
            +
              def PlotAndREval.gnuplot_comparison_chart(scores1, # hash:label(string) -> value(float), label->score-mapping
         
     | 
| 
      
 311 
     | 
    
         
            +
                                                        scores2, # hash:label(string) -> value(float), label->score-mapping
         
     | 
| 
      
 312 
     | 
    
         
            +
                                                        title,  # string: title for output files
         
     | 
| 
      
 313 
     | 
    
         
            +
                                                        score_name, # string: what are the scores? (label for y axis)
         
     | 
| 
      
 314 
     | 
    
         
            +
                                                        plotoutfile, # string: name of gnuplot output file
         
     | 
| 
      
 315 
     | 
    
         
            +
                                                        textoutfile = nil) # string: name of text output file
         
     | 
| 
      
 316 
     | 
    
         
            +
                
         
     | 
| 
      
 317 
     | 
    
         
            +
             
     | 
| 
      
 318 
     | 
    
         
            +
                # text output
         
     | 
| 
      
 319 
     | 
    
         
            +
                if textoutfile
         
     | 
| 
      
 320 
     | 
    
         
            +
                  textout = File.new(textoutfile, "w")
         
     | 
| 
      
 321 
     | 
    
         
            +
                  
         
     | 
| 
      
 322 
     | 
    
         
            +
                  # document scores in each range
         
     | 
| 
      
 323 
     | 
    
         
            +
                  # to text outfile
         
     | 
| 
      
 324 
     | 
    
         
            +
                  textout.puts "-------------------------"
         
     | 
| 
      
 325 
     | 
    
         
            +
                  textout.puts title
         
     | 
| 
      
 326 
     | 
    
         
            +
                  textout.puts "-------------------------"
         
     | 
| 
      
 327 
     | 
    
         
            +
                  textout.puts "Label\tScore 1\tScore 2"
         
     | 
| 
      
 328 
     | 
    
         
            +
             
     | 
| 
      
 329 
     | 
    
         
            +
                  scores1.to_a.sort { |a, b| b.last <=> a.last}.each { |label, score1|
         
     | 
| 
      
 330 
     | 
    
         
            +
                    textout.print label, "\t", score1, "\t"
         
     | 
| 
      
 331 
     | 
    
         
            +
                    score2 = scores2[label]
         
     | 
| 
      
 332 
     | 
    
         
            +
                    if score2
         
     | 
| 
      
 333 
     | 
    
         
            +
                      textout.print score2, "\n"
         
     | 
| 
      
 334 
     | 
    
         
            +
                    else
         
     | 
| 
      
 335 
     | 
    
         
            +
                      textout.print "-", "\n"
         
     | 
| 
      
 336 
     | 
    
         
            +
                    end
         
     | 
| 
      
 337 
     | 
    
         
            +
                  }
         
     | 
| 
      
 338 
     | 
    
         
            +
                  textout.close()
         
     | 
| 
      
 339 
     | 
    
         
            +
                end
         
     | 
| 
      
 340 
     | 
    
         
            +
                
         
     | 
| 
      
 341 
     | 
    
         
            +
                # document number of scores in each mapping
         
     | 
| 
      
 342 
     | 
    
         
            +
                # to temp. infile for gnuplot
         
     | 
| 
      
 343 
     | 
    
         
            +
                tf1 = Tempfile.new("plot_and_r")
         
     | 
| 
      
 344 
     | 
    
         
            +
                tf2 = Tempfile.new("plot_and_r")
         
     | 
| 
      
 345 
     | 
    
         
            +
                
         
     | 
| 
      
 346 
     | 
    
         
            +
                index = 0.0
         
     | 
| 
      
 347 
     | 
    
         
            +
                scores1.to_a.sort { |a, b| b.last <=> a.last}.each { |label, score1|
         
     | 
| 
      
 348 
     | 
    
         
            +
                  score2 = scores2[label]
         
     | 
| 
      
 349 
     | 
    
         
            +
                  tf1.print index, "\t", score1, "\n"
         
     | 
| 
      
 350 
     | 
    
         
            +
                  if score2
         
     | 
| 
      
 351 
     | 
    
         
            +
                    i2 = index + 0.2
         
     | 
| 
      
 352 
     | 
    
         
            +
                    tf2.print i2, "\t", score2, "\n"
         
     | 
| 
      
 353 
     | 
    
         
            +
                  end
         
     | 
| 
      
 354 
     | 
    
         
            +
                  index += 1.0
         
     | 
| 
      
 355 
     | 
    
         
            +
                }
         
     | 
| 
      
 356 
     | 
    
         
            +
             
     | 
| 
      
 357 
     | 
    
         
            +
                tf1.close()
         
     | 
| 
      
 358 
     | 
    
         
            +
                tf2.close()
         
     | 
| 
      
 359 
     | 
    
         
            +
                
         
     | 
| 
      
 360 
     | 
    
         
            +
                # make gnuplot main infile
         
     | 
| 
      
 361 
     | 
    
         
            +
                gf = Tempfile.new("plot_and_r")
         
     | 
| 
      
 362 
     | 
    
         
            +
                gf.puts "set title \"" + title+ "\""
         
     | 
| 
      
 363 
     | 
    
         
            +
                gf.puts "set ylabel \"" + score_name + "\""
         
     | 
| 
      
 364 
     | 
    
         
            +
                gf.puts "set time"
         
     | 
| 
      
 365 
     | 
    
         
            +
                gf.puts "set boxwidth 0.2"
         
     | 
| 
      
 366 
     | 
    
         
            +
                gf.puts "set noxtics"
         
     | 
| 
      
 367 
     | 
    
         
            +
                gf.puts "set grid"
         
     | 
| 
      
 368 
     | 
    
         
            +
                gf.puts "set output \"" + plotoutfile + "\""
         
     | 
| 
      
 369 
     | 
    
         
            +
                gf.puts "set terminal postscript color"
         
     | 
| 
      
 370 
     | 
    
         
            +
                gf.print "plot \"" + tf1.path() + "\" title \"score 1\" with boxes fs solid 0.9,"
         
     | 
| 
      
 371 
     | 
    
         
            +
                gf.puts "\"" + tf2.path() + "\" title \"score 2\" with boxes fs solid 0.6"
         
     | 
| 
      
 372 
     | 
    
         
            +
                gf.puts
         
     | 
| 
      
 373 
     | 
    
         
            +
                gf.puts
         
     | 
| 
      
 374 
     | 
    
         
            +
                gf.close()
         
     | 
| 
      
 375 
     | 
    
         
            +
                
         
     | 
| 
      
 376 
     | 
    
         
            +
                # now gnuplot it
         
     | 
| 
      
 377 
     | 
    
         
            +
                %x{gnuplot #{gf.path()}}
         
     | 
| 
      
 378 
     | 
    
         
            +
             
     | 
| 
      
 379 
     | 
    
         
            +
                # and remove temp files
         
     | 
| 
      
 380 
     | 
    
         
            +
                tf1.close(true)
         
     | 
| 
      
 381 
     | 
    
         
            +
                tf2.close(true)
         
     | 
| 
      
 382 
     | 
    
         
            +
                gf.close(true)
         
     | 
| 
      
 383 
     | 
    
         
            +
              end
         
     | 
| 
      
 384 
     | 
    
         
            +
             
     | 
| 
      
 385 
     | 
    
         
            +
             
     | 
| 
      
 386 
     | 
    
         
            +
              #####
         
     | 
| 
      
 387 
     | 
    
         
            +
              #
         
     | 
| 
      
 388 
     | 
    
         
            +
              # computes a nonparametric rank correlation
         
     | 
| 
      
 389 
     | 
    
         
            +
              #
         
     | 
| 
      
 390 
     | 
    
         
            +
              # can compute partial correlations, i.e. correlations which factor out the influence
         
     | 
| 
      
 391 
     | 
    
         
            +
              # of a confound variable (last variable, can be omitted).
         
     | 
| 
      
 392 
     | 
    
         
            +
              
         
     | 
| 
      
 393 
     | 
    
         
            +
              def PlotAndREval.tau_correlation(base_scores, # hash: label(string) -> value(float)
         
     | 
| 
      
 394 
     | 
    
         
            +
                                               comparison_scores, # hash: label(string) -> value(float)
         
     | 
| 
      
 395 
     | 
    
         
            +
                                               base_name, # string: what are the base scores?
         
     | 
| 
      
 396 
     | 
    
         
            +
                                               comparison_name, # string: what are the comparison scores?
         
     | 
| 
      
 397 
     | 
    
         
            +
                                               textoutfile, # string: name of text output file
         
     | 
| 
      
 398 
     | 
    
         
            +
            				   confound_scores = nil) # hash: label(string) -> value(float)
         
     | 
| 
      
 399 
     | 
    
         
            +
             
     | 
| 
      
 400 
     | 
    
         
            +
                # compute Kendall's tau:
         
     | 
| 
      
 401 
     | 
    
         
            +
                # correlation between fscore and confusion?
         
     | 
| 
      
 402 
     | 
    
         
            +
                tf_f = Tempfile.new("plot_and_r")
         
     | 
| 
      
 403 
     | 
    
         
            +
                tf_e = Tempfile.new("plot_and_r")
         
     | 
| 
      
 404 
     | 
    
         
            +
                if confound_scores
         
     | 
| 
      
 405 
     | 
    
         
            +
                  tf_c = Tempfile.new("plot_and_r")
         
     | 
| 
      
 406 
     | 
    
         
            +
                end
         
     | 
| 
      
 407 
     | 
    
         
            +
                base_scores.each_pair { |label, score|
         
     | 
| 
      
 408 
     | 
    
         
            +
                  if comparison_scores[label]
         
     | 
| 
      
 409 
     | 
    
         
            +
                    tf_f.puts score.to_s
         
     | 
| 
      
 410 
     | 
    
         
            +
                    tf_e.puts comparison_scores[label].to_s
         
     | 
| 
      
 411 
     | 
    
         
            +
            	if confound_scores
         
     | 
| 
      
 412 
     | 
    
         
            +
            	  if confound_scores[label]
         
     | 
| 
      
 413 
     | 
    
         
            +
                        # logarithmise frequencies 
         
     | 
| 
      
 414 
     | 
    
         
            +
            	    tf_c.puts((Math.log(confound_scores[label])).to_s)
         
     | 
| 
      
 415 
     | 
    
         
            +
            	  else
         
     | 
| 
      
 416 
     | 
    
         
            +
            	    $stderr.puts "no confound scores for " + label
         
     | 
| 
      
 417 
     | 
    
         
            +
            	  end	  
         
     | 
| 
      
 418 
     | 
    
         
            +
            	end
         
     | 
| 
      
 419 
     | 
    
         
            +
                  else
         
     | 
| 
      
 420 
     | 
    
         
            +
            	$stderr.puts "no comparison scores for " + label
         
     | 
| 
      
 421 
     | 
    
         
            +
                  end
         
     | 
| 
      
 422 
     | 
    
         
            +
                }
         
     | 
| 
      
 423 
     | 
    
         
            +
                tf_e.close()
         
     | 
| 
      
 424 
     | 
    
         
            +
                tf_f.close()
         
     | 
| 
      
 425 
     | 
    
         
            +
                if confound_scores
         
     | 
| 
      
 426 
     | 
    
         
            +
                  tf_c.close()
         
     | 
| 
      
 427 
     | 
    
         
            +
                end
         
     | 
| 
      
 428 
     | 
    
         
            +
             
     | 
| 
      
 429 
     | 
    
         
            +
                # write the R script to rf
         
     | 
| 
      
 430 
     | 
    
         
            +
                rf = Tempfile.new("plot_and_r")
         
     | 
| 
      
 431 
     | 
    
         
            +
                # write the output to rfout
         
     | 
| 
      
 432 
     | 
    
         
            +
                rfout = Tempfile.new("plot_and_r")
         
     | 
| 
      
 433 
     | 
    
         
            +
                rfout.close()
         
     | 
| 
      
 434 
     | 
    
         
            +
             
     | 
| 
      
 435 
     | 
    
         
            +
             
     | 
| 
      
 436 
     | 
    
         
            +
                if confound_scores # perform partial correlation analysis
         
     | 
| 
      
 437 
     | 
    
         
            +
                  rf.puts "base <- read.table(\"#{tf_f.path()}\")"
         
     | 
| 
      
 438 
     | 
    
         
            +
                  rf.puts "comparison <- read.table(\"#{tf_e.path()}\")"
         
     | 
| 
      
 439 
     | 
    
         
            +
                  rf.puts "confuse <- read.table(\"#{tf_c.path()}\")"
         
     | 
| 
      
 440 
     | 
    
         
            +
                  # adapted from https://stat.ethz.ch/pipermail/r-help/2001-August/012820.html
         
     | 
| 
      
 441 
     | 
    
         
            +
                  # compute partial correlation coefficient for comparison, with confuse excluded
         
     | 
| 
      
 442 
     | 
    
         
            +
                  rf.puts "cor(lm(base[[1]]~confuse[[1]])$resid,lm(comparison[[1]]~confuse[[1]])$resid,method=\"kendall\")"
         
     | 
| 
      
 443 
     | 
    
         
            +
             
     | 
| 
      
 444 
     | 
    
         
            +
              # compute partial correlation coefficient for confuse, with comparison excluded
         
     | 
| 
      
 445 
     | 
    
         
            +
                  rf.puts "cor(lm(base[[1]]~comparison[[1]])$resid,lm(confuse[[1]]~comparison[[1]])$resid,method=\"kendall\")"
         
     | 
| 
      
 446 
     | 
    
         
            +
             
     | 
| 
      
 447 
     | 
    
         
            +
                  # compute significance of partial correlation
         
     | 
| 
      
 448 
     | 
    
         
            +
                  rf.puts "summary(lm(base[[1]] ~ comparison[[1]] + confuse[[1]]))"      
         
     | 
| 
      
 449 
     | 
    
         
            +
                else # perform normal correlation analysis
         
     | 
| 
      
 450 
     | 
    
         
            +
                  rf.puts "base <- read.table(\"#{tf_f.path()}\")"
         
     | 
| 
      
 451 
     | 
    
         
            +
                  rf.puts "comparison <- read.table(\"#{tf_e.path()}\")"
         
     | 
| 
      
 452 
     | 
    
         
            +
                  rf.puts "cor.test(base[[1]], comparison[[1]], method=\"kendall\", exact=FALSE)"
         
     | 
| 
      
 453 
     | 
    
         
            +
                end 
         
     | 
| 
      
 454 
     | 
    
         
            +
                rf.close()
         
     | 
| 
      
 455 
     | 
    
         
            +
                %x{/proj/contrib/R/R-1.8.0/bin/R --vanilla < #{rf.path()} > #{rfout.path()}}
         
     | 
| 
      
 456 
     | 
    
         
            +
                rfout.open()
         
     | 
| 
      
 457 
     | 
    
         
            +
                
         
     | 
| 
      
 458 
     | 
    
         
            +
                # output of R results: to stderr and to textout file
         
     | 
| 
      
 459 
     | 
    
         
            +
                begin
         
     | 
| 
      
 460 
     | 
    
         
            +
                  textout = File.new(textoutfile, "w")
         
     | 
| 
      
 461 
     | 
    
         
            +
                rescue
         
     | 
| 
      
 462 
     | 
    
         
            +
                  raise "Couldn't write to file " + textoutfile
         
     | 
| 
      
 463 
     | 
    
         
            +
                end
         
     | 
| 
      
 464 
     | 
    
         
            +
             
     | 
| 
      
 465 
     | 
    
         
            +
                textout.puts "-----------------------"
         
     | 
| 
      
 466 
     | 
    
         
            +
                textout.puts "Correlation of " + base_name + " and " + comparison_name + " by Kendall's tau:"
         
     | 
| 
      
 467 
     | 
    
         
            +
                textout.puts "-----------------------"
         
     | 
| 
      
 468 
     | 
    
         
            +
             
     | 
| 
      
 469 
     | 
    
         
            +
                while (line = rfout.gets())
         
     | 
| 
      
 470 
     | 
    
         
            +
                  $stderr.puts "R output: " + line
         
     | 
| 
      
 471 
     | 
    
         
            +
                  textout.puts "R output: " + line
         
     | 
| 
      
 472 
     | 
    
         
            +
                end
         
     | 
| 
      
 473 
     | 
    
         
            +
             
     | 
| 
      
 474 
     | 
    
         
            +
                tf_e.close(true)
         
     | 
| 
      
 475 
     | 
    
         
            +
                tf_f.close(true)
         
     | 
| 
      
 476 
     | 
    
         
            +
                rf.close(true)
         
     | 
| 
      
 477 
     | 
    
         
            +
                rfout.close(true)
         
     | 
| 
      
 478 
     | 
    
         
            +
                textout.close()
         
     | 
| 
      
 479 
     | 
    
         
            +
              end
         
     | 
| 
      
 480 
     | 
    
         
            +
            end
         
     |