frprep 0.0.1.prealpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +8 -0
 - data/CHANGELOG.rdoc +0 -0
 - data/LICENSE.rdoc +0 -0
 - data/README.rdoc +0 -0
 - data/lib/common/AbstractSynInterface.rb +1227 -0
 - data/lib/common/BerkeleyInterface.rb +375 -0
 - data/lib/common/CollinsInterface.rb +1165 -0
 - data/lib/common/ConfigData.rb +694 -0
 - data/lib/common/Counter.rb +18 -0
 - data/lib/common/DBInterface.rb +48 -0
 - data/lib/common/EnduserMode.rb +27 -0
 - data/lib/common/Eval.rb +480 -0
 - data/lib/common/FixSynSemMapping.rb +196 -0
 - data/lib/common/FrPrepConfigData.rb +66 -0
 - data/lib/common/FrprepHelper.rb +1324 -0
 - data/lib/common/Graph.rb +345 -0
 - data/lib/common/ISO-8859-1.rb +24 -0
 - data/lib/common/ML.rb +186 -0
 - data/lib/common/Maxent.rb +215 -0
 - data/lib/common/MiniparInterface.rb +1388 -0
 - data/lib/common/Optimise.rb +195 -0
 - data/lib/common/Parser.rb +213 -0
 - data/lib/common/RegXML.rb +269 -0
 - data/lib/common/RosyConventions.rb +171 -0
 - data/lib/common/SQLQuery.rb +243 -0
 - data/lib/common/STXmlTerminalOrder.rb +194 -0
 - data/lib/common/SalsaTigerRegXML.rb +2347 -0
 - data/lib/common/SalsaTigerXMLHelper.rb +99 -0
 - data/lib/common/SleepyInterface.rb +384 -0
 - data/lib/common/SynInterfaces.rb +275 -0
 - data/lib/common/TabFormat.rb +720 -0
 - data/lib/common/Tiger.rb +1448 -0
 - data/lib/common/TntInterface.rb +44 -0
 - data/lib/common/Tree.rb +61 -0
 - data/lib/common/TreetaggerInterface.rb +303 -0
 - data/lib/common/headz.rb +338 -0
 - data/lib/common/option_parser.rb +13 -0
 - data/lib/common/ruby_class_extensions.rb +310 -0
 - data/lib/fred/Baseline.rb +150 -0
 - data/lib/fred/FileZipped.rb +31 -0
 - data/lib/fred/FredBOWContext.rb +863 -0
 - data/lib/fred/FredConfigData.rb +182 -0
 - data/lib/fred/FredConventions.rb +232 -0
 - data/lib/fred/FredDetermineTargets.rb +324 -0
 - data/lib/fred/FredEval.rb +312 -0
 - data/lib/fred/FredFeatureExtractors.rb +321 -0
 - data/lib/fred/FredFeatures.rb +1061 -0
 - data/lib/fred/FredFeaturize.rb +596 -0
 - data/lib/fred/FredNumTrainingSenses.rb +27 -0
 - data/lib/fred/FredParameters.rb +402 -0
 - data/lib/fred/FredSplit.rb +84 -0
 - data/lib/fred/FredSplitPkg.rb +180 -0
 - data/lib/fred/FredTest.rb +607 -0
 - data/lib/fred/FredTrain.rb +144 -0
 - data/lib/fred/PlotAndREval.rb +480 -0
 - data/lib/fred/fred.rb +45 -0
 - data/lib/fred/md5.rb +23 -0
 - data/lib/fred/opt_parser.rb +250 -0
 - data/lib/frprep/AbstractSynInterface.rb +1227 -0
 - data/lib/frprep/Ampersand.rb +37 -0
 - data/lib/frprep/BerkeleyInterface.rb +375 -0
 - data/lib/frprep/CollinsInterface.rb +1165 -0
 - data/lib/frprep/ConfigData.rb +694 -0
 - data/lib/frprep/Counter.rb +18 -0
 - data/lib/frprep/FNCorpusXML.rb +643 -0
 - data/lib/frprep/FNDatabase.rb +144 -0
 - data/lib/frprep/FixSynSemMapping.rb +196 -0
 - data/lib/frprep/FrPrepConfigData.rb +66 -0
 - data/lib/frprep/FrameXML.rb +513 -0
 - data/lib/frprep/FrprepHelper.rb +1324 -0
 - data/lib/frprep/Graph.rb +345 -0
 - data/lib/frprep/ISO-8859-1.rb +24 -0
 - data/lib/frprep/MiniparInterface.rb +1388 -0
 - data/lib/frprep/Parser.rb +213 -0
 - data/lib/frprep/RegXML.rb +269 -0
 - data/lib/frprep/STXmlTerminalOrder.rb +194 -0
 - data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
 - data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
 - data/lib/frprep/SleepyInterface.rb +384 -0
 - data/lib/frprep/SynInterfaces.rb +275 -0
 - data/lib/frprep/TabFormat.rb +720 -0
 - data/lib/frprep/Tiger.rb +1448 -0
 - data/lib/frprep/TntInterface.rb +44 -0
 - data/lib/frprep/Tree.rb +61 -0
 - data/lib/frprep/TreetaggerInterface.rb +303 -0
 - data/lib/frprep/do_parses.rb +142 -0
 - data/lib/frprep/frprep.rb +686 -0
 - data/lib/frprep/headz.rb +338 -0
 - data/lib/frprep/one_parsed_file.rb +28 -0
 - data/lib/frprep/opt_parser.rb +94 -0
 - data/lib/frprep/ruby_class_extensions.rb +310 -0
 - data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
 - data/lib/rosy/DBMySQL.rb +146 -0
 - data/lib/rosy/DBSQLite.rb +280 -0
 - data/lib/rosy/DBTable.rb +239 -0
 - data/lib/rosy/DBWrapper.rb +176 -0
 - data/lib/rosy/ExternalConfigData.rb +58 -0
 - data/lib/rosy/FailedParses.rb +130 -0
 - data/lib/rosy/FeatureInfo.rb +242 -0
 - data/lib/rosy/GfInduce.rb +1115 -0
 - data/lib/rosy/GfInduceFeature.rb +148 -0
 - data/lib/rosy/InputData.rb +294 -0
 - data/lib/rosy/RosyConfigData.rb +115 -0
 - data/lib/rosy/RosyConfusability.rb +338 -0
 - data/lib/rosy/RosyEval.rb +465 -0
 - data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
 - data/lib/rosy/RosyFeaturize.rb +280 -0
 - data/lib/rosy/RosyInspect.rb +336 -0
 - data/lib/rosy/RosyIterator.rb +477 -0
 - data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
 - data/lib/rosy/RosyPruning.rb +165 -0
 - data/lib/rosy/RosyServices.rb +744 -0
 - data/lib/rosy/RosySplit.rb +232 -0
 - data/lib/rosy/RosyTask.rb +19 -0
 - data/lib/rosy/RosyTest.rb +826 -0
 - data/lib/rosy/RosyTrain.rb +232 -0
 - data/lib/rosy/RosyTrainingTestTable.rb +786 -0
 - data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
 - data/lib/rosy/View.rb +418 -0
 - data/lib/rosy/opt_parser.rb +379 -0
 - data/lib/rosy/rosy.rb +77 -0
 - data/lib/shalmaneser/version.rb +3 -0
 - data/test/frprep/test_opt_parser.rb +94 -0
 - data/test/functional/functional_test_helper.rb +40 -0
 - data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
 - data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
 - data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
 - data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
 - data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
 - data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
 - data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
 - data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
 - data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
 - data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
 - data/test/functional/test_fred.rb +47 -0
 - data/test/functional/test_frprep.rb +52 -0
 - data/test/functional/test_rosy.rb +20 -0
 - metadata +270 -0
 
| 
         @@ -0,0 +1,148 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # GfInduceFeature
         
     | 
| 
      
 2 
     | 
    
         
            +
            # Katrin Erk Jan 06
         
     | 
| 
      
 3 
     | 
    
         
            +
            #
         
     | 
| 
      
 4 
     | 
    
         
            +
            # use result of GfInduce.rb as
         
     | 
| 
      
 5 
     | 
    
         
            +
            # feature for Rosy
         
     | 
| 
      
 6 
     | 
    
         
            +
             
     | 
| 
      
 7 
     | 
    
         
            +
            require "rosy/GfInduce"
         
     | 
| 
      
 8 
     | 
    
         
            +
            require "rosy/AbstractFeatureAndExternal"
         
     | 
| 
      
 9 
     | 
    
         
            +
            require "common/ruby_class_extensions"
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
            ###
         
     | 
| 
      
 12 
     | 
    
         
            +
            # make filename for GfInduce picle file
         
     | 
| 
      
 13 
     | 
    
         
            +
            def filename_gfmap(exp,         # ExternalConfigData object
         
     | 
| 
      
 14 
     | 
    
         
            +
            		   interpreter) # SynInterpreter class
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
              # output dir as given in my experiment file
         
     | 
| 
      
 17 
     | 
    
         
            +
              # If there is an experiment ID, make subdirectory
         
     | 
| 
      
 18 
     | 
    
         
            +
              # named after the experiment ID and place the data there.
         
     | 
| 
      
 19 
     | 
    
         
            +
              output_dir = File.new_dir(exp.get("directory"))
         
     | 
| 
      
 20 
     | 
    
         
            +
              if exp.get("experiment_id")
         
     | 
| 
      
 21 
     | 
    
         
            +
                output_dir = File.new_dir(output_dir, exp.get("experiment_id"))
         
     | 
| 
      
 22 
     | 
    
         
            +
              end
         
     | 
| 
      
 23 
     | 
    
         
            +
              
         
     | 
| 
      
 24 
     | 
    
         
            +
              # output file name: 
         
     | 
| 
      
 25 
     | 
    
         
            +
              # Gfmap.{<service>=<system_name>.}*{OPT<service>=<system_name>.}*pkl
         
     | 
| 
      
 26 
     | 
    
         
            +
              return output_dir + 
         
     | 
| 
      
 27 
     | 
    
         
            +
                "Gfmap." + 
         
     | 
| 
      
 28 
     | 
    
         
            +
                interpreter.systems().to_a.map { |service, system_name|
         
     | 
| 
      
 29 
     | 
    
         
            +
                service.to_s+ "=" + system_name.to_s
         
     | 
| 
      
 30 
     | 
    
         
            +
              }.sort.join(".") + "." +
         
     | 
| 
      
 31 
     | 
    
         
            +
                interpreter.optional_systems().to_a.map { |service, system_name|
         
     | 
| 
      
 32 
     | 
    
         
            +
                "OPT" + service.to_s + "=" + system_name.to_s
         
     | 
| 
      
 33 
     | 
    
         
            +
              }.sort.join(".") + ".pkl"
         
     | 
| 
      
 34 
     | 
    
         
            +
            end
         
     | 
| 
      
 35 
     | 
    
         
            +
             
     | 
| 
      
 36 
     | 
    
         
            +
            ################################
         
     | 
| 
      
 37 
     | 
    
         
            +
            # base class for all following feature extractors
         
     | 
| 
      
 38 
     | 
    
         
            +
            class GfInduceFeatureExtractor < ExternalFeatureExtractor
         
     | 
| 
      
 39 
     | 
    
         
            +
              GfInduceFeatureExtractor.announce_me()
         
     | 
| 
      
 40 
     | 
    
         
            +
             
     | 
| 
      
 41 
     | 
    
         
            +
              @@okay = true  # external experiment file present?
         
     | 
| 
      
 42 
     | 
    
         
            +
              @@gf_obj = nil # GfInduce object
         
     | 
| 
      
 43 
     | 
    
         
            +
              @@node_to_gf = nil # Hash: SynNodes of a sentence -> Gf label
         
     | 
| 
      
 44 
     | 
    
         
            +
             
     | 
| 
      
 45 
     | 
    
         
            +
              def GfInduceFeatureExtractor.designator()
         
     | 
| 
      
 46 
     | 
    
         
            +
                return "gf_fn"
         
     | 
| 
      
 47 
     | 
    
         
            +
              end
         
     | 
| 
      
 48 
     | 
    
         
            +
              def GfInduceFeatureExtractor.feature_names()
         
     | 
| 
      
 49 
     | 
    
         
            +
                return ["gf_fn"]
         
     | 
| 
      
 50 
     | 
    
         
            +
              end
         
     | 
| 
      
 51 
     | 
    
         
            +
              def GfInduceFeatureExtractor.sql_type()
         
     | 
| 
      
 52 
     | 
    
         
            +
                return "VARCHAR(25)"
         
     | 
| 
      
 53 
     | 
    
         
            +
              end
         
     | 
| 
      
 54 
     | 
    
         
            +
              def GfInduceFeatureExtractor.feature_type()
         
     | 
| 
      
 55 
     | 
    
         
            +
                return "syn"
         
     | 
| 
      
 56 
     | 
    
         
            +
              end
         
     | 
| 
      
 57 
     | 
    
         
            +
              def GfInduceFeatureExtractor.phase()
         
     | 
| 
      
 58 
     | 
    
         
            +
                return "phase 1"
         
     | 
| 
      
 59 
     | 
    
         
            +
              end
         
     | 
| 
      
 60 
     | 
    
         
            +
             
     | 
| 
      
 61 
     | 
    
         
            +
              ###
         
     | 
| 
      
 62 
     | 
    
         
            +
              # set sentence, set node, set other settings: 
         
     | 
| 
      
 63 
     | 
    
         
            +
              # this is done prior to
         
     | 
| 
      
 64 
     | 
    
         
            +
              # feature computation using compute_feature()
         
     | 
| 
      
 65 
     | 
    
         
            +
              # such that computations that stay the same for
         
     | 
| 
      
 66 
     | 
    
         
            +
              # several features can be done in advance
         
     | 
| 
      
 67 
     | 
    
         
            +
              #
         
     | 
| 
      
 68 
     | 
    
         
            +
              # This is just relevant for Phase 1
         
     | 
| 
      
 69 
     | 
    
         
            +
              #
         
     | 
| 
      
 70 
     | 
    
         
            +
              # returns: false/nil if there was a problem
         
     | 
| 
      
 71 
     | 
    
         
            +
              def GfInduceFeatureExtractor.set_sentence(sent,  # SalsaTigerSentence object
         
     | 
| 
      
 72 
     | 
    
         
            +
                                                        frame) # FrameNode object
         
     | 
| 
      
 73 
     | 
    
         
            +
             
     | 
| 
      
 74 
     | 
    
         
            +
                super(sent, frame)
         
     | 
| 
      
 75 
     | 
    
         
            +
             
     | 
| 
      
 76 
     | 
    
         
            +
                if @@okay
         
     | 
| 
      
 77 
     | 
    
         
            +
                  # we can actually compute something
         
     | 
| 
      
 78 
     | 
    
         
            +
             
     | 
| 
      
 79 
     | 
    
         
            +
                  # let the GF object compute all subcat frames
         
     | 
| 
      
 80 
     | 
    
         
            +
                  # for the target of this frame
         
     | 
| 
      
 81 
     | 
    
         
            +
                  subcatframes_of_current_target = @@gf_obj.apply(frame.target.children())
         
     | 
| 
      
 82 
     | 
    
         
            +
             
     | 
| 
      
 83 
     | 
    
         
            +
                  # keep the most frequent one of the
         
     | 
| 
      
 84 
     | 
    
         
            +
                  # subcat frames returned by the GF object:
         
     | 
| 
      
 85 
     | 
    
         
            +
                  if subcatframes_of_current_target.empty?
         
     | 
| 
      
 86 
     | 
    
         
            +
                    # no subcat frames returned
         
     | 
| 
      
 87 
     | 
    
         
            +
                    subcatframe = []
         
     | 
| 
      
 88 
     | 
    
         
            +
                  else
         
     | 
| 
      
 89 
     | 
    
         
            +
                    # we have at least one subcat frame: 
         
     | 
| 
      
 90 
     | 
    
         
            +
                    # keep the most frequent one of them
         
     | 
| 
      
 91 
     | 
    
         
            +
                    #
         
     | 
| 
      
 92 
     | 
    
         
            +
                    # Also, subcatframes_of_current_target
         
     | 
| 
      
 93 
     | 
    
         
            +
                    # contains triples [frame, actual_subcatframe, frequency]
         
     | 
| 
      
 94 
     | 
    
         
            +
                    # Of these, keep just the actual_subcatframe
         
     | 
| 
      
 95 
     | 
    
         
            +
             
     | 
| 
      
 96 
     | 
    
         
            +
                    subcatframe = subcatframes_of_current_target.sort { |a, b|
         
     | 
| 
      
 97 
     | 
    
         
            +
                      # sort by frequency
         
     | 
| 
      
 98 
     | 
    
         
            +
                      b.last <=> a.last
         
     | 
| 
      
 99 
     | 
    
         
            +
                    }.first[1]
         
     | 
| 
      
 100 
     | 
    
         
            +
                  end
         
     | 
| 
      
 101 
     | 
    
         
            +
                  
         
     | 
| 
      
 102 
     | 
    
         
            +
                  # change into a mapping node(SynNode) -> GF(string)
         
     | 
| 
      
 103 
     | 
    
         
            +
                  @@node_to_gf = Hash.new
         
     | 
| 
      
 104 
     | 
    
         
            +
                  subcatframe.each { |gf, prep, fe, synnodes|
         
     | 
| 
      
 105 
     | 
    
         
            +
                    synnodes.each { |node|
         
     | 
| 
      
 106 
     | 
    
         
            +
                      @@node_to_gf[node] = "#{gf} #{prep}"
         
     | 
| 
      
 107 
     | 
    
         
            +
                    }
         
     | 
| 
      
 108 
     | 
    
         
            +
                  }
         
     | 
| 
      
 109 
     | 
    
         
            +
                end
         
     | 
| 
      
 110 
     | 
    
         
            +
              end
         
     | 
| 
      
 111 
     | 
    
         
            +
             
     | 
| 
      
 112 
     | 
    
         
            +
             
     | 
| 
      
 113 
     | 
    
         
            +
              ###
         
     | 
| 
      
 114 
     | 
    
         
            +
              # Initialize: read GFInduce pickle
         
     | 
| 
      
 115 
     | 
    
         
            +
              def initialize(exp,                  # experiment file object
         
     | 
| 
      
 116 
     | 
    
         
            +
                             interpreter_class)    # SynInterpreter class
         
     | 
| 
      
 117 
     | 
    
         
            +
             
     | 
| 
      
 118 
     | 
    
         
            +
                super(exp, interpreter_class)
         
     | 
| 
      
 119 
     | 
    
         
            +
                
         
     | 
| 
      
 120 
     | 
    
         
            +
                if @exp_external
         
     | 
| 
      
 121 
     | 
    
         
            +
                  pickle_filename = filename_gfmap(@exp_external, @@interpreter_class)
         
     | 
| 
      
 122 
     | 
    
         
            +
                  @@gf_obj = GfInduce.from_file(pickle_filename)
         
     | 
| 
      
 123 
     | 
    
         
            +
                  @@okay = true
         
     | 
| 
      
 124 
     | 
    
         
            +
             
     | 
| 
      
 125 
     | 
    
         
            +
                else
         
     | 
| 
      
 126 
     | 
    
         
            +
                  # signal that you cannot compute anything
         
     | 
| 
      
 127 
     | 
    
         
            +
                  @@okay = false
         
     | 
| 
      
 128 
     | 
    
         
            +
                end
         
     | 
| 
      
 129 
     | 
    
         
            +
              end
         
     | 
| 
      
 130 
     | 
    
         
            +
             
     | 
| 
      
 131 
     | 
    
         
            +
              ###
         
     | 
| 
      
 132 
     | 
    
         
            +
              # compute: compute features
         
     | 
| 
      
 133 
     | 
    
         
            +
              #
         
     | 
| 
      
 134 
     | 
    
         
            +
              # returns an array of features (strings), length the same as the
         
     | 
| 
      
 135 
     | 
    
         
            +
              # length of feature_names()
         
     | 
| 
      
 136 
     | 
    
         
            +
              #
         
     | 
| 
      
 137 
     | 
    
         
            +
              # here: array of length one, content either a string or nil
         
     | 
| 
      
 138 
     | 
    
         
            +
              def compute_features()
         
     | 
| 
      
 139 
     | 
    
         
            +
                # current node: @@node
         
     | 
| 
      
 140 
     | 
    
         
            +
                # check whether the current node has been assigned a slot
         
     | 
| 
      
 141 
     | 
    
         
            +
                # in the subcat frame
         
     | 
| 
      
 142 
     | 
    
         
            +
                if @@okay
         
     | 
| 
      
 143 
     | 
    
         
            +
                  return [ @@node_to_gf[@@node] ]
         
     | 
| 
      
 144 
     | 
    
         
            +
                else
         
     | 
| 
      
 145 
     | 
    
         
            +
                  return [ nil ]
         
     | 
| 
      
 146 
     | 
    
         
            +
                end
         
     | 
| 
      
 147 
     | 
    
         
            +
              end
         
     | 
| 
      
 148 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,294 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            ###########
         
     | 
| 
      
 2 
     | 
    
         
            +
            #
         
     | 
| 
      
 3 
     | 
    
         
            +
            # ke / sp 12 04 05
         
     | 
| 
      
 4 
     | 
    
         
            +
            #
         
     | 
| 
      
 5 
     | 
    
         
            +
            # class for input data object
         
     | 
| 
      
 6 
     | 
    
         
            +
            # offers methods for preprocessing and
         
     | 
| 
      
 7 
     | 
    
         
            +
            # featurization
         
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
            # Salsa packages
         
     | 
| 
      
 10 
     | 
    
         
            +
            require "common/Parser"
         
     | 
| 
      
 11 
     | 
    
         
            +
            require "common/SalsaTigerRegXML"
         
     | 
| 
      
 12 
     | 
    
         
            +
            require "common/ruby_class_extensions"
         
     | 
| 
      
 13 
     | 
    
         
            +
             
     | 
| 
      
 14 
     | 
    
         
            +
            # Fred/Rosy packages
         
     | 
| 
      
 15 
     | 
    
         
            +
            require "rosy/FailedParses"
         
     | 
| 
      
 16 
     | 
    
         
            +
            require "common/RosyConventions"
         
     | 
| 
      
 17 
     | 
    
         
            +
            require "rosy/RosyFeatureExtractors"
         
     | 
| 
      
 18 
     | 
    
         
            +
            require "rosy/RosyPhase2FeatureExtractors"
         
     | 
| 
      
 19 
     | 
    
         
            +
            require "rosy/RosyPruning"
         
     | 
| 
      
 20 
     | 
    
         
            +
            require "rosy/GfInduceFeature"
         
     | 
| 
      
 21 
     | 
    
         
            +
            require "common/FixSynSemMapping"
         
     | 
| 
      
 22 
     | 
    
         
            +
             
     | 
| 
      
 23 
     | 
    
         
            +
            class InputData
         
     | 
| 
      
 24 
     | 
    
         
            +
             
     | 
| 
      
 25 
     | 
    
         
            +
              ###
         
     | 
| 
      
 26 
     | 
    
         
            +
              def initialize(exp_object,          # RosyConfigData object
         
     | 
| 
      
 27 
     | 
    
         
            +
                             dataset,             # train/test
         
     | 
| 
      
 28 
     | 
    
         
            +
            		 feature_info_object, # FeatureInfo object
         
     | 
| 
      
 29 
     | 
    
         
            +
                             interpreter_class,   # SynInterpreter class
         
     | 
| 
      
 30 
     | 
    
         
            +
                             input_dir)           # Directory with input files   
         
     | 
| 
      
 31 
     | 
    
         
            +
             
     | 
| 
      
 32 
     | 
    
         
            +
                @exp = exp_object
         
     | 
| 
      
 33 
     | 
    
         
            +
                @dataset = dataset
         
     | 
| 
      
 34 
     | 
    
         
            +
                @interpreter_class = interpreter_class
         
     | 
| 
      
 35 
     | 
    
         
            +
                @input_dir = input_dir
         
     | 
| 
      
 36 
     | 
    
         
            +
                # store information about failed parses here
         
     | 
| 
      
 37 
     | 
    
         
            +
                @failed_parses = FailedParses.new()
         
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
      
 39 
     | 
    
         
            +
                # feature_extractors_phase1: array of AbstractFeatureExtractor objects
         
     | 
| 
      
 40 
     | 
    
         
            +
                @extractors_p1_rosy, @extractors_p1_other = feature_info_object.get_extractor_objects("phase 1", 
         
     | 
| 
      
 41 
     | 
    
         
            +
                                                                                                      @interpreter_class)
         
     | 
| 
      
 42 
     | 
    
         
            +
             
     | 
| 
      
 43 
     | 
    
         
            +
                # global settings
         
     | 
| 
      
 44 
     | 
    
         
            +
                unless RosyFeatureExtractor.set("split_nones" => @exp.get("split_nones"))
         
     | 
| 
      
 45 
     | 
    
         
            +
                  raise "Some grave problem during feature extractor initialization"
         
     | 
| 
      
 46 
     | 
    
         
            +
                end
         
     | 
| 
      
 47 
     | 
    
         
            +
             
     | 
| 
      
 48 
     | 
    
         
            +
            #     # nothing to set here for now, so deactivated
         
     | 
| 
      
 49 
     | 
    
         
            +
            #     @extractors_p1_other.each { |extractor_obj|
         
     | 
| 
      
 50 
     | 
    
         
            +
            #       unless extractor_obj.class.set()
         
     | 
| 
      
 51 
     | 
    
         
            +
            #         raise "Some grave problem during feature extractor initialization"
         
     | 
| 
      
 52 
     | 
    
         
            +
            #       end
         
     | 
| 
      
 53 
     | 
    
         
            +
            #     }
         
     | 
| 
      
 54 
     | 
    
         
            +
             
     | 
| 
      
 55 
     | 
    
         
            +
             
     | 
| 
      
 56 
     | 
    
         
            +
                # feature_extractors_phase2: array of  AbstractFeatureExtractor objects
         
     | 
| 
      
 57 
     | 
    
         
            +
                extractors_p2_rosy, extractors_p2_other = feature_info_object.get_extractor_objects("phase 2", 
         
     | 
| 
      
 58 
     | 
    
         
            +
                                                                                                    @interpreter_class)
         
     | 
| 
      
 59 
     | 
    
         
            +
                @feature_extractors_phase2 = extractors_p2_rosy + extractors_p2_other
         
     | 
| 
      
 60 
     | 
    
         
            +
              end
         
     | 
| 
      
 61 
     | 
    
         
            +
              
         
     | 
| 
      
 62 
     | 
    
         
            +
              ###
         
     | 
| 
      
 63 
     | 
    
         
            +
              # each_instance_phase1()
         
     | 
| 
      
 64 
     | 
    
         
            +
              #
         
     | 
| 
      
 65 
     | 
    
         
            +
              # reads the input data from file(s), in the specific input format,
         
     | 
| 
      
 66 
     | 
    
         
            +
              # separates it into instances,
         
     | 
| 
      
 67 
     | 
    
         
            +
              # threads it through all phase 1 feature extractors
         
     | 
| 
      
 68 
     | 
    
         
            +
              # and yields one feature vector per instance
         
     | 
| 
      
 69 
     | 
    
         
            +
              #
         
     | 
| 
      
 70 
     | 
    
         
            +
              # yields: pairs [feature_name(string), feature_value(object)]
         
     | 
| 
      
 71 
     | 
    
         
            +
              
         
     | 
| 
      
 72 
     | 
    
         
            +
              def each_instance_phase1()
         
     | 
| 
      
 73 
     | 
    
         
            +
                Dir[@input_dir+"*.xml"]. each {|parsefilename|
         
     | 
| 
      
 74 
     | 
    
         
            +
             
     | 
| 
      
 75 
     | 
    
         
            +
                  xmlFile = FilePartsParser.new(parsefilename)
         
     | 
| 
      
 76 
     | 
    
         
            +
                  $stderr.puts "Processing #{parsefilename}"
         
     | 
| 
      
 77 
     | 
    
         
            +
                  xmlFile.scan_s {|sent_string|
         
     | 
| 
      
 78 
     | 
    
         
            +
                    sent = SalsaTigerSentence.new(sent_string)
         
     | 
| 
      
 79 
     | 
    
         
            +
             
     | 
| 
      
 80 
     | 
    
         
            +
                    # preprocessing: possibly change the SalsaTigerSentence object
         
     | 
| 
      
 81 
     | 
    
         
            +
                    # before featurization
         
     | 
| 
      
 82 
     | 
    
         
            +
                    preprocess(sent)
         
     | 
| 
      
 83 
     | 
    
         
            +
             
     | 
| 
      
 84 
     | 
    
         
            +
                    sent.each_frame{ |frame|
         
     | 
| 
      
 85 
     | 
    
         
            +
             
     | 
| 
      
 86 
     | 
    
         
            +
                      # skip failed parses
         
     | 
| 
      
 87 
     | 
    
         
            +
                      if sent.get_attribute("failed")
         
     | 
| 
      
 88 
     | 
    
         
            +
                        handle_failed_parse(sent, frame)
         
     | 
| 
      
 89 
     | 
    
         
            +
                        next
         
     | 
| 
      
 90 
     | 
    
         
            +
                      end
         
     | 
| 
      
 91 
     | 
    
         
            +
             
     | 
| 
      
 92 
     | 
    
         
            +
                      # Tell feature extractors about the sentence and frame:
         
     | 
| 
      
 93 
     | 
    
         
            +
                      # first Rosy feature extractors, then the others
         
     | 
| 
      
 94 
     | 
    
         
            +
                      # if there is a problem, skip this frame
         
     | 
| 
      
 95 
     | 
    
         
            +
                      unless RosyFeatureExtractor.set_sentence(sent, frame)
         
     | 
| 
      
 96 
     | 
    
         
            +
                        next
         
     | 
| 
      
 97 
     | 
    
         
            +
                      end
         
     | 
| 
      
 98 
     | 
    
         
            +
                      skip_frame = false
         
     | 
| 
      
 99 
     | 
    
         
            +
                      @extractors_p1_other.each { |extractor_obj|
         
     | 
| 
      
 100 
     | 
    
         
            +
                        unless extractor_obj.class.set_sentence(sent, frame)
         
     | 
| 
      
 101 
     | 
    
         
            +
                          skip_frame = true
         
     | 
| 
      
 102 
     | 
    
         
            +
                          break
         
     | 
| 
      
 103 
     | 
    
         
            +
                        end
         
     | 
| 
      
 104 
     | 
    
         
            +
                      }
         
     | 
| 
      
 105 
     | 
    
         
            +
                      if skip_frame
         
     | 
| 
      
 106 
     | 
    
         
            +
                        next
         
     | 
| 
      
 107 
     | 
    
         
            +
                      end
         
     | 
| 
      
 108 
     | 
    
         
            +
                           
         
     | 
| 
      
 109 
     | 
    
         
            +
                      sent.each_syn_node { |syn_node|
         
     | 
| 
      
 110 
     | 
    
         
            +
                        
         
     | 
| 
      
 111 
     | 
    
         
            +
                        # Tell feature extractors about the current node:
         
     | 
| 
      
 112 
     | 
    
         
            +
                        # first Rosy feature extractors, then the others
         
     | 
| 
      
 113 
     | 
    
         
            +
                        # if there is a problem, skip this node
         
     | 
| 
      
 114 
     | 
    
         
            +
                        unless RosyFeatureExtractor.set_node(syn_node)
         
     | 
| 
      
 115 
     | 
    
         
            +
                          next
         
     | 
| 
      
 116 
     | 
    
         
            +
                        end
         
     | 
| 
      
 117 
     | 
    
         
            +
                        skip_node = false
         
     | 
| 
      
 118 
     | 
    
         
            +
                        @extractors_p1_other.each { |extractor_obj|
         
     | 
| 
      
 119 
     | 
    
         
            +
                          unless extractor_obj.class.set_node(syn_node)
         
     | 
| 
      
 120 
     | 
    
         
            +
                            skip_node = true
         
     | 
| 
      
 121 
     | 
    
         
            +
                            break
         
     | 
| 
      
 122 
     | 
    
         
            +
                          end
         
     | 
| 
      
 123 
     | 
    
         
            +
                        }
         
     | 
| 
      
 124 
     | 
    
         
            +
                        if skip_node
         
     | 
| 
      
 125 
     | 
    
         
            +
                          next
         
     | 
| 
      
 126 
     | 
    
         
            +
                        end
         
     | 
| 
      
 127 
     | 
    
         
            +
             
     | 
| 
      
 128 
     | 
    
         
            +
                        # features: array of pairs: [feature_name(string), feature_value(object)]
         
     | 
| 
      
 129 
     | 
    
         
            +
                        features = Array.new
         
     | 
| 
      
 130 
     | 
    
         
            +
                        (@extractors_p1_rosy + @extractors_p1_other).each { |extractor|
         
     | 
| 
      
 131 
     | 
    
         
            +
                          # compute features
         
     | 
| 
      
 132 
     | 
    
         
            +
                          feature_names = extractor.class.feature_names()
         
     | 
| 
      
 133 
     | 
    
         
            +
                          feature_index = 0
         
     | 
| 
      
 134 
     | 
    
         
            +
                          
         
     | 
| 
      
 135 
     | 
    
         
            +
                          # append new features to features array
         
     | 
| 
      
 136 
     | 
    
         
            +
                          features.concat extractor.compute_features().map { |feature_value|
         
     | 
| 
      
 137 
     | 
    
         
            +
                            feature_name = feature_names[feature_index]
         
     | 
| 
      
 138 
     | 
    
         
            +
                            feature_index += 1
         
     | 
| 
      
 139 
     | 
    
         
            +
                            
         
     | 
| 
      
 140 
     | 
    
         
            +
                            # sanity check: feature value longer than the allotted space in the DB?
         
     | 
| 
      
 141 
     | 
    
         
            +
                            check_feature_length(feature_name, feature_value, extractor)
         
     | 
| 
      
 142 
     | 
    
         
            +
             
     | 
| 
      
 143 
     | 
    
         
            +
                            [feature_name, nonnil_feature(feature_value, extractor.class.sql_type()) ]
         
     | 
| 
      
 144 
     | 
    
         
            +
                          }
         
     | 
| 
      
 145 
     | 
    
         
            +
                        }            
         
     | 
| 
      
 146 
     | 
    
         
            +
                        yield features
         
     | 
| 
      
 147 
     | 
    
         
            +
                      } # each syn node
         
     | 
| 
      
 148 
     | 
    
         
            +
                    } # each frame
         
     | 
| 
      
 149 
     | 
    
         
            +
                  } # each sentence
         
     | 
| 
      
 150 
     | 
    
         
            +
                }
         
     | 
| 
      
 151 
     | 
    
         
            +
              end
         
     | 
| 
      
 152 
     | 
    
         
            +
              
         
     | 
| 
      
 153 
     | 
    
         
            +
              ###
         
     | 
| 
      
 154 
     | 
    
         
            +
              # each_phase2_column
         
     | 
| 
      
 155 
     | 
    
         
            +
              #
         
     | 
| 
      
 156 
     | 
    
         
            +
              # This method implements the application of the 
         
     | 
| 
      
 157 
     | 
    
         
            +
              # phase 2 extractors to data.
         
     | 
| 
      
 158 
     | 
    
         
            +
              #
         
     | 
| 
      
 159 
     | 
    
         
            +
              # Given a database view (of either training or test data),
         
     | 
| 
      
 160 
     | 
    
         
            +
              # assign a new feature value to each instance
         
     | 
| 
      
 161 
     | 
    
         
            +
              #
         
     | 
| 
      
 162 
     | 
    
         
            +
              # yields pairs [feature_name(string), feature_values(array)]
         
     | 
| 
      
 163 
     | 
    
         
            +
              # The feature_values array has as many lines as the view has instances
         
     | 
| 
      
 164 
     | 
    
         
            +
              # so the yield of this method can be fed directly into view.update_column()
         
     | 
| 
      
 165 
     | 
    
         
            +
              def each_phase2_column(view) # View object: training or test data
         
     | 
| 
      
 166 
     | 
    
         
            +
             
     | 
| 
      
 167 
     | 
    
         
            +
                @feature_extractors_phase2.each { |extractor|
         
     | 
| 
      
 168 
     | 
    
         
            +
                  # apply the extractor
         
     | 
| 
      
 169 
     | 
    
         
            +
                  feature_columns = extractor.compute_features_on_view(view)
         
     | 
| 
      
 170 
     | 
    
         
            +
                  # interleave with feature values and yield
         
     | 
| 
      
 171 
     | 
    
         
            +
                  feature_index = 0
         
     | 
| 
      
 172 
     | 
    
         
            +
                  feature_names = extractor.class.feature_names()
         
     | 
| 
      
 173 
     | 
    
         
            +
                  feature_columns.each { |feature_values|
         
     | 
| 
      
 174 
     | 
    
         
            +
                    yield [
         
     | 
| 
      
 175 
     | 
    
         
            +
                      feature_names[feature_index], 
         
     | 
| 
      
 176 
     | 
    
         
            +
                      feature_values.map { |feature_val| nonnil_feature(feature_val, extractor.class.sql_type)  }
         
     | 
| 
      
 177 
     | 
    
         
            +
                    ]
         
     | 
| 
      
 178 
     | 
    
         
            +
                    feature_index += 1
         
     | 
| 
      
 179 
     | 
    
         
            +
                  }
         
     | 
| 
      
 180 
     | 
    
         
            +
                }
         
     | 
| 
      
 181 
     | 
    
         
            +
              end
         
     | 
| 
      
 182 
     | 
    
         
            +
             
     | 
| 
      
 183 
     | 
    
         
            +
              ###
         
     | 
| 
      
 184 
     | 
    
         
            +
              # get_failed_parses
         
     | 
| 
      
 185 
     | 
    
         
            +
              #
         
     | 
| 
      
 186 
     | 
    
         
            +
              # returns the FailedParses object in which the info about failed parses has been stored
         
     | 
| 
      
 187 
     | 
    
         
            +
              def get_failed_parses()
         
     | 
| 
      
 188 
     | 
    
         
            +
                return @failed_parses
         
     | 
| 
      
 189 
     | 
    
         
            +
              end
         
     | 
| 
      
 190 
     | 
    
         
            +
             
     | 
| 
      
 191 
     | 
    
         
            +
              #################################
         
     | 
| 
      
 192 
     | 
    
         
            +
              private
         
     | 
| 
      
 193 
     | 
    
         
            +
             
     | 
| 
      
 194 
     | 
    
         
            +
             
     | 
| 
      
 195 
     | 
    
         
            +
              ###
         
     | 
| 
      
 196 
     | 
    
         
            +
              def nonnil_feature(feature_value,
         
     | 
| 
      
 197 
     | 
    
         
            +
                                 sql_type)
         
     | 
| 
      
 198 
     | 
    
         
            +
             
     | 
| 
      
 199 
     | 
    
         
            +
                # feature value nil? then change to noval
         
     | 
| 
      
 200 
     | 
    
         
            +
                if feature_value.nil? and sql_type =~ /CHAR/
         
     | 
| 
      
 201 
     | 
    
         
            +
                  return @exp.get("noval")
         
     | 
| 
      
 202 
     | 
    
         
            +
                elsif feature_value.class.to_s == "String" and feature_value.empty?
         
     | 
| 
      
 203 
     | 
    
         
            +
                  return @exp.get("noval")
         
     | 
| 
      
 204 
     | 
    
         
            +
                elsif feature_value.nil?
         
     | 
| 
      
 205 
     | 
    
         
            +
                  return 0
         
     | 
| 
      
 206 
     | 
    
         
            +
                else
         
     | 
| 
      
 207 
     | 
    
         
            +
                  return feature_value
         
     | 
| 
      
 208 
     | 
    
         
            +
                end
         
     | 
| 
      
 209 
     | 
    
         
            +
              end
         
     | 
| 
      
 210 
     | 
    
         
            +
             
     | 
| 
      
 211 
     | 
    
         
            +
              ###
         
     | 
| 
      
 212 
     | 
    
         
            +
              # preprocess: possibly change the given SalsaTigerSentence 
         
     | 
| 
      
 213 
     | 
    
         
            +
              # to enable better learning
         
     | 
| 
      
 214 
     | 
    
         
            +
              def preprocess(sent)           # SalsaTigerSentence object
         
     | 
| 
      
 215 
     | 
    
         
            +
             
     | 
| 
      
 216 
     | 
    
         
            +
             
     | 
| 
      
 217 
     | 
    
         
            +
                if @dataset == "train" and 
         
     | 
| 
      
 218 
     | 
    
         
            +
                    (@exp.get("fe_syn_repair") or @exp.get("fe_rel_repair"))
         
     | 
| 
      
 219 
     | 
    
         
            +
                  FixSynSemMapping.fixit(sent, @exp, @interpreter_class)
         
     | 
| 
      
 220 
     | 
    
         
            +
                end
         
     | 
| 
      
 221 
     | 
    
         
            +
              end
         
     | 
| 
      
 222 
     | 
    
         
            +
             
     | 
| 
      
 223 
     | 
    
         
            +
              ###
         
     | 
| 
      
 224 
     | 
    
         
            +
              # register failed parses
         
     | 
| 
      
 225 
     | 
    
         
            +
              def handle_failed_parse(sent,  # SalsaTigerSentence object
         
     | 
| 
      
 226 
     | 
    
         
            +
                                      frame) # FrameNode
         
     | 
| 
      
 227 
     | 
    
         
            +
             
     | 
| 
      
 228 
     | 
    
         
            +
                # target POS
         
     | 
| 
      
 229 
     | 
    
         
            +
                if frame.target()
         
     | 
| 
      
 230 
     | 
    
         
            +
                  main_target = @interpreter_class.main_node_of_expr(frame.target.children(), "no_mwe")
         
     | 
| 
      
 231 
     | 
    
         
            +
                else
         
     | 
| 
      
 232 
     | 
    
         
            +
                  main_target = nil
         
     | 
| 
      
 233 
     | 
    
         
            +
                end
         
     | 
| 
      
 234 
     | 
    
         
            +
                if main_target
         
     | 
| 
      
 235 
     | 
    
         
            +
                  target_pos = @interpreter_class.category(main_target)
         
     | 
| 
      
 236 
     | 
    
         
            +
                else
         
     | 
| 
      
 237 
     | 
    
         
            +
                  target_pos = nil
         
     | 
| 
      
 238 
     | 
    
         
            +
                end
         
     | 
| 
      
 239 
     | 
    
         
            +
                if frame.target()
         
     | 
| 
      
 240 
     | 
    
         
            +
                  target_str = frame.target().yield_nodes_ordered().map { |t_node| 
         
     | 
| 
      
 241 
     | 
    
         
            +
                    if t_node.is_syntactic?
         
     | 
| 
      
 242 
     | 
    
         
            +
                      @interpreter_class.lemma_backoff(t_node)
         
     | 
| 
      
 243 
     | 
    
         
            +
                    else
         
     | 
| 
      
 244 
     | 
    
         
            +
                      # not a syntactic node: maybe an unassigned target?
         
     | 
| 
      
 245 
     | 
    
         
            +
                      ""
         
     | 
| 
      
 246 
     | 
    
         
            +
                    end
         
     | 
| 
      
 247 
     | 
    
         
            +
                  }.join(" ")
         
     | 
| 
      
 248 
     | 
    
         
            +
                else
         
     | 
| 
      
 249 
     | 
    
         
            +
                  target_str = ""
         
     | 
| 
      
 250 
     | 
    
         
            +
                end
         
     | 
| 
      
 251 
     | 
    
         
            +
                        
         
     | 
| 
      
 252 
     | 
    
         
            +
                @failed_parses.register(construct_instance_id(sent.id(), frame.id()),
         
     | 
| 
      
 253 
     | 
    
         
            +
                                        frame.name(),
         
     | 
| 
      
 254 
     | 
    
         
            +
                                        target_str,
         
     | 
| 
      
 255 
     | 
    
         
            +
                                        target_pos,
         
     | 
| 
      
 256 
     | 
    
         
            +
                                        frame.children.map { |fe| fe.name })
         
     | 
| 
      
 257 
     | 
    
         
            +
             
     | 
| 
      
 258 
     | 
    
         
            +
              end
         
     | 
| 
      
 259 
     | 
    
         
            +
             
     | 
| 
      
 260 
     | 
    
         
            +
              ###
         
     | 
| 
      
 261 
     | 
    
         
            +
              # sanity check: feature value longer than the allotted space in the DB?
         
     | 
| 
      
 262 
     | 
    
         
            +
              def check_feature_length(feature_name,  # string
         
     | 
| 
      
 263 
     | 
    
         
            +
                                       feature_value, # object
         
     | 
| 
      
 264 
     | 
    
         
            +
                                       extractor_obj) # AbstractFeatureExtractor object
         
     | 
| 
      
 265 
     | 
    
         
            +
             
     | 
| 
      
 266 
     | 
    
         
            +
                if extractor_obj.class.sql_type() =~ /(\d+)/
         
     | 
| 
      
 267 
     | 
    
         
            +
                  # sql type contains some statement about the length.
         
     | 
| 
      
 268 
     | 
    
         
            +
                  # just crudely compare to feature length
         
     | 
| 
      
 269 
     | 
    
         
            +
                  length = $1.to_i
         
     | 
| 
      
 270 
     | 
    
         
            +
                  if feature_value.class == String and
         
     | 
| 
      
 271 
     | 
    
         
            +
                      feature_value.length() > length
         
     | 
| 
      
 272 
     | 
    
         
            +
             
     | 
| 
      
 273 
     | 
    
         
            +
                    if feature_name == "sentid"
         
     | 
| 
      
 274 
     | 
    
         
            +
            	  print length;
         
     | 
| 
      
 275 
     | 
    
         
            +
                      print feature_value;
         
     | 
| 
      
 276 
     | 
    
         
            +
            	  print feature_value.length();
         
     | 
| 
      
 277 
     | 
    
         
            +
            	  # if the sentence (instance) ID is too long, we cannot go on.
         
     | 
| 
      
 278 
     | 
    
         
            +
                      $stderr.puts "Error: Instance ID is longer than its DB column."
         
     | 
| 
      
 279 
     | 
    
         
            +
                      $stderr.puts "Please increase the DB column size in {Tiger,Collins}FeatureExtractors.rb"
         
     | 
| 
      
 280 
     | 
    
         
            +
                      raise "SQL entry length surpassed"
         
     | 
| 
      
 281 
     | 
    
         
            +
             
     | 
| 
      
 282 
     | 
    
         
            +
                    elsif @exp.get("verbose")
         
     | 
| 
      
 283 
     | 
    
         
            +
                      # KE Feb 07: don't print warning, 
         
     | 
| 
      
 284 
     | 
    
         
            +
                      # this is just too frequent
         
     | 
| 
      
 285 
     | 
    
         
            +
                      # for other features, we just issue a warning, and only if we are verbose
         
     | 
| 
      
 286 
     | 
    
         
            +
             
     | 
| 
      
 287 
     | 
    
         
            +
                      # $stderr.puts "Warning: feature #{feature_name} longer than its DB column (#{length.to_s} vs #{feature_value.length()}): #{feature_value}"                      
         
     | 
| 
      
 288 
     | 
    
         
            +
                    end # feature name check
         
     | 
| 
      
 289 
     | 
    
         
            +
                  end # length surpassed
         
     | 
| 
      
 290 
     | 
    
         
            +
                end # length found in sql type
         
     | 
| 
      
 291 
     | 
    
         
            +
             
     | 
| 
      
 292 
     | 
    
         
            +
              end
         
     | 
| 
      
 293 
     | 
    
         
            +
             
     | 
| 
      
 294 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,115 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require 'common/ConfigData'
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            ##############################
         
     | 
| 
      
 4 
     | 
    
         
            +
            # Class RosyConfigData
         
     | 
| 
      
 5 
     | 
    
         
            +
            #
         
     | 
| 
      
 6 
     | 
    
         
            +
            # inherits from ConfigData,
         
     | 
| 
      
 7 
     | 
    
         
            +
            # sets features for ROSY
         
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
            class RosyConfigData < ConfigData
         
     | 
| 
      
 10 
     | 
    
         
            +
              def initialize(filename)
         
     | 
| 
      
 11 
     | 
    
         
            +
                super(filename,                      # config file
         
     | 
| 
      
 12 
     | 
    
         
            +
            	  { # features
         
     | 
| 
      
 13 
     | 
    
         
            +
                        "feature" => "list",
         
     | 
| 
      
 14 
     | 
    
         
            +
                       "classifier" => "list",
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
                       "verbose" => "bool" ,
         
     | 
| 
      
 17 
     | 
    
         
            +
                        "enduser_mode" => "bool", 
         
     | 
| 
      
 18 
     | 
    
         
            +
                       
         
     | 
| 
      
 19 
     | 
    
         
            +
                        "experiment_ID" => "string",
         
     | 
| 
      
 20 
     | 
    
         
            +
                        
         
     | 
| 
      
 21 
     | 
    
         
            +
                        "directory_input_train" => "string",
         
     | 
| 
      
 22 
     | 
    
         
            +
                        "directory_input_test" => "string",
         
     | 
| 
      
 23 
     | 
    
         
            +
                        "directory_output" => "string", 
         
     | 
| 
      
 24 
     | 
    
         
            +
                       
         
     | 
| 
      
 25 
     | 
    
         
            +
                        "preproc_descr_file_train" => "string",
         
     | 
| 
      
 26 
     | 
    
         
            +
                        "preproc_descr_file_test" => "string",
         
     | 
| 
      
 27 
     | 
    
         
            +
                        "external_descr_file"    => "string",
         
     | 
| 
      
 28 
     | 
    
         
            +
             
     | 
| 
      
 29 
     | 
    
         
            +
                        "dbtype" => "string",    # "mysql" or "sqlite"
         
     | 
| 
      
 30 
     | 
    
         
            +
                       
         
     | 
| 
      
 31 
     | 
    
         
            +
                        "host" => "string",      # DB access: sqlite only
         
     | 
| 
      
 32 
     | 
    
         
            +
                        "user" => "string",
         
     | 
| 
      
 33 
     | 
    
         
            +
                        "passwd" => "string",
         
     | 
| 
      
 34 
     | 
    
         
            +
                        "dbname" => "string",
         
     | 
| 
      
 35 
     | 
    
         
            +
            	   
         
     | 
| 
      
 36 
     | 
    
         
            +
                        "data_dir" => "string",  # for external use
         
     | 
| 
      
 37 
     | 
    
         
            +
                        "rosy_dir" => "pattern", # for internal use only, set by rosy.rb
         
     | 
| 
      
 38 
     | 
    
         
            +
                       
         
     | 
| 
      
 39 
     | 
    
         
            +
                       "classifier_dir" => "string", # if present, special directory for classifiers
         
     | 
| 
      
 40 
     | 
    
         
            +
                       
         
     | 
| 
      
 41 
     | 
    
         
            +
                       "classif_column_name" => "string",
         
     | 
| 
      
 42 
     | 
    
         
            +
                       "main_table_name" => "pattern",
         
     | 
| 
      
 43 
     | 
    
         
            +
                       "test_table_name" => "pattern",
         
     | 
| 
      
 44 
     | 
    
         
            +
                       
         
     | 
| 
      
 45 
     | 
    
         
            +
                       "eval_file" => "pattern", 
         
     | 
| 
      
 46 
     | 
    
         
            +
                       "log_file" => "pattern",
         
     | 
| 
      
 47 
     | 
    
         
            +
                       "failed_file" => "pattern",
         
     | 
| 
      
 48 
     | 
    
         
            +
                       "classifier_file" => "pattern",
         
     | 
| 
      
 49 
     | 
    
         
            +
                       "classifier_output_file" => "pattern",
         
     | 
| 
      
 50 
     | 
    
         
            +
                       "noval" => "string",
         
     | 
| 
      
 51 
     | 
    
         
            +
                       
         
     | 
| 
      
 52 
     | 
    
         
            +
                       
         
     | 
| 
      
 53 
     | 
    
         
            +
                       "split_nones" => "bool",
         
     | 
| 
      
 54 
     | 
    
         
            +
                       "print_eval_log" => "bool",
         
     | 
| 
      
 55 
     | 
    
         
            +
                       "assume_argrec_perfect" => "bool", 
         
     | 
| 
      
 56 
     | 
    
         
            +
                       "xwise_argrec" => "string",
         
     | 
| 
      
 57 
     | 
    
         
            +
                       "xwise_arglab" => "string",
         
     | 
| 
      
 58 
     | 
    
         
            +
                       "xwise_onestep" => "string",
         
     | 
| 
      
 59 
     | 
    
         
            +
             
     | 
| 
      
 60 
     | 
    
         
            +
                       "fe_syn_repair" => "bool", # map words to constituents for FEs: idealize?
         
     | 
| 
      
 61 
     | 
    
         
            +
                       "fe_rel_repair" => "bool", # FEs: include non-included relative clauses into FEs
         
     | 
| 
      
 62 
     | 
    
         
            +
             
     | 
| 
      
 63 
     | 
    
         
            +
                       "prune" => "string",       # pruning prior to argrec?
         
     | 
| 
      
 64 
     | 
    
         
            +
            	    
         
     | 
| 
      
 65 
     | 
    
         
            +
            	  },
         
     | 
| 
      
 66 
     | 
    
         
            +
            	  ["exp_ID", "test_ID", "split_ID", "feature_name", "classif", "step", 
         
     | 
| 
      
 67 
     | 
    
         
            +
                       "group", "dataset","mode"]                      # variables
         
     | 
| 
      
 68 
     | 
    
         
            +
            	  )
         
     | 
| 
      
 69 
     | 
    
         
            +
             
     | 
| 
      
 70 
     | 
    
         
            +
                # set access functions for list features
         
     | 
| 
      
 71 
     | 
    
         
            +
                set_list_feature_access("feature", 
         
     | 
| 
      
 72 
     | 
    
         
            +
            			    method("access_feature"))
         
     | 
| 
      
 73 
     | 
    
         
            +
                
         
     | 
| 
      
 74 
     | 
    
         
            +
                # set access functions for list features
         
     | 
| 
      
 75 
     | 
    
         
            +
                set_list_feature_access("classifier", 
         
     | 
| 
      
 76 
     | 
    
         
            +
            			    method("access_feature"))
         
     | 
| 
      
 77 
     | 
    
         
            +
                
         
     | 
| 
      
 78 
     | 
    
         
            +
              end
         
     | 
| 
      
 79 
     | 
    
         
            +
             
     | 
| 
      
 80 
     | 
    
         
            +
              ###
         
     | 
| 
      
 81 
     | 
    
         
            +
              # protected
         
     | 
| 
      
 82 
     | 
    
         
            +
              
         
     | 
| 
      
 83 
     | 
    
         
            +
              #####
         
     | 
| 
      
 84 
     | 
    
         
            +
              # access_feature
         
     | 
| 
      
 85 
     | 
    
         
            +
              #
         
     | 
| 
      
 86 
     | 
    
         
            +
              # access function for feature 'feature'
         
     | 
| 
      
 87 
     | 
    
         
            +
              #
         
     | 
| 
      
 88 
     | 
    
         
            +
              # assumed format in the config file:
         
     | 
| 
      
 89 
     | 
    
         
            +
              #
         
     | 
| 
      
 90 
     | 
    
         
            +
              #   feature = path [option]*
         
     | 
| 
      
 91 
     | 
    
         
            +
              #
         
     | 
| 
      
 92 
     | 
    
         
            +
              # i.e. first the name of the feature type to use, then
         
     | 
| 
      
 93 
     | 
    
         
            +
              # optionally options associated with that feature,
         
     | 
| 
      
 94 
     | 
    
         
            +
              # e.g. 'argrec': use that feature only when computing argrec
         
     | 
| 
      
 95 
     | 
    
         
            +
              #
         
     | 
| 
      
 96 
     | 
    
         
            +
              # the access function is called with parameter val_list, an array of
         
     | 
| 
      
 97 
     | 
    
         
            +
              # string tuples, one string tuple for each feature defined.
         
     | 
| 
      
 98 
     | 
    
         
            +
              # the first string in the tuple is the feature name, the rest are the options
         
     | 
| 
      
 99 
     | 
    
         
            +
              #
         
     | 
| 
      
 100 
     | 
    
         
            +
              # returns: a list of pairs [feature_name(string), options(array:string)]
         
     | 
| 
      
 101 
     | 
    
         
            +
              # of defined features
         
     | 
| 
      
 102 
     | 
    
         
            +
              def access_feature(val_list) # array:array:string: list of tuples defined in config file
         
     | 
| 
      
 103 
     | 
    
         
            +
            		               # for feature 'feature'
         
     | 
| 
      
 104 
     | 
    
         
            +
                if val_list.nil?
         
     | 
| 
      
 105 
     | 
    
         
            +
                  return []
         
     | 
| 
      
 106 
     | 
    
         
            +
                else
         
     | 
| 
      
 107 
     | 
    
         
            +
                  return val_list.map { |feature_descr_tuple|
         
     | 
| 
      
 108 
     | 
    
         
            +
                    [feature_descr_tuple.first, feature_descr_tuple[1..-1]]
         
     | 
| 
      
 109 
     | 
    
         
            +
                  }    
         
     | 
| 
      
 110 
     | 
    
         
            +
                end
         
     | 
| 
      
 111 
     | 
    
         
            +
              end
         
     | 
| 
      
 112 
     | 
    
         
            +
            end
         
     | 
| 
      
 113 
     | 
    
         
            +
             
     | 
| 
      
 114 
     | 
    
         
            +
             
     | 
| 
      
 115 
     | 
    
         
            +
             
         
     |