RubyGems - frprep - Versions diffs - 0.0.1.prealpha - Mend

frprep 0.0.1.prealpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (138) hide show

data/.yardopts +8 -0
data/CHANGELOG.rdoc +0 -0
data/LICENSE.rdoc +0 -0
data/README.rdoc +0 -0
data/lib/common/AbstractSynInterface.rb +1227 -0
data/lib/common/BerkeleyInterface.rb +375 -0
data/lib/common/CollinsInterface.rb +1165 -0
data/lib/common/ConfigData.rb +694 -0
data/lib/common/Counter.rb +18 -0
data/lib/common/DBInterface.rb +48 -0
data/lib/common/EnduserMode.rb +27 -0
data/lib/common/Eval.rb +480 -0
data/lib/common/FixSynSemMapping.rb +196 -0
data/lib/common/FrPrepConfigData.rb +66 -0
data/lib/common/FrprepHelper.rb +1324 -0
data/lib/common/Graph.rb +345 -0
data/lib/common/ISO-8859-1.rb +24 -0
data/lib/common/ML.rb +186 -0
data/lib/common/Maxent.rb +215 -0
data/lib/common/MiniparInterface.rb +1388 -0
data/lib/common/Optimise.rb +195 -0
data/lib/common/Parser.rb +213 -0
data/lib/common/RegXML.rb +269 -0
data/lib/common/RosyConventions.rb +171 -0
data/lib/common/SQLQuery.rb +243 -0
data/lib/common/STXmlTerminalOrder.rb +194 -0
data/lib/common/SalsaTigerRegXML.rb +2347 -0
data/lib/common/SalsaTigerXMLHelper.rb +99 -0
data/lib/common/SleepyInterface.rb +384 -0
data/lib/common/SynInterfaces.rb +275 -0
data/lib/common/TabFormat.rb +720 -0
data/lib/common/Tiger.rb +1448 -0
data/lib/common/TntInterface.rb +44 -0
data/lib/common/Tree.rb +61 -0
data/lib/common/TreetaggerInterface.rb +303 -0
data/lib/common/headz.rb +338 -0
data/lib/common/option_parser.rb +13 -0
data/lib/common/ruby_class_extensions.rb +310 -0
data/lib/fred/Baseline.rb +150 -0
data/lib/fred/FileZipped.rb +31 -0
data/lib/fred/FredBOWContext.rb +863 -0
data/lib/fred/FredConfigData.rb +182 -0
data/lib/fred/FredConventions.rb +232 -0
data/lib/fred/FredDetermineTargets.rb +324 -0
data/lib/fred/FredEval.rb +312 -0
data/lib/fred/FredFeatureExtractors.rb +321 -0
data/lib/fred/FredFeatures.rb +1061 -0
data/lib/fred/FredFeaturize.rb +596 -0
data/lib/fred/FredNumTrainingSenses.rb +27 -0
data/lib/fred/FredParameters.rb +402 -0
data/lib/fred/FredSplit.rb +84 -0
data/lib/fred/FredSplitPkg.rb +180 -0
data/lib/fred/FredTest.rb +607 -0
data/lib/fred/FredTrain.rb +144 -0
data/lib/fred/PlotAndREval.rb +480 -0
data/lib/fred/fred.rb +45 -0
data/lib/fred/md5.rb +23 -0
data/lib/fred/opt_parser.rb +250 -0
data/lib/frprep/AbstractSynInterface.rb +1227 -0
data/lib/frprep/Ampersand.rb +37 -0
data/lib/frprep/BerkeleyInterface.rb +375 -0
data/lib/frprep/CollinsInterface.rb +1165 -0
data/lib/frprep/ConfigData.rb +694 -0
data/lib/frprep/Counter.rb +18 -0
data/lib/frprep/FNCorpusXML.rb +643 -0
data/lib/frprep/FNDatabase.rb +144 -0
data/lib/frprep/FixSynSemMapping.rb +196 -0
data/lib/frprep/FrPrepConfigData.rb +66 -0
data/lib/frprep/FrameXML.rb +513 -0
data/lib/frprep/FrprepHelper.rb +1324 -0
data/lib/frprep/Graph.rb +345 -0
data/lib/frprep/ISO-8859-1.rb +24 -0
data/lib/frprep/MiniparInterface.rb +1388 -0
data/lib/frprep/Parser.rb +213 -0
data/lib/frprep/RegXML.rb +269 -0
data/lib/frprep/STXmlTerminalOrder.rb +194 -0
data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
data/lib/frprep/SleepyInterface.rb +384 -0
data/lib/frprep/SynInterfaces.rb +275 -0
data/lib/frprep/TabFormat.rb +720 -0
data/lib/frprep/Tiger.rb +1448 -0
data/lib/frprep/TntInterface.rb +44 -0
data/lib/frprep/Tree.rb +61 -0
data/lib/frprep/TreetaggerInterface.rb +303 -0
data/lib/frprep/do_parses.rb +142 -0
data/lib/frprep/frprep.rb +686 -0
data/lib/frprep/headz.rb +338 -0
data/lib/frprep/one_parsed_file.rb +28 -0
data/lib/frprep/opt_parser.rb +94 -0
data/lib/frprep/ruby_class_extensions.rb +310 -0
data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
data/lib/rosy/DBMySQL.rb +146 -0
data/lib/rosy/DBSQLite.rb +280 -0
data/lib/rosy/DBTable.rb +239 -0
data/lib/rosy/DBWrapper.rb +176 -0
data/lib/rosy/ExternalConfigData.rb +58 -0
data/lib/rosy/FailedParses.rb +130 -0
data/lib/rosy/FeatureInfo.rb +242 -0
data/lib/rosy/GfInduce.rb +1115 -0
data/lib/rosy/GfInduceFeature.rb +148 -0
data/lib/rosy/InputData.rb +294 -0
data/lib/rosy/RosyConfigData.rb +115 -0
data/lib/rosy/RosyConfusability.rb +338 -0
data/lib/rosy/RosyEval.rb +465 -0
data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
data/lib/rosy/RosyFeaturize.rb +280 -0
data/lib/rosy/RosyInspect.rb +336 -0
data/lib/rosy/RosyIterator.rb +477 -0
data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
data/lib/rosy/RosyPruning.rb +165 -0
data/lib/rosy/RosyServices.rb +744 -0
data/lib/rosy/RosySplit.rb +232 -0
data/lib/rosy/RosyTask.rb +19 -0
data/lib/rosy/RosyTest.rb +826 -0
data/lib/rosy/RosyTrain.rb +232 -0
data/lib/rosy/RosyTrainingTestTable.rb +786 -0
data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
data/lib/rosy/View.rb +418 -0
data/lib/rosy/opt_parser.rb +379 -0
data/lib/rosy/rosy.rb +77 -0
data/lib/shalmaneser/version.rb +3 -0
data/test/frprep/test_opt_parser.rb +94 -0
data/test/functional/functional_test_helper.rb +40 -0
data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
data/test/functional/test_fred.rb +47 -0
data/test/functional/test_frprep.rb +52 -0
data/test/functional/test_rosy.rb +20 -0
metadata +270 -0

data/lib/rosy/GfInduceFeature.rb ADDED

@@ -0,0 +1,148 @@
+# GfInduceFeature
+# Katrin Erk Jan 06
+#
+# use result of GfInduce.rb as
+# feature for Rosy
+require "rosy/GfInduce"
+require "rosy/AbstractFeatureAndExternal"
+require "common/ruby_class_extensions"
+###
+# make filename for GfInduce picle file
+def filename_gfmap(exp,         # ExternalConfigData object
+		   interpreter) # SynInterpreter class
+  # output dir as given in my experiment file
+  # If there is an experiment ID, make subdirectory
+  # named after the experiment ID and place the data there.
+  output_dir = File.new_dir(exp.get("directory"))
+  if exp.get("experiment_id")
+    output_dir = File.new_dir(output_dir, exp.get("experiment_id"))
+  end
+  # output file name:
+  # Gfmap.{<service>=<system_name>.}*{OPT<service>=<system_name>.}*pkl
+  return output_dir +
+    "Gfmap." +
+    interpreter.systems().to_a.map { |service, system_name|
+    service.to_s+ "=" + system_name.to_s
+  }.sort.join(".") + "." +
+    interpreter.optional_systems().to_a.map { |service, system_name|
+    "OPT" + service.to_s + "=" + system_name.to_s
+  }.sort.join(".") + ".pkl"
+end
+################################
+# base class for all following feature extractors
+class GfInduceFeatureExtractor < ExternalFeatureExtractor
+  GfInduceFeatureExtractor.announce_me()
+  @@okay = true  # external experiment file present?
+  @@gf_obj = nil # GfInduce object
+  @@node_to_gf = nil # Hash: SynNodes of a sentence -> Gf label
+  def GfInduceFeatureExtractor.designator()
+    return "gf_fn"
+  end
+  def GfInduceFeatureExtractor.feature_names()
+    return ["gf_fn"]
+  end
+  def GfInduceFeatureExtractor.sql_type()
+    return "VARCHAR(25)"
+  end
+  def GfInduceFeatureExtractor.feature_type()
+    return "syn"
+  end
+  def GfInduceFeatureExtractor.phase()
+    return "phase 1"
+  end
+  ###
+  # set sentence, set node, set other settings:
+  # this is done prior to
+  # feature computation using compute_feature()
+  # such that computations that stay the same for
+  # several features can be done in advance
+  #
+  # This is just relevant for Phase 1
+  #
+  # returns: false/nil if there was a problem
+  def GfInduceFeatureExtractor.set_sentence(sent,  # SalsaTigerSentence object
+                                            frame) # FrameNode object
+    super(sent, frame)
+    if @@okay
+      # we can actually compute something
+      # let the GF object compute all subcat frames
+      # for the target of this frame
+      subcatframes_of_current_target = @@gf_obj.apply(frame.target.children())
+      # keep the most frequent one of the
+      # subcat frames returned by the GF object:
+      if subcatframes_of_current_target.empty?
+        # no subcat frames returned
+        subcatframe = []
+      else
+        # we have at least one subcat frame:
+        # keep the most frequent one of them
+        #
+        # Also, subcatframes_of_current_target
+        # contains triples [frame, actual_subcatframe, frequency]
+        # Of these, keep just the actual_subcatframe
+        subcatframe = subcatframes_of_current_target.sort { |a, b|
+          # sort by frequency
+          b.last <=> a.last
+        }.first[1]
+      end
+      # change into a mapping node(SynNode) -> GF(string)
+      @@node_to_gf = Hash.new
+      subcatframe.each { |gf, prep, fe, synnodes|
+        synnodes.each { |node|
+          @@node_to_gf[node] = "#{gf} #{prep}"
+        }
+      }
+    end
+  end
+  ###
+  # Initialize: read GFInduce pickle
+  def initialize(exp,                  # experiment file object
+                 interpreter_class)    # SynInterpreter class
+    super(exp, interpreter_class)
+    if @exp_external
+      pickle_filename = filename_gfmap(@exp_external, @@interpreter_class)
+      @@gf_obj = GfInduce.from_file(pickle_filename)
+      @@okay = true
+    else
+      # signal that you cannot compute anything
+      @@okay = false
+    end
+  end
+  ###
+  # compute: compute features
+  #
+  # returns an array of features (strings), length the same as the
+  # length of feature_names()
+  #
+  # here: array of length one, content either a string or nil
+  def compute_features()
+    # current node: @@node
+    # check whether the current node has been assigned a slot
+    # in the subcat frame
+    if @@okay
+      return [ @@node_to_gf[@@node] ]
+    else
+      return [ nil ]
+    end
+  end
+end

data/lib/rosy/InputData.rb ADDED

@@ -0,0 +1,294 @@
+###########
+#
+# ke / sp 12 04 05
+#
+# class for input data object
+# offers methods for preprocessing and
+# featurization
+# Salsa packages
+require "common/Parser"
+require "common/SalsaTigerRegXML"
+require "common/ruby_class_extensions"
+# Fred/Rosy packages
+require "rosy/FailedParses"
+require "common/RosyConventions"
+require "rosy/RosyFeatureExtractors"
+require "rosy/RosyPhase2FeatureExtractors"
+require "rosy/RosyPruning"
+require "rosy/GfInduceFeature"
+require "common/FixSynSemMapping"
+class InputData
+  ###
+  def initialize(exp_object,          # RosyConfigData object
+                 dataset,             # train/test
+		 feature_info_object, # FeatureInfo object
+                 interpreter_class,   # SynInterpreter class
+                 input_dir)           # Directory with input files
+    @exp = exp_object
+    @dataset = dataset
+    @interpreter_class = interpreter_class
+    @input_dir = input_dir
+    # store information about failed parses here
+    @failed_parses = FailedParses.new()
+    # feature_extractors_phase1: array of AbstractFeatureExtractor objects
+    @extractors_p1_rosy, @extractors_p1_other = feature_info_object.get_extractor_objects("phase 1",
+                                                                                          @interpreter_class)
+    # global settings
+    unless RosyFeatureExtractor.set("split_nones" => @exp.get("split_nones"))
+      raise "Some grave problem during feature extractor initialization"
+    end
+#     # nothing to set here for now, so deactivated
+#     @extractors_p1_other.each { |extractor_obj|
+#       unless extractor_obj.class.set()
+#         raise "Some grave problem during feature extractor initialization"
+#       end
+#     }
+    # feature_extractors_phase2: array of  AbstractFeatureExtractor objects
+    extractors_p2_rosy, extractors_p2_other = feature_info_object.get_extractor_objects("phase 2",
+                                                                                        @interpreter_class)
+    @feature_extractors_phase2 = extractors_p2_rosy + extractors_p2_other
+  end
+  ###
+  # each_instance_phase1()
+  #
+  # reads the input data from file(s), in the specific input format,
+  # separates it into instances,
+  # threads it through all phase 1 feature extractors
+  # and yields one feature vector per instance
+  #
+  # yields: pairs [feature_name(string), feature_value(object)]
+  def each_instance_phase1()
+    Dir[@input_dir+"*.xml"]. each {|parsefilename|
+      xmlFile = FilePartsParser.new(parsefilename)
+      $stderr.puts "Processing #{parsefilename}"
+      xmlFile.scan_s {|sent_string|
+        sent = SalsaTigerSentence.new(sent_string)
+        # preprocessing: possibly change the SalsaTigerSentence object
+        # before featurization
+        preprocess(sent)
+        sent.each_frame{ |frame|
+          # skip failed parses
+          if sent.get_attribute("failed")
+            handle_failed_parse(sent, frame)
+            next
+          end
+          # Tell feature extractors about the sentence and frame:
+          # first Rosy feature extractors, then the others
+          # if there is a problem, skip this frame
+          unless RosyFeatureExtractor.set_sentence(sent, frame)
+            next
+          end
+          skip_frame = false
+          @extractors_p1_other.each { |extractor_obj|
+            unless extractor_obj.class.set_sentence(sent, frame)
+              skip_frame = true
+              break
+            end
+          }
+          if skip_frame
+            next
+          end
+          sent.each_syn_node { |syn_node|
+            # Tell feature extractors about the current node:
+            # first Rosy feature extractors, then the others
+            # if there is a problem, skip this node
+            unless RosyFeatureExtractor.set_node(syn_node)
+              next
+            end
+            skip_node = false
+            @extractors_p1_other.each { |extractor_obj|
+              unless extractor_obj.class.set_node(syn_node)
+                skip_node = true
+                break
+              end
+            }
+            if skip_node
+              next
+            end
+            # features: array of pairs: [feature_name(string), feature_value(object)]
+            features = Array.new
+            (@extractors_p1_rosy + @extractors_p1_other).each { |extractor|
+              # compute features
+              feature_names = extractor.class.feature_names()
+              feature_index = 0
+              # append new features to features array
+              features.concat extractor.compute_features().map { |feature_value|
+                feature_name = feature_names[feature_index]
+                feature_index += 1
+                # sanity check: feature value longer than the allotted space in the DB?
+                check_feature_length(feature_name, feature_value, extractor)
+                [feature_name, nonnil_feature(feature_value, extractor.class.sql_type()) ]
+              }
+            }
+            yield features
+          } # each syn node
+        } # each frame
+      } # each sentence
+    }
+  end
+  ###
+  # each_phase2_column
+  #
+  # This method implements the application of the
+  # phase 2 extractors to data.
+  #
+  # Given a database view (of either training or test data),
+  # assign a new feature value to each instance
+  #
+  # yields pairs [feature_name(string), feature_values(array)]
+  # The feature_values array has as many lines as the view has instances
+  # so the yield of this method can be fed directly into view.update_column()
+  def each_phase2_column(view) # View object: training or test data
+    @feature_extractors_phase2.each { |extractor|
+      # apply the extractor
+      feature_columns = extractor.compute_features_on_view(view)
+      # interleave with feature values and yield
+      feature_index = 0
+      feature_names = extractor.class.feature_names()
+      feature_columns.each { |feature_values|
+        yield [
+          feature_names[feature_index],
+          feature_values.map { |feature_val| nonnil_feature(feature_val, extractor.class.sql_type)  }
+        ]
+        feature_index += 1
+      }
+    }
+  end
+  ###
+  # get_failed_parses
+  #
+  # returns the FailedParses object in which the info about failed parses has been stored
+  def get_failed_parses()
+    return @failed_parses
+  end
+  #################################
+  private
+  ###
+  def nonnil_feature(feature_value,
+                     sql_type)
+    # feature value nil? then change to noval
+    if feature_value.nil? and sql_type =~ /CHAR/
+      return @exp.get("noval")
+    elsif feature_value.class.to_s == "String" and feature_value.empty?
+      return @exp.get("noval")
+    elsif feature_value.nil?
+      return 0
+    else
+      return feature_value
+    end
+  end
+  ###
+  # preprocess: possibly change the given SalsaTigerSentence
+  # to enable better learning
+  def preprocess(sent)           # SalsaTigerSentence object
+    if @dataset == "train" and
+        (@exp.get("fe_syn_repair") or @exp.get("fe_rel_repair"))
+      FixSynSemMapping.fixit(sent, @exp, @interpreter_class)
+    end
+  end
+  ###
+  # register failed parses
+  def handle_failed_parse(sent,  # SalsaTigerSentence object
+                          frame) # FrameNode
+    # target POS
+    if frame.target()
+      main_target = @interpreter_class.main_node_of_expr(frame.target.children(), "no_mwe")
+    else
+      main_target = nil
+    end
+    if main_target
+      target_pos = @interpreter_class.category(main_target)
+    else
+      target_pos = nil
+    end
+    if frame.target()
+      target_str = frame.target().yield_nodes_ordered().map { |t_node|
+        if t_node.is_syntactic?
+          @interpreter_class.lemma_backoff(t_node)
+        else
+          # not a syntactic node: maybe an unassigned target?
+          ""
+        end
+      }.join(" ")
+    else
+      target_str = ""
+    end
+    @failed_parses.register(construct_instance_id(sent.id(), frame.id()),
+                            frame.name(),
+                            target_str,
+                            target_pos,
+                            frame.children.map { |fe| fe.name })
+  end
+  ###
+  # sanity check: feature value longer than the allotted space in the DB?
+  def check_feature_length(feature_name,  # string
+                           feature_value, # object
+                           extractor_obj) # AbstractFeatureExtractor object
+    if extractor_obj.class.sql_type() =~ /(\d+)/
+      # sql type contains some statement about the length.
+      # just crudely compare to feature length
+      length = $1.to_i
+      if feature_value.class == String and
+          feature_value.length() > length
+        if feature_name == "sentid"
+	  print length;
+          print feature_value;
+	  print feature_value.length();
+	  # if the sentence (instance) ID is too long, we cannot go on.
+          $stderr.puts "Error: Instance ID is longer than its DB column."
+          $stderr.puts "Please increase the DB column size in {Tiger,Collins}FeatureExtractors.rb"
+          raise "SQL entry length surpassed"
+        elsif @exp.get("verbose")
+          # KE Feb 07: don't print warning,
+          # this is just too frequent
+          # for other features, we just issue a warning, and only if we are verbose
+          # $stderr.puts "Warning: feature #{feature_name} longer than its DB column (#{length.to_s} vs #{feature_value.length()}): #{feature_value}"
+        end # feature name check
+      end # length surpassed
+    end # length found in sql type
+  end
+end

data/lib/rosy/RosyConfigData.rb ADDED

@@ -0,0 +1,115 @@
+require 'common/ConfigData'
+##############################
+# Class RosyConfigData
+#
+# inherits from ConfigData,
+# sets features for ROSY
+class RosyConfigData < ConfigData
+  def initialize(filename)
+    super(filename,                      # config file
+	  { # features
+            "feature" => "list",
+           "classifier" => "list",
+           "verbose" => "bool" ,
+            "enduser_mode" => "bool",
+            "experiment_ID" => "string",
+            "directory_input_train" => "string",
+            "directory_input_test" => "string",
+            "directory_output" => "string",
+            "preproc_descr_file_train" => "string",
+            "preproc_descr_file_test" => "string",
+            "external_descr_file"    => "string",
+            "dbtype" => "string",    # "mysql" or "sqlite"
+            "host" => "string",      # DB access: sqlite only
+            "user" => "string",
+            "passwd" => "string",
+            "dbname" => "string",
+            "data_dir" => "string",  # for external use
+            "rosy_dir" => "pattern", # for internal use only, set by rosy.rb
+           "classifier_dir" => "string", # if present, special directory for classifiers
+           "classif_column_name" => "string",
+           "main_table_name" => "pattern",
+           "test_table_name" => "pattern",
+           "eval_file" => "pattern",
+           "log_file" => "pattern",
+           "failed_file" => "pattern",
+           "classifier_file" => "pattern",
+           "classifier_output_file" => "pattern",
+           "noval" => "string",
+           "split_nones" => "bool",
+           "print_eval_log" => "bool",
+           "assume_argrec_perfect" => "bool",
+           "xwise_argrec" => "string",
+           "xwise_arglab" => "string",
+           "xwise_onestep" => "string",
+           "fe_syn_repair" => "bool", # map words to constituents for FEs: idealize?
+           "fe_rel_repair" => "bool", # FEs: include non-included relative clauses into FEs
+           "prune" => "string",       # pruning prior to argrec?
+	  },
+	  ["exp_ID", "test_ID", "split_ID", "feature_name", "classif", "step",
+           "group", "dataset","mode"]                      # variables
+	  )
+    # set access functions for list features
+    set_list_feature_access("feature",
+			    method("access_feature"))
+    # set access functions for list features
+    set_list_feature_access("classifier",
+			    method("access_feature"))
+  end
+  ###
+  # protected
+  #####
+  # access_feature
+  #
+  # access function for feature 'feature'
+  #
+  # assumed format in the config file:
+  #
+  #   feature = path [option]*
+  #
+  # i.e. first the name of the feature type to use, then
+  # optionally options associated with that feature,
+  # e.g. 'argrec': use that feature only when computing argrec
+  #
+  # the access function is called with parameter val_list, an array of
+  # string tuples, one string tuple for each feature defined.
+  # the first string in the tuple is the feature name, the rest are the options
+  #
+  # returns: a list of pairs [feature_name(string), options(array:string)]
+  # of defined features
+  def access_feature(val_list) # array:array:string: list of tuples defined in config file
+		               # for feature 'feature'
+    if val_list.nil?
+      return []
+    else
+      return val_list.map { |feature_descr_tuple|
+        [feature_descr_tuple.first, feature_descr_tuple[1..-1]]
+      }
+    end
+  end
+end