RubyGems - shalmaneser-lib - Versions diffs - 1.2.rc5 - Mend

shalmaneser-lib 1.2.rc5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

checksums.yaml +7 -0
data/.yardopts +10 -0
data/CHANGELOG.md +4 -0
data/LICENSE.md +4 -0
data/README.md +122 -0
data/lib/configuration/config_data.rb +457 -0
data/lib/configuration/config_format_element.rb +210 -0
data/lib/configuration/configuration_error.rb +15 -0
data/lib/configuration/external_config_data.rb +56 -0
data/lib/configuration/frappe_config_data.rb +134 -0
data/lib/configuration/fred_config_data.rb +199 -0
data/lib/configuration/rosy_config_data.rb +126 -0
data/lib/db/db_interface.rb +50 -0
data/lib/db/db_mysql.rb +141 -0
data/lib/db/db_sqlite.rb +280 -0
data/lib/db/db_table.rb +237 -0
data/lib/db/db_view.rb +416 -0
data/lib/db/db_wrapper.rb +175 -0
data/lib/db/select_table_and_columns.rb +10 -0
data/lib/db/sql_query.rb +243 -0
data/lib/definitions.rb +19 -0
data/lib/eval.rb +482 -0
data/lib/ext/maxent/Classify.class +0 -0
data/lib/ext/maxent/Train.class +0 -0
data/lib/external_systems.rb +251 -0
data/lib/framenet_format/fn_corpus_aset.rb +209 -0
data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
data/lib/framenet_format/fn_database.rb +143 -0
data/lib/framenet_format/frame_xml_file.rb +104 -0
data/lib/framenet_format/frame_xml_sentence.rb +411 -0
data/lib/logging.rb +25 -0
data/lib/ml/classifier.rb +189 -0
data/lib/ml/mallet.rb +236 -0
data/lib/ml/maxent.rb +229 -0
data/lib/ml/optimize.rb +195 -0
data/lib/ml/timbl.rb +140 -0
data/lib/monkey_patching/array.rb +82 -0
data/lib/monkey_patching/enumerable_bool.rb +24 -0
data/lib/monkey_patching/enumerable_distribute.rb +18 -0
data/lib/monkey_patching/file.rb +131 -0
data/lib/monkey_patching/subsumed.rb +24 -0
data/lib/ruby_class_extensions.rb +4 -0
data/lib/salsa_tiger_xml/corpus.rb +24 -0
data/lib/salsa_tiger_xml/fe_node.rb +98 -0
data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
data/lib/salsa_tiger_xml/frame_node.rb +145 -0
data/lib/salsa_tiger_xml/graph_node.rb +347 -0
data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
data/lib/salsa_tiger_xml/sem_node.rb +58 -0
data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
data/lib/salsa_tiger_xml/syn_node.rb +169 -0
data/lib/salsa_tiger_xml/tree_node.rb +59 -0
data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
data/lib/salsa_tiger_xml/usp_node.rb +72 -0
data/lib/salsa_tiger_xml/xml_node.rb +163 -0
data/lib/shalmaneser/lib.rb +1 -0
data/lib/tabular_format/fn_tab_format_file.rb +38 -0
data/lib/tabular_format/fn_tab_frame.rb +67 -0
data/lib/tabular_format/fn_tab_sentence.rb +169 -0
data/lib/tabular_format/tab_format_file.rb +91 -0
data/lib/tabular_format/tab_format_named_args.rb +184 -0
data/lib/tabular_format/tab_format_sentence.rb +119 -0
data/lib/value_restriction.rb +49 -0
metadata +131 -0

data/lib/configuration/config_format_element.rb ADDED

@@ -0,0 +1,210 @@
+##############################
+# ConfigFormatelement is an auxiliary class
+# of ConfigData.
+# It keeps track of feature patterns with variables in them
+# that can be instantiated.
+# @author Andrei Beliankou
+#
+require_relative 'configuration_error'
+module Shalmaneser
+  module Configuration
+    class ConfigFormatElement
+      # given a pattern and a list of variable names,
+      # analyze the pattern and remember the variable names
+      #
+      def initialize(string, # string: feature name, may include names of variables.
+                     # they are included in <>
+                     variables) # list of variable names that can occur
+        @variables = variables
+        # pattern: this is what the 'string' is split into,
+        # an array of elements that are either fixed parts or variables.
+        # fixed part: pair [item:string, "string"]
+        # variable: pair [variable_name:string, "variable"]
+        @pattern = []
+        state = "out"
+        item = ""
+        # analyze string,
+        # split into variables and fixed parts
+        string.split(//).each { |char|
+          case state
+          when "in"
+            case char
+            when "<"
+              raise ConfigurationError, "Duplicate < in #{string}."
+            when ">"
+              unless @variables.include? item
+                raise ConfigurationError, "Unknown variable #{item}."
+              end
+              @pattern << [item, "variable"]
+              item = ""
+              state = "out"
+            else
+              item << char
+              state = "in"
+            end
+          when "out"
+            case char
+            when "<"
+              unless item.empty?
+                @pattern << [item, "string"]
+                item = ""
+              end
+              state = "in"
+            when ">"
+              raise ConfigurationError, "Unexpected > in #{string}."
+            else
+              item << char
+              state = "out"
+            end
+          else
+            raise ConfigurationError, "Shouldn't be here!"
+          end
+        }
+        # read through the whole of "string"
+        # end state has to be "out"
+        unless state == "out"
+          raise ConfigurationError, "Unclosed < in #{string}."
+        end
+        # last bit still to be recorded?
+        unless item.empty?
+          @pattern << [item, "string"]
+        end
+        # make regexp for matching this pattern
+        @regexp = make_regexp(@pattern)
+      end
+      # instantiate: given pairs of variable names and variable values,
+      # instantiate @pattern to a string in which var names are replaced
+      # by their values
+      #
+      # returns: string
+      def instantiate(var_hash) # hash variable name(string) => variable value(string)
+        # instantiate the pattern
+        @pattern.map do |item, string_or_var|
+          case string_or_var
+          when "string"
+            item
+          when "variable"
+            if var_hash[item].nil?
+              raise ConfigurationError, "Missing variable instantiation: #{item}."
+            end
+            var_hash[item]
+          else
+            raise ConfigurationError, "Shouldn't be here!"
+          end
+        end.join
+      end
+      # match()
+      #
+      # given a string, try to match it against the @pattern
+      # while setting the variables given in 'fillers' to
+      # the values given in that hash.
+      #
+      # returns: if the string matches, a hash variable name => value
+      #   that includes the fillers given as a parameter as well as
+      #   values for all other variables mentioned in @pattern,
+      #   or false if no match.
+      def match(string,   # a string
+                fillers = nil) # hash variable name(string) => value(string)
+        # have we been given partial info about variables?
+        if fillers
+          match = make_regexp(@pattern, fillers).match(string)
+        else
+          match = @regexp.match(string)
+        end
+        if match.nil?
+          # no match via the regular expression
+          return false
+        end
+        # regular expression matched.
+        # construct return value in hash
+        # retv: variable name(string) => value(string)
+        retv = {}
+        if fillers
+          # include given fillers in retv hash
+          fillers.each_pair { |name, val| retv[name] = val }
+        end
+        # now put values for other variables in @pattern into retv
+        index = 1
+        @pattern.to_a.select { |item, string_or_var|
+          string_or_var == "variable"
+        }.select { |item, string_or_var|
+          fillers.nil? or
+            fillers[item].nil?
+        }.each { |item, string_or_var|
+          # for all items on the pattern list
+          # that are variables and
+          # haven't been filled by the "fillers" list already:
+          # fill from matches
+          if match[index].nil?
+            raise ConfigurationError, "Match, but not enough matched elements? Strange."
+          end
+          if retv[item].nil?
+            retv[item] = match[index]
+          else
+            unless retv[item] == match[index]
+              return false
+            end
+          end
+          index += 1
+        }
+        retv
+      end
+      # used_variables
+      #
+      # returns: an array of variable names used in @pattern
+      def used_variables
+        @pattern.select do |_item, string_or_var|
+          string_or_var == "variable"
+        end.map { |item, _string_or_var| item }
+      end
+      ####################
+      private
+      # make_regexp:
+      # make regular expression from a pattern
+      # together with some variable fillers
+      #
+      # @return [Regexp] object
+      # @param [Array] pattern An array of pairs [string, "string"] or [string, "variable"]
+      # @param [Hash] fillers A Hash variable name(string) => value(string)
+      def make_regexp(pattern, fillers = nil)
+        pattern = pattern.map do |item, string_or_var|
+          case string_or_var
+          when "variable"
+            fillers && fillers[item] ? Regexp.escape(fillers[item]) : '(.+)'
+          when "string"
+            Regexp.escape(item)
+          else
+            # @todo Find the source of this error.
+            raise ConfiguratinError, "Shouldn't be here"
+          end
+        end.join
+        Regexp.new("^#{pattern}$")
+      end
+    end
+  end
+end

data/lib/configuration/configuration_error.rb ADDED

@@ -0,0 +1,15 @@
+module Shalmaneser
+  module Configuration
+    class ConfigurationError < StandardError
+      # @param [String] msg A custom message for this exception.
+      # @param [Exception] nested_exception An external exception
+      #   which is reused to provide more information.
+      def initialize(msg = nil, nested_exception = nil)
+        if nested_exception
+          msg = "#{nested_exception.class}: #{nested_exception.message}\n#{msg}"
+        end
+        super(msg)
+      end
+    end
+  end
+end

data/lib/configuration/external_config_data.rb ADDED

@@ -0,0 +1,56 @@
+# ExternalConfigData
+# Katrin Erk January 2006
+#
+# All scripts that compute additional external knowledge sources
+# for Fred and Rosy:
+# access to configuration and experiment description file
+require_relative 'config_data'
+##############################
+# Class ExternalConfigData
+#
+# inherits from ConfigData,
+# sets variable names appropriate to tasks of external knowledge sources
+module Shalmaneser
+  module Configuration
+    class ExternalConfigData < ConfigData
+      def initialize(filename)
+        # initialize config data object
+        super(filename,          # config file
+              { "directory" => "string", # features
+                "experiment_id" => "string",
+                "gfmap_restrict_to_downpath" => "bool",
+                "gfmap_restrict_pathlen" => "integer",
+                "gfmap_remove_gf" => "list"
+              },
+              [] # variables
+             )
+        # set access functions for list features
+        set_list_feature_access("gfmap_remove_gf",
+                                method("access_as_stringlist"))
+      end
+      ###
+      protected
+      #####
+      # access_as_stringlist
+      #
+      # assumed format:
+      #
+      #   lhs = rhs1 rhs2 ... rhsN
+      #
+      # given in val_list as string tuples [rhs1,...,rhsN]
+      #
+      # join the rhs strings by spaces, return as string
+      # "rhs1 rhs2 ... rhsN"
+      #
+      def access_as_stringlist(val_list) # array:array:string
+        val_list.map { |rhs| rhs.join(" ") }
+      end
+    end
+  end
+end

data/lib/configuration/frappe_config_data.rb ADDED

@@ -0,0 +1,134 @@
+# FPrepConfigData
+# Katrin Erk July 05
+#
+# Preprocessing for Fred and Rosy:
+# access to a configuration and experiment description file
+require_relative 'config_data'
+##############################
+# Class FrappeConfigData
+#
+# inherits from ConfigData,
+# sets variable names appropriate to preprocessing task
+module Shalmaneser
+  module Configuration
+    class FrappeConfigData < ConfigData
+      VALID_ENCODINGS = ['hex', 'iso', 'utf8', nil]
+      VALID_INPUT_FORMATS = %w(Plain SalsaTab FNXml FNCorpusXml SalsaTigerXML)
+      CONFIG_DEFS = {
+        "prep_experiment_ID" => "string", # experiment identifier
+        "frprep_directory" => "string", # dir for frprep internal data
+        # information about the dataset
+        "language" => "string", # en, de
+        "origin" => "string",    # FrameNet, Salsa, or nothing
+        "format" => "string",   # Plain, SalsaTab, FNXml, FNCorpusXml, SalsaTigerXML
+        "encoding" => "string", # utf8, iso, hex, or nothing
+        # directories
+        "directory_input" => "string", # dir with input data
+        "directory_preprocessed" => "string", # dir with output Salsa/Tiger XML data
+        "directory_parserout" => "string", # dir with parser output for the parser named below
+        # syntactic processing
+        "pos_tagger" => "string", # name of POS tagger
+        "lemmatizer" => "string", # name of lemmatizer
+        "parser" => "string",     # name of parser
+        "pos_tagger_path" => "string", # path to POS tagger
+        "lemmatizer_path" => "string", # path to lemmatizer
+        "parser_path" => "string",     # path to parser
+        "parser_max_sent_num" => "integer", # max number of sentences per parser input file
+        "parser_max_sent_len" => "integer", # max sentence length the parser handles
+        "do_parse" => "bool",    # use parser?
+        "do_lemmatize" => "bool",# use lemmatizer?
+        "do_postag" => "bool",   # use POS tagger?
+        # output format: if tabformat_output == true,
+        # output in Tab format rather than Salsa/Tiger XML
+        # (this will not work if do_parse == true)
+        "tabformat_output" => "bool",
+        # syntactic repairs, dependent on existing semantic role annotation
+        "fe_syn_repair" => "bool", # map words to constituents for FEs: idealize?
+        "fe_rel_repair" => "bool", # FEs: include non-included relative clauses into FEs
+      }
+      # @param filename [String]
+      def initialize(filename)
+        # @param filename [String] path to a config file
+        # @param CONFIG_DEFS [Hash] a list of configuration definitions
+        super(filename, CONFIG_DEFS, [])
+        validate
+      end
+      # @return [True, False]
+      # Shall we convert our input files into the target encoding?
+      def convert_encoding?
+        get('encoding') != 'utf8'
+      end
+      private
+      # Validates semantically the input values from the experiment file.
+      # @todo Rework the whole validation engine, the parameter definitions
+      #   should entail the information about: optional, obligatory,
+      #   in combination with. This information should be stored in external
+      #   resource files to easily change them.
+      #  @todo Accumulate error messages.
+      def validate
+        msg = []
+        unless get('frprep_directory')
+          msg << 'Please set <frprep_directory>, the Frappe internal data '\
+                'directory, in the experiment file.'
+        end
+        unless get('directory_input')
+          msg << 'Please specify <directory_input> in the Frappe experiment file.'
+        end
+        unless get('directory_preprocessed')
+          msg << 'Please specify <directory_preprocessed> in the experiment file.'
+        end
+        # sanity check: output in tab format will not work
+        # if we also do a parse
+        if get('tabformat_output') && get('do_parse')
+          msg << 'Error: Cannot do Tab format output when the input text is being'\
+                'parsed. Please set either <tabformat_output> or <do_parse> to false.'
+        end
+        if get('do_postag') && !(get('pos_tagger_path') && get('pos_tagger'))
+          msg << 'POS Tagging: I need <pos_tagger> and <pos_tagger_path> '\
+                'in the experiment file.'
+        end
+        if get('do_lemmatize') && !(get('lemmatizer_path') && get('lemmatizer'))
+          msg << 'Lemmatization: I need <lemmatizer> and <lemmatizer_path> in the experiment file.'
+        end
+        if get('do_parse') && !(get('parser_path') && get('parser'))
+          msg << 'Parsing: I need <parser> and <parser_path> in the experiment file.'
+        end
+        unless VALID_ENCODINGS.include?(get('encoding'))
+          msg << 'Please define a correct encoding in the configuration file: '\
+                "<#{VALID_ENCODINGS.join('>, <')}>!"
+        end
+        unless VALID_INPUT_FORMATS.include?(get('format'))
+          msg << 'Please define a correct input format in the configuration file: '\
+                 "<#{VALID_INPUT_FORMATS.join('>, <')}>!"
+        end
+        unless get("prep_experiment_ID") =~ /^[A-Za-z0-9_]+$/
+          msg << 'Please choose an alphanumeric experiment ID! '\
+                 "You provided: #{get('prep_experiment_ID')}"
+        end
+        raise(ConfigurationError, msg.join("\n")) if msg.any?
+      end
+    end
+  end
+end

data/lib/configuration/fred_config_data.rb ADDED

@@ -0,0 +1,199 @@
+# FredConfigData
+# Katrin Erk April 05
+#
+# Frame disambiguation system:
+# access to a configuration and experiment description file
+require_relative 'config_data'
+require 'definitions'
+require 'logging'
+##############################
+# Class FredConfigData
+#
+# inherits from ConfigData,
+# sets variable names appropriate to WSD task
+module Shalmaneser
+  module Configuration
+    class FredConfigData < ConfigData
+      VALID_TASKS = %w(featurize refeaturize split test eval)
+      CONFIG_DEFS = {
+        "experiment_ID" => "string", # experiment ID
+        "preproc_descr_file_train" => "string", # path to preprocessing files
+        "preproc_descr_file_test" => "string",
+        "directory_output" => "string", # path to Salsa/Tiger XML output directory
+        # @todo Verbosity should be handled by the Logger and only via cmd switches.
+        "verbose" => "bool",     # print diagnostic messages?
+        "apply_to_all_known_targets" => "bool", # apply to all known targets rather than the ones with a frame?
+        "fred_directory" => "string",# directory for internal info
+        "classifier_dir" => "string", # write classifiers here
+        "classifier" => "list",  # classifiers
+        "dbtype" => "string",    # "mysql" or "sqlite"
+        "host" => "string",      # DB access: sqlite only
+        "user" => "string",
+        "passwd" => "string",
+        "dbname" => "string",
+        # featurization info
+        "feature" => "list",     # which features to use for the classifier?
+        "binary_classifiers" => "bool",# make binary rather than n-ary clasifiers?
+        "negsense" => "string",  # binary classifier: negative sense is..?
+        "numerical_features" => "string", # do what with numerical features?
+        # what to do with items that have multiple senses?
+        # 'binarize': binary classifiers, and consider positive
+        #          if the sense is among the gold senses
+        # 'join' : make one joint sense
+        # 'repeat' : make multiple occurrences of the item, one sense per occ
+        # 'keep' : keep as separate labels
+        #
+        # multilabel: consider as assigned all labels
+        # above a certain confidence threshold?
+        "handle_multilabel" => "string",
+        "assignment_confidence_threshold" => "float",
+        # single-sentence context?
+        "single_sent_context" => "bool",
+        # noncontiguous input? then we need access to a larger corpus
+        "noncontiguous_input" => "bool",
+        "larger_corpus_dir" => "string",
+        "larger_corpus_format" => "string",
+        "larger_corpus_encoding" => "string",
+        # Imported from PrepConfigData
+        'do_postag' => 'bool',
+        'do_lemmatize' => 'bool',
+        'do_parse' => 'bool',
+        'pos_tagger' => 'string',
+        'lemmatizer' => 'string',
+        'parser' => 'string',
+        'directory_preprocessed' => 'string',
+        'language' => 'string'
+      }
+      def initialize(filename)
+        super(filename, CONFIG_DEFS, ["train", "exp_ID"])
+        # set access functions for list features
+        set_list_feature_access("classifier", method("access_classifier"))
+        set_list_feature_access("feature", method("access_feature"))
+        validate
+      end
+      ###
+      # protected
+      #####
+      # access_feature
+      #
+      # access function for feature 'feature'
+      #
+      # assumed format:
+      #
+      #   feature = context 50
+      #   feature = context 2
+      #   feature = syn
+      #
+      # i.e. first the name of the feature type to use, then
+      # optionally a parameter,
+      # and the same feature can occur more than once (which makes sense
+      # only in case of parameters)
+      #
+      #
+      # returns:
+      #  - If a feature is given as a parameter,
+      #    - If the feature is not set in the experiment file, nil
+      #    - If the feature is set and has a parameter, the list of
+      #      parameter values set for it. It is assumed that the parameters
+      #      are integers, and they are returned as integers
+      #    - If the feature is set and has no parameter, true
+      # - If no feature is given as parameter:
+      #   a list of all features that have been set in the experiment file
+      #   Each feature is given as a tuple: the first element is the feature (a string),
+      #   all further elements are options (integers)
+      def access_feature(val_list, # array:array:string: list of tuples defined in config file
+                         # for feature 'feature'
+                         feature=nil)  # string: feature type name
+        if feature
+          # access options for this feature
+          # get the right tuples
+          positives = val_list.select { |entries|
+            entries.first == feature
+          }.map { |entries|
+            entries[1]
+          }
+          if positives.empty?
+            # feature not defined
+            return nil
+          elsif positives.compact.empty?
+            # feature defined, but no parameters
+            return true
+          else
+            # feature defined, and has values
+            return positives.map { |par| par.to_i }
+          end
+        else
+          # return all features that have been set
+          return val_list.map { |feature_name, *options|
+            [feature_name] + options.map { |o| o.to_i }
+          }
+        end
+      end
+      #####
+      # access_classifier
+      #
+      # access function for feature 'classifier'
+      #
+      # assumed format in the config file:
+      #
+      #   feature = path [option]*
+      #
+      # i.e. first the name of the feature type to use, then
+      # optionally options associated with that feature,
+      # e.g. 'argrec': use that feature only when computing argrec
+      #
+      # the access function is called with parameter val_list, an array of
+      # string tuples, one string tuple for each feature defined.
+      # the first string in the tuple is the feature name, the rest are the options
+      #
+      # returns: a list of pairs [feature_name(string), options(array:string)]
+      # of defined features
+      # @param val_list [Array] array:array:string: list of tuples defined
+      #   in config file for feature 'feature'
+      def access_classifier(val_list)
+        if val_list.nil?
+          []
+        else
+          val_list.map do |cl_descr_tuple|
+            [cl_descr_tuple.first, cl_descr_tuple[1..-1]]
+          end
+        end
+      end
+      private
+      def validate
+        msg = []
+=begin
+        unless VALID_TASKS.include?(get('encoding'))
+          msg << 'Please define a correct encoding in the configuration file: '\
+                 "<#{VALID_ENCODINGS.join('>, <')}>!"
+        end
+=end
+        raise(ConfigurationError, msg.join("\n")) if msg.any?
+      end
+    end
+  end
+end