RubyGems - shalmaneser - Versions diffs - 1.2.0.rc1 → 1.2.0.rc2 - Mend

shalmaneser 1.2.0.rc1 → 1.2.0.rc2

Files changed (30) hide show

checksums.yaml +4 -4
data/README.md +26 -8
data/doc/SB_README +57 -0
data/doc/exp_files_description.txt +160 -0
data/doc/fred.pdf +0 -0
data/doc/index.md +120 -0
data/doc/salsa_tool.pdf +0 -0
data/doc/salsatigerxml.pdf +0 -0
data/doc/shal_doc.pdf +0 -0
data/doc/shal_lrec.pdf +0 -0
data/lib/ext/maxent/Classify.class +0 -0
data/lib/ext/maxent/Train.class +0 -0
data/lib/frprep/TreetaggerInterface.rb +4 -4
data/lib/shalmaneser/version.rb +1 -1
metadata +41 -48
data/test/frprep/test_opt_parser.rb +0 -94
data/test/functional/functional_test_helper.rb +0 -40
data/test/functional/sample_experiment_files/fred_test.salsa.erb +0 -122
data/test/functional/sample_experiment_files/fred_train.salsa.erb +0 -135
data/test/functional/sample_experiment_files/prp_test.salsa.erb +0 -138
data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +0 -120
data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +0 -120
data/test/functional/sample_experiment_files/prp_train.salsa.erb +0 -138
data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +0 -138
data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +0 -138
data/test/functional/sample_experiment_files/rosy_test.salsa.erb +0 -257
data/test/functional/sample_experiment_files/rosy_train.salsa.erb +0 -259
data/test/functional/test_fred.rb +0 -47
data/test/functional/test_frprep.rb +0 -52
data/test/functional/test_rosy.rb +0 -40

data/test/functional/sample_experiment_files/fred_train.salsa.erb DELETED Viewed

@@ -1,135 +0,0 @@
-# ID identifying this experiment and all its data
-# please do not use spaces inside the experiment ID
-experiment_ID = fred_train
-# targets:
-# if apply_to_all_known_targets is set to true,
-# disambiguate all words for which we have training data
-# when performing task "test" (i.e. applying trained classifiers)
-apply_to_all_known_targets = true
-# Enduser mode?
-# The idea is that the enduser will only _apply_
-# pre-trained classifiers. So in enduser mode many
-# options are disallowed.
-enduser_mode = false
-# print warnings and
-# give detailed progress reports
-verbose = true
-############################
-# Paths
-# - fred_directory: directory where Fred puts its internal data
-# - directory_output:
-#   redirect system output of disambiguated text (in SalsaTigerXML)
-#   to another directory.
-#   If you do not set anything here, output is to
-#   <fred_directory>/<experiment_ID>/output/stxml
-# - classifier_dir:
-#   Write trained classifiers to this directory.
-#   If you do not set this parameter, classifiers are written to
-#   <fred_directory>/<experiment_ID>/classifiers
-fred_directory = <%= File.expand_path('test/functional/output') %>
-# - preproc_descr_file_train / ...test
-#   where the experiment file for frprep is located
-#   (preprocessing for Fred and Rosy)
-#   for the preprocessing of the data used in this experiment
-#
-#   give one preprocessing file name for the training data
-#   and one for the test data
-#   (If you only ever use test data in this experiment, you only
-#   need to give preproc_descr_file_test, and vice versa for training data.)
-preproc_descr_file_train = <%= File.expand_path('test/functional/sample_experiment_files/prp_train.salsa.fred.standalone') %>
-#####################
-# noncontiguous input?
-# if so, set 'noncontiguous_input' to 'true' (default is 'false')
-# Also give the larger corpus from which the input sentences are:
-# - directory
-# - format: same possibilities as for frprep format
-# - encoding: same possibilities as for frprep encoding
-noncontiguous_input = false
-#larger_corpus_dir =
-larger_corpus_format = SalsaTigerXML
-#larger_corpus_encoding = iso
-#################
-# Features
-# bag-of-words context, with given context size,
-# for example:
- feature = context 50
- feature = context 2
-#
-# (you can give more than one context feature line!)
-#
-# other possible features:
-# feature = syntax
-# feature = synsem
-#
-# syntax: grammatical functions
-# synsem: grammatical functions plus headwords
-#feature = context % %contextsize%
-feature = syntax
-# How to handle training data that is labeled
-# with multiple sense labels?
-# - binarize (default): This works only with binary classifiers.
-#   When featurizing for the binary classifiers, consider an item
-#   positive if its set of assigned labels includes the
-#   label for this binary classifier.
-# - repeat: Repeat the instance, once for each
-#   sense label that has been assigned. (Basically, treat it
-#   as N instances with equal features but different labels.)
-# - join: join all the assigned senses into one combined sense
-#   and treat that as a separate sense to train on.
-# - keep: keep as multiple sense labels. (Note that this
-#   makes sense only for classifiers that can deal with
-#   multiple labels.)
-#handle_multilabel = binarize
-handle_multilabel = repeat
-# What to do with numerical features?
-# - keep: just leave as is
-# - repeat: for a feature with max. numerical value N,
-#   use N binary features
-# - bin: use a fixed number of bins, e.g. 5, then
-#   if feature value > 20: set all bins to 1,
-#   if feature value > 10: set the first four bins to 1,
-#   etc.
-#   default: bin.
-#numerical_features = bin
-numerical_features = keep
-# Binary classifiers, or n-ary classifiers?
-# if binary classifiers, set 'binary_classifiers = true'
-# default is 'false'.
-binary_classifiers = false
-#################
-# Fred internal settings
-# what kind of classifier to use?
-#
-# format:
-# <classifier type> <path> <optionally another path>
-#
-# for maxent, give first the path where maxent resides,
-# then <where_shalmaneser_resides>/program/tools/maxent
-classifier = maxent <%= File.expand_path('tools/maxent/maxent-2.4.0') %>
-# for binary classifiers, you can set the pseudolabel
-# on the 'negative' sense.
-# Default is 'NONE'
-negsense = NONE

data/test/functional/sample_experiment_files/prp_test.salsa.erb DELETED Viewed

@@ -1,138 +0,0 @@
-#################################################
-# This is a sample experiment file
-# with explanations of all features
-# that can be set for the frprep preprocessing system for Fred and Rosy.
-#
-# To start your own experiment,
-# replace all occurrences of
-# %...% by values of your choice.
-#
-# Boolean features may be omitted and are false by default.
-#
-# Experiment file lines that start with '#'
-# are comments and are ignored. Empty lines are ignored as well.
-########################
-# Experiment description
-#
-# ID identifying this experiment and all its data
-# please do not use spaces inside the experiment ID
-prep_experiment_ID = prp_test
-# YOUR INPUT DATA:
-# frprep accepts an input directory rather than an input file.
-# It will process all files in the directory directory_input
-# and write the results to directory_preprocessed.
-#
-# For input formats see the discussion of "format" below.
-directory_input = <%= File.expand_path('test/functional/input/frprep/test.salsa') %>
-directory_preprocessed = <%= File.expand_path('test/functional/output/frprep/test.salsa') %>
-##
-# Experimental data is described by the following parameters:
-#
-# - language: en / de
-#    en for English or de for German
-#
-# - format:  SalsaTigerXML / FNXml / SalsaTab / BNC / Plain
-#
-#    Format of the input data, training/test set
-#    SalsaTigerXML:  Parsed data, English or German
-#    FNXml:          FrameNet Lexical Unit files in FrameNet XML format
-#    FNCorpusXML:    FrameNet files in the FrameNet corpus XML format
-#    SalsaTab:       tabular format (internal)
-#    BNC             BNC XML format, alternating words and POS tags
-#    Plain           Plain text, ONE SENTENCE PER LINE.
-#
-#    Preprocessing transforms all data to SalsaTigerXML.
-#
-# - origin:  SalsaTiger / FrameNet / <not specified>
-#    This is the origin of the training/test data.
-#    SalsaTiger: data from the Tiger corpus, possibly semantically
-#                annotated by Salsa
-#    FrameNet: data from the FrameNet project
-#
-#    Don't set 'origin' if none of these origins apply
-#
-# - encoding: utf8 / iso / hex / <not specified>
-#                 Default: iso
-language = de
-#origin =
-format = Plain
-encoding = iso
-#############################
-# Which preprocessing steps to take?
-#
-# Data can be parsed, lemmatized and POS-tagged,
-# but this happens only if it is specified in the
-# experiment file.
-#
-# Set these booleans to true to trigger the respective
-# type of preprocessing. The default value is false.
-do_lemmatize = true
-do_postag = false
-do_parse = true
-#############################
-# directory where frprep puts its internal data
-#
-frprep_directory = <%= File.expand_path('test/functional/output/') %>
-#############################
-# Syntax/semantics interface repair:
-# FrameNet annotated data has some annotation choices
-# that may make it harder to learn the mapping from
-# syntactic structure to semantic roles.
-#
-# If you are using FrameNet data for training a
-# semantic role labeler, set the following two settings
-# to true (default is false) to 'repair' semantic role labels
-# to closer match the syntactic structure
-fe_syn_repair = true
-fe_rel_repair = false
-#################
-# Location of tools and resources used by Fred
-# currently known to the system:
-# (Saarbruecken paths given)
-#
-# - POS tagging:
-#   - pos_tagger = treetagger
-#     pos_tagger_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
-#
-# - Lemmatization:
-#   - lemmatizer = treetagger
-#     lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
-#     lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-german-notokenisation
-#
-# - Parser:
-#   - parser = collins  (English)
-#     parser_path = /proj/llx/Software/Parsers/COLLINS-PARSER/
-#   - parser = sleepy   (German)
-#     parser_path = /proj/corpora/sleepy3/
-#   - parser = minipar (English)
-#     parser_path = /proj/llx/Software/Parsers/minipar-linux/
-#
-pos_tagger = treetagger
-pos_tagger_path = <%= File.expand_path('tools/treetagger') %>
-lemmatizer = treetagger
-lemmatizer_path = <%= File.expand_path('tools/treetagger') %>
-parser = berkeley
-parser_path = <%= File.expand_path('tools/berkeleyParser') %>
-# parser:
-# maximum no. of sentences in a parse file,
-# maximum sentence length to be parsed
-parser_max_sent_num = 2000
-parser_max_sent_len = 80

data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb DELETED Viewed

@@ -1,120 +0,0 @@
-# ID identifying this experiment and all its data
-# please do not use spaces inside the experiment ID
-prep_experiment_ID = prp_test
-# YOUR INPUT DATA:
-# frprep accepts an input directory rather than an input file.
-# It will process all files in the directory directory_input
-# and write the results to directory_preprocessed.
-#
-# For input formats see the discussion of "format" below.
-#directory_input = <%= File.expand_path('test/functional/input/frprep/test.salsa') %>
-directory_preprocessed = <%= File.expand_path('test/functional/input/fred/frprep/test.salsa') %>
-##
-# Experimental data is described by the following parameters:
-#
-# - language: en / de
-#    en for English or de for German
-#
-# - format:  SalsaTigerXML / FNXml / SalsaTab / BNC / Plain
-#
-#    Format of the input data, training/test set
-#    SalsaTigerXML:  Parsed data, English or German
-#    FNXml:          FrameNet Lexical Unit files in FrameNet XML format
-#    FNCorpusXML:    FrameNet files in the FrameNet corpus XML format
-#    SalsaTab:       tabular format (internal)
-#    BNC             BNC XML format, alternating words and POS tags
-#    Plain           Plain text, ONE SENTENCE PER LINE.
-#
-#    Preprocessing transforms all data to SalsaTigerXML.
-#
-# - origin:  SalsaTiger / FrameNet / <not specified>
-#    This is the origin of the training/test data.
-#    SalsaTiger: data from the Tiger corpus, possibly semantically
-#                annotated by Salsa
-#    FrameNet: data from the FrameNet project
-#
-#    Don't set 'origin' if none of these origins apply
-#
-# - encoding: utf8 / iso / hex / <not specified>
-#                 Default: iso
-language = de
-#origin =
-format = Plain
-encoding = iso
-#############################
-# Which preprocessing steps to take?
-#
-# Data can be parsed, lemmatized and POS-tagged,
-# but this happens only if it is specified in the
-# experiment file.
-#
-# Set these booleans to true to trigger the respective
-# type of preprocessing. The default value is false.
-do_lemmatize = true
-do_postag = false
-do_parse = true
-#############################
-# directory where frprep puts its internal data
-#
-#frprep_directory = <%= File.expand_path('test/functional/input/fred/frprep') %>
-#############################
-# Syntax/semantics interface repair:
-# FrameNet annotated data has some annotation choices
-# that may make it harder to learn the mapping from
-# syntactic structure to semantic roles.
-#
-# If you are using FrameNet data for training a
-# semantic role labeler, set the following two settings
-# to true (default is false) to 'repair' semantic role labels
-# to closer match the syntactic structure
-fe_syn_repair = true
-fe_rel_repair = false
-#################
-# Location of tools and resources used by Fred
-# currently known to the system:
-# (Saarbruecken paths given)
-#
-# - POS tagging:
-#   - pos_tagger = treetagger
-#     pos_tagger_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
-#
-# - Lemmatization:
-#   - lemmatizer = treetagger
-#     lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
-#     lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-german-notokenisation
-#
-# - Parser:
-#   - parser = collins  (English)
-#     parser_path = /proj/llx/Software/Parsers/COLLINS-PARSER/
-#   - parser = sleepy   (German)
-#     parser_path = /proj/corpora/sleepy3/
-#   - parser = minipar (English)
-#     parser_path = /proj/llx/Software/Parsers/minipar-linux/
-#
-pos_tagger = treetagger
-pos_tagger_path = <%= File.expand_path('tools/treetagger') %>
-lemmatizer = treetagger
-lemmatizer_path = <%= File.expand_path('tools/treetagger') %>
-parser = berkeley
-parser_path = <%= File.expand_path('tools/berkeleyParser') %>
-# parser:
-# maximum no. of sentences in a parse file,
-# maximum sentence length to be parsed
-parser_max_sent_num = 2000
-parser_max_sent_len = 80

data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb DELETED Viewed

@@ -1,120 +0,0 @@
-# ID identifying this experiment and all its data
-# please do not use spaces inside the experiment ID
-prep_experiment_ID = prp_test
-# YOUR INPUT DATA:
-# frprep accepts an input directory rather than an input file.
-# It will process all files in the directory directory_input
-# and write the results to directory_preprocessed.
-#
-# For input formats see the discussion of "format" below.
-#directory_input = <%= File.expand_path('test/functional/input/frprep/test.salsa') %>
-directory_preprocessed = <%= File.expand_path('test/functional/input/rosy/frprep/test.salsa') %>
-##
-# Experimental data is described by the following parameters:
-#
-# - language: en / de
-#    en for English or de for German
-#
-# - format:  SalsaTigerXML / FNXml / SalsaTab / BNC / Plain
-#
-#    Format of the input data, training/test set
-#    SalsaTigerXML:  Parsed data, English or German
-#    FNXml:          FrameNet Lexical Unit files in FrameNet XML format
-#    FNCorpusXML:    FrameNet files in the FrameNet corpus XML format
-#    SalsaTab:       tabular format (internal)
-#    BNC             BNC XML format, alternating words and POS tags
-#    Plain           Plain text, ONE SENTENCE PER LINE.
-#
-#    Preprocessing transforms all data to SalsaTigerXML.
-#
-# - origin:  SalsaTiger / FrameNet / <not specified>
-#    This is the origin of the training/test data.
-#    SalsaTiger: data from the Tiger corpus, possibly semantically
-#                annotated by Salsa
-#    FrameNet: data from the FrameNet project
-#
-#    Don't set 'origin' if none of these origins apply
-#
-# - encoding: utf8 / iso / hex / <not specified>
-#                 Default: iso
-language = de
-#origin =
-format = Plain
-encoding = iso
-#############################
-# Which preprocessing steps to take?
-#
-# Data can be parsed, lemmatized and POS-tagged,
-# but this happens only if it is specified in the
-# experiment file.
-#
-# Set these booleans to true to trigger the respective
-# type of preprocessing. The default value is false.
-do_lemmatize = true
-do_postag = false
-do_parse = true
-#############################
-# directory where frprep puts its internal data
-#
-#frprep_directory = <%= File.expand_path('test/functional/input/rosy/frprep') %>
-#############################
-# Syntax/semantics interface repair:
-# FrameNet annotated data has some annotation choices
-# that may make it harder to learn the mapping from
-# syntactic structure to semantic roles.
-#
-# If you are using FrameNet data for training a
-# semantic role labeler, set the following two settings
-# to true (default is false) to 'repair' semantic role labels
-# to closer match the syntactic structure
-fe_syn_repair = true
-fe_rel_repair = false
-#################
-# Location of tools and resources used by Fred
-# currently known to the system:
-# (Saarbruecken paths given)
-#
-# - POS tagging:
-#   - pos_tagger = treetagger
-#     pos_tagger_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
-#
-# - Lemmatization:
-#   - lemmatizer = treetagger
-#     lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
-#     lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-german-notokenisation
-#
-# - Parser:
-#   - parser = collins  (English)
-#     parser_path = /proj/llx/Software/Parsers/COLLINS-PARSER/
-#   - parser = sleepy   (German)
-#     parser_path = /proj/corpora/sleepy3/
-#   - parser = minipar (English)
-#     parser_path = /proj/llx/Software/Parsers/minipar-linux/
-#
-pos_tagger = treetagger
-pos_tagger_path = <%= File.expand_path('tools/treetagger') %>
-lemmatizer = treetagger
-lemmatizer_path = <%= File.expand_path('tools/treetagger') %>
-parser = berkeley
-parser_path = <%= File.expand_path('tools/berkeleyParser') %>
-# parser:
-# maximum no. of sentences in a parse file,
-# maximum sentence length to be parsed
-parser_max_sent_num = 2000
-parser_max_sent_len = 80