frprep 0.0.1.prealpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,138 @@
|
|
1
|
+
#################################################
|
2
|
+
# This is a sample experiment file
|
3
|
+
# with explanations of all features
|
4
|
+
# that can be set for the frprep preprocessing system for Fred and Rosy.
|
5
|
+
#
|
6
|
+
# To start your own experiment,
|
7
|
+
# replace all occurrences of
|
8
|
+
# %...% by values of your choice.
|
9
|
+
#
|
10
|
+
# Boolean features may be omitted and are false by default.
|
11
|
+
#
|
12
|
+
# Experiment file lines that start with '#'
|
13
|
+
# are comments and are ignored. Empty lines are ignored as well.
|
14
|
+
|
15
|
+
########################
|
16
|
+
# Experiment description
|
17
|
+
#
|
18
|
+
|
19
|
+
# ID identifying this experiment and all its data
|
20
|
+
# please do not use spaces inside the experiment ID
|
21
|
+
prep_experiment_ID = prp_train
|
22
|
+
|
23
|
+
# YOUR INPUT DATA:
|
24
|
+
# frprep accepts an input directory rather than an input file.
|
25
|
+
# It will process all files in the directory directory_input
|
26
|
+
# and write the results to directory_preprocessed.
|
27
|
+
#
|
28
|
+
# For input formats see the discussion of "format" below.
|
29
|
+
#directory_input = <%= File.expand_path('test/functional/input/frprep/train.salsa') %>
|
30
|
+
directory_preprocessed = <%= File.expand_path('test/functional/input/rosy/frprep/train.salsa') %>
|
31
|
+
|
32
|
+
##
|
33
|
+
# Experimental data is described by the following parameters:
|
34
|
+
#
|
35
|
+
# - language: en / de
|
36
|
+
# en for English or de for German
|
37
|
+
#
|
38
|
+
# - format: SalsaTigerXML / FNXml / SalsaTab / BNC / Plain
|
39
|
+
#
|
40
|
+
# Format of the input data, training/test set
|
41
|
+
# SalsaTigerXML: Parsed data, English or German
|
42
|
+
# FNXml: FrameNet Lexical Unit files in FrameNet XML format
|
43
|
+
# FNCorpusXML: FrameNet files in the FrameNet corpus XML format
|
44
|
+
# SalsaTab: tabular format (internal)
|
45
|
+
# BNC BNC XML format, alternating words and POS tags
|
46
|
+
# Plain Plain text, ONE SENTENCE PER LINE.
|
47
|
+
#
|
48
|
+
# Preprocessing transforms all data to SalsaTigerXML.
|
49
|
+
#
|
50
|
+
# - origin: SalsaTiger / FrameNet / <not specified>
|
51
|
+
# This is the origin of the training/test data.
|
52
|
+
# SalsaTiger: data from the Tiger corpus, possibly semantically
|
53
|
+
# annotated by Salsa
|
54
|
+
# FrameNet: data from the FrameNet project
|
55
|
+
#
|
56
|
+
# Don't set 'origin' if none of these origins apply
|
57
|
+
#
|
58
|
+
# - encoding: utf8 / iso / hex / <not specified>
|
59
|
+
# Default: iso
|
60
|
+
|
61
|
+
language = de
|
62
|
+
#origin =
|
63
|
+
format = SalsaTigerXML
|
64
|
+
encoding = utf8
|
65
|
+
|
66
|
+
#############################
|
67
|
+
# Which preprocessing steps to take?
|
68
|
+
#
|
69
|
+
# Data can be parsed, lemmatized and POS-tagged,
|
70
|
+
# but this happens only if it is specified in the
|
71
|
+
# experiment file.
|
72
|
+
#
|
73
|
+
# Set these booleans to true to trigger the respective
|
74
|
+
# type of preprocessing. The default value is false.
|
75
|
+
|
76
|
+
do_lemmatize = true
|
77
|
+
do_postag = false
|
78
|
+
do_parse = true
|
79
|
+
|
80
|
+
#############################
|
81
|
+
# directory where frprep puts its internal data
|
82
|
+
#
|
83
|
+
|
84
|
+
frprep_directory = <%= File.expand_path('test/functional/input/rosy/') %>
|
85
|
+
|
86
|
+
#############################
|
87
|
+
# Syntax/semantics interface repair:
|
88
|
+
# FrameNet annotated data has some annotation choices
|
89
|
+
# that may make it harder to learn the mapping from
|
90
|
+
# syntactic structure to semantic roles.
|
91
|
+
#
|
92
|
+
# If you are using FrameNet data for training a
|
93
|
+
# semantic role labeler, set the following two settings
|
94
|
+
# to true (default is false) to 'repair' semantic role labels
|
95
|
+
# to closer match the syntactic structure
|
96
|
+
|
97
|
+
fe_syn_repair = true
|
98
|
+
fe_rel_repair = false
|
99
|
+
|
100
|
+
|
101
|
+
#################
|
102
|
+
# Location of tools and resources used by Fred
|
103
|
+
|
104
|
+
# currently known to the system:
|
105
|
+
# (Saarbruecken paths given)
|
106
|
+
#
|
107
|
+
# - POS tagging:
|
108
|
+
# - pos_tagger = treetagger
|
109
|
+
# pos_tagger_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
|
110
|
+
#
|
111
|
+
# - Lemmatization:
|
112
|
+
# - lemmatizer = treetagger
|
113
|
+
# lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
|
114
|
+
# lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-german-notokenisation
|
115
|
+
#
|
116
|
+
# - Parser:
|
117
|
+
# - parser = collins (English)
|
118
|
+
# parser_path = /proj/llx/Software/Parsers/COLLINS-PARSER/
|
119
|
+
# - parser = sleepy (German)
|
120
|
+
# parser_path = /proj/corpora/sleepy3/
|
121
|
+
# - parser = minipar (English)
|
122
|
+
# parser_path = /proj/llx/Software/Parsers/minipar-linux/
|
123
|
+
#
|
124
|
+
pos_tagger = treetagger
|
125
|
+
pos_tagger_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
|
126
|
+
|
127
|
+
lemmatizer = treetagger
|
128
|
+
lemmatizer_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
|
129
|
+
|
130
|
+
parser = berkeley
|
131
|
+
parser_path = <%= File.expand_path('tools/berkeleyParser') %>
|
132
|
+
|
133
|
+
# parser:
|
134
|
+
# maximum no. of sentences in a parse file,
|
135
|
+
# maximum sentence length to be parsed
|
136
|
+
|
137
|
+
parser_max_sent_num = 2000
|
138
|
+
parser_max_sent_len = 80
|
@@ -0,0 +1,257 @@
|
|
1
|
+
#################################################
|
2
|
+
# This is a sample experiment file
|
3
|
+
# with explanations of all features
|
4
|
+
# that can be set for the ROSY system.
|
5
|
+
#
|
6
|
+
# To start your own experiment,
|
7
|
+
# replace all occurrences of
|
8
|
+
# %SOMETHING% or %PATH% or %PARAMETERS%
|
9
|
+
# by values of your choice.
|
10
|
+
#
|
11
|
+
# Experiment file lines that start with '#'
|
12
|
+
# are comments and are ignored. Empty lines are ignored as well.
|
13
|
+
|
14
|
+
########################
|
15
|
+
# Experiment description
|
16
|
+
#
|
17
|
+
|
18
|
+
##
|
19
|
+
# Experiment ID:
|
20
|
+
# Uniquely identifies files and database tables
|
21
|
+
# of this experiment.
|
22
|
+
# The experiment ID is a word (no spaces) of
|
23
|
+
# letters in [A-Za-z_].
|
24
|
+
experiment_ID = rosy_test
|
25
|
+
|
26
|
+
# Enduser mode?
|
27
|
+
# The idea is that the enduser will only _apply_
|
28
|
+
# pre-trained classifiers. So in enduser mode many
|
29
|
+
# options are disallowed.
|
30
|
+
enduser_mode = false
|
31
|
+
|
32
|
+
# directories
|
33
|
+
# - data directory: where Rosy puts its internal data
|
34
|
+
# - input directory:
|
35
|
+
# where Rosy reads its input SalsaTigerXML data.
|
36
|
+
# One directory each for the training and the test data
|
37
|
+
# - output directory:
|
38
|
+
# where Rosy writes its output SalsaTigerXML data:
|
39
|
+
# same frames as in the input data, but frame elements newly
|
40
|
+
# assigned.
|
41
|
+
# If no output directory is given, output is to
|
42
|
+
# <data_dir>/<experiment_ID>/output/
|
43
|
+
# - classifier_dir: If present, this is where trained classifiers
|
44
|
+
# are written.
|
45
|
+
# Otherwise they are written to <data_dir>/<experiment_id>/classif_dir
|
46
|
+
data_dir = <%= File.expand_path('test/functional/output') %>
|
47
|
+
directory_input_test = <%= File.expand_path('test/functional/input/rosy/test.salsa') %>
|
48
|
+
classifier_dir = <%= File.expand_path('test/functional/input/rosy/cls') %>
|
49
|
+
|
50
|
+
##
|
51
|
+
# Preprocessing settings:
|
52
|
+
# frprep experiment files for training and test data.
|
53
|
+
preproc_descr_file_train = <%= File.expand_path('test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone') %>
|
54
|
+
preproc_descr_file_test = <%= File.expand_path('test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone') %>
|
55
|
+
|
56
|
+
|
57
|
+
########################
|
58
|
+
# features
|
59
|
+
#
|
60
|
+
# Please specify all features that you would like
|
61
|
+
# Rosy to compute.
|
62
|
+
# Note: The system distinguishes between features to be
|
63
|
+
# computed and features to be included in the model,
|
64
|
+
# so you can compute features once and then vary features
|
65
|
+
# included in the model.
|
66
|
+
#
|
67
|
+
# Format for each feature specification:
|
68
|
+
# feature = <feature_name> [dontuse | argrec | arglab | onestep]
|
69
|
+
#
|
70
|
+
# dontuse: the feature is computed but not included in the model.
|
71
|
+
# argrec, arglab, onestep: the feature is used only in this
|
72
|
+
# processing step
|
73
|
+
#
|
74
|
+
#
|
75
|
+
# The set of features computed must stay the same throughout
|
76
|
+
# an experiment (or the match of experiment file and
|
77
|
+
# database table will fail), but the set of features included
|
78
|
+
# in the model can be varied.
|
79
|
+
#
|
80
|
+
# See below for a list of all features currently available in the system.
|
81
|
+
|
82
|
+
feature = pt_path
|
83
|
+
feature = gf_path
|
84
|
+
feature = path
|
85
|
+
feature = path_length
|
86
|
+
feature = pt_combined_path
|
87
|
+
feature = gf_combined_path
|
88
|
+
feature = combined_path
|
89
|
+
feature = pt_partial_path
|
90
|
+
feature = gf_partial_path
|
91
|
+
feature = partial_path
|
92
|
+
feature = pt_gvpath
|
93
|
+
feature = gf_gvpath
|
94
|
+
feature = gvpath
|
95
|
+
feature = ancestor_rule
|
96
|
+
feature = relpos
|
97
|
+
feature = pt
|
98
|
+
feature = gf
|
99
|
+
feature = father_pt
|
100
|
+
feature = frame
|
101
|
+
feature = target
|
102
|
+
feature = target_pos
|
103
|
+
feature = target_voice
|
104
|
+
feature = gov_verb
|
105
|
+
feature = prep
|
106
|
+
feature = const_head
|
107
|
+
feature = const_head_pos
|
108
|
+
feature = icont_word
|
109
|
+
feature = firstword
|
110
|
+
feature = lastword
|
111
|
+
feature = leftsib
|
112
|
+
feature = rightsib
|
113
|
+
feature = worddistance
|
114
|
+
feature = ismaxproj
|
115
|
+
feature = nearest_node
|
116
|
+
feature = prune
|
117
|
+
|
118
|
+
########################
|
119
|
+
# classifiers
|
120
|
+
#
|
121
|
+
# Please specify each classifier type you want to use.
|
122
|
+
# If you specify more than one classifier, classifier combination
|
123
|
+
# is used.
|
124
|
+
#
|
125
|
+
# Format for each classifier specification:
|
126
|
+
# classifier = <classifier_name> <path> [<parameters>]
|
127
|
+
#
|
128
|
+
# Possible values for <classifier_name> at the moment:
|
129
|
+
# timbl (memory-based learning),
|
130
|
+
# maxent (openlp maxent system)
|
131
|
+
#
|
132
|
+
# Samples:
|
133
|
+
# classifier = timbl /prog/MachineLearning/Timbl5/
|
134
|
+
# classifier = maxent /prog/maxent-2.4.0 /prog/shalmaneser/program/tools/maxent
|
135
|
+
|
136
|
+
classifier = maxent /opt/OpenNLP-maxent/2.4.0 <%= File.expand_path('tools/maxent/') %>
|
137
|
+
|
138
|
+
########################
|
139
|
+
# further settings
|
140
|
+
|
141
|
+
# Pruning: Identify constituents that are very unlikely
|
142
|
+
# to instantiate a semantic role, and prune them prior
|
143
|
+
# to the training/application of classifiers?
|
144
|
+
#
|
145
|
+
# Pruning methods available at the moment:
|
146
|
+
# prune: Xue/Palmer EMNLP 2004, adapted to fit each individual parser
|
147
|
+
#
|
148
|
+
# To enable pruning, set "prune" to the pruning method of your choice,
|
149
|
+
# and also compute the feature of the same name -- see
|
150
|
+
# feature list below.
|
151
|
+
# To disable pruning, comment out the next line.
|
152
|
+
prune = prune
|
153
|
+
|
154
|
+
# verbose mode
|
155
|
+
verbose = true
|
156
|
+
|
157
|
+
# data adaptation:
|
158
|
+
# correct training labels to
|
159
|
+
# match syntax better?
|
160
|
+
fe_syn_repair = true
|
161
|
+
fe_rel_repair = false
|
162
|
+
|
163
|
+
# xwise: For each classification step (argrec, arglab, onestep)
|
164
|
+
# you can set the granularity of training:
|
165
|
+
# - by frame (frame)
|
166
|
+
# - by target part of speech or (target_pos)
|
167
|
+
# - by target lemma. (target)
|
168
|
+
#
|
169
|
+
# these three settings can be combined, e.g.
|
170
|
+
# xwise_argrec = target_pos frame
|
171
|
+
# to train argrec frame-wise and split each frame by target POS.
|
172
|
+
#
|
173
|
+
# If no value is given for xwise_<step>, the default is "frame".
|
174
|
+
xwise_argrec = frame
|
175
|
+
xwise_arglab = frame
|
176
|
+
xwise_onestep = frame
|
177
|
+
|
178
|
+
|
179
|
+
# assume_argrec_perfect: by default, this is false.
|
180
|
+
#
|
181
|
+
# Set this to true
|
182
|
+
# to perform the arglab (argument labeling) step
|
183
|
+
# on all instances that actually are FEs
|
184
|
+
# rather than on all instances that the argrec step
|
185
|
+
# has judged to be FEs.
|
186
|
+
assume_argrec_perfect = false
|
187
|
+
|
188
|
+
# split_nones: set to true
|
189
|
+
# to split the NONE target class into:
|
190
|
+
# NONE left of target,
|
191
|
+
# NONE right of target
|
192
|
+
# because the NONE class has so many more instances
|
193
|
+
# than any other.
|
194
|
+
split_nones = true
|
195
|
+
|
196
|
+
|
197
|
+
# print_eval_log: set to true to print individual correctness
|
198
|
+
# judgments for each instance evaluated
|
199
|
+
print_eval_log = true
|
200
|
+
|
201
|
+
# External data source:
|
202
|
+
#
|
203
|
+
# Rosy can integrate data computed by additional systems
|
204
|
+
# provided that they all use a common experiment file
|
205
|
+
# for external data to determine where they put their data.
|
206
|
+
# Rosy needs the path to that experiment file.
|
207
|
+
#
|
208
|
+
# (May be left unset when no external data is used)
|
209
|
+
#external_descr_file = %PATH%
|
210
|
+
|
211
|
+
|
212
|
+
########################
|
213
|
+
# rosy internal data - please don't change
|
214
|
+
|
215
|
+
# Database access:
|
216
|
+
# dbtype: type of database, either mysql
|
217
|
+
# for a MySQL server, or sqlite for SQLite.
|
218
|
+
#
|
219
|
+
# if dbtype == mysql, set access parameters:
|
220
|
+
# host: database server
|
221
|
+
# user: user name to use
|
222
|
+
# passwd: password for user
|
223
|
+
# dbname: database where all Rosy's tables will be stored
|
224
|
+
|
225
|
+
dbtype = mysql
|
226
|
+
host = localhost
|
227
|
+
user = shalm
|
228
|
+
passwd = 12345
|
229
|
+
dbname = shalm11
|
230
|
+
|
231
|
+
# classifier output columns in the tables all start
|
232
|
+
# with this prefix
|
233
|
+
classif_column_name = classif
|
234
|
+
|
235
|
+
# pattern for constructing the names
|
236
|
+
# of the DB tables with training data (main_table_name)
|
237
|
+
# and test data (test_table_name)
|
238
|
+
main_table_name = rosy_<exp_ID>_main
|
239
|
+
test_table_name = rosy_<exp_ID>_<test_ID>
|
240
|
+
|
241
|
+
# string to use for "no value for this feature"
|
242
|
+
# as well as "no FE for this instance"
|
243
|
+
noval = NONE
|
244
|
+
|
245
|
+
# pattern for constructing the names
|
246
|
+
# of classifier files and classifier output files
|
247
|
+
classifier_file = classif.<classif>.<group>
|
248
|
+
classifier_output_file = classout.<classif>.<group>.<dataset>
|
249
|
+
|
250
|
+
# pattern for constructing the names
|
251
|
+
# of the evaluation file and the evaluation log file
|
252
|
+
eval_file = eval.<exp_ID>.<step>.<test_ID>
|
253
|
+
log_file = eval_log.<exp_ID>.<step>.<test_ID>
|
254
|
+
|
255
|
+
# pattern for constructing the names
|
256
|
+
# of the files with failed parses
|
257
|
+
failed_file = parsefail.<exp_ID>.<split_ID>.<dataset>
|
@@ -0,0 +1,259 @@
|
|
1
|
+
#################################################
|
2
|
+
# This is a sample experiment file
|
3
|
+
# with explanations of all features
|
4
|
+
# that can be set for the ROSY system.
|
5
|
+
#
|
6
|
+
# To start your own experiment,
|
7
|
+
# replace all occurrences of
|
8
|
+
# %SOMETHING% or %PATH% or %PARAMETERS%
|
9
|
+
# by values of your choice.
|
10
|
+
#
|
11
|
+
# Experiment file lines that start with '#'
|
12
|
+
# are comments and are ignored. Empty lines are ignored as well.
|
13
|
+
|
14
|
+
########################
|
15
|
+
# Experiment description
|
16
|
+
#
|
17
|
+
|
18
|
+
##
|
19
|
+
# Experiment ID:
|
20
|
+
# Uniquely identifies files and database tables
|
21
|
+
# of this experiment.
|
22
|
+
# The experiment ID is a word (no spaces) of
|
23
|
+
# letters in [A-Za-z_].
|
24
|
+
experiment_ID = rosy_train
|
25
|
+
|
26
|
+
# Enduser mode?
|
27
|
+
# The idea is that the enduser will only _apply_
|
28
|
+
# pre-trained classifiers. So in enduser mode many
|
29
|
+
# options are disallowed.
|
30
|
+
enduser_mode = false
|
31
|
+
|
32
|
+
# directories
|
33
|
+
# - data directory: where Rosy puts its internal data
|
34
|
+
# - input directory:
|
35
|
+
# where Rosy reads its input SalsaTigerXML data.
|
36
|
+
# One directory each for the training and the test data
|
37
|
+
# - output directory:
|
38
|
+
# where Rosy writes its output SalsaTigerXML data:
|
39
|
+
# same frames as in the input data, but frame elements newly
|
40
|
+
# assigned.
|
41
|
+
# If no output directory is given, output is to
|
42
|
+
# <data_dir>/<experiment_ID>/output/
|
43
|
+
# - classifier_dir: If present, this is where trained classifiers
|
44
|
+
# are written.
|
45
|
+
# Otherwise they are written to <data_dir>/<experiment_id>/classif_dir
|
46
|
+
data_dir = /home/arbox/work_space/shalm/german/prog/output
|
47
|
+
directory_input_train = <%= File.expand_path('test/functional/output/prp_train/stxml_split') %>
|
48
|
+
directory_input_test = <%= File.expand_path('test/functional/output/exp_fred_salsa/output/stxml') %>
|
49
|
+
directory_output = <%= File.expand_path('test/functional/output/exp_rosy_salsa/output') %>
|
50
|
+
|
51
|
+
|
52
|
+
##
|
53
|
+
# Preprocessing settings:
|
54
|
+
# frprep experiment files for training and test data.
|
55
|
+
preproc_descr_file_train = <%= File.expand_path('test/functional/sample_experiment_files/prp_train.salsa') %>
|
56
|
+
preproc_descr_file_test = <%= File.exand_path('test/functional/sample_experiment_files/prp_test.salsa') %>
|
57
|
+
|
58
|
+
|
59
|
+
########################
|
60
|
+
# features
|
61
|
+
#
|
62
|
+
# Please specify all features that you would like
|
63
|
+
# Rosy to compute.
|
64
|
+
# Note: The system distinguishes between features to be
|
65
|
+
# computed and features to be included in the model,
|
66
|
+
# so you can compute features once and then vary features
|
67
|
+
# included in the model.
|
68
|
+
#
|
69
|
+
# Format for each feature specification:
|
70
|
+
# feature = <feature_name> [dontuse | argrec | arglab | onestep]
|
71
|
+
#
|
72
|
+
# dontuse: the feature is computed but not included in the model.
|
73
|
+
# argrec, arglab, onestep: the feature is used only in this
|
74
|
+
# processing step
|
75
|
+
#
|
76
|
+
#
|
77
|
+
# The set of features computed must stay the same throughout
|
78
|
+
# an experiment (or the match of experiment file and
|
79
|
+
# database table will fail), but the set of features included
|
80
|
+
# in the model can be varied.
|
81
|
+
#
|
82
|
+
# See below for a list of all features currently available in the system.
|
83
|
+
|
84
|
+
feature = pt_path
|
85
|
+
feature = gf_path
|
86
|
+
feature = path
|
87
|
+
feature = path_length
|
88
|
+
feature = pt_combined_path
|
89
|
+
feature = gf_combined_path
|
90
|
+
feature = combined_path
|
91
|
+
feature = pt_partial_path
|
92
|
+
feature = gf_partial_path
|
93
|
+
feature = partial_path
|
94
|
+
feature = pt_gvpath
|
95
|
+
feature = gf_gvpath
|
96
|
+
feature = gvpath
|
97
|
+
feature = ancestor_rule
|
98
|
+
feature = relpos
|
99
|
+
feature = pt
|
100
|
+
feature = gf
|
101
|
+
feature = father_pt
|
102
|
+
feature = frame
|
103
|
+
feature = target
|
104
|
+
feature = target_pos
|
105
|
+
feature = target_voice
|
106
|
+
feature = gov_verb
|
107
|
+
feature = prep
|
108
|
+
feature = const_head
|
109
|
+
feature = const_head_pos
|
110
|
+
feature = icont_word
|
111
|
+
feature = firstword
|
112
|
+
feature = lastword
|
113
|
+
feature = leftsib
|
114
|
+
feature = rightsib
|
115
|
+
feature = worddistance
|
116
|
+
feature = ismaxproj
|
117
|
+
feature = nearest_node
|
118
|
+
feature = prune
|
119
|
+
|
120
|
+
########################
|
121
|
+
# classifiers
|
122
|
+
#
|
123
|
+
# Please specify each classifier type you want to use.
|
124
|
+
# If you specify more than one classifier, classifier combination
|
125
|
+
# is used.
|
126
|
+
#
|
127
|
+
# Format for each classifier specification:
|
128
|
+
# classifier = <classifier_name> <path> [<parameters>]
|
129
|
+
#
|
130
|
+
# Possible values for <classifier_name> at the moment:
|
131
|
+
# timbl (memory-based learning),
|
132
|
+
# maxent (openlp maxent system)
|
133
|
+
#
|
134
|
+
# Samples:
|
135
|
+
# classifier = timbl /prog/MachineLearning/Timbl5/
|
136
|
+
# classifier = maxent /prog/maxent-2.4.0 /prog/shalmaneser/program/tools/maxent
|
137
|
+
|
138
|
+
classifier = maxent /opt/OpenNLP-maxent/2.4.0 /home/arbox/work_space/shalm/dev/trunk/program_de/tools/maxent/
|
139
|
+
|
140
|
+
########################
|
141
|
+
# further settings
|
142
|
+
|
143
|
+
# Pruning: Identify constituents that are very unlikely
|
144
|
+
# to instantiate a semantic role, and prune them prior
|
145
|
+
# to the training/application of classifiers?
|
146
|
+
#
|
147
|
+
# Pruning methods available at the moment:
|
148
|
+
# prune: Xue/Palmer EMNLP 2004, adapted to fit each individual parser
|
149
|
+
#
|
150
|
+
# To enable pruning, set "prune" to the pruning method of your choice,
|
151
|
+
# and also compute the feature of the same name -- see
|
152
|
+
# feature list below.
|
153
|
+
# To disable pruning, comment out the next line.
|
154
|
+
prune = prune
|
155
|
+
|
156
|
+
# verbose mode
|
157
|
+
verbose = true
|
158
|
+
|
159
|
+
# data adaptation:
|
160
|
+
# correct training labels to
|
161
|
+
# match syntax better?
|
162
|
+
fe_syn_repair = true
|
163
|
+
fe_rel_repair = false
|
164
|
+
|
165
|
+
# xwise: For each classification step (argrec, arglab, onestep)
|
166
|
+
# you can set the granularity of training:
|
167
|
+
# - by frame (frame)
|
168
|
+
# - by target part of speech or (target_pos)
|
169
|
+
# - by target lemma. (target)
|
170
|
+
#
|
171
|
+
# these three settings can be combined, e.g.
|
172
|
+
# xwise_argrec = target_pos frame
|
173
|
+
# to train argrec frame-wise and split each frame by target POS.
|
174
|
+
#
|
175
|
+
# If no value is given for xwise_<step>, the default is "frame".
|
176
|
+
xwise_argrec = frame
|
177
|
+
xwise_arglab = frame
|
178
|
+
xwise_onestep = frame
|
179
|
+
|
180
|
+
|
181
|
+
# assume_argrec_perfect: by default, this is false.
|
182
|
+
#
|
183
|
+
# Set this to true
|
184
|
+
# to perform the arglab (argument labeling) step
|
185
|
+
# on all instances that actually are FEs
|
186
|
+
# rather than on all instances that the argrec step
|
187
|
+
# has judged to be FEs.
|
188
|
+
assume_argrec_perfect = false
|
189
|
+
|
190
|
+
# split_nones: set to true
|
191
|
+
# to split the NONE target class into:
|
192
|
+
# NONE left of target,
|
193
|
+
# NONE right of target
|
194
|
+
# because the NONE class has so many more instances
|
195
|
+
# than any other.
|
196
|
+
split_nones = true
|
197
|
+
|
198
|
+
|
199
|
+
# print_eval_log: set to true to print individual correctness
|
200
|
+
# judgments for each instance evaluated
|
201
|
+
print_eval_log = true
|
202
|
+
|
203
|
+
# External data source:
|
204
|
+
#
|
205
|
+
# Rosy can integrate data computed by additional systems
|
206
|
+
# provided that they all use a common experiment file
|
207
|
+
# for external data to determine where they put their data.
|
208
|
+
# Rosy needs the path to that experiment file.
|
209
|
+
#
|
210
|
+
# (May be left unset when no external data is used)
|
211
|
+
#external_descr_file = %PATH%
|
212
|
+
|
213
|
+
|
214
|
+
########################
|
215
|
+
# rosy internal data - please don't change
|
216
|
+
|
217
|
+
# Database access:
|
218
|
+
# dbtype: type of database, either mysql
|
219
|
+
# for a MySQL server, or sqlite for SQLite.
|
220
|
+
#
|
221
|
+
# if dbtype == mysql, set access parameters:
|
222
|
+
# host: database server
|
223
|
+
# user: user name to use
|
224
|
+
# passwd: password for user
|
225
|
+
# dbname: database where all Rosy's tables will be stored
|
226
|
+
|
227
|
+
dbtype = mysql
|
228
|
+
host = localhost
|
229
|
+
user = shalm
|
230
|
+
passwd = 12345
|
231
|
+
dbname = shalm11
|
232
|
+
|
233
|
+
# classifier output columns in the tables all start
|
234
|
+
# with this prefix
|
235
|
+
classif_column_name = classif
|
236
|
+
|
237
|
+
# pattern for constructing the names
|
238
|
+
# of the DB tables with training data (main_table_name)
|
239
|
+
# and test data (test_table_name)
|
240
|
+
main_table_name = rosy_<exp_ID>_main
|
241
|
+
test_table_name = rosy_<exp_ID>_<test_ID>
|
242
|
+
|
243
|
+
# string to use for "no value for this feature"
|
244
|
+
# as well as "no FE for this instance"
|
245
|
+
noval = NONE
|
246
|
+
|
247
|
+
# pattern for constructing the names
|
248
|
+
# of classifier files and classifier output files
|
249
|
+
classifier_file = classif.<classif>.<group>
|
250
|
+
classifier_output_file = classout.<classif>.<group>.<dataset>
|
251
|
+
|
252
|
+
# pattern for constructing the names
|
253
|
+
# of the evaluation file and the evaluation log file
|
254
|
+
eval_file = eval.<exp_ID>.<step>.<test_ID>
|
255
|
+
log_file = eval_log.<exp_ID>.<step>.<test_ID>
|
256
|
+
|
257
|
+
# pattern for constructing the names
|
258
|
+
# of the files with failed parses
|
259
|
+
failed_file = parsefail.<exp_ID>.<split_ID>.<dataset>
|