frprep 0.0.1.prealpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,138 @@
1
+ #################################################
2
+ # This is a sample experiment file
3
+ # with explanations of all features
4
+ # that can be set for the frprep preprocessing system for Fred and Rosy.
5
+ #
6
+ # To start your own experiment,
7
+ # replace all occurrences of
8
+ # %...% by values of your choice.
9
+ #
10
+ # Boolean features may be omitted and are false by default.
11
+ #
12
+ # Experiment file lines that start with '#'
13
+ # are comments and are ignored. Empty lines are ignored as well.
14
+
15
+ ########################
16
+ # Experiment description
17
+ #
18
+
19
+ # ID identifying this experiment and all its data
20
+ # please do not use spaces inside the experiment ID
21
+ prep_experiment_ID = prp_train
22
+
23
+ # YOUR INPUT DATA:
24
+ # frprep accepts an input directory rather than an input file.
25
+ # It will process all files in the directory directory_input
26
+ # and write the results to directory_preprocessed.
27
+ #
28
+ # For input formats see the discussion of "format" below.
29
+ #directory_input = <%= File.expand_path('test/functional/input/frprep/train.salsa') %>
30
+ directory_preprocessed = <%= File.expand_path('test/functional/input/rosy/frprep/train.salsa') %>
31
+
32
+ ##
33
+ # Experimental data is described by the following parameters:
34
+ #
35
+ # - language: en / de
36
+ # en for English or de for German
37
+ #
38
+ # - format: SalsaTigerXML / FNXml / SalsaTab / BNC / Plain
39
+ #
40
+ # Format of the input data, training/test set
41
+ # SalsaTigerXML: Parsed data, English or German
42
+ # FNXml: FrameNet Lexical Unit files in FrameNet XML format
43
+ # FNCorpusXML: FrameNet files in the FrameNet corpus XML format
44
+ # SalsaTab: tabular format (internal)
45
+ # BNC BNC XML format, alternating words and POS tags
46
+ # Plain Plain text, ONE SENTENCE PER LINE.
47
+ #
48
+ # Preprocessing transforms all data to SalsaTigerXML.
49
+ #
50
+ # - origin: SalsaTiger / FrameNet / <not specified>
51
+ # This is the origin of the training/test data.
52
+ # SalsaTiger: data from the Tiger corpus, possibly semantically
53
+ # annotated by Salsa
54
+ # FrameNet: data from the FrameNet project
55
+ #
56
+ # Don't set 'origin' if none of these origins apply
57
+ #
58
+ # - encoding: utf8 / iso / hex / <not specified>
59
+ # Default: iso
60
+
61
+ language = de
62
+ #origin =
63
+ format = SalsaTigerXML
64
+ encoding = utf8
65
+
66
+ #############################
67
+ # Which preprocessing steps to take?
68
+ #
69
+ # Data can be parsed, lemmatized and POS-tagged,
70
+ # but this happens only if it is specified in the
71
+ # experiment file.
72
+ #
73
+ # Set these booleans to true to trigger the respective
74
+ # type of preprocessing. The default value is false.
75
+
76
+ do_lemmatize = true
77
+ do_postag = false
78
+ do_parse = true
79
+
80
+ #############################
81
+ # directory where frprep puts its internal data
82
+ #
83
+
84
+ frprep_directory = <%= File.expand_path('test/functional/input/rosy/') %>
85
+
86
+ #############################
87
+ # Syntax/semantics interface repair:
88
+ # FrameNet annotated data has some annotation choices
89
+ # that may make it harder to learn the mapping from
90
+ # syntactic structure to semantic roles.
91
+ #
92
+ # If you are using FrameNet data for training a
93
+ # semantic role labeler, set the following two settings
94
+ # to true (default is false) to 'repair' semantic role labels
95
+ # to closer match the syntactic structure
96
+
97
+ fe_syn_repair = true
98
+ fe_rel_repair = false
99
+
100
+
101
+ #################
102
+ # Location of tools and resources used by Fred
103
+
104
+ # currently known to the system:
105
+ # (Saarbruecken paths given)
106
+ #
107
+ # - POS tagging:
108
+ # - pos_tagger = treetagger
109
+ # pos_tagger_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
110
+ #
111
+ # - Lemmatization:
112
+ # - lemmatizer = treetagger
113
+ # lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
114
+ # lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-german-notokenisation
115
+ #
116
+ # - Parser:
117
+ # - parser = collins (English)
118
+ # parser_path = /proj/llx/Software/Parsers/COLLINS-PARSER/
119
+ # - parser = sleepy (German)
120
+ # parser_path = /proj/corpora/sleepy3/
121
+ # - parser = minipar (English)
122
+ # parser_path = /proj/llx/Software/Parsers/minipar-linux/
123
+ #
124
+ pos_tagger = treetagger
125
+ pos_tagger_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
126
+
127
+ lemmatizer = treetagger
128
+ lemmatizer_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
129
+
130
+ parser = berkeley
131
+ parser_path = <%= File.expand_path('tools/berkeleyParser') %>
132
+
133
+ # parser:
134
+ # maximum no. of sentences in a parse file,
135
+ # maximum sentence length to be parsed
136
+
137
+ parser_max_sent_num = 2000
138
+ parser_max_sent_len = 80
@@ -0,0 +1,257 @@
1
+ #################################################
2
+ # This is a sample experiment file
3
+ # with explanations of all features
4
+ # that can be set for the ROSY system.
5
+ #
6
+ # To start your own experiment,
7
+ # replace all occurrences of
8
+ # %SOMETHING% or %PATH% or %PARAMETERS%
9
+ # by values of your choice.
10
+ #
11
+ # Experiment file lines that start with '#'
12
+ # are comments and are ignored. Empty lines are ignored as well.
13
+
14
+ ########################
15
+ # Experiment description
16
+ #
17
+
18
+ ##
19
+ # Experiment ID:
20
+ # Uniquely identifies files and database tables
21
+ # of this experiment.
22
+ # The experiment ID is a word (no spaces) of
23
+ # letters in [A-Za-z_].
24
+ experiment_ID = rosy_test
25
+
26
+ # Enduser mode?
27
+ # The idea is that the enduser will only _apply_
28
+ # pre-trained classifiers. So in enduser mode many
29
+ # options are disallowed.
30
+ enduser_mode = false
31
+
32
+ # directories
33
+ # - data directory: where Rosy puts its internal data
34
+ # - input directory:
35
+ # where Rosy reads its input SalsaTigerXML data.
36
+ # One directory each for the training and the test data
37
+ # - output directory:
38
+ # where Rosy writes its output SalsaTigerXML data:
39
+ # same frames as in the input data, but frame elements newly
40
+ # assigned.
41
+ # If no output directory is given, output is to
42
+ # <data_dir>/<experiment_ID>/output/
43
+ # - classifier_dir: If present, this is where trained classifiers
44
+ # are written.
45
+ # Otherwise they are written to <data_dir>/<experiment_id>/classif_dir
46
+ data_dir = <%= File.expand_path('test/functional/output') %>
47
+ directory_input_test = <%= File.expand_path('test/functional/input/rosy/test.salsa') %>
48
+ classifier_dir = <%= File.expand_path('test/functional/input/rosy/cls') %>
49
+
50
+ ##
51
+ # Preprocessing settings:
52
+ # frprep experiment files for training and test data.
53
+ preproc_descr_file_train = <%= File.expand_path('test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone') %>
54
+ preproc_descr_file_test = <%= File.expand_path('test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone') %>
55
+
56
+
57
+ ########################
58
+ # features
59
+ #
60
+ # Please specify all features that you would like
61
+ # Rosy to compute.
62
+ # Note: The system distinguishes between features to be
63
+ # computed and features to be included in the model,
64
+ # so you can compute features once and then vary features
65
+ # included in the model.
66
+ #
67
+ # Format for each feature specification:
68
+ # feature = <feature_name> [dontuse | argrec | arglab | onestep]
69
+ #
70
+ # dontuse: the feature is computed but not included in the model.
71
+ # argrec, arglab, onestep: the feature is used only in this
72
+ # processing step
73
+ #
74
+ #
75
+ # The set of features computed must stay the same throughout
76
+ # an experiment (or the match of experiment file and
77
+ # database table will fail), but the set of features included
78
+ # in the model can be varied.
79
+ #
80
+ # See below for a list of all features currently available in the system.
81
+
82
+ feature = pt_path
83
+ feature = gf_path
84
+ feature = path
85
+ feature = path_length
86
+ feature = pt_combined_path
87
+ feature = gf_combined_path
88
+ feature = combined_path
89
+ feature = pt_partial_path
90
+ feature = gf_partial_path
91
+ feature = partial_path
92
+ feature = pt_gvpath
93
+ feature = gf_gvpath
94
+ feature = gvpath
95
+ feature = ancestor_rule
96
+ feature = relpos
97
+ feature = pt
98
+ feature = gf
99
+ feature = father_pt
100
+ feature = frame
101
+ feature = target
102
+ feature = target_pos
103
+ feature = target_voice
104
+ feature = gov_verb
105
+ feature = prep
106
+ feature = const_head
107
+ feature = const_head_pos
108
+ feature = icont_word
109
+ feature = firstword
110
+ feature = lastword
111
+ feature = leftsib
112
+ feature = rightsib
113
+ feature = worddistance
114
+ feature = ismaxproj
115
+ feature = nearest_node
116
+ feature = prune
117
+
118
+ ########################
119
+ # classifiers
120
+ #
121
+ # Please specify each classifier type you want to use.
122
+ # If you specify more than one classifier, classifier combination
123
+ # is used.
124
+ #
125
+ # Format for each classifier specification:
126
+ # classifier = <classifier_name> <path> [<parameters>]
127
+ #
128
+ # Possible values for <classifier_name> at the moment:
129
+ # timbl (memory-based learning),
130
+ # maxent (openlp maxent system)
131
+ #
132
+ # Samples:
133
+ # classifier = timbl /prog/MachineLearning/Timbl5/
134
+ # classifier = maxent /prog/maxent-2.4.0 /prog/shalmaneser/program/tools/maxent
135
+
136
+ classifier = maxent /opt/OpenNLP-maxent/2.4.0 <%= File.expand_path('tools/maxent/') %>
137
+
138
+ ########################
139
+ # further settings
140
+
141
+ # Pruning: Identify constituents that are very unlikely
142
+ # to instantiate a semantic role, and prune them prior
143
+ # to the training/application of classifiers?
144
+ #
145
+ # Pruning methods available at the moment:
146
+ # prune: Xue/Palmer EMNLP 2004, adapted to fit each individual parser
147
+ #
148
+ # To enable pruning, set "prune" to the pruning method of your choice,
149
+ # and also compute the feature of the same name -- see
150
+ # feature list below.
151
+ # To disable pruning, comment out the next line.
152
+ prune = prune
153
+
154
+ # verbose mode
155
+ verbose = true
156
+
157
+ # data adaptation:
158
+ # correct training labels to
159
+ # match syntax better?
160
+ fe_syn_repair = true
161
+ fe_rel_repair = false
162
+
163
+ # xwise: For each classification step (argrec, arglab, onestep)
164
+ # you can set the granularity of training:
165
+ # - by frame (frame)
166
+ # - by target part of speech or (target_pos)
167
+ # - by target lemma. (target)
168
+ #
169
+ # these three settings can be combined, e.g.
170
+ # xwise_argrec = target_pos frame
171
+ # to train argrec frame-wise and split each frame by target POS.
172
+ #
173
+ # If no value is given for xwise_<step>, the default is "frame".
174
+ xwise_argrec = frame
175
+ xwise_arglab = frame
176
+ xwise_onestep = frame
177
+
178
+
179
+ # assume_argrec_perfect: by default, this is false.
180
+ #
181
+ # Set this to true
182
+ # to perform the arglab (argument labeling) step
183
+ # on all instances that actually are FEs
184
+ # rather than on all instances that the argrec step
185
+ # has judged to be FEs.
186
+ assume_argrec_perfect = false
187
+
188
+ # split_nones: set to true
189
+ # to split the NONE target class into:
190
+ # NONE left of target,
191
+ # NONE right of target
192
+ # because the NONE class has so many more instances
193
+ # than any other.
194
+ split_nones = true
195
+
196
+
197
+ # print_eval_log: set to true to print individual correctness
198
+ # judgments for each instance evaluated
199
+ print_eval_log = true
200
+
201
+ # External data source:
202
+ #
203
+ # Rosy can integrate data computed by additional systems
204
+ # provided that they all use a common experiment file
205
+ # for external data to determine where they put their data.
206
+ # Rosy needs the path to that experiment file.
207
+ #
208
+ # (May be left unset when no external data is used)
209
+ #external_descr_file = %PATH%
210
+
211
+
212
+ ########################
213
+ # rosy internal data - please don't change
214
+
215
+ # Database access:
216
+ # dbtype: type of database, either mysql
217
+ # for a MySQL server, or sqlite for SQLite.
218
+ #
219
+ # if dbtype == mysql, set access parameters:
220
+ # host: database server
221
+ # user: user name to use
222
+ # passwd: password for user
223
+ # dbname: database where all Rosy's tables will be stored
224
+
225
+ dbtype = mysql
226
+ host = localhost
227
+ user = shalm
228
+ passwd = 12345
229
+ dbname = shalm11
230
+
231
+ # classifier output columns in the tables all start
232
+ # with this prefix
233
+ classif_column_name = classif
234
+
235
+ # pattern for constructing the names
236
+ # of the DB tables with training data (main_table_name)
237
+ # and test data (test_table_name)
238
+ main_table_name = rosy_<exp_ID>_main
239
+ test_table_name = rosy_<exp_ID>_<test_ID>
240
+
241
+ # string to use for "no value for this feature"
242
+ # as well as "no FE for this instance"
243
+ noval = NONE
244
+
245
+ # pattern for constructing the names
246
+ # of classifier files and classifier output files
247
+ classifier_file = classif.<classif>.<group>
248
+ classifier_output_file = classout.<classif>.<group>.<dataset>
249
+
250
+ # pattern for constructing the names
251
+ # of the evaluation file and the evaluation log file
252
+ eval_file = eval.<exp_ID>.<step>.<test_ID>
253
+ log_file = eval_log.<exp_ID>.<step>.<test_ID>
254
+
255
+ # pattern for constructing the names
256
+ # of the files with failed parses
257
+ failed_file = parsefail.<exp_ID>.<split_ID>.<dataset>
@@ -0,0 +1,259 @@
1
+ #################################################
2
+ # This is a sample experiment file
3
+ # with explanations of all features
4
+ # that can be set for the ROSY system.
5
+ #
6
+ # To start your own experiment,
7
+ # replace all occurrences of
8
+ # %SOMETHING% or %PATH% or %PARAMETERS%
9
+ # by values of your choice.
10
+ #
11
+ # Experiment file lines that start with '#'
12
+ # are comments and are ignored. Empty lines are ignored as well.
13
+
14
+ ########################
15
+ # Experiment description
16
+ #
17
+
18
+ ##
19
+ # Experiment ID:
20
+ # Uniquely identifies files and database tables
21
+ # of this experiment.
22
+ # The experiment ID is a word (no spaces) of
23
+ # letters in [A-Za-z_].
24
+ experiment_ID = rosy_train
25
+
26
+ # Enduser mode?
27
+ # The idea is that the enduser will only _apply_
28
+ # pre-trained classifiers. So in enduser mode many
29
+ # options are disallowed.
30
+ enduser_mode = false
31
+
32
+ # directories
33
+ # - data directory: where Rosy puts its internal data
34
+ # - input directory:
35
+ # where Rosy reads its input SalsaTigerXML data.
36
+ # One directory each for the training and the test data
37
+ # - output directory:
38
+ # where Rosy writes its output SalsaTigerXML data:
39
+ # same frames as in the input data, but frame elements newly
40
+ # assigned.
41
+ # If no output directory is given, output is to
42
+ # <data_dir>/<experiment_ID>/output/
43
+ # - classifier_dir: If present, this is where trained classifiers
44
+ # are written.
45
+ # Otherwise they are written to <data_dir>/<experiment_id>/classif_dir
46
+ data_dir = /home/arbox/work_space/shalm/german/prog/output
47
+ directory_input_train = <%= File.expand_path('test/functional/output/prp_train/stxml_split') %>
48
+ directory_input_test = <%= File.expand_path('test/functional/output/exp_fred_salsa/output/stxml') %>
49
+ directory_output = <%= File.expand_path('test/functional/output/exp_rosy_salsa/output') %>
50
+
51
+
52
+ ##
53
+ # Preprocessing settings:
54
+ # frprep experiment files for training and test data.
55
+ preproc_descr_file_train = <%= File.expand_path('test/functional/sample_experiment_files/prp_train.salsa') %>
56
+ preproc_descr_file_test = <%= File.exand_path('test/functional/sample_experiment_files/prp_test.salsa') %>
57
+
58
+
59
+ ########################
60
+ # features
61
+ #
62
+ # Please specify all features that you would like
63
+ # Rosy to compute.
64
+ # Note: The system distinguishes between features to be
65
+ # computed and features to be included in the model,
66
+ # so you can compute features once and then vary features
67
+ # included in the model.
68
+ #
69
+ # Format for each feature specification:
70
+ # feature = <feature_name> [dontuse | argrec | arglab | onestep]
71
+ #
72
+ # dontuse: the feature is computed but not included in the model.
73
+ # argrec, arglab, onestep: the feature is used only in this
74
+ # processing step
75
+ #
76
+ #
77
+ # The set of features computed must stay the same throughout
78
+ # an experiment (or the match of experiment file and
79
+ # database table will fail), but the set of features included
80
+ # in the model can be varied.
81
+ #
82
+ # See below for a list of all features currently available in the system.
83
+
84
+ feature = pt_path
85
+ feature = gf_path
86
+ feature = path
87
+ feature = path_length
88
+ feature = pt_combined_path
89
+ feature = gf_combined_path
90
+ feature = combined_path
91
+ feature = pt_partial_path
92
+ feature = gf_partial_path
93
+ feature = partial_path
94
+ feature = pt_gvpath
95
+ feature = gf_gvpath
96
+ feature = gvpath
97
+ feature = ancestor_rule
98
+ feature = relpos
99
+ feature = pt
100
+ feature = gf
101
+ feature = father_pt
102
+ feature = frame
103
+ feature = target
104
+ feature = target_pos
105
+ feature = target_voice
106
+ feature = gov_verb
107
+ feature = prep
108
+ feature = const_head
109
+ feature = const_head_pos
110
+ feature = icont_word
111
+ feature = firstword
112
+ feature = lastword
113
+ feature = leftsib
114
+ feature = rightsib
115
+ feature = worddistance
116
+ feature = ismaxproj
117
+ feature = nearest_node
118
+ feature = prune
119
+
120
+ ########################
121
+ # classifiers
122
+ #
123
+ # Please specify each classifier type you want to use.
124
+ # If you specify more than one classifier, classifier combination
125
+ # is used.
126
+ #
127
+ # Format for each classifier specification:
128
+ # classifier = <classifier_name> <path> [<parameters>]
129
+ #
130
+ # Possible values for <classifier_name> at the moment:
131
+ # timbl (memory-based learning),
132
+ # maxent (openlp maxent system)
133
+ #
134
+ # Samples:
135
+ # classifier = timbl /prog/MachineLearning/Timbl5/
136
+ # classifier = maxent /prog/maxent-2.4.0 /prog/shalmaneser/program/tools/maxent
137
+
138
+ classifier = maxent /opt/OpenNLP-maxent/2.4.0 /home/arbox/work_space/shalm/dev/trunk/program_de/tools/maxent/
139
+
140
+ ########################
141
+ # further settings
142
+
143
+ # Pruning: Identify constituents that are very unlikely
144
+ # to instantiate a semantic role, and prune them prior
145
+ # to the training/application of classifiers?
146
+ #
147
+ # Pruning methods available at the moment:
148
+ # prune: Xue/Palmer EMNLP 2004, adapted to fit each individual parser
149
+ #
150
+ # To enable pruning, set "prune" to the pruning method of your choice,
151
+ # and also compute the feature of the same name -- see
152
+ # feature list below.
153
+ # To disable pruning, comment out the next line.
154
+ prune = prune
155
+
156
+ # verbose mode
157
+ verbose = true
158
+
159
+ # data adaptation:
160
+ # correct training labels to
161
+ # match syntax better?
162
+ fe_syn_repair = true
163
+ fe_rel_repair = false
164
+
165
+ # xwise: For each classification step (argrec, arglab, onestep)
166
+ # you can set the granularity of training:
167
+ # - by frame (frame)
168
+ # - by target part of speech or (target_pos)
169
+ # - by target lemma. (target)
170
+ #
171
+ # these three settings can be combined, e.g.
172
+ # xwise_argrec = target_pos frame
173
+ # to train argrec frame-wise and split each frame by target POS.
174
+ #
175
+ # If no value is given for xwise_<step>, the default is "frame".
176
+ xwise_argrec = frame
177
+ xwise_arglab = frame
178
+ xwise_onestep = frame
179
+
180
+
181
+ # assume_argrec_perfect: by default, this is false.
182
+ #
183
+ # Set this to true
184
+ # to perform the arglab (argument labeling) step
185
+ # on all instances that actually are FEs
186
+ # rather than on all instances that the argrec step
187
+ # has judged to be FEs.
188
+ assume_argrec_perfect = false
189
+
190
+ # split_nones: set to true
191
+ # to split the NONE target class into:
192
+ # NONE left of target,
193
+ # NONE right of target
194
+ # because the NONE class has so many more instances
195
+ # than any other.
196
+ split_nones = true
197
+
198
+
199
+ # print_eval_log: set to true to print individual correctness
200
+ # judgments for each instance evaluated
201
+ print_eval_log = true
202
+
203
+ # External data source:
204
+ #
205
+ # Rosy can integrate data computed by additional systems
206
+ # provided that they all use a common experiment file
207
+ # for external data to determine where they put their data.
208
+ # Rosy needs the path to that experiment file.
209
+ #
210
+ # (May be left unset when no external data is used)
211
+ #external_descr_file = %PATH%
212
+
213
+
214
+ ########################
215
+ # rosy internal data - please don't change
216
+
217
+ # Database access:
218
+ # dbtype: type of database, either mysql
219
+ # for a MySQL server, or sqlite for SQLite.
220
+ #
221
+ # if dbtype == mysql, set access parameters:
222
+ # host: database server
223
+ # user: user name to use
224
+ # passwd: password for user
225
+ # dbname: database where all Rosy's tables will be stored
226
+
227
+ dbtype = mysql
228
+ host = localhost
229
+ user = shalm
230
+ passwd = 12345
231
+ dbname = shalm11
232
+
233
+ # classifier output columns in the tables all start
234
+ # with this prefix
235
+ classif_column_name = classif
236
+
237
+ # pattern for constructing the names
238
+ # of the DB tables with training data (main_table_name)
239
+ # and test data (test_table_name)
240
+ main_table_name = rosy_<exp_ID>_main
241
+ test_table_name = rosy_<exp_ID>_<test_ID>
242
+
243
+ # string to use for "no value for this feature"
244
+ # as well as "no FE for this instance"
245
+ noval = NONE
246
+
247
+ # pattern for constructing the names
248
+ # of classifier files and classifier output files
249
+ classifier_file = classif.<classif>.<group>
250
+ classifier_output_file = classout.<classif>.<group>.<dataset>
251
+
252
+ # pattern for constructing the names
253
+ # of the evaluation file and the evaluation log file
254
+ eval_file = eval.<exp_ID>.<step>.<test_ID>
255
+ log_file = eval_log.<exp_ID>.<step>.<test_ID>
256
+
257
+ # pattern for constructing the names
258
+ # of the files with failed parses
259
+ failed_file = parsefail.<exp_ID>.<split_ID>.<dataset>