frprep 0.0.1.prealpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,138 @@
1
+ #################################################
2
+ # This is a sample experiment file
3
+ # with explanations of all features
4
+ # that can be set for the frprep preprocessing system for Fred and Rosy.
5
+ #
6
+ # To start your own experiment,
7
+ # replace all occurrences of
8
+ # %...% by values of your choice.
9
+ #
10
+ # Boolean features may be omitted and are false by default.
11
+ #
12
+ # Experiment file lines that start with '#'
13
+ # are comments and are ignored. Empty lines are ignored as well.
14
+
15
+ ########################
16
+ # Experiment description
17
+ #
18
+
19
+ # ID identifying this experiment and all its data
20
+ # please do not use spaces inside the experiment ID
21
+ prep_experiment_ID = prp_train
22
+
23
+ # YOUR INPUT DATA:
24
+ # frprep accepts an input directory rather than an input file.
25
+ # It will process all files in the directory directory_input
26
+ # and write the results to directory_preprocessed.
27
+ #
28
+ # For input formats see the discussion of "format" below.
29
+ #directory_input = <%= File.expand_path('test/functional/input/frprep/train.salsa') %>
30
+ directory_preprocessed = <%= File.expand_path('test/functional/input/rosy/frprep/train.salsa') %>
31
+
32
+ ##
33
+ # Experimental data is described by the following parameters:
34
+ #
35
+ # - language: en / de
36
+ # en for English or de for German
37
+ #
38
+ # - format: SalsaTigerXML / FNXml / SalsaTab / BNC / Plain
39
+ #
40
+ # Format of the input data, training/test set
41
+ # SalsaTigerXML: Parsed data, English or German
42
+ # FNXml: FrameNet Lexical Unit files in FrameNet XML format
43
+ # FNCorpusXML: FrameNet files in the FrameNet corpus XML format
44
+ # SalsaTab: tabular format (internal)
45
+ # BNC BNC XML format, alternating words and POS tags
46
+ # Plain Plain text, ONE SENTENCE PER LINE.
47
+ #
48
+ # Preprocessing transforms all data to SalsaTigerXML.
49
+ #
50
+ # - origin: SalsaTiger / FrameNet / <not specified>
51
+ # This is the origin of the training/test data.
52
+ # SalsaTiger: data from the Tiger corpus, possibly semantically
53
+ # annotated by Salsa
54
+ # FrameNet: data from the FrameNet project
55
+ #
56
+ # Don't set 'origin' if none of these origins apply
57
+ #
58
+ # - encoding: utf8 / iso / hex / <not specified>
59
+ # Default: iso
60
+
61
+ language = de
62
+ #origin =
63
+ format = SalsaTigerXML
64
+ encoding = utf8
65
+
66
+ #############################
67
+ # Which preprocessing steps to take?
68
+ #
69
+ # Data can be parsed, lemmatized and POS-tagged,
70
+ # but this happens only if it is specified in the
71
+ # experiment file.
72
+ #
73
+ # Set these booleans to true to trigger the respective
74
+ # type of preprocessing. The default value is false.
75
+
76
+ do_lemmatize = true
77
+ do_postag = false
78
+ do_parse = true
79
+
80
+ #############################
81
+ # directory where frprep puts its internal data
82
+ #
83
+
84
+ frprep_directory = <%= File.expand_path('test/functional/input/rosy/') %>
85
+
86
+ #############################
87
+ # Syntax/semantics interface repair:
88
+ # FrameNet annotated data has some annotation choices
89
+ # that may make it harder to learn the mapping from
90
+ # syntactic structure to semantic roles.
91
+ #
92
+ # If you are using FrameNet data for training a
93
+ # semantic role labeler, set the following two settings
94
+ # to true (default is false) to 'repair' semantic role labels
95
+ # to closer match the syntactic structure
96
+
97
+ fe_syn_repair = true
98
+ fe_rel_repair = false
99
+
100
+
101
+ #################
102
+ # Location of tools and resources used by Fred
103
+
104
+ # currently known to the system:
105
+ # (Saarbruecken paths given)
106
+ #
107
+ # - POS tagging:
108
+ # - pos_tagger = treetagger
109
+ # pos_tagger_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
110
+ #
111
+ # - Lemmatization:
112
+ # - lemmatizer = treetagger
113
+ # lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
114
+ # lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-german-notokenisation
115
+ #
116
+ # - Parser:
117
+ # - parser = collins (English)
118
+ # parser_path = /proj/llx/Software/Parsers/COLLINS-PARSER/
119
+ # - parser = sleepy (German)
120
+ # parser_path = /proj/corpora/sleepy3/
121
+ # - parser = minipar (English)
122
+ # parser_path = /proj/llx/Software/Parsers/minipar-linux/
123
+ #
124
+ pos_tagger = treetagger
125
+ pos_tagger_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
126
+
127
+ lemmatizer = treetagger
128
+ lemmatizer_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
129
+
130
+ parser = berkeley
131
+ parser_path = <%= File.expand_path('tools/berkeleyParser') %>
132
+
133
+ # parser:
134
+ # maximum no. of sentences in a parse file,
135
+ # maximum sentence length to be parsed
136
+
137
+ parser_max_sent_num = 2000
138
+ parser_max_sent_len = 80
@@ -0,0 +1,257 @@
1
+ #################################################
2
+ # This is a sample experiment file
3
+ # with explanations of all features
4
+ # that can be set for the ROSY system.
5
+ #
6
+ # To start your own experiment,
7
+ # replace all occurrences of
8
+ # %SOMETHING% or %PATH% or %PARAMETERS%
9
+ # by values of your choice.
10
+ #
11
+ # Experiment file lines that start with '#'
12
+ # are comments and are ignored. Empty lines are ignored as well.
13
+
14
+ ########################
15
+ # Experiment description
16
+ #
17
+
18
+ ##
19
+ # Experiment ID:
20
+ # Uniquely identifies files and database tables
21
+ # of this experiment.
22
+ # The experiment ID is a word (no spaces) of
23
+ # letters in [A-Za-z_].
24
+ experiment_ID = rosy_test
25
+
26
+ # Enduser mode?
27
+ # The idea is that the enduser will only _apply_
28
+ # pre-trained classifiers. So in enduser mode many
29
+ # options are disallowed.
30
+ enduser_mode = false
31
+
32
+ # directories
33
+ # - data directory: where Rosy puts its internal data
34
+ # - input directory:
35
+ # where Rosy reads its input SalsaTigerXML data.
36
+ # One directory each for the training and the test data
37
+ # - output directory:
38
+ # where Rosy writes its output SalsaTigerXML data:
39
+ # same frames as in the input data, but frame elements newly
40
+ # assigned.
41
+ # If no output directory is given, output is to
42
+ # <data_dir>/<experiment_ID>/output/
43
+ # - classifier_dir: If present, this is where trained classifiers
44
+ # are written.
45
+ # Otherwise they are written to <data_dir>/<experiment_id>/classif_dir
46
+ data_dir = <%= File.expand_path('test/functional/output') %>
47
+ directory_input_test = <%= File.expand_path('test/functional/input/rosy/test.salsa') %>
48
+ classifier_dir = <%= File.expand_path('test/functional/input/rosy/cls') %>
49
+
50
+ ##
51
+ # Preprocessing settings:
52
+ # frprep experiment files for training and test data.
53
+ preproc_descr_file_train = <%= File.expand_path('test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone') %>
54
+ preproc_descr_file_test = <%= File.expand_path('test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone') %>
55
+
56
+
57
+ ########################
58
+ # features
59
+ #
60
+ # Please specify all features that you would like
61
+ # Rosy to compute.
62
+ # Note: The system distinguishes between features to be
63
+ # computed and features to be included in the model,
64
+ # so you can compute features once and then vary features
65
+ # included in the model.
66
+ #
67
+ # Format for each feature specification:
68
+ # feature = <feature_name> [dontuse | argrec | arglab | onestep]
69
+ #
70
+ # dontuse: the feature is computed but not included in the model.
71
+ # argrec, arglab, onestep: the feature is used only in this
72
+ # processing step
73
+ #
74
+ #
75
+ # The set of features computed must stay the same throughout
76
+ # an experiment (or the match of experiment file and
77
+ # database table will fail), but the set of features included
78
+ # in the model can be varied.
79
+ #
80
+ # See below for a list of all features currently available in the system.
81
+
82
+ feature = pt_path
83
+ feature = gf_path
84
+ feature = path
85
+ feature = path_length
86
+ feature = pt_combined_path
87
+ feature = gf_combined_path
88
+ feature = combined_path
89
+ feature = pt_partial_path
90
+ feature = gf_partial_path
91
+ feature = partial_path
92
+ feature = pt_gvpath
93
+ feature = gf_gvpath
94
+ feature = gvpath
95
+ feature = ancestor_rule
96
+ feature = relpos
97
+ feature = pt
98
+ feature = gf
99
+ feature = father_pt
100
+ feature = frame
101
+ feature = target
102
+ feature = target_pos
103
+ feature = target_voice
104
+ feature = gov_verb
105
+ feature = prep
106
+ feature = const_head
107
+ feature = const_head_pos
108
+ feature = icont_word
109
+ feature = firstword
110
+ feature = lastword
111
+ feature = leftsib
112
+ feature = rightsib
113
+ feature = worddistance
114
+ feature = ismaxproj
115
+ feature = nearest_node
116
+ feature = prune
117
+
118
+ ########################
119
+ # classifiers
120
+ #
121
+ # Please specify each classifier type you want to use.
122
+ # If you specify more than one classifier, classifier combination
123
+ # is used.
124
+ #
125
+ # Format for each classifier specification:
126
+ # classifier = <classifier_name> <path> [<parameters>]
127
+ #
128
+ # Possible values for <classifier_name> at the moment:
129
+ # timbl (memory-based learning),
130
+ # maxent (openlp maxent system)
131
+ #
132
+ # Samples:
133
+ # classifier = timbl /prog/MachineLearning/Timbl5/
134
+ # classifier = maxent /prog/maxent-2.4.0 /prog/shalmaneser/program/tools/maxent
135
+
136
+ classifier = maxent /opt/OpenNLP-maxent/2.4.0 <%= File.expand_path('tools/maxent/') %>
137
+
138
+ ########################
139
+ # further settings
140
+
141
+ # Pruning: Identify constituents that are very unlikely
142
+ # to instantiate a semantic role, and prune them prior
143
+ # to the training/application of classifiers?
144
+ #
145
+ # Pruning methods available at the moment:
146
+ # prune: Xue/Palmer EMNLP 2004, adapted to fit each individual parser
147
+ #
148
+ # To enable pruning, set "prune" to the pruning method of your choice,
149
+ # and also compute the feature of the same name -- see
150
+ # feature list below.
151
+ # To disable pruning, comment out the next line.
152
+ prune = prune
153
+
154
+ # verbose mode
155
+ verbose = true
156
+
157
+ # data adaptation:
158
+ # correct training labels to
159
+ # match syntax better?
160
+ fe_syn_repair = true
161
+ fe_rel_repair = false
162
+
163
+ # xwise: For each classification step (argrec, arglab, onestep)
164
+ # you can set the granularity of training:
165
+ # - by frame (frame)
166
+ # - by target part of speech or (target_pos)
167
+ # - by target lemma. (target)
168
+ #
169
+ # these three settings can be combined, e.g.
170
+ # xwise_argrec = target_pos frame
171
+ # to train argrec frame-wise and split each frame by target POS.
172
+ #
173
+ # If no value is given for xwise_<step>, the default is "frame".
174
+ xwise_argrec = frame
175
+ xwise_arglab = frame
176
+ xwise_onestep = frame
177
+
178
+
179
+ # assume_argrec_perfect: by default, this is false.
180
+ #
181
+ # Set this to true
182
+ # to perform the arglab (argument labeling) step
183
+ # on all instances that actually are FEs
184
+ # rather than on all instances that the argrec step
185
+ # has judged to be FEs.
186
+ assume_argrec_perfect = false
187
+
188
+ # split_nones: set to true
189
+ # to split the NONE target class into:
190
+ # NONE left of target,
191
+ # NONE right of target
192
+ # because the NONE class has so many more instances
193
+ # than any other.
194
+ split_nones = true
195
+
196
+
197
+ # print_eval_log: set to true to print individual correctness
198
+ # judgments for each instance evaluated
199
+ print_eval_log = true
200
+
201
+ # External data source:
202
+ #
203
+ # Rosy can integrate data computed by additional systems
204
+ # provided that they all use a common experiment file
205
+ # for external data to determine where they put their data.
206
+ # Rosy needs the path to that experiment file.
207
+ #
208
+ # (May be left unset when no external data is used)
209
+ #external_descr_file = %PATH%
210
+
211
+
212
+ ########################
213
+ # rosy internal data - please don't change
214
+
215
+ # Database access:
216
+ # dbtype: type of database, either mysql
217
+ # for a MySQL server, or sqlite for SQLite.
218
+ #
219
+ # if dbtype == mysql, set access parameters:
220
+ # host: database server
221
+ # user: user name to use
222
+ # passwd: password for user
223
+ # dbname: database where all Rosy's tables will be stored
224
+
225
+ dbtype = mysql
226
+ host = localhost
227
+ user = shalm
228
+ passwd = 12345
229
+ dbname = shalm11
230
+
231
+ # classifier output columns in the tables all start
232
+ # with this prefix
233
+ classif_column_name = classif
234
+
235
+ # pattern for constructing the names
236
+ # of the DB tables with training data (main_table_name)
237
+ # and test data (test_table_name)
238
+ main_table_name = rosy_<exp_ID>_main
239
+ test_table_name = rosy_<exp_ID>_<test_ID>
240
+
241
+ # string to use for "no value for this feature"
242
+ # as well as "no FE for this instance"
243
+ noval = NONE
244
+
245
+ # pattern for constructing the names
246
+ # of classifier files and classifier output files
247
+ classifier_file = classif.<classif>.<group>
248
+ classifier_output_file = classout.<classif>.<group>.<dataset>
249
+
250
+ # pattern for constructing the names
251
+ # of the evaluation file and the evaluation log file
252
+ eval_file = eval.<exp_ID>.<step>.<test_ID>
253
+ log_file = eval_log.<exp_ID>.<step>.<test_ID>
254
+
255
+ # pattern for constructing the names
256
+ # of the files with failed parses
257
+ failed_file = parsefail.<exp_ID>.<split_ID>.<dataset>
@@ -0,0 +1,259 @@
1
+ #################################################
2
+ # This is a sample experiment file
3
+ # with explanations of all features
4
+ # that can be set for the ROSY system.
5
+ #
6
+ # To start your own experiment,
7
+ # replace all occurrences of
8
+ # %SOMETHING% or %PATH% or %PARAMETERS%
9
+ # by values of your choice.
10
+ #
11
+ # Experiment file lines that start with '#'
12
+ # are comments and are ignored. Empty lines are ignored as well.
13
+
14
+ ########################
15
+ # Experiment description
16
+ #
17
+
18
+ ##
19
+ # Experiment ID:
20
+ # Uniquely identifies files and database tables
21
+ # of this experiment.
22
+ # The experiment ID is a word (no spaces) of
23
+ # letters in [A-Za-z_].
24
+ experiment_ID = rosy_train
25
+
26
+ # Enduser mode?
27
+ # The idea is that the enduser will only _apply_
28
+ # pre-trained classifiers. So in enduser mode many
29
+ # options are disallowed.
30
+ enduser_mode = false
31
+
32
+ # directories
33
+ # - data directory: where Rosy puts its internal data
34
+ # - input directory:
35
+ # where Rosy reads its input SalsaTigerXML data.
36
+ # One directory each for the training and the test data
37
+ # - output directory:
38
+ # where Rosy writes its output SalsaTigerXML data:
39
+ # same frames as in the input data, but frame elements newly
40
+ # assigned.
41
+ # If no output directory is given, output is to
42
+ # <data_dir>/<experiment_ID>/output/
43
+ # - classifier_dir: If present, this is where trained classifiers
44
+ # are written.
45
+ # Otherwise they are written to <data_dir>/<experiment_id>/classif_dir
46
+ data_dir = /home/arbox/work_space/shalm/german/prog/output
47
+ directory_input_train = <%= File.expand_path('test/functional/output/prp_train/stxml_split') %>
48
+ directory_input_test = <%= File.expand_path('test/functional/output/exp_fred_salsa/output/stxml') %>
49
+ directory_output = <%= File.expand_path('test/functional/output/exp_rosy_salsa/output') %>
50
+
51
+
52
+ ##
53
+ # Preprocessing settings:
54
+ # frprep experiment files for training and test data.
55
+ preproc_descr_file_train = <%= File.expand_path('test/functional/sample_experiment_files/prp_train.salsa') %>
56
+ preproc_descr_file_test = <%= File.exand_path('test/functional/sample_experiment_files/prp_test.salsa') %>
57
+
58
+
59
+ ########################
60
+ # features
61
+ #
62
+ # Please specify all features that you would like
63
+ # Rosy to compute.
64
+ # Note: The system distinguishes between features to be
65
+ # computed and features to be included in the model,
66
+ # so you can compute features once and then vary features
67
+ # included in the model.
68
+ #
69
+ # Format for each feature specification:
70
+ # feature = <feature_name> [dontuse | argrec | arglab | onestep]
71
+ #
72
+ # dontuse: the feature is computed but not included in the model.
73
+ # argrec, arglab, onestep: the feature is used only in this
74
+ # processing step
75
+ #
76
+ #
77
+ # The set of features computed must stay the same throughout
78
+ # an experiment (or the match of experiment file and
79
+ # database table will fail), but the set of features included
80
+ # in the model can be varied.
81
+ #
82
+ # See below for a list of all features currently available in the system.
83
+
84
+ feature = pt_path
85
+ feature = gf_path
86
+ feature = path
87
+ feature = path_length
88
+ feature = pt_combined_path
89
+ feature = gf_combined_path
90
+ feature = combined_path
91
+ feature = pt_partial_path
92
+ feature = gf_partial_path
93
+ feature = partial_path
94
+ feature = pt_gvpath
95
+ feature = gf_gvpath
96
+ feature = gvpath
97
+ feature = ancestor_rule
98
+ feature = relpos
99
+ feature = pt
100
+ feature = gf
101
+ feature = father_pt
102
+ feature = frame
103
+ feature = target
104
+ feature = target_pos
105
+ feature = target_voice
106
+ feature = gov_verb
107
+ feature = prep
108
+ feature = const_head
109
+ feature = const_head_pos
110
+ feature = icont_word
111
+ feature = firstword
112
+ feature = lastword
113
+ feature = leftsib
114
+ feature = rightsib
115
+ feature = worddistance
116
+ feature = ismaxproj
117
+ feature = nearest_node
118
+ feature = prune
119
+
120
+ ########################
121
+ # classifiers
122
+ #
123
+ # Please specify each classifier type you want to use.
124
+ # If you specify more than one classifier, classifier combination
125
+ # is used.
126
+ #
127
+ # Format for each classifier specification:
128
+ # classifier = <classifier_name> <path> [<parameters>]
129
+ #
130
+ # Possible values for <classifier_name> at the moment:
131
+ # timbl (memory-based learning),
132
+ # maxent (openlp maxent system)
133
+ #
134
+ # Samples:
135
+ # classifier = timbl /prog/MachineLearning/Timbl5/
136
+ # classifier = maxent /prog/maxent-2.4.0 /prog/shalmaneser/program/tools/maxent
137
+
138
+ classifier = maxent /opt/OpenNLP-maxent/2.4.0 /home/arbox/work_space/shalm/dev/trunk/program_de/tools/maxent/
139
+
140
+ ########################
141
+ # further settings
142
+
143
+ # Pruning: Identify constituents that are very unlikely
144
+ # to instantiate a semantic role, and prune them prior
145
+ # to the training/application of classifiers?
146
+ #
147
+ # Pruning methods available at the moment:
148
+ # prune: Xue/Palmer EMNLP 2004, adapted to fit each individual parser
149
+ #
150
+ # To enable pruning, set "prune" to the pruning method of your choice,
151
+ # and also compute the feature of the same name -- see
152
+ # feature list below.
153
+ # To disable pruning, comment out the next line.
154
+ prune = prune
155
+
156
+ # verbose mode
157
+ verbose = true
158
+
159
+ # data adaptation:
160
+ # correct training labels to
161
+ # match syntax better?
162
+ fe_syn_repair = true
163
+ fe_rel_repair = false
164
+
165
+ # xwise: For each classification step (argrec, arglab, onestep)
166
+ # you can set the granularity of training:
167
+ # - by frame (frame)
168
+ # - by target part of speech or (target_pos)
169
+ # - by target lemma. (target)
170
+ #
171
+ # these three settings can be combined, e.g.
172
+ # xwise_argrec = target_pos frame
173
+ # to train argrec frame-wise and split each frame by target POS.
174
+ #
175
+ # If no value is given for xwise_<step>, the default is "frame".
176
+ xwise_argrec = frame
177
+ xwise_arglab = frame
178
+ xwise_onestep = frame
179
+
180
+
181
+ # assume_argrec_perfect: by default, this is false.
182
+ #
183
+ # Set this to true
184
+ # to perform the arglab (argument labeling) step
185
+ # on all instances that actually are FEs
186
+ # rather than on all instances that the argrec step
187
+ # has judged to be FEs.
188
+ assume_argrec_perfect = false
189
+
190
+ # split_nones: set to true
191
+ # to split the NONE target class into:
192
+ # NONE left of target,
193
+ # NONE right of target
194
+ # because the NONE class has so many more instances
195
+ # than any other.
196
+ split_nones = true
197
+
198
+
199
+ # print_eval_log: set to true to print individual correctness
200
+ # judgments for each instance evaluated
201
+ print_eval_log = true
202
+
203
+ # External data source:
204
+ #
205
+ # Rosy can integrate data computed by additional systems
206
+ # provided that they all use a common experiment file
207
+ # for external data to determine where they put their data.
208
+ # Rosy needs the path to that experiment file.
209
+ #
210
+ # (May be left unset when no external data is used)
211
+ #external_descr_file = %PATH%
212
+
213
+
214
+ ########################
215
+ # rosy internal data - please don't change
216
+
217
+ # Database access:
218
+ # dbtype: type of database, either mysql
219
+ # for a MySQL server, or sqlite for SQLite.
220
+ #
221
+ # if dbtype == mysql, set access parameters:
222
+ # host: database server
223
+ # user: user name to use
224
+ # passwd: password for user
225
+ # dbname: database where all Rosy's tables will be stored
226
+
227
+ dbtype = mysql
228
+ host = localhost
229
+ user = shalm
230
+ passwd = 12345
231
+ dbname = shalm11
232
+
233
+ # classifier output columns in the tables all start
234
+ # with this prefix
235
+ classif_column_name = classif
236
+
237
+ # pattern for constructing the names
238
+ # of the DB tables with training data (main_table_name)
239
+ # and test data (test_table_name)
240
+ main_table_name = rosy_<exp_ID>_main
241
+ test_table_name = rosy_<exp_ID>_<test_ID>
242
+
243
+ # string to use for "no value for this feature"
244
+ # as well as "no FE for this instance"
245
+ noval = NONE
246
+
247
+ # pattern for constructing the names
248
+ # of classifier files and classifier output files
249
+ classifier_file = classif.<classif>.<group>
250
+ classifier_output_file = classout.<classif>.<group>.<dataset>
251
+
252
+ # pattern for constructing the names
253
+ # of the evaluation file and the evaluation log file
254
+ eval_file = eval.<exp_ID>.<step>.<test_ID>
255
+ log_file = eval_log.<exp_ID>.<step>.<test_ID>
256
+
257
+ # pattern for constructing the names
258
+ # of the files with failed parses
259
+ failed_file = parsefail.<exp_ID>.<split_ID>.<dataset>