shalmaneser 1.2.0.rc4 → 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (115) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +47 -18
  3. data/bin/shalmaneser +8 -2
  4. data/doc/index.md +1 -0
  5. data/lib/shalmaneser/opt_parser.rb +68 -67
  6. metadata +49 -119
  7. data/bin/fred +0 -16
  8. data/bin/frprep +0 -34
  9. data/bin/rosy +0 -17
  10. data/lib/common/AbstractSynInterface.rb +0 -1229
  11. data/lib/common/Counter.rb +0 -18
  12. data/lib/common/EnduserMode.rb +0 -27
  13. data/lib/common/Eval.rb +0 -480
  14. data/lib/common/FixSynSemMapping.rb +0 -196
  15. data/lib/common/Graph.rb +0 -345
  16. data/lib/common/ISO-8859-1.rb +0 -24
  17. data/lib/common/ML.rb +0 -186
  18. data/lib/common/Mallet.rb +0 -236
  19. data/lib/common/Maxent.rb +0 -229
  20. data/lib/common/Optimise.rb +0 -195
  21. data/lib/common/Parser.rb +0 -213
  22. data/lib/common/RegXML.rb +0 -269
  23. data/lib/common/RosyConventions.rb +0 -171
  24. data/lib/common/STXmlTerminalOrder.rb +0 -194
  25. data/lib/common/SalsaTigerRegXML.rb +0 -2347
  26. data/lib/common/SalsaTigerXMLHelper.rb +0 -99
  27. data/lib/common/SynInterfaces.rb +0 -282
  28. data/lib/common/TabFormat.rb +0 -721
  29. data/lib/common/Tiger.rb +0 -1448
  30. data/lib/common/Timbl.rb +0 -144
  31. data/lib/common/Tree.rb +0 -61
  32. data/lib/common/config_data.rb +0 -470
  33. data/lib/common/config_format_element.rb +0 -220
  34. data/lib/common/headz.rb +0 -338
  35. data/lib/common/option_parser.rb +0 -13
  36. data/lib/common/prep_config_data.rb +0 -62
  37. data/lib/common/prep_helper.rb +0 -1330
  38. data/lib/common/ruby_class_extensions.rb +0 -310
  39. data/lib/db/db_interface.rb +0 -48
  40. data/lib/db/db_mysql.rb +0 -145
  41. data/lib/db/db_sqlite.rb +0 -280
  42. data/lib/db/db_table.rb +0 -239
  43. data/lib/db/db_wrapper.rb +0 -176
  44. data/lib/db/sql_query.rb +0 -243
  45. data/lib/ext/maxent/Classify.class +0 -0
  46. data/lib/ext/maxent/Train.class +0 -0
  47. data/lib/fred/Baseline.rb +0 -150
  48. data/lib/fred/FileZipped.rb +0 -31
  49. data/lib/fred/FredBOWContext.rb +0 -877
  50. data/lib/fred/FredConventions.rb +0 -232
  51. data/lib/fred/FredDetermineTargets.rb +0 -319
  52. data/lib/fred/FredEval.rb +0 -312
  53. data/lib/fred/FredFeatureExtractors.rb +0 -322
  54. data/lib/fred/FredFeatures.rb +0 -1061
  55. data/lib/fred/FredFeaturize.rb +0 -602
  56. data/lib/fred/FredNumTrainingSenses.rb +0 -27
  57. data/lib/fred/FredParameters.rb +0 -402
  58. data/lib/fred/FredSplit.rb +0 -84
  59. data/lib/fred/FredSplitPkg.rb +0 -180
  60. data/lib/fred/FredTest.rb +0 -606
  61. data/lib/fred/FredTrain.rb +0 -144
  62. data/lib/fred/PlotAndREval.rb +0 -480
  63. data/lib/fred/fred.rb +0 -47
  64. data/lib/fred/fred_config_data.rb +0 -185
  65. data/lib/fred/md5.rb +0 -23
  66. data/lib/fred/opt_parser.rb +0 -250
  67. data/lib/frprep/Ampersand.rb +0 -39
  68. data/lib/frprep/CollinsInterface.rb +0 -1165
  69. data/lib/frprep/Counter.rb +0 -18
  70. data/lib/frprep/FNCorpusXML.rb +0 -643
  71. data/lib/frprep/FNDatabase.rb +0 -144
  72. data/lib/frprep/FrameXML.rb +0 -513
  73. data/lib/frprep/Graph.rb +0 -345
  74. data/lib/frprep/MiniparInterface.rb +0 -1388
  75. data/lib/frprep/RegXML.rb +0 -269
  76. data/lib/frprep/STXmlTerminalOrder.rb +0 -194
  77. data/lib/frprep/SleepyInterface.rb +0 -384
  78. data/lib/frprep/TntInterface.rb +0 -44
  79. data/lib/frprep/TreetaggerInterface.rb +0 -327
  80. data/lib/frprep/do_parses.rb +0 -143
  81. data/lib/frprep/frprep.rb +0 -693
  82. data/lib/frprep/interfaces/berkeley_interface.rb +0 -372
  83. data/lib/frprep/interfaces/stanford_interface.rb +0 -353
  84. data/lib/frprep/interpreters/berkeley_interpreter.rb +0 -22
  85. data/lib/frprep/interpreters/stanford_interpreter.rb +0 -22
  86. data/lib/frprep/one_parsed_file.rb +0 -28
  87. data/lib/frprep/opt_parser.rb +0 -94
  88. data/lib/frprep/ruby_class_extensions.rb +0 -310
  89. data/lib/rosy/AbstractFeatureAndExternal.rb +0 -242
  90. data/lib/rosy/ExternalConfigData.rb +0 -58
  91. data/lib/rosy/FailedParses.rb +0 -130
  92. data/lib/rosy/FeatureInfo.rb +0 -242
  93. data/lib/rosy/GfInduce.rb +0 -1115
  94. data/lib/rosy/GfInduceFeature.rb +0 -148
  95. data/lib/rosy/InputData.rb +0 -294
  96. data/lib/rosy/RosyConfusability.rb +0 -338
  97. data/lib/rosy/RosyEval.rb +0 -465
  98. data/lib/rosy/RosyFeatureExtractors.rb +0 -1609
  99. data/lib/rosy/RosyFeaturize.rb +0 -281
  100. data/lib/rosy/RosyInspect.rb +0 -336
  101. data/lib/rosy/RosyIterator.rb +0 -478
  102. data/lib/rosy/RosyPhase2FeatureExtractors.rb +0 -230
  103. data/lib/rosy/RosyPruning.rb +0 -165
  104. data/lib/rosy/RosyServices.rb +0 -744
  105. data/lib/rosy/RosySplit.rb +0 -232
  106. data/lib/rosy/RosyTask.rb +0 -19
  107. data/lib/rosy/RosyTest.rb +0 -829
  108. data/lib/rosy/RosyTrain.rb +0 -234
  109. data/lib/rosy/RosyTrainingTestTable.rb +0 -787
  110. data/lib/rosy/TargetsMostFrequentFrame.rb +0 -60
  111. data/lib/rosy/View.rb +0 -418
  112. data/lib/rosy/opt_parser.rb +0 -379
  113. data/lib/rosy/rosy.rb +0 -78
  114. data/lib/rosy/rosy_config_data.rb +0 -121
  115. data/lib/shalmaneser/version.rb +0 -3
@@ -1,180 +0,0 @@
1
- ##
2
- # splitting package for WSD:
3
- # compute a split for feature files (one item a line, CSV),
4
- # and apply pre-computed split
5
- # to produce new feature files accordingly
6
-
7
- require "tempfile"
8
-
9
- require "fred/FredDetermineTargets"
10
- require "fred/FredConventions"
11
-
12
- class FredSplitPkg
13
- ###
14
- def initialize(exp)
15
- @exp = exp
16
- end
17
-
18
- ###
19
- def FredSplitPkg.split_dir(exp, split_id, mode = "existing")
20
- return fred_dirname(exp, "split", split_id, mode)
21
- end
22
-
23
- ###
24
- # make a new split
25
- def make_new_split(split_id, # string: ID
26
- trainpercent, # float: percentage training data
27
- ignore_unambiguous = false)
28
-
29
- # where to store the split?
30
- split_dir = FredSplitPkg.split_dir(@exp, split_id, "new")
31
-
32
- lemmas_and_senses = Targets.new(@exp, nil, "r")
33
- unless lemmas_and_senses.targets_okay
34
- # error during initialization
35
- $stderr.puts "Error: Could not read list of known targets, bailing out."
36
- exit 1
37
- end
38
-
39
- # Iterate through lemmas,
40
- # split training feature files.
41
- #
42
- # Do the split only once per lemma,
43
- # even if we have sense-specific feature files
44
- feature_dir = fred_dirname(@exp, "train", "features")
45
-
46
- lemmas_and_senses.get_lemmas().each { |lemma|
47
- # construct split file
48
- splitfilename = split_dir + fred_split_filename(lemma)
49
- begin
50
- splitfile = File.new(splitfilename, "w")
51
- rescue
52
- raise "Error: Couldn't write to file " + splitfilename
53
- end
54
-
55
- # find lemma-specific feature file
56
-
57
- filename = feature_dir + fred_feature_filename(lemma)
58
-
59
- unless File.exists?(filename)
60
- # try lemma+sense-specific feature file
61
- file_pattern = fred_feature_filename(lemma, "*", true)
62
- filename = Dir[feature_dir + file_pattern].first()
63
-
64
- unless filename
65
- # no lemma+sense-specific feature file
66
- $stderr.puts "Warning: split: no feature file found for #{lemma}, skipping."
67
- splitfile.close()
68
- next
69
- end
70
- end
71
-
72
- # open feature file for reading
73
- begin
74
- file = File.new(filename)
75
- rescue
76
- raise "Couldn't read feature file " + filename
77
- end
78
-
79
- if ignore_unambiguous and
80
- lemmas_and_senses.get_senses(lemma).length() < 2
81
- # unambiguous: ignore
82
-
83
- while file.gets()
84
- splitfile.puts "ignore"
85
- end
86
-
87
- else
88
- # read from feature file, classify at random
89
- # as train or test,
90
- # write result to splitfile
91
-
92
- while file.gets()
93
- if rand() < trainpercent
94
- splitfile.puts "train"
95
- else
96
- splitfile.puts "test"
97
- end
98
- end
99
- end
100
-
101
- splitfile.close()
102
- }
103
- end
104
-
105
- ###
106
- # remove an old split
107
- def FredSplitPkg.remove_split(exp, # FredConfigData object
108
- splitID) # string: split ID
109
- begin
110
- split_dir = FredSplitPkg.split_dir(exp, splitID, "new")
111
- rescue
112
- # no split to be removed
113
- return
114
- end
115
- %x{rm -rf #{split_dir}}
116
- end
117
-
118
-
119
- ###
120
- # change feature files according to
121
- # pre-computed split
122
- #
123
- #
124
- # returns: tempfile containing featurized items,
125
- # according to split,
126
- # or nil if the split file wouldn't contain any data
127
- def apply_split(filename, # feature file
128
- lemma, # string: lemma that filename is about
129
- dataset, # string: train, test
130
- split_id) # string: split ID
131
-
132
-
133
- split_filename = FredSplitPkg.split_dir(@exp, split_id) +
134
- fred_split_filename(lemma)
135
-
136
- # read feature file and split file at the same time
137
- # write to tempfile.
138
- f_feat = File.new(filename)
139
- f_split = File.new(split_filename)
140
- f_out = Tempfile.new("fred_split")
141
-
142
- num_yes = 0
143
-
144
- f_feat.each { |line|
145
- begin
146
- split_part = f_split.readline().chomp()
147
- rescue
148
- $stderr.puts "FredSplit error: split file too short."
149
- $stderr.puts "skipping rest of featurization data."
150
- $stderr.puts "Split file: #{split_filename}"
151
- $stderr.puts "Feature file: #{filename}"
152
- raise "HIER"
153
- f_out.close()
154
- if num_yes > 0
155
- return f_out
156
- else
157
- return nil
158
- end
159
- end
160
-
161
- if split_part == dataset
162
- # write training data, and this item is in the training
163
- # part of the split,
164
- # or write test data, and item is in test part
165
- f_out.puts line
166
- num_yes += 1
167
- end
168
- }
169
- f_out.close()
170
- f_feat.close()
171
- f_split.close()
172
-
173
- if num_yes > 0
174
- return f_out
175
- else
176
- return nil
177
- end
178
-
179
- end
180
- end
@@ -1,606 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- # FredTest
3
- # Katrin Erk April 05
4
- #
5
- # Frame disambiguation system:
6
- # apply trained classifiers to test data
7
- # Results are written out one output line per instance line.
8
-
9
- # Ruby packages
10
- require "tempfile"
11
-
12
- # Salsa packages
13
- require "common/Parser"
14
- require "common/RegXML"
15
- require "common/SalsaTigerRegXML"
16
- require "common/ruby_class_extensions"
17
-
18
- # Shalmaneser packages
19
- require "common/ML"
20
- require "fred/Baseline"
21
- require "fred/FredConventions"
22
- require "fred/FredDetermineTargets"
23
- require "fred/FredSplitPkg"
24
- require "fred/FredFeatures"
25
- require "fred/FredNumTrainingSenses"
26
-
27
- class FredTest
28
-
29
- ###
30
- # new
31
- #
32
- # evaluate runtime options and announce the task
33
- def initialize(exp_obj, # FredConfigData object
34
- options) # hash: runtime option name (string) => value(string)
35
-
36
- # keep the experiment file object
37
- @exp = exp_obj
38
-
39
- # evaluate runtime options
40
- @split_id = nil
41
- @baseline = false
42
- @produce_output = true
43
-
44
- options.each_pair { |opt, arg|
45
- case opt
46
- when "--logID"
47
-
48
- @split_id = arg
49
-
50
- when "--baseline"
51
- @baseline = true
52
-
53
- when "--nooutput"
54
- @produce_output = false
55
-
56
- else
57
- # case of unknown arguments has been dealt with by fred.rb
58
- end
59
- }
60
-
61
- # announce the task
62
- $stderr.puts "---------"
63
- $stderr.print "Fred experiment #{@exp.get("experiment_ID")}: "
64
- if @baseline
65
- $stderr.print "Computing baseline "
66
- else
67
- $stderr.print "Applying classifiers"
68
- end
69
- if @split_id
70
- $stderr.puts " using split with ID #{@split_id}"
71
- else
72
- $stderr.puts
73
- end
74
- if @produce_output and not @split_id
75
- $stderr.print "Output is to "
76
- if @exp.get("directory_output")
77
- $stderr.puts @exp.get("directory_output")
78
- else
79
- $stderr.puts fred_dirname(@exp, "output", "stxml", "new")
80
- end
81
- end
82
- $stderr.puts "---------"
83
-
84
- ###
85
- # prepare data:
86
-
87
- if @baseline
88
- # only compute baseline: always assign most frequent sense
89
-
90
- @classifiers = [
91
- [Baseline.new(@exp, @split_id), "baseline"]
92
- ]
93
-
94
- else
95
- # determine classifiers
96
- #
97
- # get_lf returns: array of pairs [classifier_name, options[array]]
98
- #
99
- # @classifiers: list of pairs [Classifier object, classifier name(string)]
100
- @classifiers = @exp.get_lf("classifier").map { |classif_name, options|
101
- [Classifier.new(classif_name, options), classif_name]
102
- }
103
- # sanity check: we need at least one classifier
104
- if @classifiers.empty?
105
- $stderr.puts "Error: I need at least one classifier, please specify using exp. file option 'classifier'"
106
- exit 1
107
- end
108
-
109
-
110
- if @classifiers.length() > 1
111
- $stderr.puts "Warning: I'm not doing classifier combination at the moment,"
112
- $stderr.puts "so I'll be ignoring all but the first classifier type."
113
- end
114
- end
115
-
116
- # get an object for listing senses of each lemma
117
- @lemmas_and_senses = Targets.new(@exp, nil, "r")
118
- end
119
-
120
- ###
121
- # compute
122
- #
123
- # classify test instances,
124
- # write output to file.
125
- def compute()
126
- if @split_id
127
- # make split object and parameter hash to pass to it.
128
- # read feature data from training feature directory.
129
- split_obj = FredSplitPkg.new(@exp)
130
- dataset = "train"
131
- else
132
- # read feature data from test feature directory.
133
- dataset = "test"
134
- end
135
-
136
- output_dir = fred_dirname(@exp, "output", "tab", "new")
137
- classif_dir = fred_classifier_directory(@exp, @split_id)
138
-
139
- ###
140
- # remove old classifier output files
141
- Dir[output_dir + "*"].each { |f|
142
- if File.exists? f
143
- File.delete(f)
144
- end
145
- }
146
-
147
-
148
- all_results = Array.new()
149
-
150
- ###
151
- # get a list of all relevant feature files: lemma, sense?
152
- lemma2_sense_and_filename = Hash.new()
153
-
154
- FredFeatureAccess.each_feature_file(@exp, dataset) { |filename, values|
155
-
156
- # catalogue under lemma
157
- unless lemma2_sense_and_filename[values["lemma"]]
158
- lemma2_sense_and_filename[values["lemma"]] = Array.new()
159
- end
160
- # catalogue only matches between chosen classifier type
161
- # and actually existing classifier type
162
-
163
- # hier checken
164
- # senses ist nil, lemma2_sense_and_filename wird nicht gefüllt
165
- # => es werden keine classifier gefunden
166
-
167
-
168
- if @exp.get("binary_classifiers") and \
169
- values["sense"] and not(values["sense"].empty?)
170
- lemma2_sense_and_filename[values["lemma"]] << [values["sense"], filename]
171
-
172
- elsif not(@exp.get("binary_classifiers")) and \
173
- (values["sense"].nil? or values["sense"].empty?)
174
- lemma2_sense_and_filename[values["lemma"]] << [nil, filename]
175
- end
176
- }
177
-
178
- ###
179
- # check whether we have classifiers
180
- found = 0
181
- found_single_sense = 0
182
- lemma2_sense_and_filename.each_pair { |lemma, senses_and_filenames|
183
- if @lemmas_and_senses.get_senses(lemma).length() == 1
184
- # lemma with only one sense? then mark as such
185
- found_single_sense += 1
186
- else
187
- # lemma with more than one sense: look for classifiers
188
- senses_and_filenames.each { |sense, filename|
189
- @classifiers.each { |classifier, classifier_name|
190
- if @exp.get("binary_classifiers") and \
191
- classifier.exists? classif_dir + fred_classifier_filename(classifier_name,
192
- lemma, sense)
193
- found += 1
194
- elsif not(@exp.get("binary_classifiers")) and\
195
- classifier.exists? classif_dir + fred_classifier_filename(classifier_name,
196
- lemma)
197
- found += 1
198
- end
199
- }
200
- }
201
- end
202
- }
203
- if found == 0 and found_single_sense < lemma2_sense_and_filename.length()
204
- # no matching classifiers found
205
- $stderr.puts "ERROR: no classifiers found in #{classif_dir}."
206
- if @exp.get("binary_classifiers")
207
- $stderr.puts "(Looking for binary classifiers.)"
208
- else
209
- $stderr.puts "(Looking for n-ary classifiers.)"
210
- end
211
- $stderr.puts "Please check whether you mistyped the classifier directory name.
212
-
213
- Another possibility: You may have trained binary classifiers, but
214
- tried to apply n-ary ones (or vice versa.)
215
- "
216
- exit 1
217
- end
218
-
219
- ###
220
- # each test feature set:
221
- # read classifier, apply
222
- # iterate through instance files
223
- lemma2_sense_and_filename.to_a().sort { |a, b|
224
- a.first() <=> b.first
225
- }.each { |lemma, senses_and_filenames|
226
- # progress report
227
- if @exp.get("verbose")
228
- $stderr.puts "Applying to " + lemma
229
- end
230
-
231
- # results_this_lemma: array of classifier_results
232
- # classifier_result: array of line_entries
233
- # line entry: list of pairs [sense, confidence]
234
- results_this_lemma = Array.new()
235
-
236
- training_senses = determine_training_senses(lemma, @exp,
237
- @lemmas_and_senses, @split_id)
238
-
239
- senses_and_filenames.each { |sense, filename|
240
-
241
- # if we're splitting the data, do that now
242
- if split_obj
243
- tempfile = split_obj.apply_split(filename, lemma, "test", @split_id)
244
- if tempfile.nil?
245
- # the test part of the split doesn't contain any data
246
- $stderr.puts "Skipping #{lemma}: no test data in split"
247
- next
248
- end
249
-
250
- filename = tempfile.path()
251
- end
252
-
253
- if training_senses.length() == 1
254
- # single-sense lemma: just assign that sense to all occurrences
255
- assigned_sense = training_senses.first()
256
-
257
- classifier_result = Array.new()
258
- f = File.open(filename)
259
-
260
- f.each { |line| classifier_result << [[assigned_sense, 1.0]] }
261
- results_this_lemma << classifier_result
262
-
263
- else
264
- #more than one sense: apply classifier(s)
265
-
266
- # classifiers_read_okay:
267
- # boolean, true if reading the stored classifier(s) succeeded
268
- classifiers_read_okay = true
269
- @classifiers.each { |classifier, classifier_name|
270
-
271
- stored_classifier = classif_dir + fred_classifier_filename(classifier_name,
272
- lemma, sense)
273
- status = classifier.read(stored_classifier)
274
- unless status
275
- $stderr.puts "[FredTest] Error: could not read classifier."
276
- classifiers_read_okay = false
277
- end
278
- }
279
-
280
- if classifiers_read_okay
281
- # apply classifiers, write result to database
282
- classifier_results = apply_classifiers(filename, classif_dir)
283
-
284
- if classifier_results.empty?
285
- # something went wrong during the application of classifiers
286
- $stderr.puts "Error while working on #{lemma}, skipping"
287
- else
288
- # we have classifier results:
289
- # since we're not doing any classifier combination at the moment
290
- # (if we did, this would be the place to do so!)
291
- # discard the results of all but the first classifier
292
- results_this_lemma << classifier_results.first()
293
- end
294
- end
295
-
296
- if split_obj
297
- tempfile.close(true)
298
- end
299
- end
300
- }
301
-
302
- # write to output file:
303
- # if we have binary classifiers, join.
304
- results_this_lemma = join_binary_classifier_results(results_this_lemma)
305
-
306
- outfilename = output_dir + fred_result_filename(lemma)
307
- begin
308
- outfile = File.new(outfilename, "w")
309
- rescue
310
- raise "Couldn't write to result file " + outfilename
311
- end
312
-
313
- if results_this_lemma.nil?
314
- # nothing has been done for this lemma
315
- next
316
- end
317
-
318
- results_this_lemma.each { |result|
319
- # result: an ordered list of pairs [label, confidence]
320
- outfile.puts result.map { |label, confidence|
321
- "#{label} #{confidence}"
322
- }.join(" ")
323
- }
324
-
325
- # remember results for output
326
- if @produce_output
327
- all_results << [lemma, results_this_lemma]
328
- end
329
- }
330
-
331
-
332
- ##
333
- # produce output: disambiguated data in SalsaTigerXML format
334
- if @produce_output
335
- salsatiger_output(all_results)
336
- end
337
-
338
- end
339
-
340
- #####
341
- private
342
-
343
- #########################
344
- def apply_classifiers(filename, # name of feature file
345
- classif_dir) # string: name of directory with classifiers
346
-
347
- # make output file for classifiers
348
- tf_output = Tempfile.new("fred")
349
- tf_output.close()
350
-
351
- ###
352
- # apply classifiers
353
-
354
- classifier_results = Array.new
355
-
356
- @classifiers.each { |classifier, classifier_name|
357
-
358
- success = classifier.apply(filename, tf_output.path())
359
-
360
- # did we manage to classify the test data?
361
- # there may be errors on the way (eg no training data)
362
- if success
363
- # read classifier output from file
364
- # classifier_results: list of line entries
365
- # line entry: list of pairs [sense, confidence]
366
- classifier_results << classifier.read_resultfile(tf_output.path())
367
-
368
- else
369
- # error: return empty Array, so that error handling can take over
370
- return Array.new
371
- end
372
- }
373
-
374
- # if we are here, all classifiers have succeeded...
375
-
376
- # clean up
377
- tf_output.close(true)
378
-
379
- # return list of classifier results,
380
- # each entry is a list of results,
381
- # one entry per classifier type
382
- return classifier_results
383
- end
384
-
385
- ###
386
- # join binary classifier results (if we are doing binary classifiers):
387
- # if we have classifiers that are specific to individual senses,
388
- # collect all classifiers that we have for a lemma, and
389
- # for each instance, choose the sense that won with the highest confidence
390
- #
391
- # input: a list of result lists.
392
- # a result list is a list of instance_results
393
- # instance_results is a list of pairs [label, confidence]
394
- # such that the label with the highest confidence is mentioned first
395
- #
396
- # output: a result list.
397
- def join_binary_classifier_results(resultlists) # list:list:tuples [label, confidence]
398
- unless @exp.get("binary_classifiers")
399
- # we are doing lemma-specific, not sense-specific classifiers.
400
- # so resultlist is a list containing just one entry.
401
- # all classifier: list of lists of lists of pairs label, confidence
402
- # one classifier: list of lists of pairs label, confidence
403
- # line: list of pairs label, confidence
404
- # label: pair label, confidence
405
- return resultlists.first()
406
- end
407
-
408
- # we are doing sense-specific classifiers.
409
- # group triples
410
-
411
- # what is the name of the negative sense?
412
- unless (negsense = @exp.get("negsense"))
413
- negsense = "NONE"
414
- end
415
-
416
- # retv: list of instance results
417
- # where an instance result is a list of pairs [label, confidence]
418
- retv = Array.new()
419
-
420
- # choose the sense that was assigned with highest confidence
421
- # how many instances? max. length of any of the instance lists
422
- # (we'll deal with mismatches in instance numbers later)
423
- num_instances = resultlists.map { |list_one_classifier| list_one_classifier.length() }.max()
424
- if num_instances.nil?
425
- # no instances, it seems
426
- return nil
427
- end
428
-
429
- 0.upto(num_instances - 1) { |instno|
430
-
431
- # get the results of all classifiers for instance number instno
432
- all_results_this_instance = resultlists.map { |list_one_classifier|
433
- # get the instno-th line
434
- if list_one_classifier.at(instno)
435
- list_one_classifier.at(instno)
436
- else
437
- # length mismatch: we're missing an instance
438
- $stderr.puts "Error: binary classifier results don't all have the same length."
439
- $stderr.puts "Assuming missing results to be negative."
440
- [["NONE", 1.0]]
441
- end
442
- }
443
-
444
- # now throw out the negsense judgments, and sort results by confidence
445
- joint_result_this_instance = all_results_this_instance.map { |inst_result|
446
- # if we have more than 2 entries here,
447
- # this is very weird for a binary classifier
448
- if inst_result.length() > 2
449
- $stderr.puts "Judgments for more than 2 senses in binary classifier? Very weird!"
450
- $stderr.puts inst_result.map { |label, confidence| "#{label}:#{confidence}" }.join(" ")
451
- $stderr.puts "Only considering the first non-negative sense."
452
- end
453
-
454
- # choose the first entry that is not the negsense,
455
- # or nil, if only the negative sense has been assigned with 1.0 certainty.
456
- # nil choices will be removed by the compact() below
457
- inst_result.detect { |label, confidence|
458
- label != negsense
459
- }
460
- }.compact().sort { |a, b|
461
- # sort senses by confidence, highest confidence first
462
- b[1] <=> a[1]
463
- }
464
-
465
- retv << joint_result_this_instance
466
- }
467
-
468
- return retv
469
- end
470
-
471
-
472
- ###
473
- # produce output in SalsaTigerXML: disambiguated training data,
474
- # assigned senses are recorded as frames, the targets of which are the
475
- # disambiguated words
476
- def salsatiger_output(all_results)
477
-
478
- if @split_id
479
- # we're not writing Salsa/Tiger XML output for splits.
480
- $stderr.puts "No Salsa/Tiger XML output for random splits of the data,"
481
- $stderr.puts "only for separate test sets."
482
- return
483
- end
484
-
485
- ##
486
- # determine output directory
487
- if @exp.get("directory_output")
488
- output_dir = File.new_dir(@exp.get("directory_output"))
489
- else
490
- output_dir = fred_dirname(@exp, "output", "stxml", "new")
491
- end
492
-
493
- $stderr.puts "Writing SalsaTigerXML output to #{output_dir}"
494
-
495
- ##
496
- # empty output directory
497
- Dir[output_dir + "*"].each { |filename|
498
- if File.exists?(filename)
499
- File.delete(filename)
500
- end
501
- }
502
-
503
- # input directory: where we stored the zipped input files
504
- input_dir = fred_dirname(@exp, "test", "input_data")
505
-
506
- ##
507
- # map results to target IDs, using answer key files
508
-
509
- # record results: hash
510
- # <sentencde ID>(string) -> assigned senses
511
- # where assigned senses are a list of tuples
512
- # [target IDs, sense, lemma, pos]
513
- recorded_results = Hash.new
514
-
515
- all_results.each { |lemma, results|
516
- answer_obj = AnswerKeyAccess.new(@exp, "test", lemma, "r")
517
-
518
- instance_index = 0
519
- answer_obj.each { |a_lemma, a_pos, a_targetIDs, a_sid, a_senses, a_senses_this|
520
- key = a_sid
521
-
522
- unless recorded_results[key]
523
- recorded_results[key] = Array.new()
524
- end
525
-
526
- labels_and_senses_for_this_instance = results.at(instance_index)
527
- if not(labels_and_senses_for_this_instance.empty?) and
528
- (winning_sense = labels_and_senses_for_this_instance.first().first())
529
-
530
- recorded_results[key] << [a_targetIDs, winning_sense, a_lemma, a_pos]
531
- end
532
-
533
- instance_index += 1
534
- } # each answerkey line for this lemma
535
- } # each lemma/results pair
536
-
537
-
538
- ##
539
- # read in SalsaTiger syntax, remove old semantics, add new semantics, write
540
-
541
- Dir[input_dir + "*.xml.gz"].each { |filename|
542
- # unzip input file
543
- tempfile = Tempfile.new("FredTest")
544
- tempfile.close()
545
- %x{gunzip -c #{filename} > #{tempfile.path()}}
546
-
547
- infile = FilePartsParser.new(tempfile.path())
548
- if @exp.get("verbose")
549
- $stderr.puts "SalsaTigerXML output of " + File.basename(filename, ".gz")
550
- end
551
-
552
- begin
553
- outfile = File.new(output_dir + File.basename(filename, ".gz"), "w")
554
- rescue
555
- $stderr.puts "Couldn't write to output file #{output_dir}#{File.basename(filename)}."
556
- $stderr.puts "Skipping Salsa/Tiger XML output."
557
- return
558
- end
559
-
560
- # write header
561
- outfile.puts infile.head()
562
-
563
- infile.scan_s { |sent_string|
564
- sent = SalsaTigerSentence.new(sent_string)
565
-
566
- # remove old semantics
567
- sent.remove_semantics()
568
-
569
- if recorded_results and recorded_results[sent.id()]
570
- recorded_results[sent.id()].each { |target_ids, sense, lemma, pos|
571
-
572
- # add frame to sentence
573
- new_frame = sent.add_frame(sense)
574
-
575
- # get list of target nodes from target IDs
576
- # assuming that target_ids is a string of target IDs
577
- # separated by comma.
578
- # IDs for which no node could be found are just ignored
579
-
580
- targets = target_ids.map { |target_id|
581
- sent.syn_node_with_id(target_id)
582
- }.compact
583
- # enter the target nodes for this new frame
584
- new_frame.add_fe("target", targets)
585
-
586
- # put lemma and POS info into <target>
587
- new_frame.target.set_attribute("lemma", lemma)
588
- new_frame.target.set_attribute("pos", pos)
589
- }
590
- end
591
-
592
- # write changed sentence:
593
- # only if there are recorded results for this sentence!
594
- outfile.puts sent.get()
595
-
596
- } # each sentence of file
597
-
598
- # write footer
599
- outfile.puts infile.tail()
600
- outfile.close()
601
- tempfile.close(true)
602
- } # each SalsaTiger file of the input directory
603
-
604
- end
605
-
606
- end