frprep 0.0.1.prealpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,180 @@
1
+ ##
2
+ # splitting package for WSD:
3
+ # compute a split for feature files (one item a line, CSV),
4
+ # and apply pre-computed split
5
+ # to produce new feature files accordingly
6
+
7
+ require "tempfile"
8
+
9
+ require "fred/FredDetermineTargets"
10
+ require "fred/FredConventions"
11
+
12
+ class FredSplitPkg
13
+ ###
14
+ def initialize(exp)
15
+ @exp = exp
16
+ end
17
+
18
+ ###
19
+ def FredSplitPkg.split_dir(exp, split_id, mode = "existing")
20
+ return fred_dirname(exp, "split", split_id, mode)
21
+ end
22
+
23
+ ###
24
+ # make a new split
25
+ def make_new_split(split_id, # string: ID
26
+ trainpercent, # float: percentage training data
27
+ ignore_unambiguous = false)
28
+
29
+ # where to store the split?
30
+ split_dir = FredSplitPkg.split_dir(@exp, split_id, "new")
31
+
32
+ lemmas_and_senses = Targets.new(@exp, nil, "r")
33
+ unless lemmas_and_senses.targets_okay
34
+ # error during initialization
35
+ $stderr.puts "Error: Could not read list of known targets, bailing out."
36
+ exit 1
37
+ end
38
+
39
+ # Iterate through lemmas,
40
+ # split training feature files.
41
+ #
42
+ # Do the split only once per lemma,
43
+ # even if we have sense-specific feature files
44
+ feature_dir = fred_dirname(@exp, "train", "features")
45
+
46
+ lemmas_and_senses.get_lemmas().each { |lemma|
47
+ # construct split file
48
+ splitfilename = split_dir + fred_split_filename(lemma)
49
+ begin
50
+ splitfile = File.new(splitfilename, "w")
51
+ rescue
52
+ raise "Error: Couldn't write to file " + splitfilename
53
+ end
54
+
55
+ # find lemma-specific feature file
56
+
57
+ filename = feature_dir + fred_feature_filename(lemma)
58
+
59
+ unless File.exists?(filename)
60
+ # try lemma+sense-specific feature file
61
+ file_pattern = fred_feature_filename(lemma, "*", true)
62
+ filename = Dir[feature_dir + file_pattern].first()
63
+
64
+ unless filename
65
+ # no lemma+sense-specific feature file
66
+ $stderr.puts "Warning: split: no feature file found for #{lemma}, skipping."
67
+ splitfile.close()
68
+ next
69
+ end
70
+ end
71
+
72
+ # open feature file for reading
73
+ begin
74
+ file = File.new(filename)
75
+ rescue
76
+ raise "Couldn't read feature file " + filename
77
+ end
78
+
79
+ if ignore_unambiguous and
80
+ lemmas_and_senses.get_senses(lemma).length() < 2
81
+ # unambiguous: ignore
82
+
83
+ while file.gets()
84
+ splitfile.puts "ignore"
85
+ end
86
+
87
+ else
88
+ # read from feature file, classify at random
89
+ # as train or test,
90
+ # write result to splitfile
91
+
92
+ while file.gets()
93
+ if rand() < trainpercent
94
+ splitfile.puts "train"
95
+ else
96
+ splitfile.puts "test"
97
+ end
98
+ end
99
+ end
100
+
101
+ splitfile.close()
102
+ }
103
+ end
104
+
105
+ ###
106
+ # remove an old split
107
+ def FredSplitPkg.remove_split(exp, # FredConfigData object
108
+ splitID) # string: split ID
109
+ begin
110
+ split_dir = FredSplitPkg.split_dir(exp, splitID, "new")
111
+ rescue
112
+ # no split to be removed
113
+ return
114
+ end
115
+ %x{rm -rf #{split_dir}}
116
+ end
117
+
118
+
119
+ ###
120
+ # change feature files according to
121
+ # pre-computed split
122
+ #
123
+ #
124
+ # returns: tempfile containing featurized items,
125
+ # according to split,
126
+ # or nil if the split file wouldn't contain any data
127
+ def apply_split(filename, # feature file
128
+ lemma, # string: lemma that filename is about
129
+ dataset, # string: train, test
130
+ split_id) # string: split ID
131
+
132
+
133
+ split_filename = FredSplitPkg.split_dir(@exp, split_id) +
134
+ fred_split_filename(lemma)
135
+
136
+ # read feature file and split file at the same time
137
+ # write to tempfile.
138
+ f_feat = File.new(filename)
139
+ f_split = File.new(split_filename)
140
+ f_out = Tempfile.new("fred_split")
141
+
142
+ num_yes = 0
143
+
144
+ f_feat.each { |line|
145
+ begin
146
+ split_part = f_split.readline().chomp()
147
+ rescue
148
+ $stderr.puts "FredSplit error: split file too short."
149
+ $stderr.puts "skipping rest of featurization data."
150
+ $stderr.puts "Split file: #{split_filename}"
151
+ $stderr.puts "Feature file: #{filename}"
152
+ raise "HIER"
153
+ f_out.close()
154
+ if num_yes > 0
155
+ return f_out
156
+ else
157
+ return nil
158
+ end
159
+ end
160
+
161
+ if split_part == dataset
162
+ # write training data, and this item is in the training
163
+ # part of the split,
164
+ # or write test data, and item is in test part
165
+ f_out.puts line
166
+ num_yes += 1
167
+ end
168
+ }
169
+ f_out.close()
170
+ f_feat.close()
171
+ f_split.close()
172
+
173
+ if num_yes > 0
174
+ return f_out
175
+ else
176
+ return nil
177
+ end
178
+
179
+ end
180
+ end
@@ -0,0 +1,607 @@
1
+ # -*- coding: utf-8 -*-
2
+ # FredTest
3
+ # Katrin Erk April 05
4
+ #
5
+ # Frame disambiguation system:
6
+ # apply trained classifiers to test data
7
+ # Results are written out one output line per instance line.
8
+
9
+ # Ruby packages
10
+ require "tempfile"
11
+
12
+ # Salsa packages
13
+ require "common/Parser"
14
+ require "common/RegXML"
15
+ require "common/SalsaTigerRegXML"
16
+ require "common/ruby_class_extensions"
17
+
18
+ # Shalmaneser packages
19
+ require "common/FrPrepConfigData"
20
+ require "common/ML"
21
+ require "fred/Baseline"
22
+ require "fred/FredConventions"
23
+ require "fred/FredDetermineTargets"
24
+ require "fred/FredSplitPkg"
25
+ require "fred/FredFeatures"
26
+ require "fred/FredNumTrainingSenses"
27
+
28
+ class FredTest
29
+
30
+ ###
31
+ # new
32
+ #
33
+ # evaluate runtime options and announce the task
34
+ def initialize(exp_obj, # FredConfigData object
35
+ options) # hash: runtime option name (string) => value(string)
36
+
37
+ # keep the experiment file object
38
+ @exp = exp_obj
39
+
40
+ # evaluate runtime options
41
+ @split_id = nil
42
+ @baseline = false
43
+ @produce_output = true
44
+
45
+ options.each_pair { |opt, arg|
46
+ case opt
47
+ when "--logID"
48
+
49
+ @split_id = arg
50
+
51
+ when "--baseline"
52
+ @baseline = true
53
+
54
+ when "--nooutput"
55
+ @produce_output = false
56
+
57
+ else
58
+ # case of unknown arguments has been dealt with by fred.rb
59
+ end
60
+ }
61
+
62
+ # announce the task
63
+ $stderr.puts "---------"
64
+ $stderr.print "Fred experiment #{@exp.get("experiment_ID")}: "
65
+ if @baseline
66
+ $stderr.print "Computing baseline "
67
+ else
68
+ $stderr.print "Applying classifiers"
69
+ end
70
+ if @split_id
71
+ $stderr.puts " using split with ID #{@split_id}"
72
+ else
73
+ $stderr.puts
74
+ end
75
+ if @produce_output and not @split_id
76
+ $stderr.print "Output is to "
77
+ if @exp.get("directory_output")
78
+ $stderr.puts @exp.get("directory_output")
79
+ else
80
+ $stderr.puts fred_dirname(@exp, "output", "stxml", "new")
81
+ end
82
+ end
83
+ $stderr.puts "---------"
84
+
85
+ ###
86
+ # prepare data:
87
+
88
+ if @baseline
89
+ # only compute baseline: always assign most frequent sense
90
+
91
+ @classifiers = [
92
+ [Baseline.new(@exp, @split_id), "baseline"]
93
+ ]
94
+
95
+ else
96
+ # determine classifiers
97
+ #
98
+ # get_lf returns: array of pairs [classifier_name, options[array]]
99
+ #
100
+ # @classifiers: list of pairs [Classifier object, classifier name(string)]
101
+ @classifiers = @exp.get_lf("classifier").map { |classif_name, options|
102
+ [Classifier.new(classif_name, options), classif_name]
103
+ }
104
+ # sanity check: we need at least one classifier
105
+ if @classifiers.empty?
106
+ $stderr.puts "Error: I need at least one classifier, please specify using exp. file option 'classifier'"
107
+ exit 1
108
+ end
109
+
110
+
111
+ if @classifiers.length() > 1
112
+ $stderr.puts "Warning: I'm not doing classifier combination at the moment,"
113
+ $stderr.puts "so I'll be ignoring all but the first classifier type."
114
+ end
115
+ end
116
+
117
+ # get an object for listing senses of each lemma
118
+ @lemmas_and_senses = Targets.new(@exp, nil, "r")
119
+ end
120
+
121
+ ###
122
+ # compute
123
+ #
124
+ # classify test instances,
125
+ # write output to file.
126
+ def compute()
127
+ if @split_id
128
+ # make split object and parameter hash to pass to it.
129
+ # read feature data from training feature directory.
130
+ split_obj = FredSplitPkg.new(@exp)
131
+ dataset = "train"
132
+ else
133
+ # read feature data from test feature directory.
134
+ dataset = "test"
135
+ end
136
+
137
+ output_dir = fred_dirname(@exp, "output", "tab", "new")
138
+ classif_dir = fred_classifier_directory(@exp, @split_id)
139
+
140
+ ###
141
+ # remove old classifier output files
142
+ Dir[output_dir + "*"].each { |f|
143
+ if File.exists? f
144
+ File.delete(f)
145
+ end
146
+ }
147
+
148
+
149
+ all_results = Array.new()
150
+
151
+ ###
152
+ # get a list of all relevant feature files: lemma, sense?
153
+ lemma2_sense_and_filename = Hash.new()
154
+
155
+ FredFeatureAccess.each_feature_file(@exp, dataset) { |filename, values|
156
+
157
+ # catalogue under lemma
158
+ unless lemma2_sense_and_filename[values["lemma"]]
159
+ lemma2_sense_and_filename[values["lemma"]] = Array.new()
160
+ end
161
+ # catalogue only matches between chosen classifier type
162
+ # and actually existing classifier type
163
+
164
+ # hier checken
165
+ # senses ist nil, lemma2_sense_and_filename wird nicht gefüllt
166
+ # => es werden keine classifier gefunden
167
+
168
+
169
+ if @exp.get("binary_classifiers") and \
170
+ values["sense"] and not(values["sense"].empty?)
171
+ lemma2_sense_and_filename[values["lemma"]] << [values["sense"], filename]
172
+
173
+ elsif not(@exp.get("binary_classifiers")) and \
174
+ (values["sense"].nil? or values["sense"].empty?)
175
+ lemma2_sense_and_filename[values["lemma"]] << [nil, filename]
176
+ end
177
+ }
178
+
179
+ ###
180
+ # check whether we have classifiers
181
+ found = 0
182
+ found_single_sense = 0
183
+ lemma2_sense_and_filename.each_pair { |lemma, senses_and_filenames|
184
+ if @lemmas_and_senses.get_senses(lemma).length() == 1
185
+ # lemma with only one sense? then mark as such
186
+ found_single_sense += 1
187
+ else
188
+ # lemma with more than one sense: look for classifiers
189
+ senses_and_filenames.each { |sense, filename|
190
+ @classifiers.each { |classifier, classifier_name|
191
+ if @exp.get("binary_classifiers") and \
192
+ classifier.exists? classif_dir + fred_classifier_filename(classifier_name,
193
+ lemma, sense)
194
+ found += 1
195
+ elsif not(@exp.get("binary_classifiers")) and\
196
+ classifier.exists? classif_dir + fred_classifier_filename(classifier_name,
197
+ lemma)
198
+ found += 1
199
+ end
200
+ }
201
+ }
202
+ end
203
+ }
204
+ if found == 0 and found_single_sense < lemma2_sense_and_filename.length()
205
+ # no matching classifiers found
206
+ $stderr.puts "ERROR: no classifiers found in #{classif_dir}."
207
+ if @exp.get("binary_classifiers")
208
+ $stderr.puts "(Looking for binary classifiers.)"
209
+ else
210
+ $stderr.puts "(Looking for n-ary classifiers.)"
211
+ end
212
+ $stderr.puts "Please check whether you mistyped the classifier directory name.
213
+
214
+ Another possibility: You may have trained binary classifiers, but
215
+ tried to apply n-ary ones (or vice versa.)
216
+ "
217
+ exit 1
218
+ end
219
+
220
+ ###
221
+ # each test feature set:
222
+ # read classifier, apply
223
+ # iterate through instance files
224
+ lemma2_sense_and_filename.to_a().sort { |a, b|
225
+ a.first() <=> b.first
226
+ }.each { |lemma, senses_and_filenames|
227
+ # progress report
228
+ if @exp.get("verbose")
229
+ $stderr.puts "Applying to " + lemma
230
+ end
231
+
232
+ # results_this_lemma: array of classifier_results
233
+ # classifier_result: array of line_entries
234
+ # line entry: list of pairs [sense, confidence]
235
+ results_this_lemma = Array.new()
236
+
237
+ training_senses = determine_training_senses(lemma, @exp,
238
+ @lemmas_and_senses, @split_id)
239
+
240
+ senses_and_filenames.each { |sense, filename|
241
+
242
+ # if we're splitting the data, do that now
243
+ if split_obj
244
+ tempfile = split_obj.apply_split(filename, lemma, "test", @split_id)
245
+ if tempfile.nil?
246
+ # the test part of the split doesn't contain any data
247
+ $stderr.puts "Skipping #{lemma}: no test data in split"
248
+ next
249
+ end
250
+
251
+ filename = tempfile.path()
252
+ end
253
+
254
+ if training_senses.length() == 1
255
+ # single-sense lemma: just assign that sense to all occurrences
256
+ assigned_sense = training_senses.first()
257
+
258
+ classifier_result = Array.new()
259
+ f = File.open(filename)
260
+
261
+ f.each { |line| classifier_result << [[assigned_sense, 1.0]] }
262
+ results_this_lemma << classifier_result
263
+
264
+ else
265
+ #more than one sense: apply classifier(s)
266
+
267
+ # classifiers_read_okay:
268
+ # boolean, true if reading the stored classifier(s) succeeded
269
+ classifiers_read_okay = true
270
+ @classifiers.each { |classifier, classifier_name|
271
+
272
+ stored_classifier = classif_dir + fred_classifier_filename(classifier_name,
273
+ lemma, sense)
274
+ status = classifier.read(stored_classifier)
275
+ unless status
276
+ $stderr.puts "[FredTest] Error: could not read classifier."
277
+ classifiers_read_okay = false
278
+ end
279
+ }
280
+
281
+ if classifiers_read_okay
282
+ # apply classifiers, write result to database
283
+ classifier_results = apply_classifiers(filename, classif_dir)
284
+
285
+ if classifier_results.empty?
286
+ # something went wrong during the application of classifiers
287
+ $stderr.puts "Error while working on #{lemma}, skipping"
288
+ else
289
+ # we have classifier results:
290
+ # since we're not doing any classifier combination at the moment
291
+ # (if we did, this would be the place to do so!)
292
+ # discard the results of all but the first classifier
293
+ results_this_lemma << classifier_results.first()
294
+ end
295
+ end
296
+
297
+ if split_obj
298
+ tempfile.close(true)
299
+ end
300
+ end
301
+ }
302
+
303
+ # write to output file:
304
+ # if we have binary classifiers, join.
305
+ results_this_lemma = join_binary_classifier_results(results_this_lemma)
306
+
307
+ outfilename = output_dir + fred_result_filename(lemma)
308
+ begin
309
+ outfile = File.new(outfilename, "w")
310
+ rescue
311
+ raise "Couldn't write to result file " + outfilename
312
+ end
313
+
314
+ if results_this_lemma.nil?
315
+ # nothing has been done for this lemma
316
+ next
317
+ end
318
+
319
+ results_this_lemma.each { |result|
320
+ # result: an ordered list of pairs [label, confidence]
321
+ outfile.puts result.map { |label, confidence|
322
+ "#{label} #{confidence}"
323
+ }.join(" ")
324
+ }
325
+
326
+ # remember results for output
327
+ if @produce_output
328
+ all_results << [lemma, results_this_lemma]
329
+ end
330
+ }
331
+
332
+
333
+ ##
334
+ # produce output: disambiguated data in SalsaTigerXML format
335
+ if @produce_output
336
+ salsatiger_output(all_results)
337
+ end
338
+
339
+ end
340
+
341
+ #####
342
+ private
343
+
344
+ #########################
345
+ def apply_classifiers(filename, # name of feature file
346
+ classif_dir) # string: name of directory with classifiers
347
+
348
+ # make output file for classifiers
349
+ tf_output = Tempfile.new("fred")
350
+ tf_output.close()
351
+
352
+ ###
353
+ # apply classifiers
354
+
355
+ classifier_results = Array.new
356
+
357
+ @classifiers.each { |classifier, classifier_name|
358
+
359
+ success = classifier.apply(filename, tf_output.path())
360
+
361
+ # did we manage to classify the test data?
362
+ # there may be errors on the way (eg no training data)
363
+ if success
364
+ # read classifier output from file
365
+ # classifier_results: list of line entries
366
+ # line entry: list of pairs [sense, confidence]
367
+ classifier_results << classifier.read_resultfile(tf_output.path())
368
+
369
+ else
370
+ # error: return empty Array, so that error handling can take over
371
+ return Array.new
372
+ end
373
+ }
374
+
375
+ # if we are here, all classifiers have succeeded...
376
+
377
+ # clean up
378
+ tf_output.close(true)
379
+
380
+ # return list of classifier results,
381
+ # each entry is a list of results,
382
+ # one entry per classifier type
383
+ return classifier_results
384
+ end
385
+
386
+ ###
387
+ # join binary classifier results (if we are doing binary classifiers):
388
+ # if we have classifiers that are specific to individual senses,
389
+ # collect all classifiers that we have for a lemma, and
390
+ # for each instance, choose the sense that won with the highest confidence
391
+ #
392
+ # input: a list of result lists.
393
+ # a result list is a list of instance_results
394
+ # instance_results is a list of pairs [label, confidence]
395
+ # such that the label with the highest confidence is mentioned first
396
+ #
397
+ # output: a result list.
398
+ def join_binary_classifier_results(resultlists) # list:list:tuples [label, confidence]
399
+ unless @exp.get("binary_classifiers")
400
+ # we are doing lemma-specific, not sense-specific classifiers.
401
+ # so resultlist is a list containing just one entry.
402
+ # all classifier: list of lists of lists of pairs label, confidence
403
+ # one classifier: list of lists of pairs label, confidence
404
+ # line: list of pairs label, confidence
405
+ # label: pair label, confidence
406
+ return resultlists.first()
407
+ end
408
+
409
+ # we are doing sense-specific classifiers.
410
+ # group triples
411
+
412
+ # what is the name of the negative sense?
413
+ unless (negsense = @exp.get("negsense"))
414
+ negsense = "NONE"
415
+ end
416
+
417
+ # retv: list of instance results
418
+ # where an instance result is a list of pairs [label, confidence]
419
+ retv = Array.new()
420
+
421
+ # choose the sense that was assigned with highest confidence
422
+ # how many instances? max. length of any of the instance lists
423
+ # (we'll deal with mismatches in instance numbers later)
424
+ num_instances = resultlists.map { |list_one_classifier| list_one_classifier.length() }.max()
425
+ if num_instances.nil?
426
+ # no instances, it seems
427
+ return nil
428
+ end
429
+
430
+ 0.upto(num_instances - 1) { |instno|
431
+
432
+ # get the results of all classifiers for instance number instno
433
+ all_results_this_instance = resultlists.map { |list_one_classifier|
434
+ # get the instno-th line
435
+ if list_one_classifier.at(instno)
436
+ list_one_classifier.at(instno)
437
+ else
438
+ # length mismatch: we're missing an instance
439
+ $stderr.puts "Error: binary classifier results don't all have the same length."
440
+ $stderr.puts "Assuming missing results to be negative."
441
+ [["NONE", 1.0]]
442
+ end
443
+ }
444
+
445
+ # now throw out the negsense judgments, and sort results by confidence
446
+ joint_result_this_instance = all_results_this_instance.map { |inst_result|
447
+ # if we have more than 2 entries here,
448
+ # this is very weird for a binary classifier
449
+ if inst_result.length() > 2
450
+ $stderr.puts "Judgments for more than 2 senses in binary classifier? Very weird!"
451
+ $stderr.puts inst_result.map { |label, confidence| "#{label}:#{confidence}" }.join(" ")
452
+ $stderr.puts "Only considering the first non-negative sense."
453
+ end
454
+
455
+ # choose the first entry that is not the negsense,
456
+ # or nil, if only the negative sense has been assigned with 1.0 certainty.
457
+ # nil choices will be removed by the compact() below
458
+ inst_result.detect { |label, confidence|
459
+ label != negsense
460
+ }
461
+ }.compact().sort { |a, b|
462
+ # sort senses by confidence, highest confidence first
463
+ b[1] <=> a[1]
464
+ }
465
+
466
+ retv << joint_result_this_instance
467
+ }
468
+
469
+ return retv
470
+ end
471
+
472
+
473
+ ###
474
+ # produce output in SalsaTigerXML: disambiguated training data,
475
+ # assigned senses are recorded as frames, the targets of which are the
476
+ # disambiguated words
477
+ def salsatiger_output(all_results)
478
+
479
+ if @split_id
480
+ # we're not writing Salsa/Tiger XML output for splits.
481
+ $stderr.puts "No Salsa/Tiger XML output for random splits of the data,"
482
+ $stderr.puts "only for separate test sets."
483
+ return
484
+ end
485
+
486
+ ##
487
+ # determine output directory
488
+ if @exp.get("directory_output")
489
+ output_dir = File.new_dir(@exp.get("directory_output"))
490
+ else
491
+ output_dir = fred_dirname(@exp, "output", "stxml", "new")
492
+ end
493
+
494
+ $stderr.puts "Writing SalsaTigerXML output to #{output_dir}"
495
+
496
+ ##
497
+ # empty output directory
498
+ Dir[output_dir + "*"].each { |filename|
499
+ if File.exists?(filename)
500
+ File.delete(filename)
501
+ end
502
+ }
503
+
504
+ # input directory: where we stored the zipped input files
505
+ input_dir = fred_dirname(@exp, "test", "input_data")
506
+
507
+ ##
508
+ # map results to target IDs, using answer key files
509
+
510
+ # record results: hash
511
+ # <sentencde ID>(string) -> assigned senses
512
+ # where assigned senses are a list of tuples
513
+ # [target IDs, sense, lemma, pos]
514
+ recorded_results = Hash.new
515
+
516
+ all_results.each { |lemma, results|
517
+ answer_obj = AnswerKeyAccess.new(@exp, "test", lemma, "r")
518
+
519
+ instance_index = 0
520
+ answer_obj.each { |a_lemma, a_pos, a_targetIDs, a_sid, a_senses, a_senses_this|
521
+ key = a_sid
522
+
523
+ unless recorded_results[key]
524
+ recorded_results[key] = Array.new()
525
+ end
526
+
527
+ labels_and_senses_for_this_instance = results.at(instance_index)
528
+ if not(labels_and_senses_for_this_instance.empty?) and
529
+ (winning_sense = labels_and_senses_for_this_instance.first().first())
530
+
531
+ recorded_results[key] << [a_targetIDs, winning_sense, a_lemma, a_pos]
532
+ end
533
+
534
+ instance_index += 1
535
+ } # each answerkey line for this lemma
536
+ } # each lemma/results pair
537
+
538
+
539
+ ##
540
+ # read in SalsaTiger syntax, remove old semantics, add new semantics, write
541
+
542
+ Dir[input_dir + "*.xml.gz"].each { |filename|
543
+ # unzip input file
544
+ tempfile = Tempfile.new("FredTest")
545
+ tempfile.close()
546
+ %x{gunzip -c #{filename} > #{tempfile.path()}}
547
+
548
+ infile = FilePartsParser.new(tempfile.path())
549
+ if @exp.get("verbose")
550
+ $stderr.puts "SalsaTigerXML output of " + File.basename(filename, ".gz")
551
+ end
552
+
553
+ begin
554
+ outfile = File.new(output_dir + File.basename(filename, ".gz"), "w")
555
+ rescue
556
+ $stderr.puts "Couldn't write to output file #{output_dir}#{File.basename(filename)}."
557
+ $stderr.puts "Skipping Salsa/Tiger XML output."
558
+ return
559
+ end
560
+
561
+ # write header
562
+ outfile.puts infile.head()
563
+
564
+ infile.scan_s { |sent_string|
565
+ sent = SalsaTigerSentence.new(sent_string)
566
+
567
+ # remove old semantics
568
+ sent.remove_semantics()
569
+
570
+ if recorded_results and recorded_results[sent.id()]
571
+ recorded_results[sent.id()].each { |target_ids, sense, lemma, pos|
572
+
573
+ # add frame to sentence
574
+ new_frame = sent.add_frame(sense)
575
+
576
+ # get list of target nodes from target IDs
577
+ # assuming that target_ids is a string of target IDs
578
+ # separated by comma.
579
+ # IDs for which no node could be found are just ignored
580
+
581
+ targets = target_ids.map { |target_id|
582
+ sent.syn_node_with_id(target_id)
583
+ }.compact
584
+ # enter the target nodes for this new frame
585
+ new_frame.add_fe("target", targets)
586
+
587
+ # put lemma and POS info into <target>
588
+ new_frame.target.set_attribute("lemma", lemma)
589
+ new_frame.target.set_attribute("pos", pos)
590
+ }
591
+ end
592
+
593
+ # write changed sentence:
594
+ # only if there are recorded results for this sentence!
595
+ outfile.puts sent.get()
596
+
597
+ } # each sentence of file
598
+
599
+ # write footer
600
+ outfile.puts infile.tail()
601
+ outfile.close()
602
+ tempfile.close(true)
603
+ } # each SalsaTiger file of the input directory
604
+
605
+ end
606
+
607
+ end