frprep 0.0.1.prealpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,180 @@
1
+ ##
2
+ # splitting package for WSD:
3
+ # compute a split for feature files (one item a line, CSV),
4
+ # and apply pre-computed split
5
+ # to produce new feature files accordingly
6
+
7
+ require "tempfile"
8
+
9
+ require "fred/FredDetermineTargets"
10
+ require "fred/FredConventions"
11
+
12
+ class FredSplitPkg
13
+ ###
14
+ def initialize(exp)
15
+ @exp = exp
16
+ end
17
+
18
+ ###
19
+ def FredSplitPkg.split_dir(exp, split_id, mode = "existing")
20
+ return fred_dirname(exp, "split", split_id, mode)
21
+ end
22
+
23
+ ###
24
+ # make a new split
25
+ def make_new_split(split_id, # string: ID
26
+ trainpercent, # float: percentage training data
27
+ ignore_unambiguous = false)
28
+
29
+ # where to store the split?
30
+ split_dir = FredSplitPkg.split_dir(@exp, split_id, "new")
31
+
32
+ lemmas_and_senses = Targets.new(@exp, nil, "r")
33
+ unless lemmas_and_senses.targets_okay
34
+ # error during initialization
35
+ $stderr.puts "Error: Could not read list of known targets, bailing out."
36
+ exit 1
37
+ end
38
+
39
+ # Iterate through lemmas,
40
+ # split training feature files.
41
+ #
42
+ # Do the split only once per lemma,
43
+ # even if we have sense-specific feature files
44
+ feature_dir = fred_dirname(@exp, "train", "features")
45
+
46
+ lemmas_and_senses.get_lemmas().each { |lemma|
47
+ # construct split file
48
+ splitfilename = split_dir + fred_split_filename(lemma)
49
+ begin
50
+ splitfile = File.new(splitfilename, "w")
51
+ rescue
52
+ raise "Error: Couldn't write to file " + splitfilename
53
+ end
54
+
55
+ # find lemma-specific feature file
56
+
57
+ filename = feature_dir + fred_feature_filename(lemma)
58
+
59
+ unless File.exists?(filename)
60
+ # try lemma+sense-specific feature file
61
+ file_pattern = fred_feature_filename(lemma, "*", true)
62
+ filename = Dir[feature_dir + file_pattern].first()
63
+
64
+ unless filename
65
+ # no lemma+sense-specific feature file
66
+ $stderr.puts "Warning: split: no feature file found for #{lemma}, skipping."
67
+ splitfile.close()
68
+ next
69
+ end
70
+ end
71
+
72
+ # open feature file for reading
73
+ begin
74
+ file = File.new(filename)
75
+ rescue
76
+ raise "Couldn't read feature file " + filename
77
+ end
78
+
79
+ if ignore_unambiguous and
80
+ lemmas_and_senses.get_senses(lemma).length() < 2
81
+ # unambiguous: ignore
82
+
83
+ while file.gets()
84
+ splitfile.puts "ignore"
85
+ end
86
+
87
+ else
88
+ # read from feature file, classify at random
89
+ # as train or test,
90
+ # write result to splitfile
91
+
92
+ while file.gets()
93
+ if rand() < trainpercent
94
+ splitfile.puts "train"
95
+ else
96
+ splitfile.puts "test"
97
+ end
98
+ end
99
+ end
100
+
101
+ splitfile.close()
102
+ }
103
+ end
104
+
105
+ ###
106
+ # remove an old split
107
+ def FredSplitPkg.remove_split(exp, # FredConfigData object
108
+ splitID) # string: split ID
109
+ begin
110
+ split_dir = FredSplitPkg.split_dir(exp, splitID, "new")
111
+ rescue
112
+ # no split to be removed
113
+ return
114
+ end
115
+ %x{rm -rf #{split_dir}}
116
+ end
117
+
118
+
119
+ ###
120
+ # change feature files according to
121
+ # pre-computed split
122
+ #
123
+ #
124
+ # returns: tempfile containing featurized items,
125
+ # according to split,
126
+ # or nil if the split file wouldn't contain any data
127
+ def apply_split(filename, # feature file
128
+ lemma, # string: lemma that filename is about
129
+ dataset, # string: train, test
130
+ split_id) # string: split ID
131
+
132
+
133
+ split_filename = FredSplitPkg.split_dir(@exp, split_id) +
134
+ fred_split_filename(lemma)
135
+
136
+ # read feature file and split file at the same time
137
+ # write to tempfile.
138
+ f_feat = File.new(filename)
139
+ f_split = File.new(split_filename)
140
+ f_out = Tempfile.new("fred_split")
141
+
142
+ num_yes = 0
143
+
144
+ f_feat.each { |line|
145
+ begin
146
+ split_part = f_split.readline().chomp()
147
+ rescue
148
+ $stderr.puts "FredSplit error: split file too short."
149
+ $stderr.puts "skipping rest of featurization data."
150
+ $stderr.puts "Split file: #{split_filename}"
151
+ $stderr.puts "Feature file: #{filename}"
152
+ raise "HIER"
153
+ f_out.close()
154
+ if num_yes > 0
155
+ return f_out
156
+ else
157
+ return nil
158
+ end
159
+ end
160
+
161
+ if split_part == dataset
162
+ # write training data, and this item is in the training
163
+ # part of the split,
164
+ # or write test data, and item is in test part
165
+ f_out.puts line
166
+ num_yes += 1
167
+ end
168
+ }
169
+ f_out.close()
170
+ f_feat.close()
171
+ f_split.close()
172
+
173
+ if num_yes > 0
174
+ return f_out
175
+ else
176
+ return nil
177
+ end
178
+
179
+ end
180
+ end
@@ -0,0 +1,607 @@
1
+ # -*- coding: utf-8 -*-
2
+ # FredTest
3
+ # Katrin Erk April 05
4
+ #
5
+ # Frame disambiguation system:
6
+ # apply trained classifiers to test data
7
+ # Results are written out one output line per instance line.
8
+
9
+ # Ruby packages
10
+ require "tempfile"
11
+
12
+ # Salsa packages
13
+ require "common/Parser"
14
+ require "common/RegXML"
15
+ require "common/SalsaTigerRegXML"
16
+ require "common/ruby_class_extensions"
17
+
18
+ # Shalmaneser packages
19
+ require "common/FrPrepConfigData"
20
+ require "common/ML"
21
+ require "fred/Baseline"
22
+ require "fred/FredConventions"
23
+ require "fred/FredDetermineTargets"
24
+ require "fred/FredSplitPkg"
25
+ require "fred/FredFeatures"
26
+ require "fred/FredNumTrainingSenses"
27
+
28
+ class FredTest
29
+
30
+ ###
31
+ # new
32
+ #
33
+ # evaluate runtime options and announce the task
34
+ def initialize(exp_obj, # FredConfigData object
35
+ options) # hash: runtime option name (string) => value(string)
36
+
37
+ # keep the experiment file object
38
+ @exp = exp_obj
39
+
40
+ # evaluate runtime options
41
+ @split_id = nil
42
+ @baseline = false
43
+ @produce_output = true
44
+
45
+ options.each_pair { |opt, arg|
46
+ case opt
47
+ when "--logID"
48
+
49
+ @split_id = arg
50
+
51
+ when "--baseline"
52
+ @baseline = true
53
+
54
+ when "--nooutput"
55
+ @produce_output = false
56
+
57
+ else
58
+ # case of unknown arguments has been dealt with by fred.rb
59
+ end
60
+ }
61
+
62
+ # announce the task
63
+ $stderr.puts "---------"
64
+ $stderr.print "Fred experiment #{@exp.get("experiment_ID")}: "
65
+ if @baseline
66
+ $stderr.print "Computing baseline "
67
+ else
68
+ $stderr.print "Applying classifiers"
69
+ end
70
+ if @split_id
71
+ $stderr.puts " using split with ID #{@split_id}"
72
+ else
73
+ $stderr.puts
74
+ end
75
+ if @produce_output and not @split_id
76
+ $stderr.print "Output is to "
77
+ if @exp.get("directory_output")
78
+ $stderr.puts @exp.get("directory_output")
79
+ else
80
+ $stderr.puts fred_dirname(@exp, "output", "stxml", "new")
81
+ end
82
+ end
83
+ $stderr.puts "---------"
84
+
85
+ ###
86
+ # prepare data:
87
+
88
+ if @baseline
89
+ # only compute baseline: always assign most frequent sense
90
+
91
+ @classifiers = [
92
+ [Baseline.new(@exp, @split_id), "baseline"]
93
+ ]
94
+
95
+ else
96
+ # determine classifiers
97
+ #
98
+ # get_lf returns: array of pairs [classifier_name, options[array]]
99
+ #
100
+ # @classifiers: list of pairs [Classifier object, classifier name(string)]
101
+ @classifiers = @exp.get_lf("classifier").map { |classif_name, options|
102
+ [Classifier.new(classif_name, options), classif_name]
103
+ }
104
+ # sanity check: we need at least one classifier
105
+ if @classifiers.empty?
106
+ $stderr.puts "Error: I need at least one classifier, please specify using exp. file option 'classifier'"
107
+ exit 1
108
+ end
109
+
110
+
111
+ if @classifiers.length() > 1
112
+ $stderr.puts "Warning: I'm not doing classifier combination at the moment,"
113
+ $stderr.puts "so I'll be ignoring all but the first classifier type."
114
+ end
115
+ end
116
+
117
+ # get an object for listing senses of each lemma
118
+ @lemmas_and_senses = Targets.new(@exp, nil, "r")
119
+ end
120
+
121
+ ###
122
+ # compute
123
+ #
124
+ # classify test instances,
125
+ # write output to file.
126
+ def compute()
127
+ if @split_id
128
+ # make split object and parameter hash to pass to it.
129
+ # read feature data from training feature directory.
130
+ split_obj = FredSplitPkg.new(@exp)
131
+ dataset = "train"
132
+ else
133
+ # read feature data from test feature directory.
134
+ dataset = "test"
135
+ end
136
+
137
+ output_dir = fred_dirname(@exp, "output", "tab", "new")
138
+ classif_dir = fred_classifier_directory(@exp, @split_id)
139
+
140
+ ###
141
+ # remove old classifier output files
142
+ Dir[output_dir + "*"].each { |f|
143
+ if File.exists? f
144
+ File.delete(f)
145
+ end
146
+ }
147
+
148
+
149
+ all_results = Array.new()
150
+
151
+ ###
152
+ # get a list of all relevant feature files: lemma, sense?
153
+ lemma2_sense_and_filename = Hash.new()
154
+
155
+ FredFeatureAccess.each_feature_file(@exp, dataset) { |filename, values|
156
+
157
+ # catalogue under lemma
158
+ unless lemma2_sense_and_filename[values["lemma"]]
159
+ lemma2_sense_and_filename[values["lemma"]] = Array.new()
160
+ end
161
+ # catalogue only matches between chosen classifier type
162
+ # and actually existing classifier type
163
+
164
+ # hier checken
165
+ # senses ist nil, lemma2_sense_and_filename wird nicht gefüllt
166
+ # => es werden keine classifier gefunden
167
+
168
+
169
+ if @exp.get("binary_classifiers") and \
170
+ values["sense"] and not(values["sense"].empty?)
171
+ lemma2_sense_and_filename[values["lemma"]] << [values["sense"], filename]
172
+
173
+ elsif not(@exp.get("binary_classifiers")) and \
174
+ (values["sense"].nil? or values["sense"].empty?)
175
+ lemma2_sense_and_filename[values["lemma"]] << [nil, filename]
176
+ end
177
+ }
178
+
179
+ ###
180
+ # check whether we have classifiers
181
+ found = 0
182
+ found_single_sense = 0
183
+ lemma2_sense_and_filename.each_pair { |lemma, senses_and_filenames|
184
+ if @lemmas_and_senses.get_senses(lemma).length() == 1
185
+ # lemma with only one sense? then mark as such
186
+ found_single_sense += 1
187
+ else
188
+ # lemma with more than one sense: look for classifiers
189
+ senses_and_filenames.each { |sense, filename|
190
+ @classifiers.each { |classifier, classifier_name|
191
+ if @exp.get("binary_classifiers") and \
192
+ classifier.exists? classif_dir + fred_classifier_filename(classifier_name,
193
+ lemma, sense)
194
+ found += 1
195
+ elsif not(@exp.get("binary_classifiers")) and\
196
+ classifier.exists? classif_dir + fred_classifier_filename(classifier_name,
197
+ lemma)
198
+ found += 1
199
+ end
200
+ }
201
+ }
202
+ end
203
+ }
204
+ if found == 0 and found_single_sense < lemma2_sense_and_filename.length()
205
+ # no matching classifiers found
206
+ $stderr.puts "ERROR: no classifiers found in #{classif_dir}."
207
+ if @exp.get("binary_classifiers")
208
+ $stderr.puts "(Looking for binary classifiers.)"
209
+ else
210
+ $stderr.puts "(Looking for n-ary classifiers.)"
211
+ end
212
+ $stderr.puts "Please check whether you mistyped the classifier directory name.
213
+
214
+ Another possibility: You may have trained binary classifiers, but
215
+ tried to apply n-ary ones (or vice versa.)
216
+ "
217
+ exit 1
218
+ end
219
+
220
+ ###
221
+ # each test feature set:
222
+ # read classifier, apply
223
+ # iterate through instance files
224
+ lemma2_sense_and_filename.to_a().sort { |a, b|
225
+ a.first() <=> b.first
226
+ }.each { |lemma, senses_and_filenames|
227
+ # progress report
228
+ if @exp.get("verbose")
229
+ $stderr.puts "Applying to " + lemma
230
+ end
231
+
232
+ # results_this_lemma: array of classifier_results
233
+ # classifier_result: array of line_entries
234
+ # line entry: list of pairs [sense, confidence]
235
+ results_this_lemma = Array.new()
236
+
237
+ training_senses = determine_training_senses(lemma, @exp,
238
+ @lemmas_and_senses, @split_id)
239
+
240
+ senses_and_filenames.each { |sense, filename|
241
+
242
+ # if we're splitting the data, do that now
243
+ if split_obj
244
+ tempfile = split_obj.apply_split(filename, lemma, "test", @split_id)
245
+ if tempfile.nil?
246
+ # the test part of the split doesn't contain any data
247
+ $stderr.puts "Skipping #{lemma}: no test data in split"
248
+ next
249
+ end
250
+
251
+ filename = tempfile.path()
252
+ end
253
+
254
+ if training_senses.length() == 1
255
+ # single-sense lemma: just assign that sense to all occurrences
256
+ assigned_sense = training_senses.first()
257
+
258
+ classifier_result = Array.new()
259
+ f = File.open(filename)
260
+
261
+ f.each { |line| classifier_result << [[assigned_sense, 1.0]] }
262
+ results_this_lemma << classifier_result
263
+
264
+ else
265
+ #more than one sense: apply classifier(s)
266
+
267
+ # classifiers_read_okay:
268
+ # boolean, true if reading the stored classifier(s) succeeded
269
+ classifiers_read_okay = true
270
+ @classifiers.each { |classifier, classifier_name|
271
+
272
+ stored_classifier = classif_dir + fred_classifier_filename(classifier_name,
273
+ lemma, sense)
274
+ status = classifier.read(stored_classifier)
275
+ unless status
276
+ $stderr.puts "[FredTest] Error: could not read classifier."
277
+ classifiers_read_okay = false
278
+ end
279
+ }
280
+
281
+ if classifiers_read_okay
282
+ # apply classifiers, write result to database
283
+ classifier_results = apply_classifiers(filename, classif_dir)
284
+
285
+ if classifier_results.empty?
286
+ # something went wrong during the application of classifiers
287
+ $stderr.puts "Error while working on #{lemma}, skipping"
288
+ else
289
+ # we have classifier results:
290
+ # since we're not doing any classifier combination at the moment
291
+ # (if we did, this would be the place to do so!)
292
+ # discard the results of all but the first classifier
293
+ results_this_lemma << classifier_results.first()
294
+ end
295
+ end
296
+
297
+ if split_obj
298
+ tempfile.close(true)
299
+ end
300
+ end
301
+ }
302
+
303
+ # write to output file:
304
+ # if we have binary classifiers, join.
305
+ results_this_lemma = join_binary_classifier_results(results_this_lemma)
306
+
307
+ outfilename = output_dir + fred_result_filename(lemma)
308
+ begin
309
+ outfile = File.new(outfilename, "w")
310
+ rescue
311
+ raise "Couldn't write to result file " + outfilename
312
+ end
313
+
314
+ if results_this_lemma.nil?
315
+ # nothing has been done for this lemma
316
+ next
317
+ end
318
+
319
+ results_this_lemma.each { |result|
320
+ # result: an ordered list of pairs [label, confidence]
321
+ outfile.puts result.map { |label, confidence|
322
+ "#{label} #{confidence}"
323
+ }.join(" ")
324
+ }
325
+
326
+ # remember results for output
327
+ if @produce_output
328
+ all_results << [lemma, results_this_lemma]
329
+ end
330
+ }
331
+
332
+
333
+ ##
334
+ # produce output: disambiguated data in SalsaTigerXML format
335
+ if @produce_output
336
+ salsatiger_output(all_results)
337
+ end
338
+
339
+ end
340
+
341
+ #####
342
+ private
343
+
344
+ #########################
345
+ def apply_classifiers(filename, # name of feature file
346
+ classif_dir) # string: name of directory with classifiers
347
+
348
+ # make output file for classifiers
349
+ tf_output = Tempfile.new("fred")
350
+ tf_output.close()
351
+
352
+ ###
353
+ # apply classifiers
354
+
355
+ classifier_results = Array.new
356
+
357
+ @classifiers.each { |classifier, classifier_name|
358
+
359
+ success = classifier.apply(filename, tf_output.path())
360
+
361
+ # did we manage to classify the test data?
362
+ # there may be errors on the way (eg no training data)
363
+ if success
364
+ # read classifier output from file
365
+ # classifier_results: list of line entries
366
+ # line entry: list of pairs [sense, confidence]
367
+ classifier_results << classifier.read_resultfile(tf_output.path())
368
+
369
+ else
370
+ # error: return empty Array, so that error handling can take over
371
+ return Array.new
372
+ end
373
+ }
374
+
375
+ # if we are here, all classifiers have succeeded...
376
+
377
+ # clean up
378
+ tf_output.close(true)
379
+
380
+ # return list of classifier results,
381
+ # each entry is a list of results,
382
+ # one entry per classifier type
383
+ return classifier_results
384
+ end
385
+
386
+ ###
387
+ # join binary classifier results (if we are doing binary classifiers):
388
+ # if we have classifiers that are specific to individual senses,
389
+ # collect all classifiers that we have for a lemma, and
390
+ # for each instance, choose the sense that won with the highest confidence
391
+ #
392
+ # input: a list of result lists.
393
+ # a result list is a list of instance_results
394
+ # instance_results is a list of pairs [label, confidence]
395
+ # such that the label with the highest confidence is mentioned first
396
+ #
397
+ # output: a result list.
398
+ def join_binary_classifier_results(resultlists) # list:list:tuples [label, confidence]
399
+ unless @exp.get("binary_classifiers")
400
+ # we are doing lemma-specific, not sense-specific classifiers.
401
+ # so resultlist is a list containing just one entry.
402
+ # all classifier: list of lists of lists of pairs label, confidence
403
+ # one classifier: list of lists of pairs label, confidence
404
+ # line: list of pairs label, confidence
405
+ # label: pair label, confidence
406
+ return resultlists.first()
407
+ end
408
+
409
+ # we are doing sense-specific classifiers.
410
+ # group triples
411
+
412
+ # what is the name of the negative sense?
413
+ unless (negsense = @exp.get("negsense"))
414
+ negsense = "NONE"
415
+ end
416
+
417
+ # retv: list of instance results
418
+ # where an instance result is a list of pairs [label, confidence]
419
+ retv = Array.new()
420
+
421
+ # choose the sense that was assigned with highest confidence
422
+ # how many instances? max. length of any of the instance lists
423
+ # (we'll deal with mismatches in instance numbers later)
424
+ num_instances = resultlists.map { |list_one_classifier| list_one_classifier.length() }.max()
425
+ if num_instances.nil?
426
+ # no instances, it seems
427
+ return nil
428
+ end
429
+
430
+ 0.upto(num_instances - 1) { |instno|
431
+
432
+ # get the results of all classifiers for instance number instno
433
+ all_results_this_instance = resultlists.map { |list_one_classifier|
434
+ # get the instno-th line
435
+ if list_one_classifier.at(instno)
436
+ list_one_classifier.at(instno)
437
+ else
438
+ # length mismatch: we're missing an instance
439
+ $stderr.puts "Error: binary classifier results don't all have the same length."
440
+ $stderr.puts "Assuming missing results to be negative."
441
+ [["NONE", 1.0]]
442
+ end
443
+ }
444
+
445
+ # now throw out the negsense judgments, and sort results by confidence
446
+ joint_result_this_instance = all_results_this_instance.map { |inst_result|
447
+ # if we have more than 2 entries here,
448
+ # this is very weird for a binary classifier
449
+ if inst_result.length() > 2
450
+ $stderr.puts "Judgments for more than 2 senses in binary classifier? Very weird!"
451
+ $stderr.puts inst_result.map { |label, confidence| "#{label}:#{confidence}" }.join(" ")
452
+ $stderr.puts "Only considering the first non-negative sense."
453
+ end
454
+
455
+ # choose the first entry that is not the negsense,
456
+ # or nil, if only the negative sense has been assigned with 1.0 certainty.
457
+ # nil choices will be removed by the compact() below
458
+ inst_result.detect { |label, confidence|
459
+ label != negsense
460
+ }
461
+ }.compact().sort { |a, b|
462
+ # sort senses by confidence, highest confidence first
463
+ b[1] <=> a[1]
464
+ }
465
+
466
+ retv << joint_result_this_instance
467
+ }
468
+
469
+ return retv
470
+ end
471
+
472
+
473
+ ###
474
+ # produce output in SalsaTigerXML: disambiguated training data,
475
+ # assigned senses are recorded as frames, the targets of which are the
476
+ # disambiguated words
477
+ def salsatiger_output(all_results)
478
+
479
+ if @split_id
480
+ # we're not writing Salsa/Tiger XML output for splits.
481
+ $stderr.puts "No Salsa/Tiger XML output for random splits of the data,"
482
+ $stderr.puts "only for separate test sets."
483
+ return
484
+ end
485
+
486
+ ##
487
+ # determine output directory
488
+ if @exp.get("directory_output")
489
+ output_dir = File.new_dir(@exp.get("directory_output"))
490
+ else
491
+ output_dir = fred_dirname(@exp, "output", "stxml", "new")
492
+ end
493
+
494
+ $stderr.puts "Writing SalsaTigerXML output to #{output_dir}"
495
+
496
+ ##
497
+ # empty output directory
498
+ Dir[output_dir + "*"].each { |filename|
499
+ if File.exists?(filename)
500
+ File.delete(filename)
501
+ end
502
+ }
503
+
504
+ # input directory: where we stored the zipped input files
505
+ input_dir = fred_dirname(@exp, "test", "input_data")
506
+
507
+ ##
508
+ # map results to target IDs, using answer key files
509
+
510
+ # record results: hash
511
+ # <sentencde ID>(string) -> assigned senses
512
+ # where assigned senses are a list of tuples
513
+ # [target IDs, sense, lemma, pos]
514
+ recorded_results = Hash.new
515
+
516
+ all_results.each { |lemma, results|
517
+ answer_obj = AnswerKeyAccess.new(@exp, "test", lemma, "r")
518
+
519
+ instance_index = 0
520
+ answer_obj.each { |a_lemma, a_pos, a_targetIDs, a_sid, a_senses, a_senses_this|
521
+ key = a_sid
522
+
523
+ unless recorded_results[key]
524
+ recorded_results[key] = Array.new()
525
+ end
526
+
527
+ labels_and_senses_for_this_instance = results.at(instance_index)
528
+ if not(labels_and_senses_for_this_instance.empty?) and
529
+ (winning_sense = labels_and_senses_for_this_instance.first().first())
530
+
531
+ recorded_results[key] << [a_targetIDs, winning_sense, a_lemma, a_pos]
532
+ end
533
+
534
+ instance_index += 1
535
+ } # each answerkey line for this lemma
536
+ } # each lemma/results pair
537
+
538
+
539
+ ##
540
+ # read in SalsaTiger syntax, remove old semantics, add new semantics, write
541
+
542
+ Dir[input_dir + "*.xml.gz"].each { |filename|
543
+ # unzip input file
544
+ tempfile = Tempfile.new("FredTest")
545
+ tempfile.close()
546
+ %x{gunzip -c #{filename} > #{tempfile.path()}}
547
+
548
+ infile = FilePartsParser.new(tempfile.path())
549
+ if @exp.get("verbose")
550
+ $stderr.puts "SalsaTigerXML output of " + File.basename(filename, ".gz")
551
+ end
552
+
553
+ begin
554
+ outfile = File.new(output_dir + File.basename(filename, ".gz"), "w")
555
+ rescue
556
+ $stderr.puts "Couldn't write to output file #{output_dir}#{File.basename(filename)}."
557
+ $stderr.puts "Skipping Salsa/Tiger XML output."
558
+ return
559
+ end
560
+
561
+ # write header
562
+ outfile.puts infile.head()
563
+
564
+ infile.scan_s { |sent_string|
565
+ sent = SalsaTigerSentence.new(sent_string)
566
+
567
+ # remove old semantics
568
+ sent.remove_semantics()
569
+
570
+ if recorded_results and recorded_results[sent.id()]
571
+ recorded_results[sent.id()].each { |target_ids, sense, lemma, pos|
572
+
573
+ # add frame to sentence
574
+ new_frame = sent.add_frame(sense)
575
+
576
+ # get list of target nodes from target IDs
577
+ # assuming that target_ids is a string of target IDs
578
+ # separated by comma.
579
+ # IDs for which no node could be found are just ignored
580
+
581
+ targets = target_ids.map { |target_id|
582
+ sent.syn_node_with_id(target_id)
583
+ }.compact
584
+ # enter the target nodes for this new frame
585
+ new_frame.add_fe("target", targets)
586
+
587
+ # put lemma and POS info into <target>
588
+ new_frame.target.set_attribute("lemma", lemma)
589
+ new_frame.target.set_attribute("pos", pos)
590
+ }
591
+ end
592
+
593
+ # write changed sentence:
594
+ # only if there are recorded results for this sentence!
595
+ outfile.puts sent.get()
596
+
597
+ } # each sentence of file
598
+
599
+ # write footer
600
+ outfile.puts infile.tail()
601
+ outfile.close()
602
+ tempfile.close(true)
603
+ } # each SalsaTiger file of the input directory
604
+
605
+ end
606
+
607
+ end