frprep 0.0.1.prealpha
Sign up to get free protection for your applications and to get access to all the features.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,312 @@
|
|
1
|
+
# FredEval
|
2
|
+
# Katrin Erk April 05
|
3
|
+
#
|
4
|
+
# Frame disambiguation system: evaluate classification results
|
5
|
+
#
|
6
|
+
# While the other main classes of Fred just provide a new() method
|
7
|
+
# and a compute() method,
|
8
|
+
# the FredEval class also provides access methods to all the
|
9
|
+
# individual evaluation results and allows for a flag that
|
10
|
+
# suppresses evaluation output to a file --
|
11
|
+
# such that this package can also be used by external systems that
|
12
|
+
# wish to evaluate Fred.
|
13
|
+
#
|
14
|
+
# Inherits from the Eval class that is not Fred-specific
|
15
|
+
|
16
|
+
# Salsa packages
|
17
|
+
require "common/Eval"
|
18
|
+
require "common/ruby_class_extensions"
|
19
|
+
|
20
|
+
# Fred packages
|
21
|
+
require "fred/FredConfigData"
|
22
|
+
require "fred/FredConventions"
|
23
|
+
require "fred/FredFeatures"
|
24
|
+
require "fred/FredDetermineTargets"
|
25
|
+
|
26
|
+
class FredEval < Eval
|
27
|
+
|
28
|
+
###
|
29
|
+
# new
|
30
|
+
#
|
31
|
+
# evaluate runtime options and announce the task
|
32
|
+
def initialize(exp_obj, # FredConfigData object
|
33
|
+
options) # hash: runtime option name (string) => value(string)
|
34
|
+
|
35
|
+
in_enduser_mode_unavailable()
|
36
|
+
|
37
|
+
@exp = exp_obj
|
38
|
+
|
39
|
+
###
|
40
|
+
# evaluate runtime options
|
41
|
+
@split_id = nil
|
42
|
+
logfilename = nil
|
43
|
+
|
44
|
+
options.each_pair { |opt, arg|
|
45
|
+
case opt
|
46
|
+
when "--logID"
|
47
|
+
|
48
|
+
@split_id = arg
|
49
|
+
when "--printLog"
|
50
|
+
logfilename = fred_dirname(@exp, "eval", "log", "new") +
|
51
|
+
"eval_logfile.txt"
|
52
|
+
|
53
|
+
else
|
54
|
+
# case of unknown arguments has been dealt with by fred.rb
|
55
|
+
end
|
56
|
+
}
|
57
|
+
|
58
|
+
###
|
59
|
+
# make outfile name
|
60
|
+
outfilename = fred_dirname(@exp, "eval", "eval", "new") +
|
61
|
+
"eval.txt"
|
62
|
+
|
63
|
+
###
|
64
|
+
# do we regard all senses as assigned,
|
65
|
+
# as long as they surpass some threshold?
|
66
|
+
# if we are doing multilabel evaluation, we need the full list of senses
|
67
|
+
@threshold = @exp.get("assignment_confidence_threshold")
|
68
|
+
@target_obj = Targets.new(@exp, nil, "r")
|
69
|
+
unless @target_obj.targets_okay
|
70
|
+
# error during initialization
|
71
|
+
$stderr.puts "Error: Could not read list of known targets, bailing out."
|
72
|
+
exit 1
|
73
|
+
end
|
74
|
+
|
75
|
+
if @threshold or @exp.get("handle_multilabel") == "keep"
|
76
|
+
@multiple_senses_assigned = true
|
77
|
+
else
|
78
|
+
@multiple_senses_assigned = false
|
79
|
+
end
|
80
|
+
|
81
|
+
|
82
|
+
###
|
83
|
+
# initialize abstract class behind me
|
84
|
+
if @multiple_senses_assigned
|
85
|
+
# we are possibly assigning more than one sense: do precision/recall
|
86
|
+
# instead of accuracy:
|
87
|
+
# "true" is what "this sense has been assigned" is mapped to below.
|
88
|
+
super(outfilename, logfilename, "true")
|
89
|
+
else
|
90
|
+
super(outfilename, logfilename)
|
91
|
+
end
|
92
|
+
|
93
|
+
# what is being done with instances with multiple sense labels?
|
94
|
+
@handle_multilabel = @exp.get("handle_multilabel")
|
95
|
+
|
96
|
+
###
|
97
|
+
# announce the task
|
98
|
+
$stderr.puts "---------"
|
99
|
+
$stderr.print "Fred experiment #{@exp.get("experiment_ID")}: Evaluating classifiers"
|
100
|
+
if @split_dir
|
101
|
+
$stderr.puts " using split with ID #{@split_id}"
|
102
|
+
else
|
103
|
+
$stderr.puts
|
104
|
+
end
|
105
|
+
if @multiple_senses_assigned
|
106
|
+
$stderr.puts "Allowing for the assignment of multiple senses,"
|
107
|
+
$stderr.puts "computing precision and recall against the full sense list of a lemma."
|
108
|
+
end
|
109
|
+
$stderr.puts "Writing result to #{fred_dirname(@exp, "eval", "eval")}"
|
110
|
+
$stderr.puts "---------"
|
111
|
+
end
|
112
|
+
|
113
|
+
#####
|
114
|
+
protected
|
115
|
+
|
116
|
+
###
|
117
|
+
# each_group
|
118
|
+
#
|
119
|
+
# yield each group name in turn
|
120
|
+
# in our case, group names are lemmas
|
121
|
+
#
|
122
|
+
# also, set object-global variables in such a way
|
123
|
+
# that the elements of this group can be read
|
124
|
+
def each_group()
|
125
|
+
|
126
|
+
# access to classifier output files
|
127
|
+
output_dir = fred_dirname(@exp, "output", "tab")
|
128
|
+
# access to answer key files
|
129
|
+
|
130
|
+
|
131
|
+
if @split_id
|
132
|
+
# make split object and parameter hash to pass to it
|
133
|
+
dataset = "train"
|
134
|
+
else
|
135
|
+
dataset = "test"
|
136
|
+
end
|
137
|
+
|
138
|
+
# iterate through instance files
|
139
|
+
@target_obj.get_lemmas().sort().each { |lemma|
|
140
|
+
# progress report
|
141
|
+
if @exp.get("verbose")
|
142
|
+
$stderr.puts "Evaluating " + lemma
|
143
|
+
end
|
144
|
+
|
145
|
+
# file with classification results
|
146
|
+
begin
|
147
|
+
@classfile = File.new(output_dir + fred_result_filename(lemma))
|
148
|
+
rescue
|
149
|
+
# no classification results
|
150
|
+
@classfile = nil
|
151
|
+
end
|
152
|
+
|
153
|
+
# file with answers:
|
154
|
+
# maybe we need to apply a split first
|
155
|
+
if @split_id
|
156
|
+
@goldreader = AnswerKeyAccess.new(@exp, "train", lemma, "r", @split_id, "test")
|
157
|
+
else
|
158
|
+
@goldreader = AnswerKeyAccess.new(@exp, "test", lemma, "r")
|
159
|
+
end
|
160
|
+
|
161
|
+
# doing multilabel evaluation?
|
162
|
+
# then we need a list of all senses
|
163
|
+
if @multiple_senses_assigned
|
164
|
+
@all_senses = @target_obj.get_senses(lemma)
|
165
|
+
else
|
166
|
+
@all_senses = nil
|
167
|
+
end
|
168
|
+
|
169
|
+
yield lemma
|
170
|
+
}
|
171
|
+
end
|
172
|
+
|
173
|
+
###
|
174
|
+
# each_instance
|
175
|
+
#
|
176
|
+
# given a lemma name, yield each instance of this lemma in turn,
|
177
|
+
# or rather: yield pairs [gold_class(string), assigned_class(string)]
|
178
|
+
#
|
179
|
+
# relies on each_group() having set the appropriate readers
|
180
|
+
# @goldreader and @classfile
|
181
|
+
def each_instance(lemma) # string: lemma name
|
182
|
+
|
183
|
+
# watch out for repeated instances
|
184
|
+
# which may occur if handle_multilabel = repeat.
|
185
|
+
# Only yield them once to avoid re-evaluating multi-label instances
|
186
|
+
#
|
187
|
+
# instance_ids_seen: hash target_ids -> true/nil
|
188
|
+
instance_ids_seen = Hash.new()
|
189
|
+
|
190
|
+
# read gold file and classifier output file in parallel
|
191
|
+
@goldreader.each { |lemma, pos, target_ids, sid, senses_gold, transformed_gold_senses|
|
192
|
+
|
193
|
+
# classline: format
|
194
|
+
# (label confidence)*
|
195
|
+
# such that the label with the highest confidence is first
|
196
|
+
classline = nil
|
197
|
+
if @classfile
|
198
|
+
classline = @classfile.gets()
|
199
|
+
end
|
200
|
+
if classline.nil?
|
201
|
+
classline = ""
|
202
|
+
end
|
203
|
+
|
204
|
+
# $stderr.puts "HIER0 #{classline} #{@classfile.nil?}"
|
205
|
+
|
206
|
+
# have we done this same instance previously?
|
207
|
+
if instance_ids_seen[target_ids]
|
208
|
+
next
|
209
|
+
end
|
210
|
+
# instance not seen previously, but mark as seen now.
|
211
|
+
instance_ids_seen[target_ids] = true
|
212
|
+
|
213
|
+
# determine all assigned senses and their confidence levels
|
214
|
+
# determine all sense/confidence pairs
|
215
|
+
# senses assigned: list of pairs [senselist, confidence]
|
216
|
+
# where senselist is an array of sense strings
|
217
|
+
senses_assigned = Array.new()
|
218
|
+
current_sense = nil
|
219
|
+
|
220
|
+
classline.split().each_with_index { |entry, index|
|
221
|
+
if index % 2 == 0
|
222
|
+
# we have a sense label
|
223
|
+
if @handle_multilabel == "join"
|
224
|
+
# split up joined senses
|
225
|
+
current_sense = fred_split_sense(entry)
|
226
|
+
else
|
227
|
+
current_sense = [entry]
|
228
|
+
end
|
229
|
+
|
230
|
+
else
|
231
|
+
# we have a confidence level
|
232
|
+
senses_assigned << [current_sense, entry.to_f()]
|
233
|
+
end
|
234
|
+
}
|
235
|
+
|
236
|
+
|
237
|
+
if @threshold
|
238
|
+
# multiple senses assigned, and
|
239
|
+
# regard as assigned everything above a given threshold
|
240
|
+
|
241
|
+
# transform senses_assigned:
|
242
|
+
# in the case of "join", one sense may have several confidence levels,
|
243
|
+
# one on its own and one in a joined sense
|
244
|
+
senses_assigned_hash = Hash.new()
|
245
|
+
senses_assigned.each { |senses, confidence|
|
246
|
+
senses.each { |s|
|
247
|
+
# assign to each sense the maximum of its previous confidence
|
248
|
+
# and this one.
|
249
|
+
# watch out: confidence may be smaller than zero
|
250
|
+
if senses_assigned_hash[s]
|
251
|
+
senses_assigned_hash[s] = [senses_assigned_hash[s], confidence].max()
|
252
|
+
else
|
253
|
+
senses_assigned_hash[s] = confidence
|
254
|
+
end
|
255
|
+
}
|
256
|
+
}
|
257
|
+
|
258
|
+
# select all sense/confidence pairs where confidence is above threshold
|
259
|
+
senses_assigned = senses_assigned_hash.to_a().select { |sense, confidence|
|
260
|
+
confidence >= @threshold
|
261
|
+
}.map { |sense, confidence|
|
262
|
+
# then retain only the sense, not the confidence
|
263
|
+
sense
|
264
|
+
}
|
265
|
+
|
266
|
+
|
267
|
+
unless @all_senses
|
268
|
+
raise "Shouldn't be here"
|
269
|
+
end
|
270
|
+
|
271
|
+
# for each sense out of the list of all senses:
|
272
|
+
# yield a pair of [applies, has been assigned]
|
273
|
+
# both 'applies' and 'has been assigned' will be
|
274
|
+
# a string of either 'true' or 'false'
|
275
|
+
# assignment is accurate if both are the same
|
276
|
+
@all_senses.each { |sense_of_lemma|
|
277
|
+
gold_class = (senses_gold.include? sense_of_lemma).to_s()
|
278
|
+
assigned_class = (senses_assigned.include? sense_of_lemma).to_s()
|
279
|
+
yield [gold_class, assigned_class]
|
280
|
+
}
|
281
|
+
|
282
|
+
|
283
|
+
else
|
284
|
+
# regard only one sense as assigned at a time
|
285
|
+
# count as correct if the list of gold classes
|
286
|
+
# contains the main assigned class
|
287
|
+
# (relatively lenient evaluation)
|
288
|
+
|
289
|
+
# actually assigned class: only the one with the
|
290
|
+
# maximum confidence
|
291
|
+
# $stderr.puts "HIER5 #{senses_assigned.length()}"
|
292
|
+
|
293
|
+
if senses_assigned.empty?
|
294
|
+
# nothing to yield
|
295
|
+
else
|
296
|
+
|
297
|
+
max_senselist = senses_assigned.max { |a, b|
|
298
|
+
a.last() <=> b.last()
|
299
|
+
}.first()
|
300
|
+
|
301
|
+
|
302
|
+
max_senselist.each { |single_sense|
|
303
|
+
gold_class = (senses_gold.include? single_sense).to_s()
|
304
|
+
yield [gold_class, "true"]
|
305
|
+
}
|
306
|
+
end
|
307
|
+
|
308
|
+
end
|
309
|
+
}
|
310
|
+
end
|
311
|
+
|
312
|
+
end
|
@@ -0,0 +1,321 @@
|
|
1
|
+
class FredFeatureInfo
|
2
|
+
###
|
3
|
+
# class variable:
|
4
|
+
# list of all known extractors
|
5
|
+
# add to it using add_feature()
|
6
|
+
@@extractors = Array.new
|
7
|
+
|
8
|
+
# boolean. set to true after warning messages have been given once
|
9
|
+
@@warned = false
|
10
|
+
|
11
|
+
###
|
12
|
+
# add interface/interpreter
|
13
|
+
def FredFeatureInfo.add_feature(class_name) # Class object
|
14
|
+
@@extractors << class_name
|
15
|
+
end
|
16
|
+
|
17
|
+
###
|
18
|
+
def initialize(exp)
|
19
|
+
|
20
|
+
##
|
21
|
+
# make list of extractors that are
|
22
|
+
# required by the user
|
23
|
+
@features = Array.new
|
24
|
+
@exp = exp
|
25
|
+
|
26
|
+
# user-chosen extractors:
|
27
|
+
# returns array of pairs [feature group designator(string), options(array:string)]
|
28
|
+
exp.get_lf("feature").each { |extractor_name, *options|
|
29
|
+
|
30
|
+
extractor = @@extractors.detect { |e| e.feature_name() == extractor_name }
|
31
|
+
unless extractor
|
32
|
+
# no extractor found matching the given designator
|
33
|
+
unless @@warned
|
34
|
+
$stderr.puts "Warning: Could not find a feature extractor for #{extractor_name}: skipping."
|
35
|
+
end
|
36
|
+
next
|
37
|
+
end
|
38
|
+
|
39
|
+
# no need to use the options here,
|
40
|
+
# the feature extractors can get their options themselves.
|
41
|
+
@features << extractor
|
42
|
+
}
|
43
|
+
|
44
|
+
# do not print warnings again if another RosyFeatureInfo object is made
|
45
|
+
@@warned = true
|
46
|
+
end
|
47
|
+
|
48
|
+
###
|
49
|
+
# get_extractor_objects
|
50
|
+
#
|
51
|
+
# returns a list of feature extractor objects
|
52
|
+
def get_extractor_objects()
|
53
|
+
|
54
|
+
return @features.map{ |feature_class|
|
55
|
+
feature_class.new(@exp)
|
56
|
+
}
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
##################################3
|
61
|
+
class FredFeatureExtractor
|
62
|
+
###
|
63
|
+
# feature name:
|
64
|
+
# name by which you choose this feature
|
65
|
+
# in the experiment file
|
66
|
+
def FredFeatureExtractor.feature_name()
|
67
|
+
raise "Overwrite me."
|
68
|
+
end
|
69
|
+
|
70
|
+
###
|
71
|
+
# initialize with Fred experiment file object
|
72
|
+
def initialize(exp)
|
73
|
+
@exp = exp
|
74
|
+
end
|
75
|
+
|
76
|
+
###
|
77
|
+
# compute features from meta-features
|
78
|
+
#
|
79
|
+
# argument: hash
|
80
|
+
# metafeature_label -> metafeatures
|
81
|
+
# string -> array:string
|
82
|
+
#
|
83
|
+
# yields each feature as a string
|
84
|
+
def each_feature(feature_hash)
|
85
|
+
raise "overwrite me"
|
86
|
+
end
|
87
|
+
|
88
|
+
######
|
89
|
+
protected
|
90
|
+
|
91
|
+
def FredFeatureExtractor.announce_me()
|
92
|
+
if Module.constants.include? "FredFeatureInfo"
|
93
|
+
# yup, we have a class to which we can announce ourselves
|
94
|
+
FredFeatureInfo.add_feature(eval(self.name()))
|
95
|
+
else
|
96
|
+
# no interface collector class
|
97
|
+
# $stderr.puts "Feature #{self.name()} not announced: no RosyFeatureInfo."
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
102
|
+
|
103
|
+
#####
|
104
|
+
# context feature
|
105
|
+
class FredContextFeatureExtractor < FredFeatureExtractor
|
106
|
+
FredContextFeatureExtractor.announce_me()
|
107
|
+
|
108
|
+
def FredContextFeatureExtractor.feature_name()
|
109
|
+
return "context"
|
110
|
+
end
|
111
|
+
|
112
|
+
###
|
113
|
+
def initialize(exp)
|
114
|
+
super(exp)
|
115
|
+
|
116
|
+
# cxsizes: list of context sizes chosen as features,
|
117
|
+
# encoded in metafeature labels
|
118
|
+
# written in a hash for fast access
|
119
|
+
@cxsizes = Hash.new()
|
120
|
+
@exp.get_lf("feature", "context").each { |cxsize|
|
121
|
+
@cxsizes[ "CX" + cxsize.to_s() ] = true
|
122
|
+
}
|
123
|
+
end
|
124
|
+
|
125
|
+
###
|
126
|
+
def each_feature(feature_hash)
|
127
|
+
# grf#word#lemma#pos#ne
|
128
|
+
lemma_index = 2
|
129
|
+
|
130
|
+
feature_hash.each { |ftype, fvalues|
|
131
|
+
if @cxsizes[ftype]
|
132
|
+
# this is a context feature of a size chosen
|
133
|
+
# by the user for featurization
|
134
|
+
|
135
|
+
fvalues.each { |f|
|
136
|
+
next if f =~ /#####/;
|
137
|
+
yield ftype + f.split("#")[lemma_index]
|
138
|
+
}
|
139
|
+
end
|
140
|
+
}
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
#####
|
145
|
+
# context feature: POS separately, small contexts only
|
146
|
+
class FredContextPOSFeatureExtractor < FredFeatureExtractor
|
147
|
+
FredContextPOSFeatureExtractor.announce_me()
|
148
|
+
|
149
|
+
def FredContextPOSFeatureExtractor.feature_name()
|
150
|
+
return "context_pos"
|
151
|
+
end
|
152
|
+
|
153
|
+
###
|
154
|
+
def initialize(exp)
|
155
|
+
super(exp)
|
156
|
+
|
157
|
+
# cxsizes: list of context sizes chosen as features,
|
158
|
+
# encoded in metafeature labels
|
159
|
+
# written in a hash for fast access
|
160
|
+
@cxsizes = Hash.new()
|
161
|
+
@exp.get_lf("feature", "context").each { |cxsize|
|
162
|
+
if cxsize <= 10
|
163
|
+
@cxsizes[ "CX" + cxsize.to_s() ] = true
|
164
|
+
end
|
165
|
+
}
|
166
|
+
if @cxsizes.empty?
|
167
|
+
$stderr.puts "context_pos feature warning: will not be computed"
|
168
|
+
$stderr.puts "as there is no context of size <= 10"
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
###
|
173
|
+
def each_feature(feature_hash)
|
174
|
+
# word#lemma#pos#ne
|
175
|
+
pos_index = 2
|
176
|
+
|
177
|
+
feature_hash.each { |ftype, fvalues|
|
178
|
+
if @cxsizes[ftype]
|
179
|
+
# this is a context feature of a size chosen
|
180
|
+
# by the user for featurization
|
181
|
+
|
182
|
+
fvalues.each { |f|
|
183
|
+
yield "POS" + ftype + f.split("#")[pos_index]
|
184
|
+
}
|
185
|
+
end
|
186
|
+
}
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
#####
|
191
|
+
# bigram/trigram feature
|
192
|
+
class FredNgramFeatureExtractor < FredFeatureExtractor
|
193
|
+
FredNgramFeatureExtractor.announce_me()
|
194
|
+
|
195
|
+
def FredNgramFeatureExtractor.feature_name()
|
196
|
+
return "ngram"
|
197
|
+
end
|
198
|
+
|
199
|
+
###
|
200
|
+
def initialize(exp)
|
201
|
+
super(exp)
|
202
|
+
|
203
|
+
# cxsize: context size from which the ngram feature will be computed
|
204
|
+
# encoded in metafeature labels
|
205
|
+
# written in a hash for fast access
|
206
|
+
@cxsize = @exp.get_lf("feature", "context").detect { |cxsize|
|
207
|
+
cxsize >= 2
|
208
|
+
}
|
209
|
+
unless @cxsize
|
210
|
+
$stderr.puts "Warning: no context of size >= 2, so"
|
211
|
+
$stderr.puts "no ngram feature computed."
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
###
|
216
|
+
def each_feature(feature_hash)
|
217
|
+
# word#lemma#pos#ne
|
218
|
+
lemma_index = 1
|
219
|
+
pos_index = 2
|
220
|
+
|
221
|
+
feature_hash.each { |ftype, fvalues|
|
222
|
+
if ftype == "CX" + @cxsize.to_s()
|
223
|
+
# compute the ngram features from this context
|
224
|
+
# |fvalues| = 2*cxsize, that is, cxsize describes
|
225
|
+
# the length of a one-sided context window
|
226
|
+
# the bigram of features around the target
|
227
|
+
# concerns fvalues[cxsize-1] and fvalues[cxsize]
|
228
|
+
# the trigram of two words before, one word after includes
|
229
|
+
# fvalues[cxsize-2], fvalues[cxsize-1] and fvalues[cxsize]
|
230
|
+
|
231
|
+
[
|
232
|
+
[[-1, 0], "BLEM", lemma_index], # bigram of lemmas
|
233
|
+
[[-1, 0], "BPOS", pos_index], # bigram of POSs
|
234
|
+
[[-2, -1, 0], "TLEM", lemma_index], # trigram of lemmas
|
235
|
+
[[-2, -1, 0], "TPOS", pos_index] # trigram of POSs
|
236
|
+
].each { |f_indices, label, subindex|
|
237
|
+
fs = f_indices.map { |i| fvalues[@cxsize+i] }.compact()
|
238
|
+
if fs.length() == f_indices.length()
|
239
|
+
# we successfully extracted entries for all the given indices
|
240
|
+
yield label + fs.map { |f| f.split("#")[subindex] }.join()
|
241
|
+
end
|
242
|
+
}
|
243
|
+
end
|
244
|
+
}
|
245
|
+
end
|
246
|
+
end
|
247
|
+
|
248
|
+
|
249
|
+
#####
|
250
|
+
# syntax feature
|
251
|
+
class FredSynFeatureExtractor < FredFeatureExtractor
|
252
|
+
FredSynFeatureExtractor.announce_me()
|
253
|
+
|
254
|
+
def FredSynFeatureExtractor.feature_name()
|
255
|
+
return "syntax"
|
256
|
+
end
|
257
|
+
|
258
|
+
###
|
259
|
+
def each_feature(feature_hash)
|
260
|
+
|
261
|
+
feature_hash.each { |ftype, fvalues|
|
262
|
+
|
263
|
+
case ftype
|
264
|
+
when "CH", "PA"
|
265
|
+
grf_index = 0
|
266
|
+
|
267
|
+
fvalues.each { |f|
|
268
|
+
yield ftype + f.split("#")[grf_index]
|
269
|
+
}
|
270
|
+
|
271
|
+
when "SI"
|
272
|
+
# parentlemma#grf#word#lemma#pos#ne
|
273
|
+
grf_index = 1
|
274
|
+
|
275
|
+
fvalues.each { |f|
|
276
|
+
yield ftype + f.split("#")[grf_index]
|
277
|
+
}
|
278
|
+
|
279
|
+
else
|
280
|
+
# not a syntactic metafeature
|
281
|
+
end
|
282
|
+
}
|
283
|
+
end
|
284
|
+
end
|
285
|
+
|
286
|
+
|
287
|
+
|
288
|
+
|
289
|
+
#####
|
290
|
+
# syntax-plus-headword feature
|
291
|
+
class FredSynsemFeatureExtractor < FredFeatureExtractor
|
292
|
+
FredSynsemFeatureExtractor.announce_me()
|
293
|
+
|
294
|
+
def FredSynsemFeatureExtractor.feature_name()
|
295
|
+
return "synsem"
|
296
|
+
end
|
297
|
+
|
298
|
+
###
|
299
|
+
def each_feature(feature_hash)
|
300
|
+
|
301
|
+
feature_hash.each { |ftype, fvalues|
|
302
|
+
case ftype
|
303
|
+
when "CH", "PA"
|
304
|
+
# grf#word#lemma#pos#ne
|
305
|
+
fvalues.each { |f|
|
306
|
+
yield ftype + "SEM" + f
|
307
|
+
}
|
308
|
+
|
309
|
+
when "SI"
|
310
|
+
# parentlemma#grf#word#lemma#pos#ne
|
311
|
+
# remove parent lemma
|
312
|
+
fvalues.each { |f|
|
313
|
+
yield ftype + "SEM" + f.split("#")[1..-1].join("#")
|
314
|
+
}
|
315
|
+
|
316
|
+
else
|
317
|
+
# not a syntax feature
|
318
|
+
end
|
319
|
+
}
|
320
|
+
end
|
321
|
+
end
|