frprep 0.0.1.prealpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,312 @@
|
|
1
|
+
# FredEval
|
2
|
+
# Katrin Erk April 05
|
3
|
+
#
|
4
|
+
# Frame disambiguation system: evaluate classification results
|
5
|
+
#
|
6
|
+
# While the other main classes of Fred just provide a new() method
|
7
|
+
# and a compute() method,
|
8
|
+
# the FredEval class also provides access methods to all the
|
9
|
+
# individual evaluation results and allows for a flag that
|
10
|
+
# suppresses evaluation output to a file --
|
11
|
+
# such that this package can also be used by external systems that
|
12
|
+
# wish to evaluate Fred.
|
13
|
+
#
|
14
|
+
# Inherits from the Eval class that is not Fred-specific
|
15
|
+
|
16
|
+
# Salsa packages
|
17
|
+
require "common/Eval"
|
18
|
+
require "common/ruby_class_extensions"
|
19
|
+
|
20
|
+
# Fred packages
|
21
|
+
require "fred/FredConfigData"
|
22
|
+
require "fred/FredConventions"
|
23
|
+
require "fred/FredFeatures"
|
24
|
+
require "fred/FredDetermineTargets"
|
25
|
+
|
26
|
+
class FredEval < Eval
|
27
|
+
|
28
|
+
###
|
29
|
+
# new
|
30
|
+
#
|
31
|
+
# evaluate runtime options and announce the task
|
32
|
+
def initialize(exp_obj, # FredConfigData object
|
33
|
+
options) # hash: runtime option name (string) => value(string)
|
34
|
+
|
35
|
+
in_enduser_mode_unavailable()
|
36
|
+
|
37
|
+
@exp = exp_obj
|
38
|
+
|
39
|
+
###
|
40
|
+
# evaluate runtime options
|
41
|
+
@split_id = nil
|
42
|
+
logfilename = nil
|
43
|
+
|
44
|
+
options.each_pair { |opt, arg|
|
45
|
+
case opt
|
46
|
+
when "--logID"
|
47
|
+
|
48
|
+
@split_id = arg
|
49
|
+
when "--printLog"
|
50
|
+
logfilename = fred_dirname(@exp, "eval", "log", "new") +
|
51
|
+
"eval_logfile.txt"
|
52
|
+
|
53
|
+
else
|
54
|
+
# case of unknown arguments has been dealt with by fred.rb
|
55
|
+
end
|
56
|
+
}
|
57
|
+
|
58
|
+
###
|
59
|
+
# make outfile name
|
60
|
+
outfilename = fred_dirname(@exp, "eval", "eval", "new") +
|
61
|
+
"eval.txt"
|
62
|
+
|
63
|
+
###
|
64
|
+
# do we regard all senses as assigned,
|
65
|
+
# as long as they surpass some threshold?
|
66
|
+
# if we are doing multilabel evaluation, we need the full list of senses
|
67
|
+
@threshold = @exp.get("assignment_confidence_threshold")
|
68
|
+
@target_obj = Targets.new(@exp, nil, "r")
|
69
|
+
unless @target_obj.targets_okay
|
70
|
+
# error during initialization
|
71
|
+
$stderr.puts "Error: Could not read list of known targets, bailing out."
|
72
|
+
exit 1
|
73
|
+
end
|
74
|
+
|
75
|
+
if @threshold or @exp.get("handle_multilabel") == "keep"
|
76
|
+
@multiple_senses_assigned = true
|
77
|
+
else
|
78
|
+
@multiple_senses_assigned = false
|
79
|
+
end
|
80
|
+
|
81
|
+
|
82
|
+
###
|
83
|
+
# initialize abstract class behind me
|
84
|
+
if @multiple_senses_assigned
|
85
|
+
# we are possibly assigning more than one sense: do precision/recall
|
86
|
+
# instead of accuracy:
|
87
|
+
# "true" is what "this sense has been assigned" is mapped to below.
|
88
|
+
super(outfilename, logfilename, "true")
|
89
|
+
else
|
90
|
+
super(outfilename, logfilename)
|
91
|
+
end
|
92
|
+
|
93
|
+
# what is being done with instances with multiple sense labels?
|
94
|
+
@handle_multilabel = @exp.get("handle_multilabel")
|
95
|
+
|
96
|
+
###
|
97
|
+
# announce the task
|
98
|
+
$stderr.puts "---------"
|
99
|
+
$stderr.print "Fred experiment #{@exp.get("experiment_ID")}: Evaluating classifiers"
|
100
|
+
if @split_dir
|
101
|
+
$stderr.puts " using split with ID #{@split_id}"
|
102
|
+
else
|
103
|
+
$stderr.puts
|
104
|
+
end
|
105
|
+
if @multiple_senses_assigned
|
106
|
+
$stderr.puts "Allowing for the assignment of multiple senses,"
|
107
|
+
$stderr.puts "computing precision and recall against the full sense list of a lemma."
|
108
|
+
end
|
109
|
+
$stderr.puts "Writing result to #{fred_dirname(@exp, "eval", "eval")}"
|
110
|
+
$stderr.puts "---------"
|
111
|
+
end
|
112
|
+
|
113
|
+
#####
|
114
|
+
protected
|
115
|
+
|
116
|
+
###
|
117
|
+
# each_group
|
118
|
+
#
|
119
|
+
# yield each group name in turn
|
120
|
+
# in our case, group names are lemmas
|
121
|
+
#
|
122
|
+
# also, set object-global variables in such a way
|
123
|
+
# that the elements of this group can be read
|
124
|
+
def each_group()
|
125
|
+
|
126
|
+
# access to classifier output files
|
127
|
+
output_dir = fred_dirname(@exp, "output", "tab")
|
128
|
+
# access to answer key files
|
129
|
+
|
130
|
+
|
131
|
+
if @split_id
|
132
|
+
# make split object and parameter hash to pass to it
|
133
|
+
dataset = "train"
|
134
|
+
else
|
135
|
+
dataset = "test"
|
136
|
+
end
|
137
|
+
|
138
|
+
# iterate through instance files
|
139
|
+
@target_obj.get_lemmas().sort().each { |lemma|
|
140
|
+
# progress report
|
141
|
+
if @exp.get("verbose")
|
142
|
+
$stderr.puts "Evaluating " + lemma
|
143
|
+
end
|
144
|
+
|
145
|
+
# file with classification results
|
146
|
+
begin
|
147
|
+
@classfile = File.new(output_dir + fred_result_filename(lemma))
|
148
|
+
rescue
|
149
|
+
# no classification results
|
150
|
+
@classfile = nil
|
151
|
+
end
|
152
|
+
|
153
|
+
# file with answers:
|
154
|
+
# maybe we need to apply a split first
|
155
|
+
if @split_id
|
156
|
+
@goldreader = AnswerKeyAccess.new(@exp, "train", lemma, "r", @split_id, "test")
|
157
|
+
else
|
158
|
+
@goldreader = AnswerKeyAccess.new(@exp, "test", lemma, "r")
|
159
|
+
end
|
160
|
+
|
161
|
+
# doing multilabel evaluation?
|
162
|
+
# then we need a list of all senses
|
163
|
+
if @multiple_senses_assigned
|
164
|
+
@all_senses = @target_obj.get_senses(lemma)
|
165
|
+
else
|
166
|
+
@all_senses = nil
|
167
|
+
end
|
168
|
+
|
169
|
+
yield lemma
|
170
|
+
}
|
171
|
+
end
|
172
|
+
|
173
|
+
###
|
174
|
+
# each_instance
|
175
|
+
#
|
176
|
+
# given a lemma name, yield each instance of this lemma in turn,
|
177
|
+
# or rather: yield pairs [gold_class(string), assigned_class(string)]
|
178
|
+
#
|
179
|
+
# relies on each_group() having set the appropriate readers
|
180
|
+
# @goldreader and @classfile
|
181
|
+
def each_instance(lemma) # string: lemma name
|
182
|
+
|
183
|
+
# watch out for repeated instances
|
184
|
+
# which may occur if handle_multilabel = repeat.
|
185
|
+
# Only yield them once to avoid re-evaluating multi-label instances
|
186
|
+
#
|
187
|
+
# instance_ids_seen: hash target_ids -> true/nil
|
188
|
+
instance_ids_seen = Hash.new()
|
189
|
+
|
190
|
+
# read gold file and classifier output file in parallel
|
191
|
+
@goldreader.each { |lemma, pos, target_ids, sid, senses_gold, transformed_gold_senses|
|
192
|
+
|
193
|
+
# classline: format
|
194
|
+
# (label confidence)*
|
195
|
+
# such that the label with the highest confidence is first
|
196
|
+
classline = nil
|
197
|
+
if @classfile
|
198
|
+
classline = @classfile.gets()
|
199
|
+
end
|
200
|
+
if classline.nil?
|
201
|
+
classline = ""
|
202
|
+
end
|
203
|
+
|
204
|
+
# $stderr.puts "HIER0 #{classline} #{@classfile.nil?}"
|
205
|
+
|
206
|
+
# have we done this same instance previously?
|
207
|
+
if instance_ids_seen[target_ids]
|
208
|
+
next
|
209
|
+
end
|
210
|
+
# instance not seen previously, but mark as seen now.
|
211
|
+
instance_ids_seen[target_ids] = true
|
212
|
+
|
213
|
+
# determine all assigned senses and their confidence levels
|
214
|
+
# determine all sense/confidence pairs
|
215
|
+
# senses assigned: list of pairs [senselist, confidence]
|
216
|
+
# where senselist is an array of sense strings
|
217
|
+
senses_assigned = Array.new()
|
218
|
+
current_sense = nil
|
219
|
+
|
220
|
+
classline.split().each_with_index { |entry, index|
|
221
|
+
if index % 2 == 0
|
222
|
+
# we have a sense label
|
223
|
+
if @handle_multilabel == "join"
|
224
|
+
# split up joined senses
|
225
|
+
current_sense = fred_split_sense(entry)
|
226
|
+
else
|
227
|
+
current_sense = [entry]
|
228
|
+
end
|
229
|
+
|
230
|
+
else
|
231
|
+
# we have a confidence level
|
232
|
+
senses_assigned << [current_sense, entry.to_f()]
|
233
|
+
end
|
234
|
+
}
|
235
|
+
|
236
|
+
|
237
|
+
if @threshold
|
238
|
+
# multiple senses assigned, and
|
239
|
+
# regard as assigned everything above a given threshold
|
240
|
+
|
241
|
+
# transform senses_assigned:
|
242
|
+
# in the case of "join", one sense may have several confidence levels,
|
243
|
+
# one on its own and one in a joined sense
|
244
|
+
senses_assigned_hash = Hash.new()
|
245
|
+
senses_assigned.each { |senses, confidence|
|
246
|
+
senses.each { |s|
|
247
|
+
# assign to each sense the maximum of its previous confidence
|
248
|
+
# and this one.
|
249
|
+
# watch out: confidence may be smaller than zero
|
250
|
+
if senses_assigned_hash[s]
|
251
|
+
senses_assigned_hash[s] = [senses_assigned_hash[s], confidence].max()
|
252
|
+
else
|
253
|
+
senses_assigned_hash[s] = confidence
|
254
|
+
end
|
255
|
+
}
|
256
|
+
}
|
257
|
+
|
258
|
+
# select all sense/confidence pairs where confidence is above threshold
|
259
|
+
senses_assigned = senses_assigned_hash.to_a().select { |sense, confidence|
|
260
|
+
confidence >= @threshold
|
261
|
+
}.map { |sense, confidence|
|
262
|
+
# then retain only the sense, not the confidence
|
263
|
+
sense
|
264
|
+
}
|
265
|
+
|
266
|
+
|
267
|
+
unless @all_senses
|
268
|
+
raise "Shouldn't be here"
|
269
|
+
end
|
270
|
+
|
271
|
+
# for each sense out of the list of all senses:
|
272
|
+
# yield a pair of [applies, has been assigned]
|
273
|
+
# both 'applies' and 'has been assigned' will be
|
274
|
+
# a string of either 'true' or 'false'
|
275
|
+
# assignment is accurate if both are the same
|
276
|
+
@all_senses.each { |sense_of_lemma|
|
277
|
+
gold_class = (senses_gold.include? sense_of_lemma).to_s()
|
278
|
+
assigned_class = (senses_assigned.include? sense_of_lemma).to_s()
|
279
|
+
yield [gold_class, assigned_class]
|
280
|
+
}
|
281
|
+
|
282
|
+
|
283
|
+
else
|
284
|
+
# regard only one sense as assigned at a time
|
285
|
+
# count as correct if the list of gold classes
|
286
|
+
# contains the main assigned class
|
287
|
+
# (relatively lenient evaluation)
|
288
|
+
|
289
|
+
# actually assigned class: only the one with the
|
290
|
+
# maximum confidence
|
291
|
+
# $stderr.puts "HIER5 #{senses_assigned.length()}"
|
292
|
+
|
293
|
+
if senses_assigned.empty?
|
294
|
+
# nothing to yield
|
295
|
+
else
|
296
|
+
|
297
|
+
max_senselist = senses_assigned.max { |a, b|
|
298
|
+
a.last() <=> b.last()
|
299
|
+
}.first()
|
300
|
+
|
301
|
+
|
302
|
+
max_senselist.each { |single_sense|
|
303
|
+
gold_class = (senses_gold.include? single_sense).to_s()
|
304
|
+
yield [gold_class, "true"]
|
305
|
+
}
|
306
|
+
end
|
307
|
+
|
308
|
+
end
|
309
|
+
}
|
310
|
+
end
|
311
|
+
|
312
|
+
end
|
@@ -0,0 +1,321 @@
|
|
1
|
+
class FredFeatureInfo
|
2
|
+
###
|
3
|
+
# class variable:
|
4
|
+
# list of all known extractors
|
5
|
+
# add to it using add_feature()
|
6
|
+
@@extractors = Array.new
|
7
|
+
|
8
|
+
# boolean. set to true after warning messages have been given once
|
9
|
+
@@warned = false
|
10
|
+
|
11
|
+
###
|
12
|
+
# add interface/interpreter
|
13
|
+
def FredFeatureInfo.add_feature(class_name) # Class object
|
14
|
+
@@extractors << class_name
|
15
|
+
end
|
16
|
+
|
17
|
+
###
|
18
|
+
def initialize(exp)
|
19
|
+
|
20
|
+
##
|
21
|
+
# make list of extractors that are
|
22
|
+
# required by the user
|
23
|
+
@features = Array.new
|
24
|
+
@exp = exp
|
25
|
+
|
26
|
+
# user-chosen extractors:
|
27
|
+
# returns array of pairs [feature group designator(string), options(array:string)]
|
28
|
+
exp.get_lf("feature").each { |extractor_name, *options|
|
29
|
+
|
30
|
+
extractor = @@extractors.detect { |e| e.feature_name() == extractor_name }
|
31
|
+
unless extractor
|
32
|
+
# no extractor found matching the given designator
|
33
|
+
unless @@warned
|
34
|
+
$stderr.puts "Warning: Could not find a feature extractor for #{extractor_name}: skipping."
|
35
|
+
end
|
36
|
+
next
|
37
|
+
end
|
38
|
+
|
39
|
+
# no need to use the options here,
|
40
|
+
# the feature extractors can get their options themselves.
|
41
|
+
@features << extractor
|
42
|
+
}
|
43
|
+
|
44
|
+
# do not print warnings again if another RosyFeatureInfo object is made
|
45
|
+
@@warned = true
|
46
|
+
end
|
47
|
+
|
48
|
+
###
|
49
|
+
# get_extractor_objects
|
50
|
+
#
|
51
|
+
# returns a list of feature extractor objects
|
52
|
+
def get_extractor_objects()
|
53
|
+
|
54
|
+
return @features.map{ |feature_class|
|
55
|
+
feature_class.new(@exp)
|
56
|
+
}
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
##################################3
|
61
|
+
class FredFeatureExtractor
|
62
|
+
###
|
63
|
+
# feature name:
|
64
|
+
# name by which you choose this feature
|
65
|
+
# in the experiment file
|
66
|
+
def FredFeatureExtractor.feature_name()
|
67
|
+
raise "Overwrite me."
|
68
|
+
end
|
69
|
+
|
70
|
+
###
|
71
|
+
# initialize with Fred experiment file object
|
72
|
+
def initialize(exp)
|
73
|
+
@exp = exp
|
74
|
+
end
|
75
|
+
|
76
|
+
###
|
77
|
+
# compute features from meta-features
|
78
|
+
#
|
79
|
+
# argument: hash
|
80
|
+
# metafeature_label -> metafeatures
|
81
|
+
# string -> array:string
|
82
|
+
#
|
83
|
+
# yields each feature as a string
|
84
|
+
def each_feature(feature_hash)
|
85
|
+
raise "overwrite me"
|
86
|
+
end
|
87
|
+
|
88
|
+
######
|
89
|
+
protected
|
90
|
+
|
91
|
+
def FredFeatureExtractor.announce_me()
|
92
|
+
if Module.constants.include? "FredFeatureInfo"
|
93
|
+
# yup, we have a class to which we can announce ourselves
|
94
|
+
FredFeatureInfo.add_feature(eval(self.name()))
|
95
|
+
else
|
96
|
+
# no interface collector class
|
97
|
+
# $stderr.puts "Feature #{self.name()} not announced: no RosyFeatureInfo."
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
102
|
+
|
103
|
+
#####
|
104
|
+
# context feature
|
105
|
+
class FredContextFeatureExtractor < FredFeatureExtractor
|
106
|
+
FredContextFeatureExtractor.announce_me()
|
107
|
+
|
108
|
+
def FredContextFeatureExtractor.feature_name()
|
109
|
+
return "context"
|
110
|
+
end
|
111
|
+
|
112
|
+
###
|
113
|
+
def initialize(exp)
|
114
|
+
super(exp)
|
115
|
+
|
116
|
+
# cxsizes: list of context sizes chosen as features,
|
117
|
+
# encoded in metafeature labels
|
118
|
+
# written in a hash for fast access
|
119
|
+
@cxsizes = Hash.new()
|
120
|
+
@exp.get_lf("feature", "context").each { |cxsize|
|
121
|
+
@cxsizes[ "CX" + cxsize.to_s() ] = true
|
122
|
+
}
|
123
|
+
end
|
124
|
+
|
125
|
+
###
|
126
|
+
def each_feature(feature_hash)
|
127
|
+
# grf#word#lemma#pos#ne
|
128
|
+
lemma_index = 2
|
129
|
+
|
130
|
+
feature_hash.each { |ftype, fvalues|
|
131
|
+
if @cxsizes[ftype]
|
132
|
+
# this is a context feature of a size chosen
|
133
|
+
# by the user for featurization
|
134
|
+
|
135
|
+
fvalues.each { |f|
|
136
|
+
next if f =~ /#####/;
|
137
|
+
yield ftype + f.split("#")[lemma_index]
|
138
|
+
}
|
139
|
+
end
|
140
|
+
}
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
#####
|
145
|
+
# context feature: POS separately, small contexts only
|
146
|
+
class FredContextPOSFeatureExtractor < FredFeatureExtractor
|
147
|
+
FredContextPOSFeatureExtractor.announce_me()
|
148
|
+
|
149
|
+
def FredContextPOSFeatureExtractor.feature_name()
|
150
|
+
return "context_pos"
|
151
|
+
end
|
152
|
+
|
153
|
+
###
|
154
|
+
def initialize(exp)
|
155
|
+
super(exp)
|
156
|
+
|
157
|
+
# cxsizes: list of context sizes chosen as features,
|
158
|
+
# encoded in metafeature labels
|
159
|
+
# written in a hash for fast access
|
160
|
+
@cxsizes = Hash.new()
|
161
|
+
@exp.get_lf("feature", "context").each { |cxsize|
|
162
|
+
if cxsize <= 10
|
163
|
+
@cxsizes[ "CX" + cxsize.to_s() ] = true
|
164
|
+
end
|
165
|
+
}
|
166
|
+
if @cxsizes.empty?
|
167
|
+
$stderr.puts "context_pos feature warning: will not be computed"
|
168
|
+
$stderr.puts "as there is no context of size <= 10"
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
###
|
173
|
+
def each_feature(feature_hash)
|
174
|
+
# word#lemma#pos#ne
|
175
|
+
pos_index = 2
|
176
|
+
|
177
|
+
feature_hash.each { |ftype, fvalues|
|
178
|
+
if @cxsizes[ftype]
|
179
|
+
# this is a context feature of a size chosen
|
180
|
+
# by the user for featurization
|
181
|
+
|
182
|
+
fvalues.each { |f|
|
183
|
+
yield "POS" + ftype + f.split("#")[pos_index]
|
184
|
+
}
|
185
|
+
end
|
186
|
+
}
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
#####
|
191
|
+
# bigram/trigram feature
|
192
|
+
class FredNgramFeatureExtractor < FredFeatureExtractor
|
193
|
+
FredNgramFeatureExtractor.announce_me()
|
194
|
+
|
195
|
+
def FredNgramFeatureExtractor.feature_name()
|
196
|
+
return "ngram"
|
197
|
+
end
|
198
|
+
|
199
|
+
###
|
200
|
+
def initialize(exp)
|
201
|
+
super(exp)
|
202
|
+
|
203
|
+
# cxsize: context size from which the ngram feature will be computed
|
204
|
+
# encoded in metafeature labels
|
205
|
+
# written in a hash for fast access
|
206
|
+
@cxsize = @exp.get_lf("feature", "context").detect { |cxsize|
|
207
|
+
cxsize >= 2
|
208
|
+
}
|
209
|
+
unless @cxsize
|
210
|
+
$stderr.puts "Warning: no context of size >= 2, so"
|
211
|
+
$stderr.puts "no ngram feature computed."
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
###
|
216
|
+
def each_feature(feature_hash)
|
217
|
+
# word#lemma#pos#ne
|
218
|
+
lemma_index = 1
|
219
|
+
pos_index = 2
|
220
|
+
|
221
|
+
feature_hash.each { |ftype, fvalues|
|
222
|
+
if ftype == "CX" + @cxsize.to_s()
|
223
|
+
# compute the ngram features from this context
|
224
|
+
# |fvalues| = 2*cxsize, that is, cxsize describes
|
225
|
+
# the length of a one-sided context window
|
226
|
+
# the bigram of features around the target
|
227
|
+
# concerns fvalues[cxsize-1] and fvalues[cxsize]
|
228
|
+
# the trigram of two words before, one word after includes
|
229
|
+
# fvalues[cxsize-2], fvalues[cxsize-1] and fvalues[cxsize]
|
230
|
+
|
231
|
+
[
|
232
|
+
[[-1, 0], "BLEM", lemma_index], # bigram of lemmas
|
233
|
+
[[-1, 0], "BPOS", pos_index], # bigram of POSs
|
234
|
+
[[-2, -1, 0], "TLEM", lemma_index], # trigram of lemmas
|
235
|
+
[[-2, -1, 0], "TPOS", pos_index] # trigram of POSs
|
236
|
+
].each { |f_indices, label, subindex|
|
237
|
+
fs = f_indices.map { |i| fvalues[@cxsize+i] }.compact()
|
238
|
+
if fs.length() == f_indices.length()
|
239
|
+
# we successfully extracted entries for all the given indices
|
240
|
+
yield label + fs.map { |f| f.split("#")[subindex] }.join()
|
241
|
+
end
|
242
|
+
}
|
243
|
+
end
|
244
|
+
}
|
245
|
+
end
|
246
|
+
end
|
247
|
+
|
248
|
+
|
249
|
+
#####
|
250
|
+
# syntax feature
|
251
|
+
class FredSynFeatureExtractor < FredFeatureExtractor
|
252
|
+
FredSynFeatureExtractor.announce_me()
|
253
|
+
|
254
|
+
def FredSynFeatureExtractor.feature_name()
|
255
|
+
return "syntax"
|
256
|
+
end
|
257
|
+
|
258
|
+
###
|
259
|
+
def each_feature(feature_hash)
|
260
|
+
|
261
|
+
feature_hash.each { |ftype, fvalues|
|
262
|
+
|
263
|
+
case ftype
|
264
|
+
when "CH", "PA"
|
265
|
+
grf_index = 0
|
266
|
+
|
267
|
+
fvalues.each { |f|
|
268
|
+
yield ftype + f.split("#")[grf_index]
|
269
|
+
}
|
270
|
+
|
271
|
+
when "SI"
|
272
|
+
# parentlemma#grf#word#lemma#pos#ne
|
273
|
+
grf_index = 1
|
274
|
+
|
275
|
+
fvalues.each { |f|
|
276
|
+
yield ftype + f.split("#")[grf_index]
|
277
|
+
}
|
278
|
+
|
279
|
+
else
|
280
|
+
# not a syntactic metafeature
|
281
|
+
end
|
282
|
+
}
|
283
|
+
end
|
284
|
+
end
|
285
|
+
|
286
|
+
|
287
|
+
|
288
|
+
|
289
|
+
#####
|
290
|
+
# syntax-plus-headword feature
|
291
|
+
class FredSynsemFeatureExtractor < FredFeatureExtractor
|
292
|
+
FredSynsemFeatureExtractor.announce_me()
|
293
|
+
|
294
|
+
def FredSynsemFeatureExtractor.feature_name()
|
295
|
+
return "synsem"
|
296
|
+
end
|
297
|
+
|
298
|
+
###
|
299
|
+
def each_feature(feature_hash)
|
300
|
+
|
301
|
+
feature_hash.each { |ftype, fvalues|
|
302
|
+
case ftype
|
303
|
+
when "CH", "PA"
|
304
|
+
# grf#word#lemma#pos#ne
|
305
|
+
fvalues.each { |f|
|
306
|
+
yield ftype + "SEM" + f
|
307
|
+
}
|
308
|
+
|
309
|
+
when "SI"
|
310
|
+
# parentlemma#grf#word#lemma#pos#ne
|
311
|
+
# remove parent lemma
|
312
|
+
fvalues.each { |f|
|
313
|
+
yield ftype + "SEM" + f.split("#")[1..-1].join("#")
|
314
|
+
}
|
315
|
+
|
316
|
+
else
|
317
|
+
# not a syntax feature
|
318
|
+
end
|
319
|
+
}
|
320
|
+
end
|
321
|
+
end
|