shalmaneser 1.2.0.rc4 → 1.2.rc5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +47 -18
- data/bin/shalmaneser +8 -2
- data/doc/index.md +1 -0
- data/lib/shalmaneser/opt_parser.rb +68 -67
- metadata +49 -119
- data/bin/fred +0 -16
- data/bin/frprep +0 -34
- data/bin/rosy +0 -17
- data/lib/common/AbstractSynInterface.rb +0 -1229
- data/lib/common/Counter.rb +0 -18
- data/lib/common/EnduserMode.rb +0 -27
- data/lib/common/Eval.rb +0 -480
- data/lib/common/FixSynSemMapping.rb +0 -196
- data/lib/common/Graph.rb +0 -345
- data/lib/common/ISO-8859-1.rb +0 -24
- data/lib/common/ML.rb +0 -186
- data/lib/common/Mallet.rb +0 -236
- data/lib/common/Maxent.rb +0 -229
- data/lib/common/Optimise.rb +0 -195
- data/lib/common/Parser.rb +0 -213
- data/lib/common/RegXML.rb +0 -269
- data/lib/common/RosyConventions.rb +0 -171
- data/lib/common/STXmlTerminalOrder.rb +0 -194
- data/lib/common/SalsaTigerRegXML.rb +0 -2347
- data/lib/common/SalsaTigerXMLHelper.rb +0 -99
- data/lib/common/SynInterfaces.rb +0 -282
- data/lib/common/TabFormat.rb +0 -721
- data/lib/common/Tiger.rb +0 -1448
- data/lib/common/Timbl.rb +0 -144
- data/lib/common/Tree.rb +0 -61
- data/lib/common/config_data.rb +0 -470
- data/lib/common/config_format_element.rb +0 -220
- data/lib/common/headz.rb +0 -338
- data/lib/common/option_parser.rb +0 -13
- data/lib/common/prep_config_data.rb +0 -62
- data/lib/common/prep_helper.rb +0 -1330
- data/lib/common/ruby_class_extensions.rb +0 -310
- data/lib/db/db_interface.rb +0 -48
- data/lib/db/db_mysql.rb +0 -145
- data/lib/db/db_sqlite.rb +0 -280
- data/lib/db/db_table.rb +0 -239
- data/lib/db/db_wrapper.rb +0 -176
- data/lib/db/sql_query.rb +0 -243
- data/lib/ext/maxent/Classify.class +0 -0
- data/lib/ext/maxent/Train.class +0 -0
- data/lib/fred/Baseline.rb +0 -150
- data/lib/fred/FileZipped.rb +0 -31
- data/lib/fred/FredBOWContext.rb +0 -877
- data/lib/fred/FredConventions.rb +0 -232
- data/lib/fred/FredDetermineTargets.rb +0 -319
- data/lib/fred/FredEval.rb +0 -312
- data/lib/fred/FredFeatureExtractors.rb +0 -322
- data/lib/fred/FredFeatures.rb +0 -1061
- data/lib/fred/FredFeaturize.rb +0 -602
- data/lib/fred/FredNumTrainingSenses.rb +0 -27
- data/lib/fred/FredParameters.rb +0 -402
- data/lib/fred/FredSplit.rb +0 -84
- data/lib/fred/FredSplitPkg.rb +0 -180
- data/lib/fred/FredTest.rb +0 -606
- data/lib/fred/FredTrain.rb +0 -144
- data/lib/fred/PlotAndREval.rb +0 -480
- data/lib/fred/fred.rb +0 -47
- data/lib/fred/fred_config_data.rb +0 -185
- data/lib/fred/md5.rb +0 -23
- data/lib/fred/opt_parser.rb +0 -250
- data/lib/frprep/Ampersand.rb +0 -39
- data/lib/frprep/CollinsInterface.rb +0 -1165
- data/lib/frprep/Counter.rb +0 -18
- data/lib/frprep/FNCorpusXML.rb +0 -643
- data/lib/frprep/FNDatabase.rb +0 -144
- data/lib/frprep/FrameXML.rb +0 -513
- data/lib/frprep/Graph.rb +0 -345
- data/lib/frprep/MiniparInterface.rb +0 -1388
- data/lib/frprep/RegXML.rb +0 -269
- data/lib/frprep/STXmlTerminalOrder.rb +0 -194
- data/lib/frprep/SleepyInterface.rb +0 -384
- data/lib/frprep/TntInterface.rb +0 -44
- data/lib/frprep/TreetaggerInterface.rb +0 -327
- data/lib/frprep/do_parses.rb +0 -143
- data/lib/frprep/frprep.rb +0 -693
- data/lib/frprep/interfaces/berkeley_interface.rb +0 -372
- data/lib/frprep/interfaces/stanford_interface.rb +0 -353
- data/lib/frprep/interpreters/berkeley_interpreter.rb +0 -22
- data/lib/frprep/interpreters/stanford_interpreter.rb +0 -22
- data/lib/frprep/one_parsed_file.rb +0 -28
- data/lib/frprep/opt_parser.rb +0 -94
- data/lib/frprep/ruby_class_extensions.rb +0 -310
- data/lib/rosy/AbstractFeatureAndExternal.rb +0 -242
- data/lib/rosy/ExternalConfigData.rb +0 -58
- data/lib/rosy/FailedParses.rb +0 -130
- data/lib/rosy/FeatureInfo.rb +0 -242
- data/lib/rosy/GfInduce.rb +0 -1115
- data/lib/rosy/GfInduceFeature.rb +0 -148
- data/lib/rosy/InputData.rb +0 -294
- data/lib/rosy/RosyConfusability.rb +0 -338
- data/lib/rosy/RosyEval.rb +0 -465
- data/lib/rosy/RosyFeatureExtractors.rb +0 -1609
- data/lib/rosy/RosyFeaturize.rb +0 -281
- data/lib/rosy/RosyInspect.rb +0 -336
- data/lib/rosy/RosyIterator.rb +0 -478
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +0 -230
- data/lib/rosy/RosyPruning.rb +0 -165
- data/lib/rosy/RosyServices.rb +0 -744
- data/lib/rosy/RosySplit.rb +0 -232
- data/lib/rosy/RosyTask.rb +0 -19
- data/lib/rosy/RosyTest.rb +0 -829
- data/lib/rosy/RosyTrain.rb +0 -234
- data/lib/rosy/RosyTrainingTestTable.rb +0 -787
- data/lib/rosy/TargetsMostFrequentFrame.rb +0 -60
- data/lib/rosy/View.rb +0 -418
- data/lib/rosy/opt_parser.rb +0 -379
- data/lib/rosy/rosy.rb +0 -78
- data/lib/rosy/rosy_config_data.rb +0 -121
- data/lib/shalmaneser/version.rb +0 -3
data/lib/fred/FredEval.rb
DELETED
@@ -1,312 +0,0 @@
|
|
1
|
-
# FredEval
|
2
|
-
# Katrin Erk April 05
|
3
|
-
#
|
4
|
-
# Frame disambiguation system: evaluate classification results
|
5
|
-
#
|
6
|
-
# While the other main classes of Fred just provide a new() method
|
7
|
-
# and a compute() method,
|
8
|
-
# the FredEval class also provides access methods to all the
|
9
|
-
# individual evaluation results and allows for a flag that
|
10
|
-
# suppresses evaluation output to a file --
|
11
|
-
# such that this package can also be used by external systems that
|
12
|
-
# wish to evaluate Fred.
|
13
|
-
#
|
14
|
-
# Inherits from the Eval class that is not Fred-specific
|
15
|
-
|
16
|
-
# Salsa packages
|
17
|
-
require "common/Eval"
|
18
|
-
require "common/ruby_class_extensions"
|
19
|
-
|
20
|
-
# Fred packages
|
21
|
-
require "fred/fred_config_data"
|
22
|
-
require "fred/FredConventions"
|
23
|
-
require "fred/FredFeatures"
|
24
|
-
require "fred/FredDetermineTargets"
|
25
|
-
|
26
|
-
class FredEval < Eval
|
27
|
-
|
28
|
-
###
|
29
|
-
# new
|
30
|
-
#
|
31
|
-
# evaluate runtime options and announce the task
|
32
|
-
def initialize(exp_obj, # FredConfigData object
|
33
|
-
options) # hash: runtime option name (string) => value(string)
|
34
|
-
|
35
|
-
in_enduser_mode_unavailable()
|
36
|
-
|
37
|
-
@exp = exp_obj
|
38
|
-
|
39
|
-
###
|
40
|
-
# evaluate runtime options
|
41
|
-
@split_id = nil
|
42
|
-
logfilename = nil
|
43
|
-
|
44
|
-
options.each_pair { |opt, arg|
|
45
|
-
case opt
|
46
|
-
when "--logID"
|
47
|
-
|
48
|
-
@split_id = arg
|
49
|
-
when "--printLog"
|
50
|
-
logfilename = fred_dirname(@exp, "eval", "log", "new") +
|
51
|
-
"eval_logfile.txt"
|
52
|
-
|
53
|
-
else
|
54
|
-
# case of unknown arguments has been dealt with by fred.rb
|
55
|
-
end
|
56
|
-
}
|
57
|
-
|
58
|
-
###
|
59
|
-
# make outfile name
|
60
|
-
outfilename = fred_dirname(@exp, "eval", "eval", "new") +
|
61
|
-
"eval.txt"
|
62
|
-
|
63
|
-
###
|
64
|
-
# do we regard all senses as assigned,
|
65
|
-
# as long as they surpass some threshold?
|
66
|
-
# if we are doing multilabel evaluation, we need the full list of senses
|
67
|
-
@threshold = @exp.get("assignment_confidence_threshold")
|
68
|
-
@target_obj = Targets.new(@exp, nil, "r")
|
69
|
-
unless @target_obj.targets_okay
|
70
|
-
# error during initialization
|
71
|
-
$stderr.puts "Error: Could not read list of known targets, bailing out."
|
72
|
-
exit 1
|
73
|
-
end
|
74
|
-
|
75
|
-
if @threshold or @exp.get("handle_multilabel") == "keep"
|
76
|
-
@multiple_senses_assigned = true
|
77
|
-
else
|
78
|
-
@multiple_senses_assigned = false
|
79
|
-
end
|
80
|
-
|
81
|
-
|
82
|
-
###
|
83
|
-
# initialize abstract class behind me
|
84
|
-
if @multiple_senses_assigned
|
85
|
-
# we are possibly assigning more than one sense: do precision/recall
|
86
|
-
# instead of accuracy:
|
87
|
-
# "true" is what "this sense has been assigned" is mapped to below.
|
88
|
-
super(outfilename, logfilename, "true")
|
89
|
-
else
|
90
|
-
super(outfilename, logfilename)
|
91
|
-
end
|
92
|
-
|
93
|
-
# what is being done with instances with multiple sense labels?
|
94
|
-
@handle_multilabel = @exp.get("handle_multilabel")
|
95
|
-
|
96
|
-
###
|
97
|
-
# announce the task
|
98
|
-
$stderr.puts "---------"
|
99
|
-
$stderr.print "Fred experiment #{@exp.get("experiment_ID")}: Evaluating classifiers"
|
100
|
-
if @split_dir
|
101
|
-
$stderr.puts " using split with ID #{@split_id}"
|
102
|
-
else
|
103
|
-
$stderr.puts
|
104
|
-
end
|
105
|
-
if @multiple_senses_assigned
|
106
|
-
$stderr.puts "Allowing for the assignment of multiple senses,"
|
107
|
-
$stderr.puts "computing precision and recall against the full sense list of a lemma."
|
108
|
-
end
|
109
|
-
$stderr.puts "Writing result to #{fred_dirname(@exp, "eval", "eval")}"
|
110
|
-
$stderr.puts "---------"
|
111
|
-
end
|
112
|
-
|
113
|
-
#####
|
114
|
-
protected
|
115
|
-
|
116
|
-
###
|
117
|
-
# each_group
|
118
|
-
#
|
119
|
-
# yield each group name in turn
|
120
|
-
# in our case, group names are lemmas
|
121
|
-
#
|
122
|
-
# also, set object-global variables in such a way
|
123
|
-
# that the elements of this group can be read
|
124
|
-
def each_group()
|
125
|
-
|
126
|
-
# access to classifier output files
|
127
|
-
output_dir = fred_dirname(@exp, "output", "tab")
|
128
|
-
# access to answer key files
|
129
|
-
|
130
|
-
|
131
|
-
if @split_id
|
132
|
-
# make split object and parameter hash to pass to it
|
133
|
-
dataset = "train"
|
134
|
-
else
|
135
|
-
dataset = "test"
|
136
|
-
end
|
137
|
-
|
138
|
-
# iterate through instance files
|
139
|
-
@target_obj.get_lemmas().sort().each { |lemma|
|
140
|
-
# progress report
|
141
|
-
if @exp.get("verbose")
|
142
|
-
$stderr.puts "Evaluating " + lemma
|
143
|
-
end
|
144
|
-
|
145
|
-
# file with classification results
|
146
|
-
begin
|
147
|
-
@classfile = File.new(output_dir + fred_result_filename(lemma))
|
148
|
-
rescue
|
149
|
-
# no classification results
|
150
|
-
@classfile = nil
|
151
|
-
end
|
152
|
-
|
153
|
-
# file with answers:
|
154
|
-
# maybe we need to apply a split first
|
155
|
-
if @split_id
|
156
|
-
@goldreader = AnswerKeyAccess.new(@exp, "train", lemma, "r", @split_id, "test")
|
157
|
-
else
|
158
|
-
@goldreader = AnswerKeyAccess.new(@exp, "test", lemma, "r")
|
159
|
-
end
|
160
|
-
|
161
|
-
# doing multilabel evaluation?
|
162
|
-
# then we need a list of all senses
|
163
|
-
if @multiple_senses_assigned
|
164
|
-
@all_senses = @target_obj.get_senses(lemma)
|
165
|
-
else
|
166
|
-
@all_senses = nil
|
167
|
-
end
|
168
|
-
|
169
|
-
yield lemma
|
170
|
-
}
|
171
|
-
end
|
172
|
-
|
173
|
-
###
|
174
|
-
# each_instance
|
175
|
-
#
|
176
|
-
# given a lemma name, yield each instance of this lemma in turn,
|
177
|
-
# or rather: yield pairs [gold_class(string), assigned_class(string)]
|
178
|
-
#
|
179
|
-
# relies on each_group() having set the appropriate readers
|
180
|
-
# <@goldreader> and <@classfile>
|
181
|
-
def each_instance(lemma) # string: lemma name
|
182
|
-
|
183
|
-
# watch out for repeated instances
|
184
|
-
# which may occur if handle_multilabel = repeat.
|
185
|
-
# Only yield them once to avoid re-evaluating multi-label instances
|
186
|
-
#
|
187
|
-
# instance_ids_seen: hash target_ids -> true/nil
|
188
|
-
instance_ids_seen = Hash.new()
|
189
|
-
|
190
|
-
# read gold file and classifier output file in parallel
|
191
|
-
@goldreader.each { |lemma, pos, target_ids, sid, senses_gold, transformed_gold_senses|
|
192
|
-
|
193
|
-
# classline: format
|
194
|
-
# (label confidence)*
|
195
|
-
# such that the label with the highest confidence is first
|
196
|
-
classline = nil
|
197
|
-
if @classfile
|
198
|
-
classline = @classfile.gets()
|
199
|
-
end
|
200
|
-
if classline.nil?
|
201
|
-
classline = ""
|
202
|
-
end
|
203
|
-
|
204
|
-
# $stderr.puts "HIER0 #{classline} #{@classfile.nil?}"
|
205
|
-
|
206
|
-
# have we done this same instance previously?
|
207
|
-
if instance_ids_seen[target_ids]
|
208
|
-
next
|
209
|
-
end
|
210
|
-
# instance not seen previously, but mark as seen now.
|
211
|
-
instance_ids_seen[target_ids] = true
|
212
|
-
|
213
|
-
# determine all assigned senses and their confidence levels
|
214
|
-
# determine all sense/confidence pairs
|
215
|
-
# senses assigned: list of pairs [senselist, confidence]
|
216
|
-
# where senselist is an array of sense strings
|
217
|
-
senses_assigned = Array.new()
|
218
|
-
current_sense = nil
|
219
|
-
|
220
|
-
classline.split().each_with_index { |entry, index|
|
221
|
-
if index % 2 == 0
|
222
|
-
# we have a sense label
|
223
|
-
if @handle_multilabel == "join"
|
224
|
-
# split up joined senses
|
225
|
-
current_sense = fred_split_sense(entry)
|
226
|
-
else
|
227
|
-
current_sense = [entry]
|
228
|
-
end
|
229
|
-
|
230
|
-
else
|
231
|
-
# we have a confidence level
|
232
|
-
senses_assigned << [current_sense, entry.to_f()]
|
233
|
-
end
|
234
|
-
}
|
235
|
-
|
236
|
-
|
237
|
-
if @threshold
|
238
|
-
# multiple senses assigned, and
|
239
|
-
# regard as assigned everything above a given threshold
|
240
|
-
|
241
|
-
# transform senses_assigned:
|
242
|
-
# in the case of "join", one sense may have several confidence levels,
|
243
|
-
# one on its own and one in a joined sense
|
244
|
-
senses_assigned_hash = Hash.new()
|
245
|
-
senses_assigned.each { |senses, confidence|
|
246
|
-
senses.each { |s|
|
247
|
-
# assign to each sense the maximum of its previous confidence
|
248
|
-
# and this one.
|
249
|
-
# watch out: confidence may be smaller than zero
|
250
|
-
if senses_assigned_hash[s]
|
251
|
-
senses_assigned_hash[s] = [senses_assigned_hash[s], confidence].max()
|
252
|
-
else
|
253
|
-
senses_assigned_hash[s] = confidence
|
254
|
-
end
|
255
|
-
}
|
256
|
-
}
|
257
|
-
|
258
|
-
# select all sense/confidence pairs where confidence is above threshold
|
259
|
-
senses_assigned = senses_assigned_hash.to_a().select { |sense, confidence|
|
260
|
-
confidence >= @threshold
|
261
|
-
}.map { |sense, confidence|
|
262
|
-
# then retain only the sense, not the confidence
|
263
|
-
sense
|
264
|
-
}
|
265
|
-
|
266
|
-
|
267
|
-
unless @all_senses
|
268
|
-
raise "Shouldn't be here"
|
269
|
-
end
|
270
|
-
|
271
|
-
# for each sense out of the list of all senses:
|
272
|
-
# yield a pair of [applies, has been assigned]
|
273
|
-
# both 'applies' and 'has been assigned' will be
|
274
|
-
# a string of either 'true' or 'false'
|
275
|
-
# assignment is accurate if both are the same
|
276
|
-
@all_senses.each { |sense_of_lemma|
|
277
|
-
gold_class = (senses_gold.include? sense_of_lemma).to_s()
|
278
|
-
assigned_class = (senses_assigned.include? sense_of_lemma).to_s()
|
279
|
-
yield [gold_class, assigned_class]
|
280
|
-
}
|
281
|
-
|
282
|
-
|
283
|
-
else
|
284
|
-
# regard only one sense as assigned at a time
|
285
|
-
# count as correct if the list of gold classes
|
286
|
-
# contains the main assigned class
|
287
|
-
# (relatively lenient evaluation)
|
288
|
-
|
289
|
-
# actually assigned class: only the one with the
|
290
|
-
# maximum confidence
|
291
|
-
# $stderr.puts "HIER5 #{senses_assigned.length()}"
|
292
|
-
|
293
|
-
if senses_assigned.empty?
|
294
|
-
# nothing to yield
|
295
|
-
else
|
296
|
-
|
297
|
-
max_senselist = senses_assigned.max { |a, b|
|
298
|
-
a.last() <=> b.last()
|
299
|
-
}.first()
|
300
|
-
|
301
|
-
|
302
|
-
max_senselist.each { |single_sense|
|
303
|
-
gold_class = (senses_gold.include? single_sense).to_s()
|
304
|
-
yield [gold_class, "true"]
|
305
|
-
}
|
306
|
-
end
|
307
|
-
|
308
|
-
end
|
309
|
-
}
|
310
|
-
end
|
311
|
-
|
312
|
-
end
|
@@ -1,322 +0,0 @@
|
|
1
|
-
class FredFeatureInfo
|
2
|
-
###
|
3
|
-
# class variable:
|
4
|
-
# list of all known extractors
|
5
|
-
# add to it using add_feature()
|
6
|
-
@@extractors = Array.new
|
7
|
-
|
8
|
-
# boolean. set to true after warning messages have been given once
|
9
|
-
@@warned = false
|
10
|
-
|
11
|
-
###
|
12
|
-
# add interface/interpreter
|
13
|
-
def FredFeatureInfo.add_feature(class_name) # Class object
|
14
|
-
@@extractors << class_name
|
15
|
-
end
|
16
|
-
|
17
|
-
###
|
18
|
-
def initialize(exp)
|
19
|
-
|
20
|
-
##
|
21
|
-
# make list of extractors that are
|
22
|
-
# required by the user
|
23
|
-
@features = Array.new
|
24
|
-
@exp = exp
|
25
|
-
|
26
|
-
# user-chosen extractors:
|
27
|
-
# returns array of pairs [feature group designator(string), options(array:string)]
|
28
|
-
exp.get_lf("feature").each { |extractor_name, *options|
|
29
|
-
|
30
|
-
extractor = @@extractors.detect { |e| e.feature_name() == extractor_name }
|
31
|
-
unless extractor
|
32
|
-
# no extractor found matching the given designator
|
33
|
-
unless @@warned
|
34
|
-
$stderr.puts "Warning: Could not find a feature extractor for #{extractor_name}: skipping."
|
35
|
-
end
|
36
|
-
next
|
37
|
-
end
|
38
|
-
|
39
|
-
# no need to use the options here,
|
40
|
-
# the feature extractors can get their options themselves.
|
41
|
-
@features << extractor
|
42
|
-
}
|
43
|
-
|
44
|
-
# do not print warnings again if another RosyFeatureInfo object is made
|
45
|
-
@@warned = true
|
46
|
-
end
|
47
|
-
|
48
|
-
###
|
49
|
-
# get_extractor_objects
|
50
|
-
#
|
51
|
-
# returns a list of feature extractor objects
|
52
|
-
def get_extractor_objects()
|
53
|
-
|
54
|
-
return @features.map{ |feature_class|
|
55
|
-
feature_class.new(@exp)
|
56
|
-
}
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
##################################3
|
61
|
-
class FredFeatureExtractor
|
62
|
-
###
|
63
|
-
# feature name:
|
64
|
-
# name by which you choose this feature
|
65
|
-
# in the experiment file
|
66
|
-
def FredFeatureExtractor.feature_name()
|
67
|
-
raise "Overwrite me."
|
68
|
-
end
|
69
|
-
|
70
|
-
###
|
71
|
-
# initialize with Fred experiment file object
|
72
|
-
def initialize(exp)
|
73
|
-
@exp = exp
|
74
|
-
end
|
75
|
-
|
76
|
-
###
|
77
|
-
# compute features from meta-features
|
78
|
-
#
|
79
|
-
# argument: hash
|
80
|
-
# metafeature_label -> metafeatures
|
81
|
-
# string -> array:string
|
82
|
-
#
|
83
|
-
# yields each feature as a string
|
84
|
-
def each_feature(feature_hash)
|
85
|
-
raise "overwrite me"
|
86
|
-
end
|
87
|
-
|
88
|
-
######
|
89
|
-
protected
|
90
|
-
|
91
|
-
def FredFeatureExtractor.announce_me
|
92
|
-
# AB: In 1.9 constants are symbols.
|
93
|
-
if Module.constants.include?("FredFeatureInfo") or Module.constants.include?(:FredFeatureInfo)
|
94
|
-
# yup, we have a class to which we can announce ourselves
|
95
|
-
FredFeatureInfo.add_feature(eval(self.name))
|
96
|
-
else
|
97
|
-
# no interface collector class
|
98
|
-
# $stderr.puts "Feature #{self.name()} not announced: no RosyFeatureInfo."
|
99
|
-
end
|
100
|
-
end
|
101
|
-
|
102
|
-
end
|
103
|
-
|
104
|
-
#####
|
105
|
-
# context feature
|
106
|
-
class FredContextFeatureExtractor < FredFeatureExtractor
|
107
|
-
FredContextFeatureExtractor.announce_me()
|
108
|
-
|
109
|
-
def FredContextFeatureExtractor.feature_name()
|
110
|
-
return "context"
|
111
|
-
end
|
112
|
-
|
113
|
-
###
|
114
|
-
def initialize(exp)
|
115
|
-
super(exp)
|
116
|
-
|
117
|
-
# cxsizes: list of context sizes chosen as features,
|
118
|
-
# encoded in metafeature labels
|
119
|
-
# written in a hash for fast access
|
120
|
-
@cxsizes = Hash.new()
|
121
|
-
@exp.get_lf("feature", "context").each { |cxsize|
|
122
|
-
@cxsizes[ "CX" + cxsize.to_s() ] = true
|
123
|
-
}
|
124
|
-
end
|
125
|
-
|
126
|
-
###
|
127
|
-
def each_feature(feature_hash)
|
128
|
-
# grf#word#lemma#pos#ne
|
129
|
-
lemma_index = 2
|
130
|
-
|
131
|
-
feature_hash.each { |ftype, fvalues|
|
132
|
-
if @cxsizes[ftype]
|
133
|
-
# this is a context feature of a size chosen
|
134
|
-
# by the user for featurization
|
135
|
-
|
136
|
-
fvalues.each { |f|
|
137
|
-
next if f =~ /#####/;
|
138
|
-
yield ftype + f.split("#")[lemma_index]
|
139
|
-
}
|
140
|
-
end
|
141
|
-
}
|
142
|
-
end
|
143
|
-
end
|
144
|
-
|
145
|
-
#####
|
146
|
-
# context feature: POS separately, small contexts only
|
147
|
-
class FredContextPOSFeatureExtractor < FredFeatureExtractor
|
148
|
-
FredContextPOSFeatureExtractor.announce_me()
|
149
|
-
|
150
|
-
def FredContextPOSFeatureExtractor.feature_name()
|
151
|
-
return "context_pos"
|
152
|
-
end
|
153
|
-
|
154
|
-
###
|
155
|
-
def initialize(exp)
|
156
|
-
super(exp)
|
157
|
-
|
158
|
-
# cxsizes: list of context sizes chosen as features,
|
159
|
-
# encoded in metafeature labels
|
160
|
-
# written in a hash for fast access
|
161
|
-
@cxsizes = Hash.new()
|
162
|
-
@exp.get_lf("feature", "context").each { |cxsize|
|
163
|
-
if cxsize <= 10
|
164
|
-
@cxsizes[ "CX" + cxsize.to_s() ] = true
|
165
|
-
end
|
166
|
-
}
|
167
|
-
if @cxsizes.empty?
|
168
|
-
$stderr.puts "context_pos feature warning: will not be computed"
|
169
|
-
$stderr.puts "as there is no context of size <= 10"
|
170
|
-
end
|
171
|
-
end
|
172
|
-
|
173
|
-
###
|
174
|
-
def each_feature(feature_hash)
|
175
|
-
# word#lemma#pos#ne
|
176
|
-
pos_index = 2
|
177
|
-
|
178
|
-
feature_hash.each { |ftype, fvalues|
|
179
|
-
if @cxsizes[ftype]
|
180
|
-
# this is a context feature of a size chosen
|
181
|
-
# by the user for featurization
|
182
|
-
|
183
|
-
fvalues.each { |f|
|
184
|
-
yield "POS" + ftype + f.split("#")[pos_index]
|
185
|
-
}
|
186
|
-
end
|
187
|
-
}
|
188
|
-
end
|
189
|
-
end
|
190
|
-
|
191
|
-
#####
|
192
|
-
# bigram/trigram feature
|
193
|
-
class FredNgramFeatureExtractor < FredFeatureExtractor
|
194
|
-
FredNgramFeatureExtractor.announce_me()
|
195
|
-
|
196
|
-
def FredNgramFeatureExtractor.feature_name()
|
197
|
-
return "ngram"
|
198
|
-
end
|
199
|
-
|
200
|
-
###
|
201
|
-
def initialize(exp)
|
202
|
-
super(exp)
|
203
|
-
|
204
|
-
# cxsize: context size from which the ngram feature will be computed
|
205
|
-
# encoded in metafeature labels
|
206
|
-
# written in a hash for fast access
|
207
|
-
@cxsize = @exp.get_lf("feature", "context").detect { |cxsize|
|
208
|
-
cxsize >= 2
|
209
|
-
}
|
210
|
-
unless @cxsize
|
211
|
-
$stderr.puts "Warning: no context of size >= 2, so"
|
212
|
-
$stderr.puts "no ngram feature computed."
|
213
|
-
end
|
214
|
-
end
|
215
|
-
|
216
|
-
###
|
217
|
-
def each_feature(feature_hash)
|
218
|
-
# word#lemma#pos#ne
|
219
|
-
lemma_index = 1
|
220
|
-
pos_index = 2
|
221
|
-
|
222
|
-
feature_hash.each { |ftype, fvalues|
|
223
|
-
if ftype == "CX" + @cxsize.to_s()
|
224
|
-
# compute the ngram features from this context
|
225
|
-
# |fvalues| = 2*cxsize, that is, cxsize describes
|
226
|
-
# the length of a one-sided context window
|
227
|
-
# the bigram of features around the target
|
228
|
-
# concerns fvalues[cxsize-1] and fvalues[cxsize]
|
229
|
-
# the trigram of two words before, one word after includes
|
230
|
-
# fvalues[cxsize-2], fvalues[cxsize-1] and fvalues[cxsize]
|
231
|
-
|
232
|
-
[
|
233
|
-
[[-1, 0], "BLEM", lemma_index], # bigram of lemmas
|
234
|
-
[[-1, 0], "BPOS", pos_index], # bigram of POSs
|
235
|
-
[[-2, -1, 0], "TLEM", lemma_index], # trigram of lemmas
|
236
|
-
[[-2, -1, 0], "TPOS", pos_index] # trigram of POSs
|
237
|
-
].each { |f_indices, label, subindex|
|
238
|
-
fs = f_indices.map { |i| fvalues[@cxsize+i] }.compact()
|
239
|
-
if fs.length() == f_indices.length()
|
240
|
-
# we successfully extracted entries for all the given indices
|
241
|
-
yield label + fs.map { |f| f.split("#")[subindex] }.join()
|
242
|
-
end
|
243
|
-
}
|
244
|
-
end
|
245
|
-
}
|
246
|
-
end
|
247
|
-
end
|
248
|
-
|
249
|
-
|
250
|
-
#####
|
251
|
-
# syntax feature
|
252
|
-
class FredSynFeatureExtractor < FredFeatureExtractor
|
253
|
-
FredSynFeatureExtractor.announce_me()
|
254
|
-
|
255
|
-
def FredSynFeatureExtractor.feature_name()
|
256
|
-
return "syntax"
|
257
|
-
end
|
258
|
-
|
259
|
-
###
|
260
|
-
def each_feature(feature_hash)
|
261
|
-
|
262
|
-
feature_hash.each { |ftype, fvalues|
|
263
|
-
|
264
|
-
case ftype
|
265
|
-
when "CH", "PA"
|
266
|
-
grf_index = 0
|
267
|
-
|
268
|
-
fvalues.each { |f|
|
269
|
-
yield ftype + f.split("#")[grf_index]
|
270
|
-
}
|
271
|
-
|
272
|
-
when "SI"
|
273
|
-
# parentlemma#grf#word#lemma#pos#ne
|
274
|
-
grf_index = 1
|
275
|
-
|
276
|
-
fvalues.each { |f|
|
277
|
-
yield ftype + f.split("#")[grf_index]
|
278
|
-
}
|
279
|
-
|
280
|
-
else
|
281
|
-
# not a syntactic metafeature
|
282
|
-
end
|
283
|
-
}
|
284
|
-
end
|
285
|
-
end
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
#####
|
291
|
-
# syntax-plus-headword feature
|
292
|
-
class FredSynsemFeatureExtractor < FredFeatureExtractor
|
293
|
-
FredSynsemFeatureExtractor.announce_me()
|
294
|
-
|
295
|
-
def FredSynsemFeatureExtractor.feature_name()
|
296
|
-
return "synsem"
|
297
|
-
end
|
298
|
-
|
299
|
-
###
|
300
|
-
def each_feature(feature_hash)
|
301
|
-
|
302
|
-
feature_hash.each { |ftype, fvalues|
|
303
|
-
case ftype
|
304
|
-
when "CH", "PA"
|
305
|
-
# grf#word#lemma#pos#ne
|
306
|
-
fvalues.each { |f|
|
307
|
-
yield ftype + "SEM" + f
|
308
|
-
}
|
309
|
-
|
310
|
-
when "SI"
|
311
|
-
# parentlemma#grf#word#lemma#pos#ne
|
312
|
-
# remove parent lemma
|
313
|
-
fvalues.each { |f|
|
314
|
-
yield ftype + "SEM" + f.split("#")[1..-1].join("#")
|
315
|
-
}
|
316
|
-
|
317
|
-
else
|
318
|
-
# not a syntax feature
|
319
|
-
end
|
320
|
-
}
|
321
|
-
end
|
322
|
-
end
|