frprep 0.0.1.prealpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,1061 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require "tempfile"
|
3
|
+
require "delegate"
|
4
|
+
|
5
|
+
require "fred/FredFeatureExtractors"
|
6
|
+
|
7
|
+
########################################
|
8
|
+
########################################
|
9
|
+
# Feature access classes:
|
10
|
+
# read and write features
|
11
|
+
class AbstractFredFeatureAccess
|
12
|
+
####
|
13
|
+
def initialize(exp, # experiment file object
|
14
|
+
dataset, # dataset: "train" or "test"
|
15
|
+
mode = "r") # mode: r, w, a
|
16
|
+
@exp = exp
|
17
|
+
@dataset = dataset
|
18
|
+
@mode = mode
|
19
|
+
|
20
|
+
unless ["r", "w", "a"].include? @mode
|
21
|
+
$stderr.puts "FeatureAccess: unknown mode #{@mode}."
|
22
|
+
exit 1
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
####
|
28
|
+
def AbstractFredFeatureAccess.remove_feature_files()
|
29
|
+
raise "overwrite me"
|
30
|
+
end
|
31
|
+
|
32
|
+
####
|
33
|
+
def write_item(lemma, # string: lemma
|
34
|
+
pos, # string: POS
|
35
|
+
ids, # array:string: unique IDs of this occurrence of the lemma
|
36
|
+
sid, # string: sentence ID
|
37
|
+
sense, # string: sense
|
38
|
+
features) # features: hash feature type-> features (string-> array:string)
|
39
|
+
raise "overwrite me"
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
def flush()
|
44
|
+
raise "overwrite me"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
########################################
|
49
|
+
# MetaFeatureAccess:
|
50
|
+
# write all featurization data to one gzipped file,
|
51
|
+
# directly writing the meta-features as they come
|
52
|
+
# format:
|
53
|
+
#
|
54
|
+
# lemma pos id sense
|
55
|
+
# <feature_type>: <features>
|
56
|
+
#
|
57
|
+
# where feature_type is a word, and features is a list of words, space-separated
|
58
|
+
class MetaFeatureAccess < AbstractFredFeatureAccess
|
59
|
+
###
|
60
|
+
def initialize(exp, dataset, mode)
|
61
|
+
super(exp, dataset, mode)
|
62
|
+
|
63
|
+
@filename = MetaFeatureAccess.filename(@exp, @dataset)
|
64
|
+
|
65
|
+
# make filename for writing features
|
66
|
+
case @mode
|
67
|
+
|
68
|
+
when "w", "a", "r"
|
69
|
+
# read or write access
|
70
|
+
@f = FileZipped.new(@filename, mode)
|
71
|
+
|
72
|
+
else
|
73
|
+
$stderr.puts "MetaFeatureAccess error: illegal mode #{mode}"
|
74
|
+
exit 1
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
|
79
|
+
####
|
80
|
+
def MetaFeatureAccess.filename(exp, dataset, mode="new")
|
81
|
+
return fred_dirname(exp, dataset, "meta_features", mode) +
|
82
|
+
"meta_features.txt.gz"
|
83
|
+
end
|
84
|
+
|
85
|
+
####
|
86
|
+
def MetaFeatureAccess.remove_feature_files(exp, dataset)
|
87
|
+
filename = MetaFeatureAccess.filename(exp, dataset)
|
88
|
+
if File.exists?(filename)
|
89
|
+
File.delete(filename)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
|
94
|
+
###
|
95
|
+
# read items, yield one at a time
|
96
|
+
#
|
97
|
+
# format: tuple consisting of
|
98
|
+
# - target_lemma: string
|
99
|
+
# - target_pos: string
|
100
|
+
# - target_ids: array:string
|
101
|
+
# - target SID: string, sentence ID
|
102
|
+
# - target_senses: array:string
|
103
|
+
# - feature_hash: feature_type->values, string->array:string
|
104
|
+
def each_item()
|
105
|
+
unless @mode == "r"
|
106
|
+
$stderr.puts "MetaFeatureAccess error: cannot read file not opened for reading"
|
107
|
+
exit 1
|
108
|
+
end
|
109
|
+
|
110
|
+
lemma = pos = sid = ids = senses = nil
|
111
|
+
|
112
|
+
feature_hash = Hash.new()
|
113
|
+
|
114
|
+
@f.each { |line|
|
115
|
+
line.chomp!
|
116
|
+
if line =~ /^\s/
|
117
|
+
# line starts with whitespace: continues description of previous item
|
118
|
+
# that is, if we have a previous item
|
119
|
+
#
|
120
|
+
# format of line:
|
121
|
+
# feature_type: feature feature feature ...
|
122
|
+
# as in
|
123
|
+
# CH: SB#expansion#expansion#NN# OA#change#change#NN#
|
124
|
+
unless lemma
|
125
|
+
$stderr.puts "MetaFeatureAccess error: unexpected leading whitespace"
|
126
|
+
$stderr.puts "in meta-feature file #{@filename}, ignoring line:"
|
127
|
+
$stderr.puts line
|
128
|
+
next
|
129
|
+
end
|
130
|
+
|
131
|
+
feature_type, *features = line.split()
|
132
|
+
|
133
|
+
unless feature_type =~ /^(.*):$/
|
134
|
+
# feature type should end in ":"
|
135
|
+
$stderr.puts "MetaFeatureAccess error: feature type should end in ':' but doesn't"
|
136
|
+
$stderr.puts "in meta-feature file #{@filename}, ignoring line:"
|
137
|
+
$stderr.puts line
|
138
|
+
next
|
139
|
+
end
|
140
|
+
|
141
|
+
feature_hash[feature_type[0..-2]] = features
|
142
|
+
|
143
|
+
|
144
|
+
else
|
145
|
+
# first line of item.
|
146
|
+
#
|
147
|
+
# format:
|
148
|
+
# lemma POS IDs sid senses
|
149
|
+
#
|
150
|
+
# as in:
|
151
|
+
# cause verb 2-651966_8 2-651966 Causation
|
152
|
+
|
153
|
+
# first yield previous item
|
154
|
+
if lemma
|
155
|
+
yield [lemma, pos, ids, sid, senses, feature_hash]
|
156
|
+
end
|
157
|
+
|
158
|
+
# then start new item:
|
159
|
+
lemma, pos, ids_s, sid, senses_s = line.split()
|
160
|
+
ids = ids_s.split("::").map { |i| i.gsub(/COLON/, ":") }
|
161
|
+
senses = senses_s.split("::").map { |s| s.gsub(/COLON/, ":") }
|
162
|
+
|
163
|
+
# reset feature hash
|
164
|
+
feature_hash.clear()
|
165
|
+
end
|
166
|
+
}
|
167
|
+
|
168
|
+
# one more item to yield?
|
169
|
+
if lemma
|
170
|
+
yield [lemma, pos, ids, sid, senses, feature_hash]
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
|
175
|
+
|
176
|
+
###
|
177
|
+
def write_item(lemma, # string: target lemma
|
178
|
+
pos, # string: target pos
|
179
|
+
ids, # array:string: unique IDs of this occurrence of the lemma
|
180
|
+
sid, # string: sentence ID
|
181
|
+
senses, # array:string: sense
|
182
|
+
features) # features: hash feature type-> features (string-> array:string)
|
183
|
+
|
184
|
+
unless ["w", "a"].include? @mode
|
185
|
+
$stderr.puts "MetaFeatureAccess error: cannot write to feature file opened for reading"
|
186
|
+
exit 1
|
187
|
+
end
|
188
|
+
|
189
|
+
if not(lemma) or lemma.empty? or not(ids) or ids.empty?
|
190
|
+
# nothing to write
|
191
|
+
# HIER debugging
|
192
|
+
# raise "HIER no lemma or no IDs: #{lemma} #{ids}"
|
193
|
+
return
|
194
|
+
end
|
195
|
+
if pos.nil? or pos.empty?
|
196
|
+
# POS unknown
|
197
|
+
pos = ""
|
198
|
+
end
|
199
|
+
unless senses
|
200
|
+
senses = [ @exp.get("noval") ]
|
201
|
+
end
|
202
|
+
|
203
|
+
ids_s = ids.map { |i| i.gsub(/:/, "COLON") }.join("::")
|
204
|
+
|
205
|
+
senses_s = senses.map { |s| s.gsub(/:/, "COLON") }.join("::")
|
206
|
+
@f.puts "#{lemma} #{pos} #{ids_s} #{sid} #{senses_s}"
|
207
|
+
features.each_pair { |feature_type, f_list|
|
208
|
+
@f.puts " #{feature_type}: " + f_list.map { |f| f.to_s() }.join(" ")
|
209
|
+
}
|
210
|
+
@f.flush()
|
211
|
+
end
|
212
|
+
|
213
|
+
###
|
214
|
+
def flush()
|
215
|
+
unless ["w", "a"].include? @mode
|
216
|
+
$stderr.puts "MetaFeatureAccess error: cannot write to feature file opened for reading"
|
217
|
+
exit 1
|
218
|
+
end
|
219
|
+
|
220
|
+
# actually, nothing to be done here
|
221
|
+
end
|
222
|
+
|
223
|
+
end
|
224
|
+
|
225
|
+
|
226
|
+
########################################
|
227
|
+
# FredFeatureWriter:
|
228
|
+
# write chosen features (according to the experiment file)
|
229
|
+
# to
|
230
|
+
# - one file per lemma for n-ary classification
|
231
|
+
# - one file per lemma/sense pair for binary classification
|
232
|
+
#
|
233
|
+
# format: CSV, last entry is target class
|
234
|
+
class FredFeatureAccess < AbstractFredFeatureAccess
|
235
|
+
###
|
236
|
+
def initialize(exp, dataset, mode)
|
237
|
+
super(exp, dataset, mode)
|
238
|
+
|
239
|
+
# write to auxiliary files first,
|
240
|
+
# to sort items by lemma
|
241
|
+
@w_tmp = AuxKeepWriters.new()
|
242
|
+
|
243
|
+
# which features has the user requested?
|
244
|
+
feature_info_obj = FredFeatureInfo.new(@exp)
|
245
|
+
@feature_extractors = feature_info_obj.get_extractor_objects()
|
246
|
+
|
247
|
+
end
|
248
|
+
|
249
|
+
####
|
250
|
+
def FredFeatureAccess.remove_feature_files(exp, dataset)
|
251
|
+
|
252
|
+
# remove feature files
|
253
|
+
WriteFeaturesNaryOrBinary.remove_files(exp, dataset)
|
254
|
+
|
255
|
+
# remove key files
|
256
|
+
AnswerKeyAccess.remove_files(exp, dataset)
|
257
|
+
end
|
258
|
+
|
259
|
+
###
|
260
|
+
def FredFeatureAccess.legend_filename(lemmapos)
|
261
|
+
return "fred.feature_legend.#{lemmapos}"
|
262
|
+
end
|
263
|
+
|
264
|
+
###
|
265
|
+
def FredFeatureAccess.feature_dir(exp, dataset)
|
266
|
+
return WriteFeaturesNaryOrBinary.feature_dir(exp, dataset, "new")
|
267
|
+
end
|
268
|
+
|
269
|
+
###
|
270
|
+
# each feature file:
|
271
|
+
# iterate through feature files,
|
272
|
+
# yield pairs [filename, values]
|
273
|
+
# where 'values' is a hash containing keys
|
274
|
+
# 'lemma' and potentially 'sense'
|
275
|
+
#
|
276
|
+
# filenames are sorted alphabetically before being yielded
|
277
|
+
#
|
278
|
+
# available in read and write mode
|
279
|
+
def FredFeatureAccess.each_feature_file(exp, dataset)
|
280
|
+
feature_dir = FredFeatureAccess.feature_dir(exp, dataset)
|
281
|
+
Dir[feature_dir + "*"].sort().each { |filename|
|
282
|
+
if (values = deconstruct_fred_feature_filename(filename))
|
283
|
+
yield [filename, values]
|
284
|
+
end
|
285
|
+
}
|
286
|
+
end
|
287
|
+
|
288
|
+
###
|
289
|
+
# write item:
|
290
|
+
# - transform meta-features into actual features as requested
|
291
|
+
# in the experiment file
|
292
|
+
# - write item to tempfile, don't really write yet
|
293
|
+
def write_item(lemma, # string: target lemma
|
294
|
+
pos, # string: target pos
|
295
|
+
ids, # array:string: unique IDs of this occurrence of the lemma
|
296
|
+
sid, # string: sentence ID
|
297
|
+
senses, # array:string: sense
|
298
|
+
features) # features: hash feature type-> features (string-> array:string)
|
299
|
+
|
300
|
+
|
301
|
+
unless ["w", "a"].include? @mode
|
302
|
+
$stderr.puts "FredFeatures error: cannot write to feature file opened for reading"
|
303
|
+
exit 1
|
304
|
+
end
|
305
|
+
|
306
|
+
if lemma.nil? or lemma.empty? or ids.nil? or ids.empty?
|
307
|
+
# nothing to write
|
308
|
+
return
|
309
|
+
end
|
310
|
+
if pos.nil? or pos.empty?
|
311
|
+
# POS unknown
|
312
|
+
pos = ""
|
313
|
+
end
|
314
|
+
|
315
|
+
# falsch! noval nicht zulässig für fred! (nur für rosy!) - Warum steht das hier???
|
316
|
+
unless senses
|
317
|
+
senses = [ @exp.get("noval") ]
|
318
|
+
end
|
319
|
+
|
320
|
+
# modified by ines, 19.7.2010
|
321
|
+
# senses should be empty, but they are not - why?
|
322
|
+
if senses.length == 1 and senses[0].eql? ""
|
323
|
+
senses = "NONE"
|
324
|
+
end
|
325
|
+
|
326
|
+
writer = @w_tmp.get_writer_for(fred_lemmapos_combine(lemma, pos))
|
327
|
+
ids_s = ids.map { |i| i.gsub(/:/, "COLON") }.join("::")
|
328
|
+
|
329
|
+
# AB: Ines modified <senses> and it can be a String.
|
330
|
+
# That's corrected, but I do not guarantee the correct results.
|
331
|
+
if senses.respond_to? :map
|
332
|
+
senses_s = senses.map { |s| s.gsub(/:/, "COLON") }.join("::")
|
333
|
+
end
|
334
|
+
writer.print "#{lemma} #{pos} #{ids_s} #{sid} #{senses_s} "
|
335
|
+
|
336
|
+
# write all features
|
337
|
+
@feature_extractors.each { |extractor|
|
338
|
+
extractor.each_feature(features) { |feature|
|
339
|
+
writer.print feature, " "
|
340
|
+
}
|
341
|
+
}
|
342
|
+
writer.puts
|
343
|
+
writer.flush()
|
344
|
+
end
|
345
|
+
|
346
|
+
###
|
347
|
+
def flush()
|
348
|
+
unless ["w", "a"].include? @mode
|
349
|
+
$stderr.puts "FredFeatureAccess error: cannot write to feature file opened for reading"
|
350
|
+
exit 1
|
351
|
+
end
|
352
|
+
|
353
|
+
# elements in the feature vector: get fixed with the training data,
|
354
|
+
# get read with the test data.
|
355
|
+
# get stored in feature_legend_dir
|
356
|
+
case @dataset
|
357
|
+
when "train"
|
358
|
+
feature_legend_dir = File.new_dir(fred_classifier_directory(@exp),
|
359
|
+
"legend")
|
360
|
+
when "test"
|
361
|
+
feature_legend_dir= File.existing_dir(fred_classifier_directory(@exp),
|
362
|
+
"legend")
|
363
|
+
end
|
364
|
+
|
365
|
+
# now really write features
|
366
|
+
@w_tmp.flush()
|
367
|
+
@w_tmp.get_lemmas().sort().each { |lemmapos|
|
368
|
+
|
369
|
+
# inform user
|
370
|
+
$stderr.puts "Writing #{lemmapos}..."
|
371
|
+
|
372
|
+
# prepare list of features to use in the feature vector:
|
373
|
+
legend_filename = feature_legend_dir + FredFeatureAccess.legend_filename(lemmapos)
|
374
|
+
|
375
|
+
case @dataset
|
376
|
+
when "train"
|
377
|
+
# training data:
|
378
|
+
# determine feature list and sense list from the data,
|
379
|
+
# and store in the relevant file
|
380
|
+
feature_list, sense_list = collect_feature_list(lemmapos)
|
381
|
+
begin
|
382
|
+
f = File.new(legend_filename, "w")
|
383
|
+
rescue
|
384
|
+
$stderr.puts "Error: Could not write to feature legend file #{legend_filename}: " + $!
|
385
|
+
exit 1
|
386
|
+
end
|
387
|
+
f.puts feature_list.map { |x| x.gsub(/,/, "COMMA") }.join(",")
|
388
|
+
f.puts sense_list.map { |x| x.gsub(/,/, "COMMA") }.join(",")
|
389
|
+
f.close()
|
390
|
+
|
391
|
+
when "test"
|
392
|
+
# test data:
|
393
|
+
# read feature list and sense list from the relevant file
|
394
|
+
|
395
|
+
begin
|
396
|
+
f = File.new(legend_filename)
|
397
|
+
rescue
|
398
|
+
$stderr.puts "Error: Could not read feature legend file #{legend_filename}: " + $!
|
399
|
+
$stderr.puts "Skipping this lemma."
|
400
|
+
next
|
401
|
+
end
|
402
|
+
feature_list = f.gets().chomp().split(",").map { |x| x.gsub(/COMMA/, ",") }
|
403
|
+
sense_list = f.gets().chomp().split(",").map { |x| x.gsub(/COMMA/, ",") }
|
404
|
+
end
|
405
|
+
|
406
|
+
# write
|
407
|
+
# - featurization file
|
408
|
+
# - answer key file
|
409
|
+
|
410
|
+
f = @w_tmp.get_for_reading(lemmapos)
|
411
|
+
answer_obj = AnswerKeyAccess.new(@exp, @dataset, lemmapos, "w")
|
412
|
+
|
413
|
+
obj_out = WriteFeaturesNaryOrBinary.new(lemmapos, @exp, @dataset)
|
414
|
+
|
415
|
+
f.each { |line|
|
416
|
+
|
417
|
+
lemma, pos, ids, sid, senses, features = parse_temp_itemline(line)
|
418
|
+
unless lemma
|
419
|
+
# something went wrong in parsing the line
|
420
|
+
next
|
421
|
+
end
|
422
|
+
each_sensegroup(senses, sense_list) { |senses_for_item, original_senses|
|
423
|
+
# write answer key
|
424
|
+
answer_obj.write_line(lemma, pos,
|
425
|
+
ids, sid, original_senses, senses_for_item)
|
426
|
+
|
427
|
+
# write item: features, senses
|
428
|
+
obj_out.write_instance(to_feature_list(features, feature_list),
|
429
|
+
senses_for_item)
|
430
|
+
} # each sensegroup
|
431
|
+
} # each input line
|
432
|
+
obj_out.close()
|
433
|
+
answer_obj.close()
|
434
|
+
@w_tmp.discard(lemmapos)
|
435
|
+
} # each lemma
|
436
|
+
|
437
|
+
|
438
|
+
end
|
439
|
+
|
440
|
+
##################
|
441
|
+
protected
|
442
|
+
|
443
|
+
###
|
444
|
+
# read temp feature file for the given lemma/pos
|
445
|
+
# and determine the list of all features and the list of all senses,
|
446
|
+
# each sorted alphabetically
|
447
|
+
def collect_feature_list(lemmapos)
|
448
|
+
# read entries for this lemma
|
449
|
+
f = @w_tmp.get_for_reading(lemmapos)
|
450
|
+
|
451
|
+
# keep a record of all senses and features
|
452
|
+
# senses: binary.
|
453
|
+
# features: keep the max. number of times a given feature occurred
|
454
|
+
# in an instance
|
455
|
+
all_senses = Hash.new()
|
456
|
+
all_features = Hash.new(0)
|
457
|
+
features_this_instance = Hash.new(0)
|
458
|
+
# record how often each feature occurred all in all
|
459
|
+
num_occ = Hash.new(0)
|
460
|
+
num_lines = 0
|
461
|
+
|
462
|
+
f.each { |line|
|
463
|
+
lemma, pos, id_string, sid, senses, features = parse_temp_itemline(line)
|
464
|
+
|
465
|
+
unless lemma
|
466
|
+
# something went wrong in parsing the line
|
467
|
+
# print out the file contents for reference, then leave
|
468
|
+
$stderr.puts "Could not read temporary feature file #{f.path()} for #{lemmapos}."
|
469
|
+
exit 1
|
470
|
+
end
|
471
|
+
num_lines += 1
|
472
|
+
senses.each { |s| all_senses[s] = true }
|
473
|
+
features_this_instance.clear()
|
474
|
+
features.each { |fea|
|
475
|
+
features_this_instance[fea] += 1
|
476
|
+
num_occ[fea] += 1
|
477
|
+
}
|
478
|
+
|
479
|
+
features_this_instance.each_pair { |feature, value|
|
480
|
+
all_features[feature] = [ all_features[feature], features_this_instance[feature] ].max()
|
481
|
+
}
|
482
|
+
}
|
483
|
+
|
484
|
+
# HIER
|
485
|
+
# if num_lines > 2
|
486
|
+
# num_occ.each_pair { |feature, num_occ|
|
487
|
+
# if num_occ < 2
|
488
|
+
# all_features.delete(feature)
|
489
|
+
# end
|
490
|
+
# }
|
491
|
+
# end
|
492
|
+
|
493
|
+
|
494
|
+
|
495
|
+
case @exp.get("numerical_features")
|
496
|
+
when "keep"
|
497
|
+
# leave numerical features as they are, or
|
498
|
+
# don't do numerical features
|
499
|
+
return [ all_features.keys().sort(),
|
500
|
+
all_senses.keys().sort()
|
501
|
+
]
|
502
|
+
|
503
|
+
when "repeat"
|
504
|
+
# repeat: turn numerical feature with max. value N
|
505
|
+
# into N binary features
|
506
|
+
feature_list = Array.new()
|
507
|
+
all_features.keys().sort().each { |feature|
|
508
|
+
all_features[feature].times() { |index|
|
509
|
+
feature_list << feature + " #{index}/#{all_features[feature]}"
|
510
|
+
}
|
511
|
+
}
|
512
|
+
return [ feature_list,
|
513
|
+
all_senses.keys().sort()
|
514
|
+
]
|
515
|
+
|
516
|
+
when "bin"
|
517
|
+
# make bins:
|
518
|
+
# number of bins = (max. number of occurrences of a feature per item) / 10
|
519
|
+
feature_list = Array.new()
|
520
|
+
all_features.keys().sort().each { |feature|
|
521
|
+
num_bins_this_feature = (all_features[feature].to_f() / 10.0).ceil().to_i()
|
522
|
+
|
523
|
+
num_bins_this_feature.times { |index|
|
524
|
+
feature_list << feature + " #{index}/#{num_bins_this_feature}"
|
525
|
+
}
|
526
|
+
}
|
527
|
+
return [ feature_list,
|
528
|
+
all_senses.keys().sort()
|
529
|
+
]
|
530
|
+
else
|
531
|
+
raise "Shouldn't be here"
|
532
|
+
end
|
533
|
+
end
|
534
|
+
|
535
|
+
|
536
|
+
###
|
537
|
+
# given a full sorted list of items and a partial list of items,
|
538
|
+
# match the partial list to the full list,
|
539
|
+
# that is, produce as many items as the full list has
|
540
|
+
# yielding 0 where the partial entry is not in the full list,
|
541
|
+
# and > 0 otherwise
|
542
|
+
#
|
543
|
+
# Note that if partial contains items not in full,
|
544
|
+
# they will not occur on the feature list returned!
|
545
|
+
def to_feature_list(partial, full,
|
546
|
+
handle_numerical_features = nil)
|
547
|
+
|
548
|
+
#print "FULL: ", full, "\n"
|
549
|
+
#print "PART: ", partial, "\n"
|
550
|
+
# count occurrences of each feature in the partial list
|
551
|
+
occ_hash = Hash.new(0)
|
552
|
+
partial.each { |p|
|
553
|
+
occ_hash[p] += 1
|
554
|
+
}
|
555
|
+
|
556
|
+
# what to do with our counts?
|
557
|
+
unless handle_numerical_features
|
558
|
+
# no pre-set value given when this function was called
|
559
|
+
handle_numerical_features = @exp.get("numerical_features")
|
560
|
+
end
|
561
|
+
|
562
|
+
case handle_numerical_features
|
563
|
+
when "keep"
|
564
|
+
# leave numerical features as numerical features
|
565
|
+
return full.map { |x|
|
566
|
+
occ_hash[x].to_s()
|
567
|
+
}
|
568
|
+
|
569
|
+
when "repeat"
|
570
|
+
# repeat each numerical feature up to a max. number of occurrences
|
571
|
+
return full.map { |feature_plus_count|
|
572
|
+
unless feature_plus_count =~ /^(.*) (\d+)\/(\d+)$/
|
573
|
+
$stderr.puts "Error: could not parse feature: #{feature_plus_count}, bailing out."
|
574
|
+
raise "Shouldn't be here."
|
575
|
+
end
|
576
|
+
|
577
|
+
feature = $1
|
578
|
+
current_count = $2.to_i()
|
579
|
+
max_num = $3.to_i()
|
580
|
+
|
581
|
+
if occ_hash[feature] > current_count
|
582
|
+
1
|
583
|
+
else
|
584
|
+
0
|
585
|
+
end
|
586
|
+
}
|
587
|
+
|
588
|
+
when "bin"
|
589
|
+
# group numerical feature values into N bins.
|
590
|
+
# number of bins varies from feature to feature
|
591
|
+
# each bin contains 10 different counts
|
592
|
+
return full.map { |feature_plus_count|
|
593
|
+
unless feature_plus_count =~ /^(.*) (\d+)\/(\d+)$/
|
594
|
+
$stderr.puts "Error: could not parse feature: #{feature_plus_count}, bailing out."
|
595
|
+
raise "Shouldn't be here."
|
596
|
+
end
|
597
|
+
|
598
|
+
feature = $1
|
599
|
+
current_count = $2.to_i()
|
600
|
+
max_num = $3.to_i()
|
601
|
+
|
602
|
+
if occ_hash[feature] % 10 > (10 * current_count)
|
603
|
+
1
|
604
|
+
else
|
605
|
+
0
|
606
|
+
end
|
607
|
+
}
|
608
|
+
else
|
609
|
+
raise "Shouldn't be here"
|
610
|
+
end
|
611
|
+
end
|
612
|
+
|
613
|
+
|
614
|
+
###
|
615
|
+
# how to treat instances with multiple senses?
|
616
|
+
# - either write one item per sense
|
617
|
+
# - or combine all senses into one string
|
618
|
+
# - or keep as separate senses
|
619
|
+
#
|
620
|
+
# according to 'handle_multilabel' in the experiment file
|
621
|
+
#
|
622
|
+
# yields pairs of [senses, original_senses]
|
623
|
+
# both are arrays of strings
|
624
|
+
def each_sensegroup(senses, full_sense_list)
|
625
|
+
case @exp.get("handle_multilabel")
|
626
|
+
when "keep"
|
627
|
+
yield [senses, senses]
|
628
|
+
when "join"
|
629
|
+
yield [ [fred_join_senses(senses)], senses]
|
630
|
+
when "repeat"
|
631
|
+
senses.each { |s|
|
632
|
+
yield [ [s], senses]
|
633
|
+
}
|
634
|
+
when "binarize"
|
635
|
+
yield [ senses, senses ]
|
636
|
+
else
|
637
|
+
$stderr.puts "Error: unknown setting #{exp.get("handle_multilabel")}"
|
638
|
+
$stderr.puts "for 'handle_multilabel' in the experiment file."
|
639
|
+
$stderr.puts "Please choose one of 'binary', 'keep', 'join', 'repeat'"
|
640
|
+
$stderr.puts "or leave unset -- default is 'binary'."
|
641
|
+
exit 1
|
642
|
+
end
|
643
|
+
end
|
644
|
+
|
645
|
+
###
|
646
|
+
def parse_temp_itemline(line)
|
647
|
+
lemma, pos, ids_s, sid, senses_s, *features = line.split()
|
648
|
+
# fix me! senses is empty, takes context features instead
|
649
|
+
unless senses_s
|
650
|
+
# features may be empty, but we need senses
|
651
|
+
$stderr.puts "FredFeatures Error in word sense item line: too short."
|
652
|
+
$stderr.puts ">>#{line}<<"
|
653
|
+
return nil
|
654
|
+
end
|
655
|
+
|
656
|
+
ids = ids_s.split("::").map { |i| i.gsub(/COLON/, ":") }
|
657
|
+
senses = senses_s.split("::").map { |s| s.gsub(/COLON/, ":") }
|
658
|
+
|
659
|
+
return [lemma, pos, ids, sid, senses, features]
|
660
|
+
end
|
661
|
+
|
662
|
+
end
|
663
|
+
|
664
|
+
########################################
|
665
|
+
# read and write access to answer key files
|
666
|
+
# manages a single answer key file for a given lemma/POS pair
|
667
|
+
class AnswerKeyAccess
|
668
|
+
###
|
669
|
+
def initialize(exp, # experiment file object
|
670
|
+
dataset, # "train", "test"
|
671
|
+
lemmapos, # lemma + POS (one string)
|
672
|
+
mode, # "r", "w", "a"
|
673
|
+
split_id = nil,
|
674
|
+
split_dataset = nil)
|
675
|
+
unless ["r", "w", "a"].include? mode
|
676
|
+
$stderr.puts "FredFeatures error: AnswerKeyAccess initialized with mode #{mode}."
|
677
|
+
exit 1
|
678
|
+
end
|
679
|
+
|
680
|
+
@mode = mode
|
681
|
+
|
682
|
+
answer_filename = fred_dirname(exp, dataset, "keys", "new") +
|
683
|
+
fred_answerkey_filename(lemmapos)
|
684
|
+
|
685
|
+
# are we reading the whole answer key file, or only the test part
|
686
|
+
# of a split of it?
|
687
|
+
if split_id
|
688
|
+
# we are accessing part of a split
|
689
|
+
# we can only do that when reading!
|
690
|
+
unless @mode == "r"
|
691
|
+
$stderr.puts "AnswerKeyAccess error: cannot access split answer file in write mode."
|
692
|
+
exit 1
|
693
|
+
end
|
694
|
+
|
695
|
+
# apply_split returns a closed temporary file
|
696
|
+
split_obj = FredSplitPkg.new(exp)
|
697
|
+
@f = split_obj.apply_split(answer_filename, lemmapos, split_dataset, split_id)
|
698
|
+
if @f.nil?
|
699
|
+
# the split_dataset part of the split doesn't contain any data
|
700
|
+
$stderr.puts "Warning: no #{split_dataset} data for lemma #{lemmapos}"
|
701
|
+
else
|
702
|
+
@f.open()
|
703
|
+
end
|
704
|
+
|
705
|
+
else
|
706
|
+
# we are reading the whole thing
|
707
|
+
begin
|
708
|
+
@f = File.new(answer_filename, @mode)
|
709
|
+
rescue
|
710
|
+
@f = nil
|
711
|
+
end
|
712
|
+
end
|
713
|
+
end
|
714
|
+
|
715
|
+
###
|
716
|
+
def write_line(lemma, # string: lemma
|
717
|
+
pos, # string: POS
|
718
|
+
ids, # array:string: target IDs
|
719
|
+
sid, # string: sentence ID
|
720
|
+
senses, # array:string: senses
|
721
|
+
senses_this_item) # array:string: senses for this item
|
722
|
+
unless ["w", "a"].include? @mode
|
723
|
+
$stderr.puts "FredFeatures error: AnswerKeyAccess: cannot write in read mode."
|
724
|
+
exit 1
|
725
|
+
end
|
726
|
+
unless @f
|
727
|
+
raise "Shouldn't be here"
|
728
|
+
end
|
729
|
+
|
730
|
+
# write answer key:
|
731
|
+
# lemma POS ID senses
|
732
|
+
if senses.include? nil or senses.include? ""
|
733
|
+
raise "empty sense"
|
734
|
+
end
|
735
|
+
if senses_this_item.include? nil or senses_this_item.include? ""
|
736
|
+
raise "empty sense for this item"
|
737
|
+
end
|
738
|
+
|
739
|
+
senses_s = senses.map { |s| s.gsub(/,/, "COMMA")}.join(",")
|
740
|
+
senses_ti_s = senses_this_item.map { |s|
|
741
|
+
s.gsub(/,/, "COMMA")}.join(",")
|
742
|
+
id_s = ids.map { |i| i.gsub(/:/, "COLON") }.join("::")
|
743
|
+
|
744
|
+
@f.puts "#{lemma} #{pos} #{id_s} #{sid} #{senses_s} #{senses_ti_s}"
|
745
|
+
end
|
746
|
+
|
747
|
+
###
|
748
|
+
# yield one line at a time:
|
749
|
+
# tuple (lemma, POS, ids, sentence_ID, all_assigned_senses, transformed_senses_for_this_item)
|
750
|
+
def each()
|
751
|
+
unless @mode == "r"
|
752
|
+
$stderr.puts "FredFeatures error: AnsewrKeyAccess: cannot read in write mode"
|
753
|
+
end
|
754
|
+
unless @f
|
755
|
+
# something went wrong during initialization:
|
756
|
+
# split didn't contain data
|
757
|
+
return
|
758
|
+
end
|
759
|
+
|
760
|
+
@f.each { |line|
|
761
|
+
|
762
|
+
lemma, pos, id_s, sid, senses_s, senses_this_item_s = line.split()
|
763
|
+
ids = id_s.split("::").map { |i| i.gsub(/COLON/, ":") }
|
764
|
+
senses = senses_s.split(",").map { |s| s.gsub(/COMMA/, ",") }
|
765
|
+
|
766
|
+
senses_this_item = senses_this_item_s.split(",").map { |s|
|
767
|
+
s.gsub(/COMMA/, ",") }
|
768
|
+
|
769
|
+
yield [lemma, pos, ids, sid, senses, senses_this_item]
|
770
|
+
}
|
771
|
+
end
|
772
|
+
|
773
|
+
###
|
774
|
+
def close()
|
775
|
+
@f.close()
|
776
|
+
end
|
777
|
+
|
778
|
+
###
|
779
|
+
def AnswerKeyAccess.remove_files(exp, dataset)
|
780
|
+
Dir[fred_dirname(exp, dataset, "keys", "new") + fred_answerkey_filename("*")].each { |filename|
|
781
|
+
if File.exists?(filename)
|
782
|
+
File.delete(filename)
|
783
|
+
end
|
784
|
+
}
|
785
|
+
end
|
786
|
+
end
|
787
|
+
|
788
|
+
|
789
|
+
####################3
|
790
|
+
# keep writers: auxiliary class for FredFeatureAccess:
|
791
|
+
# write to several files at a time
|
792
|
+
# in tempfiles
|
793
|
+
class AuxKeepWriters
|
794
|
+
def initialize()
|
795
|
+
@lemma2temp = Hash.new()
|
796
|
+
@size = 50
|
797
|
+
@writers = Array.new()
|
798
|
+
end
|
799
|
+
|
800
|
+
|
801
|
+
##
|
802
|
+
def flush()
|
803
|
+
@writers.each { |lemmapos, writer|
|
804
|
+
writer.close()
|
805
|
+
}
|
806
|
+
end
|
807
|
+
|
808
|
+
##
|
809
|
+
def get_lemmas()
|
810
|
+
return @lemma2temp.keys()
|
811
|
+
end
|
812
|
+
|
813
|
+
##
|
814
|
+
def get_for_reading(lemmapos)
|
815
|
+
if @lemma2temp[lemmapos]
|
816
|
+
# we have a writer for this
|
817
|
+
|
818
|
+
@lemma2temp[lemmapos].close()
|
819
|
+
@lemma2temp[lemmapos].open()
|
820
|
+
return @lemma2temp[lemmapos]
|
821
|
+
|
822
|
+
else
|
823
|
+
# no writer for this
|
824
|
+
return nil
|
825
|
+
end
|
826
|
+
end
|
827
|
+
|
828
|
+
##
|
829
|
+
# finally close temp file, remove information for lemma/pos
|
830
|
+
def discard(lemmapos)
|
831
|
+
if @lemma2temp[lemmapos]
|
832
|
+
@lemma2temp[lemmapos].close(true)
|
833
|
+
@lemma2temp.delete(lemmapos)
|
834
|
+
end
|
835
|
+
end
|
836
|
+
|
837
|
+
##
|
838
|
+
def get_writer_for(lemmapos)
|
839
|
+
|
840
|
+
# is there a temp file for this lemma/pos combination?
|
841
|
+
unless @lemma2temp[lemmapos]
|
842
|
+
@lemma2temp[lemmapos] = Tempfile.new("fred_features")
|
843
|
+
@lemma2temp[lemmapos].close()
|
844
|
+
end
|
845
|
+
|
846
|
+
# is there an open temp file for this lemma/pos combination?
|
847
|
+
pair = @writers.assoc(lemmapos)
|
848
|
+
if pair
|
849
|
+
return pair.last()
|
850
|
+
end
|
851
|
+
|
852
|
+
# no: open the temp file, kick some other temp file out of the
|
853
|
+
# @writers list
|
854
|
+
writer = @lemma2temp[lemmapos]
|
855
|
+
writer.open()
|
856
|
+
|
857
|
+
|
858
|
+
# writer: open for appending
|
859
|
+
writer.seek(0, IO::SEEK_END)
|
860
|
+
|
861
|
+
|
862
|
+
@writers << [lemmapos, writer]
|
863
|
+
if @writers.length() > @size
|
864
|
+
# close file associated with first writer
|
865
|
+
@writers.first.last.close()
|
866
|
+
@writers.shift()
|
867
|
+
end
|
868
|
+
return writer
|
869
|
+
end
|
870
|
+
|
871
|
+
###
|
872
|
+
def remove_files()
|
873
|
+
@lemma2temp.each_value { |x|
|
874
|
+
x.close(true)
|
875
|
+
}
|
876
|
+
end
|
877
|
+
end
|
878
|
+
|
879
|
+
##############
|
880
|
+
# write features,
|
881
|
+
# either lemma-wise
|
882
|
+
# or lemma+sense-wise
|
883
|
+
# if lemma+sense-wise, write as binary classifier,
|
884
|
+
# i.e. map the target senses
|
885
|
+
#
|
886
|
+
# Use Delegator.
|
887
|
+
|
888
|
+
###
|
889
|
+
# Features for N-ary classifiers
|
890
|
+
class WriteFeaturesNary
|
891
|
+
def initialize(lemma,
|
892
|
+
exp,
|
893
|
+
dataset,
|
894
|
+
feature_dir)
|
895
|
+
|
896
|
+
@filename = feature_dir + fred_feature_filename(lemma)
|
897
|
+
@f = File.new(@filename, "w")
|
898
|
+
@handle_multilabel = exp.get("handle_multilabel")
|
899
|
+
end
|
900
|
+
|
901
|
+
def write_instance(features, senses)
|
902
|
+
|
903
|
+
@f.print features.map { |x|
|
904
|
+
x.to_s().gsub(/,/, "COMMA").gsub(/;/, "SEMICOLON")
|
905
|
+
}.join(",")
|
906
|
+
|
907
|
+
# possibly more than one sense? then use semicolon to separate
|
908
|
+
if @handle_multilabel == "keep"
|
909
|
+
# possibly more than one sense:
|
910
|
+
# separate by semicolon,
|
911
|
+
# and hope that the classifier knows this
|
912
|
+
@f.print ";"
|
913
|
+
@f.puts senses.map {|x|
|
914
|
+
x.to_s().gsub(/,/, "COMMA").gsub(/;/, "SEMICOLON")
|
915
|
+
}.join(",")
|
916
|
+
else
|
917
|
+
# one sense: just separate by comma
|
918
|
+
@f.print ","
|
919
|
+
@f.puts senses.first().to_s().gsub(/,/, "COMMA").gsub(/;/, "SEMICOLON")
|
920
|
+
end
|
921
|
+
end
|
922
|
+
|
923
|
+
def close()
|
924
|
+
@f.close()
|
925
|
+
end
|
926
|
+
end
|
927
|
+
|
928
|
+
###
|
929
|
+
# Features for binary classifiers
|
930
|
+
class WriteFeaturesBinary
|
931
|
+
def initialize(lemma,
|
932
|
+
exp,
|
933
|
+
dataset,
|
934
|
+
feature_dir)
|
935
|
+
@dir = feature_dir
|
936
|
+
@lemma = lemma
|
937
|
+
@feature_dir = feature_dir
|
938
|
+
|
939
|
+
@negsense = exp.get("negsense")
|
940
|
+
unless @negsense
|
941
|
+
@negsense = "NONE"
|
942
|
+
end
|
943
|
+
|
944
|
+
# files: sense-> filename
|
945
|
+
@files = Hash.new()
|
946
|
+
|
947
|
+
# keep all instances such that, when a new sense comes around,
|
948
|
+
# we can write them for that sense
|
949
|
+
@instances = Array.new()
|
950
|
+
end
|
951
|
+
|
952
|
+
|
953
|
+
def write_instance(features, senses)
|
954
|
+
# sense we haven't seen before? Then we need to
|
955
|
+
# write the whole featurization file for that new sense
|
956
|
+
check_for_presence_of_senses(senses)
|
957
|
+
|
958
|
+
# write this new instance for all senses
|
959
|
+
@files.each_key { |sense_of_file|
|
960
|
+
write_to_sensefile(features, senses, sense_of_file)
|
961
|
+
}
|
962
|
+
|
963
|
+
# store instance in case another sense crops up later
|
964
|
+
@instances << [features, senses]
|
965
|
+
end
|
966
|
+
|
967
|
+
|
968
|
+
###
|
969
|
+
def close()
|
970
|
+
@files.each_value { |f| f.close() }
|
971
|
+
end
|
972
|
+
|
973
|
+
######
|
974
|
+
private
|
975
|
+
|
976
|
+
def check_for_presence_of_senses(senses)
|
977
|
+
senses.each { |sense|
|
978
|
+
# do we have a sense file for this sense?
|
979
|
+
unless @files[sense]
|
980
|
+
# open new file for this sense
|
981
|
+
@files[sense] = File.new(@feature_dir + fred_feature_filename(@lemma, sense, true), "w")
|
982
|
+
# filename = @feature_dir + fred_feature_filename(@lemma, sense, true)
|
983
|
+
# $stderr.puts "Starting new feature file #{filename}"
|
984
|
+
|
985
|
+
# and re-write all previous instances for it
|
986
|
+
@instances.each { |prev_features, prev_senses|
|
987
|
+
write_to_sensefile(prev_features, prev_senses,
|
988
|
+
sense)
|
989
|
+
}
|
990
|
+
end
|
991
|
+
}
|
992
|
+
end
|
993
|
+
|
994
|
+
###
|
995
|
+
def write_to_sensefile(features, senses,
|
996
|
+
sense_of_file)
|
997
|
+
# file to write to
|
998
|
+
f = @files[sense_of_file]
|
999
|
+
|
1000
|
+
# print features
|
1001
|
+
f.print features.map { |x|
|
1002
|
+
x.to_s().gsub(/,/, "COMMA")
|
1003
|
+
}.join(",")
|
1004
|
+
|
1005
|
+
f.print ","
|
1006
|
+
|
1007
|
+
# binarize target class
|
1008
|
+
if senses.include? sense_of_file
|
1009
|
+
# $stderr.puts "writing POS #{sense_of_file}"
|
1010
|
+
f.puts sense_of_file.to_s()
|
1011
|
+
else
|
1012
|
+
# $stderr.puts "writing NEG #{negsense}"
|
1013
|
+
f.puts @negsense
|
1014
|
+
end
|
1015
|
+
|
1016
|
+
end
|
1017
|
+
end
|
1018
|
+
|
1019
|
+
########
|
1020
|
+
# class writing features:
|
1021
|
+
# delegating to either a binary or an n-ary writer
|
1022
|
+
class WriteFeaturesNaryOrBinary < SimpleDelegator
|
1023
|
+
###
|
1024
|
+
def initialize(lemma,
|
1025
|
+
exp,
|
1026
|
+
dataset)
|
1027
|
+
feature_dir = WriteFeaturesNaryOrBinary.feature_dir(exp, dataset, "new")
|
1028
|
+
if exp.get("binary_classifiers")
|
1029
|
+
# binary classifiers
|
1030
|
+
# $stderr.puts "Writing binary feature data."
|
1031
|
+
|
1032
|
+
# delegate writing to the binary feature writer
|
1033
|
+
@writer = WriteFeaturesBinary.new(lemma, exp, dataset, feature_dir)
|
1034
|
+
super(@writer)
|
1035
|
+
|
1036
|
+
else
|
1037
|
+
# n-ary classifiers
|
1038
|
+
# $stderr.puts "Writing n-ary feature data."
|
1039
|
+
|
1040
|
+
# delegate writing to the n-ary feature writer
|
1041
|
+
@writer = WriteFeaturesNary.new(lemma, exp, dataset, feature_dir)
|
1042
|
+
super(@writer)
|
1043
|
+
end
|
1044
|
+
end
|
1045
|
+
|
1046
|
+
def WriteFeaturesNaryOrBinary.feature_dir(exp, dataset,
|
1047
|
+
mode = "existing")
|
1048
|
+
return fred_dirname(exp, dataset, "features", mode)
|
1049
|
+
end
|
1050
|
+
|
1051
|
+
###
|
1052
|
+
def WriteFeaturesNaryOrBinary.remove_files(exp, dataset)
|
1053
|
+
feature_dir = WriteFeaturesNaryOrBinary.feature_dir(exp, dataset, "new")
|
1054
|
+
|
1055
|
+
Dir[feature_dir + fred_feature_filename("*")].each { |filename|
|
1056
|
+
if File.exists? filename
|
1057
|
+
File.delete(filename)
|
1058
|
+
end
|
1059
|
+
}
|
1060
|
+
end
|
1061
|
+
end
|