shalmaneser-fred 1.2.0.rc4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,602 @@
1
+ # FredFeaturize
2
+ #
3
+ # Featurization for WSD
4
+ #
5
+ # Katrin Erk April 2007
6
+ #
7
+ # feature types currently allowed:
8
+ # - context (with parameter giving context size; may be set several times)
9
+ # - syntax
10
+ # - synsem
11
+ #
12
+ # features in Meta-feature file:
13
+ #
14
+ # CX: context: word/lemma/pos/ne
15
+ # CH: children: grfunc/word/lemma/pos/ne
16
+ # PA: parents: grfunc/word/lemma/pos/ne
17
+ # SI: sibling: parent/grfunc/word/lemma/pos/ne
18
+ # TA: target: word/lemma/pos/ne
19
+
20
+
21
+
22
+ require "delegate"
23
+
24
+ #######
25
+
26
+ require "fred/FileZipped"
27
+ require "common/Parser"
28
+ require "common/RegXML"
29
+ require "common/SalsaTigerRegXML"
30
+ require "common/SalsaTigerXMLHelper"
31
+
32
+ require "fred/fred_config_data"
33
+ require "fred/FredConventions"
34
+ require "common/prep_helper"
35
+ require "common/SynInterfaces"
36
+
37
+ require "fred/FredBOWContext"
38
+ require "fred/FredDetermineTargets"
39
+ require "fred/FredFeatures"
40
+
41
+ ####################################
42
+ # grammatical function computation:
43
+ # given a sentence, keep all grammatical function relations in a hash
44
+ # for faster access
45
+ class GrammaticalFunctionAccess
46
+
47
+ def initialize(interpreter_class)
48
+ @interpreter_class = interpreter_class
49
+ @to = Hash.new( [] ) # default: return empty array
50
+ @from = Hash.new( [] ) # default: return empty array
51
+ end
52
+
53
+ def set_sent(sent) # SalsaTigerRegXML sentence
54
+
55
+ @to.clear()
56
+ @from.clear()
57
+
58
+ sent.each_syn_node { |current|
59
+
60
+ current_head = @interpreter_class.head_terminal(current)
61
+ unless current_head
62
+ next
63
+ end
64
+
65
+ @interpreter_class.gfs(current, sent).map { |rel, node|
66
+ # PPs: use head noun rather than preposition as head
67
+ # Sbar, VP: use verb
68
+ if (n = @interpreter_class.informative_content_node(node))
69
+ [rel, n]
70
+ else
71
+ [rel, node]
72
+ end
73
+ }.each { |rel, node|
74
+
75
+ rel_head = @interpreter_class.head_terminal(node)
76
+ unless rel_head
77
+ next
78
+ end
79
+
80
+ unless @to.has_key? current_head
81
+ @to[current_head] = Array.new()
82
+ end
83
+ unless @to[current_head].include? [rel, rel_head]
84
+ @to[current_head] << [rel, rel_head]
85
+ end
86
+
87
+ unless @from.has_key? rel_head
88
+ @from[rel_head] = Array.new()
89
+ end
90
+ unless @from[rel_head].include? [rel, current_head]
91
+ @from[rel_head] << [rel, current_head]
92
+ end
93
+ }
94
+ }
95
+ # $stderr.puts "Changed sentence:"
96
+ # @to.each_pair { |p, ch|
97
+ # $stderr.puts "\t#{p.id()}: " + ch.map { |rel, n| rel + "/"+n.id()}.join(", ")
98
+ # }
99
+ # $stdin.gets()
100
+ end
101
+
102
+ def get_children(node)
103
+ return @to[node]
104
+ end
105
+
106
+ def get_parents(node)
107
+ return @from[node]
108
+ end
109
+ end
110
+
111
+ ####################################
112
+ # main class of this package
113
+ ####################################
114
+ class FredFeaturize < DelegateClass(GrammaticalFunctionAccess)
115
+
116
+ include WordLemmaPosNe
117
+
118
+ #####
119
+ def initialize(exp_obj, # FredConfigData object
120
+ options, # hash: runtime option name (string) => value(string)
121
+ varhash = {}) # optional parameter: "refeaturize"
122
+
123
+ ##
124
+ # evaluate runtime options
125
+ if $ENDUSER_MODE
126
+ # only possible dataset: test
127
+ @dataset = "test"
128
+ else
129
+ @dataset = nil
130
+ end
131
+ @append_rather_than_overwrite = false
132
+
133
+
134
+ options.each_pair do |opt, arg|
135
+ case opt
136
+ when '--dataset'
137
+ @dataset = arg
138
+ unless ["train", "test"].include? @dataset
139
+ $stderr.puts "--dataset needs to be either 'train' or 'test'"
140
+ exit 1
141
+ end
142
+
143
+ when '--append'
144
+ @append_rather_than_overwrite = true
145
+
146
+ else
147
+ # case of unknown arguments has been dealt with by fred.rb
148
+ end
149
+ end
150
+
151
+ # further sanity checks
152
+ if @dataset.nil?
153
+ $stderr.puts "Please set --dataset: one of 'train', 'test'"
154
+ exit 1
155
+ end
156
+
157
+ in_enduser_mode_ensure(@dataset == "test")
158
+
159
+ # evaluate optional "refeaturize" argument
160
+ # "refeaturize": reuse meta-feature set,
161
+ # just redo CSV featurization
162
+ if varhash["refeaturize"]
163
+ @refeaturize = varhash["refeaturize"]
164
+ else
165
+ @refeaturize = false
166
+ end
167
+
168
+ # prepare experiment file: add preprocessing experiment file data
169
+ @exp = exp_obj
170
+
171
+ # @note AB: The following is desabled because we don't want to use
172
+ # the dependence on {PrepConfigData}. We duplicate options:
173
+ # <do_postag>, <pos_tagger>, <do_lemmatize>, <lemmatizer>,
174
+ # <do_parse>, <parser>, <directory_preprocessed>
175
+ # in the experiment file of Fred.
176
+ #
177
+ # preproc_expname = @exp.get("preproc_descr_file_" + @dataset)
178
+ # if not(preproc_expname)
179
+ # $stderr.puts "Please set the name of the preprocessing exp. file name"
180
+ # $stderr.puts "in the experiment file, feature preproc_descr_file_#{@dataset}"
181
+ # exit 1
182
+ # elsif not(File.readable?(preproc_expname))
183
+ # $stderr.puts "Error in the experiment file:"
184
+ # $stderr.puts "Parameter preproc_descr_file_#{@dataset} has to be a readable file."
185
+ # exit 1
186
+ # end
187
+ # preproc_exp = FrPrepConfigData.new(preproc_expname)
188
+ # @exp.adjoin(preproc_exp)
189
+
190
+ # get the right syntactic interface
191
+ SynInterfaces.check_interfaces_abort_if_missing(@exp)
192
+ @interpreter_class = SynInterfaces.get_interpreter_according_to_exp(@exp)
193
+
194
+ # initialize grammatical function object (delegating)
195
+ grf_obj = GrammaticalFunctionAccess.new(@interpreter_class)
196
+ super(grf_obj)
197
+
198
+ # announce the task
199
+ $stderr.puts "---------"
200
+ $stderr.puts "Fred experiment #{@exp.get("experiment_ID")}: Featurization of dataset #{@dataset}"
201
+ if @refeaturize
202
+ $stderr.puts "Keeping meta-features, redoing featurization only."
203
+ end
204
+ if @exp.get("binary_classifiers")
205
+ $stderr.puts "Writing features for binary classifiers."
206
+ else
207
+ $stderr.puts "Writing features for n-ary classifiers."
208
+ end
209
+ $stderr.puts "---------"
210
+
211
+ end
212
+
213
+ ####
214
+ def compute()
215
+ if @refeaturize
216
+ # read meta-feature file,
217
+ # just redo normal featurization
218
+ refeaturize()
219
+ else
220
+ # write meta features and normal features
221
+ featurize()
222
+ end
223
+ end
224
+
225
+ #########################
226
+ private
227
+
228
+ #####
229
+ # main featurization
230
+ def featurize()
231
+
232
+ ###
233
+ # make objects
234
+ unless @exp.get("directory_preprocessed")
235
+ $stderr.puts "Shalmaneser error: could not find the directory with"
236
+ $stderr.puts "syntactially preprocessed data."
237
+ $stderr.puts "Please make sure that 'directory_preprocessed'"
238
+ $stderr.puts "is set in the frprep experiment file you use with this experiment."
239
+ exit 1
240
+ end
241
+ directory = File.existing_dir(@exp.get("directory_preprocessed"))
242
+
243
+ # get context sizes
244
+ context_sizes = @exp.get_lf("feature", "context")
245
+ unless context_sizes
246
+ # no contexts, nothing to compute.
247
+ # choose default context
248
+ $stderr.puts "Error: no contexts set."
249
+ $stderr.puts "I will compute a context of size 1 by default."
250
+ $stderr.puts "(This goes into the meta-features, but not"
251
+ $stderr.puts "into the set of features used in the classifier.)"
252
+ context_sizes = [1]
253
+ end
254
+ max_context_size = context_sizes.max()
255
+
256
+ # make target determination object
257
+ if @dataset == "test" and @exp.get("apply_to_all_known_targets")
258
+ $stderr.puts "Fred: Using all known targets as instances."
259
+ target_obj = FindAllTargets.new(@exp, @interpreter_class)
260
+ else
261
+ if @append_rather_than_overwrite
262
+ target_obj = FindTargetsFromFrames.new(@exp, @interpreter_class, "a")
263
+ else
264
+ target_obj = FindTargetsFromFrames.new(@exp, @interpreter_class, "w")
265
+ end
266
+ end
267
+
268
+ # make context computation object
269
+ if @exp.get("single_sent_context")
270
+ # contexts in the input data doesn't go beyond a single sentence
271
+ context_obj = SingleSentContextProvider.new(max_context_size, @exp,
272
+ @interpreter_class, target_obj,
273
+ @dataset)
274
+ # @todo AB: Put it to the OptionParser, two option are not
275
+ # compatible, don't do the check here!
276
+ if @exp.get("noncontiguous_input")
277
+ $stderr.puts "Warning: 'single_sent_context' has been set in the experiment file."
278
+ $stderr.puts "So I'm ignoring the 'noncontiguous_input = true' setting."
279
+ end
280
+
281
+ elsif @exp.get("noncontiguous_input")
282
+ # the input data is not contiguous but
283
+ # consists of selected sentences from a larger text
284
+ context_obj = NoncontiguousContextProvider.new(max_context_size, @exp,
285
+ @interpreter_class, target_obj,
286
+ @dataset)
287
+ else
288
+ # the input data is contiguous, and we're computing contexts not restricted to single sentences
289
+ context_obj = ContextProvider.new(max_context_size, @exp,
290
+ @interpreter_class, target_obj, @dataset)
291
+ end
292
+
293
+ zipped_input_dir = fred_dirname(@exp, @dataset, "input_data", "new")
294
+
295
+ ##
296
+ # make writer object(s)
297
+
298
+ writer_classes = [
299
+ MetaFeatureAccess,
300
+ FredFeatureAccess
301
+ ]
302
+
303
+ if @append_rather_than_overwrite
304
+ # append
305
+ mode = "a"
306
+ $stderr.puts "Appending new features to the old"
307
+
308
+ else
309
+ # write
310
+ mode = "w"
311
+
312
+ $stderr.puts "Removing old features for the same experiment (if any)"
313
+
314
+ writer_classes.each { |w_class|
315
+ w_class.remove_feature_files(@exp, @dataset)
316
+ }
317
+
318
+ Dir[zipped_input_dir + "*gz"].each { |filename|
319
+ File.delete(filename)
320
+ }
321
+ end
322
+
323
+ writers = writer_classes.map { |w_class|
324
+ w_class.new(@exp, @dataset, mode)
325
+ }
326
+
327
+ ###
328
+ # zip and store input files
329
+ Dir[directory + "*.xml"].sort.each { |filename|
330
+ %x{gzip -c #{filename} > #{zipped_input_dir}#{File.basename(filename)}.gz}
331
+ }
332
+
333
+ # always remember current sentence
334
+ @current_sent = nil
335
+ ###
336
+ # featurize
337
+
338
+ # context_obj.each_window yields tuples of:
339
+ # - a context, an array of tuples [word,lemma, pos, ne]
340
+ # string/nil*string/nil*string/nil*string/nil
341
+ # - ID of main target: string
342
+ # - target_IDs: array:string, list of IDs of target words
343
+ # - senses: array:string, the senses for the target
344
+ # - sent: SalsaTigerSentence object
345
+ #
346
+ # for each max. context returned by context object:
347
+ # determine meta-features:
348
+ # - context words for all context sizes listed in context_sizes,
349
+ # - children of target
350
+ # - parent of target
351
+ # - siblings of target
352
+ #
353
+ # and pass on to writing object(s)
354
+ target_count = 0
355
+ context_obj.each_window(directory) { |context, main_target_id, target_ids, senses, sent|
356
+ # inform user
357
+ if target_count % 500 == 0
358
+ $stderr.puts "#{target_count}..."
359
+ end
360
+ target_count += 1
361
+ # determine features
362
+ feature_hash = Hash.new()
363
+ compute_target_features(context, max_context_size, feature_hash)
364
+ compute_context_features(context, max_context_size, context_sizes, feature_hash)
365
+ compute_syn_features(main_target_id, sent, feature_hash)
366
+ # write
367
+ each_lemma_pos_and_senses(senses) { |target_lemma, target_pos, target_sid, target_senses|
368
+
369
+ writers.each { |writer_obj|
370
+
371
+ writer_obj.write_item(target_lemma,
372
+ target_pos,
373
+ target_ids,
374
+ target_sid,
375
+ target_senses,
376
+ feature_hash)
377
+ }
378
+ }
379
+ }
380
+ # finalize writers
381
+ writers.each { |writer_obj|
382
+ writer_obj.flush()
383
+ }
384
+
385
+ # record the targets that have been read
386
+ target_obj.done_reading_targets()
387
+
388
+ end
389
+
390
+ #####
391
+ # reuse of meta-features, recompute CSV features
392
+ def refeaturize()
393
+
394
+ ##
395
+ # remove old features:
396
+ # normal features only. Keep meta-features.
397
+ # Don't do anything about zipped input.
398
+ # Assume it stays as is.
399
+ if @append_rather_than_overwrite
400
+ # append
401
+ mode = "a"
402
+ $stderr.puts "Appending new features to the old"
403
+
404
+ else
405
+ # write
406
+ mode = "w"
407
+
408
+ $stderr.puts "Removing old features for the same experiment (if any)"
409
+
410
+ FredFeatureAccess.remove_feature_files(@exp, @dataset)
411
+ end
412
+
413
+ ##
414
+ # read meta-feature file,
415
+ # write fred feature files
416
+ meta_reader = MetaFeatureAccess.new(@exp, @dataset, "r")
417
+ feature_writer = FredFeatureAccess.new(@exp, @dataset, mode)
418
+
419
+ ##
420
+ # featurize
421
+ target_count = 0
422
+
423
+ meta_reader.each_item { |target_lemma, target_pos, target_ids, target_sid, target_senses, feature_hash|
424
+
425
+ # inform user
426
+ if target_count % 500 == 0
427
+ $stderr.puts "#{target_count}..."
428
+ end
429
+ target_count += 1
430
+
431
+ feature_writer.write_item(target_lemma,
432
+ target_pos,
433
+ target_ids,
434
+ target_sid,
435
+ target_senses,
436
+ feature_hash)
437
+ }
438
+ feature_writer.flush()
439
+ end
440
+
441
+
442
+ ####
443
+ # given a list of sense hashes, format
444
+ # "lex" -> lemma
445
+ # "pos" -> part of speech
446
+ # "sense" -> sense
447
+ #
448
+ # yield as triples [lemma, pos, sense]
449
+ def each_lemma_pos_and_senses(shashes)
450
+
451
+ # determine target and POS. If we actually have more than one lemma and POS, we're in trouble
452
+ target_lemmas = shashes.map { |sense_hash| sense_hash["lex"].to_s().gsub(/\s/, "_") }.uniq()
453
+ target_pos_s = shashes.map { |sense_hash| sense_hash["pos"].to_s().gsub(/\s/, "_")}.uniq()
454
+ target_sid = shashes.map { |sense_hash| sense_hash["sid"].to_s().gsub(/\s/, "_")}.uniq()
455
+
456
+ if target_lemmas.length() == 1 and target_pos_s.length() == 1 and target_sid.length() == 1
457
+
458
+ yield [target_lemmas.first(), target_pos_s.first(),
459
+ target_sid.first(),
460
+ shashes.map { |sense_hash| sense_hash["sense"].to_s().gsub(/\s/, "_") }
461
+ ]
462
+
463
+ else
464
+ # trouble
465
+
466
+ # group senses by SID, lemma and pos
467
+ lemmapos2sense = Hash.new
468
+ shashes.each { |sense_hash|
469
+ target_lemma = sense_hash["lex"].to_s().gsub(/\s/, "_")
470
+ target_pos = sense_hash["pos"].to_s().gsub(/\s/, "_")
471
+ target_sid = sense_hash["sid"].to_s().gsub(/\s/, "_")
472
+ target_sense = sense_hash["sense"].to_s().gsub(/\s/, "_")
473
+ key = [target_sid, target_lemma, target_pos]
474
+ unless lemmapos2sense[key]
475
+ lemmapos2sense[key] = Array.new()
476
+ end
477
+ lemmapos2sense[key] << target_sense
478
+ }
479
+
480
+ # and yield
481
+ lemmapos2sense.each_key { |target_sid, target_lemma, target_pos|
482
+ yield [target_lemma, target_pos, target_sid,
483
+ lemmapos2sense[[target_sid, target_lemma, target_pos]]
484
+ ]
485
+ }
486
+ end
487
+ end
488
+
489
+ ###
490
+ # given a context, locate the target,
491
+ # which is right in the middle,
492
+ # and enter it into the feature hash
493
+ #
494
+ # feature type: TA
495
+ # entry: word#lemma#pos#ne
496
+ def compute_target_features(context, # array: word*lemma*pos*ne
497
+ center_pos, # integer: size of context, onesided
498
+ feature_hash) # hash: feature_type -> array:feature, enter features here
499
+ feature_hash["TA"] = [
500
+ context[center_pos].map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
501
+ ]
502
+ end
503
+
504
+ ###
505
+ # compute context features:
506
+ # for each context in the given list of context sizes,
507
+ # compute a context with feature_type "CXNN" (where NN is the size of the context)
508
+ # and with features
509
+ # word#lemma#pos#ne
510
+ #
511
+ # enter each context into the feature hash
512
+ def compute_context_features(context, # array: word*lemma*pos*ne
513
+ center_pos, # int: context is 2*cx_size_onesided + 1 long
514
+ context_sizes, # array:int, produce a context of each of these sizes
515
+ feature_hash) # hash: feature_type -> array:feature, enter features here
516
+
517
+
518
+ context_sizes.each { |context_size|
519
+ # feature type: CXNN, where NN is the size of the context
520
+ feature_type = "CX" + context_size.to_s()
521
+
522
+ # features: an array of strings
523
+ feature_hash[feature_type] = Array.new()
524
+
525
+ # pre-context
526
+ (center_pos - context_size).upto(center_pos - 1) { |ix|
527
+ if context[ix]
528
+ # context entries may be nil at the beginning and end of the text
529
+ feature_hash[feature_type] << context[ix].map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
530
+ end
531
+ }
532
+ # post-context
533
+ (center_pos + 1).upto(center_pos + context_size) { |ix|
534
+ if context[ix]
535
+ # context entries may be nil at the beginning and end of the text
536
+ feature_hash[feature_type] << context[ix].map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
537
+ end
538
+ }
539
+ }
540
+ end
541
+
542
+ ###
543
+ # compute syntax-dependent features:
544
+ # children (that is, dependents) of the target word,
545
+ # parent,
546
+ # and siblings.
547
+ #
548
+ # format:
549
+ # feature type is CH for children, PA for parent, SI for siblings
550
+ #
551
+ # individual features are:
552
+ # <dependency>#<word>#<lemma>#<pos>#<ne>
553
+ def compute_syn_features(main_target_id, # string: ID of the terminal node that is the target
554
+ sent, # SalsaTigerRegXML object
555
+ feature_hash) # hash: feature_type -> array:feature, enter features here
556
+
557
+ target = sent.terminals().detect { |t| t.id() == main_target_id }
558
+ unless target
559
+ $stderr.puts "Featurization error: cannot find target with ID #{main_target_id}, skipping."
560
+ return
561
+ end
562
+
563
+ # if we're starting a new sentence,
564
+ # compute dependencies using delegate object for grammatical functions.
565
+ # also, get_children, get_parents below are methods of the delegate
566
+ unless sent == @current_sent
567
+ @current_sent = sent
568
+ set_sent(sent)
569
+ end
570
+ # children
571
+ feature_hash["CH"] = get_children(target).map { |rel, node|
572
+ #print "\t", rel, " ", node, "\n"
573
+ rel.to_s() + "#" +
574
+ word_lemma_pos_ne(node, @interpreter_class).map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
575
+ }
576
+
577
+ # parent
578
+ feature_hash["PA"] = get_parents(target).map { |rel, node|
579
+ #print "\t", rel, " ", node, "\n"
580
+ rel.to_s() + "#" +
581
+ word_lemma_pos_ne(node, @interpreter_class).map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
582
+ }
583
+
584
+ # siblings
585
+ feature_hash["SI"] = Array.new()
586
+ get_parents(target).each { |rel, parent|
587
+ parent_w, d1, d2, d3 = word_lemma_pos_ne(parent, @interpreter_class)
588
+
589
+ get_children(parent).each { |rel, node|
590
+ #print "\t", rel, " ", node, "\n"
591
+
592
+ if node == target
593
+ next
594
+ end
595
+
596
+ feature_hash["SI"] << parent_w + "#" +
597
+ rel.to_s() + "#" +
598
+ word_lemma_pos_ne(node, @interpreter_class).map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
599
+ }
600
+ }
601
+ end
602
+ end