shalmaneser-fred 1.2.0.rc4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,602 @@
1
+ # FredFeaturize
2
+ #
3
+ # Featurization for WSD
4
+ #
5
+ # Katrin Erk April 2007
6
+ #
7
+ # feature types currently allowed:
8
+ # - context (with parameter giving context size; may be set several times)
9
+ # - syntax
10
+ # - synsem
11
+ #
12
+ # features in Meta-feature file:
13
+ #
14
+ # CX: context: word/lemma/pos/ne
15
+ # CH: children: grfunc/word/lemma/pos/ne
16
+ # PA: parents: grfunc/word/lemma/pos/ne
17
+ # SI: sibling: parent/grfunc/word/lemma/pos/ne
18
+ # TA: target: word/lemma/pos/ne
19
+
20
+
21
+
22
+ require "delegate"
23
+
24
+ #######
25
+
26
+ require "fred/FileZipped"
27
+ require "common/Parser"
28
+ require "common/RegXML"
29
+ require "common/SalsaTigerRegXML"
30
+ require "common/SalsaTigerXMLHelper"
31
+
32
+ require "fred/fred_config_data"
33
+ require "fred/FredConventions"
34
+ require "common/prep_helper"
35
+ require "common/SynInterfaces"
36
+
37
+ require "fred/FredBOWContext"
38
+ require "fred/FredDetermineTargets"
39
+ require "fred/FredFeatures"
40
+
41
+ ####################################
42
+ # grammatical function computation:
43
+ # given a sentence, keep all grammatical function relations in a hash
44
+ # for faster access
45
+ class GrammaticalFunctionAccess
46
+
47
+ def initialize(interpreter_class)
48
+ @interpreter_class = interpreter_class
49
+ @to = Hash.new( [] ) # default: return empty array
50
+ @from = Hash.new( [] ) # default: return empty array
51
+ end
52
+
53
+ def set_sent(sent) # SalsaTigerRegXML sentence
54
+
55
+ @to.clear()
56
+ @from.clear()
57
+
58
+ sent.each_syn_node { |current|
59
+
60
+ current_head = @interpreter_class.head_terminal(current)
61
+ unless current_head
62
+ next
63
+ end
64
+
65
+ @interpreter_class.gfs(current, sent).map { |rel, node|
66
+ # PPs: use head noun rather than preposition as head
67
+ # Sbar, VP: use verb
68
+ if (n = @interpreter_class.informative_content_node(node))
69
+ [rel, n]
70
+ else
71
+ [rel, node]
72
+ end
73
+ }.each { |rel, node|
74
+
75
+ rel_head = @interpreter_class.head_terminal(node)
76
+ unless rel_head
77
+ next
78
+ end
79
+
80
+ unless @to.has_key? current_head
81
+ @to[current_head] = Array.new()
82
+ end
83
+ unless @to[current_head].include? [rel, rel_head]
84
+ @to[current_head] << [rel, rel_head]
85
+ end
86
+
87
+ unless @from.has_key? rel_head
88
+ @from[rel_head] = Array.new()
89
+ end
90
+ unless @from[rel_head].include? [rel, current_head]
91
+ @from[rel_head] << [rel, current_head]
92
+ end
93
+ }
94
+ }
95
+ # $stderr.puts "Changed sentence:"
96
+ # @to.each_pair { |p, ch|
97
+ # $stderr.puts "\t#{p.id()}: " + ch.map { |rel, n| rel + "/"+n.id()}.join(", ")
98
+ # }
99
+ # $stdin.gets()
100
+ end
101
+
102
+ def get_children(node)
103
+ return @to[node]
104
+ end
105
+
106
+ def get_parents(node)
107
+ return @from[node]
108
+ end
109
+ end
110
+
111
+ ####################################
112
+ # main class of this package
113
+ ####################################
114
+ class FredFeaturize < DelegateClass(GrammaticalFunctionAccess)
115
+
116
+ include WordLemmaPosNe
117
+
118
+ #####
119
+ def initialize(exp_obj, # FredConfigData object
120
+ options, # hash: runtime option name (string) => value(string)
121
+ varhash = {}) # optional parameter: "refeaturize"
122
+
123
+ ##
124
+ # evaluate runtime options
125
+ if $ENDUSER_MODE
126
+ # only possible dataset: test
127
+ @dataset = "test"
128
+ else
129
+ @dataset = nil
130
+ end
131
+ @append_rather_than_overwrite = false
132
+
133
+
134
+ options.each_pair do |opt, arg|
135
+ case opt
136
+ when '--dataset'
137
+ @dataset = arg
138
+ unless ["train", "test"].include? @dataset
139
+ $stderr.puts "--dataset needs to be either 'train' or 'test'"
140
+ exit 1
141
+ end
142
+
143
+ when '--append'
144
+ @append_rather_than_overwrite = true
145
+
146
+ else
147
+ # case of unknown arguments has been dealt with by fred.rb
148
+ end
149
+ end
150
+
151
+ # further sanity checks
152
+ if @dataset.nil?
153
+ $stderr.puts "Please set --dataset: one of 'train', 'test'"
154
+ exit 1
155
+ end
156
+
157
+ in_enduser_mode_ensure(@dataset == "test")
158
+
159
+ # evaluate optional "refeaturize" argument
160
+ # "refeaturize": reuse meta-feature set,
161
+ # just redo CSV featurization
162
+ if varhash["refeaturize"]
163
+ @refeaturize = varhash["refeaturize"]
164
+ else
165
+ @refeaturize = false
166
+ end
167
+
168
+ # prepare experiment file: add preprocessing experiment file data
169
+ @exp = exp_obj
170
+
171
+ # @note AB: The following is desabled because we don't want to use
172
+ # the dependence on {PrepConfigData}. We duplicate options:
173
+ # <do_postag>, <pos_tagger>, <do_lemmatize>, <lemmatizer>,
174
+ # <do_parse>, <parser>, <directory_preprocessed>
175
+ # in the experiment file of Fred.
176
+ #
177
+ # preproc_expname = @exp.get("preproc_descr_file_" + @dataset)
178
+ # if not(preproc_expname)
179
+ # $stderr.puts "Please set the name of the preprocessing exp. file name"
180
+ # $stderr.puts "in the experiment file, feature preproc_descr_file_#{@dataset}"
181
+ # exit 1
182
+ # elsif not(File.readable?(preproc_expname))
183
+ # $stderr.puts "Error in the experiment file:"
184
+ # $stderr.puts "Parameter preproc_descr_file_#{@dataset} has to be a readable file."
185
+ # exit 1
186
+ # end
187
+ # preproc_exp = FrPrepConfigData.new(preproc_expname)
188
+ # @exp.adjoin(preproc_exp)
189
+
190
+ # get the right syntactic interface
191
+ SynInterfaces.check_interfaces_abort_if_missing(@exp)
192
+ @interpreter_class = SynInterfaces.get_interpreter_according_to_exp(@exp)
193
+
194
+ # initialize grammatical function object (delegating)
195
+ grf_obj = GrammaticalFunctionAccess.new(@interpreter_class)
196
+ super(grf_obj)
197
+
198
+ # announce the task
199
+ $stderr.puts "---------"
200
+ $stderr.puts "Fred experiment #{@exp.get("experiment_ID")}: Featurization of dataset #{@dataset}"
201
+ if @refeaturize
202
+ $stderr.puts "Keeping meta-features, redoing featurization only."
203
+ end
204
+ if @exp.get("binary_classifiers")
205
+ $stderr.puts "Writing features for binary classifiers."
206
+ else
207
+ $stderr.puts "Writing features for n-ary classifiers."
208
+ end
209
+ $stderr.puts "---------"
210
+
211
+ end
212
+
213
+ ####
214
+ def compute()
215
+ if @refeaturize
216
+ # read meta-feature file,
217
+ # just redo normal featurization
218
+ refeaturize()
219
+ else
220
+ # write meta features and normal features
221
+ featurize()
222
+ end
223
+ end
224
+
225
+ #########################
226
+ private
227
+
228
+ #####
229
+ # main featurization
230
+ def featurize()
231
+
232
+ ###
233
+ # make objects
234
+ unless @exp.get("directory_preprocessed")
235
+ $stderr.puts "Shalmaneser error: could not find the directory with"
236
+ $stderr.puts "syntactially preprocessed data."
237
+ $stderr.puts "Please make sure that 'directory_preprocessed'"
238
+ $stderr.puts "is set in the frprep experiment file you use with this experiment."
239
+ exit 1
240
+ end
241
+ directory = File.existing_dir(@exp.get("directory_preprocessed"))
242
+
243
+ # get context sizes
244
+ context_sizes = @exp.get_lf("feature", "context")
245
+ unless context_sizes
246
+ # no contexts, nothing to compute.
247
+ # choose default context
248
+ $stderr.puts "Error: no contexts set."
249
+ $stderr.puts "I will compute a context of size 1 by default."
250
+ $stderr.puts "(This goes into the meta-features, but not"
251
+ $stderr.puts "into the set of features used in the classifier.)"
252
+ context_sizes = [1]
253
+ end
254
+ max_context_size = context_sizes.max()
255
+
256
+ # make target determination object
257
+ if @dataset == "test" and @exp.get("apply_to_all_known_targets")
258
+ $stderr.puts "Fred: Using all known targets as instances."
259
+ target_obj = FindAllTargets.new(@exp, @interpreter_class)
260
+ else
261
+ if @append_rather_than_overwrite
262
+ target_obj = FindTargetsFromFrames.new(@exp, @interpreter_class, "a")
263
+ else
264
+ target_obj = FindTargetsFromFrames.new(@exp, @interpreter_class, "w")
265
+ end
266
+ end
267
+
268
+ # make context computation object
269
+ if @exp.get("single_sent_context")
270
+ # contexts in the input data doesn't go beyond a single sentence
271
+ context_obj = SingleSentContextProvider.new(max_context_size, @exp,
272
+ @interpreter_class, target_obj,
273
+ @dataset)
274
+ # @todo AB: Put it to the OptionParser, two option are not
275
+ # compatible, don't do the check here!
276
+ if @exp.get("noncontiguous_input")
277
+ $stderr.puts "Warning: 'single_sent_context' has been set in the experiment file."
278
+ $stderr.puts "So I'm ignoring the 'noncontiguous_input = true' setting."
279
+ end
280
+
281
+ elsif @exp.get("noncontiguous_input")
282
+ # the input data is not contiguous but
283
+ # consists of selected sentences from a larger text
284
+ context_obj = NoncontiguousContextProvider.new(max_context_size, @exp,
285
+ @interpreter_class, target_obj,
286
+ @dataset)
287
+ else
288
+ # the input data is contiguous, and we're computing contexts not restricted to single sentences
289
+ context_obj = ContextProvider.new(max_context_size, @exp,
290
+ @interpreter_class, target_obj, @dataset)
291
+ end
292
+
293
+ zipped_input_dir = fred_dirname(@exp, @dataset, "input_data", "new")
294
+
295
+ ##
296
+ # make writer object(s)
297
+
298
+ writer_classes = [
299
+ MetaFeatureAccess,
300
+ FredFeatureAccess
301
+ ]
302
+
303
+ if @append_rather_than_overwrite
304
+ # append
305
+ mode = "a"
306
+ $stderr.puts "Appending new features to the old"
307
+
308
+ else
309
+ # write
310
+ mode = "w"
311
+
312
+ $stderr.puts "Removing old features for the same experiment (if any)"
313
+
314
+ writer_classes.each { |w_class|
315
+ w_class.remove_feature_files(@exp, @dataset)
316
+ }
317
+
318
+ Dir[zipped_input_dir + "*gz"].each { |filename|
319
+ File.delete(filename)
320
+ }
321
+ end
322
+
323
+ writers = writer_classes.map { |w_class|
324
+ w_class.new(@exp, @dataset, mode)
325
+ }
326
+
327
+ ###
328
+ # zip and store input files
329
+ Dir[directory + "*.xml"].sort.each { |filename|
330
+ %x{gzip -c #{filename} > #{zipped_input_dir}#{File.basename(filename)}.gz}
331
+ }
332
+
333
+ # always remember current sentence
334
+ @current_sent = nil
335
+ ###
336
+ # featurize
337
+
338
+ # context_obj.each_window yields tuples of:
339
+ # - a context, an array of tuples [word,lemma, pos, ne]
340
+ # string/nil*string/nil*string/nil*string/nil
341
+ # - ID of main target: string
342
+ # - target_IDs: array:string, list of IDs of target words
343
+ # - senses: array:string, the senses for the target
344
+ # - sent: SalsaTigerSentence object
345
+ #
346
+ # for each max. context returned by context object:
347
+ # determine meta-features:
348
+ # - context words for all context sizes listed in context_sizes,
349
+ # - children of target
350
+ # - parent of target
351
+ # - siblings of target
352
+ #
353
+ # and pass on to writing object(s)
354
+ target_count = 0
355
+ context_obj.each_window(directory) { |context, main_target_id, target_ids, senses, sent|
356
+ # inform user
357
+ if target_count % 500 == 0
358
+ $stderr.puts "#{target_count}..."
359
+ end
360
+ target_count += 1
361
+ # determine features
362
+ feature_hash = Hash.new()
363
+ compute_target_features(context, max_context_size, feature_hash)
364
+ compute_context_features(context, max_context_size, context_sizes, feature_hash)
365
+ compute_syn_features(main_target_id, sent, feature_hash)
366
+ # write
367
+ each_lemma_pos_and_senses(senses) { |target_lemma, target_pos, target_sid, target_senses|
368
+
369
+ writers.each { |writer_obj|
370
+
371
+ writer_obj.write_item(target_lemma,
372
+ target_pos,
373
+ target_ids,
374
+ target_sid,
375
+ target_senses,
376
+ feature_hash)
377
+ }
378
+ }
379
+ }
380
+ # finalize writers
381
+ writers.each { |writer_obj|
382
+ writer_obj.flush()
383
+ }
384
+
385
+ # record the targets that have been read
386
+ target_obj.done_reading_targets()
387
+
388
+ end
389
+
390
+ #####
391
+ # reuse of meta-features, recompute CSV features
392
+ def refeaturize()
393
+
394
+ ##
395
+ # remove old features:
396
+ # normal features only. Keep meta-features.
397
+ # Don't do anything about zipped input.
398
+ # Assume it stays as is.
399
+ if @append_rather_than_overwrite
400
+ # append
401
+ mode = "a"
402
+ $stderr.puts "Appending new features to the old"
403
+
404
+ else
405
+ # write
406
+ mode = "w"
407
+
408
+ $stderr.puts "Removing old features for the same experiment (if any)"
409
+
410
+ FredFeatureAccess.remove_feature_files(@exp, @dataset)
411
+ end
412
+
413
+ ##
414
+ # read meta-feature file,
415
+ # write fred feature files
416
+ meta_reader = MetaFeatureAccess.new(@exp, @dataset, "r")
417
+ feature_writer = FredFeatureAccess.new(@exp, @dataset, mode)
418
+
419
+ ##
420
+ # featurize
421
+ target_count = 0
422
+
423
+ meta_reader.each_item { |target_lemma, target_pos, target_ids, target_sid, target_senses, feature_hash|
424
+
425
+ # inform user
426
+ if target_count % 500 == 0
427
+ $stderr.puts "#{target_count}..."
428
+ end
429
+ target_count += 1
430
+
431
+ feature_writer.write_item(target_lemma,
432
+ target_pos,
433
+ target_ids,
434
+ target_sid,
435
+ target_senses,
436
+ feature_hash)
437
+ }
438
+ feature_writer.flush()
439
+ end
440
+
441
+
442
+ ####
443
+ # given a list of sense hashes, format
444
+ # "lex" -> lemma
445
+ # "pos" -> part of speech
446
+ # "sense" -> sense
447
+ #
448
+ # yield as triples [lemma, pos, sense]
449
+ def each_lemma_pos_and_senses(shashes)
450
+
451
+ # determine target and POS. If we actually have more than one lemma and POS, we're in trouble
452
+ target_lemmas = shashes.map { |sense_hash| sense_hash["lex"].to_s().gsub(/\s/, "_") }.uniq()
453
+ target_pos_s = shashes.map { |sense_hash| sense_hash["pos"].to_s().gsub(/\s/, "_")}.uniq()
454
+ target_sid = shashes.map { |sense_hash| sense_hash["sid"].to_s().gsub(/\s/, "_")}.uniq()
455
+
456
+ if target_lemmas.length() == 1 and target_pos_s.length() == 1 and target_sid.length() == 1
457
+
458
+ yield [target_lemmas.first(), target_pos_s.first(),
459
+ target_sid.first(),
460
+ shashes.map { |sense_hash| sense_hash["sense"].to_s().gsub(/\s/, "_") }
461
+ ]
462
+
463
+ else
464
+ # trouble
465
+
466
+ # group senses by SID, lemma and pos
467
+ lemmapos2sense = Hash.new
468
+ shashes.each { |sense_hash|
469
+ target_lemma = sense_hash["lex"].to_s().gsub(/\s/, "_")
470
+ target_pos = sense_hash["pos"].to_s().gsub(/\s/, "_")
471
+ target_sid = sense_hash["sid"].to_s().gsub(/\s/, "_")
472
+ target_sense = sense_hash["sense"].to_s().gsub(/\s/, "_")
473
+ key = [target_sid, target_lemma, target_pos]
474
+ unless lemmapos2sense[key]
475
+ lemmapos2sense[key] = Array.new()
476
+ end
477
+ lemmapos2sense[key] << target_sense
478
+ }
479
+
480
+ # and yield
481
+ lemmapos2sense.each_key { |target_sid, target_lemma, target_pos|
482
+ yield [target_lemma, target_pos, target_sid,
483
+ lemmapos2sense[[target_sid, target_lemma, target_pos]]
484
+ ]
485
+ }
486
+ end
487
+ end
488
+
489
+ ###
490
+ # given a context, locate the target,
491
+ # which is right in the middle,
492
+ # and enter it into the feature hash
493
+ #
494
+ # feature type: TA
495
+ # entry: word#lemma#pos#ne
496
+ def compute_target_features(context, # array: word*lemma*pos*ne
497
+ center_pos, # integer: size of context, onesided
498
+ feature_hash) # hash: feature_type -> array:feature, enter features here
499
+ feature_hash["TA"] = [
500
+ context[center_pos].map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
501
+ ]
502
+ end
503
+
504
+ ###
505
+ # compute context features:
506
+ # for each context in the given list of context sizes,
507
+ # compute a context with feature_type "CXNN" (where NN is the size of the context)
508
+ # and with features
509
+ # word#lemma#pos#ne
510
+ #
511
+ # enter each context into the feature hash
512
+ def compute_context_features(context, # array: word*lemma*pos*ne
513
+ center_pos, # int: context is 2*cx_size_onesided + 1 long
514
+ context_sizes, # array:int, produce a context of each of these sizes
515
+ feature_hash) # hash: feature_type -> array:feature, enter features here
516
+
517
+
518
+ context_sizes.each { |context_size|
519
+ # feature type: CXNN, where NN is the size of the context
520
+ feature_type = "CX" + context_size.to_s()
521
+
522
+ # features: an array of strings
523
+ feature_hash[feature_type] = Array.new()
524
+
525
+ # pre-context
526
+ (center_pos - context_size).upto(center_pos - 1) { |ix|
527
+ if context[ix]
528
+ # context entries may be nil at the beginning and end of the text
529
+ feature_hash[feature_type] << context[ix].map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
530
+ end
531
+ }
532
+ # post-context
533
+ (center_pos + 1).upto(center_pos + context_size) { |ix|
534
+ if context[ix]
535
+ # context entries may be nil at the beginning and end of the text
536
+ feature_hash[feature_type] << context[ix].map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
537
+ end
538
+ }
539
+ }
540
+ end
541
+
542
+ ###
543
+ # compute syntax-dependent features:
544
+ # children (that is, dependents) of the target word,
545
+ # parent,
546
+ # and siblings.
547
+ #
548
+ # format:
549
+ # feature type is CH for children, PA for parent, SI for siblings
550
+ #
551
+ # individual features are:
552
+ # <dependency>#<word>#<lemma>#<pos>#<ne>
553
+ def compute_syn_features(main_target_id, # string: ID of the terminal node that is the target
554
+ sent, # SalsaTigerRegXML object
555
+ feature_hash) # hash: feature_type -> array:feature, enter features here
556
+
557
+ target = sent.terminals().detect { |t| t.id() == main_target_id }
558
+ unless target
559
+ $stderr.puts "Featurization error: cannot find target with ID #{main_target_id}, skipping."
560
+ return
561
+ end
562
+
563
+ # if we're starting a new sentence,
564
+ # compute dependencies using delegate object for grammatical functions.
565
+ # also, get_children, get_parents below are methods of the delegate
566
+ unless sent == @current_sent
567
+ @current_sent = sent
568
+ set_sent(sent)
569
+ end
570
+ # children
571
+ feature_hash["CH"] = get_children(target).map { |rel, node|
572
+ #print "\t", rel, " ", node, "\n"
573
+ rel.to_s() + "#" +
574
+ word_lemma_pos_ne(node, @interpreter_class).map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
575
+ }
576
+
577
+ # parent
578
+ feature_hash["PA"] = get_parents(target).map { |rel, node|
579
+ #print "\t", rel, " ", node, "\n"
580
+ rel.to_s() + "#" +
581
+ word_lemma_pos_ne(node, @interpreter_class).map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
582
+ }
583
+
584
+ # siblings
585
+ feature_hash["SI"] = Array.new()
586
+ get_parents(target).each { |rel, parent|
587
+ parent_w, d1, d2, d3 = word_lemma_pos_ne(parent, @interpreter_class)
588
+
589
+ get_children(parent).each { |rel, node|
590
+ #print "\t", rel, " ", node, "\n"
591
+
592
+ if node == target
593
+ next
594
+ end
595
+
596
+ feature_hash["SI"] << parent_w + "#" +
597
+ rel.to_s() + "#" +
598
+ word_lemma_pos_ne(node, @interpreter_class).map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
599
+ }
600
+ }
601
+ end
602
+ end