shalmaneser-fred 1.2.0.rc4 → 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +47 -18
  3. data/bin/fred +8 -3
  4. data/lib/fred/FredConventions.rb +190 -189
  5. data/lib/fred/abstract_context_provider.rb +246 -0
  6. data/lib/fred/abstract_fred_feature_access.rb +43 -0
  7. data/lib/fred/answer_key_access.rb +130 -0
  8. data/lib/fred/aux_keep_writers.rb +94 -0
  9. data/lib/fred/baseline.rb +153 -0
  10. data/lib/fred/context_provider.rb +55 -0
  11. data/lib/fred/feature_extractors/fred_context_feature_extractor.rb +48 -0
  12. data/lib/fred/feature_extractors/fred_context_pos_feature_extractor.rb +48 -0
  13. data/lib/fred/feature_extractors/fred_feature_extractor.rb +50 -0
  14. data/lib/fred/feature_extractors/fred_ngram_feature_extractor.rb +65 -0
  15. data/lib/fred/feature_extractors/fred_syn_feature_extractor.rb +33 -0
  16. data/lib/fred/feature_extractors/fred_synsem_feature_extractor.rb +32 -0
  17. data/lib/fred/feature_extractors.rb +5 -0
  18. data/lib/fred/file_zipped.rb +43 -0
  19. data/lib/fred/find_all_targets.rb +94 -0
  20. data/lib/fred/find_targets_from_frames.rb +92 -0
  21. data/lib/fred/fred.rb +43 -40
  22. data/lib/fred/fred_error.rb +15 -0
  23. data/lib/fred/fred_eval.rb +311 -0
  24. data/lib/fred/fred_feature_access.rb +420 -0
  25. data/lib/fred/fred_feature_info.rb +56 -0
  26. data/lib/fred/fred_featurize.rb +525 -0
  27. data/lib/fred/fred_parameters.rb +190 -0
  28. data/lib/fred/fred_split.rb +86 -0
  29. data/lib/fred/fred_split_pkg.rb +189 -0
  30. data/lib/fred/fred_test.rb +571 -0
  31. data/lib/fred/fred_train.rb +125 -0
  32. data/lib/fred/grammatical_function_access.rb +63 -0
  33. data/lib/fred/md5.rb +6 -0
  34. data/lib/fred/meta_feature_access.rb +185 -0
  35. data/lib/fred/non_contiguous_context_provider.rb +532 -0
  36. data/lib/fred/opt_parser.rb +182 -161
  37. data/lib/fred/plot_and_r_eval.rb +486 -0
  38. data/lib/fred/single_sent_context_provider.rb +76 -0
  39. data/lib/fred/slide_var.rb +148 -0
  40. data/lib/fred/targets.rb +136 -0
  41. data/lib/fred/toggle_var.rb +61 -0
  42. data/lib/fred/word_lemma_pos_ne.rb +51 -0
  43. data/lib/fred/write_features_binary.rb +95 -0
  44. data/lib/fred/write_features_nary.rb +51 -0
  45. data/lib/fred/write_features_nary_or_binary.rb +51 -0
  46. data/lib/shalmaneser/fred.rb +1 -0
  47. metadata +57 -30
  48. data/lib/fred/Baseline.rb +0 -150
  49. data/lib/fred/FileZipped.rb +0 -31
  50. data/lib/fred/FredBOWContext.rb +0 -877
  51. data/lib/fred/FredDetermineTargets.rb +0 -319
  52. data/lib/fred/FredEval.rb +0 -312
  53. data/lib/fred/FredFeatureExtractors.rb +0 -322
  54. data/lib/fred/FredFeatures.rb +0 -1061
  55. data/lib/fred/FredFeaturize.rb +0 -602
  56. data/lib/fred/FredNumTrainingSenses.rb +0 -27
  57. data/lib/fred/FredParameters.rb +0 -402
  58. data/lib/fred/FredSplit.rb +0 -84
  59. data/lib/fred/FredSplitPkg.rb +0 -180
  60. data/lib/fred/FredTest.rb +0 -606
  61. data/lib/fred/FredTrain.rb +0 -144
  62. data/lib/fred/PlotAndREval.rb +0 -480
  63. data/lib/fred/fred_config_data.rb +0 -185
  64. data/test/frprep/test_opt_parser.rb +0 -94
  65. data/test/functional/functional_test_helper.rb +0 -58
  66. data/test/functional/test_fred.rb +0 -47
  67. data/test/functional/test_frprep.rb +0 -99
  68. data/test/functional/test_rosy.rb +0 -40
@@ -0,0 +1,525 @@
1
+ # FredFeaturize
2
+ #
3
+ # Featurization for WSD
4
+ #
5
+ # Katrin Erk April 2007
6
+ #
7
+ # feature types currently allowed:
8
+ # - context (with parameter giving context size; may be set several times)
9
+ # - syntax
10
+ # - synsem
11
+ #
12
+ # features in Meta-feature file:
13
+ #
14
+ # CX: context: word/lemma/pos/ne
15
+ # CH: children: grfunc/word/lemma/pos/ne
16
+ # PA: parents: grfunc/word/lemma/pos/ne
17
+ # SI: sibling: parent/grfunc/word/lemma/pos/ne
18
+ # TA: target: word/lemma/pos/ne
19
+
20
+ require 'delegate'
21
+
22
+ #######
23
+
24
+ require 'salsa_tiger_xml/salsa_tiger_sentence'
25
+ require 'configuration/fred_config_data'
26
+ require 'fred/FredConventions' # !
27
+ require 'fred/word_lemma_pos_ne'
28
+ require 'external_systems'
29
+
30
+ # require 'fred/FredDetermineTargets'
31
+
32
+ require 'fred/find_all_targets'
33
+ require 'fred/find_targets_from_frames'
34
+
35
+ # require 'fred/FredFeatures'
36
+ require 'fred/meta_feature_access'
37
+ require 'fred/fred_feature_access'
38
+
39
+ require 'fred/grammatical_function_access'
40
+
41
+ require 'fred/context_provider'
42
+ require 'fred/non_contiguous_context_provider'
43
+ require 'fred/single_sent_context_provider'
44
+
45
+ module Shalmaneser
46
+ module Fred
47
+ ####################################
48
+ # main class of this package
49
+ ####################################
50
+ class FredFeaturize < DelegateClass(GrammaticalFunctionAccess)
51
+ include WordLemmaPosNe
52
+
53
+ #####
54
+ def initialize(exp_obj, # FredConfigData object
55
+ options, # hash: runtime option name (string) => value(string)
56
+ varhash = {}) # optional parameter: "refeaturize"
57
+
58
+ @append_rather_than_overwrite = false
59
+
60
+ # @todo Move this to FredConfigData.
61
+ options.each_pair do |opt, arg|
62
+ case opt
63
+ when '--dataset'
64
+ @dataset = arg
65
+ unless ["train", "test"].include? @dataset
66
+ $stderr.puts "--dataset needs to be either 'train' or 'test'"
67
+ exit 1
68
+ end
69
+
70
+ when '--append'
71
+ @append_rather_than_overwrite = true
72
+ end
73
+ end
74
+
75
+ # @todo Move this to FredConfigData.
76
+ # further sanity checks
77
+ if @dataset.nil?
78
+ $stderr.puts "Please set --dataset: one of 'train', 'test'"
79
+ exit 1
80
+ end
81
+
82
+ # evaluate optional "refeaturize" argument
83
+ # "refeaturize": reuse meta-feature set,
84
+ # just redo CSV featurization
85
+ if varhash["refeaturize"]
86
+ @refeaturize = varhash["refeaturize"]
87
+ else
88
+ @refeaturize = false
89
+ end
90
+
91
+ # prepare experiment file: add preprocessing experiment file data
92
+ @exp = exp_obj
93
+
94
+ # @note AB: The following is desabled because we don't want to use
95
+ # the dependence on {PrepConfigData}. We duplicate options:
96
+ # <do_postag>, <pos_tagger>, <do_lemmatize>, <lemmatizer>,
97
+ # <do_parse>, <parser>, <directory_preprocessed>
98
+ # in the experiment file of Fred.
99
+ #
100
+ # preproc_expname = @exp.get("preproc_descr_file_" + @dataset)
101
+ # if not(preproc_expname)
102
+ # $stderr.puts "Please set the name of the preprocessing exp. file name"
103
+ # $stderr.puts "in the experiment file, feature preproc_descr_file_#{@dataset}"
104
+ # exit 1
105
+ # elsif not(File.readable?(preproc_expname))
106
+ # $stderr.puts "Error in the experiment file:"
107
+ # $stderr.puts "Parameter preproc_descr_file_#{@dataset} has to be a readable file."
108
+ # exit 1
109
+ # end
110
+ # preproc_exp = FrappeConfigData.new(preproc_expname)
111
+ # @exp.adjoin(preproc_exp)
112
+
113
+ # get the right syntactic interface
114
+ ::Shalmaneser::ExternalSystems.check_interfaces_abort_if_missing(@exp)
115
+
116
+ @interpreter_class = ::Shalmaneser::ExternalSystems.get_interpreter_according_to_exp(@exp)
117
+
118
+ # initialize grammatical function object (delegating)
119
+ grf_obj = GrammaticalFunctionAccess.new(@interpreter_class)
120
+ super(grf_obj)
121
+
122
+ # announce the task
123
+ $stderr.puts "---------"
124
+ $stderr.puts "Fred experiment #{@exp.get("experiment_ID")}: Featurization of dataset #{@dataset}"
125
+ if @refeaturize
126
+ $stderr.puts "Keeping meta-features, redoing featurization only."
127
+ end
128
+ if @exp.get("binary_classifiers")
129
+ $stderr.puts "Writing features for binary classifiers."
130
+ else
131
+ $stderr.puts "Writing features for n-ary classifiers."
132
+ end
133
+ $stderr.puts "---------"
134
+
135
+ end
136
+
137
+ ####
138
+ def compute
139
+ if @refeaturize
140
+ # read meta-feature file,
141
+ # just redo normal featurization
142
+ refeaturize
143
+ else
144
+ # write meta features and normal features
145
+ featurize
146
+ end
147
+ end
148
+
149
+ #########################
150
+ private
151
+
152
+ #####
153
+ # main featurization
154
+ def featurize
155
+ ###
156
+ # make objects
157
+ unless @exp.get("directory_preprocessed")
158
+ $stderr.puts "Shalmaneser error: could not find the directory with"
159
+ $stderr.puts "syntactially preprocessed data."
160
+ $stderr.puts "Please make sure that 'directory_preprocessed'"
161
+ $stderr.puts "is set in the frprep experiment file you use with this experiment."
162
+ exit 1
163
+ end
164
+ directory = File.existing_dir(@exp.get("directory_preprocessed"))
165
+
166
+ # get context sizes
167
+ context_sizes = @exp.get_lf("feature", "context")
168
+ unless context_sizes
169
+ # no contexts, nothing to compute.
170
+ # choose default context
171
+ $stderr.puts "Error: no contexts set."
172
+ $stderr.puts "I will compute a context of size 1 by default."
173
+ $stderr.puts "(This goes into the meta-features, but not"
174
+ $stderr.puts "into the set of features used in the classifier.)"
175
+ context_sizes = [1]
176
+ end
177
+ max_context_size = context_sizes.max
178
+
179
+ # make target determination object
180
+ if @dataset == "test" and @exp.get("apply_to_all_known_targets")
181
+ $stderr.puts "Fred: Using all known targets as instances."
182
+ target_obj = FindAllTargets.new(@exp, @interpreter_class)
183
+ else
184
+ if @append_rather_than_overwrite
185
+ target_obj = FindTargetsFromFrames.new(@exp, @interpreter_class, "a")
186
+ else
187
+ target_obj = FindTargetsFromFrames.new(@exp, @interpreter_class, "w")
188
+ end
189
+ end
190
+
191
+ # make context computation object
192
+ if @exp.get("single_sent_context")
193
+ # contexts in the input data doesn't go beyond a single sentence
194
+ context_obj = SingleSentContextProvider.new(max_context_size, @exp,
195
+ @interpreter_class, target_obj,
196
+ @dataset)
197
+ # @todo AB: Put it to the OptionParser, two option are not
198
+ # compatible, don't do the check here!
199
+ if @exp.get("noncontiguous_input")
200
+ $stderr.puts "Warning: 'single_sent_context' has been set in the experiment file."
201
+ $stderr.puts "So I'm ignoring the 'noncontiguous_input = true' setting."
202
+ end
203
+
204
+ elsif @exp.get("noncontiguous_input")
205
+ # the input data is not contiguous but
206
+ # consists of selected sentences from a larger text
207
+ context_obj = NoncontiguousContextProvider.new(max_context_size, @exp,
208
+ @interpreter_class, target_obj,
209
+ @dataset)
210
+ else
211
+ # the input data is contiguous, and we're computing contexts not restricted to single sentences
212
+ context_obj = ContextProvider.new(max_context_size, @exp,
213
+ @interpreter_class, target_obj, @dataset)
214
+ end
215
+
216
+ zipped_input_dir = ::Shalmaneser::Fred.fred_dirname(@exp, @dataset, "input_data", "new")
217
+
218
+ ##
219
+ # make writer object(s)
220
+
221
+ writer_classes = [
222
+ MetaFeatureAccess,
223
+ FredFeatureAccess
224
+ ]
225
+
226
+ if @append_rather_than_overwrite
227
+ # append
228
+ mode = "a"
229
+ $stderr.puts "Appending new features to the old"
230
+
231
+ else
232
+ # write
233
+ mode = "w"
234
+
235
+ $stderr.puts "Removing old features for the same experiment (if any)"
236
+
237
+ writer_classes.each { |w_class|
238
+ w_class.remove_feature_files(@exp, @dataset)
239
+ }
240
+
241
+ Dir[zipped_input_dir + "*gz"].each { |filename|
242
+ File.delete(filename)
243
+ }
244
+ end
245
+
246
+ writers = writer_classes.map { |w_class|
247
+ w_class.new(@exp, @dataset, mode)
248
+ }
249
+
250
+ ###
251
+ # zip and store input files
252
+ Dir[directory + "*.xml"].sort.each { |filename|
253
+ %x{gzip -c #{filename} > #{zipped_input_dir}#{File.basename(filename)}.gz}
254
+ }
255
+
256
+ # always remember current sentence
257
+ @current_sent = nil
258
+ ###
259
+ # featurize
260
+
261
+ # context_obj.each_window yields tuples of:
262
+ # - a context, an array of tuples [word,lemma, pos, ne]
263
+ # string/nil*string/nil*string/nil*string/nil
264
+ # - ID of main target: string
265
+ # - target_IDs: array:string, list of IDs of target words
266
+ # - senses: array:string, the senses for the target
267
+ # - sent: SalsaTigerSentence object
268
+ #
269
+ # for each max. context returned by context object:
270
+ # determine meta-features:
271
+ # - context words for all context sizes listed in context_sizes,
272
+ # - children of target
273
+ # - parent of target
274
+ # - siblings of target
275
+ #
276
+ # and pass on to writing object(s)
277
+ target_count = 0
278
+ context_obj.each_window(directory) { |context, main_target_id, target_ids, senses, sent|
279
+ # inform user
280
+ if target_count % 500 == 0
281
+ $stderr.puts "#{target_count}..."
282
+ end
283
+ target_count += 1
284
+ # determine features
285
+ feature_hash = {}
286
+ compute_target_features(context, max_context_size, feature_hash)
287
+ compute_context_features(context, max_context_size, context_sizes, feature_hash)
288
+ compute_syn_features(main_target_id, sent, feature_hash)
289
+ # write
290
+ each_lemma_pos_and_senses(senses) { |target_lemma, target_pos, target_sid, target_senses|
291
+
292
+ writers.each { |writer_obj|
293
+
294
+ writer_obj.write_item(target_lemma,
295
+ target_pos,
296
+ target_ids,
297
+ target_sid,
298
+ target_senses,
299
+ feature_hash)
300
+ }
301
+ }
302
+ }
303
+ # finalize writers
304
+ writers.each { |writer_obj|
305
+ writer_obj.flush
306
+ }
307
+
308
+ # record the targets that have been read
309
+ target_obj.done_reading_targets
310
+
311
+ end
312
+
313
+ #####
314
+ # reuse of meta-features, recompute CSV features
315
+ def refeaturize
316
+
317
+ ##
318
+ # remove old features:
319
+ # normal features only. Keep meta-features.
320
+ # Don't do anything about zipped input.
321
+ # Assume it stays as is.
322
+ if @append_rather_than_overwrite
323
+ # append
324
+ mode = "a"
325
+ $stderr.puts "Appending new features to the old"
326
+
327
+ else
328
+ # write
329
+ mode = "w"
330
+
331
+ $stderr.puts "Removing old features for the same experiment (if any)"
332
+
333
+ FredFeatureAccess.remove_feature_files(@exp, @dataset)
334
+ end
335
+
336
+ ##
337
+ # read meta-feature file,
338
+ # write fred feature files
339
+ meta_reader = MetaFeatureAccess.new(@exp, @dataset, "r")
340
+ feature_writer = FredFeatureAccess.new(@exp, @dataset, mode)
341
+
342
+ ##
343
+ # featurize
344
+ target_count = 0
345
+
346
+ meta_reader.each_item { |target_lemma, target_pos, target_ids, target_sid, target_senses, feature_hash|
347
+
348
+ # inform user
349
+ if target_count % 500 == 0
350
+ $stderr.puts "#{target_count}..."
351
+ end
352
+ target_count += 1
353
+
354
+ feature_writer.write_item(target_lemma,
355
+ target_pos,
356
+ target_ids,
357
+ target_sid,
358
+ target_senses,
359
+ feature_hash)
360
+ }
361
+ feature_writer.flush
362
+ end
363
+
364
+ ####
365
+ # given a list of sense hashes, format
366
+ # "lex" -> lemma
367
+ # "pos" -> part of speech
368
+ # "sense" -> sense
369
+ #
370
+ # yield as triples [lemma, pos, sense]
371
+ def each_lemma_pos_and_senses(shashes)
372
+ # Determine target and POS.
373
+ # If we actually have more than one lemma and POS, we're in trouble
374
+ target_lemmas = shashes.map { |sense_hash| sense_hash["lex"].to_s.gsub(/\s/, "_") }.uniq
375
+ target_pos_s = shashes.map { |sense_hash| sense_hash["pos"].to_s.gsub(/\s/, "_")}.uniq
376
+ target_sid = shashes.map { |sense_hash| sense_hash["sid"].to_s.gsub(/\s/, "_")}.uniq
377
+
378
+ if target_lemmas.length == 1 &&
379
+ target_pos_s.length == 1 &&
380
+ target_sid.length == 1
381
+
382
+ yield [target_lemmas.first,
383
+ target_pos_s.first,
384
+ target_sid.first,
385
+ shashes.map { |sense_hash| sense_hash["sense"].to_s.gsub(/\s/, "_") }
386
+ ]
387
+ else
388
+ # trouble
389
+ # group senses by SID, lemma and pos
390
+ lemmapos2sense = {}
391
+ shashes.each { |sense_hash|
392
+ target_lemma = sense_hash["lex"].to_s.gsub(/\s/, "_")
393
+ target_pos = sense_hash["pos"].to_s.gsub(/\s/, "_")
394
+ target_sid = sense_hash["sid"].to_s.gsub(/\s/, "_")
395
+ target_sense = sense_hash["sense"].to_s.gsub(/\s/, "_")
396
+ key = [target_sid, target_lemma, target_pos]
397
+
398
+ unless lemmapos2sense[key]
399
+ lemmapos2sense[key] = []
400
+ end
401
+
402
+ lemmapos2sense[key] << target_sense
403
+ }
404
+
405
+ # and yield
406
+ lemmapos2sense.each_key do |target_sid, target_lemma, target_pos|
407
+ yield [target_lemma,
408
+ target_pos,
409
+ target_sid,
410
+ lemmapos2sense[[target_sid, target_lemma, target_pos]]
411
+ ]
412
+ end
413
+ end
414
+ end
415
+
416
+ ###
417
+ # given a context, locate the target,
418
+ # which is right in the middle,
419
+ # and enter it into the feature hash
420
+ #
421
+ # feature type: TA
422
+ # entry: word#lemma#pos#ne
423
+ def compute_target_features(context, # array: word*lemma*pos*ne
424
+ center_pos, # integer: size of context, onesided
425
+ feature_hash) # hash: feature_type -> array:feature, enter features here
426
+ feature_hash["TA"] = [context[center_pos].map(&:to_s).join("#").gsub(/\s/, "_")]
427
+ end
428
+
429
+ ###
430
+ # compute context features:
431
+ # for each context in the given list of context sizes,
432
+ # compute a context with feature_type "CXNN" (where NN is the size of the context)
433
+ # and with features
434
+ # word#lemma#pos#ne
435
+ #
436
+ # enter each context into the feature hash
437
+ def compute_context_features(context, # array: word*lemma*pos*ne
438
+ center_pos, # int: context is 2*cx_size_onesided + 1 long
439
+ context_sizes, # array:int, produce a context of each of these sizes
440
+ feature_hash) # hash: feature_type -> array:feature, enter features here
441
+
442
+
443
+ context_sizes.each { |context_size|
444
+ # feature type: CXNN, where NN is the size of the context
445
+ feature_type = "CX" + context_size.to_s
446
+
447
+ # features: an array of strings
448
+ feature_hash[feature_type] = []
449
+
450
+ # pre-context
451
+ (center_pos - context_size).upto(center_pos - 1) { |ix|
452
+ if context[ix]
453
+ # context entries may be nil at the beginning and end of the text
454
+ feature_hash[feature_type] << context[ix].map(&:to_s).join("#").gsub(/\s/, "_")
455
+ end
456
+ }
457
+ # post-context
458
+ (center_pos + 1).upto(center_pos + context_size) { |ix|
459
+ if context[ix]
460
+ # context entries may be nil at the beginning and end of the text
461
+ feature_hash[feature_type] << context[ix].map(&:to_s).join("#").gsub(/\s/, "_")
462
+ end
463
+ }
464
+ }
465
+ end
466
+
467
+ ###
468
+ # compute syntax-dependent features:
469
+ # children (that is, dependents) of the target word,
470
+ # parent,
471
+ # and siblings.
472
+ #
473
+ # format:
474
+ # feature type is CH for children, PA for parent, SI for siblings
475
+ #
476
+ # individual features are:
477
+ # <dependency>#<word>#<lemma>#<pos>#<ne>
478
+ def compute_syn_features(main_target_id, # string: ID of the terminal node that is the target
479
+ sent, # SalsaTigerRegXML object
480
+ feature_hash) # hash: feature_type -> array:feature, enter features here
481
+
482
+ target = sent.terminals.detect { |t| t.id == main_target_id }
483
+ unless target
484
+ $stderr.puts "Featurization error: cannot find target with ID #{main_target_id}, skipping."
485
+ return
486
+ end
487
+
488
+ # if we're starting a new sentence,
489
+ # compute dependencies using delegate object for grammatical functions.
490
+ # also, get_children, get_parents below are methods of the delegate
491
+ unless sent == @current_sent
492
+ @current_sent = sent
493
+ set_sent(sent)
494
+ end
495
+
496
+ # children
497
+ feature_hash["CH"] = get_children(target).map do |rel, node|
498
+ rel.to_s + "#" +
499
+ word_lemma_pos_ne(node, @interpreter_class).map(&:to_s).join("#").gsub(/\s/, "_")
500
+ end
501
+
502
+ # parent
503
+ feature_hash["PA"] = get_parents(target).map do |rel, node|
504
+
505
+ rel.to_s + "#" +
506
+ word_lemma_pos_ne(node, @interpreter_class).map(&:to_s).join("#").gsub(/\s/, "_")
507
+ end
508
+
509
+ # siblings
510
+ feature_hash["SI"] = []
511
+
512
+ get_parents(target).each do |_rel, parent|
513
+ parent_w, _d1, _d2, _d3 = word_lemma_pos_ne(parent, @interpreter_class)
514
+
515
+ get_children(parent).each do |rel, node|
516
+ next if node == target
517
+ feature_hash["SI"] << parent_w + "#" +
518
+ rel.to_s + "#" +
519
+ word_lemma_pos_ne(node, @interpreter_class).map { |e| e.to_s }.join("#").gsub(/\s/, "_")
520
+ end
521
+ end
522
+ end
523
+ end
524
+ end
525
+ end