shalmaneser-fred 1.2.0.rc4 → 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +47 -18
  3. data/bin/fred +8 -3
  4. data/lib/fred/FredConventions.rb +190 -189
  5. data/lib/fred/abstract_context_provider.rb +246 -0
  6. data/lib/fred/abstract_fred_feature_access.rb +43 -0
  7. data/lib/fred/answer_key_access.rb +130 -0
  8. data/lib/fred/aux_keep_writers.rb +94 -0
  9. data/lib/fred/baseline.rb +153 -0
  10. data/lib/fred/context_provider.rb +55 -0
  11. data/lib/fred/feature_extractors/fred_context_feature_extractor.rb +48 -0
  12. data/lib/fred/feature_extractors/fred_context_pos_feature_extractor.rb +48 -0
  13. data/lib/fred/feature_extractors/fred_feature_extractor.rb +50 -0
  14. data/lib/fred/feature_extractors/fred_ngram_feature_extractor.rb +65 -0
  15. data/lib/fred/feature_extractors/fred_syn_feature_extractor.rb +33 -0
  16. data/lib/fred/feature_extractors/fred_synsem_feature_extractor.rb +32 -0
  17. data/lib/fred/feature_extractors.rb +5 -0
  18. data/lib/fred/file_zipped.rb +43 -0
  19. data/lib/fred/find_all_targets.rb +94 -0
  20. data/lib/fred/find_targets_from_frames.rb +92 -0
  21. data/lib/fred/fred.rb +43 -40
  22. data/lib/fred/fred_error.rb +15 -0
  23. data/lib/fred/fred_eval.rb +311 -0
  24. data/lib/fred/fred_feature_access.rb +420 -0
  25. data/lib/fred/fred_feature_info.rb +56 -0
  26. data/lib/fred/fred_featurize.rb +525 -0
  27. data/lib/fred/fred_parameters.rb +190 -0
  28. data/lib/fred/fred_split.rb +86 -0
  29. data/lib/fred/fred_split_pkg.rb +189 -0
  30. data/lib/fred/fred_test.rb +571 -0
  31. data/lib/fred/fred_train.rb +125 -0
  32. data/lib/fred/grammatical_function_access.rb +63 -0
  33. data/lib/fred/md5.rb +6 -0
  34. data/lib/fred/meta_feature_access.rb +185 -0
  35. data/lib/fred/non_contiguous_context_provider.rb +532 -0
  36. data/lib/fred/opt_parser.rb +182 -161
  37. data/lib/fred/plot_and_r_eval.rb +486 -0
  38. data/lib/fred/single_sent_context_provider.rb +76 -0
  39. data/lib/fred/slide_var.rb +148 -0
  40. data/lib/fred/targets.rb +136 -0
  41. data/lib/fred/toggle_var.rb +61 -0
  42. data/lib/fred/word_lemma_pos_ne.rb +51 -0
  43. data/lib/fred/write_features_binary.rb +95 -0
  44. data/lib/fred/write_features_nary.rb +51 -0
  45. data/lib/fred/write_features_nary_or_binary.rb +51 -0
  46. data/lib/shalmaneser/fred.rb +1 -0
  47. metadata +57 -30
  48. data/lib/fred/Baseline.rb +0 -150
  49. data/lib/fred/FileZipped.rb +0 -31
  50. data/lib/fred/FredBOWContext.rb +0 -877
  51. data/lib/fred/FredDetermineTargets.rb +0 -319
  52. data/lib/fred/FredEval.rb +0 -312
  53. data/lib/fred/FredFeatureExtractors.rb +0 -322
  54. data/lib/fred/FredFeatures.rb +0 -1061
  55. data/lib/fred/FredFeaturize.rb +0 -602
  56. data/lib/fred/FredNumTrainingSenses.rb +0 -27
  57. data/lib/fred/FredParameters.rb +0 -402
  58. data/lib/fred/FredSplit.rb +0 -84
  59. data/lib/fred/FredSplitPkg.rb +0 -180
  60. data/lib/fred/FredTest.rb +0 -606
  61. data/lib/fred/FredTrain.rb +0 -144
  62. data/lib/fred/PlotAndREval.rb +0 -480
  63. data/lib/fred/fred_config_data.rb +0 -185
  64. data/test/frprep/test_opt_parser.rb +0 -94
  65. data/test/functional/functional_test_helper.rb +0 -58
  66. data/test/functional/test_fred.rb +0 -47
  67. data/test/functional/test_frprep.rb +0 -99
  68. data/test/functional/test_rosy.rb +0 -40
@@ -1,602 +0,0 @@
1
- # FredFeaturize
2
- #
3
- # Featurization for WSD
4
- #
5
- # Katrin Erk April 2007
6
- #
7
- # feature types currently allowed:
8
- # - context (with parameter giving context size; may be set several times)
9
- # - syntax
10
- # - synsem
11
- #
12
- # features in Meta-feature file:
13
- #
14
- # CX: context: word/lemma/pos/ne
15
- # CH: children: grfunc/word/lemma/pos/ne
16
- # PA: parents: grfunc/word/lemma/pos/ne
17
- # SI: sibling: parent/grfunc/word/lemma/pos/ne
18
- # TA: target: word/lemma/pos/ne
19
-
20
-
21
-
22
- require "delegate"
23
-
24
- #######
25
-
26
- require "fred/FileZipped"
27
- require "common/Parser"
28
- require "common/RegXML"
29
- require "common/SalsaTigerRegXML"
30
- require "common/SalsaTigerXMLHelper"
31
-
32
- require "fred/fred_config_data"
33
- require "fred/FredConventions"
34
- require "common/prep_helper"
35
- require "common/SynInterfaces"
36
-
37
- require "fred/FredBOWContext"
38
- require "fred/FredDetermineTargets"
39
- require "fred/FredFeatures"
40
-
41
- ####################################
42
- # grammatical function computation:
43
- # given a sentence, keep all grammatical function relations in a hash
44
- # for faster access
45
- class GrammaticalFunctionAccess
46
-
47
- def initialize(interpreter_class)
48
- @interpreter_class = interpreter_class
49
- @to = Hash.new( [] ) # default: return empty array
50
- @from = Hash.new( [] ) # default: return empty array
51
- end
52
-
53
- def set_sent(sent) # SalsaTigerRegXML sentence
54
-
55
- @to.clear()
56
- @from.clear()
57
-
58
- sent.each_syn_node { |current|
59
-
60
- current_head = @interpreter_class.head_terminal(current)
61
- unless current_head
62
- next
63
- end
64
-
65
- @interpreter_class.gfs(current, sent).map { |rel, node|
66
- # PPs: use head noun rather than preposition as head
67
- # Sbar, VP: use verb
68
- if (n = @interpreter_class.informative_content_node(node))
69
- [rel, n]
70
- else
71
- [rel, node]
72
- end
73
- }.each { |rel, node|
74
-
75
- rel_head = @interpreter_class.head_terminal(node)
76
- unless rel_head
77
- next
78
- end
79
-
80
- unless @to.has_key? current_head
81
- @to[current_head] = Array.new()
82
- end
83
- unless @to[current_head].include? [rel, rel_head]
84
- @to[current_head] << [rel, rel_head]
85
- end
86
-
87
- unless @from.has_key? rel_head
88
- @from[rel_head] = Array.new()
89
- end
90
- unless @from[rel_head].include? [rel, current_head]
91
- @from[rel_head] << [rel, current_head]
92
- end
93
- }
94
- }
95
- # $stderr.puts "Changed sentence:"
96
- # @to.each_pair { |p, ch|
97
- # $stderr.puts "\t#{p.id()}: " + ch.map { |rel, n| rel + "/"+n.id()}.join(", ")
98
- # }
99
- # $stdin.gets()
100
- end
101
-
102
- def get_children(node)
103
- return @to[node]
104
- end
105
-
106
- def get_parents(node)
107
- return @from[node]
108
- end
109
- end
110
-
111
- ####################################
112
- # main class of this package
113
- ####################################
114
- class FredFeaturize < DelegateClass(GrammaticalFunctionAccess)
115
-
116
- include WordLemmaPosNe
117
-
118
- #####
119
- def initialize(exp_obj, # FredConfigData object
120
- options, # hash: runtime option name (string) => value(string)
121
- varhash = {}) # optional parameter: "refeaturize"
122
-
123
- ##
124
- # evaluate runtime options
125
- if $ENDUSER_MODE
126
- # only possible dataset: test
127
- @dataset = "test"
128
- else
129
- @dataset = nil
130
- end
131
- @append_rather_than_overwrite = false
132
-
133
-
134
- options.each_pair do |opt, arg|
135
- case opt
136
- when '--dataset'
137
- @dataset = arg
138
- unless ["train", "test"].include? @dataset
139
- $stderr.puts "--dataset needs to be either 'train' or 'test'"
140
- exit 1
141
- end
142
-
143
- when '--append'
144
- @append_rather_than_overwrite = true
145
-
146
- else
147
- # case of unknown arguments has been dealt with by fred.rb
148
- end
149
- end
150
-
151
- # further sanity checks
152
- if @dataset.nil?
153
- $stderr.puts "Please set --dataset: one of 'train', 'test'"
154
- exit 1
155
- end
156
-
157
- in_enduser_mode_ensure(@dataset == "test")
158
-
159
- # evaluate optional "refeaturize" argument
160
- # "refeaturize": reuse meta-feature set,
161
- # just redo CSV featurization
162
- if varhash["refeaturize"]
163
- @refeaturize = varhash["refeaturize"]
164
- else
165
- @refeaturize = false
166
- end
167
-
168
- # prepare experiment file: add preprocessing experiment file data
169
- @exp = exp_obj
170
-
171
- # @note AB: The following is desabled because we don't want to use
172
- # the dependence on {PrepConfigData}. We duplicate options:
173
- # <do_postag>, <pos_tagger>, <do_lemmatize>, <lemmatizer>,
174
- # <do_parse>, <parser>, <directory_preprocessed>
175
- # in the experiment file of Fred.
176
- #
177
- # preproc_expname = @exp.get("preproc_descr_file_" + @dataset)
178
- # if not(preproc_expname)
179
- # $stderr.puts "Please set the name of the preprocessing exp. file name"
180
- # $stderr.puts "in the experiment file, feature preproc_descr_file_#{@dataset}"
181
- # exit 1
182
- # elsif not(File.readable?(preproc_expname))
183
- # $stderr.puts "Error in the experiment file:"
184
- # $stderr.puts "Parameter preproc_descr_file_#{@dataset} has to be a readable file."
185
- # exit 1
186
- # end
187
- # preproc_exp = FrPrepConfigData.new(preproc_expname)
188
- # @exp.adjoin(preproc_exp)
189
-
190
- # get the right syntactic interface
191
- SynInterfaces.check_interfaces_abort_if_missing(@exp)
192
- @interpreter_class = SynInterfaces.get_interpreter_according_to_exp(@exp)
193
-
194
- # initialize grammatical function object (delegating)
195
- grf_obj = GrammaticalFunctionAccess.new(@interpreter_class)
196
- super(grf_obj)
197
-
198
- # announce the task
199
- $stderr.puts "---------"
200
- $stderr.puts "Fred experiment #{@exp.get("experiment_ID")}: Featurization of dataset #{@dataset}"
201
- if @refeaturize
202
- $stderr.puts "Keeping meta-features, redoing featurization only."
203
- end
204
- if @exp.get("binary_classifiers")
205
- $stderr.puts "Writing features for binary classifiers."
206
- else
207
- $stderr.puts "Writing features for n-ary classifiers."
208
- end
209
- $stderr.puts "---------"
210
-
211
- end
212
-
213
- ####
214
- def compute()
215
- if @refeaturize
216
- # read meta-feature file,
217
- # just redo normal featurization
218
- refeaturize()
219
- else
220
- # write meta features and normal features
221
- featurize()
222
- end
223
- end
224
-
225
- #########################
226
- private
227
-
228
- #####
229
- # main featurization
230
- def featurize()
231
-
232
- ###
233
- # make objects
234
- unless @exp.get("directory_preprocessed")
235
- $stderr.puts "Shalmaneser error: could not find the directory with"
236
- $stderr.puts "syntactially preprocessed data."
237
- $stderr.puts "Please make sure that 'directory_preprocessed'"
238
- $stderr.puts "is set in the frprep experiment file you use with this experiment."
239
- exit 1
240
- end
241
- directory = File.existing_dir(@exp.get("directory_preprocessed"))
242
-
243
- # get context sizes
244
- context_sizes = @exp.get_lf("feature", "context")
245
- unless context_sizes
246
- # no contexts, nothing to compute.
247
- # choose default context
248
- $stderr.puts "Error: no contexts set."
249
- $stderr.puts "I will compute a context of size 1 by default."
250
- $stderr.puts "(This goes into the meta-features, but not"
251
- $stderr.puts "into the set of features used in the classifier.)"
252
- context_sizes = [1]
253
- end
254
- max_context_size = context_sizes.max()
255
-
256
- # make target determination object
257
- if @dataset == "test" and @exp.get("apply_to_all_known_targets")
258
- $stderr.puts "Fred: Using all known targets as instances."
259
- target_obj = FindAllTargets.new(@exp, @interpreter_class)
260
- else
261
- if @append_rather_than_overwrite
262
- target_obj = FindTargetsFromFrames.new(@exp, @interpreter_class, "a")
263
- else
264
- target_obj = FindTargetsFromFrames.new(@exp, @interpreter_class, "w")
265
- end
266
- end
267
-
268
- # make context computation object
269
- if @exp.get("single_sent_context")
270
- # contexts in the input data doesn't go beyond a single sentence
271
- context_obj = SingleSentContextProvider.new(max_context_size, @exp,
272
- @interpreter_class, target_obj,
273
- @dataset)
274
- # @todo AB: Put it to the OptionParser, two option are not
275
- # compatible, don't do the check here!
276
- if @exp.get("noncontiguous_input")
277
- $stderr.puts "Warning: 'single_sent_context' has been set in the experiment file."
278
- $stderr.puts "So I'm ignoring the 'noncontiguous_input = true' setting."
279
- end
280
-
281
- elsif @exp.get("noncontiguous_input")
282
- # the input data is not contiguous but
283
- # consists of selected sentences from a larger text
284
- context_obj = NoncontiguousContextProvider.new(max_context_size, @exp,
285
- @interpreter_class, target_obj,
286
- @dataset)
287
- else
288
- # the input data is contiguous, and we're computing contexts not restricted to single sentences
289
- context_obj = ContextProvider.new(max_context_size, @exp,
290
- @interpreter_class, target_obj, @dataset)
291
- end
292
-
293
- zipped_input_dir = fred_dirname(@exp, @dataset, "input_data", "new")
294
-
295
- ##
296
- # make writer object(s)
297
-
298
- writer_classes = [
299
- MetaFeatureAccess,
300
- FredFeatureAccess
301
- ]
302
-
303
- if @append_rather_than_overwrite
304
- # append
305
- mode = "a"
306
- $stderr.puts "Appending new features to the old"
307
-
308
- else
309
- # write
310
- mode = "w"
311
-
312
- $stderr.puts "Removing old features for the same experiment (if any)"
313
-
314
- writer_classes.each { |w_class|
315
- w_class.remove_feature_files(@exp, @dataset)
316
- }
317
-
318
- Dir[zipped_input_dir + "*gz"].each { |filename|
319
- File.delete(filename)
320
- }
321
- end
322
-
323
- writers = writer_classes.map { |w_class|
324
- w_class.new(@exp, @dataset, mode)
325
- }
326
-
327
- ###
328
- # zip and store input files
329
- Dir[directory + "*.xml"].sort.each { |filename|
330
- %x{gzip -c #{filename} > #{zipped_input_dir}#{File.basename(filename)}.gz}
331
- }
332
-
333
- # always remember current sentence
334
- @current_sent = nil
335
- ###
336
- # featurize
337
-
338
- # context_obj.each_window yields tuples of:
339
- # - a context, an array of tuples [word,lemma, pos, ne]
340
- # string/nil*string/nil*string/nil*string/nil
341
- # - ID of main target: string
342
- # - target_IDs: array:string, list of IDs of target words
343
- # - senses: array:string, the senses for the target
344
- # - sent: SalsaTigerSentence object
345
- #
346
- # for each max. context returned by context object:
347
- # determine meta-features:
348
- # - context words for all context sizes listed in context_sizes,
349
- # - children of target
350
- # - parent of target
351
- # - siblings of target
352
- #
353
- # and pass on to writing object(s)
354
- target_count = 0
355
- context_obj.each_window(directory) { |context, main_target_id, target_ids, senses, sent|
356
- # inform user
357
- if target_count % 500 == 0
358
- $stderr.puts "#{target_count}..."
359
- end
360
- target_count += 1
361
- # determine features
362
- feature_hash = Hash.new()
363
- compute_target_features(context, max_context_size, feature_hash)
364
- compute_context_features(context, max_context_size, context_sizes, feature_hash)
365
- compute_syn_features(main_target_id, sent, feature_hash)
366
- # write
367
- each_lemma_pos_and_senses(senses) { |target_lemma, target_pos, target_sid, target_senses|
368
-
369
- writers.each { |writer_obj|
370
-
371
- writer_obj.write_item(target_lemma,
372
- target_pos,
373
- target_ids,
374
- target_sid,
375
- target_senses,
376
- feature_hash)
377
- }
378
- }
379
- }
380
- # finalize writers
381
- writers.each { |writer_obj|
382
- writer_obj.flush()
383
- }
384
-
385
- # record the targets that have been read
386
- target_obj.done_reading_targets()
387
-
388
- end
389
-
390
- #####
391
- # reuse of meta-features, recompute CSV features
392
- def refeaturize()
393
-
394
- ##
395
- # remove old features:
396
- # normal features only. Keep meta-features.
397
- # Don't do anything about zipped input.
398
- # Assume it stays as is.
399
- if @append_rather_than_overwrite
400
- # append
401
- mode = "a"
402
- $stderr.puts "Appending new features to the old"
403
-
404
- else
405
- # write
406
- mode = "w"
407
-
408
- $stderr.puts "Removing old features for the same experiment (if any)"
409
-
410
- FredFeatureAccess.remove_feature_files(@exp, @dataset)
411
- end
412
-
413
- ##
414
- # read meta-feature file,
415
- # write fred feature files
416
- meta_reader = MetaFeatureAccess.new(@exp, @dataset, "r")
417
- feature_writer = FredFeatureAccess.new(@exp, @dataset, mode)
418
-
419
- ##
420
- # featurize
421
- target_count = 0
422
-
423
- meta_reader.each_item { |target_lemma, target_pos, target_ids, target_sid, target_senses, feature_hash|
424
-
425
- # inform user
426
- if target_count % 500 == 0
427
- $stderr.puts "#{target_count}..."
428
- end
429
- target_count += 1
430
-
431
- feature_writer.write_item(target_lemma,
432
- target_pos,
433
- target_ids,
434
- target_sid,
435
- target_senses,
436
- feature_hash)
437
- }
438
- feature_writer.flush()
439
- end
440
-
441
-
442
- ####
443
- # given a list of sense hashes, format
444
- # "lex" -> lemma
445
- # "pos" -> part of speech
446
- # "sense" -> sense
447
- #
448
- # yield as triples [lemma, pos, sense]
449
- def each_lemma_pos_and_senses(shashes)
450
-
451
- # determine target and POS. If we actually have more than one lemma and POS, we're in trouble
452
- target_lemmas = shashes.map { |sense_hash| sense_hash["lex"].to_s().gsub(/\s/, "_") }.uniq()
453
- target_pos_s = shashes.map { |sense_hash| sense_hash["pos"].to_s().gsub(/\s/, "_")}.uniq()
454
- target_sid = shashes.map { |sense_hash| sense_hash["sid"].to_s().gsub(/\s/, "_")}.uniq()
455
-
456
- if target_lemmas.length() == 1 and target_pos_s.length() == 1 and target_sid.length() == 1
457
-
458
- yield [target_lemmas.first(), target_pos_s.first(),
459
- target_sid.first(),
460
- shashes.map { |sense_hash| sense_hash["sense"].to_s().gsub(/\s/, "_") }
461
- ]
462
-
463
- else
464
- # trouble
465
-
466
- # group senses by SID, lemma and pos
467
- lemmapos2sense = Hash.new
468
- shashes.each { |sense_hash|
469
- target_lemma = sense_hash["lex"].to_s().gsub(/\s/, "_")
470
- target_pos = sense_hash["pos"].to_s().gsub(/\s/, "_")
471
- target_sid = sense_hash["sid"].to_s().gsub(/\s/, "_")
472
- target_sense = sense_hash["sense"].to_s().gsub(/\s/, "_")
473
- key = [target_sid, target_lemma, target_pos]
474
- unless lemmapos2sense[key]
475
- lemmapos2sense[key] = Array.new()
476
- end
477
- lemmapos2sense[key] << target_sense
478
- }
479
-
480
- # and yield
481
- lemmapos2sense.each_key { |target_sid, target_lemma, target_pos|
482
- yield [target_lemma, target_pos, target_sid,
483
- lemmapos2sense[[target_sid, target_lemma, target_pos]]
484
- ]
485
- }
486
- end
487
- end
488
-
489
- ###
490
- # given a context, locate the target,
491
- # which is right in the middle,
492
- # and enter it into the feature hash
493
- #
494
- # feature type: TA
495
- # entry: word#lemma#pos#ne
496
- def compute_target_features(context, # array: word*lemma*pos*ne
497
- center_pos, # integer: size of context, onesided
498
- feature_hash) # hash: feature_type -> array:feature, enter features here
499
- feature_hash["TA"] = [
500
- context[center_pos].map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
501
- ]
502
- end
503
-
504
- ###
505
- # compute context features:
506
- # for each context in the given list of context sizes,
507
- # compute a context with feature_type "CXNN" (where NN is the size of the context)
508
- # and with features
509
- # word#lemma#pos#ne
510
- #
511
- # enter each context into the feature hash
512
- def compute_context_features(context, # array: word*lemma*pos*ne
513
- center_pos, # int: context is 2*cx_size_onesided + 1 long
514
- context_sizes, # array:int, produce a context of each of these sizes
515
- feature_hash) # hash: feature_type -> array:feature, enter features here
516
-
517
-
518
- context_sizes.each { |context_size|
519
- # feature type: CXNN, where NN is the size of the context
520
- feature_type = "CX" + context_size.to_s()
521
-
522
- # features: an array of strings
523
- feature_hash[feature_type] = Array.new()
524
-
525
- # pre-context
526
- (center_pos - context_size).upto(center_pos - 1) { |ix|
527
- if context[ix]
528
- # context entries may be nil at the beginning and end of the text
529
- feature_hash[feature_type] << context[ix].map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
530
- end
531
- }
532
- # post-context
533
- (center_pos + 1).upto(center_pos + context_size) { |ix|
534
- if context[ix]
535
- # context entries may be nil at the beginning and end of the text
536
- feature_hash[feature_type] << context[ix].map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
537
- end
538
- }
539
- }
540
- end
541
-
542
- ###
543
- # compute syntax-dependent features:
544
- # children (that is, dependents) of the target word,
545
- # parent,
546
- # and siblings.
547
- #
548
- # format:
549
- # feature type is CH for children, PA for parent, SI for siblings
550
- #
551
- # individual features are:
552
- # <dependency>#<word>#<lemma>#<pos>#<ne>
553
- def compute_syn_features(main_target_id, # string: ID of the terminal node that is the target
554
- sent, # SalsaTigerRegXML object
555
- feature_hash) # hash: feature_type -> array:feature, enter features here
556
-
557
- target = sent.terminals().detect { |t| t.id() == main_target_id }
558
- unless target
559
- $stderr.puts "Featurization error: cannot find target with ID #{main_target_id}, skipping."
560
- return
561
- end
562
-
563
- # if we're starting a new sentence,
564
- # compute dependencies using delegate object for grammatical functions.
565
- # also, get_children, get_parents below are methods of the delegate
566
- unless sent == @current_sent
567
- @current_sent = sent
568
- set_sent(sent)
569
- end
570
- # children
571
- feature_hash["CH"] = get_children(target).map { |rel, node|
572
- #print "\t", rel, " ", node, "\n"
573
- rel.to_s() + "#" +
574
- word_lemma_pos_ne(node, @interpreter_class).map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
575
- }
576
-
577
- # parent
578
- feature_hash["PA"] = get_parents(target).map { |rel, node|
579
- #print "\t", rel, " ", node, "\n"
580
- rel.to_s() + "#" +
581
- word_lemma_pos_ne(node, @interpreter_class).map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
582
- }
583
-
584
- # siblings
585
- feature_hash["SI"] = Array.new()
586
- get_parents(target).each { |rel, parent|
587
- parent_w, d1, d2, d3 = word_lemma_pos_ne(parent, @interpreter_class)
588
-
589
- get_children(parent).each { |rel, node|
590
- #print "\t", rel, " ", node, "\n"
591
-
592
- if node == target
593
- next
594
- end
595
-
596
- feature_hash["SI"] << parent_w + "#" +
597
- rel.to_s() + "#" +
598
- word_lemma_pos_ne(node, @interpreter_class).map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
599
- }
600
- }
601
- end
602
- end
@@ -1,27 +0,0 @@
1
- require "fred/FredFeatures"
2
-
3
- def determine_training_senses(lemma, exp, lemmas_and_senses_obj, split_id)
4
- if split_id
5
- # oh no, we're splitting the dataset into random training and test portions.
6
- # this means that we actually have to look into the training part of the data to
7
- # determine the number of training senses
8
- senses_hash= Hash.new()
9
-
10
- reader = AnswerKeyAccess.new(exp, "train", lemma, "r", split_id, "train")
11
- reader.each { |lemma, pos, ids, sids, gold_senses, transformed_gold_senses|
12
- gold_senses.each { |s| senses_hash[s] = true }
13
- }
14
- return senses_hash.keys()
15
-
16
- else
17
- # we're using separate test data.
18
- # so we can just look up the number of training senses
19
- # in the lemmas_and_senses object
20
- senses = lemmas_and_senses_obj.get_senses(lemma)
21
- if senses
22
- return senses
23
- else
24
- return []
25
- end
26
- end
27
- end