frprep 0.0.1.prealpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,596 @@
1
+ # FredFeaturize
2
+ #
3
+ # Featurization for WSD
4
+ #
5
+ # Katrin Erk April 2007
6
+ #
7
+ # feature types currently allowed:
8
+ # - context (with parameter giving context size; may be set several times)
9
+ # - syntax
10
+ # - synsem
11
+ #
12
+ # features in Meta-feature file:
13
+ #
14
+ # CX: context: word/lemma/pos/ne
15
+ # CH: children: grfunc/word/lemma/pos/ne
16
+ # PA: parents: grfunc/word/lemma/pos/ne
17
+ # SI: sibling: parent/grfunc/word/lemma/pos/ne
18
+ # TA: target: word/lemma/pos/ne
19
+
20
+
21
+
22
+ require "delegate"
23
+
24
+ #######
25
+
26
+ require "fred/FileZipped"
27
+ require "common/Parser"
28
+ require "common/RegXML"
29
+ require "common/SalsaTigerRegXML"
30
+ require "common/SalsaTigerXMLHelper"
31
+
32
+ require "fred/FredConfigData"
33
+ require "fred/FredConventions"
34
+ require "common/FrPrepConfigData"
35
+ require "common/FrprepHelper"
36
+ require "common/SynInterfaces"
37
+
38
+ require "fred/FredBOWContext"
39
+ require "fred/FredDetermineTargets"
40
+ require "fred/FredFeatures"
41
+
42
+ ####################################
43
+ # grammatical function computation:
44
+ # given a sentence, keep all grammatical function relations in a hash
45
+ # for faster access
46
+ class GrammaticalFunctionAccess
47
+
48
+ def initialize(interpreter_class)
49
+ @interpreter_class = interpreter_class
50
+ @to = Hash.new( [] ) # default: return empty array
51
+ @from = Hash.new( [] ) # default: return empty array
52
+ end
53
+
54
+ def set_sent(sent) # SalsaTigerRegXML sentence
55
+
56
+ @to.clear()
57
+ @from.clear()
58
+
59
+ sent.each_syn_node { |current|
60
+
61
+ current_head = @interpreter_class.head_terminal(current)
62
+ unless current_head
63
+ next
64
+ end
65
+
66
+ @interpreter_class.gfs(current, sent).map { |rel, node|
67
+ # PPs: use head noun rather than preposition as head
68
+ # Sbar, VP: use verb
69
+ if (n = @interpreter_class.informative_content_node(node))
70
+ [rel, n]
71
+ else
72
+ [rel, node]
73
+ end
74
+ }.each { |rel, node|
75
+
76
+ rel_head = @interpreter_class.head_terminal(node)
77
+ unless rel_head
78
+ next
79
+ end
80
+
81
+ unless @to.has_key? current_head
82
+ @to[current_head] = Array.new()
83
+ end
84
+ unless @to[current_head].include? [rel, rel_head]
85
+ @to[current_head] << [rel, rel_head]
86
+ end
87
+
88
+ unless @from.has_key? rel_head
89
+ @from[rel_head] = Array.new()
90
+ end
91
+ unless @from[rel_head].include? [rel, current_head]
92
+ @from[rel_head] << [rel, current_head]
93
+ end
94
+ }
95
+ }
96
+ # $stderr.puts "Changed sentence:"
97
+ # @to.each_pair { |p, ch|
98
+ # $stderr.puts "\t#{p.id()}: " + ch.map { |rel, n| rel + "/"+n.id()}.join(", ")
99
+ # }
100
+ # $stdin.gets()
101
+ end
102
+
103
+ def get_children(node)
104
+ return @to[node]
105
+ end
106
+
107
+ def get_parents(node)
108
+ return @from[node]
109
+ end
110
+ end
111
+
112
+ ####################################
113
+ # main class of this package
114
+ ####################################
115
+ class FredFeaturize < DelegateClass(GrammaticalFunctionAccess)
116
+
117
+ include WordLemmaPosNe
118
+
119
+ #####
120
+ def initialize(exp_obj, # FredConfigData object
121
+ options, # hash: runtime option name (string) => value(string)
122
+ varhash = {}) # optional parameter: "refeaturize"
123
+
124
+ ##
125
+ # evaluate runtime options
126
+ if $ENDUSER_MODE
127
+ # only possible dataset: test
128
+ @dataset = "test"
129
+ else
130
+ @dataset = nil
131
+ end
132
+ @append_rather_than_overwrite = false
133
+
134
+
135
+ options.each_pair do |opt, arg|
136
+ case opt
137
+ when '--dataset'
138
+ @dataset = arg
139
+ unless ["train", "test"].include? @dataset
140
+ $stderr.puts "--dataset needs to be either 'train' or 'test'"
141
+ exit 1
142
+ end
143
+
144
+ when '--append'
145
+ @append_rather_than_overwrite = true
146
+
147
+ else
148
+ # case of unknown arguments has been dealt with by fred.rb
149
+ end
150
+ end
151
+
152
+ # further sanity checks
153
+ if @dataset.nil?
154
+ $stderr.puts "Please set --dataset: one of 'train', 'test'"
155
+ exit 1
156
+ end
157
+
158
+ in_enduser_mode_ensure(@dataset == "test")
159
+
160
+ # evaluate optional "refeaturize" argument
161
+ # "refeaturize": reuse meta-feature set,
162
+ # just redo CSV featurization
163
+ if varhash["refeaturize"]
164
+ @refeaturize = varhash["refeaturize"]
165
+ else
166
+ @refeaturize = false
167
+ end
168
+
169
+ # prepare experiment file: add preprocessing experiment file data
170
+ @exp = exp_obj
171
+
172
+ preproc_expname = @exp.get("preproc_descr_file_" + @dataset)
173
+ if not(preproc_expname)
174
+ $stderr.puts "Please set the name of the preprocessing exp. file name"
175
+ $stderr.puts "in the experiment file, feature preproc_descr_file_#{@dataset}"
176
+ exit 1
177
+ elsif not(File.readable?(preproc_expname))
178
+ $stderr.puts "Error in the experiment file:"
179
+ $stderr.puts "Parameter preproc_descr_file_#{@dataset} has to be a readable file."
180
+ exit 1
181
+ end
182
+ preproc_exp = FrPrepConfigData.new(preproc_expname)
183
+ @exp.adjoin(preproc_exp)
184
+
185
+ # get the right syntactic interface
186
+ SynInterfaces.check_interfaces_abort_if_missing(@exp)
187
+ @interpreter_class = SynInterfaces.get_interpreter_according_to_exp(@exp)
188
+
189
+ # initialize grammatical function object (delegating)
190
+ grf_obj = GrammaticalFunctionAccess.new(@interpreter_class)
191
+ super(grf_obj)
192
+
193
+
194
+ # announce the task
195
+ $stderr.puts "---------"
196
+ $stderr.puts "Fred experiment #{@exp.get("experiment_ID")}: Featurization of dataset #{@dataset}"
197
+ if @refeaturize
198
+ $stderr.puts "Keeping meta-features, redoing featurization only."
199
+ end
200
+ if @exp.get("binary_classifiers")
201
+ $stderr.puts "Writing features for binary classifiers."
202
+ else
203
+ $stderr.puts "Writing features for n-ary classifiers."
204
+ end
205
+ $stderr.puts "---------"
206
+
207
+ end
208
+
209
+ ####
210
+ def compute()
211
+ if @refeaturize
212
+ # read meta-feature file,
213
+ # just redo normal featurization
214
+ refeaturize()
215
+ else
216
+ # write meta features and normal features
217
+ featurize()
218
+ end
219
+ end
220
+
221
+ #########################
222
+ private
223
+
224
+ #####
225
+ # main featurization
226
+ def featurize()
227
+
228
+ ###
229
+ # make objects
230
+ unless @exp.get("directory_preprocessed")
231
+ $stderr.puts "Shalmaneser error: could not find the directory with"
232
+ $stderr.puts "syntactially preprocessed data."
233
+ $stderr.puts "Please make sure that 'directory_preprocessed'"
234
+ $stderr.puts "is set in the frprep experiment file you use with this experiment."
235
+ exit 1
236
+ end
237
+ directory = File.existing_dir(@exp.get("directory_preprocessed"))
238
+
239
+ # get context sizes
240
+ context_sizes = @exp.get_lf("feature", "context")
241
+ unless context_sizes
242
+ # no contexts, nothing to compute.
243
+ # choose default context
244
+ $stderr.puts "Error: no contexts set."
245
+ $stderr.puts "I will compute a context of size 1 by default."
246
+ $stderr.puts "(This goes into the meta-features, but not"
247
+ $stderr.puts "into the set of features used in the classifier.)"
248
+ context_sizes = [1]
249
+ end
250
+ max_context_size = context_sizes.max()
251
+
252
+ # make target determination object
253
+ if @dataset == "test" and @exp.get("apply_to_all_known_targets")
254
+ $stderr.puts "Fred: Using all known targets as instances."
255
+ target_obj = FindAllTargets.new(@exp, @interpreter_class)
256
+ else
257
+ if @append_rather_than_overwrite
258
+ target_obj = FindTargetsFromFrames.new(@exp, @interpreter_class, "a")
259
+ else
260
+ target_obj = FindTargetsFromFrames.new(@exp, @interpreter_class, "w")
261
+ end
262
+ end
263
+
264
+ # make context computation object
265
+ if @exp.get("single_sent_context")
266
+ # contexts in the input data doesn't go beyond a single sentence
267
+ context_obj = SingleSentContextProvider.new(max_context_size, @exp,
268
+ @interpreter_class, target_obj,
269
+ @dataset)
270
+ if @exp.get("noncontiguous_input")
271
+ $stderr.puts "Warning: 'single_sent_context' has been set in the experiment file."
272
+ $stderr.puts "So I'm ignoring the 'noncontiguous_input = true' setting."
273
+ end
274
+
275
+ elsif @exp.get("noncontiguous_input")
276
+ # the input data is not contiguous but
277
+ # consists of selected sentences from a larger text
278
+ context_obj = NoncontiguousContextProvider.new(max_context_size, @exp,
279
+ @interpreter_class, target_obj,
280
+ @dataset)
281
+ else
282
+ # the input data is contiguous, and we're computing contexts not restricted to single sentences
283
+ context_obj = ContextProvider.new(max_context_size, @exp,
284
+ @interpreter_class, target_obj, @dataset)
285
+ end
286
+
287
+ zipped_input_dir = fred_dirname(@exp, @dataset, "input_data", "new")
288
+
289
+ ##
290
+ # make writer object(s)
291
+
292
+ writer_classes = [
293
+ MetaFeatureAccess,
294
+ FredFeatureAccess
295
+ ]
296
+
297
+ if @append_rather_than_overwrite
298
+ # append
299
+ mode = "a"
300
+ $stderr.puts "Appending new features to the old"
301
+
302
+ else
303
+ # write
304
+ mode = "w"
305
+
306
+ $stderr.puts "Removing old features for the same experiment (if any)"
307
+
308
+ writer_classes.each { |w_class|
309
+ w_class.remove_feature_files(@exp, @dataset)
310
+ }
311
+
312
+ Dir[zipped_input_dir + "*gz"].each { |filename|
313
+ File.delete(filename)
314
+ }
315
+ end
316
+
317
+ writers = writer_classes.map { |w_class|
318
+ w_class.new(@exp, @dataset, mode)
319
+ }
320
+
321
+ ###
322
+ # zip and store input files
323
+ Dir[directory + "*.xml"].sort.each { |filename|
324
+ %x{gzip -c #{filename} > #{zipped_input_dir}#{File.basename(filename)}.gz}
325
+ }
326
+
327
+ # always remember current sentence
328
+ @current_sent = nil
329
+ ###
330
+ # featurize
331
+
332
+ # context_obj.each_window yields tuples of:
333
+ # - a context, an array of tuples [word,lemma, pos, ne]
334
+ # string/nil*string/nil*string/nil*string/nil
335
+ # - ID of main target: string
336
+ # - target_IDs: array:string, list of IDs of target words
337
+ # - senses: array:string, the senses for the target
338
+ # - sent: SalsaTigerSentence object
339
+ #
340
+ # for each max. context returned by context object:
341
+ # determine meta-features:
342
+ # - context words for all context sizes listed in context_sizes,
343
+ # - children of target
344
+ # - parent of target
345
+ # - siblings of target
346
+ #
347
+ # and pass on to writing object(s)
348
+ target_count = 0
349
+ context_obj.each_window(directory) { |context, main_target_id, target_ids, senses, sent|
350
+ # inform user
351
+ if target_count % 500 == 0
352
+ $stderr.puts "#{target_count}..."
353
+ end
354
+ target_count += 1
355
+ # determine features
356
+ feature_hash = Hash.new()
357
+ compute_target_features(context, max_context_size, feature_hash)
358
+ compute_context_features(context, max_context_size, context_sizes, feature_hash)
359
+ compute_syn_features(main_target_id, sent, feature_hash)
360
+ # write
361
+ each_lemma_pos_and_senses(senses) { |target_lemma, target_pos, target_sid, target_senses|
362
+
363
+ writers.each { |writer_obj|
364
+
365
+ writer_obj.write_item(target_lemma,
366
+ target_pos,
367
+ target_ids,
368
+ target_sid,
369
+ target_senses,
370
+ feature_hash)
371
+ }
372
+ }
373
+ }
374
+ # finalize writers
375
+ writers.each { |writer_obj|
376
+ writer_obj.flush()
377
+ }
378
+
379
+ # record the targets that have been read
380
+ target_obj.done_reading_targets()
381
+
382
+ end
383
+
384
+ #####
385
+ # reuse of meta-features, recompute CSV features
386
+ def refeaturize()
387
+
388
+ ##
389
+ # remove old features:
390
+ # normal features only. Keep meta-features.
391
+ # Don't do anything about zipped input.
392
+ # Assume it stays as is.
393
+ if @append_rather_than_overwrite
394
+ # append
395
+ mode = "a"
396
+ $stderr.puts "Appending new features to the old"
397
+
398
+ else
399
+ # write
400
+ mode = "w"
401
+
402
+ $stderr.puts "Removing old features for the same experiment (if any)"
403
+
404
+ FredFeatureAccess.remove_feature_files(@exp, @dataset)
405
+ end
406
+
407
+ ##
408
+ # read meta-feature file,
409
+ # write fred feature files
410
+ meta_reader = MetaFeatureAccess.new(@exp, @dataset, "r")
411
+ feature_writer = FredFeatureAccess.new(@exp, @dataset, mode)
412
+
413
+ ##
414
+ # featurize
415
+ target_count = 0
416
+
417
+ meta_reader.each_item { |target_lemma, target_pos, target_ids, target_sid, target_senses, feature_hash|
418
+
419
+ # inform user
420
+ if target_count % 500 == 0
421
+ $stderr.puts "#{target_count}..."
422
+ end
423
+ target_count += 1
424
+
425
+ feature_writer.write_item(target_lemma,
426
+ target_pos,
427
+ target_ids,
428
+ target_sid,
429
+ target_senses,
430
+ feature_hash)
431
+ }
432
+ feature_writer.flush()
433
+ end
434
+
435
+
436
+ ####
437
+ # given a list of sense hashes, format
438
+ # "lex" -> lemma
439
+ # "pos" -> part of speech
440
+ # "sense" -> sense
441
+ #
442
+ # yield as triples [lemma, pos, sense]
443
+ def each_lemma_pos_and_senses(shashes)
444
+
445
+ # determine target and POS. If we actually have more than one lemma and POS, we're in trouble
446
+ target_lemmas = shashes.map { |sense_hash| sense_hash["lex"].to_s().gsub(/\s/, "_") }.uniq()
447
+ target_pos_s = shashes.map { |sense_hash| sense_hash["pos"].to_s().gsub(/\s/, "_")}.uniq()
448
+ target_sid = shashes.map { |sense_hash| sense_hash["sid"].to_s().gsub(/\s/, "_")}.uniq()
449
+
450
+ if target_lemmas.length() == 1 and target_pos_s.length() == 1 and target_sid.length() == 1
451
+
452
+ yield [target_lemmas.first(), target_pos_s.first(),
453
+ target_sid.first(),
454
+ shashes.map { |sense_hash| sense_hash["sense"].to_s().gsub(/\s/, "_") }
455
+ ]
456
+
457
+ else
458
+ # trouble
459
+
460
+ # group senses by SID, lemma and pos
461
+ lemmapos2sense = Hash.new
462
+ shashes.each { |sense_hash|
463
+ target_lemma = sense_hash["lex"].to_s().gsub(/\s/, "_")
464
+ target_pos = sense_hash["pos"].to_s().gsub(/\s/, "_")
465
+ target_sid = sense_hash["sid"].to_s().gsub(/\s/, "_")
466
+ target_sense = sense_hash["sense"].to_s().gsub(/\s/, "_")
467
+ key = [target_sid, target_lemma, target_pos]
468
+ unless lemmapos2sense[key]
469
+ lemmapos2sense[key] = Array.new()
470
+ end
471
+ lemmapos2sense[key] << target_sense
472
+ }
473
+
474
+ # and yield
475
+ lemmapos2sense.each_key { |target_sid, target_lemma, target_pos|
476
+ yield [target_lemma, target_pos, target_sid,
477
+ lemmapos2sense[[target_sid, target_lemma, target_pos]]
478
+ ]
479
+ }
480
+ end
481
+ end
482
+
483
+ ###
484
+ # given a context, locate the target,
485
+ # which is right in the middle,
486
+ # and enter it into the feature hash
487
+ #
488
+ # feature type: TA
489
+ # entry: word#lemma#pos#ne
490
+ def compute_target_features(context, # array: word*lemma*pos*ne
491
+ center_pos, # integer: size of context, onesided
492
+ feature_hash) # hash: feature_type -> array:feature, enter features here
493
+ feature_hash["TA"] = [
494
+ context[center_pos].map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
495
+ ]
496
+ end
497
+
498
+ ###
499
+ # compute context features:
500
+ # for each context in the given list of context sizes,
501
+ # compute a context with feature_type "CXNN" (where NN is the size of the context)
502
+ # and with features
503
+ # word#lemma#pos#ne
504
+ #
505
+ # enter each context into the feature hash
506
+ def compute_context_features(context, # array: word*lemma*pos*ne
507
+ center_pos, # int: context is 2*cx_size_onesided + 1 long
508
+ context_sizes, # array:int, produce a context of each of these sizes
509
+ feature_hash) # hash: feature_type -> array:feature, enter features here
510
+
511
+
512
+ context_sizes.each { |context_size|
513
+ # feature type: CXNN, where NN is the size of the context
514
+ feature_type = "CX" + context_size.to_s()
515
+
516
+ # features: an array of strings
517
+ feature_hash[feature_type] = Array.new()
518
+
519
+ # pre-context
520
+ (center_pos - context_size).upto(center_pos - 1) { |ix|
521
+ if context[ix]
522
+ # context entries may be nil at the beginning and end of the text
523
+ feature_hash[feature_type] << context[ix].map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
524
+ end
525
+ }
526
+ # post-context
527
+ (center_pos + 1).upto(center_pos + context_size) { |ix|
528
+ if context[ix]
529
+ # context entries may be nil at the beginning and end of the text
530
+ feature_hash[feature_type] << context[ix].map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
531
+ end
532
+ }
533
+ }
534
+ end
535
+
536
+ ###
537
+ # compute syntax-dependent features:
538
+ # children (that is, dependents) of the target word,
539
+ # parent,
540
+ # and siblings.
541
+ #
542
+ # format:
543
+ # feature type is CH for children, PA for parent, SI for siblings
544
+ #
545
+ # individual features are:
546
+ # <dependency>#<word>#<lemma>#<pos>#<ne>
547
+ def compute_syn_features(main_target_id, # string: ID of the terminal node that is the target
548
+ sent, # SalsaTigerRegXML object
549
+ feature_hash) # hash: feature_type -> array:feature, enter features here
550
+
551
+ target = sent.terminals().detect { |t| t.id() == main_target_id }
552
+ unless target
553
+ $stderr.puts "Featurization error: cannot find target with ID #{main_target_id}, skipping."
554
+ return
555
+ end
556
+
557
+ # if we're starting a new sentence,
558
+ # compute dependencies using delegate object for grammatical functions.
559
+ # also, get_children, get_parents below are methods of the delegate
560
+ unless sent == @current_sent
561
+ @current_sent = sent
562
+ set_sent(sent)
563
+ end
564
+ # children
565
+ feature_hash["CH"] = get_children(target).map { |rel, node|
566
+ #print "\t", rel, " ", node, "\n"
567
+ rel.to_s() + "#" +
568
+ word_lemma_pos_ne(node, @interpreter_class).map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
569
+ }
570
+
571
+ # parent
572
+ feature_hash["PA"] = get_parents(target).map { |rel, node|
573
+ #print "\t", rel, " ", node, "\n"
574
+ rel.to_s() + "#" +
575
+ word_lemma_pos_ne(node, @interpreter_class).map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
576
+ }
577
+
578
+ # siblings
579
+ feature_hash["SI"] = Array.new()
580
+ get_parents(target).each { |rel, parent|
581
+ parent_w, d1, d2, d3 = word_lemma_pos_ne(parent, @interpreter_class)
582
+
583
+ get_children(parent).each { |rel, node|
584
+ #print "\t", rel, " ", node, "\n"
585
+
586
+ if node == target
587
+ next
588
+ end
589
+
590
+ feature_hash["SI"] << parent_w + "#" +
591
+ rel.to_s() + "#" +
592
+ word_lemma_pos_ne(node, @interpreter_class).map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
593
+ }
594
+ }
595
+ end
596
+ end