frprep 0.0.1.prealpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,596 @@
1
+ # FredFeaturize
2
+ #
3
+ # Featurization for WSD
4
+ #
5
+ # Katrin Erk April 2007
6
+ #
7
+ # feature types currently allowed:
8
+ # - context (with parameter giving context size; may be set several times)
9
+ # - syntax
10
+ # - synsem
11
+ #
12
+ # features in Meta-feature file:
13
+ #
14
+ # CX: context: word/lemma/pos/ne
15
+ # CH: children: grfunc/word/lemma/pos/ne
16
+ # PA: parents: grfunc/word/lemma/pos/ne
17
+ # SI: sibling: parent/grfunc/word/lemma/pos/ne
18
+ # TA: target: word/lemma/pos/ne
19
+
20
+
21
+
22
+ require "delegate"
23
+
24
+ #######
25
+
26
+ require "fred/FileZipped"
27
+ require "common/Parser"
28
+ require "common/RegXML"
29
+ require "common/SalsaTigerRegXML"
30
+ require "common/SalsaTigerXMLHelper"
31
+
32
+ require "fred/FredConfigData"
33
+ require "fred/FredConventions"
34
+ require "common/FrPrepConfigData"
35
+ require "common/FrprepHelper"
36
+ require "common/SynInterfaces"
37
+
38
+ require "fred/FredBOWContext"
39
+ require "fred/FredDetermineTargets"
40
+ require "fred/FredFeatures"
41
+
42
+ ####################################
43
+ # grammatical function computation:
44
+ # given a sentence, keep all grammatical function relations in a hash
45
+ # for faster access
46
+ class GrammaticalFunctionAccess
47
+
48
+ def initialize(interpreter_class)
49
+ @interpreter_class = interpreter_class
50
+ @to = Hash.new( [] ) # default: return empty array
51
+ @from = Hash.new( [] ) # default: return empty array
52
+ end
53
+
54
+ def set_sent(sent) # SalsaTigerRegXML sentence
55
+
56
+ @to.clear()
57
+ @from.clear()
58
+
59
+ sent.each_syn_node { |current|
60
+
61
+ current_head = @interpreter_class.head_terminal(current)
62
+ unless current_head
63
+ next
64
+ end
65
+
66
+ @interpreter_class.gfs(current, sent).map { |rel, node|
67
+ # PPs: use head noun rather than preposition as head
68
+ # Sbar, VP: use verb
69
+ if (n = @interpreter_class.informative_content_node(node))
70
+ [rel, n]
71
+ else
72
+ [rel, node]
73
+ end
74
+ }.each { |rel, node|
75
+
76
+ rel_head = @interpreter_class.head_terminal(node)
77
+ unless rel_head
78
+ next
79
+ end
80
+
81
+ unless @to.has_key? current_head
82
+ @to[current_head] = Array.new()
83
+ end
84
+ unless @to[current_head].include? [rel, rel_head]
85
+ @to[current_head] << [rel, rel_head]
86
+ end
87
+
88
+ unless @from.has_key? rel_head
89
+ @from[rel_head] = Array.new()
90
+ end
91
+ unless @from[rel_head].include? [rel, current_head]
92
+ @from[rel_head] << [rel, current_head]
93
+ end
94
+ }
95
+ }
96
+ # $stderr.puts "Changed sentence:"
97
+ # @to.each_pair { |p, ch|
98
+ # $stderr.puts "\t#{p.id()}: " + ch.map { |rel, n| rel + "/"+n.id()}.join(", ")
99
+ # }
100
+ # $stdin.gets()
101
+ end
102
+
103
+ def get_children(node)
104
+ return @to[node]
105
+ end
106
+
107
+ def get_parents(node)
108
+ return @from[node]
109
+ end
110
+ end
111
+
112
+ ####################################
113
+ # main class of this package
114
+ ####################################
115
+ class FredFeaturize < DelegateClass(GrammaticalFunctionAccess)
116
+
117
+ include WordLemmaPosNe
118
+
119
+ #####
120
+ def initialize(exp_obj, # FredConfigData object
121
+ options, # hash: runtime option name (string) => value(string)
122
+ varhash = {}) # optional parameter: "refeaturize"
123
+
124
+ ##
125
+ # evaluate runtime options
126
+ if $ENDUSER_MODE
127
+ # only possible dataset: test
128
+ @dataset = "test"
129
+ else
130
+ @dataset = nil
131
+ end
132
+ @append_rather_than_overwrite = false
133
+
134
+
135
+ options.each_pair do |opt, arg|
136
+ case opt
137
+ when '--dataset'
138
+ @dataset = arg
139
+ unless ["train", "test"].include? @dataset
140
+ $stderr.puts "--dataset needs to be either 'train' or 'test'"
141
+ exit 1
142
+ end
143
+
144
+ when '--append'
145
+ @append_rather_than_overwrite = true
146
+
147
+ else
148
+ # case of unknown arguments has been dealt with by fred.rb
149
+ end
150
+ end
151
+
152
+ # further sanity checks
153
+ if @dataset.nil?
154
+ $stderr.puts "Please set --dataset: one of 'train', 'test'"
155
+ exit 1
156
+ end
157
+
158
+ in_enduser_mode_ensure(@dataset == "test")
159
+
160
+ # evaluate optional "refeaturize" argument
161
+ # "refeaturize": reuse meta-feature set,
162
+ # just redo CSV featurization
163
+ if varhash["refeaturize"]
164
+ @refeaturize = varhash["refeaturize"]
165
+ else
166
+ @refeaturize = false
167
+ end
168
+
169
+ # prepare experiment file: add preprocessing experiment file data
170
+ @exp = exp_obj
171
+
172
+ preproc_expname = @exp.get("preproc_descr_file_" + @dataset)
173
+ if not(preproc_expname)
174
+ $stderr.puts "Please set the name of the preprocessing exp. file name"
175
+ $stderr.puts "in the experiment file, feature preproc_descr_file_#{@dataset}"
176
+ exit 1
177
+ elsif not(File.readable?(preproc_expname))
178
+ $stderr.puts "Error in the experiment file:"
179
+ $stderr.puts "Parameter preproc_descr_file_#{@dataset} has to be a readable file."
180
+ exit 1
181
+ end
182
+ preproc_exp = FrPrepConfigData.new(preproc_expname)
183
+ @exp.adjoin(preproc_exp)
184
+
185
+ # get the right syntactic interface
186
+ SynInterfaces.check_interfaces_abort_if_missing(@exp)
187
+ @interpreter_class = SynInterfaces.get_interpreter_according_to_exp(@exp)
188
+
189
+ # initialize grammatical function object (delegating)
190
+ grf_obj = GrammaticalFunctionAccess.new(@interpreter_class)
191
+ super(grf_obj)
192
+
193
+
194
+ # announce the task
195
+ $stderr.puts "---------"
196
+ $stderr.puts "Fred experiment #{@exp.get("experiment_ID")}: Featurization of dataset #{@dataset}"
197
+ if @refeaturize
198
+ $stderr.puts "Keeping meta-features, redoing featurization only."
199
+ end
200
+ if @exp.get("binary_classifiers")
201
+ $stderr.puts "Writing features for binary classifiers."
202
+ else
203
+ $stderr.puts "Writing features for n-ary classifiers."
204
+ end
205
+ $stderr.puts "---------"
206
+
207
+ end
208
+
209
+ ####
210
+ def compute()
211
+ if @refeaturize
212
+ # read meta-feature file,
213
+ # just redo normal featurization
214
+ refeaturize()
215
+ else
216
+ # write meta features and normal features
217
+ featurize()
218
+ end
219
+ end
220
+
221
+ #########################
222
+ private
223
+
224
+ #####
225
+ # main featurization
226
+ def featurize()
227
+
228
+ ###
229
+ # make objects
230
+ unless @exp.get("directory_preprocessed")
231
+ $stderr.puts "Shalmaneser error: could not find the directory with"
232
+ $stderr.puts "syntactially preprocessed data."
233
+ $stderr.puts "Please make sure that 'directory_preprocessed'"
234
+ $stderr.puts "is set in the frprep experiment file you use with this experiment."
235
+ exit 1
236
+ end
237
+ directory = File.existing_dir(@exp.get("directory_preprocessed"))
238
+
239
+ # get context sizes
240
+ context_sizes = @exp.get_lf("feature", "context")
241
+ unless context_sizes
242
+ # no contexts, nothing to compute.
243
+ # choose default context
244
+ $stderr.puts "Error: no contexts set."
245
+ $stderr.puts "I will compute a context of size 1 by default."
246
+ $stderr.puts "(This goes into the meta-features, but not"
247
+ $stderr.puts "into the set of features used in the classifier.)"
248
+ context_sizes = [1]
249
+ end
250
+ max_context_size = context_sizes.max()
251
+
252
+ # make target determination object
253
+ if @dataset == "test" and @exp.get("apply_to_all_known_targets")
254
+ $stderr.puts "Fred: Using all known targets as instances."
255
+ target_obj = FindAllTargets.new(@exp, @interpreter_class)
256
+ else
257
+ if @append_rather_than_overwrite
258
+ target_obj = FindTargetsFromFrames.new(@exp, @interpreter_class, "a")
259
+ else
260
+ target_obj = FindTargetsFromFrames.new(@exp, @interpreter_class, "w")
261
+ end
262
+ end
263
+
264
+ # make context computation object
265
+ if @exp.get("single_sent_context")
266
+ # contexts in the input data doesn't go beyond a single sentence
267
+ context_obj = SingleSentContextProvider.new(max_context_size, @exp,
268
+ @interpreter_class, target_obj,
269
+ @dataset)
270
+ if @exp.get("noncontiguous_input")
271
+ $stderr.puts "Warning: 'single_sent_context' has been set in the experiment file."
272
+ $stderr.puts "So I'm ignoring the 'noncontiguous_input = true' setting."
273
+ end
274
+
275
+ elsif @exp.get("noncontiguous_input")
276
+ # the input data is not contiguous but
277
+ # consists of selected sentences from a larger text
278
+ context_obj = NoncontiguousContextProvider.new(max_context_size, @exp,
279
+ @interpreter_class, target_obj,
280
+ @dataset)
281
+ else
282
+ # the input data is contiguous, and we're computing contexts not restricted to single sentences
283
+ context_obj = ContextProvider.new(max_context_size, @exp,
284
+ @interpreter_class, target_obj, @dataset)
285
+ end
286
+
287
+ zipped_input_dir = fred_dirname(@exp, @dataset, "input_data", "new")
288
+
289
+ ##
290
+ # make writer object(s)
291
+
292
+ writer_classes = [
293
+ MetaFeatureAccess,
294
+ FredFeatureAccess
295
+ ]
296
+
297
+ if @append_rather_than_overwrite
298
+ # append
299
+ mode = "a"
300
+ $stderr.puts "Appending new features to the old"
301
+
302
+ else
303
+ # write
304
+ mode = "w"
305
+
306
+ $stderr.puts "Removing old features for the same experiment (if any)"
307
+
308
+ writer_classes.each { |w_class|
309
+ w_class.remove_feature_files(@exp, @dataset)
310
+ }
311
+
312
+ Dir[zipped_input_dir + "*gz"].each { |filename|
313
+ File.delete(filename)
314
+ }
315
+ end
316
+
317
+ writers = writer_classes.map { |w_class|
318
+ w_class.new(@exp, @dataset, mode)
319
+ }
320
+
321
+ ###
322
+ # zip and store input files
323
+ Dir[directory + "*.xml"].sort.each { |filename|
324
+ %x{gzip -c #{filename} > #{zipped_input_dir}#{File.basename(filename)}.gz}
325
+ }
326
+
327
+ # always remember current sentence
328
+ @current_sent = nil
329
+ ###
330
+ # featurize
331
+
332
+ # context_obj.each_window yields tuples of:
333
+ # - a context, an array of tuples [word,lemma, pos, ne]
334
+ # string/nil*string/nil*string/nil*string/nil
335
+ # - ID of main target: string
336
+ # - target_IDs: array:string, list of IDs of target words
337
+ # - senses: array:string, the senses for the target
338
+ # - sent: SalsaTigerSentence object
339
+ #
340
+ # for each max. context returned by context object:
341
+ # determine meta-features:
342
+ # - context words for all context sizes listed in context_sizes,
343
+ # - children of target
344
+ # - parent of target
345
+ # - siblings of target
346
+ #
347
+ # and pass on to writing object(s)
348
+ target_count = 0
349
+ context_obj.each_window(directory) { |context, main_target_id, target_ids, senses, sent|
350
+ # inform user
351
+ if target_count % 500 == 0
352
+ $stderr.puts "#{target_count}..."
353
+ end
354
+ target_count += 1
355
+ # determine features
356
+ feature_hash = Hash.new()
357
+ compute_target_features(context, max_context_size, feature_hash)
358
+ compute_context_features(context, max_context_size, context_sizes, feature_hash)
359
+ compute_syn_features(main_target_id, sent, feature_hash)
360
+ # write
361
+ each_lemma_pos_and_senses(senses) { |target_lemma, target_pos, target_sid, target_senses|
362
+
363
+ writers.each { |writer_obj|
364
+
365
+ writer_obj.write_item(target_lemma,
366
+ target_pos,
367
+ target_ids,
368
+ target_sid,
369
+ target_senses,
370
+ feature_hash)
371
+ }
372
+ }
373
+ }
374
+ # finalize writers
375
+ writers.each { |writer_obj|
376
+ writer_obj.flush()
377
+ }
378
+
379
+ # record the targets that have been read
380
+ target_obj.done_reading_targets()
381
+
382
+ end
383
+
384
+ #####
385
+ # reuse of meta-features, recompute CSV features
386
+ def refeaturize()
387
+
388
+ ##
389
+ # remove old features:
390
+ # normal features only. Keep meta-features.
391
+ # Don't do anything about zipped input.
392
+ # Assume it stays as is.
393
+ if @append_rather_than_overwrite
394
+ # append
395
+ mode = "a"
396
+ $stderr.puts "Appending new features to the old"
397
+
398
+ else
399
+ # write
400
+ mode = "w"
401
+
402
+ $stderr.puts "Removing old features for the same experiment (if any)"
403
+
404
+ FredFeatureAccess.remove_feature_files(@exp, @dataset)
405
+ end
406
+
407
+ ##
408
+ # read meta-feature file,
409
+ # write fred feature files
410
+ meta_reader = MetaFeatureAccess.new(@exp, @dataset, "r")
411
+ feature_writer = FredFeatureAccess.new(@exp, @dataset, mode)
412
+
413
+ ##
414
+ # featurize
415
+ target_count = 0
416
+
417
+ meta_reader.each_item { |target_lemma, target_pos, target_ids, target_sid, target_senses, feature_hash|
418
+
419
+ # inform user
420
+ if target_count % 500 == 0
421
+ $stderr.puts "#{target_count}..."
422
+ end
423
+ target_count += 1
424
+
425
+ feature_writer.write_item(target_lemma,
426
+ target_pos,
427
+ target_ids,
428
+ target_sid,
429
+ target_senses,
430
+ feature_hash)
431
+ }
432
+ feature_writer.flush()
433
+ end
434
+
435
+
436
+ ####
437
+ # given a list of sense hashes, format
438
+ # "lex" -> lemma
439
+ # "pos" -> part of speech
440
+ # "sense" -> sense
441
+ #
442
+ # yield as triples [lemma, pos, sense]
443
+ def each_lemma_pos_and_senses(shashes)
444
+
445
+ # determine target and POS. If we actually have more than one lemma and POS, we're in trouble
446
+ target_lemmas = shashes.map { |sense_hash| sense_hash["lex"].to_s().gsub(/\s/, "_") }.uniq()
447
+ target_pos_s = shashes.map { |sense_hash| sense_hash["pos"].to_s().gsub(/\s/, "_")}.uniq()
448
+ target_sid = shashes.map { |sense_hash| sense_hash["sid"].to_s().gsub(/\s/, "_")}.uniq()
449
+
450
+ if target_lemmas.length() == 1 and target_pos_s.length() == 1 and target_sid.length() == 1
451
+
452
+ yield [target_lemmas.first(), target_pos_s.first(),
453
+ target_sid.first(),
454
+ shashes.map { |sense_hash| sense_hash["sense"].to_s().gsub(/\s/, "_") }
455
+ ]
456
+
457
+ else
458
+ # trouble
459
+
460
+ # group senses by SID, lemma and pos
461
+ lemmapos2sense = Hash.new
462
+ shashes.each { |sense_hash|
463
+ target_lemma = sense_hash["lex"].to_s().gsub(/\s/, "_")
464
+ target_pos = sense_hash["pos"].to_s().gsub(/\s/, "_")
465
+ target_sid = sense_hash["sid"].to_s().gsub(/\s/, "_")
466
+ target_sense = sense_hash["sense"].to_s().gsub(/\s/, "_")
467
+ key = [target_sid, target_lemma, target_pos]
468
+ unless lemmapos2sense[key]
469
+ lemmapos2sense[key] = Array.new()
470
+ end
471
+ lemmapos2sense[key] << target_sense
472
+ }
473
+
474
+ # and yield
475
+ lemmapos2sense.each_key { |target_sid, target_lemma, target_pos|
476
+ yield [target_lemma, target_pos, target_sid,
477
+ lemmapos2sense[[target_sid, target_lemma, target_pos]]
478
+ ]
479
+ }
480
+ end
481
+ end
482
+
483
+ ###
484
+ # given a context, locate the target,
485
+ # which is right in the middle,
486
+ # and enter it into the feature hash
487
+ #
488
+ # feature type: TA
489
+ # entry: word#lemma#pos#ne
490
+ def compute_target_features(context, # array: word*lemma*pos*ne
491
+ center_pos, # integer: size of context, onesided
492
+ feature_hash) # hash: feature_type -> array:feature, enter features here
493
+ feature_hash["TA"] = [
494
+ context[center_pos].map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
495
+ ]
496
+ end
497
+
498
+ ###
499
+ # compute context features:
500
+ # for each context in the given list of context sizes,
501
+ # compute a context with feature_type "CXNN" (where NN is the size of the context)
502
+ # and with features
503
+ # word#lemma#pos#ne
504
+ #
505
+ # enter each context into the feature hash
506
+ def compute_context_features(context, # array: word*lemma*pos*ne
507
+ center_pos, # int: context is 2*cx_size_onesided + 1 long
508
+ context_sizes, # array:int, produce a context of each of these sizes
509
+ feature_hash) # hash: feature_type -> array:feature, enter features here
510
+
511
+
512
+ context_sizes.each { |context_size|
513
+ # feature type: CXNN, where NN is the size of the context
514
+ feature_type = "CX" + context_size.to_s()
515
+
516
+ # features: an array of strings
517
+ feature_hash[feature_type] = Array.new()
518
+
519
+ # pre-context
520
+ (center_pos - context_size).upto(center_pos - 1) { |ix|
521
+ if context[ix]
522
+ # context entries may be nil at the beginning and end of the text
523
+ feature_hash[feature_type] << context[ix].map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
524
+ end
525
+ }
526
+ # post-context
527
+ (center_pos + 1).upto(center_pos + context_size) { |ix|
528
+ if context[ix]
529
+ # context entries may be nil at the beginning and end of the text
530
+ feature_hash[feature_type] << context[ix].map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
531
+ end
532
+ }
533
+ }
534
+ end
535
+
536
+ ###
537
+ # compute syntax-dependent features:
538
+ # children (that is, dependents) of the target word,
539
+ # parent,
540
+ # and siblings.
541
+ #
542
+ # format:
543
+ # feature type is CH for children, PA for parent, SI for siblings
544
+ #
545
+ # individual features are:
546
+ # <dependency>#<word>#<lemma>#<pos>#<ne>
547
+ def compute_syn_features(main_target_id, # string: ID of the terminal node that is the target
548
+ sent, # SalsaTigerRegXML object
549
+ feature_hash) # hash: feature_type -> array:feature, enter features here
550
+
551
+ target = sent.terminals().detect { |t| t.id() == main_target_id }
552
+ unless target
553
+ $stderr.puts "Featurization error: cannot find target with ID #{main_target_id}, skipping."
554
+ return
555
+ end
556
+
557
+ # if we're starting a new sentence,
558
+ # compute dependencies using delegate object for grammatical functions.
559
+ # also, get_children, get_parents below are methods of the delegate
560
+ unless sent == @current_sent
561
+ @current_sent = sent
562
+ set_sent(sent)
563
+ end
564
+ # children
565
+ feature_hash["CH"] = get_children(target).map { |rel, node|
566
+ #print "\t", rel, " ", node, "\n"
567
+ rel.to_s() + "#" +
568
+ word_lemma_pos_ne(node, @interpreter_class).map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
569
+ }
570
+
571
+ # parent
572
+ feature_hash["PA"] = get_parents(target).map { |rel, node|
573
+ #print "\t", rel, " ", node, "\n"
574
+ rel.to_s() + "#" +
575
+ word_lemma_pos_ne(node, @interpreter_class).map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
576
+ }
577
+
578
+ # siblings
579
+ feature_hash["SI"] = Array.new()
580
+ get_parents(target).each { |rel, parent|
581
+ parent_w, d1, d2, d3 = word_lemma_pos_ne(parent, @interpreter_class)
582
+
583
+ get_children(parent).each { |rel, node|
584
+ #print "\t", rel, " ", node, "\n"
585
+
586
+ if node == target
587
+ next
588
+ end
589
+
590
+ feature_hash["SI"] << parent_w + "#" +
591
+ rel.to_s() + "#" +
592
+ word_lemma_pos_ne(node, @interpreter_class).map { |e| e.to_s() }.join("#").gsub(/\s/, "_")
593
+ }
594
+ }
595
+ end
596
+ end