frprep 0.0.1.prealpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,1061 @@
1
+ # -*- coding: utf-8 -*-
2
+ require "tempfile"
3
+ require "delegate"
4
+
5
+ require "fred/FredFeatureExtractors"
6
+
7
+ ########################################
8
+ ########################################
9
+ # Feature access classes:
10
+ # read and write features
11
+ class AbstractFredFeatureAccess
12
+ ####
13
+ def initialize(exp, # experiment file object
14
+ dataset, # dataset: "train" or "test"
15
+ mode = "r") # mode: r, w, a
16
+ @exp = exp
17
+ @dataset = dataset
18
+ @mode = mode
19
+
20
+ unless ["r", "w", "a"].include? @mode
21
+ $stderr.puts "FeatureAccess: unknown mode #{@mode}."
22
+ exit 1
23
+ end
24
+
25
+ end
26
+
27
+ ####
28
+ def AbstractFredFeatureAccess.remove_feature_files()
29
+ raise "overwrite me"
30
+ end
31
+
32
+ ####
33
+ def write_item(lemma, # string: lemma
34
+ pos, # string: POS
35
+ ids, # array:string: unique IDs of this occurrence of the lemma
36
+ sid, # string: sentence ID
37
+ sense, # string: sense
38
+ features) # features: hash feature type-> features (string-> array:string)
39
+ raise "overwrite me"
40
+ end
41
+
42
+
43
+ def flush()
44
+ raise "overwrite me"
45
+ end
46
+ end
47
+
48
+ ########################################
49
+ # MetaFeatureAccess:
50
+ # write all featurization data to one gzipped file,
51
+ # directly writing the meta-features as they come
52
+ # format:
53
+ #
54
+ # lemma pos id sense
55
+ # <feature_type>: <features>
56
+ #
57
+ # where feature_type is a word, and features is a list of words, space-separated
58
+ class MetaFeatureAccess < AbstractFredFeatureAccess
59
+ ###
60
+ def initialize(exp, dataset, mode)
61
+ super(exp, dataset, mode)
62
+
63
+ @filename = MetaFeatureAccess.filename(@exp, @dataset)
64
+
65
+ # make filename for writing features
66
+ case @mode
67
+
68
+ when "w", "a", "r"
69
+ # read or write access
70
+ @f = FileZipped.new(@filename, mode)
71
+
72
+ else
73
+ $stderr.puts "MetaFeatureAccess error: illegal mode #{mode}"
74
+ exit 1
75
+ end
76
+ end
77
+
78
+
79
+ ####
80
+ def MetaFeatureAccess.filename(exp, dataset, mode="new")
81
+ return fred_dirname(exp, dataset, "meta_features", mode) +
82
+ "meta_features.txt.gz"
83
+ end
84
+
85
+ ####
86
+ def MetaFeatureAccess.remove_feature_files(exp, dataset)
87
+ filename = MetaFeatureAccess.filename(exp, dataset)
88
+ if File.exists?(filename)
89
+ File.delete(filename)
90
+ end
91
+ end
92
+
93
+
94
+ ###
95
+ # read items, yield one at a time
96
+ #
97
+ # format: tuple consisting of
98
+ # - target_lemma: string
99
+ # - target_pos: string
100
+ # - target_ids: array:string
101
+ # - target SID: string, sentence ID
102
+ # - target_senses: array:string
103
+ # - feature_hash: feature_type->values, string->array:string
104
+ def each_item()
105
+ unless @mode == "r"
106
+ $stderr.puts "MetaFeatureAccess error: cannot read file not opened for reading"
107
+ exit 1
108
+ end
109
+
110
+ lemma = pos = sid = ids = senses = nil
111
+
112
+ feature_hash = Hash.new()
113
+
114
+ @f.each { |line|
115
+ line.chomp!
116
+ if line =~ /^\s/
117
+ # line starts with whitespace: continues description of previous item
118
+ # that is, if we have a previous item
119
+ #
120
+ # format of line:
121
+ # feature_type: feature feature feature ...
122
+ # as in
123
+ # CH: SB#expansion#expansion#NN# OA#change#change#NN#
124
+ unless lemma
125
+ $stderr.puts "MetaFeatureAccess error: unexpected leading whitespace"
126
+ $stderr.puts "in meta-feature file #{@filename}, ignoring line:"
127
+ $stderr.puts line
128
+ next
129
+ end
130
+
131
+ feature_type, *features = line.split()
132
+
133
+ unless feature_type =~ /^(.*):$/
134
+ # feature type should end in ":"
135
+ $stderr.puts "MetaFeatureAccess error: feature type should end in ':' but doesn't"
136
+ $stderr.puts "in meta-feature file #{@filename}, ignoring line:"
137
+ $stderr.puts line
138
+ next
139
+ end
140
+
141
+ feature_hash[feature_type[0..-2]] = features
142
+
143
+
144
+ else
145
+ # first line of item.
146
+ #
147
+ # format:
148
+ # lemma POS IDs sid senses
149
+ #
150
+ # as in:
151
+ # cause verb 2-651966_8 2-651966 Causation
152
+
153
+ # first yield previous item
154
+ if lemma
155
+ yield [lemma, pos, ids, sid, senses, feature_hash]
156
+ end
157
+
158
+ # then start new item:
159
+ lemma, pos, ids_s, sid, senses_s = line.split()
160
+ ids = ids_s.split("::").map { |i| i.gsub(/COLON/, ":") }
161
+ senses = senses_s.split("::").map { |s| s.gsub(/COLON/, ":") }
162
+
163
+ # reset feature hash
164
+ feature_hash.clear()
165
+ end
166
+ }
167
+
168
+ # one more item to yield?
169
+ if lemma
170
+ yield [lemma, pos, ids, sid, senses, feature_hash]
171
+ end
172
+ end
173
+
174
+
175
+
176
+ ###
177
+ def write_item(lemma, # string: target lemma
178
+ pos, # string: target pos
179
+ ids, # array:string: unique IDs of this occurrence of the lemma
180
+ sid, # string: sentence ID
181
+ senses, # array:string: sense
182
+ features) # features: hash feature type-> features (string-> array:string)
183
+
184
+ unless ["w", "a"].include? @mode
185
+ $stderr.puts "MetaFeatureAccess error: cannot write to feature file opened for reading"
186
+ exit 1
187
+ end
188
+
189
+ if not(lemma) or lemma.empty? or not(ids) or ids.empty?
190
+ # nothing to write
191
+ # HIER debugging
192
+ # raise "HIER no lemma or no IDs: #{lemma} #{ids}"
193
+ return
194
+ end
195
+ if pos.nil? or pos.empty?
196
+ # POS unknown
197
+ pos = ""
198
+ end
199
+ unless senses
200
+ senses = [ @exp.get("noval") ]
201
+ end
202
+
203
+ ids_s = ids.map { |i| i.gsub(/:/, "COLON") }.join("::")
204
+
205
+ senses_s = senses.map { |s| s.gsub(/:/, "COLON") }.join("::")
206
+ @f.puts "#{lemma} #{pos} #{ids_s} #{sid} #{senses_s}"
207
+ features.each_pair { |feature_type, f_list|
208
+ @f.puts " #{feature_type}: " + f_list.map { |f| f.to_s() }.join(" ")
209
+ }
210
+ @f.flush()
211
+ end
212
+
213
+ ###
214
+ def flush()
215
+ unless ["w", "a"].include? @mode
216
+ $stderr.puts "MetaFeatureAccess error: cannot write to feature file opened for reading"
217
+ exit 1
218
+ end
219
+
220
+ # actually, nothing to be done here
221
+ end
222
+
223
+ end
224
+
225
+
226
+ ########################################
227
+ # FredFeatureWriter:
228
+ # write chosen features (according to the experiment file)
229
+ # to
230
+ # - one file per lemma for n-ary classification
231
+ # - one file per lemma/sense pair for binary classification
232
+ #
233
+ # format: CSV, last entry is target class
234
+ class FredFeatureAccess < AbstractFredFeatureAccess
235
+ ###
236
+ def initialize(exp, dataset, mode)
237
+ super(exp, dataset, mode)
238
+
239
+ # write to auxiliary files first,
240
+ # to sort items by lemma
241
+ @w_tmp = AuxKeepWriters.new()
242
+
243
+ # which features has the user requested?
244
+ feature_info_obj = FredFeatureInfo.new(@exp)
245
+ @feature_extractors = feature_info_obj.get_extractor_objects()
246
+
247
+ end
248
+
249
+ ####
250
+ def FredFeatureAccess.remove_feature_files(exp, dataset)
251
+
252
+ # remove feature files
253
+ WriteFeaturesNaryOrBinary.remove_files(exp, dataset)
254
+
255
+ # remove key files
256
+ AnswerKeyAccess.remove_files(exp, dataset)
257
+ end
258
+
259
+ ###
260
+ def FredFeatureAccess.legend_filename(lemmapos)
261
+ return "fred.feature_legend.#{lemmapos}"
262
+ end
263
+
264
+ ###
265
+ def FredFeatureAccess.feature_dir(exp, dataset)
266
+ return WriteFeaturesNaryOrBinary.feature_dir(exp, dataset, "new")
267
+ end
268
+
269
+ ###
270
+ # each feature file:
271
+ # iterate through feature files,
272
+ # yield pairs [filename, values]
273
+ # where 'values' is a hash containing keys
274
+ # 'lemma' and potentially 'sense'
275
+ #
276
+ # filenames are sorted alphabetically before being yielded
277
+ #
278
+ # available in read and write mode
279
+ def FredFeatureAccess.each_feature_file(exp, dataset)
280
+ feature_dir = FredFeatureAccess.feature_dir(exp, dataset)
281
+ Dir[feature_dir + "*"].sort().each { |filename|
282
+ if (values = deconstruct_fred_feature_filename(filename))
283
+ yield [filename, values]
284
+ end
285
+ }
286
+ end
287
+
288
+ ###
289
+ # write item:
290
+ # - transform meta-features into actual features as requested
291
+ # in the experiment file
292
+ # - write item to tempfile, don't really write yet
293
+ def write_item(lemma, # string: target lemma
294
+ pos, # string: target pos
295
+ ids, # array:string: unique IDs of this occurrence of the lemma
296
+ sid, # string: sentence ID
297
+ senses, # array:string: sense
298
+ features) # features: hash feature type-> features (string-> array:string)
299
+
300
+
301
+ unless ["w", "a"].include? @mode
302
+ $stderr.puts "FredFeatures error: cannot write to feature file opened for reading"
303
+ exit 1
304
+ end
305
+
306
+ if lemma.nil? or lemma.empty? or ids.nil? or ids.empty?
307
+ # nothing to write
308
+ return
309
+ end
310
+ if pos.nil? or pos.empty?
311
+ # POS unknown
312
+ pos = ""
313
+ end
314
+
315
+ # falsch! noval nicht zulässig für fred! (nur für rosy!) - Warum steht das hier???
316
+ unless senses
317
+ senses = [ @exp.get("noval") ]
318
+ end
319
+
320
+ # modified by ines, 19.7.2010
321
+ # senses should be empty, but they are not - why?
322
+ if senses.length == 1 and senses[0].eql? ""
323
+ senses = "NONE"
324
+ end
325
+
326
+ writer = @w_tmp.get_writer_for(fred_lemmapos_combine(lemma, pos))
327
+ ids_s = ids.map { |i| i.gsub(/:/, "COLON") }.join("::")
328
+
329
+ # AB: Ines modified <senses> and it can be a String.
330
+ # That's corrected, but I do not guarantee the correct results.
331
+ if senses.respond_to? :map
332
+ senses_s = senses.map { |s| s.gsub(/:/, "COLON") }.join("::")
333
+ end
334
+ writer.print "#{lemma} #{pos} #{ids_s} #{sid} #{senses_s} "
335
+
336
+ # write all features
337
+ @feature_extractors.each { |extractor|
338
+ extractor.each_feature(features) { |feature|
339
+ writer.print feature, " "
340
+ }
341
+ }
342
+ writer.puts
343
+ writer.flush()
344
+ end
345
+
346
+ ###
347
+ def flush()
348
+ unless ["w", "a"].include? @mode
349
+ $stderr.puts "FredFeatureAccess error: cannot write to feature file opened for reading"
350
+ exit 1
351
+ end
352
+
353
+ # elements in the feature vector: get fixed with the training data,
354
+ # get read with the test data.
355
+ # get stored in feature_legend_dir
356
+ case @dataset
357
+ when "train"
358
+ feature_legend_dir = File.new_dir(fred_classifier_directory(@exp),
359
+ "legend")
360
+ when "test"
361
+ feature_legend_dir= File.existing_dir(fred_classifier_directory(@exp),
362
+ "legend")
363
+ end
364
+
365
+ # now really write features
366
+ @w_tmp.flush()
367
+ @w_tmp.get_lemmas().sort().each { |lemmapos|
368
+
369
+ # inform user
370
+ $stderr.puts "Writing #{lemmapos}..."
371
+
372
+ # prepare list of features to use in the feature vector:
373
+ legend_filename = feature_legend_dir + FredFeatureAccess.legend_filename(lemmapos)
374
+
375
+ case @dataset
376
+ when "train"
377
+ # training data:
378
+ # determine feature list and sense list from the data,
379
+ # and store in the relevant file
380
+ feature_list, sense_list = collect_feature_list(lemmapos)
381
+ begin
382
+ f = File.new(legend_filename, "w")
383
+ rescue
384
+ $stderr.puts "Error: Could not write to feature legend file #{legend_filename}: " + $!
385
+ exit 1
386
+ end
387
+ f.puts feature_list.map { |x| x.gsub(/,/, "COMMA") }.join(",")
388
+ f.puts sense_list.map { |x| x.gsub(/,/, "COMMA") }.join(",")
389
+ f.close()
390
+
391
+ when "test"
392
+ # test data:
393
+ # read feature list and sense list from the relevant file
394
+
395
+ begin
396
+ f = File.new(legend_filename)
397
+ rescue
398
+ $stderr.puts "Error: Could not read feature legend file #{legend_filename}: " + $!
399
+ $stderr.puts "Skipping this lemma."
400
+ next
401
+ end
402
+ feature_list = f.gets().chomp().split(",").map { |x| x.gsub(/COMMA/, ",") }
403
+ sense_list = f.gets().chomp().split(",").map { |x| x.gsub(/COMMA/, ",") }
404
+ end
405
+
406
+ # write
407
+ # - featurization file
408
+ # - answer key file
409
+
410
+ f = @w_tmp.get_for_reading(lemmapos)
411
+ answer_obj = AnswerKeyAccess.new(@exp, @dataset, lemmapos, "w")
412
+
413
+ obj_out = WriteFeaturesNaryOrBinary.new(lemmapos, @exp, @dataset)
414
+
415
+ f.each { |line|
416
+
417
+ lemma, pos, ids, sid, senses, features = parse_temp_itemline(line)
418
+ unless lemma
419
+ # something went wrong in parsing the line
420
+ next
421
+ end
422
+ each_sensegroup(senses, sense_list) { |senses_for_item, original_senses|
423
+ # write answer key
424
+ answer_obj.write_line(lemma, pos,
425
+ ids, sid, original_senses, senses_for_item)
426
+
427
+ # write item: features, senses
428
+ obj_out.write_instance(to_feature_list(features, feature_list),
429
+ senses_for_item)
430
+ } # each sensegroup
431
+ } # each input line
432
+ obj_out.close()
433
+ answer_obj.close()
434
+ @w_tmp.discard(lemmapos)
435
+ } # each lemma
436
+
437
+
438
+ end
439
+
440
+ ##################
441
+ protected
442
+
443
+ ###
444
+ # read temp feature file for the given lemma/pos
445
+ # and determine the list of all features and the list of all senses,
446
+ # each sorted alphabetically
447
+ def collect_feature_list(lemmapos)
448
+ # read entries for this lemma
449
+ f = @w_tmp.get_for_reading(lemmapos)
450
+
451
+ # keep a record of all senses and features
452
+ # senses: binary.
453
+ # features: keep the max. number of times a given feature occurred
454
+ # in an instance
455
+ all_senses = Hash.new()
456
+ all_features = Hash.new(0)
457
+ features_this_instance = Hash.new(0)
458
+ # record how often each feature occurred all in all
459
+ num_occ = Hash.new(0)
460
+ num_lines = 0
461
+
462
+ f.each { |line|
463
+ lemma, pos, id_string, sid, senses, features = parse_temp_itemline(line)
464
+
465
+ unless lemma
466
+ # something went wrong in parsing the line
467
+ # print out the file contents for reference, then leave
468
+ $stderr.puts "Could not read temporary feature file #{f.path()} for #{lemmapos}."
469
+ exit 1
470
+ end
471
+ num_lines += 1
472
+ senses.each { |s| all_senses[s] = true }
473
+ features_this_instance.clear()
474
+ features.each { |fea|
475
+ features_this_instance[fea] += 1
476
+ num_occ[fea] += 1
477
+ }
478
+
479
+ features_this_instance.each_pair { |feature, value|
480
+ all_features[feature] = [ all_features[feature], features_this_instance[feature] ].max()
481
+ }
482
+ }
483
+
484
+ # HIER
485
+ # if num_lines > 2
486
+ # num_occ.each_pair { |feature, num_occ|
487
+ # if num_occ < 2
488
+ # all_features.delete(feature)
489
+ # end
490
+ # }
491
+ # end
492
+
493
+
494
+
495
+ case @exp.get("numerical_features")
496
+ when "keep"
497
+ # leave numerical features as they are, or
498
+ # don't do numerical features
499
+ return [ all_features.keys().sort(),
500
+ all_senses.keys().sort()
501
+ ]
502
+
503
+ when "repeat"
504
+ # repeat: turn numerical feature with max. value N
505
+ # into N binary features
506
+ feature_list = Array.new()
507
+ all_features.keys().sort().each { |feature|
508
+ all_features[feature].times() { |index|
509
+ feature_list << feature + " #{index}/#{all_features[feature]}"
510
+ }
511
+ }
512
+ return [ feature_list,
513
+ all_senses.keys().sort()
514
+ ]
515
+
516
+ when "bin"
517
+ # make bins:
518
+ # number of bins = (max. number of occurrences of a feature per item) / 10
519
+ feature_list = Array.new()
520
+ all_features.keys().sort().each { |feature|
521
+ num_bins_this_feature = (all_features[feature].to_f() / 10.0).ceil().to_i()
522
+
523
+ num_bins_this_feature.times { |index|
524
+ feature_list << feature + " #{index}/#{num_bins_this_feature}"
525
+ }
526
+ }
527
+ return [ feature_list,
528
+ all_senses.keys().sort()
529
+ ]
530
+ else
531
+ raise "Shouldn't be here"
532
+ end
533
+ end
534
+
535
+
536
+ ###
537
+ # given a full sorted list of items and a partial list of items,
538
+ # match the partial list to the full list,
539
+ # that is, produce as many items as the full list has
540
+ # yielding 0 where the partial entry is not in the full list,
541
+ # and > 0 otherwise
542
+ #
543
+ # Note that if partial contains items not in full,
544
+ # they will not occur on the feature list returned!
545
+ def to_feature_list(partial, full,
546
+ handle_numerical_features = nil)
547
+
548
+ #print "FULL: ", full, "\n"
549
+ #print "PART: ", partial, "\n"
550
+ # count occurrences of each feature in the partial list
551
+ occ_hash = Hash.new(0)
552
+ partial.each { |p|
553
+ occ_hash[p] += 1
554
+ }
555
+
556
+ # what to do with our counts?
557
+ unless handle_numerical_features
558
+ # no pre-set value given when this function was called
559
+ handle_numerical_features = @exp.get("numerical_features")
560
+ end
561
+
562
+ case handle_numerical_features
563
+ when "keep"
564
+ # leave numerical features as numerical features
565
+ return full.map { |x|
566
+ occ_hash[x].to_s()
567
+ }
568
+
569
+ when "repeat"
570
+ # repeat each numerical feature up to a max. number of occurrences
571
+ return full.map { |feature_plus_count|
572
+ unless feature_plus_count =~ /^(.*) (\d+)\/(\d+)$/
573
+ $stderr.puts "Error: could not parse feature: #{feature_plus_count}, bailing out."
574
+ raise "Shouldn't be here."
575
+ end
576
+
577
+ feature = $1
578
+ current_count = $2.to_i()
579
+ max_num = $3.to_i()
580
+
581
+ if occ_hash[feature] > current_count
582
+ 1
583
+ else
584
+ 0
585
+ end
586
+ }
587
+
588
+ when "bin"
589
+ # group numerical feature values into N bins.
590
+ # number of bins varies from feature to feature
591
+ # each bin contains 10 different counts
592
+ return full.map { |feature_plus_count|
593
+ unless feature_plus_count =~ /^(.*) (\d+)\/(\d+)$/
594
+ $stderr.puts "Error: could not parse feature: #{feature_plus_count}, bailing out."
595
+ raise "Shouldn't be here."
596
+ end
597
+
598
+ feature = $1
599
+ current_count = $2.to_i()
600
+ max_num = $3.to_i()
601
+
602
+ if occ_hash[feature] % 10 > (10 * current_count)
603
+ 1
604
+ else
605
+ 0
606
+ end
607
+ }
608
+ else
609
+ raise "Shouldn't be here"
610
+ end
611
+ end
612
+
613
+
614
+ ###
615
+ # how to treat instances with multiple senses?
616
+ # - either write one item per sense
617
+ # - or combine all senses into one string
618
+ # - or keep as separate senses
619
+ #
620
+ # according to 'handle_multilabel' in the experiment file
621
+ #
622
+ # yields pairs of [senses, original_senses]
623
+ # both are arrays of strings
624
+ def each_sensegroup(senses, full_sense_list)
625
+ case @exp.get("handle_multilabel")
626
+ when "keep"
627
+ yield [senses, senses]
628
+ when "join"
629
+ yield [ [fred_join_senses(senses)], senses]
630
+ when "repeat"
631
+ senses.each { |s|
632
+ yield [ [s], senses]
633
+ }
634
+ when "binarize"
635
+ yield [ senses, senses ]
636
+ else
637
+ $stderr.puts "Error: unknown setting #{exp.get("handle_multilabel")}"
638
+ $stderr.puts "for 'handle_multilabel' in the experiment file."
639
+ $stderr.puts "Please choose one of 'binary', 'keep', 'join', 'repeat'"
640
+ $stderr.puts "or leave unset -- default is 'binary'."
641
+ exit 1
642
+ end
643
+ end
644
+
645
+ ###
646
+ def parse_temp_itemline(line)
647
+ lemma, pos, ids_s, sid, senses_s, *features = line.split()
648
+ # fix me! senses is empty, takes context features instead
649
+ unless senses_s
650
+ # features may be empty, but we need senses
651
+ $stderr.puts "FredFeatures Error in word sense item line: too short."
652
+ $stderr.puts ">>#{line}<<"
653
+ return nil
654
+ end
655
+
656
+ ids = ids_s.split("::").map { |i| i.gsub(/COLON/, ":") }
657
+ senses = senses_s.split("::").map { |s| s.gsub(/COLON/, ":") }
658
+
659
+ return [lemma, pos, ids, sid, senses, features]
660
+ end
661
+
662
+ end
663
+
664
+ ########################################
665
+ # read and write access to answer key files
666
+ # manages a single answer key file for a given lemma/POS pair
667
+ class AnswerKeyAccess
668
+ ###
669
+ def initialize(exp, # experiment file object
670
+ dataset, # "train", "test"
671
+ lemmapos, # lemma + POS (one string)
672
+ mode, # "r", "w", "a"
673
+ split_id = nil,
674
+ split_dataset = nil)
675
+ unless ["r", "w", "a"].include? mode
676
+ $stderr.puts "FredFeatures error: AnswerKeyAccess initialized with mode #{mode}."
677
+ exit 1
678
+ end
679
+
680
+ @mode = mode
681
+
682
+ answer_filename = fred_dirname(exp, dataset, "keys", "new") +
683
+ fred_answerkey_filename(lemmapos)
684
+
685
+ # are we reading the whole answer key file, or only the test part
686
+ # of a split of it?
687
+ if split_id
688
+ # we are accessing part of a split
689
+ # we can only do that when reading!
690
+ unless @mode == "r"
691
+ $stderr.puts "AnswerKeyAccess error: cannot access split answer file in write mode."
692
+ exit 1
693
+ end
694
+
695
+ # apply_split returns a closed temporary file
696
+ split_obj = FredSplitPkg.new(exp)
697
+ @f = split_obj.apply_split(answer_filename, lemmapos, split_dataset, split_id)
698
+ if @f.nil?
699
+ # the split_dataset part of the split doesn't contain any data
700
+ $stderr.puts "Warning: no #{split_dataset} data for lemma #{lemmapos}"
701
+ else
702
+ @f.open()
703
+ end
704
+
705
+ else
706
+ # we are reading the whole thing
707
+ begin
708
+ @f = File.new(answer_filename, @mode)
709
+ rescue
710
+ @f = nil
711
+ end
712
+ end
713
+ end
714
+
715
+ ###
716
+ def write_line(lemma, # string: lemma
717
+ pos, # string: POS
718
+ ids, # array:string: target IDs
719
+ sid, # string: sentence ID
720
+ senses, # array:string: senses
721
+ senses_this_item) # array:string: senses for this item
722
+ unless ["w", "a"].include? @mode
723
+ $stderr.puts "FredFeatures error: AnswerKeyAccess: cannot write in read mode."
724
+ exit 1
725
+ end
726
+ unless @f
727
+ raise "Shouldn't be here"
728
+ end
729
+
730
+ # write answer key:
731
+ # lemma POS ID senses
732
+ if senses.include? nil or senses.include? ""
733
+ raise "empty sense"
734
+ end
735
+ if senses_this_item.include? nil or senses_this_item.include? ""
736
+ raise "empty sense for this item"
737
+ end
738
+
739
+ senses_s = senses.map { |s| s.gsub(/,/, "COMMA")}.join(",")
740
+ senses_ti_s = senses_this_item.map { |s|
741
+ s.gsub(/,/, "COMMA")}.join(",")
742
+ id_s = ids.map { |i| i.gsub(/:/, "COLON") }.join("::")
743
+
744
+ @f.puts "#{lemma} #{pos} #{id_s} #{sid} #{senses_s} #{senses_ti_s}"
745
+ end
746
+
747
+ ###
748
+ # yield one line at a time:
749
+ # tuple (lemma, POS, ids, sentence_ID, all_assigned_senses, transformed_senses_for_this_item)
750
+ def each()
751
+ unless @mode == "r"
752
+ $stderr.puts "FredFeatures error: AnsewrKeyAccess: cannot read in write mode"
753
+ end
754
+ unless @f
755
+ # something went wrong during initialization:
756
+ # split didn't contain data
757
+ return
758
+ end
759
+
760
+ @f.each { |line|
761
+
762
+ lemma, pos, id_s, sid, senses_s, senses_this_item_s = line.split()
763
+ ids = id_s.split("::").map { |i| i.gsub(/COLON/, ":") }
764
+ senses = senses_s.split(",").map { |s| s.gsub(/COMMA/, ",") }
765
+
766
+ senses_this_item = senses_this_item_s.split(",").map { |s|
767
+ s.gsub(/COMMA/, ",") }
768
+
769
+ yield [lemma, pos, ids, sid, senses, senses_this_item]
770
+ }
771
+ end
772
+
773
+ ###
774
+ def close()
775
+ @f.close()
776
+ end
777
+
778
+ ###
779
+ def AnswerKeyAccess.remove_files(exp, dataset)
780
+ Dir[fred_dirname(exp, dataset, "keys", "new") + fred_answerkey_filename("*")].each { |filename|
781
+ if File.exists?(filename)
782
+ File.delete(filename)
783
+ end
784
+ }
785
+ end
786
+ end
787
+
788
+
789
+ ####################3
790
+ # keep writers: auxiliary class for FredFeatureAccess:
791
+ # write to several files at a time
792
+ # in tempfiles
793
+ class AuxKeepWriters
794
+ def initialize()
795
+ @lemma2temp = Hash.new()
796
+ @size = 50
797
+ @writers = Array.new()
798
+ end
799
+
800
+
801
+ ##
802
+ def flush()
803
+ @writers.each { |lemmapos, writer|
804
+ writer.close()
805
+ }
806
+ end
807
+
808
+ ##
809
+ def get_lemmas()
810
+ return @lemma2temp.keys()
811
+ end
812
+
813
+ ##
814
+ def get_for_reading(lemmapos)
815
+ if @lemma2temp[lemmapos]
816
+ # we have a writer for this
817
+
818
+ @lemma2temp[lemmapos].close()
819
+ @lemma2temp[lemmapos].open()
820
+ return @lemma2temp[lemmapos]
821
+
822
+ else
823
+ # no writer for this
824
+ return nil
825
+ end
826
+ end
827
+
828
+ ##
829
+ # finally close temp file, remove information for lemma/pos
830
+ def discard(lemmapos)
831
+ if @lemma2temp[lemmapos]
832
+ @lemma2temp[lemmapos].close(true)
833
+ @lemma2temp.delete(lemmapos)
834
+ end
835
+ end
836
+
837
+ ##
838
+ def get_writer_for(lemmapos)
839
+
840
+ # is there a temp file for this lemma/pos combination?
841
+ unless @lemma2temp[lemmapos]
842
+ @lemma2temp[lemmapos] = Tempfile.new("fred_features")
843
+ @lemma2temp[lemmapos].close()
844
+ end
845
+
846
+ # is there an open temp file for this lemma/pos combination?
847
+ pair = @writers.assoc(lemmapos)
848
+ if pair
849
+ return pair.last()
850
+ end
851
+
852
+ # no: open the temp file, kick some other temp file out of the
853
+ # @writers list
854
+ writer = @lemma2temp[lemmapos]
855
+ writer.open()
856
+
857
+
858
+ # writer: open for appending
859
+ writer.seek(0, IO::SEEK_END)
860
+
861
+
862
+ @writers << [lemmapos, writer]
863
+ if @writers.length() > @size
864
+ # close file associated with first writer
865
+ @writers.first.last.close()
866
+ @writers.shift()
867
+ end
868
+ return writer
869
+ end
870
+
871
+ ###
872
+ def remove_files()
873
+ @lemma2temp.each_value { |x|
874
+ x.close(true)
875
+ }
876
+ end
877
+ end
878
+
879
+ ##############
880
+ # write features,
881
+ # either lemma-wise
882
+ # or lemma+sense-wise
883
+ # if lemma+sense-wise, write as binary classifier,
884
+ # i.e. map the target senses
885
+ #
886
+ # Use Delegator.
887
+
888
+ ###
889
+ # Features for N-ary classifiers
890
+ class WriteFeaturesNary
891
+ def initialize(lemma,
892
+ exp,
893
+ dataset,
894
+ feature_dir)
895
+
896
+ @filename = feature_dir + fred_feature_filename(lemma)
897
+ @f = File.new(@filename, "w")
898
+ @handle_multilabel = exp.get("handle_multilabel")
899
+ end
900
+
901
+ def write_instance(features, senses)
902
+
903
+ @f.print features.map { |x|
904
+ x.to_s().gsub(/,/, "COMMA").gsub(/;/, "SEMICOLON")
905
+ }.join(",")
906
+
907
+ # possibly more than one sense? then use semicolon to separate
908
+ if @handle_multilabel == "keep"
909
+ # possibly more than one sense:
910
+ # separate by semicolon,
911
+ # and hope that the classifier knows this
912
+ @f.print ";"
913
+ @f.puts senses.map {|x|
914
+ x.to_s().gsub(/,/, "COMMA").gsub(/;/, "SEMICOLON")
915
+ }.join(",")
916
+ else
917
+ # one sense: just separate by comma
918
+ @f.print ","
919
+ @f.puts senses.first().to_s().gsub(/,/, "COMMA").gsub(/;/, "SEMICOLON")
920
+ end
921
+ end
922
+
923
+ def close()
924
+ @f.close()
925
+ end
926
+ end
927
+
928
+ ###
929
+ # Features for binary classifiers
930
+ class WriteFeaturesBinary
931
+ def initialize(lemma,
932
+ exp,
933
+ dataset,
934
+ feature_dir)
935
+ @dir = feature_dir
936
+ @lemma = lemma
937
+ @feature_dir = feature_dir
938
+
939
+ @negsense = exp.get("negsense")
940
+ unless @negsense
941
+ @negsense = "NONE"
942
+ end
943
+
944
+ # files: sense-> filename
945
+ @files = Hash.new()
946
+
947
+ # keep all instances such that, when a new sense comes around,
948
+ # we can write them for that sense
949
+ @instances = Array.new()
950
+ end
951
+
952
+
953
+ def write_instance(features, senses)
954
+ # sense we haven't seen before? Then we need to
955
+ # write the whole featurization file for that new sense
956
+ check_for_presence_of_senses(senses)
957
+
958
+ # write this new instance for all senses
959
+ @files.each_key { |sense_of_file|
960
+ write_to_sensefile(features, senses, sense_of_file)
961
+ }
962
+
963
+ # store instance in case another sense crops up later
964
+ @instances << [features, senses]
965
+ end
966
+
967
+
968
+ ###
969
+ def close()
970
+ @files.each_value { |f| f.close() }
971
+ end
972
+
973
+ ######
974
+ private
975
+
976
+ def check_for_presence_of_senses(senses)
977
+ senses.each { |sense|
978
+ # do we have a sense file for this sense?
979
+ unless @files[sense]
980
+ # open new file for this sense
981
+ @files[sense] = File.new(@feature_dir + fred_feature_filename(@lemma, sense, true), "w")
982
+ # filename = @feature_dir + fred_feature_filename(@lemma, sense, true)
983
+ # $stderr.puts "Starting new feature file #{filename}"
984
+
985
+ # and re-write all previous instances for it
986
+ @instances.each { |prev_features, prev_senses|
987
+ write_to_sensefile(prev_features, prev_senses,
988
+ sense)
989
+ }
990
+ end
991
+ }
992
+ end
993
+
994
+ ###
995
+ def write_to_sensefile(features, senses,
996
+ sense_of_file)
997
+ # file to write to
998
+ f = @files[sense_of_file]
999
+
1000
+ # print features
1001
+ f.print features.map { |x|
1002
+ x.to_s().gsub(/,/, "COMMA")
1003
+ }.join(",")
1004
+
1005
+ f.print ","
1006
+
1007
+ # binarize target class
1008
+ if senses.include? sense_of_file
1009
+ # $stderr.puts "writing POS #{sense_of_file}"
1010
+ f.puts sense_of_file.to_s()
1011
+ else
1012
+ # $stderr.puts "writing NEG #{negsense}"
1013
+ f.puts @negsense
1014
+ end
1015
+
1016
+ end
1017
+ end
1018
+
1019
+ ########
1020
+ # class writing features:
1021
+ # delegating to either a binary or an n-ary writer
1022
+ class WriteFeaturesNaryOrBinary < SimpleDelegator
1023
+ ###
1024
+ def initialize(lemma,
1025
+ exp,
1026
+ dataset)
1027
+ feature_dir = WriteFeaturesNaryOrBinary.feature_dir(exp, dataset, "new")
1028
+ if exp.get("binary_classifiers")
1029
+ # binary classifiers
1030
+ # $stderr.puts "Writing binary feature data."
1031
+
1032
+ # delegate writing to the binary feature writer
1033
+ @writer = WriteFeaturesBinary.new(lemma, exp, dataset, feature_dir)
1034
+ super(@writer)
1035
+
1036
+ else
1037
+ # n-ary classifiers
1038
+ # $stderr.puts "Writing n-ary feature data."
1039
+
1040
+ # delegate writing to the n-ary feature writer
1041
+ @writer = WriteFeaturesNary.new(lemma, exp, dataset, feature_dir)
1042
+ super(@writer)
1043
+ end
1044
+ end
1045
+
1046
+ def WriteFeaturesNaryOrBinary.feature_dir(exp, dataset,
1047
+ mode = "existing")
1048
+ return fred_dirname(exp, dataset, "features", mode)
1049
+ end
1050
+
1051
+ ###
1052
+ def WriteFeaturesNaryOrBinary.remove_files(exp, dataset)
1053
+ feature_dir = WriteFeaturesNaryOrBinary.feature_dir(exp, dataset, "new")
1054
+
1055
+ Dir[feature_dir + fred_feature_filename("*")].each { |filename|
1056
+ if File.exists? filename
1057
+ File.delete(filename)
1058
+ end
1059
+ }
1060
+ end
1061
+ end