frprep 0.0.1.prealpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,1061 @@
1
+ # -*- coding: utf-8 -*-
2
+ require "tempfile"
3
+ require "delegate"
4
+
5
+ require "fred/FredFeatureExtractors"
6
+
7
+ ########################################
8
+ ########################################
9
+ # Feature access classes:
10
+ # read and write features
11
+ class AbstractFredFeatureAccess
12
+ ####
13
+ def initialize(exp, # experiment file object
14
+ dataset, # dataset: "train" or "test"
15
+ mode = "r") # mode: r, w, a
16
+ @exp = exp
17
+ @dataset = dataset
18
+ @mode = mode
19
+
20
+ unless ["r", "w", "a"].include? @mode
21
+ $stderr.puts "FeatureAccess: unknown mode #{@mode}."
22
+ exit 1
23
+ end
24
+
25
+ end
26
+
27
+ ####
28
+ def AbstractFredFeatureAccess.remove_feature_files()
29
+ raise "overwrite me"
30
+ end
31
+
32
+ ####
33
+ def write_item(lemma, # string: lemma
34
+ pos, # string: POS
35
+ ids, # array:string: unique IDs of this occurrence of the lemma
36
+ sid, # string: sentence ID
37
+ sense, # string: sense
38
+ features) # features: hash feature type-> features (string-> array:string)
39
+ raise "overwrite me"
40
+ end
41
+
42
+
43
+ def flush()
44
+ raise "overwrite me"
45
+ end
46
+ end
47
+
48
+ ########################################
49
+ # MetaFeatureAccess:
50
+ # write all featurization data to one gzipped file,
51
+ # directly writing the meta-features as they come
52
+ # format:
53
+ #
54
+ # lemma pos id sense
55
+ # <feature_type>: <features>
56
+ #
57
+ # where feature_type is a word, and features is a list of words, space-separated
58
+ class MetaFeatureAccess < AbstractFredFeatureAccess
59
+ ###
60
+ def initialize(exp, dataset, mode)
61
+ super(exp, dataset, mode)
62
+
63
+ @filename = MetaFeatureAccess.filename(@exp, @dataset)
64
+
65
+ # make filename for writing features
66
+ case @mode
67
+
68
+ when "w", "a", "r"
69
+ # read or write access
70
+ @f = FileZipped.new(@filename, mode)
71
+
72
+ else
73
+ $stderr.puts "MetaFeatureAccess error: illegal mode #{mode}"
74
+ exit 1
75
+ end
76
+ end
77
+
78
+
79
+ ####
80
+ def MetaFeatureAccess.filename(exp, dataset, mode="new")
81
+ return fred_dirname(exp, dataset, "meta_features", mode) +
82
+ "meta_features.txt.gz"
83
+ end
84
+
85
+ ####
86
+ def MetaFeatureAccess.remove_feature_files(exp, dataset)
87
+ filename = MetaFeatureAccess.filename(exp, dataset)
88
+ if File.exists?(filename)
89
+ File.delete(filename)
90
+ end
91
+ end
92
+
93
+
94
+ ###
95
+ # read items, yield one at a time
96
+ #
97
+ # format: tuple consisting of
98
+ # - target_lemma: string
99
+ # - target_pos: string
100
+ # - target_ids: array:string
101
+ # - target SID: string, sentence ID
102
+ # - target_senses: array:string
103
+ # - feature_hash: feature_type->values, string->array:string
104
+ def each_item()
105
+ unless @mode == "r"
106
+ $stderr.puts "MetaFeatureAccess error: cannot read file not opened for reading"
107
+ exit 1
108
+ end
109
+
110
+ lemma = pos = sid = ids = senses = nil
111
+
112
+ feature_hash = Hash.new()
113
+
114
+ @f.each { |line|
115
+ line.chomp!
116
+ if line =~ /^\s/
117
+ # line starts with whitespace: continues description of previous item
118
+ # that is, if we have a previous item
119
+ #
120
+ # format of line:
121
+ # feature_type: feature feature feature ...
122
+ # as in
123
+ # CH: SB#expansion#expansion#NN# OA#change#change#NN#
124
+ unless lemma
125
+ $stderr.puts "MetaFeatureAccess error: unexpected leading whitespace"
126
+ $stderr.puts "in meta-feature file #{@filename}, ignoring line:"
127
+ $stderr.puts line
128
+ next
129
+ end
130
+
131
+ feature_type, *features = line.split()
132
+
133
+ unless feature_type =~ /^(.*):$/
134
+ # feature type should end in ":"
135
+ $stderr.puts "MetaFeatureAccess error: feature type should end in ':' but doesn't"
136
+ $stderr.puts "in meta-feature file #{@filename}, ignoring line:"
137
+ $stderr.puts line
138
+ next
139
+ end
140
+
141
+ feature_hash[feature_type[0..-2]] = features
142
+
143
+
144
+ else
145
+ # first line of item.
146
+ #
147
+ # format:
148
+ # lemma POS IDs sid senses
149
+ #
150
+ # as in:
151
+ # cause verb 2-651966_8 2-651966 Causation
152
+
153
+ # first yield previous item
154
+ if lemma
155
+ yield [lemma, pos, ids, sid, senses, feature_hash]
156
+ end
157
+
158
+ # then start new item:
159
+ lemma, pos, ids_s, sid, senses_s = line.split()
160
+ ids = ids_s.split("::").map { |i| i.gsub(/COLON/, ":") }
161
+ senses = senses_s.split("::").map { |s| s.gsub(/COLON/, ":") }
162
+
163
+ # reset feature hash
164
+ feature_hash.clear()
165
+ end
166
+ }
167
+
168
+ # one more item to yield?
169
+ if lemma
170
+ yield [lemma, pos, ids, sid, senses, feature_hash]
171
+ end
172
+ end
173
+
174
+
175
+
176
+ ###
177
+ def write_item(lemma, # string: target lemma
178
+ pos, # string: target pos
179
+ ids, # array:string: unique IDs of this occurrence of the lemma
180
+ sid, # string: sentence ID
181
+ senses, # array:string: sense
182
+ features) # features: hash feature type-> features (string-> array:string)
183
+
184
+ unless ["w", "a"].include? @mode
185
+ $stderr.puts "MetaFeatureAccess error: cannot write to feature file opened for reading"
186
+ exit 1
187
+ end
188
+
189
+ if not(lemma) or lemma.empty? or not(ids) or ids.empty?
190
+ # nothing to write
191
+ # HIER debugging
192
+ # raise "HIER no lemma or no IDs: #{lemma} #{ids}"
193
+ return
194
+ end
195
+ if pos.nil? or pos.empty?
196
+ # POS unknown
197
+ pos = ""
198
+ end
199
+ unless senses
200
+ senses = [ @exp.get("noval") ]
201
+ end
202
+
203
+ ids_s = ids.map { |i| i.gsub(/:/, "COLON") }.join("::")
204
+
205
+ senses_s = senses.map { |s| s.gsub(/:/, "COLON") }.join("::")
206
+ @f.puts "#{lemma} #{pos} #{ids_s} #{sid} #{senses_s}"
207
+ features.each_pair { |feature_type, f_list|
208
+ @f.puts " #{feature_type}: " + f_list.map { |f| f.to_s() }.join(" ")
209
+ }
210
+ @f.flush()
211
+ end
212
+
213
+ ###
214
+ def flush()
215
+ unless ["w", "a"].include? @mode
216
+ $stderr.puts "MetaFeatureAccess error: cannot write to feature file opened for reading"
217
+ exit 1
218
+ end
219
+
220
+ # actually, nothing to be done here
221
+ end
222
+
223
+ end
224
+
225
+
226
+ ########################################
227
+ # FredFeatureWriter:
228
+ # write chosen features (according to the experiment file)
229
+ # to
230
+ # - one file per lemma for n-ary classification
231
+ # - one file per lemma/sense pair for binary classification
232
+ #
233
+ # format: CSV, last entry is target class
234
+ class FredFeatureAccess < AbstractFredFeatureAccess
235
+ ###
236
+ def initialize(exp, dataset, mode)
237
+ super(exp, dataset, mode)
238
+
239
+ # write to auxiliary files first,
240
+ # to sort items by lemma
241
+ @w_tmp = AuxKeepWriters.new()
242
+
243
+ # which features has the user requested?
244
+ feature_info_obj = FredFeatureInfo.new(@exp)
245
+ @feature_extractors = feature_info_obj.get_extractor_objects()
246
+
247
+ end
248
+
249
+ ####
250
+ def FredFeatureAccess.remove_feature_files(exp, dataset)
251
+
252
+ # remove feature files
253
+ WriteFeaturesNaryOrBinary.remove_files(exp, dataset)
254
+
255
+ # remove key files
256
+ AnswerKeyAccess.remove_files(exp, dataset)
257
+ end
258
+
259
+ ###
260
+ def FredFeatureAccess.legend_filename(lemmapos)
261
+ return "fred.feature_legend.#{lemmapos}"
262
+ end
263
+
264
+ ###
265
+ def FredFeatureAccess.feature_dir(exp, dataset)
266
+ return WriteFeaturesNaryOrBinary.feature_dir(exp, dataset, "new")
267
+ end
268
+
269
+ ###
270
+ # each feature file:
271
+ # iterate through feature files,
272
+ # yield pairs [filename, values]
273
+ # where 'values' is a hash containing keys
274
+ # 'lemma' and potentially 'sense'
275
+ #
276
+ # filenames are sorted alphabetically before being yielded
277
+ #
278
+ # available in read and write mode
279
+ def FredFeatureAccess.each_feature_file(exp, dataset)
280
+ feature_dir = FredFeatureAccess.feature_dir(exp, dataset)
281
+ Dir[feature_dir + "*"].sort().each { |filename|
282
+ if (values = deconstruct_fred_feature_filename(filename))
283
+ yield [filename, values]
284
+ end
285
+ }
286
+ end
287
+
288
+ ###
289
+ # write item:
290
+ # - transform meta-features into actual features as requested
291
+ # in the experiment file
292
+ # - write item to tempfile, don't really write yet
293
+ def write_item(lemma, # string: target lemma
294
+ pos, # string: target pos
295
+ ids, # array:string: unique IDs of this occurrence of the lemma
296
+ sid, # string: sentence ID
297
+ senses, # array:string: sense
298
+ features) # features: hash feature type-> features (string-> array:string)
299
+
300
+
301
+ unless ["w", "a"].include? @mode
302
+ $stderr.puts "FredFeatures error: cannot write to feature file opened for reading"
303
+ exit 1
304
+ end
305
+
306
+ if lemma.nil? or lemma.empty? or ids.nil? or ids.empty?
307
+ # nothing to write
308
+ return
309
+ end
310
+ if pos.nil? or pos.empty?
311
+ # POS unknown
312
+ pos = ""
313
+ end
314
+
315
+ # falsch! noval nicht zulässig für fred! (nur für rosy!) - Warum steht das hier???
316
+ unless senses
317
+ senses = [ @exp.get("noval") ]
318
+ end
319
+
320
+ # modified by ines, 19.7.2010
321
+ # senses should be empty, but they are not - why?
322
+ if senses.length == 1 and senses[0].eql? ""
323
+ senses = "NONE"
324
+ end
325
+
326
+ writer = @w_tmp.get_writer_for(fred_lemmapos_combine(lemma, pos))
327
+ ids_s = ids.map { |i| i.gsub(/:/, "COLON") }.join("::")
328
+
329
+ # AB: Ines modified <senses> and it can be a String.
330
+ # That's corrected, but I do not guarantee the correct results.
331
+ if senses.respond_to? :map
332
+ senses_s = senses.map { |s| s.gsub(/:/, "COLON") }.join("::")
333
+ end
334
+ writer.print "#{lemma} #{pos} #{ids_s} #{sid} #{senses_s} "
335
+
336
+ # write all features
337
+ @feature_extractors.each { |extractor|
338
+ extractor.each_feature(features) { |feature|
339
+ writer.print feature, " "
340
+ }
341
+ }
342
+ writer.puts
343
+ writer.flush()
344
+ end
345
+
346
+ ###
347
+ def flush()
348
+ unless ["w", "a"].include? @mode
349
+ $stderr.puts "FredFeatureAccess error: cannot write to feature file opened for reading"
350
+ exit 1
351
+ end
352
+
353
+ # elements in the feature vector: get fixed with the training data,
354
+ # get read with the test data.
355
+ # get stored in feature_legend_dir
356
+ case @dataset
357
+ when "train"
358
+ feature_legend_dir = File.new_dir(fred_classifier_directory(@exp),
359
+ "legend")
360
+ when "test"
361
+ feature_legend_dir= File.existing_dir(fred_classifier_directory(@exp),
362
+ "legend")
363
+ end
364
+
365
+ # now really write features
366
+ @w_tmp.flush()
367
+ @w_tmp.get_lemmas().sort().each { |lemmapos|
368
+
369
+ # inform user
370
+ $stderr.puts "Writing #{lemmapos}..."
371
+
372
+ # prepare list of features to use in the feature vector:
373
+ legend_filename = feature_legend_dir + FredFeatureAccess.legend_filename(lemmapos)
374
+
375
+ case @dataset
376
+ when "train"
377
+ # training data:
378
+ # determine feature list and sense list from the data,
379
+ # and store in the relevant file
380
+ feature_list, sense_list = collect_feature_list(lemmapos)
381
+ begin
382
+ f = File.new(legend_filename, "w")
383
+ rescue
384
+ $stderr.puts "Error: Could not write to feature legend file #{legend_filename}: " + $!
385
+ exit 1
386
+ end
387
+ f.puts feature_list.map { |x| x.gsub(/,/, "COMMA") }.join(",")
388
+ f.puts sense_list.map { |x| x.gsub(/,/, "COMMA") }.join(",")
389
+ f.close()
390
+
391
+ when "test"
392
+ # test data:
393
+ # read feature list and sense list from the relevant file
394
+
395
+ begin
396
+ f = File.new(legend_filename)
397
+ rescue
398
+ $stderr.puts "Error: Could not read feature legend file #{legend_filename}: " + $!
399
+ $stderr.puts "Skipping this lemma."
400
+ next
401
+ end
402
+ feature_list = f.gets().chomp().split(",").map { |x| x.gsub(/COMMA/, ",") }
403
+ sense_list = f.gets().chomp().split(",").map { |x| x.gsub(/COMMA/, ",") }
404
+ end
405
+
406
+ # write
407
+ # - featurization file
408
+ # - answer key file
409
+
410
+ f = @w_tmp.get_for_reading(lemmapos)
411
+ answer_obj = AnswerKeyAccess.new(@exp, @dataset, lemmapos, "w")
412
+
413
+ obj_out = WriteFeaturesNaryOrBinary.new(lemmapos, @exp, @dataset)
414
+
415
+ f.each { |line|
416
+
417
+ lemma, pos, ids, sid, senses, features = parse_temp_itemline(line)
418
+ unless lemma
419
+ # something went wrong in parsing the line
420
+ next
421
+ end
422
+ each_sensegroup(senses, sense_list) { |senses_for_item, original_senses|
423
+ # write answer key
424
+ answer_obj.write_line(lemma, pos,
425
+ ids, sid, original_senses, senses_for_item)
426
+
427
+ # write item: features, senses
428
+ obj_out.write_instance(to_feature_list(features, feature_list),
429
+ senses_for_item)
430
+ } # each sensegroup
431
+ } # each input line
432
+ obj_out.close()
433
+ answer_obj.close()
434
+ @w_tmp.discard(lemmapos)
435
+ } # each lemma
436
+
437
+
438
+ end
439
+
440
+ ##################
441
+ protected
442
+
443
+ ###
444
+ # read temp feature file for the given lemma/pos
445
+ # and determine the list of all features and the list of all senses,
446
+ # each sorted alphabetically
447
+ def collect_feature_list(lemmapos)
448
+ # read entries for this lemma
449
+ f = @w_tmp.get_for_reading(lemmapos)
450
+
451
+ # keep a record of all senses and features
452
+ # senses: binary.
453
+ # features: keep the max. number of times a given feature occurred
454
+ # in an instance
455
+ all_senses = Hash.new()
456
+ all_features = Hash.new(0)
457
+ features_this_instance = Hash.new(0)
458
+ # record how often each feature occurred all in all
459
+ num_occ = Hash.new(0)
460
+ num_lines = 0
461
+
462
+ f.each { |line|
463
+ lemma, pos, id_string, sid, senses, features = parse_temp_itemline(line)
464
+
465
+ unless lemma
466
+ # something went wrong in parsing the line
467
+ # print out the file contents for reference, then leave
468
+ $stderr.puts "Could not read temporary feature file #{f.path()} for #{lemmapos}."
469
+ exit 1
470
+ end
471
+ num_lines += 1
472
+ senses.each { |s| all_senses[s] = true }
473
+ features_this_instance.clear()
474
+ features.each { |fea|
475
+ features_this_instance[fea] += 1
476
+ num_occ[fea] += 1
477
+ }
478
+
479
+ features_this_instance.each_pair { |feature, value|
480
+ all_features[feature] = [ all_features[feature], features_this_instance[feature] ].max()
481
+ }
482
+ }
483
+
484
+ # HIER
485
+ # if num_lines > 2
486
+ # num_occ.each_pair { |feature, num_occ|
487
+ # if num_occ < 2
488
+ # all_features.delete(feature)
489
+ # end
490
+ # }
491
+ # end
492
+
493
+
494
+
495
+ case @exp.get("numerical_features")
496
+ when "keep"
497
+ # leave numerical features as they are, or
498
+ # don't do numerical features
499
+ return [ all_features.keys().sort(),
500
+ all_senses.keys().sort()
501
+ ]
502
+
503
+ when "repeat"
504
+ # repeat: turn numerical feature with max. value N
505
+ # into N binary features
506
+ feature_list = Array.new()
507
+ all_features.keys().sort().each { |feature|
508
+ all_features[feature].times() { |index|
509
+ feature_list << feature + " #{index}/#{all_features[feature]}"
510
+ }
511
+ }
512
+ return [ feature_list,
513
+ all_senses.keys().sort()
514
+ ]
515
+
516
+ when "bin"
517
+ # make bins:
518
+ # number of bins = (max. number of occurrences of a feature per item) / 10
519
+ feature_list = Array.new()
520
+ all_features.keys().sort().each { |feature|
521
+ num_bins_this_feature = (all_features[feature].to_f() / 10.0).ceil().to_i()
522
+
523
+ num_bins_this_feature.times { |index|
524
+ feature_list << feature + " #{index}/#{num_bins_this_feature}"
525
+ }
526
+ }
527
+ return [ feature_list,
528
+ all_senses.keys().sort()
529
+ ]
530
+ else
531
+ raise "Shouldn't be here"
532
+ end
533
+ end
534
+
535
+
536
+ ###
537
+ # given a full sorted list of items and a partial list of items,
538
+ # match the partial list to the full list,
539
+ # that is, produce as many items as the full list has
540
+ # yielding 0 where the partial entry is not in the full list,
541
+ # and > 0 otherwise
542
+ #
543
+ # Note that if partial contains items not in full,
544
+ # they will not occur on the feature list returned!
545
+ def to_feature_list(partial, full,
546
+ handle_numerical_features = nil)
547
+
548
+ #print "FULL: ", full, "\n"
549
+ #print "PART: ", partial, "\n"
550
+ # count occurrences of each feature in the partial list
551
+ occ_hash = Hash.new(0)
552
+ partial.each { |p|
553
+ occ_hash[p] += 1
554
+ }
555
+
556
+ # what to do with our counts?
557
+ unless handle_numerical_features
558
+ # no pre-set value given when this function was called
559
+ handle_numerical_features = @exp.get("numerical_features")
560
+ end
561
+
562
+ case handle_numerical_features
563
+ when "keep"
564
+ # leave numerical features as numerical features
565
+ return full.map { |x|
566
+ occ_hash[x].to_s()
567
+ }
568
+
569
+ when "repeat"
570
+ # repeat each numerical feature up to a max. number of occurrences
571
+ return full.map { |feature_plus_count|
572
+ unless feature_plus_count =~ /^(.*) (\d+)\/(\d+)$/
573
+ $stderr.puts "Error: could not parse feature: #{feature_plus_count}, bailing out."
574
+ raise "Shouldn't be here."
575
+ end
576
+
577
+ feature = $1
578
+ current_count = $2.to_i()
579
+ max_num = $3.to_i()
580
+
581
+ if occ_hash[feature] > current_count
582
+ 1
583
+ else
584
+ 0
585
+ end
586
+ }
587
+
588
+ when "bin"
589
+ # group numerical feature values into N bins.
590
+ # number of bins varies from feature to feature
591
+ # each bin contains 10 different counts
592
+ return full.map { |feature_plus_count|
593
+ unless feature_plus_count =~ /^(.*) (\d+)\/(\d+)$/
594
+ $stderr.puts "Error: could not parse feature: #{feature_plus_count}, bailing out."
595
+ raise "Shouldn't be here."
596
+ end
597
+
598
+ feature = $1
599
+ current_count = $2.to_i()
600
+ max_num = $3.to_i()
601
+
602
+ if occ_hash[feature] % 10 > (10 * current_count)
603
+ 1
604
+ else
605
+ 0
606
+ end
607
+ }
608
+ else
609
+ raise "Shouldn't be here"
610
+ end
611
+ end
612
+
613
+
614
+ ###
615
+ # how to treat instances with multiple senses?
616
+ # - either write one item per sense
617
+ # - or combine all senses into one string
618
+ # - or keep as separate senses
619
+ #
620
+ # according to 'handle_multilabel' in the experiment file
621
+ #
622
+ # yields pairs of [senses, original_senses]
623
+ # both are arrays of strings
624
+ def each_sensegroup(senses, full_sense_list)
625
+ case @exp.get("handle_multilabel")
626
+ when "keep"
627
+ yield [senses, senses]
628
+ when "join"
629
+ yield [ [fred_join_senses(senses)], senses]
630
+ when "repeat"
631
+ senses.each { |s|
632
+ yield [ [s], senses]
633
+ }
634
+ when "binarize"
635
+ yield [ senses, senses ]
636
+ else
637
+ $stderr.puts "Error: unknown setting #{exp.get("handle_multilabel")}"
638
+ $stderr.puts "for 'handle_multilabel' in the experiment file."
639
+ $stderr.puts "Please choose one of 'binary', 'keep', 'join', 'repeat'"
640
+ $stderr.puts "or leave unset -- default is 'binary'."
641
+ exit 1
642
+ end
643
+ end
644
+
645
+ ###
646
+ def parse_temp_itemline(line)
647
+ lemma, pos, ids_s, sid, senses_s, *features = line.split()
648
+ # fix me! senses is empty, takes context features instead
649
+ unless senses_s
650
+ # features may be empty, but we need senses
651
+ $stderr.puts "FredFeatures Error in word sense item line: too short."
652
+ $stderr.puts ">>#{line}<<"
653
+ return nil
654
+ end
655
+
656
+ ids = ids_s.split("::").map { |i| i.gsub(/COLON/, ":") }
657
+ senses = senses_s.split("::").map { |s| s.gsub(/COLON/, ":") }
658
+
659
+ return [lemma, pos, ids, sid, senses, features]
660
+ end
661
+
662
+ end
663
+
664
+ ########################################
665
+ # read and write access to answer key files
666
+ # manages a single answer key file for a given lemma/POS pair
667
+ class AnswerKeyAccess
668
+ ###
669
+ def initialize(exp, # experiment file object
670
+ dataset, # "train", "test"
671
+ lemmapos, # lemma + POS (one string)
672
+ mode, # "r", "w", "a"
673
+ split_id = nil,
674
+ split_dataset = nil)
675
+ unless ["r", "w", "a"].include? mode
676
+ $stderr.puts "FredFeatures error: AnswerKeyAccess initialized with mode #{mode}."
677
+ exit 1
678
+ end
679
+
680
+ @mode = mode
681
+
682
+ answer_filename = fred_dirname(exp, dataset, "keys", "new") +
683
+ fred_answerkey_filename(lemmapos)
684
+
685
+ # are we reading the whole answer key file, or only the test part
686
+ # of a split of it?
687
+ if split_id
688
+ # we are accessing part of a split
689
+ # we can only do that when reading!
690
+ unless @mode == "r"
691
+ $stderr.puts "AnswerKeyAccess error: cannot access split answer file in write mode."
692
+ exit 1
693
+ end
694
+
695
+ # apply_split returns a closed temporary file
696
+ split_obj = FredSplitPkg.new(exp)
697
+ @f = split_obj.apply_split(answer_filename, lemmapos, split_dataset, split_id)
698
+ if @f.nil?
699
+ # the split_dataset part of the split doesn't contain any data
700
+ $stderr.puts "Warning: no #{split_dataset} data for lemma #{lemmapos}"
701
+ else
702
+ @f.open()
703
+ end
704
+
705
+ else
706
+ # we are reading the whole thing
707
+ begin
708
+ @f = File.new(answer_filename, @mode)
709
+ rescue
710
+ @f = nil
711
+ end
712
+ end
713
+ end
714
+
715
+ ###
716
+ def write_line(lemma, # string: lemma
717
+ pos, # string: POS
718
+ ids, # array:string: target IDs
719
+ sid, # string: sentence ID
720
+ senses, # array:string: senses
721
+ senses_this_item) # array:string: senses for this item
722
+ unless ["w", "a"].include? @mode
723
+ $stderr.puts "FredFeatures error: AnswerKeyAccess: cannot write in read mode."
724
+ exit 1
725
+ end
726
+ unless @f
727
+ raise "Shouldn't be here"
728
+ end
729
+
730
+ # write answer key:
731
+ # lemma POS ID senses
732
+ if senses.include? nil or senses.include? ""
733
+ raise "empty sense"
734
+ end
735
+ if senses_this_item.include? nil or senses_this_item.include? ""
736
+ raise "empty sense for this item"
737
+ end
738
+
739
+ senses_s = senses.map { |s| s.gsub(/,/, "COMMA")}.join(",")
740
+ senses_ti_s = senses_this_item.map { |s|
741
+ s.gsub(/,/, "COMMA")}.join(",")
742
+ id_s = ids.map { |i| i.gsub(/:/, "COLON") }.join("::")
743
+
744
+ @f.puts "#{lemma} #{pos} #{id_s} #{sid} #{senses_s} #{senses_ti_s}"
745
+ end
746
+
747
+ ###
748
+ # yield one line at a time:
749
+ # tuple (lemma, POS, ids, sentence_ID, all_assigned_senses, transformed_senses_for_this_item)
750
+ def each()
751
+ unless @mode == "r"
752
+ $stderr.puts "FredFeatures error: AnsewrKeyAccess: cannot read in write mode"
753
+ end
754
+ unless @f
755
+ # something went wrong during initialization:
756
+ # split didn't contain data
757
+ return
758
+ end
759
+
760
+ @f.each { |line|
761
+
762
+ lemma, pos, id_s, sid, senses_s, senses_this_item_s = line.split()
763
+ ids = id_s.split("::").map { |i| i.gsub(/COLON/, ":") }
764
+ senses = senses_s.split(",").map { |s| s.gsub(/COMMA/, ",") }
765
+
766
+ senses_this_item = senses_this_item_s.split(",").map { |s|
767
+ s.gsub(/COMMA/, ",") }
768
+
769
+ yield [lemma, pos, ids, sid, senses, senses_this_item]
770
+ }
771
+ end
772
+
773
+ ###
774
+ def close()
775
+ @f.close()
776
+ end
777
+
778
+ ###
779
+ def AnswerKeyAccess.remove_files(exp, dataset)
780
+ Dir[fred_dirname(exp, dataset, "keys", "new") + fred_answerkey_filename("*")].each { |filename|
781
+ if File.exists?(filename)
782
+ File.delete(filename)
783
+ end
784
+ }
785
+ end
786
+ end
787
+
788
+
789
+ ####################3
790
+ # keep writers: auxiliary class for FredFeatureAccess:
791
+ # write to several files at a time
792
+ # in tempfiles
793
+ class AuxKeepWriters
794
+ def initialize()
795
+ @lemma2temp = Hash.new()
796
+ @size = 50
797
+ @writers = Array.new()
798
+ end
799
+
800
+
801
+ ##
802
+ def flush()
803
+ @writers.each { |lemmapos, writer|
804
+ writer.close()
805
+ }
806
+ end
807
+
808
+ ##
809
+ def get_lemmas()
810
+ return @lemma2temp.keys()
811
+ end
812
+
813
+ ##
814
+ def get_for_reading(lemmapos)
815
+ if @lemma2temp[lemmapos]
816
+ # we have a writer for this
817
+
818
+ @lemma2temp[lemmapos].close()
819
+ @lemma2temp[lemmapos].open()
820
+ return @lemma2temp[lemmapos]
821
+
822
+ else
823
+ # no writer for this
824
+ return nil
825
+ end
826
+ end
827
+
828
+ ##
829
+ # finally close temp file, remove information for lemma/pos
830
+ def discard(lemmapos)
831
+ if @lemma2temp[lemmapos]
832
+ @lemma2temp[lemmapos].close(true)
833
+ @lemma2temp.delete(lemmapos)
834
+ end
835
+ end
836
+
837
+ ##
838
+ def get_writer_for(lemmapos)
839
+
840
+ # is there a temp file for this lemma/pos combination?
841
+ unless @lemma2temp[lemmapos]
842
+ @lemma2temp[lemmapos] = Tempfile.new("fred_features")
843
+ @lemma2temp[lemmapos].close()
844
+ end
845
+
846
+ # is there an open temp file for this lemma/pos combination?
847
+ pair = @writers.assoc(lemmapos)
848
+ if pair
849
+ return pair.last()
850
+ end
851
+
852
+ # no: open the temp file, kick some other temp file out of the
853
+ # @writers list
854
+ writer = @lemma2temp[lemmapos]
855
+ writer.open()
856
+
857
+
858
+ # writer: open for appending
859
+ writer.seek(0, IO::SEEK_END)
860
+
861
+
862
+ @writers << [lemmapos, writer]
863
+ if @writers.length() > @size
864
+ # close file associated with first writer
865
+ @writers.first.last.close()
866
+ @writers.shift()
867
+ end
868
+ return writer
869
+ end
870
+
871
+ ###
872
+ def remove_files()
873
+ @lemma2temp.each_value { |x|
874
+ x.close(true)
875
+ }
876
+ end
877
+ end
878
+
879
+ ##############
880
+ # write features,
881
+ # either lemma-wise
882
+ # or lemma+sense-wise
883
+ # if lemma+sense-wise, write as binary classifier,
884
+ # i.e. map the target senses
885
+ #
886
+ # Use Delegator.
887
+
888
+ ###
889
+ # Features for N-ary classifiers
890
+ class WriteFeaturesNary
891
+ def initialize(lemma,
892
+ exp,
893
+ dataset,
894
+ feature_dir)
895
+
896
+ @filename = feature_dir + fred_feature_filename(lemma)
897
+ @f = File.new(@filename, "w")
898
+ @handle_multilabel = exp.get("handle_multilabel")
899
+ end
900
+
901
+ def write_instance(features, senses)
902
+
903
+ @f.print features.map { |x|
904
+ x.to_s().gsub(/,/, "COMMA").gsub(/;/, "SEMICOLON")
905
+ }.join(",")
906
+
907
+ # possibly more than one sense? then use semicolon to separate
908
+ if @handle_multilabel == "keep"
909
+ # possibly more than one sense:
910
+ # separate by semicolon,
911
+ # and hope that the classifier knows this
912
+ @f.print ";"
913
+ @f.puts senses.map {|x|
914
+ x.to_s().gsub(/,/, "COMMA").gsub(/;/, "SEMICOLON")
915
+ }.join(",")
916
+ else
917
+ # one sense: just separate by comma
918
+ @f.print ","
919
+ @f.puts senses.first().to_s().gsub(/,/, "COMMA").gsub(/;/, "SEMICOLON")
920
+ end
921
+ end
922
+
923
+ def close()
924
+ @f.close()
925
+ end
926
+ end
927
+
928
+ ###
929
+ # Features for binary classifiers
930
+ class WriteFeaturesBinary
931
+ def initialize(lemma,
932
+ exp,
933
+ dataset,
934
+ feature_dir)
935
+ @dir = feature_dir
936
+ @lemma = lemma
937
+ @feature_dir = feature_dir
938
+
939
+ @negsense = exp.get("negsense")
940
+ unless @negsense
941
+ @negsense = "NONE"
942
+ end
943
+
944
+ # files: sense-> filename
945
+ @files = Hash.new()
946
+
947
+ # keep all instances such that, when a new sense comes around,
948
+ # we can write them for that sense
949
+ @instances = Array.new()
950
+ end
951
+
952
+
953
+ def write_instance(features, senses)
954
+ # sense we haven't seen before? Then we need to
955
+ # write the whole featurization file for that new sense
956
+ check_for_presence_of_senses(senses)
957
+
958
+ # write this new instance for all senses
959
+ @files.each_key { |sense_of_file|
960
+ write_to_sensefile(features, senses, sense_of_file)
961
+ }
962
+
963
+ # store instance in case another sense crops up later
964
+ @instances << [features, senses]
965
+ end
966
+
967
+
968
+ ###
969
+ def close()
970
+ @files.each_value { |f| f.close() }
971
+ end
972
+
973
+ ######
974
+ private
975
+
976
+ def check_for_presence_of_senses(senses)
977
+ senses.each { |sense|
978
+ # do we have a sense file for this sense?
979
+ unless @files[sense]
980
+ # open new file for this sense
981
+ @files[sense] = File.new(@feature_dir + fred_feature_filename(@lemma, sense, true), "w")
982
+ # filename = @feature_dir + fred_feature_filename(@lemma, sense, true)
983
+ # $stderr.puts "Starting new feature file #{filename}"
984
+
985
+ # and re-write all previous instances for it
986
+ @instances.each { |prev_features, prev_senses|
987
+ write_to_sensefile(prev_features, prev_senses,
988
+ sense)
989
+ }
990
+ end
991
+ }
992
+ end
993
+
994
+ ###
995
+ def write_to_sensefile(features, senses,
996
+ sense_of_file)
997
+ # file to write to
998
+ f = @files[sense_of_file]
999
+
1000
+ # print features
1001
+ f.print features.map { |x|
1002
+ x.to_s().gsub(/,/, "COMMA")
1003
+ }.join(",")
1004
+
1005
+ f.print ","
1006
+
1007
+ # binarize target class
1008
+ if senses.include? sense_of_file
1009
+ # $stderr.puts "writing POS #{sense_of_file}"
1010
+ f.puts sense_of_file.to_s()
1011
+ else
1012
+ # $stderr.puts "writing NEG #{negsense}"
1013
+ f.puts @negsense
1014
+ end
1015
+
1016
+ end
1017
+ end
1018
+
1019
+ ########
1020
+ # class writing features:
1021
+ # delegating to either a binary or an n-ary writer
1022
+ class WriteFeaturesNaryOrBinary < SimpleDelegator
1023
+ ###
1024
+ def initialize(lemma,
1025
+ exp,
1026
+ dataset)
1027
+ feature_dir = WriteFeaturesNaryOrBinary.feature_dir(exp, dataset, "new")
1028
+ if exp.get("binary_classifiers")
1029
+ # binary classifiers
1030
+ # $stderr.puts "Writing binary feature data."
1031
+
1032
+ # delegate writing to the binary feature writer
1033
+ @writer = WriteFeaturesBinary.new(lemma, exp, dataset, feature_dir)
1034
+ super(@writer)
1035
+
1036
+ else
1037
+ # n-ary classifiers
1038
+ # $stderr.puts "Writing n-ary feature data."
1039
+
1040
+ # delegate writing to the n-ary feature writer
1041
+ @writer = WriteFeaturesNary.new(lemma, exp, dataset, feature_dir)
1042
+ super(@writer)
1043
+ end
1044
+ end
1045
+
1046
+ def WriteFeaturesNaryOrBinary.feature_dir(exp, dataset,
1047
+ mode = "existing")
1048
+ return fred_dirname(exp, dataset, "features", mode)
1049
+ end
1050
+
1051
+ ###
1052
+ def WriteFeaturesNaryOrBinary.remove_files(exp, dataset)
1053
+ feature_dir = WriteFeaturesNaryOrBinary.feature_dir(exp, dataset, "new")
1054
+
1055
+ Dir[feature_dir + fred_feature_filename("*")].each { |filename|
1056
+ if File.exists? filename
1057
+ File.delete(filename)
1058
+ end
1059
+ }
1060
+ end
1061
+ end