shalmaneser 1.2.0.rc4 → 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (115) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +47 -18
  3. data/bin/shalmaneser +8 -2
  4. data/doc/index.md +1 -0
  5. data/lib/shalmaneser/opt_parser.rb +68 -67
  6. metadata +49 -119
  7. data/bin/fred +0 -16
  8. data/bin/frprep +0 -34
  9. data/bin/rosy +0 -17
  10. data/lib/common/AbstractSynInterface.rb +0 -1229
  11. data/lib/common/Counter.rb +0 -18
  12. data/lib/common/EnduserMode.rb +0 -27
  13. data/lib/common/Eval.rb +0 -480
  14. data/lib/common/FixSynSemMapping.rb +0 -196
  15. data/lib/common/Graph.rb +0 -345
  16. data/lib/common/ISO-8859-1.rb +0 -24
  17. data/lib/common/ML.rb +0 -186
  18. data/lib/common/Mallet.rb +0 -236
  19. data/lib/common/Maxent.rb +0 -229
  20. data/lib/common/Optimise.rb +0 -195
  21. data/lib/common/Parser.rb +0 -213
  22. data/lib/common/RegXML.rb +0 -269
  23. data/lib/common/RosyConventions.rb +0 -171
  24. data/lib/common/STXmlTerminalOrder.rb +0 -194
  25. data/lib/common/SalsaTigerRegXML.rb +0 -2347
  26. data/lib/common/SalsaTigerXMLHelper.rb +0 -99
  27. data/lib/common/SynInterfaces.rb +0 -282
  28. data/lib/common/TabFormat.rb +0 -721
  29. data/lib/common/Tiger.rb +0 -1448
  30. data/lib/common/Timbl.rb +0 -144
  31. data/lib/common/Tree.rb +0 -61
  32. data/lib/common/config_data.rb +0 -470
  33. data/lib/common/config_format_element.rb +0 -220
  34. data/lib/common/headz.rb +0 -338
  35. data/lib/common/option_parser.rb +0 -13
  36. data/lib/common/prep_config_data.rb +0 -62
  37. data/lib/common/prep_helper.rb +0 -1330
  38. data/lib/common/ruby_class_extensions.rb +0 -310
  39. data/lib/db/db_interface.rb +0 -48
  40. data/lib/db/db_mysql.rb +0 -145
  41. data/lib/db/db_sqlite.rb +0 -280
  42. data/lib/db/db_table.rb +0 -239
  43. data/lib/db/db_wrapper.rb +0 -176
  44. data/lib/db/sql_query.rb +0 -243
  45. data/lib/ext/maxent/Classify.class +0 -0
  46. data/lib/ext/maxent/Train.class +0 -0
  47. data/lib/fred/Baseline.rb +0 -150
  48. data/lib/fred/FileZipped.rb +0 -31
  49. data/lib/fred/FredBOWContext.rb +0 -877
  50. data/lib/fred/FredConventions.rb +0 -232
  51. data/lib/fred/FredDetermineTargets.rb +0 -319
  52. data/lib/fred/FredEval.rb +0 -312
  53. data/lib/fred/FredFeatureExtractors.rb +0 -322
  54. data/lib/fred/FredFeatures.rb +0 -1061
  55. data/lib/fred/FredFeaturize.rb +0 -602
  56. data/lib/fred/FredNumTrainingSenses.rb +0 -27
  57. data/lib/fred/FredParameters.rb +0 -402
  58. data/lib/fred/FredSplit.rb +0 -84
  59. data/lib/fred/FredSplitPkg.rb +0 -180
  60. data/lib/fred/FredTest.rb +0 -606
  61. data/lib/fred/FredTrain.rb +0 -144
  62. data/lib/fred/PlotAndREval.rb +0 -480
  63. data/lib/fred/fred.rb +0 -47
  64. data/lib/fred/fred_config_data.rb +0 -185
  65. data/lib/fred/md5.rb +0 -23
  66. data/lib/fred/opt_parser.rb +0 -250
  67. data/lib/frprep/Ampersand.rb +0 -39
  68. data/lib/frprep/CollinsInterface.rb +0 -1165
  69. data/lib/frprep/Counter.rb +0 -18
  70. data/lib/frprep/FNCorpusXML.rb +0 -643
  71. data/lib/frprep/FNDatabase.rb +0 -144
  72. data/lib/frprep/FrameXML.rb +0 -513
  73. data/lib/frprep/Graph.rb +0 -345
  74. data/lib/frprep/MiniparInterface.rb +0 -1388
  75. data/lib/frprep/RegXML.rb +0 -269
  76. data/lib/frprep/STXmlTerminalOrder.rb +0 -194
  77. data/lib/frprep/SleepyInterface.rb +0 -384
  78. data/lib/frprep/TntInterface.rb +0 -44
  79. data/lib/frprep/TreetaggerInterface.rb +0 -327
  80. data/lib/frprep/do_parses.rb +0 -143
  81. data/lib/frprep/frprep.rb +0 -693
  82. data/lib/frprep/interfaces/berkeley_interface.rb +0 -372
  83. data/lib/frprep/interfaces/stanford_interface.rb +0 -353
  84. data/lib/frprep/interpreters/berkeley_interpreter.rb +0 -22
  85. data/lib/frprep/interpreters/stanford_interpreter.rb +0 -22
  86. data/lib/frprep/one_parsed_file.rb +0 -28
  87. data/lib/frprep/opt_parser.rb +0 -94
  88. data/lib/frprep/ruby_class_extensions.rb +0 -310
  89. data/lib/rosy/AbstractFeatureAndExternal.rb +0 -242
  90. data/lib/rosy/ExternalConfigData.rb +0 -58
  91. data/lib/rosy/FailedParses.rb +0 -130
  92. data/lib/rosy/FeatureInfo.rb +0 -242
  93. data/lib/rosy/GfInduce.rb +0 -1115
  94. data/lib/rosy/GfInduceFeature.rb +0 -148
  95. data/lib/rosy/InputData.rb +0 -294
  96. data/lib/rosy/RosyConfusability.rb +0 -338
  97. data/lib/rosy/RosyEval.rb +0 -465
  98. data/lib/rosy/RosyFeatureExtractors.rb +0 -1609
  99. data/lib/rosy/RosyFeaturize.rb +0 -281
  100. data/lib/rosy/RosyInspect.rb +0 -336
  101. data/lib/rosy/RosyIterator.rb +0 -478
  102. data/lib/rosy/RosyPhase2FeatureExtractors.rb +0 -230
  103. data/lib/rosy/RosyPruning.rb +0 -165
  104. data/lib/rosy/RosyServices.rb +0 -744
  105. data/lib/rosy/RosySplit.rb +0 -232
  106. data/lib/rosy/RosyTask.rb +0 -19
  107. data/lib/rosy/RosyTest.rb +0 -829
  108. data/lib/rosy/RosyTrain.rb +0 -234
  109. data/lib/rosy/RosyTrainingTestTable.rb +0 -787
  110. data/lib/rosy/TargetsMostFrequentFrame.rb +0 -60
  111. data/lib/rosy/View.rb +0 -418
  112. data/lib/rosy/opt_parser.rb +0 -379
  113. data/lib/rosy/rosy.rb +0 -78
  114. data/lib/rosy/rosy_config_data.rb +0 -121
  115. data/lib/shalmaneser/version.rb +0 -3
@@ -1,1061 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- require "tempfile"
3
- require "delegate"
4
-
5
- require "fred/FredFeatureExtractors"
6
-
7
- ########################################
8
- ########################################
9
- # Feature access classes:
10
- # read and write features
11
- class AbstractFredFeatureAccess
12
- ####
13
- def initialize(exp, # experiment file object
14
- dataset, # dataset: "train" or "test"
15
- mode = "r") # mode: r, w, a
16
- @exp = exp
17
- @dataset = dataset
18
- @mode = mode
19
-
20
- unless ["r", "w", "a"].include? @mode
21
- $stderr.puts "FeatureAccess: unknown mode #{@mode}."
22
- exit 1
23
- end
24
-
25
- end
26
-
27
- ####
28
- def AbstractFredFeatureAccess.remove_feature_files()
29
- raise "overwrite me"
30
- end
31
-
32
- ####
33
- def write_item(lemma, # string: lemma
34
- pos, # string: POS
35
- ids, # array:string: unique IDs of this occurrence of the lemma
36
- sid, # string: sentence ID
37
- sense, # string: sense
38
- features) # features: hash feature type-> features (string-> array:string)
39
- raise "overwrite me"
40
- end
41
-
42
-
43
- def flush()
44
- raise "overwrite me"
45
- end
46
- end
47
-
48
- ########################################
49
- # MetaFeatureAccess:
50
- # write all featurization data to one gzipped file,
51
- # directly writing the meta-features as they come
52
- # format:
53
- #
54
- # lemma pos id sense
55
- # <feature_type>: <features>
56
- #
57
- # where feature_type is a word, and features is a list of words, space-separated
58
- class MetaFeatureAccess < AbstractFredFeatureAccess
59
- ###
60
- def initialize(exp, dataset, mode)
61
- super(exp, dataset, mode)
62
-
63
- @filename = MetaFeatureAccess.filename(@exp, @dataset)
64
-
65
- # make filename for writing features
66
- case @mode
67
-
68
- when "w", "a", "r"
69
- # read or write access
70
- @f = FileZipped.new(@filename, mode)
71
-
72
- else
73
- $stderr.puts "MetaFeatureAccess error: illegal mode #{mode}"
74
- exit 1
75
- end
76
- end
77
-
78
-
79
- ####
80
- def MetaFeatureAccess.filename(exp, dataset, mode="new")
81
- return fred_dirname(exp, dataset, "meta_features", mode) +
82
- "meta_features.txt.gz"
83
- end
84
-
85
- ####
86
- def MetaFeatureAccess.remove_feature_files(exp, dataset)
87
- filename = MetaFeatureAccess.filename(exp, dataset)
88
- if File.exists?(filename)
89
- File.delete(filename)
90
- end
91
- end
92
-
93
-
94
- ###
95
- # read items, yield one at a time
96
- #
97
- # format: tuple consisting of
98
- # - target_lemma: string
99
- # - target_pos: string
100
- # - target_ids: array:string
101
- # - target SID: string, sentence ID
102
- # - target_senses: array:string
103
- # - feature_hash: feature_type->values, string->array:string
104
- def each_item()
105
- unless @mode == "r"
106
- $stderr.puts "MetaFeatureAccess error: cannot read file not opened for reading"
107
- exit 1
108
- end
109
-
110
- lemma = pos = sid = ids = senses = nil
111
-
112
- feature_hash = Hash.new()
113
-
114
- @f.each { |line|
115
- line.chomp!
116
- if line =~ /^\s/
117
- # line starts with whitespace: continues description of previous item
118
- # that is, if we have a previous item
119
- #
120
- # format of line:
121
- # feature_type: feature feature feature ...
122
- # as in
123
- # CH: SB#expansion#expansion#NN# OA#change#change#NN#
124
- unless lemma
125
- $stderr.puts "MetaFeatureAccess error: unexpected leading whitespace"
126
- $stderr.puts "in meta-feature file #{@filename}, ignoring line:"
127
- $stderr.puts line
128
- next
129
- end
130
-
131
- feature_type, *features = line.split()
132
-
133
- unless feature_type =~ /^(.*):$/
134
- # feature type should end in ":"
135
- $stderr.puts "MetaFeatureAccess error: feature type should end in ':' but doesn't"
136
- $stderr.puts "in meta-feature file #{@filename}, ignoring line:"
137
- $stderr.puts line
138
- next
139
- end
140
-
141
- feature_hash[feature_type[0..-2]] = features
142
-
143
-
144
- else
145
- # first line of item.
146
- #
147
- # format:
148
- # lemma POS IDs sid senses
149
- #
150
- # as in:
151
- # cause verb 2-651966_8 2-651966 Causation
152
-
153
- # first yield previous item
154
- if lemma
155
- yield [lemma, pos, ids, sid, senses, feature_hash]
156
- end
157
-
158
- # then start new item:
159
- lemma, pos, ids_s, sid, senses_s = line.split()
160
- ids = ids_s.split("::").map { |i| i.gsub(/COLON/, ":") }
161
- senses = senses_s.split("::").map { |s| s.gsub(/COLON/, ":") }
162
-
163
- # reset feature hash
164
- feature_hash.clear()
165
- end
166
- }
167
-
168
- # one more item to yield?
169
- if lemma
170
- yield [lemma, pos, ids, sid, senses, feature_hash]
171
- end
172
- end
173
-
174
-
175
-
176
- ###
177
- def write_item(lemma, # string: target lemma
178
- pos, # string: target pos
179
- ids, # array:string: unique IDs of this occurrence of the lemma
180
- sid, # string: sentence ID
181
- senses, # array:string: sense
182
- features) # features: hash feature type-> features (string-> array:string)
183
-
184
- unless ["w", "a"].include? @mode
185
- $stderr.puts "MetaFeatureAccess error: cannot write to feature file opened for reading"
186
- exit 1
187
- end
188
-
189
- if not(lemma) or lemma.empty? or not(ids) or ids.empty?
190
- # nothing to write
191
- # HIER debugging
192
- # raise "HIER no lemma or no IDs: #{lemma} #{ids}"
193
- return
194
- end
195
- if pos.nil? or pos.empty?
196
- # POS unknown
197
- pos = ""
198
- end
199
- unless senses
200
- senses = [ @exp.get("noval") ]
201
- end
202
-
203
- ids_s = ids.map { |i| i.gsub(/:/, "COLON") }.join("::")
204
-
205
- senses_s = senses.map { |s| s.gsub(/:/, "COLON") }.join("::")
206
- @f.puts "#{lemma} #{pos} #{ids_s} #{sid} #{senses_s}"
207
- features.each_pair { |feature_type, f_list|
208
- @f.puts " #{feature_type}: " + f_list.map { |f| f.to_s() }.join(" ")
209
- }
210
- @f.flush()
211
- end
212
-
213
- ###
214
- def flush()
215
- unless ["w", "a"].include? @mode
216
- $stderr.puts "MetaFeatureAccess error: cannot write to feature file opened for reading"
217
- exit 1
218
- end
219
-
220
- # actually, nothing to be done here
221
- end
222
-
223
- end
224
-
225
-
226
- ########################################
227
- # FredFeatureWriter:
228
- # write chosen features (according to the experiment file)
229
- # to
230
- # - one file per lemma for n-ary classification
231
- # - one file per lemma/sense pair for binary classification
232
- #
233
- # format: CSV, last entry is target class
234
- class FredFeatureAccess < AbstractFredFeatureAccess
235
- ###
236
- def initialize(exp, dataset, mode)
237
- super(exp, dataset, mode)
238
-
239
- # write to auxiliary files first,
240
- # to sort items by lemma
241
- @w_tmp = AuxKeepWriters.new()
242
-
243
- # which features has the user requested?
244
- feature_info_obj = FredFeatureInfo.new(@exp)
245
- @feature_extractors = feature_info_obj.get_extractor_objects()
246
-
247
- end
248
-
249
- ####
250
- def FredFeatureAccess.remove_feature_files(exp, dataset)
251
-
252
- # remove feature files
253
- WriteFeaturesNaryOrBinary.remove_files(exp, dataset)
254
-
255
- # remove key files
256
- AnswerKeyAccess.remove_files(exp, dataset)
257
- end
258
-
259
- ###
260
- def FredFeatureAccess.legend_filename(lemmapos)
261
- return "fred.feature_legend.#{lemmapos}"
262
- end
263
-
264
- ###
265
- def FredFeatureAccess.feature_dir(exp, dataset)
266
- return WriteFeaturesNaryOrBinary.feature_dir(exp, dataset, "new")
267
- end
268
-
269
- ###
270
- # each feature file:
271
- # iterate through feature files,
272
- # yield pairs [filename, values]
273
- # where 'values' is a hash containing keys
274
- # 'lemma' and potentially 'sense'
275
- #
276
- # filenames are sorted alphabetically before being yielded
277
- #
278
- # available in read and write mode
279
- def FredFeatureAccess.each_feature_file(exp, dataset)
280
- feature_dir = FredFeatureAccess.feature_dir(exp, dataset)
281
- Dir[feature_dir + "*"].sort().each { |filename|
282
- if (values = deconstruct_fred_feature_filename(filename))
283
- yield [filename, values]
284
- end
285
- }
286
- end
287
-
288
- ###
289
- # write item:
290
- # - transform meta-features into actual features as requested
291
- # in the experiment file
292
- # - write item to tempfile, don't really write yet
293
- def write_item(lemma, # string: target lemma
294
- pos, # string: target pos
295
- ids, # array:string: unique IDs of this occurrence of the lemma
296
- sid, # string: sentence ID
297
- senses, # array:string: sense
298
- features) # features: hash feature type-> features (string-> array:string)
299
-
300
-
301
- unless ["w", "a"].include? @mode
302
- $stderr.puts "FredFeatures error: cannot write to feature file opened for reading"
303
- exit 1
304
- end
305
-
306
- if lemma.nil? or lemma.empty? or ids.nil? or ids.empty?
307
- # nothing to write
308
- return
309
- end
310
- if pos.nil? or pos.empty?
311
- # POS unknown
312
- pos = ""
313
- end
314
-
315
- # falsch! noval nicht zulässig für fred! (nur für rosy!) - Warum steht das hier???
316
- unless senses
317
- senses = [ @exp.get("noval") ]
318
- end
319
-
320
- # modified by ines, 19.7.2010
321
- # senses should be empty, but they are not - why?
322
- if senses.length == 1 and senses[0].eql? ""
323
- senses = "NONE"
324
- end
325
-
326
- writer = @w_tmp.get_writer_for(fred_lemmapos_combine(lemma, pos))
327
- ids_s = ids.map { |i| i.gsub(/:/, "COLON") }.join("::")
328
-
329
- # AB: Ines modified <senses> and it can be a String.
330
- # That's corrected, but I do not guarantee the correct results.
331
- if senses.respond_to? :map
332
- senses_s = senses.map { |s| s.gsub(/:/, "COLON") }.join("::")
333
- end
334
- writer.print "#{lemma} #{pos} #{ids_s} #{sid} #{senses_s} "
335
-
336
- # write all features
337
- @feature_extractors.each { |extractor|
338
- extractor.each_feature(features) { |feature|
339
- writer.print feature, " "
340
- }
341
- }
342
- writer.puts
343
- writer.flush()
344
- end
345
-
346
- ###
347
- def flush()
348
- unless ["w", "a"].include? @mode
349
- $stderr.puts "FredFeatureAccess error: cannot write to feature file opened for reading"
350
- exit 1
351
- end
352
-
353
- # elements in the feature vector: get fixed with the training data,
354
- # get read with the test data.
355
- # get stored in feature_legend_dir
356
- case @dataset
357
- when "train"
358
- feature_legend_dir = File.new_dir(fred_classifier_directory(@exp),
359
- "legend")
360
- when "test"
361
- feature_legend_dir= File.existing_dir(fred_classifier_directory(@exp),
362
- "legend")
363
- end
364
-
365
- # now really write features
366
- @w_tmp.flush()
367
- @w_tmp.get_lemmas().sort().each { |lemmapos|
368
-
369
- # inform user
370
- $stderr.puts "Writing #{lemmapos}..."
371
-
372
- # prepare list of features to use in the feature vector:
373
- legend_filename = feature_legend_dir + FredFeatureAccess.legend_filename(lemmapos)
374
-
375
- case @dataset
376
- when "train"
377
- # training data:
378
- # determine feature list and sense list from the data,
379
- # and store in the relevant file
380
- feature_list, sense_list = collect_feature_list(lemmapos)
381
- begin
382
- f = File.new(legend_filename, "w")
383
- rescue
384
- $stderr.puts "Error: Could not write to feature legend file #{legend_filename}: " + $!
385
- exit 1
386
- end
387
- f.puts feature_list.map { |x| x.gsub(/,/, "COMMA") }.join(",")
388
- f.puts sense_list.map { |x| x.gsub(/,/, "COMMA") }.join(",")
389
- f.close()
390
-
391
- when "test"
392
- # test data:
393
- # read feature list and sense list from the relevant file
394
-
395
- begin
396
- f = File.new(legend_filename)
397
- rescue
398
- $stderr.puts "Error: Could not read feature legend file #{legend_filename}: " + $!
399
- $stderr.puts "Skipping this lemma."
400
- next
401
- end
402
- feature_list = f.gets().chomp().split(",").map { |x| x.gsub(/COMMA/, ",") }
403
- sense_list = f.gets().chomp().split(",").map { |x| x.gsub(/COMMA/, ",") }
404
- end
405
-
406
- # write
407
- # - featurization file
408
- # - answer key file
409
-
410
- f = @w_tmp.get_for_reading(lemmapos)
411
- answer_obj = AnswerKeyAccess.new(@exp, @dataset, lemmapos, "w")
412
-
413
- obj_out = WriteFeaturesNaryOrBinary.new(lemmapos, @exp, @dataset)
414
-
415
- f.each { |line|
416
-
417
- lemma, pos, ids, sid, senses, features = parse_temp_itemline(line)
418
- unless lemma
419
- # something went wrong in parsing the line
420
- next
421
- end
422
- each_sensegroup(senses, sense_list) { |senses_for_item, original_senses|
423
- # write answer key
424
- answer_obj.write_line(lemma, pos,
425
- ids, sid, original_senses, senses_for_item)
426
-
427
- # write item: features, senses
428
- obj_out.write_instance(to_feature_list(features, feature_list),
429
- senses_for_item)
430
- } # each sensegroup
431
- } # each input line
432
- obj_out.close()
433
- answer_obj.close()
434
- @w_tmp.discard(lemmapos)
435
- } # each lemma
436
-
437
-
438
- end
439
-
440
- ##################
441
- protected
442
-
443
- ###
444
- # read temp feature file for the given lemma/pos
445
- # and determine the list of all features and the list of all senses,
446
- # each sorted alphabetically
447
- def collect_feature_list(lemmapos)
448
- # read entries for this lemma
449
- f = @w_tmp.get_for_reading(lemmapos)
450
-
451
- # keep a record of all senses and features
452
- # senses: binary.
453
- # features: keep the max. number of times a given feature occurred
454
- # in an instance
455
- all_senses = Hash.new()
456
- all_features = Hash.new(0)
457
- features_this_instance = Hash.new(0)
458
- # record how often each feature occurred all in all
459
- num_occ = Hash.new(0)
460
- num_lines = 0
461
-
462
- f.each { |line|
463
- lemma, pos, id_string, sid, senses, features = parse_temp_itemline(line)
464
-
465
- unless lemma
466
- # something went wrong in parsing the line
467
- # print out the file contents for reference, then leave
468
- $stderr.puts "Could not read temporary feature file #{f.path()} for #{lemmapos}."
469
- exit 1
470
- end
471
- num_lines += 1
472
- senses.each { |s| all_senses[s] = true }
473
- features_this_instance.clear()
474
- features.each { |fea|
475
- features_this_instance[fea] += 1
476
- num_occ[fea] += 1
477
- }
478
-
479
- features_this_instance.each_pair { |feature, value|
480
- all_features[feature] = [ all_features[feature], features_this_instance[feature] ].max()
481
- }
482
- }
483
-
484
- # HIER
485
- # if num_lines > 2
486
- # num_occ.each_pair { |feature, num_occ|
487
- # if num_occ < 2
488
- # all_features.delete(feature)
489
- # end
490
- # }
491
- # end
492
-
493
-
494
-
495
- case @exp.get("numerical_features")
496
- when "keep"
497
- # leave numerical features as they are, or
498
- # don't do numerical features
499
- return [ all_features.keys().sort(),
500
- all_senses.keys().sort()
501
- ]
502
-
503
- when "repeat"
504
- # repeat: turn numerical feature with max. value N
505
- # into N binary features
506
- feature_list = Array.new()
507
- all_features.keys().sort().each { |feature|
508
- all_features[feature].times() { |index|
509
- feature_list << feature + " #{index}/#{all_features[feature]}"
510
- }
511
- }
512
- return [ feature_list,
513
- all_senses.keys().sort()
514
- ]
515
-
516
- when "bin"
517
- # make bins:
518
- # number of bins = (max. number of occurrences of a feature per item) / 10
519
- feature_list = Array.new()
520
- all_features.keys().sort().each { |feature|
521
- num_bins_this_feature = (all_features[feature].to_f() / 10.0).ceil().to_i()
522
-
523
- num_bins_this_feature.times { |index|
524
- feature_list << feature + " #{index}/#{num_bins_this_feature}"
525
- }
526
- }
527
- return [ feature_list,
528
- all_senses.keys().sort()
529
- ]
530
- else
531
- raise "Shouldn't be here"
532
- end
533
- end
534
-
535
-
536
- ###
537
- # given a full sorted list of items and a partial list of items,
538
- # match the partial list to the full list,
539
- # that is, produce as many items as the full list has
540
- # yielding 0 where the partial entry is not in the full list,
541
- # and > 0 otherwise
542
- #
543
- # Note that if partial contains items not in full,
544
- # they will not occur on the feature list returned!
545
- def to_feature_list(partial, full,
546
- handle_numerical_features = nil)
547
-
548
- #print "FULL: ", full, "\n"
549
- #print "PART: ", partial, "\n"
550
- # count occurrences of each feature in the partial list
551
- occ_hash = Hash.new(0)
552
- partial.each { |p|
553
- occ_hash[p] += 1
554
- }
555
-
556
- # what to do with our counts?
557
- unless handle_numerical_features
558
- # no pre-set value given when this function was called
559
- handle_numerical_features = @exp.get("numerical_features")
560
- end
561
-
562
- case handle_numerical_features
563
- when "keep"
564
- # leave numerical features as numerical features
565
- return full.map { |x|
566
- occ_hash[x].to_s()
567
- }
568
-
569
- when "repeat"
570
- # repeat each numerical feature up to a max. number of occurrences
571
- return full.map { |feature_plus_count|
572
- unless feature_plus_count =~ /^(.*) (\d+)\/(\d+)$/
573
- $stderr.puts "Error: could not parse feature: #{feature_plus_count}, bailing out."
574
- raise "Shouldn't be here."
575
- end
576
-
577
- feature = $1
578
- current_count = $2.to_i()
579
- max_num = $3.to_i()
580
-
581
- if occ_hash[feature] > current_count
582
- 1
583
- else
584
- 0
585
- end
586
- }
587
-
588
- when "bin"
589
- # group numerical feature values into N bins.
590
- # number of bins varies from feature to feature
591
- # each bin contains 10 different counts
592
- return full.map { |feature_plus_count|
593
- unless feature_plus_count =~ /^(.*) (\d+)\/(\d+)$/
594
- $stderr.puts "Error: could not parse feature: #{feature_plus_count}, bailing out."
595
- raise "Shouldn't be here."
596
- end
597
-
598
- feature = $1
599
- current_count = $2.to_i()
600
- max_num = $3.to_i()
601
-
602
- if occ_hash[feature] % 10 > (10 * current_count)
603
- 1
604
- else
605
- 0
606
- end
607
- }
608
- else
609
- raise "Shouldn't be here"
610
- end
611
- end
612
-
613
-
614
- ###
615
- # how to treat instances with multiple senses?
616
- # - either write one item per sense
617
- # - or combine all senses into one string
618
- # - or keep as separate senses
619
- #
620
- # according to 'handle_multilabel' in the experiment file
621
- #
622
- # yields pairs of [senses, original_senses]
623
- # both are arrays of strings
624
- def each_sensegroup(senses, full_sense_list)
625
- case @exp.get("handle_multilabel")
626
- when "keep"
627
- yield [senses, senses]
628
- when "join"
629
- yield [ [fred_join_senses(senses)], senses]
630
- when "repeat"
631
- senses.each { |s|
632
- yield [ [s], senses]
633
- }
634
- when "binarize"
635
- yield [ senses, senses ]
636
- else
637
- $stderr.puts "Error: unknown setting #{exp.get("handle_multilabel")}"
638
- $stderr.puts "for 'handle_multilabel' in the experiment file."
639
- $stderr.puts "Please choose one of 'binary', 'keep', 'join', 'repeat'"
640
- $stderr.puts "or leave unset -- default is 'binary'."
641
- exit 1
642
- end
643
- end
644
-
645
- ###
646
- def parse_temp_itemline(line)
647
- lemma, pos, ids_s, sid, senses_s, *features = line.split()
648
- # fix me! senses is empty, takes context features instead
649
- unless senses_s
650
- # features may be empty, but we need senses
651
- $stderr.puts "FredFeatures Error in word sense item line: too short."
652
- $stderr.puts ">>#{line}<<"
653
- return nil
654
- end
655
-
656
- ids = ids_s.split("::").map { |i| i.gsub(/COLON/, ":") }
657
- senses = senses_s.split("::").map { |s| s.gsub(/COLON/, ":") }
658
-
659
- return [lemma, pos, ids, sid, senses, features]
660
- end
661
-
662
- end
663
-
664
- ########################################
665
- # read and write access to answer key files
666
- # manages a single answer key file for a given lemma/POS pair
667
- class AnswerKeyAccess
668
- ###
669
- def initialize(exp, # experiment file object
670
- dataset, # "train", "test"
671
- lemmapos, # lemma + POS (one string)
672
- mode, # "r", "w", "a"
673
- split_id = nil,
674
- split_dataset = nil)
675
- unless ["r", "w", "a"].include? mode
676
- $stderr.puts "FredFeatures error: AnswerKeyAccess initialized with mode #{mode}."
677
- exit 1
678
- end
679
-
680
- @mode = mode
681
-
682
- answer_filename = fred_dirname(exp, dataset, "keys", "new") +
683
- fred_answerkey_filename(lemmapos)
684
-
685
- # are we reading the whole answer key file, or only the test part
686
- # of a split of it?
687
- if split_id
688
- # we are accessing part of a split
689
- # we can only do that when reading!
690
- unless @mode == "r"
691
- $stderr.puts "AnswerKeyAccess error: cannot access split answer file in write mode."
692
- exit 1
693
- end
694
-
695
- # apply_split returns a closed temporary file
696
- split_obj = FredSplitPkg.new(exp)
697
- @f = split_obj.apply_split(answer_filename, lemmapos, split_dataset, split_id)
698
- if @f.nil?
699
- # the split_dataset part of the split doesn't contain any data
700
- $stderr.puts "Warning: no #{split_dataset} data for lemma #{lemmapos}"
701
- else
702
- @f.open()
703
- end
704
-
705
- else
706
- # we are reading the whole thing
707
- begin
708
- @f = File.new(answer_filename, @mode)
709
- rescue
710
- @f = nil
711
- end
712
- end
713
- end
714
-
715
- ###
716
- def write_line(lemma, # string: lemma
717
- pos, # string: POS
718
- ids, # array:string: target IDs
719
- sid, # string: sentence ID
720
- senses, # array:string: senses
721
- senses_this_item) # array:string: senses for this item
722
- unless ["w", "a"].include? @mode
723
- $stderr.puts "FredFeatures error: AnswerKeyAccess: cannot write in read mode."
724
- exit 1
725
- end
726
- unless @f
727
- raise "Shouldn't be here"
728
- end
729
-
730
- # write answer key:
731
- # lemma POS ID senses
732
- if senses.include? nil or senses.include? ""
733
- raise "empty sense"
734
- end
735
- if senses_this_item.include? nil or senses_this_item.include? ""
736
- raise "empty sense for this item"
737
- end
738
-
739
- senses_s = senses.map { |s| s.gsub(/,/, "COMMA")}.join(",")
740
- senses_ti_s = senses_this_item.map { |s|
741
- s.gsub(/,/, "COMMA")}.join(",")
742
- id_s = ids.map { |i| i.gsub(/:/, "COLON") }.join("::")
743
-
744
- @f.puts "#{lemma} #{pos} #{id_s} #{sid} #{senses_s} #{senses_ti_s}"
745
- end
746
-
747
- ###
748
- # yield one line at a time:
749
- # tuple (lemma, POS, ids, sentence_ID, all_assigned_senses, transformed_senses_for_this_item)
750
- def each()
751
- unless @mode == "r"
752
- $stderr.puts "FredFeatures error: AnsewrKeyAccess: cannot read in write mode"
753
- end
754
- unless @f
755
- # something went wrong during initialization:
756
- # split didn't contain data
757
- return
758
- end
759
-
760
- @f.each { |line|
761
-
762
- lemma, pos, id_s, sid, senses_s, senses_this_item_s = line.split()
763
- ids = id_s.split("::").map { |i| i.gsub(/COLON/, ":") }
764
- senses = senses_s.split(",").map { |s| s.gsub(/COMMA/, ",") }
765
-
766
- senses_this_item = senses_this_item_s.split(",").map { |s|
767
- s.gsub(/COMMA/, ",") }
768
-
769
- yield [lemma, pos, ids, sid, senses, senses_this_item]
770
- }
771
- end
772
-
773
- ###
774
- def close()
775
- @f.close()
776
- end
777
-
778
- ###
779
- def AnswerKeyAccess.remove_files(exp, dataset)
780
- Dir[fred_dirname(exp, dataset, "keys", "new") + fred_answerkey_filename("*")].each { |filename|
781
- if File.exists?(filename)
782
- File.delete(filename)
783
- end
784
- }
785
- end
786
- end
787
-
788
-
789
- ####################3
790
- # keep writers: auxiliary class for FredFeatureAccess:
791
- # write to several files at a time
792
- # in tempfiles
793
- class AuxKeepWriters
794
- def initialize()
795
- @lemma2temp = Hash.new()
796
- @size = 50
797
- @writers = Array.new()
798
- end
799
-
800
-
801
- ##
802
- def flush()
803
- @writers.each { |lemmapos, writer|
804
- writer.close()
805
- }
806
- end
807
-
808
- ##
809
- def get_lemmas()
810
- return @lemma2temp.keys()
811
- end
812
-
813
- ##
814
- def get_for_reading(lemmapos)
815
- if @lemma2temp[lemmapos]
816
- # we have a writer for this
817
-
818
- @lemma2temp[lemmapos].close()
819
- @lemma2temp[lemmapos].open()
820
- return @lemma2temp[lemmapos]
821
-
822
- else
823
- # no writer for this
824
- return nil
825
- end
826
- end
827
-
828
- ##
829
- # finally close temp file, remove information for lemma/pos
830
- def discard(lemmapos)
831
- if @lemma2temp[lemmapos]
832
- @lemma2temp[lemmapos].close(true)
833
- @lemma2temp.delete(lemmapos)
834
- end
835
- end
836
-
837
- ##
838
- def get_writer_for(lemmapos)
839
-
840
- # is there a temp file for this lemma/pos combination?
841
- unless @lemma2temp[lemmapos]
842
- @lemma2temp[lemmapos] = Tempfile.new("fred_features")
843
- @lemma2temp[lemmapos].close()
844
- end
845
-
846
- # is there an open temp file for this lemma/pos combination?
847
- pair = @writers.assoc(lemmapos)
848
- if pair
849
- return pair.last()
850
- end
851
-
852
- # no: open the temp file, kick some other temp file out of the
853
- # @writers list
854
- writer = @lemma2temp[lemmapos]
855
- writer.open()
856
-
857
-
858
- # writer: open for appending
859
- writer.seek(0, IO::SEEK_END)
860
-
861
-
862
- @writers << [lemmapos, writer]
863
- if @writers.length() > @size
864
- # close file associated with first writer
865
- @writers.first.last.close()
866
- @writers.shift()
867
- end
868
- return writer
869
- end
870
-
871
- ###
872
- def remove_files()
873
- @lemma2temp.each_value { |x|
874
- x.close(true)
875
- }
876
- end
877
- end
878
-
879
- ##############
880
- # write features,
881
- # either lemma-wise
882
- # or lemma+sense-wise
883
- # if lemma+sense-wise, write as binary classifier,
884
- # i.e. map the target senses
885
- #
886
- # Use Delegator.
887
-
888
- ###
889
- # Features for N-ary classifiers
890
- class WriteFeaturesNary
891
- def initialize(lemma,
892
- exp,
893
- dataset,
894
- feature_dir)
895
-
896
- @filename = feature_dir + fred_feature_filename(lemma)
897
- @f = File.new(@filename, "w")
898
- @handle_multilabel = exp.get("handle_multilabel")
899
- end
900
-
901
- def write_instance(features, senses)
902
-
903
- @f.print features.map { |x|
904
- x.to_s().gsub(/,/, "COMMA").gsub(/;/, "SEMICOLON")
905
- }.join(",")
906
-
907
- # possibly more than one sense? then use semicolon to separate
908
- if @handle_multilabel == "keep"
909
- # possibly more than one sense:
910
- # separate by semicolon,
911
- # and hope that the classifier knows this
912
- @f.print ";"
913
- @f.puts senses.map {|x|
914
- x.to_s().gsub(/,/, "COMMA").gsub(/;/, "SEMICOLON")
915
- }.join(",")
916
- else
917
- # one sense: just separate by comma
918
- @f.print ","
919
- @f.puts senses.first().to_s().gsub(/,/, "COMMA").gsub(/;/, "SEMICOLON")
920
- end
921
- end
922
-
923
- def close()
924
- @f.close()
925
- end
926
- end
927
-
928
- ###
929
- # Features for binary classifiers
930
- class WriteFeaturesBinary
931
- def initialize(lemma,
932
- exp,
933
- dataset,
934
- feature_dir)
935
- @dir = feature_dir
936
- @lemma = lemma
937
- @feature_dir = feature_dir
938
-
939
- @negsense = exp.get("negsense")
940
- unless @negsense
941
- @negsense = "NONE"
942
- end
943
-
944
- # files: sense-> filename
945
- @files = Hash.new()
946
-
947
- # keep all instances such that, when a new sense comes around,
948
- # we can write them for that sense
949
- @instances = Array.new()
950
- end
951
-
952
-
953
- def write_instance(features, senses)
954
- # sense we haven't seen before? Then we need to
955
- # write the whole featurization file for that new sense
956
- check_for_presence_of_senses(senses)
957
-
958
- # write this new instance for all senses
959
- @files.each_key { |sense_of_file|
960
- write_to_sensefile(features, senses, sense_of_file)
961
- }
962
-
963
- # store instance in case another sense crops up later
964
- @instances << [features, senses]
965
- end
966
-
967
-
968
- ###
969
- def close()
970
- @files.each_value { |f| f.close() }
971
- end
972
-
973
- ######
974
- private
975
-
976
- def check_for_presence_of_senses(senses)
977
- senses.each { |sense|
978
- # do we have a sense file for this sense?
979
- unless @files[sense]
980
- # open new file for this sense
981
- @files[sense] = File.new(@feature_dir + fred_feature_filename(@lemma, sense, true), "w")
982
- # filename = @feature_dir + fred_feature_filename(@lemma, sense, true)
983
- # $stderr.puts "Starting new feature file #{filename}"
984
-
985
- # and re-write all previous instances for it
986
- @instances.each { |prev_features, prev_senses|
987
- write_to_sensefile(prev_features, prev_senses,
988
- sense)
989
- }
990
- end
991
- }
992
- end
993
-
994
- ###
995
- def write_to_sensefile(features, senses,
996
- sense_of_file)
997
- # file to write to
998
- f = @files[sense_of_file]
999
-
1000
- # print features
1001
- f.print features.map { |x|
1002
- x.to_s().gsub(/,/, "COMMA")
1003
- }.join(",")
1004
-
1005
- f.print ","
1006
-
1007
- # binarize target class
1008
- if senses.include? sense_of_file
1009
- # $stderr.puts "writing POS #{sense_of_file}"
1010
- f.puts sense_of_file.to_s()
1011
- else
1012
- # $stderr.puts "writing NEG #{negsense}"
1013
- f.puts @negsense
1014
- end
1015
-
1016
- end
1017
- end
1018
-
1019
- ########
1020
- # class writing features:
1021
- # delegating to either a binary or an n-ary writer
1022
- class WriteFeaturesNaryOrBinary < SimpleDelegator
1023
- ###
1024
- def initialize(lemma,
1025
- exp,
1026
- dataset)
1027
- feature_dir = WriteFeaturesNaryOrBinary.feature_dir(exp, dataset, "new")
1028
- if exp.get("binary_classifiers")
1029
- # binary classifiers
1030
- # $stderr.puts "Writing binary feature data."
1031
-
1032
- # delegate writing to the binary feature writer
1033
- @writer = WriteFeaturesBinary.new(lemma, exp, dataset, feature_dir)
1034
- super(@writer)
1035
-
1036
- else
1037
- # n-ary classifiers
1038
- # $stderr.puts "Writing n-ary feature data."
1039
-
1040
- # delegate writing to the n-ary feature writer
1041
- @writer = WriteFeaturesNary.new(lemma, exp, dataset, feature_dir)
1042
- super(@writer)
1043
- end
1044
- end
1045
-
1046
- def WriteFeaturesNaryOrBinary.feature_dir(exp, dataset,
1047
- mode = "existing")
1048
- return fred_dirname(exp, dataset, "features", mode)
1049
- end
1050
-
1051
- ###
1052
- def WriteFeaturesNaryOrBinary.remove_files(exp, dataset)
1053
- feature_dir = WriteFeaturesNaryOrBinary.feature_dir(exp, dataset, "new")
1054
-
1055
- Dir[feature_dir + fred_feature_filename("*")].each { |filename|
1056
- if File.exists? filename
1057
- File.delete(filename)
1058
- end
1059
- }
1060
- end
1061
- end