frprep 0.0.1.prealpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,312 @@
1
+ # FredEval
2
+ # Katrin Erk April 05
3
+ #
4
+ # Frame disambiguation system: evaluate classification results
5
+ #
6
+ # While the other main classes of Fred just provide a new() method
7
+ # and a compute() method,
8
+ # the FredEval class also provides access methods to all the
9
+ # individual evaluation results and allows for a flag that
10
+ # suppresses evaluation output to a file --
11
+ # such that this package can also be used by external systems that
12
+ # wish to evaluate Fred.
13
+ #
14
+ # Inherits from the Eval class that is not Fred-specific
15
+
16
+ # Salsa packages
17
+ require "common/Eval"
18
+ require "common/ruby_class_extensions"
19
+
20
+ # Fred packages
21
+ require "fred/FredConfigData"
22
+ require "fred/FredConventions"
23
+ require "fred/FredFeatures"
24
+ require "fred/FredDetermineTargets"
25
+
26
+ class FredEval < Eval
27
+
28
+ ###
29
+ # new
30
+ #
31
+ # evaluate runtime options and announce the task
32
+ def initialize(exp_obj, # FredConfigData object
33
+ options) # hash: runtime option name (string) => value(string)
34
+
35
+ in_enduser_mode_unavailable()
36
+
37
+ @exp = exp_obj
38
+
39
+ ###
40
+ # evaluate runtime options
41
+ @split_id = nil
42
+ logfilename = nil
43
+
44
+ options.each_pair { |opt, arg|
45
+ case opt
46
+ when "--logID"
47
+
48
+ @split_id = arg
49
+ when "--printLog"
50
+ logfilename = fred_dirname(@exp, "eval", "log", "new") +
51
+ "eval_logfile.txt"
52
+
53
+ else
54
+ # case of unknown arguments has been dealt with by fred.rb
55
+ end
56
+ }
57
+
58
+ ###
59
+ # make outfile name
60
+ outfilename = fred_dirname(@exp, "eval", "eval", "new") +
61
+ "eval.txt"
62
+
63
+ ###
64
+ # do we regard all senses as assigned,
65
+ # as long as they surpass some threshold?
66
+ # if we are doing multilabel evaluation, we need the full list of senses
67
+ @threshold = @exp.get("assignment_confidence_threshold")
68
+ @target_obj = Targets.new(@exp, nil, "r")
69
+ unless @target_obj.targets_okay
70
+ # error during initialization
71
+ $stderr.puts "Error: Could not read list of known targets, bailing out."
72
+ exit 1
73
+ end
74
+
75
+ if @threshold or @exp.get("handle_multilabel") == "keep"
76
+ @multiple_senses_assigned = true
77
+ else
78
+ @multiple_senses_assigned = false
79
+ end
80
+
81
+
82
+ ###
83
+ # initialize abstract class behind me
84
+ if @multiple_senses_assigned
85
+ # we are possibly assigning more than one sense: do precision/recall
86
+ # instead of accuracy:
87
+ # "true" is what "this sense has been assigned" is mapped to below.
88
+ super(outfilename, logfilename, "true")
89
+ else
90
+ super(outfilename, logfilename)
91
+ end
92
+
93
+ # what is being done with instances with multiple sense labels?
94
+ @handle_multilabel = @exp.get("handle_multilabel")
95
+
96
+ ###
97
+ # announce the task
98
+ $stderr.puts "---------"
99
+ $stderr.print "Fred experiment #{@exp.get("experiment_ID")}: Evaluating classifiers"
100
+ if @split_dir
101
+ $stderr.puts " using split with ID #{@split_id}"
102
+ else
103
+ $stderr.puts
104
+ end
105
+ if @multiple_senses_assigned
106
+ $stderr.puts "Allowing for the assignment of multiple senses,"
107
+ $stderr.puts "computing precision and recall against the full sense list of a lemma."
108
+ end
109
+ $stderr.puts "Writing result to #{fred_dirname(@exp, "eval", "eval")}"
110
+ $stderr.puts "---------"
111
+ end
112
+
113
+ #####
114
+ protected
115
+
116
+ ###
117
+ # each_group
118
+ #
119
+ # yield each group name in turn
120
+ # in our case, group names are lemmas
121
+ #
122
+ # also, set object-global variables in such a way
123
+ # that the elements of this group can be read
124
+ def each_group()
125
+
126
+ # access to classifier output files
127
+ output_dir = fred_dirname(@exp, "output", "tab")
128
+ # access to answer key files
129
+
130
+
131
+ if @split_id
132
+ # make split object and parameter hash to pass to it
133
+ dataset = "train"
134
+ else
135
+ dataset = "test"
136
+ end
137
+
138
+ # iterate through instance files
139
+ @target_obj.get_lemmas().sort().each { |lemma|
140
+ # progress report
141
+ if @exp.get("verbose")
142
+ $stderr.puts "Evaluating " + lemma
143
+ end
144
+
145
+ # file with classification results
146
+ begin
147
+ @classfile = File.new(output_dir + fred_result_filename(lemma))
148
+ rescue
149
+ # no classification results
150
+ @classfile = nil
151
+ end
152
+
153
+ # file with answers:
154
+ # maybe we need to apply a split first
155
+ if @split_id
156
+ @goldreader = AnswerKeyAccess.new(@exp, "train", lemma, "r", @split_id, "test")
157
+ else
158
+ @goldreader = AnswerKeyAccess.new(@exp, "test", lemma, "r")
159
+ end
160
+
161
+ # doing multilabel evaluation?
162
+ # then we need a list of all senses
163
+ if @multiple_senses_assigned
164
+ @all_senses = @target_obj.get_senses(lemma)
165
+ else
166
+ @all_senses = nil
167
+ end
168
+
169
+ yield lemma
170
+ }
171
+ end
172
+
173
+ ###
174
+ # each_instance
175
+ #
176
+ # given a lemma name, yield each instance of this lemma in turn,
177
+ # or rather: yield pairs [gold_class(string), assigned_class(string)]
178
+ #
179
+ # relies on each_group() having set the appropriate readers
180
+ # @goldreader and @classfile
181
+ def each_instance(lemma) # string: lemma name
182
+
183
+ # watch out for repeated instances
184
+ # which may occur if handle_multilabel = repeat.
185
+ # Only yield them once to avoid re-evaluating multi-label instances
186
+ #
187
+ # instance_ids_seen: hash target_ids -> true/nil
188
+ instance_ids_seen = Hash.new()
189
+
190
+ # read gold file and classifier output file in parallel
191
+ @goldreader.each { |lemma, pos, target_ids, sid, senses_gold, transformed_gold_senses|
192
+
193
+ # classline: format
194
+ # (label confidence)*
195
+ # such that the label with the highest confidence is first
196
+ classline = nil
197
+ if @classfile
198
+ classline = @classfile.gets()
199
+ end
200
+ if classline.nil?
201
+ classline = ""
202
+ end
203
+
204
+ # $stderr.puts "HIER0 #{classline} #{@classfile.nil?}"
205
+
206
+ # have we done this same instance previously?
207
+ if instance_ids_seen[target_ids]
208
+ next
209
+ end
210
+ # instance not seen previously, but mark as seen now.
211
+ instance_ids_seen[target_ids] = true
212
+
213
+ # determine all assigned senses and their confidence levels
214
+ # determine all sense/confidence pairs
215
+ # senses assigned: list of pairs [senselist, confidence]
216
+ # where senselist is an array of sense strings
217
+ senses_assigned = Array.new()
218
+ current_sense = nil
219
+
220
+ classline.split().each_with_index { |entry, index|
221
+ if index % 2 == 0
222
+ # we have a sense label
223
+ if @handle_multilabel == "join"
224
+ # split up joined senses
225
+ current_sense = fred_split_sense(entry)
226
+ else
227
+ current_sense = [entry]
228
+ end
229
+
230
+ else
231
+ # we have a confidence level
232
+ senses_assigned << [current_sense, entry.to_f()]
233
+ end
234
+ }
235
+
236
+
237
+ if @threshold
238
+ # multiple senses assigned, and
239
+ # regard as assigned everything above a given threshold
240
+
241
+ # transform senses_assigned:
242
+ # in the case of "join", one sense may have several confidence levels,
243
+ # one on its own and one in a joined sense
244
+ senses_assigned_hash = Hash.new()
245
+ senses_assigned.each { |senses, confidence|
246
+ senses.each { |s|
247
+ # assign to each sense the maximum of its previous confidence
248
+ # and this one.
249
+ # watch out: confidence may be smaller than zero
250
+ if senses_assigned_hash[s]
251
+ senses_assigned_hash[s] = [senses_assigned_hash[s], confidence].max()
252
+ else
253
+ senses_assigned_hash[s] = confidence
254
+ end
255
+ }
256
+ }
257
+
258
+ # select all sense/confidence pairs where confidence is above threshold
259
+ senses_assigned = senses_assigned_hash.to_a().select { |sense, confidence|
260
+ confidence >= @threshold
261
+ }.map { |sense, confidence|
262
+ # then retain only the sense, not the confidence
263
+ sense
264
+ }
265
+
266
+
267
+ unless @all_senses
268
+ raise "Shouldn't be here"
269
+ end
270
+
271
+ # for each sense out of the list of all senses:
272
+ # yield a pair of [applies, has been assigned]
273
+ # both 'applies' and 'has been assigned' will be
274
+ # a string of either 'true' or 'false'
275
+ # assignment is accurate if both are the same
276
+ @all_senses.each { |sense_of_lemma|
277
+ gold_class = (senses_gold.include? sense_of_lemma).to_s()
278
+ assigned_class = (senses_assigned.include? sense_of_lemma).to_s()
279
+ yield [gold_class, assigned_class]
280
+ }
281
+
282
+
283
+ else
284
+ # regard only one sense as assigned at a time
285
+ # count as correct if the list of gold classes
286
+ # contains the main assigned class
287
+ # (relatively lenient evaluation)
288
+
289
+ # actually assigned class: only the one with the
290
+ # maximum confidence
291
+ # $stderr.puts "HIER5 #{senses_assigned.length()}"
292
+
293
+ if senses_assigned.empty?
294
+ # nothing to yield
295
+ else
296
+
297
+ max_senselist = senses_assigned.max { |a, b|
298
+ a.last() <=> b.last()
299
+ }.first()
300
+
301
+
302
+ max_senselist.each { |single_sense|
303
+ gold_class = (senses_gold.include? single_sense).to_s()
304
+ yield [gold_class, "true"]
305
+ }
306
+ end
307
+
308
+ end
309
+ }
310
+ end
311
+
312
+ end
@@ -0,0 +1,321 @@
1
+ class FredFeatureInfo
2
+ ###
3
+ # class variable:
4
+ # list of all known extractors
5
+ # add to it using add_feature()
6
+ @@extractors = Array.new
7
+
8
+ # boolean. set to true after warning messages have been given once
9
+ @@warned = false
10
+
11
+ ###
12
+ # add interface/interpreter
13
+ def FredFeatureInfo.add_feature(class_name) # Class object
14
+ @@extractors << class_name
15
+ end
16
+
17
+ ###
18
+ def initialize(exp)
19
+
20
+ ##
21
+ # make list of extractors that are
22
+ # required by the user
23
+ @features = Array.new
24
+ @exp = exp
25
+
26
+ # user-chosen extractors:
27
+ # returns array of pairs [feature group designator(string), options(array:string)]
28
+ exp.get_lf("feature").each { |extractor_name, *options|
29
+
30
+ extractor = @@extractors.detect { |e| e.feature_name() == extractor_name }
31
+ unless extractor
32
+ # no extractor found matching the given designator
33
+ unless @@warned
34
+ $stderr.puts "Warning: Could not find a feature extractor for #{extractor_name}: skipping."
35
+ end
36
+ next
37
+ end
38
+
39
+ # no need to use the options here,
40
+ # the feature extractors can get their options themselves.
41
+ @features << extractor
42
+ }
43
+
44
+ # do not print warnings again if another RosyFeatureInfo object is made
45
+ @@warned = true
46
+ end
47
+
48
+ ###
49
+ # get_extractor_objects
50
+ #
51
+ # returns a list of feature extractor objects
52
+ def get_extractor_objects()
53
+
54
+ return @features.map{ |feature_class|
55
+ feature_class.new(@exp)
56
+ }
57
+ end
58
+ end
59
+
60
+ ##################################3
61
+ class FredFeatureExtractor
62
+ ###
63
+ # feature name:
64
+ # name by which you choose this feature
65
+ # in the experiment file
66
+ def FredFeatureExtractor.feature_name()
67
+ raise "Overwrite me."
68
+ end
69
+
70
+ ###
71
+ # initialize with Fred experiment file object
72
+ def initialize(exp)
73
+ @exp = exp
74
+ end
75
+
76
+ ###
77
+ # compute features from meta-features
78
+ #
79
+ # argument: hash
80
+ # metafeature_label -> metafeatures
81
+ # string -> array:string
82
+ #
83
+ # yields each feature as a string
84
+ def each_feature(feature_hash)
85
+ raise "overwrite me"
86
+ end
87
+
88
+ ######
89
+ protected
90
+
91
+ def FredFeatureExtractor.announce_me()
92
+ if Module.constants.include? "FredFeatureInfo"
93
+ # yup, we have a class to which we can announce ourselves
94
+ FredFeatureInfo.add_feature(eval(self.name()))
95
+ else
96
+ # no interface collector class
97
+ # $stderr.puts "Feature #{self.name()} not announced: no RosyFeatureInfo."
98
+ end
99
+ end
100
+
101
+ end
102
+
103
+ #####
104
+ # context feature
105
+ class FredContextFeatureExtractor < FredFeatureExtractor
106
+ FredContextFeatureExtractor.announce_me()
107
+
108
+ def FredContextFeatureExtractor.feature_name()
109
+ return "context"
110
+ end
111
+
112
+ ###
113
+ def initialize(exp)
114
+ super(exp)
115
+
116
+ # cxsizes: list of context sizes chosen as features,
117
+ # encoded in metafeature labels
118
+ # written in a hash for fast access
119
+ @cxsizes = Hash.new()
120
+ @exp.get_lf("feature", "context").each { |cxsize|
121
+ @cxsizes[ "CX" + cxsize.to_s() ] = true
122
+ }
123
+ end
124
+
125
+ ###
126
+ def each_feature(feature_hash)
127
+ # grf#word#lemma#pos#ne
128
+ lemma_index = 2
129
+
130
+ feature_hash.each { |ftype, fvalues|
131
+ if @cxsizes[ftype]
132
+ # this is a context feature of a size chosen
133
+ # by the user for featurization
134
+
135
+ fvalues.each { |f|
136
+ next if f =~ /#####/;
137
+ yield ftype + f.split("#")[lemma_index]
138
+ }
139
+ end
140
+ }
141
+ end
142
+ end
143
+
144
+ #####
145
+ # context feature: POS separately, small contexts only
146
+ class FredContextPOSFeatureExtractor < FredFeatureExtractor
147
+ FredContextPOSFeatureExtractor.announce_me()
148
+
149
+ def FredContextPOSFeatureExtractor.feature_name()
150
+ return "context_pos"
151
+ end
152
+
153
+ ###
154
+ def initialize(exp)
155
+ super(exp)
156
+
157
+ # cxsizes: list of context sizes chosen as features,
158
+ # encoded in metafeature labels
159
+ # written in a hash for fast access
160
+ @cxsizes = Hash.new()
161
+ @exp.get_lf("feature", "context").each { |cxsize|
162
+ if cxsize <= 10
163
+ @cxsizes[ "CX" + cxsize.to_s() ] = true
164
+ end
165
+ }
166
+ if @cxsizes.empty?
167
+ $stderr.puts "context_pos feature warning: will not be computed"
168
+ $stderr.puts "as there is no context of size <= 10"
169
+ end
170
+ end
171
+
172
+ ###
173
+ def each_feature(feature_hash)
174
+ # word#lemma#pos#ne
175
+ pos_index = 2
176
+
177
+ feature_hash.each { |ftype, fvalues|
178
+ if @cxsizes[ftype]
179
+ # this is a context feature of a size chosen
180
+ # by the user for featurization
181
+
182
+ fvalues.each { |f|
183
+ yield "POS" + ftype + f.split("#")[pos_index]
184
+ }
185
+ end
186
+ }
187
+ end
188
+ end
189
+
190
+ #####
191
+ # bigram/trigram feature
192
+ class FredNgramFeatureExtractor < FredFeatureExtractor
193
+ FredNgramFeatureExtractor.announce_me()
194
+
195
+ def FredNgramFeatureExtractor.feature_name()
196
+ return "ngram"
197
+ end
198
+
199
+ ###
200
+ def initialize(exp)
201
+ super(exp)
202
+
203
+ # cxsize: context size from which the ngram feature will be computed
204
+ # encoded in metafeature labels
205
+ # written in a hash for fast access
206
+ @cxsize = @exp.get_lf("feature", "context").detect { |cxsize|
207
+ cxsize >= 2
208
+ }
209
+ unless @cxsize
210
+ $stderr.puts "Warning: no context of size >= 2, so"
211
+ $stderr.puts "no ngram feature computed."
212
+ end
213
+ end
214
+
215
+ ###
216
+ def each_feature(feature_hash)
217
+ # word#lemma#pos#ne
218
+ lemma_index = 1
219
+ pos_index = 2
220
+
221
+ feature_hash.each { |ftype, fvalues|
222
+ if ftype == "CX" + @cxsize.to_s()
223
+ # compute the ngram features from this context
224
+ # |fvalues| = 2*cxsize, that is, cxsize describes
225
+ # the length of a one-sided context window
226
+ # the bigram of features around the target
227
+ # concerns fvalues[cxsize-1] and fvalues[cxsize]
228
+ # the trigram of two words before, one word after includes
229
+ # fvalues[cxsize-2], fvalues[cxsize-1] and fvalues[cxsize]
230
+
231
+ [
232
+ [[-1, 0], "BLEM", lemma_index], # bigram of lemmas
233
+ [[-1, 0], "BPOS", pos_index], # bigram of POSs
234
+ [[-2, -1, 0], "TLEM", lemma_index], # trigram of lemmas
235
+ [[-2, -1, 0], "TPOS", pos_index] # trigram of POSs
236
+ ].each { |f_indices, label, subindex|
237
+ fs = f_indices.map { |i| fvalues[@cxsize+i] }.compact()
238
+ if fs.length() == f_indices.length()
239
+ # we successfully extracted entries for all the given indices
240
+ yield label + fs.map { |f| f.split("#")[subindex] }.join()
241
+ end
242
+ }
243
+ end
244
+ }
245
+ end
246
+ end
247
+
248
+
249
+ #####
250
+ # syntax feature
251
+ class FredSynFeatureExtractor < FredFeatureExtractor
252
+ FredSynFeatureExtractor.announce_me()
253
+
254
+ def FredSynFeatureExtractor.feature_name()
255
+ return "syntax"
256
+ end
257
+
258
+ ###
259
+ def each_feature(feature_hash)
260
+
261
+ feature_hash.each { |ftype, fvalues|
262
+
263
+ case ftype
264
+ when "CH", "PA"
265
+ grf_index = 0
266
+
267
+ fvalues.each { |f|
268
+ yield ftype + f.split("#")[grf_index]
269
+ }
270
+
271
+ when "SI"
272
+ # parentlemma#grf#word#lemma#pos#ne
273
+ grf_index = 1
274
+
275
+ fvalues.each { |f|
276
+ yield ftype + f.split("#")[grf_index]
277
+ }
278
+
279
+ else
280
+ # not a syntactic metafeature
281
+ end
282
+ }
283
+ end
284
+ end
285
+
286
+
287
+
288
+
289
+ #####
290
+ # syntax-plus-headword feature
291
+ class FredSynsemFeatureExtractor < FredFeatureExtractor
292
+ FredSynsemFeatureExtractor.announce_me()
293
+
294
+ def FredSynsemFeatureExtractor.feature_name()
295
+ return "synsem"
296
+ end
297
+
298
+ ###
299
+ def each_feature(feature_hash)
300
+
301
+ feature_hash.each { |ftype, fvalues|
302
+ case ftype
303
+ when "CH", "PA"
304
+ # grf#word#lemma#pos#ne
305
+ fvalues.each { |f|
306
+ yield ftype + "SEM" + f
307
+ }
308
+
309
+ when "SI"
310
+ # parentlemma#grf#word#lemma#pos#ne
311
+ # remove parent lemma
312
+ fvalues.each { |f|
313
+ yield ftype + "SEM" + f.split("#")[1..-1].join("#")
314
+ }
315
+
316
+ else
317
+ # not a syntax feature
318
+ end
319
+ }
320
+ end
321
+ end