frprep 0.0.1.prealpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,312 @@
1
+ # FredEval
2
+ # Katrin Erk April 05
3
+ #
4
+ # Frame disambiguation system: evaluate classification results
5
+ #
6
+ # While the other main classes of Fred just provide a new() method
7
+ # and a compute() method,
8
+ # the FredEval class also provides access methods to all the
9
+ # individual evaluation results and allows for a flag that
10
+ # suppresses evaluation output to a file --
11
+ # such that this package can also be used by external systems that
12
+ # wish to evaluate Fred.
13
+ #
14
+ # Inherits from the Eval class that is not Fred-specific
15
+
16
+ # Salsa packages
17
+ require "common/Eval"
18
+ require "common/ruby_class_extensions"
19
+
20
+ # Fred packages
21
+ require "fred/FredConfigData"
22
+ require "fred/FredConventions"
23
+ require "fred/FredFeatures"
24
+ require "fred/FredDetermineTargets"
25
+
26
+ class FredEval < Eval
27
+
28
+ ###
29
+ # new
30
+ #
31
+ # evaluate runtime options and announce the task
32
+ def initialize(exp_obj, # FredConfigData object
33
+ options) # hash: runtime option name (string) => value(string)
34
+
35
+ in_enduser_mode_unavailable()
36
+
37
+ @exp = exp_obj
38
+
39
+ ###
40
+ # evaluate runtime options
41
+ @split_id = nil
42
+ logfilename = nil
43
+
44
+ options.each_pair { |opt, arg|
45
+ case opt
46
+ when "--logID"
47
+
48
+ @split_id = arg
49
+ when "--printLog"
50
+ logfilename = fred_dirname(@exp, "eval", "log", "new") +
51
+ "eval_logfile.txt"
52
+
53
+ else
54
+ # case of unknown arguments has been dealt with by fred.rb
55
+ end
56
+ }
57
+
58
+ ###
59
+ # make outfile name
60
+ outfilename = fred_dirname(@exp, "eval", "eval", "new") +
61
+ "eval.txt"
62
+
63
+ ###
64
+ # do we regard all senses as assigned,
65
+ # as long as they surpass some threshold?
66
+ # if we are doing multilabel evaluation, we need the full list of senses
67
+ @threshold = @exp.get("assignment_confidence_threshold")
68
+ @target_obj = Targets.new(@exp, nil, "r")
69
+ unless @target_obj.targets_okay
70
+ # error during initialization
71
+ $stderr.puts "Error: Could not read list of known targets, bailing out."
72
+ exit 1
73
+ end
74
+
75
+ if @threshold or @exp.get("handle_multilabel") == "keep"
76
+ @multiple_senses_assigned = true
77
+ else
78
+ @multiple_senses_assigned = false
79
+ end
80
+
81
+
82
+ ###
83
+ # initialize abstract class behind me
84
+ if @multiple_senses_assigned
85
+ # we are possibly assigning more than one sense: do precision/recall
86
+ # instead of accuracy:
87
+ # "true" is what "this sense has been assigned" is mapped to below.
88
+ super(outfilename, logfilename, "true")
89
+ else
90
+ super(outfilename, logfilename)
91
+ end
92
+
93
+ # what is being done with instances with multiple sense labels?
94
+ @handle_multilabel = @exp.get("handle_multilabel")
95
+
96
+ ###
97
+ # announce the task
98
+ $stderr.puts "---------"
99
+ $stderr.print "Fred experiment #{@exp.get("experiment_ID")}: Evaluating classifiers"
100
+ if @split_dir
101
+ $stderr.puts " using split with ID #{@split_id}"
102
+ else
103
+ $stderr.puts
104
+ end
105
+ if @multiple_senses_assigned
106
+ $stderr.puts "Allowing for the assignment of multiple senses,"
107
+ $stderr.puts "computing precision and recall against the full sense list of a lemma."
108
+ end
109
+ $stderr.puts "Writing result to #{fred_dirname(@exp, "eval", "eval")}"
110
+ $stderr.puts "---------"
111
+ end
112
+
113
+ #####
114
+ protected
115
+
116
+ ###
117
+ # each_group
118
+ #
119
+ # yield each group name in turn
120
+ # in our case, group names are lemmas
121
+ #
122
+ # also, set object-global variables in such a way
123
+ # that the elements of this group can be read
124
+ def each_group()
125
+
126
+ # access to classifier output files
127
+ output_dir = fred_dirname(@exp, "output", "tab")
128
+ # access to answer key files
129
+
130
+
131
+ if @split_id
132
+ # make split object and parameter hash to pass to it
133
+ dataset = "train"
134
+ else
135
+ dataset = "test"
136
+ end
137
+
138
+ # iterate through instance files
139
+ @target_obj.get_lemmas().sort().each { |lemma|
140
+ # progress report
141
+ if @exp.get("verbose")
142
+ $stderr.puts "Evaluating " + lemma
143
+ end
144
+
145
+ # file with classification results
146
+ begin
147
+ @classfile = File.new(output_dir + fred_result_filename(lemma))
148
+ rescue
149
+ # no classification results
150
+ @classfile = nil
151
+ end
152
+
153
+ # file with answers:
154
+ # maybe we need to apply a split first
155
+ if @split_id
156
+ @goldreader = AnswerKeyAccess.new(@exp, "train", lemma, "r", @split_id, "test")
157
+ else
158
+ @goldreader = AnswerKeyAccess.new(@exp, "test", lemma, "r")
159
+ end
160
+
161
+ # doing multilabel evaluation?
162
+ # then we need a list of all senses
163
+ if @multiple_senses_assigned
164
+ @all_senses = @target_obj.get_senses(lemma)
165
+ else
166
+ @all_senses = nil
167
+ end
168
+
169
+ yield lemma
170
+ }
171
+ end
172
+
173
+ ###
174
+ # each_instance
175
+ #
176
+ # given a lemma name, yield each instance of this lemma in turn,
177
+ # or rather: yield pairs [gold_class(string), assigned_class(string)]
178
+ #
179
+ # relies on each_group() having set the appropriate readers
180
+ # @goldreader and @classfile
181
+ def each_instance(lemma) # string: lemma name
182
+
183
+ # watch out for repeated instances
184
+ # which may occur if handle_multilabel = repeat.
185
+ # Only yield them once to avoid re-evaluating multi-label instances
186
+ #
187
+ # instance_ids_seen: hash target_ids -> true/nil
188
+ instance_ids_seen = Hash.new()
189
+
190
+ # read gold file and classifier output file in parallel
191
+ @goldreader.each { |lemma, pos, target_ids, sid, senses_gold, transformed_gold_senses|
192
+
193
+ # classline: format
194
+ # (label confidence)*
195
+ # such that the label with the highest confidence is first
196
+ classline = nil
197
+ if @classfile
198
+ classline = @classfile.gets()
199
+ end
200
+ if classline.nil?
201
+ classline = ""
202
+ end
203
+
204
+ # $stderr.puts "HIER0 #{classline} #{@classfile.nil?}"
205
+
206
+ # have we done this same instance previously?
207
+ if instance_ids_seen[target_ids]
208
+ next
209
+ end
210
+ # instance not seen previously, but mark as seen now.
211
+ instance_ids_seen[target_ids] = true
212
+
213
+ # determine all assigned senses and their confidence levels
214
+ # determine all sense/confidence pairs
215
+ # senses assigned: list of pairs [senselist, confidence]
216
+ # where senselist is an array of sense strings
217
+ senses_assigned = Array.new()
218
+ current_sense = nil
219
+
220
+ classline.split().each_with_index { |entry, index|
221
+ if index % 2 == 0
222
+ # we have a sense label
223
+ if @handle_multilabel == "join"
224
+ # split up joined senses
225
+ current_sense = fred_split_sense(entry)
226
+ else
227
+ current_sense = [entry]
228
+ end
229
+
230
+ else
231
+ # we have a confidence level
232
+ senses_assigned << [current_sense, entry.to_f()]
233
+ end
234
+ }
235
+
236
+
237
+ if @threshold
238
+ # multiple senses assigned, and
239
+ # regard as assigned everything above a given threshold
240
+
241
+ # transform senses_assigned:
242
+ # in the case of "join", one sense may have several confidence levels,
243
+ # one on its own and one in a joined sense
244
+ senses_assigned_hash = Hash.new()
245
+ senses_assigned.each { |senses, confidence|
246
+ senses.each { |s|
247
+ # assign to each sense the maximum of its previous confidence
248
+ # and this one.
249
+ # watch out: confidence may be smaller than zero
250
+ if senses_assigned_hash[s]
251
+ senses_assigned_hash[s] = [senses_assigned_hash[s], confidence].max()
252
+ else
253
+ senses_assigned_hash[s] = confidence
254
+ end
255
+ }
256
+ }
257
+
258
+ # select all sense/confidence pairs where confidence is above threshold
259
+ senses_assigned = senses_assigned_hash.to_a().select { |sense, confidence|
260
+ confidence >= @threshold
261
+ }.map { |sense, confidence|
262
+ # then retain only the sense, not the confidence
263
+ sense
264
+ }
265
+
266
+
267
+ unless @all_senses
268
+ raise "Shouldn't be here"
269
+ end
270
+
271
+ # for each sense out of the list of all senses:
272
+ # yield a pair of [applies, has been assigned]
273
+ # both 'applies' and 'has been assigned' will be
274
+ # a string of either 'true' or 'false'
275
+ # assignment is accurate if both are the same
276
+ @all_senses.each { |sense_of_lemma|
277
+ gold_class = (senses_gold.include? sense_of_lemma).to_s()
278
+ assigned_class = (senses_assigned.include? sense_of_lemma).to_s()
279
+ yield [gold_class, assigned_class]
280
+ }
281
+
282
+
283
+ else
284
+ # regard only one sense as assigned at a time
285
+ # count as correct if the list of gold classes
286
+ # contains the main assigned class
287
+ # (relatively lenient evaluation)
288
+
289
+ # actually assigned class: only the one with the
290
+ # maximum confidence
291
+ # $stderr.puts "HIER5 #{senses_assigned.length()}"
292
+
293
+ if senses_assigned.empty?
294
+ # nothing to yield
295
+ else
296
+
297
+ max_senselist = senses_assigned.max { |a, b|
298
+ a.last() <=> b.last()
299
+ }.first()
300
+
301
+
302
+ max_senselist.each { |single_sense|
303
+ gold_class = (senses_gold.include? single_sense).to_s()
304
+ yield [gold_class, "true"]
305
+ }
306
+ end
307
+
308
+ end
309
+ }
310
+ end
311
+
312
+ end
@@ -0,0 +1,321 @@
1
+ class FredFeatureInfo
2
+ ###
3
+ # class variable:
4
+ # list of all known extractors
5
+ # add to it using add_feature()
6
+ @@extractors = Array.new
7
+
8
+ # boolean. set to true after warning messages have been given once
9
+ @@warned = false
10
+
11
+ ###
12
+ # add interface/interpreter
13
+ def FredFeatureInfo.add_feature(class_name) # Class object
14
+ @@extractors << class_name
15
+ end
16
+
17
+ ###
18
+ def initialize(exp)
19
+
20
+ ##
21
+ # make list of extractors that are
22
+ # required by the user
23
+ @features = Array.new
24
+ @exp = exp
25
+
26
+ # user-chosen extractors:
27
+ # returns array of pairs [feature group designator(string), options(array:string)]
28
+ exp.get_lf("feature").each { |extractor_name, *options|
29
+
30
+ extractor = @@extractors.detect { |e| e.feature_name() == extractor_name }
31
+ unless extractor
32
+ # no extractor found matching the given designator
33
+ unless @@warned
34
+ $stderr.puts "Warning: Could not find a feature extractor for #{extractor_name}: skipping."
35
+ end
36
+ next
37
+ end
38
+
39
+ # no need to use the options here,
40
+ # the feature extractors can get their options themselves.
41
+ @features << extractor
42
+ }
43
+
44
+ # do not print warnings again if another RosyFeatureInfo object is made
45
+ @@warned = true
46
+ end
47
+
48
+ ###
49
+ # get_extractor_objects
50
+ #
51
+ # returns a list of feature extractor objects
52
+ def get_extractor_objects()
53
+
54
+ return @features.map{ |feature_class|
55
+ feature_class.new(@exp)
56
+ }
57
+ end
58
+ end
59
+
60
+ ##################################3
61
+ class FredFeatureExtractor
62
+ ###
63
+ # feature name:
64
+ # name by which you choose this feature
65
+ # in the experiment file
66
+ def FredFeatureExtractor.feature_name()
67
+ raise "Overwrite me."
68
+ end
69
+
70
+ ###
71
+ # initialize with Fred experiment file object
72
+ def initialize(exp)
73
+ @exp = exp
74
+ end
75
+
76
+ ###
77
+ # compute features from meta-features
78
+ #
79
+ # argument: hash
80
+ # metafeature_label -> metafeatures
81
+ # string -> array:string
82
+ #
83
+ # yields each feature as a string
84
+ def each_feature(feature_hash)
85
+ raise "overwrite me"
86
+ end
87
+
88
+ ######
89
+ protected
90
+
91
+ def FredFeatureExtractor.announce_me()
92
+ if Module.constants.include? "FredFeatureInfo"
93
+ # yup, we have a class to which we can announce ourselves
94
+ FredFeatureInfo.add_feature(eval(self.name()))
95
+ else
96
+ # no interface collector class
97
+ # $stderr.puts "Feature #{self.name()} not announced: no RosyFeatureInfo."
98
+ end
99
+ end
100
+
101
+ end
102
+
103
+ #####
104
+ # context feature
105
+ class FredContextFeatureExtractor < FredFeatureExtractor
106
+ FredContextFeatureExtractor.announce_me()
107
+
108
+ def FredContextFeatureExtractor.feature_name()
109
+ return "context"
110
+ end
111
+
112
+ ###
113
+ def initialize(exp)
114
+ super(exp)
115
+
116
+ # cxsizes: list of context sizes chosen as features,
117
+ # encoded in metafeature labels
118
+ # written in a hash for fast access
119
+ @cxsizes = Hash.new()
120
+ @exp.get_lf("feature", "context").each { |cxsize|
121
+ @cxsizes[ "CX" + cxsize.to_s() ] = true
122
+ }
123
+ end
124
+
125
+ ###
126
+ def each_feature(feature_hash)
127
+ # grf#word#lemma#pos#ne
128
+ lemma_index = 2
129
+
130
+ feature_hash.each { |ftype, fvalues|
131
+ if @cxsizes[ftype]
132
+ # this is a context feature of a size chosen
133
+ # by the user for featurization
134
+
135
+ fvalues.each { |f|
136
+ next if f =~ /#####/;
137
+ yield ftype + f.split("#")[lemma_index]
138
+ }
139
+ end
140
+ }
141
+ end
142
+ end
143
+
144
+ #####
145
+ # context feature: POS separately, small contexts only
146
+ class FredContextPOSFeatureExtractor < FredFeatureExtractor
147
+ FredContextPOSFeatureExtractor.announce_me()
148
+
149
+ def FredContextPOSFeatureExtractor.feature_name()
150
+ return "context_pos"
151
+ end
152
+
153
+ ###
154
+ def initialize(exp)
155
+ super(exp)
156
+
157
+ # cxsizes: list of context sizes chosen as features,
158
+ # encoded in metafeature labels
159
+ # written in a hash for fast access
160
+ @cxsizes = Hash.new()
161
+ @exp.get_lf("feature", "context").each { |cxsize|
162
+ if cxsize <= 10
163
+ @cxsizes[ "CX" + cxsize.to_s() ] = true
164
+ end
165
+ }
166
+ if @cxsizes.empty?
167
+ $stderr.puts "context_pos feature warning: will not be computed"
168
+ $stderr.puts "as there is no context of size <= 10"
169
+ end
170
+ end
171
+
172
+ ###
173
+ def each_feature(feature_hash)
174
+ # word#lemma#pos#ne
175
+ pos_index = 2
176
+
177
+ feature_hash.each { |ftype, fvalues|
178
+ if @cxsizes[ftype]
179
+ # this is a context feature of a size chosen
180
+ # by the user for featurization
181
+
182
+ fvalues.each { |f|
183
+ yield "POS" + ftype + f.split("#")[pos_index]
184
+ }
185
+ end
186
+ }
187
+ end
188
+ end
189
+
190
+ #####
191
+ # bigram/trigram feature
192
+ class FredNgramFeatureExtractor < FredFeatureExtractor
193
+ FredNgramFeatureExtractor.announce_me()
194
+
195
+ def FredNgramFeatureExtractor.feature_name()
196
+ return "ngram"
197
+ end
198
+
199
+ ###
200
+ def initialize(exp)
201
+ super(exp)
202
+
203
+ # cxsize: context size from which the ngram feature will be computed
204
+ # encoded in metafeature labels
205
+ # written in a hash for fast access
206
+ @cxsize = @exp.get_lf("feature", "context").detect { |cxsize|
207
+ cxsize >= 2
208
+ }
209
+ unless @cxsize
210
+ $stderr.puts "Warning: no context of size >= 2, so"
211
+ $stderr.puts "no ngram feature computed."
212
+ end
213
+ end
214
+
215
+ ###
216
+ def each_feature(feature_hash)
217
+ # word#lemma#pos#ne
218
+ lemma_index = 1
219
+ pos_index = 2
220
+
221
+ feature_hash.each { |ftype, fvalues|
222
+ if ftype == "CX" + @cxsize.to_s()
223
+ # compute the ngram features from this context
224
+ # |fvalues| = 2*cxsize, that is, cxsize describes
225
+ # the length of a one-sided context window
226
+ # the bigram of features around the target
227
+ # concerns fvalues[cxsize-1] and fvalues[cxsize]
228
+ # the trigram of two words before, one word after includes
229
+ # fvalues[cxsize-2], fvalues[cxsize-1] and fvalues[cxsize]
230
+
231
+ [
232
+ [[-1, 0], "BLEM", lemma_index], # bigram of lemmas
233
+ [[-1, 0], "BPOS", pos_index], # bigram of POSs
234
+ [[-2, -1, 0], "TLEM", lemma_index], # trigram of lemmas
235
+ [[-2, -1, 0], "TPOS", pos_index] # trigram of POSs
236
+ ].each { |f_indices, label, subindex|
237
+ fs = f_indices.map { |i| fvalues[@cxsize+i] }.compact()
238
+ if fs.length() == f_indices.length()
239
+ # we successfully extracted entries for all the given indices
240
+ yield label + fs.map { |f| f.split("#")[subindex] }.join()
241
+ end
242
+ }
243
+ end
244
+ }
245
+ end
246
+ end
247
+
248
+
249
+ #####
250
+ # syntax feature
251
+ class FredSynFeatureExtractor < FredFeatureExtractor
252
+ FredSynFeatureExtractor.announce_me()
253
+
254
+ def FredSynFeatureExtractor.feature_name()
255
+ return "syntax"
256
+ end
257
+
258
+ ###
259
+ def each_feature(feature_hash)
260
+
261
+ feature_hash.each { |ftype, fvalues|
262
+
263
+ case ftype
264
+ when "CH", "PA"
265
+ grf_index = 0
266
+
267
+ fvalues.each { |f|
268
+ yield ftype + f.split("#")[grf_index]
269
+ }
270
+
271
+ when "SI"
272
+ # parentlemma#grf#word#lemma#pos#ne
273
+ grf_index = 1
274
+
275
+ fvalues.each { |f|
276
+ yield ftype + f.split("#")[grf_index]
277
+ }
278
+
279
+ else
280
+ # not a syntactic metafeature
281
+ end
282
+ }
283
+ end
284
+ end
285
+
286
+
287
+
288
+
289
+ #####
290
+ # syntax-plus-headword feature
291
+ class FredSynsemFeatureExtractor < FredFeatureExtractor
292
+ FredSynsemFeatureExtractor.announce_me()
293
+
294
+ def FredSynsemFeatureExtractor.feature_name()
295
+ return "synsem"
296
+ end
297
+
298
+ ###
299
+ def each_feature(feature_hash)
300
+
301
+ feature_hash.each { |ftype, fvalues|
302
+ case ftype
303
+ when "CH", "PA"
304
+ # grf#word#lemma#pos#ne
305
+ fvalues.each { |f|
306
+ yield ftype + "SEM" + f
307
+ }
308
+
309
+ when "SI"
310
+ # parentlemma#grf#word#lemma#pos#ne
311
+ # remove parent lemma
312
+ fvalues.each { |f|
313
+ yield ftype + "SEM" + f.split("#")[1..-1].join("#")
314
+ }
315
+
316
+ else
317
+ # not a syntax feature
318
+ end
319
+ }
320
+ end
321
+ end