shalmaneser-fred 1.2.0.rc4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,232 @@
1
+ # FredConventions
2
+ # Katrin Erk June 05
3
+ #
4
+ # several small things that should be uniform
5
+ # throughout the system
6
+
7
+ require "common/ruby_class_extensions"
8
+
9
+ require "common/EnduserMode"
10
+ class Object
11
+
12
+ ###
13
+ # joining and breaking up senses
14
+ def fred_join_senses(senses)
15
+ return senses.sort().join("++")
16
+ end
17
+
18
+ def fred_split_sense(joined_senses)
19
+ return joined_senses.split("++")
20
+ end
21
+
22
+ ###
23
+ # fred_dirname
24
+ #
25
+ # constructs a directory name:
26
+ # fred data directory / experiment ID / maindir / subdir
27
+ #
28
+ # if is_existing == existing, the directory is checked for existence,
29
+ # if is_existing == new, it is created if necessary
30
+ #
31
+ # returns: a string
32
+ def fred_dirname(exp, # FredConfigData object
33
+ maindir, # string: main part of directory name
34
+ subdir, # string: subpart of directory name
35
+ is_existing = "existing") # string: "existing" or "new", default: existing
36
+
37
+ case is_existing
38
+ when "existing"
39
+ return File.existing_dir(exp.get("fred_directory"),
40
+ exp.get("experiment_ID"),
41
+ maindir,
42
+ subdir)
43
+ when "new"
44
+ return File.new_dir(exp.get("fred_directory"),
45
+ exp.get("experiment_ID"),
46
+ maindir,
47
+ subdir)
48
+ else
49
+ raise "Shouldn't be here: #{is_existing}"
50
+ end
51
+ end
52
+
53
+ ####
54
+ # filenames for feature files
55
+ def fred_feature_filename(lemma, sense = nil,
56
+ do_binary = false)
57
+ if do_binary
58
+ return "fred.features.#{lemma}.SENSE.#{sense}"
59
+ else
60
+ return "fred.features.#{lemma}"
61
+ end
62
+ end
63
+
64
+ ####
65
+ # filenames for split files
66
+ def fred_split_filename(lemma)
67
+ return "fred.split.#{lemma}"
68
+ end
69
+
70
+ ###
71
+ # deconstruct split filename
72
+ # returns: lemma
73
+ def deconstruct_fred_split_filename(filename)
74
+ basename = File.basename(filename)
75
+ if basename =~ /^fred\.split\.(.*)/
76
+ return $1
77
+ else
78
+ return nil
79
+ end
80
+ end
81
+
82
+ ###
83
+ # deconstruct feature file name
84
+ # returns: hash with keys
85
+ # "lemma"
86
+ # "sense
87
+ def deconstruct_fred_feature_filename(filename)
88
+
89
+ basename = File.basename(filename)
90
+ retv = Hash.new()
91
+ # binary:
92
+ # fred.features.#{lemma}.SENSE.#{sense}
93
+ if basename =~ /^fred\.features\.(.*)\.SENSE\.(.*)$/
94
+ retv["lemma"] = $1
95
+ retv["sense"] = $2
96
+ elsif basename =~ /^fred\.features\.(.*)/
97
+ # fred.features.#{lemma}
98
+ retv["lemma"] = $1
99
+
100
+ else
101
+ # complete mismatch
102
+ return nil
103
+ end
104
+
105
+ return retv
106
+ end
107
+
108
+ ####
109
+ # filename for answer key files
110
+ def fred_answerkey_filename(lemma)
111
+ return "fred.answerkey.#{lemma}"
112
+ end
113
+
114
+ ###
115
+ # classifier directory
116
+ def fred_classifier_directory(exp, # FredConfigData object
117
+ splitID = nil) # string or nil
118
+
119
+ if exp.get("classifier_dir")
120
+ # user-specified classifier directory
121
+
122
+ if splitID
123
+ return File.new_dir(exp.get("classifier_dir"), splitID)
124
+ else
125
+ return File.new_dir(exp.get("classifier_dir"))
126
+ end
127
+
128
+ else
129
+ # my classifier directory
130
+ if splitID
131
+ return fred_dirname(exp, "classifiers", splitID, "new")
132
+ else
133
+ return fred_dirname(exp, "classifiers", "all", "new")
134
+ end
135
+ end
136
+ end
137
+
138
+ ###
139
+ # classifier file
140
+ def fred_classifier_filename(classifier, lemma, sense=nil)
141
+ if sense
142
+ return "fred.classif.#{classifier}.LEMMA.#{lemma}.SENSE.#{sense}"
143
+ else
144
+ return "fred.classif.#{classifier}.LEMMA.#{lemma}"
145
+ end
146
+ end
147
+
148
+ def deconstruct_fred_classifier_filename(filename)
149
+ retv = Hash.new()
150
+ if filename =~ /^fred\.classif\.(.*)\.LEMMA\.(.*)\.SENSE\.(.*)$/
151
+ retv["lemma"] = $2
152
+ retv["sense"] = $3
153
+ elsif filename =~ /^fred\.classif\.(.*)\.LEMMA\.(.*)$/
154
+ retv["lemma"] = $2
155
+ end
156
+ return retv
157
+ end
158
+
159
+ ###
160
+ # result file
161
+ def fred_result_filename(lemma)
162
+ return "fred.result.#{lemma.gsub(/\./, "_")}"
163
+ end
164
+
165
+ ##########
166
+ # lemma and POS: combine into string separated by
167
+ # a separator character
168
+ #
169
+ # fred_lemmapos_combine: take two strings, return combined string
170
+ # if POS is nil, returns lemma<separator character>
171
+ # fred_lemmapos_separate: take one string, return two strings
172
+ # if no POS could be retrieved, returns nil as POS and the whole string as lemma
173
+ def fred_lemmapos_combine(lemma, # string
174
+ pos) # string
175
+ return lemma.to_s + "." + pos.to_s.gsub(/\./, "DOT")
176
+ end
177
+
178
+ ###
179
+ def fred_lemmapos_separate(lemmapos) # string
180
+ pieces = lemmapos.split(".")
181
+ if pieces.length() > 1
182
+ return [ pieces[0..-2].join("."), pieces[-1] ]
183
+ else
184
+ # no POS found, treat all of lemmapos as lemma
185
+ return [ lemmapos, nil ]
186
+ end
187
+ end
188
+ end
189
+
190
+ ########################################
191
+ # given a SynNode object representing a terminal,
192
+ # return:
193
+ # - the word
194
+ # - the lemma
195
+ # - the part of speech
196
+ # - the named entity (if any)
197
+ #
198
+ # as a tuple
199
+ #
200
+ # WARNING: word and lemma are turned to lowercase
201
+ module WordLemmaPosNe
202
+ def word_lemma_pos_ne(syn_obj, # SynNode object
203
+ i) # SynInterpreter class
204
+ unless syn_obj.is_terminal?
205
+ $stderr.puts "Featurization warning: unexpectedly received non-terminal"
206
+ return [ nil, nil, nil, nil ]
207
+ end
208
+
209
+ word = syn_obj.word()
210
+ if word
211
+ word.downcase!
212
+ end
213
+
214
+ lemma = i.lemma_backoff(syn_obj)
215
+ if lemma and SalsaTigerXMLHelper.unescape(lemma) == "<unknown>"
216
+ lemma = nil
217
+ end
218
+ if lemma
219
+ lemma.downcase!
220
+ end
221
+
222
+ pos = syn_obj.part_of_speech()
223
+
224
+ ne = syn_obj.get_attribute("ne")
225
+ unless ne
226
+ ne = syn_obj.get_attribute("headof_ne")
227
+ end
228
+
229
+ return [word, lemma, pos, ne]
230
+ end
231
+ end
232
+
@@ -0,0 +1,319 @@
1
+ require "fred/FileZipped"
2
+
3
+ require "fred/fred_config_data"
4
+ require "common/SynInterfaces"
5
+ require "fred/FredConventions"
6
+
7
+
8
+ ########################################
9
+ # target determination classes:
10
+ # either determine targets from existing annotation
11
+ # with frames,
12
+ # or use all known targets.
13
+ class Targets
14
+ attr_reader :targets_okay
15
+
16
+ ###
17
+ def initialize(exp, # experiment file object
18
+ interpreter_class, # SynInterpreter class, or nil
19
+ mode) # string: "r", "w", "a", as in files
20
+ @exp = exp
21
+ @interpreter_class = interpreter_class
22
+
23
+ # keep recorded targets here.
24
+ # try to read old list now.
25
+ @targets = Hash.new()
26
+
27
+ # write target info in the classifier directory.
28
+ # This is _not_ dependent on a potential split ID
29
+ @dir = File.new_dir(fred_classifier_directory(@exp), "targets")
30
+
31
+ @targets_okay = true
32
+ case mode
33
+ when "w"
34
+ # start from scratch, no list of targets
35
+ when "a", "r"
36
+ # read existing file containing targets
37
+ begin
38
+ file = FileZipped.new(@dir + "targets.txt.gz")
39
+ rescue
40
+ # no pickle present: signal this
41
+ @targets_okay = false
42
+ return
43
+ end
44
+ file.each { |line|
45
+ line.chomp!
46
+ if line =~ /^LEMMA (.+) SENSES (.+)$/
47
+ lemmapos = $1
48
+ senses = $2.split()
49
+ lemmapos.gsub!(/ /, '_')
50
+ #lemmapos.gsub!(/\.[A-Z]\./, '.')
51
+ @targets[lemmapos] = senses
52
+ end
53
+ }
54
+
55
+ else
56
+ $stderr.puts "Error: shouldn't be here."
57
+ exit 1
58
+ end
59
+
60
+ if ["w", "a"].include? mode
61
+ @record_targets = true
62
+ else
63
+ @record_targets = false
64
+ end
65
+ end
66
+
67
+ ###
68
+ # determine_targets:
69
+ # for a given SalsaTigerSentence,
70
+ # determine all targets,
71
+ # each as a _single_ main terminal node
72
+ #
73
+ # We need a single terminal node in order
74
+ # to compute the context window
75
+ #
76
+ # returns:
77
+ # hash: target_IDs -> list of senses
78
+ # where target_IDs is a pair [list of terminal IDs, main terminal ID]
79
+ #
80
+ # where a sense is represented as a hash:
81
+ # "sense": sense, a string
82
+ # "obj": FrameNode object
83
+ # "all_targets": list of node IDs, may comprise more than a single node
84
+ # "lex": lemma, or multiword expression in canonical form
85
+ # "sid": sentence ID
86
+ def determine_targets(sent)
87
+ raise "overwrite me"
88
+ end
89
+
90
+ ##
91
+ # returns a list of lemma-pos combined strings
92
+ def get_lemmas()
93
+ return @targets.keys()
94
+ end
95
+
96
+ ##
97
+ # access to lemmas and POS, returns a list of pairs [lemma, pos] (string*string)
98
+ def get_lemma_pos()
99
+
100
+ return @targets.keys().map { |lemmapos| fred_lemmapos_separate(lemmapos) }
101
+ end
102
+
103
+ ##
104
+ # access to senses
105
+ def get_senses(lemmapos) # string, result of fred_lemmapos_combine
106
+ @targets[lemmapos] ? @targets[lemmapos] : []
107
+ end
108
+
109
+ ##
110
+ # write file
111
+ def done_reading_targets()
112
+ begin
113
+ file = FileZipped.new(@dir + "targets.txt.gz", "w")
114
+ rescue
115
+ $stderr.puts "Error: Could not write file #{@dir}targets.txt.gz"
116
+ exit 1
117
+ end
118
+
119
+ @targets.each_pair { |lemma, senses|
120
+ file.puts "LEMMA #{lemma} SENSES "+ senses.join(" ")
121
+ }
122
+
123
+ file.close
124
+ end
125
+
126
+ ###############################
127
+ protected
128
+
129
+ ##
130
+ # record: record occurrence of a lemma/sense pair
131
+ # <@targets> data structure
132
+ def record(target_info)
133
+ lemmapos = fred_lemmapos_combine(target_info["lex"], target_info["pos"])
134
+ unless @targets[lemmapos]
135
+ @targets[lemmapos] = []
136
+ end
137
+
138
+ unless @targets[lemmapos].include? target_info["sense"]
139
+ @targets[lemmapos] << target_info["sense"]
140
+ end
141
+ end
142
+ end
143
+
144
+ ########################################
145
+ class FindTargetsFromFrames < Targets
146
+ ###
147
+ # determine_targets:
148
+ # use existing frames to find targets
149
+ #
150
+ # returns:
151
+ # hash: target_IDs -> list of senses
152
+ # where target_IDs is a pair [list of terminal IDs, main terminal ID]
153
+ #
154
+ # where a sense is represented as a hash:
155
+ # "sense": sense, a string
156
+ # "obj": FrameNode object
157
+ # "all_targets": list of node IDs, may comprise more than a single node
158
+ # "lex": lemma, or multiword expression in canonical form
159
+ # "sid": sentence ID
160
+ def determine_targets(st_sent) #SalsaTigerSentence object
161
+ retv = Hash.new()
162
+ st_sent.each_frame { |frame_obj|
163
+ # instance-specific computation:
164
+ # target and target positions
165
+ # WARNING: at this moment, we are
166
+ # not considering true multiword targets for German.
167
+ # Remove the "no_mwe" parameter in main_node_of_expr
168
+ # to change this
169
+ term = nil
170
+ all_targets = nil
171
+ if frame_obj.target.nil? or frame_obj.target.children.empty?
172
+ # no target, nothing to record
173
+
174
+ elsif @exp.get("language") == "de"
175
+ # don't consider true multiword targets for German
176
+ all_targets = frame_obj.target.children()
177
+ term = @interpreter_class.main_node_of_expr(all_targets, "no_mwe")
178
+
179
+ else
180
+ # for all other languages: try to figure out the head target word
181
+ # anyway
182
+ all_targets = frame_obj.target.children()
183
+ term = @interpreter_class.main_node_of_expr(all_targets)
184
+ end
185
+
186
+ if term and term.is_splitword?
187
+ # don't use parts of a word as main node
188
+ term = term.parent()
189
+ end
190
+ if term and term.is_terminal?
191
+ key = [all_targets.map { |t| t.id() }, term.id()]
192
+
193
+ unless retv[key]
194
+ retv[key] = Array.new()
195
+ end
196
+
197
+ pos = frame_obj.target().get_attribute("pos")
198
+ # gold POS available, may be in wrong form,
199
+ # i.e. not the same strings that @interpreter_class.category()
200
+ # would return
201
+ case pos
202
+ when /^[Vv]$/
203
+ pos = "verb"
204
+ when /^[Nn]$/
205
+ pos = "noun"
206
+ when /^[Aa]$/
207
+ pos = "adj"
208
+ when nil
209
+ pos = @interpreter_class.category(term)
210
+ end
211
+
212
+ target_info = {
213
+ "sense" => frame_obj.name(),
214
+ "obj" => frame_obj,
215
+ "all_targets" => frame_obj.target.children().map { |ch| ch.id() },
216
+ "lex" => frame_obj.target().get_attribute("lemma"),
217
+ "pos" => pos,
218
+ "sid" => st_sent.id()
219
+ }
220
+ #print "lex ", frame_obj.target(), " und ",frame_obj.target().get_attribute("lemma"), "\n"
221
+ retv[key] << target_info
222
+ if @record_targets
223
+ record(target_info)
224
+ end
225
+ end
226
+ }
227
+ return retv
228
+ end
229
+ end
230
+
231
+ ########################################
232
+ class FindAllTargets < Targets
233
+ ###
234
+ # determine_targets:
235
+ # use all known lemmas, minus stopwords
236
+ def initialize(exp,
237
+ interpreter_class)
238
+ # read target info from file
239
+ super(exp, interpreter_class, "r")
240
+ @training_lemmapos_pairs = get_lemma_pos()
241
+
242
+ get_senses(@training_lemmapos_pairs)
243
+ # list of words to exclude from assignment, for now
244
+ @stoplemmas = [
245
+ "have",
246
+ "do",
247
+ "be"
248
+ # "make"
249
+ ]
250
+
251
+ end
252
+
253
+ ####
254
+ #
255
+ # returns:
256
+ # hash: target_IDs -> list of senses
257
+ # where target_IDs is a pair [list of terminal IDs, main terminal ID]
258
+ #
259
+ # where a sense is represented as a hash:
260
+ # "sense": sense, a string
261
+ # "obj": FrameNode object
262
+ # "all_targets": list of node IDs, may comprise more than a single node
263
+ # "lex": lemma, or multiword expression in canonical form
264
+ # "sid": sentence ID
265
+ def determine_targets(sent) #SalsaTigerSentence object
266
+ # map target IDs to list of senses, in our case always [ nil ]
267
+ # because we assume that the senses of the targets we point out
268
+ # are unknown
269
+ retv = Hash.new()
270
+ # iterate through terminals of the sentence, check for inclusion
271
+ # of their lemma in @training_lemmas
272
+ sent.each_terminal { |node|
273
+ # we know this lemma from the training data,
274
+ # and it is not an auxiliary,
275
+ # and it is not in the stopword list
276
+ # and the node does not represent a preposition
277
+
278
+ ### modified by ines, 17.10.2008
279
+ lemma = @interpreter_class.lemma_backoff(node)
280
+ pos = @interpreter_class.category(node)
281
+
282
+ # print "lemma ", lemma, " pos ", pos, "\n"
283
+ # reg = /\.[ANV]/
284
+ # if !reg.match(lemma)
285
+ # if /verb/.match(pos)
286
+ # lemma = lemma + ".V"
287
+ # elsif /noun/.match(pos)
288
+ # lemma = lemma + ".N"
289
+ # elsif /adj/.match(pos)
290
+ # lemma = lemma + ".A"
291
+ # end
292
+ # print "LEMMA ", lemma, " POS ", pos, "\n"
293
+ # end
294
+
295
+ if (@training_lemmapos_pairs.include? [lemma, pos] and
296
+ not(@interpreter_class.auxiliary?(node)) and
297
+ not(@stoplemmas.include? lemma) and
298
+ not(pos == "prep"))
299
+ key = [ [ node.id() ], node.id() ]
300
+
301
+ # take this as a target.
302
+ retv[ key ] = [
303
+ {
304
+ "sense" => nil,
305
+ "obj" => nil,
306
+ "all_targets" => [ node.id() ],
307
+ "lex" => lemma,
308
+ "pos" => pos,
309
+ "sid" => sent.id()
310
+ } ]
311
+ # no recording of target info,
312
+ # since we haven't determined anything new
313
+ end
314
+ }
315
+
316
+ return retv
317
+ end
318
+ end
319
+