shalmaneser-fred 1.2.0.rc4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,232 @@
1
+ # FredConventions
2
+ # Katrin Erk June 05
3
+ #
4
+ # several small things that should be uniform
5
+ # throughout the system
6
+
7
+ require "common/ruby_class_extensions"
8
+
9
+ require "common/EnduserMode"
10
+ class Object
11
+
12
+ ###
13
+ # joining and breaking up senses
14
+ def fred_join_senses(senses)
15
+ return senses.sort().join("++")
16
+ end
17
+
18
+ def fred_split_sense(joined_senses)
19
+ return joined_senses.split("++")
20
+ end
21
+
22
+ ###
23
+ # fred_dirname
24
+ #
25
+ # constructs a directory name:
26
+ # fred data directory / experiment ID / maindir / subdir
27
+ #
28
+ # if is_existing == existing, the directory is checked for existence,
29
+ # if is_existing == new, it is created if necessary
30
+ #
31
+ # returns: a string
32
+ def fred_dirname(exp, # FredConfigData object
33
+ maindir, # string: main part of directory name
34
+ subdir, # string: subpart of directory name
35
+ is_existing = "existing") # string: "existing" or "new", default: existing
36
+
37
+ case is_existing
38
+ when "existing"
39
+ return File.existing_dir(exp.get("fred_directory"),
40
+ exp.get("experiment_ID"),
41
+ maindir,
42
+ subdir)
43
+ when "new"
44
+ return File.new_dir(exp.get("fred_directory"),
45
+ exp.get("experiment_ID"),
46
+ maindir,
47
+ subdir)
48
+ else
49
+ raise "Shouldn't be here: #{is_existing}"
50
+ end
51
+ end
52
+
53
+ ####
54
+ # filenames for feature files
55
+ def fred_feature_filename(lemma, sense = nil,
56
+ do_binary = false)
57
+ if do_binary
58
+ return "fred.features.#{lemma}.SENSE.#{sense}"
59
+ else
60
+ return "fred.features.#{lemma}"
61
+ end
62
+ end
63
+
64
+ ####
65
+ # filenames for split files
66
+ def fred_split_filename(lemma)
67
+ return "fred.split.#{lemma}"
68
+ end
69
+
70
+ ###
71
+ # deconstruct split filename
72
+ # returns: lemma
73
+ def deconstruct_fred_split_filename(filename)
74
+ basename = File.basename(filename)
75
+ if basename =~ /^fred\.split\.(.*)/
76
+ return $1
77
+ else
78
+ return nil
79
+ end
80
+ end
81
+
82
+ ###
83
+ # deconstruct feature file name
84
+ # returns: hash with keys
85
+ # "lemma"
86
+ # "sense
87
+ def deconstruct_fred_feature_filename(filename)
88
+
89
+ basename = File.basename(filename)
90
+ retv = Hash.new()
91
+ # binary:
92
+ # fred.features.#{lemma}.SENSE.#{sense}
93
+ if basename =~ /^fred\.features\.(.*)\.SENSE\.(.*)$/
94
+ retv["lemma"] = $1
95
+ retv["sense"] = $2
96
+ elsif basename =~ /^fred\.features\.(.*)/
97
+ # fred.features.#{lemma}
98
+ retv["lemma"] = $1
99
+
100
+ else
101
+ # complete mismatch
102
+ return nil
103
+ end
104
+
105
+ return retv
106
+ end
107
+
108
+ ####
109
+ # filename for answer key files
110
+ def fred_answerkey_filename(lemma)
111
+ return "fred.answerkey.#{lemma}"
112
+ end
113
+
114
+ ###
115
+ # classifier directory
116
+ def fred_classifier_directory(exp, # FredConfigData object
117
+ splitID = nil) # string or nil
118
+
119
+ if exp.get("classifier_dir")
120
+ # user-specified classifier directory
121
+
122
+ if splitID
123
+ return File.new_dir(exp.get("classifier_dir"), splitID)
124
+ else
125
+ return File.new_dir(exp.get("classifier_dir"))
126
+ end
127
+
128
+ else
129
+ # my classifier directory
130
+ if splitID
131
+ return fred_dirname(exp, "classifiers", splitID, "new")
132
+ else
133
+ return fred_dirname(exp, "classifiers", "all", "new")
134
+ end
135
+ end
136
+ end
137
+
138
+ ###
139
+ # classifier file
140
+ def fred_classifier_filename(classifier, lemma, sense=nil)
141
+ if sense
142
+ return "fred.classif.#{classifier}.LEMMA.#{lemma}.SENSE.#{sense}"
143
+ else
144
+ return "fred.classif.#{classifier}.LEMMA.#{lemma}"
145
+ end
146
+ end
147
+
148
+ def deconstruct_fred_classifier_filename(filename)
149
+ retv = Hash.new()
150
+ if filename =~ /^fred\.classif\.(.*)\.LEMMA\.(.*)\.SENSE\.(.*)$/
151
+ retv["lemma"] = $2
152
+ retv["sense"] = $3
153
+ elsif filename =~ /^fred\.classif\.(.*)\.LEMMA\.(.*)$/
154
+ retv["lemma"] = $2
155
+ end
156
+ return retv
157
+ end
158
+
159
+ ###
160
+ # result file
161
+ def fred_result_filename(lemma)
162
+ return "fred.result.#{lemma.gsub(/\./, "_")}"
163
+ end
164
+
165
+ ##########
166
+ # lemma and POS: combine into string separated by
167
+ # a separator character
168
+ #
169
+ # fred_lemmapos_combine: take two strings, return combined string
170
+ # if POS is nil, returns lemma<separator character>
171
+ # fred_lemmapos_separate: take one string, return two strings
172
+ # if no POS could be retrieved, returns nil as POS and the whole string as lemma
173
+ def fred_lemmapos_combine(lemma, # string
174
+ pos) # string
175
+ return lemma.to_s + "." + pos.to_s.gsub(/\./, "DOT")
176
+ end
177
+
178
+ ###
179
+ def fred_lemmapos_separate(lemmapos) # string
180
+ pieces = lemmapos.split(".")
181
+ if pieces.length() > 1
182
+ return [ pieces[0..-2].join("."), pieces[-1] ]
183
+ else
184
+ # no POS found, treat all of lemmapos as lemma
185
+ return [ lemmapos, nil ]
186
+ end
187
+ end
188
+ end
189
+
190
+ ########################################
191
+ # given a SynNode object representing a terminal,
192
+ # return:
193
+ # - the word
194
+ # - the lemma
195
+ # - the part of speech
196
+ # - the named entity (if any)
197
+ #
198
+ # as a tuple
199
+ #
200
+ # WARNING: word and lemma are turned to lowercase
201
+ module WordLemmaPosNe
202
+ def word_lemma_pos_ne(syn_obj, # SynNode object
203
+ i) # SynInterpreter class
204
+ unless syn_obj.is_terminal?
205
+ $stderr.puts "Featurization warning: unexpectedly received non-terminal"
206
+ return [ nil, nil, nil, nil ]
207
+ end
208
+
209
+ word = syn_obj.word()
210
+ if word
211
+ word.downcase!
212
+ end
213
+
214
+ lemma = i.lemma_backoff(syn_obj)
215
+ if lemma and SalsaTigerXMLHelper.unescape(lemma) == "<unknown>"
216
+ lemma = nil
217
+ end
218
+ if lemma
219
+ lemma.downcase!
220
+ end
221
+
222
+ pos = syn_obj.part_of_speech()
223
+
224
+ ne = syn_obj.get_attribute("ne")
225
+ unless ne
226
+ ne = syn_obj.get_attribute("headof_ne")
227
+ end
228
+
229
+ return [word, lemma, pos, ne]
230
+ end
231
+ end
232
+
@@ -0,0 +1,319 @@
1
+ require "fred/FileZipped"
2
+
3
+ require "fred/fred_config_data"
4
+ require "common/SynInterfaces"
5
+ require "fred/FredConventions"
6
+
7
+
8
+ ########################################
9
+ # target determination classes:
10
+ # either determine targets from existing annotation
11
+ # with frames,
12
+ # or use all known targets.
13
+ class Targets
14
+ attr_reader :targets_okay
15
+
16
+ ###
17
+ def initialize(exp, # experiment file object
18
+ interpreter_class, # SynInterpreter class, or nil
19
+ mode) # string: "r", "w", "a", as in files
20
+ @exp = exp
21
+ @interpreter_class = interpreter_class
22
+
23
+ # keep recorded targets here.
24
+ # try to read old list now.
25
+ @targets = Hash.new()
26
+
27
+ # write target info in the classifier directory.
28
+ # This is _not_ dependent on a potential split ID
29
+ @dir = File.new_dir(fred_classifier_directory(@exp), "targets")
30
+
31
+ @targets_okay = true
32
+ case mode
33
+ when "w"
34
+ # start from scratch, no list of targets
35
+ when "a", "r"
36
+ # read existing file containing targets
37
+ begin
38
+ file = FileZipped.new(@dir + "targets.txt.gz")
39
+ rescue
40
+ # no pickle present: signal this
41
+ @targets_okay = false
42
+ return
43
+ end
44
+ file.each { |line|
45
+ line.chomp!
46
+ if line =~ /^LEMMA (.+) SENSES (.+)$/
47
+ lemmapos = $1
48
+ senses = $2.split()
49
+ lemmapos.gsub!(/ /, '_')
50
+ #lemmapos.gsub!(/\.[A-Z]\./, '.')
51
+ @targets[lemmapos] = senses
52
+ end
53
+ }
54
+
55
+ else
56
+ $stderr.puts "Error: shouldn't be here."
57
+ exit 1
58
+ end
59
+
60
+ if ["w", "a"].include? mode
61
+ @record_targets = true
62
+ else
63
+ @record_targets = false
64
+ end
65
+ end
66
+
67
+ ###
68
+ # determine_targets:
69
+ # for a given SalsaTigerSentence,
70
+ # determine all targets,
71
+ # each as a _single_ main terminal node
72
+ #
73
+ # We need a single terminal node in order
74
+ # to compute the context window
75
+ #
76
+ # returns:
77
+ # hash: target_IDs -> list of senses
78
+ # where target_IDs is a pair [list of terminal IDs, main terminal ID]
79
+ #
80
+ # where a sense is represented as a hash:
81
+ # "sense": sense, a string
82
+ # "obj": FrameNode object
83
+ # "all_targets": list of node IDs, may comprise more than a single node
84
+ # "lex": lemma, or multiword expression in canonical form
85
+ # "sid": sentence ID
86
+ def determine_targets(sent)
87
+ raise "overwrite me"
88
+ end
89
+
90
+ ##
91
+ # returns a list of lemma-pos combined strings
92
+ def get_lemmas()
93
+ return @targets.keys()
94
+ end
95
+
96
+ ##
97
+ # access to lemmas and POS, returns a list of pairs [lemma, pos] (string*string)
98
+ def get_lemma_pos()
99
+
100
+ return @targets.keys().map { |lemmapos| fred_lemmapos_separate(lemmapos) }
101
+ end
102
+
103
+ ##
104
+ # access to senses
105
+ def get_senses(lemmapos) # string, result of fred_lemmapos_combine
106
+ @targets[lemmapos] ? @targets[lemmapos] : []
107
+ end
108
+
109
+ ##
110
+ # write file
111
+ def done_reading_targets()
112
+ begin
113
+ file = FileZipped.new(@dir + "targets.txt.gz", "w")
114
+ rescue
115
+ $stderr.puts "Error: Could not write file #{@dir}targets.txt.gz"
116
+ exit 1
117
+ end
118
+
119
+ @targets.each_pair { |lemma, senses|
120
+ file.puts "LEMMA #{lemma} SENSES "+ senses.join(" ")
121
+ }
122
+
123
+ file.close
124
+ end
125
+
126
+ ###############################
127
+ protected
128
+
129
+ ##
130
+ # record: record occurrence of a lemma/sense pair
131
+ # <@targets> data structure
132
+ def record(target_info)
133
+ lemmapos = fred_lemmapos_combine(target_info["lex"], target_info["pos"])
134
+ unless @targets[lemmapos]
135
+ @targets[lemmapos] = []
136
+ end
137
+
138
+ unless @targets[lemmapos].include? target_info["sense"]
139
+ @targets[lemmapos] << target_info["sense"]
140
+ end
141
+ end
142
+ end
143
+
144
+ ########################################
145
+ class FindTargetsFromFrames < Targets
146
+ ###
147
+ # determine_targets:
148
+ # use existing frames to find targets
149
+ #
150
+ # returns:
151
+ # hash: target_IDs -> list of senses
152
+ # where target_IDs is a pair [list of terminal IDs, main terminal ID]
153
+ #
154
+ # where a sense is represented as a hash:
155
+ # "sense": sense, a string
156
+ # "obj": FrameNode object
157
+ # "all_targets": list of node IDs, may comprise more than a single node
158
+ # "lex": lemma, or multiword expression in canonical form
159
+ # "sid": sentence ID
160
+ def determine_targets(st_sent) #SalsaTigerSentence object
161
+ retv = Hash.new()
162
+ st_sent.each_frame { |frame_obj|
163
+ # instance-specific computation:
164
+ # target and target positions
165
+ # WARNING: at this moment, we are
166
+ # not considering true multiword targets for German.
167
+ # Remove the "no_mwe" parameter in main_node_of_expr
168
+ # to change this
169
+ term = nil
170
+ all_targets = nil
171
+ if frame_obj.target.nil? or frame_obj.target.children.empty?
172
+ # no target, nothing to record
173
+
174
+ elsif @exp.get("language") == "de"
175
+ # don't consider true multiword targets for German
176
+ all_targets = frame_obj.target.children()
177
+ term = @interpreter_class.main_node_of_expr(all_targets, "no_mwe")
178
+
179
+ else
180
+ # for all other languages: try to figure out the head target word
181
+ # anyway
182
+ all_targets = frame_obj.target.children()
183
+ term = @interpreter_class.main_node_of_expr(all_targets)
184
+ end
185
+
186
+ if term and term.is_splitword?
187
+ # don't use parts of a word as main node
188
+ term = term.parent()
189
+ end
190
+ if term and term.is_terminal?
191
+ key = [all_targets.map { |t| t.id() }, term.id()]
192
+
193
+ unless retv[key]
194
+ retv[key] = Array.new()
195
+ end
196
+
197
+ pos = frame_obj.target().get_attribute("pos")
198
+ # gold POS available, may be in wrong form,
199
+ # i.e. not the same strings that @interpreter_class.category()
200
+ # would return
201
+ case pos
202
+ when /^[Vv]$/
203
+ pos = "verb"
204
+ when /^[Nn]$/
205
+ pos = "noun"
206
+ when /^[Aa]$/
207
+ pos = "adj"
208
+ when nil
209
+ pos = @interpreter_class.category(term)
210
+ end
211
+
212
+ target_info = {
213
+ "sense" => frame_obj.name(),
214
+ "obj" => frame_obj,
215
+ "all_targets" => frame_obj.target.children().map { |ch| ch.id() },
216
+ "lex" => frame_obj.target().get_attribute("lemma"),
217
+ "pos" => pos,
218
+ "sid" => st_sent.id()
219
+ }
220
+ #print "lex ", frame_obj.target(), " und ",frame_obj.target().get_attribute("lemma"), "\n"
221
+ retv[key] << target_info
222
+ if @record_targets
223
+ record(target_info)
224
+ end
225
+ end
226
+ }
227
+ return retv
228
+ end
229
+ end
230
+
231
+ ########################################
232
+ class FindAllTargets < Targets
233
+ ###
234
+ # determine_targets:
235
+ # use all known lemmas, minus stopwords
236
+ def initialize(exp,
237
+ interpreter_class)
238
+ # read target info from file
239
+ super(exp, interpreter_class, "r")
240
+ @training_lemmapos_pairs = get_lemma_pos()
241
+
242
+ get_senses(@training_lemmapos_pairs)
243
+ # list of words to exclude from assignment, for now
244
+ @stoplemmas = [
245
+ "have",
246
+ "do",
247
+ "be"
248
+ # "make"
249
+ ]
250
+
251
+ end
252
+
253
+ ####
254
+ #
255
+ # returns:
256
+ # hash: target_IDs -> list of senses
257
+ # where target_IDs is a pair [list of terminal IDs, main terminal ID]
258
+ #
259
+ # where a sense is represented as a hash:
260
+ # "sense": sense, a string
261
+ # "obj": FrameNode object
262
+ # "all_targets": list of node IDs, may comprise more than a single node
263
+ # "lex": lemma, or multiword expression in canonical form
264
+ # "sid": sentence ID
265
+ def determine_targets(sent) #SalsaTigerSentence object
266
+ # map target IDs to list of senses, in our case always [ nil ]
267
+ # because we assume that the senses of the targets we point out
268
+ # are unknown
269
+ retv = Hash.new()
270
+ # iterate through terminals of the sentence, check for inclusion
271
+ # of their lemma in @training_lemmas
272
+ sent.each_terminal { |node|
273
+ # we know this lemma from the training data,
274
+ # and it is not an auxiliary,
275
+ # and it is not in the stopword list
276
+ # and the node does not represent a preposition
277
+
278
+ ### modified by ines, 17.10.2008
279
+ lemma = @interpreter_class.lemma_backoff(node)
280
+ pos = @interpreter_class.category(node)
281
+
282
+ # print "lemma ", lemma, " pos ", pos, "\n"
283
+ # reg = /\.[ANV]/
284
+ # if !reg.match(lemma)
285
+ # if /verb/.match(pos)
286
+ # lemma = lemma + ".V"
287
+ # elsif /noun/.match(pos)
288
+ # lemma = lemma + ".N"
289
+ # elsif /adj/.match(pos)
290
+ # lemma = lemma + ".A"
291
+ # end
292
+ # print "LEMMA ", lemma, " POS ", pos, "\n"
293
+ # end
294
+
295
+ if (@training_lemmapos_pairs.include? [lemma, pos] and
296
+ not(@interpreter_class.auxiliary?(node)) and
297
+ not(@stoplemmas.include? lemma) and
298
+ not(pos == "prep"))
299
+ key = [ [ node.id() ], node.id() ]
300
+
301
+ # take this as a target.
302
+ retv[ key ] = [
303
+ {
304
+ "sense" => nil,
305
+ "obj" => nil,
306
+ "all_targets" => [ node.id() ],
307
+ "lex" => lemma,
308
+ "pos" => pos,
309
+ "sid" => sent.id()
310
+ } ]
311
+ # no recording of target info,
312
+ # since we haven't determined anything new
313
+ end
314
+ }
315
+
316
+ return retv
317
+ end
318
+ end
319
+