shalmaneser-prep 1.2.0.rc4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,144 @@
1
+ # sp 28 06 04
2
+ #
3
+ # this module offers methods to extract gemma corpora from the FrameNet database#
4
+
5
+ require 'frprep/FrameXML'
6
+
7
+ class FNDatabase
8
+
9
+ def each_matching_sentence(file_pred,sent_pred)
10
+ # fundamental access function to FrameXML files
11
+
12
+ # returns file objects where
13
+ # FrameXMLSentence matches sent_pred
14
+ # (FrameXMLFile is accessed through FrameXMLSentence.get_file_object and matches file_pred)
15
+ each_matching_file(file_pred) {|frameNetFile|
16
+ frameNetFile.each_sentence {|frameNetSent|
17
+ if sent_pred.call(frameNetSent)
18
+ frameNetSent.verify_annotation
19
+ yield frameNetSent
20
+ end
21
+ }
22
+ }
23
+ end
24
+
25
+ def each_matching_file(file_pred)
26
+ # fundamental access function to FrameXML files
27
+
28
+ # returns file (FrameXMLFile) objects which match file_pred
29
+ each_framexml_file{|frameNetFile|
30
+ if file_pred.call(frameNetFile)
31
+ yield frameNetFile
32
+ end
33
+ frameNetFile.close
34
+ }
35
+ end
36
+
37
+ def extract_frame(frame,outfile)
38
+ each_matching_sentence(Proc.new{|fnfile| fnfile.get_frame == frame},
39
+ Proc.new{|fnsent| true}) {|fnsent|
40
+ if fnsent.contains_FE_annotation_and_target
41
+ fnsent.print_conll_style_to(outfile)
42
+ end
43
+ }
44
+ end
45
+
46
+ def extract_lemma(lemma,outfile)
47
+ each_matching_sentence(Proc.new{|fnfile| fnfile.get_lu == lemma},
48
+ Proc.new{|fnsent| true}) {|fnsent|
49
+ if fnsent.contains_FE_annotation_and_target
50
+ fnsent.print_conll_style_to(outfile)
51
+ end
52
+ }
53
+ end
54
+
55
+ def extract_everything(outdirectory)
56
+ unless outdirectory[-1,1] == "/"
57
+ outdirectory += "/"
58
+ end
59
+
60
+ outfiles = Hash.new
61
+ each_matching_sentence(Proc.new{|fnfile| true},
62
+ Proc.new{|fnsent| true}) {|fnsent|
63
+ frame = fnsent.get_file_obj.get_frame
64
+ unless outfiles.key?(frame)
65
+ outfiles[frame] = File.new(outdirectory+frame+".tab","w")
66
+ end
67
+ if fnsent.contains_FE_annotation_and_target
68
+ fnsent.print_conll_style_to(outfiles[frame])
69
+ end
70
+ }
71
+ # close output files
72
+ outfiles.each_value {|file|
73
+ file.close
74
+ }
75
+ # remove zero-size files
76
+ Dir[outdirectory+"*"].each {|filename|
77
+ if FileTest.zero?(filename)
78
+ File.unlink(filename)
79
+ end
80
+ }
81
+ end
82
+
83
+
84
+ def initialize(fn_path)
85
+ unless fn_path[-1,1] == "/"
86
+ fn_path += "/"
87
+ end
88
+ @fn = fn_path
89
+ end
90
+
91
+ private
92
+
93
+ def each_framexml_file
94
+ # files might be zipped
95
+ Dir[@fn+"lu*.xml.gz"].each {|gzfile|
96
+ Kernel.system("cp "+gzfile+" /tmp/")
97
+ Kernel.system("gunzip -f /tmp/"+File.basename(gzfile))
98
+ gzfile =~ /(.+)\.gz/
99
+ yield FrameXMLFile.new("/tmp/"+File.basename($1))
100
+ }
101
+ # or might not
102
+ Dir[@fn+"/lu*.xml"].each {|filename|
103
+ yield FrameXMLFile.new(filename)
104
+ }
105
+ end
106
+
107
+ # I don't really remember what this was good for ;-)
108
+
109
+ # def browse_everything(allFiles)
110
+ # if allFiles
111
+ # Dir[fn+"*.xml.gz"].each {|gzfile|
112
+ # Kernel.system("cp "+gzfile+" /tmp/")
113
+ # Kernel.system("gunzip -f /tmp/"+File.basename(gzfile))
114
+ # gzfile =~ /(.+)\.gz/
115
+ # # STDERR.puts File.basename($1)
116
+ # # STDERR.print "."
117
+ # ff = FrameXMLFile.new("/tmp/"+File.basename($1))
118
+ # ff.each_sentence {|s|
119
+ # if s.contains_FE_annotation_and_target
120
+ # s.verify_annotation
121
+ # if s.verify_annotation
122
+ # puts "****************** Error: Still problems after 2nd verification!"
123
+ # end
124
+ # s.print_conll_style
125
+ # end
126
+ # }
127
+ # }
128
+ # else
129
+ # ff = FrameXMLFile.new("/tmp/lu1870.xml")
130
+ # ff.each_sentence {|s|
131
+ # if s.contains_FE_annotation_and_target
132
+ # s.verify_annotation
133
+ # if s.verify_annotation
134
+ # puts "****************** Error: Still problems after 2nd verification!"
135
+ # end
136
+ # # s.print_layers
137
+ # s.print_conll_style
138
+ # end
139
+ # }
140
+ # end
141
+ # end
142
+
143
+ end
144
+
@@ -0,0 +1,513 @@
1
+ # sp 18 06 2004
2
+ #
3
+ # access to FrameNet XML files, sentences, and annotation.
4
+ #
5
+ # sp 10 11 04: only data from the first layer with name XY is
6
+ # used for output. Other data is saved in layer XY.2nd, but is
7
+ # currently not processed.
8
+ #
9
+ # sp 22 05 04: also, if two labels exist which cover the same span
10
+ # (ie there is a double annotation within the same layer), ignore
11
+ # all but the first label.
12
+ #
13
+ # ke 13 07 05:
14
+ # - changed to RegXMl.rb
15
+ # - fixed two problems in analyse_layer:
16
+ # - Deleting problematic labels:
17
+ # For some reason, thisLayer[i+1..-1].each_index {|other_i|
18
+ # included the index 0 in any case, resulting in the 1st
19
+ # label being deleted in any case.
20
+ # - Deleting problematic labels, checking for label overlap:
21
+ # The old formulation worked only if labels occurred in the array
22
+ # in the order they occurred in the sentence, but that was not the case.
23
+ # - Change in deleting problematic labels:
24
+ # No longer delete duplicate labels, since e.g. in the PT level there
25
+ # may be more than one NP label, and we want to keep those
26
+ #
27
+ # KE January 2007:
28
+ # write new adapted FNTab format
29
+ # ( "word", ("pt", "gf", "role", "target", "frame", "stuff")* "ne", "sent_id" )
30
+
31
+ require 'frprep/Ampersand'
32
+ require 'common/ISO-8859-1'
33
+ require 'common/RegXML'
34
+
35
+ class FrameXMLFile # only verified to work for FrameNet v1.1
36
+
37
+ def initialize(filename)
38
+ @filename = filename
39
+ file = File.new(filename)
40
+ counter = 0
41
+ while true
42
+ counter +=1
43
+ line = file.gets
44
+ if line =~ /<lexunit/
45
+ break
46
+ end
47
+ if counter > 3
48
+ STDERR.puts "Error: File "+filename+" does not conform to FrameNet v1.1 standard (lexunit in 3rd line)"
49
+ Kernel.exit
50
+ end
51
+ end
52
+ # found lexunit
53
+ string = line
54
+ while (line = file.gets())
55
+ string << line
56
+ end
57
+ @lexunit = RegXML.new(string)
58
+ attributes = @lexunit.attributes()
59
+ @id = attributes["ID"]
60
+ attributes["name"] =~ /^([^.]+).([^.]+)$/
61
+ @lu = $1
62
+ @pos = $2.upcase
63
+ if @lu.nil?
64
+ raise "[framexml] no lemma in header of file #{@filename}"
65
+ elsif @pos.nil?
66
+ raise "[framexml] no pos in header of file #{@filename}"
67
+ end
68
+ @frame = attributes["frame"]
69
+ end
70
+
71
+ def get_lu
72
+ return @lu.gsub(" ","_")
73
+ end
74
+
75
+ def get_lu_id
76
+ return @id
77
+ end
78
+
79
+ def get_filename
80
+ return @filename
81
+ end
82
+
83
+ def get_pos
84
+ return @pos
85
+ end
86
+
87
+ def get_frame
88
+ return @frame
89
+ end
90
+
91
+ def close
92
+ end
93
+
94
+ def each_sentence
95
+ @lexunit.children_and_text().each { |subcorpus|
96
+ subcorpus.children_and_text().each { |annotationSet|
97
+ if annotationSet.name == "annotationSet"
98
+ # sentence found
99
+ yield FrameXMLSentence.new(annotationSet,self)
100
+ end
101
+ }
102
+ }
103
+ end
104
+ end
105
+
106
+ class FrameXMLSentence
107
+ def initialize(annotationSet,file_obj)
108
+ @file_obj = file_obj
109
+
110
+ # layers: hash layer_name -> array:[name, start, stop]
111
+ # name: name of the element, string
112
+ # start: start character, integer
113
+ # stop: end character, integer
114
+ @layers = Hash.new
115
+
116
+ annotationSet.children_and_text().each { |sentence_or_layer_elt|
117
+
118
+ case sentence_or_layer_elt.name
119
+ when "sentence"
120
+ # sentence: has ID, its child is <text>[text]</text>
121
+ @sent_id = sentence_or_layer_elt.attributes["ID"]
122
+ text_elt = sentence_or_layer_elt.children_and_text().detect { |child|
123
+ child.name == "text"
124
+ }
125
+ if text_elt
126
+ # found the text element. its only child should be the text
127
+ @orig_text = text_elt.children_and_text().detect { |child|
128
+ child.text?
129
+ }
130
+ if @orig_text
131
+ # take text out of RegXMl object
132
+ @orig_text = @orig_text.to_s()
133
+ end
134
+ end
135
+
136
+ when "layers"
137
+ # contains annotation layers
138
+ sentence_or_layer_elt.children_and_text().each { |layer|
139
+ unless layer.name == "layer"
140
+ # additional material, ignore
141
+ next
142
+ end
143
+
144
+ name = layer.attributes["name"]
145
+ unless name
146
+ raise "layer without a name"
147
+ end
148
+ unless @layers.key?(name)
149
+ @layers[name] = analyse_layer(layer, name)
150
+ end
151
+ }
152
+ end
153
+ }
154
+
155
+ @pos_text = UtfIso.to_iso_8859_1(@orig_text).split(" ") # text with special characters replaced by iso8859 characters
156
+ @text = Ampersand.utf8_to_hex(@orig_text).split(" ") # text with special characters replaced by &...; sequences
157
+
158
+ # all text and pos_text have the same number of elements!
159
+ @start_is = Hash.new # map char indices (start of words) onto word indices
160
+ @stop_is = Hash.new # map char indices (end of words) onto word indices
161
+ @charidx = Array.new # maps word indices on [start,stop]
162
+
163
+ @double_space = Array.new
164
+ pos = 0
165
+ while (match = @orig_text.index(/(\s\s+)/,pos))
166
+ @double_space << match
167
+ pos = match+1
168
+ end
169
+
170
+
171
+ # fill start, stop and charidx arrays
172
+ char_i = 0
173
+ @pos_text.each_index {|word_i|
174
+ @start_is[char_i] = word_i
175
+ startchar = char_i
176
+ # puts "Remembering "+char_i.to_s+" as start index of word "+word_i.to_s
177
+ char_i += our_length(@pos_text[word_i])
178
+ @stop_is[char_i-1] = word_i
179
+
180
+ stopchar = char_i-1
181
+
182
+ # puts "Remembering "+(char_i-1).to_s+" as stop index of word "+word_i.to_s
183
+
184
+ @charidx << [startchar,stopchar]
185
+
186
+ # separators
187
+ if @double_space.include?(char_i) then
188
+ char_i += 2
189
+ else
190
+ char_i += 1
191
+ end
192
+ }
193
+ end
194
+
195
+ def get_file_obj
196
+ return @file_obj
197
+ end
198
+
199
+ def get_sent_id
200
+ return @sent_id
201
+ end
202
+
203
+ def print_text
204
+ puts "("+@id+ ")\t"+@text
205
+ end
206
+
207
+ def contains_FE_annotation_and_target
208
+ target_info = @layers["Target"][0]
209
+ unless target_info[0] == "Target"
210
+ STDERR.puts "Error in sentence from "+filename+": No target" # strictly speaking, no target at pos 0 in @layers["Target"]
211
+ STDERR.puts "Sentence: "+@text
212
+ return false
213
+ else
214
+ return (@layers.key?("FE") and target_info[2] != 0)
215
+ end
216
+ end
217
+
218
+ # we only verify the interesting layers (FE,GF,Target)
219
+ # if there is weird stuff going on on e.g. the Noun or Adj layer, we don't care.
220
+
221
+ def verify_annotation # returns true if some change has taken place
222
+ change = false
223
+ @layers.each_pair {|layername,l|
224
+
225
+ if layername=="FE" or layername=="GF" or layername=="PT" or layername=="Target" # only verify the "important" layers
226
+
227
+ l.each_index {|i|
228
+
229
+ element,start,stop = l[i]
230
+
231
+ newstart = start
232
+ newstop = stop
233
+
234
+ @charidx.each_index{|j|
235
+ unless j== 0
236
+ pstartidx, pstopidx = @charidx[j-1]
237
+ end
238
+ startidx, stopidx = @charidx[j]
239
+
240
+ if (start > startidx and start <= stopidx) or
241
+ (j != 0 and start > pstopidx and start < startidx)
242
+ newstart = startidx
243
+ end
244
+
245
+ if (stop >= startidx and stop < stopidx)
246
+ newstop = stopidx
247
+ elsif (j != 0 and stop > pstopidx and stop < startidx)
248
+ newstop = pstopidx
249
+ end
250
+
251
+ }
252
+ if start != newstart or stop != newstop
253
+ change = true
254
+ @layers[layername][i] = [element,newstart,newstop]
255
+ STDERR.puts "Heuristics has changed element "+element+" from ["+[start,stop].join(",")+"] to ["+[newstart,newstop].join(",")+"] in file "+@file_obj.get_filename+"."
256
+ markable_as_string(layername,element).each {|string|
257
+ STDERR.puts "New markable: "+string
258
+ }
259
+ STDERR.puts "Sentence: "+@pos_text.join(" ")
260
+ puts
261
+ end
262
+ }
263
+ end
264
+ }
265
+ return change
266
+ end
267
+
268
+ def print_conll_style
269
+ print_conll_style_to(STDOUT)
270
+ end
271
+
272
+ # CHANGED KE January 2007:
273
+ # write new adapted FNTab format
274
+ # ( "word", ("pt", "gf", "role", "target", "frame", "stuff")* "ne", "sent_id" )
275
+ def print_conll_style_to(out)
276
+
277
+ # even though in principle there might be multiple
278
+ # labels for one span [i.e. in one value of the
279
+ # {gf,fe,pt} hashes], we only ever record one
280
+
281
+ gf = Hash.new
282
+ add_all_to_hash(gf,"GF")
283
+ fe = Hash.new
284
+ add_all_to_hash(fe,"FE")
285
+ pt = Hash.new
286
+ add_all_to_hash(pt,"PT")
287
+ target = Hash.new
288
+ add_all_to_hash(target,"Target")
289
+
290
+ in_target = false
291
+
292
+ @pos_text.each_index {|i|
293
+ # write format:
294
+ # "word" "pt", "gf", "role", "target", "frame", "stuff" "ne", "sent_id"
295
+ line = Array.new
296
+ # word
297
+ word = @pos_text[i]
298
+ line << word
299
+
300
+ start, stop = @charidx[i]
301
+ # "pt", "gf", "role",
302
+ [pt,gf,fe].each {|hash|
303
+ token = Array.new
304
+ if hash.key?([start,"start"])
305
+ markables = hash.delete([start,"start"])
306
+ markables.each {|element|
307
+ token << "B-"+element
308
+ }
309
+ end
310
+ if hash.key?([stop,"stop"])
311
+ markables = hash.delete([stop,"stop"])
312
+ markables.each {|element|
313
+ token << "E-"+element
314
+ }
315
+ end
316
+ if token.empty?
317
+ line << "-"
318
+ else
319
+ line << token.sort.join(":")
320
+ end
321
+ }
322
+ # "target"
323
+ if target.key?([start,"start"])
324
+ target.delete([start,"start"])
325
+ in_target = true
326
+ end
327
+ if in_target
328
+ line << @file_obj.get_lu+"."+@file_obj.get_pos
329
+ else
330
+ line << "-"
331
+ end
332
+ if target.key?([stop,"stop"])
333
+ target.delete([stop,"stop"])
334
+ in_target = false
335
+ end
336
+ # "frame"
337
+ line << @file_obj.get_frame
338
+
339
+ # "stuff" "ne",
340
+ line << "-"
341
+ line << "-"
342
+
343
+ # "sent_id"
344
+ line << @file_obj.get_lu_id+"-"+@sent_id
345
+
346
+ out.puts line.join("\t")
347
+ }
348
+
349
+ out.puts
350
+
351
+ [gf,fe,pt,target].each {|hash|
352
+ unless hash.empty?
353
+ STDERR.puts @file_obj.get_filename
354
+ raise "**** Error: Hash not empty after creation of Sentence in CoNLL-Format (could not find matching words for some markup element)!"
355
+ end
356
+ }
357
+ end
358
+
359
+
360
+ def print_layers
361
+ @layers.each {|ln,l|
362
+ puts "Layer "+ln+":"
363
+ l.each {|element,start,stop|
364
+ puts "\t"+element+": "+start.to_s+" -- "+stop.to_s
365
+ }
366
+ puts "***"
367
+ }
368
+ end
369
+
370
+
371
+ private
372
+
373
+
374
+ def our_length(string) # (1) replace &...; with 1 char and " with two chars
375
+ return string.gsub(/&(.+?);/,"X").length
376
+ end
377
+
378
+ def is_fe(fename)
379
+ @layers["FE"].each {|name,start,stop|
380
+ if fename == name
381
+ return true
382
+ end
383
+ }
384
+ return false
385
+ end
386
+
387
+
388
+ def markable_as_string(layername,markup_name) # returns an array of all markables with this name
389
+
390
+ result = Array.new
391
+
392
+ festart = nil
393
+ festop = nil
394
+ @layers[layername].each {|name,start,stop|
395
+ if markup_name == name
396
+ fe = Array.new
397
+ infe = false
398
+ @charidx.each_index {|i|
399
+ startidx,stopidx = @charidx[i]
400
+ if startidx == start
401
+ infe = true
402
+ end
403
+ if infe
404
+ fe << @pos_text[i]
405
+ end
406
+ if stopidx == stop
407
+ result << (fe.join(" ")+"["+start.to_s+","+stop.to_s+", VERIFIED]")
408
+ break
409
+ elsif stopidx > stop
410
+ result << (fe.join(" ")+"["+start.to_s+","+stop.to_s+",ERROR]")
411
+ break
412
+ end
413
+ }
414
+ end
415
+ }
416
+ return result
417
+ end
418
+
419
+ def add_to_hash(hash,key,name)
420
+ exists = false
421
+ if hash.key?(key)
422
+ exists = true
423
+ else
424
+ hash[key] = Array.new
425
+ hash[key] << name
426
+ end
427
+ return exists
428
+ end
429
+
430
+ def add_all_to_hash(hash,layername)
431
+ # use "uniq" to remove wrong double annotations
432
+ @layers[layername].uniq.each {|element,start,stop|
433
+ exists = add_to_hash(hash,[start, "start"],element)
434
+ if exists
435
+ STDERR.puts "Warning ["+@file_obj.get_filename+"]: In layer "+layername+", two elements start at position "+start.to_s+". Only using first. Layer as read from FrameXML: "+@layers[layername].map {|element,start,stop| element+" ("+start.to_s+","+stop.to_s+")"}.join(" ")
436
+ else
437
+ add_to_hash(hash,[stop, "stop"],element)
438
+ end
439
+ }
440
+ end
441
+
442
+
443
+ def analyse_layer(layer_elt,name) # read layer information from file and store in @layers
444
+ if name.nil?
445
+ STDERR.puts "Error: layer line "+line+" with empty name."
446
+ end
447
+
448
+ # thisLayer, retv: array:[name(string), start(integer), end(integer)]
449
+ thisLayer = Array.new
450
+ retv = Array.new
451
+
452
+ labels_elt = layer_elt.children_and_text.detect { |child| child.name == "labels"}
453
+ unless labels_elt
454
+ # no labels found, return empty array
455
+ return thisLayer
456
+ end
457
+
458
+ labels_elt.children_and_text.each { |label|
459
+ unless label.name == "label"
460
+ # some other markup, ignore
461
+ next
462
+ end
463
+
464
+ attributes = label.attributes()
465
+ if attributes["itype"]
466
+ # null instantiation, don't retain
467
+ next
468
+ end
469
+ if not(attributes["start"]) and not(attributes["end"])
470
+ # no start and end labels
471
+ next
472
+ end
473
+ thisLayer << [attributes["name"], attributes["start"].to_i, attributes["end"].to_i]
474
+ }
475
+
476
+ # sanity check: verify that
477
+ # 1. we don't have overlapping labels
478
+
479
+ deleteHash = Hash.new # keep track of the labels which are to be deleted
480
+ # i -> Boolean
481
+
482
+ thisLayer.each_index {|i|
483
+ # efficiency: skip already delete labels
484
+ if deleteHash[i]
485
+ next
486
+ end
487
+ this_label, this_from , this_to = thisLayer[i]
488
+
489
+ # compare with all remaining labels
490
+ (i+1..thisLayer.length()-1).to_a.each { |other_i|
491
+ other_label,other_from,other_to = thisLayer[other_i]
492
+
493
+ # overlap? Throw out the later FE
494
+ if this_from <= other_from and other_from <= this_to
495
+ $stderr.puts "Warning: Label overlap, deleting #{other_label}"
496
+ deleteHash[other_i] = true
497
+ elsif this_from <= other_to and other_to <= this_to
498
+ $stderr.puts "Warning: Label overlap, deleting #{this_label}"
499
+ deleteHash[i] = true
500
+ end
501
+ }
502
+ # matched with all other labels. If "keep", return
503
+
504
+ if deleteHash[i]
505
+ # $stderr.puts " deleting entry #{i}"
506
+ else
507
+ retv << thisLayer[i]
508
+ end
509
+ }
510
+
511
+ return retv
512
+ end
513
+ end