shalmaneser-prep 1.2.0.rc4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,144 @@
1
+ # sp 28 06 04
2
+ #
3
+ # this module offers methods to extract gemma corpora from the FrameNet database#
4
+
5
+ require 'frprep/FrameXML'
6
+
7
+ class FNDatabase
8
+
9
+ def each_matching_sentence(file_pred,sent_pred)
10
+ # fundamental access function to FrameXML files
11
+
12
+ # returns file objects where
13
+ # FrameXMLSentence matches sent_pred
14
+ # (FrameXMLFile is accessed through FrameXMLSentence.get_file_object and matches file_pred)
15
+ each_matching_file(file_pred) {|frameNetFile|
16
+ frameNetFile.each_sentence {|frameNetSent|
17
+ if sent_pred.call(frameNetSent)
18
+ frameNetSent.verify_annotation
19
+ yield frameNetSent
20
+ end
21
+ }
22
+ }
23
+ end
24
+
25
+ def each_matching_file(file_pred)
26
+ # fundamental access function to FrameXML files
27
+
28
+ # returns file (FrameXMLFile) objects which match file_pred
29
+ each_framexml_file{|frameNetFile|
30
+ if file_pred.call(frameNetFile)
31
+ yield frameNetFile
32
+ end
33
+ frameNetFile.close
34
+ }
35
+ end
36
+
37
+ def extract_frame(frame,outfile)
38
+ each_matching_sentence(Proc.new{|fnfile| fnfile.get_frame == frame},
39
+ Proc.new{|fnsent| true}) {|fnsent|
40
+ if fnsent.contains_FE_annotation_and_target
41
+ fnsent.print_conll_style_to(outfile)
42
+ end
43
+ }
44
+ end
45
+
46
+ def extract_lemma(lemma,outfile)
47
+ each_matching_sentence(Proc.new{|fnfile| fnfile.get_lu == lemma},
48
+ Proc.new{|fnsent| true}) {|fnsent|
49
+ if fnsent.contains_FE_annotation_and_target
50
+ fnsent.print_conll_style_to(outfile)
51
+ end
52
+ }
53
+ end
54
+
55
+ def extract_everything(outdirectory)
56
+ unless outdirectory[-1,1] == "/"
57
+ outdirectory += "/"
58
+ end
59
+
60
+ outfiles = Hash.new
61
+ each_matching_sentence(Proc.new{|fnfile| true},
62
+ Proc.new{|fnsent| true}) {|fnsent|
63
+ frame = fnsent.get_file_obj.get_frame
64
+ unless outfiles.key?(frame)
65
+ outfiles[frame] = File.new(outdirectory+frame+".tab","w")
66
+ end
67
+ if fnsent.contains_FE_annotation_and_target
68
+ fnsent.print_conll_style_to(outfiles[frame])
69
+ end
70
+ }
71
+ # close output files
72
+ outfiles.each_value {|file|
73
+ file.close
74
+ }
75
+ # remove zero-size files
76
+ Dir[outdirectory+"*"].each {|filename|
77
+ if FileTest.zero?(filename)
78
+ File.unlink(filename)
79
+ end
80
+ }
81
+ end
82
+
83
+
84
+ def initialize(fn_path)
85
+ unless fn_path[-1,1] == "/"
86
+ fn_path += "/"
87
+ end
88
+ @fn = fn_path
89
+ end
90
+
91
+ private
92
+
93
+ def each_framexml_file
94
+ # files might be zipped
95
+ Dir[@fn+"lu*.xml.gz"].each {|gzfile|
96
+ Kernel.system("cp "+gzfile+" /tmp/")
97
+ Kernel.system("gunzip -f /tmp/"+File.basename(gzfile))
98
+ gzfile =~ /(.+)\.gz/
99
+ yield FrameXMLFile.new("/tmp/"+File.basename($1))
100
+ }
101
+ # or might not
102
+ Dir[@fn+"/lu*.xml"].each {|filename|
103
+ yield FrameXMLFile.new(filename)
104
+ }
105
+ end
106
+
107
+ # I don't really remember what this was good for ;-)
108
+
109
+ # def browse_everything(allFiles)
110
+ # if allFiles
111
+ # Dir[fn+"*.xml.gz"].each {|gzfile|
112
+ # Kernel.system("cp "+gzfile+" /tmp/")
113
+ # Kernel.system("gunzip -f /tmp/"+File.basename(gzfile))
114
+ # gzfile =~ /(.+)\.gz/
115
+ # # STDERR.puts File.basename($1)
116
+ # # STDERR.print "."
117
+ # ff = FrameXMLFile.new("/tmp/"+File.basename($1))
118
+ # ff.each_sentence {|s|
119
+ # if s.contains_FE_annotation_and_target
120
+ # s.verify_annotation
121
+ # if s.verify_annotation
122
+ # puts "****************** Error: Still problems after 2nd verification!"
123
+ # end
124
+ # s.print_conll_style
125
+ # end
126
+ # }
127
+ # }
128
+ # else
129
+ # ff = FrameXMLFile.new("/tmp/lu1870.xml")
130
+ # ff.each_sentence {|s|
131
+ # if s.contains_FE_annotation_and_target
132
+ # s.verify_annotation
133
+ # if s.verify_annotation
134
+ # puts "****************** Error: Still problems after 2nd verification!"
135
+ # end
136
+ # # s.print_layers
137
+ # s.print_conll_style
138
+ # end
139
+ # }
140
+ # end
141
+ # end
142
+
143
+ end
144
+
@@ -0,0 +1,513 @@
1
+ # sp 18 06 2004
2
+ #
3
+ # access to FrameNet XML files, sentences, and annotation.
4
+ #
5
+ # sp 10 11 04: only data from the first layer with name XY is
6
+ # used for output. Other data is saved in layer XY.2nd, but is
7
+ # currently not processed.
8
+ #
9
+ # sp 22 05 04: also, if two labels exist which cover the same span
10
+ # (ie there is a double annotation within the same layer), ignore
11
+ # all but the first label.
12
+ #
13
+ # ke 13 07 05:
14
+ # - changed to RegXMl.rb
15
+ # - fixed two problems in analyse_layer:
16
+ # - Deleting problematic labels:
17
+ # For some reason, thisLayer[i+1..-1].each_index {|other_i|
18
+ # included the index 0 in any case, resulting in the 1st
19
+ # label being deleted in any case.
20
+ # - Deleting problematic labels, checking for label overlap:
21
+ # The old formulation worked only if labels occurred in the array
22
+ # in the order they occurred in the sentence, but that was not the case.
23
+ # - Change in deleting problematic labels:
24
+ # No longer delete duplicate labels, since e.g. in the PT level there
25
+ # may be more than one NP label, and we want to keep those
26
+ #
27
+ # KE January 2007:
28
+ # write new adapted FNTab format
29
+ # ( "word", ("pt", "gf", "role", "target", "frame", "stuff")* "ne", "sent_id" )
30
+
31
+ require 'frprep/Ampersand'
32
+ require 'common/ISO-8859-1'
33
+ require 'common/RegXML'
34
+
35
+ class FrameXMLFile # only verified to work for FrameNet v1.1
36
+
37
+ def initialize(filename)
38
+ @filename = filename
39
+ file = File.new(filename)
40
+ counter = 0
41
+ while true
42
+ counter +=1
43
+ line = file.gets
44
+ if line =~ /<lexunit/
45
+ break
46
+ end
47
+ if counter > 3
48
+ STDERR.puts "Error: File "+filename+" does not conform to FrameNet v1.1 standard (lexunit in 3rd line)"
49
+ Kernel.exit
50
+ end
51
+ end
52
+ # found lexunit
53
+ string = line
54
+ while (line = file.gets())
55
+ string << line
56
+ end
57
+ @lexunit = RegXML.new(string)
58
+ attributes = @lexunit.attributes()
59
+ @id = attributes["ID"]
60
+ attributes["name"] =~ /^([^.]+).([^.]+)$/
61
+ @lu = $1
62
+ @pos = $2.upcase
63
+ if @lu.nil?
64
+ raise "[framexml] no lemma in header of file #{@filename}"
65
+ elsif @pos.nil?
66
+ raise "[framexml] no pos in header of file #{@filename}"
67
+ end
68
+ @frame = attributes["frame"]
69
+ end
70
+
71
+ def get_lu
72
+ return @lu.gsub(" ","_")
73
+ end
74
+
75
+ def get_lu_id
76
+ return @id
77
+ end
78
+
79
+ def get_filename
80
+ return @filename
81
+ end
82
+
83
+ def get_pos
84
+ return @pos
85
+ end
86
+
87
+ def get_frame
88
+ return @frame
89
+ end
90
+
91
+ def close
92
+ end
93
+
94
+ def each_sentence
95
+ @lexunit.children_and_text().each { |subcorpus|
96
+ subcorpus.children_and_text().each { |annotationSet|
97
+ if annotationSet.name == "annotationSet"
98
+ # sentence found
99
+ yield FrameXMLSentence.new(annotationSet,self)
100
+ end
101
+ }
102
+ }
103
+ end
104
+ end
105
+
106
+ class FrameXMLSentence
107
+ def initialize(annotationSet,file_obj)
108
+ @file_obj = file_obj
109
+
110
+ # layers: hash layer_name -> array:[name, start, stop]
111
+ # name: name of the element, string
112
+ # start: start character, integer
113
+ # stop: end character, integer
114
+ @layers = Hash.new
115
+
116
+ annotationSet.children_and_text().each { |sentence_or_layer_elt|
117
+
118
+ case sentence_or_layer_elt.name
119
+ when "sentence"
120
+ # sentence: has ID, its child is <text>[text]</text>
121
+ @sent_id = sentence_or_layer_elt.attributes["ID"]
122
+ text_elt = sentence_or_layer_elt.children_and_text().detect { |child|
123
+ child.name == "text"
124
+ }
125
+ if text_elt
126
+ # found the text element. its only child should be the text
127
+ @orig_text = text_elt.children_and_text().detect { |child|
128
+ child.text?
129
+ }
130
+ if @orig_text
131
+ # take text out of RegXMl object
132
+ @orig_text = @orig_text.to_s()
133
+ end
134
+ end
135
+
136
+ when "layers"
137
+ # contains annotation layers
138
+ sentence_or_layer_elt.children_and_text().each { |layer|
139
+ unless layer.name == "layer"
140
+ # additional material, ignore
141
+ next
142
+ end
143
+
144
+ name = layer.attributes["name"]
145
+ unless name
146
+ raise "layer without a name"
147
+ end
148
+ unless @layers.key?(name)
149
+ @layers[name] = analyse_layer(layer, name)
150
+ end
151
+ }
152
+ end
153
+ }
154
+
155
+ @pos_text = UtfIso.to_iso_8859_1(@orig_text).split(" ") # text with special characters replaced by iso8859 characters
156
+ @text = Ampersand.utf8_to_hex(@orig_text).split(" ") # text with special characters replaced by &...; sequences
157
+
158
+ # all text and pos_text have the same number of elements!
159
+ @start_is = Hash.new # map char indices (start of words) onto word indices
160
+ @stop_is = Hash.new # map char indices (end of words) onto word indices
161
+ @charidx = Array.new # maps word indices on [start,stop]
162
+
163
+ @double_space = Array.new
164
+ pos = 0
165
+ while (match = @orig_text.index(/(\s\s+)/,pos))
166
+ @double_space << match
167
+ pos = match+1
168
+ end
169
+
170
+
171
+ # fill start, stop and charidx arrays
172
+ char_i = 0
173
+ @pos_text.each_index {|word_i|
174
+ @start_is[char_i] = word_i
175
+ startchar = char_i
176
+ # puts "Remembering "+char_i.to_s+" as start index of word "+word_i.to_s
177
+ char_i += our_length(@pos_text[word_i])
178
+ @stop_is[char_i-1] = word_i
179
+
180
+ stopchar = char_i-1
181
+
182
+ # puts "Remembering "+(char_i-1).to_s+" as stop index of word "+word_i.to_s
183
+
184
+ @charidx << [startchar,stopchar]
185
+
186
+ # separators
187
+ if @double_space.include?(char_i) then
188
+ char_i += 2
189
+ else
190
+ char_i += 1
191
+ end
192
+ }
193
+ end
194
+
195
+ def get_file_obj
196
+ return @file_obj
197
+ end
198
+
199
+ def get_sent_id
200
+ return @sent_id
201
+ end
202
+
203
+ def print_text
204
+ puts "("+@id+ ")\t"+@text
205
+ end
206
+
207
+ def contains_FE_annotation_and_target
208
+ target_info = @layers["Target"][0]
209
+ unless target_info[0] == "Target"
210
+ STDERR.puts "Error in sentence from "+filename+": No target" # strictly speaking, no target at pos 0 in @layers["Target"]
211
+ STDERR.puts "Sentence: "+@text
212
+ return false
213
+ else
214
+ return (@layers.key?("FE") and target_info[2] != 0)
215
+ end
216
+ end
217
+
218
+ # we only verify the interesting layers (FE,GF,Target)
219
+ # if there is weird stuff going on on e.g. the Noun or Adj layer, we don't care.
220
+
221
+ def verify_annotation # returns true if some change has taken place
222
+ change = false
223
+ @layers.each_pair {|layername,l|
224
+
225
+ if layername=="FE" or layername=="GF" or layername=="PT" or layername=="Target" # only verify the "important" layers
226
+
227
+ l.each_index {|i|
228
+
229
+ element,start,stop = l[i]
230
+
231
+ newstart = start
232
+ newstop = stop
233
+
234
+ @charidx.each_index{|j|
235
+ unless j== 0
236
+ pstartidx, pstopidx = @charidx[j-1]
237
+ end
238
+ startidx, stopidx = @charidx[j]
239
+
240
+ if (start > startidx and start <= stopidx) or
241
+ (j != 0 and start > pstopidx and start < startidx)
242
+ newstart = startidx
243
+ end
244
+
245
+ if (stop >= startidx and stop < stopidx)
246
+ newstop = stopidx
247
+ elsif (j != 0 and stop > pstopidx and stop < startidx)
248
+ newstop = pstopidx
249
+ end
250
+
251
+ }
252
+ if start != newstart or stop != newstop
253
+ change = true
254
+ @layers[layername][i] = [element,newstart,newstop]
255
+ STDERR.puts "Heuristics has changed element "+element+" from ["+[start,stop].join(",")+"] to ["+[newstart,newstop].join(",")+"] in file "+@file_obj.get_filename+"."
256
+ markable_as_string(layername,element).each {|string|
257
+ STDERR.puts "New markable: "+string
258
+ }
259
+ STDERR.puts "Sentence: "+@pos_text.join(" ")
260
+ puts
261
+ end
262
+ }
263
+ end
264
+ }
265
+ return change
266
+ end
267
+
268
+ def print_conll_style
269
+ print_conll_style_to(STDOUT)
270
+ end
271
+
272
+ # CHANGED KE January 2007:
273
+ # write new adapted FNTab format
274
+ # ( "word", ("pt", "gf", "role", "target", "frame", "stuff")* "ne", "sent_id" )
275
+ def print_conll_style_to(out)
276
+
277
+ # even though in principle there might be multiple
278
+ # labels for one span [i.e. in one value of the
279
+ # {gf,fe,pt} hashes], we only ever record one
280
+
281
+ gf = Hash.new
282
+ add_all_to_hash(gf,"GF")
283
+ fe = Hash.new
284
+ add_all_to_hash(fe,"FE")
285
+ pt = Hash.new
286
+ add_all_to_hash(pt,"PT")
287
+ target = Hash.new
288
+ add_all_to_hash(target,"Target")
289
+
290
+ in_target = false
291
+
292
+ @pos_text.each_index {|i|
293
+ # write format:
294
+ # "word" "pt", "gf", "role", "target", "frame", "stuff" "ne", "sent_id"
295
+ line = Array.new
296
+ # word
297
+ word = @pos_text[i]
298
+ line << word
299
+
300
+ start, stop = @charidx[i]
301
+ # "pt", "gf", "role",
302
+ [pt,gf,fe].each {|hash|
303
+ token = Array.new
304
+ if hash.key?([start,"start"])
305
+ markables = hash.delete([start,"start"])
306
+ markables.each {|element|
307
+ token << "B-"+element
308
+ }
309
+ end
310
+ if hash.key?([stop,"stop"])
311
+ markables = hash.delete([stop,"stop"])
312
+ markables.each {|element|
313
+ token << "E-"+element
314
+ }
315
+ end
316
+ if token.empty?
317
+ line << "-"
318
+ else
319
+ line << token.sort.join(":")
320
+ end
321
+ }
322
+ # "target"
323
+ if target.key?([start,"start"])
324
+ target.delete([start,"start"])
325
+ in_target = true
326
+ end
327
+ if in_target
328
+ line << @file_obj.get_lu+"."+@file_obj.get_pos
329
+ else
330
+ line << "-"
331
+ end
332
+ if target.key?([stop,"stop"])
333
+ target.delete([stop,"stop"])
334
+ in_target = false
335
+ end
336
+ # "frame"
337
+ line << @file_obj.get_frame
338
+
339
+ # "stuff" "ne",
340
+ line << "-"
341
+ line << "-"
342
+
343
+ # "sent_id"
344
+ line << @file_obj.get_lu_id+"-"+@sent_id
345
+
346
+ out.puts line.join("\t")
347
+ }
348
+
349
+ out.puts
350
+
351
+ [gf,fe,pt,target].each {|hash|
352
+ unless hash.empty?
353
+ STDERR.puts @file_obj.get_filename
354
+ raise "**** Error: Hash not empty after creation of Sentence in CoNLL-Format (could not find matching words for some markup element)!"
355
+ end
356
+ }
357
+ end
358
+
359
+
360
+ def print_layers
361
+ @layers.each {|ln,l|
362
+ puts "Layer "+ln+":"
363
+ l.each {|element,start,stop|
364
+ puts "\t"+element+": "+start.to_s+" -- "+stop.to_s
365
+ }
366
+ puts "***"
367
+ }
368
+ end
369
+
370
+
371
+ private
372
+
373
+
374
+ def our_length(string) # (1) replace &...; with 1 char and " with two chars
375
+ return string.gsub(/&(.+?);/,"X").length
376
+ end
377
+
378
+ def is_fe(fename)
379
+ @layers["FE"].each {|name,start,stop|
380
+ if fename == name
381
+ return true
382
+ end
383
+ }
384
+ return false
385
+ end
386
+
387
+
388
+ def markable_as_string(layername,markup_name) # returns an array of all markables with this name
389
+
390
+ result = Array.new
391
+
392
+ festart = nil
393
+ festop = nil
394
+ @layers[layername].each {|name,start,stop|
395
+ if markup_name == name
396
+ fe = Array.new
397
+ infe = false
398
+ @charidx.each_index {|i|
399
+ startidx,stopidx = @charidx[i]
400
+ if startidx == start
401
+ infe = true
402
+ end
403
+ if infe
404
+ fe << @pos_text[i]
405
+ end
406
+ if stopidx == stop
407
+ result << (fe.join(" ")+"["+start.to_s+","+stop.to_s+", VERIFIED]")
408
+ break
409
+ elsif stopidx > stop
410
+ result << (fe.join(" ")+"["+start.to_s+","+stop.to_s+",ERROR]")
411
+ break
412
+ end
413
+ }
414
+ end
415
+ }
416
+ return result
417
+ end
418
+
419
+ def add_to_hash(hash,key,name)
420
+ exists = false
421
+ if hash.key?(key)
422
+ exists = true
423
+ else
424
+ hash[key] = Array.new
425
+ hash[key] << name
426
+ end
427
+ return exists
428
+ end
429
+
430
+ def add_all_to_hash(hash,layername)
431
+ # use "uniq" to remove wrong double annotations
432
+ @layers[layername].uniq.each {|element,start,stop|
433
+ exists = add_to_hash(hash,[start, "start"],element)
434
+ if exists
435
+ STDERR.puts "Warning ["+@file_obj.get_filename+"]: In layer "+layername+", two elements start at position "+start.to_s+". Only using first. Layer as read from FrameXML: "+@layers[layername].map {|element,start,stop| element+" ("+start.to_s+","+stop.to_s+")"}.join(" ")
436
+ else
437
+ add_to_hash(hash,[stop, "stop"],element)
438
+ end
439
+ }
440
+ end
441
+
442
+
443
+ def analyse_layer(layer_elt,name) # read layer information from file and store in @layers
444
+ if name.nil?
445
+ STDERR.puts "Error: layer line "+line+" with empty name."
446
+ end
447
+
448
+ # thisLayer, retv: array:[name(string), start(integer), end(integer)]
449
+ thisLayer = Array.new
450
+ retv = Array.new
451
+
452
+ labels_elt = layer_elt.children_and_text.detect { |child| child.name == "labels"}
453
+ unless labels_elt
454
+ # no labels found, return empty array
455
+ return thisLayer
456
+ end
457
+
458
+ labels_elt.children_and_text.each { |label|
459
+ unless label.name == "label"
460
+ # some other markup, ignore
461
+ next
462
+ end
463
+
464
+ attributes = label.attributes()
465
+ if attributes["itype"]
466
+ # null instantiation, don't retain
467
+ next
468
+ end
469
+ if not(attributes["start"]) and not(attributes["end"])
470
+ # no start and end labels
471
+ next
472
+ end
473
+ thisLayer << [attributes["name"], attributes["start"].to_i, attributes["end"].to_i]
474
+ }
475
+
476
+ # sanity check: verify that
477
+ # 1. we don't have overlapping labels
478
+
479
+ deleteHash = Hash.new # keep track of the labels which are to be deleted
480
+ # i -> Boolean
481
+
482
+ thisLayer.each_index {|i|
483
+ # efficiency: skip already delete labels
484
+ if deleteHash[i]
485
+ next
486
+ end
487
+ this_label, this_from , this_to = thisLayer[i]
488
+
489
+ # compare with all remaining labels
490
+ (i+1..thisLayer.length()-1).to_a.each { |other_i|
491
+ other_label,other_from,other_to = thisLayer[other_i]
492
+
493
+ # overlap? Throw out the later FE
494
+ if this_from <= other_from and other_from <= this_to
495
+ $stderr.puts "Warning: Label overlap, deleting #{other_label}"
496
+ deleteHash[other_i] = true
497
+ elsif this_from <= other_to and other_to <= this_to
498
+ $stderr.puts "Warning: Label overlap, deleting #{this_label}"
499
+ deleteHash[i] = true
500
+ end
501
+ }
502
+ # matched with all other labels. If "keep", return
503
+
504
+ if deleteHash[i]
505
+ # $stderr.puts " deleting entry #{i}"
506
+ else
507
+ retv << thisLayer[i]
508
+ end
509
+ }
510
+
511
+ return retv
512
+ end
513
+ end