shalmaneser-prep 1.2.0.rc4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,18 @@
1
+ # Counter class - provides unique ids with state
2
+
3
+ class Counter
4
+
5
+ def get
6
+ return @v
7
+ end
8
+
9
+ def next
10
+ @v += 1
11
+ return (@v-1)
12
+ end
13
+
14
+ def initialize(init_value)
15
+ @v = init_value
16
+ end
17
+
18
+ end
@@ -0,0 +1,643 @@
1
+ # KE Dec 2006
2
+ # Access for FrameNet corpus XML file
3
+ # Mainly taken over from FramesXML
4
+ #
5
+ # changes:
6
+ # - no single frame for the whole corpus
7
+ # - below <sentence> level there is an <annotationSet> level.
8
+ # One annotationSet may include a single frame,
9
+ # or a reference to all named entities in a sentence
10
+ #
11
+ # Write out in tab format, one line per word:
12
+ # Format:
13
+ # word (pt gf role target frame stuff)* ne sent_id
14
+ # with
15
+ # word: word
16
+ # whole bracketed group: information about one frame annotation
17
+ # pt: phrase type
18
+ # gf: grammatical function
19
+ # role: frame element
20
+ # target: LU occurrence
21
+ # frame: frame
22
+ # stuff: support, and other things
23
+ # ne: named entity
24
+ # sent_id: sentence ID
25
+
26
+ require 'frprep/Ampersand'
27
+ require 'common/ISO-8859-1'
28
+ require 'common/RegXML'
29
+
30
+ #####################
31
+ # mixins to make work with RegXML a little less repetitive
32
+ class RegXML
33
+ def first_child_matching(child_name)
34
+ return children_and_text().detect { |c| c.name() == child_name }
35
+ end
36
+
37
+ def each_child_matching(child_name)
38
+ children_and_text().each { |c|
39
+ if c.name() == child_name
40
+ yield c
41
+ end
42
+ }
43
+ end
44
+ end
45
+
46
+ #####################
47
+ # class to keep data for one frame
48
+ class FNCorpusAset
49
+ attr_reader :layers, :aset_type, :aset_id, :frame_name, :lu
50
+
51
+ #######
52
+ # Analyze RegXML object, store in object variables:
53
+ #
54
+ # @aset_type: "frame" or "NER"
55
+ # @frame_name: frame name for "frame" type
56
+ # @lu: LU for "frame" type
57
+ # @aset_id: ID of the annotation set
58
+ # @layers: hash: layer type (FE, GF, PT, Target, NER) -> [offset, "start"/"stop"] -> list of labels
59
+ # string -> int*string -> array:string
60
+ #
61
+ def initialize(aset, #RegXML object
62
+ charidx) # array of pairs [start index, stop index] int*int
63
+
64
+ @layers = Hash.new()
65
+ @frame_name = nil
66
+ @lu = nil
67
+ @aset_type = nil
68
+
69
+ attributes = aset.attributes()
70
+
71
+ @aset_id = attributes["ID"]
72
+
73
+ if attributes["frameName"]
74
+ # all of these seem to be frames. store in 'frames' array
75
+ unless attributes["luName"]
76
+ $stderr.puts "FNCorpusAset warning: cannot determine LU name"
77
+ $stder.puts aset.to_s()
78
+ return
79
+ end
80
+ @aset_type = "frame"
81
+ @frame_name = attributes["frameName"]
82
+ @lu = attributes["luName"]
83
+
84
+ unless (layers = aset.first_child_matching("layers"))
85
+ $stderr.puts "FNCorpusAset warning: unexpectedly no layers found"
86
+ $stderr.puts aset.to_s()
87
+ return
88
+ end
89
+
90
+ layers.each_child_matching("layer") { |l| analyze_layer(l, charidx) }
91
+
92
+ else
93
+ # all we seem to get here are named entity labels.
94
+ @aset_type = "NER"
95
+
96
+ unless (layers = aset.first_child_matching("layers"))
97
+ $stderr.puts "FNCorpusAset Warning: unexpectedly no layers found"
98
+ $stderr.puts aset.to_s()
99
+ return
100
+ end
101
+ unless (layer = layers.first_child_matching("layer"))
102
+ $stderr.puts "FNCorpusAset Warning: unexpectedly no layers found"
103
+ $stderr.puts aset.to_s()
104
+ return
105
+ end
106
+
107
+ unless layer.attributes()["name"] == "NER"
108
+ $stderr.puts "FNCorpusAset Warning: unexpected layer #{layer.attributes()["name"]}, was expecting only an NER layer."
109
+ $stderr.puts aset.to_s()
110
+ return
111
+ end
112
+
113
+ analyze_layer(layer, charidx)
114
+
115
+ end
116
+ end
117
+
118
+
119
+ #############
120
+ # input: <layer> RegXML object
121
+ # analyze this, put into @layers data structure
122
+ def analyze_layer(layer, # RegXML object
123
+ charidx) # array:int*int pairs start/end index of words
124
+ layer_name = layer.attributes()["name"]
125
+ unless layer_name
126
+ $stderr.puts "FNCorpusAset warning: cannot determine layer name"
127
+ $stderr.puts layer.to_s
128
+ return
129
+ end
130
+
131
+ # FN-specific: skip 2nd layer FEs for now
132
+ if layer_name == "FE" and layer.attributes()["rank"] == "2"
133
+ return
134
+ end
135
+
136
+ unless @layers[layer_name]
137
+ @layers[layer_name] = Hash.new()
138
+ end
139
+
140
+ unless (labels = layer.first_child_matching("labels"))
141
+ # nothing to record for this layer
142
+ return
143
+ end
144
+
145
+
146
+ # taking over much of analyse_layer() from class FrameXML
147
+ thisLayer = Array.new()
148
+
149
+ labels.each_child_matching("label") { |label|
150
+ attributes = label.attributes()
151
+ if attributes["itype"] =~ /NI/
152
+ # null instantiation, ignore
153
+ next
154
+ end
155
+
156
+ if not(attributes["start"]) and not(attributes["end"])
157
+ # no start and end labels
158
+ next
159
+ end
160
+ thisLayer << [attributes["name"], attributes["start"].to_i, attributes["end"].to_i]
161
+ }
162
+
163
+ # sanity check: do indices
164
+ # match word start and end indices?
165
+ thisLayer = verify_annotation(thisLayer, charidx)
166
+
167
+ # sanity check: verify that
168
+ # we don't have overlapping labels
169
+
170
+ deleteHash = Hash.new # keep track of the labels which are to be deleted
171
+ # i -> Boolean
172
+
173
+ thisLayer.each_index {|i|
174
+ # efficiency: skip already delete labels
175
+ if deleteHash[i]
176
+ next
177
+ end
178
+ this_label, this_from , this_to = thisLayer[i]
179
+
180
+ # compare with all remaining labels
181
+ (i+1..thisLayer.length()-1).to_a.each { |other_i|
182
+ other_label,other_from,other_to = thisLayer[other_i]
183
+
184
+ # overlap? Throw out the later FE
185
+ if this_from <= other_from and other_from <= this_to
186
+ $stderr.puts "Warning: Label overlap, deleting #{other_label}"
187
+ deleteHash[other_i] = true
188
+ elsif this_from <= other_to and other_to <= this_to
189
+ $stderr.puts "Warning: Label overlap, deleting #{this_label}"
190
+ delete_hash[i] = true
191
+ end
192
+ }
193
+ # matched with all other labels. If "keep", return
194
+
195
+ if deleteHash[i]
196
+ # $stderr.puts " deleting entry #{i}"
197
+ else
198
+ [ [this_from, "start"], [this_to, "stop"]].each { |offset, start_or_stop|
199
+ unless @layers[layer_name].has_key?([offset, start_or_stop])
200
+ @layers[layer_name][[offset, start_or_stop]] = Array.new()
201
+ end
202
+ @layers[layer_name][ [offset, start_or_stop] ] << this_label
203
+ }
204
+ end
205
+ }
206
+ end
207
+
208
+ ##############3
209
+ # verify found triples label/from_index/to_index
210
+ # against given start/end indices of words
211
+ #
212
+ # returns: triples, possibly changed
213
+ def verify_annotation(found, # array: label/from/to, string*int*int
214
+ charidx) # array: from/to, int*int
215
+
216
+ return found.map {|element, start, stop|
217
+
218
+ newstart = start
219
+ newstop = stop
220
+
221
+ # compare against word start/stop indices
222
+ charidx.each_index{|j|
223
+ unless j== 0
224
+ pstartidx, pstopidx = charidx[j-1]
225
+ end
226
+ startidx, stopidx = charidx[j]
227
+
228
+ if (start > startidx and start <= stopidx) or
229
+ (j != 0 and start > pstopidx and start < startidx)
230
+ newstart = startidx
231
+ end
232
+
233
+ if (stop >= startidx and stop < stopidx)
234
+ newstop = stopidx
235
+ elsif (j != 0 and stop > pstopidx and stop < startidx)
236
+ newstop = pstopidx
237
+ end
238
+ }
239
+
240
+ # change?
241
+ if start != newstart or stop != newstop
242
+ # report change
243
+ $stderr.puts "FNCorpusXML warning: Heuristics has changed element "+element
244
+ $stderr.puts "\tfrom ["+[start,stop].join(",")+"] to ["+[newstart,newstop].join(",")+"]"
245
+
246
+ [element, newstart, newstop]
247
+
248
+ else
249
+
250
+ [element, start, stop]
251
+ end
252
+ }
253
+ end
254
+ end
255
+
256
+ #####################
257
+ # one FrameNet corpus
258
+ #
259
+ # just the filename is stored,
260
+ # the text is read only on demand
261
+ class FNCorpusXMLFile
262
+
263
+ ###
264
+ def initialize(filename)
265
+ @filename = filename
266
+
267
+ end
268
+
269
+ ###
270
+ # yield each document in this corpus
271
+ # as a string
272
+ def each_document_string()
273
+ # read each <document> element and yield it
274
+
275
+ doc_string = ""
276
+ inside_doc_elem = false
277
+ f = File.new(@filename)
278
+
279
+ # <corpus>
280
+ # <documents>
281
+ # <document ...>
282
+ # </document>
283
+ # <document ...>
284
+ # </document>
285
+ # </documents>
286
+ # </corpus>
287
+ f.each { |line|
288
+ if not(inside_doc_elem) and line =~ /^.*?(<document\s.*)$/
289
+ # start of <document>
290
+ inside_doc_elem = true
291
+ doc_string << $1
292
+ elsif inside_doc_elem and line =~ /^(.*?<\/document>).*$/
293
+ # end of <document>
294
+ doc_string << $1
295
+ yield doc_string
296
+ doc_string = ""
297
+ inside_doc_elem = false
298
+ elsif inside_doc_elem
299
+ # within <document>
300
+ doc_string << line
301
+ end
302
+ }
303
+ end
304
+
305
+ ###
306
+ # yield each sentence
307
+ # as a FNCorpusXMLSentence object
308
+ def each_sentence()
309
+ # read each <document> element and yield it
310
+
311
+ sent_string = ""
312
+ inside_sent_elem = false
313
+ f = File.new(@filename)
314
+
315
+ # <corpus>
316
+ # <documents>
317
+ # <document ...>
318
+ # <paragraphs>
319
+ # <paragraph>
320
+ # <sentences>
321
+ # <sentence ...>
322
+ f.each { |line|
323
+ if not(inside_sent_elem) and line =~ /^.*?(<sentence\s.*)$/
324
+ # start of <sentence>
325
+ inside_sent_elem = true
326
+ sent_string << $1
327
+ elsif inside_sent_elem and line =~ /^(.*?<\/sentence>).*$/
328
+ # end of <document>
329
+ sent_string << $1
330
+ yield FNCorpusXMLSentence.new(sent_string)
331
+ sent_string = ""
332
+ inside_sent_elem = false
333
+ elsif inside_sent_elem
334
+ # within <sentence>
335
+ sent_string << line.chomp()
336
+ end
337
+ }
338
+ end
339
+
340
+ ###
341
+ # print whole FN file in tab format
342
+ def print_conll_style(file = $stdout)
343
+ each_sentence() { |s_obj|
344
+ s_obj.print_conll_style(file)
345
+ }
346
+ end
347
+ end
348
+
349
+ #######################################
350
+ # Keep one sentence from FN corpus XML
351
+ # as a RegXML object,
352
+ # offer printout in tabular format
353
+ class FNCorpusXMLSentence
354
+
355
+ #########
356
+ def initialize(sent_string)
357
+ @sent = RegXML.new(sent_string)
358
+ @sent_id = @sent.attributes()["ID"]
359
+ end
360
+
361
+ ##############
362
+ # print to file
363
+ # in tabular format
364
+ #
365
+ # row format:
366
+ # word (pt gf role target frame stuff)* ne sent_id
367
+ #
368
+ # word: word
369
+ # whole bracketed group: information about one frame annotation
370
+ # pt: phrase type
371
+ # gf: grammatical function
372
+ # role: frame element
373
+ # target: LU occurrence
374
+ # frame: frame
375
+ # stuff: support, and other things
376
+ # ne: named entity
377
+ # sent_id: sentence ID
378
+ def print_conll_style(file = $stdout)
379
+ pos_text, charidx = read_sentence()
380
+ asets = read_annotation_sets(charidx)
381
+
382
+ # aset -> are we inside the target or not?
383
+ in_target = Hash.new(false)
384
+ # aset -> are we in all sorts of other annotations, like Support?
385
+ in_stuff = Hash.new()
386
+ # are we inside a named entity?
387
+ in_ne = nil
388
+
389
+ # record every opening and closing label we recognize,
390
+ # to check later
391
+ recognized_labels = Hash.new()
392
+
393
+ pos_text.each_index {|i|
394
+ line = Array.new
395
+ word = pos_text[i]
396
+
397
+ # add: word
398
+ line << word
399
+
400
+ start, stop = charidx[i]
401
+
402
+ # iterate over the frames we have
403
+ # add: (pt gf role target frame stuff)
404
+ asets.each { |aset|
405
+ unless aset.aset_type == "frame"
406
+ # don't treat NEs as a frame here
407
+ next
408
+ end
409
+
410
+ # pt, gf, role
411
+ ["PT", "GF", "FE"].each { |layer|
412
+ token = Array.new
413
+ hash = aset.layers[layer]
414
+ if hash.has_key?([start,"start"])
415
+ recognized_labels[[layer, start, "start"]] = true
416
+
417
+ markables = hash[[start,"start"]]
418
+ markables.each {|element|
419
+ token << "B-"+element
420
+ }
421
+ end
422
+ if hash.has_key?([stop,"stop"])
423
+ recognized_labels[[layer, stop, "stop"]] = true
424
+
425
+ markables = hash[[stop,"stop"]]
426
+ markables.each {|element|
427
+ token << "E-"+element
428
+ }
429
+ end
430
+
431
+ if token.empty?
432
+ line << "-"
433
+ else
434
+ line << token.sort.join(":")
435
+ end
436
+ }
437
+
438
+ # target
439
+ target = aset.layers["Target"]
440
+ if target.has_key?([start,"start"])
441
+ recognized_labels[["Target", start, "start"]] = true
442
+ in_target[aset] = true
443
+ end
444
+ if in_target[aset]
445
+ line << aset.lu
446
+ else
447
+ line << "-"
448
+ end
449
+ if target.has_key?([stop,"stop"])
450
+ recognized_labels[["Target", stop, "stop"]] = true
451
+ in_target[aset] = false
452
+ end
453
+
454
+ # frame
455
+ line << aset.frame_name
456
+
457
+ # stuff
458
+ unless in_stuff.has_key?(aset)
459
+ in_stuff[aset] = Array.new()
460
+ end
461
+ aset.layers.each_key { |layer|
462
+ if ["PT", "GF", "FE", "Target"].include? layer
463
+ # already done those
464
+ next
465
+ end
466
+ # all the rest goes in "stuff"
467
+ if aset.layers[layer].has_key?([start, "start"])
468
+ aset.layers[layer][[start, "start"]].each { |entry|
469
+ in_stuff[aset] << layer + "-" + entry
470
+ }
471
+ recognized_labels[[layer, start, "start"]] = true
472
+ end
473
+ }
474
+ if in_stuff[aset].empty?
475
+ line << "-"
476
+ else
477
+ line << in_stuff[aset].join(":")
478
+ end
479
+ aset.layers.each_key { |layer|
480
+ if aset.layers[layer].has_key?([stop, "stop"])
481
+ recognized_labels[[layer, stop, "stop"]] = true
482
+ aset.layers[layer][[stop, "stop"]].each { |entry|
483
+ in_stuff[aset].delete(layer + "-" + entry)
484
+ }
485
+ end
486
+ }
487
+ }
488
+
489
+ # ne
490
+ if (ner = asets.detect { |a| a.aset_type == "NER" })
491
+ if ner.layers["NER"] and ner.layers["NER"].has_key?([start, "start"])
492
+ recognized_labels[["NER", start, "start"]] = true
493
+ in_ne = ner.layers["NER"][[start,"start"]]
494
+ end
495
+ if in_ne
496
+ line << in_ne.join(":")
497
+ else
498
+ line << "-"
499
+ end
500
+ if ner.layers["NER"] and ner.layers["NER"].has_key?([stop, "stop"])
501
+ recognized_labels[["NER", stop, "stop"]] = true
502
+ in_ne = nil
503
+ end
504
+ end
505
+
506
+ # sent id
507
+ line << @sent_id
508
+
509
+ # sanity check:
510
+ # row format:
511
+ # word (pt gf role target frame stuff)* ne sent_id
512
+ # so number of columns must be 3 + 6x for some x >= 0
513
+ unless (line.length() - 3)%6 == 0
514
+ $stderr.puts "Something wrong with the line length."
515
+ $stderr.puts "I have #{asets.length() - 1} frames plus NEs, "
516
+ $stderr.puts "but #{line.length()} columns."
517
+ raise
518
+ end
519
+
520
+
521
+ file.puts line.join("\t")
522
+ }
523
+
524
+ # sanity check:
525
+ # now count all labels,
526
+ # to see if we've printed them all
527
+ lost_labels = Array.new()
528
+ asets.each { |aset|
529
+ aset.layers.each_key { |layer|
530
+ aset.layers[layer].each_key() { |offset, start_or_stop|
531
+ unless recognized_labels[[layer, offset, start_or_stop]]
532
+ lost_labels << [layer, offset, start_or_stop,
533
+ aset.layers[layer][[offset, start_or_stop]]]
534
+ end
535
+ }
536
+ }
537
+ }
538
+ unless lost_labels.empty?
539
+ $stderr.puts "Offsets: "
540
+ pos_text.each_index { |i|
541
+ $stderr.puts "\t#{pos_text[i]} #{charidx[i][0]} #{charidx[i][1]}"
542
+ }
543
+ # $stderr.puts "Recognized:"
544
+ # recognized_labels.each_key { |k|
545
+ # $stderr.puts "\t" + k.to_s()
546
+ # }
547
+ lost_labels.each { |layer, offset, start_or_stop, labels|
548
+ $stderr.puts "FNCorpusXML warning: lost label"
549
+ $stderr.puts "\tLayer #{layer}"
550
+ $stderr.puts "\tOffset #{offset}"
551
+ $stderr.puts "\tStatus #{start_or_stop}"
552
+ $stderr.puts "\tLabels #{labels.join(" ")}"
553
+ }
554
+ end
555
+
556
+ file.puts
557
+ end
558
+
559
+ ################
560
+ private
561
+
562
+ ###
563
+ # read annotation sets:
564
+ # parse the annotation sets in the @sent object,
565
+ # return as:
566
+ # array of FNCorpusAset objects
567
+ def read_annotation_sets(charidx)
568
+ unless (annotation_sets = @sent.first_child_matching("annotationSets"))
569
+ return
570
+ end
571
+
572
+ # return values
573
+ frames = Array.new()
574
+
575
+ annotation_sets.each_child_matching("annotationSet") { |aset|
576
+ frames << FNCorpusAset.new(aset, charidx)
577
+ }
578
+
579
+ return frames
580
+ end
581
+
582
+ ###
583
+ # basically taken over from FrameXML.rb
584
+ # read sentence words,
585
+ # return as: sentence, indices
586
+ # - sentence as array of strings, one word per string
587
+ # - indices: array of pairs [word start char.index, word end char.index] int*int
588
+ def read_sentence()
589
+ # all text and pos_text have the same number of elements!
590
+ charidx = Array.new # maps word indices on [start,stop]
591
+ pos_text = []
592
+
593
+ unless (text_elt = @sent.first_child_matching("text"))
594
+ # no text found for this sentence
595
+ return [pos_text, charidx]
596
+ end
597
+
598
+ orig_text = text_elt.children_and_text().detect { |child|
599
+ child.text?
600
+ }
601
+ if orig_text
602
+ # take text out of RegXMl object
603
+ orig_text = orig_text.to_s()
604
+ end
605
+
606
+ pos_text = UtfIso.to_iso_8859_1(orig_text).split(" ") # text with special char.s replaced by iso8859 char.s
607
+
608
+ double_space = Array.new
609
+ pos = 0
610
+ while (match = orig_text.index(/(\s\s+)/,pos))
611
+ double_space << match
612
+ pos = match+1
613
+ end
614
+
615
+ # fill charidx array
616
+ char_i = 0
617
+ pos_text.each_index {|word_i|
618
+ startchar = char_i
619
+ # puts "Remembering "+char_i.to_s+" as start index of word "+word_i.to_s
620
+ char_i += our_length(pos_text[word_i])
621
+ stopchar = char_i-1
622
+
623
+ # puts "Remembering "+(char_i-1).to_s+" as stop index of word "+word_i.to_s
624
+
625
+ charidx << [startchar,stopchar]
626
+
627
+ # separators
628
+ if double_space.include?(char_i) then
629
+ char_i += 2
630
+ else
631
+ char_i += 1
632
+ end
633
+ }
634
+
635
+ return [pos_text, charidx]
636
+ end
637
+
638
+ ###
639
+ def our_length(string) # (1) replace &...; with 1 char and " with two chars
640
+ return string.gsub(/&(.+?);/,"X").length
641
+ end
642
+
643
+ end