shalmaneser-prep 1.2.0.rc4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,18 @@
1
+ # Counter class - provides unique ids with state
2
+
3
+ class Counter
4
+
5
+ def get
6
+ return @v
7
+ end
8
+
9
+ def next
10
+ @v += 1
11
+ return (@v-1)
12
+ end
13
+
14
+ def initialize(init_value)
15
+ @v = init_value
16
+ end
17
+
18
+ end
@@ -0,0 +1,643 @@
1
+ # KE Dec 2006
2
+ # Access for FrameNet corpus XML file
3
+ # Mainly taken over from FramesXML
4
+ #
5
+ # changes:
6
+ # - no single frame for the whole corpus
7
+ # - below <sentence> level there is an <annotationSet> level.
8
+ # One annotationSet may include a single frame,
9
+ # or a reference to all named entities in a sentence
10
+ #
11
+ # Write out in tab format, one line per word:
12
+ # Format:
13
+ # word (pt gf role target frame stuff)* ne sent_id
14
+ # with
15
+ # word: word
16
+ # whole bracketed group: information about one frame annotation
17
+ # pt: phrase type
18
+ # gf: grammatical function
19
+ # role: frame element
20
+ # target: LU occurrence
21
+ # frame: frame
22
+ # stuff: support, and other things
23
+ # ne: named entity
24
+ # sent_id: sentence ID
25
+
26
+ require 'frprep/Ampersand'
27
+ require 'common/ISO-8859-1'
28
+ require 'common/RegXML'
29
+
30
+ #####################
31
+ # mixins to make work with RegXML a little less repetitive
32
+ class RegXML
33
+ def first_child_matching(child_name)
34
+ return children_and_text().detect { |c| c.name() == child_name }
35
+ end
36
+
37
+ def each_child_matching(child_name)
38
+ children_and_text().each { |c|
39
+ if c.name() == child_name
40
+ yield c
41
+ end
42
+ }
43
+ end
44
+ end
45
+
46
+ #####################
47
+ # class to keep data for one frame
48
+ class FNCorpusAset
49
+ attr_reader :layers, :aset_type, :aset_id, :frame_name, :lu
50
+
51
+ #######
52
+ # Analyze RegXML object, store in object variables:
53
+ #
54
+ # @aset_type: "frame" or "NER"
55
+ # @frame_name: frame name for "frame" type
56
+ # @lu: LU for "frame" type
57
+ # @aset_id: ID of the annotation set
58
+ # @layers: hash: layer type (FE, GF, PT, Target, NER) -> [offset, "start"/"stop"] -> list of labels
59
+ # string -> int*string -> array:string
60
+ #
61
+ def initialize(aset, #RegXML object
62
+ charidx) # array of pairs [start index, stop index] int*int
63
+
64
+ @layers = Hash.new()
65
+ @frame_name = nil
66
+ @lu = nil
67
+ @aset_type = nil
68
+
69
+ attributes = aset.attributes()
70
+
71
+ @aset_id = attributes["ID"]
72
+
73
+ if attributes["frameName"]
74
+ # all of these seem to be frames. store in 'frames' array
75
+ unless attributes["luName"]
76
+ $stderr.puts "FNCorpusAset warning: cannot determine LU name"
77
+ $stder.puts aset.to_s()
78
+ return
79
+ end
80
+ @aset_type = "frame"
81
+ @frame_name = attributes["frameName"]
82
+ @lu = attributes["luName"]
83
+
84
+ unless (layers = aset.first_child_matching("layers"))
85
+ $stderr.puts "FNCorpusAset warning: unexpectedly no layers found"
86
+ $stderr.puts aset.to_s()
87
+ return
88
+ end
89
+
90
+ layers.each_child_matching("layer") { |l| analyze_layer(l, charidx) }
91
+
92
+ else
93
+ # all we seem to get here are named entity labels.
94
+ @aset_type = "NER"
95
+
96
+ unless (layers = aset.first_child_matching("layers"))
97
+ $stderr.puts "FNCorpusAset Warning: unexpectedly no layers found"
98
+ $stderr.puts aset.to_s()
99
+ return
100
+ end
101
+ unless (layer = layers.first_child_matching("layer"))
102
+ $stderr.puts "FNCorpusAset Warning: unexpectedly no layers found"
103
+ $stderr.puts aset.to_s()
104
+ return
105
+ end
106
+
107
+ unless layer.attributes()["name"] == "NER"
108
+ $stderr.puts "FNCorpusAset Warning: unexpected layer #{layer.attributes()["name"]}, was expecting only an NER layer."
109
+ $stderr.puts aset.to_s()
110
+ return
111
+ end
112
+
113
+ analyze_layer(layer, charidx)
114
+
115
+ end
116
+ end
117
+
118
+
119
+ #############
120
+ # input: <layer> RegXML object
121
+ # analyze this, put into @layers data structure
122
+ def analyze_layer(layer, # RegXML object
123
+ charidx) # array:int*int pairs start/end index of words
124
+ layer_name = layer.attributes()["name"]
125
+ unless layer_name
126
+ $stderr.puts "FNCorpusAset warning: cannot determine layer name"
127
+ $stderr.puts layer.to_s
128
+ return
129
+ end
130
+
131
+ # FN-specific: skip 2nd layer FEs for now
132
+ if layer_name == "FE" and layer.attributes()["rank"] == "2"
133
+ return
134
+ end
135
+
136
+ unless @layers[layer_name]
137
+ @layers[layer_name] = Hash.new()
138
+ end
139
+
140
+ unless (labels = layer.first_child_matching("labels"))
141
+ # nothing to record for this layer
142
+ return
143
+ end
144
+
145
+
146
+ # taking over much of analyse_layer() from class FrameXML
147
+ thisLayer = Array.new()
148
+
149
+ labels.each_child_matching("label") { |label|
150
+ attributes = label.attributes()
151
+ if attributes["itype"] =~ /NI/
152
+ # null instantiation, ignore
153
+ next
154
+ end
155
+
156
+ if not(attributes["start"]) and not(attributes["end"])
157
+ # no start and end labels
158
+ next
159
+ end
160
+ thisLayer << [attributes["name"], attributes["start"].to_i, attributes["end"].to_i]
161
+ }
162
+
163
+ # sanity check: do indices
164
+ # match word start and end indices?
165
+ thisLayer = verify_annotation(thisLayer, charidx)
166
+
167
+ # sanity check: verify that
168
+ # we don't have overlapping labels
169
+
170
+ deleteHash = Hash.new # keep track of the labels which are to be deleted
171
+ # i -> Boolean
172
+
173
+ thisLayer.each_index {|i|
174
+ # efficiency: skip already delete labels
175
+ if deleteHash[i]
176
+ next
177
+ end
178
+ this_label, this_from , this_to = thisLayer[i]
179
+
180
+ # compare with all remaining labels
181
+ (i+1..thisLayer.length()-1).to_a.each { |other_i|
182
+ other_label,other_from,other_to = thisLayer[other_i]
183
+
184
+ # overlap? Throw out the later FE
185
+ if this_from <= other_from and other_from <= this_to
186
+ $stderr.puts "Warning: Label overlap, deleting #{other_label}"
187
+ deleteHash[other_i] = true
188
+ elsif this_from <= other_to and other_to <= this_to
189
+ $stderr.puts "Warning: Label overlap, deleting #{this_label}"
190
+ delete_hash[i] = true
191
+ end
192
+ }
193
+ # matched with all other labels. If "keep", return
194
+
195
+ if deleteHash[i]
196
+ # $stderr.puts " deleting entry #{i}"
197
+ else
198
+ [ [this_from, "start"], [this_to, "stop"]].each { |offset, start_or_stop|
199
+ unless @layers[layer_name].has_key?([offset, start_or_stop])
200
+ @layers[layer_name][[offset, start_or_stop]] = Array.new()
201
+ end
202
+ @layers[layer_name][ [offset, start_or_stop] ] << this_label
203
+ }
204
+ end
205
+ }
206
+ end
207
+
208
+ ##############3
209
+ # verify found triples label/from_index/to_index
210
+ # against given start/end indices of words
211
+ #
212
+ # returns: triples, possibly changed
213
+ def verify_annotation(found, # array: label/from/to, string*int*int
214
+ charidx) # array: from/to, int*int
215
+
216
+ return found.map {|element, start, stop|
217
+
218
+ newstart = start
219
+ newstop = stop
220
+
221
+ # compare against word start/stop indices
222
+ charidx.each_index{|j|
223
+ unless j== 0
224
+ pstartidx, pstopidx = charidx[j-1]
225
+ end
226
+ startidx, stopidx = charidx[j]
227
+
228
+ if (start > startidx and start <= stopidx) or
229
+ (j != 0 and start > pstopidx and start < startidx)
230
+ newstart = startidx
231
+ end
232
+
233
+ if (stop >= startidx and stop < stopidx)
234
+ newstop = stopidx
235
+ elsif (j != 0 and stop > pstopidx and stop < startidx)
236
+ newstop = pstopidx
237
+ end
238
+ }
239
+
240
+ # change?
241
+ if start != newstart or stop != newstop
242
+ # report change
243
+ $stderr.puts "FNCorpusXML warning: Heuristics has changed element "+element
244
+ $stderr.puts "\tfrom ["+[start,stop].join(",")+"] to ["+[newstart,newstop].join(",")+"]"
245
+
246
+ [element, newstart, newstop]
247
+
248
+ else
249
+
250
+ [element, start, stop]
251
+ end
252
+ }
253
+ end
254
+ end
255
+
256
+ #####################
257
+ # one FrameNet corpus
258
+ #
259
+ # just the filename is stored,
260
+ # the text is read only on demand
261
+ class FNCorpusXMLFile
262
+
263
+ ###
264
+ def initialize(filename)
265
+ @filename = filename
266
+
267
+ end
268
+
269
+ ###
270
+ # yield each document in this corpus
271
+ # as a string
272
+ def each_document_string()
273
+ # read each <document> element and yield it
274
+
275
+ doc_string = ""
276
+ inside_doc_elem = false
277
+ f = File.new(@filename)
278
+
279
+ # <corpus>
280
+ # <documents>
281
+ # <document ...>
282
+ # </document>
283
+ # <document ...>
284
+ # </document>
285
+ # </documents>
286
+ # </corpus>
287
+ f.each { |line|
288
+ if not(inside_doc_elem) and line =~ /^.*?(<document\s.*)$/
289
+ # start of <document>
290
+ inside_doc_elem = true
291
+ doc_string << $1
292
+ elsif inside_doc_elem and line =~ /^(.*?<\/document>).*$/
293
+ # end of <document>
294
+ doc_string << $1
295
+ yield doc_string
296
+ doc_string = ""
297
+ inside_doc_elem = false
298
+ elsif inside_doc_elem
299
+ # within <document>
300
+ doc_string << line
301
+ end
302
+ }
303
+ end
304
+
305
+ ###
306
+ # yield each sentence
307
+ # as a FNCorpusXMLSentence object
308
+ def each_sentence()
309
+ # read each <document> element and yield it
310
+
311
+ sent_string = ""
312
+ inside_sent_elem = false
313
+ f = File.new(@filename)
314
+
315
+ # <corpus>
316
+ # <documents>
317
+ # <document ...>
318
+ # <paragraphs>
319
+ # <paragraph>
320
+ # <sentences>
321
+ # <sentence ...>
322
+ f.each { |line|
323
+ if not(inside_sent_elem) and line =~ /^.*?(<sentence\s.*)$/
324
+ # start of <sentence>
325
+ inside_sent_elem = true
326
+ sent_string << $1
327
+ elsif inside_sent_elem and line =~ /^(.*?<\/sentence>).*$/
328
+ # end of <document>
329
+ sent_string << $1
330
+ yield FNCorpusXMLSentence.new(sent_string)
331
+ sent_string = ""
332
+ inside_sent_elem = false
333
+ elsif inside_sent_elem
334
+ # within <sentence>
335
+ sent_string << line.chomp()
336
+ end
337
+ }
338
+ end
339
+
340
+ ###
341
+ # print whole FN file in tab format
342
+ def print_conll_style(file = $stdout)
343
+ each_sentence() { |s_obj|
344
+ s_obj.print_conll_style(file)
345
+ }
346
+ end
347
+ end
348
+
349
+ #######################################
350
+ # Keep one sentence from FN corpus XML
351
+ # as a RegXML object,
352
+ # offer printout in tabular format
353
+ class FNCorpusXMLSentence
354
+
355
+ #########
356
+ def initialize(sent_string)
357
+ @sent = RegXML.new(sent_string)
358
+ @sent_id = @sent.attributes()["ID"]
359
+ end
360
+
361
+ ##############
362
+ # print to file
363
+ # in tabular format
364
+ #
365
+ # row format:
366
+ # word (pt gf role target frame stuff)* ne sent_id
367
+ #
368
+ # word: word
369
+ # whole bracketed group: information about one frame annotation
370
+ # pt: phrase type
371
+ # gf: grammatical function
372
+ # role: frame element
373
+ # target: LU occurrence
374
+ # frame: frame
375
+ # stuff: support, and other things
376
+ # ne: named entity
377
+ # sent_id: sentence ID
378
+ def print_conll_style(file = $stdout)
379
+ pos_text, charidx = read_sentence()
380
+ asets = read_annotation_sets(charidx)
381
+
382
+ # aset -> are we inside the target or not?
383
+ in_target = Hash.new(false)
384
+ # aset -> are we in all sorts of other annotations, like Support?
385
+ in_stuff = Hash.new()
386
+ # are we inside a named entity?
387
+ in_ne = nil
388
+
389
+ # record every opening and closing label we recognize,
390
+ # to check later
391
+ recognized_labels = Hash.new()
392
+
393
+ pos_text.each_index {|i|
394
+ line = Array.new
395
+ word = pos_text[i]
396
+
397
+ # add: word
398
+ line << word
399
+
400
+ start, stop = charidx[i]
401
+
402
+ # iterate over the frames we have
403
+ # add: (pt gf role target frame stuff)
404
+ asets.each { |aset|
405
+ unless aset.aset_type == "frame"
406
+ # don't treat NEs as a frame here
407
+ next
408
+ end
409
+
410
+ # pt, gf, role
411
+ ["PT", "GF", "FE"].each { |layer|
412
+ token = Array.new
413
+ hash = aset.layers[layer]
414
+ if hash.has_key?([start,"start"])
415
+ recognized_labels[[layer, start, "start"]] = true
416
+
417
+ markables = hash[[start,"start"]]
418
+ markables.each {|element|
419
+ token << "B-"+element
420
+ }
421
+ end
422
+ if hash.has_key?([stop,"stop"])
423
+ recognized_labels[[layer, stop, "stop"]] = true
424
+
425
+ markables = hash[[stop,"stop"]]
426
+ markables.each {|element|
427
+ token << "E-"+element
428
+ }
429
+ end
430
+
431
+ if token.empty?
432
+ line << "-"
433
+ else
434
+ line << token.sort.join(":")
435
+ end
436
+ }
437
+
438
+ # target
439
+ target = aset.layers["Target"]
440
+ if target.has_key?([start,"start"])
441
+ recognized_labels[["Target", start, "start"]] = true
442
+ in_target[aset] = true
443
+ end
444
+ if in_target[aset]
445
+ line << aset.lu
446
+ else
447
+ line << "-"
448
+ end
449
+ if target.has_key?([stop,"stop"])
450
+ recognized_labels[["Target", stop, "stop"]] = true
451
+ in_target[aset] = false
452
+ end
453
+
454
+ # frame
455
+ line << aset.frame_name
456
+
457
+ # stuff
458
+ unless in_stuff.has_key?(aset)
459
+ in_stuff[aset] = Array.new()
460
+ end
461
+ aset.layers.each_key { |layer|
462
+ if ["PT", "GF", "FE", "Target"].include? layer
463
+ # already done those
464
+ next
465
+ end
466
+ # all the rest goes in "stuff"
467
+ if aset.layers[layer].has_key?([start, "start"])
468
+ aset.layers[layer][[start, "start"]].each { |entry|
469
+ in_stuff[aset] << layer + "-" + entry
470
+ }
471
+ recognized_labels[[layer, start, "start"]] = true
472
+ end
473
+ }
474
+ if in_stuff[aset].empty?
475
+ line << "-"
476
+ else
477
+ line << in_stuff[aset].join(":")
478
+ end
479
+ aset.layers.each_key { |layer|
480
+ if aset.layers[layer].has_key?([stop, "stop"])
481
+ recognized_labels[[layer, stop, "stop"]] = true
482
+ aset.layers[layer][[stop, "stop"]].each { |entry|
483
+ in_stuff[aset].delete(layer + "-" + entry)
484
+ }
485
+ end
486
+ }
487
+ }
488
+
489
+ # ne
490
+ if (ner = asets.detect { |a| a.aset_type == "NER" })
491
+ if ner.layers["NER"] and ner.layers["NER"].has_key?([start, "start"])
492
+ recognized_labels[["NER", start, "start"]] = true
493
+ in_ne = ner.layers["NER"][[start,"start"]]
494
+ end
495
+ if in_ne
496
+ line << in_ne.join(":")
497
+ else
498
+ line << "-"
499
+ end
500
+ if ner.layers["NER"] and ner.layers["NER"].has_key?([stop, "stop"])
501
+ recognized_labels[["NER", stop, "stop"]] = true
502
+ in_ne = nil
503
+ end
504
+ end
505
+
506
+ # sent id
507
+ line << @sent_id
508
+
509
+ # sanity check:
510
+ # row format:
511
+ # word (pt gf role target frame stuff)* ne sent_id
512
+ # so number of columns must be 3 + 6x for some x >= 0
513
+ unless (line.length() - 3)%6 == 0
514
+ $stderr.puts "Something wrong with the line length."
515
+ $stderr.puts "I have #{asets.length() - 1} frames plus NEs, "
516
+ $stderr.puts "but #{line.length()} columns."
517
+ raise
518
+ end
519
+
520
+
521
+ file.puts line.join("\t")
522
+ }
523
+
524
+ # sanity check:
525
+ # now count all labels,
526
+ # to see if we've printed them all
527
+ lost_labels = Array.new()
528
+ asets.each { |aset|
529
+ aset.layers.each_key { |layer|
530
+ aset.layers[layer].each_key() { |offset, start_or_stop|
531
+ unless recognized_labels[[layer, offset, start_or_stop]]
532
+ lost_labels << [layer, offset, start_or_stop,
533
+ aset.layers[layer][[offset, start_or_stop]]]
534
+ end
535
+ }
536
+ }
537
+ }
538
+ unless lost_labels.empty?
539
+ $stderr.puts "Offsets: "
540
+ pos_text.each_index { |i|
541
+ $stderr.puts "\t#{pos_text[i]} #{charidx[i][0]} #{charidx[i][1]}"
542
+ }
543
+ # $stderr.puts "Recognized:"
544
+ # recognized_labels.each_key { |k|
545
+ # $stderr.puts "\t" + k.to_s()
546
+ # }
547
+ lost_labels.each { |layer, offset, start_or_stop, labels|
548
+ $stderr.puts "FNCorpusXML warning: lost label"
549
+ $stderr.puts "\tLayer #{layer}"
550
+ $stderr.puts "\tOffset #{offset}"
551
+ $stderr.puts "\tStatus #{start_or_stop}"
552
+ $stderr.puts "\tLabels #{labels.join(" ")}"
553
+ }
554
+ end
555
+
556
+ file.puts
557
+ end
558
+
559
+ ################
560
+ private
561
+
562
+ ###
563
+ # read annotation sets:
564
+ # parse the annotation sets in the @sent object,
565
+ # return as:
566
+ # array of FNCorpusAset objects
567
+ def read_annotation_sets(charidx)
568
+ unless (annotation_sets = @sent.first_child_matching("annotationSets"))
569
+ return
570
+ end
571
+
572
+ # return values
573
+ frames = Array.new()
574
+
575
+ annotation_sets.each_child_matching("annotationSet") { |aset|
576
+ frames << FNCorpusAset.new(aset, charidx)
577
+ }
578
+
579
+ return frames
580
+ end
581
+
582
+ ###
583
+ # basically taken over from FrameXML.rb
584
+ # read sentence words,
585
+ # return as: sentence, indices
586
+ # - sentence as array of strings, one word per string
587
+ # - indices: array of pairs [word start char.index, word end char.index] int*int
588
+ def read_sentence()
589
+ # all text and pos_text have the same number of elements!
590
+ charidx = Array.new # maps word indices on [start,stop]
591
+ pos_text = []
592
+
593
+ unless (text_elt = @sent.first_child_matching("text"))
594
+ # no text found for this sentence
595
+ return [pos_text, charidx]
596
+ end
597
+
598
+ orig_text = text_elt.children_and_text().detect { |child|
599
+ child.text?
600
+ }
601
+ if orig_text
602
+ # take text out of RegXMl object
603
+ orig_text = orig_text.to_s()
604
+ end
605
+
606
+ pos_text = UtfIso.to_iso_8859_1(orig_text).split(" ") # text with special char.s replaced by iso8859 char.s
607
+
608
+ double_space = Array.new
609
+ pos = 0
610
+ while (match = orig_text.index(/(\s\s+)/,pos))
611
+ double_space << match
612
+ pos = match+1
613
+ end
614
+
615
+ # fill charidx array
616
+ char_i = 0
617
+ pos_text.each_index {|word_i|
618
+ startchar = char_i
619
+ # puts "Remembering "+char_i.to_s+" as start index of word "+word_i.to_s
620
+ char_i += our_length(pos_text[word_i])
621
+ stopchar = char_i-1
622
+
623
+ # puts "Remembering "+(char_i-1).to_s+" as stop index of word "+word_i.to_s
624
+
625
+ charidx << [startchar,stopchar]
626
+
627
+ # separators
628
+ if double_space.include?(char_i) then
629
+ char_i += 2
630
+ else
631
+ char_i += 1
632
+ end
633
+ }
634
+
635
+ return [pos_text, charidx]
636
+ end
637
+
638
+ ###
639
+ def our_length(string) # (1) replace &...; with 1 char and " with two chars
640
+ return string.gsub(/&(.+?);/,"X").length
641
+ end
642
+
643
+ end