shalmaneser 0.0.1.alpha → 1.2.0.rc1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +2 -2
- data/CHANGELOG.md +4 -0
- data/LICENSE.md +4 -0
- data/README.md +49 -0
- data/bin/fred +18 -0
- data/bin/frprep +34 -0
- data/bin/rosy +17 -0
- data/lib/common/AbstractSynInterface.rb +35 -33
- data/lib/common/Mallet.rb +236 -0
- data/lib/common/Maxent.rb +26 -12
- data/lib/common/Parser.rb +5 -5
- data/lib/common/SynInterfaces.rb +13 -6
- data/lib/common/TabFormat.rb +7 -6
- data/lib/common/Tiger.rb +4 -4
- data/lib/common/Timbl.rb +144 -0
- data/lib/common/{FrprepHelper.rb → frprep_helper.rb} +14 -8
- data/lib/common/headz.rb +1 -1
- data/lib/common/ruby_class_extensions.rb +3 -3
- data/lib/fred/FredBOWContext.rb +14 -2
- data/lib/fred/FredDetermineTargets.rb +4 -9
- data/lib/fred/FredEval.rb +1 -1
- data/lib/fred/FredFeatureExtractors.rb +4 -3
- data/lib/fred/FredFeaturize.rb +1 -1
- data/lib/frprep/CollinsInterface.rb +6 -6
- data/lib/frprep/MiniparInterface.rb +5 -5
- data/lib/frprep/SleepyInterface.rb +7 -7
- data/lib/frprep/TntInterface.rb +1 -1
- data/lib/frprep/TreetaggerInterface.rb +29 -5
- data/lib/frprep/do_parses.rb +1 -0
- data/lib/frprep/frprep.rb +36 -32
- data/lib/{common/BerkeleyInterface.rb → frprep/interfaces/berkeley_interface.rb} +69 -95
- data/lib/frprep/interfaces/stanford_interface.rb +353 -0
- data/lib/frprep/interpreters/berkeley_interpreter.rb +22 -0
- data/lib/frprep/interpreters/stanford_interpreter.rb +22 -0
- data/lib/frprep/opt_parser.rb +2 -2
- data/lib/rosy/AbstractFeatureAndExternal.rb +5 -3
- data/lib/rosy/RosyIterator.rb +11 -10
- data/lib/rosy/rosy.rb +1 -0
- data/lib/shalmaneser/version.rb +1 -1
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +1 -1
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +1 -1
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +2 -2
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +2 -2
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +2 -2
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +2 -2
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +2 -2
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +2 -2
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +1 -1
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +7 -7
- data/test/functional/test_frprep.rb +3 -3
- data/test/functional/test_rosy.rb +20 -0
- metadata +215 -224
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/CollinsInterface.rb +0 -1165
- data/lib/common/MiniparInterface.rb +0 -1388
- data/lib/common/SleepyInterface.rb +0 -384
- data/lib/common/TntInterface.rb +0 -44
- data/lib/common/TreetaggerInterface.rb +0 -303
- data/lib/frprep/AbstractSynInterface.rb +0 -1227
- data/lib/frprep/BerkeleyInterface.rb +0 -375
- data/lib/frprep/ConfigData.rb +0 -694
- data/lib/frprep/FixSynSemMapping.rb +0 -196
- data/lib/frprep/FrPrepConfigData.rb +0 -66
- data/lib/frprep/FrprepHelper.rb +0 -1324
- data/lib/frprep/ISO-8859-1.rb +0 -24
- data/lib/frprep/Parser.rb +0 -213
- data/lib/frprep/SalsaTigerRegXML.rb +0 -2347
- data/lib/frprep/SalsaTigerXMLHelper.rb +0 -99
- data/lib/frprep/SynInterfaces.rb +0 -275
- data/lib/frprep/TabFormat.rb +0 -720
- data/lib/frprep/Tiger.rb +0 -1448
- data/lib/frprep/Tree.rb +0 -61
- data/lib/frprep/headz.rb +0 -338
data/lib/frprep/ISO-8859-1.rb
DELETED
@@ -1,24 +0,0 @@
|
|
1
|
-
# KE changed July 05: now no inclusion of modules required,
|
2
|
-
# and names changed from REXML.Encodign to UtfIso
|
3
|
-
|
4
|
-
module UtfIso
|
5
|
-
# Convert from UTF-8
|
6
|
-
def UtfIso.to_iso_8859_1(content)
|
7
|
-
array_utf8 = content.unpack('U*')
|
8
|
-
array_enc = []
|
9
|
-
array_utf8.each do |num|
|
10
|
-
if num <= 0xFF
|
11
|
-
array_enc << num
|
12
|
-
else
|
13
|
-
# Numeric entity (&#nnnn;); shard by Stefan Scholl
|
14
|
-
# array_enc += to_iso_8859("&\##{num};").unpack('C*')
|
15
|
-
end
|
16
|
-
end
|
17
|
-
array_enc.pack('C*')
|
18
|
-
end
|
19
|
-
|
20
|
-
# Convert to UTF-8
|
21
|
-
def UtfIso.from_iso_8859_1(str)
|
22
|
-
str.unpack('C*').pack('U*')
|
23
|
-
end
|
24
|
-
end
|
data/lib/frprep/Parser.rb
DELETED
@@ -1,213 +0,0 @@
|
|
1
|
-
# Alexander Koller 2003
|
2
|
-
# extended Katrin Erk June 2003
|
3
|
-
#
|
4
|
-
# Classes that return a list of sentence DOMs, from various sources
|
5
|
-
#
|
6
|
-
# Each class in this file defines the following methods:
|
7
|
-
#
|
8
|
-
# initialize(...) "..." depends on the class
|
9
|
-
# extractDOMs() return list of all s nodes as DOM objects
|
10
|
-
# each_s() iterate over s nodes; may take less memory
|
11
|
-
|
12
|
-
|
13
|
-
require "rexml/document"
|
14
|
-
|
15
|
-
class FileParser
|
16
|
-
|
17
|
-
include REXML
|
18
|
-
|
19
|
-
def initialize(filename)
|
20
|
-
@file = File.new(filename)
|
21
|
-
@doc = nil
|
22
|
-
end
|
23
|
-
|
24
|
-
# returns an array of DOMs for the sentences
|
25
|
-
def extractDOMs()
|
26
|
-
ensureParsedDocument()
|
27
|
-
@doc.get_elements("/corpus/body/s")
|
28
|
-
end
|
29
|
-
|
30
|
-
# Iterates over all sentence nodes. This may be more memory
|
31
|
-
# efficient than using extractDOMs(), but isn't in this case.
|
32
|
-
def each_s()
|
33
|
-
extractDOMs().each { |dom| yield(dom) }
|
34
|
-
end
|
35
|
-
|
36
|
-
# Iterates over all sentence nodes. The block passed to this
|
37
|
-
# method should return a DOM object as a value. After the iteration
|
38
|
-
# has been completed, the contents of /corpus/body are then replaced
|
39
|
-
# by the list of these results.
|
40
|
-
# At the moment, this changes the FileParser object. This should
|
41
|
-
# probably change in the future, but I don't want to mess with
|
42
|
-
# cloning now.
|
43
|
-
def process_s!()
|
44
|
-
newBody = Element.new('body')
|
45
|
-
each_s { |dom| newBody.add_element( yield(dom) ) }
|
46
|
-
|
47
|
-
@doc.delete_element("/corpus/body")
|
48
|
-
@doc.elements["corpus"].add_element(newBody)
|
49
|
-
|
50
|
-
return @doc
|
51
|
-
end
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
private
|
56
|
-
|
57
|
-
def ensureParsedDocument()
|
58
|
-
if @doc == nil then
|
59
|
-
@doc = Document.new(@file)
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
63
|
-
|
64
|
-
end
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
#####################################################################
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
class FilePartsParser
|
75
|
-
# @file = File object for the corpus
|
76
|
-
# @head = string up to the first <s> tag
|
77
|
-
# @tail = string after the last </s> tag
|
78
|
-
# @rest = string starting with the latest <s> tag (complete this to
|
79
|
-
# a <s>...</s> structure by reading up to next </s> tag)
|
80
|
-
# @readCompletely = boolean specifying whether there's still something
|
81
|
-
# left to read in the file
|
82
|
-
|
83
|
-
attr_reader :head, :tail
|
84
|
-
|
85
|
-
def initialize(filename)
|
86
|
-
@file = File.new(filename)
|
87
|
-
@readCompletely = false
|
88
|
-
# read stuff into @head and initialize @rest
|
89
|
-
@head = ''
|
90
|
-
begin
|
91
|
-
while true do
|
92
|
-
line = @file.readline()
|
93
|
-
if line =~ /(.*)(<s\s.*)/ then
|
94
|
-
@head = @head << $1
|
95
|
-
@rest = $2
|
96
|
-
break
|
97
|
-
elsif line =~ /^(.*)(<\/body[\s>].*)$/
|
98
|
-
# empty corpus
|
99
|
-
@head = @head << $1
|
100
|
-
@tail = $2
|
101
|
-
while (line = @file.readline())
|
102
|
-
@tail << "\n" + line
|
103
|
-
end
|
104
|
-
@readCompletely = true
|
105
|
-
break
|
106
|
-
else
|
107
|
-
@head = @head << line
|
108
|
-
end
|
109
|
-
end
|
110
|
-
rescue EOFError
|
111
|
-
@readCompletely = true
|
112
|
-
end
|
113
|
-
end
|
114
|
-
|
115
|
-
def close()
|
116
|
-
@file.close()
|
117
|
-
end
|
118
|
-
|
119
|
-
def extractDOMs()
|
120
|
-
allDOMs = Array.new
|
121
|
-
|
122
|
-
process_s!() { |dom|
|
123
|
-
allDOMs.push(dom)
|
124
|
-
Element.new("x")
|
125
|
-
}
|
126
|
-
return allDOMs
|
127
|
-
end
|
128
|
-
|
129
|
-
def each_s()
|
130
|
-
process_s!() { |dom|
|
131
|
-
yield(dom)
|
132
|
-
Element.new("x")
|
133
|
-
}
|
134
|
-
end
|
135
|
-
|
136
|
-
# This function returns the string for the modified corpus.
|
137
|
-
# It doesn't change the internal state of the FilePartsParser,
|
138
|
-
# and is much more memory (and probably time) efficient than
|
139
|
-
# FileParser#process_s!.
|
140
|
-
# The block that is called by the method is given an element
|
141
|
-
# as its argument and is expected to return a changed element.
|
142
|
-
def process_s!()
|
143
|
-
if @readCompletely
|
144
|
-
return
|
145
|
-
end
|
146
|
-
|
147
|
-
ret = ''
|
148
|
-
scan_s() { |element|
|
149
|
-
# Process the <s> ... </s> element
|
150
|
-
doc = Document.new(element)
|
151
|
-
elt = doc.root
|
152
|
-
changedElt = yield(elt)
|
153
|
-
|
154
|
-
changedEltAsString = ''
|
155
|
-
changedElt.write(changedEltAsString, 0)
|
156
|
-
ret <<= changedEltAsString
|
157
|
-
}
|
158
|
-
|
159
|
-
return ret
|
160
|
-
end
|
161
|
-
|
162
|
-
# KE 12.6.03: scan_s :
|
163
|
-
# doesn't parse a sentence before yielding it
|
164
|
-
# doesn't allow for any changes
|
165
|
-
# but otherwise the same as process_s!
|
166
|
-
def scan_s()
|
167
|
-
if @readCompletely
|
168
|
-
return
|
169
|
-
end
|
170
|
-
|
171
|
-
begin
|
172
|
-
while true do
|
173
|
-
# Invariant: At this point, @rest always starts with an
|
174
|
-
# unseen <s> tag.
|
175
|
-
|
176
|
-
# First, we continue reading until we find the closing </s>
|
177
|
-
# No exception should occur in this loop if we're parsing
|
178
|
-
# a valid XML document.
|
179
|
-
while @rest !~ /^(.*<\/s>)(.*)/m do
|
180
|
-
@rest = @rest << @file.readline()
|
181
|
-
end
|
182
|
-
|
183
|
-
element = $1
|
184
|
-
@rest = $2
|
185
|
-
|
186
|
-
yield(element) # change HERE: element not parsed!
|
187
|
-
|
188
|
-
# Read on up to the next <s>
|
189
|
-
while @rest !~ /(.*)(<s\s.*)/m do
|
190
|
-
@rest = @rest << @file.readline()
|
191
|
-
end
|
192
|
-
|
193
|
-
@rest = $2
|
194
|
-
end
|
195
|
-
rescue EOFError
|
196
|
-
@tail = @rest
|
197
|
-
@readCompletely = true
|
198
|
-
end
|
199
|
-
end
|
200
|
-
|
201
|
-
# KE 5.11.03: get_rest: read all of the file not processed up to this point
|
202
|
-
# and return it as a string
|
203
|
-
def get_rest()
|
204
|
-
begin
|
205
|
-
while true do
|
206
|
-
@rest = @rest << @file.readline()
|
207
|
-
end
|
208
|
-
rescue EOFError
|
209
|
-
@readCompletely = true
|
210
|
-
end
|
211
|
-
return @rest
|
212
|
-
end
|
213
|
-
end
|
@@ -1,2347 +0,0 @@
|
|
1
|
-
# SalsaTigerRegXML.rb
|
2
|
-
#
|
3
|
-
# Katrin Erk, June 2005
|
4
|
-
#
|
5
|
-
# Classes for accessing and managing
|
6
|
-
# SalsaTigerXML sentences
|
7
|
-
#
|
8
|
-
# The interface of the classes in this package
|
9
|
-
# is similar to that of SalsaTigerXML.rb
|
10
|
-
# but the package is based solely on regular expressions
|
11
|
-
# and not on REXML.
|
12
|
-
#
|
13
|
-
# Main class here: SalsaTigerSentence, keeps a complete sentence
|
14
|
-
#
|
15
|
-
# Nodes of the syntactic tree, frames and frame elements are all
|
16
|
-
# handed around as XMLNode objects, or more specifically
|
17
|
-
# SynNode, FrameNode and FeNode objects, respectively.
|
18
|
-
#
|
19
|
-
# Inheritance between classes in here:
|
20
|
-
#
|
21
|
-
# GraphNode
|
22
|
-
# |
|
23
|
-
# XMLNode
|
24
|
-
# |
|
25
|
-
# SalsaTigerXmlNode
|
26
|
-
# / \
|
27
|
-
# SynNode SemNode
|
28
|
-
# | / \
|
29
|
-
# TSSynNode FrameNode FeNode
|
30
|
-
#
|
31
|
-
#
|
32
|
-
# SalsaTigerSentence uses the other classes, but is separate
|
33
|
-
#
|
34
|
-
# SalsaTigerSentence does _not_ yield a faithful image of the SalsaTiger XML structure of
|
35
|
-
# a sentence. With the SalsaTiger XML structure you need to follow "idref" attributes
|
36
|
-
# to the elements with matching "id" attributes in other parts of the structure.
|
37
|
-
# With the classes in this package, you don't.
|
38
|
-
# Wherever in SalsaTiger XML you have an idref, you will have _direct access to the
|
39
|
-
# object_ here.
|
40
|
-
#
|
41
|
-
# Suppose that in the XML structure you have a nonterminal element X with <edge> elements
|
42
|
-
# pointing to other (terminal or nonterminal) elements X1,.., Xn. Then you'll have
|
43
|
-
# a SynNode object N that contains X as its XML object, and the children N1,..,Nn of N
|
44
|
-
# will be SynNode objects that contain X1,..,Xn as their XML objects.
|
45
|
-
#
|
46
|
-
# A SynNode that is a terminal may have children too: its splitword parts (if any).
|
47
|
-
#
|
48
|
-
# So: a syntactic node is a SynNode object, its children are SynNode objects. The edges
|
49
|
-
# to its children are labeled the same way as in the XML structure. If the children
|
50
|
-
# are splitword parts, the edges are unlabeled.
|
51
|
-
#
|
52
|
-
# A frame is a FrameNode object, its children are FeNode objects. The edges to its children
|
53
|
-
# are labeled with the FE name or with "target".
|
54
|
-
#
|
55
|
-
# A frame element is an FeNode object, its children are SynNode objects. The edges to its
|
56
|
-
# children are unlabeled.
|
57
|
-
#
|
58
|
-
# A frame underspecification is an UspNode object, its children are FrameNode objects.
|
59
|
-
# The edges to its children are unlabeled.
|
60
|
-
#
|
61
|
-
# A frame element underspecification is an UspNode objects, its children are
|
62
|
-
# FeNode objects. The edges to its children are unlabeled.
|
63
|
-
|
64
|
-
require "frprep/Tree"
|
65
|
-
require "frprep/STXmlTerminalOrder"
|
66
|
-
require "frprep/RegXML"
|
67
|
-
require "frprep/ruby_class_extensions"
|
68
|
-
|
69
|
-
#############
|
70
|
-
# class XMLNode
|
71
|
-
#
|
72
|
-
# node with entries pointing to its children
|
73
|
-
# as well as its parent.
|
74
|
-
# all edges may be labeled.
|
75
|
-
# each node has a unique ID.
|
76
|
-
#
|
77
|
-
# indexes a string with XML data representing the same node,
|
78
|
-
# but does not look into it, just keeps it
|
79
|
-
#
|
80
|
-
# methods:
|
81
|
-
# This class inherits from TreeNode and GraphNode.
|
82
|
-
# See Tree.rb and Graph.rb for the methods they offer.
|
83
|
-
#
|
84
|
-
# new initializes the object
|
85
|
-
#
|
86
|
-
# get returns the XML object representing
|
87
|
-
# the same node as this node object
|
88
|
-
#
|
89
|
-
|
90
|
-
class XMLNode < TreeNode
|
91
|
-
|
92
|
-
###
|
93
|
-
def initialize(name, # string: element name; or, for text, the whole text
|
94
|
-
attribute, # hash: attr_name(string) -> attr_value(string)
|
95
|
-
id, # string: node ID
|
96
|
-
i_am_text = false) # boolean: set to anything but false or nil
|
97
|
-
# to represent not an xml element but text
|
98
|
-
|
99
|
-
if id.nil?
|
100
|
-
# I wasn't given any ID
|
101
|
-
# take system time for an ID
|
102
|
-
# use to_f to get fractions of seconds too:
|
103
|
-
# If I make several nodes in the same second,
|
104
|
-
# they should still have unique IDs
|
105
|
-
id = Time.new().to_f.to_s
|
106
|
-
end
|
107
|
-
|
108
|
-
super(id)
|
109
|
-
|
110
|
-
# remember values for this element
|
111
|
-
set_f("name", name)
|
112
|
-
set_f("attributes", attribute)
|
113
|
-
set_f("i_am_text", i_am_text)
|
114
|
-
|
115
|
-
# sanity check
|
116
|
-
if i_am_text and attributes
|
117
|
-
raise "A text element cannot have attributes"
|
118
|
-
end
|
119
|
-
|
120
|
-
@kith = Array.new()
|
121
|
-
end
|
122
|
-
|
123
|
-
###
|
124
|
-
# add sanity check:
|
125
|
-
# if this is text rather than an xml element,
|
126
|
-
# it cannot have children
|
127
|
-
def add_child(child, edgelabel, varhash={})
|
128
|
-
if get_f("i_am_text")
|
129
|
-
raise "A text element cannot have children"
|
130
|
-
end
|
131
|
-
super(child, edgelabel, varhash)
|
132
|
-
end
|
133
|
-
|
134
|
-
###
|
135
|
-
def add_kith(xml) # RegXML object
|
136
|
-
@kith << xml
|
137
|
-
end
|
138
|
-
|
139
|
-
###
|
140
|
-
# set attribute
|
141
|
-
def set_attribute(name, value)
|
142
|
-
unless value.class == String
|
143
|
-
raise "I can only set attribute values to strings. Got: #{value.class.to_s}"
|
144
|
-
end
|
145
|
-
|
146
|
-
if get_f("attributes").nil?
|
147
|
-
set_f("attributes", Hash.new())
|
148
|
-
end
|
149
|
-
get_f("attributes")[name] = value
|
150
|
-
end
|
151
|
-
|
152
|
-
###
|
153
|
-
def get_attribute(name)
|
154
|
-
if get_f("attributes")
|
155
|
-
return get_f("attributes")[name]
|
156
|
-
else
|
157
|
-
return nil
|
158
|
-
end
|
159
|
-
end
|
160
|
-
|
161
|
-
###
|
162
|
-
# delete attribute
|
163
|
-
def del_attribute(name)
|
164
|
-
if get_f("attributes")
|
165
|
-
get_f("attributes").delete(name)
|
166
|
-
end
|
167
|
-
end
|
168
|
-
|
169
|
-
###
|
170
|
-
# return XML as string:
|
171
|
-
# If this is a text, just return the text
|
172
|
-
# which is stored in "name"
|
173
|
-
# If this is an XMl element,
|
174
|
-
# make a tag from its name and attributes,
|
175
|
-
# then add tags for all its children,
|
176
|
-
# then add an end tag.
|
177
|
-
def get()
|
178
|
-
if get_f("i_am_text")
|
179
|
-
# text rather than XML element
|
180
|
-
return get_f("name")
|
181
|
-
else
|
182
|
-
# XMl element, not text
|
183
|
-
string = "<" + get_f("name")
|
184
|
-
if get_f("attributes")
|
185
|
-
string << get_f("attributes").to_a.map { |name, value|
|
186
|
-
" " + name + "=\'" + xml_secure_val(value) + "\'"
|
187
|
-
}.join()
|
188
|
-
end
|
189
|
-
string << ">\n"
|
190
|
-
string << get_xml_embedded()
|
191
|
-
string << "</#{get_f("name")}>\n"
|
192
|
-
return string
|
193
|
-
end
|
194
|
-
end
|
195
|
-
|
196
|
-
#############
|
197
|
-
protected
|
198
|
-
|
199
|
-
def get_xml_embedded()
|
200
|
-
return get_xml_ofchildren() +
|
201
|
-
get_xml_ofkith()
|
202
|
-
end
|
203
|
-
|
204
|
-
|
205
|
-
def get_xml_ofchildren()
|
206
|
-
return children.map { |child|
|
207
|
-
child.get()
|
208
|
-
}.join()
|
209
|
-
end
|
210
|
-
|
211
|
-
|
212
|
-
def get_xml_ofkith()
|
213
|
-
return @kith.map { |thing| thing.to_s + "\n" }.join()
|
214
|
-
end
|
215
|
-
|
216
|
-
|
217
|
-
###
|
218
|
-
def warn_child_ignored(where, xml_node)
|
219
|
-
$stderr.puts "WARNING: additional material found in #{where}, will be ignored:"
|
220
|
-
$stderr.puts "\t" + xml_node.to_s
|
221
|
-
end
|
222
|
-
|
223
|
-
###
|
224
|
-
def xml_secure_val(value) # string: value of an attribute
|
225
|
-
return value.gsub(/'/, "'").gsub(/"/, "''")
|
226
|
-
return value
|
227
|
-
end
|
228
|
-
end
|
229
|
-
|
230
|
-
#############
|
231
|
-
# class SalsaTigerXmlNode
|
232
|
-
#
|
233
|
-
# additional methods:
|
234
|
-
#
|
235
|
-
# is_terminal? true if this is a Tiger XML terminal node
|
236
|
-
#
|
237
|
-
# is_nonterminal? true if this is a Tiger XML nonterminal node
|
238
|
-
#
|
239
|
-
# is_splitword? true if this is a splitword part
|
240
|
-
#
|
241
|
-
# is_syntactic? true for terminal, nonterminal, splitword
|
242
|
-
#
|
243
|
-
# is_frame? true if this is a Salsa/Tiger XML frame
|
244
|
-
#
|
245
|
-
# is_target? true if this is a Salsa/Tiger XML frame target
|
246
|
-
#
|
247
|
-
# is_fe? true if this is a Salsa/Tiger XML frame element
|
248
|
-
#
|
249
|
-
# is_outside_sentence? returns false -- this node is not a placeholder for
|
250
|
-
# a node that is outside the current sentence
|
251
|
-
# (but see descendant class TSSynNode)
|
252
|
-
#
|
253
|
-
# yield_nodes returns the list of descendants thatare leaves of the tree
|
254
|
-
# NOTE: this overwrites the Graph.yield_nodes method
|
255
|
-
# since we have to treat splitwords in a special way
|
256
|
-
# empty array if no yield nodes are present
|
257
|
-
#
|
258
|
-
# yield_nodes_ordered returns those descendants ordered by precedence
|
259
|
-
# in the sentence, i.e. their node IDs.
|
260
|
-
#
|
261
|
-
# sid returns the sentence ID of this node
|
262
|
-
#
|
263
|
-
# to_s returns the yield of this node as a string of space-separated words
|
264
|
-
# words ordered left to right
|
265
|
-
#
|
266
|
-
class SalsaTigerXmlNode < XMLNode
|
267
|
-
include StringTerminalsInRightOrder
|
268
|
-
|
269
|
-
###
|
270
|
-
# extracting the ID from a RegXML element
|
271
|
-
# depends on whether it has an ID or an IDref
|
272
|
-
#
|
273
|
-
# returns: a string, the ID, or nil if none was found
|
274
|
-
def SalsaTigerXmlNode.xmlel_id(xml_obj) # RegXML object
|
275
|
-
case xml_obj.name
|
276
|
-
when "edge", "fenode", "uspitem", "splitword", "other_edge"
|
277
|
-
# contains ID ref
|
278
|
-
return xml_obj.attributes()["idref"]
|
279
|
-
when "part"
|
280
|
-
# contains ID
|
281
|
-
return xml_obj.attributes()["id"]
|
282
|
-
else
|
283
|
-
# something else
|
284
|
-
# default: ID is in attribute "id"
|
285
|
-
return xml_obj.attributes()["id"]
|
286
|
-
end
|
287
|
-
end
|
288
|
-
|
289
|
-
###
|
290
|
-
def initialize(xml) # RegXML object or text
|
291
|
-
if xml.text?
|
292
|
-
# text
|
293
|
-
super(xml, nil, nil, true)
|
294
|
-
else
|
295
|
-
# xml element
|
296
|
-
super(xml.name(), xml.attributes(), SalsaTigerXmlNode.xmlel_id(xml), false)
|
297
|
-
end
|
298
|
-
end
|
299
|
-
|
300
|
-
###
|
301
|
-
def is_terminal?
|
302
|
-
return get_f("name") == "t"
|
303
|
-
end
|
304
|
-
|
305
|
-
###
|
306
|
-
def is_nonterminal?
|
307
|
-
return get_f("name") == "nt"
|
308
|
-
end
|
309
|
-
|
310
|
-
###
|
311
|
-
def is_splitword?
|
312
|
-
return get_f("name") == "part"
|
313
|
-
end
|
314
|
-
|
315
|
-
###
|
316
|
-
def is_syntactic?
|
317
|
-
if is_terminal? or is_nonterminal? or is_splitword?
|
318
|
-
return true
|
319
|
-
else
|
320
|
-
return false
|
321
|
-
end
|
322
|
-
end
|
323
|
-
|
324
|
-
###
|
325
|
-
def is_frame?
|
326
|
-
return get_f("name") == "frame"
|
327
|
-
end
|
328
|
-
|
329
|
-
###
|
330
|
-
def is_target?
|
331
|
-
return get_f("name") == "target"
|
332
|
-
end
|
333
|
-
|
334
|
-
###
|
335
|
-
def is_fe?
|
336
|
-
return get_f("name") == "fe"
|
337
|
-
end
|
338
|
-
|
339
|
-
###
|
340
|
-
def sid()
|
341
|
-
# my node ID starts out with the sentence ID
|
342
|
-
id =~ /^(.*?)_/
|
343
|
-
return $1
|
344
|
-
end
|
345
|
-
|
346
|
-
###
|
347
|
-
def is_outside_sentence?
|
348
|
-
return false
|
349
|
-
end
|
350
|
-
|
351
|
-
###
|
352
|
-
def yield_nodes()
|
353
|
-
# special consideration: splitwords do not count as children!
|
354
|
-
if children.reject {|c| c.is_splitword? }.empty?
|
355
|
-
return [ self ]
|
356
|
-
end
|
357
|
-
|
358
|
-
arr = Array.new
|
359
|
-
children.reject { |c| c.is_splitword? }.each { |c|
|
360
|
-
if c.children.reject {|gc| gc.is_splitword? }.empty?
|
361
|
-
arr << c
|
362
|
-
else
|
363
|
-
arr.concat c.yield_nodes()
|
364
|
-
end
|
365
|
-
}
|
366
|
-
return arr
|
367
|
-
end
|
368
|
-
|
369
|
-
###
|
370
|
-
def yield_nodes_ordered() # legacy name
|
371
|
-
# sort_terminals_and_splitwords_... cannot deal with nonterminals
|
372
|
-
# so remove and attach to the end of the chain
|
373
|
-
t, nt = yield_nodes().distribute { |x| x.is_terminal? or x.is_splitword? }
|
374
|
-
return sort_terminals_and_splitwords_left_to_right(t).concat(nt)
|
375
|
-
end
|
376
|
-
|
377
|
-
###
|
378
|
-
def terminals_sorted() # name parallel to the method of SalsaTigerSentence
|
379
|
-
return yield_nodes_ordered()
|
380
|
-
end
|
381
|
-
|
382
|
-
###
|
383
|
-
def to_s
|
384
|
-
return string_for_node(self)
|
385
|
-
end
|
386
|
-
end
|
387
|
-
|
388
|
-
#############
|
389
|
-
# class SynNode
|
390
|
-
#
|
391
|
-
# inherits from SalsaTigerXmlNode,
|
392
|
-
# adds to it methods specific to nodes
|
393
|
-
# that describe the syntactic structure
|
394
|
-
#
|
395
|
-
# additional/changed methods:
|
396
|
-
#
|
397
|
-
# part_of_speech part_of_speech information as a string,
|
398
|
-
# nil for anything but terminal nodes
|
399
|
-
#
|
400
|
-
# word word information for this node as a string,
|
401
|
-
# nil for anything but terminal nodes
|
402
|
-
#
|
403
|
-
# category category information for this node as a string,
|
404
|
-
# nil for anything but nonterminal nodes
|
405
|
-
#
|
406
|
-
# is_punct? true if this is a terminal node and it is a punctuation sign
|
407
|
-
#
|
408
|
-
# get_sem add a non-tree edge from this syntactic node to a semantic node
|
409
|
-
# Idea: this is basically the inverse of the edge pointing from
|
410
|
-
# the FeNode to this SynNode, so you can fetch a node's semantics directly
|
411
|
-
#
|
412
|
-
# add_sem add non-tree edge from this syntactic node to a FeNode
|
413
|
-
|
414
|
-
class SynNode < SalsaTigerXmlNode
|
415
|
-
|
416
|
-
###
|
417
|
-
def initialize(xml)
|
418
|
-
super(xml)
|
419
|
-
|
420
|
-
@sem = Array.new
|
421
|
-
@other_links = Array.new
|
422
|
-
end
|
423
|
-
|
424
|
-
###
|
425
|
-
def add_link(other_node, # SynNode
|
426
|
-
link_label, # string: edge label
|
427
|
-
attributes = {}) # hash string>string: further attribute-value pairs for the edge
|
428
|
-
|
429
|
-
@other_links << [link_label, other_node, attributes]
|
430
|
-
end
|
431
|
-
|
432
|
-
###
|
433
|
-
def get_linked(label = nil) # string/nil: if string, use only linked with this link_label
|
434
|
-
if label
|
435
|
-
return @other_links.select { |label_node_attr| label_node_attr.first == label }
|
436
|
-
else
|
437
|
-
return @other_links
|
438
|
-
end
|
439
|
-
end
|
440
|
-
|
441
|
-
###
|
442
|
-
def part_of_speech
|
443
|
-
if get_attribute("pos")
|
444
|
-
return get_attribute("pos").strip
|
445
|
-
else
|
446
|
-
return nil
|
447
|
-
end
|
448
|
-
end
|
449
|
-
|
450
|
-
###
|
451
|
-
def category
|
452
|
-
if get_attribute("cat")
|
453
|
-
return get_attribute("cat").strip
|
454
|
-
else
|
455
|
-
return nil
|
456
|
-
end
|
457
|
-
end
|
458
|
-
|
459
|
-
###
|
460
|
-
def word()
|
461
|
-
if get_attribute("word")
|
462
|
-
return get_attribute("word").strip
|
463
|
-
else
|
464
|
-
return nil
|
465
|
-
end
|
466
|
-
end
|
467
|
-
|
468
|
-
###
|
469
|
-
def is_punct?()
|
470
|
-
if is_nonterminal?
|
471
|
-
# only terminals can be punctuation signs
|
472
|
-
return false
|
473
|
-
end
|
474
|
-
|
475
|
-
# next check part of speech
|
476
|
-
# this works at least for TIGER corpus annotation
|
477
|
-
case part_of_speech
|
478
|
-
when '$.', '$,', '$('
|
479
|
-
return true
|
480
|
-
end
|
481
|
-
if part_of_speech =~ /^PUNC/
|
482
|
-
return true
|
483
|
-
end
|
484
|
-
|
485
|
-
# known punctuation signs: filtered out for determining maximal constituents
|
486
|
-
|
487
|
-
# no luck with part of speech:
|
488
|
-
# check word
|
489
|
-
case word
|
490
|
-
when ".", ";", ",", ":", "?", "!", "(", ")", "[", "]", "{", "}", "-", "''", "``", "\"", "'"
|
491
|
-
return true
|
492
|
-
end
|
493
|
-
|
494
|
-
# not a punctuation sign by any of the tests we have applied
|
495
|
-
return false
|
496
|
-
end
|
497
|
-
|
498
|
-
###
|
499
|
-
def to_s()
|
500
|
-
if is_terminal?
|
501
|
-
return word
|
502
|
-
else
|
503
|
-
return super()
|
504
|
-
end
|
505
|
-
end
|
506
|
-
|
507
|
-
###
|
508
|
-
def get_sem()
|
509
|
-
return @sem.clone()
|
510
|
-
end
|
511
|
-
|
512
|
-
###
|
513
|
-
def add_sem(fe_node)
|
514
|
-
unless fe_node.class == FeNode
|
515
|
-
raise "Unexpected class of semantic node: was expecting an FeNode"
|
516
|
-
end
|
517
|
-
|
518
|
-
@sem << fe_node
|
519
|
-
end
|
520
|
-
|
521
|
-
#############
|
522
|
-
protected
|
523
|
-
|
524
|
-
def get_xml_ofchildren()
|
525
|
-
string = ""
|
526
|
-
|
527
|
-
each_child_with_edgelabel { |label, child|
|
528
|
-
unless child.is_splitword?
|
529
|
-
# terminal or nonterminal child.
|
530
|
-
# splitwords are handled separately in the "sem" part of the sentence
|
531
|
-
if label
|
532
|
-
string << "<edge label=\'#{xml_secure_val(label)}\' idref=\'#{xml_secure_val(child.id)}\'/>\n"
|
533
|
-
else
|
534
|
-
string << "<edge label=\'-\' idref=\'#{xml_secure_val(child.id)}\'/>\n"
|
535
|
-
end
|
536
|
-
end
|
537
|
-
}
|
538
|
-
@other_links.each { |label, node, attributes|
|
539
|
-
if label
|
540
|
-
string << "<other_edge label=\'#{xml_secure_val(label)}\'"
|
541
|
-
else
|
542
|
-
string << "<other_edge label=\'-\'"
|
543
|
-
end
|
544
|
-
string << " idref=\'#{xml_secure_val(node.id)}\'"
|
545
|
-
if attributes
|
546
|
-
string << " " + attributes.to_a.map { |attr, val| "#{xml_secure_val(attr)}=\'#{xml_secure_val(val)}\'" }.join(" ")
|
547
|
-
end
|
548
|
-
string << "/>\n"
|
549
|
-
}
|
550
|
-
|
551
|
-
return string
|
552
|
-
end
|
553
|
-
end
|
554
|
-
|
555
|
-
#############
|
556
|
-
# class TSSynNode
|
557
|
-
#
|
558
|
-
# inherits from SynNode
|
559
|
-
#
|
560
|
-
# describes a syntactic node that isn't really there:
|
561
|
-
# a reference to a node in another sentence
|
562
|
-
#
|
563
|
-
# contains that node's ID, but an empty RegXML object,
|
564
|
-
# its string is "<unknown>", and you cannot add
|
565
|
-
# a child to it
|
566
|
-
#
|
567
|
-
# new or changed methods:
|
568
|
-
#-----------------------
|
569
|
-
#
|
570
|
-
# is_outside_sentence? returns true
|
571
|
-
#
|
572
|
-
# word returns "<unknown>"
|
573
|
-
#
|
574
|
-
# add_child raises an error
|
575
|
-
|
576
|
-
class TSSynNode < SynNode
|
577
|
-
|
578
|
-
###
|
579
|
-
def initialize(id_string)
|
580
|
-
super(RegXML.new("<OTHER_SENTENCE id='" + id_string + "'/>"))
|
581
|
-
end
|
582
|
-
|
583
|
-
###
|
584
|
-
def is_outside_sentence?
|
585
|
-
return true
|
586
|
-
end
|
587
|
-
|
588
|
-
###
|
589
|
-
# word of this node: <unknown>
|
590
|
-
def word
|
591
|
-
return "<unknown>"
|
592
|
-
end
|
593
|
-
|
594
|
-
def add_child(arg1, arg2)
|
595
|
-
raise "Not implemented for this class"
|
596
|
-
end
|
597
|
-
end
|
598
|
-
|
599
|
-
#############
|
600
|
-
# class SemNode
|
601
|
-
#
|
602
|
-
# common superclass for FrameNode and FeNode,
|
603
|
-
# with methods that are the same for both:
|
604
|
-
#
|
605
|
-
#
|
606
|
-
# is_usp? returns true if the frame/FE is involved in underspecification,
|
607
|
-
# else false
|
608
|
-
#
|
609
|
-
# flags returns an array of all the frame/FE flags for this node.
|
610
|
-
# members of the array are strings describing the flags
|
611
|
-
# that have been set to true
|
612
|
-
#
|
613
|
-
# add_flag add or remove a frame/FE flag
|
614
|
-
# remove_flag
|
615
|
-
|
616
|
-
class SemNode < SalsaTigerXmlNode
|
617
|
-
attr_reader :flags
|
618
|
-
|
619
|
-
def initialize(xml) # RegXML object or text
|
620
|
-
super(xml)
|
621
|
-
# flags: array of FlagNode objects
|
622
|
-
@flags = Array.new()
|
623
|
-
end
|
624
|
-
|
625
|
-
###
|
626
|
-
def is_usp?
|
627
|
-
return get_attribute("usp") == "yes"
|
628
|
-
end
|
629
|
-
|
630
|
-
###
|
631
|
-
def add_flag(name) # string: flag name
|
632
|
-
@flags << name
|
633
|
-
end
|
634
|
-
|
635
|
-
###
|
636
|
-
def remove_flag(name) # string: flag name
|
637
|
-
@flags.delete(name)
|
638
|
-
end
|
639
|
-
|
640
|
-
#############
|
641
|
-
protected
|
642
|
-
|
643
|
-
def get_xml_embedded()
|
644
|
-
return super() + get_xml_offlags()
|
645
|
-
end
|
646
|
-
|
647
|
-
def get_xml_offlags()
|
648
|
-
# and add flags
|
649
|
-
return @flags.map { |flagname|
|
650
|
-
"<flag name=\'#{xml_secure_val(flagname)}\'/>\n"
|
651
|
-
}.join
|
652
|
-
end
|
653
|
-
end
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
#############
|
658
|
-
# class FrameNode
|
659
|
-
#
|
660
|
-
# inherits from SemNode
|
661
|
-
# adds to it methods specific to nodes
|
662
|
-
# that describe a frame
|
663
|
-
#
|
664
|
-
# additional/changed methods:
|
665
|
-
#
|
666
|
-
# name returns the name of the frame
|
667
|
-
# set_name changes the name of the frame to a new name
|
668
|
-
# target returns the target (as a FeNode object)
|
669
|
-
#
|
670
|
-
# each_child() iterates through FEs, children() returns all FEs
|
671
|
-
#
|
672
|
-
# each_fe_by_name A frame node may have several FE children with the same
|
673
|
-
# frame element label. While each_child returns them separately,
|
674
|
-
# each_fe_by_name lumps FE children with the same frame element label
|
675
|
-
# into one FeNode.
|
676
|
-
# Warnings:
|
677
|
-
# - the REXML object of the FeNode is that of the first FE child
|
678
|
-
# with that frame element label.
|
679
|
-
# - Underspecification is ignored! If you have the same FE twice,
|
680
|
-
# and there is underspecification regarding the extent of the FE,
|
681
|
-
# the two FE children will be lumped together anyway.
|
682
|
-
# If you don't want that, use each_child instead.
|
683
|
-
#
|
684
|
-
#
|
685
|
-
# add_fe CAUTION: please do not call this method directly externally,
|
686
|
-
# use SalsaTigerSentence.add_fe, otherwise the node and its ID
|
687
|
-
# will not be recorded in the node list and the node cannot be retrieved
|
688
|
-
# via its ID
|
689
|
-
|
690
|
-
class FrameNode < SemNode
|
691
|
-
|
692
|
-
###
|
693
|
-
def target()
|
694
|
-
target = children_by_edgelabels(["target"])
|
695
|
-
if target.empty?
|
696
|
-
$stderr.puts "SalsaTigerRegXML warning: Frame #{id()}: No target, but I got: \n" + child_labels().join(", ")
|
697
|
-
return nil
|
698
|
-
else
|
699
|
-
unless target.length == 1
|
700
|
-
raise "target: more than one target to frame "+id()
|
701
|
-
end
|
702
|
-
return target.first
|
703
|
-
end
|
704
|
-
end
|
705
|
-
|
706
|
-
###
|
707
|
-
def name
|
708
|
-
return get_attribute("name")
|
709
|
-
end
|
710
|
-
|
711
|
-
###
|
712
|
-
def set_name(new_name)
|
713
|
-
set_attribute("name", new_name)
|
714
|
-
end
|
715
|
-
|
716
|
-
###
|
717
|
-
# each_fe: synonym for each_child
|
718
|
-
def each_fe()
|
719
|
-
each_child { |c| yield c }
|
720
|
-
end
|
721
|
-
|
722
|
-
###
|
723
|
-
# fes: synonym for children
|
724
|
-
def fes()
|
725
|
-
children()
|
726
|
-
end
|
727
|
-
|
728
|
-
###
|
729
|
-
def each_fe_by_name()
|
730
|
-
child_labels.uniq.each { |fe_name|
|
731
|
-
unless fe_name == "target"
|
732
|
-
|
733
|
-
fes = children_by_edgelabels([fe_name])
|
734
|
-
|
735
|
-
if fes.length == 1
|
736
|
-
# one frame element with that name
|
737
|
-
yield fes.first
|
738
|
-
|
739
|
-
else
|
740
|
-
# several frame elements with that name
|
741
|
-
# combine them
|
742
|
-
|
743
|
-
combined_fe = FeNode.new(fe_name, id() + "_" + fe_name)
|
744
|
-
fes.each { |fe|
|
745
|
-
fe.each_child() { |child|
|
746
|
-
combined_fe.add_child(child)
|
747
|
-
}
|
748
|
-
}
|
749
|
-
yield combined_fe
|
750
|
-
end
|
751
|
-
end
|
752
|
-
}
|
753
|
-
end
|
754
|
-
|
755
|
-
###
|
756
|
-
def add_child(fe_node)
|
757
|
-
if fe_node.name == "target" and not(children_by_edgelabels(["target"]).empty?)
|
758
|
-
$stderr.puts "Adding second target to frame #{id()}"
|
759
|
-
$stderr.puts "I already have: " + children_by_edgelabels(["target"]).map { |t| t.id() }.join(",")
|
760
|
-
raise "More than one target."
|
761
|
-
end
|
762
|
-
|
763
|
-
super(fe_node, fe_node.name)
|
764
|
-
end
|
765
|
-
|
766
|
-
###
|
767
|
-
def remove_child(fe_node)
|
768
|
-
super(fe_node, fe_node.name)
|
769
|
-
end
|
770
|
-
|
771
|
-
###
|
772
|
-
def add_fe(fe_name, # string: name of FE to add
|
773
|
-
syn_nodes, # array:SynNode, syntactic nodes that this FE should point to
|
774
|
-
fe_id = nil) # string: ID for the new FE
|
775
|
-
|
776
|
-
if fe_name == "target" and not(children_by_edgelabels(["target"]).empty?)
|
777
|
-
$stderr.puts "Adding second target to frame #{id()}"
|
778
|
-
$stderr.puts "I already have: " + children_by_edgelabels(["target"]).map { |t| t.id() }.join(",")
|
779
|
-
raise "More than one target."
|
780
|
-
end
|
781
|
-
|
782
|
-
# make FE node and list as this frame's child
|
783
|
-
unless fe_id
|
784
|
-
# no FE ID given, make one myself
|
785
|
-
fe_id = id() + "_fe" + Time.new().to_f.to_s
|
786
|
-
end
|
787
|
-
|
788
|
-
n = FeNode.new(fe_name, fe_id)
|
789
|
-
add_child(n)
|
790
|
-
|
791
|
-
# add syn nodes
|
792
|
-
syn_nodes.each { |syn_node|
|
793
|
-
n.add_child(syn_node)
|
794
|
-
}
|
795
|
-
|
796
|
-
return n
|
797
|
-
end
|
798
|
-
end
|
799
|
-
|
800
|
-
#############
|
801
|
-
# class FeNode
|
802
|
-
#
|
803
|
-
# inherits from SemNode,
|
804
|
-
# adds to it methods specific to nodes
|
805
|
-
# that describe a frame element or target
|
806
|
-
#
|
807
|
-
# additional/changed methods:
|
808
|
-
#----------------------------
|
809
|
-
#
|
810
|
-
# name returns the name of the frame element, or "target"
|
811
|
-
#
|
812
|
-
# add_child, remove_child
|
813
|
-
|
814
|
-
class FeNode < SemNode
|
815
|
-
|
816
|
-
###
|
817
|
-
def initialize(name_or_xml, # either RegXMl object or the name of the FE as a string
|
818
|
-
id_if_name = nil) # string: ID to use if we just got the name of the FE
|
819
|
-
|
820
|
-
case name_or_xml.class.to_s
|
821
|
-
when "String"
|
822
|
-
if name_or_xml == "target"
|
823
|
-
super(RegXML.new("<target id=\'#{xml_secure_val(id_if_name.to_s)}\'/>"))
|
824
|
-
@i_am_target = true
|
825
|
-
else
|
826
|
-
super(RegXML.new("<fe name=\'#{xml_secure_val(name_or_xml)}\' id=\'#{xml_secure_val(id_if_name.to_s)}\'/>"))
|
827
|
-
@i_am_target = false
|
828
|
-
end
|
829
|
-
|
830
|
-
when "RegXML"
|
831
|
-
super(name_or_xml)
|
832
|
-
|
833
|
-
if name_or_xml.name() == "target"
|
834
|
-
@i_am_target = true
|
835
|
-
else
|
836
|
-
@i_am_target = false
|
837
|
-
end
|
838
|
-
else
|
839
|
-
raise "Shouldn't be here: " + name_or_xml.class.to_s
|
840
|
-
end
|
841
|
-
|
842
|
-
# child_attr: keep additional attributes of <fenode> elements,
|
843
|
-
# if there are any
|
844
|
-
# child_attr: hash syn_node_id(string) -> attributes(hash)
|
845
|
-
@child_attr = Hash.new()
|
846
|
-
end
|
847
|
-
|
848
|
-
###
|
849
|
-
def name
|
850
|
-
if @i_am_target
|
851
|
-
return "target"
|
852
|
-
else
|
853
|
-
return get_attribute("name")
|
854
|
-
end
|
855
|
-
end
|
856
|
-
|
857
|
-
###
|
858
|
-
def add_child(syn_node,
|
859
|
-
xml_obj = nil)
|
860
|
-
if xml_obj
|
861
|
-
# we've been given the fenode XML element
|
862
|
-
# see if there are any attributes that we will need:
|
863
|
-
# get attributes, remove the idref (we get that from the
|
864
|
-
# child's ID directly)
|
865
|
-
at = xml_obj.attributes
|
866
|
-
at.delete("idref")
|
867
|
-
unless at.empty?
|
868
|
-
@child_attr[syn_node.id] = at
|
869
|
-
end
|
870
|
-
end
|
871
|
-
|
872
|
-
super(syn_node, nil, "pointer_insteadof_edge" => true)
|
873
|
-
end
|
874
|
-
|
875
|
-
###
|
876
|
-
def remove_child(syn_node, varhash={})
|
877
|
-
super(syn_node, nil, "pointer_insteadof_edge" => true)
|
878
|
-
end
|
879
|
-
|
880
|
-
#############
|
881
|
-
protected
|
882
|
-
|
883
|
-
def get_xml_ofchildren()
|
884
|
-
return children.map { |child|
|
885
|
-
if @child_attr[child.id()]
|
886
|
-
"<fenode idref=\'#{xml_secure_val(child.id())}\'" +
|
887
|
-
@child_attr[child.id()].to_a.map { |attr, val|
|
888
|
-
" #{attr}=\'#{xml_secure_val(val)}\'"
|
889
|
-
}.join() +
|
890
|
-
"/>\n"
|
891
|
-
|
892
|
-
else
|
893
|
-
"<fenode idref=\'#{xml_secure_val(child.id())}\'/>\n"
|
894
|
-
end
|
895
|
-
}.join()
|
896
|
-
end
|
897
|
-
end
|
898
|
-
|
899
|
-
#############
|
900
|
-
# class UspNode
|
901
|
-
#
|
902
|
-
# inherits from SalsaTigerXmlNode,
|
903
|
-
# adds to it methods specific to nodes
|
904
|
-
# that describe a frame underspecification or frame element underspecification
|
905
|
-
#
|
906
|
-
# additional/changed methods:
|
907
|
-
#----------------------------
|
908
|
-
#
|
909
|
-
# new initializes the object
|
910
|
-
# rexml_object: underlying XML object for this node
|
911
|
-
# frame_or_fe: string, either "frame" for frame underspecification
|
912
|
-
# or "fe" for frame element underspecification
|
913
|
-
#
|
914
|
-
# add_child, remove_child add, remove underspecification entry
|
915
|
-
|
916
|
-
class UspNode < SalsaTigerXmlNode
|
917
|
-
|
918
|
-
attr_reader :i_am
|
919
|
-
|
920
|
-
###
|
921
|
-
def initialize(xml_obj, # RegXMl object
|
922
|
-
frame_or_fe) # string "frame" or "fe"
|
923
|
-
|
924
|
-
super(xml_obj)
|
925
|
-
case frame_or_fe
|
926
|
-
when "frame"
|
927
|
-
@i_am = "frame"
|
928
|
-
when "fe"
|
929
|
-
@i_am = "fe"
|
930
|
-
else
|
931
|
-
raise "new: neither frame nor fe??"
|
932
|
-
end
|
933
|
-
end
|
934
|
-
|
935
|
-
###
|
936
|
-
def add_child(node, varhash={})
|
937
|
-
if node
|
938
|
-
super(node, nil, "pointer_insteadof_edge" => true)
|
939
|
-
else
|
940
|
-
raise "Got nil for a node."
|
941
|
-
end
|
942
|
-
|
943
|
-
# set usp. attribute on child
|
944
|
-
node.set_attribute("usp", "yes")
|
945
|
-
end
|
946
|
-
|
947
|
-
###
|
948
|
-
def remove_child(node, varhash={})
|
949
|
-
super(node, nil, "pointer_insteadof_edge" => true)
|
950
|
-
|
951
|
-
# removing "usp" attribute on child
|
952
|
-
# this will be wrong if the child is involved in more
|
953
|
-
# than one instance of underspecification!
|
954
|
-
|
955
|
-
$stderr.puts "Warning: unsafe removal of attribute 'usp'"
|
956
|
-
node.del_attribute("usp")
|
957
|
-
end
|
958
|
-
|
959
|
-
#############
|
960
|
-
protected
|
961
|
-
|
962
|
-
def get_xml_ofchildren()
|
963
|
-
return children.map { |child|
|
964
|
-
"<uspitem idref=\'#{xml_secure_val(child.id)}\'/>\n"
|
965
|
-
}.join()
|
966
|
-
end
|
967
|
-
|
968
|
-
end
|
969
|
-
|
970
|
-
#############
|
971
|
-
class SalsaTigerSentenceGraph < XMLNode
|
972
|
-
include StringTerminalsInRightOrder
|
973
|
-
|
974
|
-
attr_reader :node
|
975
|
-
|
976
|
-
def initialize(xml_obj, # RegXML object
|
977
|
-
sentence_id) # string: ID of this sentence
|
978
|
-
|
979
|
-
# global data:
|
980
|
-
# node: hash node_id -> XMLNode object
|
981
|
-
# maps node IDs to the nodes with that ID
|
982
|
-
@node = Hash.new
|
983
|
-
@sentence_id = sentence_id
|
984
|
-
|
985
|
-
if xml_obj
|
986
|
-
# we actually have syntactic information.
|
987
|
-
# read it.
|
988
|
-
|
989
|
-
# initialize this object as an XML node,
|
990
|
-
# i.e. remember the outermost element's name, attributes,
|
991
|
-
# and ID, and specify that it's not a text but an XML object
|
992
|
-
super(xml_obj.name, xml_obj.attributes, sentence_id + "_graph", false)
|
993
|
-
|
994
|
-
# initialize nodes, remember their IDs
|
995
|
-
xml_obj.children_and_text.each { |child_or_text|
|
996
|
-
|
997
|
-
case child_or_text.name
|
998
|
-
when "terminals"
|
999
|
-
make_nodes(child_or_text, "t", "s/graph/terminals", "all_children_kith")
|
1000
|
-
when "nonterminals"
|
1001
|
-
make_nodes(child_or_text, "nt", "s/graph/nonterminals")
|
1002
|
-
else
|
1003
|
-
# additional info that we don't need for now
|
1004
|
-
# keep for output
|
1005
|
-
add_kith(child_or_text)
|
1006
|
-
end
|
1007
|
-
}
|
1008
|
-
|
1009
|
-
|
1010
|
-
|
1011
|
-
# add edges between nodes
|
1012
|
-
nonterminals = xml_obj.children_and_text.detect { |child| child.name == "nonterminals" }
|
1013
|
-
if nonterminals
|
1014
|
-
nonterminals.children_and_text.each { |nt|
|
1015
|
-
|
1016
|
-
unless nt.name == "nt"
|
1017
|
-
# we've already done the warning bit in make_nodes
|
1018
|
-
next
|
1019
|
-
end
|
1020
|
-
|
1021
|
-
syn_add_children(@node[SalsaTigerXmlNode.xmlel_id(nt)], nt)
|
1022
|
-
}
|
1023
|
-
end
|
1024
|
-
|
1025
|
-
else
|
1026
|
-
# we have no syntactic information
|
1027
|
-
# record it anyway
|
1028
|
-
|
1029
|
-
super("graph", {}, sentence_id + "_graph", false)
|
1030
|
-
end
|
1031
|
-
end
|
1032
|
-
|
1033
|
-
|
1034
|
-
###
|
1035
|
-
def add_splitwords(xml_obj) #RegXMl object
|
1036
|
-
unless xml_obj.nil?
|
1037
|
-
# splitwords is an XML element with name "splitwords" and
|
1038
|
-
# children named "splitword", each of which describes a split
|
1039
|
-
# for one of the terminals we already know
|
1040
|
-
xml_obj.children_and_text.each { |splitword|
|
1041
|
-
unless splitword.name() == "splitword"
|
1042
|
-
warn_child_ignored("s/sem/splitwords/", splitword)
|
1043
|
-
next
|
1044
|
-
end
|
1045
|
-
|
1046
|
-
# make nodes for the splitword parts
|
1047
|
-
make_nodes(splitword, "part", "s/sem/splitwords/splitword", "all_children_kith")
|
1048
|
-
|
1049
|
-
# this is the terminal that is being split:
|
1050
|
-
# add links to its new children
|
1051
|
-
syn_add_children(@node[SalsaTigerXmlNode.xmlel_id(splitword)], splitword)
|
1052
|
-
}
|
1053
|
-
end
|
1054
|
-
end
|
1055
|
-
|
1056
|
-
###
|
1057
|
-
def to_s
|
1058
|
-
string_for_nodes(syn_roots())
|
1059
|
-
end
|
1060
|
-
|
1061
|
-
###
|
1062
|
-
def get()
|
1063
|
-
# make sure that the graph element has a 'root' attribute
|
1064
|
-
# since the Salsa tool needs this
|
1065
|
-
set_attribute("root", syn_roots().first.id())
|
1066
|
-
super()
|
1067
|
-
end
|
1068
|
-
|
1069
|
-
#####
|
1070
|
-
# access methods
|
1071
|
-
|
1072
|
-
###
|
1073
|
-
def each_node
|
1074
|
-
@node.each_value { |n|
|
1075
|
-
yield n
|
1076
|
-
}
|
1077
|
-
end
|
1078
|
-
|
1079
|
-
###
|
1080
|
-
def nodes
|
1081
|
-
return @node.values()
|
1082
|
-
end
|
1083
|
-
|
1084
|
-
###
|
1085
|
-
def each_terminal
|
1086
|
-
@node.each_value { |node|
|
1087
|
-
if node.is_terminal?
|
1088
|
-
yield node
|
1089
|
-
end
|
1090
|
-
}
|
1091
|
-
end
|
1092
|
-
|
1093
|
-
###
|
1094
|
-
def each_terminal_sorted
|
1095
|
-
sort_terminals_and_splitwords_left_to_right(terminals).each { |node_obj|
|
1096
|
-
yield node_obj
|
1097
|
-
}
|
1098
|
-
end
|
1099
|
-
|
1100
|
-
###
|
1101
|
-
def terminals
|
1102
|
-
return @node.values.select { |node| node.is_terminal? }
|
1103
|
-
end
|
1104
|
-
|
1105
|
-
###
|
1106
|
-
def terminals_sorted
|
1107
|
-
return sort_terminals_and_splitwords_left_to_right(terminals)
|
1108
|
-
end
|
1109
|
-
|
1110
|
-
###
|
1111
|
-
def each_nonterminal
|
1112
|
-
@node.each_value { |node|
|
1113
|
-
if node.is_nonterminal?
|
1114
|
-
yield node
|
1115
|
-
end
|
1116
|
-
}
|
1117
|
-
end
|
1118
|
-
|
1119
|
-
###
|
1120
|
-
def nonterminals
|
1121
|
-
return @node.values.select { |node| node.is_nonterminal? }
|
1122
|
-
end
|
1123
|
-
|
1124
|
-
###
|
1125
|
-
def syn_roots
|
1126
|
-
return @node.values.select { |node|
|
1127
|
-
node.parent().nil?
|
1128
|
-
}
|
1129
|
-
end
|
1130
|
-
###
|
1131
|
-
|
1132
|
-
######################3
|
1133
|
-
# adding nodes
|
1134
|
-
|
1135
|
-
###
|
1136
|
-
def add_child(arg1, arg2, varhash={})
|
1137
|
-
raise "Not implemented for this class"
|
1138
|
-
end
|
1139
|
-
|
1140
|
-
###
|
1141
|
-
def remove_child(arg1, arg2, varhash={})
|
1142
|
-
raise "Not implemented for this class"
|
1143
|
-
end
|
1144
|
-
|
1145
|
-
###
|
1146
|
-
def add_node(sentid, # string: sentence ID
|
1147
|
-
label, # string: t or nt
|
1148
|
-
cat = nil, # string: category
|
1149
|
-
word = nil,# string: word
|
1150
|
-
pos = nil, # string: part of speech
|
1151
|
-
syn_id = nil) # string: ID for the new node
|
1152
|
-
|
1153
|
-
unless ["t", "nt"].include? label
|
1154
|
-
raise "Unknown node label #{label} for new syntactic node. Must be either t or nt."
|
1155
|
-
end
|
1156
|
-
|
1157
|
-
# make node ID: sentence ID plus ID generated by system time
|
1158
|
-
if syn_id
|
1159
|
-
new_id = sentid + "_" + syn_id
|
1160
|
-
else
|
1161
|
-
new_id = sentid + "_" + Time.new().to_f.to_s
|
1162
|
-
end
|
1163
|
-
|
1164
|
-
elt = "<#{label}"
|
1165
|
-
[["id", new_id], ["cat", cat], ["word", word], ["pos", pos]].each { |label, content|
|
1166
|
-
if content
|
1167
|
-
elt << " #{label}=\"#{xml_secure_val(content)}\""
|
1168
|
-
end
|
1169
|
-
}
|
1170
|
-
elt << "/>"
|
1171
|
-
n = SynNode.new(RegXML.new(elt))
|
1172
|
-
@node[n.id] = n
|
1173
|
-
|
1174
|
-
return n
|
1175
|
-
end
|
1176
|
-
|
1177
|
-
###
|
1178
|
-
def remove_node(node) # SynNode
|
1179
|
-
# remove node from list
|
1180
|
-
@node.delete(node.id)
|
1181
|
-
|
1182
|
-
# remove it as child and parent of other nodes;
|
1183
|
-
# add its own children to the parent.
|
1184
|
-
# the _edgelabel_ of the new edges will be the edgeslabels
|
1185
|
-
# between the original node in its children
|
1186
|
-
# in other words, the label of the removed node's incoming edge
|
1187
|
-
# is deleted
|
1188
|
-
|
1189
|
-
# STDERR.puts "Removing node #{node.id}:"
|
1190
|
-
|
1191
|
-
pair = node.parent_with_edgelabel
|
1192
|
-
if pair
|
1193
|
-
# delete incoming edge for deleted node
|
1194
|
-
label, parent = pair
|
1195
|
-
# STDERR.puts " Removing link from PARENT #{parent.id}, edgelabel #{label}"
|
1196
|
-
parent.remove_child(node, label)
|
1197
|
-
end
|
1198
|
-
# delete outgoing edge for deleted node
|
1199
|
-
node.each_child_with_edgelabel { |label, child|
|
1200
|
-
child.remove_parent(node, label)
|
1201
|
-
# STDERR.puts " Removing link to child #{child.id}"
|
1202
|
-
}
|
1203
|
-
# glue deleted node's children to its parent
|
1204
|
-
if pair
|
1205
|
-
plabel, parent = pair
|
1206
|
-
node.each_child_with_edgelabel {|clabel,child|
|
1207
|
-
parent.add_child(child, clabel)
|
1208
|
-
}
|
1209
|
-
# STDERR.puts "Parent now has children "+node.parent.children.map {|c| c.id}.join(" ")
|
1210
|
-
end
|
1211
|
-
end
|
1212
|
-
|
1213
|
-
######################
|
1214
|
-
protected
|
1215
|
-
|
1216
|
-
###
|
1217
|
-
def get_xml_ofchildren()
|
1218
|
-
string = ""
|
1219
|
-
|
1220
|
-
string << "<terminals>\n"
|
1221
|
-
each_terminal_sorted { |t|
|
1222
|
-
string << t.get()
|
1223
|
-
}
|
1224
|
-
string << "</terminals>\n"
|
1225
|
-
|
1226
|
-
string << "<nonterminals>\n"
|
1227
|
-
each_nonterminal { |nt|
|
1228
|
-
string << nt.get()
|
1229
|
-
}
|
1230
|
-
string << "</nonterminals>\n"
|
1231
|
-
|
1232
|
-
return string
|
1233
|
-
|
1234
|
-
end
|
1235
|
-
|
1236
|
-
def make_nodes(xml_obj, # RegXML object
|
1237
|
-
expected_obj_name, # string
|
1238
|
-
where, # string
|
1239
|
-
all_children_kith = nil) # object: if non-nil,
|
1240
|
-
# keep all children of the new nodes
|
1241
|
-
# as kith"
|
1242
|
-
|
1243
|
-
xml_obj.children_and_text.each { |elt|
|
1244
|
-
|
1245
|
-
if elt.name == expected_obj_name
|
1246
|
-
# this is the kind of child we were expecting to see
|
1247
|
-
n = SynNode.new(elt)
|
1248
|
-
@node[n.id] = n
|
1249
|
-
|
1250
|
-
if all_children_kith
|
1251
|
-
elt.children_and_text.each { |elt_child|
|
1252
|
-
n.add_kith(elt_child)
|
1253
|
-
}
|
1254
|
-
end
|
1255
|
-
|
1256
|
-
else
|
1257
|
-
warn_child_ignored(where, elt)
|
1258
|
-
end
|
1259
|
-
}
|
1260
|
-
end
|
1261
|
-
|
1262
|
-
def syn_add_children(node,
|
1263
|
-
xml_obj)
|
1264
|
-
unless node
|
1265
|
-
raise "Shouldn't be here"
|
1266
|
-
end
|
1267
|
-
|
1268
|
-
xml_obj.children_and_text.each { |edge|
|
1269
|
-
|
1270
|
-
if ["edge", "part"].include? edge.name()
|
1271
|
-
|
1272
|
-
# add an edge to this child,
|
1273
|
-
# retrieve the node with the given ID from id_to_node
|
1274
|
-
child = @node[SalsaTigerXmlNode.xmlel_id(edge)]
|
1275
|
-
unless child
|
1276
|
-
raise "Sentence #{@sentence_id}: I cannot find a node for " + edge.to_s()
|
1277
|
-
end
|
1278
|
-
|
1279
|
-
edgelabel = edge.attributes()["label"]
|
1280
|
-
node.add_child(child, edgelabel)
|
1281
|
-
|
1282
|
-
elsif edge.name() == "other_edge"
|
1283
|
-
# add link to this node,
|
1284
|
-
# retrieve the node with the given ID from id_to_node
|
1285
|
-
child = @node[SalsaTigerXmlNode.xmlel_id(edge)]
|
1286
|
-
unless child
|
1287
|
-
raise "Sentence #{@sentence_id}: I cannot find a node for other_edge #{SalsaTigerXmlNode.xmlel_id(edge)} : " + edge.to_s()
|
1288
|
-
end
|
1289
|
-
|
1290
|
-
attributes = edge.attributes()
|
1291
|
-
if attributes
|
1292
|
-
edgelabel = attributes.delete("label")
|
1293
|
-
else
|
1294
|
-
edgelabel = nil
|
1295
|
-
end
|
1296
|
-
node.add_link(child, edgelabel, attributes)
|
1297
|
-
|
1298
|
-
else
|
1299
|
-
# something other than an edge
|
1300
|
-
# keep for output
|
1301
|
-
node.add_kith(edge)
|
1302
|
-
end
|
1303
|
-
}
|
1304
|
-
end
|
1305
|
-
end
|
1306
|
-
|
1307
|
-
#############
|
1308
|
-
class SalsaTigerSentenceSem < XMLNode
|
1309
|
-
|
1310
|
-
attr_reader :node
|
1311
|
-
|
1312
|
-
###
|
1313
|
-
def SalsaTigerSentenceSem.get_splitwords(xml_obj)
|
1314
|
-
return xml_obj.children_and_text.detect { |child|
|
1315
|
-
child.name == "splitwords"
|
1316
|
-
}
|
1317
|
-
end
|
1318
|
-
|
1319
|
-
###
|
1320
|
-
def initialize(xml_obj, # RegXML object
|
1321
|
-
sentence_id, # string: sentence ID
|
1322
|
-
id_to_node) # hash: syn_node_id(string) -> SynNode object
|
1323
|
-
|
1324
|
-
# global data:
|
1325
|
-
# node: hash node_id -> XMLNode object
|
1326
|
-
# maps node IDs to the nodes with that ID
|
1327
|
-
# frame_id, uspframe_id, uspfe_id: arrays of node IDs,
|
1328
|
-
# listing all frame nodes, frame underspecification nodes,
|
1329
|
-
# and FE underspecification nodes respectively
|
1330
|
-
# globals: array of RegXML objects, each representing one sentence flag
|
1331
|
-
@node = Hash.new
|
1332
|
-
@frame_id = Array.new
|
1333
|
-
@uspframe_id = Array.new
|
1334
|
-
@uspfe_id = Array.new
|
1335
|
-
@globals = Array.new
|
1336
|
-
|
1337
|
-
if xml_obj
|
1338
|
-
# we actually have semantic information.
|
1339
|
-
# read it.
|
1340
|
-
|
1341
|
-
super(xml_obj.name, xml_obj.attributes, sentence_id + "_sem", false)
|
1342
|
-
|
1343
|
-
globals_obj = frames_obj = usp_obj = nil
|
1344
|
-
|
1345
|
-
xml_obj.children_and_text.each { |obj|
|
1346
|
-
case obj.name
|
1347
|
-
when "globals"
|
1348
|
-
globals_obj = obj
|
1349
|
-
when "frames"
|
1350
|
-
frames_obj = obj
|
1351
|
-
when "usp"
|
1352
|
-
usp_obj = obj
|
1353
|
-
else
|
1354
|
-
add_kith(obj)
|
1355
|
-
end
|
1356
|
-
}
|
1357
|
-
|
1358
|
-
# handle globals
|
1359
|
-
if globals_obj
|
1360
|
-
globals_obj.children_and_text.each { |obj|
|
1361
|
-
@globals << obj
|
1362
|
-
}
|
1363
|
-
end
|
1364
|
-
|
1365
|
-
# index frames
|
1366
|
-
if frames_obj
|
1367
|
-
frames_obj.children_and_text.each { |frame|
|
1368
|
-
unless frame.name() == "frame"
|
1369
|
-
warn_child_ignored("s/sem/frames/", frame)
|
1370
|
-
next
|
1371
|
-
end
|
1372
|
-
|
1373
|
-
# make a node for the frame.
|
1374
|
-
node = FrameNode.new(frame)
|
1375
|
-
semnode_add_flags(node, frame)
|
1376
|
-
@node[node.id] = node
|
1377
|
-
@frame_id << node.id
|
1378
|
-
# add FEs
|
1379
|
-
frame_add_children(node, frame, id_to_node)
|
1380
|
-
}
|
1381
|
-
end
|
1382
|
-
|
1383
|
-
# index underspecification
|
1384
|
-
if usp_obj
|
1385
|
-
usp_obj.children_and_text.each { |uspframe_or_fe|
|
1386
|
-
case uspframe_or_fe.name
|
1387
|
-
when "uspframes"
|
1388
|
-
initialize_usp(uspframe_or_fe, "frame")
|
1389
|
-
when "uspfes"
|
1390
|
-
initialize_usp(uspframe_or_fe, "fe")
|
1391
|
-
|
1392
|
-
else
|
1393
|
-
warn_child_ignored("s/sem/usp/", uspframe_or_fe)
|
1394
|
-
end
|
1395
|
-
}
|
1396
|
-
end
|
1397
|
-
|
1398
|
-
else
|
1399
|
-
# we have no semantic information
|
1400
|
-
# record it anyway
|
1401
|
-
|
1402
|
-
super("sem", {}, sentence_id + "_sem", false)
|
1403
|
-
end
|
1404
|
-
end
|
1405
|
-
|
1406
|
-
################################################3
|
1407
|
-
# access methods
|
1408
|
-
|
1409
|
-
###
|
1410
|
-
def each_frame
|
1411
|
-
@frame_id.each { |node_id|
|
1412
|
-
yield @node[node_id]
|
1413
|
-
}
|
1414
|
-
end
|
1415
|
-
|
1416
|
-
###
|
1417
|
-
def frames
|
1418
|
-
return @frame_id.map { |node_id| @node[node_id] }
|
1419
|
-
end
|
1420
|
-
|
1421
|
-
###
|
1422
|
-
def each_usp_frameblock
|
1423
|
-
@uspframe_id.each { |node_id|
|
1424
|
-
yield @node[node_id]
|
1425
|
-
}
|
1426
|
-
end
|
1427
|
-
|
1428
|
-
###
|
1429
|
-
def usp_frameblocks()
|
1430
|
-
return @uspframe_id.map { |node_id| @node[node_id] }
|
1431
|
-
end
|
1432
|
-
|
1433
|
-
###
|
1434
|
-
def each_usp_feblock
|
1435
|
-
@uspfe_id.each { |node_id|
|
1436
|
-
yield @node[node_id]
|
1437
|
-
}
|
1438
|
-
end
|
1439
|
-
|
1440
|
-
###
|
1441
|
-
def usp_feblocks()
|
1442
|
-
return @uspfe_id.map { |node_id| @node[node_id] }
|
1443
|
-
end
|
1444
|
-
|
1445
|
-
###
|
1446
|
-
def flags
|
1447
|
-
return @globals.map { |xml_obj|
|
1448
|
-
{ "type" => xml_obj.attributes["type"],
|
1449
|
-
"param" => xml_obj.attributes["param"],
|
1450
|
-
"text" => xml_obj.children_and_text.map { |c| c.to_s }.join
|
1451
|
-
}
|
1452
|
-
}
|
1453
|
-
end
|
1454
|
-
|
1455
|
-
################################################3
|
1456
|
-
# adding and removing things
|
1457
|
-
|
1458
|
-
###
|
1459
|
-
def add_frame(sentid, # string: sentence ID
|
1460
|
-
name, # string: name of the frame
|
1461
|
-
sem_id = nil) # string: ID for the new node
|
1462
|
-
|
1463
|
-
# make a node for the frame
|
1464
|
-
if sem_id
|
1465
|
-
frameid = sem_id
|
1466
|
-
else
|
1467
|
-
frameid = sentid + "_f" + Time.new().to_f.to_s
|
1468
|
-
end
|
1469
|
-
n = FrameNode.new(RegXML.new("<frame id=\"#{frameid}\" name=\"#{name}\"/>"))
|
1470
|
-
@node[n.id] = n
|
1471
|
-
@frame_id << n.id
|
1472
|
-
|
1473
|
-
return n
|
1474
|
-
end
|
1475
|
-
|
1476
|
-
###
|
1477
|
-
def remove_frame(frame_node)
|
1478
|
-
@node.delete(frame_node.id)
|
1479
|
-
@frame_id.delete(frame_node.id)
|
1480
|
-
end
|
1481
|
-
|
1482
|
-
###
|
1483
|
-
def add_fe(frame_node, # FrameNode
|
1484
|
-
fe_name, # string: name of new FE
|
1485
|
-
fe_children, # array:SynNode, children of new FE
|
1486
|
-
sem_id = nil) # optional: ID of new FE
|
1487
|
-
|
1488
|
-
|
1489
|
-
new_fe = frame_node.add_fe(fe_name, fe_children, sem_id)
|
1490
|
-
@node[new_fe.id] = new_fe
|
1491
|
-
return new_fe
|
1492
|
-
end
|
1493
|
-
|
1494
|
-
###
|
1495
|
-
def remove_fe(fe_node)
|
1496
|
-
@node.delete(fe_node.id)
|
1497
|
-
fe_node.parent.remove_child(fe_node)
|
1498
|
-
end
|
1499
|
-
|
1500
|
-
###
|
1501
|
-
def add_usp(frame_or_fe) # string: "frame" or "fe"
|
1502
|
-
|
1503
|
-
n = UspNode.new(RegXML.new("<uspblock/>"), frame_or_fe)
|
1504
|
-
@node[n.id] = n
|
1505
|
-
case frame_or_fe
|
1506
|
-
when "frame"
|
1507
|
-
@uspframe_id << n.id
|
1508
|
-
when "fe"
|
1509
|
-
@uspfe_id << n.id
|
1510
|
-
else
|
1511
|
-
raise "Shouldn't be here"
|
1512
|
-
end
|
1513
|
-
|
1514
|
-
return n
|
1515
|
-
end
|
1516
|
-
|
1517
|
-
###
|
1518
|
-
def remove_usp(usp_node)
|
1519
|
-
usp_node.children.each { |child|
|
1520
|
-
usp_node.remove_child(child)
|
1521
|
-
}
|
1522
|
-
@node.delete(usp_node.id)
|
1523
|
-
case usp_node.i_am
|
1524
|
-
when "frame"
|
1525
|
-
@uspframe_id.delete(usp_node.id)
|
1526
|
-
when "fe"
|
1527
|
-
@uspfe_id.delete(usp_node.id)
|
1528
|
-
else
|
1529
|
-
raise "Shouldn't be here"
|
1530
|
-
end
|
1531
|
-
end
|
1532
|
-
|
1533
|
-
|
1534
|
-
###
|
1535
|
-
def add_child(arg1, arg2)
|
1536
|
-
raise "Not implemented for this class"
|
1537
|
-
end
|
1538
|
-
|
1539
|
-
###
|
1540
|
-
def remove_child(arg1, arg2)
|
1541
|
-
raise "Not implemented for this class"
|
1542
|
-
end
|
1543
|
-
|
1544
|
-
###
|
1545
|
-
def add_flag(type, param=nil, text=nil)
|
1546
|
-
# unless ["REEXAMINE", "WRONGSUBCORPUS", "INTERESTING", "LATER"].include? type
|
1547
|
-
# raise "add_flag: unknown type "+type
|
1548
|
-
# end
|
1549
|
-
|
1550
|
-
newglob = "<global type=\'#{xml_secure_val(type)}\'"
|
1551
|
-
if param
|
1552
|
-
newglob << " param=\'#{xml_secure_val(param)}\'"
|
1553
|
-
end
|
1554
|
-
if text
|
1555
|
-
newglob << "> #{text} </global>"
|
1556
|
-
else
|
1557
|
-
newglob << "/>"
|
1558
|
-
end
|
1559
|
-
|
1560
|
-
newglob = RegXML.new(newglob)
|
1561
|
-
@globals << newglob
|
1562
|
-
return newglob
|
1563
|
-
end
|
1564
|
-
|
1565
|
-
###
|
1566
|
-
def remove_flag(type, param=nil, text=nil)
|
1567
|
-
|
1568
|
-
remove_ix = nil
|
1569
|
-
@globals.each_with_index { |glob,ix|
|
1570
|
-
if glob.attributes("type") == type
|
1571
|
-
if param.nil? or glob.attributes("param") == param
|
1572
|
-
if text.nil? or glob.children_and_text.map { |c| c.to_s }.join == text
|
1573
|
-
# found it
|
1574
|
-
remove_ix = ix
|
1575
|
-
break
|
1576
|
-
end
|
1577
|
-
end
|
1578
|
-
end
|
1579
|
-
}
|
1580
|
-
|
1581
|
-
if remove_ix
|
1582
|
-
return @globals.delete_at(remove_ix)
|
1583
|
-
else
|
1584
|
-
return nil
|
1585
|
-
end
|
1586
|
-
end
|
1587
|
-
|
1588
|
-
############################3
|
1589
|
-
protected
|
1590
|
-
|
1591
|
-
def get_xml_ofchildren()
|
1592
|
-
string = ""
|
1593
|
-
|
1594
|
-
# globals
|
1595
|
-
string << "<globals>\n"
|
1596
|
-
@globals.each { |glob|
|
1597
|
-
string << glob.to_s + "\n"
|
1598
|
-
}
|
1599
|
-
string << "</globals>\n"
|
1600
|
-
|
1601
|
-
# frames
|
1602
|
-
string << "<frames>\n"
|
1603
|
-
each_frame { |frame_node|
|
1604
|
-
string << frame_node.get()
|
1605
|
-
}
|
1606
|
-
string << "</frames>\n"
|
1607
|
-
|
1608
|
-
# underspecification
|
1609
|
-
string << "<usp>\n"
|
1610
|
-
string << "<uspframes>\n"
|
1611
|
-
each_usp_frameblock { |block|
|
1612
|
-
string << block.get()
|
1613
|
-
}
|
1614
|
-
string << "</uspframes>\n"
|
1615
|
-
string << "<uspfes>\n"
|
1616
|
-
each_usp_feblock { |block|
|
1617
|
-
string << block.get()
|
1618
|
-
}
|
1619
|
-
string << "</uspfes>\n"
|
1620
|
-
string << "</usp>\n"
|
1621
|
-
|
1622
|
-
return string
|
1623
|
-
end
|
1624
|
-
|
1625
|
-
###
|
1626
|
-
def semnode_add_flags(sem_node, # SemNode object
|
1627
|
-
xml_obj) # RegXML object
|
1628
|
-
|
1629
|
-
xml_obj.children_and_text.each { |child|
|
1630
|
-
if child.name == "flag"
|
1631
|
-
# found a flag, record it
|
1632
|
-
name = child.attributes["name"]
|
1633
|
-
if name
|
1634
|
-
sem_node.add_flag(name)
|
1635
|
-
else
|
1636
|
-
$stderr.puts "Warning: flag without a name"
|
1637
|
-
end
|
1638
|
-
end
|
1639
|
-
}
|
1640
|
-
end
|
1641
|
-
|
1642
|
-
def frame_add_children(frame_node, # FrameNode object
|
1643
|
-
xml_obj, # RegXML object
|
1644
|
-
id_to_node) # hash: syn_node_id(string) -> SynNode object
|
1645
|
-
|
1646
|
-
xml_obj.children_and_text.each { |fe|
|
1647
|
-
case fe.name
|
1648
|
-
when "fe", "target"
|
1649
|
-
# $stderr.puts "Da: #{fe.name}\n#{fe.to_s}"
|
1650
|
-
|
1651
|
-
# make a node for this,
|
1652
|
-
# and add it as child of this frame node.
|
1653
|
-
fe_node = FeNode.new(fe)
|
1654
|
-
@node[fe_node.id] = fe_node
|
1655
|
-
frame_node.add_child(fe_node)
|
1656
|
-
|
1657
|
-
semnode_add_flags(fe_node, fe)
|
1658
|
-
|
1659
|
-
# add the FE's children
|
1660
|
-
fe.children_and_text.each { |fechild|
|
1661
|
-
case fechild.name
|
1662
|
-
when "fenode"
|
1663
|
-
|
1664
|
-
syn_node = id_to_node[SalsaTigerXmlNode.xmlel_id(fechild)]
|
1665
|
-
if syn_node
|
1666
|
-
# normal syntactic node, which the id_to_node mapping knows
|
1667
|
-
fe_node.add_child(syn_node, fechild)
|
1668
|
-
syn_node.add_sem(fe_node)
|
1669
|
-
|
1670
|
-
else
|
1671
|
-
# must be a node in a different sentence
|
1672
|
-
# make a dummy graph node for it
|
1673
|
-
fe_node.add_child(TSSynNode.new(SalsaTigerXmlNode.xmlel_id(fechild)), fechild)
|
1674
|
-
end
|
1675
|
-
|
1676
|
-
when "flag"
|
1677
|
-
# nothing to do, we've handled that already
|
1678
|
-
else
|
1679
|
-
fe_node.add_kith(fechild)
|
1680
|
-
end
|
1681
|
-
}
|
1682
|
-
|
1683
|
-
when "flag"
|
1684
|
-
# nothing to do, wee handled that already
|
1685
|
-
|
1686
|
-
else
|
1687
|
-
# keep for output
|
1688
|
-
frame_node.add_kith(fe)
|
1689
|
-
end
|
1690
|
-
}
|
1691
|
-
end
|
1692
|
-
|
1693
|
-
###
|
1694
|
-
def initialize_usp(xml_obj, # RegXML object
|
1695
|
-
frame_or_fe) # string: "frame" or "fe"
|
1696
|
-
|
1697
|
-
xml_obj.children_and_text.each { |uspblock|
|
1698
|
-
unless uspblock.name == "uspblock"
|
1699
|
-
warn_child_ignored("s/sem/usp/uspframe|uspfe", uspblock)
|
1700
|
-
next
|
1701
|
-
end
|
1702
|
-
|
1703
|
-
# node for this underspecified block
|
1704
|
-
n = UspNode.new(uspblock, frame_or_fe)
|
1705
|
-
@node[n.id] = n
|
1706
|
-
|
1707
|
-
case frame_or_fe
|
1708
|
-
when "frame"
|
1709
|
-
@uspframe_id << n.id
|
1710
|
-
when "fe"
|
1711
|
-
@uspfe_id << n.id
|
1712
|
-
else
|
1713
|
-
raise "Shouldn't be here"
|
1714
|
-
end
|
1715
|
-
|
1716
|
-
# add its children
|
1717
|
-
uspblock.children_and_text.each { |uspitem|
|
1718
|
-
unless uspitem.name == "uspitem"
|
1719
|
-
warn_child_ignored("s/sem/usp/uspframe|uspfe/uspblock", uspitem)
|
1720
|
-
next
|
1721
|
-
end
|
1722
|
-
|
1723
|
-
usp_id = SalsaTigerXmlNode.xmlel_id(uspitem)
|
1724
|
-
usp_id = usp_id.gsub(/.*_s/, "s")
|
1725
|
-
|
1726
|
-
unless @node[usp_id]
|
1727
|
-
$stderr.puts "Error: Underspecification: could not find node with ID #{usp_id}. Skipping."
|
1728
|
-
next
|
1729
|
-
end
|
1730
|
-
n.add_child(@node[usp_id])
|
1731
|
-
}
|
1732
|
-
}
|
1733
|
-
end
|
1734
|
-
end
|
1735
|
-
|
1736
|
-
|
1737
|
-
#############
|
1738
|
-
# class SalsaTigerSentence
|
1739
|
-
#
|
1740
|
-
# offers access methods to a SalsaTigerXML sentence
|
1741
|
-
# given as a string
|
1742
|
-
#
|
1743
|
-
# Nodes of syntactic structure as well as frames and
|
1744
|
-
# frame elements are kept (and returned) as XMLNode objects,
|
1745
|
-
# or more specifically as SynNode, FrameNode and FeNode objects.
|
1746
|
-
#
|
1747
|
-
# methods:
|
1748
|
-
#
|
1749
|
-
# new initializes the object
|
1750
|
-
#
|
1751
|
-
# id returns the sentence ID
|
1752
|
-
#
|
1753
|
-
# get returns the REXML object describing the same sentence
|
1754
|
-
# as this object
|
1755
|
-
#
|
1756
|
-
# each_terminal yields each terminal of the sentence in turn.
|
1757
|
-
# they are returned as SynNode objects
|
1758
|
-
#
|
1759
|
-
# terminals returns all terminal node objects in an array
|
1760
|
-
#
|
1761
|
-
# each_terminal_sorted yields each terminal of the sentence in turn,
|
1762
|
-
# making sure the terminal with the lowest ID is returned first.
|
1763
|
-
# use this if you need the terminal words in the right order!
|
1764
|
-
# nodes are returned as SynNode objects
|
1765
|
-
#
|
1766
|
-
# each_nonterminal yields each nonterminal of the sentence in turn.
|
1767
|
-
# nodes are returned as SynNode objects
|
1768
|
-
#
|
1769
|
-
# each_frame yields each frame of the sentence in turn.
|
1770
|
-
# nodes are returned as FrameNode objects
|
1771
|
-
#
|
1772
|
-
# frames returns all frame objects in an array
|
1773
|
-
#
|
1774
|
-
# each_usp_frameblock
|
1775
|
-
# yields each group of underspecified frames of the sentence
|
1776
|
-
# in turn, as an UspNode object. To see the frames involved
|
1777
|
-
# in this underspecification, use each_child on the UspNode object
|
1778
|
-
#
|
1779
|
-
#
|
1780
|
-
# usp_frameblocks returns all groups of underspecified frames as an array
|
1781
|
-
# of UspNode objects
|
1782
|
-
#
|
1783
|
-
# each_usp_feblock
|
1784
|
-
# yields each group of underspecified frame elements
|
1785
|
-
# of the sentence in turn, as an UspNode object.
|
1786
|
-
# To see the frames involved
|
1787
|
-
# in this underspecification, use each_child on the UspNode object
|
1788
|
-
#
|
1789
|
-
# usp_feblocks returns all groups of underspecified frame elements
|
1790
|
-
# as an array of UspNode objects
|
1791
|
-
#
|
1792
|
-
#
|
1793
|
-
# flags returns a list of the sentence flags, as hashes.
|
1794
|
-
# key "type": a string, either REEXAMINE or WRONGSUBCORPUS
|
1795
|
-
# or INTERESTING or LATER
|
1796
|
-
# key "param": a string, the parameter. important for
|
1797
|
-
# REEXAMINE
|
1798
|
-
# key "text": a string, the text of this flag. Will be
|
1799
|
-
# nonempty only for INTERESTING cases
|
1800
|
-
#
|
1801
|
-
# syn_roots returns a list of all the roots of the syntactic trees
|
1802
|
-
# in this sentence, as node objects. There may be more than
|
1803
|
-
# one, unfortunately.
|
1804
|
-
#
|
1805
|
-
# add_syn add a new syntactic node with the given category, word, POS,
|
1806
|
-
# returns the new node
|
1807
|
-
#
|
1808
|
-
# add_frame add a frame with a given name, returns the new frame node
|
1809
|
-
#
|
1810
|
-
# add_usp add a new underspecification block, either for frames or FEs
|
1811
|
-
#
|
1812
|
-
# add_flag adds a sentence flag to this sentence.
|
1813
|
-
# type: a string, must be REEXAMINE, INTERESTING, WRONGSUBCORPUS,
|
1814
|
-
# or LATER
|
1815
|
-
# param: optional parameter, a string, describes type of Reexamine
|
1816
|
-
# for REEXAMINE-type flags
|
1817
|
-
# text: optional parameter, a string, arbitrary text commenting
|
1818
|
-
# on the flag, used mainly with INTERESTING
|
1819
|
-
#
|
1820
|
-
# remove_flag removes a sentence flag to this sentence
|
1821
|
-
# only removes flag in case of exact match of type, param, and text
|
1822
|
-
# type: a string, either REEXAMINE, INTERESTING, WRONGSUBCORPUS,
|
1823
|
-
# or LATER
|
1824
|
-
# param: optional parameter, a string, describes type of Reexamine
|
1825
|
-
# for REEXAMINE-type flags
|
1826
|
-
# text: optional parameter, a string, arbitrary text commenting
|
1827
|
-
# on the flag, used mainly with INTERESTING
|
1828
|
-
|
1829
|
-
class SalsaTigerSentence < XMLNode
|
1830
|
-
|
1831
|
-
def initialize(string)
|
1832
|
-
# parse string as an XML element
|
1833
|
-
xml_obj = RegXML.new(string)
|
1834
|
-
|
1835
|
-
# initialize this object as an XML node,
|
1836
|
-
# i.e. remember the outermost element's name, attributes,
|
1837
|
-
# and ID, and specify that it's not a text but an XML object
|
1838
|
-
super(xml_obj.name, xml_obj.attributes, SalsaTigerXmlNode.xmlel_id(xml_obj), false)
|
1839
|
-
|
1840
|
-
# find XML element "graph",
|
1841
|
-
# which contains the syntactic info of the sentence.
|
1842
|
-
# It is a child of the <s> element.
|
1843
|
-
xml_syn_obj = xml_obj.children_and_text().detect { |thing|
|
1844
|
-
thing.name == "graph"
|
1845
|
-
}
|
1846
|
-
|
1847
|
-
unless xml_syn_obj
|
1848
|
-
# no graph in this sentence -- fake one
|
1849
|
-
xml_syn_obj = RegXML.new("<graph/>")
|
1850
|
-
end
|
1851
|
-
|
1852
|
-
@syn = SalsaTigerSentenceGraph.new(xml_syn_obj, id)
|
1853
|
-
|
1854
|
-
# find XML element "sem"
|
1855
|
-
# which contains the semantic info of the sentence.
|
1856
|
-
# It is a child of the <s> element.
|
1857
|
-
xml_sem_obj = xml_obj.children_and_text().detect { |thing|
|
1858
|
-
thing.name == "sem"
|
1859
|
-
}
|
1860
|
-
|
1861
|
-
unless xml_sem_obj
|
1862
|
-
# no semantic info in this sentence -- fake one
|
1863
|
-
xml_sem_obj = RegXML.new("<sem/>")
|
1864
|
-
end
|
1865
|
-
|
1866
|
-
# add splitword info to @syn element
|
1867
|
-
@syn.add_splitwords(SalsaTigerSentenceSem.get_splitwords(xml_sem_obj))
|
1868
|
-
|
1869
|
-
@sem = SalsaTigerSentenceSem.new(xml_sem_obj, id, @syn.node)
|
1870
|
-
|
1871
|
-
# go through the children of the <s> object again,
|
1872
|
-
# remembering all children except <graph> and <sem>
|
1873
|
-
# for later output
|
1874
|
-
xml_obj.children_and_text.each { |child_or_text|
|
1875
|
-
case child_or_text.name
|
1876
|
-
when "graph", "sem"
|
1877
|
-
# we have handled them already
|
1878
|
-
else
|
1879
|
-
add_kith(child_or_text)
|
1880
|
-
end
|
1881
|
-
}
|
1882
|
-
|
1883
|
-
end
|
1884
|
-
|
1885
|
-
#############
|
1886
|
-
def SalsaTigerSentence.empty_sentence(sentence_id) # string
|
1887
|
-
sentence_id = sentence_id.gsub(/'/, "'")
|
1888
|
-
sent_string = "<s id=\'#{sentence_id}\'>\n" +
|
1889
|
-
"<graph/>\n" +
|
1890
|
-
"<sem/>\n" +
|
1891
|
-
"</s>"
|
1892
|
-
return SalsaTigerSentence.new(sent_string)
|
1893
|
-
end
|
1894
|
-
|
1895
|
-
#####
|
1896
|
-
|
1897
|
-
|
1898
|
-
###
|
1899
|
-
def to_s
|
1900
|
-
return @syn.to_s
|
1901
|
-
end
|
1902
|
-
|
1903
|
-
###
|
1904
|
-
def each_terminal
|
1905
|
-
@syn.each_terminal { |n| yield n }
|
1906
|
-
end
|
1907
|
-
|
1908
|
-
###
|
1909
|
-
def each_terminal_sorted
|
1910
|
-
@syn.each_terminal_sorted { |n| yield n }
|
1911
|
-
end
|
1912
|
-
|
1913
|
-
###
|
1914
|
-
def terminals
|
1915
|
-
return @syn.terminals()
|
1916
|
-
end
|
1917
|
-
|
1918
|
-
###
|
1919
|
-
def terminals_sorted
|
1920
|
-
return @syn.terminals_sorted()
|
1921
|
-
end
|
1922
|
-
|
1923
|
-
###
|
1924
|
-
def each_nonterminal
|
1925
|
-
@syn.each_nonterminal { |n| yield n }
|
1926
|
-
end
|
1927
|
-
|
1928
|
-
###
|
1929
|
-
def nonterminals
|
1930
|
-
return @syn.nonterminals()
|
1931
|
-
end
|
1932
|
-
|
1933
|
-
###
|
1934
|
-
def each_syn_node
|
1935
|
-
@syn.each_node { |n|
|
1936
|
-
yield n
|
1937
|
-
}
|
1938
|
-
end
|
1939
|
-
|
1940
|
-
###
|
1941
|
-
def syn_nodes
|
1942
|
-
return @syn.nodes()
|
1943
|
-
end
|
1944
|
-
|
1945
|
-
###
|
1946
|
-
def syn_roots
|
1947
|
-
return @syn.syn_roots()
|
1948
|
-
end
|
1949
|
-
###
|
1950
|
-
|
1951
|
-
###
|
1952
|
-
def syn_node_with_id(syn_id)
|
1953
|
-
return @syn.node[syn_id]
|
1954
|
-
end
|
1955
|
-
|
1956
|
-
###
|
1957
|
-
def sem_node_with_id(sem_id)
|
1958
|
-
return @sem.node[sem_id]
|
1959
|
-
end
|
1960
|
-
|
1961
|
-
###
|
1962
|
-
def each_frame
|
1963
|
-
@sem.each_frame { |f| yield f }
|
1964
|
-
end
|
1965
|
-
|
1966
|
-
###
|
1967
|
-
def frames
|
1968
|
-
return @sem.frames
|
1969
|
-
end
|
1970
|
-
|
1971
|
-
###
|
1972
|
-
def each_usp_frameblock
|
1973
|
-
@sem.each_usp_frameblock { |b| yield b }
|
1974
|
-
end
|
1975
|
-
|
1976
|
-
###
|
1977
|
-
def usp_frameblocks()
|
1978
|
-
return @sem.usp_frameblocks()
|
1979
|
-
end
|
1980
|
-
|
1981
|
-
###
|
1982
|
-
def each_usp_feblock
|
1983
|
-
@sem.each_usp_feblock { |b| yield b }
|
1984
|
-
end
|
1985
|
-
|
1986
|
-
###
|
1987
|
-
def usp_feblocks()
|
1988
|
-
return @sem.usp_feblocks()
|
1989
|
-
end
|
1990
|
-
|
1991
|
-
###
|
1992
|
-
def flags
|
1993
|
-
return @sem.flags()
|
1994
|
-
end
|
1995
|
-
|
1996
|
-
###################################
|
1997
|
-
# adding and removing things
|
1998
|
-
|
1999
|
-
###
|
2000
|
-
# add syntactic node, specified as terminal(t) or nonterminal(nt)
|
2001
|
-
#
|
2002
|
-
# returns the new node
|
2003
|
-
def add_syn(label, # string: t or nt
|
2004
|
-
cat = nil, # string: category
|
2005
|
-
word = nil,# string: word
|
2006
|
-
pos = nil, # string: part of speech
|
2007
|
-
syn_id = nil) # string: ID for the new node
|
2008
|
-
return @syn.add_node(id(), label, cat, word, pos, syn_id)
|
2009
|
-
end
|
2010
|
-
|
2011
|
-
###
|
2012
|
-
def remove_syn(node)
|
2013
|
-
@syn.remove_node(node)
|
2014
|
-
end
|
2015
|
-
|
2016
|
-
###
|
2017
|
-
def add_frame(name, # string: name of the frame
|
2018
|
-
sem_id = nil) # string: ID for the new node
|
2019
|
-
return @sem.add_frame(id(), name, sem_id)
|
2020
|
-
end
|
2021
|
-
|
2022
|
-
###
|
2023
|
-
def remove_frame(frame_node) # FrameNode object
|
2024
|
-
@sem.remove_frame(frame_node)
|
2025
|
-
end
|
2026
|
-
|
2027
|
-
###
|
2028
|
-
def add_fe(frame_obj,
|
2029
|
-
name,
|
2030
|
-
fe_children,
|
2031
|
-
sem_id = nil)
|
2032
|
-
return @sem.add_fe(frame_obj, name, fe_children, sem_id)
|
2033
|
-
end
|
2034
|
-
|
2035
|
-
###
|
2036
|
-
def remove_fe(fe_node)
|
2037
|
-
@sem.remove_fe(fe_node)
|
2038
|
-
end
|
2039
|
-
|
2040
|
-
###
|
2041
|
-
def add_usp(frame_or_fe)
|
2042
|
-
return @sem.add_usp(frame_or_fe)
|
2043
|
-
end
|
2044
|
-
|
2045
|
-
###
|
2046
|
-
def remove_usp(usp_node) # UspNode object
|
2047
|
-
@sem.remove_usp(usp_node)
|
2048
|
-
end
|
2049
|
-
|
2050
|
-
###
|
2051
|
-
def add_flag(type, param=nil, text=nil)
|
2052
|
-
@sem.add_flag(type, param, text)
|
2053
|
-
end
|
2054
|
-
|
2055
|
-
###
|
2056
|
-
def remove_flag(type, param=nil, text=nil)
|
2057
|
-
@sem.remove_flag(type, param, text)
|
2058
|
-
end
|
2059
|
-
|
2060
|
-
###
|
2061
|
-
def remove_semantics()
|
2062
|
-
empty_sem = RegXML.new("<sem/>")
|
2063
|
-
@sem = SalsaTigerSentenceSem.new(empty_sem, id(), @syn.node)
|
2064
|
-
end
|
2065
|
-
|
2066
|
-
#################33
|
2067
|
-
# output
|
2068
|
-
def get_syn()
|
2069
|
-
return @syn.get()
|
2070
|
-
end
|
2071
|
-
|
2072
|
-
############################3
|
2073
|
-
protected
|
2074
|
-
|
2075
|
-
def get_xml_ofchildren()
|
2076
|
-
return @syn.get() + @sem.get()
|
2077
|
-
end
|
2078
|
-
end
|
2079
|
-
|
2080
|
-
#######
|
2081
|
-
# identify the set of maximal constituents covering a set of nodes
|
2082
|
-
#
|
2083
|
-
module MaxConst
|
2084
|
-
|
2085
|
-
# returns: array:SynNode, list of maximal constituents covering
|
2086
|
-
# the input nodes
|
2087
|
-
def max_constituents_for_nodes(node_list, # array: SynNode
|
2088
|
-
ignore_empty_terminals = false) # boolean: ignore empty terminals?
|
2089
|
-
|
2090
|
-
# sort node IDs into splitwords and rest,
|
2091
|
-
# and filter out punctuation marks
|
2092
|
-
#
|
2093
|
-
# 'words' is an array of node IDs that are not splitwords
|
2094
|
-
# 'splitwords' is an array of fenodes that refer to splitwords
|
2095
|
-
words = Array.new
|
2096
|
-
splitwords = Array.new
|
2097
|
-
|
2098
|
-
node_list.each { |node|
|
2099
|
-
if node.is_splitword?
|
2100
|
-
splitwords << node
|
2101
|
-
else
|
2102
|
-
words.concat node.yield_nodes().reject { |t| t.is_punct? }
|
2103
|
-
end
|
2104
|
-
}
|
2105
|
-
|
2106
|
-
# check all nodes from root down:
|
2107
|
-
# 'constituents', 'nodes_to_check' are arrays of node IDs
|
2108
|
-
# 'constituents' contains found constituents,
|
2109
|
-
# 'nodes_to_check' contains nodes for which we still need constituents
|
2110
|
-
|
2111
|
-
constituents = Array.new
|
2112
|
-
nodes_to_check = syn_roots() # (there may be more than one)
|
2113
|
-
# this accesses the syn_roots() method of SalsaTigerSentence
|
2114
|
-
|
2115
|
-
while(true)
|
2116
|
-
node = nodes_to_check.shift()
|
2117
|
-
# have we checked all nodes already? or are we done with all words? then stop.
|
2118
|
-
if node.nil?
|
2119
|
-
constituents.concat words
|
2120
|
-
words = []
|
2121
|
-
break
|
2122
|
-
end
|
2123
|
-
if words.empty?
|
2124
|
-
break
|
2125
|
-
end
|
2126
|
-
|
2127
|
-
# only match nonempty non-punctuation nodes
|
2128
|
-
|
2129
|
-
node_yield = node.yield_nodes.reject {|n| n.is_punct? }
|
2130
|
-
if ignore_empty_terminals
|
2131
|
-
node_yield = node_yield.reject { |n| n.is_terminal? and (n.word.nil? or n.word.empty?) }
|
2132
|
-
end
|
2133
|
-
if node_yield.empty?
|
2134
|
-
# this node has no yield, or only punctuation sign yield.
|
2135
|
-
# skip it.
|
2136
|
-
next
|
2137
|
-
end
|
2138
|
-
|
2139
|
-
rest = node_yield - words
|
2140
|
-
if rest.size == 0
|
2141
|
-
# whole yield of node consists of words from this FE
|
2142
|
-
constituents << node
|
2143
|
-
words = words - node_yield
|
2144
|
-
|
2145
|
-
elsif rest.size < node_yield.size
|
2146
|
-
# at least some of the words in FE appear below this node:
|
2147
|
-
# check this node's children too
|
2148
|
-
node.children.each{ |child| nodes_to_check << child }
|
2149
|
-
end
|
2150
|
-
end
|
2151
|
-
|
2152
|
-
constituents.concat(splitwords) #splitwords stay what they are
|
2153
|
-
constituents.concat(words) # any leftover words that may not be from that sentence?
|
2154
|
-
# just keep them.
|
2155
|
-
|
2156
|
-
return constituents
|
2157
|
-
end
|
2158
|
-
|
2159
|
-
###
|
2160
|
-
# determine maximum constituents covering the nodes in node_list
|
2161
|
-
# punctuation terminals (and optionally empty terminals) are ignored.
|
2162
|
-
#
|
2163
|
-
# If include_single_missing_children is set to true,
|
2164
|
-
# then a node that has at least one child whose yield is in nodelist,
|
2165
|
-
# and has only one child whose yield is not in nodelist,
|
2166
|
-
# will be considered as having its yield in nodelist.
|
2167
|
-
#
|
2168
|
-
# Optionally, a procedure accept_anyway_proc can be given.
|
2169
|
-
# Like the option include_single_missing_children, it can lead to nodes being
|
2170
|
-
# included in the list of nodes whose yield nodes are all also yield nodes of node_list (NYNAAYNN)
|
2171
|
-
# even though not all of their yield nodes are yield nodes of the node_list.
|
2172
|
-
# accept_anyway_proc can implement arbitrary rules for including nodes in NYAAYNN.
|
2173
|
-
# The procedure is called with three arguments:
|
2174
|
-
# accept_anyway_proc(node, ch_in, ch_out)
|
2175
|
-
# node is a SynNode that would not normally be in NYAAYNN.
|
2176
|
-
# ch_in is the list of its children that are in NYAAYNN.
|
2177
|
-
# ch_out is the list of its children that are not.
|
2178
|
-
# If the procedure exists and returns true, node is put into NYAAYNN.
|
2179
|
-
#
|
2180
|
-
# returns: an array of SynNodes: the maximal constituents that together
|
2181
|
-
# exactly cover node_list
|
2182
|
-
def max_constituents_smc(node_list, # array: SynNode
|
2183
|
-
include_single_missing_children, # boolean
|
2184
|
-
ignore_empty_terminals = false, # boolean: ignore empty terminals?
|
2185
|
-
accept_anyway_proc = nil) # proc: SynNode, array:SynNode, array:SynNode => boolean
|
2186
|
-
|
2187
|
-
# sort node IDs into splitwords and rest,
|
2188
|
-
# and filter out punctuation marks
|
2189
|
-
#
|
2190
|
-
# 'words' is an array of node IDs that are not splitwords
|
2191
|
-
# 'splitwords' is an array of fenodes that refer to splitwords
|
2192
|
-
words = Array.new
|
2193
|
-
splitwords = Array.new
|
2194
|
-
|
2195
|
-
node_list.each { |node|
|
2196
|
-
if node.is_splitword?
|
2197
|
-
splitwords << node
|
2198
|
-
else
|
2199
|
-
words.concat node.yield_nodes().reject { |t| t.is_punct? }
|
2200
|
-
end
|
2201
|
-
}
|
2202
|
-
|
2203
|
-
constituents = splitwords
|
2204
|
-
|
2205
|
-
syn_roots().each { |node|
|
2206
|
-
node_included, descendants_included = max_constituents_aux(node, words,
|
2207
|
-
include_single_missing_children,
|
2208
|
-
ignore_empty_terminals,
|
2209
|
-
accept_anyway_proc)
|
2210
|
-
|
2211
|
-
if node_included == "true"
|
2212
|
-
constituents << node
|
2213
|
-
else
|
2214
|
-
constituents.concat descendants_included
|
2215
|
-
end
|
2216
|
-
}
|
2217
|
-
# which words remain to be added?
|
2218
|
-
constituents.each { |c| words = words - c.yield_nodes() }
|
2219
|
-
constituents.concat words
|
2220
|
-
|
2221
|
-
return constituents
|
2222
|
-
end
|
2223
|
-
|
2224
|
-
##########33
|
2225
|
-
private
|
2226
|
-
|
2227
|
-
###
|
2228
|
-
# recursively determine maximum constituents covering the nodes in 'nodelist',
|
2229
|
-
# starting at 'node'.
|
2230
|
-
# punctuation terminals (and optionally empty terminals) are ignored.
|
2231
|
-
#
|
2232
|
-
# If include_single_missing_children is set to true,
|
2233
|
-
# then a node that has at least one child whose yield is in nodelist,
|
2234
|
-
# and has only one child whose yield is not in nodelist,
|
2235
|
-
# will be considered as having its yield in nodelist.
|
2236
|
-
#
|
2237
|
-
# If accept_anyway_proc is nonnil, also use that to decide whether
|
2238
|
-
# a node will be considered as having its yield in nodelist.
|
2239
|
-
#
|
2240
|
-
# returns: pair [mybool, included_descendants]
|
2241
|
-
# where mybool is a string, "true", "false" or "ignoreme" (for ignored
|
2242
|
-
# punctuation and empty terminals):
|
2243
|
-
# does the yield of this node consist entirely of nodes from nodelist?
|
2244
|
-
# and included_descendants is a list of SynNodes: if mybool is "false",
|
2245
|
-
# this is a list of descendants of this node whose yield does consist
|
2246
|
-
# entirely of nodes from nodelist
|
2247
|
-
def max_constituents_aux(node, # SynNode
|
2248
|
-
nodelist, # array:SynNode
|
2249
|
-
include_single_missing_children = false, # boolean
|
2250
|
-
ignore_empty_terminals = false, # boolean: ignore empty terminals?
|
2251
|
-
accept_anyway_proc = nil) # proc: SynNode, array:SynNode, array:SynNode => Boolean
|
2252
|
-
|
2253
|
-
|
2254
|
-
|
2255
|
-
if node.is_terminal? and nodelist.include? node
|
2256
|
-
# node is terminal and included in nodelist
|
2257
|
-
return ["true", []]
|
2258
|
-
elsif node.is_punct?
|
2259
|
-
# punctuation: ignore
|
2260
|
-
return ["ignoreme", []]
|
2261
|
-
elsif ignore_empty_terminals and node.is_terminal? and
|
2262
|
-
(node.word.nil? or node.word.empty?)
|
2263
|
-
# empty terminal: possibly ignore
|
2264
|
-
return ["ignoreme", []]
|
2265
|
-
elsif node.is_terminal?
|
2266
|
-
# terminal, but not included in nodelist
|
2267
|
-
return ["false", []]
|
2268
|
-
end
|
2269
|
-
|
2270
|
-
children_results = node.children.map { |ch|
|
2271
|
-
fully_included, descendants_included = max_constituents_aux(ch, nodelist,
|
2272
|
-
include_single_missing_children,
|
2273
|
-
ignore_empty_terminals,
|
2274
|
-
accept_anyway_proc)
|
2275
|
-
[ch, fully_included, descendants_included]
|
2276
|
-
}
|
2277
|
-
|
2278
|
-
res_false = children_results.select { |ch, fully_included, descendants_included|
|
2279
|
-
fully_included == "false"
|
2280
|
-
}
|
2281
|
-
res_true = children_results.select { |ch, fully_included, descendants_included|
|
2282
|
-
fully_included == "true"
|
2283
|
-
}
|
2284
|
-
|
2285
|
-
if res_false.empty? and res_true.length() > 0
|
2286
|
-
# all true, or all true and ignoreme
|
2287
|
-
return ["true", []]
|
2288
|
-
|
2289
|
-
elsif res_false.empty? and res_true.empty?
|
2290
|
-
# all ignoreme
|
2291
|
-
return ["ignoreme", []]
|
2292
|
-
|
2293
|
-
elsif res_false.length() == 1 and res_true.length() > 1 and
|
2294
|
-
include_single_missing_children
|
2295
|
-
# one child not covered,
|
2296
|
-
# resulting in all other children (except the ignoremes) being marked individually:
|
2297
|
-
# consider the single missing child as covered, too
|
2298
|
-
|
2299
|
-
return ["true", []]
|
2300
|
-
|
2301
|
-
elsif accept_anyway_proc and
|
2302
|
-
accept_anyway_proc.call(node, res_true.map { |ch, bool1, bool2| ch }, res_false.map { |ch, bool1, bool2| ch })
|
2303
|
-
# some external source tells us that
|
2304
|
-
# we are to consider the missing children as covered, too
|
2305
|
-
return ["true", []]
|
2306
|
-
|
2307
|
-
else
|
2308
|
-
# not all children covered
|
2309
|
-
return [
|
2310
|
-
"false",
|
2311
|
-
children_results.map { |ch, fully_included, descendants_included|
|
2312
|
-
if fully_included == "true"
|
2313
|
-
[ch]
|
2314
|
-
else
|
2315
|
-
descendants_included
|
2316
|
-
end
|
2317
|
-
}.flatten
|
2318
|
-
]
|
2319
|
-
end
|
2320
|
-
end
|
2321
|
-
end
|
2322
|
-
|
2323
|
-
module ConvexComp
|
2324
|
-
|
2325
|
-
def convex_complemented(node_set)
|
2326
|
-
|
2327
|
-
terminals = terminals_sorted()
|
2328
|
-
|
2329
|
-
yield_nodes = node_set.map {|node| node.yield_nodes_ordered}.flatten
|
2330
|
-
leftmost = yield_nodes.map {|t| terminals.index(t)}.min
|
2331
|
-
rightmost = yield_nodes.map {|t| terminals.index(t)}.max
|
2332
|
-
if leftmost.nil? or rightmost.nil?
|
2333
|
-
STDERR.puts "Warning: could not complement projected node set #{yield_nodes.map {|t| t.id}}; terminals not found in sorted set of sentence terminals!?"
|
2334
|
-
return node_set
|
2335
|
-
else
|
2336
|
-
STDERR.puts "Replacing "+yield_nodes.join(" ")
|
2337
|
-
new_node_set = terminals[leftmost..rightmost]
|
2338
|
-
STDERR.puts "By "+new_node_set.join(" ")
|
2339
|
-
return max_constituents_for_nodes(new_node_set)
|
2340
|
-
end
|
2341
|
-
end
|
2342
|
-
end
|
2343
|
-
|
2344
|
-
class SalsaTigerSentence
|
2345
|
-
include MaxConst
|
2346
|
-
include ConvexComp
|
2347
|
-
end
|