frprep 0.0.1.prealpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,194 @@
1
+ #########
2
+ # module StringTerminalsInRightOrder
3
+ #
4
+ # returns the yield of a node, or a list of nodes, as a string
5
+ # of " "-separated words
6
+ #
7
+ # Words are put into the right order, left to right,
8
+ # under the assumption that their node IDs reflect that order
9
+ #
10
+ # Terminal nodes are assumed to have IDs ending in a number,
11
+ # numbered from left to right
12
+ #
13
+ # Splitword nodes are assumed to have IDs ending in N_sM
14
+ # for numbers N and M, where N orders terminals left to right
15
+ # and M orders the splitword parts left to right
16
+ #
17
+ # If the yield of the node/the list of nodes contains all splitwords of a terminal,
18
+ # the whole terminal is taken instead
19
+ #
20
+ # methods:
21
+ #
22
+ # string_for_node returns the string for the yield of a node
23
+ # node: a node object
24
+ #
25
+ # string_for_nodes returns the string for the yield of a list of nodes
26
+ # nodes: a list of node objects
27
+
28
+ module StringTerminalsInRightOrder
29
+ def string_for_node(node)
30
+ string_for_nodes([node])
31
+ end
32
+
33
+ def string_for_nodes(nodes)
34
+ a = right_level_terminals_for_nodes(nodes)
35
+ a = sort_terminals_and_splitwords_left_to_right(a)
36
+ return node_array_to_string(a)
37
+ end
38
+
39
+ #####
40
+ private
41
+
42
+ # right_level_terminals_for_nodes:
43
+ # - compute the yield for each element of 'nodes'
44
+ # - then consider all splitwords in the yield:
45
+ # if all splitwords of a terminal are in the yield,
46
+ # then use the terminal rather than its splitwords
47
+ def right_level_terminals_for_nodes(nodes)
48
+ a = nodes.map { |n| n.yield_nodes()}.flatten
49
+ b = Array.new
50
+ a.each { |n|
51
+ if n.is_splitword?
52
+ # see if a contains all parts of this splitword
53
+ # if so, take into b the splitword's parent, the terminal,
54
+ # rather than the individual splitwords
55
+
56
+ if n.parent.nil?
57
+ # splitword without a parent
58
+ b << n
59
+ elsif b.include? n.parent or a.include? n.parent
60
+ # did we already include the splitword's parent in b?
61
+ # then we're done
62
+ else
63
+
64
+ # check if all children of n.parent are in 'a'
65
+ all_in = true
66
+ n.parent.each_child { |nsibling|
67
+ unless a.include? nsibling
68
+ all_in = false
69
+ break
70
+ end
71
+ }
72
+
73
+ if all_in
74
+ # yes, all children of n.parent are in 'a'
75
+ b << n.parent
76
+ else
77
+ # no, some sibling of n is not in 'a'
78
+ b << n
79
+ end
80
+ end
81
+ elsif n.is_terminal?
82
+ # n is a terminal
83
+ b << n
84
+ # if n is anything but a splitword or a terminal,
85
+ # ignore it
86
+ end
87
+ }
88
+ return b.uniq
89
+ end
90
+
91
+ # sort_terminals_and_splitwords_left_to_right:
92
+ # take an array of nodes that consists of terminals and splitwords
93
+ # and sort them using the following comparison:
94
+ # - when comparing two terminals, use the
95
+ # last numbers in their respective IDs
96
+ # - when comparing two splitwords, their IDs end in _N_sM
97
+ # for numbers N and M.
98
+ # If they coincide in N, compare them by M,
99
+ # else compare them by M
100
+ # - when comparing a terminal and a splitword,
101
+ # compare the terminal's last number to the splitword's N
102
+ def sort_terminals_and_splitwords_left_to_right(nodes)
103
+ nodes.sort { |a, b|
104
+ if a.is_splitword? and b.is_splitword?
105
+ compare_splitwords(a, b)
106
+ elsif a.is_terminal? and b.is_terminal?
107
+ compare_terminals(a, b)
108
+ else
109
+ compare_mixed(a, b)
110
+ end
111
+ }
112
+ end
113
+
114
+ # node_array_to_string:
115
+ # 'nodes' is an array of node objects, each of which offer a "word" method
116
+ # string their words together separated by " "
117
+ def node_array_to_string(nodes)
118
+ s = ""
119
+ nodes.each { |n|
120
+ s = s + n.word + " "
121
+ }
122
+ return s
123
+ end
124
+
125
+ # - when comparing two terminals, use the
126
+ # last numbers in their respective IDs
127
+ def compare_terminals(a, b)
128
+ last_i(a) <=> last_i(b)
129
+ end
130
+
131
+ # - when comparing two splitwords, their IDs end in _N_sM
132
+ # for numbers N and M.
133
+ # If they coincide in N, compare them by M,
134
+ # else compare them by M
135
+ def compare_splitwords(a, b)
136
+ if splitword_terminal_i(a) == splitword_terminal_i(b)
137
+ # parts of same terminal?
138
+ # compare parts
139
+ last_i(a) <=> last_i(b)
140
+ else
141
+ # not parts of same terminal?
142
+ # compare terminals
143
+ splitword_terminal_i(a) <=> splitword_terminal_i(b)
144
+ end
145
+ end
146
+
147
+ # - when comparing a terminal and a splitword,
148
+ # compare the terminal's last number to the splitword's N
149
+ def compare_mixed(a, b)
150
+ if a.is_splitword? and b.is_terminal?
151
+ splitword_terminal_i(a) <=> last_i(b)
152
+
153
+ elsif a.is_terminal? and b.is_splitword?
154
+ last_i(a) <=> splitword_terminal_i(b)
155
+ else
156
+ # not one terminal, one splitword?
157
+ # then what?
158
+ $stderr.print "SalsaTigerSentence, compare_mixed: confused by "
159
+ $stderr.print a.id, ", ", b.id, "\n"
160
+ end
161
+ end
162
+
163
+ # return last number of the ID of a node
164
+ def last_i(n)
165
+ n.id =~ /(\d+)$/ # match final string of digits
166
+ if $1.nil? # if shouldn't happen _in principle_
167
+ # but we might get weird node IDs for splitwords;
168
+ # so we act gracefully and catch the case where there
169
+ # is one final letter behind the digits
170
+ n.id =~ /(\d+)\w$/
171
+ end
172
+ if $1.nil? # this shouldn't ever happen
173
+ $stderr.print "SalsaTigerSentence, last_i: Couldn't extract digits from: "
174
+ $stderr.print n.id, "\n"
175
+ exit 1
176
+ end
177
+ return $1.to_i # and return it as number
178
+ end
179
+
180
+ # assume the ID of the node includes N_sM
181
+ # return N
182
+ def splitword_terminal_i(n)
183
+ n.id =~ /(\d+)_s\d*/ # match string of digits before splitword ID
184
+ if $1.nil? # this shouldn't ever happen
185
+ $stderr.print "SalsaTigerSentence, splitword_terminal_i: Couldn't extract digits from: "
186
+ $stderr.print n.id, "\n"
187
+ exit 1
188
+ end
189
+ return $1.to_i # and return it as number
190
+ end
191
+
192
+ end
193
+
194
+
@@ -0,0 +1,2347 @@
1
+ # SalsaTigerRegXML.rb
2
+ #
3
+ # Katrin Erk, June 2005
4
+ #
5
+ # Classes for accessing and managing
6
+ # SalsaTigerXML sentences
7
+ #
8
+ # The interface of the classes in this package
9
+ # is similar to that of SalsaTigerXML.rb
10
+ # but the package is based solely on regular expressions
11
+ # and not on REXML.
12
+ #
13
+ # Main class here: SalsaTigerSentence, keeps a complete sentence
14
+ #
15
+ # Nodes of the syntactic tree, frames and frame elements are all
16
+ # handed around as XMLNode objects, or more specifically
17
+ # SynNode, FrameNode and FeNode objects, respectively.
18
+ #
19
+ # Inheritance between classes in here:
20
+ #
21
+ # GraphNode
22
+ # |
23
+ # XMLNode
24
+ # |
25
+ # SalsaTigerXmlNode
26
+ # / \
27
+ # SynNode SemNode
28
+ # | / \
29
+ # TSSynNode FrameNode FeNode
30
+ #
31
+ #
32
+ # SalsaTigerSentence uses the other classes, but is separate
33
+ #
34
+ # SalsaTigerSentence does _not_ yield a faithful image of the SalsaTiger XML structure of
35
+ # a sentence. With the SalsaTiger XML structure you need to follow "idref" attributes
36
+ # to the elements with matching "id" attributes in other parts of the structure.
37
+ # With the classes in this package, you don't.
38
+ # Wherever in SalsaTiger XML you have an idref, you will have _direct access to the
39
+ # object_ here.
40
+ #
41
+ # Suppose that in the XML structure you have a nonterminal element X with <edge> elements
42
+ # pointing to other (terminal or nonterminal) elements X1,.., Xn. Then you'll have
43
+ # a SynNode object N that contains X as its XML object, and the children N1,..,Nn of N
44
+ # will be SynNode objects that contain X1,..,Xn as their XML objects.
45
+ #
46
+ # A SynNode that is a terminal may have children too: its splitword parts (if any).
47
+ #
48
+ # So: a syntactic node is a SynNode object, its children are SynNode objects. The edges
49
+ # to its children are labeled the same way as in the XML structure. If the children
50
+ # are splitword parts, the edges are unlabeled.
51
+ #
52
+ # A frame is a FrameNode object, its children are FeNode objects. The edges to its children
53
+ # are labeled with the FE name or with "target".
54
+ #
55
+ # A frame element is an FeNode object, its children are SynNode objects. The edges to its
56
+ # children are unlabeled.
57
+ #
58
+ # A frame underspecification is an UspNode object, its children are FrameNode objects.
59
+ # The edges to its children are unlabeled.
60
+ #
61
+ # A frame element underspecification is an UspNode objects, its children are
62
+ # FeNode objects. The edges to its children are unlabeled.
63
+
64
+ require "frprep/Tree"
65
+ require "frprep/STXmlTerminalOrder"
66
+ require "frprep/RegXML"
67
+ require "frprep/ruby_class_extensions"
68
+
69
+ #############
70
+ # class XMLNode
71
+ #
72
+ # node with entries pointing to its children
73
+ # as well as its parent.
74
+ # all edges may be labeled.
75
+ # each node has a unique ID.
76
+ #
77
+ # indexes a string with XML data representing the same node,
78
+ # but does not look into it, just keeps it
79
+ #
80
+ # methods:
81
+ # This class inherits from TreeNode and GraphNode.
82
+ # See Tree.rb and Graph.rb for the methods they offer.
83
+ #
84
+ # new initializes the object
85
+ #
86
+ # get returns the XML object representing
87
+ # the same node as this node object
88
+ #
89
+
90
+ class XMLNode < TreeNode
91
+
92
+ ###
93
+ def initialize(name, # string: element name; or, for text, the whole text
94
+ attribute, # hash: attr_name(string) -> attr_value(string)
95
+ id, # string: node ID
96
+ i_am_text = false) # boolean: set to anything but false or nil
97
+ # to represent not an xml element but text
98
+
99
+ if id.nil?
100
+ # I wasn't given any ID
101
+ # take system time for an ID
102
+ # use to_f to get fractions of seconds too:
103
+ # If I make several nodes in the same second,
104
+ # they should still have unique IDs
105
+ id = Time.new().to_f.to_s
106
+ end
107
+
108
+ super(id)
109
+
110
+ # remember values for this element
111
+ set_f("name", name)
112
+ set_f("attributes", attribute)
113
+ set_f("i_am_text", i_am_text)
114
+
115
+ # sanity check
116
+ if i_am_text and attributes
117
+ raise "A text element cannot have attributes"
118
+ end
119
+
120
+ @kith = Array.new()
121
+ end
122
+
123
+ ###
124
+ # add sanity check:
125
+ # if this is text rather than an xml element,
126
+ # it cannot have children
127
+ def add_child(child, edgelabel, varhash={})
128
+ if get_f("i_am_text")
129
+ raise "A text element cannot have children"
130
+ end
131
+ super(child, edgelabel, varhash)
132
+ end
133
+
134
+ ###
135
+ def add_kith(xml) # RegXML object
136
+ @kith << xml
137
+ end
138
+
139
+ ###
140
+ # set attribute
141
+ def set_attribute(name, value)
142
+ unless value.class == String
143
+ raise "I can only set attribute values to strings. Got: #{value.class.to_s}"
144
+ end
145
+
146
+ if get_f("attributes").nil?
147
+ set_f("attributes", Hash.new())
148
+ end
149
+ get_f("attributes")[name] = value
150
+ end
151
+
152
+ ###
153
+ def get_attribute(name)
154
+ if get_f("attributes")
155
+ return get_f("attributes")[name]
156
+ else
157
+ return nil
158
+ end
159
+ end
160
+
161
+ ###
162
+ # delete attribute
163
+ def del_attribute(name)
164
+ if get_f("attributes")
165
+ get_f("attributes").delete(name)
166
+ end
167
+ end
168
+
169
+ ###
170
+ # return XML as string:
171
+ # If this is a text, just return the text
172
+ # which is stored in "name"
173
+ # If this is an XMl element,
174
+ # make a tag from its name and attributes,
175
+ # then add tags for all its children,
176
+ # then add an end tag.
177
+ def get()
178
+ if get_f("i_am_text")
179
+ # text rather than XML element
180
+ return get_f("name")
181
+ else
182
+ # XMl element, not text
183
+ string = "<" + get_f("name")
184
+ if get_f("attributes")
185
+ string << get_f("attributes").to_a.map { |name, value|
186
+ " " + name + "=\'" + xml_secure_val(value) + "\'"
187
+ }.join()
188
+ end
189
+ string << ">\n"
190
+ string << get_xml_embedded()
191
+ string << "</#{get_f("name")}>\n"
192
+ return string
193
+ end
194
+ end
195
+
196
+ #############
197
+ protected
198
+
199
+ def get_xml_embedded()
200
+ return get_xml_ofchildren() +
201
+ get_xml_ofkith()
202
+ end
203
+
204
+
205
+ def get_xml_ofchildren()
206
+ return children.map { |child|
207
+ child.get()
208
+ }.join()
209
+ end
210
+
211
+
212
+ def get_xml_ofkith()
213
+ return @kith.map { |thing| thing.to_s + "\n" }.join()
214
+ end
215
+
216
+
217
+ ###
218
+ def warn_child_ignored(where, xml_node)
219
+ $stderr.puts "WARNING: additional material found in #{where}, will be ignored:"
220
+ $stderr.puts "\t" + xml_node.to_s
221
+ end
222
+
223
+ ###
224
+ def xml_secure_val(value) # string: value of an attribute
225
+ return value.gsub(/'/, "&apos;").gsub(/"/, "&apos;&apos;")
226
+ return value
227
+ end
228
+ end
229
+
230
+ #############
231
+ # class SalsaTigerXmlNode
232
+ #
233
+ # additional methods:
234
+ #
235
+ # is_terminal? true if this is a Tiger XML terminal node
236
+ #
237
+ # is_nonterminal? true if this is a Tiger XML nonterminal node
238
+ #
239
+ # is_splitword? true if this is a splitword part
240
+ #
241
+ # is_syntactic? true for terminal, nonterminal, splitword
242
+ #
243
+ # is_frame? true if this is a Salsa/Tiger XML frame
244
+ #
245
+ # is_target? true if this is a Salsa/Tiger XML frame target
246
+ #
247
+ # is_fe? true if this is a Salsa/Tiger XML frame element
248
+ #
249
+ # is_outside_sentence? returns false -- this node is not a placeholder for
250
+ # a node that is outside the current sentence
251
+ # (but see descendant class TSSynNode)
252
+ #
253
+ # yield_nodes returns the list of descendants thatare leaves of the tree
254
+ # NOTE: this overwrites the Graph.yield_nodes method
255
+ # since we have to treat splitwords in a special way
256
+ # empty array if no yield nodes are present
257
+ #
258
+ # yield_nodes_ordered returns those descendants ordered by precedence
259
+ # in the sentence, i.e. their node IDs.
260
+ #
261
+ # sid returns the sentence ID of this node
262
+ #
263
+ # to_s returns the yield of this node as a string of space-separated words
264
+ # words ordered left to right
265
+ #
266
+ class SalsaTigerXmlNode < XMLNode
267
+ include StringTerminalsInRightOrder
268
+
269
+ ###
270
+ # extracting the ID from a RegXML element
271
+ # depends on whether it has an ID or an IDref
272
+ #
273
+ # returns: a string, the ID, or nil if none was found
274
+ def SalsaTigerXmlNode.xmlel_id(xml_obj) # RegXML object
275
+ case xml_obj.name
276
+ when "edge", "fenode", "uspitem", "splitword", "other_edge"
277
+ # contains ID ref
278
+ return xml_obj.attributes()["idref"]
279
+ when "part"
280
+ # contains ID
281
+ return xml_obj.attributes()["id"]
282
+ else
283
+ # something else
284
+ # default: ID is in attribute "id"
285
+ return xml_obj.attributes()["id"]
286
+ end
287
+ end
288
+
289
+ ###
290
+ def initialize(xml) # RegXML object or text
291
+ if xml.text?
292
+ # text
293
+ super(xml, nil, nil, true)
294
+ else
295
+ # xml element
296
+ super(xml.name(), xml.attributes(), SalsaTigerXmlNode.xmlel_id(xml), false)
297
+ end
298
+ end
299
+
300
+ ###
301
+ def is_terminal?
302
+ return get_f("name") == "t"
303
+ end
304
+
305
+ ###
306
+ def is_nonterminal?
307
+ return get_f("name") == "nt"
308
+ end
309
+
310
+ ###
311
+ def is_splitword?
312
+ return get_f("name") == "part"
313
+ end
314
+
315
+ ###
316
+ def is_syntactic?
317
+ if is_terminal? or is_nonterminal? or is_splitword?
318
+ return true
319
+ else
320
+ return false
321
+ end
322
+ end
323
+
324
+ ###
325
+ def is_frame?
326
+ return get_f("name") == "frame"
327
+ end
328
+
329
+ ###
330
+ def is_target?
331
+ return get_f("name") == "target"
332
+ end
333
+
334
+ ###
335
+ def is_fe?
336
+ return get_f("name") == "fe"
337
+ end
338
+
339
+ ###
340
+ def sid()
341
+ # my node ID starts out with the sentence ID
342
+ id =~ /^(.*?)_/
343
+ return $1
344
+ end
345
+
346
+ ###
347
+ def is_outside_sentence?
348
+ return false
349
+ end
350
+
351
+ ###
352
+ def yield_nodes()
353
+ # special consideration: splitwords do not count as children!
354
+ if children.reject {|c| c.is_splitword? }.empty?
355
+ return [ self ]
356
+ end
357
+
358
+ arr = Array.new
359
+ children.reject { |c| c.is_splitword? }.each { |c|
360
+ if c.children.reject {|gc| gc.is_splitword? }.empty?
361
+ arr << c
362
+ else
363
+ arr.concat c.yield_nodes()
364
+ end
365
+ }
366
+ return arr
367
+ end
368
+
369
+ ###
370
+ def yield_nodes_ordered() # legacy name
371
+ # sort_terminals_and_splitwords_... cannot deal with nonterminals
372
+ # so remove and attach to the end of the chain
373
+ t, nt = yield_nodes().distribute { |x| x.is_terminal? or x.is_splitword? }
374
+ return sort_terminals_and_splitwords_left_to_right(t).concat(nt)
375
+ end
376
+
377
+ ###
378
+ def terminals_sorted() # name parallel to the method of SalsaTigerSentence
379
+ return yield_nodes_ordered()
380
+ end
381
+
382
+ ###
383
+ def to_s
384
+ return string_for_node(self)
385
+ end
386
+ end
387
+
388
+ #############
389
+ # class SynNode
390
+ #
391
+ # inherits from SalsaTigerXmlNode,
392
+ # adds to it methods specific to nodes
393
+ # that describe the syntactic structure
394
+ #
395
+ # additional/changed methods:
396
+ #
397
+ # part_of_speech part_of_speech information as a string,
398
+ # nil for anything but terminal nodes
399
+ #
400
+ # word word information for this node as a string,
401
+ # nil for anything but terminal nodes
402
+ #
403
+ # category category information for this node as a string,
404
+ # nil for anything but nonterminal nodes
405
+ #
406
+ # is_punct? true if this is a terminal node and it is a punctuation sign
407
+ #
408
+ # get_sem add a non-tree edge from this syntactic node to a semantic node
409
+ # Idea: this is basically the inverse of the edge pointing from
410
+ # the FeNode to this SynNode, so you can fetch a node's semantics directly
411
+ #
412
+ # add_sem add non-tree edge from this syntactic node to a FeNode
413
+
414
+ class SynNode < SalsaTigerXmlNode
415
+
416
+ ###
417
+ def initialize(xml)
418
+ super(xml)
419
+
420
+ @sem = Array.new
421
+ @other_links = Array.new
422
+ end
423
+
424
+ ###
425
+ def add_link(other_node, # SynNode
426
+ link_label, # string: edge label
427
+ attributes = {}) # hash string>string: further attribute-value pairs for the edge
428
+
429
+ @other_links << [link_label, other_node, attributes]
430
+ end
431
+
432
+ ###
433
+ def get_linked(label = nil) # string/nil: if string, use only linked with this link_label
434
+ if label
435
+ return @other_links.select { |label_node_attr| label_node_attr.first == label }
436
+ else
437
+ return @other_links
438
+ end
439
+ end
440
+
441
+ ###
442
+ def part_of_speech
443
+ if get_attribute("pos")
444
+ return get_attribute("pos").strip
445
+ else
446
+ return nil
447
+ end
448
+ end
449
+
450
+ ###
451
+ def category
452
+ if get_attribute("cat")
453
+ return get_attribute("cat").strip
454
+ else
455
+ return nil
456
+ end
457
+ end
458
+
459
+ ###
460
+ def word()
461
+ if get_attribute("word")
462
+ return get_attribute("word").strip
463
+ else
464
+ return nil
465
+ end
466
+ end
467
+
468
+ ###
469
+ def is_punct?()
470
+ if is_nonterminal?
471
+ # only terminals can be punctuation signs
472
+ return false
473
+ end
474
+
475
+ # next check part of speech
476
+ # this works at least for TIGER corpus annotation
477
+ case part_of_speech
478
+ when '$.', '$,', '$('
479
+ return true
480
+ end
481
+ if part_of_speech =~ /^PUNC/
482
+ return true
483
+ end
484
+
485
+ # known punctuation signs: filtered out for determining maximal constituents
486
+
487
+ # no luck with part of speech:
488
+ # check word
489
+ case word
490
+ when ".", ";", ",", ":", "?", "!", "(", ")", "[", "]", "{", "}", "-", "''", "``", "\"", "'"
491
+ return true
492
+ end
493
+
494
+ # not a punctuation sign by any of the tests we have applied
495
+ return false
496
+ end
497
+
498
+ ###
499
+ def to_s()
500
+ if is_terminal?
501
+ return word
502
+ else
503
+ return super()
504
+ end
505
+ end
506
+
507
+ ###
508
+ def get_sem()
509
+ return @sem.clone()
510
+ end
511
+
512
+ ###
513
+ def add_sem(fe_node)
514
+ unless fe_node.class == FeNode
515
+ raise "Unexpected class of semantic node: was expecting an FeNode"
516
+ end
517
+
518
+ @sem << fe_node
519
+ end
520
+
521
+ #############
522
+ protected
523
+
524
+ def get_xml_ofchildren()
525
+ string = ""
526
+
527
+ each_child_with_edgelabel { |label, child|
528
+ unless child.is_splitword?
529
+ # terminal or nonterminal child.
530
+ # splitwords are handled separately in the "sem" part of the sentence
531
+ if label
532
+ string << "<edge label=\'#{xml_secure_val(label)}\' idref=\'#{xml_secure_val(child.id)}\'/>\n"
533
+ else
534
+ string << "<edge label=\'-\' idref=\'#{xml_secure_val(child.id)}\'/>\n"
535
+ end
536
+ end
537
+ }
538
+ @other_links.each { |label, node, attributes|
539
+ if label
540
+ string << "<other_edge label=\'#{xml_secure_val(label)}\'"
541
+ else
542
+ string << "<other_edge label=\'-\'"
543
+ end
544
+ string << " idref=\'#{xml_secure_val(node.id)}\'"
545
+ if attributes
546
+ string << " " + attributes.to_a.map { |attr, val| "#{xml_secure_val(attr)}=\'#{xml_secure_val(val)}\'" }.join(" ")
547
+ end
548
+ string << "/>\n"
549
+ }
550
+
551
+ return string
552
+ end
553
+ end
554
+
555
+ #############
556
+ # class TSSynNode
557
+ #
558
+ # inherits from SynNode
559
+ #
560
+ # describes a syntactic node that isn't really there:
561
+ # a reference to a node in another sentence
562
+ #
563
+ # contains that node's ID, but an empty RegXML object,
564
+ # its string is "<unknown>", and you cannot add
565
+ # a child to it
566
+ #
567
+ # new or changed methods:
568
+ #-----------------------
569
+ #
570
+ # is_outside_sentence? returns true
571
+ #
572
+ # word returns "<unknown>"
573
+ #
574
+ # add_child raises an error
575
+
576
+ class TSSynNode < SynNode
577
+
578
+ ###
579
+ def initialize(id_string)
580
+ super(RegXML.new("<OTHER_SENTENCE id='" + id_string + "'/>"))
581
+ end
582
+
583
+ ###
584
+ def is_outside_sentence?
585
+ return true
586
+ end
587
+
588
+ ###
589
+ # word of this node: <unknown>
590
+ def word
591
+ return "<unknown>"
592
+ end
593
+
594
+ def add_child(arg1, arg2)
595
+ raise "Not implemented for this class"
596
+ end
597
+ end
598
+
599
+ #############
600
+ # class SemNode
601
+ #
602
+ # common superclass for FrameNode and FeNode,
603
+ # with methods that are the same for both:
604
+ #
605
+ #
606
+ # is_usp? returns true if the frame/FE is involved in underspecification,
607
+ # else false
608
+ #
609
+ # flags returns an array of all the frame/FE flags for this node.
610
+ # members of the array are strings describing the flags
611
+ # that have been set to true
612
+ #
613
+ # add_flag add or remove a frame/FE flag
614
+ # remove_flag
615
+
616
+ class SemNode < SalsaTigerXmlNode
617
+ attr_reader :flags
618
+
619
+ def initialize(xml) # RegXML object or text
620
+ super(xml)
621
+ # flags: array of FlagNode objects
622
+ @flags = Array.new()
623
+ end
624
+
625
+ ###
626
+ def is_usp?
627
+ return get_attribute("usp") == "yes"
628
+ end
629
+
630
+ ###
631
+ def add_flag(name) # string: flag name
632
+ @flags << name
633
+ end
634
+
635
+ ###
636
+ def remove_flag(name) # string: flag name
637
+ @flags.delete(name)
638
+ end
639
+
640
+ #############
641
+ protected
642
+
643
+ def get_xml_embedded()
644
+ return super() + get_xml_offlags()
645
+ end
646
+
647
+ def get_xml_offlags()
648
+ # and add flags
649
+ return @flags.map { |flagname|
650
+ "<flag name=\'#{xml_secure_val(flagname)}\'/>\n"
651
+ }.join
652
+ end
653
+ end
654
+
655
+
656
+
657
+ #############
658
+ # class FrameNode
659
+ #
660
+ # inherits from SemNode
661
+ # adds to it methods specific to nodes
662
+ # that describe a frame
663
+ #
664
+ # additional/changed methods:
665
+ #
666
+ # name returns the name of the frame
667
+ # set_name changes the name of the frame to a new name
668
+ # target returns the target (as a FeNode object)
669
+ #
670
+ # each_child() iterates through FEs, children() returns all FEs
671
+ #
672
+ # each_fe_by_name A frame node may have several FE children with the same
673
+ # frame element label. While each_child returns them separately,
674
+ # each_fe_by_name lumps FE children with the same frame element label
675
+ # into one FeNode.
676
+ # Warnings:
677
+ # - the REXML object of the FeNode is that of the first FE child
678
+ # with that frame element label.
679
+ # - Underspecification is ignored! If you have the same FE twice,
680
+ # and there is underspecification regarding the extent of the FE,
681
+ # the two FE children will be lumped together anyway.
682
+ # If you don't want that, use each_child instead.
683
+ #
684
+ #
685
+ # add_fe CAUTION: please do not call this method directly externally,
686
+ # use SalsaTigerSentence.add_fe, otherwise the node and its ID
687
+ # will not be recorded in the node list and the node cannot be retrieved
688
+ # via its ID
689
+
690
+ class FrameNode < SemNode
691
+
692
+ ###
693
+ def target()
694
+ target = children_by_edgelabels(["target"])
695
+ if target.empty?
696
+ $stderr.puts "SalsaTigerRegXML warning: Frame #{id()}: No target, but I got: \n" + child_labels().join(", ")
697
+ return nil
698
+ else
699
+ unless target.length == 1
700
+ raise "target: more than one target to frame "+id()
701
+ end
702
+ return target.first
703
+ end
704
+ end
705
+
706
+ ###
707
+ def name
708
+ return get_attribute("name")
709
+ end
710
+
711
+ ###
712
+ def set_name(new_name)
713
+ set_attribute("name", new_name)
714
+ end
715
+
716
+ ###
717
+ # each_fe: synonym for each_child
718
+ def each_fe()
719
+ each_child { |c| yield c }
720
+ end
721
+
722
+ ###
723
+ # fes: synonym for children
724
+ def fes()
725
+ children()
726
+ end
727
+
728
+ ###
729
+ def each_fe_by_name()
730
+ child_labels.uniq.each { |fe_name|
731
+ unless fe_name == "target"
732
+
733
+ fes = children_by_edgelabels([fe_name])
734
+
735
+ if fes.length == 1
736
+ # one frame element with that name
737
+ yield fes.first
738
+
739
+ else
740
+ # several frame elements with that name
741
+ # combine them
742
+
743
+ combined_fe = FeNode.new(fe_name, id() + "_" + fe_name)
744
+ fes.each { |fe|
745
+ fe.each_child() { |child|
746
+ combined_fe.add_child(child)
747
+ }
748
+ }
749
+ yield combined_fe
750
+ end
751
+ end
752
+ }
753
+ end
754
+
755
+ ###
756
+ def add_child(fe_node)
757
+ if fe_node.name == "target" and not(children_by_edgelabels(["target"]).empty?)
758
+ $stderr.puts "Adding second target to frame #{id()}"
759
+ $stderr.puts "I already have: " + children_by_edgelabels(["target"]).map { |t| t.id() }.join(",")
760
+ raise "More than one target."
761
+ end
762
+
763
+ super(fe_node, fe_node.name)
764
+ end
765
+
766
+ ###
767
+ def remove_child(fe_node)
768
+ super(fe_node, fe_node.name)
769
+ end
770
+
771
+ ###
772
+ def add_fe(fe_name, # string: name of FE to add
773
+ syn_nodes, # array:SynNode, syntactic nodes that this FE should point to
774
+ fe_id = nil) # string: ID for the new FE
775
+
776
+ if fe_name == "target" and not(children_by_edgelabels(["target"]).empty?)
777
+ $stderr.puts "Adding second target to frame #{id()}"
778
+ $stderr.puts "I already have: " + children_by_edgelabels(["target"]).map { |t| t.id() }.join(",")
779
+ raise "More than one target."
780
+ end
781
+
782
+ # make FE node and list as this frame's child
783
+ unless fe_id
784
+ # no FE ID given, make one myself
785
+ fe_id = id() + "_fe" + Time.new().to_f.to_s
786
+ end
787
+
788
+ n = FeNode.new(fe_name, fe_id)
789
+ add_child(n)
790
+
791
+ # add syn nodes
792
+ syn_nodes.each { |syn_node|
793
+ n.add_child(syn_node)
794
+ }
795
+
796
+ return n
797
+ end
798
+ end
799
+
800
+ #############
801
+ # class FeNode
802
+ #
803
+ # inherits from SemNode,
804
+ # adds to it methods specific to nodes
805
+ # that describe a frame element or target
806
+ #
807
+ # additional/changed methods:
808
+ #----------------------------
809
+ #
810
+ # name returns the name of the frame element, or "target"
811
+ #
812
+ # add_child, remove_child
813
+
814
+ class FeNode < SemNode
815
+
816
+ ###
817
+ def initialize(name_or_xml, # either RegXMl object or the name of the FE as a string
818
+ id_if_name = nil) # string: ID to use if we just got the name of the FE
819
+
820
+ case name_or_xml.class.to_s
821
+ when "String"
822
+ if name_or_xml == "target"
823
+ super(RegXML.new("<target id=\'#{xml_secure_val(id_if_name.to_s)}\'/>"))
824
+ @i_am_target = true
825
+ else
826
+ super(RegXML.new("<fe name=\'#{xml_secure_val(name_or_xml)}\' id=\'#{xml_secure_val(id_if_name.to_s)}\'/>"))
827
+ @i_am_target = false
828
+ end
829
+
830
+ when "RegXML"
831
+ super(name_or_xml)
832
+
833
+ if name_or_xml.name() == "target"
834
+ @i_am_target = true
835
+ else
836
+ @i_am_target = false
837
+ end
838
+ else
839
+ raise "Shouldn't be here: " + name_or_xml.class.to_s
840
+ end
841
+
842
+ # child_attr: keep additional attributes of <fenode> elements,
843
+ # if there are any
844
+ # child_attr: hash syn_node_id(string) -> attributes(hash)
845
+ @child_attr = Hash.new()
846
+ end
847
+
848
+ ###
849
+ def name
850
+ if @i_am_target
851
+ return "target"
852
+ else
853
+ return get_attribute("name")
854
+ end
855
+ end
856
+
857
+ ###
858
+ def add_child(syn_node,
859
+ xml_obj = nil)
860
+ if xml_obj
861
+ # we've been given the fenode XML element
862
+ # see if there are any attributes that we will need:
863
+ # get attributes, remove the idref (we get that from the
864
+ # child's ID directly)
865
+ at = xml_obj.attributes
866
+ at.delete("idref")
867
+ unless at.empty?
868
+ @child_attr[syn_node.id] = at
869
+ end
870
+ end
871
+
872
+ super(syn_node, nil, "pointer_insteadof_edge" => true)
873
+ end
874
+
875
+ ###
876
+ def remove_child(syn_node, varhash={})
877
+ super(syn_node, nil, "pointer_insteadof_edge" => true)
878
+ end
879
+
880
+ #############
881
+ protected
882
+
883
+ def get_xml_ofchildren()
884
+ return children.map { |child|
885
+ if @child_attr[child.id()]
886
+ "<fenode idref=\'#{xml_secure_val(child.id())}\'" +
887
+ @child_attr[child.id()].to_a.map { |attr, val|
888
+ " #{attr}=\'#{xml_secure_val(val)}\'"
889
+ }.join() +
890
+ "/>\n"
891
+
892
+ else
893
+ "<fenode idref=\'#{xml_secure_val(child.id())}\'/>\n"
894
+ end
895
+ }.join()
896
+ end
897
+ end
898
+
899
+ #############
900
+ # class UspNode
901
+ #
902
+ # inherits from SalsaTigerXmlNode,
903
+ # adds to it methods specific to nodes
904
+ # that describe a frame underspecification or frame element underspecification
905
+ #
906
+ # additional/changed methods:
907
+ #----------------------------
908
+ #
909
+ # new initializes the object
910
+ # rexml_object: underlying XML object for this node
911
+ # frame_or_fe: string, either "frame" for frame underspecification
912
+ # or "fe" for frame element underspecification
913
+ #
914
+ # add_child, remove_child add, remove underspecification entry
915
+
916
+ class UspNode < SalsaTigerXmlNode
917
+
918
+ attr_reader :i_am
919
+
920
+ ###
921
+ def initialize(xml_obj, # RegXMl object
922
+ frame_or_fe) # string "frame" or "fe"
923
+
924
+ super(xml_obj)
925
+ case frame_or_fe
926
+ when "frame"
927
+ @i_am = "frame"
928
+ when "fe"
929
+ @i_am = "fe"
930
+ else
931
+ raise "new: neither frame nor fe??"
932
+ end
933
+ end
934
+
935
+ ###
936
+ def add_child(node, varhash={})
937
+ if node
938
+ super(node, nil, "pointer_insteadof_edge" => true)
939
+ else
940
+ raise "Got nil for a node."
941
+ end
942
+
943
+ # set usp. attribute on child
944
+ node.set_attribute("usp", "yes")
945
+ end
946
+
947
+ ###
948
+ def remove_child(node, varhash={})
949
+ super(node, nil, "pointer_insteadof_edge" => true)
950
+
951
+ # removing "usp" attribute on child
952
+ # this will be wrong if the child is involved in more
953
+ # than one instance of underspecification!
954
+
955
+ $stderr.puts "Warning: unsafe removal of attribute 'usp'"
956
+ node.del_attribute("usp")
957
+ end
958
+
959
+ #############
960
+ protected
961
+
962
+ def get_xml_ofchildren()
963
+ return children.map { |child|
964
+ "<uspitem idref=\'#{xml_secure_val(child.id)}\'/>\n"
965
+ }.join()
966
+ end
967
+
968
+ end
969
+
970
+ #############
971
+ class SalsaTigerSentenceGraph < XMLNode
972
+ include StringTerminalsInRightOrder
973
+
974
+ attr_reader :node
975
+
976
+ def initialize(xml_obj, # RegXML object
977
+ sentence_id) # string: ID of this sentence
978
+
979
+ # global data:
980
+ # node: hash node_id -> XMLNode object
981
+ # maps node IDs to the nodes with that ID
982
+ @node = Hash.new
983
+ @sentence_id = sentence_id
984
+
985
+ if xml_obj
986
+ # we actually have syntactic information.
987
+ # read it.
988
+
989
+ # initialize this object as an XML node,
990
+ # i.e. remember the outermost element's name, attributes,
991
+ # and ID, and specify that it's not a text but an XML object
992
+ super(xml_obj.name, xml_obj.attributes, sentence_id + "_graph", false)
993
+
994
+ # initialize nodes, remember their IDs
995
+ xml_obj.children_and_text.each { |child_or_text|
996
+
997
+ case child_or_text.name
998
+ when "terminals"
999
+ make_nodes(child_or_text, "t", "s/graph/terminals", "all_children_kith")
1000
+ when "nonterminals"
1001
+ make_nodes(child_or_text, "nt", "s/graph/nonterminals")
1002
+ else
1003
+ # additional info that we don't need for now
1004
+ # keep for output
1005
+ add_kith(child_or_text)
1006
+ end
1007
+ }
1008
+
1009
+
1010
+
1011
+ # add edges between nodes
1012
+ nonterminals = xml_obj.children_and_text.detect { |child| child.name == "nonterminals" }
1013
+ if nonterminals
1014
+ nonterminals.children_and_text.each { |nt|
1015
+
1016
+ unless nt.name == "nt"
1017
+ # we've already done the warning bit in make_nodes
1018
+ next
1019
+ end
1020
+
1021
+ syn_add_children(@node[SalsaTigerXmlNode.xmlel_id(nt)], nt)
1022
+ }
1023
+ end
1024
+
1025
+ else
1026
+ # we have no syntactic information
1027
+ # record it anyway
1028
+
1029
+ super("graph", {}, sentence_id + "_graph", false)
1030
+ end
1031
+ end
1032
+
1033
+
1034
+ ###
1035
+ def add_splitwords(xml_obj) #RegXMl object
1036
+ unless xml_obj.nil?
1037
+ # splitwords is an XML element with name "splitwords" and
1038
+ # children named "splitword", each of which describes a split
1039
+ # for one of the terminals we already know
1040
+ xml_obj.children_and_text.each { |splitword|
1041
+ unless splitword.name() == "splitword"
1042
+ warn_child_ignored("s/sem/splitwords/", splitword)
1043
+ next
1044
+ end
1045
+
1046
+ # make nodes for the splitword parts
1047
+ make_nodes(splitword, "part", "s/sem/splitwords/splitword", "all_children_kith")
1048
+
1049
+ # this is the terminal that is being split:
1050
+ # add links to its new children
1051
+ syn_add_children(@node[SalsaTigerXmlNode.xmlel_id(splitword)], splitword)
1052
+ }
1053
+ end
1054
+ end
1055
+
1056
+ ###
1057
+ def to_s
1058
+ string_for_nodes(syn_roots())
1059
+ end
1060
+
1061
+ ###
1062
+ def get()
1063
+ # make sure that the graph element has a 'root' attribute
1064
+ # since the Salsa tool needs this
1065
+ set_attribute("root", syn_roots().first.id())
1066
+ super()
1067
+ end
1068
+
1069
+ #####
1070
+ # access methods
1071
+
1072
+ ###
1073
+ def each_node
1074
+ @node.each_value { |n|
1075
+ yield n
1076
+ }
1077
+ end
1078
+
1079
+ ###
1080
+ def nodes
1081
+ return @node.values()
1082
+ end
1083
+
1084
+ ###
1085
+ def each_terminal
1086
+ @node.each_value { |node|
1087
+ if node.is_terminal?
1088
+ yield node
1089
+ end
1090
+ }
1091
+ end
1092
+
1093
+ ###
1094
+ def each_terminal_sorted
1095
+ sort_terminals_and_splitwords_left_to_right(terminals).each { |node_obj|
1096
+ yield node_obj
1097
+ }
1098
+ end
1099
+
1100
+ ###
1101
+ def terminals
1102
+ return @node.values.select { |node| node.is_terminal? }
1103
+ end
1104
+
1105
+ ###
1106
+ def terminals_sorted
1107
+ return sort_terminals_and_splitwords_left_to_right(terminals)
1108
+ end
1109
+
1110
+ ###
1111
+ def each_nonterminal
1112
+ @node.each_value { |node|
1113
+ if node.is_nonterminal?
1114
+ yield node
1115
+ end
1116
+ }
1117
+ end
1118
+
1119
+ ###
1120
+ def nonterminals
1121
+ return @node.values.select { |node| node.is_nonterminal? }
1122
+ end
1123
+
1124
+ ###
1125
+ def syn_roots
1126
+ return @node.values.select { |node|
1127
+ node.parent().nil?
1128
+ }
1129
+ end
1130
+ ###
1131
+
1132
+ ######################3
1133
+ # adding nodes
1134
+
1135
+ ###
1136
+ def add_child(arg1, arg2, varhash={})
1137
+ raise "Not implemented for this class"
1138
+ end
1139
+
1140
+ ###
1141
+ def remove_child(arg1, arg2, varhash={})
1142
+ raise "Not implemented for this class"
1143
+ end
1144
+
1145
+ ###
1146
+ def add_node(sentid, # string: sentence ID
1147
+ label, # string: t or nt
1148
+ cat = nil, # string: category
1149
+ word = nil,# string: word
1150
+ pos = nil, # string: part of speech
1151
+ syn_id = nil) # string: ID for the new node
1152
+
1153
+ unless ["t", "nt"].include? label
1154
+ raise "Unknown node label #{label} for new syntactic node. Must be either t or nt."
1155
+ end
1156
+
1157
+ # make node ID: sentence ID plus ID generated by system time
1158
+ if syn_id
1159
+ new_id = sentid + "_" + syn_id
1160
+ else
1161
+ new_id = sentid + "_" + Time.new().to_f.to_s
1162
+ end
1163
+
1164
+ elt = "<#{label}"
1165
+ [["id", new_id], ["cat", cat], ["word", word], ["pos", pos]].each { |label, content|
1166
+ if content
1167
+ elt << " #{label}=\"#{xml_secure_val(content)}\""
1168
+ end
1169
+ }
1170
+ elt << "/>"
1171
+ n = SynNode.new(RegXML.new(elt))
1172
+ @node[n.id] = n
1173
+
1174
+ return n
1175
+ end
1176
+
1177
+ ###
1178
+ def remove_node(node) # SynNode
1179
+ # remove node from list
1180
+ @node.delete(node.id)
1181
+
1182
+ # remove it as child and parent of other nodes;
1183
+ # add its own children to the parent.
1184
+ # the _edgelabel_ of the new edges will be the edgeslabels
1185
+ # between the original node in its children
1186
+ # in other words, the label of the removed node's incoming edge
1187
+ # is deleted
1188
+
1189
+ # STDERR.puts "Removing node #{node.id}:"
1190
+
1191
+ pair = node.parent_with_edgelabel
1192
+ if pair
1193
+ # delete incoming edge for deleted node
1194
+ label, parent = pair
1195
+ # STDERR.puts " Removing link from PARENT #{parent.id}, edgelabel #{label}"
1196
+ parent.remove_child(node, label)
1197
+ end
1198
+ # delete outgoing edge for deleted node
1199
+ node.each_child_with_edgelabel { |label, child|
1200
+ child.remove_parent(node, label)
1201
+ # STDERR.puts " Removing link to child #{child.id}"
1202
+ }
1203
+ # glue deleted node's children to its parent
1204
+ if pair
1205
+ plabel, parent = pair
1206
+ node.each_child_with_edgelabel {|clabel,child|
1207
+ parent.add_child(child, clabel)
1208
+ }
1209
+ # STDERR.puts "Parent now has children "+node.parent.children.map {|c| c.id}.join(" ")
1210
+ end
1211
+ end
1212
+
1213
+ ######################
1214
+ protected
1215
+
1216
+ ###
1217
+ def get_xml_ofchildren()
1218
+ string = ""
1219
+
1220
+ string << "<terminals>\n"
1221
+ each_terminal_sorted { |t|
1222
+ string << t.get()
1223
+ }
1224
+ string << "</terminals>\n"
1225
+
1226
+ string << "<nonterminals>\n"
1227
+ each_nonterminal { |nt|
1228
+ string << nt.get()
1229
+ }
1230
+ string << "</nonterminals>\n"
1231
+
1232
+ return string
1233
+
1234
+ end
1235
+
1236
+ def make_nodes(xml_obj, # RegXML object
1237
+ expected_obj_name, # string
1238
+ where, # string
1239
+ all_children_kith = nil) # object: if non-nil,
1240
+ # keep all children of the new nodes
1241
+ # as kith"
1242
+
1243
+ xml_obj.children_and_text.each { |elt|
1244
+
1245
+ if elt.name == expected_obj_name
1246
+ # this is the kind of child we were expecting to see
1247
+ n = SynNode.new(elt)
1248
+ @node[n.id] = n
1249
+
1250
+ if all_children_kith
1251
+ elt.children_and_text.each { |elt_child|
1252
+ n.add_kith(elt_child)
1253
+ }
1254
+ end
1255
+
1256
+ else
1257
+ warn_child_ignored(where, elt)
1258
+ end
1259
+ }
1260
+ end
1261
+
1262
+ def syn_add_children(node,
1263
+ xml_obj)
1264
+ unless node
1265
+ raise "Shouldn't be here"
1266
+ end
1267
+
1268
+ xml_obj.children_and_text.each { |edge|
1269
+
1270
+ if ["edge", "part"].include? edge.name()
1271
+
1272
+ # add an edge to this child,
1273
+ # retrieve the node with the given ID from id_to_node
1274
+ child = @node[SalsaTigerXmlNode.xmlel_id(edge)]
1275
+ unless child
1276
+ raise "Sentence #{@sentence_id}: I cannot find a node for " + edge.to_s()
1277
+ end
1278
+
1279
+ edgelabel = edge.attributes()["label"]
1280
+ node.add_child(child, edgelabel)
1281
+
1282
+ elsif edge.name() == "other_edge"
1283
+ # add link to this node,
1284
+ # retrieve the node with the given ID from id_to_node
1285
+ child = @node[SalsaTigerXmlNode.xmlel_id(edge)]
1286
+ unless child
1287
+ raise "Sentence #{@sentence_id}: I cannot find a node for other_edge #{SalsaTigerXmlNode.xmlel_id(edge)} : " + edge.to_s()
1288
+ end
1289
+
1290
+ attributes = edge.attributes()
1291
+ if attributes
1292
+ edgelabel = attributes.delete("label")
1293
+ else
1294
+ edgelabel = nil
1295
+ end
1296
+ node.add_link(child, edgelabel, attributes)
1297
+
1298
+ else
1299
+ # something other than an edge
1300
+ # keep for output
1301
+ node.add_kith(edge)
1302
+ end
1303
+ }
1304
+ end
1305
+ end
1306
+
1307
+ #############
1308
+ class SalsaTigerSentenceSem < XMLNode
1309
+
1310
+ attr_reader :node
1311
+
1312
+ ###
1313
+ def SalsaTigerSentenceSem.get_splitwords(xml_obj)
1314
+ return xml_obj.children_and_text.detect { |child|
1315
+ child.name == "splitwords"
1316
+ }
1317
+ end
1318
+
1319
+ ###
1320
+ def initialize(xml_obj, # RegXML object
1321
+ sentence_id, # string: sentence ID
1322
+ id_to_node) # hash: syn_node_id(string) -> SynNode object
1323
+
1324
+ # global data:
1325
+ # node: hash node_id -> XMLNode object
1326
+ # maps node IDs to the nodes with that ID
1327
+ # frame_id, uspframe_id, uspfe_id: arrays of node IDs,
1328
+ # listing all frame nodes, frame underspecification nodes,
1329
+ # and FE underspecification nodes respectively
1330
+ # globals: array of RegXML objects, each representing one sentence flag
1331
+ @node = Hash.new
1332
+ @frame_id = Array.new
1333
+ @uspframe_id = Array.new
1334
+ @uspfe_id = Array.new
1335
+ @globals = Array.new
1336
+
1337
+ if xml_obj
1338
+ # we actually have semantic information.
1339
+ # read it.
1340
+
1341
+ super(xml_obj.name, xml_obj.attributes, sentence_id + "_sem", false)
1342
+
1343
+ globals_obj = frames_obj = usp_obj = nil
1344
+
1345
+ xml_obj.children_and_text.each { |obj|
1346
+ case obj.name
1347
+ when "globals"
1348
+ globals_obj = obj
1349
+ when "frames"
1350
+ frames_obj = obj
1351
+ when "usp"
1352
+ usp_obj = obj
1353
+ else
1354
+ add_kith(obj)
1355
+ end
1356
+ }
1357
+
1358
+ # handle globals
1359
+ if globals_obj
1360
+ globals_obj.children_and_text.each { |obj|
1361
+ @globals << obj
1362
+ }
1363
+ end
1364
+
1365
+ # index frames
1366
+ if frames_obj
1367
+ frames_obj.children_and_text.each { |frame|
1368
+ unless frame.name() == "frame"
1369
+ warn_child_ignored("s/sem/frames/", frame)
1370
+ next
1371
+ end
1372
+
1373
+ # make a node for the frame.
1374
+ node = FrameNode.new(frame)
1375
+ semnode_add_flags(node, frame)
1376
+ @node[node.id] = node
1377
+ @frame_id << node.id
1378
+ # add FEs
1379
+ frame_add_children(node, frame, id_to_node)
1380
+ }
1381
+ end
1382
+
1383
+ # index underspecification
1384
+ if usp_obj
1385
+ usp_obj.children_and_text.each { |uspframe_or_fe|
1386
+ case uspframe_or_fe.name
1387
+ when "uspframes"
1388
+ initialize_usp(uspframe_or_fe, "frame")
1389
+ when "uspfes"
1390
+ initialize_usp(uspframe_or_fe, "fe")
1391
+
1392
+ else
1393
+ warn_child_ignored("s/sem/usp/", uspframe_or_fe)
1394
+ end
1395
+ }
1396
+ end
1397
+
1398
+ else
1399
+ # we have no semantic information
1400
+ # record it anyway
1401
+
1402
+ super("sem", {}, sentence_id + "_sem", false)
1403
+ end
1404
+ end
1405
+
1406
+ ################################################3
1407
+ # access methods
1408
+
1409
+ ###
1410
+ def each_frame
1411
+ @frame_id.each { |node_id|
1412
+ yield @node[node_id]
1413
+ }
1414
+ end
1415
+
1416
+ ###
1417
+ def frames
1418
+ return @frame_id.map { |node_id| @node[node_id] }
1419
+ end
1420
+
1421
+ ###
1422
+ def each_usp_frameblock
1423
+ @uspframe_id.each { |node_id|
1424
+ yield @node[node_id]
1425
+ }
1426
+ end
1427
+
1428
+ ###
1429
+ def usp_frameblocks()
1430
+ return @uspframe_id.map { |node_id| @node[node_id] }
1431
+ end
1432
+
1433
+ ###
1434
+ def each_usp_feblock
1435
+ @uspfe_id.each { |node_id|
1436
+ yield @node[node_id]
1437
+ }
1438
+ end
1439
+
1440
+ ###
1441
+ def usp_feblocks()
1442
+ return @uspfe_id.map { |node_id| @node[node_id] }
1443
+ end
1444
+
1445
+ ###
1446
+ def flags
1447
+ return @globals.map { |xml_obj|
1448
+ { "type" => xml_obj.attributes["type"],
1449
+ "param" => xml_obj.attributes["param"],
1450
+ "text" => xml_obj.children_and_text.map { |c| c.to_s }.join
1451
+ }
1452
+ }
1453
+ end
1454
+
1455
+ ################################################3
1456
+ # adding and removing things
1457
+
1458
+ ###
1459
+ def add_frame(sentid, # string: sentence ID
1460
+ name, # string: name of the frame
1461
+ sem_id = nil) # string: ID for the new node
1462
+
1463
+ # make a node for the frame
1464
+ if sem_id
1465
+ frameid = sem_id
1466
+ else
1467
+ frameid = sentid + "_f" + Time.new().to_f.to_s
1468
+ end
1469
+ n = FrameNode.new(RegXML.new("<frame id=\"#{frameid}\" name=\"#{name}\"/>"))
1470
+ @node[n.id] = n
1471
+ @frame_id << n.id
1472
+
1473
+ return n
1474
+ end
1475
+
1476
+ ###
1477
+ def remove_frame(frame_node)
1478
+ @node.delete(frame_node.id)
1479
+ @frame_id.delete(frame_node.id)
1480
+ end
1481
+
1482
+ ###
1483
+ def add_fe(frame_node, # FrameNode
1484
+ fe_name, # string: name of new FE
1485
+ fe_children, # array:SynNode, children of new FE
1486
+ sem_id = nil) # optional: ID of new FE
1487
+
1488
+
1489
+ new_fe = frame_node.add_fe(fe_name, fe_children, sem_id)
1490
+ @node[new_fe.id] = new_fe
1491
+ return new_fe
1492
+ end
1493
+
1494
+ ###
1495
+ def remove_fe(fe_node)
1496
+ @node.delete(fe_node.id)
1497
+ fe_node.parent.remove_child(fe_node)
1498
+ end
1499
+
1500
+ ###
1501
+ def add_usp(frame_or_fe) # string: "frame" or "fe"
1502
+
1503
+ n = UspNode.new(RegXML.new("<uspblock/>"), frame_or_fe)
1504
+ @node[n.id] = n
1505
+ case frame_or_fe
1506
+ when "frame"
1507
+ @uspframe_id << n.id
1508
+ when "fe"
1509
+ @uspfe_id << n.id
1510
+ else
1511
+ raise "Shouldn't be here"
1512
+ end
1513
+
1514
+ return n
1515
+ end
1516
+
1517
+ ###
1518
+ def remove_usp(usp_node)
1519
+ usp_node.children.each { |child|
1520
+ usp_node.remove_child(child)
1521
+ }
1522
+ @node.delete(usp_node.id)
1523
+ case usp_node.i_am
1524
+ when "frame"
1525
+ @uspframe_id.delete(usp_node.id)
1526
+ when "fe"
1527
+ @uspfe_id.delete(usp_node.id)
1528
+ else
1529
+ raise "Shouldn't be here"
1530
+ end
1531
+ end
1532
+
1533
+
1534
+ ###
1535
+ def add_child(arg1, arg2)
1536
+ raise "Not implemented for this class"
1537
+ end
1538
+
1539
+ ###
1540
+ def remove_child(arg1, arg2)
1541
+ raise "Not implemented for this class"
1542
+ end
1543
+
1544
+ ###
1545
+ def add_flag(type, param=nil, text=nil)
1546
+ # unless ["REEXAMINE", "WRONGSUBCORPUS", "INTERESTING", "LATER"].include? type
1547
+ # raise "add_flag: unknown type "+type
1548
+ # end
1549
+
1550
+ newglob = "<global type=\'#{xml_secure_val(type)}\'"
1551
+ if param
1552
+ newglob << " param=\'#{xml_secure_val(param)}\'"
1553
+ end
1554
+ if text
1555
+ newglob << "> #{text} </global>"
1556
+ else
1557
+ newglob << "/>"
1558
+ end
1559
+
1560
+ newglob = RegXML.new(newglob)
1561
+ @globals << newglob
1562
+ return newglob
1563
+ end
1564
+
1565
+ ###
1566
+ def remove_flag(type, param=nil, text=nil)
1567
+
1568
+ remove_ix = nil
1569
+ @globals.each_with_index { |glob,ix|
1570
+ if glob.attributes("type") == type
1571
+ if param.nil? or glob.attributes("param") == param
1572
+ if text.nil? or glob.children_and_text.map { |c| c.to_s }.join == text
1573
+ # found it
1574
+ remove_ix = ix
1575
+ break
1576
+ end
1577
+ end
1578
+ end
1579
+ }
1580
+
1581
+ if remove_ix
1582
+ return @globals.delete_at(remove_ix)
1583
+ else
1584
+ return nil
1585
+ end
1586
+ end
1587
+
1588
+ ############################3
1589
+ protected
1590
+
1591
+ def get_xml_ofchildren()
1592
+ string = ""
1593
+
1594
+ # globals
1595
+ string << "<globals>\n"
1596
+ @globals.each { |glob|
1597
+ string << glob.to_s + "\n"
1598
+ }
1599
+ string << "</globals>\n"
1600
+
1601
+ # frames
1602
+ string << "<frames>\n"
1603
+ each_frame { |frame_node|
1604
+ string << frame_node.get()
1605
+ }
1606
+ string << "</frames>\n"
1607
+
1608
+ # underspecification
1609
+ string << "<usp>\n"
1610
+ string << "<uspframes>\n"
1611
+ each_usp_frameblock { |block|
1612
+ string << block.get()
1613
+ }
1614
+ string << "</uspframes>\n"
1615
+ string << "<uspfes>\n"
1616
+ each_usp_feblock { |block|
1617
+ string << block.get()
1618
+ }
1619
+ string << "</uspfes>\n"
1620
+ string << "</usp>\n"
1621
+
1622
+ return string
1623
+ end
1624
+
1625
+ ###
1626
+ def semnode_add_flags(sem_node, # SemNode object
1627
+ xml_obj) # RegXML object
1628
+
1629
+ xml_obj.children_and_text.each { |child|
1630
+ if child.name == "flag"
1631
+ # found a flag, record it
1632
+ name = child.attributes["name"]
1633
+ if name
1634
+ sem_node.add_flag(name)
1635
+ else
1636
+ $stderr.puts "Warning: flag without a name"
1637
+ end
1638
+ end
1639
+ }
1640
+ end
1641
+
1642
+ def frame_add_children(frame_node, # FrameNode object
1643
+ xml_obj, # RegXML object
1644
+ id_to_node) # hash: syn_node_id(string) -> SynNode object
1645
+
1646
+ xml_obj.children_and_text.each { |fe|
1647
+ case fe.name
1648
+ when "fe", "target"
1649
+ # $stderr.puts "Da: #{fe.name}\n#{fe.to_s}"
1650
+
1651
+ # make a node for this,
1652
+ # and add it as child of this frame node.
1653
+ fe_node = FeNode.new(fe)
1654
+ @node[fe_node.id] = fe_node
1655
+ frame_node.add_child(fe_node)
1656
+
1657
+ semnode_add_flags(fe_node, fe)
1658
+
1659
+ # add the FE's children
1660
+ fe.children_and_text.each { |fechild|
1661
+ case fechild.name
1662
+ when "fenode"
1663
+
1664
+ syn_node = id_to_node[SalsaTigerXmlNode.xmlel_id(fechild)]
1665
+ if syn_node
1666
+ # normal syntactic node, which the id_to_node mapping knows
1667
+ fe_node.add_child(syn_node, fechild)
1668
+ syn_node.add_sem(fe_node)
1669
+
1670
+ else
1671
+ # must be a node in a different sentence
1672
+ # make a dummy graph node for it
1673
+ fe_node.add_child(TSSynNode.new(SalsaTigerXmlNode.xmlel_id(fechild)), fechild)
1674
+ end
1675
+
1676
+ when "flag"
1677
+ # nothing to do, we've handled that already
1678
+ else
1679
+ fe_node.add_kith(fechild)
1680
+ end
1681
+ }
1682
+
1683
+ when "flag"
1684
+ # nothing to do, wee handled that already
1685
+
1686
+ else
1687
+ # keep for output
1688
+ frame_node.add_kith(fe)
1689
+ end
1690
+ }
1691
+ end
1692
+
1693
+ ###
1694
+ def initialize_usp(xml_obj, # RegXML object
1695
+ frame_or_fe) # string: "frame" or "fe"
1696
+
1697
+ xml_obj.children_and_text.each { |uspblock|
1698
+ unless uspblock.name == "uspblock"
1699
+ warn_child_ignored("s/sem/usp/uspframe|uspfe", uspblock)
1700
+ next
1701
+ end
1702
+
1703
+ # node for this underspecified block
1704
+ n = UspNode.new(uspblock, frame_or_fe)
1705
+ @node[n.id] = n
1706
+
1707
+ case frame_or_fe
1708
+ when "frame"
1709
+ @uspframe_id << n.id
1710
+ when "fe"
1711
+ @uspfe_id << n.id
1712
+ else
1713
+ raise "Shouldn't be here"
1714
+ end
1715
+
1716
+ # add its children
1717
+ uspblock.children_and_text.each { |uspitem|
1718
+ unless uspitem.name == "uspitem"
1719
+ warn_child_ignored("s/sem/usp/uspframe|uspfe/uspblock", uspitem)
1720
+ next
1721
+ end
1722
+
1723
+ usp_id = SalsaTigerXmlNode.xmlel_id(uspitem)
1724
+ usp_id = usp_id.gsub(/.*_s/, "s")
1725
+
1726
+ unless @node[usp_id]
1727
+ $stderr.puts "Error: Underspecification: could not find node with ID #{usp_id}. Skipping."
1728
+ next
1729
+ end
1730
+ n.add_child(@node[usp_id])
1731
+ }
1732
+ }
1733
+ end
1734
+ end
1735
+
1736
+
1737
+ #############
1738
+ # class SalsaTigerSentence
1739
+ #
1740
+ # offers access methods to a SalsaTigerXML sentence
1741
+ # given as a string
1742
+ #
1743
+ # Nodes of syntactic structure as well as frames and
1744
+ # frame elements are kept (and returned) as XMLNode objects,
1745
+ # or more specifically as SynNode, FrameNode and FeNode objects.
1746
+ #
1747
+ # methods:
1748
+ #
1749
+ # new initializes the object
1750
+ #
1751
+ # id returns the sentence ID
1752
+ #
1753
+ # get returns the REXML object describing the same sentence
1754
+ # as this object
1755
+ #
1756
+ # each_terminal yields each terminal of the sentence in turn.
1757
+ # they are returned as SynNode objects
1758
+ #
1759
+ # terminals returns all terminal node objects in an array
1760
+ #
1761
+ # each_terminal_sorted yields each terminal of the sentence in turn,
1762
+ # making sure the terminal with the lowest ID is returned first.
1763
+ # use this if you need the terminal words in the right order!
1764
+ # nodes are returned as SynNode objects
1765
+ #
1766
+ # each_nonterminal yields each nonterminal of the sentence in turn.
1767
+ # nodes are returned as SynNode objects
1768
+ #
1769
+ # each_frame yields each frame of the sentence in turn.
1770
+ # nodes are returned as FrameNode objects
1771
+ #
1772
+ # frames returns all frame objects in an array
1773
+ #
1774
+ # each_usp_frameblock
1775
+ # yields each group of underspecified frames of the sentence
1776
+ # in turn, as an UspNode object. To see the frames involved
1777
+ # in this underspecification, use each_child on the UspNode object
1778
+ #
1779
+ #
1780
+ # usp_frameblocks returns all groups of underspecified frames as an array
1781
+ # of UspNode objects
1782
+ #
1783
+ # each_usp_feblock
1784
+ # yields each group of underspecified frame elements
1785
+ # of the sentence in turn, as an UspNode object.
1786
+ # To see the frames involved
1787
+ # in this underspecification, use each_child on the UspNode object
1788
+ #
1789
+ # usp_feblocks returns all groups of underspecified frame elements
1790
+ # as an array of UspNode objects
1791
+ #
1792
+ #
1793
+ # flags returns a list of the sentence flags, as hashes.
1794
+ # key "type": a string, either REEXAMINE or WRONGSUBCORPUS
1795
+ # or INTERESTING or LATER
1796
+ # key "param": a string, the parameter. important for
1797
+ # REEXAMINE
1798
+ # key "text": a string, the text of this flag. Will be
1799
+ # nonempty only for INTERESTING cases
1800
+ #
1801
+ # syn_roots returns a list of all the roots of the syntactic trees
1802
+ # in this sentence, as node objects. There may be more than
1803
+ # one, unfortunately.
1804
+ #
1805
+ # add_syn add a new syntactic node with the given category, word, POS,
1806
+ # returns the new node
1807
+ #
1808
+ # add_frame add a frame with a given name, returns the new frame node
1809
+ #
1810
+ # add_usp add a new underspecification block, either for frames or FEs
1811
+ #
1812
+ # add_flag adds a sentence flag to this sentence.
1813
+ # type: a string, must be REEXAMINE, INTERESTING, WRONGSUBCORPUS,
1814
+ # or LATER
1815
+ # param: optional parameter, a string, describes type of Reexamine
1816
+ # for REEXAMINE-type flags
1817
+ # text: optional parameter, a string, arbitrary text commenting
1818
+ # on the flag, used mainly with INTERESTING
1819
+ #
1820
+ # remove_flag removes a sentence flag to this sentence
1821
+ # only removes flag in case of exact match of type, param, and text
1822
+ # type: a string, either REEXAMINE, INTERESTING, WRONGSUBCORPUS,
1823
+ # or LATER
1824
+ # param: optional parameter, a string, describes type of Reexamine
1825
+ # for REEXAMINE-type flags
1826
+ # text: optional parameter, a string, arbitrary text commenting
1827
+ # on the flag, used mainly with INTERESTING
1828
+
1829
+ class SalsaTigerSentence < XMLNode
1830
+
1831
+ def initialize(string)
1832
+ # parse string as an XML element
1833
+ xml_obj = RegXML.new(string)
1834
+
1835
+ # initialize this object as an XML node,
1836
+ # i.e. remember the outermost element's name, attributes,
1837
+ # and ID, and specify that it's not a text but an XML object
1838
+ super(xml_obj.name, xml_obj.attributes, SalsaTigerXmlNode.xmlel_id(xml_obj), false)
1839
+
1840
+ # find XML element "graph",
1841
+ # which contains the syntactic info of the sentence.
1842
+ # It is a child of the <s> element.
1843
+ xml_syn_obj = xml_obj.children_and_text().detect { |thing|
1844
+ thing.name == "graph"
1845
+ }
1846
+
1847
+ unless xml_syn_obj
1848
+ # no graph in this sentence -- fake one
1849
+ xml_syn_obj = RegXML.new("<graph/>")
1850
+ end
1851
+
1852
+ @syn = SalsaTigerSentenceGraph.new(xml_syn_obj, id)
1853
+
1854
+ # find XML element "sem"
1855
+ # which contains the semantic info of the sentence.
1856
+ # It is a child of the <s> element.
1857
+ xml_sem_obj = xml_obj.children_and_text().detect { |thing|
1858
+ thing.name == "sem"
1859
+ }
1860
+
1861
+ unless xml_sem_obj
1862
+ # no semantic info in this sentence -- fake one
1863
+ xml_sem_obj = RegXML.new("<sem/>")
1864
+ end
1865
+
1866
+ # add splitword info to @syn element
1867
+ @syn.add_splitwords(SalsaTigerSentenceSem.get_splitwords(xml_sem_obj))
1868
+
1869
+ @sem = SalsaTigerSentenceSem.new(xml_sem_obj, id, @syn.node)
1870
+
1871
+ # go through the children of the <s> object again,
1872
+ # remembering all children except <graph> and <sem>
1873
+ # for later output
1874
+ xml_obj.children_and_text.each { |child_or_text|
1875
+ case child_or_text.name
1876
+ when "graph", "sem"
1877
+ # we have handled them already
1878
+ else
1879
+ add_kith(child_or_text)
1880
+ end
1881
+ }
1882
+
1883
+ end
1884
+
1885
+ #############
1886
+ def SalsaTigerSentence.empty_sentence(sentence_id) # string
1887
+ sentence_id = sentence_id.gsub(/'/, "&apos;")
1888
+ sent_string = "<s id=\'#{sentence_id}\'>\n" +
1889
+ "<graph/>\n" +
1890
+ "<sem/>\n" +
1891
+ "</s>"
1892
+ return SalsaTigerSentence.new(sent_string)
1893
+ end
1894
+
1895
+ #####
1896
+
1897
+
1898
+ ###
1899
+ def to_s
1900
+ return @syn.to_s
1901
+ end
1902
+
1903
+ ###
1904
+ def each_terminal
1905
+ @syn.each_terminal { |n| yield n }
1906
+ end
1907
+
1908
+ ###
1909
+ def each_terminal_sorted
1910
+ @syn.each_terminal_sorted { |n| yield n }
1911
+ end
1912
+
1913
+ ###
1914
+ def terminals
1915
+ return @syn.terminals()
1916
+ end
1917
+
1918
+ ###
1919
+ def terminals_sorted
1920
+ return @syn.terminals_sorted()
1921
+ end
1922
+
1923
+ ###
1924
+ def each_nonterminal
1925
+ @syn.each_nonterminal { |n| yield n }
1926
+ end
1927
+
1928
+ ###
1929
+ def nonterminals
1930
+ return @syn.nonterminals()
1931
+ end
1932
+
1933
+ ###
1934
+ def each_syn_node
1935
+ @syn.each_node { |n|
1936
+ yield n
1937
+ }
1938
+ end
1939
+
1940
+ ###
1941
+ def syn_nodes
1942
+ return @syn.nodes()
1943
+ end
1944
+
1945
+ ###
1946
+ def syn_roots
1947
+ return @syn.syn_roots()
1948
+ end
1949
+ ###
1950
+
1951
+ ###
1952
+ def syn_node_with_id(syn_id)
1953
+ return @syn.node[syn_id]
1954
+ end
1955
+
1956
+ ###
1957
+ def sem_node_with_id(sem_id)
1958
+ return @sem.node[sem_id]
1959
+ end
1960
+
1961
+ ###
1962
+ def each_frame
1963
+ @sem.each_frame { |f| yield f }
1964
+ end
1965
+
1966
+ ###
1967
+ def frames
1968
+ return @sem.frames
1969
+ end
1970
+
1971
+ ###
1972
+ def each_usp_frameblock
1973
+ @sem.each_usp_frameblock { |b| yield b }
1974
+ end
1975
+
1976
+ ###
1977
+ def usp_frameblocks()
1978
+ return @sem.usp_frameblocks()
1979
+ end
1980
+
1981
+ ###
1982
+ def each_usp_feblock
1983
+ @sem.each_usp_feblock { |b| yield b }
1984
+ end
1985
+
1986
+ ###
1987
+ def usp_feblocks()
1988
+ return @sem.usp_feblocks()
1989
+ end
1990
+
1991
+ ###
1992
+ def flags
1993
+ return @sem.flags()
1994
+ end
1995
+
1996
+ ###################################
1997
+ # adding and removing things
1998
+
1999
+ ###
2000
+ # add syntactic node, specified as terminal(t) or nonterminal(nt)
2001
+ #
2002
+ # returns the new node
2003
+ def add_syn(label, # string: t or nt
2004
+ cat = nil, # string: category
2005
+ word = nil,# string: word
2006
+ pos = nil, # string: part of speech
2007
+ syn_id = nil) # string: ID for the new node
2008
+ return @syn.add_node(id(), label, cat, word, pos, syn_id)
2009
+ end
2010
+
2011
+ ###
2012
+ def remove_syn(node)
2013
+ @syn.remove_node(node)
2014
+ end
2015
+
2016
+ ###
2017
+ def add_frame(name, # string: name of the frame
2018
+ sem_id = nil) # string: ID for the new node
2019
+ return @sem.add_frame(id(), name, sem_id)
2020
+ end
2021
+
2022
+ ###
2023
+ def remove_frame(frame_node) # FrameNode object
2024
+ @sem.remove_frame(frame_node)
2025
+ end
2026
+
2027
+ ###
2028
+ def add_fe(frame_obj,
2029
+ name,
2030
+ fe_children,
2031
+ sem_id = nil)
2032
+ return @sem.add_fe(frame_obj, name, fe_children, sem_id)
2033
+ end
2034
+
2035
+ ###
2036
+ def remove_fe(fe_node)
2037
+ @sem.remove_fe(fe_node)
2038
+ end
2039
+
2040
+ ###
2041
+ def add_usp(frame_or_fe)
2042
+ return @sem.add_usp(frame_or_fe)
2043
+ end
2044
+
2045
+ ###
2046
+ def remove_usp(usp_node) # UspNode object
2047
+ @sem.remove_usp(usp_node)
2048
+ end
2049
+
2050
+ ###
2051
+ def add_flag(type, param=nil, text=nil)
2052
+ @sem.add_flag(type, param, text)
2053
+ end
2054
+
2055
+ ###
2056
+ def remove_flag(type, param=nil, text=nil)
2057
+ @sem.remove_flag(type, param, text)
2058
+ end
2059
+
2060
+ ###
2061
+ def remove_semantics()
2062
+ empty_sem = RegXML.new("<sem/>")
2063
+ @sem = SalsaTigerSentenceSem.new(empty_sem, id(), @syn.node)
2064
+ end
2065
+
2066
+ #################33
2067
+ # output
2068
+ def get_syn()
2069
+ return @syn.get()
2070
+ end
2071
+
2072
+ ############################3
2073
+ protected
2074
+
2075
+ def get_xml_ofchildren()
2076
+ return @syn.get() + @sem.get()
2077
+ end
2078
+ end
2079
+
2080
+ #######
2081
+ # identify the set of maximal constituents covering a set of nodes
2082
+ #
2083
+ module MaxConst
2084
+
2085
+ # returns: array:SynNode, list of maximal constituents covering
2086
+ # the input nodes
2087
+ def max_constituents_for_nodes(node_list, # array: SynNode
2088
+ ignore_empty_terminals = false) # boolean: ignore empty terminals?
2089
+
2090
+ # sort node IDs into splitwords and rest,
2091
+ # and filter out punctuation marks
2092
+ #
2093
+ # 'words' is an array of node IDs that are not splitwords
2094
+ # 'splitwords' is an array of fenodes that refer to splitwords
2095
+ words = Array.new
2096
+ splitwords = Array.new
2097
+
2098
+ node_list.each { |node|
2099
+ if node.is_splitword?
2100
+ splitwords << node
2101
+ else
2102
+ words.concat node.yield_nodes().reject { |t| t.is_punct? }
2103
+ end
2104
+ }
2105
+
2106
+ # check all nodes from root down:
2107
+ # 'constituents', 'nodes_to_check' are arrays of node IDs
2108
+ # 'constituents' contains found constituents,
2109
+ # 'nodes_to_check' contains nodes for which we still need constituents
2110
+
2111
+ constituents = Array.new
2112
+ nodes_to_check = syn_roots() # (there may be more than one)
2113
+ # this accesses the syn_roots() method of SalsaTigerSentence
2114
+
2115
+ while(true)
2116
+ node = nodes_to_check.shift()
2117
+ # have we checked all nodes already? or are we done with all words? then stop.
2118
+ if node.nil?
2119
+ constituents.concat words
2120
+ words = []
2121
+ break
2122
+ end
2123
+ if words.empty?
2124
+ break
2125
+ end
2126
+
2127
+ # only match nonempty non-punctuation nodes
2128
+
2129
+ node_yield = node.yield_nodes.reject {|n| n.is_punct? }
2130
+ if ignore_empty_terminals
2131
+ node_yield = node_yield.reject { |n| n.is_terminal? and (n.word.nil? or n.word.empty?) }
2132
+ end
2133
+ if node_yield.empty?
2134
+ # this node has no yield, or only punctuation sign yield.
2135
+ # skip it.
2136
+ next
2137
+ end
2138
+
2139
+ rest = node_yield - words
2140
+ if rest.size == 0
2141
+ # whole yield of node consists of words from this FE
2142
+ constituents << node
2143
+ words = words - node_yield
2144
+
2145
+ elsif rest.size < node_yield.size
2146
+ # at least some of the words in FE appear below this node:
2147
+ # check this node's children too
2148
+ node.children.each{ |child| nodes_to_check << child }
2149
+ end
2150
+ end
2151
+
2152
+ constituents.concat(splitwords) #splitwords stay what they are
2153
+ constituents.concat(words) # any leftover words that may not be from that sentence?
2154
+ # just keep them.
2155
+
2156
+ return constituents
2157
+ end
2158
+
2159
+ ###
2160
+ # determine maximum constituents covering the nodes in node_list
2161
+ # punctuation terminals (and optionally empty terminals) are ignored.
2162
+ #
2163
+ # If include_single_missing_children is set to true,
2164
+ # then a node that has at least one child whose yield is in nodelist,
2165
+ # and has only one child whose yield is not in nodelist,
2166
+ # will be considered as having its yield in nodelist.
2167
+ #
2168
+ # Optionally, a procedure accept_anyway_proc can be given.
2169
+ # Like the option include_single_missing_children, it can lead to nodes being
2170
+ # included in the list of nodes whose yield nodes are all also yield nodes of node_list (NYNAAYNN)
2171
+ # even though not all of their yield nodes are yield nodes of the node_list.
2172
+ # accept_anyway_proc can implement arbitrary rules for including nodes in NYAAYNN.
2173
+ # The procedure is called with three arguments:
2174
+ # accept_anyway_proc(node, ch_in, ch_out)
2175
+ # node is a SynNode that would not normally be in NYAAYNN.
2176
+ # ch_in is the list of its children that are in NYAAYNN.
2177
+ # ch_out is the list of its children that are not.
2178
+ # If the procedure exists and returns true, node is put into NYAAYNN.
2179
+ #
2180
+ # returns: an array of SynNodes: the maximal constituents that together
2181
+ # exactly cover node_list
2182
+ def max_constituents_smc(node_list, # array: SynNode
2183
+ include_single_missing_children, # boolean
2184
+ ignore_empty_terminals = false, # boolean: ignore empty terminals?
2185
+ accept_anyway_proc = nil) # proc: SynNode, array:SynNode, array:SynNode => boolean
2186
+
2187
+ # sort node IDs into splitwords and rest,
2188
+ # and filter out punctuation marks
2189
+ #
2190
+ # 'words' is an array of node IDs that are not splitwords
2191
+ # 'splitwords' is an array of fenodes that refer to splitwords
2192
+ words = Array.new
2193
+ splitwords = Array.new
2194
+
2195
+ node_list.each { |node|
2196
+ if node.is_splitword?
2197
+ splitwords << node
2198
+ else
2199
+ words.concat node.yield_nodes().reject { |t| t.is_punct? }
2200
+ end
2201
+ }
2202
+
2203
+ constituents = splitwords
2204
+
2205
+ syn_roots().each { |node|
2206
+ node_included, descendants_included = max_constituents_aux(node, words,
2207
+ include_single_missing_children,
2208
+ ignore_empty_terminals,
2209
+ accept_anyway_proc)
2210
+
2211
+ if node_included == "true"
2212
+ constituents << node
2213
+ else
2214
+ constituents.concat descendants_included
2215
+ end
2216
+ }
2217
+ # which words remain to be added?
2218
+ constituents.each { |c| words = words - c.yield_nodes() }
2219
+ constituents.concat words
2220
+
2221
+ return constituents
2222
+ end
2223
+
2224
+ ##########33
2225
+ private
2226
+
2227
+ ###
2228
+ # recursively determine maximum constituents covering the nodes in 'nodelist',
2229
+ # starting at 'node'.
2230
+ # punctuation terminals (and optionally empty terminals) are ignored.
2231
+ #
2232
+ # If include_single_missing_children is set to true,
2233
+ # then a node that has at least one child whose yield is in nodelist,
2234
+ # and has only one child whose yield is not in nodelist,
2235
+ # will be considered as having its yield in nodelist.
2236
+ #
2237
+ # If accept_anyway_proc is nonnil, also use that to decide whether
2238
+ # a node will be considered as having its yield in nodelist.
2239
+ #
2240
+ # returns: pair [mybool, included_descendants]
2241
+ # where mybool is a string, "true", "false" or "ignoreme" (for ignored
2242
+ # punctuation and empty terminals):
2243
+ # does the yield of this node consist entirely of nodes from nodelist?
2244
+ # and included_descendants is a list of SynNodes: if mybool is "false",
2245
+ # this is a list of descendants of this node whose yield does consist
2246
+ # entirely of nodes from nodelist
2247
+ def max_constituents_aux(node, # SynNode
2248
+ nodelist, # array:SynNode
2249
+ include_single_missing_children = false, # boolean
2250
+ ignore_empty_terminals = false, # boolean: ignore empty terminals?
2251
+ accept_anyway_proc = nil) # proc: SynNode, array:SynNode, array:SynNode => Boolean
2252
+
2253
+
2254
+
2255
+ if node.is_terminal? and nodelist.include? node
2256
+ # node is terminal and included in nodelist
2257
+ return ["true", []]
2258
+ elsif node.is_punct?
2259
+ # punctuation: ignore
2260
+ return ["ignoreme", []]
2261
+ elsif ignore_empty_terminals and node.is_terminal? and
2262
+ (node.word.nil? or node.word.empty?)
2263
+ # empty terminal: possibly ignore
2264
+ return ["ignoreme", []]
2265
+ elsif node.is_terminal?
2266
+ # terminal, but not included in nodelist
2267
+ return ["false", []]
2268
+ end
2269
+
2270
+ children_results = node.children.map { |ch|
2271
+ fully_included, descendants_included = max_constituents_aux(ch, nodelist,
2272
+ include_single_missing_children,
2273
+ ignore_empty_terminals,
2274
+ accept_anyway_proc)
2275
+ [ch, fully_included, descendants_included]
2276
+ }
2277
+
2278
+ res_false = children_results.select { |ch, fully_included, descendants_included|
2279
+ fully_included == "false"
2280
+ }
2281
+ res_true = children_results.select { |ch, fully_included, descendants_included|
2282
+ fully_included == "true"
2283
+ }
2284
+
2285
+ if res_false.empty? and res_true.length() > 0
2286
+ # all true, or all true and ignoreme
2287
+ return ["true", []]
2288
+
2289
+ elsif res_false.empty? and res_true.empty?
2290
+ # all ignoreme
2291
+ return ["ignoreme", []]
2292
+
2293
+ elsif res_false.length() == 1 and res_true.length() > 1 and
2294
+ include_single_missing_children
2295
+ # one child not covered,
2296
+ # resulting in all other children (except the ignoremes) being marked individually:
2297
+ # consider the single missing child as covered, too
2298
+
2299
+ return ["true", []]
2300
+
2301
+ elsif accept_anyway_proc and
2302
+ accept_anyway_proc.call(node, res_true.map { |ch, bool1, bool2| ch }, res_false.map { |ch, bool1, bool2| ch })
2303
+ # some external source tells us that
2304
+ # we are to consider the missing children as covered, too
2305
+ return ["true", []]
2306
+
2307
+ else
2308
+ # not all children covered
2309
+ return [
2310
+ "false",
2311
+ children_results.map { |ch, fully_included, descendants_included|
2312
+ if fully_included == "true"
2313
+ [ch]
2314
+ else
2315
+ descendants_included
2316
+ end
2317
+ }.flatten
2318
+ ]
2319
+ end
2320
+ end
2321
+ end
2322
+
2323
+ module ConvexComp
2324
+
2325
+ def convex_complemented(node_set)
2326
+
2327
+ terminals = terminals_sorted()
2328
+
2329
+ yield_nodes = node_set.map {|node| node.yield_nodes_ordered}.flatten
2330
+ leftmost = yield_nodes.map {|t| terminals.index(t)}.min
2331
+ rightmost = yield_nodes.map {|t| terminals.index(t)}.max
2332
+ if leftmost.nil? or rightmost.nil?
2333
+ STDERR.puts "Warning: could not complement projected node set #{yield_nodes.map {|t| t.id}}; terminals not found in sorted set of sentence terminals!?"
2334
+ return node_set
2335
+ else
2336
+ STDERR.puts "Replacing "+yield_nodes.join(" ")
2337
+ new_node_set = terminals[leftmost..rightmost]
2338
+ STDERR.puts "By "+new_node_set.join(" ")
2339
+ return max_constituents_for_nodes(new_node_set)
2340
+ end
2341
+ end
2342
+ end
2343
+
2344
+ class SalsaTigerSentence
2345
+ include MaxConst
2346
+ include ConvexComp
2347
+ end