shalmaneser-lib 1.2.rc5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +10 -0
- data/CHANGELOG.md +4 -0
- data/LICENSE.md +4 -0
- data/README.md +122 -0
- data/lib/configuration/config_data.rb +457 -0
- data/lib/configuration/config_format_element.rb +210 -0
- data/lib/configuration/configuration_error.rb +15 -0
- data/lib/configuration/external_config_data.rb +56 -0
- data/lib/configuration/frappe_config_data.rb +134 -0
- data/lib/configuration/fred_config_data.rb +199 -0
- data/lib/configuration/rosy_config_data.rb +126 -0
- data/lib/db/db_interface.rb +50 -0
- data/lib/db/db_mysql.rb +141 -0
- data/lib/db/db_sqlite.rb +280 -0
- data/lib/db/db_table.rb +237 -0
- data/lib/db/db_view.rb +416 -0
- data/lib/db/db_wrapper.rb +175 -0
- data/lib/db/select_table_and_columns.rb +10 -0
- data/lib/db/sql_query.rb +243 -0
- data/lib/definitions.rb +19 -0
- data/lib/eval.rb +482 -0
- data/lib/ext/maxent/Classify.class +0 -0
- data/lib/ext/maxent/Train.class +0 -0
- data/lib/external_systems.rb +251 -0
- data/lib/framenet_format/fn_corpus_aset.rb +209 -0
- data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
- data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
- data/lib/framenet_format/fn_database.rb +143 -0
- data/lib/framenet_format/frame_xml_file.rb +104 -0
- data/lib/framenet_format/frame_xml_sentence.rb +411 -0
- data/lib/logging.rb +25 -0
- data/lib/ml/classifier.rb +189 -0
- data/lib/ml/mallet.rb +236 -0
- data/lib/ml/maxent.rb +229 -0
- data/lib/ml/optimize.rb +195 -0
- data/lib/ml/timbl.rb +140 -0
- data/lib/monkey_patching/array.rb +82 -0
- data/lib/monkey_patching/enumerable_bool.rb +24 -0
- data/lib/monkey_patching/enumerable_distribute.rb +18 -0
- data/lib/monkey_patching/file.rb +131 -0
- data/lib/monkey_patching/subsumed.rb +24 -0
- data/lib/ruby_class_extensions.rb +4 -0
- data/lib/salsa_tiger_xml/corpus.rb +24 -0
- data/lib/salsa_tiger_xml/fe_node.rb +98 -0
- data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
- data/lib/salsa_tiger_xml/frame_node.rb +145 -0
- data/lib/salsa_tiger_xml/graph_node.rb +347 -0
- data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
- data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
- data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
- data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
- data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
- data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
- data/lib/salsa_tiger_xml/sem_node.rb +58 -0
- data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
- data/lib/salsa_tiger_xml/syn_node.rb +169 -0
- data/lib/salsa_tiger_xml/tree_node.rb +59 -0
- data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
- data/lib/salsa_tiger_xml/usp_node.rb +72 -0
- data/lib/salsa_tiger_xml/xml_node.rb +163 -0
- data/lib/shalmaneser/lib.rb +1 -0
- data/lib/tabular_format/fn_tab_format_file.rb +38 -0
- data/lib/tabular_format/fn_tab_frame.rb +67 -0
- data/lib/tabular_format/fn_tab_sentence.rb +169 -0
- data/lib/tabular_format/tab_format_file.rb +91 -0
- data/lib/tabular_format/tab_format_named_args.rb +184 -0
- data/lib/tabular_format/tab_format_sentence.rb +119 -0
- data/lib/value_restriction.rb +49 -0
- metadata +131 -0
@@ -0,0 +1,285 @@
|
|
1
|
+
module STXML
|
2
|
+
# RegXML
|
3
|
+
#
|
4
|
+
# Katrin Erk June 2005
|
5
|
+
|
6
|
+
# SalsaTigerRegXML: take control of the data structure, no underlying xml
|
7
|
+
# representation anymore, re-generation of xml on demand
|
8
|
+
|
9
|
+
class RegXML
|
10
|
+
|
11
|
+
def initialize(string, # string representing a single XML element
|
12
|
+
i_am_text = false) # boolean: xml element (false) or text (true)
|
13
|
+
|
14
|
+
unless string.class == String
|
15
|
+
raise "First argument to RegXML.new must be string. I got #{string.class}"
|
16
|
+
end
|
17
|
+
|
18
|
+
if i_am_text
|
19
|
+
@s = string
|
20
|
+
@i_am_text = true
|
21
|
+
else
|
22
|
+
@s = string.gsub(/\n/, " ").freeze
|
23
|
+
@i_am_text = false
|
24
|
+
|
25
|
+
element_test
|
26
|
+
dyck_test
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def first_child_matching(child_name)
|
31
|
+
children_and_text.detect { |c| c.name == child_name }
|
32
|
+
end
|
33
|
+
|
34
|
+
def each_child_matching(child_name)
|
35
|
+
children_and_text.each do |c|
|
36
|
+
if c.name == child_name
|
37
|
+
yield c
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def to_s
|
43
|
+
xml_readable(@s)
|
44
|
+
end
|
45
|
+
|
46
|
+
def text?
|
47
|
+
@i_am_text
|
48
|
+
end
|
49
|
+
|
50
|
+
# Return the name of the xml element contained in the string.
|
51
|
+
# @return [String] Name of the element.
|
52
|
+
def name
|
53
|
+
if @i_am_text
|
54
|
+
# text
|
55
|
+
return nil
|
56
|
+
|
57
|
+
else
|
58
|
+
# xml element
|
59
|
+
if @s =~ /^\s*<\s*([\w-]+)[\s\/>]/
|
60
|
+
return $1
|
61
|
+
else
|
62
|
+
raise "Cannot parse:\n#{xml_readable(@s)}"
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
# Return a hash of attributes and their values.
|
68
|
+
# @return [Hash<String String>] Attributes of an xml element.
|
69
|
+
def attributes
|
70
|
+
if @i_am_text
|
71
|
+
# text
|
72
|
+
return {}
|
73
|
+
|
74
|
+
else
|
75
|
+
# xml element
|
76
|
+
|
77
|
+
# remove <element_name from the beginning of @s,
|
78
|
+
# place the rest up to the first > into elt_contents:
|
79
|
+
# this is a string of the form
|
80
|
+
# - either (name=value)*
|
81
|
+
# - or (name=value)*/
|
82
|
+
unless @s =~ /^\s*<\s*#{name}(.*)$/
|
83
|
+
raise "Cannot parse:\n #{xml_readable(@s)}"
|
84
|
+
end
|
85
|
+
|
86
|
+
retv = {}
|
87
|
+
elt_contents = $1
|
88
|
+
|
89
|
+
# repeat until only > or /> is left
|
90
|
+
while elt_contents !~ /^\s*\/?>/
|
91
|
+
|
92
|
+
# shave off the next name=value pair
|
93
|
+
# put the rest into elt_contents
|
94
|
+
# make sure that if the value is quoted with ',
|
95
|
+
# we accept " inside the value, and vice versa.
|
96
|
+
unless elt_contents =~ /^\s*([\w-]+)=(['"])(.*?)\2(.*)$/
|
97
|
+
raise "Cannot parse:\n #{xml_readable(elt_contents)}"
|
98
|
+
end
|
99
|
+
retv[$1] = $3
|
100
|
+
elt_contents = $4
|
101
|
+
end
|
102
|
+
|
103
|
+
return retv
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def children_and_text
|
108
|
+
if @i_am_text
|
109
|
+
return []
|
110
|
+
|
111
|
+
else
|
112
|
+
if unary_element
|
113
|
+
# <bla/>, no children
|
114
|
+
return []
|
115
|
+
end
|
116
|
+
|
117
|
+
# @s has the form <bla...> ... </bla>.
|
118
|
+
# remove <bla ...> from the beginning of @s,
|
119
|
+
# place the rest up to </bla> into children_s:
|
120
|
+
|
121
|
+
mainname = name
|
122
|
+
unless @s =~ /^\s*<\s*#{mainname}(\s+[\w-]+=(["']).*?\2)*\s*>(.*?)<\/\s*#{mainname}\s*>\s*$/
|
123
|
+
raise "Cannot parse:\n #{xml_readable(@s)}"
|
124
|
+
end
|
125
|
+
|
126
|
+
retv = []
|
127
|
+
children_s = $3
|
128
|
+
|
129
|
+
# repeat until only whitespace is left
|
130
|
+
while children_s !~ /^\s*$/
|
131
|
+
|
132
|
+
# shave off the next bit of text
|
133
|
+
# put the rest into children_s
|
134
|
+
unless children_s =~ /^\s*(.*?)(<.*$|$)/
|
135
|
+
$stderr.puts "Whole was:\n #{xml_readable(@s)}"
|
136
|
+
$stderr.puts
|
137
|
+
raise "Cannot parse:\n #{xml_readable(children_s)}"
|
138
|
+
end
|
139
|
+
unless $1.strip.empty?
|
140
|
+
children_s = $2
|
141
|
+
retv << RegXML.new($1, true)
|
142
|
+
end
|
143
|
+
|
144
|
+
# anything left after we've parsed text?
|
145
|
+
if children_s =~ /^s*$/
|
146
|
+
break
|
147
|
+
end
|
148
|
+
|
149
|
+
# shave off the next child
|
150
|
+
# and put the rest into children_s
|
151
|
+
|
152
|
+
# determine the next child's name, and the string index at which
|
153
|
+
# the element start tag ends with either / or >
|
154
|
+
unless children_s =~ /^\s*(<\s*([\w-]+)(\s+[\w-]+=(["']).*?\4)*\s*)/
|
155
|
+
$stderr.puts "Whole was:\n #{xml_readable(@s)}"
|
156
|
+
$stderr.puts
|
157
|
+
raise "Cannot parse:\n #{xml_readable(children_s)}"
|
158
|
+
end
|
159
|
+
childname = $2
|
160
|
+
child = $1
|
161
|
+
endofelt_ix = $&.length
|
162
|
+
|
163
|
+
|
164
|
+
# and remove it
|
165
|
+
case children_s[endofelt_ix..-1]
|
166
|
+
when /^\/>(.*)$/
|
167
|
+
# next child is a unary element
|
168
|
+
children_s = $1
|
169
|
+
retv << RegXML.new(child + "/>")
|
170
|
+
|
171
|
+
when /^(>.*?<\s*\/\s*#{childname}\s*>)(.*)$/
|
172
|
+
children_s = $2
|
173
|
+
retv << RegXML.new(child + $1)
|
174
|
+
|
175
|
+
else
|
176
|
+
$stderr.puts "Whole was:\n #{xml_readable(@s)}"
|
177
|
+
$stderr.puts
|
178
|
+
raise "Cannot parse:\n#{xml_readable(children_s)}"
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
return retv
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
def RegXML.test
|
187
|
+
bla = RegXML.new(" <bla blupp='a\"b'
|
188
|
+
lalala=\"c\">
|
189
|
+
<lalala> </lalala>
|
190
|
+
texttext
|
191
|
+
<lala blupp='b'/>
|
192
|
+
nochtext
|
193
|
+
<la> <l/> </la>
|
194
|
+
</ bla >
|
195
|
+
")
|
196
|
+
puts "name " + bla.name
|
197
|
+
puts
|
198
|
+
puts bla.to_s
|
199
|
+
puts
|
200
|
+
bla.attributes.each { |attr, val|
|
201
|
+
puts "attr " + attr + "=" + val
|
202
|
+
}
|
203
|
+
puts
|
204
|
+
bla.children_and_text.each { |child_obj|
|
205
|
+
if child_obj.text?
|
206
|
+
puts "da text " + child_obj.to_s
|
207
|
+
else
|
208
|
+
puts "da child " + child_obj.to_s
|
209
|
+
end
|
210
|
+
}
|
211
|
+
puts
|
212
|
+
|
213
|
+
puts "NEU"
|
214
|
+
bla = RegXML.new(" < bla blupp='a\"'/> ")
|
215
|
+
puts "name " + bla.name
|
216
|
+
puts
|
217
|
+
puts bla.to_s
|
218
|
+
puts
|
219
|
+
bla.attributes.each { |attr, val|
|
220
|
+
puts "attr " + attr + "=" + val
|
221
|
+
}
|
222
|
+
puts
|
223
|
+
bla.children_and_text.each { |child_obj|
|
224
|
+
if child_obj.text?
|
225
|
+
puts "da text " + child_obj.to_s
|
226
|
+
else
|
227
|
+
puts "da child " + child_obj.to_s
|
228
|
+
end
|
229
|
+
}
|
230
|
+
puts
|
231
|
+
|
232
|
+
end
|
233
|
+
|
234
|
+
##############
|
235
|
+
protected
|
236
|
+
|
237
|
+
def unary_element
|
238
|
+
# <bla/>
|
239
|
+
if @s =~ /^\s*<.*\/>\s*$/
|
240
|
+
return true
|
241
|
+
else
|
242
|
+
return false
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
def element_test
|
247
|
+
# make sure we have a single XML element, either <bla/> or
|
248
|
+
# <bla>...</bla>
|
249
|
+
|
250
|
+
if unary_element
|
251
|
+
# <bla/>
|
252
|
+
elsif @s =~ /^\s*<\s*([\w-]+)\W.*?<\/\s*\1\s*>\s*$/
|
253
|
+
# <bla > ... </bla>
|
254
|
+
else
|
255
|
+
raise "Cannot parse:\n #{xml_readable(@s)}"
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
259
|
+
def dyck_test
|
260
|
+
# every prefix of @s must have at least as many < as >
|
261
|
+
opening = 0
|
262
|
+
closing = 0
|
263
|
+
@s.scan(/[<>]/) { |bracket|
|
264
|
+
case bracket
|
265
|
+
when "<"
|
266
|
+
opening += 1
|
267
|
+
when ">"
|
268
|
+
closing += 1
|
269
|
+
if closing > opening
|
270
|
+
raise "More closing than opening brackets in prefix of:\n #{xml_readable(@s)}"
|
271
|
+
end
|
272
|
+
end
|
273
|
+
}
|
274
|
+
|
275
|
+
# and in total, @s must have equally many < and >
|
276
|
+
unless @s.count("<") == @s.count(">")
|
277
|
+
raise "Inequal number of brackets in:\n #{xml_readable(@s)}"
|
278
|
+
end
|
279
|
+
end
|
280
|
+
|
281
|
+
def xml_readable(string)
|
282
|
+
string.gsub(/>/, ">\n")
|
283
|
+
end
|
284
|
+
end
|
285
|
+
end
|
@@ -0,0 +1,596 @@
|
|
1
|
+
require_relative 'xml_node'
|
2
|
+
require_relative 'salsa_tiger_sentence_graph'
|
3
|
+
require_relative 'salsa_tiger_sentence_sem'
|
4
|
+
require_relative 'reg_xml'
|
5
|
+
|
6
|
+
module STXML
|
7
|
+
#############
|
8
|
+
# class SalsaTigerSentence
|
9
|
+
#
|
10
|
+
# offers access methods to a SalsaTigerXML sentence
|
11
|
+
# given as a string
|
12
|
+
#
|
13
|
+
# Nodes of syntactic structure as well as frames and
|
14
|
+
# frame elements are kept (and returned) as XMLNode objects,
|
15
|
+
# or more specifically as SynNode, FrameNode and FeNode objects.
|
16
|
+
#
|
17
|
+
# methods:
|
18
|
+
#
|
19
|
+
# new initializes the object
|
20
|
+
#
|
21
|
+
# id returns the sentence ID
|
22
|
+
#
|
23
|
+
# get returns the REXML object describing the same sentence
|
24
|
+
# as this object
|
25
|
+
#
|
26
|
+
# each_terminal yields each terminal of the sentence in turn.
|
27
|
+
# they are returned as SynNode objects
|
28
|
+
#
|
29
|
+
# terminals returns all terminal node objects in an array
|
30
|
+
#
|
31
|
+
# each_terminal_sorted yields each terminal of the sentence in turn,
|
32
|
+
# making sure the terminal with the lowest ID is returned first.
|
33
|
+
# use this if you need the terminal words in the right order!
|
34
|
+
# nodes are returned as SynNode objects
|
35
|
+
#
|
36
|
+
# each_nonterminal yields each nonterminal of the sentence in turn.
|
37
|
+
# nodes are returned as SynNode objects
|
38
|
+
#
|
39
|
+
# each_frame yields each frame of the sentence in turn.
|
40
|
+
# nodes are returned as FrameNode objects
|
41
|
+
#
|
42
|
+
# frames returns all frame objects in an array
|
43
|
+
#
|
44
|
+
# each_usp_frameblock
|
45
|
+
# yields each group of underspecified frames of the sentence
|
46
|
+
# in turn, as an UspNode object. To see the frames involved
|
47
|
+
# in this underspecification, use each_child on the UspNode object
|
48
|
+
#
|
49
|
+
#
|
50
|
+
# usp_frameblocks returns all groups of underspecified frames as an array
|
51
|
+
# of UspNode objects
|
52
|
+
#
|
53
|
+
# each_usp_feblock
|
54
|
+
# yields each group of underspecified frame elements
|
55
|
+
# of the sentence in turn, as an UspNode object.
|
56
|
+
# To see the frames involved
|
57
|
+
# in this underspecification, use each_child on the UspNode object
|
58
|
+
#
|
59
|
+
# usp_feblocks returns all groups of underspecified frame elements
|
60
|
+
# as an array of UspNode objects
|
61
|
+
#
|
62
|
+
#
|
63
|
+
# flags returns a list of the sentence flags, as hashes.
|
64
|
+
# key "type": a string, either REEXAMINE or WRONGSUBCORPUS
|
65
|
+
# or INTERESTING or LATER
|
66
|
+
# key "param": a string, the parameter. important for
|
67
|
+
# REEXAMINE
|
68
|
+
# key "text": a string, the text of this flag. Will be
|
69
|
+
# nonempty only for INTERESTING cases
|
70
|
+
#
|
71
|
+
# syn_roots returns a list of all the roots of the syntactic trees
|
72
|
+
# in this sentence, as node objects. There may be more than
|
73
|
+
# one, unfortunately.
|
74
|
+
#
|
75
|
+
# add_syn add a new syntactic node with the given category, word, POS,
|
76
|
+
# returns the new node
|
77
|
+
#
|
78
|
+
# add_frame add a frame with a given name, returns the new frame node
|
79
|
+
#
|
80
|
+
# add_usp add a new underspecification block, either for frames or FEs
|
81
|
+
#
|
82
|
+
# add_flag adds a sentence flag to this sentence.
|
83
|
+
# type: a string, must be REEXAMINE, INTERESTING, WRONGSUBCORPUS,
|
84
|
+
# or LATER
|
85
|
+
# param: optional parameter, a string, describes type of Reexamine
|
86
|
+
# for REEXAMINE-type flags
|
87
|
+
# text: optional parameter, a string, arbitrary text commenting
|
88
|
+
# on the flag, used mainly with INTERESTING
|
89
|
+
#
|
90
|
+
# remove_flag removes a sentence flag to this sentence
|
91
|
+
# only removes flag in case of exact match of type, param, and text
|
92
|
+
# type: a string, either REEXAMINE, INTERESTING, WRONGSUBCORPUS,
|
93
|
+
# or LATER
|
94
|
+
# param: optional parameter, a string, describes type of Reexamine
|
95
|
+
# for REEXAMINE-type flags
|
96
|
+
# text: optional parameter, a string, arbitrary text commenting
|
97
|
+
# on the flag, used mainly with INTERESTING
|
98
|
+
class SalsaTigerSentence < XMLNode
|
99
|
+
def self.empty_sentence(sentence_id) # string
|
100
|
+
sentence_id = sentence_id.gsub(/'/, "'")
|
101
|
+
sent_string = "<s id=\'#{sentence_id}\'>\n" +
|
102
|
+
"<graph/>\n" +
|
103
|
+
"<sem/>\n" +
|
104
|
+
"</s>"
|
105
|
+
|
106
|
+
SalsaTigerSentence.new(sent_string)
|
107
|
+
end
|
108
|
+
|
109
|
+
def initialize(string)
|
110
|
+
# parse string as an XML element
|
111
|
+
xml_obj = RegXML.new(string)
|
112
|
+
|
113
|
+
# initialize this object as an XML node,
|
114
|
+
# i.e. remember the outermost element's name, attributes,
|
115
|
+
# and ID, and specify that it's not a text but an XML object
|
116
|
+
super(xml_obj.name, xml_obj.attributes, SalsaTigerXmlNode.xmlel_id(xml_obj), false)
|
117
|
+
|
118
|
+
# find XML element "graph",
|
119
|
+
# which contains the syntactic info of the sentence.
|
120
|
+
# It is a child of the <s> element.
|
121
|
+
xml_syn_obj = xml_obj.children_and_text.detect { |thing|
|
122
|
+
thing.name == "graph"
|
123
|
+
}
|
124
|
+
|
125
|
+
unless xml_syn_obj
|
126
|
+
# no graph in this sentence -- fake one
|
127
|
+
xml_syn_obj = RegXML.new("<graph/>")
|
128
|
+
end
|
129
|
+
|
130
|
+
@syn = SalsaTigerSentenceGraph.new(xml_syn_obj, id)
|
131
|
+
|
132
|
+
# find XML element "sem"
|
133
|
+
# which contains the semantic info of the sentence.
|
134
|
+
# It is a child of the <s> element.
|
135
|
+
xml_sem_obj = xml_obj.children_and_text.detect { |thing|
|
136
|
+
thing.name == "sem"
|
137
|
+
}
|
138
|
+
|
139
|
+
unless xml_sem_obj
|
140
|
+
# no semantic info in this sentence -- fake one
|
141
|
+
xml_sem_obj = RegXML.new("<sem/>")
|
142
|
+
end
|
143
|
+
|
144
|
+
# add splitword info to @syn element
|
145
|
+
@syn.add_splitwords(SalsaTigerSentenceSem.get_splitwords(xml_sem_obj))
|
146
|
+
|
147
|
+
@sem = SalsaTigerSentenceSem.new(xml_sem_obj, id, @syn.node)
|
148
|
+
|
149
|
+
# go through the children of the <s> object again,
|
150
|
+
# remembering all children except <graph> and <sem>
|
151
|
+
# for later output
|
152
|
+
xml_obj.children_and_text.each do |child_or_text|
|
153
|
+
case child_or_text.name
|
154
|
+
when "graph", "sem"
|
155
|
+
# we have handled them already
|
156
|
+
else
|
157
|
+
add_kith(child_or_text)
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
def to_s
|
163
|
+
@syn.to_s
|
164
|
+
end
|
165
|
+
|
166
|
+
###
|
167
|
+
def each_terminal
|
168
|
+
@syn.each_terminal { |n| yield n }
|
169
|
+
end
|
170
|
+
|
171
|
+
###
|
172
|
+
def each_terminal_sorted
|
173
|
+
@syn.each_terminal_sorted { |n| yield n }
|
174
|
+
end
|
175
|
+
|
176
|
+
###
|
177
|
+
def terminals
|
178
|
+
@syn.terminals
|
179
|
+
end
|
180
|
+
|
181
|
+
###
|
182
|
+
def terminals_sorted
|
183
|
+
@syn.terminals_sorted
|
184
|
+
end
|
185
|
+
|
186
|
+
###
|
187
|
+
def each_nonterminal
|
188
|
+
@syn.each_nonterminal { |n| yield n }
|
189
|
+
end
|
190
|
+
|
191
|
+
###
|
192
|
+
def nonterminals
|
193
|
+
@syn.nonterminals
|
194
|
+
end
|
195
|
+
|
196
|
+
###
|
197
|
+
def each_syn_node
|
198
|
+
@syn.each_node { |n| yield n }
|
199
|
+
end
|
200
|
+
|
201
|
+
###
|
202
|
+
def syn_nodes
|
203
|
+
@syn.nodes
|
204
|
+
end
|
205
|
+
|
206
|
+
###
|
207
|
+
def syn_roots
|
208
|
+
@syn.syn_roots
|
209
|
+
end
|
210
|
+
|
211
|
+
###
|
212
|
+
def syn_node_with_id(syn_id)
|
213
|
+
@syn.node[syn_id]
|
214
|
+
end
|
215
|
+
|
216
|
+
###
|
217
|
+
def sem_node_with_id(sem_id)
|
218
|
+
@sem.node[sem_id]
|
219
|
+
end
|
220
|
+
|
221
|
+
###
|
222
|
+
def each_frame
|
223
|
+
@sem.each_frame { |f| yield f }
|
224
|
+
end
|
225
|
+
|
226
|
+
###
|
227
|
+
def frames
|
228
|
+
@sem.frames
|
229
|
+
end
|
230
|
+
|
231
|
+
###
|
232
|
+
def each_usp_frameblock
|
233
|
+
@sem.each_usp_frameblock { |b| yield b }
|
234
|
+
end
|
235
|
+
|
236
|
+
###
|
237
|
+
def usp_frameblocks
|
238
|
+
@sem.usp_frameblocks
|
239
|
+
end
|
240
|
+
|
241
|
+
###
|
242
|
+
def each_usp_feblock
|
243
|
+
@sem.each_usp_feblock { |b| yield b }
|
244
|
+
end
|
245
|
+
|
246
|
+
###
|
247
|
+
def usp_feblocks
|
248
|
+
@sem.usp_feblocks
|
249
|
+
end
|
250
|
+
|
251
|
+
###
|
252
|
+
def flags
|
253
|
+
@sem.flags
|
254
|
+
end
|
255
|
+
|
256
|
+
###################################
|
257
|
+
# adding and removing things
|
258
|
+
###
|
259
|
+
# add syntactic node, specified as terminal(t) or nonterminal(nt)
|
260
|
+
#
|
261
|
+
# returns the new node
|
262
|
+
def add_syn(label, # string: t or nt
|
263
|
+
cat = nil, # string: category
|
264
|
+
word = nil,# string: word
|
265
|
+
pos = nil, # string: part of speech
|
266
|
+
syn_id = nil) # string: ID for the new node
|
267
|
+
|
268
|
+
@syn.add_node(id, label, cat, word, pos, syn_id)
|
269
|
+
end
|
270
|
+
|
271
|
+
###
|
272
|
+
def remove_syn(node)
|
273
|
+
@syn.remove_node(node)
|
274
|
+
end
|
275
|
+
|
276
|
+
###
|
277
|
+
def add_frame(name, # string: name of the frame
|
278
|
+
sem_id = nil) # string: ID for the new node
|
279
|
+
|
280
|
+
@sem.add_frame(id, name, sem_id)
|
281
|
+
end
|
282
|
+
|
283
|
+
###
|
284
|
+
def remove_frame(frame_node) # FrameNode object
|
285
|
+
@sem.remove_frame(frame_node)
|
286
|
+
end
|
287
|
+
|
288
|
+
###
|
289
|
+
def add_fe(frame_obj,
|
290
|
+
name,
|
291
|
+
fe_children,
|
292
|
+
sem_id = nil)
|
293
|
+
|
294
|
+
@sem.add_fe(frame_obj, name, fe_children, sem_id)
|
295
|
+
end
|
296
|
+
|
297
|
+
###
|
298
|
+
def remove_fe(fe_node)
|
299
|
+
@sem.remove_fe(fe_node)
|
300
|
+
end
|
301
|
+
|
302
|
+
###
|
303
|
+
def add_usp(frame_or_fe)
|
304
|
+
@sem.add_usp(frame_or_fe)
|
305
|
+
end
|
306
|
+
|
307
|
+
###
|
308
|
+
def remove_usp(usp_node) # UspNode object
|
309
|
+
@sem.remove_usp(usp_node)
|
310
|
+
end
|
311
|
+
|
312
|
+
###
|
313
|
+
def add_flag(type, param = nil, text = nil)
|
314
|
+
@sem.add_flag(type, param, text)
|
315
|
+
end
|
316
|
+
|
317
|
+
###
|
318
|
+
def remove_flag(type, param = nil, text = nil)
|
319
|
+
@sem.remove_flag(type, param, text)
|
320
|
+
end
|
321
|
+
|
322
|
+
###
|
323
|
+
def remove_semantics
|
324
|
+
empty_sem = RegXML.new("<sem/>")
|
325
|
+
@sem = SalsaTigerSentenceSem.new(empty_sem, id, @syn.node)
|
326
|
+
end
|
327
|
+
|
328
|
+
#################
|
329
|
+
# output
|
330
|
+
def get_syn
|
331
|
+
@syn.get
|
332
|
+
end
|
333
|
+
|
334
|
+
def convex_complemented(node_set)
|
335
|
+
terminals = terminals_sorted
|
336
|
+
|
337
|
+
yield_nodes = node_set.map { |node| node.yield_nodes_ordered }.flatten
|
338
|
+
|
339
|
+
leftmost = yield_nodes.map { |t| terminals.index(t) }.min
|
340
|
+
rightmost = yield_nodes.map { |t| terminals.index(t) }.max
|
341
|
+
if leftmost.nil? || rightmost.nil?
|
342
|
+
STDERR.puts "Warning: could not complement projected node set "\
|
343
|
+
"#{yield_nodes.map(&:id)}"\
|
344
|
+
"Terminals not found in sorted set of sentence terminals!?"
|
345
|
+
return node_set
|
346
|
+
else
|
347
|
+
STDERR.puts "Replacing " + yield_nodes.join(" ")
|
348
|
+
new_node_set = terminals[leftmost..rightmost]
|
349
|
+
STDERR.puts "By " + new_node_set.join(" ")
|
350
|
+
return max_constituents_for_nodes(new_node_set)
|
351
|
+
end
|
352
|
+
end
|
353
|
+
|
354
|
+
# returns: array:SynNode, list of maximal constituents covering
|
355
|
+
# the input nodes
|
356
|
+
def max_constituents_for_nodes(node_list, # array: SynNode
|
357
|
+
ignore_empty_terminals = false) # boolean: ignore empty terminals?
|
358
|
+
|
359
|
+
# sort node IDs into splitwords and rest,
|
360
|
+
# and filter out punctuation marks
|
361
|
+
#
|
362
|
+
# 'words' is an array of node IDs that are not splitwords
|
363
|
+
# 'splitwords' is an array of fenodes that refer to splitwords
|
364
|
+
words = []
|
365
|
+
splitwords = []
|
366
|
+
|
367
|
+
node_list.each { |node|
|
368
|
+
if node.is_splitword?
|
369
|
+
splitwords << node
|
370
|
+
else
|
371
|
+
words.concat node.yield_nodes.reject { |t| t.is_punct? }
|
372
|
+
end
|
373
|
+
}
|
374
|
+
|
375
|
+
# check all nodes from root down:
|
376
|
+
# 'constituents', 'nodes_to_check' are arrays of node IDs
|
377
|
+
# 'constituents' contains found constituents,
|
378
|
+
# 'nodes_to_check' contains nodes for which we still need constituents
|
379
|
+
|
380
|
+
constituents = []
|
381
|
+
nodes_to_check = syn_roots # (there may be more than one)
|
382
|
+
# this accesses the syn_roots() method of SalsaTigerSentence
|
383
|
+
|
384
|
+
while(true)
|
385
|
+
node = nodes_to_check.shift
|
386
|
+
# have we checked all nodes already? or are we done with all words? then stop.
|
387
|
+
if node.nil?
|
388
|
+
constituents.concat words
|
389
|
+
words = []
|
390
|
+
break
|
391
|
+
end
|
392
|
+
if words.empty?
|
393
|
+
break
|
394
|
+
end
|
395
|
+
|
396
|
+
# only match nonempty non-punctuation nodes
|
397
|
+
|
398
|
+
node_yield = node.yield_nodes.reject {|n| n.is_punct? }
|
399
|
+
if ignore_empty_terminals
|
400
|
+
node_yield = node_yield.reject { |n| n.is_terminal? and (n.word.nil? or n.word.empty?) }
|
401
|
+
end
|
402
|
+
if node_yield.empty?
|
403
|
+
# this node has no yield, or only punctuation sign yield.
|
404
|
+
# skip it.
|
405
|
+
next
|
406
|
+
end
|
407
|
+
|
408
|
+
rest = node_yield - words
|
409
|
+
if rest.size == 0
|
410
|
+
# whole yield of node consists of words from this FE
|
411
|
+
constituents << node
|
412
|
+
words -= node_yield
|
413
|
+
|
414
|
+
elsif rest.size < node_yield.size
|
415
|
+
# at least some of the words in FE appear below this node:
|
416
|
+
# check this node's children too
|
417
|
+
node.children.each { |child| nodes_to_check << child }
|
418
|
+
end
|
419
|
+
end
|
420
|
+
|
421
|
+
constituents.concat(splitwords) #splitwords stay what they are
|
422
|
+
constituents.concat(words) # any leftover words that may not be from that sentence?
|
423
|
+
# just keep them.
|
424
|
+
|
425
|
+
constituents
|
426
|
+
end
|
427
|
+
|
428
|
+
###
|
429
|
+
# determine maximum constituents covering the nodes in node_list
|
430
|
+
# punctuation terminals (and optionally empty terminals) are ignored.
|
431
|
+
#
|
432
|
+
# If include_single_missing_children is set to true,
|
433
|
+
# then a node that has at least one child whose yield is in nodelist,
|
434
|
+
# and has only one child whose yield is not in nodelist,
|
435
|
+
# will be considered as having its yield in nodelist.
|
436
|
+
#
|
437
|
+
# Optionally, a procedure accept_anyway_proc can be given.
|
438
|
+
# Like the option include_single_missing_children, it can lead to nodes being
|
439
|
+
# included in the list of nodes whose yield nodes are all also yield nodes of node_list (NYNAAYNN)
|
440
|
+
# even though not all of their yield nodes are yield nodes of the node_list.
|
441
|
+
# accept_anyway_proc can implement arbitrary rules for including nodes in NYAAYNN.
|
442
|
+
# The procedure is called with three arguments:
|
443
|
+
# accept_anyway_proc(node, ch_in, ch_out)
|
444
|
+
# node is a SynNode that would not normally be in NYAAYNN.
|
445
|
+
# ch_in is the list of its children that are in NYAAYNN.
|
446
|
+
# ch_out is the list of its children that are not.
|
447
|
+
# If the procedure exists and returns true, node is put into NYAAYNN.
|
448
|
+
#
|
449
|
+
# returns: an array of SynNodes: the maximal constituents that together
|
450
|
+
# exactly cover node_list
|
451
|
+
def max_constituents_smc(node_list, # array: SynNode
|
452
|
+
include_single_missing_children, # boolean
|
453
|
+
ignore_empty_terminals = false, # boolean: ignore empty terminals?
|
454
|
+
accept_anyway_proc = nil) # proc: SynNode, array:SynNode, array:SynNode => boolean
|
455
|
+
|
456
|
+
# sort node IDs into splitwords and rest,
|
457
|
+
# and filter out punctuation marks
|
458
|
+
#
|
459
|
+
# 'words' is an array of node IDs that are not splitwords
|
460
|
+
# 'splitwords' is an array of fenodes that refer to splitwords
|
461
|
+
words = []
|
462
|
+
splitwords = []
|
463
|
+
|
464
|
+
node_list.each { |node|
|
465
|
+
if node.is_splitword?
|
466
|
+
splitwords << node
|
467
|
+
else
|
468
|
+
words.concat node.yield_nodes.reject { |t| t.is_punct? }
|
469
|
+
end
|
470
|
+
}
|
471
|
+
|
472
|
+
constituents = splitwords
|
473
|
+
|
474
|
+
syn_roots.each { |node|
|
475
|
+
node_included, descendants_included = max_constituents_aux(node, words,
|
476
|
+
include_single_missing_children,
|
477
|
+
ignore_empty_terminals,
|
478
|
+
accept_anyway_proc)
|
479
|
+
|
480
|
+
if node_included == "true"
|
481
|
+
constituents << node
|
482
|
+
else
|
483
|
+
constituents.concat descendants_included
|
484
|
+
end
|
485
|
+
}
|
486
|
+
# which words remain to be added?
|
487
|
+
constituents.each { |c| words -= c.yield_nodes }
|
488
|
+
constituents.concat words
|
489
|
+
|
490
|
+
constituents
|
491
|
+
end
|
492
|
+
|
493
|
+
private
|
494
|
+
|
495
|
+
###
|
496
|
+
# recursively determine maximum constituents covering the nodes in 'nodelist',
|
497
|
+
# starting at 'node'.
|
498
|
+
# punctuation terminals (and optionally empty terminals) are ignored.
|
499
|
+
#
|
500
|
+
# If include_single_missing_children is set to true,
|
501
|
+
# then a node that has at least one child whose yield is in nodelist,
|
502
|
+
# and has only one child whose yield is not in nodelist,
|
503
|
+
# will be considered as having its yield in nodelist.
|
504
|
+
#
|
505
|
+
# If accept_anyway_proc is nonnil, also use that to decide whether
|
506
|
+
# a node will be considered as having its yield in nodelist.
|
507
|
+
#
|
508
|
+
# returns: pair [mybool, included_descendants]
|
509
|
+
# where mybool is a string, "true", "false" or "ignoreme" (for ignored
|
510
|
+
# punctuation and empty terminals):
|
511
|
+
# does the yield of this node consist entirely of nodes from nodelist?
|
512
|
+
# and included_descendants is a list of SynNodes: if mybool is "false",
|
513
|
+
# this is a list of descendants of this node whose yield does consist
|
514
|
+
# entirely of nodes from nodelist
|
515
|
+
def max_constituents_aux(node, # SynNode
|
516
|
+
nodelist, # array:SynNode
|
517
|
+
include_single_missing_children = false, # boolean
|
518
|
+
ignore_empty_terminals = false, # boolean: ignore empty terminals?
|
519
|
+
accept_anyway_proc = nil) # proc: SynNode, array:SynNode, array:SynNode => Boolean
|
520
|
+
|
521
|
+
|
522
|
+
|
523
|
+
if node.is_terminal? and nodelist.include? node
|
524
|
+
# node is terminal and included in nodelist
|
525
|
+
return ["true", []]
|
526
|
+
elsif node.is_punct?
|
527
|
+
# punctuation: ignore
|
528
|
+
return ["ignoreme", []]
|
529
|
+
elsif ignore_empty_terminals and node.is_terminal? and
|
530
|
+
(node.word.nil? or node.word.empty?)
|
531
|
+
# empty terminal: possibly ignore
|
532
|
+
return ["ignoreme", []]
|
533
|
+
elsif node.is_terminal?
|
534
|
+
# terminal, but not included in nodelist
|
535
|
+
return ["false", []]
|
536
|
+
end
|
537
|
+
|
538
|
+
children_results = node.children.map { |ch|
|
539
|
+
fully_included, descendants_included = max_constituents_aux(ch, nodelist,
|
540
|
+
include_single_missing_children,
|
541
|
+
ignore_empty_terminals,
|
542
|
+
accept_anyway_proc)
|
543
|
+
[ch, fully_included, descendants_included]
|
544
|
+
}
|
545
|
+
|
546
|
+
res_false = children_results.select { |ch, fully_included, descendants_included|
|
547
|
+
fully_included == "false"
|
548
|
+
}
|
549
|
+
res_true = children_results.select { |ch, fully_included, descendants_included|
|
550
|
+
fully_included == "true"
|
551
|
+
}
|
552
|
+
|
553
|
+
if res_false.empty? and res_true.length > 0
|
554
|
+
# all true, or all true and ignoreme
|
555
|
+
return ["true", []]
|
556
|
+
|
557
|
+
elsif res_false.empty? and res_true.empty?
|
558
|
+
# all ignoreme
|
559
|
+
return ["ignoreme", []]
|
560
|
+
|
561
|
+
elsif res_false.length == 1 and res_true.length > 1 and
|
562
|
+
include_single_missing_children
|
563
|
+
# one child not covered,
|
564
|
+
# resulting in all other children (except the ignoremes) being marked individually:
|
565
|
+
# consider the single missing child as covered, too
|
566
|
+
|
567
|
+
return ["true", []]
|
568
|
+
|
569
|
+
elsif accept_anyway_proc and
|
570
|
+
accept_anyway_proc.call(node, res_true.map { |ch, bool1, bool2| ch }, res_false.map { |ch, bool1, bool2| ch })
|
571
|
+
# some external source tells us that
|
572
|
+
# we are to consider the missing children as covered, too
|
573
|
+
return ["true", []]
|
574
|
+
|
575
|
+
else
|
576
|
+
# not all children covered
|
577
|
+
return [
|
578
|
+
"false",
|
579
|
+
children_results.map { |ch, fully_included, descendants_included|
|
580
|
+
if fully_included == "true"
|
581
|
+
[ch]
|
582
|
+
else
|
583
|
+
descendants_included
|
584
|
+
end
|
585
|
+
}.flatten
|
586
|
+
]
|
587
|
+
end
|
588
|
+
end
|
589
|
+
|
590
|
+
protected
|
591
|
+
|
592
|
+
def get_xml_ofchildren
|
593
|
+
@syn.get + @sem.get
|
594
|
+
end
|
595
|
+
end
|
596
|
+
end
|