shalmaneser-lib 1.2.rc5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +10 -0
- data/CHANGELOG.md +4 -0
- data/LICENSE.md +4 -0
- data/README.md +122 -0
- data/lib/configuration/config_data.rb +457 -0
- data/lib/configuration/config_format_element.rb +210 -0
- data/lib/configuration/configuration_error.rb +15 -0
- data/lib/configuration/external_config_data.rb +56 -0
- data/lib/configuration/frappe_config_data.rb +134 -0
- data/lib/configuration/fred_config_data.rb +199 -0
- data/lib/configuration/rosy_config_data.rb +126 -0
- data/lib/db/db_interface.rb +50 -0
- data/lib/db/db_mysql.rb +141 -0
- data/lib/db/db_sqlite.rb +280 -0
- data/lib/db/db_table.rb +237 -0
- data/lib/db/db_view.rb +416 -0
- data/lib/db/db_wrapper.rb +175 -0
- data/lib/db/select_table_and_columns.rb +10 -0
- data/lib/db/sql_query.rb +243 -0
- data/lib/definitions.rb +19 -0
- data/lib/eval.rb +482 -0
- data/lib/ext/maxent/Classify.class +0 -0
- data/lib/ext/maxent/Train.class +0 -0
- data/lib/external_systems.rb +251 -0
- data/lib/framenet_format/fn_corpus_aset.rb +209 -0
- data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
- data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
- data/lib/framenet_format/fn_database.rb +143 -0
- data/lib/framenet_format/frame_xml_file.rb +104 -0
- data/lib/framenet_format/frame_xml_sentence.rb +411 -0
- data/lib/logging.rb +25 -0
- data/lib/ml/classifier.rb +189 -0
- data/lib/ml/mallet.rb +236 -0
- data/lib/ml/maxent.rb +229 -0
- data/lib/ml/optimize.rb +195 -0
- data/lib/ml/timbl.rb +140 -0
- data/lib/monkey_patching/array.rb +82 -0
- data/lib/monkey_patching/enumerable_bool.rb +24 -0
- data/lib/monkey_patching/enumerable_distribute.rb +18 -0
- data/lib/monkey_patching/file.rb +131 -0
- data/lib/monkey_patching/subsumed.rb +24 -0
- data/lib/ruby_class_extensions.rb +4 -0
- data/lib/salsa_tiger_xml/corpus.rb +24 -0
- data/lib/salsa_tiger_xml/fe_node.rb +98 -0
- data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
- data/lib/salsa_tiger_xml/frame_node.rb +145 -0
- data/lib/salsa_tiger_xml/graph_node.rb +347 -0
- data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
- data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
- data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
- data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
- data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
- data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
- data/lib/salsa_tiger_xml/sem_node.rb +58 -0
- data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
- data/lib/salsa_tiger_xml/syn_node.rb +169 -0
- data/lib/salsa_tiger_xml/tree_node.rb +59 -0
- data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
- data/lib/salsa_tiger_xml/usp_node.rb +72 -0
- data/lib/salsa_tiger_xml/xml_node.rb +163 -0
- data/lib/shalmaneser/lib.rb +1 -0
- data/lib/tabular_format/fn_tab_format_file.rb +38 -0
- data/lib/tabular_format/fn_tab_frame.rb +67 -0
- data/lib/tabular_format/fn_tab_sentence.rb +169 -0
- data/lib/tabular_format/tab_format_file.rb +91 -0
- data/lib/tabular_format/tab_format_named_args.rb +184 -0
- data/lib/tabular_format/tab_format_sentence.rb +119 -0
- data/lib/value_restriction.rb +49 -0
- metadata +131 -0
@@ -0,0 +1,285 @@
|
|
1
|
+
module STXML
|
2
|
+
# RegXML
|
3
|
+
#
|
4
|
+
# Katrin Erk June 2005
|
5
|
+
|
6
|
+
# SalsaTigerRegXML: take control of the data structure, no underlying xml
|
7
|
+
# representation anymore, re-generation of xml on demand
|
8
|
+
|
9
|
+
class RegXML
|
10
|
+
|
11
|
+
def initialize(string, # string representing a single XML element
|
12
|
+
i_am_text = false) # boolean: xml element (false) or text (true)
|
13
|
+
|
14
|
+
unless string.class == String
|
15
|
+
raise "First argument to RegXML.new must be string. I got #{string.class}"
|
16
|
+
end
|
17
|
+
|
18
|
+
if i_am_text
|
19
|
+
@s = string
|
20
|
+
@i_am_text = true
|
21
|
+
else
|
22
|
+
@s = string.gsub(/\n/, " ").freeze
|
23
|
+
@i_am_text = false
|
24
|
+
|
25
|
+
element_test
|
26
|
+
dyck_test
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def first_child_matching(child_name)
|
31
|
+
children_and_text.detect { |c| c.name == child_name }
|
32
|
+
end
|
33
|
+
|
34
|
+
def each_child_matching(child_name)
|
35
|
+
children_and_text.each do |c|
|
36
|
+
if c.name == child_name
|
37
|
+
yield c
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def to_s
|
43
|
+
xml_readable(@s)
|
44
|
+
end
|
45
|
+
|
46
|
+
def text?
|
47
|
+
@i_am_text
|
48
|
+
end
|
49
|
+
|
50
|
+
# Return the name of the xml element contained in the string.
|
51
|
+
# @return [String] Name of the element.
|
52
|
+
def name
|
53
|
+
if @i_am_text
|
54
|
+
# text
|
55
|
+
return nil
|
56
|
+
|
57
|
+
else
|
58
|
+
# xml element
|
59
|
+
if @s =~ /^\s*<\s*([\w-]+)[\s\/>]/
|
60
|
+
return $1
|
61
|
+
else
|
62
|
+
raise "Cannot parse:\n#{xml_readable(@s)}"
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
# Return a hash of attributes and their values.
|
68
|
+
# @return [Hash<String String>] Attributes of an xml element.
|
69
|
+
def attributes
|
70
|
+
if @i_am_text
|
71
|
+
# text
|
72
|
+
return {}
|
73
|
+
|
74
|
+
else
|
75
|
+
# xml element
|
76
|
+
|
77
|
+
# remove <element_name from the beginning of @s,
|
78
|
+
# place the rest up to the first > into elt_contents:
|
79
|
+
# this is a string of the form
|
80
|
+
# - either (name=value)*
|
81
|
+
# - or (name=value)*/
|
82
|
+
unless @s =~ /^\s*<\s*#{name}(.*)$/
|
83
|
+
raise "Cannot parse:\n #{xml_readable(@s)}"
|
84
|
+
end
|
85
|
+
|
86
|
+
retv = {}
|
87
|
+
elt_contents = $1
|
88
|
+
|
89
|
+
# repeat until only > or /> is left
|
90
|
+
while elt_contents !~ /^\s*\/?>/
|
91
|
+
|
92
|
+
# shave off the next name=value pair
|
93
|
+
# put the rest into elt_contents
|
94
|
+
# make sure that if the value is quoted with ',
|
95
|
+
# we accept " inside the value, and vice versa.
|
96
|
+
unless elt_contents =~ /^\s*([\w-]+)=(['"])(.*?)\2(.*)$/
|
97
|
+
raise "Cannot parse:\n #{xml_readable(elt_contents)}"
|
98
|
+
end
|
99
|
+
retv[$1] = $3
|
100
|
+
elt_contents = $4
|
101
|
+
end
|
102
|
+
|
103
|
+
return retv
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def children_and_text
|
108
|
+
if @i_am_text
|
109
|
+
return []
|
110
|
+
|
111
|
+
else
|
112
|
+
if unary_element
|
113
|
+
# <bla/>, no children
|
114
|
+
return []
|
115
|
+
end
|
116
|
+
|
117
|
+
# @s has the form <bla...> ... </bla>.
|
118
|
+
# remove <bla ...> from the beginning of @s,
|
119
|
+
# place the rest up to </bla> into children_s:
|
120
|
+
|
121
|
+
mainname = name
|
122
|
+
unless @s =~ /^\s*<\s*#{mainname}(\s+[\w-]+=(["']).*?\2)*\s*>(.*?)<\/\s*#{mainname}\s*>\s*$/
|
123
|
+
raise "Cannot parse:\n #{xml_readable(@s)}"
|
124
|
+
end
|
125
|
+
|
126
|
+
retv = []
|
127
|
+
children_s = $3
|
128
|
+
|
129
|
+
# repeat until only whitespace is left
|
130
|
+
while children_s !~ /^\s*$/
|
131
|
+
|
132
|
+
# shave off the next bit of text
|
133
|
+
# put the rest into children_s
|
134
|
+
unless children_s =~ /^\s*(.*?)(<.*$|$)/
|
135
|
+
$stderr.puts "Whole was:\n #{xml_readable(@s)}"
|
136
|
+
$stderr.puts
|
137
|
+
raise "Cannot parse:\n #{xml_readable(children_s)}"
|
138
|
+
end
|
139
|
+
unless $1.strip.empty?
|
140
|
+
children_s = $2
|
141
|
+
retv << RegXML.new($1, true)
|
142
|
+
end
|
143
|
+
|
144
|
+
# anything left after we've parsed text?
|
145
|
+
if children_s =~ /^s*$/
|
146
|
+
break
|
147
|
+
end
|
148
|
+
|
149
|
+
# shave off the next child
|
150
|
+
# and put the rest into children_s
|
151
|
+
|
152
|
+
# determine the next child's name, and the string index at which
|
153
|
+
# the element start tag ends with either / or >
|
154
|
+
unless children_s =~ /^\s*(<\s*([\w-]+)(\s+[\w-]+=(["']).*?\4)*\s*)/
|
155
|
+
$stderr.puts "Whole was:\n #{xml_readable(@s)}"
|
156
|
+
$stderr.puts
|
157
|
+
raise "Cannot parse:\n #{xml_readable(children_s)}"
|
158
|
+
end
|
159
|
+
childname = $2
|
160
|
+
child = $1
|
161
|
+
endofelt_ix = $&.length
|
162
|
+
|
163
|
+
|
164
|
+
# and remove it
|
165
|
+
case children_s[endofelt_ix..-1]
|
166
|
+
when /^\/>(.*)$/
|
167
|
+
# next child is a unary element
|
168
|
+
children_s = $1
|
169
|
+
retv << RegXML.new(child + "/>")
|
170
|
+
|
171
|
+
when /^(>.*?<\s*\/\s*#{childname}\s*>)(.*)$/
|
172
|
+
children_s = $2
|
173
|
+
retv << RegXML.new(child + $1)
|
174
|
+
|
175
|
+
else
|
176
|
+
$stderr.puts "Whole was:\n #{xml_readable(@s)}"
|
177
|
+
$stderr.puts
|
178
|
+
raise "Cannot parse:\n#{xml_readable(children_s)}"
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
return retv
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
def RegXML.test
|
187
|
+
bla = RegXML.new(" <bla blupp='a\"b'
|
188
|
+
lalala=\"c\">
|
189
|
+
<lalala> </lalala>
|
190
|
+
texttext
|
191
|
+
<lala blupp='b'/>
|
192
|
+
nochtext
|
193
|
+
<la> <l/> </la>
|
194
|
+
</ bla >
|
195
|
+
")
|
196
|
+
puts "name " + bla.name
|
197
|
+
puts
|
198
|
+
puts bla.to_s
|
199
|
+
puts
|
200
|
+
bla.attributes.each { |attr, val|
|
201
|
+
puts "attr " + attr + "=" + val
|
202
|
+
}
|
203
|
+
puts
|
204
|
+
bla.children_and_text.each { |child_obj|
|
205
|
+
if child_obj.text?
|
206
|
+
puts "da text " + child_obj.to_s
|
207
|
+
else
|
208
|
+
puts "da child " + child_obj.to_s
|
209
|
+
end
|
210
|
+
}
|
211
|
+
puts
|
212
|
+
|
213
|
+
puts "NEU"
|
214
|
+
bla = RegXML.new(" < bla blupp='a\"'/> ")
|
215
|
+
puts "name " + bla.name
|
216
|
+
puts
|
217
|
+
puts bla.to_s
|
218
|
+
puts
|
219
|
+
bla.attributes.each { |attr, val|
|
220
|
+
puts "attr " + attr + "=" + val
|
221
|
+
}
|
222
|
+
puts
|
223
|
+
bla.children_and_text.each { |child_obj|
|
224
|
+
if child_obj.text?
|
225
|
+
puts "da text " + child_obj.to_s
|
226
|
+
else
|
227
|
+
puts "da child " + child_obj.to_s
|
228
|
+
end
|
229
|
+
}
|
230
|
+
puts
|
231
|
+
|
232
|
+
end
|
233
|
+
|
234
|
+
##############
|
235
|
+
protected
|
236
|
+
|
237
|
+
def unary_element
|
238
|
+
# <bla/>
|
239
|
+
if @s =~ /^\s*<.*\/>\s*$/
|
240
|
+
return true
|
241
|
+
else
|
242
|
+
return false
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
def element_test
|
247
|
+
# make sure we have a single XML element, either <bla/> or
|
248
|
+
# <bla>...</bla>
|
249
|
+
|
250
|
+
if unary_element
|
251
|
+
# <bla/>
|
252
|
+
elsif @s =~ /^\s*<\s*([\w-]+)\W.*?<\/\s*\1\s*>\s*$/
|
253
|
+
# <bla > ... </bla>
|
254
|
+
else
|
255
|
+
raise "Cannot parse:\n #{xml_readable(@s)}"
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
259
|
+
def dyck_test
|
260
|
+
# every prefix of @s must have at least as many < as >
|
261
|
+
opening = 0
|
262
|
+
closing = 0
|
263
|
+
@s.scan(/[<>]/) { |bracket|
|
264
|
+
case bracket
|
265
|
+
when "<"
|
266
|
+
opening += 1
|
267
|
+
when ">"
|
268
|
+
closing += 1
|
269
|
+
if closing > opening
|
270
|
+
raise "More closing than opening brackets in prefix of:\n #{xml_readable(@s)}"
|
271
|
+
end
|
272
|
+
end
|
273
|
+
}
|
274
|
+
|
275
|
+
# and in total, @s must have equally many < and >
|
276
|
+
unless @s.count("<") == @s.count(">")
|
277
|
+
raise "Inequal number of brackets in:\n #{xml_readable(@s)}"
|
278
|
+
end
|
279
|
+
end
|
280
|
+
|
281
|
+
def xml_readable(string)
|
282
|
+
string.gsub(/>/, ">\n")
|
283
|
+
end
|
284
|
+
end
|
285
|
+
end
|
@@ -0,0 +1,596 @@
|
|
1
|
+
require_relative 'xml_node'
|
2
|
+
require_relative 'salsa_tiger_sentence_graph'
|
3
|
+
require_relative 'salsa_tiger_sentence_sem'
|
4
|
+
require_relative 'reg_xml'
|
5
|
+
|
6
|
+
module STXML
|
7
|
+
#############
|
8
|
+
# class SalsaTigerSentence
|
9
|
+
#
|
10
|
+
# offers access methods to a SalsaTigerXML sentence
|
11
|
+
# given as a string
|
12
|
+
#
|
13
|
+
# Nodes of syntactic structure as well as frames and
|
14
|
+
# frame elements are kept (and returned) as XMLNode objects,
|
15
|
+
# or more specifically as SynNode, FrameNode and FeNode objects.
|
16
|
+
#
|
17
|
+
# methods:
|
18
|
+
#
|
19
|
+
# new initializes the object
|
20
|
+
#
|
21
|
+
# id returns the sentence ID
|
22
|
+
#
|
23
|
+
# get returns the REXML object describing the same sentence
|
24
|
+
# as this object
|
25
|
+
#
|
26
|
+
# each_terminal yields each terminal of the sentence in turn.
|
27
|
+
# they are returned as SynNode objects
|
28
|
+
#
|
29
|
+
# terminals returns all terminal node objects in an array
|
30
|
+
#
|
31
|
+
# each_terminal_sorted yields each terminal of the sentence in turn,
|
32
|
+
# making sure the terminal with the lowest ID is returned first.
|
33
|
+
# use this if you need the terminal words in the right order!
|
34
|
+
# nodes are returned as SynNode objects
|
35
|
+
#
|
36
|
+
# each_nonterminal yields each nonterminal of the sentence in turn.
|
37
|
+
# nodes are returned as SynNode objects
|
38
|
+
#
|
39
|
+
# each_frame yields each frame of the sentence in turn.
|
40
|
+
# nodes are returned as FrameNode objects
|
41
|
+
#
|
42
|
+
# frames returns all frame objects in an array
|
43
|
+
#
|
44
|
+
# each_usp_frameblock
|
45
|
+
# yields each group of underspecified frames of the sentence
|
46
|
+
# in turn, as an UspNode object. To see the frames involved
|
47
|
+
# in this underspecification, use each_child on the UspNode object
|
48
|
+
#
|
49
|
+
#
|
50
|
+
# usp_frameblocks returns all groups of underspecified frames as an array
|
51
|
+
# of UspNode objects
|
52
|
+
#
|
53
|
+
# each_usp_feblock
|
54
|
+
# yields each group of underspecified frame elements
|
55
|
+
# of the sentence in turn, as an UspNode object.
|
56
|
+
# To see the frames involved
|
57
|
+
# in this underspecification, use each_child on the UspNode object
|
58
|
+
#
|
59
|
+
# usp_feblocks returns all groups of underspecified frame elements
|
60
|
+
# as an array of UspNode objects
|
61
|
+
#
|
62
|
+
#
|
63
|
+
# flags returns a list of the sentence flags, as hashes.
|
64
|
+
# key "type": a string, either REEXAMINE or WRONGSUBCORPUS
|
65
|
+
# or INTERESTING or LATER
|
66
|
+
# key "param": a string, the parameter. important for
|
67
|
+
# REEXAMINE
|
68
|
+
# key "text": a string, the text of this flag. Will be
|
69
|
+
# nonempty only for INTERESTING cases
|
70
|
+
#
|
71
|
+
# syn_roots returns a list of all the roots of the syntactic trees
|
72
|
+
# in this sentence, as node objects. There may be more than
|
73
|
+
# one, unfortunately.
|
74
|
+
#
|
75
|
+
# add_syn add a new syntactic node with the given category, word, POS,
|
76
|
+
# returns the new node
|
77
|
+
#
|
78
|
+
# add_frame add a frame with a given name, returns the new frame node
|
79
|
+
#
|
80
|
+
# add_usp add a new underspecification block, either for frames or FEs
|
81
|
+
#
|
82
|
+
# add_flag adds a sentence flag to this sentence.
|
83
|
+
# type: a string, must be REEXAMINE, INTERESTING, WRONGSUBCORPUS,
|
84
|
+
# or LATER
|
85
|
+
# param: optional parameter, a string, describes type of Reexamine
|
86
|
+
# for REEXAMINE-type flags
|
87
|
+
# text: optional parameter, a string, arbitrary text commenting
|
88
|
+
# on the flag, used mainly with INTERESTING
|
89
|
+
#
|
90
|
+
# remove_flag removes a sentence flag to this sentence
|
91
|
+
# only removes flag in case of exact match of type, param, and text
|
92
|
+
# type: a string, either REEXAMINE, INTERESTING, WRONGSUBCORPUS,
|
93
|
+
# or LATER
|
94
|
+
# param: optional parameter, a string, describes type of Reexamine
|
95
|
+
# for REEXAMINE-type flags
|
96
|
+
# text: optional parameter, a string, arbitrary text commenting
|
97
|
+
# on the flag, used mainly with INTERESTING
|
98
|
+
class SalsaTigerSentence < XMLNode
|
99
|
+
def self.empty_sentence(sentence_id) # string
|
100
|
+
sentence_id = sentence_id.gsub(/'/, "'")
|
101
|
+
sent_string = "<s id=\'#{sentence_id}\'>\n" +
|
102
|
+
"<graph/>\n" +
|
103
|
+
"<sem/>\n" +
|
104
|
+
"</s>"
|
105
|
+
|
106
|
+
SalsaTigerSentence.new(sent_string)
|
107
|
+
end
|
108
|
+
|
109
|
+
def initialize(string)
|
110
|
+
# parse string as an XML element
|
111
|
+
xml_obj = RegXML.new(string)
|
112
|
+
|
113
|
+
# initialize this object as an XML node,
|
114
|
+
# i.e. remember the outermost element's name, attributes,
|
115
|
+
# and ID, and specify that it's not a text but an XML object
|
116
|
+
super(xml_obj.name, xml_obj.attributes, SalsaTigerXmlNode.xmlel_id(xml_obj), false)
|
117
|
+
|
118
|
+
# find XML element "graph",
|
119
|
+
# which contains the syntactic info of the sentence.
|
120
|
+
# It is a child of the <s> element.
|
121
|
+
xml_syn_obj = xml_obj.children_and_text.detect { |thing|
|
122
|
+
thing.name == "graph"
|
123
|
+
}
|
124
|
+
|
125
|
+
unless xml_syn_obj
|
126
|
+
# no graph in this sentence -- fake one
|
127
|
+
xml_syn_obj = RegXML.new("<graph/>")
|
128
|
+
end
|
129
|
+
|
130
|
+
@syn = SalsaTigerSentenceGraph.new(xml_syn_obj, id)
|
131
|
+
|
132
|
+
# find XML element "sem"
|
133
|
+
# which contains the semantic info of the sentence.
|
134
|
+
# It is a child of the <s> element.
|
135
|
+
xml_sem_obj = xml_obj.children_and_text.detect { |thing|
|
136
|
+
thing.name == "sem"
|
137
|
+
}
|
138
|
+
|
139
|
+
unless xml_sem_obj
|
140
|
+
# no semantic info in this sentence -- fake one
|
141
|
+
xml_sem_obj = RegXML.new("<sem/>")
|
142
|
+
end
|
143
|
+
|
144
|
+
# add splitword info to @syn element
|
145
|
+
@syn.add_splitwords(SalsaTigerSentenceSem.get_splitwords(xml_sem_obj))
|
146
|
+
|
147
|
+
@sem = SalsaTigerSentenceSem.new(xml_sem_obj, id, @syn.node)
|
148
|
+
|
149
|
+
# go through the children of the <s> object again,
|
150
|
+
# remembering all children except <graph> and <sem>
|
151
|
+
# for later output
|
152
|
+
xml_obj.children_and_text.each do |child_or_text|
|
153
|
+
case child_or_text.name
|
154
|
+
when "graph", "sem"
|
155
|
+
# we have handled them already
|
156
|
+
else
|
157
|
+
add_kith(child_or_text)
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
def to_s
|
163
|
+
@syn.to_s
|
164
|
+
end
|
165
|
+
|
166
|
+
###
|
167
|
+
def each_terminal
|
168
|
+
@syn.each_terminal { |n| yield n }
|
169
|
+
end
|
170
|
+
|
171
|
+
###
|
172
|
+
def each_terminal_sorted
|
173
|
+
@syn.each_terminal_sorted { |n| yield n }
|
174
|
+
end
|
175
|
+
|
176
|
+
###
|
177
|
+
def terminals
|
178
|
+
@syn.terminals
|
179
|
+
end
|
180
|
+
|
181
|
+
###
|
182
|
+
def terminals_sorted
|
183
|
+
@syn.terminals_sorted
|
184
|
+
end
|
185
|
+
|
186
|
+
###
|
187
|
+
def each_nonterminal
|
188
|
+
@syn.each_nonterminal { |n| yield n }
|
189
|
+
end
|
190
|
+
|
191
|
+
###
|
192
|
+
def nonterminals
|
193
|
+
@syn.nonterminals
|
194
|
+
end
|
195
|
+
|
196
|
+
###
|
197
|
+
def each_syn_node
|
198
|
+
@syn.each_node { |n| yield n }
|
199
|
+
end
|
200
|
+
|
201
|
+
###
|
202
|
+
def syn_nodes
|
203
|
+
@syn.nodes
|
204
|
+
end
|
205
|
+
|
206
|
+
###
|
207
|
+
def syn_roots
|
208
|
+
@syn.syn_roots
|
209
|
+
end
|
210
|
+
|
211
|
+
###
|
212
|
+
def syn_node_with_id(syn_id)
|
213
|
+
@syn.node[syn_id]
|
214
|
+
end
|
215
|
+
|
216
|
+
###
|
217
|
+
def sem_node_with_id(sem_id)
|
218
|
+
@sem.node[sem_id]
|
219
|
+
end
|
220
|
+
|
221
|
+
###
|
222
|
+
def each_frame
|
223
|
+
@sem.each_frame { |f| yield f }
|
224
|
+
end
|
225
|
+
|
226
|
+
###
|
227
|
+
def frames
|
228
|
+
@sem.frames
|
229
|
+
end
|
230
|
+
|
231
|
+
###
|
232
|
+
def each_usp_frameblock
|
233
|
+
@sem.each_usp_frameblock { |b| yield b }
|
234
|
+
end
|
235
|
+
|
236
|
+
###
|
237
|
+
def usp_frameblocks
|
238
|
+
@sem.usp_frameblocks
|
239
|
+
end
|
240
|
+
|
241
|
+
###
|
242
|
+
def each_usp_feblock
|
243
|
+
@sem.each_usp_feblock { |b| yield b }
|
244
|
+
end
|
245
|
+
|
246
|
+
###
|
247
|
+
def usp_feblocks
|
248
|
+
@sem.usp_feblocks
|
249
|
+
end
|
250
|
+
|
251
|
+
###
|
252
|
+
def flags
|
253
|
+
@sem.flags
|
254
|
+
end
|
255
|
+
|
256
|
+
###################################
|
257
|
+
# adding and removing things
|
258
|
+
###
|
259
|
+
# add syntactic node, specified as terminal(t) or nonterminal(nt)
|
260
|
+
#
|
261
|
+
# returns the new node
|
262
|
+
def add_syn(label, # string: t or nt
|
263
|
+
cat = nil, # string: category
|
264
|
+
word = nil,# string: word
|
265
|
+
pos = nil, # string: part of speech
|
266
|
+
syn_id = nil) # string: ID for the new node
|
267
|
+
|
268
|
+
@syn.add_node(id, label, cat, word, pos, syn_id)
|
269
|
+
end
|
270
|
+
|
271
|
+
###
|
272
|
+
def remove_syn(node)
|
273
|
+
@syn.remove_node(node)
|
274
|
+
end
|
275
|
+
|
276
|
+
###
|
277
|
+
def add_frame(name, # string: name of the frame
|
278
|
+
sem_id = nil) # string: ID for the new node
|
279
|
+
|
280
|
+
@sem.add_frame(id, name, sem_id)
|
281
|
+
end
|
282
|
+
|
283
|
+
###
|
284
|
+
def remove_frame(frame_node) # FrameNode object
|
285
|
+
@sem.remove_frame(frame_node)
|
286
|
+
end
|
287
|
+
|
288
|
+
###
|
289
|
+
def add_fe(frame_obj,
|
290
|
+
name,
|
291
|
+
fe_children,
|
292
|
+
sem_id = nil)
|
293
|
+
|
294
|
+
@sem.add_fe(frame_obj, name, fe_children, sem_id)
|
295
|
+
end
|
296
|
+
|
297
|
+
###
|
298
|
+
def remove_fe(fe_node)
|
299
|
+
@sem.remove_fe(fe_node)
|
300
|
+
end
|
301
|
+
|
302
|
+
###
|
303
|
+
def add_usp(frame_or_fe)
|
304
|
+
@sem.add_usp(frame_or_fe)
|
305
|
+
end
|
306
|
+
|
307
|
+
###
|
308
|
+
def remove_usp(usp_node) # UspNode object
|
309
|
+
@sem.remove_usp(usp_node)
|
310
|
+
end
|
311
|
+
|
312
|
+
###
|
313
|
+
def add_flag(type, param = nil, text = nil)
|
314
|
+
@sem.add_flag(type, param, text)
|
315
|
+
end
|
316
|
+
|
317
|
+
###
|
318
|
+
def remove_flag(type, param = nil, text = nil)
|
319
|
+
@sem.remove_flag(type, param, text)
|
320
|
+
end
|
321
|
+
|
322
|
+
###
|
323
|
+
def remove_semantics
|
324
|
+
empty_sem = RegXML.new("<sem/>")
|
325
|
+
@sem = SalsaTigerSentenceSem.new(empty_sem, id, @syn.node)
|
326
|
+
end
|
327
|
+
|
328
|
+
#################
|
329
|
+
# output
|
330
|
+
def get_syn
|
331
|
+
@syn.get
|
332
|
+
end
|
333
|
+
|
334
|
+
def convex_complemented(node_set)
|
335
|
+
terminals = terminals_sorted
|
336
|
+
|
337
|
+
yield_nodes = node_set.map { |node| node.yield_nodes_ordered }.flatten
|
338
|
+
|
339
|
+
leftmost = yield_nodes.map { |t| terminals.index(t) }.min
|
340
|
+
rightmost = yield_nodes.map { |t| terminals.index(t) }.max
|
341
|
+
if leftmost.nil? || rightmost.nil?
|
342
|
+
STDERR.puts "Warning: could not complement projected node set "\
|
343
|
+
"#{yield_nodes.map(&:id)}"\
|
344
|
+
"Terminals not found in sorted set of sentence terminals!?"
|
345
|
+
return node_set
|
346
|
+
else
|
347
|
+
STDERR.puts "Replacing " + yield_nodes.join(" ")
|
348
|
+
new_node_set = terminals[leftmost..rightmost]
|
349
|
+
STDERR.puts "By " + new_node_set.join(" ")
|
350
|
+
return max_constituents_for_nodes(new_node_set)
|
351
|
+
end
|
352
|
+
end
|
353
|
+
|
354
|
+
# returns: array:SynNode, list of maximal constituents covering
|
355
|
+
# the input nodes
|
356
|
+
def max_constituents_for_nodes(node_list, # array: SynNode
|
357
|
+
ignore_empty_terminals = false) # boolean: ignore empty terminals?
|
358
|
+
|
359
|
+
# sort node IDs into splitwords and rest,
|
360
|
+
# and filter out punctuation marks
|
361
|
+
#
|
362
|
+
# 'words' is an array of node IDs that are not splitwords
|
363
|
+
# 'splitwords' is an array of fenodes that refer to splitwords
|
364
|
+
words = []
|
365
|
+
splitwords = []
|
366
|
+
|
367
|
+
node_list.each { |node|
|
368
|
+
if node.is_splitword?
|
369
|
+
splitwords << node
|
370
|
+
else
|
371
|
+
words.concat node.yield_nodes.reject { |t| t.is_punct? }
|
372
|
+
end
|
373
|
+
}
|
374
|
+
|
375
|
+
# check all nodes from root down:
|
376
|
+
# 'constituents', 'nodes_to_check' are arrays of node IDs
|
377
|
+
# 'constituents' contains found constituents,
|
378
|
+
# 'nodes_to_check' contains nodes for which we still need constituents
|
379
|
+
|
380
|
+
constituents = []
|
381
|
+
nodes_to_check = syn_roots # (there may be more than one)
|
382
|
+
# this accesses the syn_roots() method of SalsaTigerSentence
|
383
|
+
|
384
|
+
while(true)
|
385
|
+
node = nodes_to_check.shift
|
386
|
+
# have we checked all nodes already? or are we done with all words? then stop.
|
387
|
+
if node.nil?
|
388
|
+
constituents.concat words
|
389
|
+
words = []
|
390
|
+
break
|
391
|
+
end
|
392
|
+
if words.empty?
|
393
|
+
break
|
394
|
+
end
|
395
|
+
|
396
|
+
# only match nonempty non-punctuation nodes
|
397
|
+
|
398
|
+
node_yield = node.yield_nodes.reject {|n| n.is_punct? }
|
399
|
+
if ignore_empty_terminals
|
400
|
+
node_yield = node_yield.reject { |n| n.is_terminal? and (n.word.nil? or n.word.empty?) }
|
401
|
+
end
|
402
|
+
if node_yield.empty?
|
403
|
+
# this node has no yield, or only punctuation sign yield.
|
404
|
+
# skip it.
|
405
|
+
next
|
406
|
+
end
|
407
|
+
|
408
|
+
rest = node_yield - words
|
409
|
+
if rest.size == 0
|
410
|
+
# whole yield of node consists of words from this FE
|
411
|
+
constituents << node
|
412
|
+
words -= node_yield
|
413
|
+
|
414
|
+
elsif rest.size < node_yield.size
|
415
|
+
# at least some of the words in FE appear below this node:
|
416
|
+
# check this node's children too
|
417
|
+
node.children.each { |child| nodes_to_check << child }
|
418
|
+
end
|
419
|
+
end
|
420
|
+
|
421
|
+
constituents.concat(splitwords) #splitwords stay what they are
|
422
|
+
constituents.concat(words) # any leftover words that may not be from that sentence?
|
423
|
+
# just keep them.
|
424
|
+
|
425
|
+
constituents
|
426
|
+
end
|
427
|
+
|
428
|
+
###
|
429
|
+
# determine maximum constituents covering the nodes in node_list
|
430
|
+
# punctuation terminals (and optionally empty terminals) are ignored.
|
431
|
+
#
|
432
|
+
# If include_single_missing_children is set to true,
|
433
|
+
# then a node that has at least one child whose yield is in nodelist,
|
434
|
+
# and has only one child whose yield is not in nodelist,
|
435
|
+
# will be considered as having its yield in nodelist.
|
436
|
+
#
|
437
|
+
# Optionally, a procedure accept_anyway_proc can be given.
|
438
|
+
# Like the option include_single_missing_children, it can lead to nodes being
|
439
|
+
# included in the list of nodes whose yield nodes are all also yield nodes of node_list (NYNAAYNN)
|
440
|
+
# even though not all of their yield nodes are yield nodes of the node_list.
|
441
|
+
# accept_anyway_proc can implement arbitrary rules for including nodes in NYAAYNN.
|
442
|
+
# The procedure is called with three arguments:
|
443
|
+
# accept_anyway_proc(node, ch_in, ch_out)
|
444
|
+
# node is a SynNode that would not normally be in NYAAYNN.
|
445
|
+
# ch_in is the list of its children that are in NYAAYNN.
|
446
|
+
# ch_out is the list of its children that are not.
|
447
|
+
# If the procedure exists and returns true, node is put into NYAAYNN.
|
448
|
+
#
|
449
|
+
# returns: an array of SynNodes: the maximal constituents that together
|
450
|
+
# exactly cover node_list
|
451
|
+
def max_constituents_smc(node_list, # array: SynNode
|
452
|
+
include_single_missing_children, # boolean
|
453
|
+
ignore_empty_terminals = false, # boolean: ignore empty terminals?
|
454
|
+
accept_anyway_proc = nil) # proc: SynNode, array:SynNode, array:SynNode => boolean
|
455
|
+
|
456
|
+
# sort node IDs into splitwords and rest,
|
457
|
+
# and filter out punctuation marks
|
458
|
+
#
|
459
|
+
# 'words' is an array of node IDs that are not splitwords
|
460
|
+
# 'splitwords' is an array of fenodes that refer to splitwords
|
461
|
+
words = []
|
462
|
+
splitwords = []
|
463
|
+
|
464
|
+
node_list.each { |node|
|
465
|
+
if node.is_splitword?
|
466
|
+
splitwords << node
|
467
|
+
else
|
468
|
+
words.concat node.yield_nodes.reject { |t| t.is_punct? }
|
469
|
+
end
|
470
|
+
}
|
471
|
+
|
472
|
+
constituents = splitwords
|
473
|
+
|
474
|
+
syn_roots.each { |node|
|
475
|
+
node_included, descendants_included = max_constituents_aux(node, words,
|
476
|
+
include_single_missing_children,
|
477
|
+
ignore_empty_terminals,
|
478
|
+
accept_anyway_proc)
|
479
|
+
|
480
|
+
if node_included == "true"
|
481
|
+
constituents << node
|
482
|
+
else
|
483
|
+
constituents.concat descendants_included
|
484
|
+
end
|
485
|
+
}
|
486
|
+
# which words remain to be added?
|
487
|
+
constituents.each { |c| words -= c.yield_nodes }
|
488
|
+
constituents.concat words
|
489
|
+
|
490
|
+
constituents
|
491
|
+
end
|
492
|
+
|
493
|
+
private
|
494
|
+
|
495
|
+
###
|
496
|
+
# recursively determine maximum constituents covering the nodes in 'nodelist',
|
497
|
+
# starting at 'node'.
|
498
|
+
# punctuation terminals (and optionally empty terminals) are ignored.
|
499
|
+
#
|
500
|
+
# If include_single_missing_children is set to true,
|
501
|
+
# then a node that has at least one child whose yield is in nodelist,
|
502
|
+
# and has only one child whose yield is not in nodelist,
|
503
|
+
# will be considered as having its yield in nodelist.
|
504
|
+
#
|
505
|
+
# If accept_anyway_proc is nonnil, also use that to decide whether
|
506
|
+
# a node will be considered as having its yield in nodelist.
|
507
|
+
#
|
508
|
+
# returns: pair [mybool, included_descendants]
|
509
|
+
# where mybool is a string, "true", "false" or "ignoreme" (for ignored
|
510
|
+
# punctuation and empty terminals):
|
511
|
+
# does the yield of this node consist entirely of nodes from nodelist?
|
512
|
+
# and included_descendants is a list of SynNodes: if mybool is "false",
|
513
|
+
# this is a list of descendants of this node whose yield does consist
|
514
|
+
# entirely of nodes from nodelist
|
515
|
+
def max_constituents_aux(node, # SynNode
|
516
|
+
nodelist, # array:SynNode
|
517
|
+
include_single_missing_children = false, # boolean
|
518
|
+
ignore_empty_terminals = false, # boolean: ignore empty terminals?
|
519
|
+
accept_anyway_proc = nil) # proc: SynNode, array:SynNode, array:SynNode => Boolean
|
520
|
+
|
521
|
+
|
522
|
+
|
523
|
+
if node.is_terminal? and nodelist.include? node
|
524
|
+
# node is terminal and included in nodelist
|
525
|
+
return ["true", []]
|
526
|
+
elsif node.is_punct?
|
527
|
+
# punctuation: ignore
|
528
|
+
return ["ignoreme", []]
|
529
|
+
elsif ignore_empty_terminals and node.is_terminal? and
|
530
|
+
(node.word.nil? or node.word.empty?)
|
531
|
+
# empty terminal: possibly ignore
|
532
|
+
return ["ignoreme", []]
|
533
|
+
elsif node.is_terminal?
|
534
|
+
# terminal, but not included in nodelist
|
535
|
+
return ["false", []]
|
536
|
+
end
|
537
|
+
|
538
|
+
children_results = node.children.map { |ch|
|
539
|
+
fully_included, descendants_included = max_constituents_aux(ch, nodelist,
|
540
|
+
include_single_missing_children,
|
541
|
+
ignore_empty_terminals,
|
542
|
+
accept_anyway_proc)
|
543
|
+
[ch, fully_included, descendants_included]
|
544
|
+
}
|
545
|
+
|
546
|
+
res_false = children_results.select { |ch, fully_included, descendants_included|
|
547
|
+
fully_included == "false"
|
548
|
+
}
|
549
|
+
res_true = children_results.select { |ch, fully_included, descendants_included|
|
550
|
+
fully_included == "true"
|
551
|
+
}
|
552
|
+
|
553
|
+
if res_false.empty? and res_true.length > 0
|
554
|
+
# all true, or all true and ignoreme
|
555
|
+
return ["true", []]
|
556
|
+
|
557
|
+
elsif res_false.empty? and res_true.empty?
|
558
|
+
# all ignoreme
|
559
|
+
return ["ignoreme", []]
|
560
|
+
|
561
|
+
elsif res_false.length == 1 and res_true.length > 1 and
|
562
|
+
include_single_missing_children
|
563
|
+
# one child not covered,
|
564
|
+
# resulting in all other children (except the ignoremes) being marked individually:
|
565
|
+
# consider the single missing child as covered, too
|
566
|
+
|
567
|
+
return ["true", []]
|
568
|
+
|
569
|
+
elsif accept_anyway_proc and
|
570
|
+
accept_anyway_proc.call(node, res_true.map { |ch, bool1, bool2| ch }, res_false.map { |ch, bool1, bool2| ch })
|
571
|
+
# some external source tells us that
|
572
|
+
# we are to consider the missing children as covered, too
|
573
|
+
return ["true", []]
|
574
|
+
|
575
|
+
else
|
576
|
+
# not all children covered
|
577
|
+
return [
|
578
|
+
"false",
|
579
|
+
children_results.map { |ch, fully_included, descendants_included|
|
580
|
+
if fully_included == "true"
|
581
|
+
[ch]
|
582
|
+
else
|
583
|
+
descendants_included
|
584
|
+
end
|
585
|
+
}.flatten
|
586
|
+
]
|
587
|
+
end
|
588
|
+
end
|
589
|
+
|
590
|
+
protected
|
591
|
+
|
592
|
+
def get_xml_ofchildren
|
593
|
+
@syn.get + @sem.get
|
594
|
+
end
|
595
|
+
end
|
596
|
+
end
|