shalmaneser-prep 1.2.0.rc4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,269 @@
1
+ # RegXML
2
+ #
3
+ # Katrin Erk June 2005
4
+
5
+ # SalsaTigerRegXML: take control of the data structure, no underlying xml
6
+ # representation anymore, re-generation of xml on demand
7
+
8
+ class RegXML
9
+
10
+ def initialize(string, # string representing a single XML element
11
+ i_am_text = false) # boolean: xml element (false) or text (true)
12
+
13
+ unless string.class == String
14
+ raise "First argument to RegXML.new must be string. I got #{string.class.to_s}"
15
+ end
16
+ if i_am_text
17
+ @s = string
18
+ @i_am_text = true
19
+ else
20
+ @s = string.gsub(/\n/, " ").freeze
21
+ @i_am_text = false
22
+
23
+ element_test()
24
+ dyck_test()
25
+ end
26
+ end
27
+
28
+ def to_s()
29
+ return xml_readable(@s)
30
+ end
31
+
32
+ def text?
33
+ return @i_am_text
34
+ end
35
+
36
+ def name()
37
+ if @i_am_text
38
+ # text
39
+ return nil
40
+
41
+ else
42
+ # xml element
43
+ if @s =~ /^\s*<\s*([\w-]+)[\s\/>]/
44
+ return $1
45
+ else
46
+ raise "Cannot parse:\n#{xml_readable(@s)}"
47
+ end
48
+ end
49
+ end
50
+
51
+ def attributes()
52
+ if @i_am_text
53
+ # text
54
+ return {}
55
+
56
+ else
57
+ # xml element
58
+
59
+ # remove <element_name from the beginning of @s,
60
+ # place the rest up to the first > into elt_contents:
61
+ # this is a string of the form
62
+ # - either (name=value)*
63
+ # - or (name=value)*/
64
+ unless @s =~ /^\s*<\s*#{name()}(.*)$/
65
+ raise "Cannot parse:\n #{xml_readable(@s)}"
66
+ end
67
+
68
+ retv = Hash.new
69
+ elt_contents = $1
70
+
71
+ # repeat until only > or /> is left
72
+ while elt_contents !~ /^\s*\/?>/
73
+
74
+ # shave off the next name=value pair
75
+ # put the rest into elt_contents
76
+ # make sure that if the value is quoted with ',
77
+ # we accept " inside the value, and vice versa.
78
+ unless elt_contents =~ /^\s*([\w-]+)=(['"])(.*?)\2(.*)$/
79
+ raise "Cannot parse:\n #{xml_readable(elt_contents)}"
80
+ end
81
+ retv[$1] = $3
82
+ elt_contents = $4
83
+ end
84
+
85
+ return retv
86
+ end
87
+ end
88
+
89
+ def children_and_text()
90
+ if @i_am_text
91
+ return []
92
+
93
+ else
94
+ if unary_element()
95
+ # <bla/>, no children
96
+ return []
97
+ end
98
+
99
+ # @s has the form <bla...> ... </bla>.
100
+ # remove <bla ...> from the beginning of @s,
101
+ # place the rest up to </bla> into children_s:
102
+
103
+ mainname = name()
104
+ unless @s =~ /^\s*<\s*#{mainname}(\s+[\w-]+=(["']).*?\2)*\s*>(.*?)<\/\s*#{mainname}\s*>\s*$/
105
+ raise "Cannot parse:\n #{xml_readable(@s)}"
106
+ end
107
+
108
+ retv = Array.new
109
+ children_s = $3
110
+
111
+ # repeat until only whitespace is left
112
+ while children_s !~ /^\s*$/
113
+
114
+ # shave off the next bit of text
115
+ # put the rest into children_s
116
+ unless children_s =~ /^\s*(.*?)(<.*$|$)/
117
+ $stderr.puts "Whole was:\n #{xml_readable(@s)}"
118
+ $stderr.puts
119
+ raise "Cannot parse:\n #{xml_readable(children_s)}"
120
+ end
121
+ unless $1.strip.empty?
122
+ children_s = $2
123
+ retv << RegXML.new($1, true)
124
+ end
125
+
126
+ # anything left after we've parsed text?
127
+ if children_s =~ /^s*$/
128
+ break
129
+ end
130
+
131
+ # shave off the next child
132
+ # and put the rest into children_s
133
+
134
+ # determine the next child's name, and the string index at which
135
+ # the element start tag ends with either / or >
136
+ unless children_s =~ /^\s*(<\s*([\w-]+)(\s+[\w-]+=(["']).*?\4)*\s*)/
137
+ $stderr.puts "Whole was:\n #{xml_readable(@s)}"
138
+ $stderr.puts
139
+ raise "Cannot parse:\n #{xml_readable(children_s)}"
140
+ end
141
+ childname = $2
142
+ child = $1
143
+ endofelt_ix = $&.length()
144
+
145
+
146
+ # and remove it
147
+ case children_s[endofelt_ix..-1]
148
+ when /^\/>(.*)$/
149
+ # next child is a unary element
150
+ children_s = $1
151
+ retv << RegXML.new(child + "/>")
152
+
153
+ when /^(>.*?<\s*\/\s*#{childname}\s*>)(.*)$/
154
+ children_s = $2
155
+ retv << RegXML.new(child + $1)
156
+
157
+ else
158
+ $stderr.puts "Whole was:\n #{xml_readable(@s)}"
159
+ $stderr.puts
160
+ raise "Cannot parse:\n#{xml_readable(children_s)}"
161
+ end
162
+ end
163
+
164
+ return retv
165
+ end
166
+ end
167
+
168
+ def RegXML.test()
169
+ bla = RegXML.new(" <bla blupp='a\"b'
170
+ lalala=\"c\">
171
+ <lalala> </lalala>
172
+ texttext
173
+ <lala blupp='b'/>
174
+ nochtext
175
+ <la> <l/> </la>
176
+ </ bla >
177
+ ")
178
+ puts "name " + bla.name()
179
+ puts
180
+ puts bla.to_s()
181
+ puts
182
+ bla.attributes.each { |attr, val|
183
+ puts "attr " + attr + "=" + val
184
+ }
185
+ puts
186
+ bla.children_and_text.each { |child_obj|
187
+ if child_obj.text?
188
+ puts "da text " + child_obj.to_s
189
+ else
190
+ puts "da child " + child_obj.to_s
191
+ end
192
+ }
193
+ puts
194
+
195
+ puts "NEU"
196
+ bla = RegXML.new(" < bla blupp='a\"'/> ")
197
+ puts "name " + bla.name()
198
+ puts
199
+ puts bla.to_s()
200
+ puts
201
+ bla.attributes.each { |attr, val|
202
+ puts "attr " + attr + "=" + val
203
+ }
204
+ puts
205
+ bla.children_and_text.each { |child_obj|
206
+ if child_obj.text?
207
+ puts "da text " + child_obj.to_s
208
+ else
209
+ puts "da child " + child_obj.to_s
210
+ end
211
+ }
212
+ puts
213
+
214
+ end
215
+
216
+ ##############
217
+ protected
218
+
219
+ def unary_element()
220
+ # <bla/>
221
+ if @s =~ /^\s*<.*\/>\s*$/
222
+ return true
223
+ else
224
+ return false
225
+ end
226
+ end
227
+
228
+ def element_test()
229
+ # make sure we have a single XML element, either <bla/> or
230
+ # <bla>...</bla>
231
+
232
+ if unary_element()
233
+ # <bla/>
234
+ elsif @s =~ /^\s*<\s*([\w-]+)\W.*?<\/\s*\1\s*>\s*$/
235
+ # <bla > ... </bla>
236
+ else
237
+ raise "Cannot parse:\n #{xml_readable(@s)}"
238
+ end
239
+ end
240
+
241
+ def dyck_test()
242
+ # every prefix of @s must have at least as many < as >
243
+ opening = 0
244
+ closing = 0
245
+ @s.scan(/[<>]/) { |bracket|
246
+ case bracket
247
+ when "<"
248
+ opening += 1
249
+ when ">"
250
+ closing += 1
251
+ if closing > opening
252
+ raise "More closing than opening brackets in prefix of:\n #{xml_readable(@s)}"
253
+ end
254
+ end
255
+ }
256
+
257
+ # and in total, @s must have equally many < and >
258
+ unless @s.count("<") == @s.count(">")
259
+ raise "Inequal number of brackets in:\n #{xml_readable(@s)}"
260
+ end
261
+ end
262
+
263
+ def xml_readable(string)
264
+ return string.gsub(/>/, ">\n")
265
+ end
266
+ end
267
+
268
+ # RegXML.test()
269
+
@@ -0,0 +1,194 @@
1
+ #########
2
+ # module StringTerminalsInRightOrder
3
+ #
4
+ # returns the yield of a node, or a list of nodes, as a string
5
+ # of " "-separated words
6
+ #
7
+ # Words are put into the right order, left to right,
8
+ # under the assumption that their node IDs reflect that order
9
+ #
10
+ # Terminal nodes are assumed to have IDs ending in a number,
11
+ # numbered from left to right
12
+ #
13
+ # Splitword nodes are assumed to have IDs ending in N_sM
14
+ # for numbers N and M, where N orders terminals left to right
15
+ # and M orders the splitword parts left to right
16
+ #
17
+ # If the yield of the node/the list of nodes contains all splitwords of a terminal,
18
+ # the whole terminal is taken instead
19
+ #
20
+ # methods:
21
+ #
22
+ # string_for_node returns the string for the yield of a node
23
+ # node: a node object
24
+ #
25
+ # string_for_nodes returns the string for the yield of a list of nodes
26
+ # nodes: a list of node objects
27
+
28
+ module StringTerminalsInRightOrder
29
+ def string_for_node(node)
30
+ string_for_nodes([node])
31
+ end
32
+
33
+ def string_for_nodes(nodes)
34
+ a = right_level_terminals_for_nodes(nodes)
35
+ a = sort_terminals_and_splitwords_left_to_right(a)
36
+ return node_array_to_string(a)
37
+ end
38
+
39
+ #####
40
+ private
41
+
42
+ # right_level_terminals_for_nodes:
43
+ # - compute the yield for each element of 'nodes'
44
+ # - then consider all splitwords in the yield:
45
+ # if all splitwords of a terminal are in the yield,
46
+ # then use the terminal rather than its splitwords
47
+ def right_level_terminals_for_nodes(nodes)
48
+ a = nodes.map { |n| n.yield_nodes()}.flatten
49
+ b = Array.new
50
+ a.each { |n|
51
+ if n.is_splitword?
52
+ # see if a contains all parts of this splitword
53
+ # if so, take into b the splitword's parent, the terminal,
54
+ # rather than the individual splitwords
55
+
56
+ if n.parent.nil?
57
+ # splitword without a parent
58
+ b << n
59
+ elsif b.include? n.parent or a.include? n.parent
60
+ # did we already include the splitword's parent in b?
61
+ # then we're done
62
+ else
63
+
64
+ # check if all children of n.parent are in 'a'
65
+ all_in = true
66
+ n.parent.each_child { |nsibling|
67
+ unless a.include? nsibling
68
+ all_in = false
69
+ break
70
+ end
71
+ }
72
+
73
+ if all_in
74
+ # yes, all children of n.parent are in 'a'
75
+ b << n.parent
76
+ else
77
+ # no, some sibling of n is not in 'a'
78
+ b << n
79
+ end
80
+ end
81
+ elsif n.is_terminal?
82
+ # n is a terminal
83
+ b << n
84
+ # if n is anything but a splitword or a terminal,
85
+ # ignore it
86
+ end
87
+ }
88
+ return b.uniq
89
+ end
90
+
91
+ # sort_terminals_and_splitwords_left_to_right:
92
+ # take an array of nodes that consists of terminals and splitwords
93
+ # and sort them using the following comparison:
94
+ # - when comparing two terminals, use the
95
+ # last numbers in their respective IDs
96
+ # - when comparing two splitwords, their IDs end in _N_sM
97
+ # for numbers N and M.
98
+ # If they coincide in N, compare them by M,
99
+ # else compare them by M
100
+ # - when comparing a terminal and a splitword,
101
+ # compare the terminal's last number to the splitword's N
102
+ def sort_terminals_and_splitwords_left_to_right(nodes)
103
+ nodes.sort { |a, b|
104
+ if a.is_splitword? and b.is_splitword?
105
+ compare_splitwords(a, b)
106
+ elsif a.is_terminal? and b.is_terminal?
107
+ compare_terminals(a, b)
108
+ else
109
+ compare_mixed(a, b)
110
+ end
111
+ }
112
+ end
113
+
114
+ # node_array_to_string:
115
+ # 'nodes' is an array of node objects, each of which offer a "word" method
116
+ # string their words together separated by " "
117
+ def node_array_to_string(nodes)
118
+ s = ""
119
+ nodes.each { |n|
120
+ s = s + n.word + " "
121
+ }
122
+ return s
123
+ end
124
+
125
+ # - when comparing two terminals, use the
126
+ # last numbers in their respective IDs
127
+ def compare_terminals(a, b)
128
+ last_i(a) <=> last_i(b)
129
+ end
130
+
131
+ # - when comparing two splitwords, their IDs end in _N_sM
132
+ # for numbers N and M.
133
+ # If they coincide in N, compare them by M,
134
+ # else compare them by M
135
+ def compare_splitwords(a, b)
136
+ if splitword_terminal_i(a) == splitword_terminal_i(b)
137
+ # parts of same terminal?
138
+ # compare parts
139
+ last_i(a) <=> last_i(b)
140
+ else
141
+ # not parts of same terminal?
142
+ # compare terminals
143
+ splitword_terminal_i(a) <=> splitword_terminal_i(b)
144
+ end
145
+ end
146
+
147
+ # - when comparing a terminal and a splitword,
148
+ # compare the terminal's last number to the splitword's N
149
+ def compare_mixed(a, b)
150
+ if a.is_splitword? and b.is_terminal?
151
+ splitword_terminal_i(a) <=> last_i(b)
152
+
153
+ elsif a.is_terminal? and b.is_splitword?
154
+ last_i(a) <=> splitword_terminal_i(b)
155
+ else
156
+ # not one terminal, one splitword?
157
+ # then what?
158
+ $stderr.print "SalsaTigerSentence, compare_mixed: confused by "
159
+ $stderr.print a.id, ", ", b.id, "\n"
160
+ end
161
+ end
162
+
163
+ # return last number of the ID of a node
164
+ def last_i(n)
165
+ n.id =~ /(\d+)$/ # match final string of digits
166
+ if $1.nil? # if shouldn't happen _in principle_
167
+ # but we might get weird node IDs for splitwords;
168
+ # so we act gracefully and catch the case where there
169
+ # is one final letter behind the digits
170
+ n.id =~ /(\d+)\w$/
171
+ end
172
+ if $1.nil? # this shouldn't ever happen
173
+ $stderr.print "SalsaTigerSentence, last_i: Couldn't extract digits from: "
174
+ $stderr.print n.id, "\n"
175
+ exit 1
176
+ end
177
+ return $1.to_i # and return it as number
178
+ end
179
+
180
+ # assume the ID of the node includes N_sM
181
+ # return N
182
+ def splitword_terminal_i(n)
183
+ n.id =~ /(\d+)_s\d*/ # match string of digits before splitword ID
184
+ if $1.nil? # this shouldn't ever happen
185
+ $stderr.print "SalsaTigerSentence, splitword_terminal_i: Couldn't extract digits from: "
186
+ $stderr.print n.id, "\n"
187
+ exit 1
188
+ end
189
+ return $1.to_i # and return it as number
190
+ end
191
+
192
+ end
193
+
194
+