shalmaneser-prep 1.2.0.rc4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,269 @@
1
+ # RegXML
2
+ #
3
+ # Katrin Erk June 2005
4
+
5
+ # SalsaTigerRegXML: take control of the data structure, no underlying xml
6
+ # representation anymore, re-generation of xml on demand
7
+
8
+ class RegXML
9
+
10
+ def initialize(string, # string representing a single XML element
11
+ i_am_text = false) # boolean: xml element (false) or text (true)
12
+
13
+ unless string.class == String
14
+ raise "First argument to RegXML.new must be string. I got #{string.class.to_s}"
15
+ end
16
+ if i_am_text
17
+ @s = string
18
+ @i_am_text = true
19
+ else
20
+ @s = string.gsub(/\n/, " ").freeze
21
+ @i_am_text = false
22
+
23
+ element_test()
24
+ dyck_test()
25
+ end
26
+ end
27
+
28
+ def to_s()
29
+ return xml_readable(@s)
30
+ end
31
+
32
+ def text?
33
+ return @i_am_text
34
+ end
35
+
36
+ def name()
37
+ if @i_am_text
38
+ # text
39
+ return nil
40
+
41
+ else
42
+ # xml element
43
+ if @s =~ /^\s*<\s*([\w-]+)[\s\/>]/
44
+ return $1
45
+ else
46
+ raise "Cannot parse:\n#{xml_readable(@s)}"
47
+ end
48
+ end
49
+ end
50
+
51
+ def attributes()
52
+ if @i_am_text
53
+ # text
54
+ return {}
55
+
56
+ else
57
+ # xml element
58
+
59
+ # remove <element_name from the beginning of @s,
60
+ # place the rest up to the first > into elt_contents:
61
+ # this is a string of the form
62
+ # - either (name=value)*
63
+ # - or (name=value)*/
64
+ unless @s =~ /^\s*<\s*#{name()}(.*)$/
65
+ raise "Cannot parse:\n #{xml_readable(@s)}"
66
+ end
67
+
68
+ retv = Hash.new
69
+ elt_contents = $1
70
+
71
+ # repeat until only > or /> is left
72
+ while elt_contents !~ /^\s*\/?>/
73
+
74
+ # shave off the next name=value pair
75
+ # put the rest into elt_contents
76
+ # make sure that if the value is quoted with ',
77
+ # we accept " inside the value, and vice versa.
78
+ unless elt_contents =~ /^\s*([\w-]+)=(['"])(.*?)\2(.*)$/
79
+ raise "Cannot parse:\n #{xml_readable(elt_contents)}"
80
+ end
81
+ retv[$1] = $3
82
+ elt_contents = $4
83
+ end
84
+
85
+ return retv
86
+ end
87
+ end
88
+
89
+ def children_and_text()
90
+ if @i_am_text
91
+ return []
92
+
93
+ else
94
+ if unary_element()
95
+ # <bla/>, no children
96
+ return []
97
+ end
98
+
99
+ # @s has the form <bla...> ... </bla>.
100
+ # remove <bla ...> from the beginning of @s,
101
+ # place the rest up to </bla> into children_s:
102
+
103
+ mainname = name()
104
+ unless @s =~ /^\s*<\s*#{mainname}(\s+[\w-]+=(["']).*?\2)*\s*>(.*?)<\/\s*#{mainname}\s*>\s*$/
105
+ raise "Cannot parse:\n #{xml_readable(@s)}"
106
+ end
107
+
108
+ retv = Array.new
109
+ children_s = $3
110
+
111
+ # repeat until only whitespace is left
112
+ while children_s !~ /^\s*$/
113
+
114
+ # shave off the next bit of text
115
+ # put the rest into children_s
116
+ unless children_s =~ /^\s*(.*?)(<.*$|$)/
117
+ $stderr.puts "Whole was:\n #{xml_readable(@s)}"
118
+ $stderr.puts
119
+ raise "Cannot parse:\n #{xml_readable(children_s)}"
120
+ end
121
+ unless $1.strip.empty?
122
+ children_s = $2
123
+ retv << RegXML.new($1, true)
124
+ end
125
+
126
+ # anything left after we've parsed text?
127
+ if children_s =~ /^s*$/
128
+ break
129
+ end
130
+
131
+ # shave off the next child
132
+ # and put the rest into children_s
133
+
134
+ # determine the next child's name, and the string index at which
135
+ # the element start tag ends with either / or >
136
+ unless children_s =~ /^\s*(<\s*([\w-]+)(\s+[\w-]+=(["']).*?\4)*\s*)/
137
+ $stderr.puts "Whole was:\n #{xml_readable(@s)}"
138
+ $stderr.puts
139
+ raise "Cannot parse:\n #{xml_readable(children_s)}"
140
+ end
141
+ childname = $2
142
+ child = $1
143
+ endofelt_ix = $&.length()
144
+
145
+
146
+ # and remove it
147
+ case children_s[endofelt_ix..-1]
148
+ when /^\/>(.*)$/
149
+ # next child is a unary element
150
+ children_s = $1
151
+ retv << RegXML.new(child + "/>")
152
+
153
+ when /^(>.*?<\s*\/\s*#{childname}\s*>)(.*)$/
154
+ children_s = $2
155
+ retv << RegXML.new(child + $1)
156
+
157
+ else
158
+ $stderr.puts "Whole was:\n #{xml_readable(@s)}"
159
+ $stderr.puts
160
+ raise "Cannot parse:\n#{xml_readable(children_s)}"
161
+ end
162
+ end
163
+
164
+ return retv
165
+ end
166
+ end
167
+
168
+ def RegXML.test()
169
+ bla = RegXML.new(" <bla blupp='a\"b'
170
+ lalala=\"c\">
171
+ <lalala> </lalala>
172
+ texttext
173
+ <lala blupp='b'/>
174
+ nochtext
175
+ <la> <l/> </la>
176
+ </ bla >
177
+ ")
178
+ puts "name " + bla.name()
179
+ puts
180
+ puts bla.to_s()
181
+ puts
182
+ bla.attributes.each { |attr, val|
183
+ puts "attr " + attr + "=" + val
184
+ }
185
+ puts
186
+ bla.children_and_text.each { |child_obj|
187
+ if child_obj.text?
188
+ puts "da text " + child_obj.to_s
189
+ else
190
+ puts "da child " + child_obj.to_s
191
+ end
192
+ }
193
+ puts
194
+
195
+ puts "NEU"
196
+ bla = RegXML.new(" < bla blupp='a\"'/> ")
197
+ puts "name " + bla.name()
198
+ puts
199
+ puts bla.to_s()
200
+ puts
201
+ bla.attributes.each { |attr, val|
202
+ puts "attr " + attr + "=" + val
203
+ }
204
+ puts
205
+ bla.children_and_text.each { |child_obj|
206
+ if child_obj.text?
207
+ puts "da text " + child_obj.to_s
208
+ else
209
+ puts "da child " + child_obj.to_s
210
+ end
211
+ }
212
+ puts
213
+
214
+ end
215
+
216
+ ##############
217
+ protected
218
+
219
+ def unary_element()
220
+ # <bla/>
221
+ if @s =~ /^\s*<.*\/>\s*$/
222
+ return true
223
+ else
224
+ return false
225
+ end
226
+ end
227
+
228
+ def element_test()
229
+ # make sure we have a single XML element, either <bla/> or
230
+ # <bla>...</bla>
231
+
232
+ if unary_element()
233
+ # <bla/>
234
+ elsif @s =~ /^\s*<\s*([\w-]+)\W.*?<\/\s*\1\s*>\s*$/
235
+ # <bla > ... </bla>
236
+ else
237
+ raise "Cannot parse:\n #{xml_readable(@s)}"
238
+ end
239
+ end
240
+
241
+ def dyck_test()
242
+ # every prefix of @s must have at least as many < as >
243
+ opening = 0
244
+ closing = 0
245
+ @s.scan(/[<>]/) { |bracket|
246
+ case bracket
247
+ when "<"
248
+ opening += 1
249
+ when ">"
250
+ closing += 1
251
+ if closing > opening
252
+ raise "More closing than opening brackets in prefix of:\n #{xml_readable(@s)}"
253
+ end
254
+ end
255
+ }
256
+
257
+ # and in total, @s must have equally many < and >
258
+ unless @s.count("<") == @s.count(">")
259
+ raise "Inequal number of brackets in:\n #{xml_readable(@s)}"
260
+ end
261
+ end
262
+
263
+ def xml_readable(string)
264
+ return string.gsub(/>/, ">\n")
265
+ end
266
+ end
267
+
268
+ # RegXML.test()
269
+
@@ -0,0 +1,194 @@
1
+ #########
2
+ # module StringTerminalsInRightOrder
3
+ #
4
+ # returns the yield of a node, or a list of nodes, as a string
5
+ # of " "-separated words
6
+ #
7
+ # Words are put into the right order, left to right,
8
+ # under the assumption that their node IDs reflect that order
9
+ #
10
+ # Terminal nodes are assumed to have IDs ending in a number,
11
+ # numbered from left to right
12
+ #
13
+ # Splitword nodes are assumed to have IDs ending in N_sM
14
+ # for numbers N and M, where N orders terminals left to right
15
+ # and M orders the splitword parts left to right
16
+ #
17
+ # If the yield of the node/the list of nodes contains all splitwords of a terminal,
18
+ # the whole terminal is taken instead
19
+ #
20
+ # methods:
21
+ #
22
+ # string_for_node returns the string for the yield of a node
23
+ # node: a node object
24
+ #
25
+ # string_for_nodes returns the string for the yield of a list of nodes
26
+ # nodes: a list of node objects
27
+
28
+ module StringTerminalsInRightOrder
29
+ def string_for_node(node)
30
+ string_for_nodes([node])
31
+ end
32
+
33
+ def string_for_nodes(nodes)
34
+ a = right_level_terminals_for_nodes(nodes)
35
+ a = sort_terminals_and_splitwords_left_to_right(a)
36
+ return node_array_to_string(a)
37
+ end
38
+
39
+ #####
40
+ private
41
+
42
+ # right_level_terminals_for_nodes:
43
+ # - compute the yield for each element of 'nodes'
44
+ # - then consider all splitwords in the yield:
45
+ # if all splitwords of a terminal are in the yield,
46
+ # then use the terminal rather than its splitwords
47
+ def right_level_terminals_for_nodes(nodes)
48
+ a = nodes.map { |n| n.yield_nodes()}.flatten
49
+ b = Array.new
50
+ a.each { |n|
51
+ if n.is_splitword?
52
+ # see if a contains all parts of this splitword
53
+ # if so, take into b the splitword's parent, the terminal,
54
+ # rather than the individual splitwords
55
+
56
+ if n.parent.nil?
57
+ # splitword without a parent
58
+ b << n
59
+ elsif b.include? n.parent or a.include? n.parent
60
+ # did we already include the splitword's parent in b?
61
+ # then we're done
62
+ else
63
+
64
+ # check if all children of n.parent are in 'a'
65
+ all_in = true
66
+ n.parent.each_child { |nsibling|
67
+ unless a.include? nsibling
68
+ all_in = false
69
+ break
70
+ end
71
+ }
72
+
73
+ if all_in
74
+ # yes, all children of n.parent are in 'a'
75
+ b << n.parent
76
+ else
77
+ # no, some sibling of n is not in 'a'
78
+ b << n
79
+ end
80
+ end
81
+ elsif n.is_terminal?
82
+ # n is a terminal
83
+ b << n
84
+ # if n is anything but a splitword or a terminal,
85
+ # ignore it
86
+ end
87
+ }
88
+ return b.uniq
89
+ end
90
+
91
+ # sort_terminals_and_splitwords_left_to_right:
92
+ # take an array of nodes that consists of terminals and splitwords
93
+ # and sort them using the following comparison:
94
+ # - when comparing two terminals, use the
95
+ # last numbers in their respective IDs
96
+ # - when comparing two splitwords, their IDs end in _N_sM
97
+ # for numbers N and M.
98
+ # If they coincide in N, compare them by M,
99
+ # else compare them by M
100
+ # - when comparing a terminal and a splitword,
101
+ # compare the terminal's last number to the splitword's N
102
+ def sort_terminals_and_splitwords_left_to_right(nodes)
103
+ nodes.sort { |a, b|
104
+ if a.is_splitword? and b.is_splitword?
105
+ compare_splitwords(a, b)
106
+ elsif a.is_terminal? and b.is_terminal?
107
+ compare_terminals(a, b)
108
+ else
109
+ compare_mixed(a, b)
110
+ end
111
+ }
112
+ end
113
+
114
+ # node_array_to_string:
115
+ # 'nodes' is an array of node objects, each of which offer a "word" method
116
+ # string their words together separated by " "
117
+ def node_array_to_string(nodes)
118
+ s = ""
119
+ nodes.each { |n|
120
+ s = s + n.word + " "
121
+ }
122
+ return s
123
+ end
124
+
125
+ # - when comparing two terminals, use the
126
+ # last numbers in their respective IDs
127
+ def compare_terminals(a, b)
128
+ last_i(a) <=> last_i(b)
129
+ end
130
+
131
+ # - when comparing two splitwords, their IDs end in _N_sM
132
+ # for numbers N and M.
133
+ # If they coincide in N, compare them by M,
134
+ # else compare them by M
135
+ def compare_splitwords(a, b)
136
+ if splitword_terminal_i(a) == splitword_terminal_i(b)
137
+ # parts of same terminal?
138
+ # compare parts
139
+ last_i(a) <=> last_i(b)
140
+ else
141
+ # not parts of same terminal?
142
+ # compare terminals
143
+ splitword_terminal_i(a) <=> splitword_terminal_i(b)
144
+ end
145
+ end
146
+
147
+ # - when comparing a terminal and a splitword,
148
+ # compare the terminal's last number to the splitword's N
149
+ def compare_mixed(a, b)
150
+ if a.is_splitword? and b.is_terminal?
151
+ splitword_terminal_i(a) <=> last_i(b)
152
+
153
+ elsif a.is_terminal? and b.is_splitword?
154
+ last_i(a) <=> splitword_terminal_i(b)
155
+ else
156
+ # not one terminal, one splitword?
157
+ # then what?
158
+ $stderr.print "SalsaTigerSentence, compare_mixed: confused by "
159
+ $stderr.print a.id, ", ", b.id, "\n"
160
+ end
161
+ end
162
+
163
+ # return last number of the ID of a node
164
+ def last_i(n)
165
+ n.id =~ /(\d+)$/ # match final string of digits
166
+ if $1.nil? # if shouldn't happen _in principle_
167
+ # but we might get weird node IDs for splitwords;
168
+ # so we act gracefully and catch the case where there
169
+ # is one final letter behind the digits
170
+ n.id =~ /(\d+)\w$/
171
+ end
172
+ if $1.nil? # this shouldn't ever happen
173
+ $stderr.print "SalsaTigerSentence, last_i: Couldn't extract digits from: "
174
+ $stderr.print n.id, "\n"
175
+ exit 1
176
+ end
177
+ return $1.to_i # and return it as number
178
+ end
179
+
180
+ # assume the ID of the node includes N_sM
181
+ # return N
182
+ def splitword_terminal_i(n)
183
+ n.id =~ /(\d+)_s\d*/ # match string of digits before splitword ID
184
+ if $1.nil? # this shouldn't ever happen
185
+ $stderr.print "SalsaTigerSentence, splitword_terminal_i: Couldn't extract digits from: "
186
+ $stderr.print n.id, "\n"
187
+ exit 1
188
+ end
189
+ return $1.to_i # and return it as number
190
+ end
191
+
192
+ end
193
+
194
+