shalmaneser-prep 1.2.0.rc4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +10 -0
- data/CHANGELOG.md +4 -0
- data/LICENSE.md +4 -0
- data/README.md +93 -0
- data/lib/frprep/Ampersand.rb +39 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/TreetaggerInterface.rb +327 -0
- data/lib/frprep/do_parses.rb +143 -0
- data/lib/frprep/frprep.rb +693 -0
- data/lib/frprep/interfaces/berkeley_interface.rb +372 -0
- data/lib/frprep/interfaces/stanford_interface.rb +353 -0
- data/lib/frprep/interpreters/berkeley_interpreter.rb +22 -0
- data/lib/frprep/interpreters/stanford_interpreter.rb +22 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +58 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +99 -0
- data/test/functional/test_rosy.rb +40 -0
- metadata +85 -0
@@ -0,0 +1,269 @@
|
|
1
|
+
# RegXML
|
2
|
+
#
|
3
|
+
# Katrin Erk June 2005
|
4
|
+
|
5
|
+
# SalsaTigerRegXML: take control of the data structure, no underlying xml
|
6
|
+
# representation anymore, re-generation of xml on demand
|
7
|
+
|
8
|
+
class RegXML
|
9
|
+
|
10
|
+
def initialize(string, # string representing a single XML element
|
11
|
+
i_am_text = false) # boolean: xml element (false) or text (true)
|
12
|
+
|
13
|
+
unless string.class == String
|
14
|
+
raise "First argument to RegXML.new must be string. I got #{string.class.to_s}"
|
15
|
+
end
|
16
|
+
if i_am_text
|
17
|
+
@s = string
|
18
|
+
@i_am_text = true
|
19
|
+
else
|
20
|
+
@s = string.gsub(/\n/, " ").freeze
|
21
|
+
@i_am_text = false
|
22
|
+
|
23
|
+
element_test()
|
24
|
+
dyck_test()
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def to_s()
|
29
|
+
return xml_readable(@s)
|
30
|
+
end
|
31
|
+
|
32
|
+
def text?
|
33
|
+
return @i_am_text
|
34
|
+
end
|
35
|
+
|
36
|
+
def name()
|
37
|
+
if @i_am_text
|
38
|
+
# text
|
39
|
+
return nil
|
40
|
+
|
41
|
+
else
|
42
|
+
# xml element
|
43
|
+
if @s =~ /^\s*<\s*([\w-]+)[\s\/>]/
|
44
|
+
return $1
|
45
|
+
else
|
46
|
+
raise "Cannot parse:\n#{xml_readable(@s)}"
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def attributes()
|
52
|
+
if @i_am_text
|
53
|
+
# text
|
54
|
+
return {}
|
55
|
+
|
56
|
+
else
|
57
|
+
# xml element
|
58
|
+
|
59
|
+
# remove <element_name from the beginning of @s,
|
60
|
+
# place the rest up to the first > into elt_contents:
|
61
|
+
# this is a string of the form
|
62
|
+
# - either (name=value)*
|
63
|
+
# - or (name=value)*/
|
64
|
+
unless @s =~ /^\s*<\s*#{name()}(.*)$/
|
65
|
+
raise "Cannot parse:\n #{xml_readable(@s)}"
|
66
|
+
end
|
67
|
+
|
68
|
+
retv = Hash.new
|
69
|
+
elt_contents = $1
|
70
|
+
|
71
|
+
# repeat until only > or /> is left
|
72
|
+
while elt_contents !~ /^\s*\/?>/
|
73
|
+
|
74
|
+
# shave off the next name=value pair
|
75
|
+
# put the rest into elt_contents
|
76
|
+
# make sure that if the value is quoted with ',
|
77
|
+
# we accept " inside the value, and vice versa.
|
78
|
+
unless elt_contents =~ /^\s*([\w-]+)=(['"])(.*?)\2(.*)$/
|
79
|
+
raise "Cannot parse:\n #{xml_readable(elt_contents)}"
|
80
|
+
end
|
81
|
+
retv[$1] = $3
|
82
|
+
elt_contents = $4
|
83
|
+
end
|
84
|
+
|
85
|
+
return retv
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def children_and_text()
|
90
|
+
if @i_am_text
|
91
|
+
return []
|
92
|
+
|
93
|
+
else
|
94
|
+
if unary_element()
|
95
|
+
# <bla/>, no children
|
96
|
+
return []
|
97
|
+
end
|
98
|
+
|
99
|
+
# @s has the form <bla...> ... </bla>.
|
100
|
+
# remove <bla ...> from the beginning of @s,
|
101
|
+
# place the rest up to </bla> into children_s:
|
102
|
+
|
103
|
+
mainname = name()
|
104
|
+
unless @s =~ /^\s*<\s*#{mainname}(\s+[\w-]+=(["']).*?\2)*\s*>(.*?)<\/\s*#{mainname}\s*>\s*$/
|
105
|
+
raise "Cannot parse:\n #{xml_readable(@s)}"
|
106
|
+
end
|
107
|
+
|
108
|
+
retv = Array.new
|
109
|
+
children_s = $3
|
110
|
+
|
111
|
+
# repeat until only whitespace is left
|
112
|
+
while children_s !~ /^\s*$/
|
113
|
+
|
114
|
+
# shave off the next bit of text
|
115
|
+
# put the rest into children_s
|
116
|
+
unless children_s =~ /^\s*(.*?)(<.*$|$)/
|
117
|
+
$stderr.puts "Whole was:\n #{xml_readable(@s)}"
|
118
|
+
$stderr.puts
|
119
|
+
raise "Cannot parse:\n #{xml_readable(children_s)}"
|
120
|
+
end
|
121
|
+
unless $1.strip.empty?
|
122
|
+
children_s = $2
|
123
|
+
retv << RegXML.new($1, true)
|
124
|
+
end
|
125
|
+
|
126
|
+
# anything left after we've parsed text?
|
127
|
+
if children_s =~ /^s*$/
|
128
|
+
break
|
129
|
+
end
|
130
|
+
|
131
|
+
# shave off the next child
|
132
|
+
# and put the rest into children_s
|
133
|
+
|
134
|
+
# determine the next child's name, and the string index at which
|
135
|
+
# the element start tag ends with either / or >
|
136
|
+
unless children_s =~ /^\s*(<\s*([\w-]+)(\s+[\w-]+=(["']).*?\4)*\s*)/
|
137
|
+
$stderr.puts "Whole was:\n #{xml_readable(@s)}"
|
138
|
+
$stderr.puts
|
139
|
+
raise "Cannot parse:\n #{xml_readable(children_s)}"
|
140
|
+
end
|
141
|
+
childname = $2
|
142
|
+
child = $1
|
143
|
+
endofelt_ix = $&.length()
|
144
|
+
|
145
|
+
|
146
|
+
# and remove it
|
147
|
+
case children_s[endofelt_ix..-1]
|
148
|
+
when /^\/>(.*)$/
|
149
|
+
# next child is a unary element
|
150
|
+
children_s = $1
|
151
|
+
retv << RegXML.new(child + "/>")
|
152
|
+
|
153
|
+
when /^(>.*?<\s*\/\s*#{childname}\s*>)(.*)$/
|
154
|
+
children_s = $2
|
155
|
+
retv << RegXML.new(child + $1)
|
156
|
+
|
157
|
+
else
|
158
|
+
$stderr.puts "Whole was:\n #{xml_readable(@s)}"
|
159
|
+
$stderr.puts
|
160
|
+
raise "Cannot parse:\n#{xml_readable(children_s)}"
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
return retv
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
def RegXML.test()
|
169
|
+
bla = RegXML.new(" <bla blupp='a\"b'
|
170
|
+
lalala=\"c\">
|
171
|
+
<lalala> </lalala>
|
172
|
+
texttext
|
173
|
+
<lala blupp='b'/>
|
174
|
+
nochtext
|
175
|
+
<la> <l/> </la>
|
176
|
+
</ bla >
|
177
|
+
")
|
178
|
+
puts "name " + bla.name()
|
179
|
+
puts
|
180
|
+
puts bla.to_s()
|
181
|
+
puts
|
182
|
+
bla.attributes.each { |attr, val|
|
183
|
+
puts "attr " + attr + "=" + val
|
184
|
+
}
|
185
|
+
puts
|
186
|
+
bla.children_and_text.each { |child_obj|
|
187
|
+
if child_obj.text?
|
188
|
+
puts "da text " + child_obj.to_s
|
189
|
+
else
|
190
|
+
puts "da child " + child_obj.to_s
|
191
|
+
end
|
192
|
+
}
|
193
|
+
puts
|
194
|
+
|
195
|
+
puts "NEU"
|
196
|
+
bla = RegXML.new(" < bla blupp='a\"'/> ")
|
197
|
+
puts "name " + bla.name()
|
198
|
+
puts
|
199
|
+
puts bla.to_s()
|
200
|
+
puts
|
201
|
+
bla.attributes.each { |attr, val|
|
202
|
+
puts "attr " + attr + "=" + val
|
203
|
+
}
|
204
|
+
puts
|
205
|
+
bla.children_and_text.each { |child_obj|
|
206
|
+
if child_obj.text?
|
207
|
+
puts "da text " + child_obj.to_s
|
208
|
+
else
|
209
|
+
puts "da child " + child_obj.to_s
|
210
|
+
end
|
211
|
+
}
|
212
|
+
puts
|
213
|
+
|
214
|
+
end
|
215
|
+
|
216
|
+
##############
|
217
|
+
protected
|
218
|
+
|
219
|
+
def unary_element()
|
220
|
+
# <bla/>
|
221
|
+
if @s =~ /^\s*<.*\/>\s*$/
|
222
|
+
return true
|
223
|
+
else
|
224
|
+
return false
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
def element_test()
|
229
|
+
# make sure we have a single XML element, either <bla/> or
|
230
|
+
# <bla>...</bla>
|
231
|
+
|
232
|
+
if unary_element()
|
233
|
+
# <bla/>
|
234
|
+
elsif @s =~ /^\s*<\s*([\w-]+)\W.*?<\/\s*\1\s*>\s*$/
|
235
|
+
# <bla > ... </bla>
|
236
|
+
else
|
237
|
+
raise "Cannot parse:\n #{xml_readable(@s)}"
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
def dyck_test()
|
242
|
+
# every prefix of @s must have at least as many < as >
|
243
|
+
opening = 0
|
244
|
+
closing = 0
|
245
|
+
@s.scan(/[<>]/) { |bracket|
|
246
|
+
case bracket
|
247
|
+
when "<"
|
248
|
+
opening += 1
|
249
|
+
when ">"
|
250
|
+
closing += 1
|
251
|
+
if closing > opening
|
252
|
+
raise "More closing than opening brackets in prefix of:\n #{xml_readable(@s)}"
|
253
|
+
end
|
254
|
+
end
|
255
|
+
}
|
256
|
+
|
257
|
+
# and in total, @s must have equally many < and >
|
258
|
+
unless @s.count("<") == @s.count(">")
|
259
|
+
raise "Inequal number of brackets in:\n #{xml_readable(@s)}"
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
def xml_readable(string)
|
264
|
+
return string.gsub(/>/, ">\n")
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
268
|
+
# RegXML.test()
|
269
|
+
|
@@ -0,0 +1,194 @@
|
|
1
|
+
#########
|
2
|
+
# module StringTerminalsInRightOrder
|
3
|
+
#
|
4
|
+
# returns the yield of a node, or a list of nodes, as a string
|
5
|
+
# of " "-separated words
|
6
|
+
#
|
7
|
+
# Words are put into the right order, left to right,
|
8
|
+
# under the assumption that their node IDs reflect that order
|
9
|
+
#
|
10
|
+
# Terminal nodes are assumed to have IDs ending in a number,
|
11
|
+
# numbered from left to right
|
12
|
+
#
|
13
|
+
# Splitword nodes are assumed to have IDs ending in N_sM
|
14
|
+
# for numbers N and M, where N orders terminals left to right
|
15
|
+
# and M orders the splitword parts left to right
|
16
|
+
#
|
17
|
+
# If the yield of the node/the list of nodes contains all splitwords of a terminal,
|
18
|
+
# the whole terminal is taken instead
|
19
|
+
#
|
20
|
+
# methods:
|
21
|
+
#
|
22
|
+
# string_for_node returns the string for the yield of a node
|
23
|
+
# node: a node object
|
24
|
+
#
|
25
|
+
# string_for_nodes returns the string for the yield of a list of nodes
|
26
|
+
# nodes: a list of node objects
|
27
|
+
|
28
|
+
module StringTerminalsInRightOrder
|
29
|
+
def string_for_node(node)
|
30
|
+
string_for_nodes([node])
|
31
|
+
end
|
32
|
+
|
33
|
+
def string_for_nodes(nodes)
|
34
|
+
a = right_level_terminals_for_nodes(nodes)
|
35
|
+
a = sort_terminals_and_splitwords_left_to_right(a)
|
36
|
+
return node_array_to_string(a)
|
37
|
+
end
|
38
|
+
|
39
|
+
#####
|
40
|
+
private
|
41
|
+
|
42
|
+
# right_level_terminals_for_nodes:
|
43
|
+
# - compute the yield for each element of 'nodes'
|
44
|
+
# - then consider all splitwords in the yield:
|
45
|
+
# if all splitwords of a terminal are in the yield,
|
46
|
+
# then use the terminal rather than its splitwords
|
47
|
+
def right_level_terminals_for_nodes(nodes)
|
48
|
+
a = nodes.map { |n| n.yield_nodes()}.flatten
|
49
|
+
b = Array.new
|
50
|
+
a.each { |n|
|
51
|
+
if n.is_splitword?
|
52
|
+
# see if a contains all parts of this splitword
|
53
|
+
# if so, take into b the splitword's parent, the terminal,
|
54
|
+
# rather than the individual splitwords
|
55
|
+
|
56
|
+
if n.parent.nil?
|
57
|
+
# splitword without a parent
|
58
|
+
b << n
|
59
|
+
elsif b.include? n.parent or a.include? n.parent
|
60
|
+
# did we already include the splitword's parent in b?
|
61
|
+
# then we're done
|
62
|
+
else
|
63
|
+
|
64
|
+
# check if all children of n.parent are in 'a'
|
65
|
+
all_in = true
|
66
|
+
n.parent.each_child { |nsibling|
|
67
|
+
unless a.include? nsibling
|
68
|
+
all_in = false
|
69
|
+
break
|
70
|
+
end
|
71
|
+
}
|
72
|
+
|
73
|
+
if all_in
|
74
|
+
# yes, all children of n.parent are in 'a'
|
75
|
+
b << n.parent
|
76
|
+
else
|
77
|
+
# no, some sibling of n is not in 'a'
|
78
|
+
b << n
|
79
|
+
end
|
80
|
+
end
|
81
|
+
elsif n.is_terminal?
|
82
|
+
# n is a terminal
|
83
|
+
b << n
|
84
|
+
# if n is anything but a splitword or a terminal,
|
85
|
+
# ignore it
|
86
|
+
end
|
87
|
+
}
|
88
|
+
return b.uniq
|
89
|
+
end
|
90
|
+
|
91
|
+
# sort_terminals_and_splitwords_left_to_right:
|
92
|
+
# take an array of nodes that consists of terminals and splitwords
|
93
|
+
# and sort them using the following comparison:
|
94
|
+
# - when comparing two terminals, use the
|
95
|
+
# last numbers in their respective IDs
|
96
|
+
# - when comparing two splitwords, their IDs end in _N_sM
|
97
|
+
# for numbers N and M.
|
98
|
+
# If they coincide in N, compare them by M,
|
99
|
+
# else compare them by M
|
100
|
+
# - when comparing a terminal and a splitword,
|
101
|
+
# compare the terminal's last number to the splitword's N
|
102
|
+
def sort_terminals_and_splitwords_left_to_right(nodes)
|
103
|
+
nodes.sort { |a, b|
|
104
|
+
if a.is_splitword? and b.is_splitword?
|
105
|
+
compare_splitwords(a, b)
|
106
|
+
elsif a.is_terminal? and b.is_terminal?
|
107
|
+
compare_terminals(a, b)
|
108
|
+
else
|
109
|
+
compare_mixed(a, b)
|
110
|
+
end
|
111
|
+
}
|
112
|
+
end
|
113
|
+
|
114
|
+
# node_array_to_string:
|
115
|
+
# 'nodes' is an array of node objects, each of which offer a "word" method
|
116
|
+
# string their words together separated by " "
|
117
|
+
def node_array_to_string(nodes)
|
118
|
+
s = ""
|
119
|
+
nodes.each { |n|
|
120
|
+
s = s + n.word + " "
|
121
|
+
}
|
122
|
+
return s
|
123
|
+
end
|
124
|
+
|
125
|
+
# - when comparing two terminals, use the
|
126
|
+
# last numbers in their respective IDs
|
127
|
+
def compare_terminals(a, b)
|
128
|
+
last_i(a) <=> last_i(b)
|
129
|
+
end
|
130
|
+
|
131
|
+
# - when comparing two splitwords, their IDs end in _N_sM
|
132
|
+
# for numbers N and M.
|
133
|
+
# If they coincide in N, compare them by M,
|
134
|
+
# else compare them by M
|
135
|
+
def compare_splitwords(a, b)
|
136
|
+
if splitword_terminal_i(a) == splitword_terminal_i(b)
|
137
|
+
# parts of same terminal?
|
138
|
+
# compare parts
|
139
|
+
last_i(a) <=> last_i(b)
|
140
|
+
else
|
141
|
+
# not parts of same terminal?
|
142
|
+
# compare terminals
|
143
|
+
splitword_terminal_i(a) <=> splitword_terminal_i(b)
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
# - when comparing a terminal and a splitword,
|
148
|
+
# compare the terminal's last number to the splitword's N
|
149
|
+
def compare_mixed(a, b)
|
150
|
+
if a.is_splitword? and b.is_terminal?
|
151
|
+
splitword_terminal_i(a) <=> last_i(b)
|
152
|
+
|
153
|
+
elsif a.is_terminal? and b.is_splitword?
|
154
|
+
last_i(a) <=> splitword_terminal_i(b)
|
155
|
+
else
|
156
|
+
# not one terminal, one splitword?
|
157
|
+
# then what?
|
158
|
+
$stderr.print "SalsaTigerSentence, compare_mixed: confused by "
|
159
|
+
$stderr.print a.id, ", ", b.id, "\n"
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
# return last number of the ID of a node
|
164
|
+
def last_i(n)
|
165
|
+
n.id =~ /(\d+)$/ # match final string of digits
|
166
|
+
if $1.nil? # if shouldn't happen _in principle_
|
167
|
+
# but we might get weird node IDs for splitwords;
|
168
|
+
# so we act gracefully and catch the case where there
|
169
|
+
# is one final letter behind the digits
|
170
|
+
n.id =~ /(\d+)\w$/
|
171
|
+
end
|
172
|
+
if $1.nil? # this shouldn't ever happen
|
173
|
+
$stderr.print "SalsaTigerSentence, last_i: Couldn't extract digits from: "
|
174
|
+
$stderr.print n.id, "\n"
|
175
|
+
exit 1
|
176
|
+
end
|
177
|
+
return $1.to_i # and return it as number
|
178
|
+
end
|
179
|
+
|
180
|
+
# assume the ID of the node includes N_sM
|
181
|
+
# return N
|
182
|
+
def splitword_terminal_i(n)
|
183
|
+
n.id =~ /(\d+)_s\d*/ # match string of digits before splitword ID
|
184
|
+
if $1.nil? # this shouldn't ever happen
|
185
|
+
$stderr.print "SalsaTigerSentence, splitword_terminal_i: Couldn't extract digits from: "
|
186
|
+
$stderr.print n.id, "\n"
|
187
|
+
exit 1
|
188
|
+
end
|
189
|
+
return $1.to_i # and return it as number
|
190
|
+
end
|
191
|
+
|
192
|
+
end
|
193
|
+
|
194
|
+
|