shalmaneser-prep 1.2.0.rc4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +10 -0
- data/CHANGELOG.md +4 -0
- data/LICENSE.md +4 -0
- data/README.md +93 -0
- data/lib/frprep/Ampersand.rb +39 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/TreetaggerInterface.rb +327 -0
- data/lib/frprep/do_parses.rb +143 -0
- data/lib/frprep/frprep.rb +693 -0
- data/lib/frprep/interfaces/berkeley_interface.rb +372 -0
- data/lib/frprep/interfaces/stanford_interface.rb +353 -0
- data/lib/frprep/interpreters/berkeley_interpreter.rb +22 -0
- data/lib/frprep/interpreters/stanford_interpreter.rb +22 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +58 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +99 -0
- data/test/functional/test_rosy.rb +40 -0
- metadata +85 -0
@@ -0,0 +1,269 @@
|
|
1
|
+
# RegXML
|
2
|
+
#
|
3
|
+
# Katrin Erk June 2005
|
4
|
+
|
5
|
+
# SalsaTigerRegXML: take control of the data structure, no underlying xml
|
6
|
+
# representation anymore, re-generation of xml on demand
|
7
|
+
|
8
|
+
class RegXML
|
9
|
+
|
10
|
+
def initialize(string, # string representing a single XML element
|
11
|
+
i_am_text = false) # boolean: xml element (false) or text (true)
|
12
|
+
|
13
|
+
unless string.class == String
|
14
|
+
raise "First argument to RegXML.new must be string. I got #{string.class.to_s}"
|
15
|
+
end
|
16
|
+
if i_am_text
|
17
|
+
@s = string
|
18
|
+
@i_am_text = true
|
19
|
+
else
|
20
|
+
@s = string.gsub(/\n/, " ").freeze
|
21
|
+
@i_am_text = false
|
22
|
+
|
23
|
+
element_test()
|
24
|
+
dyck_test()
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def to_s()
|
29
|
+
return xml_readable(@s)
|
30
|
+
end
|
31
|
+
|
32
|
+
def text?
|
33
|
+
return @i_am_text
|
34
|
+
end
|
35
|
+
|
36
|
+
def name()
|
37
|
+
if @i_am_text
|
38
|
+
# text
|
39
|
+
return nil
|
40
|
+
|
41
|
+
else
|
42
|
+
# xml element
|
43
|
+
if @s =~ /^\s*<\s*([\w-]+)[\s\/>]/
|
44
|
+
return $1
|
45
|
+
else
|
46
|
+
raise "Cannot parse:\n#{xml_readable(@s)}"
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def attributes()
|
52
|
+
if @i_am_text
|
53
|
+
# text
|
54
|
+
return {}
|
55
|
+
|
56
|
+
else
|
57
|
+
# xml element
|
58
|
+
|
59
|
+
# remove <element_name from the beginning of @s,
|
60
|
+
# place the rest up to the first > into elt_contents:
|
61
|
+
# this is a string of the form
|
62
|
+
# - either (name=value)*
|
63
|
+
# - or (name=value)*/
|
64
|
+
unless @s =~ /^\s*<\s*#{name()}(.*)$/
|
65
|
+
raise "Cannot parse:\n #{xml_readable(@s)}"
|
66
|
+
end
|
67
|
+
|
68
|
+
retv = Hash.new
|
69
|
+
elt_contents = $1
|
70
|
+
|
71
|
+
# repeat until only > or /> is left
|
72
|
+
while elt_contents !~ /^\s*\/?>/
|
73
|
+
|
74
|
+
# shave off the next name=value pair
|
75
|
+
# put the rest into elt_contents
|
76
|
+
# make sure that if the value is quoted with ',
|
77
|
+
# we accept " inside the value, and vice versa.
|
78
|
+
unless elt_contents =~ /^\s*([\w-]+)=(['"])(.*?)\2(.*)$/
|
79
|
+
raise "Cannot parse:\n #{xml_readable(elt_contents)}"
|
80
|
+
end
|
81
|
+
retv[$1] = $3
|
82
|
+
elt_contents = $4
|
83
|
+
end
|
84
|
+
|
85
|
+
return retv
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def children_and_text()
|
90
|
+
if @i_am_text
|
91
|
+
return []
|
92
|
+
|
93
|
+
else
|
94
|
+
if unary_element()
|
95
|
+
# <bla/>, no children
|
96
|
+
return []
|
97
|
+
end
|
98
|
+
|
99
|
+
# @s has the form <bla...> ... </bla>.
|
100
|
+
# remove <bla ...> from the beginning of @s,
|
101
|
+
# place the rest up to </bla> into children_s:
|
102
|
+
|
103
|
+
mainname = name()
|
104
|
+
unless @s =~ /^\s*<\s*#{mainname}(\s+[\w-]+=(["']).*?\2)*\s*>(.*?)<\/\s*#{mainname}\s*>\s*$/
|
105
|
+
raise "Cannot parse:\n #{xml_readable(@s)}"
|
106
|
+
end
|
107
|
+
|
108
|
+
retv = Array.new
|
109
|
+
children_s = $3
|
110
|
+
|
111
|
+
# repeat until only whitespace is left
|
112
|
+
while children_s !~ /^\s*$/
|
113
|
+
|
114
|
+
# shave off the next bit of text
|
115
|
+
# put the rest into children_s
|
116
|
+
unless children_s =~ /^\s*(.*?)(<.*$|$)/
|
117
|
+
$stderr.puts "Whole was:\n #{xml_readable(@s)}"
|
118
|
+
$stderr.puts
|
119
|
+
raise "Cannot parse:\n #{xml_readable(children_s)}"
|
120
|
+
end
|
121
|
+
unless $1.strip.empty?
|
122
|
+
children_s = $2
|
123
|
+
retv << RegXML.new($1, true)
|
124
|
+
end
|
125
|
+
|
126
|
+
# anything left after we've parsed text?
|
127
|
+
if children_s =~ /^s*$/
|
128
|
+
break
|
129
|
+
end
|
130
|
+
|
131
|
+
# shave off the next child
|
132
|
+
# and put the rest into children_s
|
133
|
+
|
134
|
+
# determine the next child's name, and the string index at which
|
135
|
+
# the element start tag ends with either / or >
|
136
|
+
unless children_s =~ /^\s*(<\s*([\w-]+)(\s+[\w-]+=(["']).*?\4)*\s*)/
|
137
|
+
$stderr.puts "Whole was:\n #{xml_readable(@s)}"
|
138
|
+
$stderr.puts
|
139
|
+
raise "Cannot parse:\n #{xml_readable(children_s)}"
|
140
|
+
end
|
141
|
+
childname = $2
|
142
|
+
child = $1
|
143
|
+
endofelt_ix = $&.length()
|
144
|
+
|
145
|
+
|
146
|
+
# and remove it
|
147
|
+
case children_s[endofelt_ix..-1]
|
148
|
+
when /^\/>(.*)$/
|
149
|
+
# next child is a unary element
|
150
|
+
children_s = $1
|
151
|
+
retv << RegXML.new(child + "/>")
|
152
|
+
|
153
|
+
when /^(>.*?<\s*\/\s*#{childname}\s*>)(.*)$/
|
154
|
+
children_s = $2
|
155
|
+
retv << RegXML.new(child + $1)
|
156
|
+
|
157
|
+
else
|
158
|
+
$stderr.puts "Whole was:\n #{xml_readable(@s)}"
|
159
|
+
$stderr.puts
|
160
|
+
raise "Cannot parse:\n#{xml_readable(children_s)}"
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
return retv
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
def RegXML.test()
|
169
|
+
bla = RegXML.new(" <bla blupp='a\"b'
|
170
|
+
lalala=\"c\">
|
171
|
+
<lalala> </lalala>
|
172
|
+
texttext
|
173
|
+
<lala blupp='b'/>
|
174
|
+
nochtext
|
175
|
+
<la> <l/> </la>
|
176
|
+
</ bla >
|
177
|
+
")
|
178
|
+
puts "name " + bla.name()
|
179
|
+
puts
|
180
|
+
puts bla.to_s()
|
181
|
+
puts
|
182
|
+
bla.attributes.each { |attr, val|
|
183
|
+
puts "attr " + attr + "=" + val
|
184
|
+
}
|
185
|
+
puts
|
186
|
+
bla.children_and_text.each { |child_obj|
|
187
|
+
if child_obj.text?
|
188
|
+
puts "da text " + child_obj.to_s
|
189
|
+
else
|
190
|
+
puts "da child " + child_obj.to_s
|
191
|
+
end
|
192
|
+
}
|
193
|
+
puts
|
194
|
+
|
195
|
+
puts "NEU"
|
196
|
+
bla = RegXML.new(" < bla blupp='a\"'/> ")
|
197
|
+
puts "name " + bla.name()
|
198
|
+
puts
|
199
|
+
puts bla.to_s()
|
200
|
+
puts
|
201
|
+
bla.attributes.each { |attr, val|
|
202
|
+
puts "attr " + attr + "=" + val
|
203
|
+
}
|
204
|
+
puts
|
205
|
+
bla.children_and_text.each { |child_obj|
|
206
|
+
if child_obj.text?
|
207
|
+
puts "da text " + child_obj.to_s
|
208
|
+
else
|
209
|
+
puts "da child " + child_obj.to_s
|
210
|
+
end
|
211
|
+
}
|
212
|
+
puts
|
213
|
+
|
214
|
+
end
|
215
|
+
|
216
|
+
##############
|
217
|
+
protected
|
218
|
+
|
219
|
+
def unary_element()
|
220
|
+
# <bla/>
|
221
|
+
if @s =~ /^\s*<.*\/>\s*$/
|
222
|
+
return true
|
223
|
+
else
|
224
|
+
return false
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
def element_test()
|
229
|
+
# make sure we have a single XML element, either <bla/> or
|
230
|
+
# <bla>...</bla>
|
231
|
+
|
232
|
+
if unary_element()
|
233
|
+
# <bla/>
|
234
|
+
elsif @s =~ /^\s*<\s*([\w-]+)\W.*?<\/\s*\1\s*>\s*$/
|
235
|
+
# <bla > ... </bla>
|
236
|
+
else
|
237
|
+
raise "Cannot parse:\n #{xml_readable(@s)}"
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
def dyck_test()
|
242
|
+
# every prefix of @s must have at least as many < as >
|
243
|
+
opening = 0
|
244
|
+
closing = 0
|
245
|
+
@s.scan(/[<>]/) { |bracket|
|
246
|
+
case bracket
|
247
|
+
when "<"
|
248
|
+
opening += 1
|
249
|
+
when ">"
|
250
|
+
closing += 1
|
251
|
+
if closing > opening
|
252
|
+
raise "More closing than opening brackets in prefix of:\n #{xml_readable(@s)}"
|
253
|
+
end
|
254
|
+
end
|
255
|
+
}
|
256
|
+
|
257
|
+
# and in total, @s must have equally many < and >
|
258
|
+
unless @s.count("<") == @s.count(">")
|
259
|
+
raise "Inequal number of brackets in:\n #{xml_readable(@s)}"
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
def xml_readable(string)
|
264
|
+
return string.gsub(/>/, ">\n")
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
268
|
+
# RegXML.test()
|
269
|
+
|
@@ -0,0 +1,194 @@
|
|
1
|
+
#########
|
2
|
+
# module StringTerminalsInRightOrder
|
3
|
+
#
|
4
|
+
# returns the yield of a node, or a list of nodes, as a string
|
5
|
+
# of " "-separated words
|
6
|
+
#
|
7
|
+
# Words are put into the right order, left to right,
|
8
|
+
# under the assumption that their node IDs reflect that order
|
9
|
+
#
|
10
|
+
# Terminal nodes are assumed to have IDs ending in a number,
|
11
|
+
# numbered from left to right
|
12
|
+
#
|
13
|
+
# Splitword nodes are assumed to have IDs ending in N_sM
|
14
|
+
# for numbers N and M, where N orders terminals left to right
|
15
|
+
# and M orders the splitword parts left to right
|
16
|
+
#
|
17
|
+
# If the yield of the node/the list of nodes contains all splitwords of a terminal,
|
18
|
+
# the whole terminal is taken instead
|
19
|
+
#
|
20
|
+
# methods:
|
21
|
+
#
|
22
|
+
# string_for_node returns the string for the yield of a node
|
23
|
+
# node: a node object
|
24
|
+
#
|
25
|
+
# string_for_nodes returns the string for the yield of a list of nodes
|
26
|
+
# nodes: a list of node objects
|
27
|
+
|
28
|
+
module StringTerminalsInRightOrder
|
29
|
+
def string_for_node(node)
|
30
|
+
string_for_nodes([node])
|
31
|
+
end
|
32
|
+
|
33
|
+
def string_for_nodes(nodes)
|
34
|
+
a = right_level_terminals_for_nodes(nodes)
|
35
|
+
a = sort_terminals_and_splitwords_left_to_right(a)
|
36
|
+
return node_array_to_string(a)
|
37
|
+
end
|
38
|
+
|
39
|
+
#####
|
40
|
+
private
|
41
|
+
|
42
|
+
# right_level_terminals_for_nodes:
|
43
|
+
# - compute the yield for each element of 'nodes'
|
44
|
+
# - then consider all splitwords in the yield:
|
45
|
+
# if all splitwords of a terminal are in the yield,
|
46
|
+
# then use the terminal rather than its splitwords
|
47
|
+
def right_level_terminals_for_nodes(nodes)
|
48
|
+
a = nodes.map { |n| n.yield_nodes()}.flatten
|
49
|
+
b = Array.new
|
50
|
+
a.each { |n|
|
51
|
+
if n.is_splitword?
|
52
|
+
# see if a contains all parts of this splitword
|
53
|
+
# if so, take into b the splitword's parent, the terminal,
|
54
|
+
# rather than the individual splitwords
|
55
|
+
|
56
|
+
if n.parent.nil?
|
57
|
+
# splitword without a parent
|
58
|
+
b << n
|
59
|
+
elsif b.include? n.parent or a.include? n.parent
|
60
|
+
# did we already include the splitword's parent in b?
|
61
|
+
# then we're done
|
62
|
+
else
|
63
|
+
|
64
|
+
# check if all children of n.parent are in 'a'
|
65
|
+
all_in = true
|
66
|
+
n.parent.each_child { |nsibling|
|
67
|
+
unless a.include? nsibling
|
68
|
+
all_in = false
|
69
|
+
break
|
70
|
+
end
|
71
|
+
}
|
72
|
+
|
73
|
+
if all_in
|
74
|
+
# yes, all children of n.parent are in 'a'
|
75
|
+
b << n.parent
|
76
|
+
else
|
77
|
+
# no, some sibling of n is not in 'a'
|
78
|
+
b << n
|
79
|
+
end
|
80
|
+
end
|
81
|
+
elsif n.is_terminal?
|
82
|
+
# n is a terminal
|
83
|
+
b << n
|
84
|
+
# if n is anything but a splitword or a terminal,
|
85
|
+
# ignore it
|
86
|
+
end
|
87
|
+
}
|
88
|
+
return b.uniq
|
89
|
+
end
|
90
|
+
|
91
|
+
# sort_terminals_and_splitwords_left_to_right:
|
92
|
+
# take an array of nodes that consists of terminals and splitwords
|
93
|
+
# and sort them using the following comparison:
|
94
|
+
# - when comparing two terminals, use the
|
95
|
+
# last numbers in their respective IDs
|
96
|
+
# - when comparing two splitwords, their IDs end in _N_sM
|
97
|
+
# for numbers N and M.
|
98
|
+
# If they coincide in N, compare them by M,
|
99
|
+
# else compare them by M
|
100
|
+
# - when comparing a terminal and a splitword,
|
101
|
+
# compare the terminal's last number to the splitword's N
|
102
|
+
def sort_terminals_and_splitwords_left_to_right(nodes)
|
103
|
+
nodes.sort { |a, b|
|
104
|
+
if a.is_splitword? and b.is_splitword?
|
105
|
+
compare_splitwords(a, b)
|
106
|
+
elsif a.is_terminal? and b.is_terminal?
|
107
|
+
compare_terminals(a, b)
|
108
|
+
else
|
109
|
+
compare_mixed(a, b)
|
110
|
+
end
|
111
|
+
}
|
112
|
+
end
|
113
|
+
|
114
|
+
# node_array_to_string:
|
115
|
+
# 'nodes' is an array of node objects, each of which offer a "word" method
|
116
|
+
# string their words together separated by " "
|
117
|
+
def node_array_to_string(nodes)
|
118
|
+
s = ""
|
119
|
+
nodes.each { |n|
|
120
|
+
s = s + n.word + " "
|
121
|
+
}
|
122
|
+
return s
|
123
|
+
end
|
124
|
+
|
125
|
+
# - when comparing two terminals, use the
|
126
|
+
# last numbers in their respective IDs
|
127
|
+
def compare_terminals(a, b)
|
128
|
+
last_i(a) <=> last_i(b)
|
129
|
+
end
|
130
|
+
|
131
|
+
# - when comparing two splitwords, their IDs end in _N_sM
|
132
|
+
# for numbers N and M.
|
133
|
+
# If they coincide in N, compare them by M,
|
134
|
+
# else compare them by M
|
135
|
+
def compare_splitwords(a, b)
|
136
|
+
if splitword_terminal_i(a) == splitword_terminal_i(b)
|
137
|
+
# parts of same terminal?
|
138
|
+
# compare parts
|
139
|
+
last_i(a) <=> last_i(b)
|
140
|
+
else
|
141
|
+
# not parts of same terminal?
|
142
|
+
# compare terminals
|
143
|
+
splitword_terminal_i(a) <=> splitword_terminal_i(b)
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
# - when comparing a terminal and a splitword,
|
148
|
+
# compare the terminal's last number to the splitword's N
|
149
|
+
def compare_mixed(a, b)
|
150
|
+
if a.is_splitword? and b.is_terminal?
|
151
|
+
splitword_terminal_i(a) <=> last_i(b)
|
152
|
+
|
153
|
+
elsif a.is_terminal? and b.is_splitword?
|
154
|
+
last_i(a) <=> splitword_terminal_i(b)
|
155
|
+
else
|
156
|
+
# not one terminal, one splitword?
|
157
|
+
# then what?
|
158
|
+
$stderr.print "SalsaTigerSentence, compare_mixed: confused by "
|
159
|
+
$stderr.print a.id, ", ", b.id, "\n"
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
# return last number of the ID of a node
|
164
|
+
def last_i(n)
|
165
|
+
n.id =~ /(\d+)$/ # match final string of digits
|
166
|
+
if $1.nil? # if shouldn't happen _in principle_
|
167
|
+
# but we might get weird node IDs for splitwords;
|
168
|
+
# so we act gracefully and catch the case where there
|
169
|
+
# is one final letter behind the digits
|
170
|
+
n.id =~ /(\d+)\w$/
|
171
|
+
end
|
172
|
+
if $1.nil? # this shouldn't ever happen
|
173
|
+
$stderr.print "SalsaTigerSentence, last_i: Couldn't extract digits from: "
|
174
|
+
$stderr.print n.id, "\n"
|
175
|
+
exit 1
|
176
|
+
end
|
177
|
+
return $1.to_i # and return it as number
|
178
|
+
end
|
179
|
+
|
180
|
+
# assume the ID of the node includes N_sM
|
181
|
+
# return N
|
182
|
+
def splitword_terminal_i(n)
|
183
|
+
n.id =~ /(\d+)_s\d*/ # match string of digits before splitword ID
|
184
|
+
if $1.nil? # this shouldn't ever happen
|
185
|
+
$stderr.print "SalsaTigerSentence, splitword_terminal_i: Couldn't extract digits from: "
|
186
|
+
$stderr.print n.id, "\n"
|
187
|
+
exit 1
|
188
|
+
end
|
189
|
+
return $1.to_i # and return it as number
|
190
|
+
end
|
191
|
+
|
192
|
+
end
|
193
|
+
|
194
|
+
|