treebank 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +59 -0
- data/examples/penntb-words +49 -0
- data/lib/treebank.rb +446 -0
- data/test/test_treebank.rb +238 -0
- metadata +61 -0
data/README
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
= Tree Module
|
2
|
+
|
3
|
+
This module supports the creation, search, manipulation, and
|
4
|
+
serialization of tree structures.
|
5
|
+
|
6
|
+
Trees are implemented with Node objects. Each Node has a writable
|
7
|
+
_label_ that may be any arbitrary object and a list of other child
|
8
|
+
Node objects. Node objects support breadth and depth first iteration.
|
9
|
+
|
10
|
+
irb(main):001:0> require 'treebank'
|
11
|
+
=> true
|
12
|
+
irb(main):002:0> p = Treebank::Node.new('parent')
|
13
|
+
=> <Treebank::Node parent []>
|
14
|
+
irb(main):003:0> p.create_child!('child1')
|
15
|
+
=> <Treebank::Node child1 []>
|
16
|
+
irb(main):004:0> p.create_child!('child2')
|
17
|
+
=> <Treebank::Node child2 []>
|
18
|
+
|
19
|
+
Node has a subclass ParentedNode that keeps track of the parent of the
|
20
|
+
given node and has methods for iterating up the ancestor tree.
|
21
|
+
|
22
|
+
The default stringification method writes a node and all its children
|
23
|
+
in a bracketed tree format.
|
24
|
+
|
25
|
+
irb(main):005:0> puts p
|
26
|
+
(parent (child1) (child2))
|
27
|
+
=> nil
|
28
|
+
|
29
|
+
Bracketed tree strings can be used to create Node trees.
|
30
|
+
|
31
|
+
irb(main):006:0> t = Treebank::Node.new.from_s('(parent (child1) (child2))')
|
32
|
+
=> <Treebank::Node parent [child1 child2]>
|
33
|
+
irb(main):007:0> puts t
|
34
|
+
(parent (child1) (child2))
|
35
|
+
=> nil
|
36
|
+
|
37
|
+
The bracketed tree format is the one used by the Penn
|
38
|
+
Treebank[http://www.cis.upenn.edu/~treebank/] Project to annonate
|
39
|
+
linguistic structure.
|
40
|
+
|
41
|
+
= History
|
42
|
+
|
43
|
+
* 1-0-0 ... First release
|
44
|
+
|
45
|
+
= See Also
|
46
|
+
|
47
|
+
Lingua::Treebank[http://search.cpan.org/~kahn/Lingua-Treebank-0.14/Treebank.pm]
|
48
|
+
implements similar functionality in Perl.
|
49
|
+
|
50
|
+
= Copyright
|
51
|
+
|
52
|
+
Copyright 2006, William Patrick McNeill
|
53
|
+
|
54
|
+
This program is distributed under the GNU General Public License.
|
55
|
+
|
56
|
+
= Author
|
57
|
+
|
58
|
+
W.P. McNeill mailto:billmcn@u.washington.edu
|
59
|
+
|
@@ -0,0 +1,49 @@
|
|
1
|
+
#!/bin/env ruby
|
2
|
+
|
3
|
+
#--
|
4
|
+
# Copyright 2006 William Patrick McNeill
|
5
|
+
#
|
6
|
+
# This file is part of Treebank.
|
7
|
+
#
|
8
|
+
# Treebank is free software; you can redistribute it and/or modify it
|
9
|
+
# under the terms of the GNU General Public License as published by
|
10
|
+
# the Free Software Foundation; either version 2 of the License, or
|
11
|
+
# (at your option) any later version.
|
12
|
+
#
|
13
|
+
# Treebank is distributed in the hope that it will be useful, but
|
14
|
+
# WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
16
|
+
# General Public License for more details.
|
17
|
+
#
|
18
|
+
# You should have received a copy of the GNU General Public License
|
19
|
+
# along with editalign; if not, write to the Free Software Foundation,
|
20
|
+
# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
21
|
+
#
|
22
|
+
#++
|
23
|
+
|
24
|
+
# Print all the text in Penn Treebank parse files.
|
25
|
+
|
26
|
+
require 'treebank'
|
27
|
+
|
28
|
+
# A Penn Treebank File
|
29
|
+
#
|
30
|
+
# This class omits any comment lines when enumerating the lines in the
|
31
|
+
# file.
|
32
|
+
class TreebankFile < File
|
33
|
+
def each
|
34
|
+
super do |line|
|
35
|
+
line.gsub!(/\*x.*/, '')
|
36
|
+
yield line if not line.empty?
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
# Enumerate all the file names specified on the command line, opening
|
42
|
+
# each one and printing the strings in the trees it contains.
|
43
|
+
ARGV.each do |filename|
|
44
|
+
TreebankFile.open(filename) do |file|
|
45
|
+
Treebank::Parser.new(file).each do |tree|
|
46
|
+
puts tree.leaves {|leaf| leaf.label}.join(' ')
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
data/lib/treebank.rb
ADDED
@@ -0,0 +1,446 @@
|
|
1
|
+
# Copyright 2006 William Patrick McNeill
|
2
|
+
#
|
3
|
+
# Treebank is free software; you can redistribute it and/or modify it
|
4
|
+
# under the terms of the GNU General Public License as published by
|
5
|
+
# the Free Software Foundation; either version 2 of the License, or
|
6
|
+
# (at your option) any later version.
|
7
|
+
#
|
8
|
+
# Treebank is distributed in the hope that it will be useful, but
|
9
|
+
# WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU General Public License
|
14
|
+
# along with editalign; if not, write to the Free Software Foundation,
|
15
|
+
# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
|
18
|
+
# Treebank is the namespace that contains all tree-related functions.
|
19
|
+
module Treebank
|
20
|
+
|
21
|
+
# An enumerable list of tokens in a string representation of a tree
|
22
|
+
#
|
23
|
+
# This class provides a way of enumerating over a source to produce
|
24
|
+
# tokens that can be used in parsing a string representation of a
|
25
|
+
# tree. The source is an enumerable object whose _each_ function
|
26
|
+
# returns a sequence of String objects, for example a file or a
|
27
|
+
# single String. Each returned string is delimited by left and
|
28
|
+
# right brackets and whitespace. The default brackets are '(' and
|
29
|
+
# ')', but different delimiters may be specified in the constructor.
|
30
|
+
#
|
31
|
+
# Treebank::TokenStream.new('(A (B c) (D))').collect
|
32
|
+
# => ["(", "A", "(", "B", "c", ")", "(", "D", ")", ")"]
|
33
|
+
class TokenStream
|
34
|
+
include Enumerable
|
35
|
+
|
36
|
+
# The left delimiter
|
37
|
+
attr_reader :left
|
38
|
+
|
39
|
+
# The right delimiter
|
40
|
+
attr_reader :right
|
41
|
+
|
42
|
+
# Constructor
|
43
|
+
#
|
44
|
+
# * source ... the string stream to tokenize
|
45
|
+
# * left ... left bracket symbol
|
46
|
+
# * right ... right bracket symbol
|
47
|
+
def initialize(source, left = '(', right = ')')
|
48
|
+
@source = source
|
49
|
+
@left = left
|
50
|
+
@right = right
|
51
|
+
# Escape the '[' and ']' characters in the character class
|
52
|
+
# regular expression.
|
53
|
+
cc_left = (left == '[') ? "\\#{left}" : left
|
54
|
+
cc_right = (right == ']') ? "\\#{right}" : right
|
55
|
+
# Delimit by left and right brackets, e.g. /\(|\)|[^()]/
|
56
|
+
@s_regex = Regexp.new("\\#{@left}|\\#{@right}|[^#{cc_left}#{cc_right}]+")
|
57
|
+
end
|
58
|
+
|
59
|
+
# Enumerate the tokens in the source
|
60
|
+
def each
|
61
|
+
@source.each do |string|
|
62
|
+
tokenize_string(string) {|token| yield token}
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# Tokenize the source string
|
67
|
+
#
|
68
|
+
# * string ... the string to tokenize
|
69
|
+
def tokenize_string(string)
|
70
|
+
string.scan(@s_regex) do |bracket_delimited|
|
71
|
+
bracket_delimited.split.each {|token| yield token}
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
protected :tokenize_string
|
76
|
+
|
77
|
+
end # TokenStream
|
78
|
+
|
79
|
+
# A parser for string representations of trees
|
80
|
+
#
|
81
|
+
# This class uses a simplified shift-reduce parser to convert a
|
82
|
+
# string into a list of tree structures.
|
83
|
+
#
|
84
|
+
# Treebank::Parser.new('(A) (B (C) (D))').collect
|
85
|
+
# => [<Treebank::Node A []>, <Treebank::Node B [C D]>]
|
86
|
+
#
|
87
|
+
# The string representation of a list of trees has the following BNF
|
88
|
+
# definition
|
89
|
+
#
|
90
|
+
# * trees -> node*
|
91
|
+
# * node -> (label? children)
|
92
|
+
# * label -> word
|
93
|
+
# * children -> node*|word
|
94
|
+
# * word -> \w+
|
95
|
+
#
|
96
|
+
# Note that the BNF definition of children allows a shortcut in
|
97
|
+
# which the labels of terminal nodes may be specified without
|
98
|
+
# brackets. So, for example, <tt>(A (B))</tt> and <tt>(A B)</tt>
|
99
|
+
# are equivalent.
|
100
|
+
#
|
101
|
+
# The trees returned by this class are caller-defined node objects,
|
102
|
+
# where each node has a list of child nodes.
|
103
|
+
class Parser
|
104
|
+
include Enumerable
|
105
|
+
|
106
|
+
# Constructor
|
107
|
+
#
|
108
|
+
# * tokens ... stream of tokens to be converted into trees
|
109
|
+
# * node_class ... class of node to create
|
110
|
+
#
|
111
|
+
# If _tokens_ is not a kind of TokenStream object it will be used
|
112
|
+
# as the source stream of one.
|
113
|
+
def initialize(tokens, node_class = Node)
|
114
|
+
tokens = TokenStream.new(tokens) if not tokens.kind_of? TokenStream
|
115
|
+
@tokens = tokens
|
116
|
+
@node_class = node_class
|
117
|
+
end
|
118
|
+
|
119
|
+
# Enumerate the tokens yielding trees
|
120
|
+
def each # :yields: tree
|
121
|
+
parse = []
|
122
|
+
@tokens.each do |token|
|
123
|
+
case token
|
124
|
+
when @tokens.left
|
125
|
+
parse << :left
|
126
|
+
when @tokens.right
|
127
|
+
# Reduce the end of the parse stack.
|
128
|
+
left_index = parse.rindex(:left)
|
129
|
+
raise "Extra #{@tokens.right}" if left_index.nil?
|
130
|
+
parse[left_index..-1] = reduce(parse[left_index+1..-1])
|
131
|
+
# If the reduced stack consists of a single node, it must be
|
132
|
+
# a complete tree.
|
133
|
+
yield parse.pop if parse.length == 1
|
134
|
+
else
|
135
|
+
parse << token
|
136
|
+
end # case
|
137
|
+
end # do
|
138
|
+
raise "Extra #{@tokens.left}: #{parse}" if not parse.empty?
|
139
|
+
end
|
140
|
+
|
141
|
+
# Convert the end of the parse list into a single node
|
142
|
+
#
|
143
|
+
# * node_parse ... a list of labels and nodes
|
144
|
+
def reduce(node_parse)
|
145
|
+
node = @node_class.new
|
146
|
+
# The first item in the list may be a label.
|
147
|
+
if node_parse.first.class == String
|
148
|
+
node.label = node_parse.shift
|
149
|
+
# Special case: terminals without brackets, e.g. '(V ran)'
|
150
|
+
if node_parse.length == 1 and node_parse.last.class == String
|
151
|
+
node.create_child!(node_parse.last)
|
152
|
+
return node
|
153
|
+
end
|
154
|
+
end
|
155
|
+
# The remaining items are child nodes.
|
156
|
+
node_parse.each {|child| node.attach_child!(child)}
|
157
|
+
node
|
158
|
+
end
|
159
|
+
|
160
|
+
protected :reduce
|
161
|
+
|
162
|
+
end # Parser
|
163
|
+
|
164
|
+
|
165
|
+
# A node in a tree
|
166
|
+
#
|
167
|
+
# A Node consists of a label, which may be any arbitrary Object, and
|
168
|
+
# a list of children, which are also Node objects.
|
169
|
+
class Node
|
170
|
+
include Enumerable
|
171
|
+
|
172
|
+
# Iterates a tree breadth-first
|
173
|
+
class BFSIterator
|
174
|
+
include Enumerable
|
175
|
+
|
176
|
+
# Constructor
|
177
|
+
#
|
178
|
+
# * node ... the start node of the enumeration
|
179
|
+
# * visit ... optional enumeration control procedure
|
180
|
+
#
|
181
|
+
# The optional _visit_ argument can be used to control which
|
182
|
+
# children are visited by this iterator. If specified, it is
|
183
|
+
# called for every node, and only those nodes returning +true+
|
184
|
+
# will be visited.
|
185
|
+
def initialize(node, visit = nil)
|
186
|
+
@node = node
|
187
|
+
@visit = visit
|
188
|
+
end
|
189
|
+
|
190
|
+
# Enumerate the nodes
|
191
|
+
def each
|
192
|
+
@agenda = [@node]
|
193
|
+
while node = @agenda.shift
|
194
|
+
yield node
|
195
|
+
children = @visit ? node.find_all {|n| @visit.call(n)} : node.collect
|
196
|
+
recurse(children)
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
# Function that controls enumeration recursion
|
201
|
+
#
|
202
|
+
# * children ... a list of child nodes of the current node
|
203
|
+
#
|
204
|
+
# The only difference between the breadth-first and depth-first
|
205
|
+
# searches is this function.
|
206
|
+
def recurse(children)
|
207
|
+
@agenda += children
|
208
|
+
end
|
209
|
+
end # BFSIterator
|
210
|
+
|
211
|
+
# Iterates a tree depth-first
|
212
|
+
class DFSIterator < BFSIterator
|
213
|
+
|
214
|
+
# Function that controls enumeration recursion
|
215
|
+
#
|
216
|
+
# * children ... a list of child nodes of the current node
|
217
|
+
#
|
218
|
+
# The only difference between the breadth-first and depth-first
|
219
|
+
# searches is this function.
|
220
|
+
def recurse(children)
|
221
|
+
@agenda = children + @agenda
|
222
|
+
end
|
223
|
+
end # DFSIterator
|
224
|
+
|
225
|
+
# This node's label
|
226
|
+
attr_accessor :label
|
227
|
+
|
228
|
+
# Constructor
|
229
|
+
#
|
230
|
+
# * label ... the label of this node
|
231
|
+
# * child_labels ... list of labels for children of this node
|
232
|
+
def initialize(label = nil, child_labels = [])
|
233
|
+
@label = label
|
234
|
+
@children = []
|
235
|
+
child_labels.each {|label| create_child!(label)}
|
236
|
+
end
|
237
|
+
|
238
|
+
# Read the tree from a bracketed string
|
239
|
+
#
|
240
|
+
# * s ... bracketed string
|
241
|
+
# * left ... left bracket symbol
|
242
|
+
# * right ... right bracket symbol
|
243
|
+
#
|
244
|
+
# This function uses a Treebank::Parser object to create the tree from
|
245
|
+
# _s_. This raises an error if this node is not empty.
|
246
|
+
def from_s(s, left = '(', right = ')')
|
247
|
+
raise 'This node is not empty.' if not self.empty?
|
248
|
+
nodes = Parser.new(TokenStream.new(s, left, right), self.class).collect
|
249
|
+
raise "#{s} defines multiple trees" if nodes.length != 1
|
250
|
+
@label = nodes.first.label
|
251
|
+
@children = nodes.first.collect
|
252
|
+
self
|
253
|
+
end
|
254
|
+
|
255
|
+
# Stringify
|
256
|
+
#
|
257
|
+
# This writes to a bracketed string representation that can be
|
258
|
+
# read by the Parser object.
|
259
|
+
def to_s
|
260
|
+
space = leaf? ? '':' '
|
261
|
+
"(#{label}#{space}#{@children.join(' ')})"
|
262
|
+
end
|
263
|
+
|
264
|
+
# Interactive stringification
|
265
|
+
def inspect
|
266
|
+
child_labels = @children.collect {|n| n.label}
|
267
|
+
"<#{self.class} #{@label} [#{child_labels.join(' ')}]>"
|
268
|
+
end
|
269
|
+
|
270
|
+
# Tree equivalence operator
|
271
|
+
#
|
272
|
+
# If the other object is a tree and every node label in the
|
273
|
+
# corresponding nodes of the two depth first enumerations match,
|
274
|
+
# the trees are equivalent.
|
275
|
+
def ==(other)
|
276
|
+
return false if not other.kind_of? self.class
|
277
|
+
return true if self.empty? and other.empty?
|
278
|
+
mismatch = self.each_depth_first.zip(other.each_depth_first).find \
|
279
|
+
{|self_node, other_node| self_node.nil? or \
|
280
|
+
other_node.nil? or \
|
281
|
+
self_node.label != other_node.label}
|
282
|
+
mismatch.nil?
|
283
|
+
end
|
284
|
+
|
285
|
+
# Create a new node and add it as a child of this node
|
286
|
+
#
|
287
|
+
# * label ... the label of a node to create
|
288
|
+
# * index ... optional insertion index
|
289
|
+
#
|
290
|
+
# If _index_ is not specified, the node is added to the end of the
|
291
|
+
# child list.
|
292
|
+
#
|
293
|
+
# This function returns the added Node object.
|
294
|
+
def create_child!(label, index = nil)
|
295
|
+
attach_child!(self.class.new(label), index)
|
296
|
+
end
|
297
|
+
|
298
|
+
# Attach an existing node as the child of this node
|
299
|
+
#
|
300
|
+
# * node ... the node to add
|
301
|
+
# * index ... optional insertion index
|
302
|
+
#
|
303
|
+
# _node_ must be the same type as this node.
|
304
|
+
#
|
305
|
+
# If _index_ is not specified, the node is added to the end of the
|
306
|
+
# child list.
|
307
|
+
#
|
308
|
+
# This function returns the added Node object.
|
309
|
+
def attach_child!(node, index = nil)
|
310
|
+
raise "#{node} is not class #{self.class}" if node.class != self.class
|
311
|
+
if index.nil?
|
312
|
+
@children << node
|
313
|
+
else
|
314
|
+
@children[index, 0] = node
|
315
|
+
end
|
316
|
+
node
|
317
|
+
end
|
318
|
+
|
319
|
+
# Detach a child node
|
320
|
+
#
|
321
|
+
# * node ... the node to detach
|
322
|
+
#
|
323
|
+
# This removes the specfied node from this node's child list.
|
324
|
+
def detach_child!(node)
|
325
|
+
raise "#{node} is not a child of #{self}" if @children.delete(node).nil?
|
326
|
+
end
|
327
|
+
|
328
|
+
# Enumerate the children of this node.
|
329
|
+
def each
|
330
|
+
@children.each {|node| yield node}
|
331
|
+
end
|
332
|
+
|
333
|
+
# Enumerate all the nodes beneath this one breadth-first
|
334
|
+
#
|
335
|
+
# * visit ... optional enumeration control procedure
|
336
|
+
#
|
337
|
+
# The _visit_ parameter is passed down to the BFSIterator.
|
338
|
+
def each_breadth_first(visit = nil)
|
339
|
+
BFSIterator.new(self, visit)
|
340
|
+
end
|
341
|
+
|
342
|
+
# Enumerate all the nodes beneath this one depth-first
|
343
|
+
#
|
344
|
+
# * visit ... optional enumeration control procedure
|
345
|
+
#
|
346
|
+
# The _visit_ parameter is passed down to the DFSIterator.
|
347
|
+
def each_depth_first(visit = nil)
|
348
|
+
DFSIterator.new(self, visit)
|
349
|
+
end
|
350
|
+
|
351
|
+
# Is this a leaf node?
|
352
|
+
def leaf?
|
353
|
+
@children.empty?
|
354
|
+
end
|
355
|
+
|
356
|
+
# Is this node empty?
|
357
|
+
def empty?
|
358
|
+
@label.nil? and @children.empty?
|
359
|
+
end
|
360
|
+
|
361
|
+
# All the leaf nodes beneath this node
|
362
|
+
#
|
363
|
+
# * block ... an optional block to run on each leaf
|
364
|
+
def leaves(&block)
|
365
|
+
leaves = each_depth_first.find_all {|node| node.leaf?}
|
366
|
+
leaves = leaves.collect {|leaf| block.call(leaf)} if not block.nil?
|
367
|
+
leaves
|
368
|
+
end
|
369
|
+
|
370
|
+
end # Node
|
371
|
+
|
372
|
+
|
373
|
+
# A Node in a Tree that can locate its parent
|
374
|
+
#
|
375
|
+
# The ParentedNode adds a pointer back to the parent node to
|
376
|
+
# the Node class.
|
377
|
+
class ParentedNode < Node
|
378
|
+
|
379
|
+
# This node's parent
|
380
|
+
attr_reader :parent
|
381
|
+
|
382
|
+
# Iterates up a tree
|
383
|
+
class ParentIterator
|
384
|
+
include Enumerable
|
385
|
+
|
386
|
+
# Constructor
|
387
|
+
#
|
388
|
+
# * node ... the start node of the enumeration
|
389
|
+
def initialize(node)
|
390
|
+
@node = node
|
391
|
+
end
|
392
|
+
|
393
|
+
# Enumerate the ancestor chain
|
394
|
+
def each
|
395
|
+
node = @node
|
396
|
+
while not node.nil?
|
397
|
+
yield node
|
398
|
+
node = node.parent
|
399
|
+
end
|
400
|
+
end
|
401
|
+
|
402
|
+
end # ParentIterator
|
403
|
+
|
404
|
+
# Constructor
|
405
|
+
#
|
406
|
+
# * label ... the label of this node
|
407
|
+
# * child_labels ... list of labels for children of this node
|
408
|
+
# * parent ... the parent of this node
|
409
|
+
def initialize(label = nil, child_labels = [], parent = nil)
|
410
|
+
super(label, child_labels)
|
411
|
+
@parent = parent
|
412
|
+
end
|
413
|
+
|
414
|
+
# See Treebank::Node.attach_child!
|
415
|
+
def attach_child!(node, index = nil)
|
416
|
+
child = super(node, index)
|
417
|
+
child.parent = self
|
418
|
+
child
|
419
|
+
end
|
420
|
+
|
421
|
+
# See Treebank::Node.detach_child!
|
422
|
+
def detach_child!(node)
|
423
|
+
super(node)
|
424
|
+
node.parent = nil
|
425
|
+
end
|
426
|
+
|
427
|
+
# Set the parent of this node
|
428
|
+
#
|
429
|
+
# * parent ... the parent node
|
430
|
+
#
|
431
|
+
# This is a protected utility function. It does not change the
|
432
|
+
# child list of _parent_.
|
433
|
+
def parent=(parent)
|
434
|
+
@parent = parent
|
435
|
+
end
|
436
|
+
|
437
|
+
# Enumerate the ancestors of this node
|
438
|
+
def each_parent
|
439
|
+
ParentIterator.new(self)
|
440
|
+
end
|
441
|
+
|
442
|
+
protected :parent=
|
443
|
+
|
444
|
+
end # ParentedNode
|
445
|
+
|
446
|
+
end
|
@@ -0,0 +1,238 @@
|
|
1
|
+
#!/bin/env ruby
|
2
|
+
|
3
|
+
#--
|
4
|
+
# Copyright 2006 William Patrick McNeill
|
5
|
+
#
|
6
|
+
# This file is part of Treebank.
|
7
|
+
#
|
8
|
+
# Treebank is free software; you can redistribute it and/or modify it
|
9
|
+
# under the terms of the GNU General Public License as published by
|
10
|
+
# the Free Software Foundation; either version 2 of the License, or
|
11
|
+
# (at your option) any later version.
|
12
|
+
#
|
13
|
+
# Treebank is distributed in the hope that it will be useful, but
|
14
|
+
# WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
16
|
+
# General Public License for more details.
|
17
|
+
#
|
18
|
+
# You should have received a copy of the GNU General Public License
|
19
|
+
# along with editalign; if not, write to the Free Software Foundation,
|
20
|
+
# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
21
|
+
#
|
22
|
+
#++
|
23
|
+
|
24
|
+
# Test cases for the Treebank module
|
25
|
+
|
26
|
+
require 'test/unit'
|
27
|
+
require 'treebank'
|
28
|
+
|
29
|
+
class TokenStreamTest < Test::Unit::TestCase
|
30
|
+
def test_basic_token_stream
|
31
|
+
t = Treebank::TokenStream.new('(A (B c) (D))')
|
32
|
+
assert_kind_of Treebank::TokenStream, t
|
33
|
+
assert_equal '(', t.left
|
34
|
+
assert_equal ')', t.right
|
35
|
+
assert_equal ['(', 'A', '(', 'B', 'c', ')', '(', 'D', ')', ')'], t.collect
|
36
|
+
end
|
37
|
+
|
38
|
+
def test_non_alphanum_token_stream
|
39
|
+
t = Treebank::TokenStream.new('(A!Node (B!Node .) (14))')
|
40
|
+
assert_kind_of Treebank::TokenStream, t
|
41
|
+
assert_equal '(', t.left
|
42
|
+
assert_equal ')', t.right
|
43
|
+
assert_equal ['(', 'A!Node', '(', 'B!Node', '.', ')', '(', '14', ')', ')'], t.collect
|
44
|
+
end
|
45
|
+
|
46
|
+
def test_different_delimiter
|
47
|
+
t = Treebank::TokenStream.new('<A <B c> <D>>', '<', '>')
|
48
|
+
assert_kind_of Treebank::TokenStream, t
|
49
|
+
assert_equal '<', t.left
|
50
|
+
assert_equal '>', t.right
|
51
|
+
assert_equal ['<', 'A', '<', 'B', 'c', '>', '<', 'D', '>', '>'], t.collect
|
52
|
+
end
|
53
|
+
|
54
|
+
def test_bracket_delimiter
|
55
|
+
t = Treebank::TokenStream.new('[A [B c] [D]]', '[', ']')
|
56
|
+
assert_kind_of Treebank::TokenStream, t
|
57
|
+
assert_equal '[', t.left
|
58
|
+
assert_equal ']', t.right
|
59
|
+
assert_equal ['[', 'A', '[', 'B', 'c', ']', '[', 'D', ']', ']'], t.collect
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
63
|
+
|
64
|
+
|
65
|
+
module ParseTreeMixin
|
66
|
+
|
67
|
+
# Parse tree string
|
68
|
+
def test_tree_parse
|
69
|
+
p = Treebank::Parser.new(Treebank::TokenStream.new('(A) (B)'), @node_class)
|
70
|
+
trees = p.collect
|
71
|
+
assert_equal [@node_class.new.from_s('(A)'), @node_class.new.from_s('(B)')], trees
|
72
|
+
assert_kind_of @node_class, trees[0]
|
73
|
+
assert_kind_of @node_class, trees[1]
|
74
|
+
t = Treebank::Parser.new(Treebank::TokenStream.new('(A (B) (C))'), @node_class).collect.first
|
75
|
+
assert_equal t.label, 'A'
|
76
|
+
assert_equal t.collect.first.label, 'B'
|
77
|
+
assert_equal t.collect.last.label, 'C'
|
78
|
+
assert_equal [@node_class.new], Treebank::Parser.new(Treebank::TokenStream.new('()'), @node_class).collect
|
79
|
+
end
|
80
|
+
|
81
|
+
def test_string_in_constructor
|
82
|
+
assert_equal [@node_class.new.from_s('(A)'), @node_class.new.from_s('(B)')], Treebank::Parser.new('(A) (B)', @node_class).collect
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
86
|
+
|
87
|
+
|
88
|
+
class TreeParserTest < Test::Unit::TestCase
|
89
|
+
|
90
|
+
include ParseTreeMixin
|
91
|
+
|
92
|
+
def setup
|
93
|
+
@node_class = Treebank::Node
|
94
|
+
end
|
95
|
+
|
96
|
+
end
|
97
|
+
|
98
|
+
|
99
|
+
class ParentedTreeParserTest < Test::Unit::TestCase
|
100
|
+
|
101
|
+
include ParseTreeMixin
|
102
|
+
|
103
|
+
def setup
|
104
|
+
@node_class = Treebank::ParentedNode
|
105
|
+
end
|
106
|
+
|
107
|
+
end
|
108
|
+
|
109
|
+
|
110
|
+
module NodeTestMixin
|
111
|
+
|
112
|
+
# Empty tree
|
113
|
+
def test_empty_tree
|
114
|
+
t = @node_class.new
|
115
|
+
assert_kind_of @node_class, t, 'Empty tree type'
|
116
|
+
assert_nil t.label, 'Empty tree nil head'
|
117
|
+
assert t.empty?, 'Empty empty?'
|
118
|
+
assert_equal [], t.collect, 'Empty child list'
|
119
|
+
assert_equal [t], t.each_breadth_first.collect, 'Empty breadth first'
|
120
|
+
assert_equal [t], t.each_depth_first.collect, 'Empty depth first'
|
121
|
+
end
|
122
|
+
|
123
|
+
# Test a single node tree
|
124
|
+
def test_single_node_tree
|
125
|
+
t = @node_class.new('a')
|
126
|
+
assert_equal 'a', t.label, 'Single node label'
|
127
|
+
assert !t.empty?, 'Single node not empty?'
|
128
|
+
assert_equal [], t.collect, 'Empty child list'
|
129
|
+
assert_equal [t], t.each_breadth_first.collect, 'Single node breadth first'
|
130
|
+
assert_equal [t], t.each_depth_first.collect, 'Single depth first'
|
131
|
+
end
|
132
|
+
|
133
|
+
# Test adding children in the constructor
|
134
|
+
def test_constructor_children
|
135
|
+
t = @node_class.new('a', ['b', 'c', 'd'])
|
136
|
+
assert_equal @node_class.new.from_s('(a (b) (c) (d) )'), t, 'Children in constructor'
|
137
|
+
end
|
138
|
+
|
139
|
+
# Add children
|
140
|
+
def test_add_children
|
141
|
+
# Add a child of the head node.
|
142
|
+
a = @node_class.new('a')
|
143
|
+
b = a.create_child!('b')
|
144
|
+
assert_kind_of @node_class, b, 'create_child! return value type'
|
145
|
+
assert_equal 'b', b.label, 'create_child! return value label'
|
146
|
+
assert_equal a.collect, [b], 'a children'
|
147
|
+
assert_equal b.collect, [], 'b children'
|
148
|
+
assert !a.empty?, 'create_child! not empty?'
|
149
|
+
a = @node_class.new('a', ['b', 'c', 'd'])
|
150
|
+
a.create_child!('z', 0)
|
151
|
+
assert_equal ['z', 'b', 'c', 'd'], a.collect {|n| n.label}
|
152
|
+
a.create_child!('x', 2)
|
153
|
+
assert_equal ['z', 'b', 'x', 'c', 'd'], a.collect {|n| n.label}
|
154
|
+
end
|
155
|
+
|
156
|
+
# Read from/to a string
|
157
|
+
def test_stringify
|
158
|
+
s = '(S (NP (D (the)) (N (boy))) (VP (V (ran))))'
|
159
|
+
multiline_s = \
|
160
|
+
'(S
|
161
|
+
(NP
|
162
|
+
(D (the))
|
163
|
+
(N (boy)))
|
164
|
+
(VP
|
165
|
+
(V (ran))))'
|
166
|
+
t = @node_class.new.from_s(s)
|
167
|
+
assert_kind_of @node_class, t, 'from_s'
|
168
|
+
assert_equal s, "#{t}", 'to_s'
|
169
|
+
m = @node_class.new.from_s(multiline_s)
|
170
|
+
assert_equal t, m, 'Single-/multi-line equal'
|
171
|
+
end
|
172
|
+
|
173
|
+
# Simple enumeration
|
174
|
+
def test_enumeration
|
175
|
+
# Enumerate all children.
|
176
|
+
t = @node_class.new.from_s('(a (b (R) (S) ) (c (T) (U)) )')
|
177
|
+
assert_equal ['a', 'b', 'c', 'R', 'S', 'T', 'U'], t.each_breadth_first.collect {|node| node.label}, 'Full breadth first'
|
178
|
+
assert_equal ['a', 'b', 'R', 'S', 'c', 'T', 'U'], t.each_depth_first.collect {|node| node.label}, 'Full depth first'
|
179
|
+
# Enumerate children beneath a node.
|
180
|
+
b = t.find {|node| node.label == 'b'}
|
181
|
+
assert_equal ['b', 'R', 'S'], b.each_breadth_first.collect {|node| node.label}, 'Partial breadth first'
|
182
|
+
assert_equal ['b', 'R', 'S'], b.each_depth_first.collect {|node| node.label}, 'Partial depth first'
|
183
|
+
# Customize visitation.
|
184
|
+
visit = proc{|n| n.label != 'c' and n.label != 'S'}
|
185
|
+
assert_equal ['a', 'b', 'R'], t.each_breadth_first(visit).collect {|node| node.label}, 'Full breadth first'
|
186
|
+
assert_equal ['a', 'b', 'R'], t.each_depth_first(visit).collect {|node| node.label}, 'Full depth first'
|
187
|
+
end
|
188
|
+
|
189
|
+
# Tree equivalence
|
190
|
+
def test_equivalence
|
191
|
+
t1 = @node_class.new.from_s('(a (b (R) (S) ) (c (T) (U)) )')
|
192
|
+
t2 = @node_class.new.from_s('(a (b (R) (S) ) (c (T) (U)) )')
|
193
|
+
s1 = @node_class.new.from_s('(a (b (R) (S) ) (c (T) ) )')
|
194
|
+
s2 = @node_class.new.from_s('(a (c (R) (S) ) (b (T) (U) ) )')
|
195
|
+
assert_equal t1, t2, 'Tree equivalence'
|
196
|
+
assert_not_equal t1, s1, 'Tree non-equivalence: different terminals'
|
197
|
+
assert_not_equal t1, s2, 'Tree non-equivalence: reversed non-terminal labels'
|
198
|
+
assert_not_equal t1, 'non-tree', 'Tree non-equivalence: not a tree'
|
199
|
+
end
|
200
|
+
|
201
|
+
def test_leaves
|
202
|
+
t = @node_class.new.from_s('(a (b c) (d e))')
|
203
|
+
leaves = t.each_depth_first.collect
|
204
|
+
c = leaves[2]
|
205
|
+
e = leaves[4]
|
206
|
+
assert_equal [c, e], t.leaves, 'Tree leaves'
|
207
|
+
assert_equal ['c', 'e'], t.leaves {|n| n.label}, 'Tree leaves with block'
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
|
212
|
+
class NodeTest < Test::Unit::TestCase
|
213
|
+
|
214
|
+
include NodeTestMixin
|
215
|
+
|
216
|
+
def setup
|
217
|
+
@node_class = Treebank::Node
|
218
|
+
end
|
219
|
+
|
220
|
+
end
|
221
|
+
|
222
|
+
|
223
|
+
class ParentedNodeTest < Test::Unit::TestCase
|
224
|
+
|
225
|
+
include NodeTestMixin
|
226
|
+
|
227
|
+
def setup
|
228
|
+
@node_class = Treebank::ParentedNode
|
229
|
+
end
|
230
|
+
|
231
|
+
def test_ancestor_enumeration
|
232
|
+
t = @node_class.new.from_s('(a (b (R) (S) ) (c (T) (U)) )')
|
233
|
+
assert_equal [t], t.each_parent.collect, 'Ancestors from head'
|
234
|
+
u = t.each_depth_first.find {|node| node.label == 'U'}
|
235
|
+
assert_equal ['U', 'c', 'a'], u.each_parent.collect {|node| node.label}, 'Ancestors from leaf'
|
236
|
+
end
|
237
|
+
|
238
|
+
end
|
metadata
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
!ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.8.11
|
3
|
+
specification_version: 1
|
4
|
+
name: treebank
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 1.0.0
|
7
|
+
date: 2006-06-08 00:00:00 -07:00
|
8
|
+
summary: Treebank implements support for ordered n-ary branching tree structures
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: billmcn@gmail.com
|
12
|
+
homepage: http://rubyforge.org/projects/treebank
|
13
|
+
rubyforge_project:
|
14
|
+
description: This module implements ordered n-ary branching tree structures. It includes support for breadth- and depth- first iteration, and serialization to and from a bracketed tree string.
|
15
|
+
autorequire:
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: true
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
25
|
+
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
authors:
|
29
|
+
- W.P. McNeill
|
30
|
+
files:
|
31
|
+
- test/test_treebank.rb
|
32
|
+
- lib/treebank.rb
|
33
|
+
- examples/penntb-words
|
34
|
+
- README
|
35
|
+
test_files:
|
36
|
+
- test/test_treebank.rb
|
37
|
+
rdoc_options:
|
38
|
+
- --title
|
39
|
+
- Treebank -- Ruby Tree
|
40
|
+
- --main
|
41
|
+
- README
|
42
|
+
- --line-numbers
|
43
|
+
- --inline-source
|
44
|
+
extra_rdoc_files:
|
45
|
+
- README
|
46
|
+
executables: []
|
47
|
+
|
48
|
+
extensions: []
|
49
|
+
|
50
|
+
requirements: []
|
51
|
+
|
52
|
+
dependencies:
|
53
|
+
- !ruby/object:Gem::Dependency
|
54
|
+
name: fsa
|
55
|
+
version_requirement:
|
56
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ">"
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: 0.0.0
|
61
|
+
version:
|