treebank 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +59 -0
- data/examples/penntb-words +49 -0
- data/lib/treebank.rb +446 -0
- data/test/test_treebank.rb +238 -0
- metadata +61 -0
data/README
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
= Tree Module
|
2
|
+
|
3
|
+
This module supports the creation, search, manipulation, and
|
4
|
+
serialization of tree structures.
|
5
|
+
|
6
|
+
Trees are implemented with Node objects. Each Node has a writable
|
7
|
+
_label_ that may be any arbitrary object and a list of other child
|
8
|
+
Node objects. Node objects support breadth and depth first iteration.
|
9
|
+
|
10
|
+
irb(main):001:0> require 'treebank'
|
11
|
+
=> true
|
12
|
+
irb(main):002:0> p = Treebank::Node.new('parent')
|
13
|
+
=> <Treebank::Node parent []>
|
14
|
+
irb(main):003:0> p.create_child!('child1')
|
15
|
+
=> <Treebank::Node child1 []>
|
16
|
+
irb(main):004:0> p.create_child!('child2')
|
17
|
+
=> <Treebank::Node child2 []>
|
18
|
+
|
19
|
+
Node has a subclass ParentedNode that keeps track of the parent of the
|
20
|
+
given node and has methods for iterating up the ancestor tree.
|
21
|
+
|
22
|
+
The default stringification method writes a node and all its children
|
23
|
+
in a bracketed tree format.
|
24
|
+
|
25
|
+
irb(main):005:0> puts p
|
26
|
+
(parent (child1) (child2))
|
27
|
+
=> nil
|
28
|
+
|
29
|
+
Bracketed tree strings can be used to create Node trees.
|
30
|
+
|
31
|
+
irb(main):006:0> t = Treebank::Node.new.from_s('(parent (child1) (child2))')
|
32
|
+
=> <Treebank::Node parent [child1 child2]>
|
33
|
+
irb(main):007:0> puts t
|
34
|
+
(parent (child1) (child2))
|
35
|
+
=> nil
|
36
|
+
|
37
|
+
The bracketed tree format is the one used by the Penn
|
38
|
+
Treebank[http://www.cis.upenn.edu/~treebank/] Project to annonate
|
39
|
+
linguistic structure.
|
40
|
+
|
41
|
+
= History
|
42
|
+
|
43
|
+
* 1-0-0 ... First release
|
44
|
+
|
45
|
+
= See Also
|
46
|
+
|
47
|
+
Lingua::Treebank[http://search.cpan.org/~kahn/Lingua-Treebank-0.14/Treebank.pm]
|
48
|
+
implements similar functionality in Perl.
|
49
|
+
|
50
|
+
= Copyright
|
51
|
+
|
52
|
+
Copyright 2006, William Patrick McNeill
|
53
|
+
|
54
|
+
This program is distributed under the GNU General Public License.
|
55
|
+
|
56
|
+
= Author
|
57
|
+
|
58
|
+
W.P. McNeill mailto:billmcn@u.washington.edu
|
59
|
+
|
@@ -0,0 +1,49 @@
|
|
1
|
+
#!/bin/env ruby
|
2
|
+
|
3
|
+
#--
|
4
|
+
# Copyright 2006 William Patrick McNeill
|
5
|
+
#
|
6
|
+
# This file is part of Treebank.
|
7
|
+
#
|
8
|
+
# Treebank is free software; you can redistribute it and/or modify it
|
9
|
+
# under the terms of the GNU General Public License as published by
|
10
|
+
# the Free Software Foundation; either version 2 of the License, or
|
11
|
+
# (at your option) any later version.
|
12
|
+
#
|
13
|
+
# Treebank is distributed in the hope that it will be useful, but
|
14
|
+
# WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
16
|
+
# General Public License for more details.
|
17
|
+
#
|
18
|
+
# You should have received a copy of the GNU General Public License
|
19
|
+
# along with editalign; if not, write to the Free Software Foundation,
|
20
|
+
# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
21
|
+
#
|
22
|
+
#++
|
23
|
+
|
24
|
+
# Print all the text in Penn Treebank parse files.
|
25
|
+
|
26
|
+
require 'treebank'
|
27
|
+
|
28
|
+
# A Penn Treebank File
|
29
|
+
#
|
30
|
+
# This class omits any comment lines when enumerating the lines in the
|
31
|
+
# file.
|
32
|
+
class TreebankFile < File
|
33
|
+
def each
|
34
|
+
super do |line|
|
35
|
+
line.gsub!(/\*x.*/, '')
|
36
|
+
yield line if not line.empty?
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
# Enumerate all the file names specified on the command line, opening
|
42
|
+
# each one and printing the strings in the trees it contains.
|
43
|
+
ARGV.each do |filename|
|
44
|
+
TreebankFile.open(filename) do |file|
|
45
|
+
Treebank::Parser.new(file).each do |tree|
|
46
|
+
puts tree.leaves {|leaf| leaf.label}.join(' ')
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
data/lib/treebank.rb
ADDED
@@ -0,0 +1,446 @@
|
|
1
|
+
# Copyright 2006 William Patrick McNeill
|
2
|
+
#
|
3
|
+
# Treebank is free software; you can redistribute it and/or modify it
|
4
|
+
# under the terms of the GNU General Public License as published by
|
5
|
+
# the Free Software Foundation; either version 2 of the License, or
|
6
|
+
# (at your option) any later version.
|
7
|
+
#
|
8
|
+
# Treebank is distributed in the hope that it will be useful, but
|
9
|
+
# WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
11
|
+
# General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU General Public License
|
14
|
+
# along with editalign; if not, write to the Free Software Foundation,
|
15
|
+
# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
16
|
+
|
17
|
+
|
18
|
+
# Treebank is the namespace that contains all tree-related functions.
|
19
|
+
module Treebank
|
20
|
+
|
21
|
+
# An enumerable list of tokens in a string representation of a tree
|
22
|
+
#
|
23
|
+
# This class provides a way of enumerating over a source to produce
|
24
|
+
# tokens that can be used in parsing a string representation of a
|
25
|
+
# tree. The source is an enumerable object whose _each_ function
|
26
|
+
# returns a sequence of String objects, for example a file or a
|
27
|
+
# single String. Each returned string is delimited by left and
|
28
|
+
# right brackets and whitespace. The default brackets are '(' and
|
29
|
+
# ')', but different delimiters may be specified in the constructor.
|
30
|
+
#
|
31
|
+
# Treebank::TokenStream.new('(A (B c) (D))').collect
|
32
|
+
# => ["(", "A", "(", "B", "c", ")", "(", "D", ")", ")"]
|
33
|
+
class TokenStream
|
34
|
+
include Enumerable
|
35
|
+
|
36
|
+
# The left delimiter
|
37
|
+
attr_reader :left
|
38
|
+
|
39
|
+
# The right delimiter
|
40
|
+
attr_reader :right
|
41
|
+
|
42
|
+
# Constructor
|
43
|
+
#
|
44
|
+
# * source ... the string stream to tokenize
|
45
|
+
# * left ... left bracket symbol
|
46
|
+
# * right ... right bracket symbol
|
47
|
+
def initialize(source, left = '(', right = ')')
|
48
|
+
@source = source
|
49
|
+
@left = left
|
50
|
+
@right = right
|
51
|
+
# Escape the '[' and ']' characters in the character class
|
52
|
+
# regular expression.
|
53
|
+
cc_left = (left == '[') ? "\\#{left}" : left
|
54
|
+
cc_right = (right == ']') ? "\\#{right}" : right
|
55
|
+
# Delimit by left and right brackets, e.g. /\(|\)|[^()]/
|
56
|
+
@s_regex = Regexp.new("\\#{@left}|\\#{@right}|[^#{cc_left}#{cc_right}]+")
|
57
|
+
end
|
58
|
+
|
59
|
+
# Enumerate the tokens in the source
|
60
|
+
def each
|
61
|
+
@source.each do |string|
|
62
|
+
tokenize_string(string) {|token| yield token}
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# Tokenize the source string
|
67
|
+
#
|
68
|
+
# * string ... the string to tokenize
|
69
|
+
def tokenize_string(string)
|
70
|
+
string.scan(@s_regex) do |bracket_delimited|
|
71
|
+
bracket_delimited.split.each {|token| yield token}
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
protected :tokenize_string
|
76
|
+
|
77
|
+
end # TokenStream
|
78
|
+
|
79
|
+
# A parser for string representations of trees
|
80
|
+
#
|
81
|
+
# This class uses a simplified shift-reduce parser to convert a
|
82
|
+
# string into a list of tree structures.
|
83
|
+
#
|
84
|
+
# Treebank::Parser.new('(A) (B (C) (D))').collect
|
85
|
+
# => [<Treebank::Node A []>, <Treebank::Node B [C D]>]
|
86
|
+
#
|
87
|
+
# The string representation of a list of trees has the following BNF
|
88
|
+
# definition
|
89
|
+
#
|
90
|
+
# * trees -> node*
|
91
|
+
# * node -> (label? children)
|
92
|
+
# * label -> word
|
93
|
+
# * children -> node*|word
|
94
|
+
# * word -> \w+
|
95
|
+
#
|
96
|
+
# Note that the BNF definition of children allows a shortcut in
|
97
|
+
# which the labels of terminal nodes may be specified without
|
98
|
+
# brackets. So, for example, <tt>(A (B))</tt> and <tt>(A B)</tt>
|
99
|
+
# are equivalent.
|
100
|
+
#
|
101
|
+
# The trees returned by this class are caller-defined node objects,
|
102
|
+
# where each node has a list of child nodes.
|
103
|
+
class Parser
|
104
|
+
include Enumerable
|
105
|
+
|
106
|
+
# Constructor
|
107
|
+
#
|
108
|
+
# * tokens ... stream of tokens to be converted into trees
|
109
|
+
# * node_class ... class of node to create
|
110
|
+
#
|
111
|
+
# If _tokens_ is not a kind of TokenStream object it will be used
|
112
|
+
# as the source stream of one.
|
113
|
+
def initialize(tokens, node_class = Node)
|
114
|
+
tokens = TokenStream.new(tokens) if not tokens.kind_of? TokenStream
|
115
|
+
@tokens = tokens
|
116
|
+
@node_class = node_class
|
117
|
+
end
|
118
|
+
|
119
|
+
# Enumerate the tokens yielding trees
|
120
|
+
def each # :yields: tree
|
121
|
+
parse = []
|
122
|
+
@tokens.each do |token|
|
123
|
+
case token
|
124
|
+
when @tokens.left
|
125
|
+
parse << :left
|
126
|
+
when @tokens.right
|
127
|
+
# Reduce the end of the parse stack.
|
128
|
+
left_index = parse.rindex(:left)
|
129
|
+
raise "Extra #{@tokens.right}" if left_index.nil?
|
130
|
+
parse[left_index..-1] = reduce(parse[left_index+1..-1])
|
131
|
+
# If the reduced stack consists of a single node, it must be
|
132
|
+
# a complete tree.
|
133
|
+
yield parse.pop if parse.length == 1
|
134
|
+
else
|
135
|
+
parse << token
|
136
|
+
end # case
|
137
|
+
end # do
|
138
|
+
raise "Extra #{@tokens.left}: #{parse}" if not parse.empty?
|
139
|
+
end
|
140
|
+
|
141
|
+
# Convert the end of the parse list into a single node
|
142
|
+
#
|
143
|
+
# * node_parse ... a list of labels and nodes
|
144
|
+
def reduce(node_parse)
|
145
|
+
node = @node_class.new
|
146
|
+
# The first item in the list may be a label.
|
147
|
+
if node_parse.first.class == String
|
148
|
+
node.label = node_parse.shift
|
149
|
+
# Special case: terminals without brackets, e.g. '(V ran)'
|
150
|
+
if node_parse.length == 1 and node_parse.last.class == String
|
151
|
+
node.create_child!(node_parse.last)
|
152
|
+
return node
|
153
|
+
end
|
154
|
+
end
|
155
|
+
# The remaining items are child nodes.
|
156
|
+
node_parse.each {|child| node.attach_child!(child)}
|
157
|
+
node
|
158
|
+
end
|
159
|
+
|
160
|
+
protected :reduce
|
161
|
+
|
162
|
+
end # Parser
|
163
|
+
|
164
|
+
|
165
|
+
# A node in a tree
|
166
|
+
#
|
167
|
+
# A Node consists of a label, which may be any arbitrary Object, and
|
168
|
+
# a list of children, which are also Node objects.
|
169
|
+
class Node
|
170
|
+
include Enumerable
|
171
|
+
|
172
|
+
# Iterates a tree breadth-first
|
173
|
+
class BFSIterator
|
174
|
+
include Enumerable
|
175
|
+
|
176
|
+
# Constructor
|
177
|
+
#
|
178
|
+
# * node ... the start node of the enumeration
|
179
|
+
# * visit ... optional enumeration control procedure
|
180
|
+
#
|
181
|
+
# The optional _visit_ argument can be used to control which
|
182
|
+
# children are visited by this iterator. If specified, it is
|
183
|
+
# called for every node, and only those nodes returning +true+
|
184
|
+
# will be visited.
|
185
|
+
def initialize(node, visit = nil)
|
186
|
+
@node = node
|
187
|
+
@visit = visit
|
188
|
+
end
|
189
|
+
|
190
|
+
# Enumerate the nodes
|
191
|
+
def each
|
192
|
+
@agenda = [@node]
|
193
|
+
while node = @agenda.shift
|
194
|
+
yield node
|
195
|
+
children = @visit ? node.find_all {|n| @visit.call(n)} : node.collect
|
196
|
+
recurse(children)
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
# Function that controls enumeration recursion
|
201
|
+
#
|
202
|
+
# * children ... a list of child nodes of the current node
|
203
|
+
#
|
204
|
+
# The only difference between the breadth-first and depth-first
|
205
|
+
# searches is this function.
|
206
|
+
def recurse(children)
|
207
|
+
@agenda += children
|
208
|
+
end
|
209
|
+
end # BFSIterator
|
210
|
+
|
211
|
+
# Iterates a tree depth-first
|
212
|
+
class DFSIterator < BFSIterator
|
213
|
+
|
214
|
+
# Function that controls enumeration recursion
|
215
|
+
#
|
216
|
+
# * children ... a list of child nodes of the current node
|
217
|
+
#
|
218
|
+
# The only difference between the breadth-first and depth-first
|
219
|
+
# searches is this function.
|
220
|
+
def recurse(children)
|
221
|
+
@agenda = children + @agenda
|
222
|
+
end
|
223
|
+
end # DFSIterator
|
224
|
+
|
225
|
+
# This node's label
|
226
|
+
attr_accessor :label
|
227
|
+
|
228
|
+
# Constructor
|
229
|
+
#
|
230
|
+
# * label ... the label of this node
|
231
|
+
# * child_labels ... list of labels for children of this node
|
232
|
+
def initialize(label = nil, child_labels = [])
|
233
|
+
@label = label
|
234
|
+
@children = []
|
235
|
+
child_labels.each {|label| create_child!(label)}
|
236
|
+
end
|
237
|
+
|
238
|
+
# Read the tree from a bracketed string
|
239
|
+
#
|
240
|
+
# * s ... bracketed string
|
241
|
+
# * left ... left bracket symbol
|
242
|
+
# * right ... right bracket symbol
|
243
|
+
#
|
244
|
+
# This function uses a Treebank::Parser object to create the tree from
|
245
|
+
# _s_. This raises an error if this node is not empty.
|
246
|
+
def from_s(s, left = '(', right = ')')
|
247
|
+
raise 'This node is not empty.' if not self.empty?
|
248
|
+
nodes = Parser.new(TokenStream.new(s, left, right), self.class).collect
|
249
|
+
raise "#{s} defines multiple trees" if nodes.length != 1
|
250
|
+
@label = nodes.first.label
|
251
|
+
@children = nodes.first.collect
|
252
|
+
self
|
253
|
+
end
|
254
|
+
|
255
|
+
# Stringify
|
256
|
+
#
|
257
|
+
# This writes to a bracketed string representation that can be
|
258
|
+
# read by the Parser object.
|
259
|
+
def to_s
|
260
|
+
space = leaf? ? '':' '
|
261
|
+
"(#{label}#{space}#{@children.join(' ')})"
|
262
|
+
end
|
263
|
+
|
264
|
+
# Interactive stringification
|
265
|
+
def inspect
|
266
|
+
child_labels = @children.collect {|n| n.label}
|
267
|
+
"<#{self.class} #{@label} [#{child_labels.join(' ')}]>"
|
268
|
+
end
|
269
|
+
|
270
|
+
# Tree equivalence operator
|
271
|
+
#
|
272
|
+
# If the other object is a tree and every node label in the
|
273
|
+
# corresponding nodes of the two depth first enumerations match,
|
274
|
+
# the trees are equivalent.
|
275
|
+
def ==(other)
|
276
|
+
return false if not other.kind_of? self.class
|
277
|
+
return true if self.empty? and other.empty?
|
278
|
+
mismatch = self.each_depth_first.zip(other.each_depth_first).find \
|
279
|
+
{|self_node, other_node| self_node.nil? or \
|
280
|
+
other_node.nil? or \
|
281
|
+
self_node.label != other_node.label}
|
282
|
+
mismatch.nil?
|
283
|
+
end
|
284
|
+
|
285
|
+
# Create a new node and add it as a child of this node
|
286
|
+
#
|
287
|
+
# * label ... the label of a node to create
|
288
|
+
# * index ... optional insertion index
|
289
|
+
#
|
290
|
+
# If _index_ is not specified, the node is added to the end of the
|
291
|
+
# child list.
|
292
|
+
#
|
293
|
+
# This function returns the added Node object.
|
294
|
+
def create_child!(label, index = nil)
|
295
|
+
attach_child!(self.class.new(label), index)
|
296
|
+
end
|
297
|
+
|
298
|
+
# Attach an existing node as the child of this node
|
299
|
+
#
|
300
|
+
# * node ... the node to add
|
301
|
+
# * index ... optional insertion index
|
302
|
+
#
|
303
|
+
# _node_ must be the same type as this node.
|
304
|
+
#
|
305
|
+
# If _index_ is not specified, the node is added to the end of the
|
306
|
+
# child list.
|
307
|
+
#
|
308
|
+
# This function returns the added Node object.
|
309
|
+
def attach_child!(node, index = nil)
|
310
|
+
raise "#{node} is not class #{self.class}" if node.class != self.class
|
311
|
+
if index.nil?
|
312
|
+
@children << node
|
313
|
+
else
|
314
|
+
@children[index, 0] = node
|
315
|
+
end
|
316
|
+
node
|
317
|
+
end
|
318
|
+
|
319
|
+
# Detach a child node
|
320
|
+
#
|
321
|
+
# * node ... the node to detach
|
322
|
+
#
|
323
|
+
# This removes the specfied node from this node's child list.
|
324
|
+
def detach_child!(node)
|
325
|
+
raise "#{node} is not a child of #{self}" if @children.delete(node).nil?
|
326
|
+
end
|
327
|
+
|
328
|
+
# Enumerate the children of this node.
|
329
|
+
def each
|
330
|
+
@children.each {|node| yield node}
|
331
|
+
end
|
332
|
+
|
333
|
+
# Enumerate all the nodes beneath this one breadth-first
|
334
|
+
#
|
335
|
+
# * visit ... optional enumeration control procedure
|
336
|
+
#
|
337
|
+
# The _visit_ parameter is passed down to the BFSIterator.
|
338
|
+
def each_breadth_first(visit = nil)
|
339
|
+
BFSIterator.new(self, visit)
|
340
|
+
end
|
341
|
+
|
342
|
+
# Enumerate all the nodes beneath this one depth-first
|
343
|
+
#
|
344
|
+
# * visit ... optional enumeration control procedure
|
345
|
+
#
|
346
|
+
# The _visit_ parameter is passed down to the DFSIterator.
|
347
|
+
def each_depth_first(visit = nil)
|
348
|
+
DFSIterator.new(self, visit)
|
349
|
+
end
|
350
|
+
|
351
|
+
# Is this a leaf node?
|
352
|
+
def leaf?
|
353
|
+
@children.empty?
|
354
|
+
end
|
355
|
+
|
356
|
+
# Is this node empty?
|
357
|
+
def empty?
|
358
|
+
@label.nil? and @children.empty?
|
359
|
+
end
|
360
|
+
|
361
|
+
# All the leaf nodes beneath this node
|
362
|
+
#
|
363
|
+
# * block ... an optional block to run on each leaf
|
364
|
+
def leaves(&block)
|
365
|
+
leaves = each_depth_first.find_all {|node| node.leaf?}
|
366
|
+
leaves = leaves.collect {|leaf| block.call(leaf)} if not block.nil?
|
367
|
+
leaves
|
368
|
+
end
|
369
|
+
|
370
|
+
end # Node
|
371
|
+
|
372
|
+
|
373
|
+
# A Node in a Tree that can locate its parent
|
374
|
+
#
|
375
|
+
# The ParentedNode adds a pointer back to the parent node to
|
376
|
+
# the Node class.
|
377
|
+
class ParentedNode < Node
|
378
|
+
|
379
|
+
# This node's parent
|
380
|
+
attr_reader :parent
|
381
|
+
|
382
|
+
# Iterates up a tree
|
383
|
+
class ParentIterator
|
384
|
+
include Enumerable
|
385
|
+
|
386
|
+
# Constructor
|
387
|
+
#
|
388
|
+
# * node ... the start node of the enumeration
|
389
|
+
def initialize(node)
|
390
|
+
@node = node
|
391
|
+
end
|
392
|
+
|
393
|
+
# Enumerate the ancestor chain
|
394
|
+
def each
|
395
|
+
node = @node
|
396
|
+
while not node.nil?
|
397
|
+
yield node
|
398
|
+
node = node.parent
|
399
|
+
end
|
400
|
+
end
|
401
|
+
|
402
|
+
end # ParentIterator
|
403
|
+
|
404
|
+
# Constructor
|
405
|
+
#
|
406
|
+
# * label ... the label of this node
|
407
|
+
# * child_labels ... list of labels for children of this node
|
408
|
+
# * parent ... the parent of this node
|
409
|
+
def initialize(label = nil, child_labels = [], parent = nil)
|
410
|
+
super(label, child_labels)
|
411
|
+
@parent = parent
|
412
|
+
end
|
413
|
+
|
414
|
+
# See Treebank::Node.attach_child!
|
415
|
+
def attach_child!(node, index = nil)
|
416
|
+
child = super(node, index)
|
417
|
+
child.parent = self
|
418
|
+
child
|
419
|
+
end
|
420
|
+
|
421
|
+
# See Treebank::Node.detach_child!
|
422
|
+
def detach_child!(node)
|
423
|
+
super(node)
|
424
|
+
node.parent = nil
|
425
|
+
end
|
426
|
+
|
427
|
+
# Set the parent of this node
|
428
|
+
#
|
429
|
+
# * parent ... the parent node
|
430
|
+
#
|
431
|
+
# This is a protected utility function. It does not change the
|
432
|
+
# child list of _parent_.
|
433
|
+
def parent=(parent)
|
434
|
+
@parent = parent
|
435
|
+
end
|
436
|
+
|
437
|
+
# Enumerate the ancestors of this node
|
438
|
+
def each_parent
|
439
|
+
ParentIterator.new(self)
|
440
|
+
end
|
441
|
+
|
442
|
+
protected :parent=
|
443
|
+
|
444
|
+
end # ParentedNode
|
445
|
+
|
446
|
+
end
|
@@ -0,0 +1,238 @@
|
|
1
|
+
#!/bin/env ruby
|
2
|
+
|
3
|
+
#--
|
4
|
+
# Copyright 2006 William Patrick McNeill
|
5
|
+
#
|
6
|
+
# This file is part of Treebank.
|
7
|
+
#
|
8
|
+
# Treebank is free software; you can redistribute it and/or modify it
|
9
|
+
# under the terms of the GNU General Public License as published by
|
10
|
+
# the Free Software Foundation; either version 2 of the License, or
|
11
|
+
# (at your option) any later version.
|
12
|
+
#
|
13
|
+
# Treebank is distributed in the hope that it will be useful, but
|
14
|
+
# WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
16
|
+
# General Public License for more details.
|
17
|
+
#
|
18
|
+
# You should have received a copy of the GNU General Public License
|
19
|
+
# along with editalign; if not, write to the Free Software Foundation,
|
20
|
+
# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
21
|
+
#
|
22
|
+
#++
|
23
|
+
|
24
|
+
# Test cases for the Treebank module
|
25
|
+
|
26
|
+
require 'test/unit'
|
27
|
+
require 'treebank'
|
28
|
+
|
29
|
+
class TokenStreamTest < Test::Unit::TestCase
|
30
|
+
def test_basic_token_stream
|
31
|
+
t = Treebank::TokenStream.new('(A (B c) (D))')
|
32
|
+
assert_kind_of Treebank::TokenStream, t
|
33
|
+
assert_equal '(', t.left
|
34
|
+
assert_equal ')', t.right
|
35
|
+
assert_equal ['(', 'A', '(', 'B', 'c', ')', '(', 'D', ')', ')'], t.collect
|
36
|
+
end
|
37
|
+
|
38
|
+
def test_non_alphanum_token_stream
|
39
|
+
t = Treebank::TokenStream.new('(A!Node (B!Node .) (14))')
|
40
|
+
assert_kind_of Treebank::TokenStream, t
|
41
|
+
assert_equal '(', t.left
|
42
|
+
assert_equal ')', t.right
|
43
|
+
assert_equal ['(', 'A!Node', '(', 'B!Node', '.', ')', '(', '14', ')', ')'], t.collect
|
44
|
+
end
|
45
|
+
|
46
|
+
def test_different_delimiter
|
47
|
+
t = Treebank::TokenStream.new('<A <B c> <D>>', '<', '>')
|
48
|
+
assert_kind_of Treebank::TokenStream, t
|
49
|
+
assert_equal '<', t.left
|
50
|
+
assert_equal '>', t.right
|
51
|
+
assert_equal ['<', 'A', '<', 'B', 'c', '>', '<', 'D', '>', '>'], t.collect
|
52
|
+
end
|
53
|
+
|
54
|
+
def test_bracket_delimiter
|
55
|
+
t = Treebank::TokenStream.new('[A [B c] [D]]', '[', ']')
|
56
|
+
assert_kind_of Treebank::TokenStream, t
|
57
|
+
assert_equal '[', t.left
|
58
|
+
assert_equal ']', t.right
|
59
|
+
assert_equal ['[', 'A', '[', 'B', 'c', ']', '[', 'D', ']', ']'], t.collect
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
63
|
+
|
64
|
+
|
65
|
+
module ParseTreeMixin
|
66
|
+
|
67
|
+
# Parse tree string
|
68
|
+
def test_tree_parse
|
69
|
+
p = Treebank::Parser.new(Treebank::TokenStream.new('(A) (B)'), @node_class)
|
70
|
+
trees = p.collect
|
71
|
+
assert_equal [@node_class.new.from_s('(A)'), @node_class.new.from_s('(B)')], trees
|
72
|
+
assert_kind_of @node_class, trees[0]
|
73
|
+
assert_kind_of @node_class, trees[1]
|
74
|
+
t = Treebank::Parser.new(Treebank::TokenStream.new('(A (B) (C))'), @node_class).collect.first
|
75
|
+
assert_equal t.label, 'A'
|
76
|
+
assert_equal t.collect.first.label, 'B'
|
77
|
+
assert_equal t.collect.last.label, 'C'
|
78
|
+
assert_equal [@node_class.new], Treebank::Parser.new(Treebank::TokenStream.new('()'), @node_class).collect
|
79
|
+
end
|
80
|
+
|
81
|
+
def test_string_in_constructor
|
82
|
+
assert_equal [@node_class.new.from_s('(A)'), @node_class.new.from_s('(B)')], Treebank::Parser.new('(A) (B)', @node_class).collect
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
86
|
+
|
87
|
+
|
88
|
+
class TreeParserTest < Test::Unit::TestCase
|
89
|
+
|
90
|
+
include ParseTreeMixin
|
91
|
+
|
92
|
+
def setup
|
93
|
+
@node_class = Treebank::Node
|
94
|
+
end
|
95
|
+
|
96
|
+
end
|
97
|
+
|
98
|
+
|
99
|
+
class ParentedTreeParserTest < Test::Unit::TestCase
|
100
|
+
|
101
|
+
include ParseTreeMixin
|
102
|
+
|
103
|
+
def setup
|
104
|
+
@node_class = Treebank::ParentedNode
|
105
|
+
end
|
106
|
+
|
107
|
+
end
|
108
|
+
|
109
|
+
|
110
|
+
module NodeTestMixin
|
111
|
+
|
112
|
+
# Empty tree
|
113
|
+
def test_empty_tree
|
114
|
+
t = @node_class.new
|
115
|
+
assert_kind_of @node_class, t, 'Empty tree type'
|
116
|
+
assert_nil t.label, 'Empty tree nil head'
|
117
|
+
assert t.empty?, 'Empty empty?'
|
118
|
+
assert_equal [], t.collect, 'Empty child list'
|
119
|
+
assert_equal [t], t.each_breadth_first.collect, 'Empty breadth first'
|
120
|
+
assert_equal [t], t.each_depth_first.collect, 'Empty depth first'
|
121
|
+
end
|
122
|
+
|
123
|
+
# Test a single node tree
|
124
|
+
def test_single_node_tree
|
125
|
+
t = @node_class.new('a')
|
126
|
+
assert_equal 'a', t.label, 'Single node label'
|
127
|
+
assert !t.empty?, 'Single node not empty?'
|
128
|
+
assert_equal [], t.collect, 'Empty child list'
|
129
|
+
assert_equal [t], t.each_breadth_first.collect, 'Single node breadth first'
|
130
|
+
assert_equal [t], t.each_depth_first.collect, 'Single depth first'
|
131
|
+
end
|
132
|
+
|
133
|
+
# Test adding children in the constructor
|
134
|
+
def test_constructor_children
|
135
|
+
t = @node_class.new('a', ['b', 'c', 'd'])
|
136
|
+
assert_equal @node_class.new.from_s('(a (b) (c) (d) )'), t, 'Children in constructor'
|
137
|
+
end
|
138
|
+
|
139
|
+
# Add children
|
140
|
+
def test_add_children
|
141
|
+
# Add a child of the head node.
|
142
|
+
a = @node_class.new('a')
|
143
|
+
b = a.create_child!('b')
|
144
|
+
assert_kind_of @node_class, b, 'create_child! return value type'
|
145
|
+
assert_equal 'b', b.label, 'create_child! return value label'
|
146
|
+
assert_equal a.collect, [b], 'a children'
|
147
|
+
assert_equal b.collect, [], 'b children'
|
148
|
+
assert !a.empty?, 'create_child! not empty?'
|
149
|
+
a = @node_class.new('a', ['b', 'c', 'd'])
|
150
|
+
a.create_child!('z', 0)
|
151
|
+
assert_equal ['z', 'b', 'c', 'd'], a.collect {|n| n.label}
|
152
|
+
a.create_child!('x', 2)
|
153
|
+
assert_equal ['z', 'b', 'x', 'c', 'd'], a.collect {|n| n.label}
|
154
|
+
end
|
155
|
+
|
156
|
+
# Read from/to a string
|
157
|
+
def test_stringify
|
158
|
+
s = '(S (NP (D (the)) (N (boy))) (VP (V (ran))))'
|
159
|
+
multiline_s = \
|
160
|
+
'(S
|
161
|
+
(NP
|
162
|
+
(D (the))
|
163
|
+
(N (boy)))
|
164
|
+
(VP
|
165
|
+
(V (ran))))'
|
166
|
+
t = @node_class.new.from_s(s)
|
167
|
+
assert_kind_of @node_class, t, 'from_s'
|
168
|
+
assert_equal s, "#{t}", 'to_s'
|
169
|
+
m = @node_class.new.from_s(multiline_s)
|
170
|
+
assert_equal t, m, 'Single-/multi-line equal'
|
171
|
+
end
|
172
|
+
|
173
|
+
# Simple enumeration
|
174
|
+
def test_enumeration
|
175
|
+
# Enumerate all children.
|
176
|
+
t = @node_class.new.from_s('(a (b (R) (S) ) (c (T) (U)) )')
|
177
|
+
assert_equal ['a', 'b', 'c', 'R', 'S', 'T', 'U'], t.each_breadth_first.collect {|node| node.label}, 'Full breadth first'
|
178
|
+
assert_equal ['a', 'b', 'R', 'S', 'c', 'T', 'U'], t.each_depth_first.collect {|node| node.label}, 'Full depth first'
|
179
|
+
# Enumerate children beneath a node.
|
180
|
+
b = t.find {|node| node.label == 'b'}
|
181
|
+
assert_equal ['b', 'R', 'S'], b.each_breadth_first.collect {|node| node.label}, 'Partial breadth first'
|
182
|
+
assert_equal ['b', 'R', 'S'], b.each_depth_first.collect {|node| node.label}, 'Partial depth first'
|
183
|
+
# Customize visitation.
|
184
|
+
visit = proc{|n| n.label != 'c' and n.label != 'S'}
|
185
|
+
assert_equal ['a', 'b', 'R'], t.each_breadth_first(visit).collect {|node| node.label}, 'Full breadth first'
|
186
|
+
assert_equal ['a', 'b', 'R'], t.each_depth_first(visit).collect {|node| node.label}, 'Full depth first'
|
187
|
+
end
|
188
|
+
|
189
|
+
# Tree equivalence
|
190
|
+
def test_equivalence
|
191
|
+
t1 = @node_class.new.from_s('(a (b (R) (S) ) (c (T) (U)) )')
|
192
|
+
t2 = @node_class.new.from_s('(a (b (R) (S) ) (c (T) (U)) )')
|
193
|
+
s1 = @node_class.new.from_s('(a (b (R) (S) ) (c (T) ) )')
|
194
|
+
s2 = @node_class.new.from_s('(a (c (R) (S) ) (b (T) (U) ) )')
|
195
|
+
assert_equal t1, t2, 'Tree equivalence'
|
196
|
+
assert_not_equal t1, s1, 'Tree non-equivalence: different terminals'
|
197
|
+
assert_not_equal t1, s2, 'Tree non-equivalence: reversed non-terminal labels'
|
198
|
+
assert_not_equal t1, 'non-tree', 'Tree non-equivalence: not a tree'
|
199
|
+
end
|
200
|
+
|
201
|
+
def test_leaves
|
202
|
+
t = @node_class.new.from_s('(a (b c) (d e))')
|
203
|
+
leaves = t.each_depth_first.collect
|
204
|
+
c = leaves[2]
|
205
|
+
e = leaves[4]
|
206
|
+
assert_equal [c, e], t.leaves, 'Tree leaves'
|
207
|
+
assert_equal ['c', 'e'], t.leaves {|n| n.label}, 'Tree leaves with block'
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
|
212
|
+
class NodeTest < Test::Unit::TestCase
|
213
|
+
|
214
|
+
include NodeTestMixin
|
215
|
+
|
216
|
+
def setup
|
217
|
+
@node_class = Treebank::Node
|
218
|
+
end
|
219
|
+
|
220
|
+
end
|
221
|
+
|
222
|
+
|
223
|
+
class ParentedNodeTest < Test::Unit::TestCase
|
224
|
+
|
225
|
+
include NodeTestMixin
|
226
|
+
|
227
|
+
def setup
|
228
|
+
@node_class = Treebank::ParentedNode
|
229
|
+
end
|
230
|
+
|
231
|
+
def test_ancestor_enumeration
|
232
|
+
t = @node_class.new.from_s('(a (b (R) (S) ) (c (T) (U)) )')
|
233
|
+
assert_equal [t], t.each_parent.collect, 'Ancestors from head'
|
234
|
+
u = t.each_depth_first.find {|node| node.label == 'U'}
|
235
|
+
assert_equal ['U', 'c', 'a'], u.each_parent.collect {|node| node.label}, 'Ancestors from leaf'
|
236
|
+
end
|
237
|
+
|
238
|
+
end
|
metadata
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
!ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.8.11
|
3
|
+
specification_version: 1
|
4
|
+
name: treebank
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 1.0.0
|
7
|
+
date: 2006-06-08 00:00:00 -07:00
|
8
|
+
summary: Treebank implements support for ordered n-ary branching tree structures
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: billmcn@gmail.com
|
12
|
+
homepage: http://rubyforge.org/projects/treebank
|
13
|
+
rubyforge_project:
|
14
|
+
description: This module implements ordered n-ary branching tree structures. It includes support for breadth- and depth- first iteration, and serialization to and from a bracketed tree string.
|
15
|
+
autorequire:
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: true
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
25
|
+
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
authors:
|
29
|
+
- W.P. McNeill
|
30
|
+
files:
|
31
|
+
- test/test_treebank.rb
|
32
|
+
- lib/treebank.rb
|
33
|
+
- examples/penntb-words
|
34
|
+
- README
|
35
|
+
test_files:
|
36
|
+
- test/test_treebank.rb
|
37
|
+
rdoc_options:
|
38
|
+
- --title
|
39
|
+
- Treebank -- Ruby Tree
|
40
|
+
- --main
|
41
|
+
- README
|
42
|
+
- --line-numbers
|
43
|
+
- --inline-source
|
44
|
+
extra_rdoc_files:
|
45
|
+
- README
|
46
|
+
executables: []
|
47
|
+
|
48
|
+
extensions: []
|
49
|
+
|
50
|
+
requirements: []
|
51
|
+
|
52
|
+
dependencies:
|
53
|
+
- !ruby/object:Gem::Dependency
|
54
|
+
name: fsa
|
55
|
+
version_requirement:
|
56
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ">"
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: 0.0.0
|
61
|
+
version:
|