treebank 2.1.0 → 3.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +31 -28
- data/examples/penntb-words +1 -1
- data/lib/treebank.rb +189 -120
- data/test/test_treebank.rb +46 -27
- metadata +2 -2
data/README
CHANGED
@@ -1,64 +1,67 @@
|
|
1
|
-
= Tree Module
|
1
|
+
= Tree Representation Module
|
2
2
|
|
3
|
-
This module supports the creation, search, manipulation, and
|
4
|
-
serialization of tree structures.
|
3
|
+
This module supports the creation, search, manipulation, and serialization of tree structures.
|
5
4
|
|
6
|
-
Trees are implemented with Treebank::Node objects. Each Node has a writable
|
7
|
-
_label_ that may be any arbitrary object and a list of other child
|
8
|
-
Node objects. Node objects support breadth and depth first iteration.
|
5
|
+
Trees are implemented with Treebank::Node objects. Each Node has a writable _label_ that may be any arbitrary object and a list of other child Node objects.
|
9
6
|
|
10
|
-
|
7
|
+
> require 'treebank'
|
11
8
|
=> true
|
12
|
-
|
9
|
+
> p = Treebank::Node.new('parent')
|
13
10
|
=> <Treebank::Node parent []>
|
14
|
-
|
11
|
+
> p.create_child!('child1')
|
15
12
|
=> <Treebank::Node child1 []>
|
16
|
-
|
13
|
+
> p.create_child!('child2')
|
17
14
|
=> <Treebank::Node child2 []>
|
18
15
|
|
19
|
-
Node has a subclass Treebank::ParentedNode that keeps track of the parent of the
|
20
|
-
given node and has methods for iterating up the ancestor tree.
|
16
|
+
Node has a subclass Treebank::ParentedNode that keeps track of the parent of the given node and has methods for iterating up the ancestor tree.
|
21
17
|
|
22
|
-
The
|
23
|
-
in a bracketed tree format.
|
18
|
+
Node objects support breadth- and depth-first iteration. The functions each_depth_first and each_breadth_first yield a node and its descendants in the specified order. The functions depth_first_enumerator and breadth_first_enumerator wrap these functions inside Enumerator[http://www.ruby-doc.org/core/classes/Enumerable/Enumerator.html] objects. See the function documentation for details as to how the enumeration may be controlled.
|
24
19
|
|
25
|
-
|
20
|
+
The default stringification method writes a node and all its children in a bracketed tree format.
|
21
|
+
|
22
|
+
> puts p
|
26
23
|
(parent
|
27
24
|
(child1 )
|
28
25
|
(child2 ))
|
29
26
|
|
30
27
|
Bracketed tree strings can be used to create Node trees.
|
31
28
|
|
32
|
-
|
29
|
+
> t = Treebank::Node.from_s('(parent (child1) (child2))')
|
33
30
|
=> <Treebank::Node parent [child1 child2]>
|
34
|
-
|
31
|
+
> puts t
|
35
32
|
(parent
|
36
33
|
(child1 )
|
37
34
|
(child2 ))
|
38
35
|
|
39
|
-
The bracketed tree format is the one used by the Penn
|
40
|
-
|
41
|
-
|
36
|
+
The bracketed tree format is the one used by the {Penn Treebank Project}[http://www.cis.upenn.edu/~treebank] to annotate linguistic structure.
|
37
|
+
|
38
|
+
The children of a node can be dereferenced with the array operator using a list of child offsets. For example:
|
39
|
+
|
40
|
+
> t = Treebank::Node.from_s("(A (B b) (C (D d) (E e)))")
|
41
|
+
=> <Treebank::Node A [B C]>
|
42
|
+
> t[1,0]
|
43
|
+
=> <Treebank::Node D [d]>
|
44
|
+
|
45
|
+
Here the index (1,0) finds the 0th child of the node's 1st child, which is the node <tt>(D d)</tt>.
|
42
46
|
|
43
47
|
= History
|
44
48
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
+
1-0-0:: First release
|
50
|
+
1-1-0:: Removed unnecessary fsa dependency from gemspec
|
51
|
+
2-0-0:: Changed from_s initialization
|
52
|
+
2-1-0:: Add indented multiline stringification; Add preterminal?
|
53
|
+
3-0-0:: Add Node.children function. Add child deferencing by index. The each_depth_first and each_breadth_first functions now yield nodes instead of returning private enumerator objects. To get enumerator objects, use depth_first_enumerator and breadth_first_enumerator instead. Likewise, the each_parent function in the ParentedNode class yields nodes, while parent_enumerator returns an enumerator.
|
49
54
|
|
50
55
|
= See Also
|
51
56
|
|
52
|
-
Lingua::Treebank[http://search.cpan.org/~kahn/Lingua-Treebank
|
53
|
-
implements similar functionality in Perl.
|
57
|
+
Lingua::Treebank[http://search.cpan.org/~kahn/Lingua-Treebank/] implements similar functionality in Perl.
|
54
58
|
|
55
59
|
= Copyright
|
56
60
|
|
57
|
-
Copyright 2006, William Patrick McNeill
|
61
|
+
Copyright 2006-2008, William Patrick McNeill
|
58
62
|
|
59
63
|
This program is distributed under the GNU General Public License.
|
60
64
|
|
61
65
|
= Author
|
62
66
|
|
63
67
|
W.P. McNeill mailto:billmcn@u.washington.edu
|
64
|
-
|
data/examples/penntb-words
CHANGED
data/lib/treebank.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright 2006 William Patrick McNeill
|
1
|
+
# Copyright 2006-2008 William Patrick McNeill
|
2
2
|
#
|
3
3
|
# Treebank is free software; you can redistribute it and/or modify it
|
4
4
|
# under the terms of the GNU General Public License as published by
|
@@ -14,21 +14,22 @@
|
|
14
14
|
# along with editalign; if not, write to the Free Software Foundation,
|
15
15
|
# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
16
16
|
|
17
|
+
require "enumerator"
|
17
18
|
|
18
19
|
# Treebank is the namespace that contains all tree-related functions.
|
19
20
|
module Treebank
|
20
21
|
|
21
|
-
VERSION = "
|
22
|
+
VERSION = "3.0.0"
|
22
23
|
|
23
24
|
# An enumerable list of tokens in a string representation of a tree
|
24
25
|
#
|
25
|
-
# This class provides a way of enumerating over a source to produce
|
26
|
-
#
|
27
|
-
#
|
28
|
-
#
|
29
|
-
#
|
30
|
-
#
|
31
|
-
#
|
26
|
+
# This class provides a way of enumerating over a source to produce tokens
|
27
|
+
# that can be used in parsing a string representation of a tree. The source
|
28
|
+
# is an enumerable object whose _each_ function returns a sequence of String
|
29
|
+
# objects, for example a file or a single String. Each returned string is
|
30
|
+
# delimited by left and right brackets and whitespace. The default brackets
|
31
|
+
# are '(' and ')', but different delimiters may be specified in the
|
32
|
+
# constructor.
|
32
33
|
#
|
33
34
|
# Treebank::TokenStream.new('(A (B c) (D))').collect
|
34
35
|
# => ["(", "A", "(", "B", "c", ")", "(", "D", ")", ")"]
|
@@ -80,8 +81,8 @@ module Treebank
|
|
80
81
|
|
81
82
|
# A parser for string representations of trees.
|
82
83
|
#
|
83
|
-
# This class uses a simplified shift-reduce parser to convert a
|
84
|
-
#
|
84
|
+
# This class uses a simplified shift-reduce parser to convert a string into
|
85
|
+
# a list of tree structures.
|
85
86
|
#
|
86
87
|
# Treebank::Parser.new('(A) (B (C) (D))').collect
|
87
88
|
# => [<Treebank::Node A []>, <Treebank::Node B [C D]>]
|
@@ -95,13 +96,12 @@ module Treebank
|
|
95
96
|
# * children -> node*|word
|
96
97
|
# * word -> \w+
|
97
98
|
#
|
98
|
-
# Note that the BNF definition of children allows a shortcut in
|
99
|
-
#
|
100
|
-
#
|
101
|
-
# are equivalent.
|
99
|
+
# Note that the BNF definition of children allows a shortcut in which the
|
100
|
+
# labels of terminal nodes may be specified without brackets. So, for
|
101
|
+
# example, <tt>(A (B))</tt> and <tt>(A B)</tt> are equivalent.
|
102
102
|
#
|
103
|
-
# The trees returned by this class are caller-defined node objects,
|
104
|
-
#
|
103
|
+
# The trees returned by this class are caller-defined node objects, where
|
104
|
+
# each node has a list of child nodes.
|
105
105
|
class Parser
|
106
106
|
include Enumerable
|
107
107
|
|
@@ -164,61 +164,13 @@ module Treebank
|
|
164
164
|
|
165
165
|
# A node in a tree
|
166
166
|
#
|
167
|
-
# A Node consists of a label, which may be any arbitrary Object, and
|
168
|
-
#
|
167
|
+
# A Node consists of a label, which may be any arbitrary Object, and a list
|
168
|
+
# of children, which are also Node objects.
|
169
169
|
class Node
|
170
170
|
include Enumerable
|
171
171
|
|
172
|
-
#
|
173
|
-
|
174
|
-
include Enumerable
|
175
|
-
|
176
|
-
# _node_:: The start node of the enumeration
|
177
|
-
# _visit_:: Optional enumeration control procedure
|
178
|
-
#
|
179
|
-
# The optional _visit_ argument can be used to control which
|
180
|
-
# children are visited by this iterator. If specified, it is
|
181
|
-
# called for every node, and only those nodes returning _true_
|
182
|
-
# will be visited.
|
183
|
-
def initialize(node, visit = nil)
|
184
|
-
@node = node
|
185
|
-
@visit = visit
|
186
|
-
end
|
187
|
-
|
188
|
-
# Enumerate the nodes.
|
189
|
-
def each
|
190
|
-
@agenda = [@node]
|
191
|
-
while node = @agenda.shift
|
192
|
-
yield node
|
193
|
-
children = @visit ? node.find_all {|n| @visit.call(n)} : node.collect
|
194
|
-
recurse(children)
|
195
|
-
end
|
196
|
-
end
|
197
|
-
|
198
|
-
# Function that controls enumeration recursion.
|
199
|
-
#
|
200
|
-
# _children_:: A list of child nodes of the current node
|
201
|
-
#
|
202
|
-
# The only difference between the breadth-first and depth-first
|
203
|
-
# searches is this function.
|
204
|
-
def recurse(children)
|
205
|
-
@agenda += children
|
206
|
-
end
|
207
|
-
end # BFSIterator
|
208
|
-
|
209
|
-
# Iterates a tree depth-first
|
210
|
-
class DFSIterator < BFSIterator
|
211
|
-
|
212
|
-
# Function that controls enumeration recursion
|
213
|
-
#
|
214
|
-
# _children_:: A list of child nodes of the current node
|
215
|
-
#
|
216
|
-
# The only difference between the breadth-first and depth-first
|
217
|
-
# searches is this function.
|
218
|
-
def recurse(children)
|
219
|
-
@agenda = children + @agenda
|
220
|
-
end
|
221
|
-
end # DFSIterator
|
172
|
+
# The children of this node.
|
173
|
+
attr_reader :children
|
222
174
|
|
223
175
|
# This node's label
|
224
176
|
attr_accessor :label
|
@@ -262,15 +214,15 @@ module Treebank
|
|
262
214
|
end
|
263
215
|
|
264
216
|
# Returns a string representation of this node and its children over
|
265
|
-
# multiple lines with indenting. This matches the format used in the Penn
|
266
|
-
# Treebank files.
|
217
|
+
# multiple lines with indenting. This matches the format used in the {Penn
|
218
|
+
# Treebank}[http://www.cis.upenn.edu/~treebank] files.
|
267
219
|
#
|
268
220
|
# _indent_:: The number of spaces to indent this node in the output
|
269
221
|
# _parent_:: The node's parent
|
270
222
|
# _left_sibling_:: The node's left sibling
|
271
223
|
#
|
272
|
-
# This algorithm is based on the stringification algorithm in the Stanford
|
273
|
-
# Natural Language Parser.
|
224
|
+
# This algorithm is based on the stringification algorithm in the {Stanford
|
225
|
+
# Natural Language Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml].
|
274
226
|
def multiline_to_s(indent, parent, left_sibling)
|
275
227
|
# Insert a new line and indenting before this node as needed. Stay on
|
276
228
|
# the same line if this is the top of the tree, if the parent label is
|
@@ -299,13 +251,15 @@ module Treebank
|
|
299
251
|
"<#{self.class} #{@label} [#{child_labels.join(' ')}]>"
|
300
252
|
end
|
301
253
|
|
302
|
-
# If the other object is a tree and every node label in the
|
303
|
-
#
|
304
|
-
#
|
254
|
+
# If the other object is a tree and every node label in the corresponding
|
255
|
+
# nodes of the two depth first enumerations match, the trees are
|
256
|
+
# equivalent.
|
305
257
|
def ==(other)
|
306
258
|
return false if not other.kind_of? self.class
|
307
259
|
return true if self.empty? and other.empty?
|
308
|
-
|
260
|
+
self_enum = Enumerable::Enumerator.new(self, :each_depth_first)
|
261
|
+
other_enum = Enumerable::Enumerator.new(other, :each_depth_first)
|
262
|
+
mismatch = self_enum.zip(other_enum).find \
|
309
263
|
{|self_node, other_node| self_node.nil? or \
|
310
264
|
other_node.nil? or \
|
311
265
|
self_node.label != other_node.label}
|
@@ -351,24 +305,6 @@ module Treebank
|
|
351
305
|
def each
|
352
306
|
@children.each {|node| yield node}
|
353
307
|
end
|
354
|
-
|
355
|
-
# Enumerate all the nodes beneath this one breadth-first
|
356
|
-
#
|
357
|
-
# _visit_:: Optional enumeration control procedure
|
358
|
-
#
|
359
|
-
# The _visit_ parameter is passed down to the BFSIterator.
|
360
|
-
def each_breadth_first(visit = nil)
|
361
|
-
BFSIterator.new(self, visit)
|
362
|
-
end
|
363
|
-
|
364
|
-
# Enumerate all the nodes beneath this one depth-first
|
365
|
-
#
|
366
|
-
# _visit_:: Optional enumeration control procedure
|
367
|
-
#
|
368
|
-
# The _visit_ parameter is passed down to the DFSIterator.
|
369
|
-
def each_depth_first(visit = nil)
|
370
|
-
DFSIterator.new(self, visit)
|
371
|
-
end
|
372
308
|
|
373
309
|
# A leaf node has no children.
|
374
310
|
def leaf?
|
@@ -389,17 +325,160 @@ module Treebank
|
|
389
325
|
#
|
390
326
|
# _block_:: An optional block to run on each leaf
|
391
327
|
def leaves(&block)
|
392
|
-
leaves =
|
328
|
+
leaves = depth_first_enumerator.find_all {|node| node.leaf?}
|
393
329
|
leaves = leaves.collect {|leaf| block.call(leaf)} if not block.nil?
|
394
330
|
leaves
|
395
331
|
end
|
396
332
|
|
397
|
-
|
333
|
+
# Return the child node specified by the coordinate index or nil if the
|
334
|
+
# coordinate is out of range. An empty argument list returns this node.
|
335
|
+
#
|
336
|
+
# _index_:: A list of integers that indexes the child node
|
337
|
+
#
|
338
|
+
# t = Treebank::Node.from_s("(A (B b) (C (D d) (E e)))")
|
339
|
+
# <Treebank::Node A [B C]>
|
340
|
+
# t[1,0]
|
341
|
+
# <Treebank::Node D [d]>
|
342
|
+
def [](*index)
|
343
|
+
child = self
|
344
|
+
index.each do |i|
|
345
|
+
child = child.children[i]
|
346
|
+
break if child.nil?
|
347
|
+
end
|
348
|
+
child
|
349
|
+
end
|
350
|
+
|
351
|
+
# Enumerate this node and its children depth-first.
|
352
|
+
#
|
353
|
+
# _indexed_:: Return indexes along with nodes?
|
354
|
+
# _visit_:: Optional enumeration control procedure
|
355
|
+
#
|
356
|
+
# > t = Treebank::Node.from_s("(A (B b) (C c))")
|
357
|
+
# > t.each_depth_first() {|n,index| puts n.label}
|
358
|
+
# A
|
359
|
+
# B
|
360
|
+
# b
|
361
|
+
# C
|
362
|
+
# c
|
363
|
+
# > t.each_depth_first(true) {|n,index| puts "#{n.label} #{index.inspect}"}
|
364
|
+
# A []
|
365
|
+
# B [0]
|
366
|
+
# b [0, 0]
|
367
|
+
# C [1]
|
368
|
+
# c [1, 0]
|
369
|
+
#
|
370
|
+
# See the each_descendant documentation for more details on the function
|
371
|
+
# arguments.
|
372
|
+
def each_depth_first(indexed = false, visit = nil)
|
373
|
+
each_descendant(indexed, :depth_first, visit) {|node| yield node}
|
374
|
+
end
|
375
|
+
|
376
|
+
# Return an Enumerator[http://www.ruby-doc.org/core/classes/Enumerable/Enumerator.html]
|
377
|
+
# object that enumerates this node and its descendants depth-first.
|
378
|
+
#
|
379
|
+
# See the each_descendant documentation for more details on the function
|
380
|
+
# arguments.
|
381
|
+
def depth_first_enumerator(indexed = false, visit = nil)
|
382
|
+
Enumerable::Enumerator.new(self, :each_depth_first, indexed, visit)
|
383
|
+
end
|
384
|
+
|
385
|
+
# Enumerate this node and its children breadth-first.
|
386
|
+
#
|
387
|
+
# _indexed_:: Return indexes along with nodes?
|
388
|
+
# _visit_:: Optional enumeration control procedure
|
389
|
+
#
|
390
|
+
# > t = Treebank::Node.from_s("(A (B b) (C c))")
|
391
|
+
# > t.each_breadth_first() {|n,index| puts n.label}
|
392
|
+
# A
|
393
|
+
# B
|
394
|
+
# C
|
395
|
+
# b
|
396
|
+
# c
|
397
|
+
# > t.each_breadth_first(true) {|n,index| puts "#{n.label} #{index.inspect}"}
|
398
|
+
# A []
|
399
|
+
# B [0]
|
400
|
+
# C [1]
|
401
|
+
# b [0, 0]
|
402
|
+
# c [1, 0]
|
403
|
+
#
|
404
|
+
# See the each_descendant documentation for more details on the function
|
405
|
+
# arguments.
|
406
|
+
def each_breadth_first(indexed = false, visit = nil)
|
407
|
+
each_descendant(indexed, :breadth_first, visit) {|node| yield node}
|
408
|
+
end
|
409
|
+
|
410
|
+
# Return an Enumerator[http://www.ruby-doc.org/core/classes/Enumerable/Enumerator.html]
|
411
|
+
# object that enumerates this node and its descendants breadth-first.
|
412
|
+
#
|
413
|
+
# See the each_descendant documentation for more details on the function
|
414
|
+
# arguments.
|
415
|
+
def breadth_first_enumerator(indexed = false, visit = nil)
|
416
|
+
Enumerable::Enumerator.new(self, :each_breadth_first, indexed, visit)
|
417
|
+
end
|
418
|
+
|
419
|
+
# Enumerate the initial node and its childen in the order specified,
|
420
|
+
# yielding either just nodes or nodes and indexes.
|
421
|
+
#
|
422
|
+
# _indexed_:: Return indexes of the nodes?
|
423
|
+
# _order_:: :depth_first or :breadth_first
|
424
|
+
# _visit_:: Optional enumeration control procedure
|
425
|
+
#
|
426
|
+
# If _indexed_ is _true_ this class yields <tt>[node, index]</tt> pairs
|
427
|
+
# where _index_ is the index used by the [] operator. Otherwise, it
|
428
|
+
# yields just the nodes.
|
429
|
+
#
|
430
|
+
# The optional _visit_ argument can be used to control which children are
|
431
|
+
# visited by this iterator. If specified, it is called for every node,
|
432
|
+
# and only those nodes returning _true_ will be visited.
|
433
|
+
def each_descendant(indexed, order, visit)
|
434
|
+
agenda = indexed ? [ [self, []] ] : [self]
|
435
|
+
while item = agenda.shift
|
436
|
+
yield item
|
437
|
+
# If we're enumerating over nodes and their indexes, the agenda items
|
438
|
+
# are (node, index) pairs. Otherwise they are just nodes.
|
439
|
+
new_items = indexed ? indexed_new_agenda_items(item, visit) :
|
440
|
+
non_indexed_new_agenda_items(item, visit)
|
441
|
+
# The difference between depth- and breadth-first search orders lies
|
442
|
+
# in the way the agenda is updated.
|
443
|
+
if order == :depth_first
|
444
|
+
agenda = new_items + agenda
|
445
|
+
elsif order == :breadth_first
|
446
|
+
agenda = agenda + new_items
|
447
|
+
else
|
448
|
+
raise "Invalid search order #{order}"
|
449
|
+
end
|
450
|
+
end
|
451
|
+
end
|
452
|
+
|
453
|
+
# Agenda items consist of a tree node and a node index. Get new agenda
|
454
|
+
# items by recursing into children of the node. Append child index to the
|
455
|
+
# node indexes in the new agenda items.
|
456
|
+
def indexed_new_agenda_items(item, visit)
|
457
|
+
node, index = item
|
458
|
+
new_items = []
|
459
|
+
if visit
|
460
|
+
node.children.each_with_index do |n, i|
|
461
|
+
new_items << [n, index + [i]] if visit.call(n)
|
462
|
+
end
|
463
|
+
else
|
464
|
+
node.children.each_with_index{|n, i| new_items << [n, index + [i]]}
|
465
|
+
end
|
466
|
+
new_items
|
467
|
+
end
|
468
|
+
|
469
|
+
# Agenda items consist of a tree node. Get new agenda items by recursing
|
470
|
+
# into the children.
|
471
|
+
def non_indexed_new_agenda_items(node, visit)
|
472
|
+
visit ? node.find_all {|n| visit.call(n)} : node.children
|
473
|
+
end
|
474
|
+
|
475
|
+
private :indexed_new_agenda_items, :non_indexed_new_agenda_items
|
476
|
+
protected :multiline_to_s, :each_descendant
|
398
477
|
|
399
478
|
end # Node
|
400
479
|
|
401
480
|
|
402
|
-
# A Node in a
|
481
|
+
# A Node in a tree that can locate its parent.
|
403
482
|
#
|
404
483
|
# The ParentedNode adds a pointer back to the parent node to
|
405
484
|
# the Node class.
|
@@ -408,26 +487,6 @@ module Treebank
|
|
408
487
|
# This node's parent
|
409
488
|
attr_reader :parent
|
410
489
|
|
411
|
-
# Iterates up a tree
|
412
|
-
class ParentIterator
|
413
|
-
include Enumerable
|
414
|
-
|
415
|
-
# _node_:: The start node of the enumeration
|
416
|
-
def initialize(node)
|
417
|
-
@node = node
|
418
|
-
end
|
419
|
-
|
420
|
-
# Enumerate the ancestor chain.
|
421
|
-
def each
|
422
|
-
node = @node
|
423
|
-
while not node.nil?
|
424
|
-
yield node
|
425
|
-
node = node.parent
|
426
|
-
end
|
427
|
-
end
|
428
|
-
|
429
|
-
end # ParentIterator
|
430
|
-
|
431
490
|
# Create a node specifying its parent, its label, and its children's
|
432
491
|
# labels.
|
433
492
|
#
|
@@ -460,9 +519,19 @@ module Treebank
|
|
460
519
|
@parent = parent
|
461
520
|
end
|
462
521
|
|
463
|
-
# Enumerate
|
522
|
+
# Enumerate this node and its ancestors.
|
464
523
|
def each_parent
|
465
|
-
|
524
|
+
node = self
|
525
|
+
while not node.nil?
|
526
|
+
yield node
|
527
|
+
node = node.parent
|
528
|
+
end
|
529
|
+
end
|
530
|
+
|
531
|
+
# Return an Enumerator[http://www.ruby-doc.org/core/classes/Enumerable/Enumerator.html]
|
532
|
+
# object that enumerates this node and its parents.
|
533
|
+
def parent_enumerator
|
534
|
+
Enumerable::Enumerator.new(self, :each_parent)
|
466
535
|
end
|
467
536
|
|
468
537
|
protected :parent=
|
data/test/test_treebank.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/bin/env ruby
|
2
2
|
|
3
3
|
#--
|
4
|
-
# Copyright 2006 William Patrick McNeill
|
4
|
+
# Copyright 2006-2008 William Patrick McNeill
|
5
5
|
#
|
6
6
|
# This file is part of Treebank.
|
7
7
|
#
|
@@ -86,24 +86,20 @@ end
|
|
86
86
|
|
87
87
|
|
88
88
|
class TreeParserTest < Test::Unit::TestCase
|
89
|
-
|
90
89
|
include ParseTreeMixin
|
91
|
-
|
90
|
+
|
92
91
|
def setup
|
93
92
|
@node_class = Treebank::Node
|
94
93
|
end
|
95
|
-
|
96
94
|
end
|
97
95
|
|
98
96
|
|
99
97
|
class ParentedTreeParserTest < Test::Unit::TestCase
|
100
|
-
|
101
98
|
include ParseTreeMixin
|
102
|
-
|
99
|
+
|
103
100
|
def setup
|
104
101
|
@node_class = Treebank::ParentedNode
|
105
102
|
end
|
106
|
-
|
107
103
|
end
|
108
104
|
|
109
105
|
|
@@ -116,8 +112,8 @@ module NodeTestMixin
|
|
116
112
|
assert_nil t.label, 'Empty tree nil head'
|
117
113
|
assert t.empty?, 'Empty empty?'
|
118
114
|
assert_equal [], t.collect, 'Empty child list'
|
119
|
-
assert_equal [t], t.
|
120
|
-
assert_equal [t], t.
|
115
|
+
assert_equal [t], t.breadth_first_enumerator.collect, 'Empty breadth first'
|
116
|
+
assert_equal [t], t.depth_first_enumerator.collect, 'Empty depth first'
|
121
117
|
end
|
122
118
|
|
123
119
|
# Test a single node tree
|
@@ -126,8 +122,8 @@ module NodeTestMixin
|
|
126
122
|
assert_equal 'a', t.label, 'Single node label'
|
127
123
|
assert !t.empty?, 'Single node not empty?'
|
128
124
|
assert_equal [], t.collect, 'Empty child list'
|
129
|
-
assert_equal [t], t.
|
130
|
-
assert_equal [t], t.
|
125
|
+
assert_equal [t], t.breadth_first_enumerator.collect, 'Single node breadth first'
|
126
|
+
assert_equal [t], t.depth_first_enumerator.collect, 'Single depth first'
|
131
127
|
end
|
132
128
|
|
133
129
|
# Test adding children in the constructor
|
@@ -168,7 +164,7 @@ module NodeTestMixin
|
|
168
164
|
|
169
165
|
# Read to and from a typical Wall Street Journal treebank string
|
170
166
|
def test_wsj_sentence_stringify
|
171
|
-
s =
|
167
|
+
s =
|
172
168
|
"( (S
|
173
169
|
(NP-SBJ (CD Two) (VBG leading) (NN constitutional-law) (NNS experts))
|
174
170
|
(VP (VBD said)
|
@@ -191,16 +187,16 @@ module NodeTestMixin
|
|
191
187
|
def test_enumeration
|
192
188
|
# Enumerate all children.
|
193
189
|
t = @node_class.from_s('(a (b (R) (S) ) (c (T) (U)) )')
|
194
|
-
assert_equal ['a', 'b', 'c', 'R', 'S', 'T', 'U'], t.
|
195
|
-
assert_equal ['a', 'b', 'R', 'S', 'c', 'T', 'U'], t.
|
190
|
+
assert_equal ['a', 'b', 'c', 'R', 'S', 'T', 'U'], t.breadth_first_enumerator.collect {|node| node.label}, 'Full breadth first'
|
191
|
+
assert_equal ['a', 'b', 'R', 'S', 'c', 'T', 'U'], t.depth_first_enumerator.collect {|node| node.label}, 'Full depth first'
|
196
192
|
# Enumerate children beneath a node.
|
197
193
|
b = t.find {|node| node.label == 'b'}
|
198
|
-
assert_equal ['b', 'R', 'S'], b.
|
199
|
-
assert_equal ['b', 'R', 'S'], b.
|
194
|
+
assert_equal ['b', 'R', 'S'], b.breadth_first_enumerator.collect {|node| node.label}, 'Partial breadth first'
|
195
|
+
assert_equal ['b', 'R', 'S'], b.depth_first_enumerator.collect {|node| node.label}, 'Partial depth first'
|
200
196
|
# Customize visitation.
|
201
197
|
visit = proc{|n| n.label != 'c' and n.label != 'S'}
|
202
|
-
assert_equal ['a', 'b', 'R'], t.
|
203
|
-
assert_equal ['a', 'b', 'R'], t.
|
198
|
+
assert_equal ['a', 'b', 'R'], t.breadth_first_enumerator(false, visit).collect {|node| node.label}, 'Custom breadth first'
|
199
|
+
assert_equal ['a', 'b', 'R'], t.depth_first_enumerator(false, visit).collect {|node| node.label}, 'Custom depth first'
|
204
200
|
end
|
205
201
|
|
206
202
|
# Tree equivalence
|
@@ -215,41 +211,64 @@ module NodeTestMixin
|
|
215
211
|
assert_not_equal t1, 'non-tree', 'Tree non-equivalence: not a tree'
|
216
212
|
end
|
217
213
|
|
214
|
+
# Test leaves function
|
218
215
|
def test_leaves
|
219
216
|
t = @node_class.from_s('(a (b c) (d e))')
|
220
|
-
leaves = t.
|
217
|
+
leaves = t.depth_first_enumerator.collect
|
221
218
|
c = leaves[2]
|
222
219
|
e = leaves[4]
|
223
220
|
assert_equal [c, e], t.leaves, 'Tree leaves'
|
224
221
|
assert_equal ['c', 'e'], t.leaves {|n| n.label}, 'Tree leaves with block'
|
225
222
|
end
|
223
|
+
|
224
|
+
# Test dereferencing children by index
|
225
|
+
def test_child_dereference
|
226
|
+
t = @node_class.from_s("(A (B b) (C (D d) (E e)))")
|
227
|
+
nA, nB, b, nC, nD, d, nE, e = t.depth_first_enumerator.collect
|
228
|
+
assert_equal nA, t[]
|
229
|
+
assert_equal nB, t[0]
|
230
|
+
assert_equal b, t[0,0]
|
231
|
+
assert_equal nil, t[0,1]
|
232
|
+
assert_equal nC, t[1]
|
233
|
+
assert_equal nil, t[2]
|
234
|
+
assert_equal nD, t[1,0]
|
235
|
+
assert_equal d, t[1,0,0]
|
236
|
+
assert_equal nil, t[1,0,1]
|
237
|
+
assert_equal nE, t[1,1]
|
238
|
+
assert_equal nil, t[1,2]
|
239
|
+
assert_equal e, t[1,1,0]
|
240
|
+
assert_equal nil, t[1,1,1]
|
241
|
+
end
|
242
|
+
|
243
|
+
def test_indexed_enumeration
|
244
|
+
t = @node_class.from_s("(A (B b) (C (D d) (E e)))")
|
245
|
+
assert t.depth_first_enumerator(true).all?{|n,i| n == t[*i]}
|
246
|
+
end
|
247
|
+
|
226
248
|
end
|
227
249
|
|
228
250
|
|
229
251
|
class NodeTest < Test::Unit::TestCase
|
230
|
-
|
231
252
|
include NodeTestMixin
|
232
|
-
|
253
|
+
|
233
254
|
def setup
|
234
255
|
@node_class = Treebank::Node
|
235
256
|
end
|
236
|
-
|
237
257
|
end
|
238
258
|
|
239
259
|
|
240
260
|
class ParentedNodeTest < Test::Unit::TestCase
|
241
|
-
|
242
261
|
include NodeTestMixin
|
243
|
-
|
262
|
+
|
244
263
|
def setup
|
245
264
|
@node_class = Treebank::ParentedNode
|
246
265
|
end
|
247
266
|
|
248
267
|
def test_ancestor_enumeration
|
249
268
|
t = @node_class.from_s('(a (b (R) (S) ) (c (T) (U)) )')
|
250
|
-
assert_equal [t], t.
|
251
|
-
u = t.
|
252
|
-
assert_equal ['U', 'c', 'a'], u.
|
269
|
+
assert_equal [t], t.parent_enumerator.collect, 'Ancestors from head'
|
270
|
+
u = t.depth_first_enumerator.find {|node| node.label == 'U'}
|
271
|
+
assert_equal ['U', 'c', 'a'], u.parent_enumerator.collect {|node| node.label}, 'Ancestors from leaf'
|
253
272
|
end
|
254
273
|
|
255
274
|
end
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.2
|
|
3
3
|
specification_version: 1
|
4
4
|
name: treebank
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version:
|
7
|
-
date:
|
6
|
+
version: 3.0.0
|
7
|
+
date: 2008-04-16 00:00:00 -07:00
|
8
8
|
summary: Treebank implements support for ordered n-ary branching tree structures
|
9
9
|
require_paths:
|
10
10
|
- lib
|