treebank 2.1.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +31 -28
- data/examples/penntb-words +1 -1
- data/lib/treebank.rb +189 -120
- data/test/test_treebank.rb +46 -27
- metadata +2 -2
data/README
CHANGED
@@ -1,64 +1,67 @@
|
|
1
|
-
= Tree Module
|
1
|
+
= Tree Representation Module
|
2
2
|
|
3
|
-
This module supports the creation, search, manipulation, and
|
4
|
-
serialization of tree structures.
|
3
|
+
This module supports the creation, search, manipulation, and serialization of tree structures.
|
5
4
|
|
6
|
-
Trees are implemented with Treebank::Node objects. Each Node has a writable
|
7
|
-
_label_ that may be any arbitrary object and a list of other child
|
8
|
-
Node objects. Node objects support breadth and depth first iteration.
|
5
|
+
Trees are implemented with Treebank::Node objects. Each Node has a writable _label_ that may be any arbitrary object and a list of other child Node objects.
|
9
6
|
|
10
|
-
|
7
|
+
> require 'treebank'
|
11
8
|
=> true
|
12
|
-
|
9
|
+
> p = Treebank::Node.new('parent')
|
13
10
|
=> <Treebank::Node parent []>
|
14
|
-
|
11
|
+
> p.create_child!('child1')
|
15
12
|
=> <Treebank::Node child1 []>
|
16
|
-
|
13
|
+
> p.create_child!('child2')
|
17
14
|
=> <Treebank::Node child2 []>
|
18
15
|
|
19
|
-
Node has a subclass Treebank::ParentedNode that keeps track of the parent of the
|
20
|
-
given node and has methods for iterating up the ancestor tree.
|
16
|
+
Node has a subclass Treebank::ParentedNode that keeps track of the parent of the given node and has methods for iterating up the ancestor tree.
|
21
17
|
|
22
|
-
The
|
23
|
-
in a bracketed tree format.
|
18
|
+
Node objects support breadth- and depth-first iteration. The functions each_depth_first and each_breadth_first yield a node and its descendants in the specified order. The functions depth_first_enumerator and breadth_first_enumerator wrap these functions inside Enumerator[http://www.ruby-doc.org/core/classes/Enumerable/Enumerator.html] objects. See the function documentation for details as to how the enumeration may be controlled.
|
24
19
|
|
25
|
-
|
20
|
+
The default stringification method writes a node and all its children in a bracketed tree format.
|
21
|
+
|
22
|
+
> puts p
|
26
23
|
(parent
|
27
24
|
(child1 )
|
28
25
|
(child2 ))
|
29
26
|
|
30
27
|
Bracketed tree strings can be used to create Node trees.
|
31
28
|
|
32
|
-
|
29
|
+
> t = Treebank::Node.from_s('(parent (child1) (child2))')
|
33
30
|
=> <Treebank::Node parent [child1 child2]>
|
34
|
-
|
31
|
+
> puts t
|
35
32
|
(parent
|
36
33
|
(child1 )
|
37
34
|
(child2 ))
|
38
35
|
|
39
|
-
The bracketed tree format is the one used by the Penn
|
40
|
-
|
41
|
-
|
36
|
+
The bracketed tree format is the one used by the {Penn Treebank Project}[http://www.cis.upenn.edu/~treebank] to annotate linguistic structure.
|
37
|
+
|
38
|
+
The children of a node can be dereferenced with the array operator using a list of child offsets. For example:
|
39
|
+
|
40
|
+
> t = Treebank::Node.from_s("(A (B b) (C (D d) (E e)))")
|
41
|
+
=> <Treebank::Node A [B C]>
|
42
|
+
> t[1,0]
|
43
|
+
=> <Treebank::Node D [d]>
|
44
|
+
|
45
|
+
Here the index (1,0) finds the 0th child of the node's 1st child, which is the node <tt>(D d)</tt>.
|
42
46
|
|
43
47
|
= History
|
44
48
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
+
1-0-0:: First release
|
50
|
+
1-1-0:: Removed unnecessary fsa dependency from gemspec
|
51
|
+
2-0-0:: Changed from_s initialization
|
52
|
+
2-1-0:: Add indented multiline stringification; Add preterminal?
|
53
|
+
3-0-0:: Add Node.children function. Add child deferencing by index. The each_depth_first and each_breadth_first functions now yield nodes instead of returning private enumerator objects. To get enumerator objects, use depth_first_enumerator and breadth_first_enumerator instead. Likewise, the each_parent function in the ParentedNode class yields nodes, while parent_enumerator returns an enumerator.
|
49
54
|
|
50
55
|
= See Also
|
51
56
|
|
52
|
-
Lingua::Treebank[http://search.cpan.org/~kahn/Lingua-Treebank
|
53
|
-
implements similar functionality in Perl.
|
57
|
+
Lingua::Treebank[http://search.cpan.org/~kahn/Lingua-Treebank/] implements similar functionality in Perl.
|
54
58
|
|
55
59
|
= Copyright
|
56
60
|
|
57
|
-
Copyright 2006, William Patrick McNeill
|
61
|
+
Copyright 2006-2008, William Patrick McNeill
|
58
62
|
|
59
63
|
This program is distributed under the GNU General Public License.
|
60
64
|
|
61
65
|
= Author
|
62
66
|
|
63
67
|
W.P. McNeill mailto:billmcn@u.washington.edu
|
64
|
-
|
data/examples/penntb-words
CHANGED
data/lib/treebank.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright 2006 William Patrick McNeill
|
1
|
+
# Copyright 2006-2008 William Patrick McNeill
|
2
2
|
#
|
3
3
|
# Treebank is free software; you can redistribute it and/or modify it
|
4
4
|
# under the terms of the GNU General Public License as published by
|
@@ -14,21 +14,22 @@
|
|
14
14
|
# along with editalign; if not, write to the Free Software Foundation,
|
15
15
|
# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
16
16
|
|
17
|
+
require "enumerator"
|
17
18
|
|
18
19
|
# Treebank is the namespace that contains all tree-related functions.
|
19
20
|
module Treebank
|
20
21
|
|
21
|
-
VERSION = "
|
22
|
+
VERSION = "3.0.0"
|
22
23
|
|
23
24
|
# An enumerable list of tokens in a string representation of a tree
|
24
25
|
#
|
25
|
-
# This class provides a way of enumerating over a source to produce
|
26
|
-
#
|
27
|
-
#
|
28
|
-
#
|
29
|
-
#
|
30
|
-
#
|
31
|
-
#
|
26
|
+
# This class provides a way of enumerating over a source to produce tokens
|
27
|
+
# that can be used in parsing a string representation of a tree. The source
|
28
|
+
# is an enumerable object whose _each_ function returns a sequence of String
|
29
|
+
# objects, for example a file or a single String. Each returned string is
|
30
|
+
# delimited by left and right brackets and whitespace. The default brackets
|
31
|
+
# are '(' and ')', but different delimiters may be specified in the
|
32
|
+
# constructor.
|
32
33
|
#
|
33
34
|
# Treebank::TokenStream.new('(A (B c) (D))').collect
|
34
35
|
# => ["(", "A", "(", "B", "c", ")", "(", "D", ")", ")"]
|
@@ -80,8 +81,8 @@ module Treebank
|
|
80
81
|
|
81
82
|
# A parser for string representations of trees.
|
82
83
|
#
|
83
|
-
# This class uses a simplified shift-reduce parser to convert a
|
84
|
-
#
|
84
|
+
# This class uses a simplified shift-reduce parser to convert a string into
|
85
|
+
# a list of tree structures.
|
85
86
|
#
|
86
87
|
# Treebank::Parser.new('(A) (B (C) (D))').collect
|
87
88
|
# => [<Treebank::Node A []>, <Treebank::Node B [C D]>]
|
@@ -95,13 +96,12 @@ module Treebank
|
|
95
96
|
# * children -> node*|word
|
96
97
|
# * word -> \w+
|
97
98
|
#
|
98
|
-
# Note that the BNF definition of children allows a shortcut in
|
99
|
-
#
|
100
|
-
#
|
101
|
-
# are equivalent.
|
99
|
+
# Note that the BNF definition of children allows a shortcut in which the
|
100
|
+
# labels of terminal nodes may be specified without brackets. So, for
|
101
|
+
# example, <tt>(A (B))</tt> and <tt>(A B)</tt> are equivalent.
|
102
102
|
#
|
103
|
-
# The trees returned by this class are caller-defined node objects,
|
104
|
-
#
|
103
|
+
# The trees returned by this class are caller-defined node objects, where
|
104
|
+
# each node has a list of child nodes.
|
105
105
|
class Parser
|
106
106
|
include Enumerable
|
107
107
|
|
@@ -164,61 +164,13 @@ module Treebank
|
|
164
164
|
|
165
165
|
# A node in a tree
|
166
166
|
#
|
167
|
-
# A Node consists of a label, which may be any arbitrary Object, and
|
168
|
-
#
|
167
|
+
# A Node consists of a label, which may be any arbitrary Object, and a list
|
168
|
+
# of children, which are also Node objects.
|
169
169
|
class Node
|
170
170
|
include Enumerable
|
171
171
|
|
172
|
-
#
|
173
|
-
|
174
|
-
include Enumerable
|
175
|
-
|
176
|
-
# _node_:: The start node of the enumeration
|
177
|
-
# _visit_:: Optional enumeration control procedure
|
178
|
-
#
|
179
|
-
# The optional _visit_ argument can be used to control which
|
180
|
-
# children are visited by this iterator. If specified, it is
|
181
|
-
# called for every node, and only those nodes returning _true_
|
182
|
-
# will be visited.
|
183
|
-
def initialize(node, visit = nil)
|
184
|
-
@node = node
|
185
|
-
@visit = visit
|
186
|
-
end
|
187
|
-
|
188
|
-
# Enumerate the nodes.
|
189
|
-
def each
|
190
|
-
@agenda = [@node]
|
191
|
-
while node = @agenda.shift
|
192
|
-
yield node
|
193
|
-
children = @visit ? node.find_all {|n| @visit.call(n)} : node.collect
|
194
|
-
recurse(children)
|
195
|
-
end
|
196
|
-
end
|
197
|
-
|
198
|
-
# Function that controls enumeration recursion.
|
199
|
-
#
|
200
|
-
# _children_:: A list of child nodes of the current node
|
201
|
-
#
|
202
|
-
# The only difference between the breadth-first and depth-first
|
203
|
-
# searches is this function.
|
204
|
-
def recurse(children)
|
205
|
-
@agenda += children
|
206
|
-
end
|
207
|
-
end # BFSIterator
|
208
|
-
|
209
|
-
# Iterates a tree depth-first
|
210
|
-
class DFSIterator < BFSIterator
|
211
|
-
|
212
|
-
# Function that controls enumeration recursion
|
213
|
-
#
|
214
|
-
# _children_:: A list of child nodes of the current node
|
215
|
-
#
|
216
|
-
# The only difference between the breadth-first and depth-first
|
217
|
-
# searches is this function.
|
218
|
-
def recurse(children)
|
219
|
-
@agenda = children + @agenda
|
220
|
-
end
|
221
|
-
end # DFSIterator
|
172
|
+
# The children of this node.
|
173
|
+
attr_reader :children
|
222
174
|
|
223
175
|
# This node's label
|
224
176
|
attr_accessor :label
|
@@ -262,15 +214,15 @@ module Treebank
|
|
262
214
|
end
|
263
215
|
|
264
216
|
# Returns a string representation of this node and its children over
|
265
|
-
# multiple lines with indenting. This matches the format used in the Penn
|
266
|
-
# Treebank files.
|
217
|
+
# multiple lines with indenting. This matches the format used in the {Penn
|
218
|
+
# Treebank}[http://www.cis.upenn.edu/~treebank] files.
|
267
219
|
#
|
268
220
|
# _indent_:: The number of spaces to indent this node in the output
|
269
221
|
# _parent_:: The node's parent
|
270
222
|
# _left_sibling_:: The node's left sibling
|
271
223
|
#
|
272
|
-
# This algorithm is based on the stringification algorithm in the Stanford
|
273
|
-
# Natural Language Parser.
|
224
|
+
# This algorithm is based on the stringification algorithm in the {Stanford
|
225
|
+
# Natural Language Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml].
|
274
226
|
def multiline_to_s(indent, parent, left_sibling)
|
275
227
|
# Insert a new line and indenting before this node as needed. Stay on
|
276
228
|
# the same line if this is the top of the tree, if the parent label is
|
@@ -299,13 +251,15 @@ module Treebank
|
|
299
251
|
"<#{self.class} #{@label} [#{child_labels.join(' ')}]>"
|
300
252
|
end
|
301
253
|
|
302
|
-
# If the other object is a tree and every node label in the
|
303
|
-
#
|
304
|
-
#
|
254
|
+
# If the other object is a tree and every node label in the corresponding
|
255
|
+
# nodes of the two depth first enumerations match, the trees are
|
256
|
+
# equivalent.
|
305
257
|
def ==(other)
|
306
258
|
return false if not other.kind_of? self.class
|
307
259
|
return true if self.empty? and other.empty?
|
308
|
-
|
260
|
+
self_enum = Enumerable::Enumerator.new(self, :each_depth_first)
|
261
|
+
other_enum = Enumerable::Enumerator.new(other, :each_depth_first)
|
262
|
+
mismatch = self_enum.zip(other_enum).find \
|
309
263
|
{|self_node, other_node| self_node.nil? or \
|
310
264
|
other_node.nil? or \
|
311
265
|
self_node.label != other_node.label}
|
@@ -351,24 +305,6 @@ module Treebank
|
|
351
305
|
def each
|
352
306
|
@children.each {|node| yield node}
|
353
307
|
end
|
354
|
-
|
355
|
-
# Enumerate all the nodes beneath this one breadth-first
|
356
|
-
#
|
357
|
-
# _visit_:: Optional enumeration control procedure
|
358
|
-
#
|
359
|
-
# The _visit_ parameter is passed down to the BFSIterator.
|
360
|
-
def each_breadth_first(visit = nil)
|
361
|
-
BFSIterator.new(self, visit)
|
362
|
-
end
|
363
|
-
|
364
|
-
# Enumerate all the nodes beneath this one depth-first
|
365
|
-
#
|
366
|
-
# _visit_:: Optional enumeration control procedure
|
367
|
-
#
|
368
|
-
# The _visit_ parameter is passed down to the DFSIterator.
|
369
|
-
def each_depth_first(visit = nil)
|
370
|
-
DFSIterator.new(self, visit)
|
371
|
-
end
|
372
308
|
|
373
309
|
# A leaf node has no children.
|
374
310
|
def leaf?
|
@@ -389,17 +325,160 @@ module Treebank
|
|
389
325
|
#
|
390
326
|
# _block_:: An optional block to run on each leaf
|
391
327
|
def leaves(&block)
|
392
|
-
leaves =
|
328
|
+
leaves = depth_first_enumerator.find_all {|node| node.leaf?}
|
393
329
|
leaves = leaves.collect {|leaf| block.call(leaf)} if not block.nil?
|
394
330
|
leaves
|
395
331
|
end
|
396
332
|
|
397
|
-
|
333
|
+
# Return the child node specified by the coordinate index or nil if the
|
334
|
+
# coordinate is out of range. An empty argument list returns this node.
|
335
|
+
#
|
336
|
+
# _index_:: A list of integers that indexes the child node
|
337
|
+
#
|
338
|
+
# t = Treebank::Node.from_s("(A (B b) (C (D d) (E e)))")
|
339
|
+
# <Treebank::Node A [B C]>
|
340
|
+
# t[1,0]
|
341
|
+
# <Treebank::Node D [d]>
|
342
|
+
def [](*index)
|
343
|
+
child = self
|
344
|
+
index.each do |i|
|
345
|
+
child = child.children[i]
|
346
|
+
break if child.nil?
|
347
|
+
end
|
348
|
+
child
|
349
|
+
end
|
350
|
+
|
351
|
+
# Enumerate this node and its children depth-first.
|
352
|
+
#
|
353
|
+
# _indexed_:: Return indexes along with nodes?
|
354
|
+
# _visit_:: Optional enumeration control procedure
|
355
|
+
#
|
356
|
+
# > t = Treebank::Node.from_s("(A (B b) (C c))")
|
357
|
+
# > t.each_depth_first() {|n,index| puts n.label}
|
358
|
+
# A
|
359
|
+
# B
|
360
|
+
# b
|
361
|
+
# C
|
362
|
+
# c
|
363
|
+
# > t.each_depth_first(true) {|n,index| puts "#{n.label} #{index.inspect}"}
|
364
|
+
# A []
|
365
|
+
# B [0]
|
366
|
+
# b [0, 0]
|
367
|
+
# C [1]
|
368
|
+
# c [1, 0]
|
369
|
+
#
|
370
|
+
# See the each_descendant documentation for more details on the function
|
371
|
+
# arguments.
|
372
|
+
def each_depth_first(indexed = false, visit = nil)
|
373
|
+
each_descendant(indexed, :depth_first, visit) {|node| yield node}
|
374
|
+
end
|
375
|
+
|
376
|
+
# Return an Enumerator[http://www.ruby-doc.org/core/classes/Enumerable/Enumerator.html]
|
377
|
+
# object that enumerates this node and its descendants depth-first.
|
378
|
+
#
|
379
|
+
# See the each_descendant documentation for more details on the function
|
380
|
+
# arguments.
|
381
|
+
def depth_first_enumerator(indexed = false, visit = nil)
|
382
|
+
Enumerable::Enumerator.new(self, :each_depth_first, indexed, visit)
|
383
|
+
end
|
384
|
+
|
385
|
+
# Enumerate this node and its children breadth-first.
|
386
|
+
#
|
387
|
+
# _indexed_:: Return indexes along with nodes?
|
388
|
+
# _visit_:: Optional enumeration control procedure
|
389
|
+
#
|
390
|
+
# > t = Treebank::Node.from_s("(A (B b) (C c))")
|
391
|
+
# > t.each_breadth_first() {|n,index| puts n.label}
|
392
|
+
# A
|
393
|
+
# B
|
394
|
+
# C
|
395
|
+
# b
|
396
|
+
# c
|
397
|
+
# > t.each_breadth_first(true) {|n,index| puts "#{n.label} #{index.inspect}"}
|
398
|
+
# A []
|
399
|
+
# B [0]
|
400
|
+
# C [1]
|
401
|
+
# b [0, 0]
|
402
|
+
# c [1, 0]
|
403
|
+
#
|
404
|
+
# See the each_descendant documentation for more details on the function
|
405
|
+
# arguments.
|
406
|
+
def each_breadth_first(indexed = false, visit = nil)
|
407
|
+
each_descendant(indexed, :breadth_first, visit) {|node| yield node}
|
408
|
+
end
|
409
|
+
|
410
|
+
# Return an Enumerator[http://www.ruby-doc.org/core/classes/Enumerable/Enumerator.html]
|
411
|
+
# object that enumerates this node and its descendants breadth-first.
|
412
|
+
#
|
413
|
+
# See the each_descendant documentation for more details on the function
|
414
|
+
# arguments.
|
415
|
+
def breadth_first_enumerator(indexed = false, visit = nil)
|
416
|
+
Enumerable::Enumerator.new(self, :each_breadth_first, indexed, visit)
|
417
|
+
end
|
418
|
+
|
419
|
+
# Enumerate the initial node and its childen in the order specified,
|
420
|
+
# yielding either just nodes or nodes and indexes.
|
421
|
+
#
|
422
|
+
# _indexed_:: Return indexes of the nodes?
|
423
|
+
# _order_:: :depth_first or :breadth_first
|
424
|
+
# _visit_:: Optional enumeration control procedure
|
425
|
+
#
|
426
|
+
# If _indexed_ is _true_ this class yields <tt>[node, index]</tt> pairs
|
427
|
+
# where _index_ is the index used by the [] operator. Otherwise, it
|
428
|
+
# yields just the nodes.
|
429
|
+
#
|
430
|
+
# The optional _visit_ argument can be used to control which children are
|
431
|
+
# visited by this iterator. If specified, it is called for every node,
|
432
|
+
# and only those nodes returning _true_ will be visited.
|
433
|
+
def each_descendant(indexed, order, visit)
|
434
|
+
agenda = indexed ? [ [self, []] ] : [self]
|
435
|
+
while item = agenda.shift
|
436
|
+
yield item
|
437
|
+
# If we're enumerating over nodes and their indexes, the agenda items
|
438
|
+
# are (node, index) pairs. Otherwise they are just nodes.
|
439
|
+
new_items = indexed ? indexed_new_agenda_items(item, visit) :
|
440
|
+
non_indexed_new_agenda_items(item, visit)
|
441
|
+
# The difference between depth- and breadth-first search orders lies
|
442
|
+
# in the way the agenda is updated.
|
443
|
+
if order == :depth_first
|
444
|
+
agenda = new_items + agenda
|
445
|
+
elsif order == :breadth_first
|
446
|
+
agenda = agenda + new_items
|
447
|
+
else
|
448
|
+
raise "Invalid search order #{order}"
|
449
|
+
end
|
450
|
+
end
|
451
|
+
end
|
452
|
+
|
453
|
+
# Agenda items consist of a tree node and a node index. Get new agenda
|
454
|
+
# items by recursing into children of the node. Append child index to the
|
455
|
+
# node indexes in the new agenda items.
|
456
|
+
def indexed_new_agenda_items(item, visit)
|
457
|
+
node, index = item
|
458
|
+
new_items = []
|
459
|
+
if visit
|
460
|
+
node.children.each_with_index do |n, i|
|
461
|
+
new_items << [n, index + [i]] if visit.call(n)
|
462
|
+
end
|
463
|
+
else
|
464
|
+
node.children.each_with_index{|n, i| new_items << [n, index + [i]]}
|
465
|
+
end
|
466
|
+
new_items
|
467
|
+
end
|
468
|
+
|
469
|
+
# Agenda items consist of a tree node. Get new agenda items by recursing
|
470
|
+
# into the children.
|
471
|
+
def non_indexed_new_agenda_items(node, visit)
|
472
|
+
visit ? node.find_all {|n| visit.call(n)} : node.children
|
473
|
+
end
|
474
|
+
|
475
|
+
private :indexed_new_agenda_items, :non_indexed_new_agenda_items
|
476
|
+
protected :multiline_to_s, :each_descendant
|
398
477
|
|
399
478
|
end # Node
|
400
479
|
|
401
480
|
|
402
|
-
# A Node in a
|
481
|
+
# A Node in a tree that can locate its parent.
|
403
482
|
#
|
404
483
|
# The ParentedNode adds a pointer back to the parent node to
|
405
484
|
# the Node class.
|
@@ -408,26 +487,6 @@ module Treebank
|
|
408
487
|
# This node's parent
|
409
488
|
attr_reader :parent
|
410
489
|
|
411
|
-
# Iterates up a tree
|
412
|
-
class ParentIterator
|
413
|
-
include Enumerable
|
414
|
-
|
415
|
-
# _node_:: The start node of the enumeration
|
416
|
-
def initialize(node)
|
417
|
-
@node = node
|
418
|
-
end
|
419
|
-
|
420
|
-
# Enumerate the ancestor chain.
|
421
|
-
def each
|
422
|
-
node = @node
|
423
|
-
while not node.nil?
|
424
|
-
yield node
|
425
|
-
node = node.parent
|
426
|
-
end
|
427
|
-
end
|
428
|
-
|
429
|
-
end # ParentIterator
|
430
|
-
|
431
490
|
# Create a node specifying its parent, its label, and its children's
|
432
491
|
# labels.
|
433
492
|
#
|
@@ -460,9 +519,19 @@ module Treebank
|
|
460
519
|
@parent = parent
|
461
520
|
end
|
462
521
|
|
463
|
-
# Enumerate
|
522
|
+
# Enumerate this node and its ancestors.
|
464
523
|
def each_parent
|
465
|
-
|
524
|
+
node = self
|
525
|
+
while not node.nil?
|
526
|
+
yield node
|
527
|
+
node = node.parent
|
528
|
+
end
|
529
|
+
end
|
530
|
+
|
531
|
+
# Return an Enumerator[http://www.ruby-doc.org/core/classes/Enumerable/Enumerator.html]
|
532
|
+
# object that enumerates this node and its parents.
|
533
|
+
def parent_enumerator
|
534
|
+
Enumerable::Enumerator.new(self, :each_parent)
|
466
535
|
end
|
467
536
|
|
468
537
|
protected :parent=
|
data/test/test_treebank.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/bin/env ruby
|
2
2
|
|
3
3
|
#--
|
4
|
-
# Copyright 2006 William Patrick McNeill
|
4
|
+
# Copyright 2006-2008 William Patrick McNeill
|
5
5
|
#
|
6
6
|
# This file is part of Treebank.
|
7
7
|
#
|
@@ -86,24 +86,20 @@ end
|
|
86
86
|
|
87
87
|
|
88
88
|
class TreeParserTest < Test::Unit::TestCase
|
89
|
-
|
90
89
|
include ParseTreeMixin
|
91
|
-
|
90
|
+
|
92
91
|
def setup
|
93
92
|
@node_class = Treebank::Node
|
94
93
|
end
|
95
|
-
|
96
94
|
end
|
97
95
|
|
98
96
|
|
99
97
|
class ParentedTreeParserTest < Test::Unit::TestCase
|
100
|
-
|
101
98
|
include ParseTreeMixin
|
102
|
-
|
99
|
+
|
103
100
|
def setup
|
104
101
|
@node_class = Treebank::ParentedNode
|
105
102
|
end
|
106
|
-
|
107
103
|
end
|
108
104
|
|
109
105
|
|
@@ -116,8 +112,8 @@ module NodeTestMixin
|
|
116
112
|
assert_nil t.label, 'Empty tree nil head'
|
117
113
|
assert t.empty?, 'Empty empty?'
|
118
114
|
assert_equal [], t.collect, 'Empty child list'
|
119
|
-
assert_equal [t], t.
|
120
|
-
assert_equal [t], t.
|
115
|
+
assert_equal [t], t.breadth_first_enumerator.collect, 'Empty breadth first'
|
116
|
+
assert_equal [t], t.depth_first_enumerator.collect, 'Empty depth first'
|
121
117
|
end
|
122
118
|
|
123
119
|
# Test a single node tree
|
@@ -126,8 +122,8 @@ module NodeTestMixin
|
|
126
122
|
assert_equal 'a', t.label, 'Single node label'
|
127
123
|
assert !t.empty?, 'Single node not empty?'
|
128
124
|
assert_equal [], t.collect, 'Empty child list'
|
129
|
-
assert_equal [t], t.
|
130
|
-
assert_equal [t], t.
|
125
|
+
assert_equal [t], t.breadth_first_enumerator.collect, 'Single node breadth first'
|
126
|
+
assert_equal [t], t.depth_first_enumerator.collect, 'Single depth first'
|
131
127
|
end
|
132
128
|
|
133
129
|
# Test adding children in the constructor
|
@@ -168,7 +164,7 @@ module NodeTestMixin
|
|
168
164
|
|
169
165
|
# Read to and from a typical Wall Street Journal treebank string
|
170
166
|
def test_wsj_sentence_stringify
|
171
|
-
s =
|
167
|
+
s =
|
172
168
|
"( (S
|
173
169
|
(NP-SBJ (CD Two) (VBG leading) (NN constitutional-law) (NNS experts))
|
174
170
|
(VP (VBD said)
|
@@ -191,16 +187,16 @@ module NodeTestMixin
|
|
191
187
|
def test_enumeration
|
192
188
|
# Enumerate all children.
|
193
189
|
t = @node_class.from_s('(a (b (R) (S) ) (c (T) (U)) )')
|
194
|
-
assert_equal ['a', 'b', 'c', 'R', 'S', 'T', 'U'], t.
|
195
|
-
assert_equal ['a', 'b', 'R', 'S', 'c', 'T', 'U'], t.
|
190
|
+
assert_equal ['a', 'b', 'c', 'R', 'S', 'T', 'U'], t.breadth_first_enumerator.collect {|node| node.label}, 'Full breadth first'
|
191
|
+
assert_equal ['a', 'b', 'R', 'S', 'c', 'T', 'U'], t.depth_first_enumerator.collect {|node| node.label}, 'Full depth first'
|
196
192
|
# Enumerate children beneath a node.
|
197
193
|
b = t.find {|node| node.label == 'b'}
|
198
|
-
assert_equal ['b', 'R', 'S'], b.
|
199
|
-
assert_equal ['b', 'R', 'S'], b.
|
194
|
+
assert_equal ['b', 'R', 'S'], b.breadth_first_enumerator.collect {|node| node.label}, 'Partial breadth first'
|
195
|
+
assert_equal ['b', 'R', 'S'], b.depth_first_enumerator.collect {|node| node.label}, 'Partial depth first'
|
200
196
|
# Customize visitation.
|
201
197
|
visit = proc{|n| n.label != 'c' and n.label != 'S'}
|
202
|
-
assert_equal ['a', 'b', 'R'], t.
|
203
|
-
assert_equal ['a', 'b', 'R'], t.
|
198
|
+
assert_equal ['a', 'b', 'R'], t.breadth_first_enumerator(false, visit).collect {|node| node.label}, 'Custom breadth first'
|
199
|
+
assert_equal ['a', 'b', 'R'], t.depth_first_enumerator(false, visit).collect {|node| node.label}, 'Custom depth first'
|
204
200
|
end
|
205
201
|
|
206
202
|
# Tree equivalence
|
@@ -215,41 +211,64 @@ module NodeTestMixin
|
|
215
211
|
assert_not_equal t1, 'non-tree', 'Tree non-equivalence: not a tree'
|
216
212
|
end
|
217
213
|
|
214
|
+
# Test leaves function
|
218
215
|
def test_leaves
|
219
216
|
t = @node_class.from_s('(a (b c) (d e))')
|
220
|
-
leaves = t.
|
217
|
+
leaves = t.depth_first_enumerator.collect
|
221
218
|
c = leaves[2]
|
222
219
|
e = leaves[4]
|
223
220
|
assert_equal [c, e], t.leaves, 'Tree leaves'
|
224
221
|
assert_equal ['c', 'e'], t.leaves {|n| n.label}, 'Tree leaves with block'
|
225
222
|
end
|
223
|
+
|
224
|
+
# Test dereferencing children by index
|
225
|
+
def test_child_dereference
|
226
|
+
t = @node_class.from_s("(A (B b) (C (D d) (E e)))")
|
227
|
+
nA, nB, b, nC, nD, d, nE, e = t.depth_first_enumerator.collect
|
228
|
+
assert_equal nA, t[]
|
229
|
+
assert_equal nB, t[0]
|
230
|
+
assert_equal b, t[0,0]
|
231
|
+
assert_equal nil, t[0,1]
|
232
|
+
assert_equal nC, t[1]
|
233
|
+
assert_equal nil, t[2]
|
234
|
+
assert_equal nD, t[1,0]
|
235
|
+
assert_equal d, t[1,0,0]
|
236
|
+
assert_equal nil, t[1,0,1]
|
237
|
+
assert_equal nE, t[1,1]
|
238
|
+
assert_equal nil, t[1,2]
|
239
|
+
assert_equal e, t[1,1,0]
|
240
|
+
assert_equal nil, t[1,1,1]
|
241
|
+
end
|
242
|
+
|
243
|
+
def test_indexed_enumeration
|
244
|
+
t = @node_class.from_s("(A (B b) (C (D d) (E e)))")
|
245
|
+
assert t.depth_first_enumerator(true).all?{|n,i| n == t[*i]}
|
246
|
+
end
|
247
|
+
|
226
248
|
end
|
227
249
|
|
228
250
|
|
229
251
|
class NodeTest < Test::Unit::TestCase
|
230
|
-
|
231
252
|
include NodeTestMixin
|
232
|
-
|
253
|
+
|
233
254
|
def setup
|
234
255
|
@node_class = Treebank::Node
|
235
256
|
end
|
236
|
-
|
237
257
|
end
|
238
258
|
|
239
259
|
|
240
260
|
class ParentedNodeTest < Test::Unit::TestCase
|
241
|
-
|
242
261
|
include NodeTestMixin
|
243
|
-
|
262
|
+
|
244
263
|
def setup
|
245
264
|
@node_class = Treebank::ParentedNode
|
246
265
|
end
|
247
266
|
|
248
267
|
def test_ancestor_enumeration
|
249
268
|
t = @node_class.from_s('(a (b (R) (S) ) (c (T) (U)) )')
|
250
|
-
assert_equal [t], t.
|
251
|
-
u = t.
|
252
|
-
assert_equal ['U', 'c', 'a'], u.
|
269
|
+
assert_equal [t], t.parent_enumerator.collect, 'Ancestors from head'
|
270
|
+
u = t.depth_first_enumerator.find {|node| node.label == 'U'}
|
271
|
+
assert_equal ['U', 'c', 'a'], u.parent_enumerator.collect {|node| node.label}, 'Ancestors from leaf'
|
253
272
|
end
|
254
273
|
|
255
274
|
end
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.2
|
|
3
3
|
specification_version: 1
|
4
4
|
name: treebank
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version:
|
7
|
-
date:
|
6
|
+
version: 3.0.0
|
7
|
+
date: 2008-04-16 00:00:00 -07:00
|
8
8
|
summary: Treebank implements support for ordered n-ary branching tree structures
|
9
9
|
require_paths:
|
10
10
|
- lib
|