treebank 2.0.0 → 2.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +9 -6
- data/lib/treebank.rb +108 -81
- data/test/test_treebank.rb +29 -12
- metadata +4 -3
data/README
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
This module supports the creation, search, manipulation, and
|
4
4
|
serialization of tree structures.
|
5
5
|
|
6
|
-
Trees are implemented with Node objects. Each Node has a writable
|
6
|
+
Trees are implemented with Treebank::Node objects. Each Node has a writable
|
7
7
|
_label_ that may be any arbitrary object and a list of other child
|
8
8
|
Node objects. Node objects support breadth and depth first iteration.
|
9
9
|
|
@@ -16,23 +16,25 @@ Node objects. Node objects support breadth and depth first iteration.
|
|
16
16
|
irb(main):004:0> p.create_child!('child2')
|
17
17
|
=> <Treebank::Node child2 []>
|
18
18
|
|
19
|
-
Node has a subclass ParentedNode that keeps track of the parent of the
|
19
|
+
Node has a subclass Treebank::ParentedNode that keeps track of the parent of the
|
20
20
|
given node and has methods for iterating up the ancestor tree.
|
21
21
|
|
22
22
|
The default stringification method writes a node and all its children
|
23
23
|
in a bracketed tree format.
|
24
24
|
|
25
25
|
irb(main):005:0> puts p
|
26
|
-
(parent
|
27
|
-
|
26
|
+
(parent
|
27
|
+
(child1 )
|
28
|
+
(child2 ))
|
28
29
|
|
29
30
|
Bracketed tree strings can be used to create Node trees.
|
30
31
|
|
31
32
|
irb(main):006:0> t = Treebank::Node.from_s('(parent (child1) (child2))')
|
32
33
|
=> <Treebank::Node parent [child1 child2]>
|
33
34
|
irb(main):007:0> puts t
|
34
|
-
(parent
|
35
|
-
|
35
|
+
(parent
|
36
|
+
(child1 )
|
37
|
+
(child2 ))
|
36
38
|
|
37
39
|
The bracketed tree format is the one used by the Penn
|
38
40
|
Treebank[http://www.cis.upenn.edu/~treebank/] Project to annonate
|
@@ -43,6 +45,7 @@ linguistic structure.
|
|
43
45
|
* 1-0-0 ... First release
|
44
46
|
* 1-1-0 ... Removed unnecessary fsa dependency from gemspec
|
45
47
|
* 2-0-0 ... Changed from_s initialization
|
48
|
+
* 2-1-0 ... Add indented multiline stringification; Add preterminal?
|
46
49
|
|
47
50
|
= See Also
|
48
51
|
|
data/lib/treebank.rb
CHANGED
@@ -18,7 +18,7 @@
|
|
18
18
|
# Treebank is the namespace that contains all tree-related functions.
|
19
19
|
module Treebank
|
20
20
|
|
21
|
-
VERSION = "2.
|
21
|
+
VERSION = "2.1.0"
|
22
22
|
|
23
23
|
# An enumerable list of tokens in a string representation of a tree
|
24
24
|
#
|
@@ -41,11 +41,11 @@ module Treebank
|
|
41
41
|
# The right delimiter
|
42
42
|
attr_reader :right
|
43
43
|
|
44
|
-
#
|
44
|
+
# Create a stream of tokens from an enumerable source.
|
45
45
|
#
|
46
|
-
#
|
47
|
-
#
|
48
|
-
#
|
46
|
+
# _source_:: The string stream to tokenize
|
47
|
+
# _left_:: Left bracket symbol
|
48
|
+
# _right_:: Right bracket symbol
|
49
49
|
def initialize(source, left = '(', right = ')')
|
50
50
|
@source = source
|
51
51
|
@left = left
|
@@ -58,16 +58,16 @@ module Treebank
|
|
58
58
|
@s_regex = Regexp.new("\\#{@left}|\\#{@right}|[^#{cc_left}#{cc_right}]+")
|
59
59
|
end
|
60
60
|
|
61
|
-
# Enumerate the tokens in the source
|
61
|
+
# Enumerate the tokens in the source.
|
62
62
|
def each
|
63
63
|
@source.each do |string|
|
64
64
|
tokenize_string(string) {|token| yield token}
|
65
65
|
end
|
66
66
|
end
|
67
67
|
|
68
|
-
# Tokenize the source string
|
68
|
+
# Tokenize the source string.
|
69
69
|
#
|
70
|
-
#
|
70
|
+
# _string_:: The string to tokenize
|
71
71
|
def tokenize_string(string)
|
72
72
|
string.scan(@s_regex) do |bracket_delimited|
|
73
73
|
bracket_delimited.split.each {|token| yield token}
|
@@ -78,7 +78,7 @@ module Treebank
|
|
78
78
|
|
79
79
|
end # TokenStream
|
80
80
|
|
81
|
-
# A parser for string representations of trees
|
81
|
+
# A parser for string representations of trees.
|
82
82
|
#
|
83
83
|
# This class uses a simplified shift-reduce parser to convert a
|
84
84
|
# string into a list of tree structures.
|
@@ -87,7 +87,7 @@ module Treebank
|
|
87
87
|
# => [<Treebank::Node A []>, <Treebank::Node B [C D]>]
|
88
88
|
#
|
89
89
|
# The string representation of a list of trees has the following BNF
|
90
|
-
# definition
|
90
|
+
# definition:
|
91
91
|
#
|
92
92
|
# * trees -> node*
|
93
93
|
# * node -> (label? children)
|
@@ -105,10 +105,8 @@ module Treebank
|
|
105
105
|
class Parser
|
106
106
|
include Enumerable
|
107
107
|
|
108
|
-
#
|
109
|
-
#
|
110
|
-
# * tokens ... stream of tokens to be converted into trees
|
111
|
-
# * node_class ... class of node to create
|
108
|
+
# _tokens_:: Stream of tokens to be converted into trees
|
109
|
+
# _node_class_:: Class of node to create
|
112
110
|
#
|
113
111
|
# If _tokens_ is not a kind of TokenStream object it will be used
|
114
112
|
# as the source stream of one.
|
@@ -118,7 +116,7 @@ module Treebank
|
|
118
116
|
@node_class = node_class
|
119
117
|
end
|
120
118
|
|
121
|
-
# Enumerate the tokens yielding trees
|
119
|
+
# Enumerate the tokens yielding trees.
|
122
120
|
def each # :yields: tree
|
123
121
|
parse = []
|
124
122
|
@tokens.each do |token|
|
@@ -140,9 +138,9 @@ module Treebank
|
|
140
138
|
raise "Extra #{@tokens.left}: #{parse}" if not parse.empty?
|
141
139
|
end
|
142
140
|
|
143
|
-
# Convert the end of the parse list into a single node
|
141
|
+
# Convert the end of the parse list into a single node.
|
144
142
|
#
|
145
|
-
#
|
143
|
+
# _node_parse_:: A list of labels and nodes
|
146
144
|
def reduce(node_parse)
|
147
145
|
node = @node_class.new
|
148
146
|
# The first item in the list may be a label.
|
@@ -175,21 +173,19 @@ module Treebank
|
|
175
173
|
class BFSIterator
|
176
174
|
include Enumerable
|
177
175
|
|
178
|
-
#
|
179
|
-
#
|
180
|
-
# * node ... the start node of the enumeration
|
181
|
-
# * visit ... optional enumeration control procedure
|
176
|
+
# _node_:: The start node of the enumeration
|
177
|
+
# _visit_:: Optional enumeration control procedure
|
182
178
|
#
|
183
179
|
# The optional _visit_ argument can be used to control which
|
184
180
|
# children are visited by this iterator. If specified, it is
|
185
|
-
# called for every node, and only those nodes returning
|
181
|
+
# called for every node, and only those nodes returning _true_
|
186
182
|
# will be visited.
|
187
183
|
def initialize(node, visit = nil)
|
188
184
|
@node = node
|
189
185
|
@visit = visit
|
190
186
|
end
|
191
187
|
|
192
|
-
# Enumerate the nodes
|
188
|
+
# Enumerate the nodes.
|
193
189
|
def each
|
194
190
|
@agenda = [@node]
|
195
191
|
while node = @agenda.shift
|
@@ -199,9 +195,9 @@ module Treebank
|
|
199
195
|
end
|
200
196
|
end
|
201
197
|
|
202
|
-
# Function that controls enumeration recursion
|
198
|
+
# Function that controls enumeration recursion.
|
203
199
|
#
|
204
|
-
#
|
200
|
+
# _children_:: A list of child nodes of the current node
|
205
201
|
#
|
206
202
|
# The only difference between the breadth-first and depth-first
|
207
203
|
# searches is this function.
|
@@ -215,7 +211,7 @@ module Treebank
|
|
215
211
|
|
216
212
|
# Function that controls enumeration recursion
|
217
213
|
#
|
218
|
-
#
|
214
|
+
# _children_:: A list of child nodes of the current node
|
219
215
|
#
|
220
216
|
# The only difference between the breadth-first and depth-first
|
221
217
|
# searches is this function.
|
@@ -227,21 +223,21 @@ module Treebank
|
|
227
223
|
# This node's label
|
228
224
|
attr_accessor :label
|
229
225
|
|
230
|
-
#
|
226
|
+
# Create a node, specifying its label and its children's labels.
|
231
227
|
#
|
232
|
-
#
|
233
|
-
#
|
228
|
+
# _label_:: The label of this node
|
229
|
+
# _child_labels_:: List of labels for children of this node
|
234
230
|
def initialize(label = nil, child_labels = [])
|
235
231
|
@label = label
|
236
232
|
@children = []
|
237
233
|
child_labels.each {|label| create_child!(label)}
|
238
234
|
end
|
239
235
|
|
240
|
-
# Read the tree from a bracketed string
|
236
|
+
# Read the tree from a bracketed string.
|
241
237
|
#
|
242
|
-
#
|
243
|
-
#
|
244
|
-
#
|
238
|
+
# _s_:: Bracketed string
|
239
|
+
# _left_:: Left bracket symbol
|
240
|
+
# _right_:: Right bracket symbol
|
245
241
|
#
|
246
242
|
# This function uses a Treebank::Parser object to create the tree from
|
247
243
|
# _s_.
|
@@ -251,23 +247,58 @@ module Treebank
|
|
251
247
|
nodes.first
|
252
248
|
end
|
253
249
|
|
254
|
-
#
|
250
|
+
# This writes to a bracketed string representation that can be read by the
|
251
|
+
# Parser object.
|
255
252
|
#
|
256
|
-
#
|
257
|
-
#
|
258
|
-
def to_s
|
259
|
-
|
260
|
-
|
253
|
+
# _multiline_:: The string representation is in indented multiline format
|
254
|
+
# if this is _true_ and on a single line if it is _false_.
|
255
|
+
def to_s(multiline = true)
|
256
|
+
if multiline
|
257
|
+
multiline_to_s(0, nil, nil)
|
258
|
+
else
|
259
|
+
"(#{label} #{preterminal? ? @children.first.label :
|
260
|
+
@children.join(' ')})"
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
# Returns a string representation of this node and its children over
|
265
|
+
# multiple lines with indenting. This matches the format used in the Penn
|
266
|
+
# Treebank files.
|
267
|
+
#
|
268
|
+
# _indent_:: The number of spaces to indent this node in the output
|
269
|
+
# _parent_:: The node's parent
|
270
|
+
# _left_sibling_:: The node's left sibling
|
271
|
+
#
|
272
|
+
# This algorithm is based on the stringification algorithm in the Stanford
|
273
|
+
# Natural Language Parser.
|
274
|
+
def multiline_to_s(indent, parent, left_sibling)
|
275
|
+
# Insert a new line and indenting before this node as needed. Stay on
|
276
|
+
# the same line if this is the top of the tree, if the parent label is
|
277
|
+
# empty or if this node and all preceeding siblings are preterminals.
|
278
|
+
parent_empty = (parent.nil? or parent.label.to_s.empty?)
|
279
|
+
left_preterm = (left_sibling.nil? or left_sibling.preterminal?)
|
280
|
+
same_line = (parent_empty or (preterminal? and left_preterm))
|
281
|
+
s = parent.nil? ? "" : (same_line ? " " : "\n" + " " * indent)
|
282
|
+
# Recursively stringify this node and its children.
|
283
|
+
if leaf? or preterminal?
|
284
|
+
s << to_s(false)
|
285
|
+
else
|
286
|
+
s << "(#{label}"
|
287
|
+
@children.each_with_index do |child, index|
|
288
|
+
left_sibling = index.zero? ? nil : @children[index-1]
|
289
|
+
s << child.multiline_to_s(indent+2, self, left_sibling)
|
290
|
+
end
|
291
|
+
s << ")"
|
292
|
+
end
|
293
|
+
s
|
261
294
|
end
|
262
295
|
|
263
|
-
#
|
296
|
+
# Show this node's label and the labels of its children.
|
264
297
|
def inspect
|
265
298
|
child_labels = @children.collect {|n| n.label}
|
266
299
|
"<#{self.class} #{@label} [#{child_labels.join(' ')}]>"
|
267
300
|
end
|
268
301
|
|
269
|
-
# Tree equivalence operator
|
270
|
-
#
|
271
302
|
# If the other object is a tree and every node label in the
|
272
303
|
# corresponding nodes of the two depth first enumerations match,
|
273
304
|
# the trees are equivalent.
|
@@ -281,28 +312,22 @@ module Treebank
|
|
281
312
|
mismatch.nil?
|
282
313
|
end
|
283
314
|
|
284
|
-
# Create a new node and add it as a child of this node
|
285
|
-
#
|
286
|
-
# * label ... the label of a node to create
|
287
|
-
# * index ... optional insertion index
|
315
|
+
# Create a new node and add it as a child of this node.
|
288
316
|
#
|
289
|
-
#
|
290
|
-
#
|
317
|
+
# _label_:: The label of a node to create
|
318
|
+
# _index_:: Optional insertion index. If unspecified, the node is added
|
319
|
+
# to the end of the child list.
|
291
320
|
#
|
292
321
|
# This function returns the added Node object.
|
293
322
|
def create_child!(label, index = nil)
|
294
323
|
attach_child!(self.class.new(label), index)
|
295
324
|
end
|
296
325
|
|
297
|
-
# Attach an existing node as the child of this node
|
326
|
+
# Attach an existing node as the child of this node.
|
298
327
|
#
|
299
|
-
#
|
300
|
-
#
|
301
|
-
#
|
302
|
-
# _node_ must be the same type as this node.
|
303
|
-
#
|
304
|
-
# If _index_ is not specified, the node is added to the end of the
|
305
|
-
# child list.
|
328
|
+
# _node_:: The node to add. It must be the same type as this node.
|
329
|
+
# _index_:: Optional insertion index. If unspecified, the node is added
|
330
|
+
# to the end of the child list.
|
306
331
|
#
|
307
332
|
# This function returns the added Node object.
|
308
333
|
def attach_child!(node, index = nil)
|
@@ -315,11 +340,9 @@ module Treebank
|
|
315
340
|
node
|
316
341
|
end
|
317
342
|
|
318
|
-
#
|
319
|
-
#
|
320
|
-
# * node ... the node to detach
|
343
|
+
# Removes the specfied node from this node's child list.
|
321
344
|
#
|
322
|
-
#
|
345
|
+
# _node_:: The node to detach
|
323
346
|
def detach_child!(node)
|
324
347
|
raise "#{node} is not a child of #{self}" if @children.delete(node).nil?
|
325
348
|
end
|
@@ -331,7 +354,7 @@ module Treebank
|
|
331
354
|
|
332
355
|
# Enumerate all the nodes beneath this one breadth-first
|
333
356
|
#
|
334
|
-
#
|
357
|
+
# _visit_:: Optional enumeration control procedure
|
335
358
|
#
|
336
359
|
# The _visit_ parameter is passed down to the BFSIterator.
|
337
360
|
def each_breadth_first(visit = nil)
|
@@ -340,36 +363,43 @@ module Treebank
|
|
340
363
|
|
341
364
|
# Enumerate all the nodes beneath this one depth-first
|
342
365
|
#
|
343
|
-
#
|
366
|
+
# _visit_:: Optional enumeration control procedure
|
344
367
|
#
|
345
368
|
# The _visit_ parameter is passed down to the DFSIterator.
|
346
369
|
def each_depth_first(visit = nil)
|
347
370
|
DFSIterator.new(self, visit)
|
348
371
|
end
|
349
372
|
|
350
|
-
#
|
373
|
+
# A leaf node has no children.
|
351
374
|
def leaf?
|
352
375
|
@children.empty?
|
353
376
|
end
|
354
377
|
|
355
|
-
#
|
378
|
+
# An empty node has no label and no children.
|
356
379
|
def empty?
|
357
380
|
@label.nil? and @children.empty?
|
358
381
|
end
|
359
382
|
|
360
|
-
#
|
383
|
+
# A preterminal node dominates a single leaf node.
|
384
|
+
def preterminal?
|
385
|
+
@children.length == 1 and @children.first.leaf?
|
386
|
+
end
|
387
|
+
|
388
|
+
# Return all the leaf nodes beneath this node.
|
361
389
|
#
|
362
|
-
#
|
390
|
+
# _block_:: An optional block to run on each leaf
|
363
391
|
def leaves(&block)
|
364
392
|
leaves = each_depth_first.find_all {|node| node.leaf?}
|
365
393
|
leaves = leaves.collect {|leaf| block.call(leaf)} if not block.nil?
|
366
394
|
leaves
|
367
395
|
end
|
368
396
|
|
397
|
+
protected :multiline_to_s
|
398
|
+
|
369
399
|
end # Node
|
370
400
|
|
371
401
|
|
372
|
-
# A Node in a Tree that can locate its parent
|
402
|
+
# A Node in a Tree that can locate its parent.
|
373
403
|
#
|
374
404
|
# The ParentedNode adds a pointer back to the parent node to
|
375
405
|
# the Node class.
|
@@ -382,14 +412,12 @@ module Treebank
|
|
382
412
|
class ParentIterator
|
383
413
|
include Enumerable
|
384
414
|
|
385
|
-
#
|
386
|
-
#
|
387
|
-
# * node ... the start node of the enumeration
|
415
|
+
# _node_:: The start node of the enumeration
|
388
416
|
def initialize(node)
|
389
417
|
@node = node
|
390
418
|
end
|
391
419
|
|
392
|
-
# Enumerate the ancestor chain
|
420
|
+
# Enumerate the ancestor chain.
|
393
421
|
def each
|
394
422
|
node = @node
|
395
423
|
while not node.nil?
|
@@ -400,11 +428,12 @@ module Treebank
|
|
400
428
|
|
401
429
|
end # ParentIterator
|
402
430
|
|
403
|
-
#
|
431
|
+
# Create a node specifying its parent, its label, and its children's
|
432
|
+
# labels.
|
404
433
|
#
|
405
|
-
#
|
406
|
-
#
|
407
|
-
#
|
434
|
+
# _label_:: The label of this node
|
435
|
+
# _child_labels_:: List of labels for children of this node
|
436
|
+
# _parent_:: The parent of this node
|
408
437
|
def initialize(label = nil, child_labels = [], parent = nil)
|
409
438
|
super(label, child_labels)
|
410
439
|
@parent = parent
|
@@ -423,17 +452,15 @@ module Treebank
|
|
423
452
|
node.parent = nil
|
424
453
|
end
|
425
454
|
|
426
|
-
# Set the parent of this node
|
427
|
-
#
|
428
|
-
# * parent ... the parent node
|
455
|
+
# Set the parent of this node. This does not change the child list of
|
456
|
+
# _parent_.
|
429
457
|
#
|
430
|
-
#
|
431
|
-
# child list of _parent_.
|
458
|
+
# _parent_:: The parent node
|
432
459
|
def parent=(parent)
|
433
460
|
@parent = parent
|
434
461
|
end
|
435
462
|
|
436
|
-
# Enumerate the ancestors of this node
|
463
|
+
# Enumerate the ancestors of this node.
|
437
464
|
def each_parent
|
438
465
|
ParentIterator.new(self)
|
439
466
|
end
|
data/test/test_treebank.rb
CHANGED
@@ -155,19 +155,36 @@ module NodeTestMixin
|
|
155
155
|
|
156
156
|
# Read from/to a string
|
157
157
|
def test_stringify
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
(NP
|
162
|
-
(D (the))
|
163
|
-
(N (boy)))
|
164
|
-
(VP
|
165
|
-
(V (ran))))'
|
166
|
-
t = @node_class.from_s(s)
|
158
|
+
singleline = "(S (NP (D the) (N boy)) (VP (V ran)))"
|
159
|
+
multiline = "(S\n (NP (D the) (N boy))\n (VP (V ran)))"
|
160
|
+
t = @node_class.from_s(singleline)
|
167
161
|
assert_kind_of @node_class, t, 'from_s'
|
168
|
-
assert_equal
|
169
|
-
|
170
|
-
assert_equal
|
162
|
+
assert_equal t.to_s, t.to_s(true), 'to_s == to_s(true)'
|
163
|
+
assert_equal multiline, t.to_s(true), 'to_s(true)'
|
164
|
+
assert_equal singleline, t.to_s(false), 'to_s(false)'
|
165
|
+
m = @node_class.from_s(multiline)
|
166
|
+
assert_equal t, m, 'Single-/multi-line initialization equal'
|
167
|
+
end
|
168
|
+
|
169
|
+
# Read to and from a typical Wall Street Journal treebank string
|
170
|
+
def test_wsj_sentence_stringify
|
171
|
+
s =
|
172
|
+
"( (S
|
173
|
+
(NP-SBJ (CD Two) (VBG leading) (NN constitutional-law) (NNS experts))
|
174
|
+
(VP (VBD said)
|
175
|
+
(SBAR (-NONE- 0)
|
176
|
+
(S
|
177
|
+
(NP-SBJ (NNP President) (NNP Bush))
|
178
|
+
(VP (VBZ does) (RB n't)
|
179
|
+
(VP (VB have)
|
180
|
+
(NP (DT the) (JJ legal) (NN authority)
|
181
|
+
(S
|
182
|
+
(NP-SBJ (-NONE- *))
|
183
|
+
(VP (TO to)
|
184
|
+
(VP (VB exercise)
|
185
|
+
(NP (DT a) (JJ line-item) (NN veto)))))))))))
|
186
|
+
(. .)))"
|
187
|
+
assert_equal s, @node_class.from_s(s).to_s, 'Stringify WSJ sentence'
|
171
188
|
end
|
172
189
|
|
173
190
|
# Simple enumeration
|
metadata
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.
|
2
|
+
rubygems_version: 0.9.2
|
3
3
|
specification_version: 1
|
4
4
|
name: treebank
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 2.
|
7
|
-
date: 2007-
|
6
|
+
version: 2.1.0
|
7
|
+
date: 2007-12-11 00:00:00 -08:00
|
8
8
|
summary: Treebank implements support for ordered n-ary branching tree structures
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -25,6 +25,7 @@ required_ruby_version: !ruby/object:Gem::Version::Requirement
|
|
25
25
|
platform: ruby
|
26
26
|
signing_key:
|
27
27
|
cert_chain:
|
28
|
+
post_install_message:
|
28
29
|
authors:
|
29
30
|
- W.P. McNeill
|
30
31
|
files:
|