treebank 2.0.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +9 -6
- data/lib/treebank.rb +108 -81
- data/test/test_treebank.rb +29 -12
- metadata +4 -3
data/README
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
This module supports the creation, search, manipulation, and
|
4
4
|
serialization of tree structures.
|
5
5
|
|
6
|
-
Trees are implemented with Node objects. Each Node has a writable
|
6
|
+
Trees are implemented with Treebank::Node objects. Each Node has a writable
|
7
7
|
_label_ that may be any arbitrary object and a list of other child
|
8
8
|
Node objects. Node objects support breadth and depth first iteration.
|
9
9
|
|
@@ -16,23 +16,25 @@ Node objects. Node objects support breadth and depth first iteration.
|
|
16
16
|
irb(main):004:0> p.create_child!('child2')
|
17
17
|
=> <Treebank::Node child2 []>
|
18
18
|
|
19
|
-
Node has a subclass ParentedNode that keeps track of the parent of the
|
19
|
+
Node has a subclass Treebank::ParentedNode that keeps track of the parent of the
|
20
20
|
given node and has methods for iterating up the ancestor tree.
|
21
21
|
|
22
22
|
The default stringification method writes a node and all its children
|
23
23
|
in a bracketed tree format.
|
24
24
|
|
25
25
|
irb(main):005:0> puts p
|
26
|
-
(parent
|
27
|
-
|
26
|
+
(parent
|
27
|
+
(child1 )
|
28
|
+
(child2 ))
|
28
29
|
|
29
30
|
Bracketed tree strings can be used to create Node trees.
|
30
31
|
|
31
32
|
irb(main):006:0> t = Treebank::Node.from_s('(parent (child1) (child2))')
|
32
33
|
=> <Treebank::Node parent [child1 child2]>
|
33
34
|
irb(main):007:0> puts t
|
34
|
-
(parent
|
35
|
-
|
35
|
+
(parent
|
36
|
+
(child1 )
|
37
|
+
(child2 ))
|
36
38
|
|
37
39
|
The bracketed tree format is the one used by the Penn
|
38
40
|
Treebank[http://www.cis.upenn.edu/~treebank/] Project to annonate
|
@@ -43,6 +45,7 @@ linguistic structure.
|
|
43
45
|
* 1-0-0 ... First release
|
44
46
|
* 1-1-0 ... Removed unnecessary fsa dependency from gemspec
|
45
47
|
* 2-0-0 ... Changed from_s initialization
|
48
|
+
* 2-1-0 ... Add indented multiline stringification; Add preterminal?
|
46
49
|
|
47
50
|
= See Also
|
48
51
|
|
data/lib/treebank.rb
CHANGED
@@ -18,7 +18,7 @@
|
|
18
18
|
# Treebank is the namespace that contains all tree-related functions.
|
19
19
|
module Treebank
|
20
20
|
|
21
|
-
VERSION = "2.
|
21
|
+
VERSION = "2.1.0"
|
22
22
|
|
23
23
|
# An enumerable list of tokens in a string representation of a tree
|
24
24
|
#
|
@@ -41,11 +41,11 @@ module Treebank
|
|
41
41
|
# The right delimiter
|
42
42
|
attr_reader :right
|
43
43
|
|
44
|
-
#
|
44
|
+
# Create a stream of tokens from an enumerable source.
|
45
45
|
#
|
46
|
-
#
|
47
|
-
#
|
48
|
-
#
|
46
|
+
# _source_:: The string stream to tokenize
|
47
|
+
# _left_:: Left bracket symbol
|
48
|
+
# _right_:: Right bracket symbol
|
49
49
|
def initialize(source, left = '(', right = ')')
|
50
50
|
@source = source
|
51
51
|
@left = left
|
@@ -58,16 +58,16 @@ module Treebank
|
|
58
58
|
@s_regex = Regexp.new("\\#{@left}|\\#{@right}|[^#{cc_left}#{cc_right}]+")
|
59
59
|
end
|
60
60
|
|
61
|
-
# Enumerate the tokens in the source
|
61
|
+
# Enumerate the tokens in the source.
|
62
62
|
def each
|
63
63
|
@source.each do |string|
|
64
64
|
tokenize_string(string) {|token| yield token}
|
65
65
|
end
|
66
66
|
end
|
67
67
|
|
68
|
-
# Tokenize the source string
|
68
|
+
# Tokenize the source string.
|
69
69
|
#
|
70
|
-
#
|
70
|
+
# _string_:: The string to tokenize
|
71
71
|
def tokenize_string(string)
|
72
72
|
string.scan(@s_regex) do |bracket_delimited|
|
73
73
|
bracket_delimited.split.each {|token| yield token}
|
@@ -78,7 +78,7 @@ module Treebank
|
|
78
78
|
|
79
79
|
end # TokenStream
|
80
80
|
|
81
|
-
# A parser for string representations of trees
|
81
|
+
# A parser for string representations of trees.
|
82
82
|
#
|
83
83
|
# This class uses a simplified shift-reduce parser to convert a
|
84
84
|
# string into a list of tree structures.
|
@@ -87,7 +87,7 @@ module Treebank
|
|
87
87
|
# => [<Treebank::Node A []>, <Treebank::Node B [C D]>]
|
88
88
|
#
|
89
89
|
# The string representation of a list of trees has the following BNF
|
90
|
-
# definition
|
90
|
+
# definition:
|
91
91
|
#
|
92
92
|
# * trees -> node*
|
93
93
|
# * node -> (label? children)
|
@@ -105,10 +105,8 @@ module Treebank
|
|
105
105
|
class Parser
|
106
106
|
include Enumerable
|
107
107
|
|
108
|
-
#
|
109
|
-
#
|
110
|
-
# * tokens ... stream of tokens to be converted into trees
|
111
|
-
# * node_class ... class of node to create
|
108
|
+
# _tokens_:: Stream of tokens to be converted into trees
|
109
|
+
# _node_class_:: Class of node to create
|
112
110
|
#
|
113
111
|
# If _tokens_ is not a kind of TokenStream object it will be used
|
114
112
|
# as the source stream of one.
|
@@ -118,7 +116,7 @@ module Treebank
|
|
118
116
|
@node_class = node_class
|
119
117
|
end
|
120
118
|
|
121
|
-
# Enumerate the tokens yielding trees
|
119
|
+
# Enumerate the tokens yielding trees.
|
122
120
|
def each # :yields: tree
|
123
121
|
parse = []
|
124
122
|
@tokens.each do |token|
|
@@ -140,9 +138,9 @@ module Treebank
|
|
140
138
|
raise "Extra #{@tokens.left}: #{parse}" if not parse.empty?
|
141
139
|
end
|
142
140
|
|
143
|
-
# Convert the end of the parse list into a single node
|
141
|
+
# Convert the end of the parse list into a single node.
|
144
142
|
#
|
145
|
-
#
|
143
|
+
# _node_parse_:: A list of labels and nodes
|
146
144
|
def reduce(node_parse)
|
147
145
|
node = @node_class.new
|
148
146
|
# The first item in the list may be a label.
|
@@ -175,21 +173,19 @@ module Treebank
|
|
175
173
|
class BFSIterator
|
176
174
|
include Enumerable
|
177
175
|
|
178
|
-
#
|
179
|
-
#
|
180
|
-
# * node ... the start node of the enumeration
|
181
|
-
# * visit ... optional enumeration control procedure
|
176
|
+
# _node_:: The start node of the enumeration
|
177
|
+
# _visit_:: Optional enumeration control procedure
|
182
178
|
#
|
183
179
|
# The optional _visit_ argument can be used to control which
|
184
180
|
# children are visited by this iterator. If specified, it is
|
185
|
-
# called for every node, and only those nodes returning
|
181
|
+
# called for every node, and only those nodes returning _true_
|
186
182
|
# will be visited.
|
187
183
|
def initialize(node, visit = nil)
|
188
184
|
@node = node
|
189
185
|
@visit = visit
|
190
186
|
end
|
191
187
|
|
192
|
-
# Enumerate the nodes
|
188
|
+
# Enumerate the nodes.
|
193
189
|
def each
|
194
190
|
@agenda = [@node]
|
195
191
|
while node = @agenda.shift
|
@@ -199,9 +195,9 @@ module Treebank
|
|
199
195
|
end
|
200
196
|
end
|
201
197
|
|
202
|
-
# Function that controls enumeration recursion
|
198
|
+
# Function that controls enumeration recursion.
|
203
199
|
#
|
204
|
-
#
|
200
|
+
# _children_:: A list of child nodes of the current node
|
205
201
|
#
|
206
202
|
# The only difference between the breadth-first and depth-first
|
207
203
|
# searches is this function.
|
@@ -215,7 +211,7 @@ module Treebank
|
|
215
211
|
|
216
212
|
# Function that controls enumeration recursion
|
217
213
|
#
|
218
|
-
#
|
214
|
+
# _children_:: A list of child nodes of the current node
|
219
215
|
#
|
220
216
|
# The only difference between the breadth-first and depth-first
|
221
217
|
# searches is this function.
|
@@ -227,21 +223,21 @@ module Treebank
|
|
227
223
|
# This node's label
|
228
224
|
attr_accessor :label
|
229
225
|
|
230
|
-
#
|
226
|
+
# Create a node, specifying its label and its children's labels.
|
231
227
|
#
|
232
|
-
#
|
233
|
-
#
|
228
|
+
# _label_:: The label of this node
|
229
|
+
# _child_labels_:: List of labels for children of this node
|
234
230
|
def initialize(label = nil, child_labels = [])
|
235
231
|
@label = label
|
236
232
|
@children = []
|
237
233
|
child_labels.each {|label| create_child!(label)}
|
238
234
|
end
|
239
235
|
|
240
|
-
# Read the tree from a bracketed string
|
236
|
+
# Read the tree from a bracketed string.
|
241
237
|
#
|
242
|
-
#
|
243
|
-
#
|
244
|
-
#
|
238
|
+
# _s_:: Bracketed string
|
239
|
+
# _left_:: Left bracket symbol
|
240
|
+
# _right_:: Right bracket symbol
|
245
241
|
#
|
246
242
|
# This function uses a Treebank::Parser object to create the tree from
|
247
243
|
# _s_.
|
@@ -251,23 +247,58 @@ module Treebank
|
|
251
247
|
nodes.first
|
252
248
|
end
|
253
249
|
|
254
|
-
#
|
250
|
+
# This writes to a bracketed string representation that can be read by the
|
251
|
+
# Parser object.
|
255
252
|
#
|
256
|
-
#
|
257
|
-
#
|
258
|
-
def to_s
|
259
|
-
|
260
|
-
|
253
|
+
# _multiline_:: The string representation is in indented multiline format
|
254
|
+
# if this is _true_ and on a single line if it is _false_.
|
255
|
+
def to_s(multiline = true)
|
256
|
+
if multiline
|
257
|
+
multiline_to_s(0, nil, nil)
|
258
|
+
else
|
259
|
+
"(#{label} #{preterminal? ? @children.first.label :
|
260
|
+
@children.join(' ')})"
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
# Returns a string representation of this node and its children over
|
265
|
+
# multiple lines with indenting. This matches the format used in the Penn
|
266
|
+
# Treebank files.
|
267
|
+
#
|
268
|
+
# _indent_:: The number of spaces to indent this node in the output
|
269
|
+
# _parent_:: The node's parent
|
270
|
+
# _left_sibling_:: The node's left sibling
|
271
|
+
#
|
272
|
+
# This algorithm is based on the stringification algorithm in the Stanford
|
273
|
+
# Natural Language Parser.
|
274
|
+
def multiline_to_s(indent, parent, left_sibling)
|
275
|
+
# Insert a new line and indenting before this node as needed. Stay on
|
276
|
+
# the same line if this is the top of the tree, if the parent label is
|
277
|
+
# empty or if this node and all preceeding siblings are preterminals.
|
278
|
+
parent_empty = (parent.nil? or parent.label.to_s.empty?)
|
279
|
+
left_preterm = (left_sibling.nil? or left_sibling.preterminal?)
|
280
|
+
same_line = (parent_empty or (preterminal? and left_preterm))
|
281
|
+
s = parent.nil? ? "" : (same_line ? " " : "\n" + " " * indent)
|
282
|
+
# Recursively stringify this node and its children.
|
283
|
+
if leaf? or preterminal?
|
284
|
+
s << to_s(false)
|
285
|
+
else
|
286
|
+
s << "(#{label}"
|
287
|
+
@children.each_with_index do |child, index|
|
288
|
+
left_sibling = index.zero? ? nil : @children[index-1]
|
289
|
+
s << child.multiline_to_s(indent+2, self, left_sibling)
|
290
|
+
end
|
291
|
+
s << ")"
|
292
|
+
end
|
293
|
+
s
|
261
294
|
end
|
262
295
|
|
263
|
-
#
|
296
|
+
# Show this node's label and the labels of its children.
|
264
297
|
def inspect
|
265
298
|
child_labels = @children.collect {|n| n.label}
|
266
299
|
"<#{self.class} #{@label} [#{child_labels.join(' ')}]>"
|
267
300
|
end
|
268
301
|
|
269
|
-
# Tree equivalence operator
|
270
|
-
#
|
271
302
|
# If the other object is a tree and every node label in the
|
272
303
|
# corresponding nodes of the two depth first enumerations match,
|
273
304
|
# the trees are equivalent.
|
@@ -281,28 +312,22 @@ module Treebank
|
|
281
312
|
mismatch.nil?
|
282
313
|
end
|
283
314
|
|
284
|
-
# Create a new node and add it as a child of this node
|
285
|
-
#
|
286
|
-
# * label ... the label of a node to create
|
287
|
-
# * index ... optional insertion index
|
315
|
+
# Create a new node and add it as a child of this node.
|
288
316
|
#
|
289
|
-
#
|
290
|
-
#
|
317
|
+
# _label_:: The label of a node to create
|
318
|
+
# _index_:: Optional insertion index. If unspecified, the node is added
|
319
|
+
# to the end of the child list.
|
291
320
|
#
|
292
321
|
# This function returns the added Node object.
|
293
322
|
def create_child!(label, index = nil)
|
294
323
|
attach_child!(self.class.new(label), index)
|
295
324
|
end
|
296
325
|
|
297
|
-
# Attach an existing node as the child of this node
|
326
|
+
# Attach an existing node as the child of this node.
|
298
327
|
#
|
299
|
-
#
|
300
|
-
#
|
301
|
-
#
|
302
|
-
# _node_ must be the same type as this node.
|
303
|
-
#
|
304
|
-
# If _index_ is not specified, the node is added to the end of the
|
305
|
-
# child list.
|
328
|
+
# _node_:: The node to add. It must be the same type as this node.
|
329
|
+
# _index_:: Optional insertion index. If unspecified, the node is added
|
330
|
+
# to the end of the child list.
|
306
331
|
#
|
307
332
|
# This function returns the added Node object.
|
308
333
|
def attach_child!(node, index = nil)
|
@@ -315,11 +340,9 @@ module Treebank
|
|
315
340
|
node
|
316
341
|
end
|
317
342
|
|
318
|
-
#
|
319
|
-
#
|
320
|
-
# * node ... the node to detach
|
343
|
+
# Removes the specfied node from this node's child list.
|
321
344
|
#
|
322
|
-
#
|
345
|
+
# _node_:: The node to detach
|
323
346
|
def detach_child!(node)
|
324
347
|
raise "#{node} is not a child of #{self}" if @children.delete(node).nil?
|
325
348
|
end
|
@@ -331,7 +354,7 @@ module Treebank
|
|
331
354
|
|
332
355
|
# Enumerate all the nodes beneath this one breadth-first
|
333
356
|
#
|
334
|
-
#
|
357
|
+
# _visit_:: Optional enumeration control procedure
|
335
358
|
#
|
336
359
|
# The _visit_ parameter is passed down to the BFSIterator.
|
337
360
|
def each_breadth_first(visit = nil)
|
@@ -340,36 +363,43 @@ module Treebank
|
|
340
363
|
|
341
364
|
# Enumerate all the nodes beneath this one depth-first
|
342
365
|
#
|
343
|
-
#
|
366
|
+
# _visit_:: Optional enumeration control procedure
|
344
367
|
#
|
345
368
|
# The _visit_ parameter is passed down to the DFSIterator.
|
346
369
|
def each_depth_first(visit = nil)
|
347
370
|
DFSIterator.new(self, visit)
|
348
371
|
end
|
349
372
|
|
350
|
-
#
|
373
|
+
# A leaf node has no children.
|
351
374
|
def leaf?
|
352
375
|
@children.empty?
|
353
376
|
end
|
354
377
|
|
355
|
-
#
|
378
|
+
# An empty node has no label and no children.
|
356
379
|
def empty?
|
357
380
|
@label.nil? and @children.empty?
|
358
381
|
end
|
359
382
|
|
360
|
-
#
|
383
|
+
# A preterminal node dominates a single leaf node.
|
384
|
+
def preterminal?
|
385
|
+
@children.length == 1 and @children.first.leaf?
|
386
|
+
end
|
387
|
+
|
388
|
+
# Return all the leaf nodes beneath this node.
|
361
389
|
#
|
362
|
-
#
|
390
|
+
# _block_:: An optional block to run on each leaf
|
363
391
|
def leaves(&block)
|
364
392
|
leaves = each_depth_first.find_all {|node| node.leaf?}
|
365
393
|
leaves = leaves.collect {|leaf| block.call(leaf)} if not block.nil?
|
366
394
|
leaves
|
367
395
|
end
|
368
396
|
|
397
|
+
protected :multiline_to_s
|
398
|
+
|
369
399
|
end # Node
|
370
400
|
|
371
401
|
|
372
|
-
# A Node in a Tree that can locate its parent
|
402
|
+
# A Node in a Tree that can locate its parent.
|
373
403
|
#
|
374
404
|
# The ParentedNode adds a pointer back to the parent node to
|
375
405
|
# the Node class.
|
@@ -382,14 +412,12 @@ module Treebank
|
|
382
412
|
class ParentIterator
|
383
413
|
include Enumerable
|
384
414
|
|
385
|
-
#
|
386
|
-
#
|
387
|
-
# * node ... the start node of the enumeration
|
415
|
+
# _node_:: The start node of the enumeration
|
388
416
|
def initialize(node)
|
389
417
|
@node = node
|
390
418
|
end
|
391
419
|
|
392
|
-
# Enumerate the ancestor chain
|
420
|
+
# Enumerate the ancestor chain.
|
393
421
|
def each
|
394
422
|
node = @node
|
395
423
|
while not node.nil?
|
@@ -400,11 +428,12 @@ module Treebank
|
|
400
428
|
|
401
429
|
end # ParentIterator
|
402
430
|
|
403
|
-
#
|
431
|
+
# Create a node specifying its parent, its label, and its children's
|
432
|
+
# labels.
|
404
433
|
#
|
405
|
-
#
|
406
|
-
#
|
407
|
-
#
|
434
|
+
# _label_:: The label of this node
|
435
|
+
# _child_labels_:: List of labels for children of this node
|
436
|
+
# _parent_:: The parent of this node
|
408
437
|
def initialize(label = nil, child_labels = [], parent = nil)
|
409
438
|
super(label, child_labels)
|
410
439
|
@parent = parent
|
@@ -423,17 +452,15 @@ module Treebank
|
|
423
452
|
node.parent = nil
|
424
453
|
end
|
425
454
|
|
426
|
-
# Set the parent of this node
|
427
|
-
#
|
428
|
-
# * parent ... the parent node
|
455
|
+
# Set the parent of this node. This does not change the child list of
|
456
|
+
# _parent_.
|
429
457
|
#
|
430
|
-
#
|
431
|
-
# child list of _parent_.
|
458
|
+
# _parent_:: The parent node
|
432
459
|
def parent=(parent)
|
433
460
|
@parent = parent
|
434
461
|
end
|
435
462
|
|
436
|
-
# Enumerate the ancestors of this node
|
463
|
+
# Enumerate the ancestors of this node.
|
437
464
|
def each_parent
|
438
465
|
ParentIterator.new(self)
|
439
466
|
end
|
data/test/test_treebank.rb
CHANGED
@@ -155,19 +155,36 @@ module NodeTestMixin
|
|
155
155
|
|
156
156
|
# Read from/to a string
|
157
157
|
def test_stringify
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
(NP
|
162
|
-
(D (the))
|
163
|
-
(N (boy)))
|
164
|
-
(VP
|
165
|
-
(V (ran))))'
|
166
|
-
t = @node_class.from_s(s)
|
158
|
+
singleline = "(S (NP (D the) (N boy)) (VP (V ran)))"
|
159
|
+
multiline = "(S\n (NP (D the) (N boy))\n (VP (V ran)))"
|
160
|
+
t = @node_class.from_s(singleline)
|
167
161
|
assert_kind_of @node_class, t, 'from_s'
|
168
|
-
assert_equal
|
169
|
-
|
170
|
-
assert_equal
|
162
|
+
assert_equal t.to_s, t.to_s(true), 'to_s == to_s(true)'
|
163
|
+
assert_equal multiline, t.to_s(true), 'to_s(true)'
|
164
|
+
assert_equal singleline, t.to_s(false), 'to_s(false)'
|
165
|
+
m = @node_class.from_s(multiline)
|
166
|
+
assert_equal t, m, 'Single-/multi-line initialization equal'
|
167
|
+
end
|
168
|
+
|
169
|
+
# Read to and from a typical Wall Street Journal treebank string
|
170
|
+
def test_wsj_sentence_stringify
|
171
|
+
s =
|
172
|
+
"( (S
|
173
|
+
(NP-SBJ (CD Two) (VBG leading) (NN constitutional-law) (NNS experts))
|
174
|
+
(VP (VBD said)
|
175
|
+
(SBAR (-NONE- 0)
|
176
|
+
(S
|
177
|
+
(NP-SBJ (NNP President) (NNP Bush))
|
178
|
+
(VP (VBZ does) (RB n't)
|
179
|
+
(VP (VB have)
|
180
|
+
(NP (DT the) (JJ legal) (NN authority)
|
181
|
+
(S
|
182
|
+
(NP-SBJ (-NONE- *))
|
183
|
+
(VP (TO to)
|
184
|
+
(VP (VB exercise)
|
185
|
+
(NP (DT a) (JJ line-item) (NN veto)))))))))))
|
186
|
+
(. .)))"
|
187
|
+
assert_equal s, @node_class.from_s(s).to_s, 'Stringify WSJ sentence'
|
171
188
|
end
|
172
189
|
|
173
190
|
# Simple enumeration
|
metadata
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.
|
2
|
+
rubygems_version: 0.9.2
|
3
3
|
specification_version: 1
|
4
4
|
name: treebank
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 2.
|
7
|
-
date: 2007-
|
6
|
+
version: 2.1.0
|
7
|
+
date: 2007-12-11 00:00:00 -08:00
|
8
8
|
summary: Treebank implements support for ordered n-ary branching tree structures
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -25,6 +25,7 @@ required_ruby_version: !ruby/object:Gem::Version::Requirement
|
|
25
25
|
platform: ruby
|
26
26
|
signing_key:
|
27
27
|
cert_chain:
|
28
|
+
post_install_message:
|
28
29
|
authors:
|
29
30
|
- W.P. McNeill
|
30
31
|
files:
|