opener-constituent-parser-nl 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: ea12eef17f8f3e0bbf6cdeb7d7f85c1708bdf363
4
+ data.tar.gz: 436a65c189c21f054b8105393b1a6f4238780907
5
+ SHA512:
6
+ metadata.gz: 16d4fff1cb2d8eb1f26f008e8af562e1bbeb0e1d2455f3089e276e6d3f1804c524321d1f107e5820f32187a3d6ae5f65d6d8c3bad97c693f93b9c0a9107fe232
7
+ data.tar.gz: 47c9c43d790a5bc9bc90dbb6833437021f9015ff976efceead9056dd56269fbedfa5ffde3b42d7f3d0e303eabc524c958517445ad144e15c05a68611122367f4
data/README.md ADDED
@@ -0,0 +1,41 @@
1
+ [![Build Status](https://drone.io/github.com/opener-project/constituent-parser-nl/status.png)](https://drone.io/github.com/opener-project/constituent-parser-nl/latest)
2
+
3
+ Constituent-parser-nl
4
+ =======
5
+
6
+ Introduction
7
+ ------------
8
+
9
+ This is a parser for Dutch text using the Alpino parser (http://www.let.rug.nl/vannoord/alp/Alpino/). The input for this module has to be a valid
10
+ KAF file with at least the text layer. The output will be the constituent trees in pennTreebank format for each of the sentences in the input KAF.
11
+ The tokenization and sentence splitting is taken from the input KAF file, so if your input file has a wrong tokenization/splitting, the output could
12
+ contain errors. The number of output constituent trees will be exactly the same as the number of sentences in your input KAF
13
+
14
+ Requirements
15
+ -----------
16
+ * VUKafParserPy: parser in python for KAF files (https://github.com/opener-project/VU-kaf-parser)
17
+ * lxml: library for processing xml in python
18
+ * Alpino parser:http://www.let.rug.nl/vannoord/alp/Alpino/
19
+
20
+ Installation
21
+ -----------
22
+ Clone the repository to your local machine and set the varible ALPINO_HOME in the file core/alpino_parser.py
23
+ to point to your local folder of the Alpino parser.
24
+
25
+ How to run the module with Python
26
+ ---------------------------------
27
+
28
+ You can run this module from the command line using Python. The main script is core/alpino_parser.py. This script reads the KAF from the standard input
29
+ and writes the output to the standard output, generating some log information in the standard error output. To process one file just run:
30
+ ````shell
31
+ cat input.kaf | core/alpino_parser.py > input.tree
32
+ ````
33
+
34
+ This will read the KAF file in "input.kaf" and will store the constituent trees in "input.tree"
35
+
36
+
37
+ Contact
38
+ ------
39
+ * Ruben Izquierdo
40
+ * Vrije University of Amsterdam
41
+ * ruben.izquierdobevia@vu.nl
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/opener/constituent_parsers/nl'
4
+
5
+ # STDIN.tty? returns `false` if data is being piped into the current process.
6
+ if STDIN.tty?
7
+ input = nil
8
+ else
9
+ input = STDIN.read
10
+ end
11
+
12
+ kernel = Opener::ConstituentParsers::NL.new(:args => ARGV)
13
+ stdout, stderr, process = kernel.run(input)
14
+
15
+ puts stdout
@@ -0,0 +1,212 @@
1
+ #!/usr/bin/env python
2
+
3
+ import sys
4
+ import getopt
5
+ import os
6
+
7
+ this_folder = os.path.dirname(os.path.realpath(__file__))
8
+
9
+ # This updates the load path to ensure that the local site-packages directory
10
+ # can be used to load packages (e.g. a locally installed copy of lxml).
11
+ sys.path.append(os.path.join(this_folder, 'site-packages/pre_install'))
12
+
13
+ import codecs
14
+ from VUKafParserPy import KafParser
15
+ from lxml import etree
16
+ import tempfile
17
+ from subprocess import Popen,PIPE
18
+ import shutil
19
+ import glob
20
+ import logging
21
+
22
+ from convert_penn_to_kaf import convert_penn_to_kaf_with_numtokens
23
+ ## LAST CHANGES ##
24
+ # 20-dec-2013: modified to generate KAF output
25
+ # 15-jan-2014: order in alpino XML does not math the order of tokens
26
+ # so the label "begin" in the xml is used to know which is the number of token of each <node>
27
+
28
+
29
+ last_modified='21Jan2014'
30
+ version="1.4"
31
+ this_name = 'alpino kaf constituency parser'
32
+ this_layer = 'constituents'
33
+
34
+ #### SET THIS VARIABLE TO YOUR LOCAL FOLDER OF ALPINO
35
+ ALPINO_HOME = os.environ['ALPINO_HOME']
36
+
37
+ logging.basicConfig(stream=sys.stderr,format='%(asctime)s - %(levelname)s - %(message)s',level=logging.DEBUG)
38
+
39
+ __module_dir = os.path.dirname(__file__)
40
+
41
+ ## Function to convert to penn treebank bracketd format
42
+ def node_to_penn(node):
43
+ children = node.getchildren()
44
+ if len(children) == 0:
45
+ word = node.get('word',None)
46
+ if word is not None:
47
+ #The attribute begin gives you the number of the token
48
+ word = word.replace('(','-LRB')
49
+ word = word.replace(')','-RRB-')
50
+
51
+
52
+ num_token = node.get('begin')
53
+
54
+ word = num_token+'#'+word
55
+ if node.get('rel') == 'hd':
56
+ head = '=H'
57
+ else:
58
+ head = ''
59
+ return '('+node.get('pos')+head+' '+word.encode('utf-8')+')'
60
+ else:
61
+ return ''
62
+ else:
63
+ str = '('+node.get('cat')+' '
64
+ for n in children:
65
+ str+=node_to_penn(n)
66
+ str+=')'
67
+ return str
68
+
69
+
70
+ def xml_to_penn(filename):
71
+
72
+ ## Under certain condition, there is know bug of Alpino, it sets the encoding in the XML
73
+ ## to iso-8859-1, but the real encoding is UTF-8. So we need to force to use this encoding
74
+
75
+ parser = etree.XMLParser(encoding='UTF-8')
76
+ tree = etree.parse(filename,parser)
77
+
78
+ str = node_to_penn(tree.find('node'))
79
+ return str
80
+
81
+ if not sys.stdin.isatty():
82
+ ## READING FROM A PIPE
83
+ pass
84
+ else:
85
+ print>>sys.stderr,'Input stream required in KAF format at least with the text layer.'
86
+ print>>sys.stderr,'The language encoded in the KAF has to be Dutch, otherwise it will raise an error.'
87
+ print>>sys.stderr,'Example usage: cat myUTF8file.kaf.xml |',sys.argv[0]
88
+ sys.exit(-1)
89
+
90
+ my_time_stamp = True
91
+ try:
92
+ opts, args = getopt.getopt(sys.argv[1:],"",["no-time"])
93
+ for opt, arg in opts:
94
+ if opt == "--no-time":
95
+ my_time_stamp = False
96
+ except getopt.GetoptError:
97
+ pass
98
+
99
+
100
+ logging.debug('Loading and parsing KAF file ...')
101
+ my_kaf = KafParser(sys.stdin)
102
+
103
+ lang = my_kaf.getLanguage()
104
+ if lang != 'nl':
105
+ print>>sys.stdout,'ERROR! Language is ',lang,' and must be nl (Dutch)'
106
+ sys.exit(-1)
107
+
108
+ logging.debug('Extracting sentences from the KAF')
109
+ sentences = []
110
+ current_sent = []
111
+ term_ids = []
112
+ current_sent_tid = []
113
+
114
+
115
+ lemma_for_termid = {}
116
+ termid_for_token = {}
117
+
118
+ for term in my_kaf.getTerms():
119
+ lemma_for_termid[term.getId()] = term.getLemma()
120
+ tokens_id = term.get_list_span()
121
+ for token_id in tokens_id:
122
+ termid_for_token[token_id] = term.getId()
123
+
124
+
125
+ previous_sent = None
126
+ for token,sent,token_id in my_kaf.getTokens():
127
+ ##To avoid using tokens that have no term linked
128
+ if token_id not in termid_for_token:
129
+ continue
130
+ if sent != previous_sent and previous_sent!=None:
131
+ sentences.append(current_sent)
132
+ current_sent = [token]
133
+ term_ids.append(current_sent_tid)
134
+ current_sent_tid = [termid_for_token[token_id]]
135
+ else:
136
+ current_sent.append(token)
137
+ current_sent_tid.append(termid_for_token[token_id])
138
+ previous_sent = sent
139
+
140
+ if len(current_sent) !=0:
141
+ sentences.append(current_sent)
142
+ term_ids.append(current_sent_tid)
143
+
144
+
145
+ out_folder_alp = tempfile.mkdtemp()
146
+
147
+
148
+ logging.debug('Calling to Alpino parser in '+ALPINO_HOME)
149
+ logging.debug('Temporary folder: '+out_folder_alp)
150
+
151
+
152
+ ## CALL TO ALPINO
153
+ alpino_bin = os.path.join(ALPINO_HOME,'bin','Alpino')
154
+ cmd = alpino_bin+' end_hook=xml -flag treebank '+out_folder_alp+' -parse'
155
+ alpino_pro = Popen(cmd,stdout=PIPE,stdin=PIPE,stderr=PIPE,shell=True)
156
+
157
+ for sentence in sentences:
158
+ for token in sentence:
159
+ token = token.replace('[','\[')
160
+ token = token.replace(']','\]')
161
+ token = token.replace('|','\|')
162
+ #print>>sys.stderr,token.encode('utf-8'),
163
+ alpino_pro.stdin.write(token.encode('utf-8')+' ')
164
+ alpino_pro.stdin.write('\n')
165
+ #print>>sys.stderr
166
+ alpino_pro.stdin.close()
167
+
168
+ error_log = alpino_pro.stderr.read()
169
+ #print>>sys.stderr,alpino_pro.stderr.read()
170
+
171
+ # As we are not reading the stdout or stderr of the process, if we dont wait to it to be done
172
+ # the parent will keep running without alpino be completed, and we will get empty XML files
173
+ # If the parent reads from stdout or stderr, it waits to the child to be completed before keep running
174
+ alpino_pro.wait()
175
+
176
+
177
+ ## There should be as many files as number of sentences in the KAF
178
+
179
+ const = etree.Element('constituency')
180
+
181
+ #for xml_file in glob.glob(os.path.join(out_folder_alp,'*.xml')):
182
+ cnt_t = cnt_nt = cnt_edge = 0
183
+ some_error = False
184
+ for num_sent in range(len(sentences)):
185
+ xml_file = os.path.join(out_folder_alp,str(num_sent+1)+'.xml')
186
+ if os.path.exists(xml_file):
187
+ logging.debug('Converting alpino XML to pennTreebank, sentence num '+str(num_sent+1))
188
+ penn_str = xml_to_penn(xml_file)
189
+ tree_node,cnt_t,cnt_nt,cnt_edge = convert_penn_to_kaf_with_numtokens(penn_str,term_ids[num_sent],logging,lemma_for_termid,cnt_t,cnt_nt,cnt_edge)
190
+ else:
191
+ tree_node = etree.Element('tree') #empty
192
+ some_error = True
193
+ const.append(tree_node)
194
+
195
+ if some_error:
196
+ print>>sys.stderr,'POSSIBLE ERROR',error_log
197
+ value = -1
198
+ else:
199
+ value = 0
200
+
201
+ my_kaf.tree.getroot().append(const)
202
+ my_kaf.addLinguisticProcessor(this_name, version+'_'+last_modified, this_layer, my_time_stamp)
203
+ my_kaf.saveToFile(sys.stdout)
204
+
205
+
206
+ logging.debug('Number of sentences in the input KAF: '+str(len(sentences)))
207
+ logging.debug('PROCESS DONE')
208
+
209
+ ##Remove temporary stuff
210
+ shutil.rmtree(out_folder_alp)
211
+ #print out_folder_alp
212
+ sys.exit(value)
@@ -0,0 +1,161 @@
1
+ from lxml import etree
2
+ from tree import Tree
3
+ import logging
4
+
5
+
6
+
7
+ ## will be used as global variables to generate recursively the KAF constituent nodes
8
+ NOTER='nonter'
9
+ TER='ter'
10
+ EDGE='edge'
11
+ noter_cnt=0
12
+ ter_cnt=0
13
+ edge_cnt=0
14
+
15
+ ##This function generates a "tree" xml element as defined in KAF from a string containing
16
+ ##the penntreebank format and a list of term ids to do the linking
17
+ '''
18
+ s = '(S (NP (DET The) (NN dog)) (VP (V ate) (NP (DET the) (NN cat))) (. .))'
19
+ ids = ['t0 t1','t2','t3','t4','t5','t6']
20
+ tree_node = create_constituency_layer(s, ids)
21
+ e = etree.ElementTree(element=tree_node)
22
+ e.write(sys.stdout,pretty_print=True)
23
+ '''
24
+
25
+ list_t = []
26
+ cnt_t = 0
27
+ list_nt = []
28
+ cnt_nt =0
29
+ list_edge = []
30
+ cnt_edge =0
31
+
32
+ def convert_penn_to_kaf_with_numtokens(tree_str,term_ids,logging,lemma_for_termid,off_t=0,off_nt=0,off_edge=0):
33
+ global list_t, list_nt,list_edge,cnt_t, cnt_nt, cnt_edge
34
+ list_t = []
35
+ list_nt = []
36
+ list_edge = []
37
+ cnt_t = off_t
38
+ cnt_nt = off_nt
39
+ cnt_edge = off_edge
40
+
41
+ this_tree = Tree(tree_str)
42
+ logging.debug('\n'+str(this_tree)) ##It has been already encoded using UTF8
43
+ for num, num_token_and_token in enumerate(this_tree.leaves()):
44
+ ## token is not used at all
45
+ ##print num,token,position,token_id
46
+ p = num_token_and_token.find('#')
47
+ num_token = int(num_token_and_token[:p])
48
+ position = this_tree.leaf_treeposition(num)
49
+ token_id = term_ids[int(num_token)]
50
+ this_tree[position] = token_id
51
+ logging.debug('Matching '+num_token_and_token+' with term id='+token_id+' according to KAF lemma='+str(lemma_for_termid.get(token_id).encode('utf-8')))
52
+
53
+ ##Creat the ROOT
54
+ create_extra_root = False
55
+ nt_id = None
56
+ if create_extra_root:
57
+ nt_id = 'nter'+str(cnt_nt)
58
+ cnt_nt +=1
59
+ list_nt.append((nt_id,'ROOT'))
60
+
61
+ visit_node(this_tree, nt_id)
62
+
63
+ root = etree.Element('tree')
64
+ nonter_heads = set()
65
+ #Nonter
66
+ labels_for_nt = {}
67
+ for nt_id, label in list_nt:
68
+ ##Checking the head
69
+ if len(label)>=2 and label[-1]=='H' and label[-2]=='=':
70
+ nonter_heads.add(nt_id)
71
+ label = label[:-2]
72
+ ele = etree.Element('nt', attrib={'id':nt_id,'label':label})
73
+ labels_for_nt[nt_id] = label
74
+ root.append(ele)
75
+
76
+ ## Terminals
77
+ lemma_for_ter = {}
78
+ for ter_id, span_ids in list_t:
79
+ ele = etree.Element('t',attrib={'id':ter_id})
80
+ span = etree.Element('span')
81
+ ele.append(span)
82
+ for termid in span_ids.split(' '):
83
+ target = etree.Element('target',attrib={'id':termid})
84
+ span.append(target)
85
+ lemma_for_ter[ter_id] = lemma_for_termid[termid]
86
+ root.append(ele)
87
+
88
+ ##Edges
89
+ #for edge_id,node_to,node_from in list_edge:
90
+ for edge_id, node_from, node_to in list_edge:
91
+ ele = etree.Element('edge',attrib={'id':edge_id,'from':node_from,'to':node_to})
92
+
93
+ ## For the comment
94
+ ##Only non-ter
95
+ label_to = labels_for_nt.get(node_to)
96
+
97
+ ##Could be ter or nonter
98
+ label_from = labels_for_nt.get(node_from)
99
+ if label_from is None:
100
+ label_from = lemma_for_ter.get(node_from,'kk')
101
+
102
+ comment = ' '+(edge_id)+' '+(label_to)+' <- '+(label_from)+' '
103
+ comment = comment.replace('--','-')
104
+ if node_from in nonter_heads:
105
+ ele.set('head','yes')
106
+ root.append(etree.Comment(comment))
107
+ root.append(ele)
108
+
109
+ return root,cnt_t,cnt_nt,cnt_edge
110
+
111
+
112
+
113
+ def visit_node(node,id_parent=None):
114
+ global list_t, list_nt,list_edge,cnt_t, cnt_nt, cnt_edge
115
+
116
+ if isinstance(node,str): #is a terminal
117
+ ##Create the terminal
118
+ t_id = 'ter'+str(cnt_t)
119
+ cnt_t +=1
120
+ list_t.append((t_id,str(node)))
121
+
122
+ ##Create the edge with the parent
123
+ edge_id = 'tre'+str(cnt_edge)
124
+ cnt_edge +=1
125
+ list_edge.append((edge_id,t_id,id_parent))
126
+ else: #Is a non terminal
127
+ ##Create the nonterminal
128
+ nt_id = 'nter'+str(cnt_nt)
129
+ cnt_nt+=1
130
+ list_nt.append((nt_id,node.node))
131
+
132
+ ##Create the linking with the parent
133
+ if id_parent is not None:
134
+ edge_id = 'tre'+str(cnt_edge)
135
+ cnt_edge +=1
136
+ list_edge.append((edge_id,nt_id,id_parent))
137
+
138
+ ##Call to the child
139
+ for child in node:
140
+ visit_node(child,nt_id)
141
+
142
+
143
+
144
+ if __name__ == '__main__':
145
+ s = "(S (NP (DET 0#The) (NN 1#dog)) (VP (V 2#ate) (NP (DET 3#the) (NN 4#cat))) (. 5#.))"
146
+ ids = ['t0' ,'t1','t2','t3','t4','t5']
147
+ t= {}
148
+ t['t0']='The'
149
+ t['t1']='dog'
150
+ t['t2']='ate'
151
+ t['t3']='the'
152
+ t['t4']='cat'
153
+ t['t5']='.'
154
+ root = convert_penn_to_kaf_with_numtokens(s,ids,None,t)
155
+ import sys
156
+ etree.ElementTree(element=root).write(sys.stdout,pretty_print=1)
157
+
158
+
159
+
160
+
161
+
Binary file
Binary file
data/core/tree.py ADDED
@@ -0,0 +1,1438 @@
1
+ # -*- coding: utf-8 -*-
2
+ # Natural Language Toolkit: Text Trees
3
+ #
4
+ # Copyright (C) 2001-2012 NLTK Project
5
+ # Author: Edward Loper <edloper@gradient.cis.upenn.edu>
6
+ # Steven Bird <sb@csse.unimelb.edu.au>
7
+ # Peter Ljunglöf <peter.ljunglof@gu.se>
8
+ # Nathan Bodenstab <bodenstab@cslu.ogi.edu> (tree transforms)
9
+ # URL: <http://www.nltk.org/>
10
+ # For license information, see LICENSE.TXT
11
+
12
+ """
13
+ Class for representing hierarchical language structures, such as
14
+ syntax trees and morphological trees.
15
+ """
16
+
17
+ # TODO: add LabelledTree (can be used for dependency trees)
18
+
19
+ import re
20
+ import string
21
+
22
+ ######################################################################
23
+ ## Trees
24
+ ######################################################################
25
+
26
+ class Tree(list):
27
+ """
28
+ A Tree represents a hierarchical grouping of leaves and subtrees.
29
+ For example, each constituent in a syntax tree is represented by a single Tree.
30
+
31
+ A tree's children are encoded as a list of leaves and subtrees,
32
+ where a leaf is a basic (non-tree) value; and a subtree is a
33
+ nested Tree.
34
+
35
+ >>> from nltk.tree import Tree
36
+ >>> print Tree(1, [2, Tree(3, [4]), 5])
37
+ (1 2 (3 4) 5)
38
+ >>> vp = Tree('VP', [Tree('V', ['saw']),
39
+ ... Tree('NP', ['him'])])
40
+ >>> s = Tree('S', [Tree('NP', ['I']), vp])
41
+ >>> print s
42
+ (S (NP I) (VP (V saw) (NP him)))
43
+ >>> print s[1]
44
+ (VP (V saw) (NP him))
45
+ >>> print s[1,1]
46
+ (NP him)
47
+ >>> t = Tree("(S (NP I) (VP (V saw) (NP him)))")
48
+ >>> s == t
49
+ True
50
+ >>> t[1][1].node = "X"
51
+ >>> print t
52
+ (S (NP I) (VP (V saw) (X him)))
53
+ >>> t[0], t[1,1] = t[1,1], t[0]
54
+ >>> print t
55
+ (S (X him) (VP (V saw) (NP I)))
56
+
57
+ The length of a tree is the number of children it has.
58
+
59
+ >>> len(t)
60
+ 2
61
+
62
+ Any other properties that a Tree defines are known as node
63
+ properties, and are used to add information about individual
64
+ hierarchical groupings. For example, syntax trees use a NODE
65
+ property to label syntactic constituents with phrase tags, such as
66
+ "NP" and "VP".
67
+
68
+ Several Tree methods use "tree positions" to specify
69
+ children or descendants of a tree. Tree positions are defined as
70
+ follows:
71
+
72
+ - The tree position *i* specifies a Tree's *i*\ th child.
73
+ - The tree position ``()`` specifies the Tree itself.
74
+ - If *p* is the tree position of descendant *d*, then
75
+ *p+i* specifies the *i*\ th child of *d*.
76
+
77
+ I.e., every tree position is either a single index *i*,
78
+ specifying ``tree[i]``; or a sequence *i1, i2, ..., iN*,
79
+ specifying ``tree[i1][i2]...[iN]``.
80
+
81
+ Construct a new tree. This constructor can be called in one
82
+ of two ways:
83
+
84
+ - ``Tree(node, children)`` constructs a new tree with the
85
+ specified node value and list of children.
86
+
87
+ - ``Tree(s)`` constructs a new tree by parsing the string ``s``.
88
+ It is equivalent to calling the class method ``Tree.parse(s)``.
89
+ """
90
+ def __init__(self, node_or_str, children=None):
91
+ if children is None:
92
+ if not isinstance(node_or_str, basestring):
93
+ raise TypeError("%s: Expected a node value and child list "
94
+ "or a single string" % type(self).__name__)
95
+ tree = type(self).parse(node_or_str)
96
+ list.__init__(self, tree)
97
+ self.node = tree.node
98
+ elif isinstance(children, basestring):
99
+ raise TypeError("%s() argument 2 should be a list, not a "
100
+ "string" % type(self).__name__)
101
+ else:
102
+ list.__init__(self, children)
103
+ self.node = node_or_str
104
+
105
+ #////////////////////////////////////////////////////////////
106
+ # Comparison operators
107
+ #////////////////////////////////////////////////////////////
108
+
109
+ def __eq__(self, other):
110
+ if not isinstance(other, Tree): return False
111
+ return self.node == other.node and list.__eq__(self, other)
112
+ def __ne__(self, other):
113
+ return not (self == other)
114
+ def __lt__(self, other):
115
+ if not isinstance(other, Tree): return False
116
+ return self.node < other.node or list.__lt__(self, other)
117
+ def __le__(self, other):
118
+ if not isinstance(other, Tree): return False
119
+ return self.node <= other.node or list.__le__(self, other)
120
+ def __gt__(self, other):
121
+ if not isinstance(other, Tree): return True
122
+ return self.node > other.node or list.__gt__(self, other)
123
+ def __ge__(self, other):
124
+ if not isinstance(other, Tree): return False
125
+ return self.node >= other.node or list.__ge__(self, other)
126
+
127
+ #////////////////////////////////////////////////////////////
128
+ # Disabled list operations
129
+ #////////////////////////////////////////////////////////////
130
+
131
+ def __mul__(self, v):
132
+ raise TypeError('Tree does not support multiplication')
133
+ def __rmul__(self, v):
134
+ raise TypeError('Tree does not support multiplication')
135
+ def __add__(self, v):
136
+ raise TypeError('Tree does not support addition')
137
+ def __radd__(self, v):
138
+ raise TypeError('Tree does not support addition')
139
+
140
+ #////////////////////////////////////////////////////////////
141
+ # Indexing (with support for tree positions)
142
+ #////////////////////////////////////////////////////////////
143
+
144
+ def __getitem__(self, index):
145
+ if isinstance(index, (int, slice)):
146
+ return list.__getitem__(self, index)
147
+ elif isinstance(index, (list, tuple)):
148
+ if len(index) == 0:
149
+ return self
150
+ elif len(index) == 1:
151
+ return self[index[0]]
152
+ else:
153
+ return self[index[0]][index[1:]]
154
+ else:
155
+ raise TypeError("%s indices must be integers, not %s" %
156
+ (type(self).__name__, type(index).__name__))
157
+
158
+ def __setitem__(self, index, value):
159
+ if isinstance(index, (int, slice)):
160
+ return list.__setitem__(self, index, value)
161
+ elif isinstance(index, (list, tuple)):
162
+ if len(index) == 0:
163
+ raise IndexError('The tree position () may not be '
164
+ 'assigned to.')
165
+ elif len(index) == 1:
166
+ self[index[0]] = value
167
+ else:
168
+ self[index[0]][index[1:]] = value
169
+ else:
170
+ raise TypeError("%s indices must be integers, not %s" %
171
+ (type(self).__name__, type(index).__name__))
172
+
173
+ def __delitem__(self, index):
174
+ if isinstance(index, (int, slice)):
175
+ return list.__delitem__(self, index)
176
+ elif isinstance(index, (list, tuple)):
177
+ if len(index) == 0:
178
+ raise IndexError('The tree position () may not be deleted.')
179
+ elif len(index) == 1:
180
+ del self[index[0]]
181
+ else:
182
+ del self[index[0]][index[1:]]
183
+ else:
184
+ raise TypeError("%s indices must be integers, not %s" %
185
+ (type(self).__name__, type(index).__name__))
186
+
187
+ #////////////////////////////////////////////////////////////
188
+ # Basic tree operations
189
+ #////////////////////////////////////////////////////////////
190
+
191
+ def leaves(self):
192
+ """
193
+ Return the leaves of the tree.
194
+
195
+ >>> t = Tree("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
196
+ >>> t.leaves()
197
+ ['the', 'dog', 'chased', 'the', 'cat']
198
+
199
+ :return: a list containing this tree's leaves.
200
+ The order reflects the order of the
201
+ leaves in the tree's hierarchical structure.
202
+ :rtype: list
203
+ """
204
+ leaves = []
205
+ for child in self:
206
+ if isinstance(child, Tree):
207
+ leaves.extend(child.leaves())
208
+ else:
209
+ leaves.append(child)
210
+ return leaves
211
+
212
+ def flatten(self):
213
+ """
214
+ Return a flat version of the tree, with all non-root non-terminals removed.
215
+
216
+ >>> t = Tree("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
217
+ >>> print t.flatten()
218
+ (S the dog chased the cat)
219
+
220
+ :return: a tree consisting of this tree's root connected directly to
221
+ its leaves, omitting all intervening non-terminal nodes.
222
+ :rtype: Tree
223
+ """
224
+ return Tree(self.node, self.leaves())
225
+
226
+ def height(self):
227
+ """
228
+ Return the height of the tree.
229
+
230
+ >>> t = Tree("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
231
+ >>> t.height()
232
+ 5
233
+ >>> print t[0,0]
234
+ (D the)
235
+ >>> t[0,0].height()
236
+ 2
237
+
238
+ :return: The height of this tree. The height of a tree
239
+ containing no children is 1; the height of a tree
240
+ containing only leaves is 2; and the height of any other
241
+ tree is one plus the maximum of its children's
242
+ heights.
243
+ :rtype: int
244
+ """
245
+ max_child_height = 0
246
+ for child in self:
247
+ if isinstance(child, Tree):
248
+ max_child_height = max(max_child_height, child.height())
249
+ else:
250
+ max_child_height = max(max_child_height, 1)
251
+ return 1 + max_child_height
252
+
253
+ def treepositions(self, order='preorder'):
254
+ """
255
+ >>> t = Tree("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
256
+ >>> t.treepositions() # doctest: +ELLIPSIS
257
+ [(), (0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1,), (1, 0), (1, 0, 0), ...]
258
+ >>> for pos in t.treepositions('leaves'):
259
+ ... t[pos] = t[pos][::-1].upper()
260
+ >>> print t
261
+ (S (NP (D EHT) (N GOD)) (VP (V DESAHC) (NP (D EHT) (N TAC))))
262
+
263
+ :param order: One of: ``preorder``, ``postorder``, ``bothorder``,
264
+ ``leaves``.
265
+ """
266
+ positions = []
267
+ if order in ('preorder', 'bothorder'): positions.append( () )
268
+ for i, child in enumerate(self):
269
+ if isinstance(child, Tree):
270
+ childpos = child.treepositions(order)
271
+ positions.extend((i,)+p for p in childpos)
272
+ else:
273
+ positions.append( (i,) )
274
+ if order in ('postorder', 'bothorder'): positions.append( () )
275
+ return positions
276
+
277
+ def subtrees(self, filter=None):
278
+ """
279
+ Generate all the subtrees of this tree, optionally restricted
280
+ to trees matching the filter function.
281
+
282
+ >>> t = Tree("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
283
+ >>> for s in t.subtrees(lambda t: t.height() == 2):
284
+ ... print s
285
+ (D the)
286
+ (N dog)
287
+ (V chased)
288
+ (D the)
289
+ (N cat)
290
+
291
+ :type filter: function
292
+ :param filter: the function to filter all local trees
293
+ """
294
+ if not filter or filter(self):
295
+ yield self
296
+ for child in self:
297
+ if isinstance(child, Tree):
298
+ for subtree in child.subtrees(filter):
299
+ yield subtree
300
+
301
+ def productions(self):
302
+ """
303
+ Generate the productions that correspond to the non-terminal nodes of the tree.
304
+ For each subtree of the form (P: C1 C2 ... Cn) this produces a production of the
305
+ form P -> C1 C2 ... Cn.
306
+
307
+ >>> t = Tree("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
308
+ >>> t.productions()
309
+ [S -> NP VP, NP -> D N, D -> 'the', N -> 'dog', VP -> V NP, V -> 'chased',
310
+ NP -> D N, D -> 'the', N -> 'cat']
311
+
312
+ :rtype: list(Production)
313
+ """
314
+
315
+ if not isinstance(self.node, basestring):
316
+ raise TypeError, 'Productions can only be generated from trees having node labels that are strings'
317
+
318
+ prods = [Production(Nonterminal(self.node), _child_names(self))]
319
+ for child in self:
320
+ if isinstance(child, Tree):
321
+ prods += child.productions()
322
+ return prods
323
+
324
+ def pos(self):
325
+ """
326
+ Return a sequence of pos-tagged words extracted from the tree.
327
+
328
+ >>> t = Tree("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
329
+ >>> t.pos()
330
+ [('the', 'D'), ('dog', 'N'), ('chased', 'V'), ('the', 'D'), ('cat', 'N')]
331
+
332
+ :return: a list of tuples containing leaves and pre-terminals (part-of-speech tags).
333
+ The order reflects the order of the leaves in the tree's hierarchical structure.
334
+ :rtype: list(tuple)
335
+ """
336
+ pos = []
337
+ for child in self:
338
+ if isinstance(child, Tree):
339
+ pos.extend(child.pos())
340
+ else:
341
+ pos.append((child, self.node))
342
+ return pos
343
+
344
+ def leaf_treeposition(self, index):
345
+ """
346
+ :return: The tree position of the ``index``-th leaf in this
347
+ tree. I.e., if ``tp=self.leaf_treeposition(i)``, then
348
+ ``self[tp]==self.leaves()[i]``.
349
+
350
+ :raise IndexError: If this tree contains fewer than ``index+1``
351
+ leaves, or if ``index<0``.
352
+ """
353
+ if index < 0: raise IndexError('index must be non-negative')
354
+
355
+ stack = [(self, ())]
356
+ while stack:
357
+ value, treepos = stack.pop()
358
+ if not isinstance(value, Tree):
359
+ if index == 0: return treepos
360
+ else: index -= 1
361
+ else:
362
+ for i in range(len(value)-1, -1, -1):
363
+ stack.append( (value[i], treepos+(i,)) )
364
+
365
+ raise IndexError('index must be less than or equal to len(self)')
366
+
367
+ def treeposition_spanning_leaves(self, start, end):
368
+ """
369
+ :return: The tree position of the lowest descendant of this
370
+ tree that dominates ``self.leaves()[start:end]``.
371
+ :raise ValueError: if ``end <= start``
372
+ """
373
+ if end <= start:
374
+ raise ValueError('end must be greater than start')
375
+ # Find the tree positions of the start & end leaves, and
376
+ # take the longest common subsequence.
377
+ start_treepos = self.leaf_treeposition(start)
378
+ end_treepos = self.leaf_treeposition(end-1)
379
+ # Find the first index where they mismatch:
380
+ for i in range(len(start_treepos)):
381
+ if i == len(end_treepos) or start_treepos[i] != end_treepos[i]:
382
+ return start_treepos[:i]
383
+ return start_treepos
384
+
385
+ #////////////////////////////////////////////////////////////
386
+ # Transforms
387
+ #////////////////////////////////////////////////////////////
388
+
389
+ def chomsky_normal_form(self, factor = "right", horzMarkov = None, vertMarkov = 0, childChar = "|", parentChar = "^"):
390
+ """
391
+ This method can modify a tree in three ways:
392
+
393
+ 1. Convert a tree into its Chomsky Normal Form (CNF)
394
+ equivalent -- Every subtree has either two non-terminals
395
+ or one terminal as its children. This process requires
396
+ the creation of more"artificial" non-terminal nodes.
397
+ 2. Markov (vertical) smoothing of children in new artificial
398
+ nodes
399
+ 3. Horizontal (parent) annotation of nodes
400
+
401
+ :param factor: Right or left factoring method (default = "right")
402
+ :type factor: str = [left|right]
403
+ :param horzMarkov: Markov order for sibling smoothing in artificial nodes (None (default) = include all siblings)
404
+ :type horzMarkov: int | None
405
+ :param vertMarkov: Markov order for parent smoothing (0 (default) = no vertical annotation)
406
+ :type vertMarkov: int | None
407
+ :param childChar: A string used in construction of the artificial nodes, separating the head of the
408
+ original subtree from the child nodes that have yet to be expanded (default = "|")
409
+ :type childChar: str
410
+ :param parentChar: A string used to separate the node representation from its vertical annotation
411
+ :type parentChar: str
412
+ """
413
+ from treetransforms import chomsky_normal_form
414
+ chomsky_normal_form(self, factor, horzMarkov, vertMarkov, childChar, parentChar)
415
+
416
+ def un_chomsky_normal_form(self, expandUnary = True, childChar = "|", parentChar = "^", unaryChar = "+"):
417
+ """
418
+ This method modifies the tree in three ways:
419
+
420
+ 1. Transforms a tree in Chomsky Normal Form back to its
421
+ original structure (branching greater than two)
422
+ 2. Removes any parent annotation (if it exists)
423
+ 3. (optional) expands unary subtrees (if previously
424
+ collapsed with collapseUnary(...) )
425
+
426
+ :param expandUnary: Flag to expand unary or not (default = True)
427
+ :type expandUnary: bool
428
+ :param childChar: A string separating the head node from its children in an artificial node (default = "|")
429
+ :type childChar: str
430
+ :param parentChar: A sting separating the node label from its parent annotation (default = "^")
431
+ :type parentChar: str
432
+ :param unaryChar: A string joining two non-terminals in a unary production (default = "+")
433
+ :type unaryChar: str
434
+ """
435
+ from treetransforms import un_chomsky_normal_form
436
+ un_chomsky_normal_form(self, expandUnary, childChar, parentChar, unaryChar)
437
+
438
+ def collapse_unary(self, collapsePOS = False, collapseRoot = False, joinChar = "+"):
439
+ """
440
+ Collapse subtrees with a single child (ie. unary productions)
441
+ into a new non-terminal (Tree node) joined by 'joinChar'.
442
+ This is useful when working with algorithms that do not allow
443
+ unary productions, and completely removing the unary productions
444
+ would require loss of useful information. The Tree is modified
445
+ directly (since it is passed by reference) and no value is returned.
446
+
447
+ :param collapsePOS: 'False' (default) will not collapse the parent of leaf nodes (ie.
448
+ Part-of-Speech tags) since they are always unary productions
449
+ :type collapsePOS: bool
450
+ :param collapseRoot: 'False' (default) will not modify the root production
451
+ if it is unary. For the Penn WSJ treebank corpus, this corresponds
452
+ to the TOP -> productions.
453
+ :type collapseRoot: bool
454
+ :param joinChar: A string used to connect collapsed node values (default = "+")
455
+ :type joinChar: str
456
+ """
457
+ from treetransforms import collapse_unary
458
+ collapse_unary(self, collapsePOS, collapseRoot, joinChar)
459
+
460
+ #////////////////////////////////////////////////////////////
461
+ # Convert, copy
462
+ #////////////////////////////////////////////////////////////
463
+
464
+ @classmethod
465
+ def convert(cls, tree):
466
+ """
467
+ Convert a tree between different subtypes of Tree. ``cls`` determines
468
+ which class will be used to encode the new tree.
469
+
470
+ :type tree: Tree
471
+ :param tree: The tree that should be converted.
472
+ :return: The new Tree.
473
+ """
474
+ if isinstance(tree, Tree):
475
+ children = [cls.convert(child) for child in tree]
476
+ return cls(tree.node, children)
477
+ else:
478
+ return tree
479
+
480
+ def copy(self, deep=False):
481
+ if not deep: return type(self)(self.node, self)
482
+ else: return type(self).convert(self)
483
+
484
+ def _frozen_class(self): return ImmutableTree
485
+ def freeze(self, leaf_freezer=None):
486
+ frozen_class = self._frozen_class()
487
+ if leaf_freezer is None:
488
+ newcopy = frozen_class.convert(self)
489
+ else:
490
+ newcopy = self.copy(deep=True)
491
+ for pos in newcopy.treepositions('leaves'):
492
+ newcopy[pos] = leaf_freezer(newcopy[pos])
493
+ newcopy = frozen_class.convert(newcopy)
494
+ hash(newcopy) # Make sure the leaves are hashable.
495
+ return newcopy
496
+
497
+ #////////////////////////////////////////////////////////////
498
+ # Parsing
499
+ #////////////////////////////////////////////////////////////
500
+
501
+ @classmethod
502
+ def parse(cls, s, brackets='()', parse_node=None, parse_leaf=None,
503
+ node_pattern=None, leaf_pattern=None,
504
+ remove_empty_top_bracketing=False):
505
+ """
506
+ Parse a bracketed tree string and return the resulting tree.
507
+ Trees are represented as nested brackettings, such as::
508
+
509
+ (S (NP (NNP John)) (VP (V runs)))
510
+
511
+ :type s: str
512
+ :param s: The string to parse
513
+
514
+ :type brackets: str (length=2)
515
+ :param brackets: The bracket characters used to mark the
516
+ beginning and end of trees and subtrees.
517
+
518
+ :type parse_node: function
519
+ :type parse_leaf: function
520
+ :param parse_node, parse_leaf: If specified, these functions
521
+ are applied to the substrings of ``s`` corresponding to
522
+ nodes and leaves (respectively) to obtain the values for
523
+ those nodes and leaves. They should have the following
524
+ signature:
525
+
526
+ parse_node(str) -> value
527
+
528
+ For example, these functions could be used to parse nodes
529
+ and leaves whose values should be some type other than
530
+ string (such as ``FeatStruct``).
531
+ Note that by default, node strings and leaf strings are
532
+ delimited by whitespace and brackets; to override this
533
+ default, use the ``node_pattern`` and ``leaf_pattern``
534
+ arguments.
535
+
536
+ :type node_pattern: str
537
+ :type leaf_pattern: str
538
+ :param node_pattern, leaf_pattern: Regular expression patterns
539
+ used to find node and leaf substrings in ``s``. By
540
+ default, both nodes patterns are defined to match any
541
+ sequence of non-whitespace non-bracket characters.
542
+
543
+ :type remove_empty_top_bracketing: bool
544
+ :param remove_empty_top_bracketing: If the resulting tree has
545
+ an empty node label, and is length one, then return its
546
+ single child instead. This is useful for treebank trees,
547
+ which sometimes contain an extra level of bracketing.
548
+
549
+ :return: A tree corresponding to the string representation ``s``.
550
+ If this class method is called using a subclass of Tree,
551
+ then it will return a tree of that type.
552
+ :rtype: Tree
553
+ """
554
+ if not isinstance(brackets, basestring) or len(brackets) != 2:
555
+ raise TypeError('brackets must be a length-2 string')
556
+ if re.search('\s', brackets):
557
+ raise TypeError('whitespace brackets not allowed')
558
+ # Construct a regexp that will tokenize the string.
559
+ open_b, close_b = brackets
560
+ open_pattern, close_pattern = (re.escape(open_b), re.escape(close_b))
561
+ if node_pattern is None:
562
+ node_pattern = '[^\s%s%s]+' % (open_pattern, close_pattern)
563
+ if leaf_pattern is None:
564
+ leaf_pattern = '[^\s%s%s]+' % (open_pattern, close_pattern)
565
+ token_re = re.compile('%s\s*(%s)?|%s|(%s)' % (
566
+ open_pattern, node_pattern, close_pattern, leaf_pattern))
567
+ # Walk through each token, updating a stack of trees.
568
+ stack = [(None, [])] # list of (node, children) tuples
569
+ for match in token_re.finditer(s):
570
+ token = match.group()
571
+ # Beginning of a tree/subtree
572
+ if token[0] == open_b:
573
+ if len(stack) == 1 and len(stack[0][1]) > 0:
574
+ cls._parse_error(s, match, 'end-of-string')
575
+ node = token[1:].lstrip()
576
+ if parse_node is not None: node = parse_node(node)
577
+ stack.append((node, []))
578
+ # End of a tree/subtree
579
+ elif token == close_b:
580
+ if len(stack) == 1:
581
+ if len(stack[0][1]) == 0:
582
+ cls._parse_error(s, match, open_b)
583
+ else:
584
+ cls._parse_error(s, match, 'end-of-string')
585
+ node, children = stack.pop()
586
+ stack[-1][1].append(cls(node, children))
587
+ # Leaf node
588
+ else:
589
+ if len(stack) == 1:
590
+ cls._parse_error(s, match, open_b)
591
+ if parse_leaf is not None: token = parse_leaf(token)
592
+ stack[-1][1].append(token)
593
+
594
+ # check that we got exactly one complete tree.
595
+ if len(stack) > 1:
596
+ cls._parse_error(s, 'end-of-string', close_b)
597
+ elif len(stack[0][1]) == 0:
598
+ cls._parse_error(s, 'end-of-string', open_b)
599
+ else:
600
+ assert stack[0][0] is None
601
+ assert len(stack[0][1]) == 1
602
+ tree = stack[0][1][0]
603
+
604
+ # If the tree has an extra level with node='', then get rid of
605
+ # it. E.g.: "((S (NP ...) (VP ...)))"
606
+ if remove_empty_top_bracketing and tree.node == '' and len(tree) == 1:
607
+ tree = tree[0]
608
+ # return the tree.
609
+ return tree
610
+
611
+ @classmethod
612
+ def _parse_error(cls, s, match, expecting):
613
+ """
614
+ Display a friendly error message when parsing a tree string fails.
615
+ :param s: The string we're parsing.
616
+ :param match: regexp match of the problem token.
617
+ :param expecting: what we expected to see instead.
618
+ """
619
+ # Construct a basic error message
620
+ if match == 'end-of-string':
621
+ pos, token = len(s), 'end-of-string'
622
+ else:
623
+ pos, token = match.start(), match.group()
624
+ msg = '%s.parse(): expected %r but got %r\n%sat index %d.' % (
625
+ cls.__name__, expecting, token, ' '*12, pos)
626
+ # Add a display showing the error token itsels:
627
+ s = s.replace('\n', ' ').replace('\t', ' ')
628
+ offset = pos
629
+ if len(s) > pos+10:
630
+ s = s[:pos+10]+'...'
631
+ if pos > 10:
632
+ s = '...'+s[pos-10:]
633
+ offset = 13
634
+ msg += '\n%s"%s"\n%s^' % (' '*16, s, ' '*(17+offset))
635
+ raise ValueError(msg)
636
+
637
+ #////////////////////////////////////////////////////////////
638
+ # Visualization & String Representation
639
+ #////////////////////////////////////////////////////////////
640
+
641
+ def draw(self):
642
+ """
643
+ Open a new window containing a graphical diagram of this tree.
644
+ """
645
+ from nltk.draw.tree import draw_trees
646
+ draw_trees(self)
647
+
648
+ def __repr__(self):
649
+ childstr = ", ".join(repr(c) for c in self)
650
+ return '%s(%r, [%s])' % (type(self).__name__, self.node, childstr)
651
+
652
+ def __str__(self):
653
+ return self.pprint()
654
+
655
+ def pprint(self, margin=70, indent=0, nodesep='', parens='()', quotes=False):
656
+ """
657
+ :return: A pretty-printed string representation of this tree.
658
+ :rtype: str
659
+ :param margin: The right margin at which to do line-wrapping.
660
+ :type margin: int
661
+ :param indent: The indentation level at which printing
662
+ begins. This number is used to decide how far to indent
663
+ subsequent lines.
664
+ :type indent: int
665
+ :param nodesep: A string that is used to separate the node
666
+ from the children. E.g., the default value ``':'`` gives
667
+ trees like ``(S: (NP: I) (VP: (V: saw) (NP: it)))``.
668
+ """
669
+
670
+ # Try writing it on one line.
671
+ s = self._pprint_flat(nodesep, parens, quotes)
672
+ if len(s)+indent < margin:
673
+ return s
674
+
675
+ # If it doesn't fit on one line, then write it on multi-lines.
676
+ if isinstance(self.node, basestring):
677
+ s = '%s%s%s' % (parens[0], self.node, nodesep)
678
+ else:
679
+ s = '%s%r%s' % (parens[0], self.node, nodesep)
680
+ for child in self:
681
+ if isinstance(child, Tree):
682
+ s += '\n'+' '*(indent+2)+child.pprint(margin, indent+2,
683
+ nodesep, parens, quotes)
684
+ elif isinstance(child, tuple):
685
+ s += '\n'+' '*(indent+2)+ "/".join(child)
686
+ elif isinstance(child, basestring) and not quotes:
687
+ s += '\n'+' '*(indent+2)+ '%s' % child
688
+ else:
689
+ s += '\n'+' '*(indent+2)+ '%r' % child
690
+ return s+parens[1]
691
+
692
+ def pprint_latex_qtree(self):
693
+ r"""
694
+ Returns a representation of the tree compatible with the
695
+ LaTeX qtree package. This consists of the string ``\Tree``
696
+ followed by the parse tree represented in bracketed notation.
697
+
698
+ For example, the following result was generated from a parse tree of
699
+ the sentence ``The announcement astounded us``::
700
+
701
+ \Tree [.I'' [.N'' [.D The ] [.N' [.N announcement ] ] ]
702
+ [.I' [.V'' [.V' [.V astounded ] [.N'' [.N' [.N us ] ] ] ] ] ] ]
703
+
704
+ See http://www.ling.upenn.edu/advice/latex.html for the LaTeX
705
+ style file for the qtree package.
706
+
707
+ :return: A latex qtree representation of this tree.
708
+ :rtype: str
709
+ """
710
+ return r'\Tree ' + self.pprint(indent=6, nodesep='', parens=('[.', ' ]'))
711
+
712
+ def _pprint_flat(self, nodesep, parens, quotes):
713
+ childstrs = []
714
+ for child in self:
715
+ if isinstance(child, Tree):
716
+ childstrs.append(child._pprint_flat(nodesep, parens, quotes))
717
+ elif isinstance(child, tuple):
718
+ childstrs.append("/".join(child))
719
+ elif isinstance(child, basestring) and not quotes:
720
+ childstrs.append('%s' % child)
721
+ else:
722
+ childstrs.append('%r' % child)
723
+ if isinstance(self.node, basestring):
724
+ return '%s%s%s %s%s' % (parens[0], self.node, nodesep,
725
+ string.join(childstrs), parens[1])
726
+ else:
727
+ return '%s%r%s %s%s' % (parens[0], self.node, nodesep,
728
+ string.join(childstrs), parens[1])
729
+
730
+
731
+ class ImmutableTree(Tree):
732
+ def __init__(self, node_or_str, children=None):
733
+ super(ImmutableTree, self).__init__(node_or_str, children)
734
+ # Precompute our hash value. This ensures that we're really
735
+ # immutable. It also means we only have to calculate it once.
736
+ try:
737
+ self._hash = hash( (self.node, tuple(self)) )
738
+ except (TypeError, ValueError):
739
+ raise ValueError("%s: node value and children "
740
+ "must be immutable" % type(self).__name__)
741
+
742
+ def __setitem__(self, index, value):
743
+ raise ValueError('%s may not be modified' % type(self).__name__)
744
+ def __setslice__(self, i, j, value):
745
+ raise ValueError('%s may not be modified' % type(self).__name__)
746
+ def __delitem__(self, index):
747
+ raise ValueError('%s may not be modified' % type(self).__name__)
748
+ def __delslice__(self, i, j):
749
+ raise ValueError('%s may not be modified' % type(self).__name__)
750
+ def __iadd__(self, other):
751
+ raise ValueError('%s may not be modified' % type(self).__name__)
752
+ def __imul__(self, other):
753
+ raise ValueError('%s may not be modified' % type(self).__name__)
754
+ def append(self, v):
755
+ raise ValueError('%s may not be modified' % type(self).__name__)
756
+ def extend(self, v):
757
+ raise ValueError('%s may not be modified' % type(self).__name__)
758
+ def pop(self, v=None):
759
+ raise ValueError('%s may not be modified' % type(self).__name__)
760
+ def remove(self, v):
761
+ raise ValueError('%s may not be modified' % type(self).__name__)
762
+ def reverse(self):
763
+ raise ValueError('%s may not be modified' % type(self).__name__)
764
+ def sort(self):
765
+ raise ValueError('%s may not be modified' % type(self).__name__)
766
+ def __hash__(self):
767
+ return self._hash
768
+
769
+ def _get_node(self):
770
+ """Get the node value"""
771
+ return self._node
772
+ def _set_node(self, value):
773
+ """
774
+ Set the node value. This will only succeed the first time the
775
+ node value is set, which should occur in ImmutableTree.__init__().
776
+ """
777
+ if hasattr(self, 'node'):
778
+ raise ValueError('%s may not be modified' % type(self).__name__)
779
+ self._node = value
780
+ node = property(_get_node, _set_node)
781
+
782
+
783
+ ######################################################################
784
+ ## Parented trees
785
+ ######################################################################
786
+
787
+ class AbstractParentedTree(Tree):
788
+ """
789
+ An abstract base class for a ``Tree`` that automatically maintains
790
+ pointers to parent nodes. These parent pointers are updated
791
+ whenever any change is made to a tree's structure. Two subclasses
792
+ are currently defined:
793
+
794
+ - ``ParentedTree`` is used for tree structures where each subtree
795
+ has at most one parent. This class should be used in cases
796
+ where there is no"sharing" of subtrees.
797
+
798
+ - ``MultiParentedTree`` is used for tree structures where a
799
+ subtree may have zero or more parents. This class should be
800
+ used in cases where subtrees may be shared.
801
+
802
+ Subclassing
803
+ ===========
804
+ The ``AbstractParentedTree`` class redefines all operations that
805
+ modify a tree's structure to call two methods, which are used by
806
+ subclasses to update parent information:
807
+
808
+ - ``_setparent()`` is called whenever a new child is added.
809
+ - ``_delparent()`` is called whenever a child is removed.
810
+ """
811
+
812
+ def __init__(self, node_or_str, children=None):
813
+ super(AbstractParentedTree, self).__init__(node_or_str, children)
814
+ # If children is None, the tree is parsed from node_or_str, and
815
+ # all parents will be set during parsing.
816
+ if children is not None:
817
+ # Otherwise we have to set the parent of the children.
818
+ # Iterate over self, and *not* children, because children
819
+ # might be an iterator.
820
+ for i, child in enumerate(self):
821
+ if isinstance(child, Tree):
822
+ self._setparent(child, i, dry_run=True)
823
+ for i, child in enumerate(self):
824
+ if isinstance(child, Tree):
825
+ self._setparent(child, i)
826
+
827
+ #////////////////////////////////////////////////////////////
828
+ # Parent management
829
+ #////////////////////////////////////////////////////////////
830
+
831
+ def _setparent(self, child, index, dry_run=False):
832
+ """
833
+ Update the parent pointer of ``child`` to point to ``self``. This
834
+ method is only called if the type of ``child`` is ``Tree``;
835
+ i.e., it is not called when adding a leaf to a tree. This method
836
+ is always called before the child is actually added to the
837
+ child list of ``self``.
838
+
839
+ :type child: Tree
840
+ :type index: int
841
+ :param index: The index of ``child`` in ``self``.
842
+ :raise TypeError: If ``child`` is a tree with an impropriate
843
+ type. Typically, if ``child`` is a tree, then its type needs
844
+ to match the type of ``self``. This prevents mixing of
845
+ different tree types (single-parented, multi-parented, and
846
+ non-parented).
847
+ :param dry_run: If true, the don't actually set the child's
848
+ parent pointer; just check for any error conditions, and
849
+ raise an exception if one is found.
850
+ """
851
+ raise NotImplementedError()
852
+
853
+ def _delparent(self, child, index):
854
+ """
855
+ Update the parent pointer of ``child`` to not point to self. This
856
+ method is only called if the type of ``child`` is ``Tree``; i.e., it
857
+ is not called when removing a leaf from a tree. This method
858
+ is always called before the child is actually removed from the
859
+ child list of ``self``.
860
+
861
+ :type child: Tree
862
+ :type index: int
863
+ :param index: The index of ``child`` in ``self``.
864
+ """
865
+ raise NotImplementedError()
866
+
867
+ #////////////////////////////////////////////////////////////
868
+ # Methods that add/remove children
869
+ #////////////////////////////////////////////////////////////
870
+ # Every method that adds or removes a child must make
871
+ # appropriate calls to _setparent() and _delparent().
872
+
873
+ def __delitem__(self, index):
874
+ # del ptree[start:stop]
875
+ if isinstance(index, slice):
876
+ start, stop, step = slice_bounds(self, index, allow_step=True)
877
+ # Clear all the children pointers.
878
+ for i in xrange(start, stop, step):
879
+ if isinstance(self[i], Tree):
880
+ self._delparent(self[i], i)
881
+ # Delete the children from our child list.
882
+ super(AbstractParentedTree, self).__delitem__(index)
883
+
884
+ # del ptree[i]
885
+ elif isinstance(index, int):
886
+ if index < 0: index += len(self)
887
+ if index < 0: raise IndexError('index out of range')
888
+ # Clear the child's parent pointer.
889
+ if isinstance(self[index], Tree):
890
+ self._delparent(self[index], index)
891
+ # Remove the child from our child list.
892
+ super(AbstractParentedTree, self).__delitem__(index)
893
+
894
+ elif isinstance(index, (list, tuple)):
895
+ # del ptree[()]
896
+ if len(index) == 0:
897
+ raise IndexError('The tree position () may not be deleted.')
898
+ # del ptree[(i,)]
899
+ elif len(index) == 1:
900
+ del self[index[0]]
901
+ # del ptree[i1, i2, i3]
902
+ else:
903
+ del self[index[0]][index[1:]]
904
+
905
+ else:
906
+ raise TypeError("%s indices must be integers, not %s" %
907
+ (type(self).__name__, type(index).__name__))
908
+
909
+ def __setitem__(self, index, value):
910
+ # ptree[start:stop] = value
911
+ if isinstance(index, slice):
912
+ start, stop, step = slice_bounds(self, index, allow_step=True)
913
+ # make a copy of value, in case it's an iterator
914
+ if not isinstance(value, (list, tuple)):
915
+ value = list(value)
916
+ # Check for any error conditions, so we can avoid ending
917
+ # up in an inconsistent state if an error does occur.
918
+ for i, child in enumerate(value):
919
+ if isinstance(child, Tree):
920
+ self._setparent(child, start + i*step, dry_run=True)
921
+ # clear the child pointers of all parents we're removing
922
+ for i in xrange(start, stop, step):
923
+ if isinstance(self[i], Tree):
924
+ self._delparent(self[i], i)
925
+ # set the child pointers of the new children. We do this
926
+ # after clearing *all* child pointers, in case we're e.g.
927
+ # reversing the elements in a tree.
928
+ for i, child in enumerate(value):
929
+ if isinstance(child, Tree):
930
+ self._setparent(child, start + i*step)
931
+ # finally, update the content of the child list itself.
932
+ super(AbstractParentedTree, self).__setitem__(index, value)
933
+
934
+ # ptree[i] = value
935
+ elif isinstance(index, int):
936
+ if index < 0: index += len(self)
937
+ if index < 0: raise IndexError('index out of range')
938
+ # if the value is not changing, do nothing.
939
+ if value is self[index]:
940
+ return
941
+ # Set the new child's parent pointer.
942
+ if isinstance(value, Tree):
943
+ self._setparent(value, index)
944
+ # Remove the old child's parent pointer
945
+ if isinstance(self[index], Tree):
946
+ self._delparent(self[index], index)
947
+ # Update our child list.
948
+ super(AbstractParentedTree, self).__setitem__(index, value)
949
+
950
+ elif isinstance(index, (list, tuple)):
951
+ # ptree[()] = value
952
+ if len(index) == 0:
953
+ raise IndexError('The tree position () may not be assigned to.')
954
+ # ptree[(i,)] = value
955
+ elif len(index) == 1:
956
+ self[index[0]] = value
957
+ # ptree[i1, i2, i3] = value
958
+ else:
959
+ self[index[0]][index[1:]] = value
960
+
961
+ else:
962
+ raise TypeError("%s indices must be integers, not %s" %
963
+ (type(self).__name__, type(index).__name__))
964
+
965
+ def append(self, child):
966
+ if isinstance(child, Tree):
967
+ self._setparent(child, len(self))
968
+ super(AbstractParentedTree, self).append(child)
969
+
970
+ def extend(self, children):
971
+ for child in children:
972
+ if isinstance(child, Tree):
973
+ self._setparent(child, len(self))
974
+ super(AbstractParentedTree, self).append(child)
975
+
976
+ def insert(self, index, child):
977
+ # Handle negative indexes. Note that if index < -len(self),
978
+ # we do *not* raise an IndexError, unlike __getitem__. This
979
+ # is done for consistency with list.__getitem__ and list.index.
980
+ if index < 0: index += len(self)
981
+ if index < 0: index = 0
982
+ # Set the child's parent, and update our child list.
983
+ if isinstance(child, Tree):
984
+ self._setparent(child, index)
985
+ super(AbstractParentedTree, self).insert(index, child)
986
+
987
+ def pop(self, index=-1):
988
+ if index < 0: index += len(self)
989
+ if index < 0: raise IndexError('index out of range')
990
+ if isinstance(self[index], Tree):
991
+ self._delparent(self[index], index)
992
+ return super(AbstractParentedTree, self).pop(index)
993
+
994
+ # n.b.: like `list`, this is done by equality, not identity!
995
+ # To remove a specific child, use del ptree[i].
996
+ def remove(self, child):
997
+ index = self.index(child)
998
+ if isinstance(self[index], Tree):
999
+ self._delparent(self[index], index)
1000
+ super(AbstractParentedTree, self).remove(child)
1001
+
1002
+ # We need to implement __getslice__ and friends, even though
1003
+ # they're deprecated, because otherwise list.__getslice__ will get
1004
+ # called (since we're subclassing from list). Just delegate to
1005
+ # __getitem__ etc., but use max(0, start) and max(0, stop) because
1006
+ # because negative indices are already handled *before*
1007
+ # __getslice__ is called; and we don't want to double-count them.
1008
+ if hasattr(list, '__getslice__'):
1009
+ def __getslice__(self, start, stop):
1010
+ return self.__getitem__(slice(max(0, start), max(0, stop)))
1011
+ def __delslice__(self, start, stop):
1012
+ return self.__delitem__(slice(max(0, start), max(0, stop)))
1013
+ def __setslice__(self, start, stop, value):
1014
+ return self.__setitem__(slice(max(0, start), max(0, stop)), value)
1015
+
1016
+ class ParentedTree(AbstractParentedTree):
1017
+ """
1018
+ A ``Tree`` that automatically maintains parent pointers for
1019
+ single-parented trees. The following are methods for querying
1020
+ the structure of a parented tree: ``parent``, ``parent_index``,
1021
+ ``left_sibling``, ``right_sibling``, ``root``, ``treeposition``.
1022
+
1023
+ Each ``ParentedTree`` may have at most one parent. In
1024
+ particular, subtrees may not be shared. Any attempt to reuse a
1025
+ single ``ParentedTree`` as a child of more than one parent (or
1026
+ as multiple children of the same parent) will cause a
1027
+ ``ValueError`` exception to be raised.
1028
+
1029
+ ``ParentedTrees`` should never be used in the same tree as ``Trees``
1030
+ or ``MultiParentedTrees``. Mixing tree implementations may result
1031
+ in incorrect parent pointers and in ``TypeError`` exceptions.
1032
+ """
1033
+ def __init__(self, node_or_str, children=None):
1034
+ self._parent = None
1035
+ """The parent of this Tree, or None if it has no parent."""
1036
+ super(ParentedTree, self).__init__(node_or_str, children)
1037
+ if children is None:
1038
+ # If children is None, the tree is parsed from node_or_str.
1039
+ # After parsing, the parent of the immediate children
1040
+ # will point to an intermediate tree, not self.
1041
+ # We fix this by brute force:
1042
+ for i, child in enumerate(self):
1043
+ if isinstance(child, Tree):
1044
+ child._parent = None
1045
+ self._setparent(child, i)
1046
+
1047
+ def _frozen_class(self): return ImmutableParentedTree
1048
+
1049
+ #/////////////////////////////////////////////////////////////////
1050
+ # Methods
1051
+ #/////////////////////////////////////////////////////////////////
1052
+
1053
+ def parent(self):
1054
+ """The parent of this tree, or None if it has no parent."""
1055
+ return self._parent
1056
+
1057
+ def parent_index(self):
1058
+ """
1059
+ The index of this tree in its parent. I.e.,
1060
+ ``ptree.parent()[ptree.parent_index()] is ptree``. Note that
1061
+ ``ptree.parent_index()`` is not necessarily equal to
1062
+ ``ptree.parent.index(ptree)``, since the ``index()`` method
1063
+ returns the first child that is equal to its argument.
1064
+ """
1065
+ if self._parent is None: return None
1066
+ for i, child in enumerate(self._parent):
1067
+ if child is self: return i
1068
+ assert False, 'expected to find self in self._parent!'
1069
+
1070
+ def left_sibling(self):
1071
+ """The left sibling of this tree, or None if it has none."""
1072
+ parent_index = self.parent_index()
1073
+ if self._parent and parent_index > 0:
1074
+ return self._parent[parent_index-1]
1075
+ return None # no left sibling
1076
+
1077
+ def right_sibling(self):
1078
+ """The right sibling of this tree, or None if it has none."""
1079
+ parent_index = self.parent_index()
1080
+ if self._parent and parent_index < (len(self._parent)-1):
1081
+ return self._parent[parent_index+1]
1082
+ return None # no right sibling
1083
+
1084
+ def root(self):
1085
+ """
1086
+ The root of this tree. I.e., the unique ancestor of this tree
1087
+ whose parent is None. If ``ptree.parent()`` is None, then
1088
+ ``ptree`` is its own root.
1089
+ """
1090
+ root = self
1091
+ while root.parent() is not None:
1092
+ root = root.parent()
1093
+ return root
1094
+
1095
+ def treeposition(self):
1096
+ """
1097
+ The tree position of this tree, relative to the root of the
1098
+ tree. I.e., ``ptree.root[ptree.treeposition] is ptree``.
1099
+ """
1100
+ if self.parent() is None: return ()
1101
+ else: return self.parent().treeposition() + (self.parent_index(),)
1102
+
1103
+
1104
+ #/////////////////////////////////////////////////////////////////
1105
+ # Parent Management
1106
+ #/////////////////////////////////////////////////////////////////
1107
+
1108
+ def _delparent(self, child, index):
1109
+ # Sanity checks
1110
+ assert isinstance(child, ParentedTree)
1111
+ assert self[index] is child
1112
+ assert child._parent is self
1113
+
1114
+ # Delete child's parent pointer.
1115
+ child._parent = None
1116
+
1117
+ def _setparent(self, child, index, dry_run=False):
1118
+ # If the child's type is incorrect, then complain.
1119
+ if not isinstance(child, ParentedTree):
1120
+ raise TypeError('Can not insert a non-ParentedTree '+
1121
+ 'into a ParentedTree')
1122
+
1123
+ # If child already has a parent, then complain.
1124
+ if child._parent is not None:
1125
+ raise ValueError('Can not insert a subtree that already '
1126
+ 'has a parent.')
1127
+
1128
+ # Set child's parent pointer & index.
1129
+ if not dry_run:
1130
+ child._parent = self
1131
+
1132
+
1133
+ class MultiParentedTree(AbstractParentedTree):
1134
+ """
1135
+ A ``Tree`` that automatically maintains parent pointers for
1136
+ multi-parented trees. The following are methods for querying the
1137
+ structure of a multi-parented tree: ``parents()``, ``parent_indices()``,
1138
+ ``left_siblings()``, ``right_siblings()``, ``roots``, ``treepositions``.
1139
+
1140
+ Each ``MultiParentedTree`` may have zero or more parents. In
1141
+ particular, subtrees may be shared. If a single
1142
+ ``MultiParentedTree`` is used as multiple children of the same
1143
+ parent, then that parent will appear multiple times in its
1144
+ ``parents()`` method.
1145
+
1146
+ ``MultiParentedTrees`` should never be used in the same tree as
1147
+ ``Trees`` or ``ParentedTrees``. Mixing tree implementations may
1148
+ result in incorrect parent pointers and in ``TypeError`` exceptions.
1149
+ """
1150
+ def __init__(self, node_or_str, children=None):
1151
+ self._parents = []
1152
+ """A list of this tree's parents. This list should not
1153
+ contain duplicates, even if a parent contains this tree
1154
+ multiple times."""
1155
+ super(MultiParentedTree, self).__init__(node_or_str, children)
1156
+ if children is None:
1157
+ # If children is None, the tree is parsed from node_or_str.
1158
+ # After parsing, the parent(s) of the immediate children
1159
+ # will point to an intermediate tree, not self.
1160
+ # We fix this by brute force:
1161
+ for i, child in enumerate(self):
1162
+ if isinstance(child, Tree):
1163
+ child._parents = []
1164
+ self._setparent(child, i)
1165
+
1166
+ def _frozen_class(self): return ImmutableMultiParentedTree
1167
+
1168
+ #/////////////////////////////////////////////////////////////////
1169
+ # Methods
1170
+ #/////////////////////////////////////////////////////////////////
1171
+
1172
+ def parents(self):
1173
+ """
1174
+ The set of parents of this tree. If this tree has no parents,
1175
+ then ``parents`` is the empty set. To check if a tree is used
1176
+ as multiple children of the same parent, use the
1177
+ ``parent_indices()`` method.
1178
+
1179
+ :type: list(MultiParentedTree)
1180
+ """
1181
+ return list(self._parents)
1182
+
1183
+ def left_siblings(self):
1184
+ """
1185
+ A list of all left siblings of this tree, in any of its parent
1186
+ trees. A tree may be its own left sibling if it is used as
1187
+ multiple contiguous children of the same parent. A tree may
1188
+ appear multiple times in this list if it is the left sibling
1189
+ of this tree with respect to multiple parents.
1190
+
1191
+ :type: list(MultiParentedTree)
1192
+ """
1193
+ return [parent[index-1]
1194
+ for (parent, index) in self._get_parent_indices()
1195
+ if index > 0]
1196
+
1197
+ def right_siblings(self):
1198
+ """
1199
+ A list of all right siblings of this tree, in any of its parent
1200
+ trees. A tree may be its own right sibling if it is used as
1201
+ multiple contiguous children of the same parent. A tree may
1202
+ appear multiple times in this list if it is the right sibling
1203
+ of this tree with respect to multiple parents.
1204
+
1205
+ :type: list(MultiParentedTree)
1206
+ """
1207
+ return [parent[index+1]
1208
+ for (parent, index) in self._get_parent_indices()
1209
+ if index < (len(parent)-1)]
1210
+
1211
+ def _get_parent_indices(self):
1212
+ return [(parent, index)
1213
+ for parent in self._parents
1214
+ for index, child in enumerate(parent)
1215
+ if child is self]
1216
+
1217
+ def roots(self):
1218
+ """
1219
+ The set of all roots of this tree. This set is formed by
1220
+ tracing all possible parent paths until trees with no parents
1221
+ are found.
1222
+
1223
+ :type: list(MultiParentedTree)
1224
+ """
1225
+ return self._get_roots_helper({}).values()
1226
+
1227
+ def _get_roots_helper(self, result):
1228
+ if self._parents:
1229
+ for parent in self._parents:
1230
+ parent._get_roots_helper(result)
1231
+ else:
1232
+ result[id(self)] = self
1233
+ return result
1234
+
1235
+ def parent_indices(self, parent):
1236
+ """
1237
+ Return a list of the indices where this tree occurs as a child
1238
+ of ``parent``. If this child does not occur as a child of
1239
+ ``parent``, then the empty list is returned. The following is
1240
+ always true::
1241
+
1242
+ for parent_index in ptree.parent_indices(parent):
1243
+ parent[parent_index] is ptree
1244
+ """
1245
+ if parent not in self._parents: return []
1246
+ else: return [index for (index, child) in enumerate(parent)
1247
+ if child is self]
1248
+
1249
+ def treepositions(self, root):
1250
+ """
1251
+ Return a list of all tree positions that can be used to reach
1252
+ this multi-parented tree starting from ``root``. I.e., the
1253
+ following is always true::
1254
+
1255
+ for treepos in ptree.treepositions(root):
1256
+ root[treepos] is ptree
1257
+ """
1258
+ if self is root:
1259
+ return [()]
1260
+ else:
1261
+ return [treepos+(index,)
1262
+ for parent in self._parents
1263
+ for treepos in parent.treepositions(root)
1264
+ for (index, child) in enumerate(parent) if child is self]
1265
+
1266
+
1267
+ #/////////////////////////////////////////////////////////////////
1268
+ # Parent Management
1269
+ #/////////////////////////////////////////////////////////////////
1270
+
1271
+ def _delparent(self, child, index):
1272
+ # Sanity checks
1273
+ assert isinstance(child, MultiParentedTree)
1274
+ assert self[index] is child
1275
+ assert len([p for p in child._parents if p is self]) == 1
1276
+
1277
+ # If the only copy of child in self is at index, then delete
1278
+ # self from child's parent list.
1279
+ for i, c in enumerate(self):
1280
+ if c is child and i != index: break
1281
+ else:
1282
+ child._parents.remove(self)
1283
+
1284
+ def _setparent(self, child, index, dry_run=False):
1285
+ # If the child's type is incorrect, then complain.
1286
+ if not isinstance(child, MultiParentedTree):
1287
+ raise TypeError('Can not insert a non-MultiParentedTree '+
1288
+ 'into a MultiParentedTree')
1289
+
1290
+ # Add self as a parent pointer if it's not already listed.
1291
+ if not dry_run:
1292
+ for parent in child._parents:
1293
+ if parent is self: break
1294
+ else:
1295
+ child._parents.append(self)
1296
+
1297
+ class ImmutableParentedTree(ImmutableTree, ParentedTree):
1298
+ pass
1299
+
1300
+ class ImmutableMultiParentedTree(ImmutableTree, MultiParentedTree):
1301
+ pass
1302
+
1303
+
1304
+ def _child_names(tree):
1305
+ names = []
1306
+ for child in tree:
1307
+ if isinstance(child, Tree):
1308
+ names.append(Nonterminal(child.node))
1309
+ else:
1310
+ names.append(child)
1311
+ return names
1312
+
1313
+ ######################################################################
1314
+ ## Parsing
1315
+ ######################################################################
1316
+
1317
+ def bracket_parse(s):
1318
+ """
1319
+ Use Tree.parse(s, remove_empty_top_bracketing=True) instead.
1320
+ """
1321
+ raise NameError("Use Tree.parse(s, remove_empty_top_bracketing=True) instead.")
1322
+
1323
+ def sinica_parse(s):
1324
+ """
1325
+ Parse a Sinica Treebank string and return a tree. Trees are represented as nested brackettings,
1326
+ as shown in the following example (X represents a Chinese character):
1327
+ S(goal:NP(Head:Nep:XX)|theme:NP(Head:Nhaa:X)|quantity:Dab:X|Head:VL2:X)#0(PERIODCATEGORY)
1328
+
1329
+ :return: A tree corresponding to the string representation.
1330
+ :rtype: Tree
1331
+ :param s: The string to be converted
1332
+ :type s: str
1333
+ """
1334
+ tokens = re.split(r'([()| ])', s)
1335
+ for i in range(len(tokens)):
1336
+ if tokens[i] == '(':
1337
+ tokens[i-1], tokens[i] = tokens[i], tokens[i-1] # pull nonterminal inside parens
1338
+ elif ':' in tokens[i]:
1339
+ fields = tokens[i].split(':')
1340
+ if len(fields) == 2: # non-terminal
1341
+ tokens[i] = fields[1]
1342
+ else:
1343
+ tokens[i] = "(" + fields[-2] + " " + fields[-1] + ")"
1344
+ elif tokens[i] == '|':
1345
+ tokens[i] = ''
1346
+
1347
+ treebank_string = string.join(tokens)
1348
+ return Tree.parse(treebank_string, remove_empty_top_bracketing=True)
1349
+
1350
+ # s = re.sub(r'^#[^\s]*\s', '', s) # remove leading identifier
1351
+ # s = re.sub(r'\w+:', '', s) # remove role tags
1352
+
1353
+ # return s
1354
+
1355
+ ######################################################################
1356
+ ## Demonstration
1357
+ ######################################################################
1358
+
1359
+ def demo():
1360
+ """
1361
+ A demonstration showing how Trees and Trees can be
1362
+ used. This demonstration creates a Tree, and loads a
1363
+ Tree from the Treebank corpus,
1364
+ and shows the results of calling several of their methods.
1365
+ """
1366
+
1367
+ from nltk import tree
1368
+
1369
+ # Demonstrate tree parsing.
1370
+ s = '(S (NP (DT the) (NN cat)) (VP (VBD ate) (NP (DT a) (NN cookie))))'
1371
+ t = Tree(s)
1372
+ print "Convert bracketed string into tree:"
1373
+ print t
1374
+ print t.__repr__()
1375
+
1376
+ print "Display tree properties:"
1377
+ print t.node # tree's constituent type
1378
+ print t[0] # tree's first child
1379
+ print t[1] # tree's second child
1380
+ print t.height()
1381
+ print t.leaves()
1382
+ print t[1]
1383
+ print t[1,1]
1384
+ print t[1,1,0]
1385
+
1386
+ # Demonstrate tree modification.
1387
+ the_cat = t[0]
1388
+ the_cat.insert(1, tree.Tree.parse('(JJ big)'))
1389
+ print "Tree modification:"
1390
+ print t
1391
+ t[1,1,1] = tree.Tree.parse('(NN cake)')
1392
+ print t
1393
+ print
1394
+
1395
+ # Tree transforms
1396
+ print "Collapse unary:"
1397
+ t.collapse_unary()
1398
+ print t
1399
+ print "Chomsky normal form:"
1400
+ t.chomsky_normal_form()
1401
+ print t
1402
+ print
1403
+
1404
+ # Demonstrate probabilistic trees.
1405
+ pt = tree.ProbabilisticTree('x', ['y', 'z'], prob=0.5)
1406
+ print "Probabilistic Tree:"
1407
+ print pt
1408
+ print
1409
+
1410
+ # Demonstrate parsing of treebank output format.
1411
+ t = tree.Tree.parse(t.pprint())
1412
+ print "Convert tree to bracketed string and back again:"
1413
+ print t
1414
+ print
1415
+
1416
+ # Demonstrate LaTeX output
1417
+ print "LaTeX output:"
1418
+ print t.pprint_latex_qtree()
1419
+ print
1420
+
1421
+ # Demonstrate Productions
1422
+ print "Production output:"
1423
+ print t.productions()
1424
+ print
1425
+
1426
+ # Demonstrate tree nodes containing objects other than strings
1427
+ t.node = ('test', 3)
1428
+ print t
1429
+
1430
+ __all__ = ['ImmutableProbabilisticTree', 'ImmutableTree', 'ProbabilisticMixIn',
1431
+ 'ProbabilisticTree', 'Tree', 'bracket_parse',
1432
+ 'sinica_parse', 'ParentedTree', 'MultiParentedTree',
1433
+ 'ImmutableParentedTree', 'ImmutableMultiParentedTree']
1434
+
1435
+ if __name__ == "__main__":
1436
+ import doctest
1437
+ doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
1438
+