bscampp 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bscampp/__init__.py +68 -0
- bscampp/configs.py +169 -0
- bscampp/default.config +5 -0
- bscampp/functions.py +409 -0
- bscampp/init_configs.py +93 -0
- bscampp/jobs.py +198 -0
- bscampp/pipeline.py +249 -0
- bscampp/tools/epa-ng +0 -0
- bscampp/tools/hamming_distance/CMakeLists.txt +13 -0
- bscampp/tools/hamming_distance/fragment_hamming +0 -0
- bscampp/tools/hamming_distance/hamming +0 -0
- bscampp/tools/hamming_distance/homology +0 -0
- bscampp/tools/hamming_distance/src/fragment_hamming.cpp +180 -0
- bscampp/tools/hamming_distance/src/fragment_tree_hamming.cpp +183 -0
- bscampp/tools/hamming_distance/src/fragment_tree_hamming_new.cpp +214 -0
- bscampp/tools/hamming_distance/src/homology.cpp +179 -0
- bscampp/tools/hamming_distance/src/new_hamming.cpp +161 -0
- bscampp/tools/pplacer +0 -0
- bscampp/utils.py +914 -0
- bscampp-1.0.1.dist-info/LICENSE +21 -0
- bscampp-1.0.1.dist-info/METADATA +234 -0
- bscampp-1.0.1.dist-info/RECORD +25 -0
- bscampp-1.0.1.dist-info/WHEEL +5 -0
- bscampp-1.0.1.dist-info/entry_points.txt +3 -0
- bscampp-1.0.1.dist-info/top_level.txt +1 -0
bscampp/utils.py
ADDED
@@ -0,0 +1,914 @@
|
|
1
|
+
#from dendropy import *
|
2
|
+
import numpy as np
|
3
|
+
import heapq
|
4
|
+
import treeswift
|
5
|
+
import itertools
|
6
|
+
from collections import deque
|
7
|
+
from os.path import expanduser,isfile
|
8
|
+
import random
|
9
|
+
import statistics
|
10
|
+
import copy
|
11
|
+
import gzip
|
12
|
+
|
13
|
+
import argparse
|
14
|
+
# reformat argparse help text formatting
|
15
|
+
class SmartHelpFormatter(argparse.RawDescriptionHelpFormatter):
|
16
|
+
def add_text(self, text):
|
17
|
+
if text is not None:
|
18
|
+
text = text.replace("\\n", "\n").replace("\\t", "\t")
|
19
|
+
super().add_text(text)
|
20
|
+
def _split_lines(self, text, width):
|
21
|
+
if '\n' in text:
|
22
|
+
temp = text.split('\n')
|
23
|
+
ret = []
|
24
|
+
for _splice in [argparse.RawDescriptionHelpFormatter._split_lines(self, x, width)
|
25
|
+
for x in temp]:
|
26
|
+
ret.extend(_splice)
|
27
|
+
return ret
|
28
|
+
return argparse.RawDescriptionHelpFormatter._split_lines(self, text, width)
|
29
|
+
|
30
|
+
# store bracket open/close for convenience in label parsing
|
31
|
+
BRACKET = {
|
32
|
+
'[': ']', # square bracket
|
33
|
+
'{': '}', # curly bracket
|
34
|
+
"'": "'", # single-quote
|
35
|
+
'"': '"', # double-quote
|
36
|
+
}
|
37
|
+
|
38
|
+
|
39
|
+
def write_fasta(aln, aln_dict, aligned=True):
|
40
|
+
""" Write given dictionary as FASTA file out
|
41
|
+
|
42
|
+
Parameters
|
43
|
+
----------
|
44
|
+
aln : FASTA file path
|
45
|
+
aln_dict : MSA in the form of a dict
|
46
|
+
aligned : whether the sequences are aligned
|
47
|
+
|
48
|
+
Returns
|
49
|
+
-------
|
50
|
+
None
|
51
|
+
|
52
|
+
"""
|
53
|
+
|
54
|
+
f = open(aln, 'w')
|
55
|
+
for label, seq in aln_dict.items():
|
56
|
+
if label != '':
|
57
|
+
f.write(f'>{label}\n')
|
58
|
+
if aligned:
|
59
|
+
f.write(f'{seq}\n')
|
60
|
+
else:
|
61
|
+
f.write(seq.replace('-', '') + '\n')
|
62
|
+
f.close()
|
63
|
+
|
64
|
+
#separete the query and ref sequence from the alignment file
|
65
|
+
|
66
|
+
def read_data(aln):
|
67
|
+
""" Load the query and reference sequence from the alignment file
|
68
|
+
|
69
|
+
Parameters
|
70
|
+
----------
|
71
|
+
aln : multiple sequence alignment containing reference taxa and query sequence
|
72
|
+
|
73
|
+
Returns
|
74
|
+
-------
|
75
|
+
dictionary containing sequences with taxon label keys
|
76
|
+
|
77
|
+
"""
|
78
|
+
|
79
|
+
f = open(aln)
|
80
|
+
result = dict()
|
81
|
+
|
82
|
+
taxa = ""
|
83
|
+
seq = ""
|
84
|
+
for line in f:
|
85
|
+
if line[0] == '>':
|
86
|
+
if taxa != "":
|
87
|
+
result[taxa] = seq
|
88
|
+
taxa = line[1:-1]
|
89
|
+
seq = ""
|
90
|
+
|
91
|
+
elif line == "/n":
|
92
|
+
continue
|
93
|
+
else:
|
94
|
+
seq += line[:-1]
|
95
|
+
|
96
|
+
if taxa != "":
|
97
|
+
result[taxa] = seq
|
98
|
+
|
99
|
+
|
100
|
+
return result
|
101
|
+
|
102
|
+
def seperate(aln_dict, leaf_dict):
|
103
|
+
""" Separate the query sequences from the reference sequences
|
104
|
+
|
105
|
+
Parameters
|
106
|
+
----------
|
107
|
+
aln_dict : Sequence dictionary with taxon label keys
|
108
|
+
leaf_dict : Sequence dictionary with leaf label keys (queries are not in backbone tree)
|
109
|
+
|
110
|
+
Returns
|
111
|
+
-------
|
112
|
+
separate dictionaries containing query sequences and referece sequences with taxon label keys
|
113
|
+
|
114
|
+
"""
|
115
|
+
ref = dict()
|
116
|
+
query = dict()
|
117
|
+
|
118
|
+
for key, value in aln_dict.items():
|
119
|
+
if key not in leaf_dict:
|
120
|
+
query[key] = value
|
121
|
+
else:
|
122
|
+
ref[key] = value
|
123
|
+
|
124
|
+
return ref, query
|
125
|
+
|
126
|
+
def hamming(seq1, seq2):
|
127
|
+
""" Returns hamming distance between two sequences
|
128
|
+
|
129
|
+
Parameters
|
130
|
+
----------
|
131
|
+
seq1 : query sequence
|
132
|
+
seq2 : reference sequence
|
133
|
+
|
134
|
+
Returns
|
135
|
+
-------
|
136
|
+
integer hamming distance between query sequence and reference sequence
|
137
|
+
|
138
|
+
"""
|
139
|
+
return sum(1 for ch1, ch2 in zip(seq1, seq2) if ch1 != ch2)
|
140
|
+
|
141
|
+
|
142
|
+
def find_y(x,ref):
|
143
|
+
""" Returns leaf label for closest sister taxon l (no longer used)
|
144
|
+
|
145
|
+
Parameters
|
146
|
+
----------
|
147
|
+
x : aligned query sequence
|
148
|
+
ref : reference multiple sequence alignment dictionary
|
149
|
+
|
150
|
+
Returns
|
151
|
+
-------
|
152
|
+
leaf label for taxon with smallest hamming distacne to query sequence
|
153
|
+
|
154
|
+
"""
|
155
|
+
low = len(x)
|
156
|
+
y = ""
|
157
|
+
for name, seq in ref.items():
|
158
|
+
h_dist = hamming(x, seq)
|
159
|
+
if h_dist < low:
|
160
|
+
low = h_dist
|
161
|
+
y = name
|
162
|
+
return y
|
163
|
+
|
164
|
+
def set_hamming_dict(args):
|
165
|
+
dict_items, other_args = args
|
166
|
+
name, seq = dict_items
|
167
|
+
ref, n, fragment_flag, y_dict = other_args
|
168
|
+
|
169
|
+
y_dict[name] = find_closest_hamming(seq, ref, n, fragment_flag)
|
170
|
+
|
171
|
+
|
172
|
+
def find_closest_hamming(x, ref, n, fragment_flag):
|
173
|
+
''' Returns leaf name for n closest sister taxa to sequence x
|
174
|
+
|
175
|
+
Parameters
|
176
|
+
----------
|
177
|
+
x : aligned query sequence
|
178
|
+
ref : reference multiple sequence alignment dictionary
|
179
|
+
n : number of nodes to return
|
180
|
+
fragment_flag : True if the query is not full length
|
181
|
+
|
182
|
+
Returns
|
183
|
+
-------
|
184
|
+
list of nodes with n smallest hamming distacnes to query sequence
|
185
|
+
|
186
|
+
'''
|
187
|
+
queue = []
|
188
|
+
closest = []
|
189
|
+
|
190
|
+
counter = 0
|
191
|
+
if fragment_flag == 'True':
|
192
|
+
[si, ei] = set_fragment_indicies(x)
|
193
|
+
else:
|
194
|
+
[si, ei] = [0, len(x)]
|
195
|
+
|
196
|
+
c = 200 # size of the subtring compared at once
|
197
|
+
|
198
|
+
for name, seq in ref.items():
|
199
|
+
heapq.heappush(queue,(hamming(seq[si:si+c],x[si:si+c]), ei - si - c, counter, name))
|
200
|
+
counter += 1
|
201
|
+
|
202
|
+
while queue:
|
203
|
+
(ham_dist, sites_left, cnt, name) = heapq.heappop(queue)
|
204
|
+
if sites_left < 0:
|
205
|
+
closest.append(name)
|
206
|
+
if len(closest) >= n:
|
207
|
+
return closest
|
208
|
+
else:
|
209
|
+
ind = ei - sites_left
|
210
|
+
new_ham = hamming(ref[name][ind:ind+c],x[ind:ind+c])
|
211
|
+
heapq.heappush(queue,(ham_dist + new_ham, sites_left - c, cnt, name))
|
212
|
+
|
213
|
+
def set_fragment_indicies(x):
|
214
|
+
""" Returns the indicees without leading and trailing gaps.
|
215
|
+
|
216
|
+
Parameters
|
217
|
+
----------
|
218
|
+
x = string sequence
|
219
|
+
|
220
|
+
Returns
|
221
|
+
-------
|
222
|
+
list of start index and end index with the first and last non gap character
|
223
|
+
|
224
|
+
"""
|
225
|
+
e = len(x)
|
226
|
+
ei = e
|
227
|
+
si = 0
|
228
|
+
for i in range(ei):
|
229
|
+
if x[i] == '-' and si == i:
|
230
|
+
si = i + 1
|
231
|
+
if x[e - i - 1] == '-' and ei == e - i:
|
232
|
+
ei = e - i - 1
|
233
|
+
if ei == si:
|
234
|
+
break
|
235
|
+
return [si, ei]
|
236
|
+
|
237
|
+
def find_closest(x, visited, y=None):
|
238
|
+
""" Returns leaf label for closest leaf to the node x through path not travelling through visited.
|
239
|
+
If y is populated returns path from x to y not travelling through nodes in visited.
|
240
|
+
|
241
|
+
Parameters
|
242
|
+
----------
|
243
|
+
x : dendropy node object
|
244
|
+
visited : dictionary containing dendropy node objects as keys
|
245
|
+
y : dendropy node object
|
246
|
+
|
247
|
+
Returns
|
248
|
+
-------
|
249
|
+
If y == None : dendropy node object of closest leaf y to the node x through path not travelling through nodes in visited,
|
250
|
+
list containing dendropy node objects on path to that leaf y from node x
|
251
|
+
If y != None : dendropy node object y,
|
252
|
+
list containing dendropy node objects on path from node x to leaf y not travelling through nodes in visited
|
253
|
+
|
254
|
+
"""
|
255
|
+
queue = []
|
256
|
+
cnt = 1
|
257
|
+
visited.add(x)
|
258
|
+
|
259
|
+
if x.get_parent() and x.get_parent() not in visited:
|
260
|
+
tmp = []
|
261
|
+
tmp.append(x)
|
262
|
+
heapq.heappush(queue, [x.get_edge_length(), cnt, tmp, x.get_parent()])
|
263
|
+
cnt += 1
|
264
|
+
|
265
|
+
for child in x.child_nodes():
|
266
|
+
if child and child not in visited:
|
267
|
+
tmp = []
|
268
|
+
tmp.append(child)
|
269
|
+
heapq.heappush(queue, [child.get_edge_length(), cnt, tmp, child])
|
270
|
+
cnt += 1
|
271
|
+
|
272
|
+
while len(queue) > 0:
|
273
|
+
try:
|
274
|
+
[length, _, path, node] = heapq.heappop(queue)
|
275
|
+
except IndexError:
|
276
|
+
break
|
277
|
+
|
278
|
+
visited.add(node)
|
279
|
+
if node.is_leaf():
|
280
|
+
if (not y) or node.get_label()==y.get_label():
|
281
|
+
return node, path
|
282
|
+
else:
|
283
|
+
continue
|
284
|
+
|
285
|
+
if node.get_parent() and node.get_parent() not in visited:
|
286
|
+
tmp = path.copy()
|
287
|
+
tmp.append(node)
|
288
|
+
heapq.heappush(queue, [length+node.get_edge_length(), cnt, tmp, node.get_parent()])
|
289
|
+
cnt += 1
|
290
|
+
|
291
|
+
for child in node.child_nodes():
|
292
|
+
if child and child not in visited:
|
293
|
+
tmp = path.copy()
|
294
|
+
tmp.append(child)
|
295
|
+
heapq.heappush(queue, [length+child.get_edge_length(), cnt, tmp, child])
|
296
|
+
cnt += 1
|
297
|
+
|
298
|
+
return x, [x]
|
299
|
+
|
300
|
+
|
301
|
+
def find_closest_testing(x, visited, y=None, valid_leaves=None):
|
302
|
+
""" Returns leaf label for closest leaf to the node x through path not travelling through visited.
|
303
|
+
If y is populated returns path from x to y not travelling through nodes in visited.
|
304
|
+
|
305
|
+
Parameters
|
306
|
+
----------
|
307
|
+
x : dendropy node object
|
308
|
+
visited : dictionary containing dendropy node objects as keys
|
309
|
+
y : dendropy node object
|
310
|
+
|
311
|
+
Returns
|
312
|
+
-------
|
313
|
+
If y == None : dendropy node object of closest leaf y to the node x through path not travelling through nodes in visited,
|
314
|
+
list containing dendropy node objects on path to that leaf y from node x
|
315
|
+
distance from node x to leaf y
|
316
|
+
If y != None : dendropy node object y,
|
317
|
+
list containing dendropy node objects on path from node x to leaf y not travelling through nodes in visited,
|
318
|
+
distance from node x to leaf y
|
319
|
+
|
320
|
+
"""
|
321
|
+
queue = []
|
322
|
+
cnt = 1
|
323
|
+
visited.add(x)
|
324
|
+
|
325
|
+
if x.get_parent() and x.get_parent() not in visited:
|
326
|
+
tmp = []
|
327
|
+
tmp.append(x)
|
328
|
+
heapq.heappush(queue, [x.get_edge_length(), cnt, tmp, x.get_parent()])
|
329
|
+
cnt += 1
|
330
|
+
|
331
|
+
for child in x.child_nodes():
|
332
|
+
if child and child not in visited:
|
333
|
+
tmp = []
|
334
|
+
tmp.append(child)
|
335
|
+
heapq.heappush(queue, [child.get_edge_length(), cnt, tmp, child])
|
336
|
+
cnt += 1
|
337
|
+
|
338
|
+
while len(queue) > 0:
|
339
|
+
try:
|
340
|
+
[length, _, path, node] = heapq.heappop(queue)
|
341
|
+
except IndexError:
|
342
|
+
break
|
343
|
+
|
344
|
+
visited.add(node)
|
345
|
+
if node.is_leaf():
|
346
|
+
if (not y) or node.get_label() == y.get_label():
|
347
|
+
if (valid_leaves != None and node.get_label() not in valid_leaves):
|
348
|
+
continue
|
349
|
+
else:
|
350
|
+
return node, path, length
|
351
|
+
else:
|
352
|
+
continue
|
353
|
+
|
354
|
+
if node.get_parent() and node.get_parent() not in visited:
|
355
|
+
tmp = path.copy()
|
356
|
+
tmp.append(node)
|
357
|
+
heapq.heappush(queue, [length+node.get_edge_length(), cnt, tmp, node.get_parent()])
|
358
|
+
cnt += 1
|
359
|
+
|
360
|
+
for child in node.child_nodes():
|
361
|
+
if child and child not in visited:
|
362
|
+
tmp = path.copy()
|
363
|
+
tmp.append(child)
|
364
|
+
heapq.heappush(queue, [length+child.get_edge_length(), cnt, tmp, child])
|
365
|
+
cnt += 1
|
366
|
+
|
367
|
+
return x, [x], 0
|
368
|
+
|
369
|
+
def build_subtrees(sister_taxon_dict, leaf_dict, tree, nbr_subtrees, subtree_size):
|
370
|
+
tree_building_taxa = len(list(sister_taxon_dict))
|
371
|
+
cluster_dict = dict()
|
372
|
+
cluster_index = 0
|
373
|
+
cluster_list = dict()
|
374
|
+
|
375
|
+
|
376
|
+
|
377
|
+
#tmp
|
378
|
+
#draw(tree.extract_tree_with(list(sister_taxon_dict)))
|
379
|
+
#tmp
|
380
|
+
|
381
|
+
for taxon, _ in sister_taxon_dict.items():
|
382
|
+
cluster_dict[taxon] = cluster_index
|
383
|
+
cluster_list[cluster_index] = [taxon]
|
384
|
+
cluster_index += 1
|
385
|
+
|
386
|
+
queue = []
|
387
|
+
for l1, _ in sister_taxon_dict.items():
|
388
|
+
node_l2, _, length = find_closest_testing(leaf_dict[l1],{leaf_dict[l1]},valid_leaves=sister_taxon_dict)
|
389
|
+
heapq.heappush(queue, [length, l1, node_l2.get_label()])
|
390
|
+
|
391
|
+
while tree_building_taxa > nbr_subtrees:
|
392
|
+
[length, l1, l2] = heapq.heappop(queue)
|
393
|
+
|
394
|
+
idx = cluster_dict[l1]
|
395
|
+
old_idx = cluster_dict[l2]
|
396
|
+
|
397
|
+
if idx == old_idx:
|
398
|
+
continue
|
399
|
+
|
400
|
+
for leaf in cluster_list[old_idx]:
|
401
|
+
cluster_dict[leaf] = idx
|
402
|
+
#[cluster_dict[leaf] = idx for leaf in cluster_list[old_idx]]
|
403
|
+
cluster_list[idx].extend(cluster_list[old_idx])
|
404
|
+
cluster_list.pop(old_idx)
|
405
|
+
|
406
|
+
tree_building_taxa -= 1
|
407
|
+
|
408
|
+
best_leaf = cluster_list[idx][0]
|
409
|
+
for leaf in cluster_list[idx]:
|
410
|
+
best_leaf = centered_leaf(tree, cluster_list[idx])
|
411
|
+
cluster_node_list = [leaf_dict[leaf] for leaf in cluster_list[idx]]
|
412
|
+
cluster_set = {*cluster_node_list}
|
413
|
+
node_l3, _, length = find_closest_testing(leaf_dict[best_leaf], cluster_set, valid_leaves=sister_taxon_dict)
|
414
|
+
heapq.heappush(queue, [length, best_leaf, node_l3.get_label()])
|
415
|
+
|
416
|
+
trees = []
|
417
|
+
query_decomp_dict = []
|
418
|
+
|
419
|
+
for idx, cluster in cluster_list.items():
|
420
|
+
best_leaf = centered_leaf(tree, cluster)
|
421
|
+
|
422
|
+
subtree_leaves, tree2 = subtree_with_edge_length(tree, leaf_dict[best_leaf], subtree_size)
|
423
|
+
tree2 = tree2.extract_tree_with(subtree_leaves)
|
424
|
+
trees.append(tree2)
|
425
|
+
query_list = []
|
426
|
+
for leaf in cluster:
|
427
|
+
query_list.extend(sister_taxon_dict[leaf])
|
428
|
+
query_decomp_dict.append(query_list)
|
429
|
+
|
430
|
+
return trees, query_decomp_dict
|
431
|
+
|
432
|
+
|
433
|
+
def subtree_nodes(tree, leaf_y, n):
|
434
|
+
""" Returns list of length n of leaves closest to sister taxon
|
435
|
+
|
436
|
+
Parameters
|
437
|
+
----------
|
438
|
+
tree : treeswift tree object
|
439
|
+
leaf_y : treeswift node for closest sister taxon
|
440
|
+
n = number of taxa contained in subtree
|
441
|
+
|
442
|
+
Returns
|
443
|
+
-------
|
444
|
+
list of taxon labels corresponding to leaves in the subtree
|
445
|
+
|
446
|
+
"""
|
447
|
+
queue = [(0, 0, leaf_y.get_parent())]
|
448
|
+
|
449
|
+
leaves = [leaf_y]
|
450
|
+
visited = {leaf_y}
|
451
|
+
|
452
|
+
counter = 1
|
453
|
+
|
454
|
+
while len(leaves) < n:
|
455
|
+
try:
|
456
|
+
(length, _, node) = heapq.heappop(queue)
|
457
|
+
except IndexError:
|
458
|
+
break
|
459
|
+
|
460
|
+
visited.add(node)
|
461
|
+
if node.is_leaf():
|
462
|
+
leaves.append(node)
|
463
|
+
|
464
|
+
adjacent = node.child_nodes()
|
465
|
+
if not node.is_root():
|
466
|
+
adjacent.append(node.get_parent())
|
467
|
+
|
468
|
+
for neighbor in adjacent:
|
469
|
+
if neighbor not in visited:
|
470
|
+
heapq.heappush(queue, (length+1, counter, neighbor))
|
471
|
+
counter += 1
|
472
|
+
|
473
|
+
|
474
|
+
result = []
|
475
|
+
for item in leaves:
|
476
|
+
result.append(item.get_label())
|
477
|
+
|
478
|
+
return result
|
479
|
+
|
480
|
+
def subtree_nodes_with_edge_length(tree, leaf_y, n):
|
481
|
+
""" Returns list of length n of leaves closest to sister taxon (minimizing edge weights)
|
482
|
+
|
483
|
+
Parameters
|
484
|
+
----------
|
485
|
+
tree : treeswift tree object
|
486
|
+
leaf_y : treeswift node for closest sister taxon
|
487
|
+
n = number of taxa contained in subtree
|
488
|
+
|
489
|
+
Returns
|
490
|
+
-------
|
491
|
+
list of taxon labels corresponding to leaves in the subtree
|
492
|
+
"""
|
493
|
+
queue = [(leaf_y.get_edge_length(), leaf_y.get_parent())]
|
494
|
+
|
495
|
+
leaves = [leaf_y]
|
496
|
+
visited = {leaf_y}
|
497
|
+
|
498
|
+
while len(leaves) < n:
|
499
|
+
try:
|
500
|
+
(length, node) = heapq.heappop(queue)
|
501
|
+
except IndexError:
|
502
|
+
break
|
503
|
+
|
504
|
+
visited.add(node)
|
505
|
+
if node.is_leaf() and node.get_label() != '':
|
506
|
+
leaves.append(node)
|
507
|
+
|
508
|
+
adjacent_nodes = node.child_nodes()
|
509
|
+
if not node.is_root():
|
510
|
+
adjacent_nodes.append(node.get_parent())
|
511
|
+
|
512
|
+
for neighbor in adjacent_nodes:
|
513
|
+
if neighbor not in visited:
|
514
|
+
if neighbor == node.get_parent():
|
515
|
+
heapq.heappush(queue, (length+node.get_edge_length(), neighbor))
|
516
|
+
else:
|
517
|
+
heapq.heappush(queue, (length+neighbor.get_edge_length(), neighbor))
|
518
|
+
|
519
|
+
result = []
|
520
|
+
for item in leaves:
|
521
|
+
result.append(item.get_label())
|
522
|
+
|
523
|
+
return result
|
524
|
+
|
525
|
+
def subtree_with_edge_length(tree, leaf_y, n):
|
526
|
+
""" Returns a subtree and list of length n of leaves closest to sister taxon (minimizing edge weights)
|
527
|
+
|
528
|
+
Parameters
|
529
|
+
----------
|
530
|
+
tree : treeswift tree object
|
531
|
+
leaf_y : treeswift node for closest sister taxon
|
532
|
+
n = number of taxa contained in subtree
|
533
|
+
|
534
|
+
Returns
|
535
|
+
-------
|
536
|
+
a subtree and a list of taxon labels corresponding to leaves in the subtree
|
537
|
+
"""
|
538
|
+
if leaf_y.get_parent() == None:
|
539
|
+
return None, None
|
540
|
+
|
541
|
+
subtree = treeswift.Tree(); subtree.root.label = None; subtree.root.edge_length = 0
|
542
|
+
subtreeNode = treeswift.Node(label=leaf_y.get_label(), edge_length=0); subtree.root.add_child(subtreeNode)
|
543
|
+
|
544
|
+
queue = [(leaf_y.get_edge_length(), leaf_y.get_parent(), leaf_y.get_edge_length(), subtree.root)]
|
545
|
+
|
546
|
+
leaves = [leaf_y]
|
547
|
+
visited = {leaf_y}
|
548
|
+
|
549
|
+
while len(leaves) < n:
|
550
|
+
try:
|
551
|
+
(length, node, n_edge_length, parent) = heapq.heappop(queue)
|
552
|
+
except IndexError:
|
553
|
+
break
|
554
|
+
|
555
|
+
visited.add(node)
|
556
|
+
subtreeNode = treeswift.Node(label=node.get_label(), edge_length=n_edge_length); parent.add_child(subtreeNode)
|
557
|
+
if node.is_leaf():
|
558
|
+
leaves.append(node)
|
559
|
+
|
560
|
+
adjacent_nodes = node.child_nodes()
|
561
|
+
if not node.is_root():
|
562
|
+
adjacent_nodes.append(node.get_parent())
|
563
|
+
|
564
|
+
for neighbor in adjacent_nodes:
|
565
|
+
if neighbor not in visited:
|
566
|
+
if neighbor == node.get_parent():
|
567
|
+
heapq.heappush(queue, (length+node.get_edge_length(), neighbor, node.get_edge_length(), subtreeNode))
|
568
|
+
else:
|
569
|
+
heapq.heappush(queue, (length+neighbor.get_edge_length(), neighbor, neighbor.get_edge_length(), subtreeNode))
|
570
|
+
|
571
|
+
result = []
|
572
|
+
for item in leaves:
|
573
|
+
result.append(item.get_label())
|
574
|
+
|
575
|
+
#subtree.deroot()
|
576
|
+
subtree.suppress_unifurcations()
|
577
|
+
return result, subtree
|
578
|
+
|
579
|
+
def extract_taxa_from_tree(a_tree, labels, leaf_dict):
|
580
|
+
queue = []
|
581
|
+
closest = []
|
582
|
+
|
583
|
+
counter = 0
|
584
|
+
|
585
|
+
for label in labels:
|
586
|
+
heapq.heappush(queue,(counter,leaf_dict[label]))
|
587
|
+
counter += 1
|
588
|
+
|
589
|
+
while queue:
|
590
|
+
(cnt, node) = heapq.heappop(queue)
|
591
|
+
if node.num_children() == 0:
|
592
|
+
parent = node.get_parent()
|
593
|
+
if parent != None:
|
594
|
+
parent.remove_child(node)
|
595
|
+
heapq.heappush(queue,(counter, parent))
|
596
|
+
|
597
|
+
a_tree.resolve_polytomies()
|
598
|
+
a_tree.suppress_unifurcations()
|
599
|
+
|
600
|
+
return a_tree
|
601
|
+
|
602
|
+
def min_tree_extract_disjoint(a_tree, max_size, tmp_leaves):
|
603
|
+
labels, t2 = subtree_with_edge_length(a_tree, tmp_leaves[random.choice(list(tmp_leaves))], max_size)
|
604
|
+
|
605
|
+
t2 = t2.extract_tree_with(labels)
|
606
|
+
t1 = extract_taxa_from_tree(a_tree, labels, tmp_leaves)
|
607
|
+
for label in labels:
|
608
|
+
del tmp_leaves[label]
|
609
|
+
return t1, t2, tmp_leaves
|
610
|
+
|
611
|
+
|
612
|
+
def min_tree_extract_non_disjoint(a_tree, max_size, tmp_leaves):
|
613
|
+
labels, t2 = subtree_with_edge_length(a_tree, tmp_leaves[random.choice(list(tmp_leaves))], max_size)
|
614
|
+
|
615
|
+
t2 = a_tree.extract_tree_with(labels)
|
616
|
+
for label in labels:
|
617
|
+
if label in tmp_leaves:
|
618
|
+
del tmp_leaves[label]
|
619
|
+
return a_tree, t2, tmp_leaves
|
620
|
+
|
621
|
+
def decompose_tree(a_tree,max_size):
|
622
|
+
tree_list = []
|
623
|
+
tmp_leaves = a_tree.label_to_node(selection='leaves')
|
624
|
+
if '' in tmp_leaves:
|
625
|
+
del tmp_leaves['']
|
626
|
+
#t1, t2, tmp_leaves = min_tree_extract_disjoint(a_tree, max_size, tmp_leaves)
|
627
|
+
t1, t2, tmp_leaves = min_tree_extract_non_disjoint(a_tree, max_size, tmp_leaves)
|
628
|
+
while len(tmp_leaves) > 0:
|
629
|
+
tree_list.append(t2)
|
630
|
+
|
631
|
+
#t1, t2, tmp_leaves = min_tree_extract_disjoint(t1, max_size, tmp_leaves)
|
632
|
+
t1, t2, tmp_leaves = min_tree_extract_non_disjoint(t1, max_size, tmp_leaves)
|
633
|
+
tree_list.append(t2)
|
634
|
+
print ("nbr subtrees: ", len(tree_list))
|
635
|
+
return tree_list
|
636
|
+
|
637
|
+
def decompose_tree_min_clust(a_tree,max_size):
|
638
|
+
tree_list = []
|
639
|
+
t1, t2 = min_cluster_size_bisect(a_tree,max_size)
|
640
|
+
while t2 != None:
|
641
|
+
tree_list.append(t2)
|
642
|
+
t1, t2 = min_cluster_size_bisect(t1, max_size)
|
643
|
+
tree_list.append(t1) #this might be a bug
|
644
|
+
return tree_list
|
645
|
+
|
646
|
+
def min_cluster_size_bisect(a_tree,max_size):
|
647
|
+
'''
|
648
|
+
modified from PASTA to use treeswift
|
649
|
+
'''
|
650
|
+
nleaf = dict()
|
651
|
+
|
652
|
+
print("before extracting subtree: " + str(len(a_tree.label_to_node(selection='leaves'))))
|
653
|
+
#a_tree.draw()
|
654
|
+
|
655
|
+
for node in a_tree.traverse_postorder():
|
656
|
+
if node.is_leaf():
|
657
|
+
nleaf[node] = 1
|
658
|
+
else:
|
659
|
+
nleaf[node] = 0
|
660
|
+
max_child = None
|
661
|
+
max_nleaf = 0
|
662
|
+
for ch in node.child_nodes():
|
663
|
+
nleaf[node] += nleaf[ch]
|
664
|
+
if nleaf[ch] > max_nleaf:
|
665
|
+
max_nleaf = nleaf[ch]
|
666
|
+
max_child = ch
|
667
|
+
if nleaf[node] >= max_size:
|
668
|
+
node.remove_child(max_child)
|
669
|
+
t1 = a_tree.extract_subtree(max_child)
|
670
|
+
print("subtree size: " + str(len(t1.label_to_node(selection='leaves'))))
|
671
|
+
print("after extracting subtree: " + str(len(a_tree.label_to_node(selection='leaves'))))
|
672
|
+
#t1.deroot()
|
673
|
+
#t1.draw()
|
674
|
+
t1.resolve_polytomies()
|
675
|
+
return a_tree,t1
|
676
|
+
|
677
|
+
print("after extracting subtree: " + str(len(a_tree.label_to_node(selection='leaves'))))
|
678
|
+
return a_tree,None
|
679
|
+
|
680
|
+
def centered_leaf(tree, cluster):
|
681
|
+
#pick best sister leaf for cluster center
|
682
|
+
best_leaf = cluster[0]
|
683
|
+
if len(cluster) > 1:
|
684
|
+
tmp_tree = tree.extract_tree_with(cluster)
|
685
|
+
min_distance = 99999999999
|
686
|
+
tmp_label_dict = tmp_tree.label_to_node(selection='leaves')
|
687
|
+
for leaf in cluster:
|
688
|
+
total = total_distance(tmp_tree, tmp_label_dict[leaf])
|
689
|
+
|
690
|
+
if total < min_distance:
|
691
|
+
min_distance = total
|
692
|
+
best_leaf = leaf
|
693
|
+
|
694
|
+
return best_leaf
|
695
|
+
|
696
|
+
def max_distance(tree, node):
|
697
|
+
maximum = 0
|
698
|
+
for leaf in tree.traverse_leaves():
|
699
|
+
distance = tree.distance_between(node, leaf)
|
700
|
+
if distance >= maximum:
|
701
|
+
maximum = distance
|
702
|
+
return maximum
|
703
|
+
|
704
|
+
def total_distance(tree, node):
|
705
|
+
total = 0
|
706
|
+
for leaf in tree.traverse_leaves():
|
707
|
+
total += tree.distance_between(node, leaf)
|
708
|
+
return total
|
709
|
+
|
710
|
+
|
711
|
+
def avg_distance(tree, node):
|
712
|
+
total = 0
|
713
|
+
leaf_cnt = 0
|
714
|
+
for leaf in tree.traverse_leaves():
|
715
|
+
total += tree.distance_between(node, leaf)
|
716
|
+
leaf_cnt += 1
|
717
|
+
return total/leaf_cnt
|
718
|
+
|
719
|
+
def median_distance(tree, node):
|
720
|
+
total = 0
|
721
|
+
leaf_cnt = 0
|
722
|
+
distances = []
|
723
|
+
for leaf in tree.traverse_leaves():
|
724
|
+
distance = tree.distance_between(node, leaf)
|
725
|
+
total += distance
|
726
|
+
distances.append(distance)
|
727
|
+
leaf_cnt += 1
|
728
|
+
return statistics.median(distances)
|
729
|
+
|
730
|
+
def add_edge_nbrs(tree):
|
731
|
+
counter = 0
|
732
|
+
for node in tree.traverse_postorder():
|
733
|
+
#if not node.is_root():
|
734
|
+
counter += 1
|
735
|
+
label = node.get_label()
|
736
|
+
if label == None:
|
737
|
+
node.set_label('%%{}'.format(counter))
|
738
|
+
else:
|
739
|
+
node.set_label('{}%%{}'.format(label, counter))
|
740
|
+
|
741
|
+
def remove_edge_nbrs(tree):
|
742
|
+
for node in tree.traverse_postorder():
|
743
|
+
#if not node.is_root():
|
744
|
+
label_list = node.get_label().split('%%',1)
|
745
|
+
if label_list[0] == '':
|
746
|
+
node.set_label(None)
|
747
|
+
else:
|
748
|
+
node.set_label(label_list[0])
|
749
|
+
|
750
|
+
'''
|
751
|
+
The following three functions are modified from treeswift to
|
752
|
+
read and write newick files with jplace tokens
|
753
|
+
'''
|
754
|
+
def newick_edge_tokens(tree):
|
755
|
+
'''
|
756
|
+
Modified from treeswift tree.newick()
|
757
|
+
Output this ``Tree`` as a Newick string with lables
|
758
|
+
Returns:
|
759
|
+
``str``: Newick string of this ``Tree``
|
760
|
+
'''
|
761
|
+
label_list = tree.root.get_label().split('%%',1)
|
762
|
+
|
763
|
+
if tree.root.edge_length is None:
|
764
|
+
suffix = ';'
|
765
|
+
elif isinstance(tree.root.edge_length,int):
|
766
|
+
suffix = '%s:%d{%d};' % (str(label_list[0]), int(tree.root.edge_length), int(label_list[1]))
|
767
|
+
elif isinstance(tree.root.edge_length,float) and tree.root.edge_length.is_integer():
|
768
|
+
suffix = '%s:%d{%d};' % (str(label_list[0]), float(tree.root.edge_length), int(label_list[1]))
|
769
|
+
else:
|
770
|
+
suffix = '%s:%s{%d};' % (str(label_list[0]), str(tree.root.edge_length), int(label_list[1]))
|
771
|
+
|
772
|
+
if tree.is_rooted:
|
773
|
+
return '[&R] %s%s' % (newick_edge_tokens_node(tree.root),suffix)
|
774
|
+
else:
|
775
|
+
return '%s%s' % (newick_edge_tokens_node(tree.root),suffix)
|
776
|
+
|
777
|
+
def newick_edge_tokens_node(node):
|
778
|
+
'''
|
779
|
+
Modified from treeswift node.newick()
|
780
|
+
Newick string conversion starting at this ``Node`` object
|
781
|
+
Returns:
|
782
|
+
``str``: Newick string conversion starting at this ``Node`` object
|
783
|
+
'''
|
784
|
+
node_to_str = dict()
|
785
|
+
for node in node.traverse_postorder():
|
786
|
+
node_label = node.get_label()
|
787
|
+
[label, edge_nbr] = node_label.split('%%',1)
|
788
|
+
#node.set_label(label_list[0])
|
789
|
+
if node.is_leaf():
|
790
|
+
if label is None:
|
791
|
+
node_to_str[node] = ''
|
792
|
+
else:
|
793
|
+
node_to_str[node] = str(label)
|
794
|
+
else:
|
795
|
+
out = ['(']
|
796
|
+
for c in node.children:
|
797
|
+
c_label = c.get_label()
|
798
|
+
[label_c, edge_nbr_c] = c_label.split('%%',1)
|
799
|
+
out.append(node_to_str[c])
|
800
|
+
if c.edge_length is not None:
|
801
|
+
if isinstance(c.edge_length,int):
|
802
|
+
l_str = str(c.edge_length)
|
803
|
+
elif isinstance(c.edge_length,float) and c.edge_length.is_integer():
|
804
|
+
l_str = str(int(c.edge_length))
|
805
|
+
else:
|
806
|
+
l_str = str(c.edge_length)
|
807
|
+
out.append(':%s{%d}' % (l_str, int(edge_nbr_c)))
|
808
|
+
out.append(',')
|
809
|
+
del node_to_str[c]
|
810
|
+
out.pop() # trailing comma
|
811
|
+
out.append(')')
|
812
|
+
if label is not None:
|
813
|
+
out.append(str(label))
|
814
|
+
node_to_str[node] = ''.join(out)
|
815
|
+
return node_to_str[node]
|
816
|
+
|
817
|
+
#def write_tree_newick_edge_tokens(tree, filename, hide_rooted_prefix=False):
|
818
|
+
# '''
|
819
|
+
# Modified from treeswift tree.write_tree_newick()
|
820
|
+
# Write this ``Tree`` to a Newick file
|
821
|
+
# Args:
|
822
|
+
# ``filename`` (``str``): Path to desired output file (plain-text or gzipped)
|
823
|
+
# '''
|
824
|
+
# if not isinstance(filename, str):
|
825
|
+
# raise TypeError("filename must be a str")
|
826
|
+
# treestr = newick_edge_nbr_string(tree)
|
827
|
+
# if hide_rooted_prefix:
|
828
|
+
# if treestr.startswith('[&R]'):
|
829
|
+
# treestr = treestr[4:].strip()
|
830
|
+
# else:
|
831
|
+
# warn("Specified hide_rooted_prefix, but tree was not rooted")
|
832
|
+
# if filename.lower().endswith('.gz'): # gzipped file
|
833
|
+
# f = gopen(expanduser(filename),'wb',9); f.write(treestr.encode()); f.close()
|
834
|
+
# else: # plain-text file
|
835
|
+
# f = open(expanduser(filename),'w'); f.write(treestr); f.close()
|
836
|
+
|
837
|
+
def read_tree_newick_edge_tokens(newick):
|
838
|
+
'''
|
839
|
+
Modified from treeswift.read_tree_newick(newick)
|
840
|
+
Read a tree from a Newick string or file
|
841
|
+
Args:
|
842
|
+
``newick`` (``str``): Either a Newick string or the path to a Newick file (plain-text or gzipped)
|
843
|
+
|
844
|
+
Returns:
|
845
|
+
``Tree``: The tree represented by ``newick``. If the Newick file has multiple trees (one per line), a ``list`` of ``Tree`` objects will be returned
|
846
|
+
'''
|
847
|
+
place_edge_dict = dict()
|
848
|
+
if not isinstance(newick, str):
|
849
|
+
try:
|
850
|
+
newick = str(newick)
|
851
|
+
except:
|
852
|
+
raise TypeError("newick must be a str")
|
853
|
+
if newick.lower().endswith('.gz'): # gzipped file
|
854
|
+
f = gzip.open(expanduser(newick)); ts = f.read().decode().strip(); f.close()
|
855
|
+
elif isfile(expanduser(newick)): # plain-text file
|
856
|
+
f = open(expanduser(newick)); ts = f.read().strip(); f.close()
|
857
|
+
else:
|
858
|
+
ts = newick.strip()
|
859
|
+
lines = ts.splitlines()
|
860
|
+
if len(lines) != 1:
|
861
|
+
return [read_tree_newick_edge_tokens(l) for l in lines]
|
862
|
+
try:
|
863
|
+
t = treeswift.Tree(); t.is_rooted = ts.startswith('[&R]')
|
864
|
+
if ts[0] == '[':
|
865
|
+
ts = ']'.join(ts.split(']')[1:]).strip(); ts = ts.replace(', ',',')
|
866
|
+
n = t.root; i = 0
|
867
|
+
while i < len(ts):
|
868
|
+
# end of Newick string
|
869
|
+
if ts[i] == ';':
|
870
|
+
if i != len(ts)-1 or n != t.root:
|
871
|
+
raise RuntimeError("INVALID NEWICK")
|
872
|
+
|
873
|
+
# go to new child
|
874
|
+
elif ts[i] == '(':
|
875
|
+
c = treeswift.Node(); n.add_child(c); n = c
|
876
|
+
|
877
|
+
# go to parent
|
878
|
+
elif ts[i] == ')':
|
879
|
+
n = n.parent
|
880
|
+
|
881
|
+
# go to new sibling
|
882
|
+
elif ts[i] == ',':
|
883
|
+
n = n.parent; c = treeswift.Node(); n.add_child(c); n = c
|
884
|
+
|
885
|
+
# edge length
|
886
|
+
elif ts[i] == ':':
|
887
|
+
i += 1; ls = ''
|
888
|
+
while ts[i] != ',' and ts[i] != ')' and ts[i] != ';' and ts[i] != '{':
|
889
|
+
ls += ts[i]; i += 1
|
890
|
+
if ls[0] == '[':
|
891
|
+
n.edge_params = ']'.join(ls.split(']')[:-1]); ls = ls.split(']')[-1]
|
892
|
+
n.edge_length = float(ls); i -= 1
|
893
|
+
|
894
|
+
# edge token
|
895
|
+
elif ts[i] == '{':
|
896
|
+
i += 1; ls = ''
|
897
|
+
while ts[i] != '}':
|
898
|
+
ls += ts[i]; i += 1
|
899
|
+
place_edge_dict[ls] = n
|
900
|
+
|
901
|
+
# node label
|
902
|
+
else:
|
903
|
+
label = ''; bracket = None
|
904
|
+
while bracket is not None or ts[i] in BRACKET or (ts[i] != ':' and ts[i] != ',' and ts[i] != ';' and ts[i] != ')'):
|
905
|
+
if ts[i] in BRACKET and bracket is None:
|
906
|
+
bracket = ts[i]
|
907
|
+
elif bracket is not None and ts[i] == BRACKET[bracket]:
|
908
|
+
bracket = None
|
909
|
+
label += ts[i]; i += 1
|
910
|
+
i -= 1; n.label = label
|
911
|
+
i += 1
|
912
|
+
except Exception as e:
|
913
|
+
raise RuntimeError("Failed to parse string as Newick")
|
914
|
+
return t, place_edge_dict
|