bscampp 1.0.1a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bscampp/utils.py ADDED
@@ -0,0 +1,913 @@
1
+ #from dendropy import *
2
+ import numpy as np
3
+ import heapq
4
+ import treeswift
5
+ import itertools
6
+ from collections import deque
7
+ from os.path import expanduser,isfile
8
+ import random
9
+ import statistics
10
+ import copy
11
+
12
+ import argparse
13
+ # reformat argparse help text formatting
14
+ class SmartHelpFormatter(argparse.RawDescriptionHelpFormatter):
15
+ def add_text(self, text):
16
+ if text is not None:
17
+ text = text.replace("\\n", "\n").replace("\\t", "\t")
18
+ super().add_text(text)
19
+ def _split_lines(self, text, width):
20
+ if '\n' in text:
21
+ temp = text.split('\n')
22
+ ret = []
23
+ for _splice in [argparse.RawDescriptionHelpFormatter._split_lines(self, x, width)
24
+ for x in temp]:
25
+ ret.extend(_splice)
26
+ return ret
27
+ return argparse.RawDescriptionHelpFormatter._split_lines(self, text, width)
28
+
29
+ # store bracket open/close for convenience in label parsing
30
+ BRACKET = {
31
+ '[': ']', # square bracket
32
+ '{': '}', # curly bracket
33
+ "'": "'", # single-quote
34
+ '"': '"', # double-quote
35
+ }
36
+
37
+
38
+ def write_fasta(aln, aln_dict, aligned=True):
39
+ """ Write given dictionary as FASTA file out
40
+
41
+ Parameters
42
+ ----------
43
+ aln : FASTA file path
44
+ aln_dict : MSA in the form of a dict
45
+ aligned : whether the sequences are aligned
46
+
47
+ Returns
48
+ -------
49
+ None
50
+
51
+ """
52
+
53
+ f = open(aln, 'w')
54
+ for label, seq in aln_dict.items():
55
+ if label != '':
56
+ f.write(f'>{label}\n')
57
+ if aligned:
58
+ f.write(f'{seq}\n')
59
+ else:
60
+ f.write(seq.replace('-', '') + '\n')
61
+ f.close()
62
+
63
+ #separete the query and ref sequence from the alignment file
64
+
65
+ def read_data(aln):
66
+ """ Load the query and reference sequence from the alignment file
67
+
68
+ Parameters
69
+ ----------
70
+ aln : multiple sequence alignment containing reference taxa and query sequence
71
+
72
+ Returns
73
+ -------
74
+ dictionary containing sequences with taxon label keys
75
+
76
+ """
77
+
78
+ f = open(aln)
79
+ result = dict()
80
+
81
+ taxa = ""
82
+ seq = ""
83
+ for line in f:
84
+ if line[0] == '>':
85
+ if taxa != "":
86
+ result[taxa] = seq
87
+ taxa = line[1:-1]
88
+ seq = ""
89
+
90
+ elif line == "/n":
91
+ continue
92
+ else:
93
+ seq += line[:-1]
94
+
95
+ if taxa != "":
96
+ result[taxa] = seq
97
+
98
+
99
+ return result
100
+
101
+ def seperate(aln_dict, leaf_dict):
102
+ """ Separate the query sequences from the reference sequences
103
+
104
+ Parameters
105
+ ----------
106
+ aln_dict : Sequence dictionary with taxon label keys
107
+ leaf_dict : Sequence dictionary with leaf label keys (queries are not in backbone tree)
108
+
109
+ Returns
110
+ -------
111
+ separate dictionaries containing query sequences and referece sequences with taxon label keys
112
+
113
+ """
114
+ ref = dict()
115
+ query = dict()
116
+
117
+ for key, value in aln_dict.items():
118
+ if key not in leaf_dict:
119
+ query[key] = value
120
+ else:
121
+ ref[key] = value
122
+
123
+ return ref, query
124
+
125
+ def hamming(seq1, seq2):
126
+ """ Returns hamming distance between two sequences
127
+
128
+ Parameters
129
+ ----------
130
+ seq1 : query sequence
131
+ seq2 : reference sequence
132
+
133
+ Returns
134
+ -------
135
+ integer hamming distance between query sequence and reference sequence
136
+
137
+ """
138
+ return sum(1 for ch1, ch2 in zip(seq1, seq2) if ch1 != ch2)
139
+
140
+
141
+ def find_y(x,ref):
142
+ """ Returns leaf label for closest sister taxon l (no longer used)
143
+
144
+ Parameters
145
+ ----------
146
+ x : aligned query sequence
147
+ ref : reference multiple sequence alignment dictionary
148
+
149
+ Returns
150
+ -------
151
+ leaf label for taxon with smallest hamming distacne to query sequence
152
+
153
+ """
154
+ low = len(x)
155
+ y = ""
156
+ for name, seq in ref.items():
157
+ h_dist = hamming(x, seq)
158
+ if h_dist < low:
159
+ low = h_dist
160
+ y = name
161
+ return y
162
+
163
+ def set_hamming_dict(args):
164
+ dict_items, other_args = args
165
+ name, seq = dict_items
166
+ ref, n, fragment_flag, y_dict = other_args
167
+
168
+ y_dict[name] = find_closest_hamming(seq, ref, n, fragment_flag)
169
+
170
+
171
+ def find_closest_hamming(x, ref, n, fragment_flag):
172
+ ''' Returns leaf name for n closest sister taxa to sequence x
173
+
174
+ Parameters
175
+ ----------
176
+ x : aligned query sequence
177
+ ref : reference multiple sequence alignment dictionary
178
+ n : number of nodes to return
179
+ fragment_flag : True if the query is not full length
180
+
181
+ Returns
182
+ -------
183
+ list of nodes with n smallest hamming distacnes to query sequence
184
+
185
+ '''
186
+ queue = []
187
+ closest = []
188
+
189
+ counter = 0
190
+ if fragment_flag == 'True':
191
+ [si, ei] = set_fragment_indicies(x)
192
+ else:
193
+ [si, ei] = [0, len(x)]
194
+
195
+ c = 200 # size of the subtring compared at once
196
+
197
+ for name, seq in ref.items():
198
+ heapq.heappush(queue,(hamming(seq[si:si+c],x[si:si+c]), ei - si - c, counter, name))
199
+ counter += 1
200
+
201
+ while queue:
202
+ (ham_dist, sites_left, cnt, name) = heapq.heappop(queue)
203
+ if sites_left < 0:
204
+ closest.append(name)
205
+ if len(closest) >= n:
206
+ return closest
207
+ else:
208
+ ind = ei - sites_left
209
+ new_ham = hamming(ref[name][ind:ind+c],x[ind:ind+c])
210
+ heapq.heappush(queue,(ham_dist + new_ham, sites_left - c, cnt, name))
211
+
212
+ def set_fragment_indicies(x):
213
+ """ Returns the indicees without leading and trailing gaps.
214
+
215
+ Parameters
216
+ ----------
217
+ x = string sequence
218
+
219
+ Returns
220
+ -------
221
+ list of start index and end index with the first and last non gap character
222
+
223
+ """
224
+ e = len(x)
225
+ ei = e
226
+ si = 0
227
+ for i in range(ei):
228
+ if x[i] == '-' and si == i:
229
+ si = i + 1
230
+ if x[e - i - 1] == '-' and ei == e - i:
231
+ ei = e - i - 1
232
+ if ei == si:
233
+ break
234
+ return [si, ei]
235
+
236
+ def find_closest(x, visited, y=None):
237
+ """ Returns leaf label for closest leaf to the node x through path not travelling through visited.
238
+ If y is populated returns path from x to y not travelling through nodes in visited.
239
+
240
+ Parameters
241
+ ----------
242
+ x : dendropy node object
243
+ visited : dictionary containing dendropy node objects as keys
244
+ y : dendropy node object
245
+
246
+ Returns
247
+ -------
248
+ If y == None : dendropy node object of closest leaf y to the node x through path not travelling through nodes in visited,
249
+ list containing dendropy node objects on path to that leaf y from node x
250
+ If y != None : dendropy node object y,
251
+ list containing dendropy node objects on path from node x to leaf y not travelling through nodes in visited
252
+
253
+ """
254
+ queue = []
255
+ cnt = 1
256
+ visited.add(x)
257
+
258
+ if x.get_parent() and x.get_parent() not in visited:
259
+ tmp = []
260
+ tmp.append(x)
261
+ heapq.heappush(queue, [x.get_edge_length(), cnt, tmp, x.get_parent()])
262
+ cnt += 1
263
+
264
+ for child in x.child_nodes():
265
+ if child and child not in visited:
266
+ tmp = []
267
+ tmp.append(child)
268
+ heapq.heappush(queue, [child.get_edge_length(), cnt, tmp, child])
269
+ cnt += 1
270
+
271
+ while len(queue) > 0:
272
+ try:
273
+ [length, _, path, node] = heapq.heappop(queue)
274
+ except IndexError:
275
+ break
276
+
277
+ visited.add(node)
278
+ if node.is_leaf():
279
+ if (not y) or node.get_label()==y.get_label():
280
+ return node, path
281
+ else:
282
+ continue
283
+
284
+ if node.get_parent() and node.get_parent() not in visited:
285
+ tmp = path.copy()
286
+ tmp.append(node)
287
+ heapq.heappush(queue, [length+node.get_edge_length(), cnt, tmp, node.get_parent()])
288
+ cnt += 1
289
+
290
+ for child in node.child_nodes():
291
+ if child and child not in visited:
292
+ tmp = path.copy()
293
+ tmp.append(child)
294
+ heapq.heappush(queue, [length+child.get_edge_length(), cnt, tmp, child])
295
+ cnt += 1
296
+
297
+ return x, [x]
298
+
299
+
300
+ def find_closest_testing(x, visited, y=None, valid_leaves=None):
301
+ """ Returns leaf label for closest leaf to the node x through path not travelling through visited.
302
+ If y is populated returns path from x to y not travelling through nodes in visited.
303
+
304
+ Parameters
305
+ ----------
306
+ x : dendropy node object
307
+ visited : dictionary containing dendropy node objects as keys
308
+ y : dendropy node object
309
+
310
+ Returns
311
+ -------
312
+ If y == None : dendropy node object of closest leaf y to the node x through path not travelling through nodes in visited,
313
+ list containing dendropy node objects on path to that leaf y from node x
314
+ distance from node x to leaf y
315
+ If y != None : dendropy node object y,
316
+ list containing dendropy node objects on path from node x to leaf y not travelling through nodes in visited,
317
+ distance from node x to leaf y
318
+
319
+ """
320
+ queue = []
321
+ cnt = 1
322
+ visited.add(x)
323
+
324
+ if x.get_parent() and x.get_parent() not in visited:
325
+ tmp = []
326
+ tmp.append(x)
327
+ heapq.heappush(queue, [x.get_edge_length(), cnt, tmp, x.get_parent()])
328
+ cnt += 1
329
+
330
+ for child in x.child_nodes():
331
+ if child and child not in visited:
332
+ tmp = []
333
+ tmp.append(child)
334
+ heapq.heappush(queue, [child.get_edge_length(), cnt, tmp, child])
335
+ cnt += 1
336
+
337
+ while len(queue) > 0:
338
+ try:
339
+ [length, _, path, node] = heapq.heappop(queue)
340
+ except IndexError:
341
+ break
342
+
343
+ visited.add(node)
344
+ if node.is_leaf():
345
+ if (not y) or node.get_label() == y.get_label():
346
+ if (valid_leaves != None and node.get_label() not in valid_leaves):
347
+ continue
348
+ else:
349
+ return node, path, length
350
+ else:
351
+ continue
352
+
353
+ if node.get_parent() and node.get_parent() not in visited:
354
+ tmp = path.copy()
355
+ tmp.append(node)
356
+ heapq.heappush(queue, [length+node.get_edge_length(), cnt, tmp, node.get_parent()])
357
+ cnt += 1
358
+
359
+ for child in node.child_nodes():
360
+ if child and child not in visited:
361
+ tmp = path.copy()
362
+ tmp.append(child)
363
+ heapq.heappush(queue, [length+child.get_edge_length(), cnt, tmp, child])
364
+ cnt += 1
365
+
366
+ return x, [x], 0
367
+
368
+ def build_subtrees(sister_taxon_dict, leaf_dict, tree, nbr_subtrees, subtree_size):
369
+ tree_building_taxa = len(list(sister_taxon_dict))
370
+ cluster_dict = dict()
371
+ cluster_index = 0
372
+ cluster_list = dict()
373
+
374
+
375
+
376
+ #tmp
377
+ #draw(tree.extract_tree_with(list(sister_taxon_dict)))
378
+ #tmp
379
+
380
+ for taxon, _ in sister_taxon_dict.items():
381
+ cluster_dict[taxon] = cluster_index
382
+ cluster_list[cluster_index] = [taxon]
383
+ cluster_index += 1
384
+
385
+ queue = []
386
+ for l1, _ in sister_taxon_dict.items():
387
+ node_l2, _, length = find_closest_testing(leaf_dict[l1],{leaf_dict[l1]},valid_leaves=sister_taxon_dict)
388
+ heapq.heappush(queue, [length, l1, node_l2.get_label()])
389
+
390
+ while tree_building_taxa > nbr_subtrees:
391
+ [length, l1, l2] = heapq.heappop(queue)
392
+
393
+ idx = cluster_dict[l1]
394
+ old_idx = cluster_dict[l2]
395
+
396
+ if idx == old_idx:
397
+ continue
398
+
399
+ for leaf in cluster_list[old_idx]:
400
+ cluster_dict[leaf] = idx
401
+ #[cluster_dict[leaf] = idx for leaf in cluster_list[old_idx]]
402
+ cluster_list[idx].extend(cluster_list[old_idx])
403
+ cluster_list.pop(old_idx)
404
+
405
+ tree_building_taxa -= 1
406
+
407
+ best_leaf = cluster_list[idx][0]
408
+ for leaf in cluster_list[idx]:
409
+ best_leaf = centered_leaf(tree, cluster_list[idx])
410
+ cluster_node_list = [leaf_dict[leaf] for leaf in cluster_list[idx]]
411
+ cluster_set = {*cluster_node_list}
412
+ node_l3, _, length = find_closest_testing(leaf_dict[best_leaf], cluster_set, valid_leaves=sister_taxon_dict)
413
+ heapq.heappush(queue, [length, best_leaf, node_l3.get_label()])
414
+
415
+ trees = []
416
+ query_decomp_dict = []
417
+
418
+ for idx, cluster in cluster_list.items():
419
+ best_leaf = centered_leaf(tree, cluster)
420
+
421
+ subtree_leaves, tree2 = subtree_with_edge_length(tree, leaf_dict[best_leaf], subtree_size)
422
+ tree2 = tree2.extract_tree_with(subtree_leaves)
423
+ trees.append(tree2)
424
+ query_list = []
425
+ for leaf in cluster:
426
+ query_list.extend(sister_taxon_dict[leaf])
427
+ query_decomp_dict.append(query_list)
428
+
429
+ return trees, query_decomp_dict
430
+
431
+
432
+ def subtree_nodes(tree, leaf_y, n):
433
+ """ Returns list of length n of leaves closest to sister taxon
434
+
435
+ Parameters
436
+ ----------
437
+ tree : treeswift tree object
438
+ leaf_y : treeswift node for closest sister taxon
439
+ n = number of taxa contained in subtree
440
+
441
+ Returns
442
+ -------
443
+ list of taxon labels corresponding to leaves in the subtree
444
+
445
+ """
446
+ queue = [(0, 0, leaf_y.get_parent())]
447
+
448
+ leaves = [leaf_y]
449
+ visited = {leaf_y}
450
+
451
+ counter = 1
452
+
453
+ while len(leaves) < n:
454
+ try:
455
+ (length, _, node) = heapq.heappop(queue)
456
+ except IndexError:
457
+ break
458
+
459
+ visited.add(node)
460
+ if node.is_leaf():
461
+ leaves.append(node)
462
+
463
+ adjacent = node.child_nodes()
464
+ if not node.is_root():
465
+ adjacent.append(node.get_parent())
466
+
467
+ for neighbor in adjacent:
468
+ if neighbor not in visited:
469
+ heapq.heappush(queue, (length+1, counter, neighbor))
470
+ counter += 1
471
+
472
+
473
+ result = []
474
+ for item in leaves:
475
+ result.append(item.get_label())
476
+
477
+ return result
478
+
479
+ def subtree_nodes_with_edge_length(tree, leaf_y, n):
480
+ """ Returns list of length n of leaves closest to sister taxon (minimizing edge weights)
481
+
482
+ Parameters
483
+ ----------
484
+ tree : treeswift tree object
485
+ leaf_y : treeswift node for closest sister taxon
486
+ n = number of taxa contained in subtree
487
+
488
+ Returns
489
+ -------
490
+ list of taxon labels corresponding to leaves in the subtree
491
+ """
492
+ queue = [(leaf_y.get_edge_length(), leaf_y.get_parent())]
493
+
494
+ leaves = [leaf_y]
495
+ visited = {leaf_y}
496
+
497
+ while len(leaves) < n:
498
+ try:
499
+ (length, node) = heapq.heappop(queue)
500
+ except IndexError:
501
+ break
502
+
503
+ visited.add(node)
504
+ if node.is_leaf() and node.get_label() != '':
505
+ leaves.append(node)
506
+
507
+ adjacent_nodes = node.child_nodes()
508
+ if not node.is_root():
509
+ adjacent_nodes.append(node.get_parent())
510
+
511
+ for neighbor in adjacent_nodes:
512
+ if neighbor not in visited:
513
+ if neighbor == node.get_parent():
514
+ heapq.heappush(queue, (length+node.get_edge_length(), neighbor))
515
+ else:
516
+ heapq.heappush(queue, (length+neighbor.get_edge_length(), neighbor))
517
+
518
+ result = []
519
+ for item in leaves:
520
+ result.append(item.get_label())
521
+
522
+ return result
523
+
524
+ def subtree_with_edge_length(tree, leaf_y, n):
525
+ """ Returns a subtree and list of length n of leaves closest to sister taxon (minimizing edge weights)
526
+
527
+ Parameters
528
+ ----------
529
+ tree : treeswift tree object
530
+ leaf_y : treeswift node for closest sister taxon
531
+ n = number of taxa contained in subtree
532
+
533
+ Returns
534
+ -------
535
+ a subtree and a list of taxon labels corresponding to leaves in the subtree
536
+ """
537
+ if leaf_y.get_parent() == None:
538
+ return None, None
539
+
540
+ subtree = treeswift.Tree(); subtree.root.label = None; subtree.root.edge_length = 0
541
+ subtreeNode = treeswift.Node(label=leaf_y.get_label(), edge_length=0); subtree.root.add_child(subtreeNode)
542
+
543
+ queue = [(leaf_y.get_edge_length(), leaf_y.get_parent(), leaf_y.get_edge_length(), subtree.root)]
544
+
545
+ leaves = [leaf_y]
546
+ visited = {leaf_y}
547
+
548
+ while len(leaves) < n:
549
+ try:
550
+ (length, node, n_edge_length, parent) = heapq.heappop(queue)
551
+ except IndexError:
552
+ break
553
+
554
+ visited.add(node)
555
+ subtreeNode = treeswift.Node(label=node.get_label(), edge_length=n_edge_length); parent.add_child(subtreeNode)
556
+ if node.is_leaf():
557
+ leaves.append(node)
558
+
559
+ adjacent_nodes = node.child_nodes()
560
+ if not node.is_root():
561
+ adjacent_nodes.append(node.get_parent())
562
+
563
+ for neighbor in adjacent_nodes:
564
+ if neighbor not in visited:
565
+ if neighbor == node.get_parent():
566
+ heapq.heappush(queue, (length+node.get_edge_length(), neighbor, node.get_edge_length(), subtreeNode))
567
+ else:
568
+ heapq.heappush(queue, (length+neighbor.get_edge_length(), neighbor, neighbor.get_edge_length(), subtreeNode))
569
+
570
+ result = []
571
+ for item in leaves:
572
+ result.append(item.get_label())
573
+
574
+ #subtree.deroot()
575
+ subtree.suppress_unifurcations()
576
+ return result, subtree
577
+
578
+ def extract_taxa_from_tree(a_tree, labels, leaf_dict):
579
+ queue = []
580
+ closest = []
581
+
582
+ counter = 0
583
+
584
+ for label in labels:
585
+ heapq.heappush(queue,(counter,leaf_dict[label]))
586
+ counter += 1
587
+
588
+ while queue:
589
+ (cnt, node) = heapq.heappop(queue)
590
+ if node.num_children() == 0:
591
+ parent = node.get_parent()
592
+ if parent != None:
593
+ parent.remove_child(node)
594
+ heapq.heappush(queue,(counter, parent))
595
+
596
+ a_tree.resolve_polytomies()
597
+ a_tree.suppress_unifurcations()
598
+
599
+ return a_tree
600
+
601
+ def min_tree_extract_disjoint(a_tree, max_size, tmp_leaves):
602
+ labels, t2 = subtree_with_edge_length(a_tree, tmp_leaves[random.choice(list(tmp_leaves))], max_size)
603
+
604
+ t2 = t2.extract_tree_with(labels)
605
+ t1 = extract_taxa_from_tree(a_tree, labels, tmp_leaves)
606
+ for label in labels:
607
+ del tmp_leaves[label]
608
+ return t1, t2, tmp_leaves
609
+
610
+
611
+ def min_tree_extract_non_disjoint(a_tree, max_size, tmp_leaves):
612
+ labels, t2 = subtree_with_edge_length(a_tree, tmp_leaves[random.choice(list(tmp_leaves))], max_size)
613
+
614
+ t2 = a_tree.extract_tree_with(labels)
615
+ for label in labels:
616
+ if label in tmp_leaves:
617
+ del tmp_leaves[label]
618
+ return a_tree, t2, tmp_leaves
619
+
620
+ def decompose_tree(a_tree,max_size):
621
+ tree_list = []
622
+ tmp_leaves = a_tree.label_to_node(selection='leaves')
623
+ if '' in tmp_leaves:
624
+ del tmp_leaves['']
625
+ #t1, t2, tmp_leaves = min_tree_extract_disjoint(a_tree, max_size, tmp_leaves)
626
+ t1, t2, tmp_leaves = min_tree_extract_non_disjoint(a_tree, max_size, tmp_leaves)
627
+ while len(tmp_leaves) > 0:
628
+ tree_list.append(t2)
629
+
630
+ #t1, t2, tmp_leaves = min_tree_extract_disjoint(t1, max_size, tmp_leaves)
631
+ t1, t2, tmp_leaves = min_tree_extract_non_disjoint(t1, max_size, tmp_leaves)
632
+ tree_list.append(t2)
633
+ print ("nbr subtrees: ", len(tree_list))
634
+ return tree_list
635
+
636
+ def decompose_tree_min_clust(a_tree,max_size):
637
+ tree_list = []
638
+ t1, t2 = min_cluster_size_bisect(a_tree,max_size)
639
+ while t2 != None:
640
+ tree_list.append(t2)
641
+ t1, t2 = min_cluster_size_bisect(t1, max_size)
642
+ tree_list.append(t1) #this might be a bug
643
+ return tree_list
644
+
645
+ def min_cluster_size_bisect(a_tree,max_size):
646
+ '''
647
+ modified from PASTA to use treeswift
648
+ '''
649
+ nleaf = dict()
650
+
651
+ print("before extracting subtree: " + str(len(a_tree.label_to_node(selection='leaves'))))
652
+ #a_tree.draw()
653
+
654
+ for node in a_tree.traverse_postorder():
655
+ if node.is_leaf():
656
+ nleaf[node] = 1
657
+ else:
658
+ nleaf[node] = 0
659
+ max_child = None
660
+ max_nleaf = 0
661
+ for ch in node.child_nodes():
662
+ nleaf[node] += nleaf[ch]
663
+ if nleaf[ch] > max_nleaf:
664
+ max_nleaf = nleaf[ch]
665
+ max_child = ch
666
+ if nleaf[node] >= max_size:
667
+ node.remove_child(max_child)
668
+ t1 = a_tree.extract_subtree(max_child)
669
+ print("subtree size: " + str(len(t1.label_to_node(selection='leaves'))))
670
+ print("after extracting subtree: " + str(len(a_tree.label_to_node(selection='leaves'))))
671
+ #t1.deroot()
672
+ #t1.draw()
673
+ t1.resolve_polytomies()
674
+ return a_tree,t1
675
+
676
+ print("after extracting subtree: " + str(len(a_tree.label_to_node(selection='leaves'))))
677
+ return a_tree,None
678
+
679
+ def centered_leaf(tree, cluster):
680
+ #pick best sister leaf for cluster center
681
+ best_leaf = cluster[0]
682
+ if len(cluster) > 1:
683
+ tmp_tree = tree.extract_tree_with(cluster)
684
+ min_distance = 99999999999
685
+ tmp_label_dict = tmp_tree.label_to_node(selection='leaves')
686
+ for leaf in cluster:
687
+ total = total_distance(tmp_tree, tmp_label_dict[leaf])
688
+
689
+ if total < min_distance:
690
+ min_distance = total
691
+ best_leaf = leaf
692
+
693
+ return best_leaf
694
+
695
+ def max_distance(tree, node):
696
+ maximum = 0
697
+ for leaf in tree.traverse_leaves():
698
+ distance = tree.distance_between(node, leaf)
699
+ if distance >= maximum:
700
+ maximum = distance
701
+ return maximum
702
+
703
+ def total_distance(tree, node):
704
+ total = 0
705
+ for leaf in tree.traverse_leaves():
706
+ total += tree.distance_between(node, leaf)
707
+ return total
708
+
709
+
710
+ def avg_distance(tree, node):
711
+ total = 0
712
+ leaf_cnt = 0
713
+ for leaf in tree.traverse_leaves():
714
+ total += tree.distance_between(node, leaf)
715
+ leaf_cnt += 1
716
+ return total/leaf_cnt
717
+
718
+ def median_distance(tree, node):
719
+ total = 0
720
+ leaf_cnt = 0
721
+ distances = []
722
+ for leaf in tree.traverse_leaves():
723
+ distance = tree.distance_between(node, leaf)
724
+ total += distance
725
+ distances.append(distance)
726
+ leaf_cnt += 1
727
+ return statistics.median(distances)
728
+
729
+ def add_edge_nbrs(tree):
730
+ counter = 0
731
+ for node in tree.traverse_postorder():
732
+ #if not node.is_root():
733
+ counter += 1
734
+ label = node.get_label()
735
+ if label == None:
736
+ node.set_label('%%{}'.format(counter))
737
+ else:
738
+ node.set_label('{}%%{}'.format(label, counter))
739
+
740
+ def remove_edge_nbrs(tree):
741
+ for node in tree.traverse_postorder():
742
+ #if not node.is_root():
743
+ label_list = node.get_label().split('%%',1)
744
+ if label_list[0] == '':
745
+ node.set_label(None)
746
+ else:
747
+ node.set_label(label_list[0])
748
+
749
+ '''
750
+ The following three functions are modified from treeswift to
751
+ read and write newick files with jplace tokens
752
+ '''
753
+ def newick_edge_tokens(tree):
754
+ '''
755
+ Modified from treeswift tree.newick()
756
+ Output this ``Tree`` as a Newick string with lables
757
+ Returns:
758
+ ``str``: Newick string of this ``Tree``
759
+ '''
760
+ label_list = tree.root.get_label().split('%%',1)
761
+
762
+ if tree.root.edge_length is None:
763
+ suffix = ';'
764
+ elif isinstance(tree.root.edge_length,int):
765
+ suffix = '%s:%d{%d};' % (str(label_list[0]), int(tree.root.edge_length), int(label_list[1]))
766
+ elif isinstance(tree.root.edge_length,float) and tree.root.edge_length.is_integer():
767
+ suffix = '%s:%d{%d};' % (str(label_list[0]), float(tree.root.edge_length), int(label_list[1]))
768
+ else:
769
+ suffix = '%s:%s{%d};' % (str(label_list[0]), str(tree.root.edge_length), int(label_list[1]))
770
+
771
+ if tree.is_rooted:
772
+ return '[&R] %s%s' % (newick_edge_tokens_node(tree.root),suffix)
773
+ else:
774
+ return '%s%s' % (newick_edge_tokens_node(tree.root),suffix)
775
+
776
+ def newick_edge_tokens_node(node):
777
+ '''
778
+ Modified from treeswift node.newick()
779
+ Newick string conversion starting at this ``Node`` object
780
+ Returns:
781
+ ``str``: Newick string conversion starting at this ``Node`` object
782
+ '''
783
+ node_to_str = dict()
784
+ for node in node.traverse_postorder():
785
+ node_label = node.get_label()
786
+ [label, edge_nbr] = node_label.split('%%',1)
787
+ #node.set_label(label_list[0])
788
+ if node.is_leaf():
789
+ if label is None:
790
+ node_to_str[node] = ''
791
+ else:
792
+ node_to_str[node] = str(label)
793
+ else:
794
+ out = ['(']
795
+ for c in node.children:
796
+ c_label = c.get_label()
797
+ [label_c, edge_nbr_c] = c_label.split('%%',1)
798
+ out.append(node_to_str[c])
799
+ if c.edge_length is not None:
800
+ if isinstance(c.edge_length,int):
801
+ l_str = str(c.edge_length)
802
+ elif isinstance(c.edge_length,float) and c.edge_length.is_integer():
803
+ l_str = str(int(c.edge_length))
804
+ else:
805
+ l_str = str(c.edge_length)
806
+ out.append(':%s{%d}' % (l_str, int(edge_nbr_c)))
807
+ out.append(',')
808
+ del node_to_str[c]
809
+ out.pop() # trailing comma
810
+ out.append(')')
811
+ if label is not None:
812
+ out.append(str(label))
813
+ node_to_str[node] = ''.join(out)
814
+ return node_to_str[node]
815
+
816
+ def write_tree_newick_edge_tokens(tree, filename, hide_rooted_prefix=False):
817
+ '''
818
+ Modified from treeswift tree.write_tree_newick()
819
+ Write this ``Tree`` to a Newick file
820
+ Args:
821
+ ``filename`` (``str``): Path to desired output file (plain-text or gzipped)
822
+ '''
823
+ if not isinstance(filename, str):
824
+ raise TypeError("filename must be a str")
825
+ treestr = newick_edge_nbr_string(tree)
826
+ if hide_rooted_prefix:
827
+ if treestr.startswith('[&R]'):
828
+ treestr = treestr[4:].strip()
829
+ else:
830
+ warn("Specified hide_rooted_prefix, but tree was not rooted")
831
+ if filename.lower().endswith('.gz'): # gzipped file
832
+ f = gopen(expanduser(filename),'wb',9); f.write(treestr.encode()); f.close()
833
+ else: # plain-text file
834
+ f = open(expanduser(filename),'w'); f.write(treestr); f.close()
835
+
836
+ def read_tree_newick_edge_tokens(newick):
837
+ '''
838
+ Modified from treeswift.read_tree_newick(newick)
839
+ Read a tree from a Newick string or file
840
+ Args:
841
+ ``newick`` (``str``): Either a Newick string or the path to a Newick file (plain-text or gzipped)
842
+
843
+ Returns:
844
+ ``Tree``: The tree represented by ``newick``. If the Newick file has multiple trees (one per line), a ``list`` of ``Tree`` objects will be returned
845
+ '''
846
+ place_edge_dict = dict()
847
+ if not isinstance(newick, str):
848
+ try:
849
+ newick = str(newick)
850
+ except:
851
+ raise TypeError("newick must be a str")
852
+ if newick.lower().endswith('.gz'): # gzipped file
853
+ f = gopen(expanduser(newick)); ts = f.read().decode().strip(); f.close()
854
+ elif isfile(expanduser(newick)): # plain-text file
855
+ f = open(expanduser(newick)); ts = f.read().strip(); f.close()
856
+ else:
857
+ ts = newick.strip()
858
+ lines = ts.splitlines()
859
+ if len(lines) != 1:
860
+ return [read_tree_newick_edge_tokens(l) for l in lines]
861
+ try:
862
+ t = treeswift.Tree(); t.is_rooted = ts.startswith('[&R]')
863
+ if ts[0] == '[':
864
+ ts = ']'.join(ts.split(']')[1:]).strip(); ts = ts.replace(', ',',')
865
+ n = t.root; i = 0
866
+ while i < len(ts):
867
+ # end of Newick string
868
+ if ts[i] == ';':
869
+ if i != len(ts)-1 or n != t.root:
870
+ raise RuntimeError(INVALID_NEWICK)
871
+
872
+ # go to new child
873
+ elif ts[i] == '(':
874
+ c = treeswift.Node(); n.add_child(c); n = c
875
+
876
+ # go to parent
877
+ elif ts[i] == ')':
878
+ n = n.parent
879
+
880
+ # go to new sibling
881
+ elif ts[i] == ',':
882
+ n = n.parent; c = treeswift.Node(); n.add_child(c); n = c
883
+
884
+ # edge length
885
+ elif ts[i] == ':':
886
+ i += 1; ls = ''
887
+ while ts[i] != ',' and ts[i] != ')' and ts[i] != ';' and ts[i] != '{':
888
+ ls += ts[i]; i += 1
889
+ if ls[0] == '[':
890
+ n.edge_params = ']'.join(ls.split(']')[:-1]); ls = ls.split(']')[-1]
891
+ n.edge_length = float(ls); i -= 1
892
+
893
+ # edge token
894
+ elif ts[i] == '{':
895
+ i += 1; ls = ''
896
+ while ts[i] != '}':
897
+ ls += ts[i]; i += 1
898
+ place_edge_dict[ls] = n
899
+
900
+ # node label
901
+ else:
902
+ label = ''; bracket = None
903
+ while bracket is not None or ts[i] in BRACKET or (ts[i] != ':' and ts[i] != ',' and ts[i] != ';' and ts[i] != ')'):
904
+ if ts[i] in BRACKET and bracket is None:
905
+ bracket = ts[i]
906
+ elif bracket is not None and ts[i] == BRACKET[bracket]:
907
+ bracket = None
908
+ label += ts[i]; i += 1
909
+ i -= 1; n.label = label
910
+ i += 1
911
+ except Exception as e:
912
+ raise RuntimeError("Failed to parse string as Newick")
913
+ return t, place_edge_dict