bscampp 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bscampp/utils.py ADDED
@@ -0,0 +1,914 @@
1
+ #from dendropy import *
2
+ import numpy as np
3
+ import heapq
4
+ import treeswift
5
+ import itertools
6
+ from collections import deque
7
+ from os.path import expanduser,isfile
8
+ import random
9
+ import statistics
10
+ import copy
11
+ import gzip
12
+
13
+ import argparse
14
+ # reformat argparse help text formatting
15
+ class SmartHelpFormatter(argparse.RawDescriptionHelpFormatter):
16
+ def add_text(self, text):
17
+ if text is not None:
18
+ text = text.replace("\\n", "\n").replace("\\t", "\t")
19
+ super().add_text(text)
20
+ def _split_lines(self, text, width):
21
+ if '\n' in text:
22
+ temp = text.split('\n')
23
+ ret = []
24
+ for _splice in [argparse.RawDescriptionHelpFormatter._split_lines(self, x, width)
25
+ for x in temp]:
26
+ ret.extend(_splice)
27
+ return ret
28
+ return argparse.RawDescriptionHelpFormatter._split_lines(self, text, width)
29
+
30
+ # store bracket open/close for convenience in label parsing
31
+ BRACKET = {
32
+ '[': ']', # square bracket
33
+ '{': '}', # curly bracket
34
+ "'": "'", # single-quote
35
+ '"': '"', # double-quote
36
+ }
37
+
38
+
39
+ def write_fasta(aln, aln_dict, aligned=True):
40
+ """ Write given dictionary as FASTA file out
41
+
42
+ Parameters
43
+ ----------
44
+ aln : FASTA file path
45
+ aln_dict : MSA in the form of a dict
46
+ aligned : whether the sequences are aligned
47
+
48
+ Returns
49
+ -------
50
+ None
51
+
52
+ """
53
+
54
+ f = open(aln, 'w')
55
+ for label, seq in aln_dict.items():
56
+ if label != '':
57
+ f.write(f'>{label}\n')
58
+ if aligned:
59
+ f.write(f'{seq}\n')
60
+ else:
61
+ f.write(seq.replace('-', '') + '\n')
62
+ f.close()
63
+
64
+ #separete the query and ref sequence from the alignment file
65
+
66
+ def read_data(aln):
67
+ """ Load the query and reference sequence from the alignment file
68
+
69
+ Parameters
70
+ ----------
71
+ aln : multiple sequence alignment containing reference taxa and query sequence
72
+
73
+ Returns
74
+ -------
75
+ dictionary containing sequences with taxon label keys
76
+
77
+ """
78
+
79
+ f = open(aln)
80
+ result = dict()
81
+
82
+ taxa = ""
83
+ seq = ""
84
+ for line in f:
85
+ if line[0] == '>':
86
+ if taxa != "":
87
+ result[taxa] = seq
88
+ taxa = line[1:-1]
89
+ seq = ""
90
+
91
+ elif line == "/n":
92
+ continue
93
+ else:
94
+ seq += line[:-1]
95
+
96
+ if taxa != "":
97
+ result[taxa] = seq
98
+
99
+
100
+ return result
101
+
102
+ def seperate(aln_dict, leaf_dict):
103
+ """ Separate the query sequences from the reference sequences
104
+
105
+ Parameters
106
+ ----------
107
+ aln_dict : Sequence dictionary with taxon label keys
108
+ leaf_dict : Sequence dictionary with leaf label keys (queries are not in backbone tree)
109
+
110
+ Returns
111
+ -------
112
+ separate dictionaries containing query sequences and referece sequences with taxon label keys
113
+
114
+ """
115
+ ref = dict()
116
+ query = dict()
117
+
118
+ for key, value in aln_dict.items():
119
+ if key not in leaf_dict:
120
+ query[key] = value
121
+ else:
122
+ ref[key] = value
123
+
124
+ return ref, query
125
+
126
+ def hamming(seq1, seq2):
127
+ """ Returns hamming distance between two sequences
128
+
129
+ Parameters
130
+ ----------
131
+ seq1 : query sequence
132
+ seq2 : reference sequence
133
+
134
+ Returns
135
+ -------
136
+ integer hamming distance between query sequence and reference sequence
137
+
138
+ """
139
+ return sum(1 for ch1, ch2 in zip(seq1, seq2) if ch1 != ch2)
140
+
141
+
142
+ def find_y(x,ref):
143
+ """ Returns leaf label for closest sister taxon l (no longer used)
144
+
145
+ Parameters
146
+ ----------
147
+ x : aligned query sequence
148
+ ref : reference multiple sequence alignment dictionary
149
+
150
+ Returns
151
+ -------
152
+ leaf label for taxon with smallest hamming distacne to query sequence
153
+
154
+ """
155
+ low = len(x)
156
+ y = ""
157
+ for name, seq in ref.items():
158
+ h_dist = hamming(x, seq)
159
+ if h_dist < low:
160
+ low = h_dist
161
+ y = name
162
+ return y
163
+
164
+ def set_hamming_dict(args):
165
+ dict_items, other_args = args
166
+ name, seq = dict_items
167
+ ref, n, fragment_flag, y_dict = other_args
168
+
169
+ y_dict[name] = find_closest_hamming(seq, ref, n, fragment_flag)
170
+
171
+
172
+ def find_closest_hamming(x, ref, n, fragment_flag):
173
+ ''' Returns leaf name for n closest sister taxa to sequence x
174
+
175
+ Parameters
176
+ ----------
177
+ x : aligned query sequence
178
+ ref : reference multiple sequence alignment dictionary
179
+ n : number of nodes to return
180
+ fragment_flag : True if the query is not full length
181
+
182
+ Returns
183
+ -------
184
+ list of nodes with n smallest hamming distacnes to query sequence
185
+
186
+ '''
187
+ queue = []
188
+ closest = []
189
+
190
+ counter = 0
191
+ if fragment_flag == 'True':
192
+ [si, ei] = set_fragment_indicies(x)
193
+ else:
194
+ [si, ei] = [0, len(x)]
195
+
196
+ c = 200 # size of the subtring compared at once
197
+
198
+ for name, seq in ref.items():
199
+ heapq.heappush(queue,(hamming(seq[si:si+c],x[si:si+c]), ei - si - c, counter, name))
200
+ counter += 1
201
+
202
+ while queue:
203
+ (ham_dist, sites_left, cnt, name) = heapq.heappop(queue)
204
+ if sites_left < 0:
205
+ closest.append(name)
206
+ if len(closest) >= n:
207
+ return closest
208
+ else:
209
+ ind = ei - sites_left
210
+ new_ham = hamming(ref[name][ind:ind+c],x[ind:ind+c])
211
+ heapq.heappush(queue,(ham_dist + new_ham, sites_left - c, cnt, name))
212
+
213
+ def set_fragment_indicies(x):
214
+ """ Returns the indicees without leading and trailing gaps.
215
+
216
+ Parameters
217
+ ----------
218
+ x = string sequence
219
+
220
+ Returns
221
+ -------
222
+ list of start index and end index with the first and last non gap character
223
+
224
+ """
225
+ e = len(x)
226
+ ei = e
227
+ si = 0
228
+ for i in range(ei):
229
+ if x[i] == '-' and si == i:
230
+ si = i + 1
231
+ if x[e - i - 1] == '-' and ei == e - i:
232
+ ei = e - i - 1
233
+ if ei == si:
234
+ break
235
+ return [si, ei]
236
+
237
+ def find_closest(x, visited, y=None):
238
+ """ Returns leaf label for closest leaf to the node x through path not travelling through visited.
239
+ If y is populated returns path from x to y not travelling through nodes in visited.
240
+
241
+ Parameters
242
+ ----------
243
+ x : dendropy node object
244
+ visited : dictionary containing dendropy node objects as keys
245
+ y : dendropy node object
246
+
247
+ Returns
248
+ -------
249
+ If y == None : dendropy node object of closest leaf y to the node x through path not travelling through nodes in visited,
250
+ list containing dendropy node objects on path to that leaf y from node x
251
+ If y != None : dendropy node object y,
252
+ list containing dendropy node objects on path from node x to leaf y not travelling through nodes in visited
253
+
254
+ """
255
+ queue = []
256
+ cnt = 1
257
+ visited.add(x)
258
+
259
+ if x.get_parent() and x.get_parent() not in visited:
260
+ tmp = []
261
+ tmp.append(x)
262
+ heapq.heappush(queue, [x.get_edge_length(), cnt, tmp, x.get_parent()])
263
+ cnt += 1
264
+
265
+ for child in x.child_nodes():
266
+ if child and child not in visited:
267
+ tmp = []
268
+ tmp.append(child)
269
+ heapq.heappush(queue, [child.get_edge_length(), cnt, tmp, child])
270
+ cnt += 1
271
+
272
+ while len(queue) > 0:
273
+ try:
274
+ [length, _, path, node] = heapq.heappop(queue)
275
+ except IndexError:
276
+ break
277
+
278
+ visited.add(node)
279
+ if node.is_leaf():
280
+ if (not y) or node.get_label()==y.get_label():
281
+ return node, path
282
+ else:
283
+ continue
284
+
285
+ if node.get_parent() and node.get_parent() not in visited:
286
+ tmp = path.copy()
287
+ tmp.append(node)
288
+ heapq.heappush(queue, [length+node.get_edge_length(), cnt, tmp, node.get_parent()])
289
+ cnt += 1
290
+
291
+ for child in node.child_nodes():
292
+ if child and child not in visited:
293
+ tmp = path.copy()
294
+ tmp.append(child)
295
+ heapq.heappush(queue, [length+child.get_edge_length(), cnt, tmp, child])
296
+ cnt += 1
297
+
298
+ return x, [x]
299
+
300
+
301
+ def find_closest_testing(x, visited, y=None, valid_leaves=None):
302
+ """ Returns leaf label for closest leaf to the node x through path not travelling through visited.
303
+ If y is populated returns path from x to y not travelling through nodes in visited.
304
+
305
+ Parameters
306
+ ----------
307
+ x : dendropy node object
308
+ visited : dictionary containing dendropy node objects as keys
309
+ y : dendropy node object
310
+
311
+ Returns
312
+ -------
313
+ If y == None : dendropy node object of closest leaf y to the node x through path not travelling through nodes in visited,
314
+ list containing dendropy node objects on path to that leaf y from node x
315
+ distance from node x to leaf y
316
+ If y != None : dendropy node object y,
317
+ list containing dendropy node objects on path from node x to leaf y not travelling through nodes in visited,
318
+ distance from node x to leaf y
319
+
320
+ """
321
+ queue = []
322
+ cnt = 1
323
+ visited.add(x)
324
+
325
+ if x.get_parent() and x.get_parent() not in visited:
326
+ tmp = []
327
+ tmp.append(x)
328
+ heapq.heappush(queue, [x.get_edge_length(), cnt, tmp, x.get_parent()])
329
+ cnt += 1
330
+
331
+ for child in x.child_nodes():
332
+ if child and child not in visited:
333
+ tmp = []
334
+ tmp.append(child)
335
+ heapq.heappush(queue, [child.get_edge_length(), cnt, tmp, child])
336
+ cnt += 1
337
+
338
+ while len(queue) > 0:
339
+ try:
340
+ [length, _, path, node] = heapq.heappop(queue)
341
+ except IndexError:
342
+ break
343
+
344
+ visited.add(node)
345
+ if node.is_leaf():
346
+ if (not y) or node.get_label() == y.get_label():
347
+ if (valid_leaves != None and node.get_label() not in valid_leaves):
348
+ continue
349
+ else:
350
+ return node, path, length
351
+ else:
352
+ continue
353
+
354
+ if node.get_parent() and node.get_parent() not in visited:
355
+ tmp = path.copy()
356
+ tmp.append(node)
357
+ heapq.heappush(queue, [length+node.get_edge_length(), cnt, tmp, node.get_parent()])
358
+ cnt += 1
359
+
360
+ for child in node.child_nodes():
361
+ if child and child not in visited:
362
+ tmp = path.copy()
363
+ tmp.append(child)
364
+ heapq.heappush(queue, [length+child.get_edge_length(), cnt, tmp, child])
365
+ cnt += 1
366
+
367
+ return x, [x], 0
368
+
369
+ def build_subtrees(sister_taxon_dict, leaf_dict, tree, nbr_subtrees, subtree_size):
370
+ tree_building_taxa = len(list(sister_taxon_dict))
371
+ cluster_dict = dict()
372
+ cluster_index = 0
373
+ cluster_list = dict()
374
+
375
+
376
+
377
+ #tmp
378
+ #draw(tree.extract_tree_with(list(sister_taxon_dict)))
379
+ #tmp
380
+
381
+ for taxon, _ in sister_taxon_dict.items():
382
+ cluster_dict[taxon] = cluster_index
383
+ cluster_list[cluster_index] = [taxon]
384
+ cluster_index += 1
385
+
386
+ queue = []
387
+ for l1, _ in sister_taxon_dict.items():
388
+ node_l2, _, length = find_closest_testing(leaf_dict[l1],{leaf_dict[l1]},valid_leaves=sister_taxon_dict)
389
+ heapq.heappush(queue, [length, l1, node_l2.get_label()])
390
+
391
+ while tree_building_taxa > nbr_subtrees:
392
+ [length, l1, l2] = heapq.heappop(queue)
393
+
394
+ idx = cluster_dict[l1]
395
+ old_idx = cluster_dict[l2]
396
+
397
+ if idx == old_idx:
398
+ continue
399
+
400
+ for leaf in cluster_list[old_idx]:
401
+ cluster_dict[leaf] = idx
402
+ #[cluster_dict[leaf] = idx for leaf in cluster_list[old_idx]]
403
+ cluster_list[idx].extend(cluster_list[old_idx])
404
+ cluster_list.pop(old_idx)
405
+
406
+ tree_building_taxa -= 1
407
+
408
+ best_leaf = cluster_list[idx][0]
409
+ for leaf in cluster_list[idx]:
410
+ best_leaf = centered_leaf(tree, cluster_list[idx])
411
+ cluster_node_list = [leaf_dict[leaf] for leaf in cluster_list[idx]]
412
+ cluster_set = {*cluster_node_list}
413
+ node_l3, _, length = find_closest_testing(leaf_dict[best_leaf], cluster_set, valid_leaves=sister_taxon_dict)
414
+ heapq.heappush(queue, [length, best_leaf, node_l3.get_label()])
415
+
416
+ trees = []
417
+ query_decomp_dict = []
418
+
419
+ for idx, cluster in cluster_list.items():
420
+ best_leaf = centered_leaf(tree, cluster)
421
+
422
+ subtree_leaves, tree2 = subtree_with_edge_length(tree, leaf_dict[best_leaf], subtree_size)
423
+ tree2 = tree2.extract_tree_with(subtree_leaves)
424
+ trees.append(tree2)
425
+ query_list = []
426
+ for leaf in cluster:
427
+ query_list.extend(sister_taxon_dict[leaf])
428
+ query_decomp_dict.append(query_list)
429
+
430
+ return trees, query_decomp_dict
431
+
432
+
433
+ def subtree_nodes(tree, leaf_y, n):
434
+ """ Returns list of length n of leaves closest to sister taxon
435
+
436
+ Parameters
437
+ ----------
438
+ tree : treeswift tree object
439
+ leaf_y : treeswift node for closest sister taxon
440
+ n = number of taxa contained in subtree
441
+
442
+ Returns
443
+ -------
444
+ list of taxon labels corresponding to leaves in the subtree
445
+
446
+ """
447
+ queue = [(0, 0, leaf_y.get_parent())]
448
+
449
+ leaves = [leaf_y]
450
+ visited = {leaf_y}
451
+
452
+ counter = 1
453
+
454
+ while len(leaves) < n:
455
+ try:
456
+ (length, _, node) = heapq.heappop(queue)
457
+ except IndexError:
458
+ break
459
+
460
+ visited.add(node)
461
+ if node.is_leaf():
462
+ leaves.append(node)
463
+
464
+ adjacent = node.child_nodes()
465
+ if not node.is_root():
466
+ adjacent.append(node.get_parent())
467
+
468
+ for neighbor in adjacent:
469
+ if neighbor not in visited:
470
+ heapq.heappush(queue, (length+1, counter, neighbor))
471
+ counter += 1
472
+
473
+
474
+ result = []
475
+ for item in leaves:
476
+ result.append(item.get_label())
477
+
478
+ return result
479
+
480
+ def subtree_nodes_with_edge_length(tree, leaf_y, n):
481
+ """ Returns list of length n of leaves closest to sister taxon (minimizing edge weights)
482
+
483
+ Parameters
484
+ ----------
485
+ tree : treeswift tree object
486
+ leaf_y : treeswift node for closest sister taxon
487
+ n = number of taxa contained in subtree
488
+
489
+ Returns
490
+ -------
491
+ list of taxon labels corresponding to leaves in the subtree
492
+ """
493
+ queue = [(leaf_y.get_edge_length(), leaf_y.get_parent())]
494
+
495
+ leaves = [leaf_y]
496
+ visited = {leaf_y}
497
+
498
+ while len(leaves) < n:
499
+ try:
500
+ (length, node) = heapq.heappop(queue)
501
+ except IndexError:
502
+ break
503
+
504
+ visited.add(node)
505
+ if node.is_leaf() and node.get_label() != '':
506
+ leaves.append(node)
507
+
508
+ adjacent_nodes = node.child_nodes()
509
+ if not node.is_root():
510
+ adjacent_nodes.append(node.get_parent())
511
+
512
+ for neighbor in adjacent_nodes:
513
+ if neighbor not in visited:
514
+ if neighbor == node.get_parent():
515
+ heapq.heappush(queue, (length+node.get_edge_length(), neighbor))
516
+ else:
517
+ heapq.heappush(queue, (length+neighbor.get_edge_length(), neighbor))
518
+
519
+ result = []
520
+ for item in leaves:
521
+ result.append(item.get_label())
522
+
523
+ return result
524
+
525
+ def subtree_with_edge_length(tree, leaf_y, n):
526
+ """ Returns a subtree and list of length n of leaves closest to sister taxon (minimizing edge weights)
527
+
528
+ Parameters
529
+ ----------
530
+ tree : treeswift tree object
531
+ leaf_y : treeswift node for closest sister taxon
532
+ n = number of taxa contained in subtree
533
+
534
+ Returns
535
+ -------
536
+ a subtree and a list of taxon labels corresponding to leaves in the subtree
537
+ """
538
+ if leaf_y.get_parent() == None:
539
+ return None, None
540
+
541
+ subtree = treeswift.Tree(); subtree.root.label = None; subtree.root.edge_length = 0
542
+ subtreeNode = treeswift.Node(label=leaf_y.get_label(), edge_length=0); subtree.root.add_child(subtreeNode)
543
+
544
+ queue = [(leaf_y.get_edge_length(), leaf_y.get_parent(), leaf_y.get_edge_length(), subtree.root)]
545
+
546
+ leaves = [leaf_y]
547
+ visited = {leaf_y}
548
+
549
+ while len(leaves) < n:
550
+ try:
551
+ (length, node, n_edge_length, parent) = heapq.heappop(queue)
552
+ except IndexError:
553
+ break
554
+
555
+ visited.add(node)
556
+ subtreeNode = treeswift.Node(label=node.get_label(), edge_length=n_edge_length); parent.add_child(subtreeNode)
557
+ if node.is_leaf():
558
+ leaves.append(node)
559
+
560
+ adjacent_nodes = node.child_nodes()
561
+ if not node.is_root():
562
+ adjacent_nodes.append(node.get_parent())
563
+
564
+ for neighbor in adjacent_nodes:
565
+ if neighbor not in visited:
566
+ if neighbor == node.get_parent():
567
+ heapq.heappush(queue, (length+node.get_edge_length(), neighbor, node.get_edge_length(), subtreeNode))
568
+ else:
569
+ heapq.heappush(queue, (length+neighbor.get_edge_length(), neighbor, neighbor.get_edge_length(), subtreeNode))
570
+
571
+ result = []
572
+ for item in leaves:
573
+ result.append(item.get_label())
574
+
575
+ #subtree.deroot()
576
+ subtree.suppress_unifurcations()
577
+ return result, subtree
578
+
579
+ def extract_taxa_from_tree(a_tree, labels, leaf_dict):
580
+ queue = []
581
+ closest = []
582
+
583
+ counter = 0
584
+
585
+ for label in labels:
586
+ heapq.heappush(queue,(counter,leaf_dict[label]))
587
+ counter += 1
588
+
589
+ while queue:
590
+ (cnt, node) = heapq.heappop(queue)
591
+ if node.num_children() == 0:
592
+ parent = node.get_parent()
593
+ if parent != None:
594
+ parent.remove_child(node)
595
+ heapq.heappush(queue,(counter, parent))
596
+
597
+ a_tree.resolve_polytomies()
598
+ a_tree.suppress_unifurcations()
599
+
600
+ return a_tree
601
+
602
+ def min_tree_extract_disjoint(a_tree, max_size, tmp_leaves):
603
+ labels, t2 = subtree_with_edge_length(a_tree, tmp_leaves[random.choice(list(tmp_leaves))], max_size)
604
+
605
+ t2 = t2.extract_tree_with(labels)
606
+ t1 = extract_taxa_from_tree(a_tree, labels, tmp_leaves)
607
+ for label in labels:
608
+ del tmp_leaves[label]
609
+ return t1, t2, tmp_leaves
610
+
611
+
612
+ def min_tree_extract_non_disjoint(a_tree, max_size, tmp_leaves):
613
+ labels, t2 = subtree_with_edge_length(a_tree, tmp_leaves[random.choice(list(tmp_leaves))], max_size)
614
+
615
+ t2 = a_tree.extract_tree_with(labels)
616
+ for label in labels:
617
+ if label in tmp_leaves:
618
+ del tmp_leaves[label]
619
+ return a_tree, t2, tmp_leaves
620
+
621
+ def decompose_tree(a_tree,max_size):
622
+ tree_list = []
623
+ tmp_leaves = a_tree.label_to_node(selection='leaves')
624
+ if '' in tmp_leaves:
625
+ del tmp_leaves['']
626
+ #t1, t2, tmp_leaves = min_tree_extract_disjoint(a_tree, max_size, tmp_leaves)
627
+ t1, t2, tmp_leaves = min_tree_extract_non_disjoint(a_tree, max_size, tmp_leaves)
628
+ while len(tmp_leaves) > 0:
629
+ tree_list.append(t2)
630
+
631
+ #t1, t2, tmp_leaves = min_tree_extract_disjoint(t1, max_size, tmp_leaves)
632
+ t1, t2, tmp_leaves = min_tree_extract_non_disjoint(t1, max_size, tmp_leaves)
633
+ tree_list.append(t2)
634
+ print ("nbr subtrees: ", len(tree_list))
635
+ return tree_list
636
+
637
+ def decompose_tree_min_clust(a_tree,max_size):
638
+ tree_list = []
639
+ t1, t2 = min_cluster_size_bisect(a_tree,max_size)
640
+ while t2 != None:
641
+ tree_list.append(t2)
642
+ t1, t2 = min_cluster_size_bisect(t1, max_size)
643
+ tree_list.append(t1) #this might be a bug
644
+ return tree_list
645
+
646
+ def min_cluster_size_bisect(a_tree,max_size):
647
+ '''
648
+ modified from PASTA to use treeswift
649
+ '''
650
+ nleaf = dict()
651
+
652
+ print("before extracting subtree: " + str(len(a_tree.label_to_node(selection='leaves'))))
653
+ #a_tree.draw()
654
+
655
+ for node in a_tree.traverse_postorder():
656
+ if node.is_leaf():
657
+ nleaf[node] = 1
658
+ else:
659
+ nleaf[node] = 0
660
+ max_child = None
661
+ max_nleaf = 0
662
+ for ch in node.child_nodes():
663
+ nleaf[node] += nleaf[ch]
664
+ if nleaf[ch] > max_nleaf:
665
+ max_nleaf = nleaf[ch]
666
+ max_child = ch
667
+ if nleaf[node] >= max_size:
668
+ node.remove_child(max_child)
669
+ t1 = a_tree.extract_subtree(max_child)
670
+ print("subtree size: " + str(len(t1.label_to_node(selection='leaves'))))
671
+ print("after extracting subtree: " + str(len(a_tree.label_to_node(selection='leaves'))))
672
+ #t1.deroot()
673
+ #t1.draw()
674
+ t1.resolve_polytomies()
675
+ return a_tree,t1
676
+
677
+ print("after extracting subtree: " + str(len(a_tree.label_to_node(selection='leaves'))))
678
+ return a_tree,None
679
+
680
+ def centered_leaf(tree, cluster):
681
+ #pick best sister leaf for cluster center
682
+ best_leaf = cluster[0]
683
+ if len(cluster) > 1:
684
+ tmp_tree = tree.extract_tree_with(cluster)
685
+ min_distance = 99999999999
686
+ tmp_label_dict = tmp_tree.label_to_node(selection='leaves')
687
+ for leaf in cluster:
688
+ total = total_distance(tmp_tree, tmp_label_dict[leaf])
689
+
690
+ if total < min_distance:
691
+ min_distance = total
692
+ best_leaf = leaf
693
+
694
+ return best_leaf
695
+
696
+ def max_distance(tree, node):
697
+ maximum = 0
698
+ for leaf in tree.traverse_leaves():
699
+ distance = tree.distance_between(node, leaf)
700
+ if distance >= maximum:
701
+ maximum = distance
702
+ return maximum
703
+
704
+ def total_distance(tree, node):
705
+ total = 0
706
+ for leaf in tree.traverse_leaves():
707
+ total += tree.distance_between(node, leaf)
708
+ return total
709
+
710
+
711
+ def avg_distance(tree, node):
712
+ total = 0
713
+ leaf_cnt = 0
714
+ for leaf in tree.traverse_leaves():
715
+ total += tree.distance_between(node, leaf)
716
+ leaf_cnt += 1
717
+ return total/leaf_cnt
718
+
719
+ def median_distance(tree, node):
720
+ total = 0
721
+ leaf_cnt = 0
722
+ distances = []
723
+ for leaf in tree.traverse_leaves():
724
+ distance = tree.distance_between(node, leaf)
725
+ total += distance
726
+ distances.append(distance)
727
+ leaf_cnt += 1
728
+ return statistics.median(distances)
729
+
730
+ def add_edge_nbrs(tree):
731
+ counter = 0
732
+ for node in tree.traverse_postorder():
733
+ #if not node.is_root():
734
+ counter += 1
735
+ label = node.get_label()
736
+ if label == None:
737
+ node.set_label('%%{}'.format(counter))
738
+ else:
739
+ node.set_label('{}%%{}'.format(label, counter))
740
+
741
+ def remove_edge_nbrs(tree):
742
+ for node in tree.traverse_postorder():
743
+ #if not node.is_root():
744
+ label_list = node.get_label().split('%%',1)
745
+ if label_list[0] == '':
746
+ node.set_label(None)
747
+ else:
748
+ node.set_label(label_list[0])
749
+
750
+ '''
751
+ The following three functions are modified from treeswift to
752
+ read and write newick files with jplace tokens
753
+ '''
754
+ def newick_edge_tokens(tree):
755
+ '''
756
+ Modified from treeswift tree.newick()
757
+ Output this ``Tree`` as a Newick string with lables
758
+ Returns:
759
+ ``str``: Newick string of this ``Tree``
760
+ '''
761
+ label_list = tree.root.get_label().split('%%',1)
762
+
763
+ if tree.root.edge_length is None:
764
+ suffix = ';'
765
+ elif isinstance(tree.root.edge_length,int):
766
+ suffix = '%s:%d{%d};' % (str(label_list[0]), int(tree.root.edge_length), int(label_list[1]))
767
+ elif isinstance(tree.root.edge_length,float) and tree.root.edge_length.is_integer():
768
+ suffix = '%s:%d{%d};' % (str(label_list[0]), float(tree.root.edge_length), int(label_list[1]))
769
+ else:
770
+ suffix = '%s:%s{%d};' % (str(label_list[0]), str(tree.root.edge_length), int(label_list[1]))
771
+
772
+ if tree.is_rooted:
773
+ return '[&R] %s%s' % (newick_edge_tokens_node(tree.root),suffix)
774
+ else:
775
+ return '%s%s' % (newick_edge_tokens_node(tree.root),suffix)
776
+
777
+ def newick_edge_tokens_node(node):
778
+ '''
779
+ Modified from treeswift node.newick()
780
+ Newick string conversion starting at this ``Node`` object
781
+ Returns:
782
+ ``str``: Newick string conversion starting at this ``Node`` object
783
+ '''
784
+ node_to_str = dict()
785
+ for node in node.traverse_postorder():
786
+ node_label = node.get_label()
787
+ [label, edge_nbr] = node_label.split('%%',1)
788
+ #node.set_label(label_list[0])
789
+ if node.is_leaf():
790
+ if label is None:
791
+ node_to_str[node] = ''
792
+ else:
793
+ node_to_str[node] = str(label)
794
+ else:
795
+ out = ['(']
796
+ for c in node.children:
797
+ c_label = c.get_label()
798
+ [label_c, edge_nbr_c] = c_label.split('%%',1)
799
+ out.append(node_to_str[c])
800
+ if c.edge_length is not None:
801
+ if isinstance(c.edge_length,int):
802
+ l_str = str(c.edge_length)
803
+ elif isinstance(c.edge_length,float) and c.edge_length.is_integer():
804
+ l_str = str(int(c.edge_length))
805
+ else:
806
+ l_str = str(c.edge_length)
807
+ out.append(':%s{%d}' % (l_str, int(edge_nbr_c)))
808
+ out.append(',')
809
+ del node_to_str[c]
810
+ out.pop() # trailing comma
811
+ out.append(')')
812
+ if label is not None:
813
+ out.append(str(label))
814
+ node_to_str[node] = ''.join(out)
815
+ return node_to_str[node]
816
+
817
+ #def write_tree_newick_edge_tokens(tree, filename, hide_rooted_prefix=False):
818
+ # '''
819
+ # Modified from treeswift tree.write_tree_newick()
820
+ # Write this ``Tree`` to a Newick file
821
+ # Args:
822
+ # ``filename`` (``str``): Path to desired output file (plain-text or gzipped)
823
+ # '''
824
+ # if not isinstance(filename, str):
825
+ # raise TypeError("filename must be a str")
826
+ # treestr = newick_edge_nbr_string(tree)
827
+ # if hide_rooted_prefix:
828
+ # if treestr.startswith('[&R]'):
829
+ # treestr = treestr[4:].strip()
830
+ # else:
831
+ # warn("Specified hide_rooted_prefix, but tree was not rooted")
832
+ # if filename.lower().endswith('.gz'): # gzipped file
833
+ # f = gopen(expanduser(filename),'wb',9); f.write(treestr.encode()); f.close()
834
+ # else: # plain-text file
835
+ # f = open(expanduser(filename),'w'); f.write(treestr); f.close()
836
+
837
+ def read_tree_newick_edge_tokens(newick):
838
+ '''
839
+ Modified from treeswift.read_tree_newick(newick)
840
+ Read a tree from a Newick string or file
841
+ Args:
842
+ ``newick`` (``str``): Either a Newick string or the path to a Newick file (plain-text or gzipped)
843
+
844
+ Returns:
845
+ ``Tree``: The tree represented by ``newick``. If the Newick file has multiple trees (one per line), a ``list`` of ``Tree`` objects will be returned
846
+ '''
847
+ place_edge_dict = dict()
848
+ if not isinstance(newick, str):
849
+ try:
850
+ newick = str(newick)
851
+ except:
852
+ raise TypeError("newick must be a str")
853
+ if newick.lower().endswith('.gz'): # gzipped file
854
+ f = gzip.open(expanduser(newick)); ts = f.read().decode().strip(); f.close()
855
+ elif isfile(expanduser(newick)): # plain-text file
856
+ f = open(expanduser(newick)); ts = f.read().strip(); f.close()
857
+ else:
858
+ ts = newick.strip()
859
+ lines = ts.splitlines()
860
+ if len(lines) != 1:
861
+ return [read_tree_newick_edge_tokens(l) for l in lines]
862
+ try:
863
+ t = treeswift.Tree(); t.is_rooted = ts.startswith('[&R]')
864
+ if ts[0] == '[':
865
+ ts = ']'.join(ts.split(']')[1:]).strip(); ts = ts.replace(', ',',')
866
+ n = t.root; i = 0
867
+ while i < len(ts):
868
+ # end of Newick string
869
+ if ts[i] == ';':
870
+ if i != len(ts)-1 or n != t.root:
871
+ raise RuntimeError("INVALID NEWICK")
872
+
873
+ # go to new child
874
+ elif ts[i] == '(':
875
+ c = treeswift.Node(); n.add_child(c); n = c
876
+
877
+ # go to parent
878
+ elif ts[i] == ')':
879
+ n = n.parent
880
+
881
+ # go to new sibling
882
+ elif ts[i] == ',':
883
+ n = n.parent; c = treeswift.Node(); n.add_child(c); n = c
884
+
885
+ # edge length
886
+ elif ts[i] == ':':
887
+ i += 1; ls = ''
888
+ while ts[i] != ',' and ts[i] != ')' and ts[i] != ';' and ts[i] != '{':
889
+ ls += ts[i]; i += 1
890
+ if ls[0] == '[':
891
+ n.edge_params = ']'.join(ls.split(']')[:-1]); ls = ls.split(']')[-1]
892
+ n.edge_length = float(ls); i -= 1
893
+
894
+ # edge token
895
+ elif ts[i] == '{':
896
+ i += 1; ls = ''
897
+ while ts[i] != '}':
898
+ ls += ts[i]; i += 1
899
+ place_edge_dict[ls] = n
900
+
901
+ # node label
902
+ else:
903
+ label = ''; bracket = None
904
+ while bracket is not None or ts[i] in BRACKET or (ts[i] != ':' and ts[i] != ',' and ts[i] != ';' and ts[i] != ')'):
905
+ if ts[i] in BRACKET and bracket is None:
906
+ bracket = ts[i]
907
+ elif bracket is not None and ts[i] == BRACKET[bracket]:
908
+ bracket = None
909
+ label += ts[i]; i += 1
910
+ i -= 1; n.label = label
911
+ i += 1
912
+ except Exception as e:
913
+ raise RuntimeError("Failed to parse string as Newick")
914
+ return t, place_edge_dict