phykit 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phykit/__init__.py +0 -0
- phykit/__main__.py +6 -0
- phykit/helpers/__init__.py +0 -0
- phykit/helpers/boolean_argument_parsing.py +12 -0
- phykit/helpers/caching.py +201 -0
- phykit/helpers/files.py +125 -0
- phykit/helpers/parallel.py +305 -0
- phykit/helpers/stats_summary.py +64 -0
- phykit/helpers/streaming.py +152 -0
- phykit/phykit.py +2862 -0
- phykit/services/__init__.py +0 -0
- phykit/services/alignment/__init__.py +17 -0
- phykit/services/alignment/alignment_length.py +16 -0
- phykit/services/alignment/alignment_length_no_gaps.py +69 -0
- phykit/services/alignment/alignment_recoding.py +89 -0
- phykit/services/alignment/base.py +103 -0
- phykit/services/alignment/column_score.py +66 -0
- phykit/services/alignment/compositional_bias_per_site.py +98 -0
- phykit/services/alignment/create_concatenation_matrix.py +254 -0
- phykit/services/alignment/dna_threader.py +145 -0
- phykit/services/alignment/evolutionary_rate_per_site.py +85 -0
- phykit/services/alignment/faidx.py +21 -0
- phykit/services/alignment/gc_content.py +94 -0
- phykit/services/alignment/pairwise_identity.py +159 -0
- phykit/services/alignment/parsimony_informative_sites.py +81 -0
- phykit/services/alignment/rcv.py +14 -0
- phykit/services/alignment/rcvt.py +47 -0
- phykit/services/alignment/rename_fasta_entries.py +53 -0
- phykit/services/alignment/sum_of_pairs_score.py +157 -0
- phykit/services/alignment/variable_sites.py +54 -0
- phykit/services/base.py +9 -0
- phykit/services/tree/__init__.py +29 -0
- phykit/services/tree/base.py +178 -0
- phykit/services/tree/bipartition_support_stats.py +48 -0
- phykit/services/tree/branch_length_multiplier.py +37 -0
- phykit/services/tree/collapse_branches.py +27 -0
- phykit/services/tree/covarying_evolutionary_rates.py +272 -0
- phykit/services/tree/dvmc.py +37 -0
- phykit/services/tree/evolutionary_rate.py +17 -0
- phykit/services/tree/hidden_paralogy_check.py +128 -0
- phykit/services/tree/internal_branch_stats.py +77 -0
- phykit/services/tree/internode_labeler.py +33 -0
- phykit/services/tree/last_common_ancestor_subtree.py +35 -0
- phykit/services/tree/lb_score.py +196 -0
- phykit/services/tree/monophyly_check.py +106 -0
- phykit/services/tree/nearest_neighbor_interchange.py +140 -0
- phykit/services/tree/patristic_distances.py +113 -0
- phykit/services/tree/polytomy_test.py +546 -0
- phykit/services/tree/print_tree.py +28 -0
- phykit/services/tree/prune_tree.py +40 -0
- phykit/services/tree/rename_tree_tips.py +64 -0
- phykit/services/tree/rf_distance.py +136 -0
- phykit/services/tree/root_tree.py +35 -0
- phykit/services/tree/saturation.py +209 -0
- phykit/services/tree/spurious_sequence.py +75 -0
- phykit/services/tree/terminal_branch_stats.py +87 -0
- phykit/services/tree/tip_labels.py +18 -0
- phykit/services/tree/tip_to_tip_distance.py +41 -0
- phykit/services/tree/tip_to_tip_node_distance.py +41 -0
- phykit/services/tree/total_tree_length.py +25 -0
- phykit/services/tree/treeness.py +16 -0
- phykit/services/tree/treeness_over_rcv.py +40 -0
- phykit/version.py +1 -0
- phykit-2.1.0.dist-info/METADATA +150 -0
- phykit-2.1.0.dist-info/RECORD +69 -0
- phykit-2.1.0.dist-info/WHEEL +5 -0
- phykit-2.1.0.dist-info/entry_points.txt +121 -0
- phykit-2.1.0.dist-info/licenses/LICENSE.md +7 -0
- phykit-2.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import copy
|
|
3
|
+
from typing import List
|
|
4
|
+
from functools import lru_cache
|
|
5
|
+
import os
|
|
6
|
+
import hashlib
|
|
7
|
+
|
|
8
|
+
from Bio import Phylo
|
|
9
|
+
|
|
10
|
+
from ..base import BaseService
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Tree(BaseService):
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
*args,
|
|
17
|
+
tree_file_path=None,
|
|
18
|
+
idmap=None,
|
|
19
|
+
alignment_file_path=None,
|
|
20
|
+
tree1_file_path=None,
|
|
21
|
+
outgroup_taxa_file_path=None,
|
|
22
|
+
output_file_path=None,
|
|
23
|
+
factor=None,
|
|
24
|
+
remove=None,
|
|
25
|
+
verbose=None,
|
|
26
|
+
reference=None,
|
|
27
|
+
list_of_taxa=None,
|
|
28
|
+
trees=None,
|
|
29
|
+
groups=None,
|
|
30
|
+
support=None,
|
|
31
|
+
tip_1=None,
|
|
32
|
+
tip_2=None,
|
|
33
|
+
clade=None,
|
|
34
|
+
keep=None,
|
|
35
|
+
exclude_gaps=None,
|
|
36
|
+
):
|
|
37
|
+
self.tree_file_path = tree_file_path
|
|
38
|
+
self.tree1_file_path = tree1_file_path
|
|
39
|
+
self.alignment_file_path = alignment_file_path
|
|
40
|
+
self.output_file_path = output_file_path
|
|
41
|
+
self.outgroup_taxa_file_path = outgroup_taxa_file_path
|
|
42
|
+
self.tree_format = "newick"
|
|
43
|
+
self.verbose = verbose
|
|
44
|
+
self.factor = factor
|
|
45
|
+
self.remove = remove
|
|
46
|
+
self.idmap = idmap
|
|
47
|
+
self.reference = reference
|
|
48
|
+
self.list_of_taxa = list_of_taxa
|
|
49
|
+
self.trees = trees
|
|
50
|
+
self.groups = groups
|
|
51
|
+
self.support = support
|
|
52
|
+
self.tip_1 = tip_1
|
|
53
|
+
self.tip_2 = tip_2
|
|
54
|
+
self.clade = clade
|
|
55
|
+
self.keep = keep
|
|
56
|
+
self.exclude_gaps = exclude_gaps
|
|
57
|
+
|
|
58
|
+
@staticmethod
|
|
59
|
+
@lru_cache(maxsize=32)
|
|
60
|
+
def _cached_tree_read(file_path: str, tree_format: str, file_hash: str):
|
|
61
|
+
"""Cached tree reading with file hash for cache invalidation."""
|
|
62
|
+
return Phylo.read(file_path, tree_format)
|
|
63
|
+
|
|
64
|
+
@staticmethod
|
|
65
|
+
def _get_file_hash(file_path: str) -> str:
|
|
66
|
+
"""Get a hash based on file path, size, and modification time."""
|
|
67
|
+
try:
|
|
68
|
+
stat = os.stat(file_path)
|
|
69
|
+
cache_key = f"{file_path}_{stat.st_size}_{stat.st_mtime}"
|
|
70
|
+
return hashlib.md5(cache_key.encode()).hexdigest()
|
|
71
|
+
except:
|
|
72
|
+
return ""
|
|
73
|
+
|
|
74
|
+
def read_tree_file(self):
|
|
75
|
+
try:
|
|
76
|
+
file_hash = self._get_file_hash(self.tree_file_path)
|
|
77
|
+
tree = self._cached_tree_read(self.tree_file_path, self.tree_format, file_hash)
|
|
78
|
+
# Return a deep copy to prevent modifications to the cached tree
|
|
79
|
+
return copy.deepcopy(tree)
|
|
80
|
+
except FileNotFoundError:
|
|
81
|
+
print(f"{self.tree_file_path} corresponds to no such file or directory.")
|
|
82
|
+
print("Please check filename and pathing")
|
|
83
|
+
sys.exit(2)
|
|
84
|
+
|
|
85
|
+
def read_tree1_file(self):
|
|
86
|
+
try:
|
|
87
|
+
file_hash = self._get_file_hash(self.tree1_file_path)
|
|
88
|
+
tree = self._cached_tree_read(self.tree1_file_path, self.tree_format, file_hash)
|
|
89
|
+
# Return a deep copy to prevent modifications to the cached tree
|
|
90
|
+
return copy.deepcopy(tree)
|
|
91
|
+
except FileNotFoundError:
|
|
92
|
+
print(f"{self.tree1_file_path} corresponds to no such file or directory.")
|
|
93
|
+
print("Please check filename and pathing")
|
|
94
|
+
sys.exit(2)
|
|
95
|
+
|
|
96
|
+
def read_reference_tree_file(self):
|
|
97
|
+
try:
|
|
98
|
+
file_hash = self._get_file_hash(self.reference)
|
|
99
|
+
tree = self._cached_tree_read(self.reference, self.tree_format, file_hash)
|
|
100
|
+
# Return a deep copy to prevent modifications to the cached tree
|
|
101
|
+
return copy.deepcopy(tree)
|
|
102
|
+
except FileNotFoundError:
|
|
103
|
+
print(f"{self.reference} corresponds to no such file or directory.")
|
|
104
|
+
print("Please check filename and pathing")
|
|
105
|
+
sys.exit(2)
|
|
106
|
+
|
|
107
|
+
def write_tree_file(self, tree, output_file_path):
|
|
108
|
+
return Phylo.write(tree, output_file_path, self.tree_format)
|
|
109
|
+
|
|
110
|
+
def get_tip_names_from_tree(self, tree) -> list:
|
|
111
|
+
"""
|
|
112
|
+
get tip names from a tree
|
|
113
|
+
"""
|
|
114
|
+
# Use list comprehension for better performance
|
|
115
|
+
return [tip.name for tip in tree.get_terminals()]
|
|
116
|
+
|
|
117
|
+
def shared_tips(self, a, b):
|
|
118
|
+
"""
|
|
119
|
+
Determines what tips are shared between two trees
|
|
120
|
+
-------------------------------------------------
|
|
121
|
+
argv: a
|
|
122
|
+
list of tips from one tree
|
|
123
|
+
argv: b
|
|
124
|
+
list of tips from a second tree
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
a_set = set(a)
|
|
128
|
+
b_set = set(b)
|
|
129
|
+
|
|
130
|
+
# check length
|
|
131
|
+
if len(a_set.intersection(b_set)) > 0:
|
|
132
|
+
return list(a_set.intersection(b_set))
|
|
133
|
+
else:
|
|
134
|
+
print("no common tips")
|
|
135
|
+
sys.exit(2)
|
|
136
|
+
|
|
137
|
+
def prune_tree_using_taxa_list(self, tree, taxa_to_prune: list):
|
|
138
|
+
"""
|
|
139
|
+
prune taxa from tree
|
|
140
|
+
"""
|
|
141
|
+
for taxon in taxa_to_prune:
|
|
142
|
+
tree.prune(taxon)
|
|
143
|
+
|
|
144
|
+
return tree
|
|
145
|
+
|
|
146
|
+
def calculate_treeness(self, tree=None, print_value=False):
|
|
147
|
+
if not tree:
|
|
148
|
+
tree = self.read_tree_file()
|
|
149
|
+
|
|
150
|
+
inter_len = float(0.0)
|
|
151
|
+
# determine internal branch lengths
|
|
152
|
+
for interal in tree.get_nonterminals():
|
|
153
|
+
# only include if a branch length value is present
|
|
154
|
+
if interal.branch_length != None:
|
|
155
|
+
inter_len += interal.branch_length
|
|
156
|
+
# determine total branch length
|
|
157
|
+
total_len = tree.total_branch_length()
|
|
158
|
+
|
|
159
|
+
try:
|
|
160
|
+
treeness = float(inter_len / total_len)
|
|
161
|
+
try:
|
|
162
|
+
if print_value:
|
|
163
|
+
print(f"{treeness}")
|
|
164
|
+
return treeness
|
|
165
|
+
except BrokenPipeError:
|
|
166
|
+
pass
|
|
167
|
+
except ZeroDivisionError:
|
|
168
|
+
try:
|
|
169
|
+
print("Invalid tree. Tree should contain branch lengths")
|
|
170
|
+
return None
|
|
171
|
+
except BrokenPipeError:
|
|
172
|
+
pass
|
|
173
|
+
|
|
174
|
+
def get_gap_chars(is_protein: bool) -> List[str]:
|
|
175
|
+
if is_protein:
|
|
176
|
+
return ["-", "?", "*", "X", "x"]
|
|
177
|
+
else:
|
|
178
|
+
return ["-", "?", "*", "X", "x", "N", "n"]
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from typing import Dict, List, Tuple
|
|
2
|
+
|
|
3
|
+
from Bio.Phylo import Newick
|
|
4
|
+
|
|
5
|
+
from .base import Tree
|
|
6
|
+
from ...helpers.stats_summary import (
|
|
7
|
+
calculate_summary_statistics_from_arr,
|
|
8
|
+
print_summary_statistics,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class BipartitionSupportStats(Tree):
|
|
13
|
+
def __init__(self, args) -> None:
|
|
14
|
+
super().__init__(**self.process_args(args))
|
|
15
|
+
|
|
16
|
+
def run(self) -> None:
|
|
17
|
+
tree = self.read_tree_file()
|
|
18
|
+
bs_vals, term_names = self.get_bipartition_support_vals(tree)
|
|
19
|
+
|
|
20
|
+
if self.verbose:
|
|
21
|
+
try:
|
|
22
|
+
for i in range(len(bs_vals)):
|
|
23
|
+
print(bs_vals[i], ";".join(term_names[i]))
|
|
24
|
+
except BrokenPipeError:
|
|
25
|
+
pass
|
|
26
|
+
else:
|
|
27
|
+
stats = calculate_summary_statistics_from_arr(bs_vals)
|
|
28
|
+
print_summary_statistics(stats)
|
|
29
|
+
|
|
30
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
31
|
+
return dict(tree_file_path=args.tree, verbose=args.verbose)
|
|
32
|
+
|
|
33
|
+
def get_bipartition_support_vals(
|
|
34
|
+
self,
|
|
35
|
+
tree: Newick.Tree,
|
|
36
|
+
) -> Tuple[List[float], List[List[str]]]:
|
|
37
|
+
# Single pass through nonterminals to avoid duplicate tree traversal
|
|
38
|
+
bs_vals = []
|
|
39
|
+
term_names = []
|
|
40
|
+
|
|
41
|
+
# Cache terminals for each nonterminal in one pass
|
|
42
|
+
for nonterminal in tree.get_nonterminals():
|
|
43
|
+
if nonterminal.confidence is not None:
|
|
44
|
+
bs_vals.append(nonterminal.confidence)
|
|
45
|
+
# Get terminal names once for this nonterminal
|
|
46
|
+
term_names.append([term.name for term in nonterminal.get_terminals()])
|
|
47
|
+
|
|
48
|
+
return bs_vals, term_names
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
import copy
|
|
3
|
+
|
|
4
|
+
from Bio.Phylo import Newick
|
|
5
|
+
|
|
6
|
+
from .base import Tree
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class BranchLengthMultiplier(Tree):
|
|
10
|
+
def __init__(self, args) -> None:
|
|
11
|
+
super().__init__(**self.process_args(args))
|
|
12
|
+
|
|
13
|
+
def run(self) -> None:
|
|
14
|
+
tree = self.read_tree_file()
|
|
15
|
+
# Make a deep copy to avoid modifying the cached tree
|
|
16
|
+
tree_copy = copy.deepcopy(tree)
|
|
17
|
+
self.multiply_branch_lengths_by_factor(tree_copy, self.factor)
|
|
18
|
+
self.write_tree_file(tree_copy, self.output_file_path)
|
|
19
|
+
|
|
20
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
21
|
+
output_file_path = \
|
|
22
|
+
args.output or f"{args.tree}.factor_{args.factor}.tre"
|
|
23
|
+
return dict(
|
|
24
|
+
tree_file_path=args.tree,
|
|
25
|
+
factor=args.factor,
|
|
26
|
+
output_file_path=output_file_path,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
def multiply_branch_lengths_by_factor(
|
|
30
|
+
self,
|
|
31
|
+
tree: Newick.Tree,
|
|
32
|
+
factor: float,
|
|
33
|
+
) -> Newick.Tree:
|
|
34
|
+
for node in tree.get_nonterminals() + tree.get_terminals():
|
|
35
|
+
if node.branch_length is not None:
|
|
36
|
+
node.branch_length *= factor
|
|
37
|
+
return tree
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
import copy
|
|
3
|
+
|
|
4
|
+
from .base import Tree
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class CollapseBranches(Tree):
|
|
8
|
+
def __init__(self, args) -> None:
|
|
9
|
+
super().__init__(**self.process_args(args))
|
|
10
|
+
|
|
11
|
+
def run(self):
|
|
12
|
+
tree = self.read_tree_file()
|
|
13
|
+
# Make a deep copy to avoid modifying the cached tree
|
|
14
|
+
tree_copy = copy.deepcopy(tree)
|
|
15
|
+
tree_copy.collapse_all(
|
|
16
|
+
lambda c: c.confidence and c.confidence < self.support
|
|
17
|
+
)
|
|
18
|
+
self.write_tree_file(tree_copy, self.output_file_path)
|
|
19
|
+
|
|
20
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
21
|
+
output_file_path = \
|
|
22
|
+
args.output or f"{args.tree}.collapsed_{args.support}.tre"
|
|
23
|
+
return dict(
|
|
24
|
+
tree_file_path=args.tree,
|
|
25
|
+
support=args.support,
|
|
26
|
+
output_file_path=output_file_path,
|
|
27
|
+
)
|
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import numpy as np
|
|
3
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
4
|
+
from functools import lru_cache
|
|
5
|
+
import pickle
|
|
6
|
+
|
|
7
|
+
from scipy.stats import pearsonr, zscore
|
|
8
|
+
|
|
9
|
+
from .base import Tree
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class CovaryingEvolutionaryRates(Tree):
|
|
13
|
+
def __init__(self, args) -> None:
|
|
14
|
+
super().__init__(**self.process_args(args))
|
|
15
|
+
|
|
16
|
+
def run(self):
|
|
17
|
+
tree_zero = self.read_tree_file()
|
|
18
|
+
tree_one = self.read_tree1_file()
|
|
19
|
+
tree_ref = self.read_reference_tree_file()
|
|
20
|
+
|
|
21
|
+
# - Calculate correlation between two gene trees
|
|
22
|
+
# and save results to an array, corrArr.
|
|
23
|
+
# - Branch lengths will also be part of output
|
|
24
|
+
|
|
25
|
+
# get tree tip names
|
|
26
|
+
tree_zero_tips = self.get_tip_names_from_tree(tree_zero)
|
|
27
|
+
tree_one_tips = self.get_tip_names_from_tree(tree_one)
|
|
28
|
+
tree_ref_tips = self.get_tip_names_from_tree(tree_ref)
|
|
29
|
+
|
|
30
|
+
# get shared tips between the two trees
|
|
31
|
+
shared_tree_tips = self.shared_tips(tree_zero_tips, tree_one_tips)
|
|
32
|
+
|
|
33
|
+
# find differences between tree tips and shared tips
|
|
34
|
+
# to determine what tips to prune
|
|
35
|
+
tree_zero_tips_to_prune = list(set(tree_zero_tips) - set(shared_tree_tips))
|
|
36
|
+
tree_one_tips_to_prune = list(set(tree_one_tips) - set(shared_tree_tips))
|
|
37
|
+
tree_ref_tips_to_prune = list(set(tree_ref_tips) - set(shared_tree_tips))
|
|
38
|
+
|
|
39
|
+
# get a set of pruned trees
|
|
40
|
+
tree_zero = self.prune_tips(tree_zero, tree_zero_tips_to_prune)
|
|
41
|
+
tree_one = self.prune_tips(tree_one, tree_one_tips_to_prune)
|
|
42
|
+
tree_ref = self.prune_tips(tree_ref, tree_ref_tips_to_prune)
|
|
43
|
+
|
|
44
|
+
# obtain corrected branch lengths where branch lengths
|
|
45
|
+
# are corrected by the species tree branch length
|
|
46
|
+
(
|
|
47
|
+
tree_zero_corr_branch_lengths,
|
|
48
|
+
tree_one_corr_branch_lengths,
|
|
49
|
+
tip_names,
|
|
50
|
+
) = self.correct_branch_lengths(tree_zero, tree_one, tree_ref)
|
|
51
|
+
|
|
52
|
+
# remove corrected BLs greater than 5
|
|
53
|
+
outlier_indices = []
|
|
54
|
+
outlier_indices = self.get_indices_of_outlier_branch_lengths(
|
|
55
|
+
tree_zero_corr_branch_lengths, outlier_indices
|
|
56
|
+
)
|
|
57
|
+
outlier_indices = self.get_indices_of_outlier_branch_lengths(
|
|
58
|
+
tree_one_corr_branch_lengths, outlier_indices
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
tree_zero_corr_branch_lengths = self.remove_outliers_based_on_indices(
|
|
62
|
+
tree_zero_corr_branch_lengths, outlier_indices
|
|
63
|
+
)
|
|
64
|
+
tree_one_corr_branch_lengths = self.remove_outliers_based_on_indices(
|
|
65
|
+
tree_one_corr_branch_lengths, outlier_indices
|
|
66
|
+
)
|
|
67
|
+
tip_names = self.remove_outliers_based_on_indices(tip_names, outlier_indices)
|
|
68
|
+
|
|
69
|
+
# standardize values for final correction
|
|
70
|
+
tree_zero_corr_branch_lengths = zscore(tree_zero_corr_branch_lengths)
|
|
71
|
+
tree_one_corr_branch_lengths = zscore(tree_one_corr_branch_lengths)
|
|
72
|
+
|
|
73
|
+
# Calculate correlation and append to results array
|
|
74
|
+
# also keep a list of p values
|
|
75
|
+
corr = list(
|
|
76
|
+
pearsonr(tree_zero_corr_branch_lengths, tree_one_corr_branch_lengths)
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
if self.verbose:
|
|
81
|
+
for val_zero, val_one, tip_name in zip(
|
|
82
|
+
tree_zero_corr_branch_lengths,
|
|
83
|
+
tree_one_corr_branch_lengths,
|
|
84
|
+
tip_names,
|
|
85
|
+
):
|
|
86
|
+
print(
|
|
87
|
+
f"{round(val_zero, 4)}\t{round(val_one, 4)}\t{';'.join(tip_name)}"
|
|
88
|
+
)
|
|
89
|
+
else:
|
|
90
|
+
print(f"{round(corr[0], 4)}\t{round(corr[1], 6)}")
|
|
91
|
+
except BrokenPipeError:
|
|
92
|
+
pass
|
|
93
|
+
|
|
94
|
+
def process_args(self, args):
|
|
95
|
+
return dict(
|
|
96
|
+
tree_file_path=args.tree_zero,
|
|
97
|
+
tree1_file_path=args.tree_one,
|
|
98
|
+
reference=args.reference,
|
|
99
|
+
verbose=args.verbose,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
def get_indices_of_outlier_branch_lengths(
|
|
103
|
+
self, corr_branch_lengths, outlier_indices
|
|
104
|
+
):
|
|
105
|
+
"""
|
|
106
|
+
create index for branch lengths that
|
|
107
|
+
have an absolute value greater than 5
|
|
108
|
+
"""
|
|
109
|
+
# Convert to numpy array for vectorized operations
|
|
110
|
+
arr = np.array(corr_branch_lengths, dtype=float)
|
|
111
|
+
|
|
112
|
+
# Find outliers using vectorized operations
|
|
113
|
+
new_outliers = np.where((np.abs(arr) > 5) | np.isnan(arr))[0]
|
|
114
|
+
|
|
115
|
+
# Combine with existing outliers
|
|
116
|
+
all_outliers = set(outlier_indices)
|
|
117
|
+
all_outliers.update(new_outliers.tolist())
|
|
118
|
+
|
|
119
|
+
return list(all_outliers)
|
|
120
|
+
|
|
121
|
+
def remove_outliers_based_on_indices(self, corr_branch_lengths, outlier_indices):
|
|
122
|
+
"""
|
|
123
|
+
remove value if the value is an outlier according
|
|
124
|
+
to the outlier indices list
|
|
125
|
+
"""
|
|
126
|
+
if not outlier_indices:
|
|
127
|
+
return corr_branch_lengths
|
|
128
|
+
|
|
129
|
+
# Use numpy for efficient filtering
|
|
130
|
+
mask = np.ones(len(corr_branch_lengths), dtype=bool)
|
|
131
|
+
mask[list(outlier_indices)] = False
|
|
132
|
+
|
|
133
|
+
if isinstance(corr_branch_lengths[0], (list, tuple)):
|
|
134
|
+
# Handle list of lists (tip_names)
|
|
135
|
+
return [item for i, item in enumerate(corr_branch_lengths) if mask[i]]
|
|
136
|
+
else:
|
|
137
|
+
# Handle numeric lists
|
|
138
|
+
return [item for i, item in enumerate(corr_branch_lengths) if mask[i]]
|
|
139
|
+
|
|
140
|
+
def prune_tips(self, tree, tips):
|
|
141
|
+
"""
|
|
142
|
+
prune tips from trees
|
|
143
|
+
"""
|
|
144
|
+
|
|
145
|
+
for tip in tips:
|
|
146
|
+
tree.prune(tip)
|
|
147
|
+
|
|
148
|
+
return tree
|
|
149
|
+
|
|
150
|
+
@staticmethod
|
|
151
|
+
def _process_terminal_batch(tree0_pickle, tree1_pickle, terminals_data):
|
|
152
|
+
"""Process a batch of terminals in parallel."""
|
|
153
|
+
t0 = pickle.loads(tree0_pickle)
|
|
154
|
+
t1 = pickle.loads(tree1_pickle)
|
|
155
|
+
|
|
156
|
+
results = []
|
|
157
|
+
for terminal_name, terminal_bl, sp_tips in terminals_data:
|
|
158
|
+
try:
|
|
159
|
+
newtree = t0.common_ancestor(terminal_name)
|
|
160
|
+
newtree1 = t1.common_ancestor(terminal_name)
|
|
161
|
+
|
|
162
|
+
bl0 = round(newtree.branch_length / terminal_bl, 6) if newtree.branch_length else None
|
|
163
|
+
bl1 = round(newtree1.branch_length / terminal_bl, 6) if newtree1.branch_length else None
|
|
164
|
+
|
|
165
|
+
if bl0 is not None and bl1 is not None:
|
|
166
|
+
results.append((bl0, bl1, sp_tips))
|
|
167
|
+
except:
|
|
168
|
+
continue
|
|
169
|
+
return results
|
|
170
|
+
|
|
171
|
+
@staticmethod
|
|
172
|
+
def _process_nonterminal_batch(tree0_pickle, tree1_pickle, nonterminals_data):
|
|
173
|
+
"""Process a batch of nonterminals in parallel."""
|
|
174
|
+
t0 = pickle.loads(tree0_pickle)
|
|
175
|
+
t1 = pickle.loads(tree1_pickle)
|
|
176
|
+
|
|
177
|
+
results = []
|
|
178
|
+
for sp_tips, nonterminal_bl in nonterminals_data:
|
|
179
|
+
try:
|
|
180
|
+
newtree = t0.common_ancestor(sp_tips)
|
|
181
|
+
newtree1 = t1.common_ancestor(sp_tips)
|
|
182
|
+
|
|
183
|
+
if newtree.branch_length and newtree1.branch_length and nonterminal_bl:
|
|
184
|
+
bl0 = round(newtree.branch_length / nonterminal_bl, 6)
|
|
185
|
+
bl1 = round(newtree1.branch_length / nonterminal_bl, 6)
|
|
186
|
+
results.append((bl0, bl1, sp_tips))
|
|
187
|
+
except:
|
|
188
|
+
continue
|
|
189
|
+
return results
|
|
190
|
+
|
|
191
|
+
def correct_branch_lengths(self, t0, t1, sp):
|
|
192
|
+
"""
|
|
193
|
+
obtain a list of corrected branch lengths with parallel processing
|
|
194
|
+
"""
|
|
195
|
+
l0 = []
|
|
196
|
+
l1 = []
|
|
197
|
+
tip_names = []
|
|
198
|
+
|
|
199
|
+
# Collect terminal data
|
|
200
|
+
terminals = sp.get_terminals()
|
|
201
|
+
nonterminals = sp.get_nonterminals()
|
|
202
|
+
|
|
203
|
+
# Process sequentially if small dataset or use parallel processing
|
|
204
|
+
if len(terminals) + len(nonterminals) < 50:
|
|
205
|
+
# Original sequential processing for small datasets
|
|
206
|
+
for i in terminals:
|
|
207
|
+
sp_tips = self.get_tip_names_from_tree(i)
|
|
208
|
+
tip_names.append(sp_tips)
|
|
209
|
+
try:
|
|
210
|
+
newtree = t0.common_ancestor(i.name)
|
|
211
|
+
newtree1 = t1.common_ancestor(i.name)
|
|
212
|
+
if newtree.branch_length and i.branch_length:
|
|
213
|
+
l0.append(round(newtree.branch_length / i.branch_length, 6))
|
|
214
|
+
l1.append(round(newtree1.branch_length / i.branch_length, 6))
|
|
215
|
+
except:
|
|
216
|
+
continue
|
|
217
|
+
|
|
218
|
+
for i in nonterminals:
|
|
219
|
+
sp_tips = self.get_tip_names_from_tree(i)
|
|
220
|
+
try:
|
|
221
|
+
newtree = t0.common_ancestor(sp_tips)
|
|
222
|
+
newtree1 = t1.common_ancestor(sp_tips)
|
|
223
|
+
if newtree.branch_length and newtree1.branch_length and i.branch_length:
|
|
224
|
+
l0.append(round(newtree.branch_length / i.branch_length, 6))
|
|
225
|
+
l1.append(round(newtree1.branch_length / i.branch_length, 6))
|
|
226
|
+
tip_names.append(sp_tips)
|
|
227
|
+
except:
|
|
228
|
+
continue
|
|
229
|
+
else:
|
|
230
|
+
# Parallel processing for large datasets
|
|
231
|
+
tree0_pickle = pickle.dumps(t0)
|
|
232
|
+
tree1_pickle = pickle.dumps(t1)
|
|
233
|
+
|
|
234
|
+
# Prepare terminal data
|
|
235
|
+
terminals_data = []
|
|
236
|
+
for i in terminals:
|
|
237
|
+
sp_tips = self.get_tip_names_from_tree(i)
|
|
238
|
+
if i.branch_length:
|
|
239
|
+
terminals_data.append((i.name, i.branch_length, sp_tips))
|
|
240
|
+
|
|
241
|
+
# Prepare nonterminal data
|
|
242
|
+
nonterminals_data = []
|
|
243
|
+
for i in nonterminals:
|
|
244
|
+
if i.branch_length:
|
|
245
|
+
sp_tips = self.get_tip_names_from_tree(i)
|
|
246
|
+
nonterminals_data.append((sp_tips, i.branch_length))
|
|
247
|
+
|
|
248
|
+
# Process in batches
|
|
249
|
+
batch_size = max(10, (len(terminals_data) + len(nonterminals_data)) // 4)
|
|
250
|
+
|
|
251
|
+
with ProcessPoolExecutor(max_workers=min(4, len(terminals_data) + len(nonterminals_data) // 10)) as executor:
|
|
252
|
+
futures = []
|
|
253
|
+
|
|
254
|
+
# Submit terminal batches
|
|
255
|
+
for i in range(0, len(terminals_data), batch_size):
|
|
256
|
+
batch = terminals_data[i:i+batch_size]
|
|
257
|
+
futures.append(executor.submit(self._process_terminal_batch, tree0_pickle, tree1_pickle, batch))
|
|
258
|
+
|
|
259
|
+
# Submit nonterminal batches
|
|
260
|
+
for i in range(0, len(nonterminals_data), batch_size):
|
|
261
|
+
batch = nonterminals_data[i:i+batch_size]
|
|
262
|
+
futures.append(executor.submit(self._process_nonterminal_batch, tree0_pickle, tree1_pickle, batch))
|
|
263
|
+
|
|
264
|
+
# Collect results
|
|
265
|
+
for future in as_completed(futures):
|
|
266
|
+
batch_results = future.result()
|
|
267
|
+
for bl0, bl1, sp_tips in batch_results:
|
|
268
|
+
l0.append(bl0)
|
|
269
|
+
l1.append(bl1)
|
|
270
|
+
tip_names.append(sp_tips)
|
|
271
|
+
|
|
272
|
+
return (l0, l1, tip_names)
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import math
|
|
2
|
+
from typing import Dict
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
from Bio.Phylo import Newick
|
|
6
|
+
|
|
7
|
+
from .base import Tree
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DVMC(Tree):
|
|
11
|
+
def __init__(self, args) -> None:
|
|
12
|
+
super().__init__(**self.process_args(args))
|
|
13
|
+
|
|
14
|
+
def run(self):
|
|
15
|
+
tree = self.read_tree_file()
|
|
16
|
+
dvmc = self.determine_dvmc(tree)
|
|
17
|
+
print(round(dvmc, 4))
|
|
18
|
+
|
|
19
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
20
|
+
return dict(tree_file_path=args.tree)
|
|
21
|
+
|
|
22
|
+
def determine_dvmc(self, tree: Newick.Tree) -> float:
|
|
23
|
+
num_spp = tree.count_terminals()
|
|
24
|
+
|
|
25
|
+
# Collect all distances at once for vectorized operations
|
|
26
|
+
distances = np.array([tree.distance(term) for term in tree.get_terminals()])
|
|
27
|
+
|
|
28
|
+
# Calculate statistics using numpy
|
|
29
|
+
sum_dist = np.sum(distances)
|
|
30
|
+
sumi2N = np.sum(distances ** 2)
|
|
31
|
+
avg_dist = np.mean(distances)
|
|
32
|
+
|
|
33
|
+
# Calculate variance more efficiently
|
|
34
|
+
squared_diff_sum = sumi2N - num_spp * (avg_dist ** 2)
|
|
35
|
+
|
|
36
|
+
# Return standard deviation
|
|
37
|
+
return np.sqrt(squared_diff_sum / (num_spp - 1))
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
|
|
3
|
+
from .base import Tree
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class EvolutionaryRate(Tree):
|
|
7
|
+
def __init__(self, args) -> None:
|
|
8
|
+
super().__init__(**self.process_args(args))
|
|
9
|
+
|
|
10
|
+
def run(self) -> None:
|
|
11
|
+
tree = self.read_tree_file()
|
|
12
|
+
total_tree_length = tree.total_branch_length()
|
|
13
|
+
num_terminals = tree.count_terminals()
|
|
14
|
+
print(round(total_tree_length / num_terminals, 4))
|
|
15
|
+
|
|
16
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
17
|
+
return dict(tree_file_path=args.tree)
|