phykit 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phykit/__init__.py +0 -0
- phykit/__main__.py +6 -0
- phykit/helpers/__init__.py +0 -0
- phykit/helpers/boolean_argument_parsing.py +12 -0
- phykit/helpers/caching.py +201 -0
- phykit/helpers/files.py +125 -0
- phykit/helpers/parallel.py +305 -0
- phykit/helpers/stats_summary.py +64 -0
- phykit/helpers/streaming.py +152 -0
- phykit/phykit.py +2862 -0
- phykit/services/__init__.py +0 -0
- phykit/services/alignment/__init__.py +17 -0
- phykit/services/alignment/alignment_length.py +16 -0
- phykit/services/alignment/alignment_length_no_gaps.py +69 -0
- phykit/services/alignment/alignment_recoding.py +89 -0
- phykit/services/alignment/base.py +103 -0
- phykit/services/alignment/column_score.py +66 -0
- phykit/services/alignment/compositional_bias_per_site.py +98 -0
- phykit/services/alignment/create_concatenation_matrix.py +254 -0
- phykit/services/alignment/dna_threader.py +145 -0
- phykit/services/alignment/evolutionary_rate_per_site.py +85 -0
- phykit/services/alignment/faidx.py +21 -0
- phykit/services/alignment/gc_content.py +94 -0
- phykit/services/alignment/pairwise_identity.py +159 -0
- phykit/services/alignment/parsimony_informative_sites.py +81 -0
- phykit/services/alignment/rcv.py +14 -0
- phykit/services/alignment/rcvt.py +47 -0
- phykit/services/alignment/rename_fasta_entries.py +53 -0
- phykit/services/alignment/sum_of_pairs_score.py +157 -0
- phykit/services/alignment/variable_sites.py +54 -0
- phykit/services/base.py +9 -0
- phykit/services/tree/__init__.py +29 -0
- phykit/services/tree/base.py +178 -0
- phykit/services/tree/bipartition_support_stats.py +48 -0
- phykit/services/tree/branch_length_multiplier.py +37 -0
- phykit/services/tree/collapse_branches.py +27 -0
- phykit/services/tree/covarying_evolutionary_rates.py +272 -0
- phykit/services/tree/dvmc.py +37 -0
- phykit/services/tree/evolutionary_rate.py +17 -0
- phykit/services/tree/hidden_paralogy_check.py +128 -0
- phykit/services/tree/internal_branch_stats.py +77 -0
- phykit/services/tree/internode_labeler.py +33 -0
- phykit/services/tree/last_common_ancestor_subtree.py +35 -0
- phykit/services/tree/lb_score.py +196 -0
- phykit/services/tree/monophyly_check.py +106 -0
- phykit/services/tree/nearest_neighbor_interchange.py +140 -0
- phykit/services/tree/patristic_distances.py +113 -0
- phykit/services/tree/polytomy_test.py +546 -0
- phykit/services/tree/print_tree.py +28 -0
- phykit/services/tree/prune_tree.py +40 -0
- phykit/services/tree/rename_tree_tips.py +64 -0
- phykit/services/tree/rf_distance.py +136 -0
- phykit/services/tree/root_tree.py +35 -0
- phykit/services/tree/saturation.py +209 -0
- phykit/services/tree/spurious_sequence.py +75 -0
- phykit/services/tree/terminal_branch_stats.py +87 -0
- phykit/services/tree/tip_labels.py +18 -0
- phykit/services/tree/tip_to_tip_distance.py +41 -0
- phykit/services/tree/tip_to_tip_node_distance.py +41 -0
- phykit/services/tree/total_tree_length.py +25 -0
- phykit/services/tree/treeness.py +16 -0
- phykit/services/tree/treeness_over_rcv.py +40 -0
- phykit/version.py +1 -0
- phykit-2.1.0.dist-info/METADATA +150 -0
- phykit-2.1.0.dist-info/RECORD +69 -0
- phykit-2.1.0.dist-info/WHEEL +5 -0
- phykit-2.1.0.dist-info/entry_points.txt +121 -0
- phykit-2.1.0.dist-info/licenses/LICENSE.md +7 -0
- phykit-2.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from typing import Dict, List, Union
|
|
3
|
+
from functools import lru_cache
|
|
4
|
+
import multiprocessing as mp
|
|
5
|
+
from functools import partial
|
|
6
|
+
|
|
7
|
+
from Bio import Phylo
|
|
8
|
+
|
|
9
|
+
from .base import Tree
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class HiddenParalogyCheck(Tree):
|
|
13
|
+
def __init__(self, args) -> None:
|
|
14
|
+
super().__init__(**self.process_args(args))
|
|
15
|
+
|
|
16
|
+
@staticmethod
|
|
17
|
+
def _process_clade_batch(clade_batch, tree_file_path, master_tree_tips):
|
|
18
|
+
"""Process a batch of clades in parallel."""
|
|
19
|
+
batch_results = []
|
|
20
|
+
|
|
21
|
+
for clade in clade_batch:
|
|
22
|
+
# Read a fresh copy of the tree for each clade
|
|
23
|
+
tree = Phylo.read(tree_file_path, "newick")
|
|
24
|
+
clade_of_interest = set(clade).intersection(master_tree_tips)
|
|
25
|
+
|
|
26
|
+
if len(clade_of_interest) <= 1:
|
|
27
|
+
batch_results.append(["insufficient_taxon_representation"])
|
|
28
|
+
continue
|
|
29
|
+
|
|
30
|
+
diff_tips = master_tree_tips - clade_of_interest
|
|
31
|
+
|
|
32
|
+
# Root and find common ancestor
|
|
33
|
+
try:
|
|
34
|
+
tree.root_with_outgroup(list(diff_tips))
|
|
35
|
+
subtree = tree.common_ancestor(clade_of_interest)
|
|
36
|
+
|
|
37
|
+
# Get terminal names efficiently
|
|
38
|
+
common_ancestor_tips = set(tip.name for tip in subtree.get_terminals())
|
|
39
|
+
|
|
40
|
+
diff_tips_between_clade_and_curr_tree = \
|
|
41
|
+
clade_of_interest.symmetric_difference(common_ancestor_tips)
|
|
42
|
+
|
|
43
|
+
batch_results.append([
|
|
44
|
+
"monophyletic" if not diff_tips_between_clade_and_curr_tree else "not_monophyletic",
|
|
45
|
+
list(diff_tips_between_clade_and_curr_tree),
|
|
46
|
+
])
|
|
47
|
+
except (ValueError, AttributeError):
|
|
48
|
+
# Handle edge cases where rooting fails
|
|
49
|
+
batch_results.append(["processing_error"])
|
|
50
|
+
|
|
51
|
+
return batch_results
|
|
52
|
+
|
|
53
|
+
def run(self) -> None:
|
|
54
|
+
# Read the master tree once to get all tip names
|
|
55
|
+
master_tree = self.read_tree_file()
|
|
56
|
+
master_tree_tips = frozenset(self.get_tip_names_from_tree(master_tree))
|
|
57
|
+
|
|
58
|
+
# Read clades
|
|
59
|
+
clades = self.read_clades_file(self.clade)
|
|
60
|
+
|
|
61
|
+
# For small datasets, process sequentially
|
|
62
|
+
if len(clades) < 10:
|
|
63
|
+
res_arr = []
|
|
64
|
+
for clade in clades:
|
|
65
|
+
# Read a fresh tree for each clade instead of deep copying
|
|
66
|
+
tree = Phylo.read(self.tree_file_path, "newick")
|
|
67
|
+
clade_of_interest = set(clade).intersection(master_tree_tips)
|
|
68
|
+
|
|
69
|
+
if len(clade_of_interest) <= 1:
|
|
70
|
+
res_arr.append(["insufficient_taxon_representation"])
|
|
71
|
+
continue
|
|
72
|
+
|
|
73
|
+
diff_tips = master_tree_tips - clade_of_interest
|
|
74
|
+
tree.root_with_outgroup(list(diff_tips))
|
|
75
|
+
|
|
76
|
+
subtree = tree.common_ancestor(clade_of_interest)
|
|
77
|
+
common_ancestor_tips = set(self.get_tip_names_from_tree(subtree))
|
|
78
|
+
|
|
79
|
+
diff_tips_between_clade_and_curr_tree = \
|
|
80
|
+
clade_of_interest.symmetric_difference(common_ancestor_tips)
|
|
81
|
+
|
|
82
|
+
res_arr.append([
|
|
83
|
+
"monophyletic" if not diff_tips_between_clade_and_curr_tree else "not_monophyletic",
|
|
84
|
+
list(diff_tips_between_clade_and_curr_tree),
|
|
85
|
+
])
|
|
86
|
+
else:
|
|
87
|
+
# Use multiprocessing for larger datasets
|
|
88
|
+
num_workers = min(mp.cpu_count(), 8)
|
|
89
|
+
batch_size = max(1, len(clades) // num_workers)
|
|
90
|
+
|
|
91
|
+
# Create clade batches
|
|
92
|
+
clade_batches = [clades[i:i + batch_size]
|
|
93
|
+
for i in range(0, len(clades), batch_size)]
|
|
94
|
+
|
|
95
|
+
# Process batches in parallel
|
|
96
|
+
process_func = partial(
|
|
97
|
+
self._process_clade_batch,
|
|
98
|
+
tree_file_path=self.tree_file_path,
|
|
99
|
+
master_tree_tips=master_tree_tips
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
with mp.Pool(processes=num_workers) as pool:
|
|
103
|
+
batch_results = pool.map(process_func, clade_batches)
|
|
104
|
+
|
|
105
|
+
# Flatten results
|
|
106
|
+
res_arr = []
|
|
107
|
+
for batch_result in batch_results:
|
|
108
|
+
res_arr.extend(batch_result)
|
|
109
|
+
|
|
110
|
+
self.print_results(res_arr)
|
|
111
|
+
|
|
112
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
113
|
+
return dict(
|
|
114
|
+
tree_file_path=args.tree,
|
|
115
|
+
clade=args.clade,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
def read_clades_file(self, clades: str) -> List[List[str]]:
|
|
119
|
+
try:
|
|
120
|
+
with open(clades, 'r') as file:
|
|
121
|
+
return [line.split() for line in file.readlines()]
|
|
122
|
+
except FileNotFoundError:
|
|
123
|
+
print("Clade file not found. Please check the path.")
|
|
124
|
+
sys.exit(2)
|
|
125
|
+
|
|
126
|
+
def print_results(self, res_arr: List[List[Union[List, str]]]) -> None:
|
|
127
|
+
for res in res_arr:
|
|
128
|
+
print(res[0])
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from typing import (
|
|
3
|
+
Dict,
|
|
4
|
+
List,
|
|
5
|
+
Tuple,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
from Bio.Phylo import Newick
|
|
9
|
+
|
|
10
|
+
from .base import Tree
|
|
11
|
+
|
|
12
|
+
from ...helpers.stats_summary import calculate_summary_statistics_from_arr, print_summary_statistics
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class InternalBranchStats(Tree):
|
|
16
|
+
def __init__(self, args) -> None:
|
|
17
|
+
super().__init__(**self.process_args(args))
|
|
18
|
+
|
|
19
|
+
def run(self):
|
|
20
|
+
tree = self.read_tree_file()
|
|
21
|
+
stats, lengths_and_names \
|
|
22
|
+
= self.calculate_internal_branch_stats(tree)
|
|
23
|
+
|
|
24
|
+
if self.verbose:
|
|
25
|
+
try:
|
|
26
|
+
for length, names in lengths_and_names:
|
|
27
|
+
print(round(length, 4), ";".join(names))
|
|
28
|
+
except BrokenPipeError:
|
|
29
|
+
pass
|
|
30
|
+
else:
|
|
31
|
+
print_summary_statistics(stats)
|
|
32
|
+
|
|
33
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
34
|
+
return dict(tree_file_path=args.tree, verbose=args.verbose)
|
|
35
|
+
|
|
36
|
+
def get_internal_branch_lengths(
|
|
37
|
+
self,
|
|
38
|
+
tree: Newick.Tree
|
|
39
|
+
) -> Tuple[
|
|
40
|
+
List[float],
|
|
41
|
+
List[Tuple[float, List[str]]]
|
|
42
|
+
]:
|
|
43
|
+
internal_branch_lengths = []
|
|
44
|
+
lengths_and_names = []
|
|
45
|
+
|
|
46
|
+
# Collect branch lengths and associated names in one pass
|
|
47
|
+
for internal_branch in tree.get_nonterminals():
|
|
48
|
+
if internal_branch.branch_length is not None:
|
|
49
|
+
internal_branch_lengths.append(internal_branch.branch_length)
|
|
50
|
+
term_names = [
|
|
51
|
+
term.name for term in internal_branch.get_terminals()
|
|
52
|
+
]
|
|
53
|
+
lengths_and_names.append(
|
|
54
|
+
(
|
|
55
|
+
internal_branch.branch_length, term_names
|
|
56
|
+
)
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
return internal_branch_lengths, lengths_and_names
|
|
60
|
+
|
|
61
|
+
def calculate_internal_branch_stats(
|
|
62
|
+
self,
|
|
63
|
+
tree: Newick.Tree
|
|
64
|
+
) -> Tuple[
|
|
65
|
+
Dict[str, float],
|
|
66
|
+
List[Tuple[float, List[str]]],
|
|
67
|
+
]:
|
|
68
|
+
internal_branch_lengths, lengths_and_names = \
|
|
69
|
+
self.get_internal_branch_lengths(tree)
|
|
70
|
+
|
|
71
|
+
if not internal_branch_lengths:
|
|
72
|
+
print("Calculating internal branch statistics requires a phylogeny with branch lengths.")
|
|
73
|
+
sys.exit(2)
|
|
74
|
+
|
|
75
|
+
stats = calculate_summary_statistics_from_arr(internal_branch_lengths)
|
|
76
|
+
|
|
77
|
+
return stats, lengths_and_names
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
import copy
|
|
3
|
+
|
|
4
|
+
from Bio.Phylo import Newick
|
|
5
|
+
|
|
6
|
+
from .base import Tree
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class InternodeLabeler(Tree):
|
|
10
|
+
def __init__(self, args) -> None:
|
|
11
|
+
super().__init__(**self.process_args(args))
|
|
12
|
+
|
|
13
|
+
def run(self):
|
|
14
|
+
tree = self.read_tree_file()
|
|
15
|
+
# Make a deep copy to avoid modifying the cached tree
|
|
16
|
+
tree_copy = copy.deepcopy(tree)
|
|
17
|
+
self.add_labels_to_tree(tree_copy)
|
|
18
|
+
self.write_tree_file(tree_copy, self.output_file_path)
|
|
19
|
+
|
|
20
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
21
|
+
output_file_path = args.output or f"{args.tree}.internode_labels.tre"
|
|
22
|
+
|
|
23
|
+
return dict(
|
|
24
|
+
tree_file_path=args.tree,
|
|
25
|
+
output_file_path=output_file_path,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
def add_labels_to_tree(
|
|
29
|
+
self,
|
|
30
|
+
tree: Newick.Tree
|
|
31
|
+
) -> None:
|
|
32
|
+
for label, node in enumerate(tree.get_nonterminals(), start=1):
|
|
33
|
+
node.confidence = label
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import copy
|
|
3
|
+
from typing import Dict
|
|
4
|
+
|
|
5
|
+
from .base import Tree
|
|
6
|
+
|
|
7
|
+
from ...helpers.files import read_single_column_file_to_list
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class LastCommonAncestorSubtree(Tree):
|
|
11
|
+
def __init__(self, args) -> None:
|
|
12
|
+
super().__init__(**self.process_args(args))
|
|
13
|
+
|
|
14
|
+
def run(self):
|
|
15
|
+
tree = self.read_tree_file()
|
|
16
|
+
# Make a deep copy to avoid issues with cached tree modifications
|
|
17
|
+
tree_copy = copy.deepcopy(tree)
|
|
18
|
+
try:
|
|
19
|
+
taxa = read_single_column_file_to_list(self.list_of_taxa)
|
|
20
|
+
except FileNotFoundError:
|
|
21
|
+
print("Taxa list file is not found. Please check pathing.")
|
|
22
|
+
sys.exit(2)
|
|
23
|
+
subtree = tree_copy.common_ancestor(taxa)
|
|
24
|
+
|
|
25
|
+
self.write_tree_file(subtree, self.output_file_path)
|
|
26
|
+
|
|
27
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
28
|
+
tree_file_path = args.tree
|
|
29
|
+
output_file_path = args.output or f"{tree_file_path}.subtree.tre"
|
|
30
|
+
|
|
31
|
+
return dict(
|
|
32
|
+
tree_file_path=tree_file_path,
|
|
33
|
+
output_file_path=output_file_path,
|
|
34
|
+
list_of_taxa=args.list_of_taxa,
|
|
35
|
+
)
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import itertools
|
|
3
|
+
from typing import Dict, List, Tuple
|
|
4
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
5
|
+
import multiprocessing as mp
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pickle
|
|
8
|
+
from functools import lru_cache
|
|
9
|
+
try:
|
|
10
|
+
from tqdm import tqdm
|
|
11
|
+
HAS_TQDM = True
|
|
12
|
+
except ImportError:
|
|
13
|
+
HAS_TQDM = False
|
|
14
|
+
|
|
15
|
+
from Bio.Phylo import Newick
|
|
16
|
+
|
|
17
|
+
from .base import Tree
|
|
18
|
+
|
|
19
|
+
from ...helpers.stats_summary import (
|
|
20
|
+
calculate_summary_statistics_from_arr,
|
|
21
|
+
print_summary_statistics,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class LBScore(Tree):
|
|
26
|
+
def __init__(self, args) -> None:
|
|
27
|
+
super().__init__(**self.process_args(args))
|
|
28
|
+
|
|
29
|
+
def run(self) -> None:
|
|
30
|
+
tree = self.read_tree_file()
|
|
31
|
+
tips, LBis = self.calculate_lb_score(tree)
|
|
32
|
+
if self.verbose:
|
|
33
|
+
try:
|
|
34
|
+
for tip, LBi in zip(tips, LBis):
|
|
35
|
+
print(f"{tip}\t{round(LBi, 4)}")
|
|
36
|
+
except BrokenPipeError:
|
|
37
|
+
pass
|
|
38
|
+
else:
|
|
39
|
+
stats = calculate_summary_statistics_from_arr(LBis)
|
|
40
|
+
print_summary_statistics(stats)
|
|
41
|
+
|
|
42
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
43
|
+
return dict(tree_file_path=args.tree, verbose=args.verbose)
|
|
44
|
+
|
|
45
|
+
@staticmethod
|
|
46
|
+
def _calculate_distances_batch(tree_pickle, tip_pairs):
|
|
47
|
+
"""Calculate distances for a batch of tip pairs."""
|
|
48
|
+
tree = pickle.loads(tree_pickle)
|
|
49
|
+
return [tree.distance(tip1, tip2) for tip1, tip2 in tip_pairs]
|
|
50
|
+
|
|
51
|
+
def calculate_average_distance_between_tips(
|
|
52
|
+
self,
|
|
53
|
+
tips: List[str],
|
|
54
|
+
tree: Newick.Tree,
|
|
55
|
+
) -> float:
|
|
56
|
+
num_tips = len(tips)
|
|
57
|
+
if num_tips < 2:
|
|
58
|
+
return 0
|
|
59
|
+
|
|
60
|
+
# Get all combinations
|
|
61
|
+
all_pairs = list(itertools.combinations(tips, 2))
|
|
62
|
+
num_combos = len(all_pairs)
|
|
63
|
+
|
|
64
|
+
# For small datasets, use sequential processing
|
|
65
|
+
if num_combos < 100:
|
|
66
|
+
total_dist = sum(
|
|
67
|
+
tree.distance(tip1, tip2)
|
|
68
|
+
for tip1, tip2 in all_pairs
|
|
69
|
+
)
|
|
70
|
+
else:
|
|
71
|
+
# Use multiprocessing for large datasets
|
|
72
|
+
tree_pickle = pickle.dumps(tree)
|
|
73
|
+
batch_size = max(50, num_combos // mp.cpu_count())
|
|
74
|
+
|
|
75
|
+
with ProcessPoolExecutor(max_workers=min(mp.cpu_count(), 8)) as executor:
|
|
76
|
+
futures = []
|
|
77
|
+
for i in range(0, num_combos, batch_size):
|
|
78
|
+
batch = all_pairs[i:i + batch_size]
|
|
79
|
+
futures.append(
|
|
80
|
+
executor.submit(self._calculate_distances_batch, tree_pickle, batch)
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
total_dist = 0
|
|
84
|
+
# Add progress bar if available and dataset is large
|
|
85
|
+
if HAS_TQDM and num_combos > 1000:
|
|
86
|
+
futures_iter = tqdm(as_completed(futures), total=len(futures), desc="Computing distances")
|
|
87
|
+
else:
|
|
88
|
+
futures_iter = as_completed(futures)
|
|
89
|
+
|
|
90
|
+
for future in futures_iter:
|
|
91
|
+
total_dist += sum(future.result())
|
|
92
|
+
|
|
93
|
+
return total_dist / num_combos if num_combos else 0
|
|
94
|
+
|
|
95
|
+
@staticmethod
|
|
96
|
+
def _calculate_tip_distances_batch(tree_pickle, tips_data):
|
|
97
|
+
"""Calculate average distances for a batch of tips."""
|
|
98
|
+
tree = pickle.loads(tree_pickle)
|
|
99
|
+
results = []
|
|
100
|
+
|
|
101
|
+
for tip, other_tips in tips_data:
|
|
102
|
+
distances = [tree.distance(tip, other_tip) for other_tip in other_tips]
|
|
103
|
+
avg_dist = sum(distances) / len(distances) if distances else 0
|
|
104
|
+
results.append(avg_dist)
|
|
105
|
+
|
|
106
|
+
return results
|
|
107
|
+
|
|
108
|
+
def calculate_average_distance_of_taxon_to_other_taxa(
|
|
109
|
+
self,
|
|
110
|
+
tips: List[str],
|
|
111
|
+
tree: Newick.Tree,
|
|
112
|
+
) -> List[float]:
|
|
113
|
+
# IMPORTANT: Original code has a bug where it uses set(tip) which creates
|
|
114
|
+
# a set of characters, not a set containing the tip. This includes the
|
|
115
|
+
# current tip in distance calculations. We preserve this for compatibility.
|
|
116
|
+
|
|
117
|
+
# For small datasets or to maintain exact compatibility, use sequential
|
|
118
|
+
if len(tips) <= 50:
|
|
119
|
+
avg_PDis = []
|
|
120
|
+
for tip in tips:
|
|
121
|
+
# Preserve the original bug: set(tip) creates set of characters
|
|
122
|
+
tips_minus_i = list(set(tips) - set(tip))
|
|
123
|
+
PDi = []
|
|
124
|
+
for tip_minus in tips_minus_i:
|
|
125
|
+
PDi.append(tree.distance(tip, tip_minus))
|
|
126
|
+
PDi = sum(PDi) / len(PDi) if PDi else 0
|
|
127
|
+
avg_PDis.append(PDi)
|
|
128
|
+
|
|
129
|
+
return avg_PDis
|
|
130
|
+
|
|
131
|
+
# For larger datasets, use parallel processing but preserve the bug
|
|
132
|
+
tips_data = []
|
|
133
|
+
for tip in tips:
|
|
134
|
+
# Preserve the bug: set(tip) creates set of characters
|
|
135
|
+
tips_minus_i = list(set(tips) - set(tip))
|
|
136
|
+
tips_data.append((tip, tips_minus_i))
|
|
137
|
+
|
|
138
|
+
# Process in batches
|
|
139
|
+
batch_size = max(10, len(tips) // mp.cpu_count())
|
|
140
|
+
tree_pickle = pickle.dumps(tree)
|
|
141
|
+
|
|
142
|
+
with ProcessPoolExecutor(max_workers=min(mp.cpu_count(), 8)) as executor:
|
|
143
|
+
# Keep track of batch order
|
|
144
|
+
future_to_index = {}
|
|
145
|
+
|
|
146
|
+
for i in range(0, len(tips_data), batch_size):
|
|
147
|
+
batch = tips_data[i:i + batch_size]
|
|
148
|
+
future = executor.submit(self._calculate_tip_distances_batch, tree_pickle, batch)
|
|
149
|
+
future_to_index[future] = i
|
|
150
|
+
|
|
151
|
+
# Collect results in order
|
|
152
|
+
results_dict = {}
|
|
153
|
+
for future in as_completed(future_to_index):
|
|
154
|
+
batch_index = future_to_index[future]
|
|
155
|
+
results_dict[batch_index] = future.result()
|
|
156
|
+
|
|
157
|
+
# Reconstruct ordered results
|
|
158
|
+
avg_PDis = []
|
|
159
|
+
for i in sorted(results_dict.keys()):
|
|
160
|
+
avg_PDis.extend(results_dict[i])
|
|
161
|
+
|
|
162
|
+
return avg_PDis
|
|
163
|
+
|
|
164
|
+
def calculate_lb_score_per_taxa(
|
|
165
|
+
self,
|
|
166
|
+
avg_PDis: List[float],
|
|
167
|
+
avg_dist: float
|
|
168
|
+
) -> List[float]:
|
|
169
|
+
if avg_dist == 0:
|
|
170
|
+
try:
|
|
171
|
+
print("Invalid tree. Tree should contain branch lengths")
|
|
172
|
+
sys.exit(2)
|
|
173
|
+
except BrokenPipeError:
|
|
174
|
+
pass
|
|
175
|
+
return []
|
|
176
|
+
|
|
177
|
+
# Use NumPy for vectorized computation
|
|
178
|
+
PDis_array = np.array(avg_PDis)
|
|
179
|
+
LBis = ((PDis_array / avg_dist) - 1) * 100
|
|
180
|
+
|
|
181
|
+
return LBis.tolist()
|
|
182
|
+
|
|
183
|
+
def calculate_lb_score(
|
|
184
|
+
self,
|
|
185
|
+
tree: Newick.Tree
|
|
186
|
+
) -> Tuple[List[str], List[float]]:
|
|
187
|
+
tips = self.get_tip_names_from_tree(tree)
|
|
188
|
+
|
|
189
|
+
avg_dist = self.calculate_average_distance_between_tips(tips, tree)
|
|
190
|
+
|
|
191
|
+
avg_PDis = \
|
|
192
|
+
self.calculate_average_distance_of_taxon_to_other_taxa(tips, tree)
|
|
193
|
+
|
|
194
|
+
LBis = self.calculate_lb_score_per_taxa(avg_PDis, avg_dist)
|
|
195
|
+
|
|
196
|
+
return tips, LBis
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from typing import Dict, List, Union
|
|
3
|
+
|
|
4
|
+
from Bio.Phylo import Newick
|
|
5
|
+
|
|
6
|
+
from .base import Tree
|
|
7
|
+
|
|
8
|
+
from ...helpers.stats_summary import calculate_summary_statistics_from_arr
|
|
9
|
+
from ...helpers.files import read_single_column_file_to_list
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class MonophylyCheck(Tree):
|
|
13
|
+
def __init__(self, args) -> None:
|
|
14
|
+
super().__init__(**self.process_args(args))
|
|
15
|
+
|
|
16
|
+
def run(self) -> None:
|
|
17
|
+
tree = self.read_tree_file()
|
|
18
|
+
taxa = read_single_column_file_to_list(self.list_of_taxa)
|
|
19
|
+
|
|
20
|
+
res_arr = []
|
|
21
|
+
|
|
22
|
+
# Use frozenset for more efficient set operations
|
|
23
|
+
tree_tips = frozenset(self.get_tip_names_from_tree(tree))
|
|
24
|
+
taxa_set = frozenset(taxa)
|
|
25
|
+
taxa_of_interest = taxa_set.intersection(tree_tips)
|
|
26
|
+
|
|
27
|
+
if len(taxa_of_interest) <= 1:
|
|
28
|
+
res_arr.append(["insufficient_taxon_representation"])
|
|
29
|
+
sys.exit(2)
|
|
30
|
+
|
|
31
|
+
# Convert back to list for functions that need it
|
|
32
|
+
taxa_of_interest_list = list(taxa_of_interest)
|
|
33
|
+
shared_tree_tips = self.shared_tips(taxa_of_interest_list, list(tree_tips))
|
|
34
|
+
|
|
35
|
+
# Use set difference directly
|
|
36
|
+
diff_tips = list(tree_tips - frozenset(shared_tree_tips))
|
|
37
|
+
tree.root_with_outgroup(diff_tips)
|
|
38
|
+
tree = tree.common_ancestor(shared_tree_tips)
|
|
39
|
+
|
|
40
|
+
# Cache common ancestor tips as set
|
|
41
|
+
common_ancestor_tips = frozenset(self.get_tip_names_from_tree(tree))
|
|
42
|
+
diff_tips_between_clade_and_curr_tree = list(
|
|
43
|
+
taxa_of_interest.symmetric_difference(common_ancestor_tips)
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
stats = self.get_bootstrap_statistics(tree)
|
|
47
|
+
|
|
48
|
+
res_arr = self.populate_res_arr(
|
|
49
|
+
diff_tips_between_clade_and_curr_tree, stats, res_arr
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
self.print_results(res_arr)
|
|
53
|
+
|
|
54
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
55
|
+
return dict(
|
|
56
|
+
tree_file_path=args.tree,
|
|
57
|
+
list_of_taxa=args.list_of_taxa,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
def get_bootstrap_statistics(
|
|
61
|
+
self,
|
|
62
|
+
clade: Newick.Clade
|
|
63
|
+
) -> Dict[str, Union[int, float]]:
|
|
64
|
+
# Use generator for memory efficiency
|
|
65
|
+
bs_vals = [
|
|
66
|
+
terminal.confidence for terminal in clade.get_nonterminals()
|
|
67
|
+
if terminal.confidence is not None
|
|
68
|
+
]
|
|
69
|
+
|
|
70
|
+
return calculate_summary_statistics_from_arr(bs_vals)
|
|
71
|
+
|
|
72
|
+
def populate_res_arr(
|
|
73
|
+
self,
|
|
74
|
+
diff_tips_between_clade_and_curr_tree: List[str],
|
|
75
|
+
stats: Dict[str, float],
|
|
76
|
+
res_arr: List,
|
|
77
|
+
) -> List[List[Union[str, int, float]]]:
|
|
78
|
+
temp = []
|
|
79
|
+
|
|
80
|
+
if len(diff_tips_between_clade_and_curr_tree) == 0:
|
|
81
|
+
temp.append("monophyletic")
|
|
82
|
+
else:
|
|
83
|
+
temp.append("not_monophyletic")
|
|
84
|
+
temp.append(stats["mean"])
|
|
85
|
+
temp.append(stats["maximum"])
|
|
86
|
+
temp.append(stats["minimum"])
|
|
87
|
+
temp.append(stats["standard_deviation"])
|
|
88
|
+
temp.append(diff_tips_between_clade_and_curr_tree)
|
|
89
|
+
res_arr.append(temp)
|
|
90
|
+
|
|
91
|
+
return res_arr
|
|
92
|
+
|
|
93
|
+
def print_results(self, res_arr: List[List[Union[str, int, float]]]) -> None:
|
|
94
|
+
for res in res_arr:
|
|
95
|
+
try:
|
|
96
|
+
if res[5]:
|
|
97
|
+
res[5].sort()
|
|
98
|
+
print(
|
|
99
|
+
f"{res[0]}\t{round(res[1], 4)}\t{round(res[2], 4)}\t{round(res[3], 4)}\t{round(res[4], 4)}\t{';'.join(res[5])}"
|
|
100
|
+
)
|
|
101
|
+
else:
|
|
102
|
+
print(
|
|
103
|
+
f"{res[0]}\t{round(res[1], 4)}\t{round(res[2], 4)}\t{round(res[3], 4)}\t{round(res[4], 4)}"
|
|
104
|
+
)
|
|
105
|
+
except IndexError:
|
|
106
|
+
print(f"{res[0]}")
|