phykit 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phykit/__init__.py +0 -0
- phykit/__main__.py +6 -0
- phykit/helpers/__init__.py +0 -0
- phykit/helpers/boolean_argument_parsing.py +12 -0
- phykit/helpers/caching.py +201 -0
- phykit/helpers/files.py +125 -0
- phykit/helpers/parallel.py +305 -0
- phykit/helpers/stats_summary.py +64 -0
- phykit/helpers/streaming.py +152 -0
- phykit/phykit.py +2862 -0
- phykit/services/__init__.py +0 -0
- phykit/services/alignment/__init__.py +17 -0
- phykit/services/alignment/alignment_length.py +16 -0
- phykit/services/alignment/alignment_length_no_gaps.py +69 -0
- phykit/services/alignment/alignment_recoding.py +89 -0
- phykit/services/alignment/base.py +103 -0
- phykit/services/alignment/column_score.py +66 -0
- phykit/services/alignment/compositional_bias_per_site.py +98 -0
- phykit/services/alignment/create_concatenation_matrix.py +254 -0
- phykit/services/alignment/dna_threader.py +145 -0
- phykit/services/alignment/evolutionary_rate_per_site.py +85 -0
- phykit/services/alignment/faidx.py +21 -0
- phykit/services/alignment/gc_content.py +94 -0
- phykit/services/alignment/pairwise_identity.py +159 -0
- phykit/services/alignment/parsimony_informative_sites.py +81 -0
- phykit/services/alignment/rcv.py +14 -0
- phykit/services/alignment/rcvt.py +47 -0
- phykit/services/alignment/rename_fasta_entries.py +53 -0
- phykit/services/alignment/sum_of_pairs_score.py +157 -0
- phykit/services/alignment/variable_sites.py +54 -0
- phykit/services/base.py +9 -0
- phykit/services/tree/__init__.py +29 -0
- phykit/services/tree/base.py +178 -0
- phykit/services/tree/bipartition_support_stats.py +48 -0
- phykit/services/tree/branch_length_multiplier.py +37 -0
- phykit/services/tree/collapse_branches.py +27 -0
- phykit/services/tree/covarying_evolutionary_rates.py +272 -0
- phykit/services/tree/dvmc.py +37 -0
- phykit/services/tree/evolutionary_rate.py +17 -0
- phykit/services/tree/hidden_paralogy_check.py +128 -0
- phykit/services/tree/internal_branch_stats.py +77 -0
- phykit/services/tree/internode_labeler.py +33 -0
- phykit/services/tree/last_common_ancestor_subtree.py +35 -0
- phykit/services/tree/lb_score.py +196 -0
- phykit/services/tree/monophyly_check.py +106 -0
- phykit/services/tree/nearest_neighbor_interchange.py +140 -0
- phykit/services/tree/patristic_distances.py +113 -0
- phykit/services/tree/polytomy_test.py +546 -0
- phykit/services/tree/print_tree.py +28 -0
- phykit/services/tree/prune_tree.py +40 -0
- phykit/services/tree/rename_tree_tips.py +64 -0
- phykit/services/tree/rf_distance.py +136 -0
- phykit/services/tree/root_tree.py +35 -0
- phykit/services/tree/saturation.py +209 -0
- phykit/services/tree/spurious_sequence.py +75 -0
- phykit/services/tree/terminal_branch_stats.py +87 -0
- phykit/services/tree/tip_labels.py +18 -0
- phykit/services/tree/tip_to_tip_distance.py +41 -0
- phykit/services/tree/tip_to_tip_node_distance.py +41 -0
- phykit/services/tree/total_tree_length.py +25 -0
- phykit/services/tree/treeness.py +16 -0
- phykit/services/tree/treeness_over_rcv.py +40 -0
- phykit/version.py +1 -0
- phykit-2.1.0.dist-info/METADATA +150 -0
- phykit-2.1.0.dist-info/RECORD +69 -0
- phykit-2.1.0.dist-info/WHEEL +5 -0
- phykit-2.1.0.dist-info/entry_points.txt +121 -0
- phykit-2.1.0.dist-info/licenses/LICENSE.md +7 -0
- phykit-2.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
from typing import Dict, List, Generator
|
|
3
|
+
import pickle
|
|
4
|
+
|
|
5
|
+
from Bio import Phylo
|
|
6
|
+
from Bio.Phylo import Newick
|
|
7
|
+
|
|
8
|
+
from .base import Tree
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class NearestNeighborInterchange(Tree):
|
|
12
|
+
def __init__(self, args) -> None:
|
|
13
|
+
super().__init__(**self.process_args(args))
|
|
14
|
+
|
|
15
|
+
def run(self) -> None:
|
|
16
|
+
tree = self.read_tree_file()
|
|
17
|
+
|
|
18
|
+
# Use standard neighbor generation with optimized copying
|
|
19
|
+
all_nnis = [tree]
|
|
20
|
+
all_nnis.extend(self.get_neighbors(tree))
|
|
21
|
+
Phylo.write(all_nnis, self.output_file_path, "newick")
|
|
22
|
+
|
|
23
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
24
|
+
tree_file_path = args.tree
|
|
25
|
+
output_file_path = \
|
|
26
|
+
f"{args.output}" if args.output else f"{tree_file_path}.nnis"
|
|
27
|
+
|
|
28
|
+
return dict(
|
|
29
|
+
tree_file_path=tree_file_path,
|
|
30
|
+
output_file_path=output_file_path
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
def _fast_tree_copy(self, tree: Newick.Tree) -> Newick.Tree:
|
|
34
|
+
"""Fast tree copying using pickle instead of deep copy."""
|
|
35
|
+
return pickle.loads(pickle.dumps(tree, protocol=pickle.HIGHEST_PROTOCOL))
|
|
36
|
+
|
|
37
|
+
def get_neighbors(
|
|
38
|
+
self,
|
|
39
|
+
tree: Newick.Tree
|
|
40
|
+
) -> List[Newick.Tree]:
|
|
41
|
+
### This code is from BioPython (so is this comment)
|
|
42
|
+
# Get all neighbor trees of the given tree (PRIVATE).
|
|
43
|
+
# Currently only for binary rooted trees.
|
|
44
|
+
###
|
|
45
|
+
# make child to parent dict
|
|
46
|
+
parents = {}
|
|
47
|
+
for clade in tree.find_clades():
|
|
48
|
+
if clade != tree.root:
|
|
49
|
+
node_path = tree.get_path(clade)
|
|
50
|
+
# cannot get the parent if the parent is root. Bug?
|
|
51
|
+
if len(node_path) == 1:
|
|
52
|
+
parents[clade] = tree.root
|
|
53
|
+
else:
|
|
54
|
+
parents[clade] = node_path[-2]
|
|
55
|
+
neighbors = []
|
|
56
|
+
root_childs = []
|
|
57
|
+
for clade in tree.get_nonterminals(order="level"):
|
|
58
|
+
if clade == tree.root:
|
|
59
|
+
left = clade.clades[0]
|
|
60
|
+
right = clade.clades[1]
|
|
61
|
+
root_childs.append(left)
|
|
62
|
+
root_childs.append(right)
|
|
63
|
+
if not left.is_terminal() and not right.is_terminal():
|
|
64
|
+
# make changes around the left_left clade
|
|
65
|
+
# left_left = left.clades[0]
|
|
66
|
+
left_right = left.clades[1]
|
|
67
|
+
right_left = right.clades[0]
|
|
68
|
+
right_right = right.clades[1]
|
|
69
|
+
# neighbor 1 (left_left + right_right)
|
|
70
|
+
del left.clades[1]
|
|
71
|
+
del right.clades[1]
|
|
72
|
+
left.clades.append(right_right)
|
|
73
|
+
right.clades.append(left_right)
|
|
74
|
+
temp_tree = self._fast_tree_copy(tree)
|
|
75
|
+
neighbors.append(temp_tree)
|
|
76
|
+
# neighbor 2 (left_left + right_left)
|
|
77
|
+
del left.clades[1]
|
|
78
|
+
del right.clades[0]
|
|
79
|
+
left.clades.append(right_left)
|
|
80
|
+
right.clades.append(right_right)
|
|
81
|
+
temp_tree = self._fast_tree_copy(tree)
|
|
82
|
+
neighbors.append(temp_tree)
|
|
83
|
+
# change back (left_left + left_right)
|
|
84
|
+
del left.clades[1]
|
|
85
|
+
del right.clades[0]
|
|
86
|
+
left.clades.append(left_right)
|
|
87
|
+
right.clades.insert(0, right_left)
|
|
88
|
+
elif clade in root_childs:
|
|
89
|
+
# skip root child
|
|
90
|
+
continue
|
|
91
|
+
else:
|
|
92
|
+
# method for other clades
|
|
93
|
+
# make changes around the parent clade
|
|
94
|
+
left = clade.clades[0]
|
|
95
|
+
right = clade.clades[1]
|
|
96
|
+
parent = parents[clade]
|
|
97
|
+
if clade == parent.clades[0]:
|
|
98
|
+
sister = parent.clades[1]
|
|
99
|
+
# neighbor 1 (parent + right)
|
|
100
|
+
del parent.clades[1]
|
|
101
|
+
del clade.clades[1]
|
|
102
|
+
parent.clades.append(right)
|
|
103
|
+
clade.clades.append(sister)
|
|
104
|
+
temp_tree = self._fast_tree_copy(tree)
|
|
105
|
+
neighbors.append(temp_tree)
|
|
106
|
+
# neighbor 2 (parent + left)
|
|
107
|
+
del parent.clades[1]
|
|
108
|
+
del clade.clades[0]
|
|
109
|
+
parent.clades.append(left)
|
|
110
|
+
clade.clades.append(right)
|
|
111
|
+
temp_tree = self._fast_tree_copy(tree)
|
|
112
|
+
neighbors.append(temp_tree)
|
|
113
|
+
# change back (parent + sister)
|
|
114
|
+
del parent.clades[1]
|
|
115
|
+
del clade.clades[0]
|
|
116
|
+
parent.clades.append(sister)
|
|
117
|
+
clade.clades.insert(0, left)
|
|
118
|
+
else:
|
|
119
|
+
sister = parent.clades[0]
|
|
120
|
+
# neighbor 1 (parent + right)
|
|
121
|
+
del parent.clades[0]
|
|
122
|
+
del clade.clades[1]
|
|
123
|
+
parent.clades.insert(0, right)
|
|
124
|
+
clade.clades.append(sister)
|
|
125
|
+
temp_tree = self._fast_tree_copy(tree)
|
|
126
|
+
neighbors.append(temp_tree)
|
|
127
|
+
# neighbor 2 (parent + left)
|
|
128
|
+
del parent.clades[0]
|
|
129
|
+
del clade.clades[0]
|
|
130
|
+
parent.clades.insert(0, left)
|
|
131
|
+
clade.clades.append(right)
|
|
132
|
+
temp_tree = self._fast_tree_copy(tree)
|
|
133
|
+
neighbors.append(temp_tree)
|
|
134
|
+
# change back (parent + sister)
|
|
135
|
+
del parent.clades[0]
|
|
136
|
+
del clade.clades[0]
|
|
137
|
+
parent.clades.insert(0, sister)
|
|
138
|
+
clade.clades.insert(0, left)
|
|
139
|
+
|
|
140
|
+
return neighbors
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
from typing import Dict, List, Tuple
|
|
2
|
+
import itertools
|
|
3
|
+
import multiprocessing as mp
|
|
4
|
+
from functools import partial
|
|
5
|
+
import pickle
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
from Bio.Phylo import Newick
|
|
9
|
+
try:
|
|
10
|
+
from tqdm import tqdm
|
|
11
|
+
except ImportError:
|
|
12
|
+
# Fallback if tqdm is not installed
|
|
13
|
+
def tqdm(iterable, *args, **kwargs):
|
|
14
|
+
return iterable
|
|
15
|
+
|
|
16
|
+
from .base import Tree
|
|
17
|
+
|
|
18
|
+
from ...helpers.stats_summary import (
|
|
19
|
+
calculate_summary_statistics_from_arr,
|
|
20
|
+
print_summary_statistics,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class PatristicDistances(Tree):
|
|
25
|
+
def __init__(self, args) -> None:
|
|
26
|
+
super().__init__(**self.process_args(args))
|
|
27
|
+
|
|
28
|
+
def run(self):
|
|
29
|
+
tree = self.read_tree_file()
|
|
30
|
+
patristic_distances, combos, stats = \
|
|
31
|
+
self.calculate_patristic_distances(tree)
|
|
32
|
+
|
|
33
|
+
if self.verbose:
|
|
34
|
+
try:
|
|
35
|
+
for combo, patristic_distance in zip(combos, patristic_distances):
|
|
36
|
+
print(f"{combo[0]}\t{combo[1]}\t{round(patristic_distance, 4)}")
|
|
37
|
+
except BrokenPipeError:
|
|
38
|
+
pass
|
|
39
|
+
else:
|
|
40
|
+
print_summary_statistics(stats)
|
|
41
|
+
|
|
42
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
43
|
+
return dict(tree_file_path=args.tree, verbose=args.verbose)
|
|
44
|
+
|
|
45
|
+
def _calculate_distance_batch(self, tree_pickle, combo_batch):
|
|
46
|
+
"""Helper function to calculate distances for a batch of combinations."""
|
|
47
|
+
tree = pickle.loads(tree_pickle)
|
|
48
|
+
return [tree.distance(combo[0], combo[1]) for combo in combo_batch]
|
|
49
|
+
|
|
50
|
+
def calculate_distance_between_pairs(
|
|
51
|
+
self,
|
|
52
|
+
tips: List[str],
|
|
53
|
+
tree
|
|
54
|
+
) -> Tuple[
|
|
55
|
+
List[Tuple[str, str]],
|
|
56
|
+
List[float],
|
|
57
|
+
]:
|
|
58
|
+
combos = list(itertools.combinations(tips, 2))
|
|
59
|
+
|
|
60
|
+
# For small datasets, use the original single-threaded approach
|
|
61
|
+
if len(combos) < 100:
|
|
62
|
+
patristic_distances = [
|
|
63
|
+
tree.distance(combo[0], combo[1]) for combo in combos
|
|
64
|
+
]
|
|
65
|
+
else:
|
|
66
|
+
# Use multiprocessing for larger datasets
|
|
67
|
+
# Serialize the tree once to avoid repeated serialization
|
|
68
|
+
tree_pickle = pickle.dumps(tree)
|
|
69
|
+
|
|
70
|
+
# Determine optimal number of workers
|
|
71
|
+
num_workers = min(mp.cpu_count(), 8)
|
|
72
|
+
|
|
73
|
+
# Split combos into chunks for parallel processing
|
|
74
|
+
chunk_size = max(1, len(combos) // (num_workers * 4))
|
|
75
|
+
combo_chunks = [combos[i:i + chunk_size] for i in range(0, len(combos), chunk_size)]
|
|
76
|
+
|
|
77
|
+
# Create partial function with the pickled tree
|
|
78
|
+
calc_func = partial(self._calculate_distance_batch, tree_pickle)
|
|
79
|
+
|
|
80
|
+
# Process in parallel with progress bar
|
|
81
|
+
with mp.Pool(processes=num_workers) as pool:
|
|
82
|
+
# Only show progress bar if stderr is a tty (not redirected)
|
|
83
|
+
if sys.stderr.isatty():
|
|
84
|
+
results = list(tqdm(
|
|
85
|
+
pool.imap(calc_func, combo_chunks),
|
|
86
|
+
total=len(combo_chunks),
|
|
87
|
+
desc="Calculating patristic distances",
|
|
88
|
+
unit="batch"
|
|
89
|
+
))
|
|
90
|
+
else:
|
|
91
|
+
results = pool.map(calc_func, combo_chunks)
|
|
92
|
+
|
|
93
|
+
# Flatten the results
|
|
94
|
+
patristic_distances = [dist for chunk_result in results for dist in chunk_result]
|
|
95
|
+
|
|
96
|
+
return combos, patristic_distances
|
|
97
|
+
|
|
98
|
+
def calculate_patristic_distances(
|
|
99
|
+
self,
|
|
100
|
+
tree: Newick.Tree,
|
|
101
|
+
) -> Tuple[
|
|
102
|
+
List[float],
|
|
103
|
+
List[Tuple[str, str]],
|
|
104
|
+
Dict[str, float],
|
|
105
|
+
]:
|
|
106
|
+
tips = self.get_tip_names_from_tree(tree)
|
|
107
|
+
|
|
108
|
+
combos, patristic_distances = \
|
|
109
|
+
self.calculate_distance_between_pairs(tips, tree)
|
|
110
|
+
|
|
111
|
+
stats = calculate_summary_statistics_from_arr(patristic_distances)
|
|
112
|
+
|
|
113
|
+
return patristic_distances, combos, stats
|