phykit 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. phykit/__init__.py +0 -0
  2. phykit/__main__.py +6 -0
  3. phykit/helpers/__init__.py +0 -0
  4. phykit/helpers/boolean_argument_parsing.py +12 -0
  5. phykit/helpers/caching.py +201 -0
  6. phykit/helpers/files.py +125 -0
  7. phykit/helpers/parallel.py +305 -0
  8. phykit/helpers/stats_summary.py +64 -0
  9. phykit/helpers/streaming.py +152 -0
  10. phykit/phykit.py +2862 -0
  11. phykit/services/__init__.py +0 -0
  12. phykit/services/alignment/__init__.py +17 -0
  13. phykit/services/alignment/alignment_length.py +16 -0
  14. phykit/services/alignment/alignment_length_no_gaps.py +69 -0
  15. phykit/services/alignment/alignment_recoding.py +89 -0
  16. phykit/services/alignment/base.py +103 -0
  17. phykit/services/alignment/column_score.py +66 -0
  18. phykit/services/alignment/compositional_bias_per_site.py +98 -0
  19. phykit/services/alignment/create_concatenation_matrix.py +254 -0
  20. phykit/services/alignment/dna_threader.py +145 -0
  21. phykit/services/alignment/evolutionary_rate_per_site.py +85 -0
  22. phykit/services/alignment/faidx.py +21 -0
  23. phykit/services/alignment/gc_content.py +94 -0
  24. phykit/services/alignment/pairwise_identity.py +159 -0
  25. phykit/services/alignment/parsimony_informative_sites.py +81 -0
  26. phykit/services/alignment/rcv.py +14 -0
  27. phykit/services/alignment/rcvt.py +47 -0
  28. phykit/services/alignment/rename_fasta_entries.py +53 -0
  29. phykit/services/alignment/sum_of_pairs_score.py +157 -0
  30. phykit/services/alignment/variable_sites.py +54 -0
  31. phykit/services/base.py +9 -0
  32. phykit/services/tree/__init__.py +29 -0
  33. phykit/services/tree/base.py +178 -0
  34. phykit/services/tree/bipartition_support_stats.py +48 -0
  35. phykit/services/tree/branch_length_multiplier.py +37 -0
  36. phykit/services/tree/collapse_branches.py +27 -0
  37. phykit/services/tree/covarying_evolutionary_rates.py +272 -0
  38. phykit/services/tree/dvmc.py +37 -0
  39. phykit/services/tree/evolutionary_rate.py +17 -0
  40. phykit/services/tree/hidden_paralogy_check.py +128 -0
  41. phykit/services/tree/internal_branch_stats.py +77 -0
  42. phykit/services/tree/internode_labeler.py +33 -0
  43. phykit/services/tree/last_common_ancestor_subtree.py +35 -0
  44. phykit/services/tree/lb_score.py +196 -0
  45. phykit/services/tree/monophyly_check.py +106 -0
  46. phykit/services/tree/nearest_neighbor_interchange.py +140 -0
  47. phykit/services/tree/patristic_distances.py +113 -0
  48. phykit/services/tree/polytomy_test.py +546 -0
  49. phykit/services/tree/print_tree.py +28 -0
  50. phykit/services/tree/prune_tree.py +40 -0
  51. phykit/services/tree/rename_tree_tips.py +64 -0
  52. phykit/services/tree/rf_distance.py +136 -0
  53. phykit/services/tree/root_tree.py +35 -0
  54. phykit/services/tree/saturation.py +209 -0
  55. phykit/services/tree/spurious_sequence.py +75 -0
  56. phykit/services/tree/terminal_branch_stats.py +87 -0
  57. phykit/services/tree/tip_labels.py +18 -0
  58. phykit/services/tree/tip_to_tip_distance.py +41 -0
  59. phykit/services/tree/tip_to_tip_node_distance.py +41 -0
  60. phykit/services/tree/total_tree_length.py +25 -0
  61. phykit/services/tree/treeness.py +16 -0
  62. phykit/services/tree/treeness_over_rcv.py +40 -0
  63. phykit/version.py +1 -0
  64. phykit-2.1.0.dist-info/METADATA +150 -0
  65. phykit-2.1.0.dist-info/RECORD +69 -0
  66. phykit-2.1.0.dist-info/WHEEL +5 -0
  67. phykit-2.1.0.dist-info/entry_points.txt +121 -0
  68. phykit-2.1.0.dist-info/licenses/LICENSE.md +7 -0
  69. phykit-2.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,178 @@
1
+ import sys
2
+ import copy
3
+ from typing import List
4
+ from functools import lru_cache
5
+ import os
6
+ import hashlib
7
+
8
+ from Bio import Phylo
9
+
10
+ from ..base import BaseService
11
+
12
+
13
+ class Tree(BaseService):
14
+ def __init__(
15
+ self,
16
+ *args,
17
+ tree_file_path=None,
18
+ idmap=None,
19
+ alignment_file_path=None,
20
+ tree1_file_path=None,
21
+ outgroup_taxa_file_path=None,
22
+ output_file_path=None,
23
+ factor=None,
24
+ remove=None,
25
+ verbose=None,
26
+ reference=None,
27
+ list_of_taxa=None,
28
+ trees=None,
29
+ groups=None,
30
+ support=None,
31
+ tip_1=None,
32
+ tip_2=None,
33
+ clade=None,
34
+ keep=None,
35
+ exclude_gaps=None,
36
+ ):
37
+ self.tree_file_path = tree_file_path
38
+ self.tree1_file_path = tree1_file_path
39
+ self.alignment_file_path = alignment_file_path
40
+ self.output_file_path = output_file_path
41
+ self.outgroup_taxa_file_path = outgroup_taxa_file_path
42
+ self.tree_format = "newick"
43
+ self.verbose = verbose
44
+ self.factor = factor
45
+ self.remove = remove
46
+ self.idmap = idmap
47
+ self.reference = reference
48
+ self.list_of_taxa = list_of_taxa
49
+ self.trees = trees
50
+ self.groups = groups
51
+ self.support = support
52
+ self.tip_1 = tip_1
53
+ self.tip_2 = tip_2
54
+ self.clade = clade
55
+ self.keep = keep
56
+ self.exclude_gaps = exclude_gaps
57
+
58
+ @staticmethod
59
+ @lru_cache(maxsize=32)
60
+ def _cached_tree_read(file_path: str, tree_format: str, file_hash: str):
61
+ """Cached tree reading with file hash for cache invalidation."""
62
+ return Phylo.read(file_path, tree_format)
63
+
64
+ @staticmethod
65
+ def _get_file_hash(file_path: str) -> str:
66
+ """Get a hash based on file path, size, and modification time."""
67
+ try:
68
+ stat = os.stat(file_path)
69
+ cache_key = f"{file_path}_{stat.st_size}_{stat.st_mtime}"
70
+ return hashlib.md5(cache_key.encode()).hexdigest()
71
+ except:
72
+ return ""
73
+
74
+ def read_tree_file(self):
75
+ try:
76
+ file_hash = self._get_file_hash(self.tree_file_path)
77
+ tree = self._cached_tree_read(self.tree_file_path, self.tree_format, file_hash)
78
+ # Return a deep copy to prevent modifications to the cached tree
79
+ return copy.deepcopy(tree)
80
+ except FileNotFoundError:
81
+ print(f"{self.tree_file_path} corresponds to no such file or directory.")
82
+ print("Please check filename and pathing")
83
+ sys.exit(2)
84
+
85
+ def read_tree1_file(self):
86
+ try:
87
+ file_hash = self._get_file_hash(self.tree1_file_path)
88
+ tree = self._cached_tree_read(self.tree1_file_path, self.tree_format, file_hash)
89
+ # Return a deep copy to prevent modifications to the cached tree
90
+ return copy.deepcopy(tree)
91
+ except FileNotFoundError:
92
+ print(f"{self.tree1_file_path} corresponds to no such file or directory.")
93
+ print("Please check filename and pathing")
94
+ sys.exit(2)
95
+
96
+ def read_reference_tree_file(self):
97
+ try:
98
+ file_hash = self._get_file_hash(self.reference)
99
+ tree = self._cached_tree_read(self.reference, self.tree_format, file_hash)
100
+ # Return a deep copy to prevent modifications to the cached tree
101
+ return copy.deepcopy(tree)
102
+ except FileNotFoundError:
103
+ print(f"{self.reference} corresponds to no such file or directory.")
104
+ print("Please check filename and pathing")
105
+ sys.exit(2)
106
+
107
+ def write_tree_file(self, tree, output_file_path):
108
+ return Phylo.write(tree, output_file_path, self.tree_format)
109
+
110
+ def get_tip_names_from_tree(self, tree) -> list:
111
+ """
112
+ get tip names from a tree
113
+ """
114
+ # Use list comprehension for better performance
115
+ return [tip.name for tip in tree.get_terminals()]
116
+
117
+ def shared_tips(self, a, b):
118
+ """
119
+ Determines what tips are shared between two trees
120
+ -------------------------------------------------
121
+ argv: a
122
+ list of tips from one tree
123
+ argv: b
124
+ list of tips from a second tree
125
+ """
126
+
127
+ a_set = set(a)
128
+ b_set = set(b)
129
+
130
+ # check length
131
+ if len(a_set.intersection(b_set)) > 0:
132
+ return list(a_set.intersection(b_set))
133
+ else:
134
+ print("no common tips")
135
+ sys.exit(2)
136
+
137
+ def prune_tree_using_taxa_list(self, tree, taxa_to_prune: list):
138
+ """
139
+ prune taxa from tree
140
+ """
141
+ for taxon in taxa_to_prune:
142
+ tree.prune(taxon)
143
+
144
+ return tree
145
+
146
+ def calculate_treeness(self, tree=None, print_value=False):
147
+ if not tree:
148
+ tree = self.read_tree_file()
149
+
150
+ inter_len = float(0.0)
151
+ # determine internal branch lengths
152
+ for interal in tree.get_nonterminals():
153
+ # only include if a branch length value is present
154
+ if interal.branch_length != None:
155
+ inter_len += interal.branch_length
156
+ # determine total branch length
157
+ total_len = tree.total_branch_length()
158
+
159
+ try:
160
+ treeness = float(inter_len / total_len)
161
+ try:
162
+ if print_value:
163
+ print(f"{treeness}")
164
+ return treeness
165
+ except BrokenPipeError:
166
+ pass
167
+ except ZeroDivisionError:
168
+ try:
169
+ print("Invalid tree. Tree should contain branch lengths")
170
+ return None
171
+ except BrokenPipeError:
172
+ pass
173
+
174
+ def get_gap_chars(is_protein: bool) -> List[str]:
175
+ if is_protein:
176
+ return ["-", "?", "*", "X", "x"]
177
+ else:
178
+ return ["-", "?", "*", "X", "x", "N", "n"]
@@ -0,0 +1,48 @@
1
+ from typing import Dict, List, Tuple
2
+
3
+ from Bio.Phylo import Newick
4
+
5
+ from .base import Tree
6
+ from ...helpers.stats_summary import (
7
+ calculate_summary_statistics_from_arr,
8
+ print_summary_statistics,
9
+ )
10
+
11
+
12
+ class BipartitionSupportStats(Tree):
13
+ def __init__(self, args) -> None:
14
+ super().__init__(**self.process_args(args))
15
+
16
+ def run(self) -> None:
17
+ tree = self.read_tree_file()
18
+ bs_vals, term_names = self.get_bipartition_support_vals(tree)
19
+
20
+ if self.verbose:
21
+ try:
22
+ for i in range(len(bs_vals)):
23
+ print(bs_vals[i], ";".join(term_names[i]))
24
+ except BrokenPipeError:
25
+ pass
26
+ else:
27
+ stats = calculate_summary_statistics_from_arr(bs_vals)
28
+ print_summary_statistics(stats)
29
+
30
+ def process_args(self, args) -> Dict[str, str]:
31
+ return dict(tree_file_path=args.tree, verbose=args.verbose)
32
+
33
+ def get_bipartition_support_vals(
34
+ self,
35
+ tree: Newick.Tree,
36
+ ) -> Tuple[List[float], List[List[str]]]:
37
+ # Single pass through nonterminals to avoid duplicate tree traversal
38
+ bs_vals = []
39
+ term_names = []
40
+
41
+ # Cache terminals for each nonterminal in one pass
42
+ for nonterminal in tree.get_nonterminals():
43
+ if nonterminal.confidence is not None:
44
+ bs_vals.append(nonterminal.confidence)
45
+ # Get terminal names once for this nonterminal
46
+ term_names.append([term.name for term in nonterminal.get_terminals()])
47
+
48
+ return bs_vals, term_names
@@ -0,0 +1,37 @@
1
+ from typing import Dict
2
+ import copy
3
+
4
+ from Bio.Phylo import Newick
5
+
6
+ from .base import Tree
7
+
8
+
9
+ class BranchLengthMultiplier(Tree):
10
+ def __init__(self, args) -> None:
11
+ super().__init__(**self.process_args(args))
12
+
13
+ def run(self) -> None:
14
+ tree = self.read_tree_file()
15
+ # Make a deep copy to avoid modifying the cached tree
16
+ tree_copy = copy.deepcopy(tree)
17
+ self.multiply_branch_lengths_by_factor(tree_copy, self.factor)
18
+ self.write_tree_file(tree_copy, self.output_file_path)
19
+
20
+ def process_args(self, args) -> Dict[str, str]:
21
+ output_file_path = \
22
+ args.output or f"{args.tree}.factor_{args.factor}.tre"
23
+ return dict(
24
+ tree_file_path=args.tree,
25
+ factor=args.factor,
26
+ output_file_path=output_file_path,
27
+ )
28
+
29
+ def multiply_branch_lengths_by_factor(
30
+ self,
31
+ tree: Newick.Tree,
32
+ factor: float,
33
+ ) -> Newick.Tree:
34
+ for node in tree.get_nonterminals() + tree.get_terminals():
35
+ if node.branch_length is not None:
36
+ node.branch_length *= factor
37
+ return tree
@@ -0,0 +1,27 @@
1
+ from typing import Dict
2
+ import copy
3
+
4
+ from .base import Tree
5
+
6
+
7
+ class CollapseBranches(Tree):
8
+ def __init__(self, args) -> None:
9
+ super().__init__(**self.process_args(args))
10
+
11
+ def run(self):
12
+ tree = self.read_tree_file()
13
+ # Make a deep copy to avoid modifying the cached tree
14
+ tree_copy = copy.deepcopy(tree)
15
+ tree_copy.collapse_all(
16
+ lambda c: c.confidence and c.confidence < self.support
17
+ )
18
+ self.write_tree_file(tree_copy, self.output_file_path)
19
+
20
+ def process_args(self, args) -> Dict[str, str]:
21
+ output_file_path = \
22
+ args.output or f"{args.tree}.collapsed_{args.support}.tre"
23
+ return dict(
24
+ tree_file_path=args.tree,
25
+ support=args.support,
26
+ output_file_path=output_file_path,
27
+ )
@@ -0,0 +1,272 @@
1
+ import copy
2
+ import numpy as np
3
+ from concurrent.futures import ProcessPoolExecutor, as_completed
4
+ from functools import lru_cache
5
+ import pickle
6
+
7
+ from scipy.stats import pearsonr, zscore
8
+
9
+ from .base import Tree
10
+
11
+
12
+ class CovaryingEvolutionaryRates(Tree):
13
+ def __init__(self, args) -> None:
14
+ super().__init__(**self.process_args(args))
15
+
16
+ def run(self):
17
+ tree_zero = self.read_tree_file()
18
+ tree_one = self.read_tree1_file()
19
+ tree_ref = self.read_reference_tree_file()
20
+
21
+ # - Calculate correlation between two gene trees
22
+ # and save results to an array, corrArr.
23
+ # - Branch lengths will also be part of output
24
+
25
+ # get tree tip names
26
+ tree_zero_tips = self.get_tip_names_from_tree(tree_zero)
27
+ tree_one_tips = self.get_tip_names_from_tree(tree_one)
28
+ tree_ref_tips = self.get_tip_names_from_tree(tree_ref)
29
+
30
+ # get shared tips between the two trees
31
+ shared_tree_tips = self.shared_tips(tree_zero_tips, tree_one_tips)
32
+
33
+ # find differences between tree tips and shared tips
34
+ # to determine what tips to prune
35
+ tree_zero_tips_to_prune = list(set(tree_zero_tips) - set(shared_tree_tips))
36
+ tree_one_tips_to_prune = list(set(tree_one_tips) - set(shared_tree_tips))
37
+ tree_ref_tips_to_prune = list(set(tree_ref_tips) - set(shared_tree_tips))
38
+
39
+ # get a set of pruned trees
40
+ tree_zero = self.prune_tips(tree_zero, tree_zero_tips_to_prune)
41
+ tree_one = self.prune_tips(tree_one, tree_one_tips_to_prune)
42
+ tree_ref = self.prune_tips(tree_ref, tree_ref_tips_to_prune)
43
+
44
+ # obtain corrected branch lengths where branch lengths
45
+ # are corrected by the species tree branch length
46
+ (
47
+ tree_zero_corr_branch_lengths,
48
+ tree_one_corr_branch_lengths,
49
+ tip_names,
50
+ ) = self.correct_branch_lengths(tree_zero, tree_one, tree_ref)
51
+
52
+ # remove corrected BLs greater than 5
53
+ outlier_indices = []
54
+ outlier_indices = self.get_indices_of_outlier_branch_lengths(
55
+ tree_zero_corr_branch_lengths, outlier_indices
56
+ )
57
+ outlier_indices = self.get_indices_of_outlier_branch_lengths(
58
+ tree_one_corr_branch_lengths, outlier_indices
59
+ )
60
+
61
+ tree_zero_corr_branch_lengths = self.remove_outliers_based_on_indices(
62
+ tree_zero_corr_branch_lengths, outlier_indices
63
+ )
64
+ tree_one_corr_branch_lengths = self.remove_outliers_based_on_indices(
65
+ tree_one_corr_branch_lengths, outlier_indices
66
+ )
67
+ tip_names = self.remove_outliers_based_on_indices(tip_names, outlier_indices)
68
+
69
+ # standardize values for final correction
70
+ tree_zero_corr_branch_lengths = zscore(tree_zero_corr_branch_lengths)
71
+ tree_one_corr_branch_lengths = zscore(tree_one_corr_branch_lengths)
72
+
73
+ # Calculate correlation and append to results array
74
+ # also keep a list of p values
75
+ corr = list(
76
+ pearsonr(tree_zero_corr_branch_lengths, tree_one_corr_branch_lengths)
77
+ )
78
+
79
+ try:
80
+ if self.verbose:
81
+ for val_zero, val_one, tip_name in zip(
82
+ tree_zero_corr_branch_lengths,
83
+ tree_one_corr_branch_lengths,
84
+ tip_names,
85
+ ):
86
+ print(
87
+ f"{round(val_zero, 4)}\t{round(val_one, 4)}\t{';'.join(tip_name)}"
88
+ )
89
+ else:
90
+ print(f"{round(corr[0], 4)}\t{round(corr[1], 6)}")
91
+ except BrokenPipeError:
92
+ pass
93
+
94
+ def process_args(self, args):
95
+ return dict(
96
+ tree_file_path=args.tree_zero,
97
+ tree1_file_path=args.tree_one,
98
+ reference=args.reference,
99
+ verbose=args.verbose,
100
+ )
101
+
102
+ def get_indices_of_outlier_branch_lengths(
103
+ self, corr_branch_lengths, outlier_indices
104
+ ):
105
+ """
106
+ create index for branch lengths that
107
+ have an absolute value greater than 5
108
+ """
109
+ # Convert to numpy array for vectorized operations
110
+ arr = np.array(corr_branch_lengths, dtype=float)
111
+
112
+ # Find outliers using vectorized operations
113
+ new_outliers = np.where((np.abs(arr) > 5) | np.isnan(arr))[0]
114
+
115
+ # Combine with existing outliers
116
+ all_outliers = set(outlier_indices)
117
+ all_outliers.update(new_outliers.tolist())
118
+
119
+ return list(all_outliers)
120
+
121
+ def remove_outliers_based_on_indices(self, corr_branch_lengths, outlier_indices):
122
+ """
123
+ remove value if the value is an outlier according
124
+ to the outlier indices list
125
+ """
126
+ if not outlier_indices:
127
+ return corr_branch_lengths
128
+
129
+ # Use numpy for efficient filtering
130
+ mask = np.ones(len(corr_branch_lengths), dtype=bool)
131
+ mask[list(outlier_indices)] = False
132
+
133
+ if isinstance(corr_branch_lengths[0], (list, tuple)):
134
+ # Handle list of lists (tip_names)
135
+ return [item for i, item in enumerate(corr_branch_lengths) if mask[i]]
136
+ else:
137
+ # Handle numeric lists
138
+ return [item for i, item in enumerate(corr_branch_lengths) if mask[i]]
139
+
140
+ def prune_tips(self, tree, tips):
141
+ """
142
+ prune tips from trees
143
+ """
144
+
145
+ for tip in tips:
146
+ tree.prune(tip)
147
+
148
+ return tree
149
+
150
+ @staticmethod
151
+ def _process_terminal_batch(tree0_pickle, tree1_pickle, terminals_data):
152
+ """Process a batch of terminals in parallel."""
153
+ t0 = pickle.loads(tree0_pickle)
154
+ t1 = pickle.loads(tree1_pickle)
155
+
156
+ results = []
157
+ for terminal_name, terminal_bl, sp_tips in terminals_data:
158
+ try:
159
+ newtree = t0.common_ancestor(terminal_name)
160
+ newtree1 = t1.common_ancestor(terminal_name)
161
+
162
+ bl0 = round(newtree.branch_length / terminal_bl, 6) if newtree.branch_length else None
163
+ bl1 = round(newtree1.branch_length / terminal_bl, 6) if newtree1.branch_length else None
164
+
165
+ if bl0 is not None and bl1 is not None:
166
+ results.append((bl0, bl1, sp_tips))
167
+ except:
168
+ continue
169
+ return results
170
+
171
+ @staticmethod
172
+ def _process_nonterminal_batch(tree0_pickle, tree1_pickle, nonterminals_data):
173
+ """Process a batch of nonterminals in parallel."""
174
+ t0 = pickle.loads(tree0_pickle)
175
+ t1 = pickle.loads(tree1_pickle)
176
+
177
+ results = []
178
+ for sp_tips, nonterminal_bl in nonterminals_data:
179
+ try:
180
+ newtree = t0.common_ancestor(sp_tips)
181
+ newtree1 = t1.common_ancestor(sp_tips)
182
+
183
+ if newtree.branch_length and newtree1.branch_length and nonterminal_bl:
184
+ bl0 = round(newtree.branch_length / nonterminal_bl, 6)
185
+ bl1 = round(newtree1.branch_length / nonterminal_bl, 6)
186
+ results.append((bl0, bl1, sp_tips))
187
+ except:
188
+ continue
189
+ return results
190
+
191
+ def correct_branch_lengths(self, t0, t1, sp):
192
+ """
193
+ obtain a list of corrected branch lengths with parallel processing
194
+ """
195
+ l0 = []
196
+ l1 = []
197
+ tip_names = []
198
+
199
+ # Collect terminal data
200
+ terminals = sp.get_terminals()
201
+ nonterminals = sp.get_nonterminals()
202
+
203
+ # Process sequentially if small dataset or use parallel processing
204
+ if len(terminals) + len(nonterminals) < 50:
205
+ # Original sequential processing for small datasets
206
+ for i in terminals:
207
+ sp_tips = self.get_tip_names_from_tree(i)
208
+ tip_names.append(sp_tips)
209
+ try:
210
+ newtree = t0.common_ancestor(i.name)
211
+ newtree1 = t1.common_ancestor(i.name)
212
+ if newtree.branch_length and i.branch_length:
213
+ l0.append(round(newtree.branch_length / i.branch_length, 6))
214
+ l1.append(round(newtree1.branch_length / i.branch_length, 6))
215
+ except:
216
+ continue
217
+
218
+ for i in nonterminals:
219
+ sp_tips = self.get_tip_names_from_tree(i)
220
+ try:
221
+ newtree = t0.common_ancestor(sp_tips)
222
+ newtree1 = t1.common_ancestor(sp_tips)
223
+ if newtree.branch_length and newtree1.branch_length and i.branch_length:
224
+ l0.append(round(newtree.branch_length / i.branch_length, 6))
225
+ l1.append(round(newtree1.branch_length / i.branch_length, 6))
226
+ tip_names.append(sp_tips)
227
+ except:
228
+ continue
229
+ else:
230
+ # Parallel processing for large datasets
231
+ tree0_pickle = pickle.dumps(t0)
232
+ tree1_pickle = pickle.dumps(t1)
233
+
234
+ # Prepare terminal data
235
+ terminals_data = []
236
+ for i in terminals:
237
+ sp_tips = self.get_tip_names_from_tree(i)
238
+ if i.branch_length:
239
+ terminals_data.append((i.name, i.branch_length, sp_tips))
240
+
241
+ # Prepare nonterminal data
242
+ nonterminals_data = []
243
+ for i in nonterminals:
244
+ if i.branch_length:
245
+ sp_tips = self.get_tip_names_from_tree(i)
246
+ nonterminals_data.append((sp_tips, i.branch_length))
247
+
248
+ # Process in batches
249
+ batch_size = max(10, (len(terminals_data) + len(nonterminals_data)) // 4)
250
+
251
+ with ProcessPoolExecutor(max_workers=min(4, len(terminals_data) + len(nonterminals_data) // 10)) as executor:
252
+ futures = []
253
+
254
+ # Submit terminal batches
255
+ for i in range(0, len(terminals_data), batch_size):
256
+ batch = terminals_data[i:i+batch_size]
257
+ futures.append(executor.submit(self._process_terminal_batch, tree0_pickle, tree1_pickle, batch))
258
+
259
+ # Submit nonterminal batches
260
+ for i in range(0, len(nonterminals_data), batch_size):
261
+ batch = nonterminals_data[i:i+batch_size]
262
+ futures.append(executor.submit(self._process_nonterminal_batch, tree0_pickle, tree1_pickle, batch))
263
+
264
+ # Collect results
265
+ for future in as_completed(futures):
266
+ batch_results = future.result()
267
+ for bl0, bl1, sp_tips in batch_results:
268
+ l0.append(bl0)
269
+ l1.append(bl1)
270
+ tip_names.append(sp_tips)
271
+
272
+ return (l0, l1, tip_names)
@@ -0,0 +1,37 @@
1
+ import math
2
+ from typing import Dict
3
+ import numpy as np
4
+
5
+ from Bio.Phylo import Newick
6
+
7
+ from .base import Tree
8
+
9
+
10
+ class DVMC(Tree):
11
+ def __init__(self, args) -> None:
12
+ super().__init__(**self.process_args(args))
13
+
14
+ def run(self):
15
+ tree = self.read_tree_file()
16
+ dvmc = self.determine_dvmc(tree)
17
+ print(round(dvmc, 4))
18
+
19
+ def process_args(self, args) -> Dict[str, str]:
20
+ return dict(tree_file_path=args.tree)
21
+
22
+ def determine_dvmc(self, tree: Newick.Tree) -> float:
23
+ num_spp = tree.count_terminals()
24
+
25
+ # Collect all distances at once for vectorized operations
26
+ distances = np.array([tree.distance(term) for term in tree.get_terminals()])
27
+
28
+ # Calculate statistics using numpy
29
+ sum_dist = np.sum(distances)
30
+ sumi2N = np.sum(distances ** 2)
31
+ avg_dist = np.mean(distances)
32
+
33
+ # Calculate variance more efficiently
34
+ squared_diff_sum = sumi2N - num_spp * (avg_dist ** 2)
35
+
36
+ # Return standard deviation
37
+ return np.sqrt(squared_diff_sum / (num_spp - 1))
@@ -0,0 +1,17 @@
1
+ from typing import Dict
2
+
3
+ from .base import Tree
4
+
5
+
6
+ class EvolutionaryRate(Tree):
7
+ def __init__(self, args) -> None:
8
+ super().__init__(**self.process_args(args))
9
+
10
+ def run(self) -> None:
11
+ tree = self.read_tree_file()
12
+ total_tree_length = tree.total_branch_length()
13
+ num_terminals = tree.count_terminals()
14
+ print(round(total_tree_length / num_terminals, 4))
15
+
16
+ def process_args(self, args) -> Dict[str, str]:
17
+ return dict(tree_file_path=args.tree)