phykit 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phykit/__init__.py +0 -0
- phykit/__main__.py +6 -0
- phykit/helpers/__init__.py +0 -0
- phykit/helpers/boolean_argument_parsing.py +12 -0
- phykit/helpers/caching.py +201 -0
- phykit/helpers/files.py +125 -0
- phykit/helpers/parallel.py +305 -0
- phykit/helpers/stats_summary.py +64 -0
- phykit/helpers/streaming.py +152 -0
- phykit/phykit.py +2862 -0
- phykit/services/__init__.py +0 -0
- phykit/services/alignment/__init__.py +17 -0
- phykit/services/alignment/alignment_length.py +16 -0
- phykit/services/alignment/alignment_length_no_gaps.py +69 -0
- phykit/services/alignment/alignment_recoding.py +89 -0
- phykit/services/alignment/base.py +103 -0
- phykit/services/alignment/column_score.py +66 -0
- phykit/services/alignment/compositional_bias_per_site.py +98 -0
- phykit/services/alignment/create_concatenation_matrix.py +254 -0
- phykit/services/alignment/dna_threader.py +145 -0
- phykit/services/alignment/evolutionary_rate_per_site.py +85 -0
- phykit/services/alignment/faidx.py +21 -0
- phykit/services/alignment/gc_content.py +94 -0
- phykit/services/alignment/pairwise_identity.py +159 -0
- phykit/services/alignment/parsimony_informative_sites.py +81 -0
- phykit/services/alignment/rcv.py +14 -0
- phykit/services/alignment/rcvt.py +47 -0
- phykit/services/alignment/rename_fasta_entries.py +53 -0
- phykit/services/alignment/sum_of_pairs_score.py +157 -0
- phykit/services/alignment/variable_sites.py +54 -0
- phykit/services/base.py +9 -0
- phykit/services/tree/__init__.py +29 -0
- phykit/services/tree/base.py +178 -0
- phykit/services/tree/bipartition_support_stats.py +48 -0
- phykit/services/tree/branch_length_multiplier.py +37 -0
- phykit/services/tree/collapse_branches.py +27 -0
- phykit/services/tree/covarying_evolutionary_rates.py +272 -0
- phykit/services/tree/dvmc.py +37 -0
- phykit/services/tree/evolutionary_rate.py +17 -0
- phykit/services/tree/hidden_paralogy_check.py +128 -0
- phykit/services/tree/internal_branch_stats.py +77 -0
- phykit/services/tree/internode_labeler.py +33 -0
- phykit/services/tree/last_common_ancestor_subtree.py +35 -0
- phykit/services/tree/lb_score.py +196 -0
- phykit/services/tree/monophyly_check.py +106 -0
- phykit/services/tree/nearest_neighbor_interchange.py +140 -0
- phykit/services/tree/patristic_distances.py +113 -0
- phykit/services/tree/polytomy_test.py +546 -0
- phykit/services/tree/print_tree.py +28 -0
- phykit/services/tree/prune_tree.py +40 -0
- phykit/services/tree/rename_tree_tips.py +64 -0
- phykit/services/tree/rf_distance.py +136 -0
- phykit/services/tree/root_tree.py +35 -0
- phykit/services/tree/saturation.py +209 -0
- phykit/services/tree/spurious_sequence.py +75 -0
- phykit/services/tree/terminal_branch_stats.py +87 -0
- phykit/services/tree/tip_labels.py +18 -0
- phykit/services/tree/tip_to_tip_distance.py +41 -0
- phykit/services/tree/tip_to_tip_node_distance.py +41 -0
- phykit/services/tree/total_tree_length.py +25 -0
- phykit/services/tree/treeness.py +16 -0
- phykit/services/tree/treeness_over_rcv.py +40 -0
- phykit/version.py +1 -0
- phykit-2.1.0.dist-info/METADATA +150 -0
- phykit-2.1.0.dist-info/RECORD +69 -0
- phykit-2.1.0.dist-info/WHEEL +5 -0
- phykit-2.1.0.dist-info/entry_points.txt +121 -0
- phykit-2.1.0.dist-info/licenses/LICENSE.md +7 -0
- phykit-2.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,546 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import itertools
|
|
3
|
+
from scipy.stats import chisquare
|
|
4
|
+
from scipy.stats import _stats_py
|
|
5
|
+
from typing import Dict, List, Tuple, Union
|
|
6
|
+
import multiprocessing as mp
|
|
7
|
+
from functools import partial, lru_cache
|
|
8
|
+
import hashlib
|
|
9
|
+
import pickle
|
|
10
|
+
|
|
11
|
+
from Bio import Phylo
|
|
12
|
+
from Bio.Phylo import Newick
|
|
13
|
+
import numpy as np
|
|
14
|
+
|
|
15
|
+
from .base import Tree
|
|
16
|
+
from ...helpers.files import read_single_column_file_to_list
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class PolytomyTest(Tree):
|
|
20
|
+
def __init__(self, args) -> None:
|
|
21
|
+
super().__init__(**self.process_args(args))
|
|
22
|
+
|
|
23
|
+
def run(self):
|
|
24
|
+
# read in groups
|
|
25
|
+
groups_arr = self.read_in_groups()
|
|
26
|
+
|
|
27
|
+
# determine groups of groups
|
|
28
|
+
groups_of_groups, outgroup_taxa = self.determine_groups_of_groups(groups_arr)
|
|
29
|
+
|
|
30
|
+
# read trees into list
|
|
31
|
+
trees_file_path = read_single_column_file_to_list(self.trees)
|
|
32
|
+
|
|
33
|
+
# go through all triplets of all trees and
|
|
34
|
+
# examine sister relationships among all triplets
|
|
35
|
+
summary = self.loop_through_trees_and_examine_sister_support_among_triplets(
|
|
36
|
+
trees_file_path, groups_of_groups, outgroup_taxa
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# count triplet and gene support frequencies for different sister relationships
|
|
40
|
+
(
|
|
41
|
+
triplet_group_counts,
|
|
42
|
+
gene_support_freq,
|
|
43
|
+
) = self.get_triplet_and_gene_support_freq_counts(summary)
|
|
44
|
+
|
|
45
|
+
# conduct chisquare tests
|
|
46
|
+
triplet_res, gene_support_freq_res = self.chisquare_tests(
|
|
47
|
+
triplet_group_counts, gene_support_freq
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
# print results
|
|
51
|
+
self.print_gene_support_freq_res(
|
|
52
|
+
gene_support_freq_res, gene_support_freq, trees_file_path
|
|
53
|
+
)
|
|
54
|
+
# self.print_triplet_based_res(triplet_res, triplet_group_counts)
|
|
55
|
+
|
|
56
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
57
|
+
return dict(trees=args.trees, groups=args.groups)
|
|
58
|
+
|
|
59
|
+
def read_in_groups(
|
|
60
|
+
self
|
|
61
|
+
) -> List[
|
|
62
|
+
List[
|
|
63
|
+
Union[str, List[str]]
|
|
64
|
+
]
|
|
65
|
+
]:
|
|
66
|
+
groups_arr = []
|
|
67
|
+
try:
|
|
68
|
+
for line in open(self.groups):
|
|
69
|
+
line = line.strip()
|
|
70
|
+
if not line.startswith("#"):
|
|
71
|
+
try:
|
|
72
|
+
line = line.split("\t")
|
|
73
|
+
temp = []
|
|
74
|
+
temp.append(line[0])
|
|
75
|
+
temp.append(line[1].split(";"))
|
|
76
|
+
temp.append(line[2].split(";"))
|
|
77
|
+
temp.append(line[3].split(";"))
|
|
78
|
+
temp.append(line[4].split(";"))
|
|
79
|
+
groups_arr.append(temp)
|
|
80
|
+
except IndexError:
|
|
81
|
+
try:
|
|
82
|
+
print(f"{self.groups} contains an indexing error.")
|
|
83
|
+
print(
|
|
84
|
+
"Please format the groups file (-g) as a four column tab-delimited file with column 1 being the name of the test"
|
|
85
|
+
)
|
|
86
|
+
print("col2: the tip names of one group (; separated)")
|
|
87
|
+
print("col3: the tip names of a second group (; separated)")
|
|
88
|
+
print("col4: the tip names of a third group (; separated)")
|
|
89
|
+
print(
|
|
90
|
+
"col5: the tip names of the outgroup taxa (; separated)"
|
|
91
|
+
)
|
|
92
|
+
sys.exit(2)
|
|
93
|
+
except BrokenPipeError:
|
|
94
|
+
pass
|
|
95
|
+
|
|
96
|
+
except FileNotFoundError:
|
|
97
|
+
try:
|
|
98
|
+
print(f"{self.groups} corresponds to no such file.")
|
|
99
|
+
print("Please check filename and pathing again.")
|
|
100
|
+
sys.exit(2)
|
|
101
|
+
except BrokenPipeError:
|
|
102
|
+
pass
|
|
103
|
+
|
|
104
|
+
return groups_arr
|
|
105
|
+
|
|
106
|
+
def _process_tree_batch(
|
|
107
|
+
self,
|
|
108
|
+
tree_files_batch: List[str],
|
|
109
|
+
groups_of_groups: Dict[str, List[List[str]]],
|
|
110
|
+
outgroup_taxa: List[str],
|
|
111
|
+
) -> Dict[str, Dict[str, Dict[str, int]]]:
|
|
112
|
+
"""Process a batch of trees in parallel."""
|
|
113
|
+
batch_summary = {}
|
|
114
|
+
for tree_file in tree_files_batch:
|
|
115
|
+
try:
|
|
116
|
+
tree = Phylo.read(tree_file, "newick")
|
|
117
|
+
tips = self.get_tip_names_from_tree(tree)
|
|
118
|
+
batch_summary = self.examine_all_triplets_and_sister_pairing(
|
|
119
|
+
tips, tree_file, batch_summary, groups_of_groups, outgroup_taxa
|
|
120
|
+
)
|
|
121
|
+
except:
|
|
122
|
+
continue
|
|
123
|
+
return batch_summary
|
|
124
|
+
|
|
125
|
+
def loop_through_trees_and_examine_sister_support_among_triplets(
|
|
126
|
+
self,
|
|
127
|
+
trees_file_path: str,
|
|
128
|
+
groups_of_groups: Dict[str, List[List[str]]],
|
|
129
|
+
outgroup_taxa: List[str],
|
|
130
|
+
) -> Dict[
|
|
131
|
+
str, Dict[str, Dict[str, int]]
|
|
132
|
+
]:
|
|
133
|
+
"""
|
|
134
|
+
go through all trees and all triplets of all trees. For each triplet,
|
|
135
|
+
determine which two taxa are sister to one another
|
|
136
|
+
"""
|
|
137
|
+
summary = dict()
|
|
138
|
+
|
|
139
|
+
# For small datasets, process sequentially
|
|
140
|
+
if len(trees_file_path) < 10:
|
|
141
|
+
for tree_file in trees_file_path:
|
|
142
|
+
try:
|
|
143
|
+
tree = Phylo.read(tree_file, "newick")
|
|
144
|
+
tips = self.get_tip_names_from_tree(tree)
|
|
145
|
+
summary = self.examine_all_triplets_and_sister_pairing(
|
|
146
|
+
tips, tree_file, summary, groups_of_groups, outgroup_taxa
|
|
147
|
+
)
|
|
148
|
+
except FileNotFoundError:
|
|
149
|
+
print(f"{tree_file} corresponds to no such file.")
|
|
150
|
+
print("Please check file name and pathing")
|
|
151
|
+
sys.exit(2)
|
|
152
|
+
else:
|
|
153
|
+
# Use multiprocessing for larger datasets
|
|
154
|
+
num_workers = min(mp.cpu_count(), 8)
|
|
155
|
+
batch_size = max(1, len(trees_file_path) // num_workers)
|
|
156
|
+
tree_batches = [trees_file_path[i:i + batch_size]
|
|
157
|
+
for i in range(0, len(trees_file_path), batch_size)]
|
|
158
|
+
|
|
159
|
+
# Process batches in parallel
|
|
160
|
+
process_func = partial(self._process_tree_batch,
|
|
161
|
+
groups_of_groups=groups_of_groups,
|
|
162
|
+
outgroup_taxa=outgroup_taxa)
|
|
163
|
+
|
|
164
|
+
with mp.Pool(processes=num_workers) as pool:
|
|
165
|
+
batch_results = pool.map(process_func, tree_batches)
|
|
166
|
+
|
|
167
|
+
# Merge results
|
|
168
|
+
for batch_summary in batch_results:
|
|
169
|
+
for tree_file, tree_data in batch_summary.items():
|
|
170
|
+
if tree_file not in summary:
|
|
171
|
+
summary[tree_file] = {}
|
|
172
|
+
for sisters, count in tree_data.items():
|
|
173
|
+
if sisters not in summary[tree_file]:
|
|
174
|
+
summary[tree_file][sisters] = 0
|
|
175
|
+
summary[tree_file][sisters] += count
|
|
176
|
+
|
|
177
|
+
return summary
|
|
178
|
+
|
|
179
|
+
def determine_groups_of_groups(
|
|
180
|
+
self,
|
|
181
|
+
groups_arr: List[Union[str, List[str]]],
|
|
182
|
+
) -> Tuple[
|
|
183
|
+
Dict[str, List[List[str]]],
|
|
184
|
+
List[str],
|
|
185
|
+
]:
|
|
186
|
+
groups_of_groups = {}
|
|
187
|
+
|
|
188
|
+
# Pre-compute group sets for faster lookups
|
|
189
|
+
self._group_sets_cache = {}
|
|
190
|
+
|
|
191
|
+
for group in groups_arr:
|
|
192
|
+
temp = []
|
|
193
|
+
group_sets = []
|
|
194
|
+
for i in range(1, 4):
|
|
195
|
+
taxa_list = [taxon_name for taxon_name in group[i]]
|
|
196
|
+
temp.append(taxa_list)
|
|
197
|
+
group_sets.append(frozenset(taxa_list))
|
|
198
|
+
groups_of_groups[group[0]] = temp
|
|
199
|
+
self._group_sets_cache[group[0]] = group_sets
|
|
200
|
+
|
|
201
|
+
outgroup_taxa = [taxon_name for taxon_name in group[4]]
|
|
202
|
+
|
|
203
|
+
return groups_of_groups, outgroup_taxa
|
|
204
|
+
|
|
205
|
+
@lru_cache(maxsize=1024)
|
|
206
|
+
def _get_triplet_tree_cached(self, tips_tuple: tuple, triplet: tuple,
|
|
207
|
+
tree_file: str, outgroup_tuple: tuple):
|
|
208
|
+
"""Cached version of get_triplet_tree."""
|
|
209
|
+
tips = list(tips_tuple)
|
|
210
|
+
outgroup_taxa = list(outgroup_tuple)
|
|
211
|
+
return self.get_triplet_tree(tips, triplet, tree_file, outgroup_taxa)
|
|
212
|
+
|
|
213
|
+
def _process_triplet_batch(
|
|
214
|
+
self,
|
|
215
|
+
triplet_batch: List[Tuple],
|
|
216
|
+
tips: List[str],
|
|
217
|
+
tree_file: str,
|
|
218
|
+
groups_of_groups: Dict[str, List[List[str]]],
|
|
219
|
+
outgroup_taxa: List[str],
|
|
220
|
+
) -> Dict[str, Dict[str, int]]:
|
|
221
|
+
"""Process a batch of triplets."""
|
|
222
|
+
batch_summary = {}
|
|
223
|
+
|
|
224
|
+
for triplet in triplet_batch:
|
|
225
|
+
# Use cached version for tree pruning
|
|
226
|
+
tree = self._get_triplet_tree_cached(
|
|
227
|
+
tuple(tips), triplet, tree_file, tuple(outgroup_taxa)
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
if tree and hasattr(tree, 'get_terminals'):
|
|
231
|
+
terminal_count = len(list(tree.get_terminals()))
|
|
232
|
+
if terminal_count == 3:
|
|
233
|
+
for _, groups in groups_of_groups.items():
|
|
234
|
+
num_groups_represented = self.count_number_of_groups_in_triplet(
|
|
235
|
+
triplet, groups
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
if num_groups_represented == 3:
|
|
239
|
+
tip_names = self.get_tip_names_from_tree(tree)
|
|
240
|
+
self.set_branch_lengths_in_tree_to_one(tree)
|
|
241
|
+
batch_summary = self.determine_sisters_and_add_to_counter(
|
|
242
|
+
tip_names, tree, tree_file, groups, batch_summary
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
return batch_summary
|
|
246
|
+
|
|
247
|
+
def examine_all_triplets_and_sister_pairing(
|
|
248
|
+
self,
|
|
249
|
+
tips: List[str],
|
|
250
|
+
tree_file: str,
|
|
251
|
+
summary: Dict[
|
|
252
|
+
str, Dict[str, Dict[str, int]]
|
|
253
|
+
],
|
|
254
|
+
groups_of_groups: Dict[str, List[List[str]]],
|
|
255
|
+
outgroup_taxa: List[str],
|
|
256
|
+
) -> Dict[str, Dict[str, int]]:
|
|
257
|
+
"""
|
|
258
|
+
evaluate all triplets for sister relationships. Polytomies
|
|
259
|
+
in input trees are accounted for
|
|
260
|
+
"""
|
|
261
|
+
# get all combinations of three tips
|
|
262
|
+
identifier = list(groups_of_groups.keys())[0]
|
|
263
|
+
triplet_tips = list(itertools.product(*groups_of_groups[identifier]))
|
|
264
|
+
|
|
265
|
+
# For small datasets, process sequentially
|
|
266
|
+
if len(triplet_tips) < 50:
|
|
267
|
+
for triplet in triplet_tips:
|
|
268
|
+
tree = self.get_triplet_tree(tips, triplet, tree_file, outgroup_taxa)
|
|
269
|
+
if tree and hasattr(tree, 'get_terminals'):
|
|
270
|
+
terminal_count = len(list(tree.get_terminals()))
|
|
271
|
+
if terminal_count == 3:
|
|
272
|
+
for _, groups in groups_of_groups.items():
|
|
273
|
+
num_groups_represented = self.count_number_of_groups_in_triplet(
|
|
274
|
+
triplet, groups
|
|
275
|
+
)
|
|
276
|
+
if num_groups_represented == 3:
|
|
277
|
+
tip_names = self.get_tip_names_from_tree(tree)
|
|
278
|
+
self.set_branch_lengths_in_tree_to_one(tree)
|
|
279
|
+
summary = self.determine_sisters_and_add_to_counter(
|
|
280
|
+
tip_names, tree, tree_file, groups, summary
|
|
281
|
+
)
|
|
282
|
+
else:
|
|
283
|
+
# Process triplets in batches for larger datasets
|
|
284
|
+
batch_size = max(10, len(triplet_tips) // (mp.cpu_count() * 2))
|
|
285
|
+
triplet_batches = [triplet_tips[i:i + batch_size]
|
|
286
|
+
for i in range(0, len(triplet_tips), batch_size)]
|
|
287
|
+
|
|
288
|
+
process_func = partial(
|
|
289
|
+
self._process_triplet_batch,
|
|
290
|
+
tips=tips,
|
|
291
|
+
tree_file=tree_file,
|
|
292
|
+
groups_of_groups=groups_of_groups,
|
|
293
|
+
outgroup_taxa=outgroup_taxa
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
# Process batches and merge results
|
|
297
|
+
for batch in triplet_batches:
|
|
298
|
+
batch_summary = process_func(batch)
|
|
299
|
+
for tree_file_key, tree_data in batch_summary.items():
|
|
300
|
+
if tree_file_key not in summary:
|
|
301
|
+
summary[tree_file_key] = {}
|
|
302
|
+
for sisters, count in tree_data.items():
|
|
303
|
+
if sisters not in summary[tree_file_key]:
|
|
304
|
+
summary[tree_file_key][sisters] = 0
|
|
305
|
+
summary[tree_file_key][sisters] += count
|
|
306
|
+
|
|
307
|
+
return summary
|
|
308
|
+
|
|
309
|
+
@lru_cache(maxsize=4096)
|
|
310
|
+
def _count_groups_cached(self, triplet_tuple: tuple, groups_tuple: tuple) -> int:
|
|
311
|
+
"""Cached version of group counting."""
|
|
312
|
+
triplet_set = set(triplet_tuple)
|
|
313
|
+
num_groups_represented = sum(
|
|
314
|
+
1 for group in groups_tuple if triplet_set.intersection(group)
|
|
315
|
+
)
|
|
316
|
+
return num_groups_represented
|
|
317
|
+
|
|
318
|
+
def count_number_of_groups_in_triplet(
|
|
319
|
+
self,
|
|
320
|
+
triplet: Tuple[str, str, str],
|
|
321
|
+
groups: List[List[str]]
|
|
322
|
+
) -> int:
|
|
323
|
+
"""
|
|
324
|
+
determine how many groups are represented in a triplet
|
|
325
|
+
"""
|
|
326
|
+
# Convert groups to tuple of frozensets for caching
|
|
327
|
+
groups_tuple = tuple(frozenset(group) for group in groups)
|
|
328
|
+
return self._count_groups_cached(triplet, groups_tuple)
|
|
329
|
+
|
|
330
|
+
def set_branch_lengths_in_tree_to_one(
|
|
331
|
+
self,
|
|
332
|
+
tree: Newick.Tree
|
|
333
|
+
) -> None:
|
|
334
|
+
# Single pass through all clades
|
|
335
|
+
for clade in tree.find_clades():
|
|
336
|
+
clade.branch_length = 1
|
|
337
|
+
|
|
338
|
+
def check_if_triplet_is_a_polytomy(self, tree: Newick.Tree) -> bool:
|
|
339
|
+
"""
|
|
340
|
+
count the number of internal branches. If 1, then the triplet is a polytomy
|
|
341
|
+
"""
|
|
342
|
+
# Direct check without intermediate list creation
|
|
343
|
+
nonterminal_count = sum(1 for _ in tree.get_nonterminals())
|
|
344
|
+
return nonterminal_count == 1
|
|
345
|
+
|
|
346
|
+
def sister_relationship_counter(
|
|
347
|
+
self,
|
|
348
|
+
tree_file: str,
|
|
349
|
+
summary: Dict[str, Dict[str, int]],
|
|
350
|
+
sisters: str,
|
|
351
|
+
) -> Dict[str, Dict[str, int]]:
|
|
352
|
+
"""
|
|
353
|
+
counter for how many times a particular sister relationship is observed
|
|
354
|
+
"""
|
|
355
|
+
# if tree is not in summary, create a key for it
|
|
356
|
+
if tree_file not in summary.keys():
|
|
357
|
+
summary[str(tree_file)] = {}
|
|
358
|
+
# if the sister relationship is not in the tree file dict, create a key for it
|
|
359
|
+
if sisters not in summary[str(tree_file)].keys():
|
|
360
|
+
summary[str(tree_file)][sisters] = 1
|
|
361
|
+
else:
|
|
362
|
+
summary[str(tree_file)][sisters] += 1
|
|
363
|
+
|
|
364
|
+
return summary
|
|
365
|
+
|
|
366
|
+
def get_triplet_tree(
|
|
367
|
+
self,
|
|
368
|
+
tips: List[str],
|
|
369
|
+
triplet: Tuple[str, str, str],
|
|
370
|
+
tree_file: str,
|
|
371
|
+
outgroup_taxa: List[str],
|
|
372
|
+
) -> Newick.Tree:
|
|
373
|
+
"""
|
|
374
|
+
get a tree object of only the triplet of interest
|
|
375
|
+
"""
|
|
376
|
+
# determine tips that are not in the triplet of interest
|
|
377
|
+
tips_to_prune = list(set(tips) - set(list(triplet)))
|
|
378
|
+
# determine tips that in the outgroup
|
|
379
|
+
outgroup_present = [value for value in tips if value in outgroup_taxa]
|
|
380
|
+
tree = Phylo.read(tree_file, "newick")
|
|
381
|
+
|
|
382
|
+
# root tree on outgroup taxa
|
|
383
|
+
try:
|
|
384
|
+
tree.root_with_outgroup(outgroup_present)
|
|
385
|
+
|
|
386
|
+
# prune to a triplet
|
|
387
|
+
tree = self.prune_tree_using_taxa_list(tree, tips_to_prune)
|
|
388
|
+
|
|
389
|
+
return tree
|
|
390
|
+
except ValueError:
|
|
391
|
+
tree = False
|
|
392
|
+
return tree
|
|
393
|
+
|
|
394
|
+
@lru_cache(maxsize=4096)
|
|
395
|
+
def _determine_sisters_cached(self, groups_tuple: tuple, pair: tuple) -> str:
|
|
396
|
+
"""Cached version of sister determination."""
|
|
397
|
+
# Find which group each pair member belongs to
|
|
398
|
+
idx0 = next(i for i, group in enumerate(groups_tuple) if pair[0] in group)
|
|
399
|
+
idx1 = next(i for i, group in enumerate(groups_tuple) if pair[1] in group)
|
|
400
|
+
|
|
401
|
+
# Sort and format the result
|
|
402
|
+
sisters = sorted([idx0, idx1])
|
|
403
|
+
return f"{sisters[0]}-{sisters[1]}"
|
|
404
|
+
|
|
405
|
+
def determine_sisters_from_triplet(
|
|
406
|
+
self,
|
|
407
|
+
groups: List[List[str]],
|
|
408
|
+
pair: Tuple[str]
|
|
409
|
+
) -> str:
|
|
410
|
+
"""
|
|
411
|
+
determine sister taxa from a triplet
|
|
412
|
+
"""
|
|
413
|
+
# Convert to tuple of frozensets for caching
|
|
414
|
+
groups_tuple = tuple(frozenset(group) for group in groups)
|
|
415
|
+
return self._determine_sisters_cached(groups_tuple, pair)
|
|
416
|
+
|
|
417
|
+
def determine_sisters_and_add_to_counter(
|
|
418
|
+
self,
|
|
419
|
+
tip_names: List[str],
|
|
420
|
+
tree: Newick.Tree,
|
|
421
|
+
tree_file: str,
|
|
422
|
+
groups: List[List[str]],
|
|
423
|
+
summary: Dict[str, Dict[str, int]],
|
|
424
|
+
) -> Dict[str, Dict[str, int]]:
|
|
425
|
+
"""
|
|
426
|
+
determine which pair of taxa are sister to one another
|
|
427
|
+
and add 1 to the counter for the sister pair
|
|
428
|
+
"""
|
|
429
|
+
# get pairs from tip names
|
|
430
|
+
pairs = list(itertools.combinations(tip_names, 2))
|
|
431
|
+
for pair in pairs:
|
|
432
|
+
is_polytomy = self.check_if_triplet_is_a_polytomy(tree)
|
|
433
|
+
# if distance between pair is 2 and the triplet is
|
|
434
|
+
# not a polytomy (i.e., having only 1 internal branch)
|
|
435
|
+
# then report the sisters in the triplet
|
|
436
|
+
if tree.distance(pair[0], pair[1]) == 2 and not is_polytomy:
|
|
437
|
+
# determine which two tips are sisters
|
|
438
|
+
sisters = self.determine_sisters_from_triplet(groups, pair)
|
|
439
|
+
# add to summary dictionary of how many times that sister
|
|
440
|
+
# relationship is observed
|
|
441
|
+
summary = self.sister_relationship_counter(tree_file, summary, sisters)
|
|
442
|
+
return summary
|
|
443
|
+
|
|
444
|
+
def get_triplet_and_gene_support_freq_counts(
|
|
445
|
+
self,
|
|
446
|
+
summary: Dict[str, Dict[str, int]]
|
|
447
|
+
) -> Tuple[
|
|
448
|
+
Dict[str, int], Dict[str, int]
|
|
449
|
+
]:
|
|
450
|
+
"""
|
|
451
|
+
count how many triplets and genes support the various sister relationships
|
|
452
|
+
"""
|
|
453
|
+
# Count the total number of sister pairings
|
|
454
|
+
# for the three possible pairs for triplets
|
|
455
|
+
triplet_group_counts = {"g0g1_count": 0, "g0g2_count": 0, "g1g2_count": 0}
|
|
456
|
+
|
|
457
|
+
# Also, keep track of which and how many genes
|
|
458
|
+
# support each sister pairing
|
|
459
|
+
gene_support_freq = {"0-1": 0, "1-2": 0, "0-2": 0}
|
|
460
|
+
for tree in summary:
|
|
461
|
+
# create empty key value pairs in case sister
|
|
462
|
+
# pairing was never observed
|
|
463
|
+
if "0-1" not in summary[tree].keys():
|
|
464
|
+
summary[tree]["0-1"] = 0
|
|
465
|
+
if "0-2" not in summary[tree].keys():
|
|
466
|
+
summary[tree]["0-2"] = 0
|
|
467
|
+
if "1-2" not in summary[tree].keys():
|
|
468
|
+
summary[tree]["1-2"] = 0
|
|
469
|
+
# create a running value of triplets that support each sister pair
|
|
470
|
+
triplet_group_counts["g0g1_count"] += summary[tree]["0-1"]
|
|
471
|
+
triplet_group_counts["g0g2_count"] += summary[tree]["0-2"]
|
|
472
|
+
triplet_group_counts["g1g2_count"] += summary[tree]["1-2"]
|
|
473
|
+
# determine which sister pairing is best supported in a single gene
|
|
474
|
+
# and add one to the corresponding gene support frequency count
|
|
475
|
+
gene_support_freq[max(summary[tree], key=summary[tree].get)] += 1
|
|
476
|
+
|
|
477
|
+
return triplet_group_counts, gene_support_freq
|
|
478
|
+
|
|
479
|
+
def chisquare_tests(
|
|
480
|
+
self,
|
|
481
|
+
triplet_group_counts: dict,
|
|
482
|
+
gene_support_freq: dict
|
|
483
|
+
) -> Tuple[
|
|
484
|
+
_stats_py.Power_divergenceResult,
|
|
485
|
+
_stats_py.Power_divergenceResult,
|
|
486
|
+
]:
|
|
487
|
+
triplet_res = chisquare(
|
|
488
|
+
[
|
|
489
|
+
triplet_group_counts["g0g1_count"],
|
|
490
|
+
triplet_group_counts["g0g2_count"],
|
|
491
|
+
triplet_group_counts["g1g2_count"],
|
|
492
|
+
]
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
gene_support_freq_res = chisquare(
|
|
496
|
+
[
|
|
497
|
+
gene_support_freq["0-1"],
|
|
498
|
+
gene_support_freq["0-2"],
|
|
499
|
+
gene_support_freq["1-2"],
|
|
500
|
+
]
|
|
501
|
+
)
|
|
502
|
+
|
|
503
|
+
return triplet_res, gene_support_freq_res
|
|
504
|
+
|
|
505
|
+
# def print_triplet_based_res(
|
|
506
|
+
# self,
|
|
507
|
+
# triplet_res,
|
|
508
|
+
# triplet_group_counts: dict
|
|
509
|
+
# ) -> None:
|
|
510
|
+
# """
|
|
511
|
+
# print results to stdout for user
|
|
512
|
+
# """
|
|
513
|
+
# try:
|
|
514
|
+
# print(f"\nTriplet Results")
|
|
515
|
+
# print(f"===============")
|
|
516
|
+
# print(f"chi-squared: {round(triplet_res.statistic, 4)}")
|
|
517
|
+
# print(f"p-value: {round(triplet_res.pvalue, 6)}")
|
|
518
|
+
# print(f"total triplets: {sum(triplet_group_counts.values())}")
|
|
519
|
+
# print(f"0-1: {triplet_group_counts['g0g1_count']}")
|
|
520
|
+
# print(f"0-2: {triplet_group_counts['g0g2_count']}")
|
|
521
|
+
# print(f"1-2: {triplet_group_counts['g1g2_count']}")
|
|
522
|
+
# except BrokenPipeError:
|
|
523
|
+
# pass
|
|
524
|
+
|
|
525
|
+
def print_gene_support_freq_res(
|
|
526
|
+
self,
|
|
527
|
+
gene_support_freq_res,
|
|
528
|
+
gene_support_freq: Dict[str, int],
|
|
529
|
+
trees_file_path: List[str],
|
|
530
|
+
) -> None:
|
|
531
|
+
"""
|
|
532
|
+
print results to stdout for user
|
|
533
|
+
"""
|
|
534
|
+
try:
|
|
535
|
+
print(f"Gene Support Frequency Results")
|
|
536
|
+
print(f"==============================")
|
|
537
|
+
print(f"chi-squared: {round(gene_support_freq_res.statistic, 4)}")
|
|
538
|
+
print(f"p-value: {round(gene_support_freq_res.pvalue, 6)}")
|
|
539
|
+
print(
|
|
540
|
+
f"total genes: {(gene_support_freq['0-1'] + gene_support_freq['0-2'] + gene_support_freq['1-2'])}"
|
|
541
|
+
)
|
|
542
|
+
print(f"0-1: {gene_support_freq['0-1']}")
|
|
543
|
+
print(f"0-2: {gene_support_freq['0-2']}")
|
|
544
|
+
print(f"1-2: {gene_support_freq['1-2']}")
|
|
545
|
+
except BrokenPipeError:
|
|
546
|
+
pass
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
import copy
|
|
3
|
+
|
|
4
|
+
from Bio import Phylo
|
|
5
|
+
|
|
6
|
+
from .base import Tree
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PrintTree(Tree):
|
|
10
|
+
def __init__(self, args) -> None:
|
|
11
|
+
super().__init__(**self.process_args(args))
|
|
12
|
+
|
|
13
|
+
def run(self):
|
|
14
|
+
tree = self.read_tree_file()
|
|
15
|
+
|
|
16
|
+
if self.remove:
|
|
17
|
+
# Make a deep copy to avoid modifying the cached tree
|
|
18
|
+
tree = copy.deepcopy(tree)
|
|
19
|
+
for node in tree.get_terminals() + tree.get_nonterminals():
|
|
20
|
+
node.branch_length = None
|
|
21
|
+
|
|
22
|
+
try:
|
|
23
|
+
Phylo.draw_ascii(tree)
|
|
24
|
+
except BrokenPipeError:
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
28
|
+
return dict(tree_file_path=args.tree, remove=args.remove)
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
import copy
|
|
3
|
+
|
|
4
|
+
from .base import Tree
|
|
5
|
+
|
|
6
|
+
from ...helpers.files import read_single_column_file_to_list
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PruneTree(Tree):
|
|
10
|
+
def __init__(self, args) -> None:
|
|
11
|
+
super().__init__(**self.process_args(args))
|
|
12
|
+
|
|
13
|
+
def run(self) -> None:
|
|
14
|
+
tree = self.read_tree_file()
|
|
15
|
+
# Make a deep copy to avoid modifying the cached tree
|
|
16
|
+
tree_copy = copy.deepcopy(tree)
|
|
17
|
+
|
|
18
|
+
taxa = read_single_column_file_to_list(self.list_of_taxa)
|
|
19
|
+
|
|
20
|
+
if self.keep:
|
|
21
|
+
tips_in_tree = [term.name for term in tree_copy.get_terminals()]
|
|
22
|
+
taxa = [x for x in tips_in_tree if x not in taxa]
|
|
23
|
+
|
|
24
|
+
tree_copy = self.prune_tree_using_taxa_list(tree_copy, taxa)
|
|
25
|
+
|
|
26
|
+
self.write_tree_file(tree_copy, self.output_file_path)
|
|
27
|
+
|
|
28
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
29
|
+
tree_file_path = args.tree
|
|
30
|
+
output_file_path = \
|
|
31
|
+
f"{args.output}" if args.output else f"{tree_file_path}.pruned"
|
|
32
|
+
|
|
33
|
+
keep = True if args.keep is None else args.keep
|
|
34
|
+
|
|
35
|
+
return dict(
|
|
36
|
+
tree_file_path=tree_file_path,
|
|
37
|
+
list_of_taxa=args.list_of_taxa,
|
|
38
|
+
output_file_path=output_file_path,
|
|
39
|
+
keep=keep,
|
|
40
|
+
)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import copy
|
|
3
|
+
from typing import Dict
|
|
4
|
+
|
|
5
|
+
from Bio.Phylo import Newick
|
|
6
|
+
|
|
7
|
+
from .base import Tree
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class RenameTreeTips(Tree):
|
|
11
|
+
def __init__(self, args) -> None:
|
|
12
|
+
super().__init__(**self.process_args(args))
|
|
13
|
+
|
|
14
|
+
def run(self):
|
|
15
|
+
tree = self.read_tree_file()
|
|
16
|
+
# Make a deep copy to avoid modifying the cached tree
|
|
17
|
+
tree_copy = copy.deepcopy(tree)
|
|
18
|
+
|
|
19
|
+
idmap = self.read_id_map()
|
|
20
|
+
|
|
21
|
+
tree_copy = self.replace_tip_names(tree_copy, idmap)
|
|
22
|
+
|
|
23
|
+
self.write_tree_file(tree_copy, self.output_file_path)
|
|
24
|
+
|
|
25
|
+
def process_args(self, args) -> Dict[str, str]:
|
|
26
|
+
tree_file_path = args.tree
|
|
27
|
+
|
|
28
|
+
output_file_path = \
|
|
29
|
+
f"{args.output}" if args.output else f"{tree_file_path}.renamed"
|
|
30
|
+
|
|
31
|
+
return dict(
|
|
32
|
+
tree_file_path=tree_file_path,
|
|
33
|
+
idmap=args.idmap,
|
|
34
|
+
output_file_path=output_file_path,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
def read_id_map(self) -> Dict[str, str]:
|
|
38
|
+
idmap = dict()
|
|
39
|
+
try:
|
|
40
|
+
with open(self.idmap) as identifiers:
|
|
41
|
+
for line in identifiers:
|
|
42
|
+
(key, val) = line.split()
|
|
43
|
+
idmap[key] = val
|
|
44
|
+
except FileNotFoundError:
|
|
45
|
+
try:
|
|
46
|
+
print(f"{self.idmap} corresponds to no such file.")
|
|
47
|
+
print("Please check file name and pathing")
|
|
48
|
+
sys.exit(2)
|
|
49
|
+
except BrokenPipeError:
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
return idmap
|
|
53
|
+
|
|
54
|
+
def replace_tip_names(
|
|
55
|
+
self,
|
|
56
|
+
tree: Tree,
|
|
57
|
+
idmap: Dict[str, str]
|
|
58
|
+
) -> Newick.Tree:
|
|
59
|
+
for term in tree.get_terminals():
|
|
60
|
+
name = term.name
|
|
61
|
+
if name in idmap:
|
|
62
|
+
term.name = idmap[name]
|
|
63
|
+
|
|
64
|
+
return tree
|