phykit 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. phykit/__init__.py +0 -0
  2. phykit/__main__.py +6 -0
  3. phykit/helpers/__init__.py +0 -0
  4. phykit/helpers/boolean_argument_parsing.py +12 -0
  5. phykit/helpers/caching.py +201 -0
  6. phykit/helpers/files.py +125 -0
  7. phykit/helpers/parallel.py +305 -0
  8. phykit/helpers/stats_summary.py +64 -0
  9. phykit/helpers/streaming.py +152 -0
  10. phykit/phykit.py +2862 -0
  11. phykit/services/__init__.py +0 -0
  12. phykit/services/alignment/__init__.py +17 -0
  13. phykit/services/alignment/alignment_length.py +16 -0
  14. phykit/services/alignment/alignment_length_no_gaps.py +69 -0
  15. phykit/services/alignment/alignment_recoding.py +89 -0
  16. phykit/services/alignment/base.py +103 -0
  17. phykit/services/alignment/column_score.py +66 -0
  18. phykit/services/alignment/compositional_bias_per_site.py +98 -0
  19. phykit/services/alignment/create_concatenation_matrix.py +254 -0
  20. phykit/services/alignment/dna_threader.py +145 -0
  21. phykit/services/alignment/evolutionary_rate_per_site.py +85 -0
  22. phykit/services/alignment/faidx.py +21 -0
  23. phykit/services/alignment/gc_content.py +94 -0
  24. phykit/services/alignment/pairwise_identity.py +159 -0
  25. phykit/services/alignment/parsimony_informative_sites.py +81 -0
  26. phykit/services/alignment/rcv.py +14 -0
  27. phykit/services/alignment/rcvt.py +47 -0
  28. phykit/services/alignment/rename_fasta_entries.py +53 -0
  29. phykit/services/alignment/sum_of_pairs_score.py +157 -0
  30. phykit/services/alignment/variable_sites.py +54 -0
  31. phykit/services/base.py +9 -0
  32. phykit/services/tree/__init__.py +29 -0
  33. phykit/services/tree/base.py +178 -0
  34. phykit/services/tree/bipartition_support_stats.py +48 -0
  35. phykit/services/tree/branch_length_multiplier.py +37 -0
  36. phykit/services/tree/collapse_branches.py +27 -0
  37. phykit/services/tree/covarying_evolutionary_rates.py +272 -0
  38. phykit/services/tree/dvmc.py +37 -0
  39. phykit/services/tree/evolutionary_rate.py +17 -0
  40. phykit/services/tree/hidden_paralogy_check.py +128 -0
  41. phykit/services/tree/internal_branch_stats.py +77 -0
  42. phykit/services/tree/internode_labeler.py +33 -0
  43. phykit/services/tree/last_common_ancestor_subtree.py +35 -0
  44. phykit/services/tree/lb_score.py +196 -0
  45. phykit/services/tree/monophyly_check.py +106 -0
  46. phykit/services/tree/nearest_neighbor_interchange.py +140 -0
  47. phykit/services/tree/patristic_distances.py +113 -0
  48. phykit/services/tree/polytomy_test.py +546 -0
  49. phykit/services/tree/print_tree.py +28 -0
  50. phykit/services/tree/prune_tree.py +40 -0
  51. phykit/services/tree/rename_tree_tips.py +64 -0
  52. phykit/services/tree/rf_distance.py +136 -0
  53. phykit/services/tree/root_tree.py +35 -0
  54. phykit/services/tree/saturation.py +209 -0
  55. phykit/services/tree/spurious_sequence.py +75 -0
  56. phykit/services/tree/terminal_branch_stats.py +87 -0
  57. phykit/services/tree/tip_labels.py +18 -0
  58. phykit/services/tree/tip_to_tip_distance.py +41 -0
  59. phykit/services/tree/tip_to_tip_node_distance.py +41 -0
  60. phykit/services/tree/total_tree_length.py +25 -0
  61. phykit/services/tree/treeness.py +16 -0
  62. phykit/services/tree/treeness_over_rcv.py +40 -0
  63. phykit/version.py +1 -0
  64. phykit-2.1.0.dist-info/METADATA +150 -0
  65. phykit-2.1.0.dist-info/RECORD +69 -0
  66. phykit-2.1.0.dist-info/WHEEL +5 -0
  67. phykit-2.1.0.dist-info/entry_points.txt +121 -0
  68. phykit-2.1.0.dist-info/licenses/LICENSE.md +7 -0
  69. phykit-2.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,546 @@
1
+ import sys
2
+ import itertools
3
+ from scipy.stats import chisquare
4
+ from scipy.stats import _stats_py
5
+ from typing import Dict, List, Tuple, Union
6
+ import multiprocessing as mp
7
+ from functools import partial, lru_cache
8
+ import hashlib
9
+ import pickle
10
+
11
+ from Bio import Phylo
12
+ from Bio.Phylo import Newick
13
+ import numpy as np
14
+
15
+ from .base import Tree
16
+ from ...helpers.files import read_single_column_file_to_list
17
+
18
+
19
+ class PolytomyTest(Tree):
20
+ def __init__(self, args) -> None:
21
+ super().__init__(**self.process_args(args))
22
+
23
+ def run(self):
24
+ # read in groups
25
+ groups_arr = self.read_in_groups()
26
+
27
+ # determine groups of groups
28
+ groups_of_groups, outgroup_taxa = self.determine_groups_of_groups(groups_arr)
29
+
30
+ # read trees into list
31
+ trees_file_path = read_single_column_file_to_list(self.trees)
32
+
33
+ # go through all triplets of all trees and
34
+ # examine sister relationships among all triplets
35
+ summary = self.loop_through_trees_and_examine_sister_support_among_triplets(
36
+ trees_file_path, groups_of_groups, outgroup_taxa
37
+ )
38
+
39
+ # count triplet and gene support frequencies for different sister relationships
40
+ (
41
+ triplet_group_counts,
42
+ gene_support_freq,
43
+ ) = self.get_triplet_and_gene_support_freq_counts(summary)
44
+
45
+ # conduct chisquare tests
46
+ triplet_res, gene_support_freq_res = self.chisquare_tests(
47
+ triplet_group_counts, gene_support_freq
48
+ )
49
+
50
+ # print results
51
+ self.print_gene_support_freq_res(
52
+ gene_support_freq_res, gene_support_freq, trees_file_path
53
+ )
54
+ # self.print_triplet_based_res(triplet_res, triplet_group_counts)
55
+
56
+ def process_args(self, args) -> Dict[str, str]:
57
+ return dict(trees=args.trees, groups=args.groups)
58
+
59
+ def read_in_groups(
60
+ self
61
+ ) -> List[
62
+ List[
63
+ Union[str, List[str]]
64
+ ]
65
+ ]:
66
+ groups_arr = []
67
+ try:
68
+ for line in open(self.groups):
69
+ line = line.strip()
70
+ if not line.startswith("#"):
71
+ try:
72
+ line = line.split("\t")
73
+ temp = []
74
+ temp.append(line[0])
75
+ temp.append(line[1].split(";"))
76
+ temp.append(line[2].split(";"))
77
+ temp.append(line[3].split(";"))
78
+ temp.append(line[4].split(";"))
79
+ groups_arr.append(temp)
80
+ except IndexError:
81
+ try:
82
+ print(f"{self.groups} contains an indexing error.")
83
+ print(
84
+ "Please format the groups file (-g) as a four column tab-delimited file with column 1 being the name of the test"
85
+ )
86
+ print("col2: the tip names of one group (; separated)")
87
+ print("col3: the tip names of a second group (; separated)")
88
+ print("col4: the tip names of a third group (; separated)")
89
+ print(
90
+ "col5: the tip names of the outgroup taxa (; separated)"
91
+ )
92
+ sys.exit(2)
93
+ except BrokenPipeError:
94
+ pass
95
+
96
+ except FileNotFoundError:
97
+ try:
98
+ print(f"{self.groups} corresponds to no such file.")
99
+ print("Please check filename and pathing again.")
100
+ sys.exit(2)
101
+ except BrokenPipeError:
102
+ pass
103
+
104
+ return groups_arr
105
+
106
+ def _process_tree_batch(
107
+ self,
108
+ tree_files_batch: List[str],
109
+ groups_of_groups: Dict[str, List[List[str]]],
110
+ outgroup_taxa: List[str],
111
+ ) -> Dict[str, Dict[str, Dict[str, int]]]:
112
+ """Process a batch of trees in parallel."""
113
+ batch_summary = {}
114
+ for tree_file in tree_files_batch:
115
+ try:
116
+ tree = Phylo.read(tree_file, "newick")
117
+ tips = self.get_tip_names_from_tree(tree)
118
+ batch_summary = self.examine_all_triplets_and_sister_pairing(
119
+ tips, tree_file, batch_summary, groups_of_groups, outgroup_taxa
120
+ )
121
+ except:
122
+ continue
123
+ return batch_summary
124
+
125
+ def loop_through_trees_and_examine_sister_support_among_triplets(
126
+ self,
127
+ trees_file_path: str,
128
+ groups_of_groups: Dict[str, List[List[str]]],
129
+ outgroup_taxa: List[str],
130
+ ) -> Dict[
131
+ str, Dict[str, Dict[str, int]]
132
+ ]:
133
+ """
134
+ go through all trees and all triplets of all trees. For each triplet,
135
+ determine which two taxa are sister to one another
136
+ """
137
+ summary = dict()
138
+
139
+ # For small datasets, process sequentially
140
+ if len(trees_file_path) < 10:
141
+ for tree_file in trees_file_path:
142
+ try:
143
+ tree = Phylo.read(tree_file, "newick")
144
+ tips = self.get_tip_names_from_tree(tree)
145
+ summary = self.examine_all_triplets_and_sister_pairing(
146
+ tips, tree_file, summary, groups_of_groups, outgroup_taxa
147
+ )
148
+ except FileNotFoundError:
149
+ print(f"{tree_file} corresponds to no such file.")
150
+ print("Please check file name and pathing")
151
+ sys.exit(2)
152
+ else:
153
+ # Use multiprocessing for larger datasets
154
+ num_workers = min(mp.cpu_count(), 8)
155
+ batch_size = max(1, len(trees_file_path) // num_workers)
156
+ tree_batches = [trees_file_path[i:i + batch_size]
157
+ for i in range(0, len(trees_file_path), batch_size)]
158
+
159
+ # Process batches in parallel
160
+ process_func = partial(self._process_tree_batch,
161
+ groups_of_groups=groups_of_groups,
162
+ outgroup_taxa=outgroup_taxa)
163
+
164
+ with mp.Pool(processes=num_workers) as pool:
165
+ batch_results = pool.map(process_func, tree_batches)
166
+
167
+ # Merge results
168
+ for batch_summary in batch_results:
169
+ for tree_file, tree_data in batch_summary.items():
170
+ if tree_file not in summary:
171
+ summary[tree_file] = {}
172
+ for sisters, count in tree_data.items():
173
+ if sisters not in summary[tree_file]:
174
+ summary[tree_file][sisters] = 0
175
+ summary[tree_file][sisters] += count
176
+
177
+ return summary
178
+
179
+ def determine_groups_of_groups(
180
+ self,
181
+ groups_arr: List[Union[str, List[str]]],
182
+ ) -> Tuple[
183
+ Dict[str, List[List[str]]],
184
+ List[str],
185
+ ]:
186
+ groups_of_groups = {}
187
+
188
+ # Pre-compute group sets for faster lookups
189
+ self._group_sets_cache = {}
190
+
191
+ for group in groups_arr:
192
+ temp = []
193
+ group_sets = []
194
+ for i in range(1, 4):
195
+ taxa_list = [taxon_name for taxon_name in group[i]]
196
+ temp.append(taxa_list)
197
+ group_sets.append(frozenset(taxa_list))
198
+ groups_of_groups[group[0]] = temp
199
+ self._group_sets_cache[group[0]] = group_sets
200
+
201
+ outgroup_taxa = [taxon_name for taxon_name in group[4]]
202
+
203
+ return groups_of_groups, outgroup_taxa
204
+
205
+ @lru_cache(maxsize=1024)
206
+ def _get_triplet_tree_cached(self, tips_tuple: tuple, triplet: tuple,
207
+ tree_file: str, outgroup_tuple: tuple):
208
+ """Cached version of get_triplet_tree."""
209
+ tips = list(tips_tuple)
210
+ outgroup_taxa = list(outgroup_tuple)
211
+ return self.get_triplet_tree(tips, triplet, tree_file, outgroup_taxa)
212
+
213
+ def _process_triplet_batch(
214
+ self,
215
+ triplet_batch: List[Tuple],
216
+ tips: List[str],
217
+ tree_file: str,
218
+ groups_of_groups: Dict[str, List[List[str]]],
219
+ outgroup_taxa: List[str],
220
+ ) -> Dict[str, Dict[str, int]]:
221
+ """Process a batch of triplets."""
222
+ batch_summary = {}
223
+
224
+ for triplet in triplet_batch:
225
+ # Use cached version for tree pruning
226
+ tree = self._get_triplet_tree_cached(
227
+ tuple(tips), triplet, tree_file, tuple(outgroup_taxa)
228
+ )
229
+
230
+ if tree and hasattr(tree, 'get_terminals'):
231
+ terminal_count = len(list(tree.get_terminals()))
232
+ if terminal_count == 3:
233
+ for _, groups in groups_of_groups.items():
234
+ num_groups_represented = self.count_number_of_groups_in_triplet(
235
+ triplet, groups
236
+ )
237
+
238
+ if num_groups_represented == 3:
239
+ tip_names = self.get_tip_names_from_tree(tree)
240
+ self.set_branch_lengths_in_tree_to_one(tree)
241
+ batch_summary = self.determine_sisters_and_add_to_counter(
242
+ tip_names, tree, tree_file, groups, batch_summary
243
+ )
244
+
245
+ return batch_summary
246
+
247
+ def examine_all_triplets_and_sister_pairing(
248
+ self,
249
+ tips: List[str],
250
+ tree_file: str,
251
+ summary: Dict[
252
+ str, Dict[str, Dict[str, int]]
253
+ ],
254
+ groups_of_groups: Dict[str, List[List[str]]],
255
+ outgroup_taxa: List[str],
256
+ ) -> Dict[str, Dict[str, int]]:
257
+ """
258
+ evaluate all triplets for sister relationships. Polytomies
259
+ in input trees are accounted for
260
+ """
261
+ # get all combinations of three tips
262
+ identifier = list(groups_of_groups.keys())[0]
263
+ triplet_tips = list(itertools.product(*groups_of_groups[identifier]))
264
+
265
+ # For small datasets, process sequentially
266
+ if len(triplet_tips) < 50:
267
+ for triplet in triplet_tips:
268
+ tree = self.get_triplet_tree(tips, triplet, tree_file, outgroup_taxa)
269
+ if tree and hasattr(tree, 'get_terminals'):
270
+ terminal_count = len(list(tree.get_terminals()))
271
+ if terminal_count == 3:
272
+ for _, groups in groups_of_groups.items():
273
+ num_groups_represented = self.count_number_of_groups_in_triplet(
274
+ triplet, groups
275
+ )
276
+ if num_groups_represented == 3:
277
+ tip_names = self.get_tip_names_from_tree(tree)
278
+ self.set_branch_lengths_in_tree_to_one(tree)
279
+ summary = self.determine_sisters_and_add_to_counter(
280
+ tip_names, tree, tree_file, groups, summary
281
+ )
282
+ else:
283
+ # Process triplets in batches for larger datasets
284
+ batch_size = max(10, len(triplet_tips) // (mp.cpu_count() * 2))
285
+ triplet_batches = [triplet_tips[i:i + batch_size]
286
+ for i in range(0, len(triplet_tips), batch_size)]
287
+
288
+ process_func = partial(
289
+ self._process_triplet_batch,
290
+ tips=tips,
291
+ tree_file=tree_file,
292
+ groups_of_groups=groups_of_groups,
293
+ outgroup_taxa=outgroup_taxa
294
+ )
295
+
296
+ # Process batches and merge results
297
+ for batch in triplet_batches:
298
+ batch_summary = process_func(batch)
299
+ for tree_file_key, tree_data in batch_summary.items():
300
+ if tree_file_key not in summary:
301
+ summary[tree_file_key] = {}
302
+ for sisters, count in tree_data.items():
303
+ if sisters not in summary[tree_file_key]:
304
+ summary[tree_file_key][sisters] = 0
305
+ summary[tree_file_key][sisters] += count
306
+
307
+ return summary
308
+
309
+ @lru_cache(maxsize=4096)
310
+ def _count_groups_cached(self, triplet_tuple: tuple, groups_tuple: tuple) -> int:
311
+ """Cached version of group counting."""
312
+ triplet_set = set(triplet_tuple)
313
+ num_groups_represented = sum(
314
+ 1 for group in groups_tuple if triplet_set.intersection(group)
315
+ )
316
+ return num_groups_represented
317
+
318
+ def count_number_of_groups_in_triplet(
319
+ self,
320
+ triplet: Tuple[str, str, str],
321
+ groups: List[List[str]]
322
+ ) -> int:
323
+ """
324
+ determine how many groups are represented in a triplet
325
+ """
326
+ # Convert groups to tuple of frozensets for caching
327
+ groups_tuple = tuple(frozenset(group) for group in groups)
328
+ return self._count_groups_cached(triplet, groups_tuple)
329
+
330
+ def set_branch_lengths_in_tree_to_one(
331
+ self,
332
+ tree: Newick.Tree
333
+ ) -> None:
334
+ # Single pass through all clades
335
+ for clade in tree.find_clades():
336
+ clade.branch_length = 1
337
+
338
+ def check_if_triplet_is_a_polytomy(self, tree: Newick.Tree) -> bool:
339
+ """
340
+ count the number of internal branches. If 1, then the triplet is a polytomy
341
+ """
342
+ # Direct check without intermediate list creation
343
+ nonterminal_count = sum(1 for _ in tree.get_nonterminals())
344
+ return nonterminal_count == 1
345
+
346
+ def sister_relationship_counter(
347
+ self,
348
+ tree_file: str,
349
+ summary: Dict[str, Dict[str, int]],
350
+ sisters: str,
351
+ ) -> Dict[str, Dict[str, int]]:
352
+ """
353
+ counter for how many times a particular sister relationship is observed
354
+ """
355
+ # if tree is not in summary, create a key for it
356
+ if tree_file not in summary.keys():
357
+ summary[str(tree_file)] = {}
358
+ # if the sister relationship is not in the tree file dict, create a key for it
359
+ if sisters not in summary[str(tree_file)].keys():
360
+ summary[str(tree_file)][sisters] = 1
361
+ else:
362
+ summary[str(tree_file)][sisters] += 1
363
+
364
+ return summary
365
+
366
+ def get_triplet_tree(
367
+ self,
368
+ tips: List[str],
369
+ triplet: Tuple[str, str, str],
370
+ tree_file: str,
371
+ outgroup_taxa: List[str],
372
+ ) -> Newick.Tree:
373
+ """
374
+ get a tree object of only the triplet of interest
375
+ """
376
+ # determine tips that are not in the triplet of interest
377
+ tips_to_prune = list(set(tips) - set(list(triplet)))
378
+ # determine tips that in the outgroup
379
+ outgroup_present = [value for value in tips if value in outgroup_taxa]
380
+ tree = Phylo.read(tree_file, "newick")
381
+
382
+ # root tree on outgroup taxa
383
+ try:
384
+ tree.root_with_outgroup(outgroup_present)
385
+
386
+ # prune to a triplet
387
+ tree = self.prune_tree_using_taxa_list(tree, tips_to_prune)
388
+
389
+ return tree
390
+ except ValueError:
391
+ tree = False
392
+ return tree
393
+
394
+ @lru_cache(maxsize=4096)
395
+ def _determine_sisters_cached(self, groups_tuple: tuple, pair: tuple) -> str:
396
+ """Cached version of sister determination."""
397
+ # Find which group each pair member belongs to
398
+ idx0 = next(i for i, group in enumerate(groups_tuple) if pair[0] in group)
399
+ idx1 = next(i for i, group in enumerate(groups_tuple) if pair[1] in group)
400
+
401
+ # Sort and format the result
402
+ sisters = sorted([idx0, idx1])
403
+ return f"{sisters[0]}-{sisters[1]}"
404
+
405
+ def determine_sisters_from_triplet(
406
+ self,
407
+ groups: List[List[str]],
408
+ pair: Tuple[str]
409
+ ) -> str:
410
+ """
411
+ determine sister taxa from a triplet
412
+ """
413
+ # Convert to tuple of frozensets for caching
414
+ groups_tuple = tuple(frozenset(group) for group in groups)
415
+ return self._determine_sisters_cached(groups_tuple, pair)
416
+
417
+ def determine_sisters_and_add_to_counter(
418
+ self,
419
+ tip_names: List[str],
420
+ tree: Newick.Tree,
421
+ tree_file: str,
422
+ groups: List[List[str]],
423
+ summary: Dict[str, Dict[str, int]],
424
+ ) -> Dict[str, Dict[str, int]]:
425
+ """
426
+ determine which pair of taxa are sister to one another
427
+ and add 1 to the counter for the sister pair
428
+ """
429
+ # get pairs from tip names
430
+ pairs = list(itertools.combinations(tip_names, 2))
431
+ for pair in pairs:
432
+ is_polytomy = self.check_if_triplet_is_a_polytomy(tree)
433
+ # if distance between pair is 2 and the triplet is
434
+ # not a polytomy (i.e., having only 1 internal branch)
435
+ # then report the sisters in the triplet
436
+ if tree.distance(pair[0], pair[1]) == 2 and not is_polytomy:
437
+ # determine which two tips are sisters
438
+ sisters = self.determine_sisters_from_triplet(groups, pair)
439
+ # add to summary dictionary of how many times that sister
440
+ # relationship is observed
441
+ summary = self.sister_relationship_counter(tree_file, summary, sisters)
442
+ return summary
443
+
444
+ def get_triplet_and_gene_support_freq_counts(
445
+ self,
446
+ summary: Dict[str, Dict[str, int]]
447
+ ) -> Tuple[
448
+ Dict[str, int], Dict[str, int]
449
+ ]:
450
+ """
451
+ count how many triplets and genes support the various sister relationships
452
+ """
453
+ # Count the total number of sister pairings
454
+ # for the three possible pairs for triplets
455
+ triplet_group_counts = {"g0g1_count": 0, "g0g2_count": 0, "g1g2_count": 0}
456
+
457
+ # Also, keep track of which and how many genes
458
+ # support each sister pairing
459
+ gene_support_freq = {"0-1": 0, "1-2": 0, "0-2": 0}
460
+ for tree in summary:
461
+ # create empty key value pairs in case sister
462
+ # pairing was never observed
463
+ if "0-1" not in summary[tree].keys():
464
+ summary[tree]["0-1"] = 0
465
+ if "0-2" not in summary[tree].keys():
466
+ summary[tree]["0-2"] = 0
467
+ if "1-2" not in summary[tree].keys():
468
+ summary[tree]["1-2"] = 0
469
+ # create a running value of triplets that support each sister pair
470
+ triplet_group_counts["g0g1_count"] += summary[tree]["0-1"]
471
+ triplet_group_counts["g0g2_count"] += summary[tree]["0-2"]
472
+ triplet_group_counts["g1g2_count"] += summary[tree]["1-2"]
473
+ # determine which sister pairing is best supported in a single gene
474
+ # and add one to the corresponding gene support frequency count
475
+ gene_support_freq[max(summary[tree], key=summary[tree].get)] += 1
476
+
477
+ return triplet_group_counts, gene_support_freq
478
+
479
+ def chisquare_tests(
480
+ self,
481
+ triplet_group_counts: dict,
482
+ gene_support_freq: dict
483
+ ) -> Tuple[
484
+ _stats_py.Power_divergenceResult,
485
+ _stats_py.Power_divergenceResult,
486
+ ]:
487
+ triplet_res = chisquare(
488
+ [
489
+ triplet_group_counts["g0g1_count"],
490
+ triplet_group_counts["g0g2_count"],
491
+ triplet_group_counts["g1g2_count"],
492
+ ]
493
+ )
494
+
495
+ gene_support_freq_res = chisquare(
496
+ [
497
+ gene_support_freq["0-1"],
498
+ gene_support_freq["0-2"],
499
+ gene_support_freq["1-2"],
500
+ ]
501
+ )
502
+
503
+ return triplet_res, gene_support_freq_res
504
+
505
+ # def print_triplet_based_res(
506
+ # self,
507
+ # triplet_res,
508
+ # triplet_group_counts: dict
509
+ # ) -> None:
510
+ # """
511
+ # print results to stdout for user
512
+ # """
513
+ # try:
514
+ # print(f"\nTriplet Results")
515
+ # print(f"===============")
516
+ # print(f"chi-squared: {round(triplet_res.statistic, 4)}")
517
+ # print(f"p-value: {round(triplet_res.pvalue, 6)}")
518
+ # print(f"total triplets: {sum(triplet_group_counts.values())}")
519
+ # print(f"0-1: {triplet_group_counts['g0g1_count']}")
520
+ # print(f"0-2: {triplet_group_counts['g0g2_count']}")
521
+ # print(f"1-2: {triplet_group_counts['g1g2_count']}")
522
+ # except BrokenPipeError:
523
+ # pass
524
+
525
+ def print_gene_support_freq_res(
526
+ self,
527
+ gene_support_freq_res,
528
+ gene_support_freq: Dict[str, int],
529
+ trees_file_path: List[str],
530
+ ) -> None:
531
+ """
532
+ print results to stdout for user
533
+ """
534
+ try:
535
+ print(f"Gene Support Frequency Results")
536
+ print(f"==============================")
537
+ print(f"chi-squared: {round(gene_support_freq_res.statistic, 4)}")
538
+ print(f"p-value: {round(gene_support_freq_res.pvalue, 6)}")
539
+ print(
540
+ f"total genes: {(gene_support_freq['0-1'] + gene_support_freq['0-2'] + gene_support_freq['1-2'])}"
541
+ )
542
+ print(f"0-1: {gene_support_freq['0-1']}")
543
+ print(f"0-2: {gene_support_freq['0-2']}")
544
+ print(f"1-2: {gene_support_freq['1-2']}")
545
+ except BrokenPipeError:
546
+ pass
@@ -0,0 +1,28 @@
1
+ from typing import Dict
2
+ import copy
3
+
4
+ from Bio import Phylo
5
+
6
+ from .base import Tree
7
+
8
+
9
+ class PrintTree(Tree):
10
+ def __init__(self, args) -> None:
11
+ super().__init__(**self.process_args(args))
12
+
13
+ def run(self):
14
+ tree = self.read_tree_file()
15
+
16
+ if self.remove:
17
+ # Make a deep copy to avoid modifying the cached tree
18
+ tree = copy.deepcopy(tree)
19
+ for node in tree.get_terminals() + tree.get_nonterminals():
20
+ node.branch_length = None
21
+
22
+ try:
23
+ Phylo.draw_ascii(tree)
24
+ except BrokenPipeError:
25
+ pass
26
+
27
+ def process_args(self, args) -> Dict[str, str]:
28
+ return dict(tree_file_path=args.tree, remove=args.remove)
@@ -0,0 +1,40 @@
1
+ from typing import Dict
2
+ import copy
3
+
4
+ from .base import Tree
5
+
6
+ from ...helpers.files import read_single_column_file_to_list
7
+
8
+
9
+ class PruneTree(Tree):
10
+ def __init__(self, args) -> None:
11
+ super().__init__(**self.process_args(args))
12
+
13
+ def run(self) -> None:
14
+ tree = self.read_tree_file()
15
+ # Make a deep copy to avoid modifying the cached tree
16
+ tree_copy = copy.deepcopy(tree)
17
+
18
+ taxa = read_single_column_file_to_list(self.list_of_taxa)
19
+
20
+ if self.keep:
21
+ tips_in_tree = [term.name for term in tree_copy.get_terminals()]
22
+ taxa = [x for x in tips_in_tree if x not in taxa]
23
+
24
+ tree_copy = self.prune_tree_using_taxa_list(tree_copy, taxa)
25
+
26
+ self.write_tree_file(tree_copy, self.output_file_path)
27
+
28
+ def process_args(self, args) -> Dict[str, str]:
29
+ tree_file_path = args.tree
30
+ output_file_path = \
31
+ f"{args.output}" if args.output else f"{tree_file_path}.pruned"
32
+
33
+ keep = True if args.keep is None else args.keep
34
+
35
+ return dict(
36
+ tree_file_path=tree_file_path,
37
+ list_of_taxa=args.list_of_taxa,
38
+ output_file_path=output_file_path,
39
+ keep=keep,
40
+ )
@@ -0,0 +1,64 @@
1
+ import sys
2
+ import copy
3
+ from typing import Dict
4
+
5
+ from Bio.Phylo import Newick
6
+
7
+ from .base import Tree
8
+
9
+
10
+ class RenameTreeTips(Tree):
11
+ def __init__(self, args) -> None:
12
+ super().__init__(**self.process_args(args))
13
+
14
+ def run(self):
15
+ tree = self.read_tree_file()
16
+ # Make a deep copy to avoid modifying the cached tree
17
+ tree_copy = copy.deepcopy(tree)
18
+
19
+ idmap = self.read_id_map()
20
+
21
+ tree_copy = self.replace_tip_names(tree_copy, idmap)
22
+
23
+ self.write_tree_file(tree_copy, self.output_file_path)
24
+
25
+ def process_args(self, args) -> Dict[str, str]:
26
+ tree_file_path = args.tree
27
+
28
+ output_file_path = \
29
+ f"{args.output}" if args.output else f"{tree_file_path}.renamed"
30
+
31
+ return dict(
32
+ tree_file_path=tree_file_path,
33
+ idmap=args.idmap,
34
+ output_file_path=output_file_path,
35
+ )
36
+
37
+ def read_id_map(self) -> Dict[str, str]:
38
+ idmap = dict()
39
+ try:
40
+ with open(self.idmap) as identifiers:
41
+ for line in identifiers:
42
+ (key, val) = line.split()
43
+ idmap[key] = val
44
+ except FileNotFoundError:
45
+ try:
46
+ print(f"{self.idmap} corresponds to no such file.")
47
+ print("Please check file name and pathing")
48
+ sys.exit(2)
49
+ except BrokenPipeError:
50
+ pass
51
+
52
+ return idmap
53
+
54
+ def replace_tip_names(
55
+ self,
56
+ tree: Tree,
57
+ idmap: Dict[str, str]
58
+ ) -> Newick.Tree:
59
+ for term in tree.get_terminals():
60
+ name = term.name
61
+ if name in idmap:
62
+ term.name = idmap[name]
63
+
64
+ return tree