phykit 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. phykit/__init__.py +0 -0
  2. phykit/__main__.py +6 -0
  3. phykit/helpers/__init__.py +0 -0
  4. phykit/helpers/boolean_argument_parsing.py +12 -0
  5. phykit/helpers/caching.py +201 -0
  6. phykit/helpers/files.py +125 -0
  7. phykit/helpers/parallel.py +305 -0
  8. phykit/helpers/stats_summary.py +64 -0
  9. phykit/helpers/streaming.py +152 -0
  10. phykit/phykit.py +2862 -0
  11. phykit/services/__init__.py +0 -0
  12. phykit/services/alignment/__init__.py +17 -0
  13. phykit/services/alignment/alignment_length.py +16 -0
  14. phykit/services/alignment/alignment_length_no_gaps.py +69 -0
  15. phykit/services/alignment/alignment_recoding.py +89 -0
  16. phykit/services/alignment/base.py +103 -0
  17. phykit/services/alignment/column_score.py +66 -0
  18. phykit/services/alignment/compositional_bias_per_site.py +98 -0
  19. phykit/services/alignment/create_concatenation_matrix.py +254 -0
  20. phykit/services/alignment/dna_threader.py +145 -0
  21. phykit/services/alignment/evolutionary_rate_per_site.py +85 -0
  22. phykit/services/alignment/faidx.py +21 -0
  23. phykit/services/alignment/gc_content.py +94 -0
  24. phykit/services/alignment/pairwise_identity.py +159 -0
  25. phykit/services/alignment/parsimony_informative_sites.py +81 -0
  26. phykit/services/alignment/rcv.py +14 -0
  27. phykit/services/alignment/rcvt.py +47 -0
  28. phykit/services/alignment/rename_fasta_entries.py +53 -0
  29. phykit/services/alignment/sum_of_pairs_score.py +157 -0
  30. phykit/services/alignment/variable_sites.py +54 -0
  31. phykit/services/base.py +9 -0
  32. phykit/services/tree/__init__.py +29 -0
  33. phykit/services/tree/base.py +178 -0
  34. phykit/services/tree/bipartition_support_stats.py +48 -0
  35. phykit/services/tree/branch_length_multiplier.py +37 -0
  36. phykit/services/tree/collapse_branches.py +27 -0
  37. phykit/services/tree/covarying_evolutionary_rates.py +272 -0
  38. phykit/services/tree/dvmc.py +37 -0
  39. phykit/services/tree/evolutionary_rate.py +17 -0
  40. phykit/services/tree/hidden_paralogy_check.py +128 -0
  41. phykit/services/tree/internal_branch_stats.py +77 -0
  42. phykit/services/tree/internode_labeler.py +33 -0
  43. phykit/services/tree/last_common_ancestor_subtree.py +35 -0
  44. phykit/services/tree/lb_score.py +196 -0
  45. phykit/services/tree/monophyly_check.py +106 -0
  46. phykit/services/tree/nearest_neighbor_interchange.py +140 -0
  47. phykit/services/tree/patristic_distances.py +113 -0
  48. phykit/services/tree/polytomy_test.py +546 -0
  49. phykit/services/tree/print_tree.py +28 -0
  50. phykit/services/tree/prune_tree.py +40 -0
  51. phykit/services/tree/rename_tree_tips.py +64 -0
  52. phykit/services/tree/rf_distance.py +136 -0
  53. phykit/services/tree/root_tree.py +35 -0
  54. phykit/services/tree/saturation.py +209 -0
  55. phykit/services/tree/spurious_sequence.py +75 -0
  56. phykit/services/tree/terminal_branch_stats.py +87 -0
  57. phykit/services/tree/tip_labels.py +18 -0
  58. phykit/services/tree/tip_to_tip_distance.py +41 -0
  59. phykit/services/tree/tip_to_tip_node_distance.py +41 -0
  60. phykit/services/tree/total_tree_length.py +25 -0
  61. phykit/services/tree/treeness.py +16 -0
  62. phykit/services/tree/treeness_over_rcv.py +40 -0
  63. phykit/version.py +1 -0
  64. phykit-2.1.0.dist-info/METADATA +150 -0
  65. phykit-2.1.0.dist-info/RECORD +69 -0
  66. phykit-2.1.0.dist-info/WHEEL +5 -0
  67. phykit-2.1.0.dist-info/entry_points.txt +121 -0
  68. phykit-2.1.0.dist-info/licenses/LICENSE.md +7 -0
  69. phykit-2.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,140 @@
1
+ import copy
2
+ from typing import Dict, List, Generator
3
+ import pickle
4
+
5
+ from Bio import Phylo
6
+ from Bio.Phylo import Newick
7
+
8
+ from .base import Tree
9
+
10
+
11
+ class NearestNeighborInterchange(Tree):
12
+ def __init__(self, args) -> None:
13
+ super().__init__(**self.process_args(args))
14
+
15
+ def run(self) -> None:
16
+ tree = self.read_tree_file()
17
+
18
+ # Use standard neighbor generation with optimized copying
19
+ all_nnis = [tree]
20
+ all_nnis.extend(self.get_neighbors(tree))
21
+ Phylo.write(all_nnis, self.output_file_path, "newick")
22
+
23
+ def process_args(self, args) -> Dict[str, str]:
24
+ tree_file_path = args.tree
25
+ output_file_path = \
26
+ f"{args.output}" if args.output else f"{tree_file_path}.nnis"
27
+
28
+ return dict(
29
+ tree_file_path=tree_file_path,
30
+ output_file_path=output_file_path
31
+ )
32
+
33
+ def _fast_tree_copy(self, tree: Newick.Tree) -> Newick.Tree:
34
+ """Fast tree copying using pickle instead of deep copy."""
35
+ return pickle.loads(pickle.dumps(tree, protocol=pickle.HIGHEST_PROTOCOL))
36
+
37
+ def get_neighbors(
38
+ self,
39
+ tree: Newick.Tree
40
+ ) -> List[Newick.Tree]:
41
+ ### This code is from BioPython (so is this comment)
42
+ # Get all neighbor trees of the given tree (PRIVATE).
43
+ # Currently only for binary rooted trees.
44
+ ###
45
+ # make child to parent dict
46
+ parents = {}
47
+ for clade in tree.find_clades():
48
+ if clade != tree.root:
49
+ node_path = tree.get_path(clade)
50
+ # cannot get the parent if the parent is root. Bug?
51
+ if len(node_path) == 1:
52
+ parents[clade] = tree.root
53
+ else:
54
+ parents[clade] = node_path[-2]
55
+ neighbors = []
56
+ root_childs = []
57
+ for clade in tree.get_nonterminals(order="level"):
58
+ if clade == tree.root:
59
+ left = clade.clades[0]
60
+ right = clade.clades[1]
61
+ root_childs.append(left)
62
+ root_childs.append(right)
63
+ if not left.is_terminal() and not right.is_terminal():
64
+ # make changes around the left_left clade
65
+ # left_left = left.clades[0]
66
+ left_right = left.clades[1]
67
+ right_left = right.clades[0]
68
+ right_right = right.clades[1]
69
+ # neighbor 1 (left_left + right_right)
70
+ del left.clades[1]
71
+ del right.clades[1]
72
+ left.clades.append(right_right)
73
+ right.clades.append(left_right)
74
+ temp_tree = self._fast_tree_copy(tree)
75
+ neighbors.append(temp_tree)
76
+ # neighbor 2 (left_left + right_left)
77
+ del left.clades[1]
78
+ del right.clades[0]
79
+ left.clades.append(right_left)
80
+ right.clades.append(right_right)
81
+ temp_tree = self._fast_tree_copy(tree)
82
+ neighbors.append(temp_tree)
83
+ # change back (left_left + left_right)
84
+ del left.clades[1]
85
+ del right.clades[0]
86
+ left.clades.append(left_right)
87
+ right.clades.insert(0, right_left)
88
+ elif clade in root_childs:
89
+ # skip root child
90
+ continue
91
+ else:
92
+ # method for other clades
93
+ # make changes around the parent clade
94
+ left = clade.clades[0]
95
+ right = clade.clades[1]
96
+ parent = parents[clade]
97
+ if clade == parent.clades[0]:
98
+ sister = parent.clades[1]
99
+ # neighbor 1 (parent + right)
100
+ del parent.clades[1]
101
+ del clade.clades[1]
102
+ parent.clades.append(right)
103
+ clade.clades.append(sister)
104
+ temp_tree = self._fast_tree_copy(tree)
105
+ neighbors.append(temp_tree)
106
+ # neighbor 2 (parent + left)
107
+ del parent.clades[1]
108
+ del clade.clades[0]
109
+ parent.clades.append(left)
110
+ clade.clades.append(right)
111
+ temp_tree = self._fast_tree_copy(tree)
112
+ neighbors.append(temp_tree)
113
+ # change back (parent + sister)
114
+ del parent.clades[1]
115
+ del clade.clades[0]
116
+ parent.clades.append(sister)
117
+ clade.clades.insert(0, left)
118
+ else:
119
+ sister = parent.clades[0]
120
+ # neighbor 1 (parent + right)
121
+ del parent.clades[0]
122
+ del clade.clades[1]
123
+ parent.clades.insert(0, right)
124
+ clade.clades.append(sister)
125
+ temp_tree = self._fast_tree_copy(tree)
126
+ neighbors.append(temp_tree)
127
+ # neighbor 2 (parent + left)
128
+ del parent.clades[0]
129
+ del clade.clades[0]
130
+ parent.clades.insert(0, left)
131
+ clade.clades.append(right)
132
+ temp_tree = self._fast_tree_copy(tree)
133
+ neighbors.append(temp_tree)
134
+ # change back (parent + sister)
135
+ del parent.clades[0]
136
+ del clade.clades[0]
137
+ parent.clades.insert(0, sister)
138
+ clade.clades.insert(0, left)
139
+
140
+ return neighbors
@@ -0,0 +1,113 @@
1
+ from typing import Dict, List, Tuple
2
+ import itertools
3
+ import multiprocessing as mp
4
+ from functools import partial
5
+ import pickle
6
+ import sys
7
+
8
+ from Bio.Phylo import Newick
9
+ try:
10
+ from tqdm import tqdm
11
+ except ImportError:
12
+ # Fallback if tqdm is not installed
13
+ def tqdm(iterable, *args, **kwargs):
14
+ return iterable
15
+
16
+ from .base import Tree
17
+
18
+ from ...helpers.stats_summary import (
19
+ calculate_summary_statistics_from_arr,
20
+ print_summary_statistics,
21
+ )
22
+
23
+
24
+ class PatristicDistances(Tree):
25
+ def __init__(self, args) -> None:
26
+ super().__init__(**self.process_args(args))
27
+
28
+ def run(self):
29
+ tree = self.read_tree_file()
30
+ patristic_distances, combos, stats = \
31
+ self.calculate_patristic_distances(tree)
32
+
33
+ if self.verbose:
34
+ try:
35
+ for combo, patristic_distance in zip(combos, patristic_distances):
36
+ print(f"{combo[0]}\t{combo[1]}\t{round(patristic_distance, 4)}")
37
+ except BrokenPipeError:
38
+ pass
39
+ else:
40
+ print_summary_statistics(stats)
41
+
42
+ def process_args(self, args) -> Dict[str, str]:
43
+ return dict(tree_file_path=args.tree, verbose=args.verbose)
44
+
45
+ def _calculate_distance_batch(self, tree_pickle, combo_batch):
46
+ """Helper function to calculate distances for a batch of combinations."""
47
+ tree = pickle.loads(tree_pickle)
48
+ return [tree.distance(combo[0], combo[1]) for combo in combo_batch]
49
+
50
+ def calculate_distance_between_pairs(
51
+ self,
52
+ tips: List[str],
53
+ tree
54
+ ) -> Tuple[
55
+ List[Tuple[str, str]],
56
+ List[float],
57
+ ]:
58
+ combos = list(itertools.combinations(tips, 2))
59
+
60
+ # For small datasets, use the original single-threaded approach
61
+ if len(combos) < 100:
62
+ patristic_distances = [
63
+ tree.distance(combo[0], combo[1]) for combo in combos
64
+ ]
65
+ else:
66
+ # Use multiprocessing for larger datasets
67
+ # Serialize the tree once to avoid repeated serialization
68
+ tree_pickle = pickle.dumps(tree)
69
+
70
+ # Determine optimal number of workers
71
+ num_workers = min(mp.cpu_count(), 8)
72
+
73
+ # Split combos into chunks for parallel processing
74
+ chunk_size = max(1, len(combos) // (num_workers * 4))
75
+ combo_chunks = [combos[i:i + chunk_size] for i in range(0, len(combos), chunk_size)]
76
+
77
+ # Create partial function with the pickled tree
78
+ calc_func = partial(self._calculate_distance_batch, tree_pickle)
79
+
80
+ # Process in parallel with progress bar
81
+ with mp.Pool(processes=num_workers) as pool:
82
+ # Only show progress bar if stderr is a tty (not redirected)
83
+ if sys.stderr.isatty():
84
+ results = list(tqdm(
85
+ pool.imap(calc_func, combo_chunks),
86
+ total=len(combo_chunks),
87
+ desc="Calculating patristic distances",
88
+ unit="batch"
89
+ ))
90
+ else:
91
+ results = pool.map(calc_func, combo_chunks)
92
+
93
+ # Flatten the results
94
+ patristic_distances = [dist for chunk_result in results for dist in chunk_result]
95
+
96
+ return combos, patristic_distances
97
+
98
+ def calculate_patristic_distances(
99
+ self,
100
+ tree: Newick.Tree,
101
+ ) -> Tuple[
102
+ List[float],
103
+ List[Tuple[str, str]],
104
+ Dict[str, float],
105
+ ]:
106
+ tips = self.get_tip_names_from_tree(tree)
107
+
108
+ combos, patristic_distances = \
109
+ self.calculate_distance_between_pairs(tips, tree)
110
+
111
+ stats = calculate_summary_statistics_from_arr(patristic_distances)
112
+
113
+ return patristic_distances, combos, stats