phykit 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. phykit/__init__.py +0 -0
  2. phykit/__main__.py +6 -0
  3. phykit/helpers/__init__.py +0 -0
  4. phykit/helpers/boolean_argument_parsing.py +12 -0
  5. phykit/helpers/caching.py +201 -0
  6. phykit/helpers/files.py +125 -0
  7. phykit/helpers/parallel.py +305 -0
  8. phykit/helpers/stats_summary.py +64 -0
  9. phykit/helpers/streaming.py +152 -0
  10. phykit/phykit.py +2862 -0
  11. phykit/services/__init__.py +0 -0
  12. phykit/services/alignment/__init__.py +17 -0
  13. phykit/services/alignment/alignment_length.py +16 -0
  14. phykit/services/alignment/alignment_length_no_gaps.py +69 -0
  15. phykit/services/alignment/alignment_recoding.py +89 -0
  16. phykit/services/alignment/base.py +103 -0
  17. phykit/services/alignment/column_score.py +66 -0
  18. phykit/services/alignment/compositional_bias_per_site.py +98 -0
  19. phykit/services/alignment/create_concatenation_matrix.py +254 -0
  20. phykit/services/alignment/dna_threader.py +145 -0
  21. phykit/services/alignment/evolutionary_rate_per_site.py +85 -0
  22. phykit/services/alignment/faidx.py +21 -0
  23. phykit/services/alignment/gc_content.py +94 -0
  24. phykit/services/alignment/pairwise_identity.py +159 -0
  25. phykit/services/alignment/parsimony_informative_sites.py +81 -0
  26. phykit/services/alignment/rcv.py +14 -0
  27. phykit/services/alignment/rcvt.py +47 -0
  28. phykit/services/alignment/rename_fasta_entries.py +53 -0
  29. phykit/services/alignment/sum_of_pairs_score.py +157 -0
  30. phykit/services/alignment/variable_sites.py +54 -0
  31. phykit/services/base.py +9 -0
  32. phykit/services/tree/__init__.py +29 -0
  33. phykit/services/tree/base.py +178 -0
  34. phykit/services/tree/bipartition_support_stats.py +48 -0
  35. phykit/services/tree/branch_length_multiplier.py +37 -0
  36. phykit/services/tree/collapse_branches.py +27 -0
  37. phykit/services/tree/covarying_evolutionary_rates.py +272 -0
  38. phykit/services/tree/dvmc.py +37 -0
  39. phykit/services/tree/evolutionary_rate.py +17 -0
  40. phykit/services/tree/hidden_paralogy_check.py +128 -0
  41. phykit/services/tree/internal_branch_stats.py +77 -0
  42. phykit/services/tree/internode_labeler.py +33 -0
  43. phykit/services/tree/last_common_ancestor_subtree.py +35 -0
  44. phykit/services/tree/lb_score.py +196 -0
  45. phykit/services/tree/monophyly_check.py +106 -0
  46. phykit/services/tree/nearest_neighbor_interchange.py +140 -0
  47. phykit/services/tree/patristic_distances.py +113 -0
  48. phykit/services/tree/polytomy_test.py +546 -0
  49. phykit/services/tree/print_tree.py +28 -0
  50. phykit/services/tree/prune_tree.py +40 -0
  51. phykit/services/tree/rename_tree_tips.py +64 -0
  52. phykit/services/tree/rf_distance.py +136 -0
  53. phykit/services/tree/root_tree.py +35 -0
  54. phykit/services/tree/saturation.py +209 -0
  55. phykit/services/tree/spurious_sequence.py +75 -0
  56. phykit/services/tree/terminal_branch_stats.py +87 -0
  57. phykit/services/tree/tip_labels.py +18 -0
  58. phykit/services/tree/tip_to_tip_distance.py +41 -0
  59. phykit/services/tree/tip_to_tip_node_distance.py +41 -0
  60. phykit/services/tree/total_tree_length.py +25 -0
  61. phykit/services/tree/treeness.py +16 -0
  62. phykit/services/tree/treeness_over_rcv.py +40 -0
  63. phykit/version.py +1 -0
  64. phykit-2.1.0.dist-info/METADATA +150 -0
  65. phykit-2.1.0.dist-info/RECORD +69 -0
  66. phykit-2.1.0.dist-info/WHEEL +5 -0
  67. phykit-2.1.0.dist-info/entry_points.txt +121 -0
  68. phykit-2.1.0.dist-info/licenses/LICENSE.md +7 -0
  69. phykit-2.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,64 @@
1
+ import statistics as stat
2
+
3
+ import numpy as np
4
+
5
+
6
+ def calculate_summary_statistics_from_arr(arr):
7
+ """
8
+ calcuate summary statistics for an input list
9
+ """
10
+ try:
11
+ stats = dict(
12
+ mean=stat.mean(arr),
13
+ median=stat.median(arr),
14
+ twenty_fifth=np.percentile(arr, 25),
15
+ seventy_fifth=np.percentile(arr, 75),
16
+ minimum=np.min(arr),
17
+ maximum=np.max(arr),
18
+ standard_deviation=stat.stdev(arr),
19
+ variance=stat.variance(arr),
20
+ )
21
+ except stat.StatisticsError:
22
+ print("There are no values to calculate summary statistics for.\n")
23
+ print("Double check that the input alignment/phylogeny contains")
24
+ print("the properties you want to calculate summary statistics for.")
25
+
26
+ return stats
27
+
28
+
29
+ def calculate_summary_statistics_from_dict(dat: dict):
30
+ """
31
+ calcuate summary statistics for a dictionary
32
+ """
33
+ try:
34
+ stats = dict(
35
+ mean=stat.mean([*dat.values()]),
36
+ median=stat.median([*dat.values()]),
37
+ twenty_fifth=np.percentile([*dat.values()], 25),
38
+ seventy_fifth=np.percentile([*dat.values()], 75),
39
+ minimum=np.min([*dat.values()]),
40
+ maximum=np.max([*dat.values()]),
41
+ standard_deviation=stat.stdev([*dat.values()]),
42
+ variance=stat.variance([*dat.values()]),
43
+ )
44
+ except stat.StatisticsError:
45
+ print("There are no values to calculate summary statistics for.\n")
46
+ print("Double check that the input alignment/phylogeny contains")
47
+ print("the properties you want to calculate summary statistics for.")
48
+
49
+ return stats
50
+
51
+
52
+ def print_summary_statistics(stats: list):
53
+ """ """
54
+ try:
55
+ print(f"mean: {round(stats['mean'], 4)}")
56
+ print(f"median: {round(stats['median'], 4)}")
57
+ print(f"25th percentile: {round(stats['twenty_fifth'], 4)}")
58
+ print(f"75th percentile: {round(stats['seventy_fifth'], 4)}")
59
+ print(f"minimum: {round(stats['minimum'], 4)}")
60
+ print(f"maximum: {round(stats['maximum'], 4)}")
61
+ print(f"standard deviation: {round(stats['standard_deviation'], 4)}")
62
+ print(f"variance: {round(stats['variance'], 4)}")
63
+ except BrokenPipeError:
64
+ pass
@@ -0,0 +1,152 @@
1
+ """
2
+ Streaming utilities for memory-efficient processing of large files
3
+ """
4
+
5
+ from typing import Iterator, Tuple, Optional
6
+ import mmap
7
+ import os
8
+ from Bio import SeqIO
9
+ from Bio.SeqRecord import SeqRecord
10
+
11
+
12
+ class StreamingFastaReader:
13
+ """
14
+ Memory-efficient streaming reader for large FASTA files.
15
+ Uses memory mapping to avoid loading entire file into memory.
16
+ """
17
+
18
+ def __init__(self, file_path: str, chunk_size: int = 1000):
19
+ """
20
+ Initialize streaming reader.
21
+
22
+ Args:
23
+ file_path: Path to FASTA file
24
+ chunk_size: Number of sequences to yield at once
25
+ """
26
+ self.file_path = file_path
27
+ self.chunk_size = chunk_size
28
+ self.file_size = os.path.getsize(file_path)
29
+
30
+ def stream_sequences(self) -> Iterator[SeqRecord]:
31
+ """
32
+ Stream sequences one at a time.
33
+ """
34
+ with open(self.file_path, 'r') as handle:
35
+ for record in SeqIO.parse(handle, "fasta"):
36
+ yield record
37
+
38
+ def stream_chunks(self) -> Iterator[list]:
39
+ """
40
+ Stream sequences in chunks for batch processing.
41
+ """
42
+ chunk = []
43
+ for record in self.stream_sequences():
44
+ chunk.append(record)
45
+ if len(chunk) >= self.chunk_size:
46
+ yield chunk
47
+ chunk = []
48
+
49
+ # Yield remaining sequences
50
+ if chunk:
51
+ yield chunk
52
+
53
+ def get_sequence_count(self) -> int:
54
+ """
55
+ Count sequences without loading entire file.
56
+ """
57
+ count = 0
58
+ with open(self.file_path, 'rb') as f:
59
+ with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mmapped_file:
60
+ for line in iter(mmapped_file.readline, b""):
61
+ if line.startswith(b'>'):
62
+ count += 1
63
+ return count
64
+
65
+ def get_sequence_at_position(self, position: int) -> Optional[SeqRecord]:
66
+ """
67
+ Get a specific sequence by position without loading entire file.
68
+ """
69
+ for i, record in enumerate(self.stream_sequences()):
70
+ if i == position:
71
+ return record
72
+ return None
73
+
74
+
75
+ class MemoryEfficientAlignmentProcessor:
76
+ """
77
+ Process large alignments with minimal memory footprint.
78
+ """
79
+
80
+ @staticmethod
81
+ def calculate_column_stats_streaming(file_path: str) -> dict:
82
+ """
83
+ Calculate column statistics without loading entire alignment.
84
+
85
+ Returns:
86
+ Dictionary with column statistics
87
+ """
88
+ reader = StreamingFastaReader(file_path)
89
+
90
+ # First pass: get dimensions
91
+ num_seqs = 0
92
+ seq_length = None
93
+
94
+ for record in reader.stream_sequences():
95
+ if seq_length is None:
96
+ seq_length = len(record.seq)
97
+ num_seqs += 1
98
+
99
+ # Initialize column stats
100
+ column_stats = {
101
+ 'variable_sites': [],
102
+ 'gap_counts': [],
103
+ 'conservation': []
104
+ }
105
+
106
+ # Process in chunks to maintain memory efficiency
107
+ for col_idx in range(seq_length):
108
+ column_chars = []
109
+ gap_count = 0
110
+
111
+ for record in reader.stream_sequences():
112
+ char = str(record.seq[col_idx]).upper()
113
+ column_chars.append(char)
114
+ if char in ['-', '?', 'X', 'N']:
115
+ gap_count += 1
116
+
117
+ # Calculate statistics
118
+ unique_chars = set(column_chars)
119
+ is_variable = len(unique_chars) > 1
120
+ conservation_score = 1 - (len(unique_chars) / len(column_chars))
121
+
122
+ column_stats['variable_sites'].append(is_variable)
123
+ column_stats['gap_counts'].append(gap_count)
124
+ column_stats['conservation'].append(conservation_score)
125
+
126
+ return column_stats
127
+
128
+ @staticmethod
129
+ def process_large_alignment_in_batches(
130
+ file_path: str,
131
+ processing_func,
132
+ batch_size: int = 100
133
+ ):
134
+ """
135
+ Process large alignment in batches to avoid memory issues.
136
+
137
+ Args:
138
+ file_path: Path to alignment file
139
+ processing_func: Function to apply to each batch
140
+ batch_size: Number of sequences per batch
141
+
142
+ Returns:
143
+ Aggregated results from processing function
144
+ """
145
+ reader = StreamingFastaReader(file_path, chunk_size=batch_size)
146
+ results = []
147
+
148
+ for batch_num, batch in enumerate(reader.stream_chunks()):
149
+ batch_result = processing_func(batch)
150
+ results.append(batch_result)
151
+
152
+ return results