phykit 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phykit/__init__.py +0 -0
- phykit/__main__.py +6 -0
- phykit/helpers/__init__.py +0 -0
- phykit/helpers/boolean_argument_parsing.py +12 -0
- phykit/helpers/caching.py +201 -0
- phykit/helpers/files.py +125 -0
- phykit/helpers/parallel.py +305 -0
- phykit/helpers/stats_summary.py +64 -0
- phykit/helpers/streaming.py +152 -0
- phykit/phykit.py +2862 -0
- phykit/services/__init__.py +0 -0
- phykit/services/alignment/__init__.py +17 -0
- phykit/services/alignment/alignment_length.py +16 -0
- phykit/services/alignment/alignment_length_no_gaps.py +69 -0
- phykit/services/alignment/alignment_recoding.py +89 -0
- phykit/services/alignment/base.py +103 -0
- phykit/services/alignment/column_score.py +66 -0
- phykit/services/alignment/compositional_bias_per_site.py +98 -0
- phykit/services/alignment/create_concatenation_matrix.py +254 -0
- phykit/services/alignment/dna_threader.py +145 -0
- phykit/services/alignment/evolutionary_rate_per_site.py +85 -0
- phykit/services/alignment/faidx.py +21 -0
- phykit/services/alignment/gc_content.py +94 -0
- phykit/services/alignment/pairwise_identity.py +159 -0
- phykit/services/alignment/parsimony_informative_sites.py +81 -0
- phykit/services/alignment/rcv.py +14 -0
- phykit/services/alignment/rcvt.py +47 -0
- phykit/services/alignment/rename_fasta_entries.py +53 -0
- phykit/services/alignment/sum_of_pairs_score.py +157 -0
- phykit/services/alignment/variable_sites.py +54 -0
- phykit/services/base.py +9 -0
- phykit/services/tree/__init__.py +29 -0
- phykit/services/tree/base.py +178 -0
- phykit/services/tree/bipartition_support_stats.py +48 -0
- phykit/services/tree/branch_length_multiplier.py +37 -0
- phykit/services/tree/collapse_branches.py +27 -0
- phykit/services/tree/covarying_evolutionary_rates.py +272 -0
- phykit/services/tree/dvmc.py +37 -0
- phykit/services/tree/evolutionary_rate.py +17 -0
- phykit/services/tree/hidden_paralogy_check.py +128 -0
- phykit/services/tree/internal_branch_stats.py +77 -0
- phykit/services/tree/internode_labeler.py +33 -0
- phykit/services/tree/last_common_ancestor_subtree.py +35 -0
- phykit/services/tree/lb_score.py +196 -0
- phykit/services/tree/monophyly_check.py +106 -0
- phykit/services/tree/nearest_neighbor_interchange.py +140 -0
- phykit/services/tree/patristic_distances.py +113 -0
- phykit/services/tree/polytomy_test.py +546 -0
- phykit/services/tree/print_tree.py +28 -0
- phykit/services/tree/prune_tree.py +40 -0
- phykit/services/tree/rename_tree_tips.py +64 -0
- phykit/services/tree/rf_distance.py +136 -0
- phykit/services/tree/root_tree.py +35 -0
- phykit/services/tree/saturation.py +209 -0
- phykit/services/tree/spurious_sequence.py +75 -0
- phykit/services/tree/terminal_branch_stats.py +87 -0
- phykit/services/tree/tip_labels.py +18 -0
- phykit/services/tree/tip_to_tip_distance.py +41 -0
- phykit/services/tree/tip_to_tip_node_distance.py +41 -0
- phykit/services/tree/total_tree_length.py +25 -0
- phykit/services/tree/treeness.py +16 -0
- phykit/services/tree/treeness_over_rcv.py +40 -0
- phykit/version.py +1 -0
- phykit-2.1.0.dist-info/METADATA +150 -0
- phykit-2.1.0.dist-info/RECORD +69 -0
- phykit-2.1.0.dist-info/WHEEL +5 -0
- phykit-2.1.0.dist-info/entry_points.txt +121 -0
- phykit-2.1.0.dist-info/licenses/LICENSE.md +7 -0
- phykit-2.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import statistics as stat
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def calculate_summary_statistics_from_arr(arr):
|
|
7
|
+
"""
|
|
8
|
+
calcuate summary statistics for an input list
|
|
9
|
+
"""
|
|
10
|
+
try:
|
|
11
|
+
stats = dict(
|
|
12
|
+
mean=stat.mean(arr),
|
|
13
|
+
median=stat.median(arr),
|
|
14
|
+
twenty_fifth=np.percentile(arr, 25),
|
|
15
|
+
seventy_fifth=np.percentile(arr, 75),
|
|
16
|
+
minimum=np.min(arr),
|
|
17
|
+
maximum=np.max(arr),
|
|
18
|
+
standard_deviation=stat.stdev(arr),
|
|
19
|
+
variance=stat.variance(arr),
|
|
20
|
+
)
|
|
21
|
+
except stat.StatisticsError:
|
|
22
|
+
print("There are no values to calculate summary statistics for.\n")
|
|
23
|
+
print("Double check that the input alignment/phylogeny contains")
|
|
24
|
+
print("the properties you want to calculate summary statistics for.")
|
|
25
|
+
|
|
26
|
+
return stats
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def calculate_summary_statistics_from_dict(dat: dict):
|
|
30
|
+
"""
|
|
31
|
+
calcuate summary statistics for a dictionary
|
|
32
|
+
"""
|
|
33
|
+
try:
|
|
34
|
+
stats = dict(
|
|
35
|
+
mean=stat.mean([*dat.values()]),
|
|
36
|
+
median=stat.median([*dat.values()]),
|
|
37
|
+
twenty_fifth=np.percentile([*dat.values()], 25),
|
|
38
|
+
seventy_fifth=np.percentile([*dat.values()], 75),
|
|
39
|
+
minimum=np.min([*dat.values()]),
|
|
40
|
+
maximum=np.max([*dat.values()]),
|
|
41
|
+
standard_deviation=stat.stdev([*dat.values()]),
|
|
42
|
+
variance=stat.variance([*dat.values()]),
|
|
43
|
+
)
|
|
44
|
+
except stat.StatisticsError:
|
|
45
|
+
print("There are no values to calculate summary statistics for.\n")
|
|
46
|
+
print("Double check that the input alignment/phylogeny contains")
|
|
47
|
+
print("the properties you want to calculate summary statistics for.")
|
|
48
|
+
|
|
49
|
+
return stats
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def print_summary_statistics(stats: list):
|
|
53
|
+
""" """
|
|
54
|
+
try:
|
|
55
|
+
print(f"mean: {round(stats['mean'], 4)}")
|
|
56
|
+
print(f"median: {round(stats['median'], 4)}")
|
|
57
|
+
print(f"25th percentile: {round(stats['twenty_fifth'], 4)}")
|
|
58
|
+
print(f"75th percentile: {round(stats['seventy_fifth'], 4)}")
|
|
59
|
+
print(f"minimum: {round(stats['minimum'], 4)}")
|
|
60
|
+
print(f"maximum: {round(stats['maximum'], 4)}")
|
|
61
|
+
print(f"standard deviation: {round(stats['standard_deviation'], 4)}")
|
|
62
|
+
print(f"variance: {round(stats['variance'], 4)}")
|
|
63
|
+
except BrokenPipeError:
|
|
64
|
+
pass
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Streaming utilities for memory-efficient processing of large files
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Iterator, Tuple, Optional
|
|
6
|
+
import mmap
|
|
7
|
+
import os
|
|
8
|
+
from Bio import SeqIO
|
|
9
|
+
from Bio.SeqRecord import SeqRecord
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class StreamingFastaReader:
|
|
13
|
+
"""
|
|
14
|
+
Memory-efficient streaming reader for large FASTA files.
|
|
15
|
+
Uses memory mapping to avoid loading entire file into memory.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, file_path: str, chunk_size: int = 1000):
|
|
19
|
+
"""
|
|
20
|
+
Initialize streaming reader.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
file_path: Path to FASTA file
|
|
24
|
+
chunk_size: Number of sequences to yield at once
|
|
25
|
+
"""
|
|
26
|
+
self.file_path = file_path
|
|
27
|
+
self.chunk_size = chunk_size
|
|
28
|
+
self.file_size = os.path.getsize(file_path)
|
|
29
|
+
|
|
30
|
+
def stream_sequences(self) -> Iterator[SeqRecord]:
|
|
31
|
+
"""
|
|
32
|
+
Stream sequences one at a time.
|
|
33
|
+
"""
|
|
34
|
+
with open(self.file_path, 'r') as handle:
|
|
35
|
+
for record in SeqIO.parse(handle, "fasta"):
|
|
36
|
+
yield record
|
|
37
|
+
|
|
38
|
+
def stream_chunks(self) -> Iterator[list]:
|
|
39
|
+
"""
|
|
40
|
+
Stream sequences in chunks for batch processing.
|
|
41
|
+
"""
|
|
42
|
+
chunk = []
|
|
43
|
+
for record in self.stream_sequences():
|
|
44
|
+
chunk.append(record)
|
|
45
|
+
if len(chunk) >= self.chunk_size:
|
|
46
|
+
yield chunk
|
|
47
|
+
chunk = []
|
|
48
|
+
|
|
49
|
+
# Yield remaining sequences
|
|
50
|
+
if chunk:
|
|
51
|
+
yield chunk
|
|
52
|
+
|
|
53
|
+
def get_sequence_count(self) -> int:
|
|
54
|
+
"""
|
|
55
|
+
Count sequences without loading entire file.
|
|
56
|
+
"""
|
|
57
|
+
count = 0
|
|
58
|
+
with open(self.file_path, 'rb') as f:
|
|
59
|
+
with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mmapped_file:
|
|
60
|
+
for line in iter(mmapped_file.readline, b""):
|
|
61
|
+
if line.startswith(b'>'):
|
|
62
|
+
count += 1
|
|
63
|
+
return count
|
|
64
|
+
|
|
65
|
+
def get_sequence_at_position(self, position: int) -> Optional[SeqRecord]:
|
|
66
|
+
"""
|
|
67
|
+
Get a specific sequence by position without loading entire file.
|
|
68
|
+
"""
|
|
69
|
+
for i, record in enumerate(self.stream_sequences()):
|
|
70
|
+
if i == position:
|
|
71
|
+
return record
|
|
72
|
+
return None
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class MemoryEfficientAlignmentProcessor:
|
|
76
|
+
"""
|
|
77
|
+
Process large alignments with minimal memory footprint.
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
@staticmethod
|
|
81
|
+
def calculate_column_stats_streaming(file_path: str) -> dict:
|
|
82
|
+
"""
|
|
83
|
+
Calculate column statistics without loading entire alignment.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
Dictionary with column statistics
|
|
87
|
+
"""
|
|
88
|
+
reader = StreamingFastaReader(file_path)
|
|
89
|
+
|
|
90
|
+
# First pass: get dimensions
|
|
91
|
+
num_seqs = 0
|
|
92
|
+
seq_length = None
|
|
93
|
+
|
|
94
|
+
for record in reader.stream_sequences():
|
|
95
|
+
if seq_length is None:
|
|
96
|
+
seq_length = len(record.seq)
|
|
97
|
+
num_seqs += 1
|
|
98
|
+
|
|
99
|
+
# Initialize column stats
|
|
100
|
+
column_stats = {
|
|
101
|
+
'variable_sites': [],
|
|
102
|
+
'gap_counts': [],
|
|
103
|
+
'conservation': []
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
# Process in chunks to maintain memory efficiency
|
|
107
|
+
for col_idx in range(seq_length):
|
|
108
|
+
column_chars = []
|
|
109
|
+
gap_count = 0
|
|
110
|
+
|
|
111
|
+
for record in reader.stream_sequences():
|
|
112
|
+
char = str(record.seq[col_idx]).upper()
|
|
113
|
+
column_chars.append(char)
|
|
114
|
+
if char in ['-', '?', 'X', 'N']:
|
|
115
|
+
gap_count += 1
|
|
116
|
+
|
|
117
|
+
# Calculate statistics
|
|
118
|
+
unique_chars = set(column_chars)
|
|
119
|
+
is_variable = len(unique_chars) > 1
|
|
120
|
+
conservation_score = 1 - (len(unique_chars) / len(column_chars))
|
|
121
|
+
|
|
122
|
+
column_stats['variable_sites'].append(is_variable)
|
|
123
|
+
column_stats['gap_counts'].append(gap_count)
|
|
124
|
+
column_stats['conservation'].append(conservation_score)
|
|
125
|
+
|
|
126
|
+
return column_stats
|
|
127
|
+
|
|
128
|
+
@staticmethod
|
|
129
|
+
def process_large_alignment_in_batches(
|
|
130
|
+
file_path: str,
|
|
131
|
+
processing_func,
|
|
132
|
+
batch_size: int = 100
|
|
133
|
+
):
|
|
134
|
+
"""
|
|
135
|
+
Process large alignment in batches to avoid memory issues.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
file_path: Path to alignment file
|
|
139
|
+
processing_func: Function to apply to each batch
|
|
140
|
+
batch_size: Number of sequences per batch
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
Aggregated results from processing function
|
|
144
|
+
"""
|
|
145
|
+
reader = StreamingFastaReader(file_path, chunk_size=batch_size)
|
|
146
|
+
results = []
|
|
147
|
+
|
|
148
|
+
for batch_num, batch in enumerate(reader.stream_chunks()):
|
|
149
|
+
batch_result = processing_func(batch)
|
|
150
|
+
results.append(batch_result)
|
|
151
|
+
|
|
152
|
+
return results
|