PyMSAStats 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
msa_stats_calculator.py
ADDED
|
@@ -0,0 +1,485 @@
|
|
|
1
|
+
# ABOUTME: Core implementation of MSA summary statistics calculator.
|
|
2
|
+
# ABOUTME: Computes 27 metrics matching the original C++ msastats behavior.
|
|
3
|
+
"""MSA Statistics Calculator.
|
|
4
|
+
|
|
5
|
+
A Python implementation of the MsaStatsCalculator for computing MSA summary statistics.
|
|
6
|
+
"""
|
|
7
|
+
import enum
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import List, Dict, Tuple, Union
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class MsaStatsError(Exception):
|
|
13
|
+
"""Base exception for MSA statistics errors."""
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
class StatType(enum.Enum):
|
|
17
|
+
"""Defines the types of summary statistics that can be calculated."""
|
|
18
|
+
AVG_GAP_SIZE = enum.auto()
|
|
19
|
+
MSA_LEN = enum.auto()
|
|
20
|
+
LONGEST_UNALIGNED_SEQ = enum.auto()
|
|
21
|
+
SHORTEST_UNALIGNED_SEQ = enum.auto()
|
|
22
|
+
TOT_NUM_GAPS = enum.auto()
|
|
23
|
+
NUM_GAPS_LEN_ONE = enum.auto()
|
|
24
|
+
NUM_GAPS_LEN_TWO = enum.auto()
|
|
25
|
+
NUM_GAPS_LEN_THREE = enum.auto()
|
|
26
|
+
NUM_GAPS_LEN_AT_LEAST_FOUR = enum.auto()
|
|
27
|
+
AVG_UNIQUE_GAP_SIZE = enum.auto()
|
|
28
|
+
TOT_NUM_UNIQUE_GAPS = enum.auto()
|
|
29
|
+
NUM_GAPS_LEN_ONE_IN_ONE_SEQ = enum.auto()
|
|
30
|
+
NUM_GAPS_LEN_ONE_IN_TWO_SEQS = enum.auto()
|
|
31
|
+
NUM_GAPS_LEN_ONE_IN_ALL_EXCEPT_ONE = enum.auto()
|
|
32
|
+
NUM_GAPS_LEN_TWO_IN_ONE_SEQ = enum.auto()
|
|
33
|
+
NUM_GAPS_LEN_TWO_IN_TWO_SEQS = enum.auto()
|
|
34
|
+
NUM_GAPS_LEN_TWO_IN_ALL_EXCEPT_ONE = enum.auto()
|
|
35
|
+
NUM_GAPS_LEN_THREE_IN_ONE_SEQ = enum.auto()
|
|
36
|
+
NUM_GAPS_LEN_THREE_IN_TWO_SEQS = enum.auto()
|
|
37
|
+
NUM_GAPS_LEN_THREE_IN_ALL_EXCEPT_ONE = enum.auto()
|
|
38
|
+
NUM_GAPS_LEN_AT_LEAST_FOUR_IN_ONE_SEQ = enum.auto()
|
|
39
|
+
NUM_GAPS_LEN_AT_LEAST_FOUR_IN_TWO_SEQS = enum.auto()
|
|
40
|
+
NUM_GAPS_LEN_AT_LEAST_FOUR_IN_ALL_EXCEPT_ONE = enum.auto()
|
|
41
|
+
MSA_POSITION_WITH_0_GAPS = enum.auto()
|
|
42
|
+
MSA_POSITION_WITH_1_GAPS = enum.auto()
|
|
43
|
+
MSA_POSITION_WITH_2_GAPS = enum.auto()
|
|
44
|
+
MSA_POSITION_WITH_N_MINUS_1_GAPS = enum.auto()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class MsaStatsCalculator:
|
|
48
|
+
"""
|
|
49
|
+
Calculates a variety of summary statistics for a given Multiple Sequence Alignment (MSA).
|
|
50
|
+
The algorithms and data structures are designed to mirror the original C++ implementation.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
def __init__(self, msa_sequences: List[str]):
|
|
54
|
+
"""Initializes the calculator with an MSA provided as a list of strings.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
msa_sequences: A list of strings, where each string is an aligned sequence.
|
|
58
|
+
|
|
59
|
+
Raises:
|
|
60
|
+
ValueError: If input is empty, contains non-strings, has empty sequences,
|
|
61
|
+
or sequences have mismatched lengths.
|
|
62
|
+
"""
|
|
63
|
+
if not msa_sequences:
|
|
64
|
+
raise ValueError("Input must be a non-empty list of sequences.")
|
|
65
|
+
|
|
66
|
+
for i, seq in enumerate(msa_sequences):
|
|
67
|
+
if not isinstance(seq, str):
|
|
68
|
+
raise ValueError(
|
|
69
|
+
f"Sequence at index {i} is not a string (got {type(seq).__name__})."
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
self._original_aligned_seqs: Tuple[str, ...] = tuple(msa_sequences)
|
|
73
|
+
self._number_of_sequences: int = len(self._original_aligned_seqs)
|
|
74
|
+
|
|
75
|
+
if not self._original_aligned_seqs[0]:
|
|
76
|
+
raise ValueError("Sequences cannot be empty.")
|
|
77
|
+
|
|
78
|
+
self._msa_length = len(self._original_aligned_seqs[0])
|
|
79
|
+
for i, seq in enumerate(self._original_aligned_seqs[1:], start=1):
|
|
80
|
+
if len(seq) != self._msa_length:
|
|
81
|
+
raise ValueError(
|
|
82
|
+
f"Sequence length mismatch: sequence 0 has length {self._msa_length}, "
|
|
83
|
+
f"but sequence {i} has length {len(seq)}."
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
self._initialize_all_variables()
|
|
87
|
+
|
|
88
|
+
@classmethod
|
|
89
|
+
def from_fasta(cls, fasta_path: Union[str, Path]) -> 'MsaStatsCalculator':
|
|
90
|
+
"""Creates an MsaStatsCalculator instance from a FASTA file.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
fasta_path: The path to the FASTA file.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
An instance of MsaStatsCalculator.
|
|
97
|
+
|
|
98
|
+
Raises:
|
|
99
|
+
MsaStatsError: If the file cannot be read or contains no sequences.
|
|
100
|
+
ValueError: If the parsed sequences are invalid (empty, mismatched lengths).
|
|
101
|
+
"""
|
|
102
|
+
fasta_path = Path(fasta_path)
|
|
103
|
+
try:
|
|
104
|
+
with open(fasta_path, 'r') as f:
|
|
105
|
+
sequences = []
|
|
106
|
+
current_seq = ""
|
|
107
|
+
for line in f:
|
|
108
|
+
line = line.strip()
|
|
109
|
+
if line.startswith('>'):
|
|
110
|
+
if current_seq:
|
|
111
|
+
sequences.append(current_seq)
|
|
112
|
+
current_seq = ""
|
|
113
|
+
else:
|
|
114
|
+
current_seq += line
|
|
115
|
+
if current_seq:
|
|
116
|
+
sequences.append(current_seq)
|
|
117
|
+
except FileNotFoundError as e:
|
|
118
|
+
raise MsaStatsError(f"FASTA file not found: {fasta_path}") from e
|
|
119
|
+
except PermissionError as e:
|
|
120
|
+
raise MsaStatsError(f"Permission denied reading FASTA file: {fasta_path}") from e
|
|
121
|
+
except IsADirectoryError as e:
|
|
122
|
+
raise MsaStatsError(f"Path is a directory, not a file: {fasta_path}") from e
|
|
123
|
+
except OSError as e:
|
|
124
|
+
raise MsaStatsError(f"Failed to read FASTA file '{fasta_path}': {e}") from e
|
|
125
|
+
|
|
126
|
+
if not sequences:
|
|
127
|
+
raise MsaStatsError(f"No sequences found in FASTA file: {fasta_path}")
|
|
128
|
+
|
|
129
|
+
return cls(sequences)
|
|
130
|
+
|
|
131
|
+
def _initialize_all_variables(self) -> None:
|
|
132
|
+
"""Resets all internal statistics and working data structures."""
|
|
133
|
+
self._aligned_seqs: List[str] = list(self._original_aligned_seqs)
|
|
134
|
+
|
|
135
|
+
# Core data structures
|
|
136
|
+
self._unique_indel_map: Dict[Tuple[int, int], List[int]] = {}
|
|
137
|
+
self._indel_counter: List[int] = []
|
|
138
|
+
|
|
139
|
+
# Final statistics attributes
|
|
140
|
+
self._ave_indel_length: float = 0.0
|
|
141
|
+
self._total_number_of_indels: int = 0
|
|
142
|
+
self._longest_seq_length: int = 0
|
|
143
|
+
self._shortest_seq_length: int = 0
|
|
144
|
+
|
|
145
|
+
self._number_of_indels_of_length_one: int = 0
|
|
146
|
+
self._number_of_indels_of_length_two: int = 0
|
|
147
|
+
self._number_of_indels_of_length_three: int = 0
|
|
148
|
+
self._number_of_indels_of_length_at_least_four: int = 0
|
|
149
|
+
|
|
150
|
+
self._number_of_indels_of_length_one_in_one_position: int = 0
|
|
151
|
+
self._number_of_indels_of_length_one_in_two_positions: int = 0
|
|
152
|
+
self._number_of_indels_of_length_one_in_n_minus_1_positions: int = 0
|
|
153
|
+
|
|
154
|
+
self._number_of_indels_of_length_two_in_one_position: int = 0
|
|
155
|
+
self._number_of_indels_of_length_two_in_two_positions: int = 0
|
|
156
|
+
self._number_of_indels_of_length_two_in_n_minus_1_positions: int = 0
|
|
157
|
+
|
|
158
|
+
self._number_of_indels_of_length_three_in_one_position: int = 0
|
|
159
|
+
self._number_of_indels_of_length_three_in_two_positions: int = 0
|
|
160
|
+
self._number_of_indels_of_length_three_in_n_minus_1_positions: int = 0
|
|
161
|
+
|
|
162
|
+
self._number_of_indels_of_length_at_least_four_in_one_position: int = 0
|
|
163
|
+
self._number_of_indels_of_length_at_least_four_in_two_positions: int = 0
|
|
164
|
+
self._number_of_indels_of_length_at_least_four_in_n_minus_1_positions: int = 0
|
|
165
|
+
|
|
166
|
+
self._number_of_msa_position_with_0_gaps: int = 0
|
|
167
|
+
self._number_of_msa_position_with_1_gaps: int = 0
|
|
168
|
+
self._number_of_msa_position_with_2_gaps: int = 0
|
|
169
|
+
self._number_of_msa_position_with_n_minus_1_gaps: int = 0
|
|
170
|
+
|
|
171
|
+
# Unique indels summary statistics
|
|
172
|
+
self._ave_unique_indel_length: float = 0.0
|
|
173
|
+
self._total_number_of_unique_indels: int = 0
|
|
174
|
+
|
|
175
|
+
def _trim_msa_from_all_indel_position_and_get_summary_statistics_from_indel_counter(self) -> None:
|
|
176
|
+
"""
|
|
177
|
+
Counts gaps per column, calculates column-based stats, and trims all-gap columns.
|
|
178
|
+
This method modifies self._aligned_seqs.
|
|
179
|
+
"""
|
|
180
|
+
if not self._original_aligned_seqs:
|
|
181
|
+
return
|
|
182
|
+
|
|
183
|
+
self._indel_counter = [0] * self._msa_length
|
|
184
|
+
for seq in self._original_aligned_seqs:
|
|
185
|
+
for i, char in enumerate(seq):
|
|
186
|
+
if char == '-':
|
|
187
|
+
self._indel_counter[i] += 1
|
|
188
|
+
|
|
189
|
+
for count in self._indel_counter:
|
|
190
|
+
if count == 0:
|
|
191
|
+
self._number_of_msa_position_with_0_gaps += 1
|
|
192
|
+
elif count == 1:
|
|
193
|
+
self._number_of_msa_position_with_1_gaps += 1
|
|
194
|
+
elif count == 2:
|
|
195
|
+
self._number_of_msa_position_with_2_gaps += 1
|
|
196
|
+
elif count == self._number_of_sequences - 1:
|
|
197
|
+
self._number_of_msa_position_with_n_minus_1_gaps += 1
|
|
198
|
+
|
|
199
|
+
# Identify columns to keep
|
|
200
|
+
cols_to_keep = [i for i, count in enumerate(self._indel_counter) if count < self._number_of_sequences]
|
|
201
|
+
|
|
202
|
+
# Create new sequences with only the columns to keep
|
|
203
|
+
self._aligned_seqs = [''.join(seq[i] for i in cols_to_keep) for seq in self._original_aligned_seqs]
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _fill_unique_gaps_map(self) -> None:
|
|
207
|
+
"""
|
|
208
|
+
Scans the aligned sequences to identify and count unique indel events,
|
|
209
|
+
populating the _unique_indel_map.
|
|
210
|
+
"""
|
|
211
|
+
self._unique_indel_map.clear()
|
|
212
|
+
if not self._aligned_seqs:
|
|
213
|
+
return
|
|
214
|
+
|
|
215
|
+
msa_length = len(self._aligned_seqs[0])
|
|
216
|
+
|
|
217
|
+
for seq in self._aligned_seqs:
|
|
218
|
+
in_indel = False
|
|
219
|
+
start_index = -1
|
|
220
|
+
|
|
221
|
+
for i, char in enumerate(seq):
|
|
222
|
+
if char == '-' and not in_indel:
|
|
223
|
+
in_indel = True
|
|
224
|
+
start_index = i
|
|
225
|
+
elif char != '-' and in_indel:
|
|
226
|
+
# End of the indel, record it
|
|
227
|
+
end_index = i - 1
|
|
228
|
+
key = (start_index, end_index)
|
|
229
|
+
length = end_index - start_index + 1
|
|
230
|
+
|
|
231
|
+
if key not in self._unique_indel_map:
|
|
232
|
+
self._unique_indel_map[key] = [length, 0]
|
|
233
|
+
self._unique_indel_map[key][1] += 1
|
|
234
|
+
|
|
235
|
+
in_indel = False
|
|
236
|
+
start_index = -1
|
|
237
|
+
|
|
238
|
+
# Handle indel that goes to the end of the sequence
|
|
239
|
+
if in_indel:
|
|
240
|
+
end_index = msa_length - 1
|
|
241
|
+
key = (start_index, end_index)
|
|
242
|
+
length = end_index - start_index + 1
|
|
243
|
+
if key not in self._unique_indel_map:
|
|
244
|
+
self._unique_indel_map[key] = [length, 0]
|
|
245
|
+
self._unique_indel_map[key][1] += 1
|
|
246
|
+
|
|
247
|
+
def _set_values_of_indel_summ_stats(self) -> None:
|
|
248
|
+
"""
|
|
249
|
+
Calculates indel-related summary statistics by processing the _unique_indel_map.
|
|
250
|
+
"""
|
|
251
|
+
self._fill_unique_gaps_map()
|
|
252
|
+
|
|
253
|
+
total_gap_chars = 0
|
|
254
|
+
total_unique_gap_chars = 0
|
|
255
|
+
|
|
256
|
+
for (length, count) in self._unique_indel_map.values():
|
|
257
|
+
self._total_number_of_indels += count
|
|
258
|
+
self._total_number_of_unique_indels += 1
|
|
259
|
+
total_gap_chars += length * count
|
|
260
|
+
total_unique_gap_chars += length
|
|
261
|
+
|
|
262
|
+
if length == 1:
|
|
263
|
+
self._number_of_indels_of_length_one += count
|
|
264
|
+
if count == 1:
|
|
265
|
+
self._number_of_indels_of_length_one_in_one_position += 1
|
|
266
|
+
if count == 2:
|
|
267
|
+
self._number_of_indels_of_length_one_in_two_positions += 1
|
|
268
|
+
if count == self._number_of_sequences - 1:
|
|
269
|
+
self._number_of_indels_of_length_one_in_n_minus_1_positions += 1
|
|
270
|
+
elif length == 2:
|
|
271
|
+
self._number_of_indels_of_length_two += count
|
|
272
|
+
if count == 1:
|
|
273
|
+
self._number_of_indels_of_length_two_in_one_position += 1
|
|
274
|
+
if count == 2:
|
|
275
|
+
self._number_of_indels_of_length_two_in_two_positions += 1
|
|
276
|
+
if count == self._number_of_sequences - 1:
|
|
277
|
+
self._number_of_indels_of_length_two_in_n_minus_1_positions += 1
|
|
278
|
+
elif length == 3:
|
|
279
|
+
self._number_of_indels_of_length_three += count
|
|
280
|
+
if count == 1:
|
|
281
|
+
self._number_of_indels_of_length_three_in_one_position += 1
|
|
282
|
+
if count == 2:
|
|
283
|
+
self._number_of_indels_of_length_three_in_two_positions += 1
|
|
284
|
+
if count == self._number_of_sequences - 1:
|
|
285
|
+
self._number_of_indels_of_length_three_in_n_minus_1_positions += 1
|
|
286
|
+
else: # length >= 4
|
|
287
|
+
self._number_of_indels_of_length_at_least_four += count
|
|
288
|
+
if count == 1:
|
|
289
|
+
self._number_of_indels_of_length_at_least_four_in_one_position += 1
|
|
290
|
+
if count == 2:
|
|
291
|
+
self._number_of_indels_of_length_at_least_four_in_two_positions += 1
|
|
292
|
+
if count == self._number_of_sequences - 1:
|
|
293
|
+
self._number_of_indels_of_length_at_least_four_in_n_minus_1_positions += 1
|
|
294
|
+
|
|
295
|
+
if self._total_number_of_indels > 0:
|
|
296
|
+
self._ave_indel_length = total_gap_chars / self._total_number_of_indels
|
|
297
|
+
if self._total_number_of_unique_indels > 0:
|
|
298
|
+
self._ave_unique_indel_length = total_unique_gap_chars / self._total_number_of_unique_indels
|
|
299
|
+
|
|
300
|
+
def _set_longest_and_shortest_sequence_lengths(self) -> None:
|
|
301
|
+
"""Calculates the longest and shortest ungapped sequence lengths."""
|
|
302
|
+
if not self._original_aligned_seqs:
|
|
303
|
+
self._longest_seq_length = 0
|
|
304
|
+
self._shortest_seq_length = 0
|
|
305
|
+
return
|
|
306
|
+
|
|
307
|
+
seq_lengths = [len(s.replace('-', '')) for s in self._original_aligned_seqs]
|
|
308
|
+
self._longest_seq_length = max(seq_lengths)
|
|
309
|
+
self._shortest_seq_length = min(seq_lengths)
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def recompute_stats(self) -> None:
|
|
313
|
+
"""
|
|
314
|
+
The main public method to run the full statistical analysis.
|
|
315
|
+
This orchestrates the calls to the internal calculation methods in the correct order.
|
|
316
|
+
"""
|
|
317
|
+
self._initialize_all_variables()
|
|
318
|
+
self._trim_msa_from_all_indel_position_and_get_summary_statistics_from_indel_counter()
|
|
319
|
+
self._set_values_of_indel_summ_stats()
|
|
320
|
+
self._set_longest_and_shortest_sequence_lengths()
|
|
321
|
+
|
|
322
|
+
def get_stat_by_type(self, stat_type: StatType) -> float:
|
|
323
|
+
"""
|
|
324
|
+
Returns the value of a specific statistic.
|
|
325
|
+
|
|
326
|
+
Args:
|
|
327
|
+
stat_type: The enum member representing the statistic to retrieve.
|
|
328
|
+
|
|
329
|
+
Returns:
|
|
330
|
+
The calculated value of the statistic as a float.
|
|
331
|
+
"""
|
|
332
|
+
stat_map = {
|
|
333
|
+
StatType.AVG_GAP_SIZE: self.average_indel_size,
|
|
334
|
+
StatType.MSA_LEN: self.msa_length,
|
|
335
|
+
StatType.LONGEST_UNALIGNED_SEQ: self.msa_longest_seq_length,
|
|
336
|
+
StatType.SHORTEST_UNALIGNED_SEQ: self.msa_shortest_seq_length,
|
|
337
|
+
StatType.TOT_NUM_GAPS: self.total_number_of_indels,
|
|
338
|
+
StatType.NUM_GAPS_LEN_ONE: self.number_of_indels_of_length_one,
|
|
339
|
+
StatType.NUM_GAPS_LEN_TWO: self.number_of_indels_of_length_two,
|
|
340
|
+
StatType.NUM_GAPS_LEN_THREE: self.number_of_indels_of_length_three,
|
|
341
|
+
StatType.NUM_GAPS_LEN_AT_LEAST_FOUR: self.number_of_indels_of_length_at_least_four,
|
|
342
|
+
StatType.AVG_UNIQUE_GAP_SIZE: self.average_unique_indel_size,
|
|
343
|
+
StatType.TOT_NUM_UNIQUE_GAPS: self.total_number_of_unique_indels,
|
|
344
|
+
StatType.NUM_GAPS_LEN_ONE_IN_ONE_SEQ: self.number_of_indels_of_length_one_in_one_position,
|
|
345
|
+
StatType.NUM_GAPS_LEN_ONE_IN_TWO_SEQS: self.number_of_indels_of_length_one_in_two_positions,
|
|
346
|
+
StatType.NUM_GAPS_LEN_ONE_IN_ALL_EXCEPT_ONE: self.number_of_indels_of_length_one_in_n_minus_1_positions,
|
|
347
|
+
StatType.NUM_GAPS_LEN_TWO_IN_ONE_SEQ: self.number_of_indels_of_length_two_in_one_position,
|
|
348
|
+
StatType.NUM_GAPS_LEN_TWO_IN_TWO_SEQS: self.number_of_indels_of_length_two_in_two_positions,
|
|
349
|
+
StatType.NUM_GAPS_LEN_TWO_IN_ALL_EXCEPT_ONE: self.number_of_indels_of_length_two_in_n_minus_1_positions,
|
|
350
|
+
StatType.NUM_GAPS_LEN_THREE_IN_ONE_SEQ: self.number_of_indels_of_length_three_in_one_position,
|
|
351
|
+
StatType.NUM_GAPS_LEN_THREE_IN_TWO_SEQS: self.number_of_indels_of_length_three_in_two_positions,
|
|
352
|
+
StatType.NUM_GAPS_LEN_THREE_IN_ALL_EXCEPT_ONE: self.number_of_indels_of_length_three_in_n_minus_1_positions,
|
|
353
|
+
StatType.NUM_GAPS_LEN_AT_LEAST_FOUR_IN_ONE_SEQ: self.number_of_indels_of_length_at_least_four_in_one_position,
|
|
354
|
+
StatType.NUM_GAPS_LEN_AT_LEAST_FOUR_IN_TWO_SEQS: self.number_of_indels_of_length_at_least_four_in_two_positions,
|
|
355
|
+
StatType.NUM_GAPS_LEN_AT_LEAST_FOUR_IN_ALL_EXCEPT_ONE: self.number_of_indels_of_length_at_least_four_in_n_minus_1_positions,
|
|
356
|
+
StatType.MSA_POSITION_WITH_0_GAPS: self.number_of_msa_position_with_0_gaps,
|
|
357
|
+
StatType.MSA_POSITION_WITH_1_GAPS: self.number_of_msa_position_with_1_gaps,
|
|
358
|
+
StatType.MSA_POSITION_WITH_2_GAPS: self.number_of_msa_position_with_2_gaps,
|
|
359
|
+
StatType.MSA_POSITION_WITH_N_MINUS_1_GAPS: self.number_of_msa_position_with_n_minus_1_gaps,
|
|
360
|
+
}
|
|
361
|
+
return float(stat_map.get(stat_type, -1.0))
|
|
362
|
+
|
|
363
|
+
def get_stat_vec(self) -> List[float]:
|
|
364
|
+
"""
|
|
365
|
+
Returns a list of all summary statistics in a predefined order, matching
|
|
366
|
+
the C++ implementation's `getStatVec` method.
|
|
367
|
+
"""
|
|
368
|
+
return [self.get_stat_by_type(stat) for stat in StatType]
|
|
369
|
+
|
|
370
|
+
def __str__(self) -> str:
|
|
371
|
+
"""Returns a string representation of the original aligned MSA."""
|
|
372
|
+
return "\n".join(self._original_aligned_seqs)
|
|
373
|
+
|
|
374
|
+
# Public properties for accessing statistics
|
|
375
|
+
@property
|
|
376
|
+
def msa_length(self) -> int:
|
|
377
|
+
return self._msa_length
|
|
378
|
+
|
|
379
|
+
@property
|
|
380
|
+
def number_of_sequences(self) -> int:
|
|
381
|
+
return self._number_of_sequences
|
|
382
|
+
|
|
383
|
+
@property
|
|
384
|
+
def total_number_of_indels(self) -> int:
|
|
385
|
+
return self._total_number_of_indels
|
|
386
|
+
|
|
387
|
+
@property
|
|
388
|
+
def total_number_of_unique_indels(self) -> int:
|
|
389
|
+
return self._total_number_of_unique_indels
|
|
390
|
+
|
|
391
|
+
@property
|
|
392
|
+
def number_of_indels_of_length_one(self) -> int:
|
|
393
|
+
return self._number_of_indels_of_length_one
|
|
394
|
+
|
|
395
|
+
@property
|
|
396
|
+
def number_of_indels_of_length_two(self) -> int:
|
|
397
|
+
return self._number_of_indels_of_length_two
|
|
398
|
+
|
|
399
|
+
@property
|
|
400
|
+
def number_of_indels_of_length_three(self) -> int:
|
|
401
|
+
return self._number_of_indels_of_length_three
|
|
402
|
+
|
|
403
|
+
@property
|
|
404
|
+
def number_of_indels_of_length_at_least_four(self) -> int:
|
|
405
|
+
return self._number_of_indels_of_length_at_least_four
|
|
406
|
+
|
|
407
|
+
@property
|
|
408
|
+
def average_indel_size(self) -> float:
|
|
409
|
+
return self._ave_indel_length
|
|
410
|
+
|
|
411
|
+
@property
|
|
412
|
+
def average_unique_indel_size(self) -> float:
|
|
413
|
+
return self._ave_unique_indel_length
|
|
414
|
+
|
|
415
|
+
@property
|
|
416
|
+
def msa_longest_seq_length(self) -> int:
|
|
417
|
+
return self._longest_seq_length
|
|
418
|
+
|
|
419
|
+
@property
|
|
420
|
+
def msa_shortest_seq_length(self) -> int:
|
|
421
|
+
return self._shortest_seq_length
|
|
422
|
+
|
|
423
|
+
@property
|
|
424
|
+
def number_of_indels_of_length_one_in_one_position(self) -> int:
|
|
425
|
+
return self._number_of_indels_of_length_one_in_one_position
|
|
426
|
+
|
|
427
|
+
@property
|
|
428
|
+
def number_of_indels_of_length_one_in_two_positions(self) -> int:
|
|
429
|
+
return self._number_of_indels_of_length_one_in_two_positions
|
|
430
|
+
|
|
431
|
+
@property
|
|
432
|
+
def number_of_indels_of_length_one_in_n_minus_1_positions(self) -> int:
|
|
433
|
+
return self._number_of_indels_of_length_one_in_n_minus_1_positions
|
|
434
|
+
|
|
435
|
+
@property
|
|
436
|
+
def number_of_indels_of_length_two_in_one_position(self) -> int:
|
|
437
|
+
return self._number_of_indels_of_length_two_in_one_position
|
|
438
|
+
|
|
439
|
+
@property
|
|
440
|
+
def number_of_indels_of_length_two_in_two_positions(self) -> int:
|
|
441
|
+
return self._number_of_indels_of_length_two_in_two_positions
|
|
442
|
+
|
|
443
|
+
@property
|
|
444
|
+
def number_of_indels_of_length_two_in_n_minus_1_positions(self) -> int:
|
|
445
|
+
return self._number_of_indels_of_length_two_in_n_minus_1_positions
|
|
446
|
+
|
|
447
|
+
@property
|
|
448
|
+
def number_of_indels_of_length_three_in_one_position(self) -> int:
|
|
449
|
+
return self._number_of_indels_of_length_three_in_one_position
|
|
450
|
+
|
|
451
|
+
@property
|
|
452
|
+
def number_of_indels_of_length_three_in_two_positions(self) -> int:
|
|
453
|
+
return self._number_of_indels_of_length_three_in_two_positions
|
|
454
|
+
|
|
455
|
+
@property
|
|
456
|
+
def number_of_indels_of_length_three_in_n_minus_1_positions(self) -> int:
|
|
457
|
+
return self._number_of_indels_of_length_three_in_n_minus_1_positions
|
|
458
|
+
|
|
459
|
+
@property
|
|
460
|
+
def number_of_indels_of_length_at_least_four_in_one_position(self) -> int:
|
|
461
|
+
return self._number_of_indels_of_length_at_least_four_in_one_position
|
|
462
|
+
|
|
463
|
+
@property
|
|
464
|
+
def number_of_indels_of_length_at_least_four_in_two_positions(self) -> int:
|
|
465
|
+
return self._number_of_indels_of_length_at_least_four_in_two_positions
|
|
466
|
+
|
|
467
|
+
@property
|
|
468
|
+
def number_of_indels_of_length_at_least_four_in_n_minus_1_positions(self) -> int:
|
|
469
|
+
return self._number_of_indels_of_length_at_least_four_in_n_minus_1_positions
|
|
470
|
+
|
|
471
|
+
@property
|
|
472
|
+
def number_of_msa_position_with_0_gaps(self) -> int:
|
|
473
|
+
return self._number_of_msa_position_with_0_gaps
|
|
474
|
+
|
|
475
|
+
@property
|
|
476
|
+
def number_of_msa_position_with_1_gaps(self) -> int:
|
|
477
|
+
return self._number_of_msa_position_with_1_gaps
|
|
478
|
+
|
|
479
|
+
@property
|
|
480
|
+
def number_of_msa_position_with_2_gaps(self) -> int:
|
|
481
|
+
return self._number_of_msa_position_with_2_gaps
|
|
482
|
+
|
|
483
|
+
@property
|
|
484
|
+
def number_of_msa_position_with_n_minus_1_gaps(self) -> int:
|
|
485
|
+
return self._number_of_msa_position_with_n_minus_1_gaps
|
msastats.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# ABOUTME: Drop-in API module for msastats (import as `msastats`).
|
|
2
|
+
# ABOUTME: Provides calculate_msa_stats, calculate_fasta_stats, and stats_names.
|
|
3
|
+
"""Pure-Python replacement for the `msastats` extension module.
|
|
4
|
+
|
|
5
|
+
This module implements the minimal public API used by MSACompare:
|
|
6
|
+
- calculate_msa_stats(msa: list[str]) -> list[float]
|
|
7
|
+
- calculate_fasta_stats(path: str) -> list[float]
|
|
8
|
+
- stats_names() -> list[str]
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import os
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import List, Sequence, Union
|
|
16
|
+
|
|
17
|
+
from msa_stats_calculator import MsaStatsCalculator, MsaStatsError, StatType
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"MsaStatsCalculator",
|
|
21
|
+
"MsaStatsError",
|
|
22
|
+
"StatType",
|
|
23
|
+
"calculate_fasta_stats",
|
|
24
|
+
"calculate_msa_stats",
|
|
25
|
+
"stats_names",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
PathLike = Union[str, os.PathLike[str]]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def calculate_msa_stats(msa: Sequence[str]) -> List[float]:
|
|
33
|
+
"""Calculate stats from an in-memory MSA (list/tuple of aligned strings)."""
|
|
34
|
+
calculator = MsaStatsCalculator(list(msa))
|
|
35
|
+
calculator.recompute_stats()
|
|
36
|
+
return calculator.get_stat_vec()
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def calculate_fasta_stats(fasta_path: PathLike) -> List[float]:
|
|
40
|
+
"""Calculate stats from a FASTA file containing an aligned MSA."""
|
|
41
|
+
calculator = MsaStatsCalculator.from_fasta(Path(fasta_path))
|
|
42
|
+
calculator.recompute_stats()
|
|
43
|
+
return calculator.get_stat_vec()
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def stats_names() -> List[str]:
|
|
47
|
+
"""Return stat names in the same order as `calculate_*_stats`."""
|
|
48
|
+
return [stat.name for stat in StatType]
|
|
49
|
+
|
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: PyMSAStats
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Pure-Python MSA summary statistics
|
|
5
|
+
Author: Naiel J
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/naielj/PyMSAStats
|
|
8
|
+
Project-URL: Repository, https://github.com/naielj/PyMSAStats
|
|
9
|
+
Project-URL: Issues, https://github.com/naielj/PyMSAStats/issues
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
18
|
+
Requires-Python: >=3.9
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Dynamic: license-file
|
|
22
|
+
|
|
23
|
+
# PyMSAStats (pure-Python `msastats`)
|
|
24
|
+
|
|
25
|
+
A pure-Python implementation of the `msastats` API used by MSACompare for computing
|
|
26
|
+
Multiple Sequence Alignment (MSA) summary statistics.
|
|
27
|
+
|
|
28
|
+
This package installs and imports as `msastats`:
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
import msastats
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Usage
|
|
35
|
+
|
|
36
|
+
You can use either the high-level `msastats` functions (drop-in API), or the
|
|
37
|
+
lower-level `MsaStatsCalculator`.
|
|
38
|
+
|
|
39
|
+
### Installation
|
|
40
|
+
|
|
41
|
+
Install from PyPI:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
pip install pymsastats
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Or with `uv`:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
uv add pymsastats
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
For development (editable install from source):
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
git clone https://github.com/naielj/PyMSAStats.git
|
|
57
|
+
cd PyMSAStats
|
|
58
|
+
uv sync
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### Drop-in API (Recommended)
|
|
62
|
+
|
|
63
|
+
From a list of aligned sequences:
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
import msastats
|
|
67
|
+
|
|
68
|
+
stats = msastats.calculate_msa_stats(["AA-A", "AA-A", "A--A"])
|
|
69
|
+
names = msastats.stats_names()
|
|
70
|
+
|
|
71
|
+
as_dict = dict(zip(names, stats))
|
|
72
|
+
print(as_dict["AVG_GAP_SIZE"])
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
From an aligned FASTA file:
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
import msastats
|
|
79
|
+
|
|
80
|
+
stats = msastats.calculate_fasta_stats("path/to/alignment.fasta")
|
|
81
|
+
print(stats)
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### Calculator API
|
|
85
|
+
|
|
86
|
+
From a list of aligned sequences:
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
from msastats import MsaStatsCalculator, StatType
|
|
90
|
+
|
|
91
|
+
# 1. Initialize with a list of aligned sequences
|
|
92
|
+
sequences = [
|
|
93
|
+
"AC--GT",
|
|
94
|
+
"ACGTGT",
|
|
95
|
+
"AC-TGT",
|
|
96
|
+
]
|
|
97
|
+
calculator = MsaStatsCalculator(sequences)
|
|
98
|
+
|
|
99
|
+
# 2. Compute the statistics
|
|
100
|
+
calculator.recompute_stats()
|
|
101
|
+
|
|
102
|
+
# 3. Access the statistics
|
|
103
|
+
print(f"MSA Length: {calculator.msa_length}")
|
|
104
|
+
print(f"Number of Sequences: {calculator.number_of_sequences}")
|
|
105
|
+
print(f"Total Gaps: {calculator.total_number_of_indels}")
|
|
106
|
+
|
|
107
|
+
# Or access stats by type
|
|
108
|
+
avg_gap_size = calculator.get_stat_by_type(StatType.AVG_GAP_SIZE)
|
|
109
|
+
print(f"Average Gap Size: {avg_gap_size:.2f}")
|
|
110
|
+
|
|
111
|
+
# Get all stats as a vector
|
|
112
|
+
stats_vector = calculator.get_stat_vec()
|
|
113
|
+
print(f"Stats Vector: {stats_vector}")
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
From an aligned FASTA file:
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
from pathlib import Path
|
|
120
|
+
from msastats import MsaStatsCalculator, StatType
|
|
121
|
+
|
|
122
|
+
# 1. Create a dummy FASTA file
|
|
123
|
+
fasta_content = """>seq1
|
|
124
|
+
AC--GT--
|
|
125
|
+
>seq2
|
|
126
|
+
ACGTGT--
|
|
127
|
+
>seq3
|
|
128
|
+
AC-TGTAC
|
|
129
|
+
"""
|
|
130
|
+
fasta_path = Path("dummy.fasta")
|
|
131
|
+
fasta_path.write_text(fasta_content)
|
|
132
|
+
|
|
133
|
+
# 2. Initialize from the FASTA file
|
|
134
|
+
calculator = MsaStatsCalculator.from_fasta(fasta_path)
|
|
135
|
+
|
|
136
|
+
# 3. Compute the statistics
|
|
137
|
+
calculator.recompute_stats()
|
|
138
|
+
|
|
139
|
+
# 4. Access the statistics
|
|
140
|
+
print(f"MSA Length: {calculator.msa_length}")
|
|
141
|
+
print(f"Longest Sequence: {calculator.msa_longest_seq_length}")
|
|
142
|
+
print(f"Shortest Sequence: {calculator.msa_shortest_seq_length}")
|
|
143
|
+
print(f"Total Number of Gaps: {calculator.total_number_of_indels}")
|
|
144
|
+
|
|
145
|
+
# Clean up the dummy file
|
|
146
|
+
fasta_path.unlink()
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
## Statistics reference (27 metrics)
|
|
150
|
+
|
|
151
|
+
The 27 summary statistics implemented here are defined in:
|
|
152
|
+
|
|
153
|
+
> Wygoda E, Loewenthal G, Moshe A, Alburquerque M, Mayrose I, Pupko T. Statistical framework to determine indel-length distribution. *Bioinformatics*. 2024;40(2):btae043. <https://doi.org/10.1093/bioinformatics/btae043>
|
|
154
|
+
|
|
155
|
+
```bibtex
|
|
156
|
+
@article{wygoda2024indel,
|
|
157
|
+
author = {Wygoda, Elya and Loewenthal, Gil and Moshe, Asher and Alburquerque, Michael and Mayrose, Itay and Pupko, Tal},
|
|
158
|
+
title = {Statistical framework to determine indel-length distribution},
|
|
159
|
+
journal = {Bioinformatics},
|
|
160
|
+
volume = {40},
|
|
161
|
+
number = {2},
|
|
162
|
+
pages = {btae043},
|
|
163
|
+
year = {2024},
|
|
164
|
+
doi = {10.1093/bioinformatics/btae043}
|
|
165
|
+
}
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
For an illustrated reference with a worked example showing all 27 metric values on a single MSA, see
|
|
169
|
+
[docs/summary_statistics_reference.md](docs/summary_statistics_reference.md).
|
|
170
|
+
|
|
171
|
+
### Terminology
|
|
172
|
+
|
|
173
|
+
- **Gap character:** the implementation treats `-` as a gap.
|
|
174
|
+
- **All-gap column:** a column where *all* sequences have `-` at that position.
|
|
175
|
+
- **Gap run / indel (what “gap” means in most metrics):** a maximal contiguous run of `-` in a single sequence.
|
|
176
|
+
- **All-gap trimming (important):** before detecting gap runs, the algorithm removes all-gap columns. This matches the
|
|
177
|
+
original C++ implementation and prevents all-gap columns from splitting/creating gap runs.
|
|
178
|
+
- **Unique gap interval:** gap runs are grouped by their `(start, end)` coordinates *in the trimmed alignment*. If
|
|
179
|
+
multiple sequences have a gap run with the same `(start, end)`, that is **one** unique gap interval with:
|
|
180
|
+
- `length = end - start + 1`
|
|
181
|
+
- `count = number of sequences that have that exact interval`
|
|
182
|
+
|
|
183
|
+
### Returned order
|
|
184
|
+
|
|
185
|
+
`calculate_msa_stats` / `calculate_fasta_stats` return a list of 27 floats in the same order as
|
|
186
|
+
`msastats.stats_names()` (and the `StatType` enum).
|
|
187
|
+
|
|
188
|
+
### Metric definitions
|
|
189
|
+
|
|
190
|
+
#### Alignment and sequence lengths
|
|
191
|
+
|
|
192
|
+
- `MSA_LEN` (MSACompare: `LINE_LENGTH`): alignment length in columns (includes all-gap columns).
|
|
193
|
+
- `LONGEST_UNALIGNED_SEQ` (MSACompare: `LONGEST_UNALIGNED_SEQ_LENGTH`): max ungapped sequence length across sequences
|
|
194
|
+
(`len(seq.replace('-', ''))`).
|
|
195
|
+
- `SHORTEST_UNALIGNED_SEQ` (MSACompare: `SHORTEST_UNALIGNED_SEQ_LENGTH`): min ungapped sequence length across sequences.
|
|
196
|
+
|
|
197
|
+
#### Gap-run totals (after all-gap trimming)
|
|
198
|
+
|
|
199
|
+
Let the set of unique gap intervals be `U`, and for each `u ∈ U`, let `u.length` be its length and `u.count` be how
|
|
200
|
+
many sequences contain that interval.
|
|
201
|
+
|
|
202
|
+
- `TOT_NUM_GAPS` (MSACompare: `TOTAL_GAPS`): total number of gap runs across sequences:
|
|
203
|
+
`Σ_u u.count`
|
|
204
|
+
- `AVG_GAP_SIZE` (MSACompare: `AVG_LENGTH_OF_GAPS`): mean gap-run length across all sequences:
|
|
205
|
+
`(Σ_u u.length · u.count) / TOT_NUM_GAPS` (0 if `TOT_NUM_GAPS == 0`)
|
|
206
|
+
- `NUM_GAPS_LEN_ONE` (MSACompare: `GAPS_OF_LENGTH_ONE`): `Σ_{u.length==1} u.count`
|
|
207
|
+
- `NUM_GAPS_LEN_TWO` (MSACompare: `GAPS_OF_LENGTH_TWO`): `Σ_{u.length==2} u.count`
|
|
208
|
+
- `NUM_GAPS_LEN_THREE` (MSACompare: `GAPS_OF_LENGTH_THREE`): `Σ_{u.length==3} u.count`
|
|
209
|
+
- `NUM_GAPS_LEN_AT_LEAST_FOUR` (MSACompare: `GAPS_LARGER_THAN_THREE`): `Σ_{u.length>=4} u.count`
|
|
210
|
+
|
|
211
|
+
#### Unique gap intervals (after all-gap trimming)
|
|
212
|
+
|
|
213
|
+
- `TOT_NUM_UNIQUE_GAPS` (MSACompare: `TOTAL_UNIQUE_GAPS`): number of unique gap intervals: `|U|`
|
|
214
|
+
- `AVG_UNIQUE_GAP_SIZE` (MSACompare: `AVG_SIZE_OF_UNIQUE_GAPS`): mean unique-gap length:
|
|
215
|
+
`(Σ_u u.length) / TOT_NUM_UNIQUE_GAPS` (0 if `TOT_NUM_UNIQUE_GAPS == 0`)
|
|
216
|
+
|
|
217
|
+
#### Unique gap intervals shared by k sequences (after all-gap trimming)
|
|
218
|
+
|
|
219
|
+
These count **unique** gap intervals (not total occurrences). For a given interval length bucket, they count how many
|
|
220
|
+
intervals have `u.count == k`.
|
|
221
|
+
|
|
222
|
+
Length 1:
|
|
223
|
+
- `NUM_GAPS_LEN_ONE_IN_ONE_SEQ` (MSACompare: `GAPS_LENGTH_ONE_ONE_SEQ`): `|{u: u.length==1 and u.count==1}|`
|
|
224
|
+
- `NUM_GAPS_LEN_ONE_IN_TWO_SEQS` (MSACompare: `GAPS_LENGTH_ONE_TWO_SEQ`): `|{u: u.length==1 and u.count==2}|`
|
|
225
|
+
- `NUM_GAPS_LEN_ONE_IN_ALL_EXCEPT_ONE` (MSACompare: `GAPS_LENGTH_ONE_EXCEPT_ONE`):
|
|
226
|
+
`|{u: u.length==1 and u.count==N-1}|`
|
|
227
|
+
|
|
228
|
+
Length 2:
|
|
229
|
+
- `NUM_GAPS_LEN_TWO_IN_ONE_SEQ` (MSACompare: `GAPS_LENGTH_TWO_ONE_SEQ`): `|{u: u.length==2 and u.count==1}|`
|
|
230
|
+
- `NUM_GAPS_LEN_TWO_IN_TWO_SEQS` (MSACompare: `GAPS_LENGTH_TWO_TWO_SEQ`): `|{u: u.length==2 and u.count==2}|`
|
|
231
|
+
- `NUM_GAPS_LEN_TWO_IN_ALL_EXCEPT_ONE` (MSACompare: `GAPS_LENGTH_TWO_EXCEPT_ONE`):
|
|
232
|
+
`|{u: u.length==2 and u.count==N-1}|`
|
|
233
|
+
|
|
234
|
+
Length 3:
|
|
235
|
+
- `NUM_GAPS_LEN_THREE_IN_ONE_SEQ` (MSACompare: `GAPS_LENGTH_THREE_ONE_SEQ`): `|{u: u.length==3 and u.count==1}|`
|
|
236
|
+
- `NUM_GAPS_LEN_THREE_IN_TWO_SEQS` (MSACompare: `GAPS_LENGTH_THREE_TWO_SEQ`): `|{u: u.length==3 and u.count==2}|`
|
|
237
|
+
- `NUM_GAPS_LEN_THREE_IN_ALL_EXCEPT_ONE` (MSACompare: `GAPS_LENGTH_THREE_EXCEPT_ONE`):
|
|
238
|
+
`|{u: u.length==3 and u.count==N-1}|`
|
|
239
|
+
|
|
240
|
+
Length ≥ 4:
|
|
241
|
+
- `NUM_GAPS_LEN_AT_LEAST_FOUR_IN_ONE_SEQ` (MSACompare: `GAPS_LARGER_THAN_THREE_ONE_SEQ`):
|
|
242
|
+
`|{u: u.length>=4 and u.count==1}|`
|
|
243
|
+
- `NUM_GAPS_LEN_AT_LEAST_FOUR_IN_TWO_SEQS` (MSACompare: `GAPS_LARGER_THAN_THREE_TWO_SEQ`):
|
|
244
|
+
`|{u: u.length>=4 and u.count==2}|`
|
|
245
|
+
- `NUM_GAPS_LEN_AT_LEAST_FOUR_IN_ALL_EXCEPT_ONE` (MSACompare: `GAPS_LARGER_THAN_THREE_EXCEPT_ONE`):
|
|
246
|
+
`|{u: u.length>=4 and u.count==N-1}|`
|
|
247
|
+
|
|
248
|
+
Important edge-case note: to match the C++ reference, the "`== 1` / `== 2` / `== N-1`" checks are independent.
|
|
249
|
+
So for small `N`, some categories overlap:
|
|
250
|
+
- If `N == 2`, then `N-1 == 1`, so "`IN_ONE_SEQ`" and "`IN_ALL_EXCEPT_ONE`" count the same intervals.
|
|
251
|
+
- If `N == 3`, then `N-1 == 2`, so "`IN_TWO_SEQS`" and "`IN_ALL_EXCEPT_ONE`" count the same intervals.
|
|
252
|
+
|
|
253
|
+
#### Column-wise gap counts (computed on the original alignment, before all-gap trimming)
|
|
254
|
+
|
|
255
|
+
These count alignment columns based on how many sequences have a `-` in that column:
|
|
256
|
+
|
|
257
|
+
- `MSA_POSITION_WITH_0_GAPS` (MSACompare: `NO_GAP_COLUMNS`): number of columns with exactly 0 gaps
|
|
258
|
+
- `MSA_POSITION_WITH_1_GAPS` (MSACompare: `ONE_GAP_COLUMNS`): number of columns with exactly 1 gap
|
|
259
|
+
- `MSA_POSITION_WITH_2_GAPS` (MSACompare: `TWO_GAP_COLUMNS`): number of columns with exactly 2 gaps
|
|
260
|
+
- `MSA_POSITION_WITH_N_MINUS_1_GAPS` (MSACompare: `ONE_GAP_EXCEPT_ONE_COLUMN`): number of columns with exactly `N-1` gaps
|
|
261
|
+
|
|
262
|
+
Notes:
|
|
263
|
+
- Columns with `N` gaps (all-gap columns) are **not** counted in any of these buckets.
|
|
264
|
+
- For `N == 3`, the `N-1` bucket is effectively 0 because columns with 2 gaps are already counted in
|
|
265
|
+
`MSA_POSITION_WITH_2_GAPS` (this matches the C++ reference behavior).
|
|
266
|
+
|
|
267
|
+
### Examples
|
|
268
|
+
|
|
269
|
+
Helper to get a readable dict:
|
|
270
|
+
|
|
271
|
+
```python
|
|
272
|
+
import msastats
|
|
273
|
+
|
|
274
|
+
def stats_dict(msa):
|
|
275
|
+
return dict(zip(msastats.stats_names(), msastats.calculate_msa_stats(msa)))
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
#### Example 1: One gap interval shared by 2 of 3 sequences
|
|
279
|
+
|
|
280
|
+
```python
|
|
281
|
+
msa = ["A--A", "A--A", "AAAA"] # N=3, L=4
|
|
282
|
+
stats = stats_dict(msa)
|
|
283
|
+
|
|
284
|
+
assert stats["MSA_LEN"] == 4.0
|
|
285
|
+
assert stats["LONGEST_UNALIGNED_SEQ"] == 4.0
|
|
286
|
+
assert stats["SHORTEST_UNALIGNED_SEQ"] == 2.0
|
|
287
|
+
|
|
288
|
+
# One unique gap interval of length 2, appearing in 2 sequences:
|
|
289
|
+
assert stats["TOT_NUM_UNIQUE_GAPS"] == 1.0
|
|
290
|
+
assert stats["TOT_NUM_GAPS"] == 2.0
|
|
291
|
+
assert stats["AVG_GAP_SIZE"] == 2.0
|
|
292
|
+
assert stats["NUM_GAPS_LEN_TWO"] == 2.0
|
|
293
|
+
|
|
294
|
+
# Column-wise gap counts (original alignment):
|
|
295
|
+
assert stats["MSA_POSITION_WITH_0_GAPS"] == 2.0
|
|
296
|
+
assert stats["MSA_POSITION_WITH_2_GAPS"] == 2.0
|
|
297
|
+
```
|
|
298
|
+
|
|
299
|
+
#### Example 2: All-gap columns are ignored for gap-run detection
|
|
300
|
+
|
|
301
|
+
```python
|
|
302
|
+
msa = ["A-A", "A-A", "A-A"] # middle column is all gaps
|
|
303
|
+
stats = stats_dict(msa)
|
|
304
|
+
|
|
305
|
+
assert stats["MSA_LEN"] == 3.0
|
|
306
|
+
|
|
307
|
+
# The all-gap column is removed before gap runs are detected, so there are no gaps:
|
|
308
|
+
assert stats["TOT_NUM_GAPS"] == 0.0
|
|
309
|
+
assert stats["TOT_NUM_UNIQUE_GAPS"] == 0.0
|
|
310
|
+
assert stats["AVG_GAP_SIZE"] == 0.0
|
|
311
|
+
|
|
312
|
+
# But column-wise counts still “see” the original alignment columns:
|
|
313
|
+
assert stats["MSA_POSITION_WITH_0_GAPS"] == 2.0
|
|
314
|
+
```
|
|
315
|
+
|
|
316
|
+
#### Example 3: “total gap runs” vs “unique gap intervals”
|
|
317
|
+
|
|
318
|
+
```python
|
|
319
|
+
msa = ["A-A", "A-A", "AAA", "AAA"] # one gap interval shared by 2 sequences
|
|
320
|
+
stats = stats_dict(msa)
|
|
321
|
+
|
|
322
|
+
# Total occurrences (one per sequence that has it):
|
|
323
|
+
assert stats["NUM_GAPS_LEN_ONE"] == 2.0
|
|
324
|
+
|
|
325
|
+
# Unique intervals are counted once:
|
|
326
|
+
assert stats["TOT_NUM_UNIQUE_GAPS"] == 1.0
|
|
327
|
+
assert stats["NUM_GAPS_LEN_ONE_IN_TWO_SEQS"] == 1.0
|
|
328
|
+
```
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
msa_stats_calculator.py,sha256=vh4ykrP8pK1fF-czYG04S5XezIBbLIofhIkZpCmBcgM,20912
|
|
2
|
+
msastats.py,sha256=4mteH4JbIPgxSkBkiGIE1VhS6lsv44SF6DwZ68wC9vA,1488
|
|
3
|
+
pymsastats-0.1.0.dist-info/licenses/LICENSE,sha256=huhwDNhNzr24NkyPX5pH_F4BiS_cYVrZwyXB_ZynjqE,1071
|
|
4
|
+
pymsastats-0.1.0.dist-info/METADATA,sha256=sT7hWXuBoctX715iU1PZYFRqNarbD9sH0DtU1skXT1g,11526
|
|
5
|
+
pymsastats-0.1.0.dist-info/WHEEL,sha256=YCfwYGOYMi5Jhw2fU4yNgwErybb2IX5PEwBKV4ZbdBo,91
|
|
6
|
+
pymsastats-0.1.0.dist-info/top_level.txt,sha256=Per3ZNEbyffbfTUtyfzXEWzt6eT2PBORZ2Mp8QrLoEQ,30
|
|
7
|
+
pymsastats-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Naiel Jabareen
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|