PyMSAStats 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,485 @@
1
+ # ABOUTME: Core implementation of MSA summary statistics calculator.
2
+ # ABOUTME: Computes 27 metrics matching the original C++ msastats behavior.
3
+ """MSA Statistics Calculator.
4
+
5
+ A Python implementation of the MsaStatsCalculator for computing MSA summary statistics.
6
+ """
7
+ import enum
8
+ from pathlib import Path
9
+ from typing import List, Dict, Tuple, Union
10
+
11
+
12
+ class MsaStatsError(Exception):
13
+ """Base exception for MSA statistics errors."""
14
+ pass
15
+
16
+ class StatType(enum.Enum):
17
+ """Defines the types of summary statistics that can be calculated."""
18
+ AVG_GAP_SIZE = enum.auto()
19
+ MSA_LEN = enum.auto()
20
+ LONGEST_UNALIGNED_SEQ = enum.auto()
21
+ SHORTEST_UNALIGNED_SEQ = enum.auto()
22
+ TOT_NUM_GAPS = enum.auto()
23
+ NUM_GAPS_LEN_ONE = enum.auto()
24
+ NUM_GAPS_LEN_TWO = enum.auto()
25
+ NUM_GAPS_LEN_THREE = enum.auto()
26
+ NUM_GAPS_LEN_AT_LEAST_FOUR = enum.auto()
27
+ AVG_UNIQUE_GAP_SIZE = enum.auto()
28
+ TOT_NUM_UNIQUE_GAPS = enum.auto()
29
+ NUM_GAPS_LEN_ONE_IN_ONE_SEQ = enum.auto()
30
+ NUM_GAPS_LEN_ONE_IN_TWO_SEQS = enum.auto()
31
+ NUM_GAPS_LEN_ONE_IN_ALL_EXCEPT_ONE = enum.auto()
32
+ NUM_GAPS_LEN_TWO_IN_ONE_SEQ = enum.auto()
33
+ NUM_GAPS_LEN_TWO_IN_TWO_SEQS = enum.auto()
34
+ NUM_GAPS_LEN_TWO_IN_ALL_EXCEPT_ONE = enum.auto()
35
+ NUM_GAPS_LEN_THREE_IN_ONE_SEQ = enum.auto()
36
+ NUM_GAPS_LEN_THREE_IN_TWO_SEQS = enum.auto()
37
+ NUM_GAPS_LEN_THREE_IN_ALL_EXCEPT_ONE = enum.auto()
38
+ NUM_GAPS_LEN_AT_LEAST_FOUR_IN_ONE_SEQ = enum.auto()
39
+ NUM_GAPS_LEN_AT_LEAST_FOUR_IN_TWO_SEQS = enum.auto()
40
+ NUM_GAPS_LEN_AT_LEAST_FOUR_IN_ALL_EXCEPT_ONE = enum.auto()
41
+ MSA_POSITION_WITH_0_GAPS = enum.auto()
42
+ MSA_POSITION_WITH_1_GAPS = enum.auto()
43
+ MSA_POSITION_WITH_2_GAPS = enum.auto()
44
+ MSA_POSITION_WITH_N_MINUS_1_GAPS = enum.auto()
45
+
46
+
47
+ class MsaStatsCalculator:
48
+ """
49
+ Calculates a variety of summary statistics for a given Multiple Sequence Alignment (MSA).
50
+ The algorithms and data structures are designed to mirror the original C++ implementation.
51
+ """
52
+
53
+ def __init__(self, msa_sequences: List[str]):
54
+ """Initializes the calculator with an MSA provided as a list of strings.
55
+
56
+ Args:
57
+ msa_sequences: A list of strings, where each string is an aligned sequence.
58
+
59
+ Raises:
60
+ ValueError: If input is empty, contains non-strings, has empty sequences,
61
+ or sequences have mismatched lengths.
62
+ """
63
+ if not msa_sequences:
64
+ raise ValueError("Input must be a non-empty list of sequences.")
65
+
66
+ for i, seq in enumerate(msa_sequences):
67
+ if not isinstance(seq, str):
68
+ raise ValueError(
69
+ f"Sequence at index {i} is not a string (got {type(seq).__name__})."
70
+ )
71
+
72
+ self._original_aligned_seqs: Tuple[str, ...] = tuple(msa_sequences)
73
+ self._number_of_sequences: int = len(self._original_aligned_seqs)
74
+
75
+ if not self._original_aligned_seqs[0]:
76
+ raise ValueError("Sequences cannot be empty.")
77
+
78
+ self._msa_length = len(self._original_aligned_seqs[0])
79
+ for i, seq in enumerate(self._original_aligned_seqs[1:], start=1):
80
+ if len(seq) != self._msa_length:
81
+ raise ValueError(
82
+ f"Sequence length mismatch: sequence 0 has length {self._msa_length}, "
83
+ f"but sequence {i} has length {len(seq)}."
84
+ )
85
+
86
+ self._initialize_all_variables()
87
+
88
+ @classmethod
89
+ def from_fasta(cls, fasta_path: Union[str, Path]) -> 'MsaStatsCalculator':
90
+ """Creates an MsaStatsCalculator instance from a FASTA file.
91
+
92
+ Args:
93
+ fasta_path: The path to the FASTA file.
94
+
95
+ Returns:
96
+ An instance of MsaStatsCalculator.
97
+
98
+ Raises:
99
+ MsaStatsError: If the file cannot be read or contains no sequences.
100
+ ValueError: If the parsed sequences are invalid (empty, mismatched lengths).
101
+ """
102
+ fasta_path = Path(fasta_path)
103
+ try:
104
+ with open(fasta_path, 'r') as f:
105
+ sequences = []
106
+ current_seq = ""
107
+ for line in f:
108
+ line = line.strip()
109
+ if line.startswith('>'):
110
+ if current_seq:
111
+ sequences.append(current_seq)
112
+ current_seq = ""
113
+ else:
114
+ current_seq += line
115
+ if current_seq:
116
+ sequences.append(current_seq)
117
+ except FileNotFoundError as e:
118
+ raise MsaStatsError(f"FASTA file not found: {fasta_path}") from e
119
+ except PermissionError as e:
120
+ raise MsaStatsError(f"Permission denied reading FASTA file: {fasta_path}") from e
121
+ except IsADirectoryError as e:
122
+ raise MsaStatsError(f"Path is a directory, not a file: {fasta_path}") from e
123
+ except OSError as e:
124
+ raise MsaStatsError(f"Failed to read FASTA file '{fasta_path}': {e}") from e
125
+
126
+ if not sequences:
127
+ raise MsaStatsError(f"No sequences found in FASTA file: {fasta_path}")
128
+
129
+ return cls(sequences)
130
+
131
+ def _initialize_all_variables(self) -> None:
132
+ """Resets all internal statistics and working data structures."""
133
+ self._aligned_seqs: List[str] = list(self._original_aligned_seqs)
134
+
135
+ # Core data structures
136
+ self._unique_indel_map: Dict[Tuple[int, int], List[int]] = {}
137
+ self._indel_counter: List[int] = []
138
+
139
+ # Final statistics attributes
140
+ self._ave_indel_length: float = 0.0
141
+ self._total_number_of_indels: int = 0
142
+ self._longest_seq_length: int = 0
143
+ self._shortest_seq_length: int = 0
144
+
145
+ self._number_of_indels_of_length_one: int = 0
146
+ self._number_of_indels_of_length_two: int = 0
147
+ self._number_of_indels_of_length_three: int = 0
148
+ self._number_of_indels_of_length_at_least_four: int = 0
149
+
150
+ self._number_of_indels_of_length_one_in_one_position: int = 0
151
+ self._number_of_indels_of_length_one_in_two_positions: int = 0
152
+ self._number_of_indels_of_length_one_in_n_minus_1_positions: int = 0
153
+
154
+ self._number_of_indels_of_length_two_in_one_position: int = 0
155
+ self._number_of_indels_of_length_two_in_two_positions: int = 0
156
+ self._number_of_indels_of_length_two_in_n_minus_1_positions: int = 0
157
+
158
+ self._number_of_indels_of_length_three_in_one_position: int = 0
159
+ self._number_of_indels_of_length_three_in_two_positions: int = 0
160
+ self._number_of_indels_of_length_three_in_n_minus_1_positions: int = 0
161
+
162
+ self._number_of_indels_of_length_at_least_four_in_one_position: int = 0
163
+ self._number_of_indels_of_length_at_least_four_in_two_positions: int = 0
164
+ self._number_of_indels_of_length_at_least_four_in_n_minus_1_positions: int = 0
165
+
166
+ self._number_of_msa_position_with_0_gaps: int = 0
167
+ self._number_of_msa_position_with_1_gaps: int = 0
168
+ self._number_of_msa_position_with_2_gaps: int = 0
169
+ self._number_of_msa_position_with_n_minus_1_gaps: int = 0
170
+
171
+ # Unique indels summary statistics
172
+ self._ave_unique_indel_length: float = 0.0
173
+ self._total_number_of_unique_indels: int = 0
174
+
175
+ def _trim_msa_from_all_indel_position_and_get_summary_statistics_from_indel_counter(self) -> None:
176
+ """
177
+ Counts gaps per column, calculates column-based stats, and trims all-gap columns.
178
+ This method modifies self._aligned_seqs.
179
+ """
180
+ if not self._original_aligned_seqs:
181
+ return
182
+
183
+ self._indel_counter = [0] * self._msa_length
184
+ for seq in self._original_aligned_seqs:
185
+ for i, char in enumerate(seq):
186
+ if char == '-':
187
+ self._indel_counter[i] += 1
188
+
189
+ for count in self._indel_counter:
190
+ if count == 0:
191
+ self._number_of_msa_position_with_0_gaps += 1
192
+ elif count == 1:
193
+ self._number_of_msa_position_with_1_gaps += 1
194
+ elif count == 2:
195
+ self._number_of_msa_position_with_2_gaps += 1
196
+ elif count == self._number_of_sequences - 1:
197
+ self._number_of_msa_position_with_n_minus_1_gaps += 1
198
+
199
+ # Identify columns to keep
200
+ cols_to_keep = [i for i, count in enumerate(self._indel_counter) if count < self._number_of_sequences]
201
+
202
+ # Create new sequences with only the columns to keep
203
+ self._aligned_seqs = [''.join(seq[i] for i in cols_to_keep) for seq in self._original_aligned_seqs]
204
+
205
+
206
+ def _fill_unique_gaps_map(self) -> None:
207
+ """
208
+ Scans the aligned sequences to identify and count unique indel events,
209
+ populating the _unique_indel_map.
210
+ """
211
+ self._unique_indel_map.clear()
212
+ if not self._aligned_seqs:
213
+ return
214
+
215
+ msa_length = len(self._aligned_seqs[0])
216
+
217
+ for seq in self._aligned_seqs:
218
+ in_indel = False
219
+ start_index = -1
220
+
221
+ for i, char in enumerate(seq):
222
+ if char == '-' and not in_indel:
223
+ in_indel = True
224
+ start_index = i
225
+ elif char != '-' and in_indel:
226
+ # End of the indel, record it
227
+ end_index = i - 1
228
+ key = (start_index, end_index)
229
+ length = end_index - start_index + 1
230
+
231
+ if key not in self._unique_indel_map:
232
+ self._unique_indel_map[key] = [length, 0]
233
+ self._unique_indel_map[key][1] += 1
234
+
235
+ in_indel = False
236
+ start_index = -1
237
+
238
+ # Handle indel that goes to the end of the sequence
239
+ if in_indel:
240
+ end_index = msa_length - 1
241
+ key = (start_index, end_index)
242
+ length = end_index - start_index + 1
243
+ if key not in self._unique_indel_map:
244
+ self._unique_indel_map[key] = [length, 0]
245
+ self._unique_indel_map[key][1] += 1
246
+
247
+ def _set_values_of_indel_summ_stats(self) -> None:
248
+ """
249
+ Calculates indel-related summary statistics by processing the _unique_indel_map.
250
+ """
251
+ self._fill_unique_gaps_map()
252
+
253
+ total_gap_chars = 0
254
+ total_unique_gap_chars = 0
255
+
256
+ for (length, count) in self._unique_indel_map.values():
257
+ self._total_number_of_indels += count
258
+ self._total_number_of_unique_indels += 1
259
+ total_gap_chars += length * count
260
+ total_unique_gap_chars += length
261
+
262
+ if length == 1:
263
+ self._number_of_indels_of_length_one += count
264
+ if count == 1:
265
+ self._number_of_indels_of_length_one_in_one_position += 1
266
+ if count == 2:
267
+ self._number_of_indels_of_length_one_in_two_positions += 1
268
+ if count == self._number_of_sequences - 1:
269
+ self._number_of_indels_of_length_one_in_n_minus_1_positions += 1
270
+ elif length == 2:
271
+ self._number_of_indels_of_length_two += count
272
+ if count == 1:
273
+ self._number_of_indels_of_length_two_in_one_position += 1
274
+ if count == 2:
275
+ self._number_of_indels_of_length_two_in_two_positions += 1
276
+ if count == self._number_of_sequences - 1:
277
+ self._number_of_indels_of_length_two_in_n_minus_1_positions += 1
278
+ elif length == 3:
279
+ self._number_of_indels_of_length_three += count
280
+ if count == 1:
281
+ self._number_of_indels_of_length_three_in_one_position += 1
282
+ if count == 2:
283
+ self._number_of_indels_of_length_three_in_two_positions += 1
284
+ if count == self._number_of_sequences - 1:
285
+ self._number_of_indels_of_length_three_in_n_minus_1_positions += 1
286
+ else: # length >= 4
287
+ self._number_of_indels_of_length_at_least_four += count
288
+ if count == 1:
289
+ self._number_of_indels_of_length_at_least_four_in_one_position += 1
290
+ if count == 2:
291
+ self._number_of_indels_of_length_at_least_four_in_two_positions += 1
292
+ if count == self._number_of_sequences - 1:
293
+ self._number_of_indels_of_length_at_least_four_in_n_minus_1_positions += 1
294
+
295
+ if self._total_number_of_indels > 0:
296
+ self._ave_indel_length = total_gap_chars / self._total_number_of_indels
297
+ if self._total_number_of_unique_indels > 0:
298
+ self._ave_unique_indel_length = total_unique_gap_chars / self._total_number_of_unique_indels
299
+
300
+ def _set_longest_and_shortest_sequence_lengths(self) -> None:
301
+ """Calculates the longest and shortest ungapped sequence lengths."""
302
+ if not self._original_aligned_seqs:
303
+ self._longest_seq_length = 0
304
+ self._shortest_seq_length = 0
305
+ return
306
+
307
+ seq_lengths = [len(s.replace('-', '')) for s in self._original_aligned_seqs]
308
+ self._longest_seq_length = max(seq_lengths)
309
+ self._shortest_seq_length = min(seq_lengths)
310
+
311
+
312
+ def recompute_stats(self) -> None:
313
+ """
314
+ The main public method to run the full statistical analysis.
315
+ This orchestrates the calls to the internal calculation methods in the correct order.
316
+ """
317
+ self._initialize_all_variables()
318
+ self._trim_msa_from_all_indel_position_and_get_summary_statistics_from_indel_counter()
319
+ self._set_values_of_indel_summ_stats()
320
+ self._set_longest_and_shortest_sequence_lengths()
321
+
322
+ def get_stat_by_type(self, stat_type: StatType) -> float:
323
+ """
324
+ Returns the value of a specific statistic.
325
+
326
+ Args:
327
+ stat_type: The enum member representing the statistic to retrieve.
328
+
329
+ Returns:
330
+ The calculated value of the statistic as a float.
331
+ """
332
+ stat_map = {
333
+ StatType.AVG_GAP_SIZE: self.average_indel_size,
334
+ StatType.MSA_LEN: self.msa_length,
335
+ StatType.LONGEST_UNALIGNED_SEQ: self.msa_longest_seq_length,
336
+ StatType.SHORTEST_UNALIGNED_SEQ: self.msa_shortest_seq_length,
337
+ StatType.TOT_NUM_GAPS: self.total_number_of_indels,
338
+ StatType.NUM_GAPS_LEN_ONE: self.number_of_indels_of_length_one,
339
+ StatType.NUM_GAPS_LEN_TWO: self.number_of_indels_of_length_two,
340
+ StatType.NUM_GAPS_LEN_THREE: self.number_of_indels_of_length_three,
341
+ StatType.NUM_GAPS_LEN_AT_LEAST_FOUR: self.number_of_indels_of_length_at_least_four,
342
+ StatType.AVG_UNIQUE_GAP_SIZE: self.average_unique_indel_size,
343
+ StatType.TOT_NUM_UNIQUE_GAPS: self.total_number_of_unique_indels,
344
+ StatType.NUM_GAPS_LEN_ONE_IN_ONE_SEQ: self.number_of_indels_of_length_one_in_one_position,
345
+ StatType.NUM_GAPS_LEN_ONE_IN_TWO_SEQS: self.number_of_indels_of_length_one_in_two_positions,
346
+ StatType.NUM_GAPS_LEN_ONE_IN_ALL_EXCEPT_ONE: self.number_of_indels_of_length_one_in_n_minus_1_positions,
347
+ StatType.NUM_GAPS_LEN_TWO_IN_ONE_SEQ: self.number_of_indels_of_length_two_in_one_position,
348
+ StatType.NUM_GAPS_LEN_TWO_IN_TWO_SEQS: self.number_of_indels_of_length_two_in_two_positions,
349
+ StatType.NUM_GAPS_LEN_TWO_IN_ALL_EXCEPT_ONE: self.number_of_indels_of_length_two_in_n_minus_1_positions,
350
+ StatType.NUM_GAPS_LEN_THREE_IN_ONE_SEQ: self.number_of_indels_of_length_three_in_one_position,
351
+ StatType.NUM_GAPS_LEN_THREE_IN_TWO_SEQS: self.number_of_indels_of_length_three_in_two_positions,
352
+ StatType.NUM_GAPS_LEN_THREE_IN_ALL_EXCEPT_ONE: self.number_of_indels_of_length_three_in_n_minus_1_positions,
353
+ StatType.NUM_GAPS_LEN_AT_LEAST_FOUR_IN_ONE_SEQ: self.number_of_indels_of_length_at_least_four_in_one_position,
354
+ StatType.NUM_GAPS_LEN_AT_LEAST_FOUR_IN_TWO_SEQS: self.number_of_indels_of_length_at_least_four_in_two_positions,
355
+ StatType.NUM_GAPS_LEN_AT_LEAST_FOUR_IN_ALL_EXCEPT_ONE: self.number_of_indels_of_length_at_least_four_in_n_minus_1_positions,
356
+ StatType.MSA_POSITION_WITH_0_GAPS: self.number_of_msa_position_with_0_gaps,
357
+ StatType.MSA_POSITION_WITH_1_GAPS: self.number_of_msa_position_with_1_gaps,
358
+ StatType.MSA_POSITION_WITH_2_GAPS: self.number_of_msa_position_with_2_gaps,
359
+ StatType.MSA_POSITION_WITH_N_MINUS_1_GAPS: self.number_of_msa_position_with_n_minus_1_gaps,
360
+ }
361
+ return float(stat_map.get(stat_type, -1.0))
362
+
363
+ def get_stat_vec(self) -> List[float]:
364
+ """
365
+ Returns a list of all summary statistics in a predefined order, matching
366
+ the C++ implementation's `getStatVec` method.
367
+ """
368
+ return [self.get_stat_by_type(stat) for stat in StatType]
369
+
370
+ def __str__(self) -> str:
371
+ """Returns a string representation of the original aligned MSA."""
372
+ return "\n".join(self._original_aligned_seqs)
373
+
374
+ # Public properties for accessing statistics
375
+ @property
376
+ def msa_length(self) -> int:
377
+ return self._msa_length
378
+
379
+ @property
380
+ def number_of_sequences(self) -> int:
381
+ return self._number_of_sequences
382
+
383
+ @property
384
+ def total_number_of_indels(self) -> int:
385
+ return self._total_number_of_indels
386
+
387
+ @property
388
+ def total_number_of_unique_indels(self) -> int:
389
+ return self._total_number_of_unique_indels
390
+
391
+ @property
392
+ def number_of_indels_of_length_one(self) -> int:
393
+ return self._number_of_indels_of_length_one
394
+
395
+ @property
396
+ def number_of_indels_of_length_two(self) -> int:
397
+ return self._number_of_indels_of_length_two
398
+
399
+ @property
400
+ def number_of_indels_of_length_three(self) -> int:
401
+ return self._number_of_indels_of_length_three
402
+
403
+ @property
404
+ def number_of_indels_of_length_at_least_four(self) -> int:
405
+ return self._number_of_indels_of_length_at_least_four
406
+
407
+ @property
408
+ def average_indel_size(self) -> float:
409
+ return self._ave_indel_length
410
+
411
+ @property
412
+ def average_unique_indel_size(self) -> float:
413
+ return self._ave_unique_indel_length
414
+
415
+ @property
416
+ def msa_longest_seq_length(self) -> int:
417
+ return self._longest_seq_length
418
+
419
+ @property
420
+ def msa_shortest_seq_length(self) -> int:
421
+ return self._shortest_seq_length
422
+
423
+ @property
424
+ def number_of_indels_of_length_one_in_one_position(self) -> int:
425
+ return self._number_of_indels_of_length_one_in_one_position
426
+
427
+ @property
428
+ def number_of_indels_of_length_one_in_two_positions(self) -> int:
429
+ return self._number_of_indels_of_length_one_in_two_positions
430
+
431
+ @property
432
+ def number_of_indels_of_length_one_in_n_minus_1_positions(self) -> int:
433
+ return self._number_of_indels_of_length_one_in_n_minus_1_positions
434
+
435
+ @property
436
+ def number_of_indels_of_length_two_in_one_position(self) -> int:
437
+ return self._number_of_indels_of_length_two_in_one_position
438
+
439
+ @property
440
+ def number_of_indels_of_length_two_in_two_positions(self) -> int:
441
+ return self._number_of_indels_of_length_two_in_two_positions
442
+
443
+ @property
444
+ def number_of_indels_of_length_two_in_n_minus_1_positions(self) -> int:
445
+ return self._number_of_indels_of_length_two_in_n_minus_1_positions
446
+
447
+ @property
448
+ def number_of_indels_of_length_three_in_one_position(self) -> int:
449
+ return self._number_of_indels_of_length_three_in_one_position
450
+
451
+ @property
452
+ def number_of_indels_of_length_three_in_two_positions(self) -> int:
453
+ return self._number_of_indels_of_length_three_in_two_positions
454
+
455
+ @property
456
+ def number_of_indels_of_length_three_in_n_minus_1_positions(self) -> int:
457
+ return self._number_of_indels_of_length_three_in_n_minus_1_positions
458
+
459
+ @property
460
+ def number_of_indels_of_length_at_least_four_in_one_position(self) -> int:
461
+ return self._number_of_indels_of_length_at_least_four_in_one_position
462
+
463
+ @property
464
+ def number_of_indels_of_length_at_least_four_in_two_positions(self) -> int:
465
+ return self._number_of_indels_of_length_at_least_four_in_two_positions
466
+
467
+ @property
468
+ def number_of_indels_of_length_at_least_four_in_n_minus_1_positions(self) -> int:
469
+ return self._number_of_indels_of_length_at_least_four_in_n_minus_1_positions
470
+
471
+ @property
472
+ def number_of_msa_position_with_0_gaps(self) -> int:
473
+ return self._number_of_msa_position_with_0_gaps
474
+
475
+ @property
476
+ def number_of_msa_position_with_1_gaps(self) -> int:
477
+ return self._number_of_msa_position_with_1_gaps
478
+
479
+ @property
480
+ def number_of_msa_position_with_2_gaps(self) -> int:
481
+ return self._number_of_msa_position_with_2_gaps
482
+
483
+ @property
484
+ def number_of_msa_position_with_n_minus_1_gaps(self) -> int:
485
+ return self._number_of_msa_position_with_n_minus_1_gaps
msastats.py ADDED
@@ -0,0 +1,49 @@
1
+ # ABOUTME: Drop-in API module for msastats (import as `msastats`).
2
+ # ABOUTME: Provides calculate_msa_stats, calculate_fasta_stats, and stats_names.
3
+ """Pure-Python replacement for the `msastats` extension module.
4
+
5
+ This module implements the minimal public API used by MSACompare:
6
+ - calculate_msa_stats(msa: list[str]) -> list[float]
7
+ - calculate_fasta_stats(path: str) -> list[float]
8
+ - stats_names() -> list[str]
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import os
14
+ from pathlib import Path
15
+ from typing import List, Sequence, Union
16
+
17
+ from msa_stats_calculator import MsaStatsCalculator, MsaStatsError, StatType
18
+
19
+ __all__ = [
20
+ "MsaStatsCalculator",
21
+ "MsaStatsError",
22
+ "StatType",
23
+ "calculate_fasta_stats",
24
+ "calculate_msa_stats",
25
+ "stats_names",
26
+ ]
27
+
28
+
29
+ PathLike = Union[str, os.PathLike[str]]
30
+
31
+
32
+ def calculate_msa_stats(msa: Sequence[str]) -> List[float]:
33
+ """Calculate stats from an in-memory MSA (list/tuple of aligned strings)."""
34
+ calculator = MsaStatsCalculator(list(msa))
35
+ calculator.recompute_stats()
36
+ return calculator.get_stat_vec()
37
+
38
+
39
+ def calculate_fasta_stats(fasta_path: PathLike) -> List[float]:
40
+ """Calculate stats from a FASTA file containing an aligned MSA."""
41
+ calculator = MsaStatsCalculator.from_fasta(Path(fasta_path))
42
+ calculator.recompute_stats()
43
+ return calculator.get_stat_vec()
44
+
45
+
46
+ def stats_names() -> List[str]:
47
+ """Return stat names in the same order as `calculate_*_stats`."""
48
+ return [stat.name for stat in StatType]
49
+
@@ -0,0 +1,328 @@
1
+ Metadata-Version: 2.4
2
+ Name: PyMSAStats
3
+ Version: 0.1.0
4
+ Summary: Pure-Python MSA summary statistics
5
+ Author: Naiel J
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/naielj/PyMSAStats
8
+ Project-URL: Repository, https://github.com/naielj/PyMSAStats
9
+ Project-URL: Issues, https://github.com/naielj/PyMSAStats/issues
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Programming Language :: Python :: 3.9
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
18
+ Requires-Python: >=3.9
19
+ Description-Content-Type: text/markdown
20
+ License-File: LICENSE
21
+ Dynamic: license-file
22
+
23
+ # PyMSAStats (pure-Python `msastats`)
24
+
25
+ A pure-Python implementation of the `msastats` API used by MSACompare for computing
26
+ Multiple Sequence Alignment (MSA) summary statistics.
27
+
28
+ This package installs and imports as `msastats`:
29
+
30
+ ```python
31
+ import msastats
32
+ ```
33
+
34
+ ## Usage
35
+
36
+ You can use either the high-level `msastats` functions (drop-in API), or the
37
+ lower-level `MsaStatsCalculator`.
38
+
39
+ ### Installation
40
+
41
+ Install from PyPI:
42
+
43
+ ```bash
44
+ pip install pymsastats
45
+ ```
46
+
47
+ Or with `uv`:
48
+
49
+ ```bash
50
+ uv add pymsastats
51
+ ```
52
+
53
+ For development (editable install from source):
54
+
55
+ ```bash
56
+ git clone https://github.com/naielj/PyMSAStats.git
57
+ cd PyMSAStats
58
+ uv sync
59
+ ```
60
+
61
+ ### Drop-in API (Recommended)
62
+
63
+ From a list of aligned sequences:
64
+
65
+ ```python
66
+ import msastats
67
+
68
+ stats = msastats.calculate_msa_stats(["AA-A", "AA-A", "A--A"])
69
+ names = msastats.stats_names()
70
+
71
+ as_dict = dict(zip(names, stats))
72
+ print(as_dict["AVG_GAP_SIZE"])
73
+ ```
74
+
75
+ From an aligned FASTA file:
76
+
77
+ ```python
78
+ import msastats
79
+
80
+ stats = msastats.calculate_fasta_stats("path/to/alignment.fasta")
81
+ print(stats)
82
+ ```
83
+
84
+ ### Calculator API
85
+
86
+ From a list of aligned sequences:
87
+
88
+ ```python
89
+ from msastats import MsaStatsCalculator, StatType
90
+
91
+ # 1. Initialize with a list of aligned sequences
92
+ sequences = [
93
+ "AC--GT",
94
+ "ACGTGT",
95
+ "AC-TGT",
96
+ ]
97
+ calculator = MsaStatsCalculator(sequences)
98
+
99
+ # 2. Compute the statistics
100
+ calculator.recompute_stats()
101
+
102
+ # 3. Access the statistics
103
+ print(f"MSA Length: {calculator.msa_length}")
104
+ print(f"Number of Sequences: {calculator.number_of_sequences}")
105
+ print(f"Total Gaps: {calculator.total_number_of_indels}")
106
+
107
+ # Or access stats by type
108
+ avg_gap_size = calculator.get_stat_by_type(StatType.AVG_GAP_SIZE)
109
+ print(f"Average Gap Size: {avg_gap_size:.2f}")
110
+
111
+ # Get all stats as a vector
112
+ stats_vector = calculator.get_stat_vec()
113
+ print(f"Stats Vector: {stats_vector}")
114
+ ```
115
+
116
+ From an aligned FASTA file:
117
+
118
+ ```python
119
+ from pathlib import Path
120
+ from msastats import MsaStatsCalculator, StatType
121
+
122
+ # 1. Create a dummy FASTA file
123
+ fasta_content = """>seq1
124
+ AC--GT--
125
+ >seq2
126
+ ACGTGT--
127
+ >seq3
128
+ AC-TGTAC
129
+ """
130
+ fasta_path = Path("dummy.fasta")
131
+ fasta_path.write_text(fasta_content)
132
+
133
+ # 2. Initialize from the FASTA file
134
+ calculator = MsaStatsCalculator.from_fasta(fasta_path)
135
+
136
+ # 3. Compute the statistics
137
+ calculator.recompute_stats()
138
+
139
+ # 4. Access the statistics
140
+ print(f"MSA Length: {calculator.msa_length}")
141
+ print(f"Longest Sequence: {calculator.msa_longest_seq_length}")
142
+ print(f"Shortest Sequence: {calculator.msa_shortest_seq_length}")
143
+ print(f"Total Number of Gaps: {calculator.total_number_of_indels}")
144
+
145
+ # Clean up the dummy file
146
+ fasta_path.unlink()
147
+ ```
148
+
149
+ ## Statistics reference (27 metrics)
150
+
151
+ The 27 summary statistics implemented here are defined in:
152
+
153
+ > Wygoda E, Loewenthal G, Moshe A, Alburquerque M, Mayrose I, Pupko T. Statistical framework to determine indel-length distribution. *Bioinformatics*. 2024;40(2):btae043. <https://doi.org/10.1093/bioinformatics/btae043>
154
+
155
+ ```bibtex
156
+ @article{wygoda2024indel,
157
+ author = {Wygoda, Elya and Loewenthal, Gil and Moshe, Asher and Alburquerque, Michael and Mayrose, Itay and Pupko, Tal},
158
+ title = {Statistical framework to determine indel-length distribution},
159
+ journal = {Bioinformatics},
160
+ volume = {40},
161
+ number = {2},
162
+ pages = {btae043},
163
+ year = {2024},
164
+ doi = {10.1093/bioinformatics/btae043}
165
+ }
166
+ ```
167
+
168
+ For an illustrated reference with a worked example showing all 27 metric values on a single MSA, see
169
+ [docs/summary_statistics_reference.md](docs/summary_statistics_reference.md).
170
+
171
+ ### Terminology
172
+
173
+ - **Gap character:** the implementation treats `-` as a gap.
174
+ - **All-gap column:** a column where *all* sequences have `-` at that position.
175
+ - **Gap run / indel (what “gap” means in most metrics):** a maximal contiguous run of `-` in a single sequence.
176
+ - **All-gap trimming (important):** before detecting gap runs, the algorithm removes all-gap columns. This matches the
177
+ original C++ implementation and prevents all-gap columns from splitting/creating gap runs.
178
+ - **Unique gap interval:** gap runs are grouped by their `(start, end)` coordinates *in the trimmed alignment*. If
179
+ multiple sequences have a gap run with the same `(start, end)`, that is **one** unique gap interval with:
180
+ - `length = end - start + 1`
181
+ - `count = number of sequences that have that exact interval`
182
+
183
+ ### Returned order
184
+
185
+ `calculate_msa_stats` / `calculate_fasta_stats` return a list of 27 floats in the same order as
186
+ `msastats.stats_names()` (and the `StatType` enum).
187
+
188
+ ### Metric definitions
189
+
190
+ #### Alignment and sequence lengths
191
+
192
+ - `MSA_LEN` (MSACompare: `LINE_LENGTH`): alignment length in columns (includes all-gap columns).
193
+ - `LONGEST_UNALIGNED_SEQ` (MSACompare: `LONGEST_UNALIGNED_SEQ_LENGTH`): max ungapped sequence length across sequences
194
+ (`len(seq.replace('-', ''))`).
195
+ - `SHORTEST_UNALIGNED_SEQ` (MSACompare: `SHORTEST_UNALIGNED_SEQ_LENGTH`): min ungapped sequence length across sequences.
196
+
197
+ #### Gap-run totals (after all-gap trimming)
198
+
199
+ Let the set of unique gap intervals be `U`, and for each `u ∈ U`, let `u.length` be its length and `u.count` be how
200
+ many sequences contain that interval.
201
+
202
+ - `TOT_NUM_GAPS` (MSACompare: `TOTAL_GAPS`): total number of gap runs across sequences:
203
+ `Σ_u u.count`
204
+ - `AVG_GAP_SIZE` (MSACompare: `AVG_LENGTH_OF_GAPS`): mean gap-run length across all sequences:
205
+ `(Σ_u u.length · u.count) / TOT_NUM_GAPS` (0 if `TOT_NUM_GAPS == 0`)
206
+ - `NUM_GAPS_LEN_ONE` (MSACompare: `GAPS_OF_LENGTH_ONE`): `Σ_{u.length==1} u.count`
207
+ - `NUM_GAPS_LEN_TWO` (MSACompare: `GAPS_OF_LENGTH_TWO`): `Σ_{u.length==2} u.count`
208
+ - `NUM_GAPS_LEN_THREE` (MSACompare: `GAPS_OF_LENGTH_THREE`): `Σ_{u.length==3} u.count`
209
+ - `NUM_GAPS_LEN_AT_LEAST_FOUR` (MSACompare: `GAPS_LARGER_THAN_THREE`): `Σ_{u.length>=4} u.count`
210
+
211
+ #### Unique gap intervals (after all-gap trimming)
212
+
213
+ - `TOT_NUM_UNIQUE_GAPS` (MSACompare: `TOTAL_UNIQUE_GAPS`): number of unique gap intervals: `|U|`
214
+ - `AVG_UNIQUE_GAP_SIZE` (MSACompare: `AVG_SIZE_OF_UNIQUE_GAPS`): mean unique-gap length:
215
+ `(Σ_u u.length) / TOT_NUM_UNIQUE_GAPS` (0 if `TOT_NUM_UNIQUE_GAPS == 0`)
216
+
217
+ #### Unique gap intervals shared by k sequences (after all-gap trimming)
218
+
219
+ These count **unique** gap intervals (not total occurrences). For a given interval length bucket, they count how many
220
+ intervals have `u.count == k`.
221
+
222
+ Length 1:
223
+ - `NUM_GAPS_LEN_ONE_IN_ONE_SEQ` (MSACompare: `GAPS_LENGTH_ONE_ONE_SEQ`): `|{u: u.length==1 and u.count==1}|`
224
+ - `NUM_GAPS_LEN_ONE_IN_TWO_SEQS` (MSACompare: `GAPS_LENGTH_ONE_TWO_SEQ`): `|{u: u.length==1 and u.count==2}|`
225
+ - `NUM_GAPS_LEN_ONE_IN_ALL_EXCEPT_ONE` (MSACompare: `GAPS_LENGTH_ONE_EXCEPT_ONE`):
226
+ `|{u: u.length==1 and u.count==N-1}|`
227
+
228
+ Length 2:
229
+ - `NUM_GAPS_LEN_TWO_IN_ONE_SEQ` (MSACompare: `GAPS_LENGTH_TWO_ONE_SEQ`): `|{u: u.length==2 and u.count==1}|`
230
+ - `NUM_GAPS_LEN_TWO_IN_TWO_SEQS` (MSACompare: `GAPS_LENGTH_TWO_TWO_SEQ`): `|{u: u.length==2 and u.count==2}|`
231
+ - `NUM_GAPS_LEN_TWO_IN_ALL_EXCEPT_ONE` (MSACompare: `GAPS_LENGTH_TWO_EXCEPT_ONE`):
232
+ `|{u: u.length==2 and u.count==N-1}|`
233
+
234
+ Length 3:
235
+ - `NUM_GAPS_LEN_THREE_IN_ONE_SEQ` (MSACompare: `GAPS_LENGTH_THREE_ONE_SEQ`): `|{u: u.length==3 and u.count==1}|`
236
+ - `NUM_GAPS_LEN_THREE_IN_TWO_SEQS` (MSACompare: `GAPS_LENGTH_THREE_TWO_SEQ`): `|{u: u.length==3 and u.count==2}|`
237
+ - `NUM_GAPS_LEN_THREE_IN_ALL_EXCEPT_ONE` (MSACompare: `GAPS_LENGTH_THREE_EXCEPT_ONE`):
238
+ `|{u: u.length==3 and u.count==N-1}|`
239
+
240
+ Length ≥ 4:
241
+ - `NUM_GAPS_LEN_AT_LEAST_FOUR_IN_ONE_SEQ` (MSACompare: `GAPS_LARGER_THAN_THREE_ONE_SEQ`):
242
+ `|{u: u.length>=4 and u.count==1}|`
243
+ - `NUM_GAPS_LEN_AT_LEAST_FOUR_IN_TWO_SEQS` (MSACompare: `GAPS_LARGER_THAN_THREE_TWO_SEQ`):
244
+ `|{u: u.length>=4 and u.count==2}|`
245
+ - `NUM_GAPS_LEN_AT_LEAST_FOUR_IN_ALL_EXCEPT_ONE` (MSACompare: `GAPS_LARGER_THAN_THREE_EXCEPT_ONE`):
246
+ `|{u: u.length>=4 and u.count==N-1}|`
247
+
248
+ Important edge-case note: to match the C++ reference, the "`== 1` / `== 2` / `== N-1`" checks are independent.
249
+ So for small `N`, some categories overlap:
250
+ - If `N == 2`, then `N-1 == 1`, so "`IN_ONE_SEQ`" and "`IN_ALL_EXCEPT_ONE`" count the same intervals.
251
+ - If `N == 3`, then `N-1 == 2`, so "`IN_TWO_SEQS`" and "`IN_ALL_EXCEPT_ONE`" count the same intervals.
252
+
253
+ #### Column-wise gap counts (computed on the original alignment, before all-gap trimming)
254
+
255
+ These count alignment columns based on how many sequences have a `-` in that column:
256
+
257
+ - `MSA_POSITION_WITH_0_GAPS` (MSACompare: `NO_GAP_COLUMNS`): number of columns with exactly 0 gaps
258
+ - `MSA_POSITION_WITH_1_GAPS` (MSACompare: `ONE_GAP_COLUMNS`): number of columns with exactly 1 gap
259
+ - `MSA_POSITION_WITH_2_GAPS` (MSACompare: `TWO_GAP_COLUMNS`): number of columns with exactly 2 gaps
260
+ - `MSA_POSITION_WITH_N_MINUS_1_GAPS` (MSACompare: `ONE_GAP_EXCEPT_ONE_COLUMN`): number of columns with exactly `N-1` gaps
261
+
262
+ Notes:
263
+ - Columns with `N` gaps (all-gap columns) are **not** counted in any of these buckets.
264
+ - For `N == 3`, the `N-1` bucket is effectively 0 because columns with 2 gaps are already counted in
265
+ `MSA_POSITION_WITH_2_GAPS` (this matches the C++ reference behavior).
266
+
267
+ ### Examples
268
+
269
+ Helper to get a readable dict:
270
+
271
+ ```python
272
+ import msastats
273
+
274
+ def stats_dict(msa):
275
+ return dict(zip(msastats.stats_names(), msastats.calculate_msa_stats(msa)))
276
+ ```
277
+
278
+ #### Example 1: One gap interval shared by 2 of 3 sequences
279
+
280
+ ```python
281
+ msa = ["A--A", "A--A", "AAAA"] # N=3, L=4
282
+ stats = stats_dict(msa)
283
+
284
+ assert stats["MSA_LEN"] == 4.0
285
+ assert stats["LONGEST_UNALIGNED_SEQ"] == 4.0
286
+ assert stats["SHORTEST_UNALIGNED_SEQ"] == 2.0
287
+
288
+ # One unique gap interval of length 2, appearing in 2 sequences:
289
+ assert stats["TOT_NUM_UNIQUE_GAPS"] == 1.0
290
+ assert stats["TOT_NUM_GAPS"] == 2.0
291
+ assert stats["AVG_GAP_SIZE"] == 2.0
292
+ assert stats["NUM_GAPS_LEN_TWO"] == 2.0
293
+
294
+ # Column-wise gap counts (original alignment):
295
+ assert stats["MSA_POSITION_WITH_0_GAPS"] == 2.0
296
+ assert stats["MSA_POSITION_WITH_2_GAPS"] == 2.0
297
+ ```
298
+
299
+ #### Example 2: All-gap columns are ignored for gap-run detection
300
+
301
+ ```python
302
+ msa = ["A-A", "A-A", "A-A"] # middle column is all gaps
303
+ stats = stats_dict(msa)
304
+
305
+ assert stats["MSA_LEN"] == 3.0
306
+
307
+ # The all-gap column is removed before gap runs are detected, so there are no gaps:
308
+ assert stats["TOT_NUM_GAPS"] == 0.0
309
+ assert stats["TOT_NUM_UNIQUE_GAPS"] == 0.0
310
+ assert stats["AVG_GAP_SIZE"] == 0.0
311
+
312
+ # But column-wise counts still “see” the original alignment columns:
313
+ assert stats["MSA_POSITION_WITH_0_GAPS"] == 2.0
314
+ ```
315
+
316
+ #### Example 3: “total gap runs” vs “unique gap intervals”
317
+
318
+ ```python
319
+ msa = ["A-A", "A-A", "AAA", "AAA"] # one gap interval shared by 2 sequences
320
+ stats = stats_dict(msa)
321
+
322
+ # Total occurrences (one per sequence that has it):
323
+ assert stats["NUM_GAPS_LEN_ONE"] == 2.0
324
+
325
+ # Unique intervals are counted once:
326
+ assert stats["TOT_NUM_UNIQUE_GAPS"] == 1.0
327
+ assert stats["NUM_GAPS_LEN_ONE_IN_TWO_SEQS"] == 1.0
328
+ ```
@@ -0,0 +1,7 @@
1
+ msa_stats_calculator.py,sha256=vh4ykrP8pK1fF-czYG04S5XezIBbLIofhIkZpCmBcgM,20912
2
+ msastats.py,sha256=4mteH4JbIPgxSkBkiGIE1VhS6lsv44SF6DwZ68wC9vA,1488
3
+ pymsastats-0.1.0.dist-info/licenses/LICENSE,sha256=huhwDNhNzr24NkyPX5pH_F4BiS_cYVrZwyXB_ZynjqE,1071
4
+ pymsastats-0.1.0.dist-info/METADATA,sha256=sT7hWXuBoctX715iU1PZYFRqNarbD9sH0DtU1skXT1g,11526
5
+ pymsastats-0.1.0.dist-info/WHEEL,sha256=YCfwYGOYMi5Jhw2fU4yNgwErybb2IX5PEwBKV4ZbdBo,91
6
+ pymsastats-0.1.0.dist-info/top_level.txt,sha256=Per3ZNEbyffbfTUtyfzXEWzt6eT2PBORZ2Mp8QrLoEQ,30
7
+ pymsastats-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Naiel Jabareen
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,2 @@
1
+ msa_stats_calculator
2
+ msastats