PyMSAStats 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pymsastats-0.1.0/LICENSE +21 -0
- pymsastats-0.1.0/PKG-INFO +328 -0
- pymsastats-0.1.0/README.md +306 -0
- pymsastats-0.1.0/pyproject.toml +45 -0
- pymsastats-0.1.0/setup.cfg +4 -0
- pymsastats-0.1.0/src/PyMSAStats.egg-info/PKG-INFO +328 -0
- pymsastats-0.1.0/src/PyMSAStats.egg-info/SOURCES.txt +13 -0
- pymsastats-0.1.0/src/PyMSAStats.egg-info/dependency_links.txt +1 -0
- pymsastats-0.1.0/src/PyMSAStats.egg-info/top_level.txt +2 -0
- pymsastats-0.1.0/src/msa_stats_calculator.py +485 -0
- pymsastats-0.1.0/src/msastats.py +49 -0
- pymsastats-0.1.0/tests/test_additional_msas.py +286 -0
- pymsastats-0.1.0/tests/test_msa_stats_calculator.py +163 -0
- pymsastats-0.1.0/tests/test_msastats_api.py +30 -0
- pymsastats-0.1.0/tests/test_summary_statistics_reference.py +67 -0
pymsastats-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Naiel Jabareen
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: PyMSAStats
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Pure-Python MSA summary statistics
|
|
5
|
+
Author: Naiel J
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/naielj/PyMSAStats
|
|
8
|
+
Project-URL: Repository, https://github.com/naielj/PyMSAStats
|
|
9
|
+
Project-URL: Issues, https://github.com/naielj/PyMSAStats/issues
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
18
|
+
Requires-Python: >=3.9
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Dynamic: license-file
|
|
22
|
+
|
|
23
|
+
# PyMSAStats (pure-Python `msastats`)
|
|
24
|
+
|
|
25
|
+
A pure-Python implementation of the `msastats` API used by MSACompare for computing
|
|
26
|
+
Multiple Sequence Alignment (MSA) summary statistics.
|
|
27
|
+
|
|
28
|
+
This package installs and imports as `msastats`:
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
import msastats
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Usage
|
|
35
|
+
|
|
36
|
+
You can use either the high-level `msastats` functions (drop-in API), or the
|
|
37
|
+
lower-level `MsaStatsCalculator`.
|
|
38
|
+
|
|
39
|
+
### Installation
|
|
40
|
+
|
|
41
|
+
Install from PyPI:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
pip install pymsastats
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Or with `uv`:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
uv add pymsastats
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
For development (editable install from source):
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
git clone https://github.com/naielj/PyMSAStats.git
|
|
57
|
+
cd PyMSAStats
|
|
58
|
+
uv sync
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### Drop-in API (Recommended)
|
|
62
|
+
|
|
63
|
+
From a list of aligned sequences:
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
import msastats
|
|
67
|
+
|
|
68
|
+
stats = msastats.calculate_msa_stats(["AA-A", "AA-A", "A--A"])
|
|
69
|
+
names = msastats.stats_names()
|
|
70
|
+
|
|
71
|
+
as_dict = dict(zip(names, stats))
|
|
72
|
+
print(as_dict["AVG_GAP_SIZE"])
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
From an aligned FASTA file:
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
import msastats
|
|
79
|
+
|
|
80
|
+
stats = msastats.calculate_fasta_stats("path/to/alignment.fasta")
|
|
81
|
+
print(stats)
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### Calculator API
|
|
85
|
+
|
|
86
|
+
From a list of aligned sequences:
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
from msastats import MsaStatsCalculator, StatType
|
|
90
|
+
|
|
91
|
+
# 1. Initialize with a list of aligned sequences
|
|
92
|
+
sequences = [
|
|
93
|
+
"AC--GT",
|
|
94
|
+
"ACGTGT",
|
|
95
|
+
"AC-TGT",
|
|
96
|
+
]
|
|
97
|
+
calculator = MsaStatsCalculator(sequences)
|
|
98
|
+
|
|
99
|
+
# 2. Compute the statistics
|
|
100
|
+
calculator.recompute_stats()
|
|
101
|
+
|
|
102
|
+
# 3. Access the statistics
|
|
103
|
+
print(f"MSA Length: {calculator.msa_length}")
|
|
104
|
+
print(f"Number of Sequences: {calculator.number_of_sequences}")
|
|
105
|
+
print(f"Total Gaps: {calculator.total_number_of_indels}")
|
|
106
|
+
|
|
107
|
+
# Or access stats by type
|
|
108
|
+
avg_gap_size = calculator.get_stat_by_type(StatType.AVG_GAP_SIZE)
|
|
109
|
+
print(f"Average Gap Size: {avg_gap_size:.2f}")
|
|
110
|
+
|
|
111
|
+
# Get all stats as a vector
|
|
112
|
+
stats_vector = calculator.get_stat_vec()
|
|
113
|
+
print(f"Stats Vector: {stats_vector}")
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
From an aligned FASTA file:
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
from pathlib import Path
|
|
120
|
+
from msastats import MsaStatsCalculator, StatType
|
|
121
|
+
|
|
122
|
+
# 1. Create a dummy FASTA file
|
|
123
|
+
fasta_content = """>seq1
|
|
124
|
+
AC--GT--
|
|
125
|
+
>seq2
|
|
126
|
+
ACGTGT--
|
|
127
|
+
>seq3
|
|
128
|
+
AC-TGTAC
|
|
129
|
+
"""
|
|
130
|
+
fasta_path = Path("dummy.fasta")
|
|
131
|
+
fasta_path.write_text(fasta_content)
|
|
132
|
+
|
|
133
|
+
# 2. Initialize from the FASTA file
|
|
134
|
+
calculator = MsaStatsCalculator.from_fasta(fasta_path)
|
|
135
|
+
|
|
136
|
+
# 3. Compute the statistics
|
|
137
|
+
calculator.recompute_stats()
|
|
138
|
+
|
|
139
|
+
# 4. Access the statistics
|
|
140
|
+
print(f"MSA Length: {calculator.msa_length}")
|
|
141
|
+
print(f"Longest Sequence: {calculator.msa_longest_seq_length}")
|
|
142
|
+
print(f"Shortest Sequence: {calculator.msa_shortest_seq_length}")
|
|
143
|
+
print(f"Total Number of Gaps: {calculator.total_number_of_indels}")
|
|
144
|
+
|
|
145
|
+
# Clean up the dummy file
|
|
146
|
+
fasta_path.unlink()
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
## Statistics reference (27 metrics)
|
|
150
|
+
|
|
151
|
+
The 27 summary statistics implemented here are defined in:
|
|
152
|
+
|
|
153
|
+
> Wygoda E, Loewenthal G, Moshe A, Alburquerque M, Mayrose I, Pupko T. Statistical framework to determine indel-length distribution. *Bioinformatics*. 2024;40(2):btae043. <https://doi.org/10.1093/bioinformatics/btae043>
|
|
154
|
+
|
|
155
|
+
```bibtex
|
|
156
|
+
@article{wygoda2024indel,
|
|
157
|
+
author = {Wygoda, Elya and Loewenthal, Gil and Moshe, Asher and Alburquerque, Michael and Mayrose, Itay and Pupko, Tal},
|
|
158
|
+
title = {Statistical framework to determine indel-length distribution},
|
|
159
|
+
journal = {Bioinformatics},
|
|
160
|
+
volume = {40},
|
|
161
|
+
number = {2},
|
|
162
|
+
pages = {btae043},
|
|
163
|
+
year = {2024},
|
|
164
|
+
doi = {10.1093/bioinformatics/btae043}
|
|
165
|
+
}
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
For an illustrated reference with a worked example showing all 27 metric values on a single MSA, see
|
|
169
|
+
[docs/summary_statistics_reference.md](docs/summary_statistics_reference.md).
|
|
170
|
+
|
|
171
|
+
### Terminology
|
|
172
|
+
|
|
173
|
+
- **Gap character:** the implementation treats `-` as a gap.
|
|
174
|
+
- **All-gap column:** a column where *all* sequences have `-` at that position.
|
|
175
|
+
- **Gap run / indel (what “gap” means in most metrics):** a maximal contiguous run of `-` in a single sequence.
|
|
176
|
+
- **All-gap trimming (important):** before detecting gap runs, the algorithm removes all-gap columns. This matches the
|
|
177
|
+
original C++ implementation and prevents all-gap columns from splitting/creating gap runs.
|
|
178
|
+
- **Unique gap interval:** gap runs are grouped by their `(start, end)` coordinates *in the trimmed alignment*. If
|
|
179
|
+
multiple sequences have a gap run with the same `(start, end)`, that is **one** unique gap interval with:
|
|
180
|
+
- `length = end - start + 1`
|
|
181
|
+
- `count = number of sequences that have that exact interval`
|
|
182
|
+
|
|
183
|
+
### Returned order
|
|
184
|
+
|
|
185
|
+
`calculate_msa_stats` / `calculate_fasta_stats` return a list of 27 floats in the same order as
|
|
186
|
+
`msastats.stats_names()` (and the `StatType` enum).
|
|
187
|
+
|
|
188
|
+
### Metric definitions
|
|
189
|
+
|
|
190
|
+
#### Alignment and sequence lengths
|
|
191
|
+
|
|
192
|
+
- `MSA_LEN` (MSACompare: `LINE_LENGTH`): alignment length in columns (includes all-gap columns).
|
|
193
|
+
- `LONGEST_UNALIGNED_SEQ` (MSACompare: `LONGEST_UNALIGNED_SEQ_LENGTH`): max ungapped sequence length across sequences
|
|
194
|
+
(`len(seq.replace('-', ''))`).
|
|
195
|
+
- `SHORTEST_UNALIGNED_SEQ` (MSACompare: `SHORTEST_UNALIGNED_SEQ_LENGTH`): min ungapped sequence length across sequences.
|
|
196
|
+
|
|
197
|
+
#### Gap-run totals (after all-gap trimming)
|
|
198
|
+
|
|
199
|
+
Let the set of unique gap intervals be `U`, and for each `u ∈ U`, let `u.length` be its length and `u.count` be how
|
|
200
|
+
many sequences contain that interval.
|
|
201
|
+
|
|
202
|
+
- `TOT_NUM_GAPS` (MSACompare: `TOTAL_GAPS`): total number of gap runs across sequences:
|
|
203
|
+
`Σ_u u.count`
|
|
204
|
+
- `AVG_GAP_SIZE` (MSACompare: `AVG_LENGTH_OF_GAPS`): mean gap-run length across all sequences:
|
|
205
|
+
`(Σ_u u.length · u.count) / TOT_NUM_GAPS` (0 if `TOT_NUM_GAPS == 0`)
|
|
206
|
+
- `NUM_GAPS_LEN_ONE` (MSACompare: `GAPS_OF_LENGTH_ONE`): `Σ_{u.length==1} u.count`
|
|
207
|
+
- `NUM_GAPS_LEN_TWO` (MSACompare: `GAPS_OF_LENGTH_TWO`): `Σ_{u.length==2} u.count`
|
|
208
|
+
- `NUM_GAPS_LEN_THREE` (MSACompare: `GAPS_OF_LENGTH_THREE`): `Σ_{u.length==3} u.count`
|
|
209
|
+
- `NUM_GAPS_LEN_AT_LEAST_FOUR` (MSACompare: `GAPS_LARGER_THAN_THREE`): `Σ_{u.length>=4} u.count`
|
|
210
|
+
|
|
211
|
+
#### Unique gap intervals (after all-gap trimming)
|
|
212
|
+
|
|
213
|
+
- `TOT_NUM_UNIQUE_GAPS` (MSACompare: `TOTAL_UNIQUE_GAPS`): number of unique gap intervals: `|U|`
|
|
214
|
+
- `AVG_UNIQUE_GAP_SIZE` (MSACompare: `AVG_SIZE_OF_UNIQUE_GAPS`): mean unique-gap length:
|
|
215
|
+
`(Σ_u u.length) / TOT_NUM_UNIQUE_GAPS` (0 if `TOT_NUM_UNIQUE_GAPS == 0`)
|
|
216
|
+
|
|
217
|
+
#### Unique gap intervals shared by k sequences (after all-gap trimming)
|
|
218
|
+
|
|
219
|
+
These count **unique** gap intervals (not total occurrences). For a given interval length bucket, they count how many
|
|
220
|
+
intervals have `u.count == k`.
|
|
221
|
+
|
|
222
|
+
Length 1:
|
|
223
|
+
- `NUM_GAPS_LEN_ONE_IN_ONE_SEQ` (MSACompare: `GAPS_LENGTH_ONE_ONE_SEQ`): `|{u: u.length==1 and u.count==1}|`
|
|
224
|
+
- `NUM_GAPS_LEN_ONE_IN_TWO_SEQS` (MSACompare: `GAPS_LENGTH_ONE_TWO_SEQ`): `|{u: u.length==1 and u.count==2}|`
|
|
225
|
+
- `NUM_GAPS_LEN_ONE_IN_ALL_EXCEPT_ONE` (MSACompare: `GAPS_LENGTH_ONE_EXCEPT_ONE`):
|
|
226
|
+
`|{u: u.length==1 and u.count==N-1}|`
|
|
227
|
+
|
|
228
|
+
Length 2:
|
|
229
|
+
- `NUM_GAPS_LEN_TWO_IN_ONE_SEQ` (MSACompare: `GAPS_LENGTH_TWO_ONE_SEQ`): `|{u: u.length==2 and u.count==1}|`
|
|
230
|
+
- `NUM_GAPS_LEN_TWO_IN_TWO_SEQS` (MSACompare: `GAPS_LENGTH_TWO_TWO_SEQ`): `|{u: u.length==2 and u.count==2}|`
|
|
231
|
+
- `NUM_GAPS_LEN_TWO_IN_ALL_EXCEPT_ONE` (MSACompare: `GAPS_LENGTH_TWO_EXCEPT_ONE`):
|
|
232
|
+
`|{u: u.length==2 and u.count==N-1}|`
|
|
233
|
+
|
|
234
|
+
Length 3:
|
|
235
|
+
- `NUM_GAPS_LEN_THREE_IN_ONE_SEQ` (MSACompare: `GAPS_LENGTH_THREE_ONE_SEQ`): `|{u: u.length==3 and u.count==1}|`
|
|
236
|
+
- `NUM_GAPS_LEN_THREE_IN_TWO_SEQS` (MSACompare: `GAPS_LENGTH_THREE_TWO_SEQ`): `|{u: u.length==3 and u.count==2}|`
|
|
237
|
+
- `NUM_GAPS_LEN_THREE_IN_ALL_EXCEPT_ONE` (MSACompare: `GAPS_LENGTH_THREE_EXCEPT_ONE`):
|
|
238
|
+
`|{u: u.length==3 and u.count==N-1}|`
|
|
239
|
+
|
|
240
|
+
Length ≥ 4:
|
|
241
|
+
- `NUM_GAPS_LEN_AT_LEAST_FOUR_IN_ONE_SEQ` (MSACompare: `GAPS_LARGER_THAN_THREE_ONE_SEQ`):
|
|
242
|
+
`|{u: u.length>=4 and u.count==1}|`
|
|
243
|
+
- `NUM_GAPS_LEN_AT_LEAST_FOUR_IN_TWO_SEQS` (MSACompare: `GAPS_LARGER_THAN_THREE_TWO_SEQ`):
|
|
244
|
+
`|{u: u.length>=4 and u.count==2}|`
|
|
245
|
+
- `NUM_GAPS_LEN_AT_LEAST_FOUR_IN_ALL_EXCEPT_ONE` (MSACompare: `GAPS_LARGER_THAN_THREE_EXCEPT_ONE`):
|
|
246
|
+
`|{u: u.length>=4 and u.count==N-1}|`
|
|
247
|
+
|
|
248
|
+
Important edge-case note: to match the C++ reference, the "`== 1` / `== 2` / `== N-1`" checks are independent.
|
|
249
|
+
So for small `N`, some categories overlap:
|
|
250
|
+
- If `N == 2`, then `N-1 == 1`, so "`IN_ONE_SEQ`" and "`IN_ALL_EXCEPT_ONE`" count the same intervals.
|
|
251
|
+
- If `N == 3`, then `N-1 == 2`, so "`IN_TWO_SEQS`" and "`IN_ALL_EXCEPT_ONE`" count the same intervals.
|
|
252
|
+
|
|
253
|
+
#### Column-wise gap counts (computed on the original alignment, before all-gap trimming)
|
|
254
|
+
|
|
255
|
+
These count alignment columns based on how many sequences have a `-` in that column:
|
|
256
|
+
|
|
257
|
+
- `MSA_POSITION_WITH_0_GAPS` (MSACompare: `NO_GAP_COLUMNS`): number of columns with exactly 0 gaps
|
|
258
|
+
- `MSA_POSITION_WITH_1_GAPS` (MSACompare: `ONE_GAP_COLUMNS`): number of columns with exactly 1 gap
|
|
259
|
+
- `MSA_POSITION_WITH_2_GAPS` (MSACompare: `TWO_GAP_COLUMNS`): number of columns with exactly 2 gaps
|
|
260
|
+
- `MSA_POSITION_WITH_N_MINUS_1_GAPS` (MSACompare: `ONE_GAP_EXCEPT_ONE_COLUMN`): number of columns with exactly `N-1` gaps
|
|
261
|
+
|
|
262
|
+
Notes:
|
|
263
|
+
- Columns with `N` gaps (all-gap columns) are **not** counted in any of these buckets.
|
|
264
|
+
- For `N == 3`, the `N-1` bucket is effectively 0 because columns with 2 gaps are already counted in
|
|
265
|
+
`MSA_POSITION_WITH_2_GAPS` (this matches the C++ reference behavior).
|
|
266
|
+
|
|
267
|
+
### Examples
|
|
268
|
+
|
|
269
|
+
Helper to get a readable dict:
|
|
270
|
+
|
|
271
|
+
```python
|
|
272
|
+
import msastats
|
|
273
|
+
|
|
274
|
+
def stats_dict(msa):
|
|
275
|
+
return dict(zip(msastats.stats_names(), msastats.calculate_msa_stats(msa)))
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
#### Example 1: One gap interval shared by 2 of 3 sequences
|
|
279
|
+
|
|
280
|
+
```python
|
|
281
|
+
msa = ["A--A", "A--A", "AAAA"] # N=3, L=4
|
|
282
|
+
stats = stats_dict(msa)
|
|
283
|
+
|
|
284
|
+
assert stats["MSA_LEN"] == 4.0
|
|
285
|
+
assert stats["LONGEST_UNALIGNED_SEQ"] == 4.0
|
|
286
|
+
assert stats["SHORTEST_UNALIGNED_SEQ"] == 2.0
|
|
287
|
+
|
|
288
|
+
# One unique gap interval of length 2, appearing in 2 sequences:
|
|
289
|
+
assert stats["TOT_NUM_UNIQUE_GAPS"] == 1.0
|
|
290
|
+
assert stats["TOT_NUM_GAPS"] == 2.0
|
|
291
|
+
assert stats["AVG_GAP_SIZE"] == 2.0
|
|
292
|
+
assert stats["NUM_GAPS_LEN_TWO"] == 2.0
|
|
293
|
+
|
|
294
|
+
# Column-wise gap counts (original alignment):
|
|
295
|
+
assert stats["MSA_POSITION_WITH_0_GAPS"] == 2.0
|
|
296
|
+
assert stats["MSA_POSITION_WITH_2_GAPS"] == 2.0
|
|
297
|
+
```
|
|
298
|
+
|
|
299
|
+
#### Example 2: All-gap columns are ignored for gap-run detection
|
|
300
|
+
|
|
301
|
+
```python
|
|
302
|
+
msa = ["A-A", "A-A", "A-A"] # middle column is all gaps
|
|
303
|
+
stats = stats_dict(msa)
|
|
304
|
+
|
|
305
|
+
assert stats["MSA_LEN"] == 3.0
|
|
306
|
+
|
|
307
|
+
# The all-gap column is removed before gap runs are detected, so there are no gaps:
|
|
308
|
+
assert stats["TOT_NUM_GAPS"] == 0.0
|
|
309
|
+
assert stats["TOT_NUM_UNIQUE_GAPS"] == 0.0
|
|
310
|
+
assert stats["AVG_GAP_SIZE"] == 0.0
|
|
311
|
+
|
|
312
|
+
# But column-wise counts still “see” the original alignment columns:
|
|
313
|
+
assert stats["MSA_POSITION_WITH_0_GAPS"] == 2.0
|
|
314
|
+
```
|
|
315
|
+
|
|
316
|
+
#### Example 3: “total gap runs” vs “unique gap intervals”
|
|
317
|
+
|
|
318
|
+
```python
|
|
319
|
+
msa = ["A-A", "A-A", "AAA", "AAA"] # one gap interval shared by 2 sequences
|
|
320
|
+
stats = stats_dict(msa)
|
|
321
|
+
|
|
322
|
+
# Total occurrences (one per sequence that has it):
|
|
323
|
+
assert stats["NUM_GAPS_LEN_ONE"] == 2.0
|
|
324
|
+
|
|
325
|
+
# Unique intervals are counted once:
|
|
326
|
+
assert stats["TOT_NUM_UNIQUE_GAPS"] == 1.0
|
|
327
|
+
assert stats["NUM_GAPS_LEN_ONE_IN_TWO_SEQS"] == 1.0
|
|
328
|
+
```
|
|
@@ -0,0 +1,306 @@
|
|
|
1
|
+
# PyMSAStats (pure-Python `msastats`)
|
|
2
|
+
|
|
3
|
+
A pure-Python implementation of the `msastats` API used by MSACompare for computing
|
|
4
|
+
Multiple Sequence Alignment (MSA) summary statistics.
|
|
5
|
+
|
|
6
|
+
This package installs and imports as `msastats`:
|
|
7
|
+
|
|
8
|
+
```python
|
|
9
|
+
import msastats
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
## Usage
|
|
13
|
+
|
|
14
|
+
You can use either the high-level `msastats` functions (drop-in API), or the
|
|
15
|
+
lower-level `MsaStatsCalculator`.
|
|
16
|
+
|
|
17
|
+
### Installation
|
|
18
|
+
|
|
19
|
+
Install from PyPI:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
pip install pymsastats
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Or with `uv`:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
uv add pymsastats
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
For development (editable install from source):
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
git clone https://github.com/naielj/PyMSAStats.git
|
|
35
|
+
cd PyMSAStats
|
|
36
|
+
uv sync
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
### Drop-in API (Recommended)
|
|
40
|
+
|
|
41
|
+
From a list of aligned sequences:
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
import msastats
|
|
45
|
+
|
|
46
|
+
stats = msastats.calculate_msa_stats(["AA-A", "AA-A", "A--A"])
|
|
47
|
+
names = msastats.stats_names()
|
|
48
|
+
|
|
49
|
+
as_dict = dict(zip(names, stats))
|
|
50
|
+
print(as_dict["AVG_GAP_SIZE"])
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
From an aligned FASTA file:
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
import msastats
|
|
57
|
+
|
|
58
|
+
stats = msastats.calculate_fasta_stats("path/to/alignment.fasta")
|
|
59
|
+
print(stats)
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### Calculator API
|
|
63
|
+
|
|
64
|
+
From a list of aligned sequences:
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
from msastats import MsaStatsCalculator, StatType
|
|
68
|
+
|
|
69
|
+
# 1. Initialize with a list of aligned sequences
|
|
70
|
+
sequences = [
|
|
71
|
+
"AC--GT",
|
|
72
|
+
"ACGTGT",
|
|
73
|
+
"AC-TGT",
|
|
74
|
+
]
|
|
75
|
+
calculator = MsaStatsCalculator(sequences)
|
|
76
|
+
|
|
77
|
+
# 2. Compute the statistics
|
|
78
|
+
calculator.recompute_stats()
|
|
79
|
+
|
|
80
|
+
# 3. Access the statistics
|
|
81
|
+
print(f"MSA Length: {calculator.msa_length}")
|
|
82
|
+
print(f"Number of Sequences: {calculator.number_of_sequences}")
|
|
83
|
+
print(f"Total Gaps: {calculator.total_number_of_indels}")
|
|
84
|
+
|
|
85
|
+
# Or access stats by type
|
|
86
|
+
avg_gap_size = calculator.get_stat_by_type(StatType.AVG_GAP_SIZE)
|
|
87
|
+
print(f"Average Gap Size: {avg_gap_size:.2f}")
|
|
88
|
+
|
|
89
|
+
# Get all stats as a vector
|
|
90
|
+
stats_vector = calculator.get_stat_vec()
|
|
91
|
+
print(f"Stats Vector: {stats_vector}")
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
From an aligned FASTA file:
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
from pathlib import Path
|
|
98
|
+
from msastats import MsaStatsCalculator, StatType
|
|
99
|
+
|
|
100
|
+
# 1. Create a dummy FASTA file
|
|
101
|
+
fasta_content = """>seq1
|
|
102
|
+
AC--GT--
|
|
103
|
+
>seq2
|
|
104
|
+
ACGTGT--
|
|
105
|
+
>seq3
|
|
106
|
+
AC-TGTAC
|
|
107
|
+
"""
|
|
108
|
+
fasta_path = Path("dummy.fasta")
|
|
109
|
+
fasta_path.write_text(fasta_content)
|
|
110
|
+
|
|
111
|
+
# 2. Initialize from the FASTA file
|
|
112
|
+
calculator = MsaStatsCalculator.from_fasta(fasta_path)
|
|
113
|
+
|
|
114
|
+
# 3. Compute the statistics
|
|
115
|
+
calculator.recompute_stats()
|
|
116
|
+
|
|
117
|
+
# 4. Access the statistics
|
|
118
|
+
print(f"MSA Length: {calculator.msa_length}")
|
|
119
|
+
print(f"Longest Sequence: {calculator.msa_longest_seq_length}")
|
|
120
|
+
print(f"Shortest Sequence: {calculator.msa_shortest_seq_length}")
|
|
121
|
+
print(f"Total Number of Gaps: {calculator.total_number_of_indels}")
|
|
122
|
+
|
|
123
|
+
# Clean up the dummy file
|
|
124
|
+
fasta_path.unlink()
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
## Statistics reference (27 metrics)
|
|
128
|
+
|
|
129
|
+
The 27 summary statistics implemented here are defined in:
|
|
130
|
+
|
|
131
|
+
> Wygoda E, Loewenthal G, Moshe A, Alburquerque M, Mayrose I, Pupko T. Statistical framework to determine indel-length distribution. *Bioinformatics*. 2024;40(2):btae043. <https://doi.org/10.1093/bioinformatics/btae043>
|
|
132
|
+
|
|
133
|
+
```bibtex
|
|
134
|
+
@article{wygoda2024indel,
|
|
135
|
+
author = {Wygoda, Elya and Loewenthal, Gil and Moshe, Asher and Alburquerque, Michael and Mayrose, Itay and Pupko, Tal},
|
|
136
|
+
title = {Statistical framework to determine indel-length distribution},
|
|
137
|
+
journal = {Bioinformatics},
|
|
138
|
+
volume = {40},
|
|
139
|
+
number = {2},
|
|
140
|
+
pages = {btae043},
|
|
141
|
+
year = {2024},
|
|
142
|
+
doi = {10.1093/bioinformatics/btae043}
|
|
143
|
+
}
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
For an illustrated reference with a worked example showing all 27 metric values on a single MSA, see
|
|
147
|
+
[docs/summary_statistics_reference.md](docs/summary_statistics_reference.md).
|
|
148
|
+
|
|
149
|
+
### Terminology
|
|
150
|
+
|
|
151
|
+
- **Gap character:** the implementation treats `-` as a gap.
|
|
152
|
+
- **All-gap column:** a column where *all* sequences have `-` at that position.
|
|
153
|
+
- **Gap run / indel (what “gap” means in most metrics):** a maximal contiguous run of `-` in a single sequence.
|
|
154
|
+
- **All-gap trimming (important):** before detecting gap runs, the algorithm removes all-gap columns. This matches the
|
|
155
|
+
original C++ implementation and prevents all-gap columns from splitting/creating gap runs.
|
|
156
|
+
- **Unique gap interval:** gap runs are grouped by their `(start, end)` coordinates *in the trimmed alignment*. If
|
|
157
|
+
multiple sequences have a gap run with the same `(start, end)`, that is **one** unique gap interval with:
|
|
158
|
+
- `length = end - start + 1`
|
|
159
|
+
- `count = number of sequences that have that exact interval`
|
|
160
|
+
|
|
161
|
+
### Returned order
|
|
162
|
+
|
|
163
|
+
`calculate_msa_stats` / `calculate_fasta_stats` return a list of 27 floats in the same order as
|
|
164
|
+
`msastats.stats_names()` (and the `StatType` enum).
|
|
165
|
+
|
|
166
|
+
### Metric definitions
|
|
167
|
+
|
|
168
|
+
#### Alignment and sequence lengths
|
|
169
|
+
|
|
170
|
+
- `MSA_LEN` (MSACompare: `LINE_LENGTH`): alignment length in columns (includes all-gap columns).
|
|
171
|
+
- `LONGEST_UNALIGNED_SEQ` (MSACompare: `LONGEST_UNALIGNED_SEQ_LENGTH`): max ungapped sequence length across sequences
|
|
172
|
+
(`len(seq.replace('-', ''))`).
|
|
173
|
+
- `SHORTEST_UNALIGNED_SEQ` (MSACompare: `SHORTEST_UNALIGNED_SEQ_LENGTH`): min ungapped sequence length across sequences.
|
|
174
|
+
|
|
175
|
+
#### Gap-run totals (after all-gap trimming)
|
|
176
|
+
|
|
177
|
+
Let the set of unique gap intervals be `U`, and for each `u ∈ U`, let `u.length` be its length and `u.count` be how
|
|
178
|
+
many sequences contain that interval.
|
|
179
|
+
|
|
180
|
+
- `TOT_NUM_GAPS` (MSACompare: `TOTAL_GAPS`): total number of gap runs across sequences:
|
|
181
|
+
`Σ_u u.count`
|
|
182
|
+
- `AVG_GAP_SIZE` (MSACompare: `AVG_LENGTH_OF_GAPS`): mean gap-run length across all sequences:
|
|
183
|
+
`(Σ_u u.length · u.count) / TOT_NUM_GAPS` (0 if `TOT_NUM_GAPS == 0`)
|
|
184
|
+
- `NUM_GAPS_LEN_ONE` (MSACompare: `GAPS_OF_LENGTH_ONE`): `Σ_{u.length==1} u.count`
|
|
185
|
+
- `NUM_GAPS_LEN_TWO` (MSACompare: `GAPS_OF_LENGTH_TWO`): `Σ_{u.length==2} u.count`
|
|
186
|
+
- `NUM_GAPS_LEN_THREE` (MSACompare: `GAPS_OF_LENGTH_THREE`): `Σ_{u.length==3} u.count`
|
|
187
|
+
- `NUM_GAPS_LEN_AT_LEAST_FOUR` (MSACompare: `GAPS_LARGER_THAN_THREE`): `Σ_{u.length>=4} u.count`
|
|
188
|
+
|
|
189
|
+
#### Unique gap intervals (after all-gap trimming)
|
|
190
|
+
|
|
191
|
+
- `TOT_NUM_UNIQUE_GAPS` (MSACompare: `TOTAL_UNIQUE_GAPS`): number of unique gap intervals: `|U|`
|
|
192
|
+
- `AVG_UNIQUE_GAP_SIZE` (MSACompare: `AVG_SIZE_OF_UNIQUE_GAPS`): mean unique-gap length:
|
|
193
|
+
`(Σ_u u.length) / TOT_NUM_UNIQUE_GAPS` (0 if `TOT_NUM_UNIQUE_GAPS == 0`)
|
|
194
|
+
|
|
195
|
+
#### Unique gap intervals shared by k sequences (after all-gap trimming)
|
|
196
|
+
|
|
197
|
+
These count **unique** gap intervals (not total occurrences). For a given interval length bucket, they count how many
|
|
198
|
+
intervals have `u.count == k`.
|
|
199
|
+
|
|
200
|
+
Length 1:
|
|
201
|
+
- `NUM_GAPS_LEN_ONE_IN_ONE_SEQ` (MSACompare: `GAPS_LENGTH_ONE_ONE_SEQ`): `|{u: u.length==1 and u.count==1}|`
|
|
202
|
+
- `NUM_GAPS_LEN_ONE_IN_TWO_SEQS` (MSACompare: `GAPS_LENGTH_ONE_TWO_SEQ`): `|{u: u.length==1 and u.count==2}|`
|
|
203
|
+
- `NUM_GAPS_LEN_ONE_IN_ALL_EXCEPT_ONE` (MSACompare: `GAPS_LENGTH_ONE_EXCEPT_ONE`):
|
|
204
|
+
`|{u: u.length==1 and u.count==N-1}|`
|
|
205
|
+
|
|
206
|
+
Length 2:
|
|
207
|
+
- `NUM_GAPS_LEN_TWO_IN_ONE_SEQ` (MSACompare: `GAPS_LENGTH_TWO_ONE_SEQ`): `|{u: u.length==2 and u.count==1}|`
|
|
208
|
+
- `NUM_GAPS_LEN_TWO_IN_TWO_SEQS` (MSACompare: `GAPS_LENGTH_TWO_TWO_SEQ`): `|{u: u.length==2 and u.count==2}|`
|
|
209
|
+
- `NUM_GAPS_LEN_TWO_IN_ALL_EXCEPT_ONE` (MSACompare: `GAPS_LENGTH_TWO_EXCEPT_ONE`):
|
|
210
|
+
`|{u: u.length==2 and u.count==N-1}|`
|
|
211
|
+
|
|
212
|
+
Length 3:
|
|
213
|
+
- `NUM_GAPS_LEN_THREE_IN_ONE_SEQ` (MSACompare: `GAPS_LENGTH_THREE_ONE_SEQ`): `|{u: u.length==3 and u.count==1}|`
|
|
214
|
+
- `NUM_GAPS_LEN_THREE_IN_TWO_SEQS` (MSACompare: `GAPS_LENGTH_THREE_TWO_SEQ`): `|{u: u.length==3 and u.count==2}|`
|
|
215
|
+
- `NUM_GAPS_LEN_THREE_IN_ALL_EXCEPT_ONE` (MSACompare: `GAPS_LENGTH_THREE_EXCEPT_ONE`):
|
|
216
|
+
`|{u: u.length==3 and u.count==N-1}|`
|
|
217
|
+
|
|
218
|
+
Length ≥ 4:
|
|
219
|
+
- `NUM_GAPS_LEN_AT_LEAST_FOUR_IN_ONE_SEQ` (MSACompare: `GAPS_LARGER_THAN_THREE_ONE_SEQ`):
|
|
220
|
+
`|{u: u.length>=4 and u.count==1}|`
|
|
221
|
+
- `NUM_GAPS_LEN_AT_LEAST_FOUR_IN_TWO_SEQS` (MSACompare: `GAPS_LARGER_THAN_THREE_TWO_SEQ`):
|
|
222
|
+
`|{u: u.length>=4 and u.count==2}|`
|
|
223
|
+
- `NUM_GAPS_LEN_AT_LEAST_FOUR_IN_ALL_EXCEPT_ONE` (MSACompare: `GAPS_LARGER_THAN_THREE_EXCEPT_ONE`):
|
|
224
|
+
`|{u: u.length>=4 and u.count==N-1}|`
|
|
225
|
+
|
|
226
|
+
Important edge-case note: to match the C++ reference, the "`== 1` / `== 2` / `== N-1`" checks are independent.
|
|
227
|
+
So for small `N`, some categories overlap:
|
|
228
|
+
- If `N == 2`, then `N-1 == 1`, so "`IN_ONE_SEQ`" and "`IN_ALL_EXCEPT_ONE`" count the same intervals.
|
|
229
|
+
- If `N == 3`, then `N-1 == 2`, so "`IN_TWO_SEQS`" and "`IN_ALL_EXCEPT_ONE`" count the same intervals.
|
|
230
|
+
|
|
231
|
+
#### Column-wise gap counts (computed on the original alignment, before all-gap trimming)
|
|
232
|
+
|
|
233
|
+
These count alignment columns based on how many sequences have a `-` in that column:
|
|
234
|
+
|
|
235
|
+
- `MSA_POSITION_WITH_0_GAPS` (MSACompare: `NO_GAP_COLUMNS`): number of columns with exactly 0 gaps
|
|
236
|
+
- `MSA_POSITION_WITH_1_GAPS` (MSACompare: `ONE_GAP_COLUMNS`): number of columns with exactly 1 gap
|
|
237
|
+
- `MSA_POSITION_WITH_2_GAPS` (MSACompare: `TWO_GAP_COLUMNS`): number of columns with exactly 2 gaps
|
|
238
|
+
- `MSA_POSITION_WITH_N_MINUS_1_GAPS` (MSACompare: `ONE_GAP_EXCEPT_ONE_COLUMN`): number of columns with exactly `N-1` gaps
|
|
239
|
+
|
|
240
|
+
Notes:
|
|
241
|
+
- Columns with `N` gaps (all-gap columns) are **not** counted in any of these buckets.
|
|
242
|
+
- For `N == 3`, the `N-1` bucket is effectively 0 because columns with 2 gaps are already counted in
|
|
243
|
+
`MSA_POSITION_WITH_2_GAPS` (this matches the C++ reference behavior).
|
|
244
|
+
|
|
245
|
+
### Examples
|
|
246
|
+
|
|
247
|
+
Helper to get a readable dict:
|
|
248
|
+
|
|
249
|
+
```python
|
|
250
|
+
import msastats
|
|
251
|
+
|
|
252
|
+
def stats_dict(msa):
|
|
253
|
+
return dict(zip(msastats.stats_names(), msastats.calculate_msa_stats(msa)))
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
#### Example 1: One gap interval shared by 2 of 3 sequences
|
|
257
|
+
|
|
258
|
+
```python
|
|
259
|
+
msa = ["A--A", "A--A", "AAAA"] # N=3, L=4
|
|
260
|
+
stats = stats_dict(msa)
|
|
261
|
+
|
|
262
|
+
assert stats["MSA_LEN"] == 4.0
|
|
263
|
+
assert stats["LONGEST_UNALIGNED_SEQ"] == 4.0
|
|
264
|
+
assert stats["SHORTEST_UNALIGNED_SEQ"] == 2.0
|
|
265
|
+
|
|
266
|
+
# One unique gap interval of length 2, appearing in 2 sequences:
|
|
267
|
+
assert stats["TOT_NUM_UNIQUE_GAPS"] == 1.0
|
|
268
|
+
assert stats["TOT_NUM_GAPS"] == 2.0
|
|
269
|
+
assert stats["AVG_GAP_SIZE"] == 2.0
|
|
270
|
+
assert stats["NUM_GAPS_LEN_TWO"] == 2.0
|
|
271
|
+
|
|
272
|
+
# Column-wise gap counts (original alignment):
|
|
273
|
+
assert stats["MSA_POSITION_WITH_0_GAPS"] == 2.0
|
|
274
|
+
assert stats["MSA_POSITION_WITH_2_GAPS"] == 2.0
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
#### Example 2: All-gap columns are ignored for gap-run detection
|
|
278
|
+
|
|
279
|
+
```python
|
|
280
|
+
msa = ["A-A", "A-A", "A-A"] # middle column is all gaps
|
|
281
|
+
stats = stats_dict(msa)
|
|
282
|
+
|
|
283
|
+
assert stats["MSA_LEN"] == 3.0
|
|
284
|
+
|
|
285
|
+
# The all-gap column is removed before gap runs are detected, so there are no gaps:
|
|
286
|
+
assert stats["TOT_NUM_GAPS"] == 0.0
|
|
287
|
+
assert stats["TOT_NUM_UNIQUE_GAPS"] == 0.0
|
|
288
|
+
assert stats["AVG_GAP_SIZE"] == 0.0
|
|
289
|
+
|
|
290
|
+
# But column-wise counts still “see” the original alignment columns:
|
|
291
|
+
assert stats["MSA_POSITION_WITH_0_GAPS"] == 2.0
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
#### Example 3: “total gap runs” vs “unique gap intervals”
|
|
295
|
+
|
|
296
|
+
```python
|
|
297
|
+
msa = ["A-A", "A-A", "AAA", "AAA"] # one gap interval shared by 2 sequences
|
|
298
|
+
stats = stats_dict(msa)
|
|
299
|
+
|
|
300
|
+
# Total occurrences (one per sequence that has it):
|
|
301
|
+
assert stats["NUM_GAPS_LEN_ONE"] == 2.0
|
|
302
|
+
|
|
303
|
+
# Unique intervals are counted once:
|
|
304
|
+
assert stats["TOT_NUM_UNIQUE_GAPS"] == 1.0
|
|
305
|
+
assert stats["NUM_GAPS_LEN_ONE_IN_TWO_SEQS"] == 1.0
|
|
306
|
+
```
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "PyMSAStats"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Pure-Python MSA summary statistics"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
authors = [{ name = "Naiel J" }]
|
|
12
|
+
requires-python = ">=3.9"
|
|
13
|
+
dependencies = []
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"Intended Audience :: Science/Research",
|
|
17
|
+
"Programming Language :: Python :: 3.9",
|
|
18
|
+
"Programming Language :: Python :: 3.10",
|
|
19
|
+
"Programming Language :: Python :: 3.11",
|
|
20
|
+
"Programming Language :: Python :: 3.12",
|
|
21
|
+
"Programming Language :: Python :: 3.13",
|
|
22
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
[project.urls]
|
|
26
|
+
Homepage = "https://github.com/naielj/PyMSAStats"
|
|
27
|
+
Repository = "https://github.com/naielj/PyMSAStats"
|
|
28
|
+
Issues = "https://github.com/naielj/PyMSAStats/issues"
|
|
29
|
+
|
|
30
|
+
[dependency-groups]
|
|
31
|
+
dev = [
|
|
32
|
+
"matplotlib>=3.7",
|
|
33
|
+
"msasim>=25.12.1",
|
|
34
|
+
"msastats>=25.11.1",
|
|
35
|
+
"pytest>=8.4.1",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
[tool.setuptools]
|
|
39
|
+
package-dir = {"" = "src"}
|
|
40
|
+
py-modules = ["msastats", "msa_stats_calculator"]
|
|
41
|
+
|
|
42
|
+
[tool.pytest.ini_options]
|
|
43
|
+
pythonpath = [
|
|
44
|
+
"src"
|
|
45
|
+
]
|