PyMSAStats 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Naiel Jabareen
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,328 @@
1
+ Metadata-Version: 2.4
2
+ Name: PyMSAStats
3
+ Version: 0.1.0
4
+ Summary: Pure-Python MSA summary statistics
5
+ Author: Naiel J
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/naielj/PyMSAStats
8
+ Project-URL: Repository, https://github.com/naielj/PyMSAStats
9
+ Project-URL: Issues, https://github.com/naielj/PyMSAStats/issues
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Programming Language :: Python :: 3.9
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
18
+ Requires-Python: >=3.9
19
+ Description-Content-Type: text/markdown
20
+ License-File: LICENSE
21
+ Dynamic: license-file
22
+
23
+ # PyMSAStats (pure-Python `msastats`)
24
+
25
+ A pure-Python implementation of the `msastats` API used by MSACompare for computing
26
+ Multiple Sequence Alignment (MSA) summary statistics.
27
+
28
+ This package installs and imports as `msastats`:
29
+
30
+ ```python
31
+ import msastats
32
+ ```
33
+
34
+ ## Usage
35
+
36
+ You can use either the high-level `msastats` functions (drop-in API), or the
37
+ lower-level `MsaStatsCalculator`.
38
+
39
+ ### Installation
40
+
41
+ Install from PyPI:
42
+
43
+ ```bash
44
+ pip install pymsastats
45
+ ```
46
+
47
+ Or with `uv`:
48
+
49
+ ```bash
50
+ uv add pymsastats
51
+ ```
52
+
53
+ For development (editable install from source):
54
+
55
+ ```bash
56
+ git clone https://github.com/naielj/PyMSAStats.git
57
+ cd PyMSAStats
58
+ uv sync
59
+ ```
60
+
61
+ ### Drop-in API (Recommended)
62
+
63
+ From a list of aligned sequences:
64
+
65
+ ```python
66
+ import msastats
67
+
68
+ stats = msastats.calculate_msa_stats(["AA-A", "AA-A", "A--A"])
69
+ names = msastats.stats_names()
70
+
71
+ as_dict = dict(zip(names, stats))
72
+ print(as_dict["AVG_GAP_SIZE"])
73
+ ```
74
+
75
+ From an aligned FASTA file:
76
+
77
+ ```python
78
+ import msastats
79
+
80
+ stats = msastats.calculate_fasta_stats("path/to/alignment.fasta")
81
+ print(stats)
82
+ ```
83
+
84
+ ### Calculator API
85
+
86
+ From a list of aligned sequences:
87
+
88
+ ```python
89
+ from msastats import MsaStatsCalculator, StatType
90
+
91
+ # 1. Initialize with a list of aligned sequences
92
+ sequences = [
93
+ "AC--GT",
94
+ "ACGTGT",
95
+ "AC-TGT",
96
+ ]
97
+ calculator = MsaStatsCalculator(sequences)
98
+
99
+ # 2. Compute the statistics
100
+ calculator.recompute_stats()
101
+
102
+ # 3. Access the statistics
103
+ print(f"MSA Length: {calculator.msa_length}")
104
+ print(f"Number of Sequences: {calculator.number_of_sequences}")
105
+ print(f"Total Gaps: {calculator.total_number_of_indels}")
106
+
107
+ # Or access stats by type
108
+ avg_gap_size = calculator.get_stat_by_type(StatType.AVG_GAP_SIZE)
109
+ print(f"Average Gap Size: {avg_gap_size:.2f}")
110
+
111
+ # Get all stats as a vector
112
+ stats_vector = calculator.get_stat_vec()
113
+ print(f"Stats Vector: {stats_vector}")
114
+ ```
115
+
116
+ From an aligned FASTA file:
117
+
118
+ ```python
119
+ from pathlib import Path
120
+ from msastats import MsaStatsCalculator, StatType
121
+
122
+ # 1. Create a dummy FASTA file
123
+ fasta_content = """>seq1
124
+ AC--GT--
125
+ >seq2
126
+ ACGTGT--
127
+ >seq3
128
+ AC-TGTAC
129
+ """
130
+ fasta_path = Path("dummy.fasta")
131
+ fasta_path.write_text(fasta_content)
132
+
133
+ # 2. Initialize from the FASTA file
134
+ calculator = MsaStatsCalculator.from_fasta(fasta_path)
135
+
136
+ # 3. Compute the statistics
137
+ calculator.recompute_stats()
138
+
139
+ # 4. Access the statistics
140
+ print(f"MSA Length: {calculator.msa_length}")
141
+ print(f"Longest Sequence: {calculator.msa_longest_seq_length}")
142
+ print(f"Shortest Sequence: {calculator.msa_shortest_seq_length}")
143
+ print(f"Total Number of Gaps: {calculator.total_number_of_indels}")
144
+
145
+ # Clean up the dummy file
146
+ fasta_path.unlink()
147
+ ```
148
+
149
+ ## Statistics reference (27 metrics)
150
+
151
+ The 27 summary statistics implemented here are defined in:
152
+
153
+ > Wygoda E, Loewenthal G, Moshe A, Alburquerque M, Mayrose I, Pupko T. Statistical framework to determine indel-length distribution. *Bioinformatics*. 2024;40(2):btae043. <https://doi.org/10.1093/bioinformatics/btae043>
154
+
155
+ ```bibtex
156
+ @article{wygoda2024indel,
157
+ author = {Wygoda, Elya and Loewenthal, Gil and Moshe, Asher and Alburquerque, Michael and Mayrose, Itay and Pupko, Tal},
158
+ title = {Statistical framework to determine indel-length distribution},
159
+ journal = {Bioinformatics},
160
+ volume = {40},
161
+ number = {2},
162
+ pages = {btae043},
163
+ year = {2024},
164
+ doi = {10.1093/bioinformatics/btae043}
165
+ }
166
+ ```
167
+
168
+ For an illustrated reference with a worked example showing all 27 metric values on a single MSA, see
169
+ [docs/summary_statistics_reference.md](docs/summary_statistics_reference.md).
170
+
171
+ ### Terminology
172
+
173
+ - **Gap character:** the implementation treats `-` as a gap.
174
+ - **All-gap column:** a column where *all* sequences have `-` at that position.
175
+ - **Gap run / indel (what “gap” means in most metrics):** a maximal contiguous run of `-` in a single sequence.
176
+ - **All-gap trimming (important):** before detecting gap runs, the algorithm removes all-gap columns. This matches the
177
+ original C++ implementation and prevents all-gap columns from splitting/creating gap runs.
178
+ - **Unique gap interval:** gap runs are grouped by their `(start, end)` coordinates *in the trimmed alignment*. If
179
+ multiple sequences have a gap run with the same `(start, end)`, that is **one** unique gap interval with:
180
+ - `length = end - start + 1`
181
+ - `count = number of sequences that have that exact interval`
182
+
183
+ ### Returned order
184
+
185
+ `calculate_msa_stats` / `calculate_fasta_stats` return a list of 27 floats in the same order as
186
+ `msastats.stats_names()` (and the `StatType` enum).
187
+
188
+ ### Metric definitions
189
+
190
+ #### Alignment and sequence lengths
191
+
192
+ - `MSA_LEN` (MSACompare: `LINE_LENGTH`): alignment length in columns (includes all-gap columns).
193
+ - `LONGEST_UNALIGNED_SEQ` (MSACompare: `LONGEST_UNALIGNED_SEQ_LENGTH`): max ungapped sequence length across sequences
194
+ (`len(seq.replace('-', ''))`).
195
+ - `SHORTEST_UNALIGNED_SEQ` (MSACompare: `SHORTEST_UNALIGNED_SEQ_LENGTH`): min ungapped sequence length across sequences.
196
+
197
+ #### Gap-run totals (after all-gap trimming)
198
+
199
+ Let the set of unique gap intervals be `U`, and for each `u ∈ U`, let `u.length` be its length and `u.count` be how
200
+ many sequences contain that interval.
201
+
202
+ - `TOT_NUM_GAPS` (MSACompare: `TOTAL_GAPS`): total number of gap runs across sequences:
203
+ `Σ_u u.count`
204
+ - `AVG_GAP_SIZE` (MSACompare: `AVG_LENGTH_OF_GAPS`): mean gap-run length across all sequences:
205
+ `(Σ_u u.length · u.count) / TOT_NUM_GAPS` (0 if `TOT_NUM_GAPS == 0`)
206
+ - `NUM_GAPS_LEN_ONE` (MSACompare: `GAPS_OF_LENGTH_ONE`): `Σ_{u.length==1} u.count`
207
+ - `NUM_GAPS_LEN_TWO` (MSACompare: `GAPS_OF_LENGTH_TWO`): `Σ_{u.length==2} u.count`
208
+ - `NUM_GAPS_LEN_THREE` (MSACompare: `GAPS_OF_LENGTH_THREE`): `Σ_{u.length==3} u.count`
209
+ - `NUM_GAPS_LEN_AT_LEAST_FOUR` (MSACompare: `GAPS_LARGER_THAN_THREE`): `Σ_{u.length>=4} u.count`
210
+
211
+ #### Unique gap intervals (after all-gap trimming)
212
+
213
+ - `TOT_NUM_UNIQUE_GAPS` (MSACompare: `TOTAL_UNIQUE_GAPS`): number of unique gap intervals: `|U|`
214
+ - `AVG_UNIQUE_GAP_SIZE` (MSACompare: `AVG_SIZE_OF_UNIQUE_GAPS`): mean unique-gap length:
215
+ `(Σ_u u.length) / TOT_NUM_UNIQUE_GAPS` (0 if `TOT_NUM_UNIQUE_GAPS == 0`)
216
+
217
+ #### Unique gap intervals shared by k sequences (after all-gap trimming)
218
+
219
+ These count **unique** gap intervals (not total occurrences). For a given interval length bucket, they count how many
220
+ intervals have `u.count == k`.
221
+
222
+ Length 1:
223
+ - `NUM_GAPS_LEN_ONE_IN_ONE_SEQ` (MSACompare: `GAPS_LENGTH_ONE_ONE_SEQ`): `|{u: u.length==1 and u.count==1}|`
224
+ - `NUM_GAPS_LEN_ONE_IN_TWO_SEQS` (MSACompare: `GAPS_LENGTH_ONE_TWO_SEQ`): `|{u: u.length==1 and u.count==2}|`
225
+ - `NUM_GAPS_LEN_ONE_IN_ALL_EXCEPT_ONE` (MSACompare: `GAPS_LENGTH_ONE_EXCEPT_ONE`):
226
+ `|{u: u.length==1 and u.count==N-1}|`
227
+
228
+ Length 2:
229
+ - `NUM_GAPS_LEN_TWO_IN_ONE_SEQ` (MSACompare: `GAPS_LENGTH_TWO_ONE_SEQ`): `|{u: u.length==2 and u.count==1}|`
230
+ - `NUM_GAPS_LEN_TWO_IN_TWO_SEQS` (MSACompare: `GAPS_LENGTH_TWO_TWO_SEQ`): `|{u: u.length==2 and u.count==2}|`
231
+ - `NUM_GAPS_LEN_TWO_IN_ALL_EXCEPT_ONE` (MSACompare: `GAPS_LENGTH_TWO_EXCEPT_ONE`):
232
+ `|{u: u.length==2 and u.count==N-1}|`
233
+
234
+ Length 3:
235
+ - `NUM_GAPS_LEN_THREE_IN_ONE_SEQ` (MSACompare: `GAPS_LENGTH_THREE_ONE_SEQ`): `|{u: u.length==3 and u.count==1}|`
236
+ - `NUM_GAPS_LEN_THREE_IN_TWO_SEQS` (MSACompare: `GAPS_LENGTH_THREE_TWO_SEQ`): `|{u: u.length==3 and u.count==2}|`
237
+ - `NUM_GAPS_LEN_THREE_IN_ALL_EXCEPT_ONE` (MSACompare: `GAPS_LENGTH_THREE_EXCEPT_ONE`):
238
+ `|{u: u.length==3 and u.count==N-1}|`
239
+
240
+ Length ≥ 4:
241
+ - `NUM_GAPS_LEN_AT_LEAST_FOUR_IN_ONE_SEQ` (MSACompare: `GAPS_LARGER_THAN_THREE_ONE_SEQ`):
242
+ `|{u: u.length>=4 and u.count==1}|`
243
+ - `NUM_GAPS_LEN_AT_LEAST_FOUR_IN_TWO_SEQS` (MSACompare: `GAPS_LARGER_THAN_THREE_TWO_SEQ`):
244
+ `|{u: u.length>=4 and u.count==2}|`
245
+ - `NUM_GAPS_LEN_AT_LEAST_FOUR_IN_ALL_EXCEPT_ONE` (MSACompare: `GAPS_LARGER_THAN_THREE_EXCEPT_ONE`):
246
+ `|{u: u.length>=4 and u.count==N-1}|`
247
+
248
+ Important edge-case note: to match the C++ reference, the "`== 1` / `== 2` / `== N-1`" checks are independent.
249
+ So for small `N`, some categories overlap:
250
+ - If `N == 2`, then `N-1 == 1`, so "`IN_ONE_SEQ`" and "`IN_ALL_EXCEPT_ONE`" count the same intervals.
251
+ - If `N == 3`, then `N-1 == 2`, so "`IN_TWO_SEQS`" and "`IN_ALL_EXCEPT_ONE`" count the same intervals.
252
+
253
+ #### Column-wise gap counts (computed on the original alignment, before all-gap trimming)
254
+
255
+ These count alignment columns based on how many sequences have a `-` in that column:
256
+
257
+ - `MSA_POSITION_WITH_0_GAPS` (MSACompare: `NO_GAP_COLUMNS`): number of columns with exactly 0 gaps
258
+ - `MSA_POSITION_WITH_1_GAPS` (MSACompare: `ONE_GAP_COLUMNS`): number of columns with exactly 1 gap
259
+ - `MSA_POSITION_WITH_2_GAPS` (MSACompare: `TWO_GAP_COLUMNS`): number of columns with exactly 2 gaps
260
+ - `MSA_POSITION_WITH_N_MINUS_1_GAPS` (MSACompare: `ONE_GAP_EXCEPT_ONE_COLUMN`): number of columns with exactly `N-1` gaps
261
+
262
+ Notes:
263
+ - Columns with `N` gaps (all-gap columns) are **not** counted in any of these buckets.
264
+ - For `N == 3`, the `N-1` bucket is effectively 0 because columns with 2 gaps are already counted in
265
+ `MSA_POSITION_WITH_2_GAPS` (this matches the C++ reference behavior).
266
+
267
+ ### Examples
268
+
269
+ Helper to get a readable dict:
270
+
271
+ ```python
272
+ import msastats
273
+
274
+ def stats_dict(msa):
275
+ return dict(zip(msastats.stats_names(), msastats.calculate_msa_stats(msa)))
276
+ ```
277
+
278
+ #### Example 1: One gap interval shared by 2 of 3 sequences
279
+
280
+ ```python
281
+ msa = ["A--A", "A--A", "AAAA"] # N=3, L=4
282
+ stats = stats_dict(msa)
283
+
284
+ assert stats["MSA_LEN"] == 4.0
285
+ assert stats["LONGEST_UNALIGNED_SEQ"] == 4.0
286
+ assert stats["SHORTEST_UNALIGNED_SEQ"] == 2.0
287
+
288
+ # One unique gap interval of length 2, appearing in 2 sequences:
289
+ assert stats["TOT_NUM_UNIQUE_GAPS"] == 1.0
290
+ assert stats["TOT_NUM_GAPS"] == 2.0
291
+ assert stats["AVG_GAP_SIZE"] == 2.0
292
+ assert stats["NUM_GAPS_LEN_TWO"] == 2.0
293
+
294
+ # Column-wise gap counts (original alignment):
295
+ assert stats["MSA_POSITION_WITH_0_GAPS"] == 2.0
296
+ assert stats["MSA_POSITION_WITH_2_GAPS"] == 2.0
297
+ ```
298
+
299
+ #### Example 2: All-gap columns are ignored for gap-run detection
300
+
301
+ ```python
302
+ msa = ["A-A", "A-A", "A-A"] # middle column is all gaps
303
+ stats = stats_dict(msa)
304
+
305
+ assert stats["MSA_LEN"] == 3.0
306
+
307
+ # The all-gap column is removed before gap runs are detected, so there are no gaps:
308
+ assert stats["TOT_NUM_GAPS"] == 0.0
309
+ assert stats["TOT_NUM_UNIQUE_GAPS"] == 0.0
310
+ assert stats["AVG_GAP_SIZE"] == 0.0
311
+
312
+ # But column-wise counts still “see” the original alignment columns:
313
+ assert stats["MSA_POSITION_WITH_0_GAPS"] == 2.0
314
+ ```
315
+
316
+ #### Example 3: “total gap runs” vs “unique gap intervals”
317
+
318
+ ```python
319
+ msa = ["A-A", "A-A", "AAA", "AAA"] # one gap interval shared by 2 sequences
320
+ stats = stats_dict(msa)
321
+
322
+ # Total occurrences (one per sequence that has it):
323
+ assert stats["NUM_GAPS_LEN_ONE"] == 2.0
324
+
325
+ # Unique intervals are counted once:
326
+ assert stats["TOT_NUM_UNIQUE_GAPS"] == 1.0
327
+ assert stats["NUM_GAPS_LEN_ONE_IN_TWO_SEQS"] == 1.0
328
+ ```
@@ -0,0 +1,306 @@
1
+ # PyMSAStats (pure-Python `msastats`)
2
+
3
+ A pure-Python implementation of the `msastats` API used by MSACompare for computing
4
+ Multiple Sequence Alignment (MSA) summary statistics.
5
+
6
+ This package installs and imports as `msastats`:
7
+
8
+ ```python
9
+ import msastats
10
+ ```
11
+
12
+ ## Usage
13
+
14
+ You can use either the high-level `msastats` functions (drop-in API), or the
15
+ lower-level `MsaStatsCalculator`.
16
+
17
+ ### Installation
18
+
19
+ Install from PyPI:
20
+
21
+ ```bash
22
+ pip install pymsastats
23
+ ```
24
+
25
+ Or with `uv`:
26
+
27
+ ```bash
28
+ uv add pymsastats
29
+ ```
30
+
31
+ For development (editable install from source):
32
+
33
+ ```bash
34
+ git clone https://github.com/naielj/PyMSAStats.git
35
+ cd PyMSAStats
36
+ uv sync
37
+ ```
38
+
39
+ ### Drop-in API (Recommended)
40
+
41
+ From a list of aligned sequences:
42
+
43
+ ```python
44
+ import msastats
45
+
46
+ stats = msastats.calculate_msa_stats(["AA-A", "AA-A", "A--A"])
47
+ names = msastats.stats_names()
48
+
49
+ as_dict = dict(zip(names, stats))
50
+ print(as_dict["AVG_GAP_SIZE"])
51
+ ```
52
+
53
+ From an aligned FASTA file:
54
+
55
+ ```python
56
+ import msastats
57
+
58
+ stats = msastats.calculate_fasta_stats("path/to/alignment.fasta")
59
+ print(stats)
60
+ ```
61
+
62
+ ### Calculator API
63
+
64
+ From a list of aligned sequences:
65
+
66
+ ```python
67
+ from msastats import MsaStatsCalculator, StatType
68
+
69
+ # 1. Initialize with a list of aligned sequences
70
+ sequences = [
71
+ "AC--GT",
72
+ "ACGTGT",
73
+ "AC-TGT",
74
+ ]
75
+ calculator = MsaStatsCalculator(sequences)
76
+
77
+ # 2. Compute the statistics
78
+ calculator.recompute_stats()
79
+
80
+ # 3. Access the statistics
81
+ print(f"MSA Length: {calculator.msa_length}")
82
+ print(f"Number of Sequences: {calculator.number_of_sequences}")
83
+ print(f"Total Gaps: {calculator.total_number_of_indels}")
84
+
85
+ # Or access stats by type
86
+ avg_gap_size = calculator.get_stat_by_type(StatType.AVG_GAP_SIZE)
87
+ print(f"Average Gap Size: {avg_gap_size:.2f}")
88
+
89
+ # Get all stats as a vector
90
+ stats_vector = calculator.get_stat_vec()
91
+ print(f"Stats Vector: {stats_vector}")
92
+ ```
93
+
94
+ From an aligned FASTA file:
95
+
96
+ ```python
97
+ from pathlib import Path
98
+ from msastats import MsaStatsCalculator, StatType
99
+
100
+ # 1. Create a dummy FASTA file
101
+ fasta_content = """>seq1
102
+ AC--GT--
103
+ >seq2
104
+ ACGTGT--
105
+ >seq3
106
+ AC-TGTAC
107
+ """
108
+ fasta_path = Path("dummy.fasta")
109
+ fasta_path.write_text(fasta_content)
110
+
111
+ # 2. Initialize from the FASTA file
112
+ calculator = MsaStatsCalculator.from_fasta(fasta_path)
113
+
114
+ # 3. Compute the statistics
115
+ calculator.recompute_stats()
116
+
117
+ # 4. Access the statistics
118
+ print(f"MSA Length: {calculator.msa_length}")
119
+ print(f"Longest Sequence: {calculator.msa_longest_seq_length}")
120
+ print(f"Shortest Sequence: {calculator.msa_shortest_seq_length}")
121
+ print(f"Total Number of Gaps: {calculator.total_number_of_indels}")
122
+
123
+ # Clean up the dummy file
124
+ fasta_path.unlink()
125
+ ```
126
+
127
+ ## Statistics reference (27 metrics)
128
+
129
+ The 27 summary statistics implemented here are defined in:
130
+
131
+ > Wygoda E, Loewenthal G, Moshe A, Alburquerque M, Mayrose I, Pupko T. Statistical framework to determine indel-length distribution. *Bioinformatics*. 2024;40(2):btae043. <https://doi.org/10.1093/bioinformatics/btae043>
132
+
133
+ ```bibtex
134
+ @article{wygoda2024indel,
135
+ author = {Wygoda, Elya and Loewenthal, Gil and Moshe, Asher and Alburquerque, Michael and Mayrose, Itay and Pupko, Tal},
136
+ title = {Statistical framework to determine indel-length distribution},
137
+ journal = {Bioinformatics},
138
+ volume = {40},
139
+ number = {2},
140
+ pages = {btae043},
141
+ year = {2024},
142
+ doi = {10.1093/bioinformatics/btae043}
143
+ }
144
+ ```
145
+
146
+ For an illustrated reference with a worked example showing all 27 metric values on a single MSA, see
147
+ [docs/summary_statistics_reference.md](docs/summary_statistics_reference.md).
148
+
149
+ ### Terminology
150
+
151
+ - **Gap character:** the implementation treats `-` as a gap.
152
+ - **All-gap column:** a column where *all* sequences have `-` at that position.
153
+ - **Gap run / indel (what “gap” means in most metrics):** a maximal contiguous run of `-` in a single sequence.
154
+ - **All-gap trimming (important):** before detecting gap runs, the algorithm removes all-gap columns. This matches the
155
+ original C++ implementation and prevents all-gap columns from splitting/creating gap runs.
156
+ - **Unique gap interval:** gap runs are grouped by their `(start, end)` coordinates *in the trimmed alignment*. If
157
+ multiple sequences have a gap run with the same `(start, end)`, that is **one** unique gap interval with:
158
+ - `length = end - start + 1`
159
+ - `count = number of sequences that have that exact interval`
160
+
161
+ ### Returned order
162
+
163
+ `calculate_msa_stats` / `calculate_fasta_stats` return a list of 27 floats in the same order as
164
+ `msastats.stats_names()` (and the `StatType` enum).
165
+
166
+ ### Metric definitions
167
+
168
+ #### Alignment and sequence lengths
169
+
170
+ - `MSA_LEN` (MSACompare: `LINE_LENGTH`): alignment length in columns (includes all-gap columns).
171
+ - `LONGEST_UNALIGNED_SEQ` (MSACompare: `LONGEST_UNALIGNED_SEQ_LENGTH`): max ungapped sequence length across sequences
172
+ (`len(seq.replace('-', ''))`).
173
+ - `SHORTEST_UNALIGNED_SEQ` (MSACompare: `SHORTEST_UNALIGNED_SEQ_LENGTH`): min ungapped sequence length across sequences.
174
+
175
+ #### Gap-run totals (after all-gap trimming)
176
+
177
+ Let the set of unique gap intervals be `U`, and for each `u ∈ U`, let `u.length` be its length and `u.count` be how
178
+ many sequences contain that interval.
179
+
180
+ - `TOT_NUM_GAPS` (MSACompare: `TOTAL_GAPS`): total number of gap runs across sequences:
181
+ `Σ_u u.count`
182
+ - `AVG_GAP_SIZE` (MSACompare: `AVG_LENGTH_OF_GAPS`): mean gap-run length across all sequences:
183
+ `(Σ_u u.length · u.count) / TOT_NUM_GAPS` (0 if `TOT_NUM_GAPS == 0`)
184
+ - `NUM_GAPS_LEN_ONE` (MSACompare: `GAPS_OF_LENGTH_ONE`): `Σ_{u.length==1} u.count`
185
+ - `NUM_GAPS_LEN_TWO` (MSACompare: `GAPS_OF_LENGTH_TWO`): `Σ_{u.length==2} u.count`
186
+ - `NUM_GAPS_LEN_THREE` (MSACompare: `GAPS_OF_LENGTH_THREE`): `Σ_{u.length==3} u.count`
187
+ - `NUM_GAPS_LEN_AT_LEAST_FOUR` (MSACompare: `GAPS_LARGER_THAN_THREE`): `Σ_{u.length>=4} u.count`
188
+
189
+ #### Unique gap intervals (after all-gap trimming)
190
+
191
+ - `TOT_NUM_UNIQUE_GAPS` (MSACompare: `TOTAL_UNIQUE_GAPS`): number of unique gap intervals: `|U|`
192
+ - `AVG_UNIQUE_GAP_SIZE` (MSACompare: `AVG_SIZE_OF_UNIQUE_GAPS`): mean unique-gap length:
193
+ `(Σ_u u.length) / TOT_NUM_UNIQUE_GAPS` (0 if `TOT_NUM_UNIQUE_GAPS == 0`)
194
+
195
+ #### Unique gap intervals shared by k sequences (after all-gap trimming)
196
+
197
+ These count **unique** gap intervals (not total occurrences). For a given interval length bucket, they count how many
198
+ intervals have `u.count == k`.
199
+
200
+ Length 1:
201
+ - `NUM_GAPS_LEN_ONE_IN_ONE_SEQ` (MSACompare: `GAPS_LENGTH_ONE_ONE_SEQ`): `|{u: u.length==1 and u.count==1}|`
202
+ - `NUM_GAPS_LEN_ONE_IN_TWO_SEQS` (MSACompare: `GAPS_LENGTH_ONE_TWO_SEQ`): `|{u: u.length==1 and u.count==2}|`
203
+ - `NUM_GAPS_LEN_ONE_IN_ALL_EXCEPT_ONE` (MSACompare: `GAPS_LENGTH_ONE_EXCEPT_ONE`):
204
+ `|{u: u.length==1 and u.count==N-1}|`
205
+
206
+ Length 2:
207
+ - `NUM_GAPS_LEN_TWO_IN_ONE_SEQ` (MSACompare: `GAPS_LENGTH_TWO_ONE_SEQ`): `|{u: u.length==2 and u.count==1}|`
208
+ - `NUM_GAPS_LEN_TWO_IN_TWO_SEQS` (MSACompare: `GAPS_LENGTH_TWO_TWO_SEQ`): `|{u: u.length==2 and u.count==2}|`
209
+ - `NUM_GAPS_LEN_TWO_IN_ALL_EXCEPT_ONE` (MSACompare: `GAPS_LENGTH_TWO_EXCEPT_ONE`):
210
+ `|{u: u.length==2 and u.count==N-1}|`
211
+
212
+ Length 3:
213
+ - `NUM_GAPS_LEN_THREE_IN_ONE_SEQ` (MSACompare: `GAPS_LENGTH_THREE_ONE_SEQ`): `|{u: u.length==3 and u.count==1}|`
214
+ - `NUM_GAPS_LEN_THREE_IN_TWO_SEQS` (MSACompare: `GAPS_LENGTH_THREE_TWO_SEQ`): `|{u: u.length==3 and u.count==2}|`
215
+ - `NUM_GAPS_LEN_THREE_IN_ALL_EXCEPT_ONE` (MSACompare: `GAPS_LENGTH_THREE_EXCEPT_ONE`):
216
+ `|{u: u.length==3 and u.count==N-1}|`
217
+
218
+ Length ≥ 4:
219
+ - `NUM_GAPS_LEN_AT_LEAST_FOUR_IN_ONE_SEQ` (MSACompare: `GAPS_LARGER_THAN_THREE_ONE_SEQ`):
220
+ `|{u: u.length>=4 and u.count==1}|`
221
+ - `NUM_GAPS_LEN_AT_LEAST_FOUR_IN_TWO_SEQS` (MSACompare: `GAPS_LARGER_THAN_THREE_TWO_SEQ`):
222
+ `|{u: u.length>=4 and u.count==2}|`
223
+ - `NUM_GAPS_LEN_AT_LEAST_FOUR_IN_ALL_EXCEPT_ONE` (MSACompare: `GAPS_LARGER_THAN_THREE_EXCEPT_ONE`):
224
+ `|{u: u.length>=4 and u.count==N-1}|`
225
+
226
+ Important edge-case note: to match the C++ reference, the "`== 1` / `== 2` / `== N-1`" checks are independent.
227
+ So for small `N`, some categories overlap:
228
+ - If `N == 2`, then `N-1 == 1`, so "`IN_ONE_SEQ`" and "`IN_ALL_EXCEPT_ONE`" count the same intervals.
229
+ - If `N == 3`, then `N-1 == 2`, so "`IN_TWO_SEQS`" and "`IN_ALL_EXCEPT_ONE`" count the same intervals.
230
+
231
+ #### Column-wise gap counts (computed on the original alignment, before all-gap trimming)
232
+
233
+ These count alignment columns based on how many sequences have a `-` in that column:
234
+
235
+ - `MSA_POSITION_WITH_0_GAPS` (MSACompare: `NO_GAP_COLUMNS`): number of columns with exactly 0 gaps
236
+ - `MSA_POSITION_WITH_1_GAPS` (MSACompare: `ONE_GAP_COLUMNS`): number of columns with exactly 1 gap
237
+ - `MSA_POSITION_WITH_2_GAPS` (MSACompare: `TWO_GAP_COLUMNS`): number of columns with exactly 2 gaps
238
+ - `MSA_POSITION_WITH_N_MINUS_1_GAPS` (MSACompare: `ONE_GAP_EXCEPT_ONE_COLUMN`): number of columns with exactly `N-1` gaps
239
+
240
+ Notes:
241
+ - Columns with `N` gaps (all-gap columns) are **not** counted in any of these buckets.
242
+ - For `N == 3`, the `N-1` bucket is effectively 0 because columns with 2 gaps are already counted in
243
+ `MSA_POSITION_WITH_2_GAPS` (this matches the C++ reference behavior).
244
+
245
+ ### Examples
246
+
247
+ Helper to get a readable dict:
248
+
249
+ ```python
250
+ import msastats
251
+
252
+ def stats_dict(msa):
253
+ return dict(zip(msastats.stats_names(), msastats.calculate_msa_stats(msa)))
254
+ ```
255
+
256
+ #### Example 1: One gap interval shared by 2 of 3 sequences
257
+
258
+ ```python
259
+ msa = ["A--A", "A--A", "AAAA"] # N=3, L=4
260
+ stats = stats_dict(msa)
261
+
262
+ assert stats["MSA_LEN"] == 4.0
263
+ assert stats["LONGEST_UNALIGNED_SEQ"] == 4.0
264
+ assert stats["SHORTEST_UNALIGNED_SEQ"] == 2.0
265
+
266
+ # One unique gap interval of length 2, appearing in 2 sequences:
267
+ assert stats["TOT_NUM_UNIQUE_GAPS"] == 1.0
268
+ assert stats["TOT_NUM_GAPS"] == 2.0
269
+ assert stats["AVG_GAP_SIZE"] == 2.0
270
+ assert stats["NUM_GAPS_LEN_TWO"] == 2.0
271
+
272
+ # Column-wise gap counts (original alignment):
273
+ assert stats["MSA_POSITION_WITH_0_GAPS"] == 2.0
274
+ assert stats["MSA_POSITION_WITH_2_GAPS"] == 2.0
275
+ ```
276
+
277
+ #### Example 2: All-gap columns are ignored for gap-run detection
278
+
279
+ ```python
280
+ msa = ["A-A", "A-A", "A-A"] # middle column is all gaps
281
+ stats = stats_dict(msa)
282
+
283
+ assert stats["MSA_LEN"] == 3.0
284
+
285
+ # The all-gap column is removed before gap runs are detected, so there are no gaps:
286
+ assert stats["TOT_NUM_GAPS"] == 0.0
287
+ assert stats["TOT_NUM_UNIQUE_GAPS"] == 0.0
288
+ assert stats["AVG_GAP_SIZE"] == 0.0
289
+
290
+ # But column-wise counts still “see” the original alignment columns:
291
+ assert stats["MSA_POSITION_WITH_0_GAPS"] == 2.0
292
+ ```
293
+
294
+ #### Example 3: “total gap runs” vs “unique gap intervals”
295
+
296
+ ```python
297
+ msa = ["A-A", "A-A", "AAA", "AAA"] # one gap interval shared by 2 sequences
298
+ stats = stats_dict(msa)
299
+
300
+ # Total occurrences (one per sequence that has it):
301
+ assert stats["NUM_GAPS_LEN_ONE"] == 2.0
302
+
303
+ # Unique intervals are counted once:
304
+ assert stats["TOT_NUM_UNIQUE_GAPS"] == 1.0
305
+ assert stats["NUM_GAPS_LEN_ONE_IN_TWO_SEQS"] == 1.0
306
+ ```
@@ -0,0 +1,45 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "PyMSAStats"
7
+ version = "0.1.0"
8
+ description = "Pure-Python MSA summary statistics"
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ authors = [{ name = "Naiel J" }]
12
+ requires-python = ">=3.9"
13
+ dependencies = []
14
+ classifiers = [
15
+ "Development Status :: 4 - Beta",
16
+ "Intended Audience :: Science/Research",
17
+ "Programming Language :: Python :: 3.9",
18
+ "Programming Language :: Python :: 3.10",
19
+ "Programming Language :: Python :: 3.11",
20
+ "Programming Language :: Python :: 3.12",
21
+ "Programming Language :: Python :: 3.13",
22
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
23
+ ]
24
+
25
+ [project.urls]
26
+ Homepage = "https://github.com/naielj/PyMSAStats"
27
+ Repository = "https://github.com/naielj/PyMSAStats"
28
+ Issues = "https://github.com/naielj/PyMSAStats/issues"
29
+
30
+ [dependency-groups]
31
+ dev = [
32
+ "matplotlib>=3.7",
33
+ "msasim>=25.12.1",
34
+ "msastats>=25.11.1",
35
+ "pytest>=8.4.1",
36
+ ]
37
+
38
+ [tool.setuptools]
39
+ package-dir = {"" = "src"}
40
+ py-modules = ["msastats", "msa_stats_calculator"]
41
+
42
+ [tool.pytest.ini_options]
43
+ pythonpath = [
44
+ "src"
45
+ ]