zehramsa 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zehramsa-1.0.0/PKG-INFO +5 -0
- zehramsa-1.0.0/pyproject.toml +13 -0
- zehramsa-1.0.0/setup.cfg +4 -0
- zehramsa-1.0.0/src/zehramsa/__init__.py +19 -0
- zehramsa-1.0.0/src/zehramsa/align.py +74 -0
- zehramsa-1.0.0/src/zehramsa/center_star.py +131 -0
- zehramsa-1.0.0/src/zehramsa/needleman_wunsch.py +84 -0
- zehramsa-1.0.0/src/zehramsa/result.py +64 -0
- zehramsa-1.0.0/src/zehramsa/scoring.py +20 -0
- zehramsa-1.0.0/src/zehramsa.egg-info/PKG-INFO +5 -0
- zehramsa-1.0.0/src/zehramsa.egg-info/SOURCES.txt +11 -0
- zehramsa-1.0.0/src/zehramsa.egg-info/dependency_links.txt +1 -0
- zehramsa-1.0.0/src/zehramsa.egg-info/top_level.txt +1 -0
zehramsa-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "zehramsa"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "Multiple Sequence Alignment via Dynamic Programming"
|
|
9
|
+
requires-python = ">=3.9"
|
|
10
|
+
dependencies = []
|
|
11
|
+
|
|
12
|
+
[tool.setuptools.packages.find]
|
|
13
|
+
where = ["src"]
|
zehramsa-1.0.0/setup.cfg
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
__version__ = "1.0.0"
|
|
2
|
+
__author__ = "Zehra"
|
|
3
|
+
|
|
4
|
+
from .align import align
|
|
5
|
+
from .result import MSAResult, PairwiseResult
|
|
6
|
+
from .scoring import SimpleScoring, DEFAULT_SCORING
|
|
7
|
+
from .needleman_wunsch import needleman_wunsch, needleman_wunsch_score
|
|
8
|
+
from .center_star import center_star_align
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"align",
|
|
12
|
+
"MSAResult",
|
|
13
|
+
"PairwiseResult",
|
|
14
|
+
"SimpleScoring",
|
|
15
|
+
"DEFAULT_SCORING",
|
|
16
|
+
"needleman_wunsch",
|
|
17
|
+
"needleman_wunsch_score",
|
|
18
|
+
"center_star_align",
|
|
19
|
+
]
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from .needleman_wunsch import needleman_wunsch
|
|
4
|
+
from .center_star import center_star_align
|
|
5
|
+
from .result import MSAResult, PairwiseResult
|
|
6
|
+
from .scoring import DEFAULT_SCORING
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def align(
|
|
10
|
+
sequences: list[str],
|
|
11
|
+
*,
|
|
12
|
+
scoring=None,
|
|
13
|
+
verbose: bool = False,
|
|
14
|
+
) -> MSAResult:
|
|
15
|
+
if not sequences:
|
|
16
|
+
raise ValueError("sequences list is empty — provide at least 2 sequences.")
|
|
17
|
+
if len(sequences) < 2:
|
|
18
|
+
raise ValueError("At least 2 sequences are required for alignment.")
|
|
19
|
+
for i, s in enumerate(sequences):
|
|
20
|
+
if not isinstance(s, str):
|
|
21
|
+
raise TypeError(
|
|
22
|
+
f"sequences[{i}] must be a string, got {type(s).__name__!r}"
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
if scoring is None:
|
|
26
|
+
scoring = DEFAULT_SCORING
|
|
27
|
+
|
|
28
|
+
names = [f'Seq_{i}' for i in range(len(sequences))]
|
|
29
|
+
seqs = [s.upper() for s in sequences]
|
|
30
|
+
n = len(seqs)
|
|
31
|
+
|
|
32
|
+
if verbose:
|
|
33
|
+
print(f"[zehramsa] Aligning {n} sequences: {names}")
|
|
34
|
+
|
|
35
|
+
pw_results: list[PairwiseResult] = []
|
|
36
|
+
score_matrix: list[list[float]] = [[0.0] * n for _ in range(n)]
|
|
37
|
+
for i in range(n):
|
|
38
|
+
for j in range(i + 1, n):
|
|
39
|
+
pw = needleman_wunsch(
|
|
40
|
+
seqs[i], seqs[j], scoring,
|
|
41
|
+
seq1_name=names[i], seq2_name=names[j],
|
|
42
|
+
)
|
|
43
|
+
pw_results.append(pw)
|
|
44
|
+
score_matrix[i][j] = pw.score
|
|
45
|
+
score_matrix[j][i] = pw.score
|
|
46
|
+
if verbose:
|
|
47
|
+
print(f" NW {names[i]} vs {names[j]}: score={pw.score:.2f}")
|
|
48
|
+
|
|
49
|
+
sequences_aligned, center_name = center_star_align(
|
|
50
|
+
seqs, names, scoring,
|
|
51
|
+
score_matrix=score_matrix,
|
|
52
|
+
pairwise_results=pw_results,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
if verbose:
|
|
56
|
+
print(f" Center sequence: {center_name}")
|
|
57
|
+
|
|
58
|
+
aligned_seqs = list(sequences_aligned.values())
|
|
59
|
+
length = len(aligned_seqs[0]) if aligned_seqs else 0
|
|
60
|
+
sp_score = 0.0
|
|
61
|
+
for i in range(n):
|
|
62
|
+
for j in range(i + 1, n):
|
|
63
|
+
for col in range(length):
|
|
64
|
+
sp_score += scoring.score(aligned_seqs[i][col], aligned_seqs[j][col])
|
|
65
|
+
|
|
66
|
+
if verbose:
|
|
67
|
+
print(f" SP-score: {sp_score:.2f}")
|
|
68
|
+
|
|
69
|
+
return MSAResult(
|
|
70
|
+
sequences=sequences_aligned,
|
|
71
|
+
score=sp_score,
|
|
72
|
+
center_sequence=center_name,
|
|
73
|
+
pairwise_results=pw_results,
|
|
74
|
+
)
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from .needleman_wunsch import needleman_wunsch
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def center_star_align(
|
|
7
|
+
seqs: list[str],
|
|
8
|
+
names: list[str],
|
|
9
|
+
scoring,
|
|
10
|
+
*,
|
|
11
|
+
score_matrix: list[list[float]] | None = None,
|
|
12
|
+
pairwise_results=None,
|
|
13
|
+
) -> tuple[dict[str, str], str]:
|
|
14
|
+
n = len(seqs)
|
|
15
|
+
|
|
16
|
+
_cached_pw: dict[tuple[int, int], object] = {}
|
|
17
|
+
|
|
18
|
+
if score_matrix is None:
|
|
19
|
+
score_matrix = [[0.0] * n for _ in range(n)]
|
|
20
|
+
for i in range(n):
|
|
21
|
+
for j in range(i + 1, n):
|
|
22
|
+
pw = needleman_wunsch(
|
|
23
|
+
seqs[i], seqs[j], scoring,
|
|
24
|
+
seq1_name=names[i], seq2_name=names[j],
|
|
25
|
+
)
|
|
26
|
+
score_matrix[i][j] = pw.score
|
|
27
|
+
score_matrix[j][i] = pw.score
|
|
28
|
+
_cached_pw[(i, j)] = pw
|
|
29
|
+
|
|
30
|
+
center_idx = max(range(n), key=lambda i: sum(score_matrix[i]))
|
|
31
|
+
|
|
32
|
+
center_name = names[center_idx]
|
|
33
|
+
pairwise_alignments: dict[int, tuple[str, str]] = {}
|
|
34
|
+
for k in range(n):
|
|
35
|
+
if k == center_idx:
|
|
36
|
+
continue
|
|
37
|
+
if pairwise_results is not None:
|
|
38
|
+
for pw in pairwise_results:
|
|
39
|
+
if pw.seq1_name == center_name and pw.seq2_name == names[k]:
|
|
40
|
+
pairwise_alignments[k] = (pw.aligned_seq1, pw.aligned_seq2)
|
|
41
|
+
break
|
|
42
|
+
if pw.seq2_name == center_name and pw.seq1_name == names[k]:
|
|
43
|
+
pairwise_alignments[k] = (pw.aligned_seq2, pw.aligned_seq1)
|
|
44
|
+
break
|
|
45
|
+
if k not in pairwise_alignments:
|
|
46
|
+
raise ValueError(
|
|
47
|
+
f"No pairwise alignment found for '{names[k]}' against center '{center_name}'. "
|
|
48
|
+
"This is likely a name mismatch in pairwise_results."
|
|
49
|
+
)
|
|
50
|
+
else:
|
|
51
|
+
i_lo = min(center_idx, k)
|
|
52
|
+
i_hi = max(center_idx, k)
|
|
53
|
+
if (i_lo, i_hi) in _cached_pw:
|
|
54
|
+
pw = _cached_pw[(i_lo, i_hi)]
|
|
55
|
+
if center_idx == i_lo:
|
|
56
|
+
pairwise_alignments[k] = (pw.aligned_seq1, pw.aligned_seq2)
|
|
57
|
+
else:
|
|
58
|
+
pairwise_alignments[k] = (pw.aligned_seq2, pw.aligned_seq1)
|
|
59
|
+
else:
|
|
60
|
+
pw = needleman_wunsch(
|
|
61
|
+
seqs[center_idx], seqs[k], scoring,
|
|
62
|
+
seq1_name=center_name, seq2_name=names[k],
|
|
63
|
+
)
|
|
64
|
+
pairwise_alignments[k] = (pw.aligned_seq1, pw.aligned_seq2)
|
|
65
|
+
|
|
66
|
+
if n == 1:
|
|
67
|
+
return {names[0]: seqs[0]}, names[0]
|
|
68
|
+
|
|
69
|
+
col_lists: dict[int, list[tuple[str, str]]] = {
|
|
70
|
+
k: list(zip(ca, oa))
|
|
71
|
+
for k, (ca, oa) in pairwise_alignments.items()
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
other_indices = [k for k in range(n) if k != center_idx]
|
|
75
|
+
ptrs: dict[int, int] = {k: 0 for k in other_indices}
|
|
76
|
+
|
|
77
|
+
out_center: list[str] = []
|
|
78
|
+
out_others: dict[int, list[str]] = {k: [] for k in other_indices}
|
|
79
|
+
|
|
80
|
+
lengths = {k: len(col_lists[k]) for k in other_indices}
|
|
81
|
+
|
|
82
|
+
def leading_center_gaps(k: int) -> int:
|
|
83
|
+
count = 0
|
|
84
|
+
p = ptrs[k]
|
|
85
|
+
while p < lengths[k] and col_lists[k][p][0] == '-':
|
|
86
|
+
count += 1
|
|
87
|
+
p += 1
|
|
88
|
+
return count
|
|
89
|
+
|
|
90
|
+
while True:
|
|
91
|
+
ins_counts = {k: leading_center_gaps(k) for k in other_indices}
|
|
92
|
+
max_ins = max(ins_counts.values()) if ins_counts else 0
|
|
93
|
+
|
|
94
|
+
ins_ptrs = {k: 0 for k in other_indices}
|
|
95
|
+
for _ in range(max_ins):
|
|
96
|
+
out_center.append('-')
|
|
97
|
+
for k in other_indices:
|
|
98
|
+
if ins_ptrs[k] < ins_counts[k]:
|
|
99
|
+
_, oc = col_lists[k][ptrs[k]]
|
|
100
|
+
out_others[k].append(oc)
|
|
101
|
+
ptrs[k] += 1
|
|
102
|
+
ins_ptrs[k] += 1
|
|
103
|
+
else:
|
|
104
|
+
out_others[k].append('-')
|
|
105
|
+
|
|
106
|
+
if all(ptrs[k] >= lengths[k] for k in other_indices):
|
|
107
|
+
break
|
|
108
|
+
|
|
109
|
+
center_chars = set()
|
|
110
|
+
for k in other_indices:
|
|
111
|
+
if ptrs[k] < lengths[k]:
|
|
112
|
+
center_chars.add(col_lists[k][ptrs[k]][0])
|
|
113
|
+
|
|
114
|
+
assert len(center_chars) == 1, (
|
|
115
|
+
f"Expected exactly one center character, got {center_chars!r}"
|
|
116
|
+
)
|
|
117
|
+
c_char = next(iter(center_chars))
|
|
118
|
+
out_center.append(c_char)
|
|
119
|
+
for k in other_indices:
|
|
120
|
+
if ptrs[k] < lengths[k] and col_lists[k][ptrs[k]][0] == c_char:
|
|
121
|
+
_, oc = col_lists[k][ptrs[k]]
|
|
122
|
+
out_others[k].append(oc)
|
|
123
|
+
ptrs[k] += 1
|
|
124
|
+
else:
|
|
125
|
+
out_others[k].append('-')
|
|
126
|
+
|
|
127
|
+
center_aligned = ''.join(out_center)
|
|
128
|
+
result: dict[str, str] = {names[center_idx]: center_aligned}
|
|
129
|
+
for k in other_indices:
|
|
130
|
+
result[names[k]] = ''.join(out_others[k])
|
|
131
|
+
return result, center_name
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from .result import PairwiseResult
|
|
3
|
+
from .scoring import DEFAULT_SCORING
|
|
4
|
+
|
|
5
|
+
_DIAG = 0
|
|
6
|
+
_UP = 1
|
|
7
|
+
_LEFT = 2
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def needleman_wunsch(
|
|
11
|
+
seq1: str,
|
|
12
|
+
seq2: str,
|
|
13
|
+
scoring=None,
|
|
14
|
+
*,
|
|
15
|
+
seq1_name: str = 'Seq1',
|
|
16
|
+
seq2_name: str = 'Seq2',
|
|
17
|
+
) -> PairwiseResult:
|
|
18
|
+
if scoring is None:
|
|
19
|
+
scoring = DEFAULT_SCORING
|
|
20
|
+
|
|
21
|
+
gap = scoring.gap
|
|
22
|
+
n = len(seq1)
|
|
23
|
+
m = len(seq2)
|
|
24
|
+
|
|
25
|
+
F: list[list[float]] = [[0.0] * (m + 1) for _ in range(n + 1)]
|
|
26
|
+
D: list[list[int]] = [[_DIAG] * (m + 1) for _ in range(n + 1)]
|
|
27
|
+
|
|
28
|
+
for i in range(1, n + 1):
|
|
29
|
+
F[i][0] = i * gap
|
|
30
|
+
D[i][0] = _UP
|
|
31
|
+
for j in range(1, m + 1):
|
|
32
|
+
F[0][j] = j * gap
|
|
33
|
+
D[0][j] = _LEFT
|
|
34
|
+
|
|
35
|
+
for i in range(1, n + 1):
|
|
36
|
+
for j in range(1, m + 1):
|
|
37
|
+
diag = F[i - 1][j - 1] + scoring.score(seq1[i - 1], seq2[j - 1])
|
|
38
|
+
up = F[i - 1][j] + gap
|
|
39
|
+
left = F[i][j - 1] + gap
|
|
40
|
+
|
|
41
|
+
if diag >= up and diag >= left:
|
|
42
|
+
F[i][j] = diag
|
|
43
|
+
D[i][j] = _DIAG
|
|
44
|
+
elif up >= left:
|
|
45
|
+
F[i][j] = up
|
|
46
|
+
D[i][j] = _UP
|
|
47
|
+
else:
|
|
48
|
+
F[i][j] = left
|
|
49
|
+
D[i][j] = _LEFT
|
|
50
|
+
|
|
51
|
+
aligned1: list[str] = []
|
|
52
|
+
aligned2: list[str] = []
|
|
53
|
+
|
|
54
|
+
i, j = n, m
|
|
55
|
+
while i > 0 or j > 0:
|
|
56
|
+
d = D[i][j]
|
|
57
|
+
if d == _DIAG:
|
|
58
|
+
aligned1.append(seq1[i - 1])
|
|
59
|
+
aligned2.append(seq2[j - 1])
|
|
60
|
+
i -= 1
|
|
61
|
+
j -= 1
|
|
62
|
+
elif d == _UP:
|
|
63
|
+
aligned1.append(seq1[i - 1])
|
|
64
|
+
aligned2.append('-')
|
|
65
|
+
i -= 1
|
|
66
|
+
else:
|
|
67
|
+
aligned1.append('-')
|
|
68
|
+
aligned2.append(seq2[j - 1])
|
|
69
|
+
j -= 1
|
|
70
|
+
|
|
71
|
+
aligned1.reverse()
|
|
72
|
+
aligned2.reverse()
|
|
73
|
+
|
|
74
|
+
return PairwiseResult(
|
|
75
|
+
seq1_name=seq1_name,
|
|
76
|
+
seq2_name=seq2_name,
|
|
77
|
+
aligned_seq1=''.join(aligned1),
|
|
78
|
+
aligned_seq2=''.join(aligned2),
|
|
79
|
+
score=F[n][m],
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def needleman_wunsch_score(seq1: str, seq2: str, scoring=None) -> float:
|
|
84
|
+
return needleman_wunsch(seq1, seq2, scoring).score
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class PairwiseResult:
|
|
7
|
+
seq1_name: str
|
|
8
|
+
seq2_name: str
|
|
9
|
+
aligned_seq1: str
|
|
10
|
+
aligned_seq2: str
|
|
11
|
+
score: float
|
|
12
|
+
|
|
13
|
+
@property
|
|
14
|
+
def alignment_length(self) -> int:
|
|
15
|
+
return len(self.aligned_seq1)
|
|
16
|
+
|
|
17
|
+
@property
|
|
18
|
+
def identity(self) -> float:
|
|
19
|
+
"""Identity = matches / alignment_length (gaps included in denominator)."""
|
|
20
|
+
if self.alignment_length == 0:
|
|
21
|
+
return 0.0
|
|
22
|
+
matches = sum(
|
|
23
|
+
1 for a, b in zip(self.aligned_seq1, self.aligned_seq2) if a == b and a != '-'
|
|
24
|
+
)
|
|
25
|
+
return matches / self.alignment_length
|
|
26
|
+
|
|
27
|
+
def __str__(self) -> str:
|
|
28
|
+
return (
|
|
29
|
+
f"{self.seq1_name}: {self.aligned_seq1}\n"
|
|
30
|
+
f"{self.seq2_name}: {self.aligned_seq2}\n"
|
|
31
|
+
f"Score: {self.score} Identity: {self.identity:.1%}"
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class MSAResult:
|
|
37
|
+
sequences: dict[str, str]
|
|
38
|
+
score: float
|
|
39
|
+
center_sequence: str
|
|
40
|
+
pairwise_results: list[PairwiseResult]
|
|
41
|
+
column_count: int = field(init=False)
|
|
42
|
+
|
|
43
|
+
def __post_init__(self) -> None:
|
|
44
|
+
lengths = {len(s) for s in self.sequences.values()}
|
|
45
|
+
if len(lengths) == 1:
|
|
46
|
+
self.column_count = next(iter(lengths))
|
|
47
|
+
elif lengths:
|
|
48
|
+
self.column_count = max(lengths)
|
|
49
|
+
else:
|
|
50
|
+
self.column_count = 0
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def names(self) -> list[str]:
|
|
54
|
+
return list(self.sequences.keys())
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def aligned_sequences(self) -> list[str]:
|
|
58
|
+
return list(self.sequences.values())
|
|
59
|
+
|
|
60
|
+
def __str__(self) -> str:
|
|
61
|
+
lines = ["=== MSA Result ===", f"Score: {self.score}"]
|
|
62
|
+
for name, seq in self.sequences.items():
|
|
63
|
+
lines.append(f"{name}: {seq}")
|
|
64
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class SimpleScoring:
|
|
5
|
+
def __init__(self, match: float = 1.0, mismatch: float = -1.0, gap: float = -2.0) -> None:
|
|
6
|
+
self.match = match
|
|
7
|
+
self.mismatch = mismatch
|
|
8
|
+
self.gap = gap
|
|
9
|
+
|
|
10
|
+
def score(self, a: str, b: str) -> float:
|
|
11
|
+
if len(a) != 1 or len(b) != 1:
|
|
12
|
+
raise ValueError(
|
|
13
|
+
f"score() expects single characters, got {a!r} and {b!r}"
|
|
14
|
+
)
|
|
15
|
+
if a == '-' or b == '-':
|
|
16
|
+
return self.gap
|
|
17
|
+
return self.match if a == b else self.mismatch
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
DEFAULT_SCORING = SimpleScoring()
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
pyproject.toml
|
|
2
|
+
src/zehramsa/__init__.py
|
|
3
|
+
src/zehramsa/align.py
|
|
4
|
+
src/zehramsa/center_star.py
|
|
5
|
+
src/zehramsa/needleman_wunsch.py
|
|
6
|
+
src/zehramsa/result.py
|
|
7
|
+
src/zehramsa/scoring.py
|
|
8
|
+
src/zehramsa.egg-info/PKG-INFO
|
|
9
|
+
src/zehramsa.egg-info/SOURCES.txt
|
|
10
|
+
src/zehramsa.egg-info/dependency_links.txt
|
|
11
|
+
src/zehramsa.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
zehramsa
|