ataserinyelMSA 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ataserinyelmsa-0.1.0/LICENSE +21 -0
- ataserinyelmsa-0.1.0/PKG-INFO +70 -0
- ataserinyelmsa-0.1.0/README.md +55 -0
- ataserinyelmsa-0.1.0/pyproject.toml +28 -0
- ataserinyelmsa-0.1.0/setup.cfg +4 -0
- ataserinyelmsa-0.1.0/src/ataserinyelMSA/alignment.py +155 -0
- ataserinyelmsa-0.1.0/src/ataserinyelMSA/fasta.py +35 -0
- ataserinyelmsa-0.1.0/src/ataserinyelMSA/scoring.py +24 -0
- ataserinyelmsa-0.1.0/src/ataserinyelMSA/tree.py +60 -0
- ataserinyelmsa-0.1.0/src/ataserinyelMSA.egg-info/PKG-INFO +70 -0
- ataserinyelmsa-0.1.0/src/ataserinyelMSA.egg-info/SOURCES.txt +19 -0
- ataserinyelmsa-0.1.0/src/ataserinyelMSA.egg-info/dependency_links.txt +1 -0
- ataserinyelmsa-0.1.0/src/ataserinyelMSA.egg-info/requires.txt +1 -0
- ataserinyelmsa-0.1.0/src/ataserinyelMSA.egg-info/top_level.txt +1 -0
- ataserinyelmsa-0.1.0/test/test_alignment.py +3 -0
- ataserinyelmsa-0.1.0/test/test_distance.py +8 -0
- ataserinyelmsa-0.1.0/test/test_fasta.py +3 -0
- ataserinyelmsa-0.1.0/test/test_pipline.py +33 -0
- ataserinyelmsa-0.1.0/test/test_progressive_alignment.py +11 -0
- ataserinyelmsa-0.1.0/test/test_scoring.py +6 -0
- ataserinyelmsa-0.1.0/test/test_tree.py +8 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Ata Serinyel
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: ataserinyelMSA
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A simple MAFFT-based Multiple Sequence Alignment (MSA) library
|
|
5
|
+
Author-email: ataserinyel <clasher.mp2@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/ataserinyel/ataserinyelMSA
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
11
|
+
Requires-Python: >=3.10
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Requires-Dist: numpy>=1.21.0
|
|
15
|
+
|
|
16
|
+
# ataserinyelMSA
|
|
17
|
+
|
|
18
|
+
A simple MAFFT-inspired Multiple Sequence Alignment (MSA) tool written in Python.
|
|
19
|
+
|
|
20
|
+
## Installation
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
pip install ataserinyelMSA
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Usage
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
python main.py input.fasta output.fasta
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Example
|
|
33
|
+
|
|
34
|
+
Input (`input.fasta`):
|
|
35
|
+
```
|
|
36
|
+
>seq1
|
|
37
|
+
GATTACA
|
|
38
|
+
>seq2
|
|
39
|
+
GCATGCU
|
|
40
|
+
>seq3
|
|
41
|
+
AGCTAGC
|
|
42
|
+
```
|
|
43
|
+
Output (`output.fasta`):
|
|
44
|
+
```
|
|
45
|
+
>seq1
|
|
46
|
+
-GAT-TACA
|
|
47
|
+
>seq2
|
|
48
|
+
-GC-ATGCU
|
|
49
|
+
>seq3
|
|
50
|
+
AGCTA-GC-
|
|
51
|
+
```
|
|
52
|
+
## Algorithm
|
|
53
|
+
|
|
54
|
+
This tool implements a simplified version of the MAFFT FFT-NS-1 algorithm:
|
|
55
|
+
|
|
56
|
+
1. **FASTA Parsing** - Read and write FASTA format files
|
|
57
|
+
2. **Pairwise Alignment** - Needleman-Wunsch global alignment algorithm
|
|
58
|
+
3. **Distance Matrix** - Compute pairwise distances between sequences
|
|
59
|
+
4. **Guide Tree** - UPGMA clustering algorithm
|
|
60
|
+
5. **Progressive Alignment** - Align sequences following the guide tree order
|
|
61
|
+
|
|
62
|
+
## Differences from original MAFFT
|
|
63
|
+
|
|
64
|
+
- Uses Needleman-Wunsch instead of FFT for similarity calculation
|
|
65
|
+
- Simple +1/-1 scoring matrix instead of advanced substitution matrices
|
|
66
|
+
- Suitable for small datasets
|
|
67
|
+
|
|
68
|
+
## Author
|
|
69
|
+
|
|
70
|
+
Ata Serinyel
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# ataserinyelMSA
|
|
2
|
+
|
|
3
|
+
A simple MAFFT-inspired Multiple Sequence Alignment (MSA) tool written in Python.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install ataserinyelMSA
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
python main.py input.fasta output.fasta
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Example
|
|
18
|
+
|
|
19
|
+
Input (`input.fasta`):
|
|
20
|
+
```
|
|
21
|
+
>seq1
|
|
22
|
+
GATTACA
|
|
23
|
+
>seq2
|
|
24
|
+
GCATGCU
|
|
25
|
+
>seq3
|
|
26
|
+
AGCTAGC
|
|
27
|
+
```
|
|
28
|
+
Output (`output.fasta`):
|
|
29
|
+
```
|
|
30
|
+
>seq1
|
|
31
|
+
-GAT-TACA
|
|
32
|
+
>seq2
|
|
33
|
+
-GC-ATGCU
|
|
34
|
+
>seq3
|
|
35
|
+
AGCTA-GC-
|
|
36
|
+
```
|
|
37
|
+
## Algorithm
|
|
38
|
+
|
|
39
|
+
This tool implements a simplified version of the MAFFT FFT-NS-1 algorithm:
|
|
40
|
+
|
|
41
|
+
1. **FASTA Parsing** - Read and write FASTA format files
|
|
42
|
+
2. **Pairwise Alignment** - Needleman-Wunsch global alignment algorithm
|
|
43
|
+
3. **Distance Matrix** - Compute pairwise distances between sequences
|
|
44
|
+
4. **Guide Tree** - UPGMA clustering algorithm
|
|
45
|
+
5. **Progressive Alignment** - Align sequences following the guide tree order
|
|
46
|
+
|
|
47
|
+
## Differences from original MAFFT
|
|
48
|
+
|
|
49
|
+
- Uses Needleman-Wunsch instead of FFT for similarity calculation
|
|
50
|
+
- Simple +1/-1 scoring matrix instead of advanced substitution matrices
|
|
51
|
+
- Suitable for small datasets
|
|
52
|
+
|
|
53
|
+
## Author
|
|
54
|
+
|
|
55
|
+
Ata Serinyel
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0,<77.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "ataserinyelMSA"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name="ataserinyel", email="clasher.mp2@gmail.com" }
|
|
10
|
+
]
|
|
11
|
+
description = "A simple MAFFT-based Multiple Sequence Alignment (MSA) library"
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.10"
|
|
14
|
+
license = {text = "MIT"}
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Operating System :: OS Independent",
|
|
18
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
19
|
+
]
|
|
20
|
+
dependencies = [
|
|
21
|
+
"numpy>=1.21.0",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
[project.urls]
|
|
25
|
+
Homepage = "https://github.com/ataserinyel/ataserinyelMSA"
|
|
26
|
+
|
|
27
|
+
[tool.setuptools.packages.find]
|
|
28
|
+
where = ["src"]
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
# Needleman-Wunsch Algoritması
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from src.ataserinyelMSA.scoring import get_score, GAP_PENALTY
|
|
5
|
+
|
|
6
|
+
def needleman_wunsch(seq1, seq2):
|
|
7
|
+
'''
|
|
8
|
+
İki diziyi Needleman-Wunsch algoritması ile hizalar.
|
|
9
|
+
Hizalanmış seq1 ve seq2 döndürür.
|
|
10
|
+
'''
|
|
11
|
+
|
|
12
|
+
# MAtrisleri 0 ile doldurma
|
|
13
|
+
S = np.zeros([len(seq1)+1, len(seq2)+1])
|
|
14
|
+
|
|
15
|
+
for i in range(1, len(seq1)+1):
|
|
16
|
+
S[i, 0] = S[i-1, 0] + GAP_PENALTY
|
|
17
|
+
|
|
18
|
+
for j in range(1, len(seq2)+1):
|
|
19
|
+
S[0, j] = S[0, j-1] + GAP_PENALTY
|
|
20
|
+
|
|
21
|
+
for i in range(1, len(seq1)+1):
|
|
22
|
+
for j in range(1, len(seq2)+1):
|
|
23
|
+
a = S[i-1, j-1] + get_score(seq1[i-1], seq2[j-1]) # Çapraz kontrol
|
|
24
|
+
b = S[i-1, j] + GAP_PENALTY # Yukarı
|
|
25
|
+
c = S[i, j-1] + GAP_PENALTY # Sol
|
|
26
|
+
S[i,j] = max(a,b,c) # En yükseği al
|
|
27
|
+
|
|
28
|
+
# Traceback işlemi
|
|
29
|
+
aligned_seq1 = ''
|
|
30
|
+
aligned_seq2 = ''
|
|
31
|
+
i, j = len(seq1), len(seq2)
|
|
32
|
+
|
|
33
|
+
while i>0 or j>0:
|
|
34
|
+
# Çaprazdan gelme
|
|
35
|
+
if i>0 and j>0 and S[i,j] == S[i-1, j-1] + get_score(seq1[i-1], seq2[j-1]):
|
|
36
|
+
aligned_seq1 = seq1[i-1] + aligned_seq1
|
|
37
|
+
aligned_seq2 = seq2[j-1] + aligned_seq2
|
|
38
|
+
i = i-1
|
|
39
|
+
j = j-1
|
|
40
|
+
# Yukarıdan Gelme
|
|
41
|
+
elif i>0 and S[i,j] == S[i-1,j] + GAP_PENALTY:
|
|
42
|
+
aligned_seq1 = seq1[i-1] + aligned_seq1
|
|
43
|
+
aligned_seq2 = '-' + aligned_seq2
|
|
44
|
+
i = i-1
|
|
45
|
+
# Soldan Gelme
|
|
46
|
+
else:
|
|
47
|
+
aligned_seq1 = '-' + aligned_seq1
|
|
48
|
+
aligned_seq2 = seq2[j-1] + aligned_seq2
|
|
49
|
+
j = j-1
|
|
50
|
+
|
|
51
|
+
return aligned_seq1, aligned_seq2
|
|
52
|
+
|
|
53
|
+
def compute_distance(seq1, seq2):
|
|
54
|
+
'''
|
|
55
|
+
İki dizi arasındaki mesafeyi hesaplar.
|
|
56
|
+
Hizalama skorunu mesafeye çevirir.
|
|
57
|
+
Yüksek Skor = Düşük Mesafe
|
|
58
|
+
'''
|
|
59
|
+
|
|
60
|
+
aligned_seq1, aligned_seq2 = needleman_wunsch(seq1, seq2)
|
|
61
|
+
|
|
62
|
+
matches = 0
|
|
63
|
+
total = len(aligned_seq1)
|
|
64
|
+
|
|
65
|
+
for a, b in zip(aligned_seq1, aligned_seq2):
|
|
66
|
+
if a == b:
|
|
67
|
+
matches += 1
|
|
68
|
+
|
|
69
|
+
smilarity = matches / total # 0 ile 1 arasında benzerlik oranı
|
|
70
|
+
distance = 1 - smilarity # Benzerliğin tersi = mesafe
|
|
71
|
+
|
|
72
|
+
return distance
|
|
73
|
+
|
|
74
|
+
def distance_matrix(sequences):
|
|
75
|
+
'''
|
|
76
|
+
Tüm dizi çiftleri için mesafe hesaplar.
|
|
77
|
+
Girdi: {'seq1': 'GATTACA', 'seq2': 'GCATGCU', ...}
|
|
78
|
+
Döndürdüğü 2 boyutlu liste (matris) ve isim listesi.
|
|
79
|
+
'''
|
|
80
|
+
|
|
81
|
+
names = list(sequences.keys())
|
|
82
|
+
seqs = list(sequences.values())
|
|
83
|
+
n = len(seqs)
|
|
84
|
+
|
|
85
|
+
matrix = np.zeros((n, n))
|
|
86
|
+
|
|
87
|
+
for i in range(n):
|
|
88
|
+
for j in range(i+1, n): # i+1'den başlıyoruz çünkü i == j anyı dizi ve mefase 0
|
|
89
|
+
dist = compute_distance(seqs[i], seqs[j])
|
|
90
|
+
matrix[i][j] = dist
|
|
91
|
+
matrix[j][i] = dist # Simetrik, seq1 -> seq2 mesafesi = seq2 -> seq1 mesafesi.
|
|
92
|
+
|
|
93
|
+
return matrix, names
|
|
94
|
+
|
|
95
|
+
def get_consensus(aligned_seqs):
|
|
96
|
+
"""
|
|
97
|
+
Hizalanmış diziler grubundan konsensüs dizi üretir.
|
|
98
|
+
Gap olmayan karakterlere öncelik verir.
|
|
99
|
+
"""
|
|
100
|
+
consensus = ''
|
|
101
|
+
for i in range(len(aligned_seqs[0])):
|
|
102
|
+
column = [seq[i] for seq in aligned_seqs]
|
|
103
|
+
non_gaps = [c for c in column if c != '-']
|
|
104
|
+
if non_gaps:
|
|
105
|
+
consensus += max(set(non_gaps), key=non_gaps.count) # Gap olmayanların en sığını al
|
|
106
|
+
else:
|
|
107
|
+
consensus += '-' # Tüm pozisyon gap ise gap koy
|
|
108
|
+
return consensus
|
|
109
|
+
|
|
110
|
+
def progressive_alignment(sequences, merge_order):
|
|
111
|
+
'''
|
|
112
|
+
UPGMA'nın verdiği sıraya göre dizileri hizalar.
|
|
113
|
+
Girdi: - sequences -> {'seq1': 'GATTACA', ...}
|
|
114
|
+
- merge_oreder -> [('seq2', 'seq3'), ('seq1', '(seq2, seq3)')]
|
|
115
|
+
Döndürdüğü: hizalanmış diziler sözlüğü
|
|
116
|
+
'''
|
|
117
|
+
|
|
118
|
+
# Grupları takip eden sözlük, başta her isim kendi dizisini takip eder
|
|
119
|
+
groups = {name: [seq] for name, seq in sequences.items()}
|
|
120
|
+
|
|
121
|
+
for (name1, name2) in merge_order:
|
|
122
|
+
# İki grubunu konsensüsünü al
|
|
123
|
+
consensus1 = get_consensus(groups[name1])
|
|
124
|
+
consensus2 = get_consensus(groups[name2])
|
|
125
|
+
|
|
126
|
+
# Konsensüsler arasında hizalama yap
|
|
127
|
+
aligned1, aligned2 = needleman_wunsch(consensus1, consensus2)
|
|
128
|
+
|
|
129
|
+
# Gap pozisyonlarını tüm gruba uygula
|
|
130
|
+
def apply_gaps(seqs, aligned_consensus, original_consensus):
|
|
131
|
+
result = []
|
|
132
|
+
|
|
133
|
+
for seq in seqs:
|
|
134
|
+
new_seq = ''
|
|
135
|
+
seq_idx = 0
|
|
136
|
+
for char in aligned_consensus:
|
|
137
|
+
if char == '-':
|
|
138
|
+
new_seq += '-' # Yeni gap ekle
|
|
139
|
+
else:
|
|
140
|
+
new_seq += seq[seq_idx]
|
|
141
|
+
seq_idx += 1
|
|
142
|
+
result.append(new_seq)
|
|
143
|
+
return result
|
|
144
|
+
|
|
145
|
+
new_group1 = apply_gaps(groups[name1], aligned1, consensus1)
|
|
146
|
+
new_group2 = apply_gaps(groups[name2], aligned2, consensus2)
|
|
147
|
+
|
|
148
|
+
# Yeni grubu oluşturma
|
|
149
|
+
new_name = f'({name1}, {name2})'
|
|
150
|
+
groups[new_name] = new_group1 + new_group2
|
|
151
|
+
|
|
152
|
+
final_group = groups[list(groups.keys())[-1]]
|
|
153
|
+
names = list(sequences.keys())
|
|
154
|
+
|
|
155
|
+
return {names[i]: final_group[i] for i in range(len(names))}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# FASTA Dosyalarını okumak ve yazmak için yardımcı fonkyionlar.
|
|
2
|
+
|
|
3
|
+
def read_fasta(filename):
|
|
4
|
+
'''
|
|
5
|
+
FASTA dosyasını okur.
|
|
6
|
+
Döndürme şu şekilde olur:
|
|
7
|
+
{'seq1': 'ATAATAGC', 'seq2': 'GATACACG'}
|
|
8
|
+
'''
|
|
9
|
+
|
|
10
|
+
sequences = {}
|
|
11
|
+
current_name = None
|
|
12
|
+
|
|
13
|
+
with open(filename, 'r') as f:
|
|
14
|
+
for line in f:
|
|
15
|
+
line = line.strip() # Satır sonu \n karakterini temizler
|
|
16
|
+
|
|
17
|
+
if line.startswith('>'):
|
|
18
|
+
current_name = line[1:] # > karkterini atla
|
|
19
|
+
sequences[current_name] = ''
|
|
20
|
+
else:
|
|
21
|
+
sequences[current_name] += line
|
|
22
|
+
# += kullannıyoruz çünkü bazı diziler birden fazla satıra bölünmüş olabilir
|
|
23
|
+
|
|
24
|
+
return sequences
|
|
25
|
+
|
|
26
|
+
def write_fasta(sequences, filename):
|
|
27
|
+
'''
|
|
28
|
+
Sözlükteki dizileri FASTA formatında dosyaya yazar.
|
|
29
|
+
Hizalamadan sonra dizilerde gap karakteri olabilir.
|
|
30
|
+
'''
|
|
31
|
+
|
|
32
|
+
with open(filename, 'w') as f:
|
|
33
|
+
for name, seq in sequences.items():
|
|
34
|
+
f.write(f'>{name}\n')
|
|
35
|
+
f.write(f'{seq}\n')
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# Dizi hizalamasında kullanılan puanlama matrisi.
|
|
2
|
+
# DNA ve RNA dizileri için nükleotid puanlama matrisi.
|
|
3
|
+
|
|
4
|
+
MATCH = 1
|
|
5
|
+
MISMATCH = -1
|
|
6
|
+
GAP_PENALTY = -1
|
|
7
|
+
|
|
8
|
+
# Eşleşme: MATCH, Uyuşmazlık: MISMATCH
|
|
9
|
+
|
|
10
|
+
NUCLEOTIDE_MATRIX = {
|
|
11
|
+
('A', 'A'): MATCH, ('A', 'T'): MISMATCH, ('A', 'G'): MISMATCH, ('A', 'C'): MISMATCH, ('A', 'U'): MISMATCH,
|
|
12
|
+
('T', 'A'): MISMATCH, ('T', 'T'): MATCH, ('T', 'G'): MISMATCH, ('T', 'C'): MISMATCH, ('T', 'U'): MISMATCH,
|
|
13
|
+
('G', 'A'): MISMATCH, ('G', 'T'): MISMATCH, ('G', 'G'): MATCH, ('G', 'C'): MISMATCH, ('G', 'U'): MISMATCH,
|
|
14
|
+
('C', 'A'): MISMATCH, ('C', 'T'): MISMATCH, ('C', 'G'): MISMATCH, ('C', 'C'): MATCH, ('C', 'U'): MISMATCH,
|
|
15
|
+
('U', 'A'): MISMATCH, ('U', 'T'): MISMATCH, ('U', 'G'): MISMATCH, ('U', 'C'): MISMATCH, ('U', 'U'): MATCH,
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
def get_score(a,b):
|
|
19
|
+
'''
|
|
20
|
+
İki karakter arasındaki skoru döndürür.
|
|
21
|
+
Matriste olmayan karakterler için 0 döndürür.
|
|
22
|
+
'''
|
|
23
|
+
|
|
24
|
+
return NUCLEOTIDE_MATRIX.get((a,b), 0)
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# UPGMA algoritması ile guide tree oluşturur.
|
|
2
|
+
# Girdi: Distance matrix ve isim listesi
|
|
3
|
+
# Çıktı: Dizilerin hangi sırayla birleştirildiği listesi
|
|
4
|
+
# [(i,j), (k,l), ...] gibi gibi
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
def upgma(matrix, names):
|
|
9
|
+
'''
|
|
10
|
+
UPGMA algorimtası ile birleştirme sırasını döndürür.
|
|
11
|
+
Döndürdüğğü: [('seq2', 'seq3'), ('seq1', '(seq2,seq3)'), ...]
|
|
12
|
+
'''
|
|
13
|
+
|
|
14
|
+
matrix = [row[:] for row in matrix] # Orijinal matrisi bozmamak için kopyala
|
|
15
|
+
names = list(names) # İsim listesini kopyala
|
|
16
|
+
merge_order = [] # Birleştirme sırası
|
|
17
|
+
|
|
18
|
+
while len(names) > 1:
|
|
19
|
+
n = len(names)
|
|
20
|
+
|
|
21
|
+
# En küçük mesafeyi bul
|
|
22
|
+
min_dist = float('inf')
|
|
23
|
+
min_i, min_j = 0, 1
|
|
24
|
+
|
|
25
|
+
for i in range(n):
|
|
26
|
+
for j in range(i+1, n):
|
|
27
|
+
if matrix[i][j] < min_dist:
|
|
28
|
+
min_dist = matrix[i][j]
|
|
29
|
+
min_i, min_j = i, j
|
|
30
|
+
|
|
31
|
+
# Birleştirme sırasına ekle
|
|
32
|
+
merge_order.append((names[min_i], names[min_j]))
|
|
33
|
+
|
|
34
|
+
# Yeni grubun ismini oluştur
|
|
35
|
+
new_name = f'({names[min_i]}, {names[min_j]})'
|
|
36
|
+
|
|
37
|
+
# Yeni grubun diğer dizilere mesafesini hesapla
|
|
38
|
+
new_distances = []
|
|
39
|
+
for k in range(n):
|
|
40
|
+
if k != min_i and k != min_j:
|
|
41
|
+
new_dist = (matrix[min_i][k] + matrix[min_j][k]) / 2
|
|
42
|
+
new_distances.append(new_dist)
|
|
43
|
+
|
|
44
|
+
# Eski grupları sil, yeni grubu ekle
|
|
45
|
+
names.pop(min_j)
|
|
46
|
+
names.pop(min_i)
|
|
47
|
+
names.append(new_name)
|
|
48
|
+
|
|
49
|
+
# Matrisi güncelle
|
|
50
|
+
matrix = np.delete(matrix, [min_i, min_j], axis=0)
|
|
51
|
+
matrix = np.delete(matrix, [min_i, min_j], axis=1)
|
|
52
|
+
matrix = matrix.tolist()
|
|
53
|
+
|
|
54
|
+
# Yeni satır ve sütun ekle
|
|
55
|
+
for k in range(len(matrix)):
|
|
56
|
+
matrix[k].append(new_distances[k])
|
|
57
|
+
new_distances.append(0.0)
|
|
58
|
+
matrix.append(new_distances)
|
|
59
|
+
|
|
60
|
+
return merge_order
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: ataserinyelMSA
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A simple MAFFT-based Multiple Sequence Alignment (MSA) library
|
|
5
|
+
Author-email: ataserinyel <clasher.mp2@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/ataserinyel/ataserinyelMSA
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
11
|
+
Requires-Python: >=3.10
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Requires-Dist: numpy>=1.21.0
|
|
15
|
+
|
|
16
|
+
# ataserinyelMSA
|
|
17
|
+
|
|
18
|
+
A simple MAFFT-inspired Multiple Sequence Alignment (MSA) tool written in Python.
|
|
19
|
+
|
|
20
|
+
## Installation
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
pip install ataserinyelMSA
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Usage
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
python main.py input.fasta output.fasta
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Example
|
|
33
|
+
|
|
34
|
+
Input (`input.fasta`):
|
|
35
|
+
```
|
|
36
|
+
>seq1
|
|
37
|
+
GATTACA
|
|
38
|
+
>seq2
|
|
39
|
+
GCATGCU
|
|
40
|
+
>seq3
|
|
41
|
+
AGCTAGC
|
|
42
|
+
```
|
|
43
|
+
Output (`output.fasta`):
|
|
44
|
+
```
|
|
45
|
+
>seq1
|
|
46
|
+
-GAT-TACA
|
|
47
|
+
>seq2
|
|
48
|
+
-GC-ATGCU
|
|
49
|
+
>seq3
|
|
50
|
+
AGCTA-GC-
|
|
51
|
+
```
|
|
52
|
+
## Algorithm
|
|
53
|
+
|
|
54
|
+
This tool implements a simplified version of the MAFFT FFT-NS-1 algorithm:
|
|
55
|
+
|
|
56
|
+
1. **FASTA Parsing** - Read and write FASTA format files
|
|
57
|
+
2. **Pairwise Alignment** - Needleman-Wunsch global alignment algorithm
|
|
58
|
+
3. **Distance Matrix** - Compute pairwise distances between sequences
|
|
59
|
+
4. **Guide Tree** - UPGMA clustering algorithm
|
|
60
|
+
5. **Progressive Alignment** - Align sequences following the guide tree order
|
|
61
|
+
|
|
62
|
+
## Differences from original MAFFT
|
|
63
|
+
|
|
64
|
+
- Uses Needleman-Wunsch instead of FFT for similarity calculation
|
|
65
|
+
- Simple +1/-1 scoring matrix instead of advanced substitution matrices
|
|
66
|
+
- Suitable for small datasets
|
|
67
|
+
|
|
68
|
+
## Author
|
|
69
|
+
|
|
70
|
+
Ata Serinyel
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
src/ataserinyelMSA/alignment.py
|
|
5
|
+
src/ataserinyelMSA/fasta.py
|
|
6
|
+
src/ataserinyelMSA/scoring.py
|
|
7
|
+
src/ataserinyelMSA/tree.py
|
|
8
|
+
src/ataserinyelMSA.egg-info/PKG-INFO
|
|
9
|
+
src/ataserinyelMSA.egg-info/SOURCES.txt
|
|
10
|
+
src/ataserinyelMSA.egg-info/dependency_links.txt
|
|
11
|
+
src/ataserinyelMSA.egg-info/requires.txt
|
|
12
|
+
src/ataserinyelMSA.egg-info/top_level.txt
|
|
13
|
+
test/test_alignment.py
|
|
14
|
+
test/test_distance.py
|
|
15
|
+
test/test_fasta.py
|
|
16
|
+
test/test_pipline.py
|
|
17
|
+
test/test_progressive_alignment.py
|
|
18
|
+
test/test_scoring.py
|
|
19
|
+
test/test_tree.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
numpy>=1.21.0
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ataserinyelMSA
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
|
|
2
|
+
from src.ataserinyelMSA.fasta import read_fasta
|
|
3
|
+
from src.ataserinyelMSA.alignment import distance_matrix
|
|
4
|
+
seqs = read_fasta('test/test.fasta')
|
|
5
|
+
matrix, names = distance_matrix(seqs)
|
|
6
|
+
print(names)
|
|
7
|
+
for row in matrix:
|
|
8
|
+
print([round(float(x), 2) for x in row])
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
|
|
2
|
+
from src.ataserinyelMSA.fasta import read_fasta, write_fasta
|
|
3
|
+
from src.ataserinyelMSA.alignment import distance_matrix, progressive_alignment
|
|
4
|
+
from src.ataserinyelMSA.tree import upgma
|
|
5
|
+
|
|
6
|
+
# 1. FASTA oku
|
|
7
|
+
seqs = read_fasta('test/test.fasta')
|
|
8
|
+
print('1. Diziler okundu:')
|
|
9
|
+
for name, seq in seqs.items():
|
|
10
|
+
print(f' {name}: {seq}')
|
|
11
|
+
|
|
12
|
+
# 2. Distance matrix
|
|
13
|
+
mat, names = distance_matrix(seqs)
|
|
14
|
+
print('\n2. Distance Matrix:')
|
|
15
|
+
print(' ', names)
|
|
16
|
+
for i, row in enumerate(mat):
|
|
17
|
+
print(f' {names[i]}: {[round(float(x), 2) for x in row]}')
|
|
18
|
+
|
|
19
|
+
# 3. UPGMA
|
|
20
|
+
order = upgma(mat, names)
|
|
21
|
+
print('\n3. Birleştirme Sirasi:')
|
|
22
|
+
for step in order:
|
|
23
|
+
print(f' {step[0]} + {step[1]}')
|
|
24
|
+
|
|
25
|
+
# 4. Progressive Alignment
|
|
26
|
+
result = progressive_alignment(seqs, order)
|
|
27
|
+
print('\n4. Hizalanmis Diziler:')
|
|
28
|
+
for name, seq in result.items():
|
|
29
|
+
print(f' {name}: {seq}')
|
|
30
|
+
|
|
31
|
+
# 5. Sonucu dosyaya yaz
|
|
32
|
+
write_fasta(result, 'test/output.fasta')
|
|
33
|
+
print('\n5. Sonuc test/output.fasta dosyasina yazildi.')
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from src.ataserinyelMSA.fasta import read_fasta
|
|
2
|
+
from src.ataserinyelMSA.alignment import distance_matrix, progressive_alignment
|
|
3
|
+
from src.ataserinyelMSA.tree import upgma
|
|
4
|
+
|
|
5
|
+
seqs = read_fasta('test/test.fasta')
|
|
6
|
+
mat, names = distance_matrix(seqs)
|
|
7
|
+
order = upgma(mat, names)
|
|
8
|
+
result = progressive_alignment(seqs, order)
|
|
9
|
+
|
|
10
|
+
for name, seq in result.items():
|
|
11
|
+
print(f'{name}: {seq}')
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
from src.ataserinyelMSA.fasta import read_fasta
|
|
2
|
+
from src.ataserinyelMSA.alignment import distance_matrix
|
|
3
|
+
from src.ataserinyelMSA.tree import upgma
|
|
4
|
+
|
|
5
|
+
seqs = read_fasta('test/test.fasta')
|
|
6
|
+
mat, names = distance_matrix(seqs)
|
|
7
|
+
order = upgma(mat, names)
|
|
8
|
+
print(order)
|