FastSTR 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- faststr/Segmented_Global_Alignment.py +541 -0
- faststr/__init__.py +2 -0
- faststr/get_subread_trs.py +28 -0
- faststr/main.py +338 -0
- faststr/make_consensus_TRs.py +367 -0
- faststr/scan_subread.py +326 -0
- faststr/trs_align.py +704 -0
- faststr/utils.py +428 -0
- faststr-1.0.0.dist-info/METADATA +220 -0
- faststr-1.0.0.dist-info/RECORD +13 -0
- faststr-1.0.0.dist-info/WHEEL +5 -0
- faststr-1.0.0.dist-info/entry_points.txt +2 -0
- faststr-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,541 @@
|
|
|
1
|
+
import math
|
|
2
|
+
import re
|
|
3
|
+
from collections import deque
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
# 寻找最佳解释:原则是如果有整motif长度,首选整motif长度,如果有多个,选取错误率最低的;如果没有整motif长度,则选取他周边是整motif长度,且编辑距
|
|
10
|
+
# 离最低的,如果也有多个,则选取错误率最低的
|
|
11
|
+
# RPTA
|
|
12
|
+
def realtime_path_tracking_alignment(standard_seq, seq, mat, mis, gap, logo=0):
|
|
13
|
+
dp = [[0] * (len(standard_seq) + 1) for _ in range(len(seq) + 1)]
|
|
14
|
+
upper_left = [[set() for _ in range(len(standard_seq) + 1)] for _ in range(len(seq) + 1)]
|
|
15
|
+
priority_path = [[-1] * (len(standard_seq) + 1) for _ in range(len(seq) + 1)]
|
|
16
|
+
|
|
17
|
+
# 初始化
|
|
18
|
+
for i in range(len(seq) + 1):
|
|
19
|
+
if i == 0:
|
|
20
|
+
dp[i][0] = 0
|
|
21
|
+
else:
|
|
22
|
+
dp[i][0] = i * gap
|
|
23
|
+
for j in range(len(standard_seq) + 1):
|
|
24
|
+
if j == 0:
|
|
25
|
+
dp[0][j] = 0
|
|
26
|
+
else:
|
|
27
|
+
dp[0][j] = j * gap
|
|
28
|
+
for i in range(len(seq) + 1):
|
|
29
|
+
upper_left[i][0].add(0)
|
|
30
|
+
for j in range(len(standard_seq) + 1):
|
|
31
|
+
upper_left[0][j].add(0)
|
|
32
|
+
for i in range(len(seq) + 1):
|
|
33
|
+
if i == 0:
|
|
34
|
+
priority_path[i][0] = -1
|
|
35
|
+
else:
|
|
36
|
+
priority_path[i][0] = (i - 1, 0, -1)
|
|
37
|
+
for j in range(len(standard_seq) + 1):
|
|
38
|
+
if j == 0:
|
|
39
|
+
priority_path[0][j] = -1
|
|
40
|
+
else:
|
|
41
|
+
priority_path[0][j] = (0, j - 1, -1)
|
|
42
|
+
|
|
43
|
+
# 遍历序列,进行比对
|
|
44
|
+
for i in range(1, len(seq) + 1):
|
|
45
|
+
for j in range(1, len(standard_seq) + 1):
|
|
46
|
+
# 选择状态基于插入、删除、替换,用三位二进制来记录
|
|
47
|
+
chose_path = 0
|
|
48
|
+
delete_cost = dp[i][j - 1] + gap
|
|
49
|
+
insert_cost = dp[i - 1][j] + gap
|
|
50
|
+
replace_cost = dp[i - 1][j - 1] + (mat if standard_seq[j - 1] == seq[i - 1] else mis)
|
|
51
|
+
dp[i][j] = max(insert_cost, delete_cost, replace_cost)
|
|
52
|
+
# 更新最佳比对的替换数量
|
|
53
|
+
if dp[i][j] == delete_cost:
|
|
54
|
+
upper_left[i][j] = upper_left[i][j - 1]
|
|
55
|
+
chose_path += 2
|
|
56
|
+
if dp[i][j] == insert_cost:
|
|
57
|
+
# if not upper_left[i][j]:
|
|
58
|
+
if chose_path == 0:
|
|
59
|
+
upper_left[i][j] = upper_left[i - 1][j]
|
|
60
|
+
else:
|
|
61
|
+
upper_left[i][j] = upper_left[i][j - 1] | upper_left[i - 1][j]
|
|
62
|
+
chose_path += 4
|
|
63
|
+
if dp[i][j] == replace_cost:
|
|
64
|
+
updated_set = {x + 1 for x in upper_left[i - 1][j - 1]}
|
|
65
|
+
# if not upper_left[i][j]:
|
|
66
|
+
if chose_path == 0:
|
|
67
|
+
upper_left[i][j] = updated_set
|
|
68
|
+
else:
|
|
69
|
+
upper_left[i][j] = upper_left[i][j] | updated_set
|
|
70
|
+
chose_path += 1
|
|
71
|
+
# 更新比对路径,0表示不匹配、1表示匹配、-1表示indel
|
|
72
|
+
if chose_path == 1:
|
|
73
|
+
priority_path[i][j] = (i - 1, j - 1, 1) if standard_seq[j - 1] == seq[i - 1] else (i - 1, j - 1, 0)
|
|
74
|
+
elif chose_path == 2:
|
|
75
|
+
priority_path[i][j] = (i, j - 1, -1)
|
|
76
|
+
elif chose_path == 4:
|
|
77
|
+
priority_path[i][j] = (i - 1, j, -1)
|
|
78
|
+
elif chose_path == 3:
|
|
79
|
+
if min(upper_left[i - 1][j - 1]) + 1 <= min(upper_left[i][j - 1]):
|
|
80
|
+
priority_path[i][j] = (i - 1, j - 1, 1) if standard_seq[j - 1] == seq[i - 1] else (i - 1, j - 1, 0)
|
|
81
|
+
else:
|
|
82
|
+
priority_path[i][j] = (i, j - 1, -1)
|
|
83
|
+
elif chose_path == 5:
|
|
84
|
+
if min(upper_left[i - 1][j - 1]) + 1 <= min(upper_left[i - 1][j]):
|
|
85
|
+
priority_path[i][j] = (i - 1, j - 1, 1) if standard_seq[j - 1] == seq[i - 1] else (i - 1, j - 1, 0)
|
|
86
|
+
else:
|
|
87
|
+
priority_path[i][j] = (i - 1, j, -1)
|
|
88
|
+
elif chose_path == 6:
|
|
89
|
+
if min(upper_left[i][j - 1]) <= min(upper_left[i - 1][j]):
|
|
90
|
+
priority_path[i][j] = (i, j - 1, -1)
|
|
91
|
+
else:
|
|
92
|
+
priority_path[i][j] = (i - 1, j, -1)
|
|
93
|
+
else:
|
|
94
|
+
min_ul = min(min(upper_left[i][j - 1]), min(upper_left[i - 1][j]), min(upper_left[i - 1][j - 1]) + 1)
|
|
95
|
+
if min_ul == min(upper_left[i - 1][j - 1]) + 1:
|
|
96
|
+
priority_path[i][j] = (i - 1, j - 1, 1) if standard_seq[j - 1] == seq[i - 1] else (i - 1, j - 1, 0)
|
|
97
|
+
elif min_ul == min(upper_left[i][j - 1]):
|
|
98
|
+
priority_path[i][j] = (i, j - 1, -1)
|
|
99
|
+
else:
|
|
100
|
+
priority_path[i][j] = (i - 1, j, -1)
|
|
101
|
+
|
|
102
|
+
if logo == 1:
|
|
103
|
+
return dp, upper_left, priority_path
|
|
104
|
+
return dp[len(seq)], upper_left[len(seq)], priority_path
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
# MCLA
|
|
108
|
+
def motif_constrained_local_align(standard_seq, seq, mat, mis, gap, n):
|
|
109
|
+
dp = [[0] * (len(standard_seq) + 1) for _ in range(len(seq) + 1)]
|
|
110
|
+
upper_left = [[{(0, -1, -1)} for _ in range(len(standard_seq) + 1)] for _ in range(len(seq) + 1)]
|
|
111
|
+
priority_path = [[-1] * (len(standard_seq) + 1) for _ in range(len(seq) + 1)]
|
|
112
|
+
# new_tr_start = [-1] * len(seq)
|
|
113
|
+
max_score = 0
|
|
114
|
+
|
|
115
|
+
for i in range(1, len(seq) + 1):
|
|
116
|
+
for j in range(1, len(standard_seq) + 1):
|
|
117
|
+
if j > 1 and priority_path[i - 1][j] == -1 and priority_path[i][j - 1] == -1 and priority_path[
|
|
118
|
+
i - 1][j - 1] == -1: # 确保B从第一个碱基开始比对
|
|
119
|
+
continue
|
|
120
|
+
match = mis
|
|
121
|
+
if j == 1 or priority_path[i - 1][j - 1] != -1:
|
|
122
|
+
match = dp[i - 1][j - 1] + (mat if seq[i - 1] == standard_seq[j - 1] else mis)
|
|
123
|
+
delete = dp[i][j - 1] + gap
|
|
124
|
+
insert = dp[i - 1][j] + gap
|
|
125
|
+
dp[i][j] = max(0, match, delete, insert)
|
|
126
|
+
if match < 0 and delete < 0 and insert < 0:
|
|
127
|
+
continue
|
|
128
|
+
chose_path = 0
|
|
129
|
+
# 更新最佳比对的替换数量
|
|
130
|
+
if dp[i][j] == delete:
|
|
131
|
+
upper_left[i][j] = upper_left[i][j - 1]
|
|
132
|
+
chose_path += 2
|
|
133
|
+
if dp[i][j] == insert:
|
|
134
|
+
if chose_path == 0:
|
|
135
|
+
upper_left[i][j] = upper_left[i - 1][j]
|
|
136
|
+
else:
|
|
137
|
+
upper_left[i][j] = upper_left[i][j - 1] | upper_left[i - 1][j]
|
|
138
|
+
chose_path += 4
|
|
139
|
+
if dp[i][j] == match:
|
|
140
|
+
updated_set = set()
|
|
141
|
+
for x in upper_left[i - 1][j - 1]:
|
|
142
|
+
if priority_path[i - 1][j - 1] == -1:
|
|
143
|
+
updated_set.add((x[0] + 1, i, j))
|
|
144
|
+
else:
|
|
145
|
+
updated_set.add((x[0] + 1, x[1], x[2]))
|
|
146
|
+
# updated_set = {(x[0] + 1, x[1], x[2]) for x in upper_left[i - 1][j - 1]}
|
|
147
|
+
if chose_path == 0:
|
|
148
|
+
upper_left[i][j] = updated_set
|
|
149
|
+
else:
|
|
150
|
+
upper_left[i][j] = upper_left[i][j] | updated_set
|
|
151
|
+
chose_path += 1
|
|
152
|
+
# 更新比对路径,0表示不匹配、1表示匹配、-1表示indel
|
|
153
|
+
if chose_path == 1:
|
|
154
|
+
priority_path[i][j] = (i - 1, j - 1, 1) if standard_seq[j - 1] == seq[i - 1] else (i - 1, j - 1, 0)
|
|
155
|
+
elif chose_path == 2:
|
|
156
|
+
priority_path[i][j] = (i, j - 1, -1)
|
|
157
|
+
elif chose_path == 4:
|
|
158
|
+
priority_path[i][j] = (i - 1, j, -1)
|
|
159
|
+
elif chose_path == 3:
|
|
160
|
+
if min(t[0] for t in upper_left[i - 1][j - 1]) + 1 <= min(t[0] for t in upper_left[i][j - 1]):
|
|
161
|
+
priority_path[i][j] = (i - 1, j - 1, 1) if standard_seq[j - 1] == seq[i - 1] else (
|
|
162
|
+
i - 1, j - 1, 0)
|
|
163
|
+
else:
|
|
164
|
+
priority_path[i][j] = (i, j - 1, -1)
|
|
165
|
+
elif chose_path == 5:
|
|
166
|
+
if min(t[0] for t in upper_left[i - 1][j - 1]) + 1 <= min(t[0] for t in upper_left[i - 1][j]):
|
|
167
|
+
priority_path[i][j] = (i - 1, j - 1, 1) if standard_seq[j - 1] == seq[i - 1] else (
|
|
168
|
+
i - 1, j - 1, 0)
|
|
169
|
+
else:
|
|
170
|
+
priority_path[i][j] = (i - 1, j, -1)
|
|
171
|
+
elif chose_path == 6:
|
|
172
|
+
if min(t[0] for t in upper_left[i][j - 1]) <= min(t[0] for t in upper_left[i - 1][j]):
|
|
173
|
+
priority_path[i][j] = (i, j - 1, -1)
|
|
174
|
+
else:
|
|
175
|
+
priority_path[i][j] = (i - 1, j, -1)
|
|
176
|
+
else:
|
|
177
|
+
min_ul = min(min(t[0] for t in upper_left[i][j - 1]), min(t[0] for t in upper_left[i - 1][j]),
|
|
178
|
+
min(t[0] for t in upper_left[i - 1][j - 1]) + 1)
|
|
179
|
+
if min_ul == min(t[0] for t in upper_left[i - 1][j]) + 1:
|
|
180
|
+
priority_path[i][j] = (i - 1, j - 1, 1) if standard_seq[j - 1] == seq[i - 1] else (
|
|
181
|
+
i - 1, j - 1, 0)
|
|
182
|
+
elif min_ul == min(t[0] for t in upper_left[i][j - 1]):
|
|
183
|
+
priority_path[i][j] = (i, j - 1, -1)
|
|
184
|
+
else:
|
|
185
|
+
priority_path[i][j] = (i - 1, j, -1)
|
|
186
|
+
|
|
187
|
+
if i == len(seq) and j % n == 0 and dp[i][j] > max_score:
|
|
188
|
+
max_score = dp[i][j]
|
|
189
|
+
|
|
190
|
+
return dp[len(seq)], upper_left[len(seq)], priority_path, max_score
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def format_alignment(align_path, seq, motif, stab_l):
|
|
194
|
+
target = deque()
|
|
195
|
+
query = deque()
|
|
196
|
+
alignment = deque()
|
|
197
|
+
i = len(seq)
|
|
198
|
+
j = stab_l
|
|
199
|
+
length = 0
|
|
200
|
+
seq_length = len(seq) - 1
|
|
201
|
+
motif_length = len(motif) - 1
|
|
202
|
+
while True:
|
|
203
|
+
if align_path[i][j] == -1:
|
|
204
|
+
break
|
|
205
|
+
length += 1
|
|
206
|
+
if align_path[i][j][2] == 0:
|
|
207
|
+
query.appendleft(seq[seq_length])
|
|
208
|
+
seq_length -= 1
|
|
209
|
+
target.appendleft(motif[motif_length])
|
|
210
|
+
motif_length -= 1
|
|
211
|
+
if motif_length == -1:
|
|
212
|
+
motif_length += len(motif)
|
|
213
|
+
alignment.appendleft('.')
|
|
214
|
+
ii = align_path[i][j][0]
|
|
215
|
+
j = align_path[i][j][1]
|
|
216
|
+
i = ii
|
|
217
|
+
elif align_path[i][j][2] == 1:
|
|
218
|
+
query.appendleft(seq[seq_length])
|
|
219
|
+
seq_length -= 1
|
|
220
|
+
target.appendleft(motif[motif_length])
|
|
221
|
+
motif_length -= 1
|
|
222
|
+
if motif_length == -1:
|
|
223
|
+
motif_length += len(motif)
|
|
224
|
+
alignment.appendleft('|')
|
|
225
|
+
ii = align_path[i][j][0]
|
|
226
|
+
j = align_path[i][j][1]
|
|
227
|
+
i = ii
|
|
228
|
+
else:
|
|
229
|
+
if align_path[i][j][1] == j:
|
|
230
|
+
query.appendleft(seq[seq_length])
|
|
231
|
+
seq_length -= 1
|
|
232
|
+
target.appendleft('-')
|
|
233
|
+
else:
|
|
234
|
+
query.appendleft('-')
|
|
235
|
+
target.appendleft(motif[motif_length])
|
|
236
|
+
motif_length -= 1
|
|
237
|
+
if motif_length == -1:
|
|
238
|
+
motif_length += len(motif)
|
|
239
|
+
alignment.appendleft('-')
|
|
240
|
+
ii = align_path[i][j][0]
|
|
241
|
+
j = align_path[i][j][1]
|
|
242
|
+
i = ii
|
|
243
|
+
|
|
244
|
+
format_align = [''.join(target), ''.join(alignment), ''.join(query), stab_l, length, len(seq) - 1 - seq_length]
|
|
245
|
+
|
|
246
|
+
return format_align
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def global_motif_multiple_align(seq, motif, logo, indel, mat, mis, ope, ext, marks):
|
|
250
|
+
standard_seq_length, real_seq_length = math.ceil(len(seq) * (1 + indel + 0.005)), len(seq)
|
|
251
|
+
n = len(motif)
|
|
252
|
+
standard_seq = motif[marks % n:] + motif * (standard_seq_length // n)
|
|
253
|
+
# 如果是开头
|
|
254
|
+
if logo == 1:
|
|
255
|
+
standard_seq = motif + motif * (standard_seq_length // n)
|
|
256
|
+
align_score, diagonal_num, align_path, max_score = motif_constrained_local_align(standard_seq, seq, mat, mis,
|
|
257
|
+
ope, n)
|
|
258
|
+
candidate_length = []
|
|
259
|
+
indel = real_seq_length + 1
|
|
260
|
+
mismatch = real_seq_length + 1
|
|
261
|
+
insert_num = 0
|
|
262
|
+
delete_num = 0
|
|
263
|
+
for tr_l, a_s in enumerate(align_score):
|
|
264
|
+
if tr_l % n == 0 and a_s == max_score:
|
|
265
|
+
candidate_length.append(tr_l)
|
|
266
|
+
stab_l = candidate_length[0]
|
|
267
|
+
for c_l in candidate_length:
|
|
268
|
+
min_up_left = min(diagonal_num[c_l], key=lambda x: x[0])
|
|
269
|
+
indel_num = c_l + (real_seq_length - min_up_left[1] + 1) - 2 * min_up_left[0]
|
|
270
|
+
mismatch_num = (align_score[c_l] - indel_num * ope - min_up_left[0] * mat) / (mis - mat)
|
|
271
|
+
mismatch = mismatch_num if indel_num < indel else mismatch
|
|
272
|
+
insert_num = (real_seq_length - min_up_left[1] + 1) - min_up_left[0] if indel_num < indel else insert_num
|
|
273
|
+
delete_num = indel_num - insert_num if indel_num < indel else delete_num
|
|
274
|
+
stab_l = c_l if indel_num < indel else stab_l
|
|
275
|
+
indel = indel_num if indel_num < indel else indel
|
|
276
|
+
if max_score == 0 and align_path[len(seq)][stab_l] == -1:
|
|
277
|
+
return 0, 0, 0, '', len(seq)
|
|
278
|
+
format_align = format_alignment(align_path, seq, motif, stab_l)
|
|
279
|
+
left = real_seq_length - format_align[5]
|
|
280
|
+
return insert_num, delete_num, mismatch, format_align, left
|
|
281
|
+
# 如果是结尾,则不需要整motif长度,选取最佳匹配长度
|
|
282
|
+
if logo == 2:
|
|
283
|
+
align_score, diagonal_num, align_path = realtime_path_tracking_alignment(standard_seq, seq, mat, mis, ope, 1)
|
|
284
|
+
max_score = np.max(np.array(align_score))
|
|
285
|
+
if max_score < 0:
|
|
286
|
+
return 0, 0, 0, '', len(seq)
|
|
287
|
+
candidate_length = []
|
|
288
|
+
indel = real_seq_length + 1
|
|
289
|
+
mismatch = real_seq_length + 1
|
|
290
|
+
insert_num = 0
|
|
291
|
+
delete_num = 0
|
|
292
|
+
row = 0
|
|
293
|
+
for ro in range(len(seq), 0, -1):
|
|
294
|
+
for tr_l, a_s in enumerate(align_score[ro]):
|
|
295
|
+
if a_s == max_score:
|
|
296
|
+
candidate_length.append(tr_l)
|
|
297
|
+
if candidate_length != []:
|
|
298
|
+
row = ro
|
|
299
|
+
break
|
|
300
|
+
if candidate_length == []:
|
|
301
|
+
return 0, 0, 0, '', len(seq)
|
|
302
|
+
# 选取indel错误率最低的
|
|
303
|
+
stab_l = candidate_length[0]
|
|
304
|
+
for c_l in candidate_length:
|
|
305
|
+
indel_num = c_l + row - 2 * min(diagonal_num[row][c_l])
|
|
306
|
+
mismatch_num = (align_score[row][c_l] - indel_num * ope - min(diagonal_num[row][c_l]) * mat) / (mis - mat)
|
|
307
|
+
mismatch = mismatch_num if indel_num < indel else mismatch
|
|
308
|
+
insert_num = row - min(diagonal_num[row][c_l]) if indel_num < indel else insert_num
|
|
309
|
+
delete_num = indel_num - insert_num if indel_num < indel else delete_num
|
|
310
|
+
stab_l = c_l if indel_num < indel else stab_l
|
|
311
|
+
indel = indel_num if indel_num < indel else indel
|
|
312
|
+
format_align = format_alignment(align_path, seq[:row], motif, stab_l)
|
|
313
|
+
right = real_seq_length - row
|
|
314
|
+
return insert_num, delete_num, mismatch, format_align, right
|
|
315
|
+
# 如果不是结尾,那么必须是整motif长度
|
|
316
|
+
align_score, diagonal_num, align_path = realtime_path_tracking_alignment(standard_seq, seq, mat, mis, ope)
|
|
317
|
+
candidate_length = []
|
|
318
|
+
indel = real_seq_length + 1
|
|
319
|
+
mismatch = real_seq_length + 1
|
|
320
|
+
insert_num = 0
|
|
321
|
+
delete_num = 0
|
|
322
|
+
for tr_l, a_s in enumerate(align_score):
|
|
323
|
+
if (tr_l - marks % n) % n == 0 and a_s == max(align_score):
|
|
324
|
+
candidate_length.append(tr_l)
|
|
325
|
+
# 如果有整motif长度,选取indel错误率最低的
|
|
326
|
+
if candidate_length:
|
|
327
|
+
stab_l = candidate_length[0]
|
|
328
|
+
for c_l in candidate_length:
|
|
329
|
+
indel_num = c_l + real_seq_length - 2 * min(diagonal_num[c_l])
|
|
330
|
+
mismatch_num = (align_score[c_l] - indel_num * ope - min(diagonal_num[c_l]) * mat) / (mis - mat)
|
|
331
|
+
mismatch = mismatch_num if indel_num < indel else mismatch
|
|
332
|
+
insert_num = real_seq_length - min(diagonal_num[c_l]) if indel_num < indel else insert_num
|
|
333
|
+
delete_num = indel_num - insert_num if indel_num < indel else delete_num
|
|
334
|
+
stab_l = c_l if indel_num < indel else stab_l
|
|
335
|
+
indel = indel_num if indel_num < indel else indel
|
|
336
|
+
format_align = format_alignment(align_path, seq, motif, stab_l)
|
|
337
|
+
return insert_num, delete_num, mismatch, format_align
|
|
338
|
+
# 如果没有整motif长度,那么选取比对分数最高的整motif长度,并从中选取indel最低的
|
|
339
|
+
integralmultiples_motif = []
|
|
340
|
+
for tr_l, _ in enumerate(align_score):
|
|
341
|
+
if (tr_l - marks % n) % n == 0:
|
|
342
|
+
integralmultiples_motif.append((tr_l, align_score[tr_l]))
|
|
343
|
+
integralmultiples_motif.sort(key=lambda x: x[1])
|
|
344
|
+
for i_m in integralmultiples_motif:
|
|
345
|
+
if i_m[1] == integralmultiples_motif[-1][1]:
|
|
346
|
+
candidate_length.append(i_m[0])
|
|
347
|
+
stab_l = candidate_length[0]
|
|
348
|
+
for c_l in candidate_length:
|
|
349
|
+
indel_num = c_l + real_seq_length - 2 * min(diagonal_num[c_l])
|
|
350
|
+
mismatch_num = (align_score[c_l] - indel_num * ope - min(diagonal_num[c_l]) * mat) / (mis - mat)
|
|
351
|
+
mismatch = mismatch_num if indel_num < indel else mismatch
|
|
352
|
+
insert_num = real_seq_length - min(diagonal_num[c_l]) if indel_num < indel else insert_num
|
|
353
|
+
delete_num = indel_num - insert_num if indel_num < indel else delete_num
|
|
354
|
+
stab_l = c_l if indel_num < indel else stab_l
|
|
355
|
+
indel = indel_num if indel_num < indel else indel
|
|
356
|
+
format_align = format_alignment(align_path, seq, motif, stab_l)
|
|
357
|
+
return insert_num, delete_num, mismatch, format_align
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def segmented_global_align_algorithm(sequence, motif, indel, mat, mis, ope, ext, beta=0.045):
|
|
361
|
+
sequence = str(sequence)
|
|
362
|
+
marks = len(motif)
|
|
363
|
+
if len(motif) == 2:
|
|
364
|
+
marks = 5
|
|
365
|
+
elif len(motif) == 3:
|
|
366
|
+
marks = 6
|
|
367
|
+
probe = str(marks // len(motif) * motif + motif[:marks % len(motif)])
|
|
368
|
+
pattern = re.compile('(?={})'.format(re.escape(str(probe))))
|
|
369
|
+
com_match = [match.start() for match in pattern.finditer(sequence)]
|
|
370
|
+
|
|
371
|
+
if len(com_match) * len(probe) / len(sequence) < beta:
|
|
372
|
+
return None, None, None, None, None, None, None, None
|
|
373
|
+
|
|
374
|
+
distances = [com_match[i + 1] - com_match[i] for i in range(len(com_match) - 1)]
|
|
375
|
+
dis_series = pd.Series(distances)
|
|
376
|
+
most_common_dis = dis_series.value_counts().idxmax()
|
|
377
|
+
if most_common_dis != len(motif):
|
|
378
|
+
return None, None, None, None, None, None, None, None
|
|
379
|
+
|
|
380
|
+
Insert = 0
|
|
381
|
+
Delete = 0
|
|
382
|
+
Mismatch = 0
|
|
383
|
+
Target = []
|
|
384
|
+
Align = []
|
|
385
|
+
Query = []
|
|
386
|
+
target_length = 0
|
|
387
|
+
align_length = 0
|
|
388
|
+
pre_end = 0
|
|
389
|
+
chaos_seq_start = -1
|
|
390
|
+
left = 0
|
|
391
|
+
for index, cm in enumerate(com_match):
|
|
392
|
+
# 未出现混沌序列
|
|
393
|
+
if cm - pre_end == 0 and chaos_seq_start < 0:
|
|
394
|
+
pre_end = pre_end + marks
|
|
395
|
+
if cm > 0 and marks % len(motif) > 0:
|
|
396
|
+
Delete += (len(motif) - marks % len(motif))
|
|
397
|
+
Target.append(motif[marks % len(motif):])
|
|
398
|
+
Align.append('-' * (len(motif) - marks % len(motif)))
|
|
399
|
+
Query.append('-' * (len(motif) - marks % len(motif)))
|
|
400
|
+
align_length += (len(motif) - marks % len(motif))
|
|
401
|
+
target_length += (len(motif) - marks % len(motif))
|
|
402
|
+
if index + 1 == len(com_match) or com_match[index + 1] >= pre_end:
|
|
403
|
+
Target.append(probe)
|
|
404
|
+
Align.append('|' * marks)
|
|
405
|
+
Query.append(probe)
|
|
406
|
+
align_length += marks
|
|
407
|
+
target_length += marks
|
|
408
|
+
continue
|
|
409
|
+
if cm - pre_end > 0 and chaos_seq_start < 0:
|
|
410
|
+
'''
|
|
411
|
+
起始端单独考虑
|
|
412
|
+
'''
|
|
413
|
+
if pre_end == 0:
|
|
414
|
+
insert_num, delete_num, mismatch_num, format_align, left = global_motif_multiple_align(
|
|
415
|
+
sequence[pre_end:cm], motif, 1, indel, mat, mis, mis, mis, marks)
|
|
416
|
+
Insert += insert_num
|
|
417
|
+
Delete += delete_num
|
|
418
|
+
Mismatch += mismatch_num
|
|
419
|
+
pre_end = cm + marks
|
|
420
|
+
if format_align == '':
|
|
421
|
+
if index + 1 == len(com_match) or com_match[index + 1] >= pre_end:
|
|
422
|
+
Target.append(probe)
|
|
423
|
+
Align.append('|' * marks)
|
|
424
|
+
Query.append(probe)
|
|
425
|
+
target_length += marks
|
|
426
|
+
align_length += marks
|
|
427
|
+
continue
|
|
428
|
+
Target.append(format_align[0])
|
|
429
|
+
Align.append(format_align[1])
|
|
430
|
+
Query.append(format_align[2])
|
|
431
|
+
target_length += format_align[3]
|
|
432
|
+
align_length += format_align[4]
|
|
433
|
+
if index + 1 == len(com_match) or com_match[index + 1] >= pre_end:
|
|
434
|
+
Target.append(probe)
|
|
435
|
+
Align.append('|' * marks)
|
|
436
|
+
Query.append(probe)
|
|
437
|
+
target_length += marks
|
|
438
|
+
align_length += marks
|
|
439
|
+
continue
|
|
440
|
+
insert_num, delete_num, mismatch_num, format_align = global_motif_multiple_align(sequence[pre_end:cm],
|
|
441
|
+
motif, 0, indel,
|
|
442
|
+
mat, mis, mis, mis, marks)
|
|
443
|
+
Insert += insert_num
|
|
444
|
+
Delete += delete_num
|
|
445
|
+
Mismatch += mismatch_num
|
|
446
|
+
pre_end = cm + marks
|
|
447
|
+
Target.append(format_align[0])
|
|
448
|
+
Align.append(format_align[1])
|
|
449
|
+
Query.append(format_align[2])
|
|
450
|
+
target_length += format_align[3]
|
|
451
|
+
align_length += format_align[4]
|
|
452
|
+
if index + 1 == len(com_match) or com_match[index + 1] >= pre_end:
|
|
453
|
+
Target.append(probe)
|
|
454
|
+
Align.append('|' * marks)
|
|
455
|
+
Query.append(probe)
|
|
456
|
+
target_length += marks
|
|
457
|
+
align_length += marks
|
|
458
|
+
continue
|
|
459
|
+
# 发现混沌序列起始部分
|
|
460
|
+
if chaos_seq_start == -1:
|
|
461
|
+
chaos_seq_start = pre_end - marks
|
|
462
|
+
pre_end = cm + marks
|
|
463
|
+
continue
|
|
464
|
+
# 继续发现混沌序列部分
|
|
465
|
+
if chaos_seq_start >= 0 and cm - pre_end < 0:
|
|
466
|
+
pre_end = cm + marks
|
|
467
|
+
continue
|
|
468
|
+
# 发现混沌序列结尾部分
|
|
469
|
+
# motif为2和3对应的混沌序列包含一个大的重构标记序列,不可能是混沌序列
|
|
470
|
+
if len(motif) in [2, 3]:
|
|
471
|
+
Target.append(sequence[chaos_seq_start:pre_end])
|
|
472
|
+
Align.append('|' * len(sequence[chaos_seq_start:pre_end]))
|
|
473
|
+
Query.append(sequence[chaos_seq_start:pre_end])
|
|
474
|
+
target_length += len(sequence[chaos_seq_start:pre_end])
|
|
475
|
+
align_length += len(sequence[chaos_seq_start:pre_end])
|
|
476
|
+
chaos_seq_start = pre_end
|
|
477
|
+
insert_num, delete_num, mismatch_num, format_align = global_motif_multiple_align(sequence[chaos_seq_start:cm],
|
|
478
|
+
motif, 0, indel, mat, mis, mis,
|
|
479
|
+
mis, marks)
|
|
480
|
+
Insert += insert_num
|
|
481
|
+
Delete += delete_num
|
|
482
|
+
Mismatch += mismatch_num
|
|
483
|
+
chaos_seq_start = -1
|
|
484
|
+
pre_end = cm + marks
|
|
485
|
+
Target.append(format_align[0])
|
|
486
|
+
Align.append(format_align[1])
|
|
487
|
+
Query.append(format_align[2])
|
|
488
|
+
target_length += format_align[3]
|
|
489
|
+
align_length += format_align[4]
|
|
490
|
+
# 如果该新的标记序列处于最后一个或者不是下一个混沌序列的开始,那么可以直接比对,否则纳入到下一次比对中
|
|
491
|
+
if index == len(com_match) - 1 or com_match[index + 1] - pre_end >= 0:
|
|
492
|
+
Target.append(probe)
|
|
493
|
+
Align.append('|' * marks)
|
|
494
|
+
Query.append(probe)
|
|
495
|
+
target_length += marks
|
|
496
|
+
align_length += marks
|
|
497
|
+
|
|
498
|
+
# 处理剩余序列
|
|
499
|
+
# 如果不是处于混沌序列部分,直接比对
|
|
500
|
+
right = 0
|
|
501
|
+
if chaos_seq_start == -1 and pre_end < len(sequence):
|
|
502
|
+
insert_num, delete_num, mismatch_num, format_align, right = global_motif_multiple_align(sequence[pre_end:],
|
|
503
|
+
motif, 2, indel, mat,
|
|
504
|
+
mis, mis, mis, marks)
|
|
505
|
+
if format_align != '':
|
|
506
|
+
Insert += insert_num
|
|
507
|
+
Delete += delete_num
|
|
508
|
+
Mismatch += mismatch_num
|
|
509
|
+
Target.append(format_align[0])
|
|
510
|
+
Align.append(format_align[1])
|
|
511
|
+
Query.append(format_align[2])
|
|
512
|
+
target_length += format_align[3]
|
|
513
|
+
align_length += format_align[4]
|
|
514
|
+
# 否则如果还处于混沌序列之中,
|
|
515
|
+
elif chaos_seq_start >= 0:
|
|
516
|
+
insert_num, delete_num, mismatch_num, format_align, right = global_motif_multiple_align(
|
|
517
|
+
sequence[chaos_seq_start:], motif, 2, indel, mat, mis, mis, mis, marks)
|
|
518
|
+
if format_align != '':
|
|
519
|
+
Insert += insert_num
|
|
520
|
+
Delete += delete_num
|
|
521
|
+
Mismatch += mismatch_num
|
|
522
|
+
Target.append(format_align[0])
|
|
523
|
+
Align.append(format_align[1])
|
|
524
|
+
Query.append(format_align[2])
|
|
525
|
+
target_length += format_align[3]
|
|
526
|
+
align_length += format_align[4]
|
|
527
|
+
|
|
528
|
+
score = (Insert + Delete) * ope + Mismatch * mis + ((len(sequence) - right) - Mismatch - Insert) * mat
|
|
529
|
+
Target = ''.join(Target)
|
|
530
|
+
Align = ''.join(Align)
|
|
531
|
+
Query = ''.join(Query)
|
|
532
|
+
align = 'target 0 ' + Target + ' ' + str(
|
|
533
|
+
target_length) + '\n' + ' 0 ' + Align + ' ' + str(
|
|
534
|
+
align_length) + '\n' + 'query 0 ' + Query + ' ' + str(len(sequence) - left - right)
|
|
535
|
+
return Insert, Delete, Mismatch, align, left, right, score, (len(Target) - Target.count('-')) / len(motif)
|
|
536
|
+
|
|
537
|
+
# tr = 'TAAAATAAACAAAATAAAATAAATTAAACAATTAAATTAAATAAAATAAATTAAA'
|
|
538
|
+
# motif = 'TAAAA'
|
|
539
|
+
# Insert, Delete, Mismatch, align, _, right, _, copy = segmented_global_align_algorithm(tr, motif, 0.15, 2, -5, -7, -3)
|
|
540
|
+
# print(align)
|
|
541
|
+
# print(copy)
|
faststr/__init__.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from . import utils
|
|
2
|
+
from . import scan_subread
|
|
3
|
+
from . import trs_align
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def get_subread_trs(sub_read, m, r, g, e, p_indel, p_match, score, beta):
|
|
8
|
+
# 获取anchors
|
|
9
|
+
anchor_dict, anchor_mark_dict = utils.get_anchors(sub_read)
|
|
10
|
+
# 获取模糊重复区间
|
|
11
|
+
fuzzy_repeats_dict = {i: [] for i in range(1, 9)}
|
|
12
|
+
num = 0
|
|
13
|
+
for n, n_anchors in anchor_dict.items():
|
|
14
|
+
fuzzy_repeats_dict[n] = scan_subread.cluster_anchors(list(n_anchors), anchor_mark_dict[n], p_indel, p_match)
|
|
15
|
+
del anchor_dict
|
|
16
|
+
# 获取模糊重复区间的motif及大致比对区间
|
|
17
|
+
candidate_trs_dict = {i: [] for i in range(1, 9)}
|
|
18
|
+
for n, n_fuzzy_repeats in fuzzy_repeats_dict.items():
|
|
19
|
+
candidate_trs_dict[n] = scan_subread.filter_chaotic_repeats(n_fuzzy_repeats, sub_read, n)
|
|
20
|
+
del fuzzy_repeats_dict
|
|
21
|
+
# 合并中断tr序列,比对备选序列,得到合格序列,得到一条sub_read上的最终tr集
|
|
22
|
+
qualified_trs_list = []
|
|
23
|
+
# num = 0
|
|
24
|
+
for n, n_candidate_trs in candidate_trs_dict.items():
|
|
25
|
+
qualified_trs_list.extend(trs_align.trs_align_algorithm(n_candidate_trs, sub_read, p_indel, p_match, m, r, g, e,
|
|
26
|
+
score, beta))
|
|
27
|
+
del candidate_trs_dict
|
|
28
|
+
return sorted(qualified_trs_list, key=lambda x: x[1])
|