FastSTR 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- faststr/Segmented_Global_Alignment.py +541 -0
- faststr/__init__.py +2 -0
- faststr/get_subread_trs.py +28 -0
- faststr/main.py +338 -0
- faststr/make_consensus_TRs.py +367 -0
- faststr/scan_subread.py +326 -0
- faststr/trs_align.py +704 -0
- faststr/utils.py +428 -0
- faststr-1.0.0.dist-info/METADATA +220 -0
- faststr-1.0.0.dist-info/RECORD +13 -0
- faststr-1.0.0.dist-info/WHEEL +5 -0
- faststr-1.0.0.dist-info/entry_points.txt +2 -0
- faststr-1.0.0.dist-info/top_level.txt +1 -0
faststr/trs_align.py
ADDED
|
@@ -0,0 +1,704 @@
|
|
|
1
|
+
import math
|
|
2
|
+
from operator import itemgetter
|
|
3
|
+
import numpy as np
|
|
4
|
+
import regex
|
|
5
|
+
from Bio.Align import PairwiseAligner
|
|
6
|
+
|
|
7
|
+
from . import utils
|
|
8
|
+
from . import Segmented_Global_Alignment
|
|
9
|
+
from . import scan_subread
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def can_merge_to_onetr(seq1, seq2, motif1, motif2, end, start, interrupted_seq):
|
|
13
|
+
joint_motif = judge_motif_sim(motif1, motif2, seq1, seq2)
|
|
14
|
+
if joint_motif != '' and 0 <= start - end - 1 <= 100:
|
|
15
|
+
return joint_motif
|
|
16
|
+
if len(motif1) < 7 and 0 <= start - end - 1 <= 100:
|
|
17
|
+
pattern = f"({motif1}){{s<=1}}"
|
|
18
|
+
matches = regex.finditer(pattern, str(motif2 + motif2))
|
|
19
|
+
if any(matches):
|
|
20
|
+
return can_standardize_motif(seq1, seq2, motif1)
|
|
21
|
+
return ''
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def trs_align_algorithm(n_candidate_trs, sub_read, p_indel, p_match, mat, mis, gap, ext, align_score, beta):
|
|
25
|
+
if n_candidate_trs == []:
|
|
26
|
+
return []
|
|
27
|
+
qualified_trs = []
|
|
28
|
+
n_candidate_trs.sort(key=itemgetter(1))
|
|
29
|
+
after_merge_trs = []
|
|
30
|
+
maybe_merge_list = [n_candidate_trs[0]]
|
|
31
|
+
consensus_motif = ''
|
|
32
|
+
for index, c_t in enumerate(n_candidate_trs[1:]):
|
|
33
|
+
if maybe_merge_list[-1] == c_t:
|
|
34
|
+
continue
|
|
35
|
+
consensus_motif_test = can_merge_to_onetr(sub_read[maybe_merge_list[0][1]:maybe_merge_list[-1][2] + 1],
|
|
36
|
+
sub_read[c_t[1]:c_t[2] + 1], maybe_merge_list[-1][0], c_t[0],
|
|
37
|
+
maybe_merge_list[-1][2], c_t[1], sub_read[
|
|
38
|
+
maybe_merge_list[-1][2] + 1:max(
|
|
39
|
+
maybe_merge_list[-1][2] + 1,
|
|
40
|
+
c_t[1])])
|
|
41
|
+
if consensus_motif_test != '':
|
|
42
|
+
maybe_merge_list.append(c_t)
|
|
43
|
+
consensus_motif = consensus_motif_test
|
|
44
|
+
continue
|
|
45
|
+
if index + 1 == len(n_candidate_trs[1:]):
|
|
46
|
+
if len(maybe_merge_list) == 1:
|
|
47
|
+
after_merge_trs.append(maybe_merge_list[0])
|
|
48
|
+
maybe_merge_list = [c_t]
|
|
49
|
+
else:
|
|
50
|
+
final_merge_trs, early_qualified_trs = calculate_character_distance(maybe_merge_list, sub_read,
|
|
51
|
+
consensus_motif, p_indel, p_match,
|
|
52
|
+
mat, mis, gap, ext, align_score,
|
|
53
|
+
beta)
|
|
54
|
+
after_merge_trs.extend(final_merge_trs)
|
|
55
|
+
qualified_trs.extend(early_qualified_trs)
|
|
56
|
+
maybe_merge_list = [c_t]
|
|
57
|
+
consensus_motif = ''
|
|
58
|
+
break
|
|
59
|
+
i = 2
|
|
60
|
+
while index + i < len(n_candidate_trs):
|
|
61
|
+
# 右重复
|
|
62
|
+
if maybe_merge_list[-1][2] < c_t[1]:
|
|
63
|
+
# 重复部分均无法满足前合并
|
|
64
|
+
if n_candidate_trs[index + i][1] > c_t[2]:
|
|
65
|
+
if len(maybe_merge_list) == 1:
|
|
66
|
+
after_merge_trs.append(maybe_merge_list[0])
|
|
67
|
+
maybe_merge_list = [c_t]
|
|
68
|
+
else:
|
|
69
|
+
final_merge_trs, early_qualified_trs = calculate_character_distance(maybe_merge_list, sub_read,
|
|
70
|
+
consensus_motif, p_indel,
|
|
71
|
+
p_match, mat, mis, gap, ext,
|
|
72
|
+
align_score, beta)
|
|
73
|
+
after_merge_trs.extend(final_merge_trs)
|
|
74
|
+
qualified_trs.extend(early_qualified_trs)
|
|
75
|
+
maybe_merge_list = [c_t]
|
|
76
|
+
consensus_motif = ''
|
|
77
|
+
break
|
|
78
|
+
# 重复部分前合并成功
|
|
79
|
+
consensus_motif_test = can_merge_to_onetr(sub_read[maybe_merge_list[0][1]:maybe_merge_list[-1][2] + 1],
|
|
80
|
+
sub_read[
|
|
81
|
+
n_candidate_trs[index + i][1]:n_candidate_trs[index + i][
|
|
82
|
+
2] + 1],
|
|
83
|
+
maybe_merge_list[-1][0], n_candidate_trs[index + i][0],
|
|
84
|
+
maybe_merge_list[-1][2], n_candidate_trs[index + i][1],
|
|
85
|
+
sub_read[
|
|
86
|
+
maybe_merge_list[-1][2] + 1:max(maybe_merge_list[-1][2] + 1,
|
|
87
|
+
n_candidate_trs[index + i][
|
|
88
|
+
1])])
|
|
89
|
+
if consensus_motif_test != '':
|
|
90
|
+
maybe_merge_list.append(n_candidate_trs[index + i])
|
|
91
|
+
consensus_motif = consensus_motif_test
|
|
92
|
+
break
|
|
93
|
+
# 重复部分前合并失败,但还处于重复之中
|
|
94
|
+
i += 1
|
|
95
|
+
continue
|
|
96
|
+
# 左重复
|
|
97
|
+
else:
|
|
98
|
+
# 还处于重复之中,继续遍历
|
|
99
|
+
if n_candidate_trs[index + i][1] < c_t[2]:
|
|
100
|
+
i += 1
|
|
101
|
+
continue
|
|
102
|
+
# 刚好跳出重复,后合并成功
|
|
103
|
+
consensus_motif_test = can_merge_to_onetr(sub_read[maybe_merge_list[0][1]:maybe_merge_list[-1][2] + 1],
|
|
104
|
+
sub_read[
|
|
105
|
+
n_candidate_trs[index + i][1]:n_candidate_trs[index + i][
|
|
106
|
+
2] + 1],
|
|
107
|
+
maybe_merge_list[-1][0], n_candidate_trs[index + i][0],
|
|
108
|
+
maybe_merge_list[-1][2], n_candidate_trs[index + i][1],
|
|
109
|
+
sub_read[
|
|
110
|
+
maybe_merge_list[-1][2] + 1:max(maybe_merge_list[-1][2] + 1,
|
|
111
|
+
n_candidate_trs[index + i][
|
|
112
|
+
1])])
|
|
113
|
+
if consensus_motif_test != '':
|
|
114
|
+
maybe_merge_list.append(n_candidate_trs[index + i])
|
|
115
|
+
consensus_motif = consensus_motif_test
|
|
116
|
+
break
|
|
117
|
+
# 刚好跳出重复,后合并失败
|
|
118
|
+
if len(maybe_merge_list) == 1:
|
|
119
|
+
after_merge_trs.append(maybe_merge_list[0])
|
|
120
|
+
maybe_merge_list = [c_t]
|
|
121
|
+
else:
|
|
122
|
+
final_merge_trs, early_qualified_trs = calculate_character_distance(maybe_merge_list, sub_read,
|
|
123
|
+
consensus_motif, p_indel,
|
|
124
|
+
p_match, mat, mis, gap, ext,
|
|
125
|
+
align_score, beta)
|
|
126
|
+
after_merge_trs.extend(final_merge_trs)
|
|
127
|
+
qualified_trs.extend(early_qualified_trs)
|
|
128
|
+
maybe_merge_list = [c_t]
|
|
129
|
+
consensus_motif = ''
|
|
130
|
+
break
|
|
131
|
+
|
|
132
|
+
if len(maybe_merge_list) == 1:
|
|
133
|
+
after_merge_trs.append(maybe_merge_list[0])
|
|
134
|
+
else:
|
|
135
|
+
final_merge_trs, early_qualified_trs = calculate_character_distance(maybe_merge_list, sub_read, consensus_motif,
|
|
136
|
+
p_indel, p_match, mat, mis, gap, ext,
|
|
137
|
+
align_score, beta)
|
|
138
|
+
after_merge_trs.extend(final_merge_trs)
|
|
139
|
+
qualified_trs.extend(early_qualified_trs)
|
|
140
|
+
|
|
141
|
+
for pos, a_m_t in enumerate(after_merge_trs):
|
|
142
|
+
if pos > 0 and a_m_t[1] >= after_merge_trs[pos - 1][1] and a_m_t[2] <= after_merge_trs[pos - 1][2] and \
|
|
143
|
+
after_merge_trs[pos - 1][0] in a_m_t[0] + a_m_t[0]:
|
|
144
|
+
continue
|
|
145
|
+
if a_m_t[2] - a_m_t[1] + 1 > 6000:
|
|
146
|
+
insert, delete, mismatch, align, left, right, score, copy = Segmented_Global_Alignment.segmented_global_align_algorithm(
|
|
147
|
+
sub_read[a_m_t[1]:a_m_t[2] + 1], a_m_t[0], p_indel, mat, mis, gap, ext, beta)
|
|
148
|
+
if insert == None:
|
|
149
|
+
continue
|
|
150
|
+
length = a_m_t[2] - a_m_t[1] + 1 - right - left
|
|
151
|
+
indel_rio = (insert + delete) / (length + delete)
|
|
152
|
+
match_rio = (length - mismatch - insert) / (length - insert)
|
|
153
|
+
else:
|
|
154
|
+
l = round((a_m_t[2] - a_m_t[1] + 1) * (1 + p_indel + 0.001))
|
|
155
|
+
seq = l // len(a_m_t[0]) * a_m_t[0] + a_m_t[0][:l % len(a_m_t[0])]
|
|
156
|
+
|
|
157
|
+
# 创建 PairwiseAligner 对象
|
|
158
|
+
aligner = PairwiseAligner()
|
|
159
|
+
aligner.mode = 'local' # 设置为局部比对模式
|
|
160
|
+
# 设置比对参数
|
|
161
|
+
aligner.match_score = mat
|
|
162
|
+
aligner.mismatch_score = mis
|
|
163
|
+
aligner.open_gap_score = gap
|
|
164
|
+
aligner.extend_gap_score = ext
|
|
165
|
+
# 执行局部比对
|
|
166
|
+
try:
|
|
167
|
+
alignments = aligner.align(seq, sub_read[a_m_t[1]:a_m_t[2] + 1])
|
|
168
|
+
if alignments:
|
|
169
|
+
delete = alignments[0].length - alignments[0].coordinates[1][-1] + alignments[0].coordinates[1][0]
|
|
170
|
+
insert = alignments[0].counts()[0] - delete
|
|
171
|
+
mismatch = alignments[0].counts()[2]
|
|
172
|
+
length = alignments[0].length
|
|
173
|
+
score = alignments[0].score
|
|
174
|
+
align = alignments[0].format()
|
|
175
|
+
copy = (alignments[0].coordinates[0][-1] - alignments[0].coordinates[0][0]) / len(a_m_t[0])
|
|
176
|
+
left = alignments[0].coordinates[1][0]
|
|
177
|
+
right = a_m_t[2] - a_m_t[1] + 1 - alignments[0].coordinates[1][-1]
|
|
178
|
+
else:
|
|
179
|
+
continue
|
|
180
|
+
except OverflowError as e:
|
|
181
|
+
continue
|
|
182
|
+
|
|
183
|
+
indel_rio = (insert + delete) / (length + delete)
|
|
184
|
+
match_rio = (length - mismatch - insert) / (length - insert)
|
|
185
|
+
|
|
186
|
+
if length >= 25 and indel_rio <= p_indel and match_rio >= p_match and score >= align_score:
|
|
187
|
+
qualified_trs.append(
|
|
188
|
+
(a_m_t[0], a_m_t[1] + left, a_m_t[2] - right, indel_rio, match_rio, score, align, copy))
|
|
189
|
+
|
|
190
|
+
if 100 <= left:
|
|
191
|
+
left_candidate = scan_subread.filter_chaotic_repeats(
|
|
192
|
+
[(a_m_t[1], a_m_t[1] + left, a_m_t[1], a_m_t[1] + left, 1)], sub_read, len(a_m_t[0]))
|
|
193
|
+
if left_candidate != []:
|
|
194
|
+
after_merge_trs.append(left_candidate[0])
|
|
195
|
+
|
|
196
|
+
if right >= 100:
|
|
197
|
+
right_candidate = scan_subread.filter_chaotic_repeats(
|
|
198
|
+
[(a_m_t[2] - right + 1, a_m_t[2], a_m_t[2] - right + 1, a_m_t[2], 1)], sub_read, len(a_m_t[0]))
|
|
199
|
+
if right_candidate != []:
|
|
200
|
+
after_merge_trs.append(right_candidate[0])
|
|
201
|
+
elif length >= 25 and try_change_motif(score, indel_rio, match_rio, align_score, p_indel, p_match):
|
|
202
|
+
motif_p = utils.tri_gram_model(sub_read[a_m_t[1]:a_m_t[2] + 1], len(a_m_t[0]))
|
|
203
|
+
if motif_p == None:
|
|
204
|
+
continue
|
|
205
|
+
for m_p in motif_p:
|
|
206
|
+
if a_m_t[0] in m_p + m_p:
|
|
207
|
+
continue
|
|
208
|
+
if a_m_t[2] - a_m_t[1] + 1 > 6000:
|
|
209
|
+
insert_p, delete_p, mismatch_p, align_p, left_p, right_p, score_p, copy_p = Segmented_Global_Alignment.segmented_global_align_algorithm(
|
|
210
|
+
sub_read[a_m_t[1]:a_m_t[2] + 1], m_p, p_indel, mat, mis, gap, ext, beta)
|
|
211
|
+
if insert_p == None:
|
|
212
|
+
continue
|
|
213
|
+
length_p = a_m_t[2] - a_m_t[1] + 1 - right_p - left_p
|
|
214
|
+
indel_rio_p = (insert_p + delete_p) / (length_p + delete_p)
|
|
215
|
+
match_rio_p = (length_p - mismatch_p - insert_p) / (length_p - insert_p)
|
|
216
|
+
else:
|
|
217
|
+
l = round((a_m_t[2] - a_m_t[1] + 1) * (1 + p_indel + 0.001))
|
|
218
|
+
seq_p = l // len(m_p) * m_p + m_p[:l % len(m_p)]
|
|
219
|
+
|
|
220
|
+
# 创建 PairwiseAligner 对象
|
|
221
|
+
aligner_p = PairwiseAligner()
|
|
222
|
+
aligner_p.mode = 'local' # 设置为局部比对模式
|
|
223
|
+
# 设置比对参数
|
|
224
|
+
aligner_p.match_score = mat
|
|
225
|
+
aligner_p.mismatch_score = mis
|
|
226
|
+
aligner_p.open_gap_score = gap
|
|
227
|
+
aligner_p.extend_gap_score = ext
|
|
228
|
+
# 执行局部比对
|
|
229
|
+
try:
|
|
230
|
+
alignments_p = aligner_p.align(seq_p, sub_read[a_m_t[1]:a_m_t[2] + 1])
|
|
231
|
+
if alignments_p:
|
|
232
|
+
delete_p = alignments_p[0].length - alignments_p[0].coordinates[1][-1] + \
|
|
233
|
+
alignments_p[0].coordinates[1][0]
|
|
234
|
+
insert_p = alignments_p[0].counts()[0] - delete_p
|
|
235
|
+
mismatch_p = alignments_p[0].counts()[2]
|
|
236
|
+
length_p = alignments_p[0].length
|
|
237
|
+
score_p = alignments_p[0].score
|
|
238
|
+
align_p = alignments_p[0].format()
|
|
239
|
+
copy_p = alignments_p[0].coordinates[0][-1] / len(m_p)
|
|
240
|
+
left_p = alignments_p[0].coordinates[1][0]
|
|
241
|
+
right_p = a_m_t[2] - a_m_t[1] + 1 - alignments_p[0].coordinates[1][-1]
|
|
242
|
+
else:
|
|
243
|
+
continue
|
|
244
|
+
except OverflowError as e:
|
|
245
|
+
continue
|
|
246
|
+
|
|
247
|
+
indel_rio_p = (insert_p + delete_p) / (length_p + delete_p)
|
|
248
|
+
match_rio_p = (length_p - mismatch_p - insert_p) / (length_p - insert_p)
|
|
249
|
+
|
|
250
|
+
if length_p >= 25 and indel_rio_p <= p_indel and match_rio_p >= p_match and score_p >= align_score:
|
|
251
|
+
qualified_trs.append((m_p, a_m_t[1] + left_p, a_m_t[2] - right_p, indel_rio_p, match_rio_p, score_p,
|
|
252
|
+
align_p, copy_p))
|
|
253
|
+
return qualified_trs
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def can_shorten_distance(seq, motif):
|
|
257
|
+
max_windows = [16, 16, 16, 16, 15, 15, 14, 14]
|
|
258
|
+
motif_mark_indexes = utils.get_motif_marks(seq, motif)
|
|
259
|
+
visit = [0] * len(seq)
|
|
260
|
+
for m_p in motif_mark_indexes:
|
|
261
|
+
visit[m_p:m_p + len(motif)] = [1] * len(motif)
|
|
262
|
+
if len(motif) in [1, 4, 5, 6, 7]:
|
|
263
|
+
window_ones_count = sum(visit[:20])
|
|
264
|
+
max_ones_count = window_ones_count
|
|
265
|
+
|
|
266
|
+
for i in range(1, len(visit) - 19):
|
|
267
|
+
# 更新窗口中1的数量
|
|
268
|
+
window_ones_count += visit[i + 19] - visit[i - 1]
|
|
269
|
+
if window_ones_count > max_ones_count:
|
|
270
|
+
max_ones_count = window_ones_count
|
|
271
|
+
elif len(motif) in [2, 3]:
|
|
272
|
+
pattern = f"({motif}){{s<=1}}"
|
|
273
|
+
matches = regex.finditer(pattern, str(seq))
|
|
274
|
+
positions = [match.start() for match in matches]
|
|
275
|
+
for m_p in positions:
|
|
276
|
+
for i in range(len(motif)):
|
|
277
|
+
visit[m_p + i] = visit[m_p + i] + (1 - visit[m_p + i]) * (len(motif) - 1) / len(motif)
|
|
278
|
+
windows = sum(visit[:20])
|
|
279
|
+
window_ones_count = math.floor(windows)
|
|
280
|
+
max_ones_count = window_ones_count
|
|
281
|
+
|
|
282
|
+
for i in range(1, len(visit) - 19):
|
|
283
|
+
# 更新窗口中1的数量
|
|
284
|
+
windows += visit[i + 19] - visit[i - 1]
|
|
285
|
+
window_ones_count = math.floor(windows)
|
|
286
|
+
if window_ones_count > max_ones_count:
|
|
287
|
+
max_ones_count = window_ones_count
|
|
288
|
+
else:
|
|
289
|
+
pattern = f"({motif}){{s<=3}}"
|
|
290
|
+
matches = regex.finditer(pattern, str(seq))
|
|
291
|
+
positions = [match.start() for match in matches]
|
|
292
|
+
for m_p in positions:
|
|
293
|
+
for i in range(len(motif)):
|
|
294
|
+
visit[m_p + i] = visit[m_p + i] + (1 - visit[m_p + i]) * (len(motif) - 3) / len(motif)
|
|
295
|
+
windows = sum(visit[:20])
|
|
296
|
+
window_ones_count = math.floor(windows)
|
|
297
|
+
max_ones_count = window_ones_count
|
|
298
|
+
|
|
299
|
+
for i in range(1, len(visit) - 19):
|
|
300
|
+
# 更新窗口中1的数量
|
|
301
|
+
windows += visit[i + 19] - visit[i - 1]
|
|
302
|
+
window_ones_count = math.floor(windows)
|
|
303
|
+
if window_ones_count > max_ones_count:
|
|
304
|
+
max_ones_count = window_ones_count
|
|
305
|
+
|
|
306
|
+
if max_ones_count < max_windows[len(motif) - 1]:
|
|
307
|
+
return False
|
|
308
|
+
return True
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def can_standardize_motif(seq1, seq2, motif1):
|
|
312
|
+
motifs = utils.tri_gram_model(seq1 + seq2, len(motif1))
|
|
313
|
+
if motifs == None:
|
|
314
|
+
return ''
|
|
315
|
+
start_probe = [24, 24, 24, 21, 24, 24, 24, 22]
|
|
316
|
+
end_probe = [24, 24, 24, 22, 23, 23, 23, 22]
|
|
317
|
+
to_align_seq = seq2 if motifs[0] in motif1 + motif1 else seq1
|
|
318
|
+
if can_shorten_distance(to_align_seq, motifs[0]) == False:
|
|
319
|
+
return ''
|
|
320
|
+
probe = 25 // len(motifs[0]) * motifs[0] + motifs[0][:25 % len(motifs[0])]
|
|
321
|
+
aligner = PairwiseAligner()
|
|
322
|
+
aligner.mode = 'local'
|
|
323
|
+
# 设置比对参数
|
|
324
|
+
aligner.match_score = 2
|
|
325
|
+
aligner.mismatch_score = -3
|
|
326
|
+
aligner.open_gap_score = -5
|
|
327
|
+
start_index = -1
|
|
328
|
+
end_index = -1
|
|
329
|
+
for i in range(len(to_align_seq) // 5 - 3):
|
|
330
|
+
probe_align = aligner.align(probe, to_align_seq[i * 5:i * 5 + 20])
|
|
331
|
+
if probe_align:
|
|
332
|
+
pass
|
|
333
|
+
else:
|
|
334
|
+
continue
|
|
335
|
+
if probe_align[0].score >= start_probe[len(motifs[0]) - 1]:
|
|
336
|
+
start_index = max(i * 5 - 5, 0)
|
|
337
|
+
break
|
|
338
|
+
if start_index == -1:
|
|
339
|
+
return ''
|
|
340
|
+
for j in range((len(to_align_seq) - start_index - 1) // 5 - 3):
|
|
341
|
+
probe_align = aligner.align(probe, to_align_seq[len(to_align_seq) - j * 5 - 20:len(to_align_seq) - j * 5])
|
|
342
|
+
if probe_align:
|
|
343
|
+
pass
|
|
344
|
+
else:
|
|
345
|
+
continue
|
|
346
|
+
if probe_align[0].score >= end_probe[len(motifs[0]) - 1]:
|
|
347
|
+
end_index = min(len(to_align_seq) - j * 5 + 4, len(to_align_seq) - 1)
|
|
348
|
+
break
|
|
349
|
+
if end_index == -1:
|
|
350
|
+
return ''
|
|
351
|
+
if end_index - start_index < 24:
|
|
352
|
+
return ''
|
|
353
|
+
return motifs[0]
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def judge_motif_sim(m1, m2, s1, s2):
|
|
357
|
+
if len(m1) < 7 and m1 in m2 + m2:
|
|
358
|
+
return m1
|
|
359
|
+
else:
|
|
360
|
+
pattern = f"({m1}){{s<=1}}"
|
|
361
|
+
matches = regex.finditer(pattern, str(m2 + m2))
|
|
362
|
+
if any(matches):
|
|
363
|
+
motifs = utils.tri_gram_model(s1 + s2, len(m1))
|
|
364
|
+
if motifs == None:
|
|
365
|
+
return ''
|
|
366
|
+
return motifs[0]
|
|
367
|
+
else:
|
|
368
|
+
return ''
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
def try_change_motif(score, indel, match, align_score, p_indel, p_match):
|
|
372
|
+
if align_score - score > 5 and indel <= p_indel and match >= p_match:
|
|
373
|
+
return False
|
|
374
|
+
if score >= align_score and indel - p_indel > 0.025 and match >= p_match:
|
|
375
|
+
return False
|
|
376
|
+
if score >= align_score and indel <= p_indel and p_match - match > 0.025:
|
|
377
|
+
return False
|
|
378
|
+
return True
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def calculate_character_distance(maybe_merge_list_partial, sub_read, consensus_motif, p_indel, p_match, mat, mis, gap,
|
|
382
|
+
ext, align_score, beta):
|
|
383
|
+
final_merge_trs = []
|
|
384
|
+
final_merge_list = []
|
|
385
|
+
early_qualified_trs = []
|
|
386
|
+
last = 0
|
|
387
|
+
for inde, m_m_p in enumerate(maybe_merge_list_partial):
|
|
388
|
+
if m_m_p[2] - m_m_p[1] + 1 > 6000:
|
|
389
|
+
insert, delete, mismatch, align, left, right, score, copy = Segmented_Global_Alignment.segmented_global_align_algorithm(
|
|
390
|
+
sub_read[m_m_p[1]:m_m_p[2] + 1], m_m_p[0], p_indel, mat, mis, gap, ext, beta)
|
|
391
|
+
if insert == None:
|
|
392
|
+
continue
|
|
393
|
+
length = m_m_p[2] - m_m_p[1] + 1 - right - left
|
|
394
|
+
else:
|
|
395
|
+
seq = sub_read[m_m_p[1]:m_m_p[2] + 1]
|
|
396
|
+
l = round(len(seq) * (1 + p_indel + 0.005))
|
|
397
|
+
tr = l // len(consensus_motif) * consensus_motif + consensus_motif[:l % len(consensus_motif)]
|
|
398
|
+
# 创建 PairwiseAligner 对象
|
|
399
|
+
aligner = PairwiseAligner()
|
|
400
|
+
aligner.mode = 'local' # 设置为局部比对模式
|
|
401
|
+
# 设置比对参数
|
|
402
|
+
aligner.match_score = mat
|
|
403
|
+
aligner.mismatch_score = mis
|
|
404
|
+
aligner.open_gap_score = gap
|
|
405
|
+
aligner.extend_gap_score = ext
|
|
406
|
+
# 执行局部比对
|
|
407
|
+
try:
|
|
408
|
+
alignments = aligner.align(tr, seq)
|
|
409
|
+
if alignments:
|
|
410
|
+
delete = alignments[0].length - alignments[0].coordinates[1][-1] + alignments[0].coordinates[1][0]
|
|
411
|
+
insert = alignments[0].counts()[0] - delete
|
|
412
|
+
mismatch = alignments[0].counts()[2]
|
|
413
|
+
length = alignments[0].length
|
|
414
|
+
score = alignments[0].score
|
|
415
|
+
align = alignments[0].format()
|
|
416
|
+
copy = (alignments[0].coordinates[0][-1] - alignments[0].coordinates[0][0]) / len(consensus_motif)
|
|
417
|
+
left = alignments[0].coordinates[1][0]
|
|
418
|
+
right = len(seq) - alignments[0].coordinates[1][-1]
|
|
419
|
+
else:
|
|
420
|
+
continue
|
|
421
|
+
except OverflowError as e:
|
|
422
|
+
continue
|
|
423
|
+
|
|
424
|
+
indel_rio = (insert + delete) / (length + delete)
|
|
425
|
+
match_rio = (length - mismatch - insert) / (length - insert)
|
|
426
|
+
if inde != 0 and left + last <= 70:
|
|
427
|
+
if final_merge_list == []:
|
|
428
|
+
final_merge_list.append(maybe_merge_list_partial[inde - 1])
|
|
429
|
+
if early_qualified_trs and maybe_merge_list_partial[inde - 1][1] <= early_qualified_trs[-1][1] and \
|
|
430
|
+
maybe_merge_list_partial[inde - 1][2] >= early_qualified_trs[-1][2]:
|
|
431
|
+
del early_qualified_trs[-1]
|
|
432
|
+
final_merge_list.append(m_m_p)
|
|
433
|
+
elif inde == 0:
|
|
434
|
+
if length >= 25 and indel_rio <= p_indel and match_rio >= p_match and score >= align_score:
|
|
435
|
+
early_qualified_trs.append(
|
|
436
|
+
(m_m_p[0], m_m_p[1] + left, m_m_p[2] - right, indel_rio, match_rio, score, align, copy))
|
|
437
|
+
elif length >= 25 and try_change_motif(score, indel_rio, match_rio, align_score, p_indel, p_match):
|
|
438
|
+
motif_p = utils.tri_gram_model(sub_read[m_m_p[1]:m_m_p[2] + 1], len(m_m_p[0]))
|
|
439
|
+
if motif_p == None:
|
|
440
|
+
continue
|
|
441
|
+
for m_p in motif_p:
|
|
442
|
+
if m_m_p[0] in m_p + m_p:
|
|
443
|
+
continue
|
|
444
|
+
if m_m_p[2] - m_m_p[1] + 1 > 6000:
|
|
445
|
+
insert_p, delete_p, mismatch_p, align_p, left_p, right_p, score_p, copy_p = Segmented_Global_Alignment.segmented_global_align_algorithm(
|
|
446
|
+
sub_read[m_m_p[1]:m_m_p[2] + 1], m_p, p_indel, mat, mis, gap, ext, beta)
|
|
447
|
+
if insert_p == None:
|
|
448
|
+
continue
|
|
449
|
+
length_p = m_m_p[2] - m_m_p[1] + 1 - right_p - left_p
|
|
450
|
+
indel_rio_p = (insert_p + delete_p) / (length_p + delete_p)
|
|
451
|
+
match_rio_p = (length_p - mismatch_p - insert_p) / (length_p - insert_p)
|
|
452
|
+
else:
|
|
453
|
+
l_p = round((m_m_p[2] - m_m_p[1] + 1) * (1 + p_indel + 0.001))
|
|
454
|
+
seq_p = l_p // len(m_p) * m_p + m_p[:l_p % len(m_p)]
|
|
455
|
+
|
|
456
|
+
# 创建 PairwiseAligner 对象
|
|
457
|
+
aligner_p = PairwiseAligner()
|
|
458
|
+
aligner_p.mode = 'local' # 设置为局部比对模式
|
|
459
|
+
# 设置比对参数
|
|
460
|
+
aligner_p.match_score = mat
|
|
461
|
+
aligner_p.mismatch_score = mis
|
|
462
|
+
aligner_p.open_gap_score = gap
|
|
463
|
+
aligner_p.extend_gap_score = ext
|
|
464
|
+
# 执行局部比对
|
|
465
|
+
try:
|
|
466
|
+
alignments_p = aligner_p.align(seq_p, sub_read[m_m_p[1]:m_m_p[2] + 1])
|
|
467
|
+
if alignments_p:
|
|
468
|
+
delete_p = alignments_p[0].length - alignments_p[0].coordinates[1][-1] + \
|
|
469
|
+
alignments_p[0].coordinates[1][0]
|
|
470
|
+
insert_p = alignments_p[0].counts()[0] - delete_p
|
|
471
|
+
mismatch_p = alignments_p[0].counts()[2]
|
|
472
|
+
length_p = alignments_p[0].length
|
|
473
|
+
score_p = alignments_p[0].score
|
|
474
|
+
align_p = alignments_p[0].format()
|
|
475
|
+
copy_p = alignments_p[0].coordinates[0][-1] / len(m_p)
|
|
476
|
+
left_p = alignments_p[0].coordinates[1][0]
|
|
477
|
+
right_p = m_m_p[2] - m_m_p[1] + 1 - alignments_p[0].coordinates[1][-1]
|
|
478
|
+
else:
|
|
479
|
+
continue
|
|
480
|
+
except OverflowError as e:
|
|
481
|
+
continue
|
|
482
|
+
|
|
483
|
+
indel_rio_p = (insert_p + delete_p) / (length_p + delete_p)
|
|
484
|
+
match_rio_p = (length_p - mismatch_p - insert_p) / (length_p - insert_p)
|
|
485
|
+
|
|
486
|
+
if length_p >= 25 and indel_rio_p <= p_indel and match_rio_p >= p_match and score_p >= align_score:
|
|
487
|
+
early_qualified_trs.append((
|
|
488
|
+
m_p, m_m_p[1] + left_p, m_m_p[2] - right_p, indel_rio_p, match_rio_p,
|
|
489
|
+
score_p, align_p, copy_p))
|
|
490
|
+
elif final_merge_list == []:
|
|
491
|
+
if length >= 25 and indel_rio <= p_indel and match_rio >= p_match and score >= align_score:
|
|
492
|
+
early_qualified_trs.append(
|
|
493
|
+
(m_m_p[0], m_m_p[1] + left, m_m_p[2] - right, indel_rio, match_rio, score, align, copy))
|
|
494
|
+
elif length >= 25 and try_change_motif(score, indel_rio, match_rio, align_score, p_indel, p_match):
|
|
495
|
+
motif_p = utils.tri_gram_model(sub_read[m_m_p[1]:m_m_p[2] + 1], len(m_m_p[0]))
|
|
496
|
+
if motif_p == None:
|
|
497
|
+
continue
|
|
498
|
+
for m_p in motif_p:
|
|
499
|
+
if m_m_p[0] in m_p + m_p:
|
|
500
|
+
continue
|
|
501
|
+
if m_m_p[2] - m_m_p[1] + 1 > 6000:
|
|
502
|
+
insert_p, delete_p, mismatch_p, align_p, left_p, right_p, score_p, copy_p = Segmented_Global_Alignment.segmented_global_align_algorithm(
|
|
503
|
+
sub_read[m_m_p[1]:m_m_p[2] + 1], m_p, p_indel, mat, mis, gap, ext, beta)
|
|
504
|
+
if insert_p == None:
|
|
505
|
+
continue
|
|
506
|
+
length_p = m_m_p[2] - m_m_p[1] + 1 - right_p - left_p
|
|
507
|
+
indel_rio_p = (insert_p + delete_p) / (length_p + delete_p)
|
|
508
|
+
match_rio_p = (length_p - mismatch_p - insert_p) / (length_p - insert_p)
|
|
509
|
+
else:
|
|
510
|
+
l_p = round((m_m_p[2] - m_m_p[1] + 1) * (1 + p_indel + 0.001))
|
|
511
|
+
seq_p = l_p // len(m_p) * m_p + m_p[:l_p % len(m_p)]
|
|
512
|
+
|
|
513
|
+
# 创建 PairwiseAligner 对象
|
|
514
|
+
aligner_p = PairwiseAligner()
|
|
515
|
+
aligner_p.mode = 'local' # 设置为局部比对模式
|
|
516
|
+
# 设置比对参数
|
|
517
|
+
aligner_p.match_score = mat
|
|
518
|
+
aligner_p.mismatch_score = mis
|
|
519
|
+
aligner_p.open_gap_score = gap
|
|
520
|
+
aligner_p.extend_gap_score = ext
|
|
521
|
+
# 执行局部比对
|
|
522
|
+
try:
|
|
523
|
+
alignments_p = aligner_p.align(seq_p, sub_read[m_m_p[1]:m_m_p[2] + 1])
|
|
524
|
+
if alignments_p:
|
|
525
|
+
delete_p = alignments_p[0].length - alignments_p[0].coordinates[1][-1] + \
|
|
526
|
+
alignments_p[0].coordinates[1][0]
|
|
527
|
+
insert_p = alignments_p[0].counts()[0] - delete_p
|
|
528
|
+
mismatch_p = alignments_p[0].counts()[2]
|
|
529
|
+
length_p = alignments_p[0].length
|
|
530
|
+
score_p = alignments_p[0].score
|
|
531
|
+
align_p = alignments_p[0].format()
|
|
532
|
+
copy_p = alignments_p[0].coordinates[0][-1] / len(m_p)
|
|
533
|
+
left_p = alignments_p[0].coordinates[1][0]
|
|
534
|
+
right_p = m_m_p[2] - m_m_p[1] + 1 - alignments_p[0].coordinates[1][-1]
|
|
535
|
+
else:
|
|
536
|
+
continue
|
|
537
|
+
except OverflowError as e:
|
|
538
|
+
continue
|
|
539
|
+
|
|
540
|
+
indel_rio_p = (insert_p + delete_p) / (length_p + delete_p)
|
|
541
|
+
match_rio_p = (length_p - mismatch_p - insert_p) / (length_p - insert_p)
|
|
542
|
+
|
|
543
|
+
if length_p >= 25 and indel_rio_p <= p_indel and match_rio_p >= p_match and score_p >= align_score:
|
|
544
|
+
early_qualified_trs.append(
|
|
545
|
+
(m_p, m_m_p[1] + left_p, m_m_p[2] - right_p, indel_rio_p, match_rio_p, score_p, align_p,
|
|
546
|
+
copy_p))
|
|
547
|
+
else:
|
|
548
|
+
final_merge_trs.append((consensus_motif, final_merge_list[0][1], final_merge_list[-1][2]))
|
|
549
|
+
final_merge_list = []
|
|
550
|
+
if length >= 25 and indel_rio <= p_indel and match_rio >= p_match and score >= align_score:
|
|
551
|
+
early_qualified_trs.append(
|
|
552
|
+
(m_m_p[0], m_m_p[1] + left, m_m_p[2] - right, indel_rio, match_rio, score, align, copy))
|
|
553
|
+
elif length >= 25 and try_change_motif(score, indel_rio, match_rio, align_score, p_indel, p_match):
|
|
554
|
+
motif_p = utils.tri_gram_model(sub_read[m_m_p[1]:m_m_p[2] + 1], len(m_m_p[0]))
|
|
555
|
+
if motif_p == None:
|
|
556
|
+
continue
|
|
557
|
+
for m_p in motif_p:
|
|
558
|
+
if m_m_p[0] in m_p + m_p:
|
|
559
|
+
continue
|
|
560
|
+
if m_m_p[2] - m_m_p[1] + 1 > 6000:
|
|
561
|
+
insert_p, delete_p, mismatch_p, align_p, left_p, right_p, score_p, copy_p = Segmented_Global_Alignment.segmented_global_align_algorithm(
|
|
562
|
+
sub_read[m_m_p[1]:m_m_p[2] + 1], m_p, p_indel, mat, mis, gap, ext, beta)
|
|
563
|
+
if insert_p == None:
|
|
564
|
+
continue
|
|
565
|
+
length_p = m_m_p[2] - m_m_p[1] + 1 - right_p - left_p
|
|
566
|
+
indel_rio_p = (insert_p + delete_p) / (length_p + delete_p)
|
|
567
|
+
match_rio_p = (length_p - mismatch_p - insert_p) / (length_p - insert_p)
|
|
568
|
+
else:
|
|
569
|
+
l_p = round((m_m_p[2] - m_m_p[1] + 1) * (1 + p_indel + 0.001))
|
|
570
|
+
seq_p = l_p // len(m_p) * m_p + m_p[:l_p % len(m_p)]
|
|
571
|
+
|
|
572
|
+
# 创建 PairwiseAligner 对象
|
|
573
|
+
aligner_p = PairwiseAligner()
|
|
574
|
+
aligner_p.mode = 'local' # 设置为局部比对模式
|
|
575
|
+
# 设置比对参数
|
|
576
|
+
aligner_p.match_score = mat
|
|
577
|
+
aligner_p.mismatch_score = mis
|
|
578
|
+
aligner_p.open_gap_score = gap
|
|
579
|
+
aligner_p.extend_gap_score = ext
|
|
580
|
+
# 执行局部比对
|
|
581
|
+
try:
|
|
582
|
+
alignments_p = aligner_p.align(seq_p, sub_read[m_m_p[1]:m_m_p[2] + 1])
|
|
583
|
+
if alignments_p:
|
|
584
|
+
delete_p = alignments_p[0].length - alignments_p[0].coordinates[1][-1] + \
|
|
585
|
+
alignments_p[0].coordinates[1][0]
|
|
586
|
+
insert_p = alignments_p[0].counts()[0] - delete_p
|
|
587
|
+
mismatch_p = alignments_p[0].counts()[2]
|
|
588
|
+
length_p = alignments_p[0].length
|
|
589
|
+
score_p = alignments_p[0].score
|
|
590
|
+
align_p = alignments_p[0].format()
|
|
591
|
+
copy_p = alignments_p[0].coordinates[0][-1] / len(m_p)
|
|
592
|
+
left_p = alignments_p[0].coordinates[1][0]
|
|
593
|
+
right_p = m_m_p[2] - m_m_p[1] + 1 - alignments_p[0].coordinates[1][-1]
|
|
594
|
+
else:
|
|
595
|
+
continue
|
|
596
|
+
except OverflowError as e:
|
|
597
|
+
continue
|
|
598
|
+
|
|
599
|
+
indel_rio_p = (insert_p + delete_p) / (length_p + delete_p)
|
|
600
|
+
match_rio_p = (length_p - mismatch_p - insert_p) / (length_p - insert_p)
|
|
601
|
+
|
|
602
|
+
if length_p >= 25 and indel_rio_p <= p_indel and match_rio_p >= p_match and score_p >= align_score:
|
|
603
|
+
early_qualified_trs.append(
|
|
604
|
+
(m_p, m_m_p[1] + left_p, m_m_p[2] - right_p, indel_rio_p, match_rio_p, score_p, align_p,
|
|
605
|
+
copy_p))
|
|
606
|
+
last = right
|
|
607
|
+
|
|
608
|
+
if final_merge_list != []:
|
|
609
|
+
final_merge_trs.append((consensus_motif, final_merge_list[0][1], final_merge_list[-1][2]))
|
|
610
|
+
return final_merge_trs, early_qualified_trs
|
|
611
|
+
|
|
612
|
+
|
|
613
|
+
def simple_local_comparison_algorithm(seq, motif, p_indel, mat, mis, gap):
|
|
614
|
+
l = round(len(seq) * (1 + p_indel + 0.005))
|
|
615
|
+
tr = l // len(motif) * motif + motif[:l % len(motif)]
|
|
616
|
+
score_matrix = np.zeros((len(seq) + 1, len(tr) + 1))
|
|
617
|
+
traceback_matrix = np.zeros((len(seq) + 1, len(tr) + 1), dtype=object)
|
|
618
|
+
|
|
619
|
+
max_score = 0
|
|
620
|
+
max_pos = None
|
|
621
|
+
|
|
622
|
+
for i in range(1, len(seq) + 1):
|
|
623
|
+
for j in range(1, len(tr) + 1):
|
|
624
|
+
match = score_matrix[i - 1, j - 1] + (mat if seq[i - 1] == tr[j - 1] else mis)
|
|
625
|
+
delete = score_matrix[i, j - 1] + gap
|
|
626
|
+
insert = score_matrix[i - 1, j] + gap
|
|
627
|
+
score_matrix[i, j] = max(0, match, delete, insert)
|
|
628
|
+
|
|
629
|
+
# 记录得分最高的位置
|
|
630
|
+
if score_matrix[i, j] > max_score:
|
|
631
|
+
max_score = score_matrix[i, j]
|
|
632
|
+
max_pos = (i, j)
|
|
633
|
+
|
|
634
|
+
# 偏替换,插入和删除偏删除
|
|
635
|
+
if score_matrix[i, j] == match:
|
|
636
|
+
traceback_matrix[i, j] = (i - 1, j - 1)
|
|
637
|
+
elif score_matrix[i, j] == delete:
|
|
638
|
+
traceback_matrix[i, j] = (i, j - 1)
|
|
639
|
+
elif score_matrix[i, j] == insert:
|
|
640
|
+
traceback_matrix[i, j] = (i - 1, j)
|
|
641
|
+
|
|
642
|
+
i, j = max_pos
|
|
643
|
+
align_mark = []
|
|
644
|
+
target = []
|
|
645
|
+
query = []
|
|
646
|
+
delete = 0
|
|
647
|
+
insert = 0
|
|
648
|
+
mismatch = 0
|
|
649
|
+
original_i = i
|
|
650
|
+
original_j = j
|
|
651
|
+
while score_matrix[i, j] != 0:
|
|
652
|
+
ii, jj = traceback_matrix[i, j]
|
|
653
|
+
if ii + 1 == i and jj + 1 == j:
|
|
654
|
+
if seq[ii] == tr[jj]:
|
|
655
|
+
align_mark.append('|')
|
|
656
|
+
else:
|
|
657
|
+
align_mark.append('.')
|
|
658
|
+
mismatch += 1
|
|
659
|
+
target.append(tr[jj])
|
|
660
|
+
query.append(seq[ii])
|
|
661
|
+
elif ii + 1 == i and jj == j:
|
|
662
|
+
align_mark.append('-')
|
|
663
|
+
query.append(seq[ii])
|
|
664
|
+
target.append('-')
|
|
665
|
+
insert += 1
|
|
666
|
+
elif ii == i and jj + 1 == j:
|
|
667
|
+
align_mark.append('-')
|
|
668
|
+
query.append('-')
|
|
669
|
+
target.append(tr[jj])
|
|
670
|
+
delete += 1
|
|
671
|
+
i = ii
|
|
672
|
+
j = jj
|
|
673
|
+
align_mark = ''.join(list(reversed(align_mark)))
|
|
674
|
+
target = ''.join(list(reversed(target)))
|
|
675
|
+
query = ''.join(list(reversed(query)))
|
|
676
|
+
align = []
|
|
677
|
+
target_start = j
|
|
678
|
+
query_start = i
|
|
679
|
+
align_start = 0
|
|
680
|
+
for k in range(math.ceil(len(align_mark) / 60)):
|
|
681
|
+
target_local = target[align_start:min(len(align_mark), align_start + 60)]
|
|
682
|
+
query_local = query[align_start:min(len(align_mark), align_start + 60)]
|
|
683
|
+
align_local = align_mark[align_start:min(len(align_mark), align_start + 60)]
|
|
684
|
+
target_local = 'target' + (13 - len(str(target_start))) * ' ' + str(target_start) + ' ' + target_local
|
|
685
|
+
query_local = 'query' + (14 - len(str(query_start))) * ' ' + str(query_start) + ' ' + query_local
|
|
686
|
+
align_local = (19 - len(str(align_start))) * ' ' + str(align_start) + ' ' + align_local
|
|
687
|
+
if k + 1 < math.ceil(len(align_mark) / 60):
|
|
688
|
+
target_local += '\n'
|
|
689
|
+
query_local += '\n\n'
|
|
690
|
+
align_local += '\n'
|
|
691
|
+
target_start += (60 - target_local.count('-'))
|
|
692
|
+
query_start += (60 - query_local.count('-'))
|
|
693
|
+
align_start += 60
|
|
694
|
+
else:
|
|
695
|
+
target_local += (' ' + str(original_j) + '\n')
|
|
696
|
+
align_local += (' ' + str(len(align_mark)) + '\n')
|
|
697
|
+
query_local += (' ' + str(original_i) + '\n' + '\n')
|
|
698
|
+
align.append(target_local)
|
|
699
|
+
align.append(align_local)
|
|
700
|
+
align.append(query_local)
|
|
701
|
+
align = ''.join(align)
|
|
702
|
+
left = i
|
|
703
|
+
right = len(seq) - original_i
|
|
704
|
+
return insert, delete, mismatch, align, left, right, max_score
|