FastSTR 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
faststr/trs_align.py ADDED
@@ -0,0 +1,704 @@
1
+ import math
2
+ from operator import itemgetter
3
+ import numpy as np
4
+ import regex
5
+ from Bio.Align import PairwiseAligner
6
+
7
+ from . import utils
8
+ from . import Segmented_Global_Alignment
9
+ from . import scan_subread
10
+
11
+
12
+ def can_merge_to_onetr(seq1, seq2, motif1, motif2, end, start, interrupted_seq):
13
+ joint_motif = judge_motif_sim(motif1, motif2, seq1, seq2)
14
+ if joint_motif != '' and 0 <= start - end - 1 <= 100:
15
+ return joint_motif
16
+ if len(motif1) < 7 and 0 <= start - end - 1 <= 100:
17
+ pattern = f"({motif1}){{s<=1}}"
18
+ matches = regex.finditer(pattern, str(motif2 + motif2))
19
+ if any(matches):
20
+ return can_standardize_motif(seq1, seq2, motif1)
21
+ return ''
22
+
23
+
24
+ def trs_align_algorithm(n_candidate_trs, sub_read, p_indel, p_match, mat, mis, gap, ext, align_score, beta):
25
+ if n_candidate_trs == []:
26
+ return []
27
+ qualified_trs = []
28
+ n_candidate_trs.sort(key=itemgetter(1))
29
+ after_merge_trs = []
30
+ maybe_merge_list = [n_candidate_trs[0]]
31
+ consensus_motif = ''
32
+ for index, c_t in enumerate(n_candidate_trs[1:]):
33
+ if maybe_merge_list[-1] == c_t:
34
+ continue
35
+ consensus_motif_test = can_merge_to_onetr(sub_read[maybe_merge_list[0][1]:maybe_merge_list[-1][2] + 1],
36
+ sub_read[c_t[1]:c_t[2] + 1], maybe_merge_list[-1][0], c_t[0],
37
+ maybe_merge_list[-1][2], c_t[1], sub_read[
38
+ maybe_merge_list[-1][2] + 1:max(
39
+ maybe_merge_list[-1][2] + 1,
40
+ c_t[1])])
41
+ if consensus_motif_test != '':
42
+ maybe_merge_list.append(c_t)
43
+ consensus_motif = consensus_motif_test
44
+ continue
45
+ if index + 1 == len(n_candidate_trs[1:]):
46
+ if len(maybe_merge_list) == 1:
47
+ after_merge_trs.append(maybe_merge_list[0])
48
+ maybe_merge_list = [c_t]
49
+ else:
50
+ final_merge_trs, early_qualified_trs = calculate_character_distance(maybe_merge_list, sub_read,
51
+ consensus_motif, p_indel, p_match,
52
+ mat, mis, gap, ext, align_score,
53
+ beta)
54
+ after_merge_trs.extend(final_merge_trs)
55
+ qualified_trs.extend(early_qualified_trs)
56
+ maybe_merge_list = [c_t]
57
+ consensus_motif = ''
58
+ break
59
+ i = 2
60
+ while index + i < len(n_candidate_trs):
61
+ # 右重复
62
+ if maybe_merge_list[-1][2] < c_t[1]:
63
+ # 重复部分均无法满足前合并
64
+ if n_candidate_trs[index + i][1] > c_t[2]:
65
+ if len(maybe_merge_list) == 1:
66
+ after_merge_trs.append(maybe_merge_list[0])
67
+ maybe_merge_list = [c_t]
68
+ else:
69
+ final_merge_trs, early_qualified_trs = calculate_character_distance(maybe_merge_list, sub_read,
70
+ consensus_motif, p_indel,
71
+ p_match, mat, mis, gap, ext,
72
+ align_score, beta)
73
+ after_merge_trs.extend(final_merge_trs)
74
+ qualified_trs.extend(early_qualified_trs)
75
+ maybe_merge_list = [c_t]
76
+ consensus_motif = ''
77
+ break
78
+ # 重复部分前合并成功
79
+ consensus_motif_test = can_merge_to_onetr(sub_read[maybe_merge_list[0][1]:maybe_merge_list[-1][2] + 1],
80
+ sub_read[
81
+ n_candidate_trs[index + i][1]:n_candidate_trs[index + i][
82
+ 2] + 1],
83
+ maybe_merge_list[-1][0], n_candidate_trs[index + i][0],
84
+ maybe_merge_list[-1][2], n_candidate_trs[index + i][1],
85
+ sub_read[
86
+ maybe_merge_list[-1][2] + 1:max(maybe_merge_list[-1][2] + 1,
87
+ n_candidate_trs[index + i][
88
+ 1])])
89
+ if consensus_motif_test != '':
90
+ maybe_merge_list.append(n_candidate_trs[index + i])
91
+ consensus_motif = consensus_motif_test
92
+ break
93
+ # 重复部分前合并失败,但还处于重复之中
94
+ i += 1
95
+ continue
96
+ # 左重复
97
+ else:
98
+ # 还处于重复之中,继续遍历
99
+ if n_candidate_trs[index + i][1] < c_t[2]:
100
+ i += 1
101
+ continue
102
+ # 刚好跳出重复,后合并成功
103
+ consensus_motif_test = can_merge_to_onetr(sub_read[maybe_merge_list[0][1]:maybe_merge_list[-1][2] + 1],
104
+ sub_read[
105
+ n_candidate_trs[index + i][1]:n_candidate_trs[index + i][
106
+ 2] + 1],
107
+ maybe_merge_list[-1][0], n_candidate_trs[index + i][0],
108
+ maybe_merge_list[-1][2], n_candidate_trs[index + i][1],
109
+ sub_read[
110
+ maybe_merge_list[-1][2] + 1:max(maybe_merge_list[-1][2] + 1,
111
+ n_candidate_trs[index + i][
112
+ 1])])
113
+ if consensus_motif_test != '':
114
+ maybe_merge_list.append(n_candidate_trs[index + i])
115
+ consensus_motif = consensus_motif_test
116
+ break
117
+ # 刚好跳出重复,后合并失败
118
+ if len(maybe_merge_list) == 1:
119
+ after_merge_trs.append(maybe_merge_list[0])
120
+ maybe_merge_list = [c_t]
121
+ else:
122
+ final_merge_trs, early_qualified_trs = calculate_character_distance(maybe_merge_list, sub_read,
123
+ consensus_motif, p_indel,
124
+ p_match, mat, mis, gap, ext,
125
+ align_score, beta)
126
+ after_merge_trs.extend(final_merge_trs)
127
+ qualified_trs.extend(early_qualified_trs)
128
+ maybe_merge_list = [c_t]
129
+ consensus_motif = ''
130
+ break
131
+
132
+ if len(maybe_merge_list) == 1:
133
+ after_merge_trs.append(maybe_merge_list[0])
134
+ else:
135
+ final_merge_trs, early_qualified_trs = calculate_character_distance(maybe_merge_list, sub_read, consensus_motif,
136
+ p_indel, p_match, mat, mis, gap, ext,
137
+ align_score, beta)
138
+ after_merge_trs.extend(final_merge_trs)
139
+ qualified_trs.extend(early_qualified_trs)
140
+
141
+ for pos, a_m_t in enumerate(after_merge_trs):
142
+ if pos > 0 and a_m_t[1] >= after_merge_trs[pos - 1][1] and a_m_t[2] <= after_merge_trs[pos - 1][2] and \
143
+ after_merge_trs[pos - 1][0] in a_m_t[0] + a_m_t[0]:
144
+ continue
145
+ if a_m_t[2] - a_m_t[1] + 1 > 6000:
146
+ insert, delete, mismatch, align, left, right, score, copy = Segmented_Global_Alignment.segmented_global_align_algorithm(
147
+ sub_read[a_m_t[1]:a_m_t[2] + 1], a_m_t[0], p_indel, mat, mis, gap, ext, beta)
148
+ if insert == None:
149
+ continue
150
+ length = a_m_t[2] - a_m_t[1] + 1 - right - left
151
+ indel_rio = (insert + delete) / (length + delete)
152
+ match_rio = (length - mismatch - insert) / (length - insert)
153
+ else:
154
+ l = round((a_m_t[2] - a_m_t[1] + 1) * (1 + p_indel + 0.001))
155
+ seq = l // len(a_m_t[0]) * a_m_t[0] + a_m_t[0][:l % len(a_m_t[0])]
156
+
157
+ # 创建 PairwiseAligner 对象
158
+ aligner = PairwiseAligner()
159
+ aligner.mode = 'local' # 设置为局部比对模式
160
+ # 设置比对参数
161
+ aligner.match_score = mat
162
+ aligner.mismatch_score = mis
163
+ aligner.open_gap_score = gap
164
+ aligner.extend_gap_score = ext
165
+ # 执行局部比对
166
+ try:
167
+ alignments = aligner.align(seq, sub_read[a_m_t[1]:a_m_t[2] + 1])
168
+ if alignments:
169
+ delete = alignments[0].length - alignments[0].coordinates[1][-1] + alignments[0].coordinates[1][0]
170
+ insert = alignments[0].counts()[0] - delete
171
+ mismatch = alignments[0].counts()[2]
172
+ length = alignments[0].length
173
+ score = alignments[0].score
174
+ align = alignments[0].format()
175
+ copy = (alignments[0].coordinates[0][-1] - alignments[0].coordinates[0][0]) / len(a_m_t[0])
176
+ left = alignments[0].coordinates[1][0]
177
+ right = a_m_t[2] - a_m_t[1] + 1 - alignments[0].coordinates[1][-1]
178
+ else:
179
+ continue
180
+ except OverflowError as e:
181
+ continue
182
+
183
+ indel_rio = (insert + delete) / (length + delete)
184
+ match_rio = (length - mismatch - insert) / (length - insert)
185
+
186
+ if length >= 25 and indel_rio <= p_indel and match_rio >= p_match and score >= align_score:
187
+ qualified_trs.append(
188
+ (a_m_t[0], a_m_t[1] + left, a_m_t[2] - right, indel_rio, match_rio, score, align, copy))
189
+
190
+ if 100 <= left:
191
+ left_candidate = scan_subread.filter_chaotic_repeats(
192
+ [(a_m_t[1], a_m_t[1] + left, a_m_t[1], a_m_t[1] + left, 1)], sub_read, len(a_m_t[0]))
193
+ if left_candidate != []:
194
+ after_merge_trs.append(left_candidate[0])
195
+
196
+ if right >= 100:
197
+ right_candidate = scan_subread.filter_chaotic_repeats(
198
+ [(a_m_t[2] - right + 1, a_m_t[2], a_m_t[2] - right + 1, a_m_t[2], 1)], sub_read, len(a_m_t[0]))
199
+ if right_candidate != []:
200
+ after_merge_trs.append(right_candidate[0])
201
+ elif length >= 25 and try_change_motif(score, indel_rio, match_rio, align_score, p_indel, p_match):
202
+ motif_p = utils.tri_gram_model(sub_read[a_m_t[1]:a_m_t[2] + 1], len(a_m_t[0]))
203
+ if motif_p == None:
204
+ continue
205
+ for m_p in motif_p:
206
+ if a_m_t[0] in m_p + m_p:
207
+ continue
208
+ if a_m_t[2] - a_m_t[1] + 1 > 6000:
209
+ insert_p, delete_p, mismatch_p, align_p, left_p, right_p, score_p, copy_p = Segmented_Global_Alignment.segmented_global_align_algorithm(
210
+ sub_read[a_m_t[1]:a_m_t[2] + 1], m_p, p_indel, mat, mis, gap, ext, beta)
211
+ if insert_p == None:
212
+ continue
213
+ length_p = a_m_t[2] - a_m_t[1] + 1 - right_p - left_p
214
+ indel_rio_p = (insert_p + delete_p) / (length_p + delete_p)
215
+ match_rio_p = (length_p - mismatch_p - insert_p) / (length_p - insert_p)
216
+ else:
217
+ l = round((a_m_t[2] - a_m_t[1] + 1) * (1 + p_indel + 0.001))
218
+ seq_p = l // len(m_p) * m_p + m_p[:l % len(m_p)]
219
+
220
+ # 创建 PairwiseAligner 对象
221
+ aligner_p = PairwiseAligner()
222
+ aligner_p.mode = 'local' # 设置为局部比对模式
223
+ # 设置比对参数
224
+ aligner_p.match_score = mat
225
+ aligner_p.mismatch_score = mis
226
+ aligner_p.open_gap_score = gap
227
+ aligner_p.extend_gap_score = ext
228
+ # 执行局部比对
229
+ try:
230
+ alignments_p = aligner_p.align(seq_p, sub_read[a_m_t[1]:a_m_t[2] + 1])
231
+ if alignments_p:
232
+ delete_p = alignments_p[0].length - alignments_p[0].coordinates[1][-1] + \
233
+ alignments_p[0].coordinates[1][0]
234
+ insert_p = alignments_p[0].counts()[0] - delete_p
235
+ mismatch_p = alignments_p[0].counts()[2]
236
+ length_p = alignments_p[0].length
237
+ score_p = alignments_p[0].score
238
+ align_p = alignments_p[0].format()
239
+ copy_p = alignments_p[0].coordinates[0][-1] / len(m_p)
240
+ left_p = alignments_p[0].coordinates[1][0]
241
+ right_p = a_m_t[2] - a_m_t[1] + 1 - alignments_p[0].coordinates[1][-1]
242
+ else:
243
+ continue
244
+ except OverflowError as e:
245
+ continue
246
+
247
+ indel_rio_p = (insert_p + delete_p) / (length_p + delete_p)
248
+ match_rio_p = (length_p - mismatch_p - insert_p) / (length_p - insert_p)
249
+
250
+ if length_p >= 25 and indel_rio_p <= p_indel and match_rio_p >= p_match and score_p >= align_score:
251
+ qualified_trs.append((m_p, a_m_t[1] + left_p, a_m_t[2] - right_p, indel_rio_p, match_rio_p, score_p,
252
+ align_p, copy_p))
253
+ return qualified_trs
254
+
255
+
256
+ def can_shorten_distance(seq, motif):
257
+ max_windows = [16, 16, 16, 16, 15, 15, 14, 14]
258
+ motif_mark_indexes = utils.get_motif_marks(seq, motif)
259
+ visit = [0] * len(seq)
260
+ for m_p in motif_mark_indexes:
261
+ visit[m_p:m_p + len(motif)] = [1] * len(motif)
262
+ if len(motif) in [1, 4, 5, 6, 7]:
263
+ window_ones_count = sum(visit[:20])
264
+ max_ones_count = window_ones_count
265
+
266
+ for i in range(1, len(visit) - 19):
267
+ # 更新窗口中1的数量
268
+ window_ones_count += visit[i + 19] - visit[i - 1]
269
+ if window_ones_count > max_ones_count:
270
+ max_ones_count = window_ones_count
271
+ elif len(motif) in [2, 3]:
272
+ pattern = f"({motif}){{s<=1}}"
273
+ matches = regex.finditer(pattern, str(seq))
274
+ positions = [match.start() for match in matches]
275
+ for m_p in positions:
276
+ for i in range(len(motif)):
277
+ visit[m_p + i] = visit[m_p + i] + (1 - visit[m_p + i]) * (len(motif) - 1) / len(motif)
278
+ windows = sum(visit[:20])
279
+ window_ones_count = math.floor(windows)
280
+ max_ones_count = window_ones_count
281
+
282
+ for i in range(1, len(visit) - 19):
283
+ # 更新窗口中1的数量
284
+ windows += visit[i + 19] - visit[i - 1]
285
+ window_ones_count = math.floor(windows)
286
+ if window_ones_count > max_ones_count:
287
+ max_ones_count = window_ones_count
288
+ else:
289
+ pattern = f"({motif}){{s<=3}}"
290
+ matches = regex.finditer(pattern, str(seq))
291
+ positions = [match.start() for match in matches]
292
+ for m_p in positions:
293
+ for i in range(len(motif)):
294
+ visit[m_p + i] = visit[m_p + i] + (1 - visit[m_p + i]) * (len(motif) - 3) / len(motif)
295
+ windows = sum(visit[:20])
296
+ window_ones_count = math.floor(windows)
297
+ max_ones_count = window_ones_count
298
+
299
+ for i in range(1, len(visit) - 19):
300
+ # 更新窗口中1的数量
301
+ windows += visit[i + 19] - visit[i - 1]
302
+ window_ones_count = math.floor(windows)
303
+ if window_ones_count > max_ones_count:
304
+ max_ones_count = window_ones_count
305
+
306
+ if max_ones_count < max_windows[len(motif) - 1]:
307
+ return False
308
+ return True
309
+
310
+
311
+ def can_standardize_motif(seq1, seq2, motif1):
312
+ motifs = utils.tri_gram_model(seq1 + seq2, len(motif1))
313
+ if motifs == None:
314
+ return ''
315
+ start_probe = [24, 24, 24, 21, 24, 24, 24, 22]
316
+ end_probe = [24, 24, 24, 22, 23, 23, 23, 22]
317
+ to_align_seq = seq2 if motifs[0] in motif1 + motif1 else seq1
318
+ if can_shorten_distance(to_align_seq, motifs[0]) == False:
319
+ return ''
320
+ probe = 25 // len(motifs[0]) * motifs[0] + motifs[0][:25 % len(motifs[0])]
321
+ aligner = PairwiseAligner()
322
+ aligner.mode = 'local'
323
+ # 设置比对参数
324
+ aligner.match_score = 2
325
+ aligner.mismatch_score = -3
326
+ aligner.open_gap_score = -5
327
+ start_index = -1
328
+ end_index = -1
329
+ for i in range(len(to_align_seq) // 5 - 3):
330
+ probe_align = aligner.align(probe, to_align_seq[i * 5:i * 5 + 20])
331
+ if probe_align:
332
+ pass
333
+ else:
334
+ continue
335
+ if probe_align[0].score >= start_probe[len(motifs[0]) - 1]:
336
+ start_index = max(i * 5 - 5, 0)
337
+ break
338
+ if start_index == -1:
339
+ return ''
340
+ for j in range((len(to_align_seq) - start_index - 1) // 5 - 3):
341
+ probe_align = aligner.align(probe, to_align_seq[len(to_align_seq) - j * 5 - 20:len(to_align_seq) - j * 5])
342
+ if probe_align:
343
+ pass
344
+ else:
345
+ continue
346
+ if probe_align[0].score >= end_probe[len(motifs[0]) - 1]:
347
+ end_index = min(len(to_align_seq) - j * 5 + 4, len(to_align_seq) - 1)
348
+ break
349
+ if end_index == -1:
350
+ return ''
351
+ if end_index - start_index < 24:
352
+ return ''
353
+ return motifs[0]
354
+
355
+
356
+ def judge_motif_sim(m1, m2, s1, s2):
357
+ if len(m1) < 7 and m1 in m2 + m2:
358
+ return m1
359
+ else:
360
+ pattern = f"({m1}){{s<=1}}"
361
+ matches = regex.finditer(pattern, str(m2 + m2))
362
+ if any(matches):
363
+ motifs = utils.tri_gram_model(s1 + s2, len(m1))
364
+ if motifs == None:
365
+ return ''
366
+ return motifs[0]
367
+ else:
368
+ return ''
369
+
370
+
371
+ def try_change_motif(score, indel, match, align_score, p_indel, p_match):
372
+ if align_score - score > 5 and indel <= p_indel and match >= p_match:
373
+ return False
374
+ if score >= align_score and indel - p_indel > 0.025 and match >= p_match:
375
+ return False
376
+ if score >= align_score and indel <= p_indel and p_match - match > 0.025:
377
+ return False
378
+ return True
379
+
380
+
381
+ def calculate_character_distance(maybe_merge_list_partial, sub_read, consensus_motif, p_indel, p_match, mat, mis, gap,
382
+ ext, align_score, beta):
383
+ final_merge_trs = []
384
+ final_merge_list = []
385
+ early_qualified_trs = []
386
+ last = 0
387
+ for inde, m_m_p in enumerate(maybe_merge_list_partial):
388
+ if m_m_p[2] - m_m_p[1] + 1 > 6000:
389
+ insert, delete, mismatch, align, left, right, score, copy = Segmented_Global_Alignment.segmented_global_align_algorithm(
390
+ sub_read[m_m_p[1]:m_m_p[2] + 1], m_m_p[0], p_indel, mat, mis, gap, ext, beta)
391
+ if insert == None:
392
+ continue
393
+ length = m_m_p[2] - m_m_p[1] + 1 - right - left
394
+ else:
395
+ seq = sub_read[m_m_p[1]:m_m_p[2] + 1]
396
+ l = round(len(seq) * (1 + p_indel + 0.005))
397
+ tr = l // len(consensus_motif) * consensus_motif + consensus_motif[:l % len(consensus_motif)]
398
+ # 创建 PairwiseAligner 对象
399
+ aligner = PairwiseAligner()
400
+ aligner.mode = 'local' # 设置为局部比对模式
401
+ # 设置比对参数
402
+ aligner.match_score = mat
403
+ aligner.mismatch_score = mis
404
+ aligner.open_gap_score = gap
405
+ aligner.extend_gap_score = ext
406
+ # 执行局部比对
407
+ try:
408
+ alignments = aligner.align(tr, seq)
409
+ if alignments:
410
+ delete = alignments[0].length - alignments[0].coordinates[1][-1] + alignments[0].coordinates[1][0]
411
+ insert = alignments[0].counts()[0] - delete
412
+ mismatch = alignments[0].counts()[2]
413
+ length = alignments[0].length
414
+ score = alignments[0].score
415
+ align = alignments[0].format()
416
+ copy = (alignments[0].coordinates[0][-1] - alignments[0].coordinates[0][0]) / len(consensus_motif)
417
+ left = alignments[0].coordinates[1][0]
418
+ right = len(seq) - alignments[0].coordinates[1][-1]
419
+ else:
420
+ continue
421
+ except OverflowError as e:
422
+ continue
423
+
424
+ indel_rio = (insert + delete) / (length + delete)
425
+ match_rio = (length - mismatch - insert) / (length - insert)
426
+ if inde != 0 and left + last <= 70:
427
+ if final_merge_list == []:
428
+ final_merge_list.append(maybe_merge_list_partial[inde - 1])
429
+ if early_qualified_trs and maybe_merge_list_partial[inde - 1][1] <= early_qualified_trs[-1][1] and \
430
+ maybe_merge_list_partial[inde - 1][2] >= early_qualified_trs[-1][2]:
431
+ del early_qualified_trs[-1]
432
+ final_merge_list.append(m_m_p)
433
+ elif inde == 0:
434
+ if length >= 25 and indel_rio <= p_indel and match_rio >= p_match and score >= align_score:
435
+ early_qualified_trs.append(
436
+ (m_m_p[0], m_m_p[1] + left, m_m_p[2] - right, indel_rio, match_rio, score, align, copy))
437
+ elif length >= 25 and try_change_motif(score, indel_rio, match_rio, align_score, p_indel, p_match):
438
+ motif_p = utils.tri_gram_model(sub_read[m_m_p[1]:m_m_p[2] + 1], len(m_m_p[0]))
439
+ if motif_p == None:
440
+ continue
441
+ for m_p in motif_p:
442
+ if m_m_p[0] in m_p + m_p:
443
+ continue
444
+ if m_m_p[2] - m_m_p[1] + 1 > 6000:
445
+ insert_p, delete_p, mismatch_p, align_p, left_p, right_p, score_p, copy_p = Segmented_Global_Alignment.segmented_global_align_algorithm(
446
+ sub_read[m_m_p[1]:m_m_p[2] + 1], m_p, p_indel, mat, mis, gap, ext, beta)
447
+ if insert_p == None:
448
+ continue
449
+ length_p = m_m_p[2] - m_m_p[1] + 1 - right_p - left_p
450
+ indel_rio_p = (insert_p + delete_p) / (length_p + delete_p)
451
+ match_rio_p = (length_p - mismatch_p - insert_p) / (length_p - insert_p)
452
+ else:
453
+ l_p = round((m_m_p[2] - m_m_p[1] + 1) * (1 + p_indel + 0.001))
454
+ seq_p = l_p // len(m_p) * m_p + m_p[:l_p % len(m_p)]
455
+
456
+ # 创建 PairwiseAligner 对象
457
+ aligner_p = PairwiseAligner()
458
+ aligner_p.mode = 'local' # 设置为局部比对模式
459
+ # 设置比对参数
460
+ aligner_p.match_score = mat
461
+ aligner_p.mismatch_score = mis
462
+ aligner_p.open_gap_score = gap
463
+ aligner_p.extend_gap_score = ext
464
+ # 执行局部比对
465
+ try:
466
+ alignments_p = aligner_p.align(seq_p, sub_read[m_m_p[1]:m_m_p[2] + 1])
467
+ if alignments_p:
468
+ delete_p = alignments_p[0].length - alignments_p[0].coordinates[1][-1] + \
469
+ alignments_p[0].coordinates[1][0]
470
+ insert_p = alignments_p[0].counts()[0] - delete_p
471
+ mismatch_p = alignments_p[0].counts()[2]
472
+ length_p = alignments_p[0].length
473
+ score_p = alignments_p[0].score
474
+ align_p = alignments_p[0].format()
475
+ copy_p = alignments_p[0].coordinates[0][-1] / len(m_p)
476
+ left_p = alignments_p[0].coordinates[1][0]
477
+ right_p = m_m_p[2] - m_m_p[1] + 1 - alignments_p[0].coordinates[1][-1]
478
+ else:
479
+ continue
480
+ except OverflowError as e:
481
+ continue
482
+
483
+ indel_rio_p = (insert_p + delete_p) / (length_p + delete_p)
484
+ match_rio_p = (length_p - mismatch_p - insert_p) / (length_p - insert_p)
485
+
486
+ if length_p >= 25 and indel_rio_p <= p_indel and match_rio_p >= p_match and score_p >= align_score:
487
+ early_qualified_trs.append((
488
+ m_p, m_m_p[1] + left_p, m_m_p[2] - right_p, indel_rio_p, match_rio_p,
489
+ score_p, align_p, copy_p))
490
+ elif final_merge_list == []:
491
+ if length >= 25 and indel_rio <= p_indel and match_rio >= p_match and score >= align_score:
492
+ early_qualified_trs.append(
493
+ (m_m_p[0], m_m_p[1] + left, m_m_p[2] - right, indel_rio, match_rio, score, align, copy))
494
+ elif length >= 25 and try_change_motif(score, indel_rio, match_rio, align_score, p_indel, p_match):
495
+ motif_p = utils.tri_gram_model(sub_read[m_m_p[1]:m_m_p[2] + 1], len(m_m_p[0]))
496
+ if motif_p == None:
497
+ continue
498
+ for m_p in motif_p:
499
+ if m_m_p[0] in m_p + m_p:
500
+ continue
501
+ if m_m_p[2] - m_m_p[1] + 1 > 6000:
502
+ insert_p, delete_p, mismatch_p, align_p, left_p, right_p, score_p, copy_p = Segmented_Global_Alignment.segmented_global_align_algorithm(
503
+ sub_read[m_m_p[1]:m_m_p[2] + 1], m_p, p_indel, mat, mis, gap, ext, beta)
504
+ if insert_p == None:
505
+ continue
506
+ length_p = m_m_p[2] - m_m_p[1] + 1 - right_p - left_p
507
+ indel_rio_p = (insert_p + delete_p) / (length_p + delete_p)
508
+ match_rio_p = (length_p - mismatch_p - insert_p) / (length_p - insert_p)
509
+ else:
510
+ l_p = round((m_m_p[2] - m_m_p[1] + 1) * (1 + p_indel + 0.001))
511
+ seq_p = l_p // len(m_p) * m_p + m_p[:l_p % len(m_p)]
512
+
513
+ # 创建 PairwiseAligner 对象
514
+ aligner_p = PairwiseAligner()
515
+ aligner_p.mode = 'local' # 设置为局部比对模式
516
+ # 设置比对参数
517
+ aligner_p.match_score = mat
518
+ aligner_p.mismatch_score = mis
519
+ aligner_p.open_gap_score = gap
520
+ aligner_p.extend_gap_score = ext
521
+ # 执行局部比对
522
+ try:
523
+ alignments_p = aligner_p.align(seq_p, sub_read[m_m_p[1]:m_m_p[2] + 1])
524
+ if alignments_p:
525
+ delete_p = alignments_p[0].length - alignments_p[0].coordinates[1][-1] + \
526
+ alignments_p[0].coordinates[1][0]
527
+ insert_p = alignments_p[0].counts()[0] - delete_p
528
+ mismatch_p = alignments_p[0].counts()[2]
529
+ length_p = alignments_p[0].length
530
+ score_p = alignments_p[0].score
531
+ align_p = alignments_p[0].format()
532
+ copy_p = alignments_p[0].coordinates[0][-1] / len(m_p)
533
+ left_p = alignments_p[0].coordinates[1][0]
534
+ right_p = m_m_p[2] - m_m_p[1] + 1 - alignments_p[0].coordinates[1][-1]
535
+ else:
536
+ continue
537
+ except OverflowError as e:
538
+ continue
539
+
540
+ indel_rio_p = (insert_p + delete_p) / (length_p + delete_p)
541
+ match_rio_p = (length_p - mismatch_p - insert_p) / (length_p - insert_p)
542
+
543
+ if length_p >= 25 and indel_rio_p <= p_indel and match_rio_p >= p_match and score_p >= align_score:
544
+ early_qualified_trs.append(
545
+ (m_p, m_m_p[1] + left_p, m_m_p[2] - right_p, indel_rio_p, match_rio_p, score_p, align_p,
546
+ copy_p))
547
+ else:
548
+ final_merge_trs.append((consensus_motif, final_merge_list[0][1], final_merge_list[-1][2]))
549
+ final_merge_list = []
550
+ if length >= 25 and indel_rio <= p_indel and match_rio >= p_match and score >= align_score:
551
+ early_qualified_trs.append(
552
+ (m_m_p[0], m_m_p[1] + left, m_m_p[2] - right, indel_rio, match_rio, score, align, copy))
553
+ elif length >= 25 and try_change_motif(score, indel_rio, match_rio, align_score, p_indel, p_match):
554
+ motif_p = utils.tri_gram_model(sub_read[m_m_p[1]:m_m_p[2] + 1], len(m_m_p[0]))
555
+ if motif_p == None:
556
+ continue
557
+ for m_p in motif_p:
558
+ if m_m_p[0] in m_p + m_p:
559
+ continue
560
+ if m_m_p[2] - m_m_p[1] + 1 > 6000:
561
+ insert_p, delete_p, mismatch_p, align_p, left_p, right_p, score_p, copy_p = Segmented_Global_Alignment.segmented_global_align_algorithm(
562
+ sub_read[m_m_p[1]:m_m_p[2] + 1], m_p, p_indel, mat, mis, gap, ext, beta)
563
+ if insert_p == None:
564
+ continue
565
+ length_p = m_m_p[2] - m_m_p[1] + 1 - right_p - left_p
566
+ indel_rio_p = (insert_p + delete_p) / (length_p + delete_p)
567
+ match_rio_p = (length_p - mismatch_p - insert_p) / (length_p - insert_p)
568
+ else:
569
+ l_p = round((m_m_p[2] - m_m_p[1] + 1) * (1 + p_indel + 0.001))
570
+ seq_p = l_p // len(m_p) * m_p + m_p[:l_p % len(m_p)]
571
+
572
+ # 创建 PairwiseAligner 对象
573
+ aligner_p = PairwiseAligner()
574
+ aligner_p.mode = 'local' # 设置为局部比对模式
575
+ # 设置比对参数
576
+ aligner_p.match_score = mat
577
+ aligner_p.mismatch_score = mis
578
+ aligner_p.open_gap_score = gap
579
+ aligner_p.extend_gap_score = ext
580
+ # 执行局部比对
581
+ try:
582
+ alignments_p = aligner_p.align(seq_p, sub_read[m_m_p[1]:m_m_p[2] + 1])
583
+ if alignments_p:
584
+ delete_p = alignments_p[0].length - alignments_p[0].coordinates[1][-1] + \
585
+ alignments_p[0].coordinates[1][0]
586
+ insert_p = alignments_p[0].counts()[0] - delete_p
587
+ mismatch_p = alignments_p[0].counts()[2]
588
+ length_p = alignments_p[0].length
589
+ score_p = alignments_p[0].score
590
+ align_p = alignments_p[0].format()
591
+ copy_p = alignments_p[0].coordinates[0][-1] / len(m_p)
592
+ left_p = alignments_p[0].coordinates[1][0]
593
+ right_p = m_m_p[2] - m_m_p[1] + 1 - alignments_p[0].coordinates[1][-1]
594
+ else:
595
+ continue
596
+ except OverflowError as e:
597
+ continue
598
+
599
+ indel_rio_p = (insert_p + delete_p) / (length_p + delete_p)
600
+ match_rio_p = (length_p - mismatch_p - insert_p) / (length_p - insert_p)
601
+
602
+ if length_p >= 25 and indel_rio_p <= p_indel and match_rio_p >= p_match and score_p >= align_score:
603
+ early_qualified_trs.append(
604
+ (m_p, m_m_p[1] + left_p, m_m_p[2] - right_p, indel_rio_p, match_rio_p, score_p, align_p,
605
+ copy_p))
606
+ last = right
607
+
608
+ if final_merge_list != []:
609
+ final_merge_trs.append((consensus_motif, final_merge_list[0][1], final_merge_list[-1][2]))
610
+ return final_merge_trs, early_qualified_trs
611
+
612
+
613
+ def simple_local_comparison_algorithm(seq, motif, p_indel, mat, mis, gap):
614
+ l = round(len(seq) * (1 + p_indel + 0.005))
615
+ tr = l // len(motif) * motif + motif[:l % len(motif)]
616
+ score_matrix = np.zeros((len(seq) + 1, len(tr) + 1))
617
+ traceback_matrix = np.zeros((len(seq) + 1, len(tr) + 1), dtype=object)
618
+
619
+ max_score = 0
620
+ max_pos = None
621
+
622
+ for i in range(1, len(seq) + 1):
623
+ for j in range(1, len(tr) + 1):
624
+ match = score_matrix[i - 1, j - 1] + (mat if seq[i - 1] == tr[j - 1] else mis)
625
+ delete = score_matrix[i, j - 1] + gap
626
+ insert = score_matrix[i - 1, j] + gap
627
+ score_matrix[i, j] = max(0, match, delete, insert)
628
+
629
+ # 记录得分最高的位置
630
+ if score_matrix[i, j] > max_score:
631
+ max_score = score_matrix[i, j]
632
+ max_pos = (i, j)
633
+
634
+ # 偏替换,插入和删除偏删除
635
+ if score_matrix[i, j] == match:
636
+ traceback_matrix[i, j] = (i - 1, j - 1)
637
+ elif score_matrix[i, j] == delete:
638
+ traceback_matrix[i, j] = (i, j - 1)
639
+ elif score_matrix[i, j] == insert:
640
+ traceback_matrix[i, j] = (i - 1, j)
641
+
642
+ i, j = max_pos
643
+ align_mark = []
644
+ target = []
645
+ query = []
646
+ delete = 0
647
+ insert = 0
648
+ mismatch = 0
649
+ original_i = i
650
+ original_j = j
651
+ while score_matrix[i, j] != 0:
652
+ ii, jj = traceback_matrix[i, j]
653
+ if ii + 1 == i and jj + 1 == j:
654
+ if seq[ii] == tr[jj]:
655
+ align_mark.append('|')
656
+ else:
657
+ align_mark.append('.')
658
+ mismatch += 1
659
+ target.append(tr[jj])
660
+ query.append(seq[ii])
661
+ elif ii + 1 == i and jj == j:
662
+ align_mark.append('-')
663
+ query.append(seq[ii])
664
+ target.append('-')
665
+ insert += 1
666
+ elif ii == i and jj + 1 == j:
667
+ align_mark.append('-')
668
+ query.append('-')
669
+ target.append(tr[jj])
670
+ delete += 1
671
+ i = ii
672
+ j = jj
673
+ align_mark = ''.join(list(reversed(align_mark)))
674
+ target = ''.join(list(reversed(target)))
675
+ query = ''.join(list(reversed(query)))
676
+ align = []
677
+ target_start = j
678
+ query_start = i
679
+ align_start = 0
680
+ for k in range(math.ceil(len(align_mark) / 60)):
681
+ target_local = target[align_start:min(len(align_mark), align_start + 60)]
682
+ query_local = query[align_start:min(len(align_mark), align_start + 60)]
683
+ align_local = align_mark[align_start:min(len(align_mark), align_start + 60)]
684
+ target_local = 'target' + (13 - len(str(target_start))) * ' ' + str(target_start) + ' ' + target_local
685
+ query_local = 'query' + (14 - len(str(query_start))) * ' ' + str(query_start) + ' ' + query_local
686
+ align_local = (19 - len(str(align_start))) * ' ' + str(align_start) + ' ' + align_local
687
+ if k + 1 < math.ceil(len(align_mark) / 60):
688
+ target_local += '\n'
689
+ query_local += '\n\n'
690
+ align_local += '\n'
691
+ target_start += (60 - target_local.count('-'))
692
+ query_start += (60 - query_local.count('-'))
693
+ align_start += 60
694
+ else:
695
+ target_local += (' ' + str(original_j) + '\n')
696
+ align_local += (' ' + str(len(align_mark)) + '\n')
697
+ query_local += (' ' + str(original_i) + '\n' + '\n')
698
+ align.append(target_local)
699
+ align.append(align_local)
700
+ align.append(query_local)
701
+ align = ''.join(align)
702
+ left = i
703
+ right = len(seq) - original_i
704
+ return insert, delete, mismatch, align, left, right, max_score