FastSTR 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,541 @@
1
+ import math
2
+ import re
3
+ from collections import deque
4
+ import numpy as np
5
+ import pandas as pd
6
+
7
+
8
+
9
+ # 寻找最佳解释:原则是如果有整motif长度,首选整motif长度,如果有多个,选取错误率最低的;如果没有整motif长度,则选取他周边是整motif长度,且编辑距
10
+ # 离最低的,如果也有多个,则选取错误率最低的
11
+ # RPTA
12
+ def realtime_path_tracking_alignment(standard_seq, seq, mat, mis, gap, logo=0):
13
+ dp = [[0] * (len(standard_seq) + 1) for _ in range(len(seq) + 1)]
14
+ upper_left = [[set() for _ in range(len(standard_seq) + 1)] for _ in range(len(seq) + 1)]
15
+ priority_path = [[-1] * (len(standard_seq) + 1) for _ in range(len(seq) + 1)]
16
+
17
+ # 初始化
18
+ for i in range(len(seq) + 1):
19
+ if i == 0:
20
+ dp[i][0] = 0
21
+ else:
22
+ dp[i][0] = i * gap
23
+ for j in range(len(standard_seq) + 1):
24
+ if j == 0:
25
+ dp[0][j] = 0
26
+ else:
27
+ dp[0][j] = j * gap
28
+ for i in range(len(seq) + 1):
29
+ upper_left[i][0].add(0)
30
+ for j in range(len(standard_seq) + 1):
31
+ upper_left[0][j].add(0)
32
+ for i in range(len(seq) + 1):
33
+ if i == 0:
34
+ priority_path[i][0] = -1
35
+ else:
36
+ priority_path[i][0] = (i - 1, 0, -1)
37
+ for j in range(len(standard_seq) + 1):
38
+ if j == 0:
39
+ priority_path[0][j] = -1
40
+ else:
41
+ priority_path[0][j] = (0, j - 1, -1)
42
+
43
+ # 遍历序列,进行比对
44
+ for i in range(1, len(seq) + 1):
45
+ for j in range(1, len(standard_seq) + 1):
46
+ # 选择状态基于插入、删除、替换,用三位二进制来记录
47
+ chose_path = 0
48
+ delete_cost = dp[i][j - 1] + gap
49
+ insert_cost = dp[i - 1][j] + gap
50
+ replace_cost = dp[i - 1][j - 1] + (mat if standard_seq[j - 1] == seq[i - 1] else mis)
51
+ dp[i][j] = max(insert_cost, delete_cost, replace_cost)
52
+ # 更新最佳比对的替换数量
53
+ if dp[i][j] == delete_cost:
54
+ upper_left[i][j] = upper_left[i][j - 1]
55
+ chose_path += 2
56
+ if dp[i][j] == insert_cost:
57
+ # if not upper_left[i][j]:
58
+ if chose_path == 0:
59
+ upper_left[i][j] = upper_left[i - 1][j]
60
+ else:
61
+ upper_left[i][j] = upper_left[i][j - 1] | upper_left[i - 1][j]
62
+ chose_path += 4
63
+ if dp[i][j] == replace_cost:
64
+ updated_set = {x + 1 for x in upper_left[i - 1][j - 1]}
65
+ # if not upper_left[i][j]:
66
+ if chose_path == 0:
67
+ upper_left[i][j] = updated_set
68
+ else:
69
+ upper_left[i][j] = upper_left[i][j] | updated_set
70
+ chose_path += 1
71
+ # 更新比对路径,0表示不匹配、1表示匹配、-1表示indel
72
+ if chose_path == 1:
73
+ priority_path[i][j] = (i - 1, j - 1, 1) if standard_seq[j - 1] == seq[i - 1] else (i - 1, j - 1, 0)
74
+ elif chose_path == 2:
75
+ priority_path[i][j] = (i, j - 1, -1)
76
+ elif chose_path == 4:
77
+ priority_path[i][j] = (i - 1, j, -1)
78
+ elif chose_path == 3:
79
+ if min(upper_left[i - 1][j - 1]) + 1 <= min(upper_left[i][j - 1]):
80
+ priority_path[i][j] = (i - 1, j - 1, 1) if standard_seq[j - 1] == seq[i - 1] else (i - 1, j - 1, 0)
81
+ else:
82
+ priority_path[i][j] = (i, j - 1, -1)
83
+ elif chose_path == 5:
84
+ if min(upper_left[i - 1][j - 1]) + 1 <= min(upper_left[i - 1][j]):
85
+ priority_path[i][j] = (i - 1, j - 1, 1) if standard_seq[j - 1] == seq[i - 1] else (i - 1, j - 1, 0)
86
+ else:
87
+ priority_path[i][j] = (i - 1, j, -1)
88
+ elif chose_path == 6:
89
+ if min(upper_left[i][j - 1]) <= min(upper_left[i - 1][j]):
90
+ priority_path[i][j] = (i, j - 1, -1)
91
+ else:
92
+ priority_path[i][j] = (i - 1, j, -1)
93
+ else:
94
+ min_ul = min(min(upper_left[i][j - 1]), min(upper_left[i - 1][j]), min(upper_left[i - 1][j - 1]) + 1)
95
+ if min_ul == min(upper_left[i - 1][j - 1]) + 1:
96
+ priority_path[i][j] = (i - 1, j - 1, 1) if standard_seq[j - 1] == seq[i - 1] else (i - 1, j - 1, 0)
97
+ elif min_ul == min(upper_left[i][j - 1]):
98
+ priority_path[i][j] = (i, j - 1, -1)
99
+ else:
100
+ priority_path[i][j] = (i - 1, j, -1)
101
+
102
+ if logo == 1:
103
+ return dp, upper_left, priority_path
104
+ return dp[len(seq)], upper_left[len(seq)], priority_path
105
+
106
+
107
+ # MCLA
108
+ def motif_constrained_local_align(standard_seq, seq, mat, mis, gap, n):
109
+ dp = [[0] * (len(standard_seq) + 1) for _ in range(len(seq) + 1)]
110
+ upper_left = [[{(0, -1, -1)} for _ in range(len(standard_seq) + 1)] for _ in range(len(seq) + 1)]
111
+ priority_path = [[-1] * (len(standard_seq) + 1) for _ in range(len(seq) + 1)]
112
+ # new_tr_start = [-1] * len(seq)
113
+ max_score = 0
114
+
115
+ for i in range(1, len(seq) + 1):
116
+ for j in range(1, len(standard_seq) + 1):
117
+ if j > 1 and priority_path[i - 1][j] == -1 and priority_path[i][j - 1] == -1 and priority_path[
118
+ i - 1][j - 1] == -1: # 确保B从第一个碱基开始比对
119
+ continue
120
+ match = mis
121
+ if j == 1 or priority_path[i - 1][j - 1] != -1:
122
+ match = dp[i - 1][j - 1] + (mat if seq[i - 1] == standard_seq[j - 1] else mis)
123
+ delete = dp[i][j - 1] + gap
124
+ insert = dp[i - 1][j] + gap
125
+ dp[i][j] = max(0, match, delete, insert)
126
+ if match < 0 and delete < 0 and insert < 0:
127
+ continue
128
+ chose_path = 0
129
+ # 更新最佳比对的替换数量
130
+ if dp[i][j] == delete:
131
+ upper_left[i][j] = upper_left[i][j - 1]
132
+ chose_path += 2
133
+ if dp[i][j] == insert:
134
+ if chose_path == 0:
135
+ upper_left[i][j] = upper_left[i - 1][j]
136
+ else:
137
+ upper_left[i][j] = upper_left[i][j - 1] | upper_left[i - 1][j]
138
+ chose_path += 4
139
+ if dp[i][j] == match:
140
+ updated_set = set()
141
+ for x in upper_left[i - 1][j - 1]:
142
+ if priority_path[i - 1][j - 1] == -1:
143
+ updated_set.add((x[0] + 1, i, j))
144
+ else:
145
+ updated_set.add((x[0] + 1, x[1], x[2]))
146
+ # updated_set = {(x[0] + 1, x[1], x[2]) for x in upper_left[i - 1][j - 1]}
147
+ if chose_path == 0:
148
+ upper_left[i][j] = updated_set
149
+ else:
150
+ upper_left[i][j] = upper_left[i][j] | updated_set
151
+ chose_path += 1
152
+ # 更新比对路径,0表示不匹配、1表示匹配、-1表示indel
153
+ if chose_path == 1:
154
+ priority_path[i][j] = (i - 1, j - 1, 1) if standard_seq[j - 1] == seq[i - 1] else (i - 1, j - 1, 0)
155
+ elif chose_path == 2:
156
+ priority_path[i][j] = (i, j - 1, -1)
157
+ elif chose_path == 4:
158
+ priority_path[i][j] = (i - 1, j, -1)
159
+ elif chose_path == 3:
160
+ if min(t[0] for t in upper_left[i - 1][j - 1]) + 1 <= min(t[0] for t in upper_left[i][j - 1]):
161
+ priority_path[i][j] = (i - 1, j - 1, 1) if standard_seq[j - 1] == seq[i - 1] else (
162
+ i - 1, j - 1, 0)
163
+ else:
164
+ priority_path[i][j] = (i, j - 1, -1)
165
+ elif chose_path == 5:
166
+ if min(t[0] for t in upper_left[i - 1][j - 1]) + 1 <= min(t[0] for t in upper_left[i - 1][j]):
167
+ priority_path[i][j] = (i - 1, j - 1, 1) if standard_seq[j - 1] == seq[i - 1] else (
168
+ i - 1, j - 1, 0)
169
+ else:
170
+ priority_path[i][j] = (i - 1, j, -1)
171
+ elif chose_path == 6:
172
+ if min(t[0] for t in upper_left[i][j - 1]) <= min(t[0] for t in upper_left[i - 1][j]):
173
+ priority_path[i][j] = (i, j - 1, -1)
174
+ else:
175
+ priority_path[i][j] = (i - 1, j, -1)
176
+ else:
177
+ min_ul = min(min(t[0] for t in upper_left[i][j - 1]), min(t[0] for t in upper_left[i - 1][j]),
178
+ min(t[0] for t in upper_left[i - 1][j - 1]) + 1)
179
+ if min_ul == min(t[0] for t in upper_left[i - 1][j]) + 1:
180
+ priority_path[i][j] = (i - 1, j - 1, 1) if standard_seq[j - 1] == seq[i - 1] else (
181
+ i - 1, j - 1, 0)
182
+ elif min_ul == min(t[0] for t in upper_left[i][j - 1]):
183
+ priority_path[i][j] = (i, j - 1, -1)
184
+ else:
185
+ priority_path[i][j] = (i - 1, j, -1)
186
+
187
+ if i == len(seq) and j % n == 0 and dp[i][j] > max_score:
188
+ max_score = dp[i][j]
189
+
190
+ return dp[len(seq)], upper_left[len(seq)], priority_path, max_score
191
+
192
+
193
+ def format_alignment(align_path, seq, motif, stab_l):
194
+ target = deque()
195
+ query = deque()
196
+ alignment = deque()
197
+ i = len(seq)
198
+ j = stab_l
199
+ length = 0
200
+ seq_length = len(seq) - 1
201
+ motif_length = len(motif) - 1
202
+ while True:
203
+ if align_path[i][j] == -1:
204
+ break
205
+ length += 1
206
+ if align_path[i][j][2] == 0:
207
+ query.appendleft(seq[seq_length])
208
+ seq_length -= 1
209
+ target.appendleft(motif[motif_length])
210
+ motif_length -= 1
211
+ if motif_length == -1:
212
+ motif_length += len(motif)
213
+ alignment.appendleft('.')
214
+ ii = align_path[i][j][0]
215
+ j = align_path[i][j][1]
216
+ i = ii
217
+ elif align_path[i][j][2] == 1:
218
+ query.appendleft(seq[seq_length])
219
+ seq_length -= 1
220
+ target.appendleft(motif[motif_length])
221
+ motif_length -= 1
222
+ if motif_length == -1:
223
+ motif_length += len(motif)
224
+ alignment.appendleft('|')
225
+ ii = align_path[i][j][0]
226
+ j = align_path[i][j][1]
227
+ i = ii
228
+ else:
229
+ if align_path[i][j][1] == j:
230
+ query.appendleft(seq[seq_length])
231
+ seq_length -= 1
232
+ target.appendleft('-')
233
+ else:
234
+ query.appendleft('-')
235
+ target.appendleft(motif[motif_length])
236
+ motif_length -= 1
237
+ if motif_length == -1:
238
+ motif_length += len(motif)
239
+ alignment.appendleft('-')
240
+ ii = align_path[i][j][0]
241
+ j = align_path[i][j][1]
242
+ i = ii
243
+
244
+ format_align = [''.join(target), ''.join(alignment), ''.join(query), stab_l, length, len(seq) - 1 - seq_length]
245
+
246
+ return format_align
247
+
248
+
249
+ def global_motif_multiple_align(seq, motif, logo, indel, mat, mis, ope, ext, marks):
250
+ standard_seq_length, real_seq_length = math.ceil(len(seq) * (1 + indel + 0.005)), len(seq)
251
+ n = len(motif)
252
+ standard_seq = motif[marks % n:] + motif * (standard_seq_length // n)
253
+ # 如果是开头
254
+ if logo == 1:
255
+ standard_seq = motif + motif * (standard_seq_length // n)
256
+ align_score, diagonal_num, align_path, max_score = motif_constrained_local_align(standard_seq, seq, mat, mis,
257
+ ope, n)
258
+ candidate_length = []
259
+ indel = real_seq_length + 1
260
+ mismatch = real_seq_length + 1
261
+ insert_num = 0
262
+ delete_num = 0
263
+ for tr_l, a_s in enumerate(align_score):
264
+ if tr_l % n == 0 and a_s == max_score:
265
+ candidate_length.append(tr_l)
266
+ stab_l = candidate_length[0]
267
+ for c_l in candidate_length:
268
+ min_up_left = min(diagonal_num[c_l], key=lambda x: x[0])
269
+ indel_num = c_l + (real_seq_length - min_up_left[1] + 1) - 2 * min_up_left[0]
270
+ mismatch_num = (align_score[c_l] - indel_num * ope - min_up_left[0] * mat) / (mis - mat)
271
+ mismatch = mismatch_num if indel_num < indel else mismatch
272
+ insert_num = (real_seq_length - min_up_left[1] + 1) - min_up_left[0] if indel_num < indel else insert_num
273
+ delete_num = indel_num - insert_num if indel_num < indel else delete_num
274
+ stab_l = c_l if indel_num < indel else stab_l
275
+ indel = indel_num if indel_num < indel else indel
276
+ if max_score == 0 and align_path[len(seq)][stab_l] == -1:
277
+ return 0, 0, 0, '', len(seq)
278
+ format_align = format_alignment(align_path, seq, motif, stab_l)
279
+ left = real_seq_length - format_align[5]
280
+ return insert_num, delete_num, mismatch, format_align, left
281
+ # 如果是结尾,则不需要整motif长度,选取最佳匹配长度
282
+ if logo == 2:
283
+ align_score, diagonal_num, align_path = realtime_path_tracking_alignment(standard_seq, seq, mat, mis, ope, 1)
284
+ max_score = np.max(np.array(align_score))
285
+ if max_score < 0:
286
+ return 0, 0, 0, '', len(seq)
287
+ candidate_length = []
288
+ indel = real_seq_length + 1
289
+ mismatch = real_seq_length + 1
290
+ insert_num = 0
291
+ delete_num = 0
292
+ row = 0
293
+ for ro in range(len(seq), 0, -1):
294
+ for tr_l, a_s in enumerate(align_score[ro]):
295
+ if a_s == max_score:
296
+ candidate_length.append(tr_l)
297
+ if candidate_length != []:
298
+ row = ro
299
+ break
300
+ if candidate_length == []:
301
+ return 0, 0, 0, '', len(seq)
302
+ # 选取indel错误率最低的
303
+ stab_l = candidate_length[0]
304
+ for c_l in candidate_length:
305
+ indel_num = c_l + row - 2 * min(diagonal_num[row][c_l])
306
+ mismatch_num = (align_score[row][c_l] - indel_num * ope - min(diagonal_num[row][c_l]) * mat) / (mis - mat)
307
+ mismatch = mismatch_num if indel_num < indel else mismatch
308
+ insert_num = row - min(diagonal_num[row][c_l]) if indel_num < indel else insert_num
309
+ delete_num = indel_num - insert_num if indel_num < indel else delete_num
310
+ stab_l = c_l if indel_num < indel else stab_l
311
+ indel = indel_num if indel_num < indel else indel
312
+ format_align = format_alignment(align_path, seq[:row], motif, stab_l)
313
+ right = real_seq_length - row
314
+ return insert_num, delete_num, mismatch, format_align, right
315
+ # 如果不是结尾,那么必须是整motif长度
316
+ align_score, diagonal_num, align_path = realtime_path_tracking_alignment(standard_seq, seq, mat, mis, ope)
317
+ candidate_length = []
318
+ indel = real_seq_length + 1
319
+ mismatch = real_seq_length + 1
320
+ insert_num = 0
321
+ delete_num = 0
322
+ for tr_l, a_s in enumerate(align_score):
323
+ if (tr_l - marks % n) % n == 0 and a_s == max(align_score):
324
+ candidate_length.append(tr_l)
325
+ # 如果有整motif长度,选取indel错误率最低的
326
+ if candidate_length:
327
+ stab_l = candidate_length[0]
328
+ for c_l in candidate_length:
329
+ indel_num = c_l + real_seq_length - 2 * min(diagonal_num[c_l])
330
+ mismatch_num = (align_score[c_l] - indel_num * ope - min(diagonal_num[c_l]) * mat) / (mis - mat)
331
+ mismatch = mismatch_num if indel_num < indel else mismatch
332
+ insert_num = real_seq_length - min(diagonal_num[c_l]) if indel_num < indel else insert_num
333
+ delete_num = indel_num - insert_num if indel_num < indel else delete_num
334
+ stab_l = c_l if indel_num < indel else stab_l
335
+ indel = indel_num if indel_num < indel else indel
336
+ format_align = format_alignment(align_path, seq, motif, stab_l)
337
+ return insert_num, delete_num, mismatch, format_align
338
+ # 如果没有整motif长度,那么选取比对分数最高的整motif长度,并从中选取indel最低的
339
+ integralmultiples_motif = []
340
+ for tr_l, _ in enumerate(align_score):
341
+ if (tr_l - marks % n) % n == 0:
342
+ integralmultiples_motif.append((tr_l, align_score[tr_l]))
343
+ integralmultiples_motif.sort(key=lambda x: x[1])
344
+ for i_m in integralmultiples_motif:
345
+ if i_m[1] == integralmultiples_motif[-1][1]:
346
+ candidate_length.append(i_m[0])
347
+ stab_l = candidate_length[0]
348
+ for c_l in candidate_length:
349
+ indel_num = c_l + real_seq_length - 2 * min(diagonal_num[c_l])
350
+ mismatch_num = (align_score[c_l] - indel_num * ope - min(diagonal_num[c_l]) * mat) / (mis - mat)
351
+ mismatch = mismatch_num if indel_num < indel else mismatch
352
+ insert_num = real_seq_length - min(diagonal_num[c_l]) if indel_num < indel else insert_num
353
+ delete_num = indel_num - insert_num if indel_num < indel else delete_num
354
+ stab_l = c_l if indel_num < indel else stab_l
355
+ indel = indel_num if indel_num < indel else indel
356
+ format_align = format_alignment(align_path, seq, motif, stab_l)
357
+ return insert_num, delete_num, mismatch, format_align
358
+
359
+
360
+ def segmented_global_align_algorithm(sequence, motif, indel, mat, mis, ope, ext, beta=0.045):
361
+ sequence = str(sequence)
362
+ marks = len(motif)
363
+ if len(motif) == 2:
364
+ marks = 5
365
+ elif len(motif) == 3:
366
+ marks = 6
367
+ probe = str(marks // len(motif) * motif + motif[:marks % len(motif)])
368
+ pattern = re.compile('(?={})'.format(re.escape(str(probe))))
369
+ com_match = [match.start() for match in pattern.finditer(sequence)]
370
+
371
+ if len(com_match) * len(probe) / len(sequence) < beta:
372
+ return None, None, None, None, None, None, None, None
373
+
374
+ distances = [com_match[i + 1] - com_match[i] for i in range(len(com_match) - 1)]
375
+ dis_series = pd.Series(distances)
376
+ most_common_dis = dis_series.value_counts().idxmax()
377
+ if most_common_dis != len(motif):
378
+ return None, None, None, None, None, None, None, None
379
+
380
+ Insert = 0
381
+ Delete = 0
382
+ Mismatch = 0
383
+ Target = []
384
+ Align = []
385
+ Query = []
386
+ target_length = 0
387
+ align_length = 0
388
+ pre_end = 0
389
+ chaos_seq_start = -1
390
+ left = 0
391
+ for index, cm in enumerate(com_match):
392
+ # 未出现混沌序列
393
+ if cm - pre_end == 0 and chaos_seq_start < 0:
394
+ pre_end = pre_end + marks
395
+ if cm > 0 and marks % len(motif) > 0:
396
+ Delete += (len(motif) - marks % len(motif))
397
+ Target.append(motif[marks % len(motif):])
398
+ Align.append('-' * (len(motif) - marks % len(motif)))
399
+ Query.append('-' * (len(motif) - marks % len(motif)))
400
+ align_length += (len(motif) - marks % len(motif))
401
+ target_length += (len(motif) - marks % len(motif))
402
+ if index + 1 == len(com_match) or com_match[index + 1] >= pre_end:
403
+ Target.append(probe)
404
+ Align.append('|' * marks)
405
+ Query.append(probe)
406
+ align_length += marks
407
+ target_length += marks
408
+ continue
409
+ if cm - pre_end > 0 and chaos_seq_start < 0:
410
+ '''
411
+ 起始端单独考虑
412
+ '''
413
+ if pre_end == 0:
414
+ insert_num, delete_num, mismatch_num, format_align, left = global_motif_multiple_align(
415
+ sequence[pre_end:cm], motif, 1, indel, mat, mis, mis, mis, marks)
416
+ Insert += insert_num
417
+ Delete += delete_num
418
+ Mismatch += mismatch_num
419
+ pre_end = cm + marks
420
+ if format_align == '':
421
+ if index + 1 == len(com_match) or com_match[index + 1] >= pre_end:
422
+ Target.append(probe)
423
+ Align.append('|' * marks)
424
+ Query.append(probe)
425
+ target_length += marks
426
+ align_length += marks
427
+ continue
428
+ Target.append(format_align[0])
429
+ Align.append(format_align[1])
430
+ Query.append(format_align[2])
431
+ target_length += format_align[3]
432
+ align_length += format_align[4]
433
+ if index + 1 == len(com_match) or com_match[index + 1] >= pre_end:
434
+ Target.append(probe)
435
+ Align.append('|' * marks)
436
+ Query.append(probe)
437
+ target_length += marks
438
+ align_length += marks
439
+ continue
440
+ insert_num, delete_num, mismatch_num, format_align = global_motif_multiple_align(sequence[pre_end:cm],
441
+ motif, 0, indel,
442
+ mat, mis, mis, mis, marks)
443
+ Insert += insert_num
444
+ Delete += delete_num
445
+ Mismatch += mismatch_num
446
+ pre_end = cm + marks
447
+ Target.append(format_align[0])
448
+ Align.append(format_align[1])
449
+ Query.append(format_align[2])
450
+ target_length += format_align[3]
451
+ align_length += format_align[4]
452
+ if index + 1 == len(com_match) or com_match[index + 1] >= pre_end:
453
+ Target.append(probe)
454
+ Align.append('|' * marks)
455
+ Query.append(probe)
456
+ target_length += marks
457
+ align_length += marks
458
+ continue
459
+ # 发现混沌序列起始部分
460
+ if chaos_seq_start == -1:
461
+ chaos_seq_start = pre_end - marks
462
+ pre_end = cm + marks
463
+ continue
464
+ # 继续发现混沌序列部分
465
+ if chaos_seq_start >= 0 and cm - pre_end < 0:
466
+ pre_end = cm + marks
467
+ continue
468
+ # 发现混沌序列结尾部分
469
+ # motif为2和3对应的混沌序列包含一个大的重构标记序列,不可能是混沌序列
470
+ if len(motif) in [2, 3]:
471
+ Target.append(sequence[chaos_seq_start:pre_end])
472
+ Align.append('|' * len(sequence[chaos_seq_start:pre_end]))
473
+ Query.append(sequence[chaos_seq_start:pre_end])
474
+ target_length += len(sequence[chaos_seq_start:pre_end])
475
+ align_length += len(sequence[chaos_seq_start:pre_end])
476
+ chaos_seq_start = pre_end
477
+ insert_num, delete_num, mismatch_num, format_align = global_motif_multiple_align(sequence[chaos_seq_start:cm],
478
+ motif, 0, indel, mat, mis, mis,
479
+ mis, marks)
480
+ Insert += insert_num
481
+ Delete += delete_num
482
+ Mismatch += mismatch_num
483
+ chaos_seq_start = -1
484
+ pre_end = cm + marks
485
+ Target.append(format_align[0])
486
+ Align.append(format_align[1])
487
+ Query.append(format_align[2])
488
+ target_length += format_align[3]
489
+ align_length += format_align[4]
490
+ # 如果该新的标记序列处于最后一个或者不是下一个混沌序列的开始,那么可以直接比对,否则纳入到下一次比对中
491
+ if index == len(com_match) - 1 or com_match[index + 1] - pre_end >= 0:
492
+ Target.append(probe)
493
+ Align.append('|' * marks)
494
+ Query.append(probe)
495
+ target_length += marks
496
+ align_length += marks
497
+
498
+ # 处理剩余序列
499
+ # 如果不是处于混沌序列部分,直接比对
500
+ right = 0
501
+ if chaos_seq_start == -1 and pre_end < len(sequence):
502
+ insert_num, delete_num, mismatch_num, format_align, right = global_motif_multiple_align(sequence[pre_end:],
503
+ motif, 2, indel, mat,
504
+ mis, mis, mis, marks)
505
+ if format_align != '':
506
+ Insert += insert_num
507
+ Delete += delete_num
508
+ Mismatch += mismatch_num
509
+ Target.append(format_align[0])
510
+ Align.append(format_align[1])
511
+ Query.append(format_align[2])
512
+ target_length += format_align[3]
513
+ align_length += format_align[4]
514
+ # 否则如果还处于混沌序列之中,
515
+ elif chaos_seq_start >= 0:
516
+ insert_num, delete_num, mismatch_num, format_align, right = global_motif_multiple_align(
517
+ sequence[chaos_seq_start:], motif, 2, indel, mat, mis, mis, mis, marks)
518
+ if format_align != '':
519
+ Insert += insert_num
520
+ Delete += delete_num
521
+ Mismatch += mismatch_num
522
+ Target.append(format_align[0])
523
+ Align.append(format_align[1])
524
+ Query.append(format_align[2])
525
+ target_length += format_align[3]
526
+ align_length += format_align[4]
527
+
528
+ score = (Insert + Delete) * ope + Mismatch * mis + ((len(sequence) - right) - Mismatch - Insert) * mat
529
+ Target = ''.join(Target)
530
+ Align = ''.join(Align)
531
+ Query = ''.join(Query)
532
+ align = 'target 0 ' + Target + ' ' + str(
533
+ target_length) + '\n' + ' 0 ' + Align + ' ' + str(
534
+ align_length) + '\n' + 'query 0 ' + Query + ' ' + str(len(sequence) - left - right)
535
+ return Insert, Delete, Mismatch, align, left, right, score, (len(Target) - Target.count('-')) / len(motif)
536
+
537
+ # tr = 'TAAAATAAACAAAATAAAATAAATTAAACAATTAAATTAAATAAAATAAATTAAA'
538
+ # motif = 'TAAAA'
539
+ # Insert, Delete, Mismatch, align, _, right, _, copy = segmented_global_align_algorithm(tr, motif, 0.15, 2, -5, -7, -3)
540
+ # print(align)
541
+ # print(copy)
faststr/__init__.py ADDED
@@ -0,0 +1,2 @@
1
+ # faststr/__init__.py
2
+ __version__ = "0.1.0"
@@ -0,0 +1,28 @@
1
+ from . import utils
2
+ from . import scan_subread
3
+ from . import trs_align
4
+
5
+
6
+
7
+ def get_subread_trs(sub_read, m, r, g, e, p_indel, p_match, score, beta):
8
+ # 获取anchors
9
+ anchor_dict, anchor_mark_dict = utils.get_anchors(sub_read)
10
+ # 获取模糊重复区间
11
+ fuzzy_repeats_dict = {i: [] for i in range(1, 9)}
12
+ num = 0
13
+ for n, n_anchors in anchor_dict.items():
14
+ fuzzy_repeats_dict[n] = scan_subread.cluster_anchors(list(n_anchors), anchor_mark_dict[n], p_indel, p_match)
15
+ del anchor_dict
16
+ # 获取模糊重复区间的motif及大致比对区间
17
+ candidate_trs_dict = {i: [] for i in range(1, 9)}
18
+ for n, n_fuzzy_repeats in fuzzy_repeats_dict.items():
19
+ candidate_trs_dict[n] = scan_subread.filter_chaotic_repeats(n_fuzzy_repeats, sub_read, n)
20
+ del fuzzy_repeats_dict
21
+ # 合并中断tr序列,比对备选序列,得到合格序列,得到一条sub_read上的最终tr集
22
+ qualified_trs_list = []
23
+ # num = 0
24
+ for n, n_candidate_trs in candidate_trs_dict.items():
25
+ qualified_trs_list.extend(trs_align.trs_align_algorithm(n_candidate_trs, sub_read, p_indel, p_match, m, r, g, e,
26
+ score, beta))
27
+ del candidate_trs_dict
28
+ return sorted(qualified_trs_list, key=lambda x: x[1])