FastSTR 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,367 @@
1
+ import heapq
2
+ import regex
3
+
4
+ from . import trs_align
5
+ from . import utils
6
+
7
+
8
+ def judge_overlap_type(motif1, motif2, rear_overlap_pro, front_overlap_pro, start, end, overlap_length):
9
+ if len(motif1) != len(motif2):
10
+ return -1
11
+ pattern = f"({motif1}){{s<=1}}"
12
+ matches = regex.finditer(pattern, str(motif2 + motif2))
13
+ if not any(matches):
14
+ return -1
15
+ if start <= overlap_length and end <= overlap_length and motif1 in motif2 + motif2 and rear_overlap_pro >= 0.9 and front_overlap_pro >= 0.9:
16
+ return 0
17
+ if start > overlap_length and end <= overlap_length and front_overlap_pro >= 0.8:
18
+ return 1
19
+ if start <= overlap_length and end > overlap_length and rear_overlap_pro >= 0.8:
20
+ return 2
21
+ if start > overlap_length and end > overlap_length:
22
+ return 3
23
+ return -1
24
+
25
+
26
+ def motif_sim(m1, m2):
27
+ m1 = str(m1)
28
+ m2 = str(m2)
29
+ if len(m1) != len(m2):
30
+ return -1
31
+ if m1 in m2 + m2:
32
+ return 0
33
+ pattern = f"({m1}){{s<=1}}"
34
+ matches = regex.finditer(pattern, str(m2 + m2))
35
+ if any(matches):
36
+ return 1
37
+ return -1
38
+
39
+
40
+ def construct_consensus(rear_overlap, front_overlap, read_length, overlap_length, all_seq, global_index,
41
+ cross_sub_reads_TRs_list, p_indel, p_match, mat, mis, gap, ext, align_score, beta):
42
+ consensus_trs = []
43
+ j = 0
44
+ global_index_p = global_index + read_length - overlap_length
45
+ if_chage_motif = -1
46
+ while rear_overlap and j < len(front_overlap) or rear_overlap == [] and j < len(front_overlap):
47
+ # 下一个sub_read有全覆盖tr,可能是跨tr的起始或者中间
48
+ if (front_overlap[j][2] - overlap_length) / (read_length - overlap_length) > 0.98:
49
+ if_start = -1
50
+ # 找出前一个重叠的tr,确定起始位置
51
+ for index, r_o in enumerate(sorted(rear_overlap, key=lambda x: x[2], reverse=True)):
52
+ if_chage_motif = motif_sim(front_overlap[j][0], r_o[0])
53
+ if if_chage_motif >= 0:
54
+ tr_overlap_length = r_o[2] - max(front_overlap[j][1] + read_length - overlap_length, r_o[1]) + 1
55
+ rear_overlap_pro = tr_overlap_length / (overlap_length - front_overlap[j][1])
56
+ if rear_overlap_pro >= 0.8:
57
+ if_start = len(rear_overlap) - 1 - index
58
+ break
59
+ # 如果找到,则该tr必为起始段
60
+ if if_start > -1:
61
+ cross_sub_reads_TRs_list.append(
62
+ [rear_overlap[if_start][0], rear_overlap[if_start][1] + global_index,
63
+ front_overlap[j][2] + global_index_p])
64
+ del rear_overlap[if_start]
65
+ # 如果没找到,则tr可能为起始段,可能为中间段,取决于跨sub_read字典是否存在前序tr重叠
66
+ elif if_start == -1:
67
+ # 如果该tr全覆盖sub_read,且与前序tr有重叠,motif相似,则为中间段
68
+ if (front_overlap[j][2] - front_overlap[j][
69
+ 1] + 1) / read_length >= 0.96 and cross_sub_reads_TRs_list and front_overlap[j][
70
+ 1] + global_index_p < cross_sub_reads_TRs_list[-1][-1]:
71
+ if_chage_motif = motif_sim(front_overlap[j][0], cross_sub_reads_TRs_list[-1][0])
72
+ if if_chage_motif >= 0:
73
+ cross_sub_reads_TRs_list[-1][-1] = front_overlap[j][2] + global_index_p
74
+ # 否则为起始段(无需全覆盖sub_read)
75
+ else:
76
+ cross_sub_reads_TRs_list.append([front_overlap[j][0], front_overlap[j][1] + global_index_p,
77
+ front_overlap[j][2] + global_index_p])
78
+ del front_overlap[j]
79
+ j = 0
80
+ continue
81
+ # 到了跨tr的结束端
82
+ if cross_sub_reads_TRs_list:
83
+ if_chage_motif = motif_sim(front_overlap[j][0], cross_sub_reads_TRs_list[-1][0])
84
+ if front_overlap[j][1] + global_index_p < cross_sub_reads_TRs_list[-1][2] and front_overlap[j][
85
+ 2] + global_index_p >= cross_sub_reads_TRs_list[-1][2] and if_chage_motif >= 0:
86
+ cross_sub_reads_TRs_list[-1][2] = front_overlap[j][2] + global_index_p
87
+ del front_overlap[j]
88
+ j = 0
89
+ continue
90
+ else:
91
+ if_chage_motif = -1
92
+ if rear_overlap == []:
93
+ break
94
+ tr_overlap_length = min(rear_overlap[0][2], front_overlap[j][2] + read_length - overlap_length) - max(
95
+ rear_overlap[0][1], front_overlap[j][1] + read_length - overlap_length) + 1
96
+ rear_overlap_pro = tr_overlap_length / (rear_overlap[0][2] - rear_overlap[0][1] + 1)
97
+ front_overlap_pro = tr_overlap_length / (front_overlap[j][2] - front_overlap[j][1] + 1)
98
+ rear_tr = (rear_overlap[0][0], rear_overlap[0][1] + global_index, rear_overlap[0][2] + global_index,
99
+ rear_overlap[0][3], rear_overlap[0][4], rear_overlap[0][5], rear_overlap[0][6], rear_overlap[0][7])
100
+ front_tr = (front_overlap[j][0], front_overlap[j][1] + global_index_p, front_overlap[j][2] + global_index_p,
101
+ front_overlap[j][3], front_overlap[j][4], front_overlap[j][5], front_overlap[j][6],
102
+ front_overlap[j][7])
103
+ overlap_type = judge_overlap_type(rear_overlap[0][0], front_overlap[j][0], rear_overlap_pro, front_overlap_pro,
104
+ read_length - rear_overlap[0][1], front_overlap[j][2] + 1, overlap_length)
105
+ # 未重叠
106
+ if front_overlap[j][1] + read_length - overlap_length >= rear_overlap[0][2] or j + 1 == len(front_overlap):
107
+ consensus_trs.append(rear_tr)
108
+ del rear_overlap[0]
109
+ j = 0
110
+ continue
111
+ # 重叠但未匹配
112
+ if overlap_type == -1:
113
+ j += 1
114
+ continue
115
+ # 全重叠匹配
116
+ if overlap_type == 0:
117
+ if rear_tr[1] <= front_tr[1]:
118
+ consensus_trs.append(rear_tr)
119
+ else:
120
+ consensus_trs.append(front_tr)
121
+ del rear_overlap[0]
122
+ del front_overlap[j]
123
+ j = 0
124
+ continue
125
+ # 左重叠匹配
126
+ if overlap_type == 1:
127
+ consensus_trs.append(rear_tr)
128
+ del rear_overlap[0]
129
+ del front_overlap[j]
130
+ j = 0
131
+ continue
132
+ # 右重叠匹配
133
+ if overlap_type == 2:
134
+ consensus_trs.append(front_tr)
135
+ del rear_overlap[0]
136
+ del front_overlap[j]
137
+ j = 0
138
+ continue
139
+ # 嵌入重叠匹配
140
+ if overlap_type == 3:
141
+ to_scan_sequence = utils.get_realscan_sequence(
142
+ all_seq[rear_overlap[0][1]:front_overlap[j][2] + 1 + read_length - overlap_length])
143
+ motifs = utils.tri_gram_model(to_scan_sequence, len(rear_overlap[0][0]))
144
+ if motifs == []:
145
+ consensus_trs.append(rear_tr)
146
+ consensus_trs.append(front_tr)
147
+ del rear_overlap[0]
148
+ del front_overlap[j]
149
+ j = 0
150
+ continue
151
+ maybe_consensus = []
152
+ for mot in motifs:
153
+ maybe_consensus.extend(trs_align.trs_align_algorithm(
154
+ [(mot, rear_overlap[0][1], front_overlap[j][2] + 1 + read_length - overlap_length)], all_seq,
155
+ p_indel, p_match, mat, mis, gap, ext, align_score, beta))
156
+ if maybe_consensus == []:
157
+ consensus_trs.append(rear_tr)
158
+ consensus_trs.append(front_tr)
159
+ del rear_overlap[0]
160
+ del front_overlap[j]
161
+ j = 0
162
+ continue
163
+ for m_c in maybe_consensus:
164
+ consensus_trs.append(
165
+ (m_c[0], m_c[1] + global_index, m_c[2] + global_index, m_c[3], m_c[4], m_c[5], m_c[6], m_c[7]))
166
+ del rear_overlap[0]
167
+ del front_overlap[j]
168
+ j = 0
169
+ continue
170
+
171
+ # 上一个后重叠区还有tr,全部并入
172
+ if rear_overlap:
173
+ for reminder_tr in rear_overlap:
174
+ rear_tr = (reminder_tr[0], reminder_tr[1] + global_index, reminder_tr[2] + global_index, reminder_tr[3],
175
+ reminder_tr[4], reminder_tr[5], reminder_tr[6], reminder_tr[7])
176
+ consensus_trs.append(rear_tr)
177
+ elif front_overlap:
178
+ for reminder_tr in front_overlap:
179
+ front_tr = (
180
+ reminder_tr[0], reminder_tr[1] + global_index_p, reminder_tr[2] + global_index_p, reminder_tr[3],
181
+ reminder_tr[4], reminder_tr[5], reminder_tr[6], reminder_tr[7])
182
+ consensus_trs.append(front_tr)
183
+
184
+ return sorted(consensus_trs, key=lambda x: x[1]), if_chage_motif
185
+
186
+
187
+ def make_two_subreads_consensus(up_read, mid_read, all_seq, subread_index, read_length, overlap_length, start_index,
188
+ cross_sub_reads_TRs_list, p_indel, p_match, mat, mis, gap, ext, align_score, beta):
189
+ with_nosorted_trs = []
190
+ to_be_merged_trs = []
191
+ rear_repeating_area = read_length - overlap_length
192
+ global_index = rear_repeating_area * subread_index + start_index
193
+
194
+ if len(all_seq) <= read_length:
195
+ if len(up_read) == 0:
196
+ return [], [], -1
197
+ for u_r in up_read:
198
+ with_nosorted_trs.append(
199
+ (u_r[0], u_r[1] + global_index, u_r[2] + global_index, u_r[3], u_r[4], u_r[5], u_r[6], u_r[7]))
200
+ return sorted(with_nosorted_trs, key=lambda x: x[1]), [], -1
201
+ if len(up_read) > 0:
202
+ for u_r in up_read:
203
+ if u_r[2] - rear_repeating_area < 24:
204
+ with_nosorted_trs.append(
205
+ (u_r[0], u_r[1] + global_index, u_r[2] + global_index, u_r[3], u_r[4], u_r[5], u_r[6], u_r[7]))
206
+ else:
207
+ to_be_merged_trs.append(u_r)
208
+
209
+ after_front_repeat = next((ind for ind, m_r in enumerate(mid_read) if overlap_length - m_r[1] <= 24), len(mid_read))
210
+ if after_front_repeat == 0:
211
+ for t_b_m in to_be_merged_trs:
212
+ with_nosorted_trs.append(
213
+ (t_b_m[0], t_b_m[1] + global_index, t_b_m[2] + global_index, t_b_m[3], t_b_m[4], t_b_m[5], t_b_m[6],
214
+ t_b_m[7]))
215
+ return sorted(with_nosorted_trs, key=lambda x: x[1]), mid_read, -1
216
+ after_consensus, if_chage_motif = construct_consensus(to_be_merged_trs, mid_read[:after_front_repeat], read_length,
217
+ overlap_length, all_seq, global_index,
218
+ cross_sub_reads_TRs_list, p_indel, p_match, mat, mis, gap,
219
+ ext, align_score, beta)
220
+ with_nosorted_trs.extend(after_consensus)
221
+ return sorted(with_nosorted_trs, key=lambda x: x[1]), mid_read[after_front_repeat:], if_chage_motif
222
+
223
+
224
+ def calculate_cross_subread_tr(cross_tr_tuple, cross_tr_seq, p_indel, p_match, mat, mis, gap, ext, align_score, beta):
225
+ cross_tr = trs_align.trs_align_algorithm(
226
+ [(cross_tr_tuple[0], cross_tr_tuple[1] - cross_tr_tuple[1], cross_tr_tuple[2] - cross_tr_tuple[1])],
227
+ cross_tr_seq, p_indel, p_match, mat, mis, gap, ext, align_score, beta)
228
+ if cross_tr == []:
229
+ return []
230
+ Cross_Tr = []
231
+ for c_t in cross_tr:
232
+ Cross_Tr.append(
233
+ (c_t[0], c_t[1] + cross_tr_tuple[1], c_t[2] + cross_tr_tuple[1], c_t[3], c_t[4], c_t[5], c_t[6], c_t[7]))
234
+
235
+ return Cross_Tr
236
+
237
+
238
+ def handling_compatibility(to_final_trs_list, p_match, p_indel):
239
+ if len(to_final_trs_list) < 1:
240
+ return to_final_trs_list, []
241
+ if len(to_final_trs_list) < 2:
242
+ return to_final_trs_list, [(1, to_final_trs_list[0][1], to_final_trs_list[0][2], to_final_trs_list[0][0],
243
+ 2 * (1 + to_final_trs_list[0][4]) * (1 - to_final_trs_list[0][3]) / (
244
+ 2 - to_final_trs_list[0][3]) - 1)]
245
+ P_th_profit = 2 * (1 + p_match) * (1 - p_indel) / (2 - p_indel)
246
+ region_minprop = 1 / P_th_profit
247
+ final_trs_list = []
248
+ can_retain = [1] * len(to_final_trs_list)
249
+ cur_repeat_region = []
250
+ cur_repeat_region_score = []
251
+ repeat_region_mark = []
252
+ max_repeat_region_end = 0
253
+ for index, t_f in enumerate(to_final_trs_list):
254
+ if can_retain[index] == 0:
255
+ continue
256
+ if index + 1 == len(to_final_trs_list):
257
+ final_trs_list.append(t_f)
258
+ if cur_repeat_region == []:
259
+ cur_repeat_region.append(t_f)
260
+ max_repeat_region_end = t_f[2]
261
+ cur_repeat_region_score.append(2 * (1 + t_f[4]) * (1 - t_f[3]) / (2 - t_f[3]) - 1)
262
+ else:
263
+ one_region_legnth = max(max_repeat_region_end, t_f[2]) - cur_repeat_region[0][1]
264
+ one_region_score = []
265
+ for c_r_r in cur_repeat_region:
266
+ cache_score = 2 * (c_r_r[2] - c_r_r[1] + 1) * (1 + c_r_r[4]) * (1 - c_r_r[3]) / (
267
+ 2 - c_r_r[3]) - one_region_legnth
268
+ if cache_score >= 0:
269
+ one_region_score.append(cache_score / one_region_legnth)
270
+ else:
271
+ top_three_tr = heapq.nlargest(3, zip(cur_repeat_region_score, cur_repeat_region))
272
+ if len(top_three_tr) == 1:
273
+ repeat_region_mark.append((1, cur_repeat_region[0][1], max_repeat_region_end,
274
+ cur_repeat_region[0][0], cur_repeat_region_score[0]))
275
+ elif len(top_three_tr) == 2:
276
+ repeat_region_mark.append((2, cur_repeat_region[0][1], max_repeat_region_end,
277
+ top_three_tr[0][1][0], top_three_tr[0][0], top_three_tr[1][1][0],
278
+ top_three_tr[1][0]))
279
+ else:
280
+ repeat_region_mark.append((3, cur_repeat_region[0][1], max_repeat_region_end,
281
+ top_three_tr[0][1][0], top_three_tr[0][0], top_three_tr[1][1][0],
282
+ top_three_tr[1][0], top_three_tr[2][1][0], top_three_tr[2][0]))
283
+ cur_repeat_region = [t_f]
284
+ max_repeat_region_end = t_f[2]
285
+ cur_repeat_region_score = [2 * (1 + t_f[4]) * (1 - t_f[3]) / (2 - t_f[3]) - 1]
286
+ break
287
+ if len(one_region_score) == len(cur_repeat_region):
288
+ cur_repeat_region.append(t_f)
289
+ cur_repeat_region_score = one_region_score
290
+ cache_score = 2 * (t_f[2] - t_f[1] + 1) * (1 + t_f[4]) * (1 - t_f[3]) / (
291
+ 2 - t_f[3]) - one_region_legnth
292
+ cur_repeat_region_score.append(cache_score / one_region_legnth)
293
+ max_repeat_region_end = max(max_repeat_region_end, t_f[2])
294
+ continue
295
+ for pos in range(index + 1, len(to_final_trs_list)):
296
+ if to_final_trs_list[pos][1] >= t_f[2]:
297
+ break
298
+ region_length = max(t_f[2], to_final_trs_list[pos][2]) - min(t_f[1], to_final_trs_list[pos][1]) + 1
299
+ tr_length_pro1 = (t_f[2] - t_f[1] + 1) / region_length
300
+ tr_length_pro2 = (to_final_trs_list[pos][2] - to_final_trs_list[pos][1] + 1) / region_length
301
+ if len(to_final_trs_list[pos][0]) == len(t_f[0]) and to_final_trs_list[pos][0] in t_f[0] + t_f[0]:
302
+ if tr_length_pro1 < tr_length_pro2:
303
+ can_retain[pos] = 0
304
+ else:
305
+ can_retain[index] = 0
306
+ continue
307
+ if tr_length_pro1 >= 0.8 and tr_length_pro2 <= region_minprop or tr_length_pro1 <= region_minprop and tr_length_pro2 >= 0.8:
308
+ gain1 = 2 * (t_f[2] - t_f[1] + 1) * (1 + t_f[4]) * (1 - t_f[3]) / (2 - t_f[3]) - region_length
309
+ gain2 = 2 * (to_final_trs_list[pos][2] - to_final_trs_list[pos][1] + 1) * (
310
+ 1 + to_final_trs_list[pos][4]) * (1 - to_final_trs_list[pos][3]) / (
311
+ 2 - to_final_trs_list[pos][3]) - region_length
312
+ if gain2 < 0:
313
+ can_retain[pos] = 0
314
+ if gain1 < 0:
315
+ can_retain[index] = 0
316
+ break
317
+ if can_retain[index] == 1:
318
+ final_trs_list.append(t_f)
319
+ if cur_repeat_region == []:
320
+ cur_repeat_region.append(t_f)
321
+ max_repeat_region_end = t_f[2]
322
+ cur_repeat_region_score.append(2 * (1 + t_f[4]) * (1 - t_f[3]) / (2 - t_f[3]) - 1)
323
+ else:
324
+ one_region_legnth = max(max_repeat_region_end, t_f[2]) - cur_repeat_region[0][1]
325
+ one_region_score = []
326
+ for c_r_r in cur_repeat_region + [t_f]:
327
+ cache_score = 2 * (c_r_r[2] - c_r_r[1] + 1) * (1 + c_r_r[4]) * (1 - c_r_r[3]) / (
328
+ 2 - c_r_r[3]) - one_region_legnth
329
+ if cache_score >= 0:
330
+ one_region_score.append(cache_score / one_region_legnth)
331
+ else:
332
+ top_three_tr = heapq.nlargest(3, zip(cur_repeat_region_score, cur_repeat_region))
333
+ if len(top_three_tr) == 1:
334
+ repeat_region_mark.append((1, cur_repeat_region[0][1], max_repeat_region_end,
335
+ cur_repeat_region[0][0], cur_repeat_region_score[0]))
336
+ elif len(top_three_tr) == 2:
337
+ repeat_region_mark.append((2, cur_repeat_region[0][1], max_repeat_region_end,
338
+ top_three_tr[0][1][0], top_three_tr[0][0], top_three_tr[1][1][0],
339
+ top_three_tr[1][0]))
340
+ else:
341
+ repeat_region_mark.append((3, cur_repeat_region[0][1], max_repeat_region_end,
342
+ top_three_tr[0][1][0], top_three_tr[0][0], top_three_tr[1][1][0],
343
+ top_three_tr[1][0], top_three_tr[2][1][0], top_three_tr[2][0]))
344
+ cur_repeat_region = [t_f]
345
+ max_repeat_region_end = t_f[2]
346
+ cur_repeat_region_score = [2 * (1 + t_f[4]) * (1 - t_f[3]) / (2 - t_f[3]) - 1]
347
+ break
348
+ if len(one_region_score) == len(cur_repeat_region) + 1:
349
+ cur_repeat_region.append(t_f)
350
+ cur_repeat_region_score = one_region_score
351
+ max_repeat_region_end = max(max_repeat_region_end, t_f[2])
352
+
353
+ if cur_repeat_region and len(cur_repeat_region) == 1:
354
+ repeat_region_mark.append((1, cur_repeat_region[0][1], max_repeat_region_end,
355
+ cur_repeat_region[0][0], cur_repeat_region_score[0]))
356
+ elif len(cur_repeat_region) > 1:
357
+ top_three_tr = heapq.nlargest(3, zip(cur_repeat_region_score, cur_repeat_region))
358
+ if len(top_three_tr) == 2:
359
+ repeat_region_mark.append((2, cur_repeat_region[0][1], max_repeat_region_end,
360
+ top_three_tr[0][1][0], top_three_tr[0][0], top_three_tr[1][1][0],
361
+ top_three_tr[1][0]))
362
+ else:
363
+ repeat_region_mark.append((3, cur_repeat_region[0][1], max_repeat_region_end,
364
+ top_three_tr[0][1][0], top_three_tr[0][0], top_three_tr[1][1][0],
365
+ top_three_tr[1][0], top_three_tr[2][1][0], top_three_tr[2][0]))
366
+
367
+ return final_trs_list, repeat_region_mark