FastSTR 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,326 @@
1
+ import math
2
+ import regex
3
+ from sklearn.cluster import DBSCAN
4
+ import numpy as np
5
+ from Bio.Align import PairwiseAligner
6
+
7
+ from . import utils
8
+
9
+ def cluster_anchors(anchors, anchor_mark_list, indel, cover):
10
+ if anchors == []:
11
+ return []
12
+ fuzzy_repeating_region = []
13
+ difinition_rough_judgment = [19, 19, 19, 20, 18, 18, 17, 17]
14
+ start_rought_judgment = [17, 18, 18, 18, 16, 16, 15, 15]
15
+ end_rought_judgment = [17, 18, 18, 17, 15, 15, 14, 14]
16
+ start_expand = [5, 5, 5, 5, 5, 5, 10, 10]
17
+ end_expand = [5, 5, 5, 5, 5, 5, 10, 10]
18
+ fuzzy_tr_structure_feature = [12, 12, 12, 12, 12, 12, 14, 12]
19
+ fuzzy_tr_structure_threshold = [0, 8, 0, 0, 7.5, 0, 12.5, 0]
20
+ fuzzy_tr_anchor_density = [0, 0.68, 0, 0, 0.64, 0, 0.68, 0]
21
+ fuzzy_tr_anchor_boundary_density = [0, 14, 0, 0, 12, 0, 13, 0]
22
+ anchor_dis = anchors[0][1] - 1
23
+ dbscan_model = DBSCAN(eps=14, min_samples=14, n_jobs=1).fit(anchors)
24
+ # 获取聚类标签
25
+ labels = dbscan_model.labels_
26
+ # 获取长度达到25,覆盖度达到(1 - indel) * cover的簇
27
+ unique_labels = np.unique(labels)
28
+ anchors = np.array(anchors)
29
+ for label in unique_labels:
30
+ if label == -1: # 忽略噪声点
31
+ continue
32
+ cluster_indices = np.where(labels == label)[0]
33
+ cluster_points = anchors[cluster_indices]
34
+ cluster_first = min(item[0] for item in cluster_points)
35
+ cluster_last = max(item[0] for item in cluster_points)
36
+ l = cluster_last - cluster_first + 1
37
+ anchor_density = len(cluster_points) / l
38
+ if l >= 25 and anchor_density >= min(0.81, (1 - indel) * cover):
39
+ start_index = -1
40
+ end_index = -1
41
+ logo = 0
42
+ current_density = sum(anchor_mark_list[cluster_first:cluster_last + 1][:25])
43
+ if logo == 0:
44
+ for i in range(1, len(anchor_mark_list[cluster_first:cluster_last + 1]) - 25 + 1):
45
+ # 更新窗口内的区密度
46
+ current_density = current_density - anchor_mark_list[cluster_first:cluster_last + 1][i - 1] + \
47
+ anchor_mark_list[cluster_first:cluster_last + 1][i + 25 - 1]
48
+ if start_index == -1:
49
+ if current_density >= start_rought_judgment[anchor_dis]:
50
+ start_index = max(i - start_expand[anchor_dis], 0)
51
+ if current_density >= difinition_rough_judgment[anchor_dis]:
52
+ logo = 1
53
+ break
54
+ if logo == 0:
55
+ continue
56
+ current_density = sum(anchor_mark_list[cluster_first:cluster_last + 1][-25:])
57
+ if current_density >= end_rought_judgment[anchor_dis]:
58
+ end_index = len(anchor_mark_list[cluster_first:cluster_last + 1]) - 1
59
+ else:
60
+ for i in range(len(anchor_mark_list[cluster_first:cluster_last + 1]) - 2, start_index + 23, -1):
61
+ current_density = current_density - anchor_mark_list[cluster_first:cluster_last + 1][i + 1] + \
62
+ anchor_mark_list[cluster_first:cluster_last + 1][i - 25 + 1]
63
+ if current_density >= end_rought_judgment[anchor_dis]:
64
+ end_index = min(i + end_expand[anchor_dis],
65
+ len(anchor_mark_list[cluster_first:cluster_last + 1]) - 1)
66
+ break
67
+ fuzzy_repeating_region.append(
68
+ (cluster_first, cluster_last, cluster_first + start_index, cluster_first + end_index, 1))
69
+ elif l >= 25:
70
+ current_density = sum(anchor_mark_list[cluster_first:cluster_last + 1][:25])
71
+ start_index = -1
72
+ end_index = -1
73
+ fuzzy_tr_start_index = -1
74
+ fuzzy_tr_end_index = -1
75
+ logo = 0
76
+ if current_density >= start_rought_judgment[anchor_dis]:
77
+ start_index = 0
78
+ if current_density >= fuzzy_tr_anchor_boundary_density[anchor_dis]:
79
+ fuzzy_tr_start_index = 0
80
+ if current_density >= difinition_rough_judgment[anchor_dis]:
81
+ logo = 1
82
+ if current_density >= end_rought_judgment[anchor_dis]:
83
+ end_index = min(29, len(anchor_mark_list[cluster_first:cluster_last + 1]) - 1)
84
+ if current_density >= fuzzy_tr_anchor_boundary_density[anchor_dis]:
85
+ fuzzy_tr_end_index = len(anchor_mark_list[cluster_first:cluster_last + 1]) - 1
86
+ Q_support_domain_mod5 = [current_density]
87
+ for i in range(1, len(anchor_mark_list[cluster_first:cluster_last + 1]) - 25 + 1):
88
+ current_density = current_density - anchor_mark_list[cluster_first:cluster_last + 1][i - 1] + \
89
+ anchor_mark_list[cluster_first:cluster_last + 1][i + 25 - 1]
90
+ if start_index == -1:
91
+ if current_density >= start_rought_judgment[anchor_dis]:
92
+ start_index = max(i - start_expand[anchor_dis], 0)
93
+ if fuzzy_tr_start_index == -1:
94
+ if current_density >= fuzzy_tr_anchor_boundary_density[anchor_dis]:
95
+ fuzzy_tr_start_index = i
96
+ if logo == 0 and current_density >= difinition_rough_judgment[anchor_dis]:
97
+ logo = 1
98
+ if current_density >= end_rought_judgment[anchor_dis]:
99
+ end_index = min(i + 24 + end_expand[anchor_dis],
100
+ len(anchor_mark_list[cluster_first:cluster_last + 1]) - 1)
101
+ if current_density >= fuzzy_tr_anchor_boundary_density[anchor_dis]:
102
+ fuzzy_tr_end_index = i + 24
103
+ if i % 5 == 0:
104
+ Q_support_domain_mod5.append(current_density)
105
+ Q_variance = np.mean((np.array(Q_support_domain_mod5) - fuzzy_tr_structure_feature[anchor_dis]) ** 2)
106
+ if Q_variance >= fuzzy_tr_structure_threshold[anchor_dis]:
107
+ if logo == 1:
108
+ fuzzy_repeating_region.append(
109
+ (cluster_first, cluster_last, cluster_first + start_index, cluster_first + end_index, 1))
110
+ else:
111
+ '''
112
+ 干扰区间的过滤
113
+ '''
114
+ if anchor_density >= fuzzy_tr_anchor_density[anchor_dis]:
115
+ fuzzy_repeating_region.append(
116
+ (cluster_first, cluster_last, cluster_first + fuzzy_tr_start_index,
117
+ cluster_first + fuzzy_tr_end_index, 0))
118
+ # if logo == 0:
119
+ # for i in range(1, len(anchor_mark_list[cluster_first:cluster_last + 1]) - 25 + 1):
120
+ # # 更新窗口内的区密度
121
+ # current_density = current_density - anchor_mark_list[cluster_first:cluster_last + 1][i - 1] + \
122
+ # anchor_mark_list[cluster_first:cluster_last + 1][i + 25 - 1]
123
+ # if start_index == -1:
124
+ # if current_density >= start_rought_judgment[anchor_dis]:
125
+ # start_index = max(i - start_expand[anchor_dis], 0)
126
+ # if current_density >= difinition_rough_judgment[anchor_dis]:
127
+ # logo = 1
128
+ # break
129
+ # if logo == 0:
130
+ # continue
131
+ # current_density = sum(anchor_mark_list[cluster_first:cluster_last + 1][-25:])
132
+ # if current_density >= end_rought_judgment[anchor_dis]:
133
+ # end_index = len(anchor_mark_list[cluster_first:cluster_last + 1]) - 1
134
+ # else:
135
+ # for i in range(len(anchor_mark_list[cluster_first:cluster_last + 1]) - 2, start_index + 23, -1):
136
+ # current_density = current_density - anchor_mark_list[cluster_first:cluster_last + 1][i + 1] + \
137
+ # anchor_mark_list[cluster_first:cluster_last + 1][i - 25 + 1]
138
+ # if current_density >= end_rought_judgment[anchor_dis]:
139
+ # end_index = min(i + end_expand[anchor_dis],
140
+ # len(anchor_mark_list[cluster_first:cluster_last + 1]) - 1)
141
+ # break
142
+
143
+ return fuzzy_repeating_region
144
+
145
+
146
+ def filter_chaotic_repeats(fuzzy_repeats, sub_read, n):
147
+ candidate_trs = []
148
+ max_windows = [18, 18, 17.5, 17, 18, 18, 17, 17]
149
+ start_probe = [24, 23, 23, 20, 22, 23, 23, 22]
150
+ end_probe = [24, 23, 22, 20, 22, 23, 22, 21]
151
+ consistent_density = [0, 1.8, 0, 0, 5, 0, 4.8, 0]
152
+ perfect_count = [0, 25, 0, 0, 15, 0, 10, 0]
153
+ to_check_large_probe_trs = []
154
+ for f_r in fuzzy_repeats:
155
+ to_scan_sequence = sub_read[f_r[2]:f_r[3] + 1]
156
+ if n < 5:
157
+ motifs = utils.tri_gram_model(to_scan_sequence, n)
158
+ else:
159
+ motifs = utils.high_motif_detection(to_scan_sequence, n)
160
+ if motifs == None:
161
+ continue
162
+ if f_r[4] == 1:
163
+ for mot in motifs:
164
+ # 过滤掉明显不符合串联重复结构的序列————小探针粗过滤
165
+ motif_mark_indexes = utils.get_motif_marks(to_scan_sequence, mot)
166
+ visit = [0] * len(to_scan_sequence)
167
+ for m_p in motif_mark_indexes:
168
+ visit[m_p:m_p + n] = [1] * n
169
+ if len(mot) in [1, 4, 6]:
170
+ window_ones_count = sum(visit[:20])
171
+ max_ones_count = window_ones_count
172
+
173
+ for i in range(1, len(visit) - 19):
174
+ # 更新窗口中1的数量
175
+ window_ones_count += visit[i + 19] - visit[i - 1]
176
+ if window_ones_count > max_ones_count:
177
+ max_ones_count = window_ones_count
178
+ elif len(mot) in [2, 3]:
179
+ pattern = f"({mot}){{s<=1}}"
180
+ matches = regex.finditer(pattern, str(to_scan_sequence))
181
+ positions = [match.start() for match in matches]
182
+ for m_p in positions:
183
+ for i in range(len(mot)):
184
+ visit[m_p + i] = visit[m_p + i] + (1 - visit[m_p + i]) * (len(mot) - 1) / len(mot)
185
+ windows = sum(visit[:20])
186
+ window_ones_count = math.floor(windows)
187
+ max_ones_count = window_ones_count
188
+
189
+ for i in range(1, len(visit) - 19):
190
+ # 更新窗口中1的数量
191
+ windows += visit[i + 19] - visit[i - 1]
192
+ window_ones_count = math.floor(windows)
193
+ if window_ones_count > max_ones_count:
194
+ max_ones_count = window_ones_count
195
+ elif len(mot) == 5:
196
+ pattern = f"({mot}){{s<=2}}"
197
+ matches = regex.finditer(pattern, str(to_scan_sequence))
198
+ positions = [match.start() for match in matches]
199
+ for m_p in positions:
200
+ for i in range(len(mot)):
201
+ visit[m_p + i] = visit[m_p + i] + (1 - visit[m_p + i]) * (len(mot) - 2) / len(mot)
202
+ windows = sum(visit[:20])
203
+ window_ones_count = math.floor(windows)
204
+ max_ones_count = window_ones_count
205
+
206
+ for i in range(1, len(visit) - 19):
207
+ # 更新窗口中1的数量
208
+ windows += visit[i + 19] - visit[i - 1]
209
+ window_ones_count = math.floor(windows)
210
+ if window_ones_count > max_ones_count:
211
+ max_ones_count = window_ones_count
212
+ else:
213
+ pattern = f"({mot}){{s<=3}}"
214
+ matches = regex.finditer(pattern, str(to_scan_sequence))
215
+ positions = [match.start() for match in matches]
216
+ for m_p in positions:
217
+ for i in range(len(mot)):
218
+ visit[m_p + i] = visit[m_p + i] + (1 - visit[m_p + i]) * (len(mot) - 3) / len(mot)
219
+ windows = sum(visit[:20])
220
+ window_ones_count = math.floor(windows)
221
+ max_ones_count = window_ones_count
222
+
223
+ for i in range(1, len(visit) - 19):
224
+ # 更新窗口中1的数量
225
+ windows += visit[i + 19] - visit[i - 1]
226
+ window_ones_count = math.floor(windows)
227
+ if window_ones_count > max_ones_count:
228
+ max_ones_count = window_ones_count
229
+
230
+ if max_ones_count // 0.5 * 0.5 < max_windows[len(mot) - 1]:
231
+ continue
232
+ else:
233
+ to_check_large_probe_trs.append((f_r[0], f_r[1], mot))
234
+ else:
235
+ for mot in motifs:
236
+ # 标记映射
237
+ if n == 2:
238
+ pattern = r"|".join(['.' + mot[0] + mot[1], mot[0] + mot[1] + '.'])
239
+ matches = regex.finditer(pattern, to_scan_sequence)
240
+ motif_mark_indexes = [match.start() for match in matches]
241
+ elif n == 5:
242
+ four_mers = set()
243
+ for i in range(len(mot)):
244
+ four_mers.add((mot + mot)[i:i + 4])
245
+ _, motif_mark_indexes = utils.find_most_frequent_substring_given(to_scan_sequence, four_mers, 4)
246
+ elif n == 7:
247
+ matches = regex.finditer(f"({mot}){{s<=2}}", to_scan_sequence)
248
+ motif_mark_indexes = [match.start() for match in matches]
249
+
250
+ # 获取标记密度
251
+ motif_mark_gap = []
252
+ for ind, m in enumerate(motif_mark_indexes[1:]):
253
+ motif_mark_gap.append(max(0, m - motif_mark_indexes[ind] - n))
254
+
255
+ # 获取TR区间
256
+ fluctuations_sum = 0
257
+ fuzzy_tr_start = -1
258
+ should_start = 0
259
+ for g_index, g in enumerate(motif_mark_gap[:-perfect_count[len(mot) - 1]]):
260
+ if g_index < should_start:
261
+ continue
262
+ if g >= 1 and fuzzy_tr_start == -1:
263
+ continue
264
+ if g < 1 and fuzzy_tr_start == -1:
265
+ # Catch = [max(catch_g - 1, 0) for catch_g in G[g_index:g_index + 15]]
266
+ fluctuations_sum = sum(motif_mark_gap[g_index:g_index + perfect_count[len(mot) - 1]])
267
+ if fluctuations_sum / perfect_count[len(mot) - 1] <= consistent_density[len(mot) - 1]:
268
+ fuzzy_tr_start = g_index
269
+ elif fuzzy_tr_start != -1:
270
+ fluctuations_sum = fluctuations_sum - max(motif_mark_gap[g_index - 1], 0) + max(
271
+ motif_mark_gap[g_index + perfect_count[len(mot) - 1] - 1], 0)
272
+ if fluctuations_sum / perfect_count[len(mot) - 1] <= consistent_density[len(mot) - 1]:
273
+ continue
274
+ else:
275
+ to_check_large_probe_trs.append((motif_mark_indexes[fuzzy_tr_start] + f_r[3],
276
+ motif_mark_indexes[
277
+ g_index + perfect_count[len(mot) - 1] - 2] + f_r[3],
278
+ mot))
279
+ fuzzy_tr_start = -1
280
+ fluctuations_sum = 0
281
+ should_start = g_index + perfect_count[len(mot) - 1] - 1
282
+ if fuzzy_tr_start != -1:
283
+ to_check_large_probe_trs.append(
284
+ (motif_mark_indexes[fuzzy_tr_start] + f_r[3], motif_mark_indexes[-1] + f_r[3], mot))
285
+
286
+ for t_o_t in to_check_large_probe_trs:
287
+ # 过滤掉明显不符合串联重复结构的序列————大探针细过滤,同时获取大致起始和结束位置
288
+ probe = 25 // len(t_o_t[2]) * t_o_t[2] + t_o_t[2][:25 % len(t_o_t[2])]
289
+ to_scan_sequence = sub_read[t_o_t[0]:t_o_t[1] + 1]
290
+ aligner = PairwiseAligner()
291
+ aligner.mode = 'local'
292
+ # 设置比对参数
293
+ aligner.match_score = 2
294
+ aligner.mismatch_score = -3
295
+ aligner.open_gap_score = -5
296
+ start_index = -1
297
+ end_index = -1
298
+ for i in range(len(to_scan_sequence) // 5 - 3):
299
+ probe_align = aligner.align(probe, to_scan_sequence[i * 5:i * 5 + 20])
300
+ if probe_align:
301
+ pass
302
+ else:
303
+ continue
304
+ if probe_align[0].score >= start_probe[len(t_o_t[2]) - 1]:
305
+ start_index = max(i * 5 - 5, 0)
306
+ break
307
+ if start_index == -1:
308
+ continue
309
+ for j in range((len(to_scan_sequence) - start_index - 1) // 5 - 3):
310
+ probe_align = aligner.align(probe, to_scan_sequence[
311
+ len(to_scan_sequence) - j * 5 - 20:len(to_scan_sequence) - j * 5])
312
+ if probe_align:
313
+ pass
314
+ else:
315
+ continue
316
+ if probe_align[0].score >= end_probe[len(t_o_t[2]) - 1]:
317
+ end_index = min(len(to_scan_sequence) - j * 5 + 4, len(to_scan_sequence) - 1)
318
+ break
319
+ if end_index == -1:
320
+ continue
321
+ if end_index - start_index < 24:
322
+ continue
323
+ # 获取tr的大致边界
324
+ candidate_trs.append((str(t_o_t[2]), t_o_t[0] + start_index, t_o_t[0] + end_index))
325
+
326
+ return candidate_trs