FastSTR 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- faststr/Segmented_Global_Alignment.py +541 -0
- faststr/__init__.py +2 -0
- faststr/get_subread_trs.py +28 -0
- faststr/main.py +338 -0
- faststr/make_consensus_TRs.py +367 -0
- faststr/scan_subread.py +326 -0
- faststr/trs_align.py +704 -0
- faststr/utils.py +428 -0
- faststr-1.0.0.dist-info/METADATA +220 -0
- faststr-1.0.0.dist-info/RECORD +13 -0
- faststr-1.0.0.dist-info/WHEEL +5 -0
- faststr-1.0.0.dist-info/entry_points.txt +2 -0
- faststr-1.0.0.dist-info/top_level.txt +1 -0
faststr/scan_subread.py
ADDED
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
import math
|
|
2
|
+
import regex
|
|
3
|
+
from sklearn.cluster import DBSCAN
|
|
4
|
+
import numpy as np
|
|
5
|
+
from Bio.Align import PairwiseAligner
|
|
6
|
+
|
|
7
|
+
from . import utils
|
|
8
|
+
|
|
9
|
+
def cluster_anchors(anchors, anchor_mark_list, indel, cover):
|
|
10
|
+
if anchors == []:
|
|
11
|
+
return []
|
|
12
|
+
fuzzy_repeating_region = []
|
|
13
|
+
difinition_rough_judgment = [19, 19, 19, 20, 18, 18, 17, 17]
|
|
14
|
+
start_rought_judgment = [17, 18, 18, 18, 16, 16, 15, 15]
|
|
15
|
+
end_rought_judgment = [17, 18, 18, 17, 15, 15, 14, 14]
|
|
16
|
+
start_expand = [5, 5, 5, 5, 5, 5, 10, 10]
|
|
17
|
+
end_expand = [5, 5, 5, 5, 5, 5, 10, 10]
|
|
18
|
+
fuzzy_tr_structure_feature = [12, 12, 12, 12, 12, 12, 14, 12]
|
|
19
|
+
fuzzy_tr_structure_threshold = [0, 8, 0, 0, 7.5, 0, 12.5, 0]
|
|
20
|
+
fuzzy_tr_anchor_density = [0, 0.68, 0, 0, 0.64, 0, 0.68, 0]
|
|
21
|
+
fuzzy_tr_anchor_boundary_density = [0, 14, 0, 0, 12, 0, 13, 0]
|
|
22
|
+
anchor_dis = anchors[0][1] - 1
|
|
23
|
+
dbscan_model = DBSCAN(eps=14, min_samples=14, n_jobs=1).fit(anchors)
|
|
24
|
+
# 获取聚类标签
|
|
25
|
+
labels = dbscan_model.labels_
|
|
26
|
+
# 获取长度达到25,覆盖度达到(1 - indel) * cover的簇
|
|
27
|
+
unique_labels = np.unique(labels)
|
|
28
|
+
anchors = np.array(anchors)
|
|
29
|
+
for label in unique_labels:
|
|
30
|
+
if label == -1: # 忽略噪声点
|
|
31
|
+
continue
|
|
32
|
+
cluster_indices = np.where(labels == label)[0]
|
|
33
|
+
cluster_points = anchors[cluster_indices]
|
|
34
|
+
cluster_first = min(item[0] for item in cluster_points)
|
|
35
|
+
cluster_last = max(item[0] for item in cluster_points)
|
|
36
|
+
l = cluster_last - cluster_first + 1
|
|
37
|
+
anchor_density = len(cluster_points) / l
|
|
38
|
+
if l >= 25 and anchor_density >= min(0.81, (1 - indel) * cover):
|
|
39
|
+
start_index = -1
|
|
40
|
+
end_index = -1
|
|
41
|
+
logo = 0
|
|
42
|
+
current_density = sum(anchor_mark_list[cluster_first:cluster_last + 1][:25])
|
|
43
|
+
if logo == 0:
|
|
44
|
+
for i in range(1, len(anchor_mark_list[cluster_first:cluster_last + 1]) - 25 + 1):
|
|
45
|
+
# 更新窗口内的区密度
|
|
46
|
+
current_density = current_density - anchor_mark_list[cluster_first:cluster_last + 1][i - 1] + \
|
|
47
|
+
anchor_mark_list[cluster_first:cluster_last + 1][i + 25 - 1]
|
|
48
|
+
if start_index == -1:
|
|
49
|
+
if current_density >= start_rought_judgment[anchor_dis]:
|
|
50
|
+
start_index = max(i - start_expand[anchor_dis], 0)
|
|
51
|
+
if current_density >= difinition_rough_judgment[anchor_dis]:
|
|
52
|
+
logo = 1
|
|
53
|
+
break
|
|
54
|
+
if logo == 0:
|
|
55
|
+
continue
|
|
56
|
+
current_density = sum(anchor_mark_list[cluster_first:cluster_last + 1][-25:])
|
|
57
|
+
if current_density >= end_rought_judgment[anchor_dis]:
|
|
58
|
+
end_index = len(anchor_mark_list[cluster_first:cluster_last + 1]) - 1
|
|
59
|
+
else:
|
|
60
|
+
for i in range(len(anchor_mark_list[cluster_first:cluster_last + 1]) - 2, start_index + 23, -1):
|
|
61
|
+
current_density = current_density - anchor_mark_list[cluster_first:cluster_last + 1][i + 1] + \
|
|
62
|
+
anchor_mark_list[cluster_first:cluster_last + 1][i - 25 + 1]
|
|
63
|
+
if current_density >= end_rought_judgment[anchor_dis]:
|
|
64
|
+
end_index = min(i + end_expand[anchor_dis],
|
|
65
|
+
len(anchor_mark_list[cluster_first:cluster_last + 1]) - 1)
|
|
66
|
+
break
|
|
67
|
+
fuzzy_repeating_region.append(
|
|
68
|
+
(cluster_first, cluster_last, cluster_first + start_index, cluster_first + end_index, 1))
|
|
69
|
+
elif l >= 25:
|
|
70
|
+
current_density = sum(anchor_mark_list[cluster_first:cluster_last + 1][:25])
|
|
71
|
+
start_index = -1
|
|
72
|
+
end_index = -1
|
|
73
|
+
fuzzy_tr_start_index = -1
|
|
74
|
+
fuzzy_tr_end_index = -1
|
|
75
|
+
logo = 0
|
|
76
|
+
if current_density >= start_rought_judgment[anchor_dis]:
|
|
77
|
+
start_index = 0
|
|
78
|
+
if current_density >= fuzzy_tr_anchor_boundary_density[anchor_dis]:
|
|
79
|
+
fuzzy_tr_start_index = 0
|
|
80
|
+
if current_density >= difinition_rough_judgment[anchor_dis]:
|
|
81
|
+
logo = 1
|
|
82
|
+
if current_density >= end_rought_judgment[anchor_dis]:
|
|
83
|
+
end_index = min(29, len(anchor_mark_list[cluster_first:cluster_last + 1]) - 1)
|
|
84
|
+
if current_density >= fuzzy_tr_anchor_boundary_density[anchor_dis]:
|
|
85
|
+
fuzzy_tr_end_index = len(anchor_mark_list[cluster_first:cluster_last + 1]) - 1
|
|
86
|
+
Q_support_domain_mod5 = [current_density]
|
|
87
|
+
for i in range(1, len(anchor_mark_list[cluster_first:cluster_last + 1]) - 25 + 1):
|
|
88
|
+
current_density = current_density - anchor_mark_list[cluster_first:cluster_last + 1][i - 1] + \
|
|
89
|
+
anchor_mark_list[cluster_first:cluster_last + 1][i + 25 - 1]
|
|
90
|
+
if start_index == -1:
|
|
91
|
+
if current_density >= start_rought_judgment[anchor_dis]:
|
|
92
|
+
start_index = max(i - start_expand[anchor_dis], 0)
|
|
93
|
+
if fuzzy_tr_start_index == -1:
|
|
94
|
+
if current_density >= fuzzy_tr_anchor_boundary_density[anchor_dis]:
|
|
95
|
+
fuzzy_tr_start_index = i
|
|
96
|
+
if logo == 0 and current_density >= difinition_rough_judgment[anchor_dis]:
|
|
97
|
+
logo = 1
|
|
98
|
+
if current_density >= end_rought_judgment[anchor_dis]:
|
|
99
|
+
end_index = min(i + 24 + end_expand[anchor_dis],
|
|
100
|
+
len(anchor_mark_list[cluster_first:cluster_last + 1]) - 1)
|
|
101
|
+
if current_density >= fuzzy_tr_anchor_boundary_density[anchor_dis]:
|
|
102
|
+
fuzzy_tr_end_index = i + 24
|
|
103
|
+
if i % 5 == 0:
|
|
104
|
+
Q_support_domain_mod5.append(current_density)
|
|
105
|
+
Q_variance = np.mean((np.array(Q_support_domain_mod5) - fuzzy_tr_structure_feature[anchor_dis]) ** 2)
|
|
106
|
+
if Q_variance >= fuzzy_tr_structure_threshold[anchor_dis]:
|
|
107
|
+
if logo == 1:
|
|
108
|
+
fuzzy_repeating_region.append(
|
|
109
|
+
(cluster_first, cluster_last, cluster_first + start_index, cluster_first + end_index, 1))
|
|
110
|
+
else:
|
|
111
|
+
'''
|
|
112
|
+
干扰区间的过滤
|
|
113
|
+
'''
|
|
114
|
+
if anchor_density >= fuzzy_tr_anchor_density[anchor_dis]:
|
|
115
|
+
fuzzy_repeating_region.append(
|
|
116
|
+
(cluster_first, cluster_last, cluster_first + fuzzy_tr_start_index,
|
|
117
|
+
cluster_first + fuzzy_tr_end_index, 0))
|
|
118
|
+
# if logo == 0:
|
|
119
|
+
# for i in range(1, len(anchor_mark_list[cluster_first:cluster_last + 1]) - 25 + 1):
|
|
120
|
+
# # 更新窗口内的区密度
|
|
121
|
+
# current_density = current_density - anchor_mark_list[cluster_first:cluster_last + 1][i - 1] + \
|
|
122
|
+
# anchor_mark_list[cluster_first:cluster_last + 1][i + 25 - 1]
|
|
123
|
+
# if start_index == -1:
|
|
124
|
+
# if current_density >= start_rought_judgment[anchor_dis]:
|
|
125
|
+
# start_index = max(i - start_expand[anchor_dis], 0)
|
|
126
|
+
# if current_density >= difinition_rough_judgment[anchor_dis]:
|
|
127
|
+
# logo = 1
|
|
128
|
+
# break
|
|
129
|
+
# if logo == 0:
|
|
130
|
+
# continue
|
|
131
|
+
# current_density = sum(anchor_mark_list[cluster_first:cluster_last + 1][-25:])
|
|
132
|
+
# if current_density >= end_rought_judgment[anchor_dis]:
|
|
133
|
+
# end_index = len(anchor_mark_list[cluster_first:cluster_last + 1]) - 1
|
|
134
|
+
# else:
|
|
135
|
+
# for i in range(len(anchor_mark_list[cluster_first:cluster_last + 1]) - 2, start_index + 23, -1):
|
|
136
|
+
# current_density = current_density - anchor_mark_list[cluster_first:cluster_last + 1][i + 1] + \
|
|
137
|
+
# anchor_mark_list[cluster_first:cluster_last + 1][i - 25 + 1]
|
|
138
|
+
# if current_density >= end_rought_judgment[anchor_dis]:
|
|
139
|
+
# end_index = min(i + end_expand[anchor_dis],
|
|
140
|
+
# len(anchor_mark_list[cluster_first:cluster_last + 1]) - 1)
|
|
141
|
+
# break
|
|
142
|
+
|
|
143
|
+
return fuzzy_repeating_region
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def filter_chaotic_repeats(fuzzy_repeats, sub_read, n):
|
|
147
|
+
candidate_trs = []
|
|
148
|
+
max_windows = [18, 18, 17.5, 17, 18, 18, 17, 17]
|
|
149
|
+
start_probe = [24, 23, 23, 20, 22, 23, 23, 22]
|
|
150
|
+
end_probe = [24, 23, 22, 20, 22, 23, 22, 21]
|
|
151
|
+
consistent_density = [0, 1.8, 0, 0, 5, 0, 4.8, 0]
|
|
152
|
+
perfect_count = [0, 25, 0, 0, 15, 0, 10, 0]
|
|
153
|
+
to_check_large_probe_trs = []
|
|
154
|
+
for f_r in fuzzy_repeats:
|
|
155
|
+
to_scan_sequence = sub_read[f_r[2]:f_r[3] + 1]
|
|
156
|
+
if n < 5:
|
|
157
|
+
motifs = utils.tri_gram_model(to_scan_sequence, n)
|
|
158
|
+
else:
|
|
159
|
+
motifs = utils.high_motif_detection(to_scan_sequence, n)
|
|
160
|
+
if motifs == None:
|
|
161
|
+
continue
|
|
162
|
+
if f_r[4] == 1:
|
|
163
|
+
for mot in motifs:
|
|
164
|
+
# 过滤掉明显不符合串联重复结构的序列————小探针粗过滤
|
|
165
|
+
motif_mark_indexes = utils.get_motif_marks(to_scan_sequence, mot)
|
|
166
|
+
visit = [0] * len(to_scan_sequence)
|
|
167
|
+
for m_p in motif_mark_indexes:
|
|
168
|
+
visit[m_p:m_p + n] = [1] * n
|
|
169
|
+
if len(mot) in [1, 4, 6]:
|
|
170
|
+
window_ones_count = sum(visit[:20])
|
|
171
|
+
max_ones_count = window_ones_count
|
|
172
|
+
|
|
173
|
+
for i in range(1, len(visit) - 19):
|
|
174
|
+
# 更新窗口中1的数量
|
|
175
|
+
window_ones_count += visit[i + 19] - visit[i - 1]
|
|
176
|
+
if window_ones_count > max_ones_count:
|
|
177
|
+
max_ones_count = window_ones_count
|
|
178
|
+
elif len(mot) in [2, 3]:
|
|
179
|
+
pattern = f"({mot}){{s<=1}}"
|
|
180
|
+
matches = regex.finditer(pattern, str(to_scan_sequence))
|
|
181
|
+
positions = [match.start() for match in matches]
|
|
182
|
+
for m_p in positions:
|
|
183
|
+
for i in range(len(mot)):
|
|
184
|
+
visit[m_p + i] = visit[m_p + i] + (1 - visit[m_p + i]) * (len(mot) - 1) / len(mot)
|
|
185
|
+
windows = sum(visit[:20])
|
|
186
|
+
window_ones_count = math.floor(windows)
|
|
187
|
+
max_ones_count = window_ones_count
|
|
188
|
+
|
|
189
|
+
for i in range(1, len(visit) - 19):
|
|
190
|
+
# 更新窗口中1的数量
|
|
191
|
+
windows += visit[i + 19] - visit[i - 1]
|
|
192
|
+
window_ones_count = math.floor(windows)
|
|
193
|
+
if window_ones_count > max_ones_count:
|
|
194
|
+
max_ones_count = window_ones_count
|
|
195
|
+
elif len(mot) == 5:
|
|
196
|
+
pattern = f"({mot}){{s<=2}}"
|
|
197
|
+
matches = regex.finditer(pattern, str(to_scan_sequence))
|
|
198
|
+
positions = [match.start() for match in matches]
|
|
199
|
+
for m_p in positions:
|
|
200
|
+
for i in range(len(mot)):
|
|
201
|
+
visit[m_p + i] = visit[m_p + i] + (1 - visit[m_p + i]) * (len(mot) - 2) / len(mot)
|
|
202
|
+
windows = sum(visit[:20])
|
|
203
|
+
window_ones_count = math.floor(windows)
|
|
204
|
+
max_ones_count = window_ones_count
|
|
205
|
+
|
|
206
|
+
for i in range(1, len(visit) - 19):
|
|
207
|
+
# 更新窗口中1的数量
|
|
208
|
+
windows += visit[i + 19] - visit[i - 1]
|
|
209
|
+
window_ones_count = math.floor(windows)
|
|
210
|
+
if window_ones_count > max_ones_count:
|
|
211
|
+
max_ones_count = window_ones_count
|
|
212
|
+
else:
|
|
213
|
+
pattern = f"({mot}){{s<=3}}"
|
|
214
|
+
matches = regex.finditer(pattern, str(to_scan_sequence))
|
|
215
|
+
positions = [match.start() for match in matches]
|
|
216
|
+
for m_p in positions:
|
|
217
|
+
for i in range(len(mot)):
|
|
218
|
+
visit[m_p + i] = visit[m_p + i] + (1 - visit[m_p + i]) * (len(mot) - 3) / len(mot)
|
|
219
|
+
windows = sum(visit[:20])
|
|
220
|
+
window_ones_count = math.floor(windows)
|
|
221
|
+
max_ones_count = window_ones_count
|
|
222
|
+
|
|
223
|
+
for i in range(1, len(visit) - 19):
|
|
224
|
+
# 更新窗口中1的数量
|
|
225
|
+
windows += visit[i + 19] - visit[i - 1]
|
|
226
|
+
window_ones_count = math.floor(windows)
|
|
227
|
+
if window_ones_count > max_ones_count:
|
|
228
|
+
max_ones_count = window_ones_count
|
|
229
|
+
|
|
230
|
+
if max_ones_count // 0.5 * 0.5 < max_windows[len(mot) - 1]:
|
|
231
|
+
continue
|
|
232
|
+
else:
|
|
233
|
+
to_check_large_probe_trs.append((f_r[0], f_r[1], mot))
|
|
234
|
+
else:
|
|
235
|
+
for mot in motifs:
|
|
236
|
+
# 标记映射
|
|
237
|
+
if n == 2:
|
|
238
|
+
pattern = r"|".join(['.' + mot[0] + mot[1], mot[0] + mot[1] + '.'])
|
|
239
|
+
matches = regex.finditer(pattern, to_scan_sequence)
|
|
240
|
+
motif_mark_indexes = [match.start() for match in matches]
|
|
241
|
+
elif n == 5:
|
|
242
|
+
four_mers = set()
|
|
243
|
+
for i in range(len(mot)):
|
|
244
|
+
four_mers.add((mot + mot)[i:i + 4])
|
|
245
|
+
_, motif_mark_indexes = utils.find_most_frequent_substring_given(to_scan_sequence, four_mers, 4)
|
|
246
|
+
elif n == 7:
|
|
247
|
+
matches = regex.finditer(f"({mot}){{s<=2}}", to_scan_sequence)
|
|
248
|
+
motif_mark_indexes = [match.start() for match in matches]
|
|
249
|
+
|
|
250
|
+
# 获取标记密度
|
|
251
|
+
motif_mark_gap = []
|
|
252
|
+
for ind, m in enumerate(motif_mark_indexes[1:]):
|
|
253
|
+
motif_mark_gap.append(max(0, m - motif_mark_indexes[ind] - n))
|
|
254
|
+
|
|
255
|
+
# 获取TR区间
|
|
256
|
+
fluctuations_sum = 0
|
|
257
|
+
fuzzy_tr_start = -1
|
|
258
|
+
should_start = 0
|
|
259
|
+
for g_index, g in enumerate(motif_mark_gap[:-perfect_count[len(mot) - 1]]):
|
|
260
|
+
if g_index < should_start:
|
|
261
|
+
continue
|
|
262
|
+
if g >= 1 and fuzzy_tr_start == -1:
|
|
263
|
+
continue
|
|
264
|
+
if g < 1 and fuzzy_tr_start == -1:
|
|
265
|
+
# Catch = [max(catch_g - 1, 0) for catch_g in G[g_index:g_index + 15]]
|
|
266
|
+
fluctuations_sum = sum(motif_mark_gap[g_index:g_index + perfect_count[len(mot) - 1]])
|
|
267
|
+
if fluctuations_sum / perfect_count[len(mot) - 1] <= consistent_density[len(mot) - 1]:
|
|
268
|
+
fuzzy_tr_start = g_index
|
|
269
|
+
elif fuzzy_tr_start != -1:
|
|
270
|
+
fluctuations_sum = fluctuations_sum - max(motif_mark_gap[g_index - 1], 0) + max(
|
|
271
|
+
motif_mark_gap[g_index + perfect_count[len(mot) - 1] - 1], 0)
|
|
272
|
+
if fluctuations_sum / perfect_count[len(mot) - 1] <= consistent_density[len(mot) - 1]:
|
|
273
|
+
continue
|
|
274
|
+
else:
|
|
275
|
+
to_check_large_probe_trs.append((motif_mark_indexes[fuzzy_tr_start] + f_r[3],
|
|
276
|
+
motif_mark_indexes[
|
|
277
|
+
g_index + perfect_count[len(mot) - 1] - 2] + f_r[3],
|
|
278
|
+
mot))
|
|
279
|
+
fuzzy_tr_start = -1
|
|
280
|
+
fluctuations_sum = 0
|
|
281
|
+
should_start = g_index + perfect_count[len(mot) - 1] - 1
|
|
282
|
+
if fuzzy_tr_start != -1:
|
|
283
|
+
to_check_large_probe_trs.append(
|
|
284
|
+
(motif_mark_indexes[fuzzy_tr_start] + f_r[3], motif_mark_indexes[-1] + f_r[3], mot))
|
|
285
|
+
|
|
286
|
+
for t_o_t in to_check_large_probe_trs:
|
|
287
|
+
# 过滤掉明显不符合串联重复结构的序列————大探针细过滤,同时获取大致起始和结束位置
|
|
288
|
+
probe = 25 // len(t_o_t[2]) * t_o_t[2] + t_o_t[2][:25 % len(t_o_t[2])]
|
|
289
|
+
to_scan_sequence = sub_read[t_o_t[0]:t_o_t[1] + 1]
|
|
290
|
+
aligner = PairwiseAligner()
|
|
291
|
+
aligner.mode = 'local'
|
|
292
|
+
# 设置比对参数
|
|
293
|
+
aligner.match_score = 2
|
|
294
|
+
aligner.mismatch_score = -3
|
|
295
|
+
aligner.open_gap_score = -5
|
|
296
|
+
start_index = -1
|
|
297
|
+
end_index = -1
|
|
298
|
+
for i in range(len(to_scan_sequence) // 5 - 3):
|
|
299
|
+
probe_align = aligner.align(probe, to_scan_sequence[i * 5:i * 5 + 20])
|
|
300
|
+
if probe_align:
|
|
301
|
+
pass
|
|
302
|
+
else:
|
|
303
|
+
continue
|
|
304
|
+
if probe_align[0].score >= start_probe[len(t_o_t[2]) - 1]:
|
|
305
|
+
start_index = max(i * 5 - 5, 0)
|
|
306
|
+
break
|
|
307
|
+
if start_index == -1:
|
|
308
|
+
continue
|
|
309
|
+
for j in range((len(to_scan_sequence) - start_index - 1) // 5 - 3):
|
|
310
|
+
probe_align = aligner.align(probe, to_scan_sequence[
|
|
311
|
+
len(to_scan_sequence) - j * 5 - 20:len(to_scan_sequence) - j * 5])
|
|
312
|
+
if probe_align:
|
|
313
|
+
pass
|
|
314
|
+
else:
|
|
315
|
+
continue
|
|
316
|
+
if probe_align[0].score >= end_probe[len(t_o_t[2]) - 1]:
|
|
317
|
+
end_index = min(len(to_scan_sequence) - j * 5 + 4, len(to_scan_sequence) - 1)
|
|
318
|
+
break
|
|
319
|
+
if end_index == -1:
|
|
320
|
+
continue
|
|
321
|
+
if end_index - start_index < 24:
|
|
322
|
+
continue
|
|
323
|
+
# 获取tr的大致边界
|
|
324
|
+
candidate_trs.append((str(t_o_t[2]), t_o_t[0] + start_index, t_o_t[0] + end_index))
|
|
325
|
+
|
|
326
|
+
return candidate_trs
|