FastSTR 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- faststr/Segmented_Global_Alignment.py +541 -0
- faststr/__init__.py +2 -0
- faststr/get_subread_trs.py +28 -0
- faststr/main.py +338 -0
- faststr/make_consensus_TRs.py +367 -0
- faststr/scan_subread.py +326 -0
- faststr/trs_align.py +704 -0
- faststr/utils.py +428 -0
- faststr-1.0.0.dist-info/METADATA +220 -0
- faststr-1.0.0.dist-info/RECORD +13 -0
- faststr-1.0.0.dist-info/WHEEL +5 -0
- faststr-1.0.0.dist-info/entry_points.txt +2 -0
- faststr-1.0.0.dist-info/top_level.txt +1 -0
faststr/utils.py
ADDED
|
@@ -0,0 +1,428 @@
|
|
|
1
|
+
import math
|
|
2
|
+
import numpy as np
|
|
3
|
+
import regex
|
|
4
|
+
from Bio import SeqIO
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
# 获取每条染色体从start_index到end_index上的碱基序列
|
|
8
|
+
def read_fasta(path, start_index=1, end_index=0):
|
|
9
|
+
sequences = []
|
|
10
|
+
with open(path, "r") as handle:
|
|
11
|
+
for record in SeqIO.parse(handle, "fasta"):
|
|
12
|
+
if end_index == 0:
|
|
13
|
+
record.seq = record.seq.upper()[start_index - 1:]
|
|
14
|
+
else:
|
|
15
|
+
record.seq = record.seq.upper()[start_index - 1:end_index]
|
|
16
|
+
|
|
17
|
+
sequences.append(record)
|
|
18
|
+
return sequences
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def make_sub_reads(gene_sequence, read_length, overlap_length):
|
|
22
|
+
sub_reads = []
|
|
23
|
+
if len(gene_sequence) <= read_length:
|
|
24
|
+
return [gene_sequence.seq]
|
|
25
|
+
# 滑动窗口提取sub_read
|
|
26
|
+
for start in range(0, (len(gene_sequence) // (read_length - overlap_length) + 1) * (read_length - overlap_length),
|
|
27
|
+
read_length - overlap_length):
|
|
28
|
+
if start >= len(gene_sequence):
|
|
29
|
+
break
|
|
30
|
+
end = start + read_length
|
|
31
|
+
subsequence = gene_sequence.seq[start:min(end, len(gene_sequence))]
|
|
32
|
+
sub_reads.append(subsequence)
|
|
33
|
+
|
|
34
|
+
return sub_reads
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def get_anchors(sub_read):
|
|
38
|
+
anchor_dict = {i: set() for i in range(1, 9)}
|
|
39
|
+
anchor_mark_dict = {i: [0] * len(sub_read) for i in range(1, 9)}
|
|
40
|
+
same_bases = [[], [], [], []]
|
|
41
|
+
for p, c in enumerate(sub_read):
|
|
42
|
+
if c == 'A':
|
|
43
|
+
same_bases[0].append(p)
|
|
44
|
+
elif c == 'G':
|
|
45
|
+
same_bases[1].append(p)
|
|
46
|
+
elif c == 'C':
|
|
47
|
+
same_bases[2].append(p)
|
|
48
|
+
elif c == 'T':
|
|
49
|
+
same_bases[3].append(p)
|
|
50
|
+
for s_b in same_bases:
|
|
51
|
+
i = 0
|
|
52
|
+
for j in range(len(s_b)):
|
|
53
|
+
while s_b[j] - s_b[i] > 8:
|
|
54
|
+
i += 1
|
|
55
|
+
for k in range(i, j):
|
|
56
|
+
anchor_dict[s_b[j] - s_b[k]].add((s_b[k], s_b[j] - s_b[k]))
|
|
57
|
+
anchor_dict[s_b[j] - s_b[k]].add((s_b[j], s_b[j] - s_b[k]))
|
|
58
|
+
anchor_mark_dict[s_b[j] - s_b[k]][s_b[k]] = 1
|
|
59
|
+
return anchor_dict, anchor_mark_dict
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def if_motif_is_tr(motif):
|
|
63
|
+
if not motif:
|
|
64
|
+
return False
|
|
65
|
+
# 拼接字符串并移除首尾字符
|
|
66
|
+
doubled_motif = (motif + motif)[1:-1]
|
|
67
|
+
# 检查原字符串是否出现在移除首尾字符后的新字符串中
|
|
68
|
+
return motif in doubled_motif
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def tri_gram_model(sequence, n):
|
|
72
|
+
# 获取二元、三元计数和二元位置
|
|
73
|
+
tri_count = {}
|
|
74
|
+
bi_count = {'AA': 0, 'AG': 0, 'AC': 0, 'AT': 0, 'GA': 0, 'GG': 0, 'GC': 0, 'GT': 0, 'CA': 0, 'CG': 0, 'CC': 0,
|
|
75
|
+
'CT': 0, 'TA': 0, 'TG': 0, 'TC': 0, 'TT': 0}
|
|
76
|
+
bi_pos = {'AA': [], 'AG': [], 'AC': [], 'AT': [], 'GA': [], 'GG': [], 'GC': [], 'GT': [], 'CA': [], 'CG': [],
|
|
77
|
+
'CC': [], 'CT': [], 'TA': [], 'TG': [], 'TC': [], 'TT': []}
|
|
78
|
+
for i in range(len(sequence) - 2):
|
|
79
|
+
if not set(sequence[i:i + 3]).issubset({'A', 'G', 'T', 'C'}):
|
|
80
|
+
continue
|
|
81
|
+
if sequence[i:i + 3] not in tri_count:
|
|
82
|
+
tri_count[sequence[i:i + 3]] = 1
|
|
83
|
+
else:
|
|
84
|
+
tri_count[sequence[i:i + 3]] += 1
|
|
85
|
+
bi_pos[sequence[i:i + 2]].append(i)
|
|
86
|
+
bi_count[sequence[i:i + 2]] += 1
|
|
87
|
+
if i == len(sequence) - 3:
|
|
88
|
+
bi_pos[sequence[i + 1:i + 3]].append(i + 1)
|
|
89
|
+
bi_count[sequence[i + 1:i + 3]] += 1
|
|
90
|
+
|
|
91
|
+
# 根据种子获取备选motif
|
|
92
|
+
sorted_seed = sorted(bi_count, key=lambda x: bi_count[x], reverse=True)
|
|
93
|
+
if bi_count[sorted_seed[0]] < 2:
|
|
94
|
+
return None
|
|
95
|
+
# 如果motif长度为1
|
|
96
|
+
if n == 1:
|
|
97
|
+
return [sorted_seed[0][0]]
|
|
98
|
+
# 如果motif长度为2
|
|
99
|
+
if n == 2:
|
|
100
|
+
i = 0
|
|
101
|
+
while i < len(sorted_seed):
|
|
102
|
+
if sorted_seed[i][0] == sorted_seed[i][1] and bi_count[sorted_seed[i]] >= len(sequence) / 2:
|
|
103
|
+
return None
|
|
104
|
+
if sorted_seed[i][0] != sorted_seed[i][1] and bi_count[sorted_seed[i]] > 0:
|
|
105
|
+
return [sorted_seed[i]]
|
|
106
|
+
i += 1
|
|
107
|
+
return None
|
|
108
|
+
# 如果motif长度大于2小于7
|
|
109
|
+
motifs = set()
|
|
110
|
+
# true_motif = []
|
|
111
|
+
k = 0
|
|
112
|
+
while True:
|
|
113
|
+
max_seed = sorted_seed[k]
|
|
114
|
+
bipos_set = set(bi_pos[max_seed])
|
|
115
|
+
|
|
116
|
+
for pos in bi_pos[max_seed]:
|
|
117
|
+
if pos + n in bipos_set:
|
|
118
|
+
if if_motif_is_tr(sequence[pos:pos + n]):
|
|
119
|
+
continue
|
|
120
|
+
if not set(sequence[pos:pos + n]).issubset({'A', 'G', 'T', 'C'}):
|
|
121
|
+
continue
|
|
122
|
+
# if sequence[pos:pos + n] in m + m:
|
|
123
|
+
# true_motif.append(sequence[pos:pos + n])
|
|
124
|
+
motifs.add(sequence[pos:pos + n])
|
|
125
|
+
if motifs and k == 2:
|
|
126
|
+
break
|
|
127
|
+
k += 1
|
|
128
|
+
if k > 15:
|
|
129
|
+
return None
|
|
130
|
+
|
|
131
|
+
# 计算各个motif的合理概率,得到最佳motif
|
|
132
|
+
tri_gram = {}
|
|
133
|
+
for tri, _ in tri_count.items():
|
|
134
|
+
if bi_count[tri[:2]] == 0:
|
|
135
|
+
continue
|
|
136
|
+
tri_gram[tri] = tri_count[tri] / bi_count[tri[:2]]
|
|
137
|
+
p_rationality = []
|
|
138
|
+
for mot in motifs:
|
|
139
|
+
p_r = 1
|
|
140
|
+
for b in range(n - 2):
|
|
141
|
+
p_r += math.log(tri_gram[mot[b:b + 3]])
|
|
142
|
+
p_rationality.append(p_r)
|
|
143
|
+
sorted_prationalty = sorted(p_rationality)
|
|
144
|
+
max_prationalty = sorted_prationalty[-1]
|
|
145
|
+
motifs = list(motifs)
|
|
146
|
+
alternative_motif = [str(motifs[index]) for index, value in enumerate(p_rationality) if value == max_prationalty]
|
|
147
|
+
final_motif = [alternative_motif[0]]
|
|
148
|
+
if len(alternative_motif) == 1:
|
|
149
|
+
return final_motif
|
|
150
|
+
for a_m in alternative_motif[1:]:
|
|
151
|
+
logo = 0
|
|
152
|
+
for f_m in final_motif:
|
|
153
|
+
if a_m in f_m + f_m:
|
|
154
|
+
logo = 1
|
|
155
|
+
if logo == 0:
|
|
156
|
+
final_motif.append(a_m)
|
|
157
|
+
return final_motif
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def get_realscan_sequence(sequence):
|
|
161
|
+
if len(sequence) < 40:
|
|
162
|
+
return sequence
|
|
163
|
+
elif len(sequence) < 800:
|
|
164
|
+
noisy_bases = round(len(sequence) * 0.05)
|
|
165
|
+
start_noisy_bases = noisy_bases // 2
|
|
166
|
+
end_noisy_bases = noisy_bases - start_noisy_bases
|
|
167
|
+
return sequence[start_noisy_bases:len(sequence) - end_noisy_bases]
|
|
168
|
+
else:
|
|
169
|
+
return sequence[20:len(sequence) - 20]
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def get_motif_marks(sequence, motif):
|
|
173
|
+
if len(motif) < 4:
|
|
174
|
+
pattern = f"({motif}){{s<=0}}"
|
|
175
|
+
elif len(motif) < 6:
|
|
176
|
+
pattern = f"({motif}){{s<=1}}"
|
|
177
|
+
else:
|
|
178
|
+
pattern = f"({motif}){{s<=2}}"
|
|
179
|
+
matches = regex.finditer(pattern, str(sequence))
|
|
180
|
+
positions = [match.start() for match in matches]
|
|
181
|
+
return positions
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def high_motif_detection(sequence, n):
|
|
185
|
+
# 获取二元、三元计数和二元位置
|
|
186
|
+
tri_count = {}
|
|
187
|
+
bi_count = {'AA': 0, 'AG': 0, 'AC': 0, 'AT': 0, 'GA': 0, 'GG': 0, 'GC': 0, 'GT': 0, 'CA': 0, 'CG': 0, 'CC': 0,
|
|
188
|
+
'CT': 0, 'TA': 0, 'TG': 0, 'TC': 0, 'TT': 0}
|
|
189
|
+
bi_pos = {'AA': [], 'AG': [], 'AC': [], 'AT': [], 'GA': [], 'GG': [], 'GC': [], 'GT': [], 'CA': [], 'CG': [],
|
|
190
|
+
'CC': [], 'CT': [], 'TA': [], 'TG': [], 'TC': [], 'TT': []}
|
|
191
|
+
base_de_bruijn = np.zeros((4, 4))
|
|
192
|
+
base_mark = {'A': 0, 'G': 1, 'C': 2, 'T': 3}
|
|
193
|
+
base_annotation = {0: 'A', 1: 'G', 2: 'C', 3: 'T'}
|
|
194
|
+
for i in range(len(sequence) - 2):
|
|
195
|
+
if i > 0 and set(sequence[i - 1:i + 1]).issubset({'A', 'G', 'T', 'C'}):
|
|
196
|
+
base_de_bruijn[base_mark[sequence[i - 1]]][base_mark[sequence[i]]] += 1
|
|
197
|
+
if not set(sequence[i:i + 3]).issubset({'A', 'G', 'T', 'C'}):
|
|
198
|
+
continue
|
|
199
|
+
if sequence[i:i + 3] not in tri_count:
|
|
200
|
+
tri_count[sequence[i:i + 3]] = 1
|
|
201
|
+
else:
|
|
202
|
+
tri_count[sequence[i:i + 3]] += 1
|
|
203
|
+
bi_pos[sequence[i:i + 2]].append(i)
|
|
204
|
+
bi_count[sequence[i:i + 2]] += 1
|
|
205
|
+
if i == len(sequence) - 3:
|
|
206
|
+
bi_pos[sequence[i + 1:i + 3]].append(i + 1)
|
|
207
|
+
bi_count[sequence[i + 1:i + 3]] += 1
|
|
208
|
+
base_de_bruijn[base_mark[sequence[i]]][base_mark[sequence[i + 1]]] += 1
|
|
209
|
+
base_de_bruijn[base_mark[sequence[i + 1]]][base_mark[sequence[i + 2]]] += 1
|
|
210
|
+
|
|
211
|
+
# 根据种子获取备选motif
|
|
212
|
+
sorted_seed = sorted(bi_count, key=lambda x: bi_count[x], reverse=True)
|
|
213
|
+
if bi_count[sorted_seed[0]] < 2:
|
|
214
|
+
return None
|
|
215
|
+
|
|
216
|
+
motifs = set()
|
|
217
|
+
k = 0
|
|
218
|
+
while True:
|
|
219
|
+
max_seed = sorted_seed[k]
|
|
220
|
+
bipos_set = set(bi_pos[max_seed])
|
|
221
|
+
|
|
222
|
+
for pos in bi_pos[max_seed]:
|
|
223
|
+
if pos + n in bipos_set:
|
|
224
|
+
if if_motif_is_tr(sequence[pos:pos + n]):
|
|
225
|
+
continue
|
|
226
|
+
if not set(sequence[pos:pos + n]).issubset({'A', 'G', 'T', 'C'}):
|
|
227
|
+
continue
|
|
228
|
+
motifs.add(sequence[pos:pos + n])
|
|
229
|
+
if motifs and k == 2:
|
|
230
|
+
break
|
|
231
|
+
k += 1
|
|
232
|
+
if k > 15:
|
|
233
|
+
return None
|
|
234
|
+
|
|
235
|
+
# 计算各个motif的合理概率,得到最佳motif
|
|
236
|
+
tri_gram = {}
|
|
237
|
+
for tri, _ in tri_count.items():
|
|
238
|
+
if bi_count[tri[:2]] == 0:
|
|
239
|
+
continue
|
|
240
|
+
tri_gram[tri] = tri_count[tri] / bi_count[tri[:2]]
|
|
241
|
+
p_rationality = []
|
|
242
|
+
for mot in motifs:
|
|
243
|
+
p_r = 1
|
|
244
|
+
for b in range(n - 2):
|
|
245
|
+
p_r += math.log(tri_gram[mot[b:b + 3]])
|
|
246
|
+
p_rationality.append(p_r)
|
|
247
|
+
sorted_prationalty = sorted(p_rationality)
|
|
248
|
+
max_prationalty = sorted_prationalty[-1]
|
|
249
|
+
motifs_mar = []
|
|
250
|
+
motifs = list(motifs)
|
|
251
|
+
arfa = 0
|
|
252
|
+
if n == 5:
|
|
253
|
+
arfa = 0.3
|
|
254
|
+
elif n == 6:
|
|
255
|
+
arfa = 0.4
|
|
256
|
+
elif n == 7:
|
|
257
|
+
arfa = 0.8
|
|
258
|
+
else:
|
|
259
|
+
arfa = 1
|
|
260
|
+
for inde, mot in enumerate(motifs):
|
|
261
|
+
if max_prationalty - p_rationality[inde] <= arfa:
|
|
262
|
+
motifs_mar.append(mot)
|
|
263
|
+
|
|
264
|
+
# copy_vector_variance = []
|
|
265
|
+
# # copy = round(len(sequence) / n)
|
|
266
|
+
# copy = estimated_copy_number(sequence, n, base_de_bruijn)
|
|
267
|
+
# for mot in motifs_mar:
|
|
268
|
+
# mot_copy_vector_variance = get_copy_vector(mot, n, copy, base_de_bruijn, base_mark)
|
|
269
|
+
# if mot_copy_vector_variance != -1:
|
|
270
|
+
# copy_vector_variance.append(mot_copy_vector_variance)
|
|
271
|
+
#
|
|
272
|
+
# if copy_vector_variance == []:
|
|
273
|
+
# return None
|
|
274
|
+
# min_variance = min(copy_vector_variance)
|
|
275
|
+
# alternative_motif = [str(motifs_mar[index]) for index, value in enumerate(copy_vector_variance) if
|
|
276
|
+
# value == min_variance]
|
|
277
|
+
#
|
|
278
|
+
# final_motif = [alternative_motif[0]]
|
|
279
|
+
# if len(alternative_motif) == 1:
|
|
280
|
+
# return final_motif
|
|
281
|
+
# for a_m in alternative_motif[1:]:
|
|
282
|
+
# logo = 0
|
|
283
|
+
# for f_m in final_motif:
|
|
284
|
+
# if a_m in f_m + f_m:
|
|
285
|
+
# logo = 1
|
|
286
|
+
# if logo == 0:
|
|
287
|
+
# final_motif.append(a_m)
|
|
288
|
+
|
|
289
|
+
copy = estimated_copy_number(sequence, n, base_de_bruijn)
|
|
290
|
+
Frequency = []
|
|
291
|
+
for mot in motifs_mar:
|
|
292
|
+
Frequency.append(calculate_base_frequencye(mot, copy, base_de_bruijn, base_mark))
|
|
293
|
+
|
|
294
|
+
max_frequency = max(Frequency)
|
|
295
|
+
alternative_motif = []
|
|
296
|
+
for index, value in enumerate(Frequency):
|
|
297
|
+
if value == max_frequency:
|
|
298
|
+
alternative_motif.append(str(motifs_mar[index]))
|
|
299
|
+
|
|
300
|
+
final_motif = [alternative_motif[0]]
|
|
301
|
+
if len(alternative_motif) == 1:
|
|
302
|
+
return final_motif
|
|
303
|
+
for a_m in alternative_motif[1:]:
|
|
304
|
+
logo = 0
|
|
305
|
+
for f_m in final_motif:
|
|
306
|
+
if a_m in f_m + f_m:
|
|
307
|
+
logo = 1
|
|
308
|
+
if logo == 0:
|
|
309
|
+
final_motif.append(a_m)
|
|
310
|
+
if len(final_motif) > 3:
|
|
311
|
+
copy = len(sequence) // n
|
|
312
|
+
Frequency = []
|
|
313
|
+
for mot in motifs_mar:
|
|
314
|
+
Frequency.append(calculate_base_frequencye(mot, copy, base_de_bruijn, base_mark))
|
|
315
|
+
|
|
316
|
+
max_frequency = max(Frequency)
|
|
317
|
+
alternative_motif = []
|
|
318
|
+
for index, value in enumerate(Frequency):
|
|
319
|
+
if value == max_frequency:
|
|
320
|
+
alternative_motif.append(str(motifs_mar[index]))
|
|
321
|
+
|
|
322
|
+
final_motif = [alternative_motif[0]]
|
|
323
|
+
if len(alternative_motif) == 1:
|
|
324
|
+
return final_motif
|
|
325
|
+
for a_m in alternative_motif[1:]:
|
|
326
|
+
logo = 0
|
|
327
|
+
for f_m in final_motif:
|
|
328
|
+
if a_m in f_m + f_m:
|
|
329
|
+
logo = 1
|
|
330
|
+
if logo == 0:
|
|
331
|
+
final_motif.append(a_m)
|
|
332
|
+
|
|
333
|
+
return final_motif
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
def estimated_copy_number(sequence, n, base_de_bruijn):
|
|
337
|
+
copy = len(sequence) // n
|
|
338
|
+
# 展开数组为一维
|
|
339
|
+
flattened = base_de_bruijn.flatten()
|
|
340
|
+
# 获取最大的前8个数
|
|
341
|
+
largest_eight = np.sort(flattened)[-8:][::-1]
|
|
342
|
+
large = largest_eight[0]
|
|
343
|
+
for i in range(1, n):
|
|
344
|
+
if i == n - 1 or largest_eight[i] < copy:
|
|
345
|
+
small = math.ceil(sum(largest_eight[:n]) / n)
|
|
346
|
+
break
|
|
347
|
+
if large - largest_eight[i] >= copy:
|
|
348
|
+
small = largest_eight[i]
|
|
349
|
+
break
|
|
350
|
+
|
|
351
|
+
common_divisor = large - small
|
|
352
|
+
max_common_divisor = min(common_divisor, small)
|
|
353
|
+
while max_common_divisor >= copy:
|
|
354
|
+
if small != max_common_divisor:
|
|
355
|
+
common_divisor = max(max_common_divisor, small) - min(small, max_common_divisor)
|
|
356
|
+
small = min(small, max_common_divisor)
|
|
357
|
+
max_common_divisor = min(common_divisor, small)
|
|
358
|
+
else:
|
|
359
|
+
if max_common_divisor % copy == 0:
|
|
360
|
+
max_common_divisor = copy
|
|
361
|
+
break
|
|
362
|
+
max_common_divisor = math.ceil(max_common_divisor / round(max_common_divisor / copy))
|
|
363
|
+
break
|
|
364
|
+
return math.ceil(max_common_divisor * 0.5 + copy * 0.5)
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def calculate_base_frequencye(motif, copy, base_de_bruijn, base_mark):
|
|
368
|
+
base_de_bruijn = base_de_bruijn.copy()
|
|
369
|
+
base_frequency = 0
|
|
370
|
+
for i in range(1, len(motif)):
|
|
371
|
+
if base_de_bruijn[base_mark[motif[i - 1]]][base_mark[motif[i]]] >= copy:
|
|
372
|
+
base_frequency += copy
|
|
373
|
+
base_de_bruijn[base_mark[motif[i - 1]]][base_mark[motif[i]]] -= copy
|
|
374
|
+
else:
|
|
375
|
+
if base_de_bruijn[base_mark[motif[i - 1]]][base_mark[motif[i]]] >= 0.6 * copy:
|
|
376
|
+
base_frequency += base_de_bruijn[base_mark[motif[i - 1]]][base_mark[motif[i]]]
|
|
377
|
+
else:
|
|
378
|
+
base_frequency += 0
|
|
379
|
+
if base_de_bruijn[base_mark[motif[i - 1]]][base_mark[motif[i]]] > 0:
|
|
380
|
+
base_de_bruijn[base_mark[motif[i - 1]]][base_mark[motif[i]]] = 0
|
|
381
|
+
|
|
382
|
+
return base_frequency
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def get_copy_vector(motif, n, copy, base_de_bruijn, base_mark):
|
|
386
|
+
motif_base_de_bruijn = np.zeros((4, 4))
|
|
387
|
+
for i in range(1, n):
|
|
388
|
+
motif_base_de_bruijn[base_mark[motif[i - 1]]][base_mark[motif[i]]] += 1
|
|
389
|
+
|
|
390
|
+
copy_vector = []
|
|
391
|
+
for r in range(4):
|
|
392
|
+
for l in range(4):
|
|
393
|
+
if motif_base_de_bruijn[r][l] > 0:
|
|
394
|
+
copy_vector.append(base_de_bruijn[r][l] / motif_base_de_bruijn[r][l])
|
|
395
|
+
|
|
396
|
+
return np.mean((np.array(copy_vector) - copy) ** 2)
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
def find_most_frequent_substring_given(long_seq, four_mer_marker, k):
|
|
400
|
+
freq_map = {} # 字符串 -> 频数
|
|
401
|
+
positions_map = {} # 字符串 -> 出现位置列表
|
|
402
|
+
max_count = 0
|
|
403
|
+
most_frequent_string = ""
|
|
404
|
+
|
|
405
|
+
# Convert short_strings list to a set for O(1) lookup
|
|
406
|
+
short_strings_set = set(four_mer_marker)
|
|
407
|
+
|
|
408
|
+
# Traverse all substrings of length k using sliding window
|
|
409
|
+
for i in range(len(long_seq) - k + 1):
|
|
410
|
+
substring = long_seq[i:i + k]
|
|
411
|
+
|
|
412
|
+
# Only count the substring if it is in the set of given short strings
|
|
413
|
+
if substring in short_strings_set:
|
|
414
|
+
# Update frequency map
|
|
415
|
+
if substring in freq_map and i - positions_map[substring][-1] >= k:
|
|
416
|
+
freq_map[substring] += 1
|
|
417
|
+
positions_map[substring].append(i)
|
|
418
|
+
elif substring not in freq_map:
|
|
419
|
+
freq_map[substring] = 1
|
|
420
|
+
positions_map[substring] = [i]
|
|
421
|
+
|
|
422
|
+
# Update the most frequent substring if needed
|
|
423
|
+
if freq_map[substring] > max_count:
|
|
424
|
+
max_count = freq_map[substring]
|
|
425
|
+
most_frequent_string = substring
|
|
426
|
+
|
|
427
|
+
# Return the most frequent substring and its positions
|
|
428
|
+
return most_frequent_string, positions_map[most_frequent_string]
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: FastSTR
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: FastSTR: A high-performance tool for short tandem repeat (STR) detection and analysis from genome assemblies.
|
|
5
|
+
Author-email: Wen Long <wenlong2024@mail.nwpu.edu.cn>
|
|
6
|
+
Maintainer-email: Wen Long <wenlong2024@mail.nwpu.edu.cn>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/XL-BioGroup/FastSTR
|
|
9
|
+
Project-URL: Repository, https://github.com/XL-BioGroup/FastSTR
|
|
10
|
+
Project-URL: Issues, https://github.com/XL-BioGroup/FastSTR/issues
|
|
11
|
+
Keywords: tandem repeat,STR,genome,bioinformatics,sequence alignment
|
|
12
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Requires-Python: >=3.9
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
Requires-Dist: biopython==1.83
|
|
20
|
+
Requires-Dist: numpy==1.26.0
|
|
21
|
+
Requires-Dist: pandas==2.2.2
|
|
22
|
+
Requires-Dist: regex==2024.9.11
|
|
23
|
+
Requires-Dist: scikit-learn==1.5.2
|
|
24
|
+
|
|
25
|
+
# 🧬 FastSTR
|
|
26
|
+
|
|
27
|
+
**FastSTR** — Ultra-fast and accurate identification of Short Tandem Repeats (STRs) from long-read DNA sequences. Developed for genome-wide STR detection, consensus construction, and comparative STR analysis.
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## 📘 Table of Contents
|
|
32
|
+
1. [Overview](#overview)
|
|
33
|
+
2. [Installation](#installation)
|
|
34
|
+
3. [Quick Start](#quick-start)
|
|
35
|
+
4. [Command Line Options](#command-line-options)
|
|
36
|
+
5. [Input & Output](#input--output)
|
|
37
|
+
6. [Usage](#usage)
|
|
38
|
+
7. [Performance](#performance)
|
|
39
|
+
8. [Citation](#citation)
|
|
40
|
+
9. [License](#license)
|
|
41
|
+
10. [Changelog](#changelog)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
## 🌍 Overview
|
|
47
|
+
|
|
48
|
+
**FastSTR** is a novel and efficient tool for de novo detection of short tandem repeats (STRs) in genomic sequences. It combines fast motif recognition with accurate sequence alignment to achieve both high precision and completeness in STR identification. FastSTR is optimized for large-scale genomic datasets and enables rapid detection of repetitive elements without relying on predefined motif libraries or fixed repeat-length thresholds.
|
|
49
|
+
|
|
50
|
+
Compared to classical tools like **TRF**, **T-reks**, and **TRASH**, FastSTR achieves:
|
|
51
|
+
|
|
52
|
+
- ⚡ **High-speed parallel processing** — Processes genomic fragments in parallel, achieving **up to 10× faster runtime**.
|
|
53
|
+
- 🧠 **Context-aware motif recognition** — Uses an **N-gram + Markov** model to identify representative motifs without predefined motif libraries.
|
|
54
|
+
- 🧩 **Segmented global alignment** — Efficiently handles **ultra-long or complex STRs** while maintaining base-level precision.
|
|
55
|
+
- 🔍 **Smart interval merging** — Applies an **interval-gain decision** strategy to accurately resolve overlapping STRs.
|
|
56
|
+
- 🧬 **Enhanced detection in complex regions** — Identifies **confounding or nested repeat regions** (e.g., centromeric satellites) with a novel **density-based concentration test**.
|
|
57
|
+
- 💾 **Lightweight & scalable** — Requires few dependencies, easy to install and run, and supports **multiple operating systems**.
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## ⚙️ Installation
|
|
62
|
+
|
|
63
|
+
### Option 1: Install via `pip`
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install faststr
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Option 2: Install via `conda`
|
|
70
|
+
*(coming soon)*
|
|
71
|
+
```bash
|
|
72
|
+
conda install -c bioconda faststr
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### Option 3: Local installation (development)
|
|
76
|
+
```bash
|
|
77
|
+
git clone https://github.com/yourname/faststr.git
|
|
78
|
+
cd faststr
|
|
79
|
+
pip install -e .
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## 🚀 Quick Start
|
|
85
|
+
|
|
86
|
+
### Basic Command
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
faststr [--strict | --normal | --loose] [--default] genome.fa
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### Example
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
faststr --strict --default genome.fa
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
This runs FastSTR in **strict mode** using the **default model** to identify STRs in the `genome.fa` file.
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## ⚡ Command Line Options
|
|
103
|
+
|
|
104
|
+
| Argument | Type | Default | Description |
|
|
105
|
+
|-----------|------|----------|-------------|
|
|
106
|
+
| `match` | int | 2 | Match score |
|
|
107
|
+
| `mismatch` | int | 5 | Mismatch score |
|
|
108
|
+
| `gap_open` | int | 7 | Gap opening penalty |
|
|
109
|
+
| `gap_extend` | int | 3 | Gap extension penalty |
|
|
110
|
+
| `p_indel` | int | 15 | Indel percentage threshold |
|
|
111
|
+
| `p_match` | int | 80 | Match percentage threshold |
|
|
112
|
+
| `score` | int | 50 | Alignment score threshold |
|
|
113
|
+
| `quality_control` | bool | False | Enable read-level quality control |
|
|
114
|
+
| `DNA_file` | str | — | Path to DNA FASTA input |
|
|
115
|
+
| `-f` | str | — | Output directory |
|
|
116
|
+
| `-s` | int | 1 | Start index |
|
|
117
|
+
| `-e` | int | 0 | End index |
|
|
118
|
+
| `-l` | int | 15000 | Sub-read length |
|
|
119
|
+
| `-o` | int | 1000 | Overlap length |
|
|
120
|
+
| `-p` | int | 1 | Number of CPU cores |
|
|
121
|
+
| `-b` | float | 0.045 | Motif coverage threshold |
|
|
122
|
+
|
|
123
|
+
---
|
|
124
|
+
|
|
125
|
+
## 🧠 Alignment Modes
|
|
126
|
+
|
|
127
|
+
| Mode | Description |
|
|
128
|
+
|------|--------------|
|
|
129
|
+
| `--strict` | High precision, recommended for curated assemblies |
|
|
130
|
+
| `--normal` | Balanced mode, suitable for most datasets |
|
|
131
|
+
| `--loose` | High sensitivity, tolerant of mismatches |
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
## 🧬 Model Presets
|
|
136
|
+
|
|
137
|
+
| Preset | Description |
|
|
138
|
+
|---------|-------------|
|
|
139
|
+
| `--default` | Standard scoring model |
|
|
140
|
+
| *(future)* `--sensitive` | Optimized for noisy long reads |
|
|
141
|
+
| *(future)* `--speed` | Optimized for large-scale detection |
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
## 📥 Input & Output
|
|
146
|
+
|
|
147
|
+
### Input
|
|
148
|
+
- DNA sequences in **FASTA** format
|
|
149
|
+
|
|
150
|
+
### Output
|
|
151
|
+
| File Pattern | Description |
|
|
152
|
+
|----------------|-------------|
|
|
153
|
+
| `*detail.dat` | Contains all STR positions and motifs, quality statistics for each STR, and STR counts per chromosome. |
|
|
154
|
+
| `*align.dat` | Detailed alignment of all STRs against reference STRs, including mismatches and indels. |
|
|
155
|
+
| `*.csv` | Merged STR intervals with representative motifs and summary statistics for each interval. |
|
|
156
|
+
| `*.log` | Processing logs. |
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## 🧪 Usage
|
|
161
|
+
|
|
162
|
+
### 1️⃣ Identify STRs in a genome
|
|
163
|
+
```bash
|
|
164
|
+
faststr --normal --default human_genome.fa
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
### 2️⃣ Use multiple cores
|
|
168
|
+
```bash
|
|
169
|
+
faststr --strict --default genome.fa -p 8
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
---
|
|
173
|
+
|
|
174
|
+
## 📈 Performance
|
|
175
|
+
|
|
176
|
+
| Dataset | Genome Size | Tool | Runtime | Recall | Precision |
|
|
177
|
+
|--------------------|------------|--------|-----------|--------|-----------|
|
|
178
|
+
| Human (T2T) | 2.94 G | TRF | 18 h 31 min | - | - |
|
|
179
|
+
| | | FastSTR| 1 h 13 min | 0.950 | 0.994 |
|
|
180
|
+
| Mouse (GRCm39) | 2.57 G | TRF | 1 h 41 min | - | - |
|
|
181
|
+
| | | FastSTR| 38 min | 0.966 | 0.997 |
|
|
182
|
+
| Zebrafish (GRCz11) | 1.58 G | TRF | 2 h 51 min | - | - |
|
|
183
|
+
| | | FastSTR| 25 min | 0.945 | 0.998 |
|
|
184
|
+
|
|
185
|
+
*Note: TRF is used as the ground-truth. FastSTR runs based on 72 CPUs.*
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
---
|
|
189
|
+
|
|
190
|
+
## 📚 Citation
|
|
191
|
+
|
|
192
|
+
If you use **FastSTR** in your research, please cite:
|
|
193
|
+
|
|
194
|
+
> Xingyu Liao *et al.*,
|
|
195
|
+
> **Efficient Identification of Short Tandem Repeats via Context-Aware Motif Discovery and Ultra-Fast Sequence Alignment**,
|
|
196
|
+
> *Nat. Methods*, 2025.
|
|
197
|
+
|
|
198
|
+
---
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
## 📄 License
|
|
202
|
+
|
|
203
|
+
This project is licensed under the **MIT License**.
|
|
204
|
+
See [LICENSE](LICENSE) for more details.
|
|
205
|
+
|
|
206
|
+
---
|
|
207
|
+
|
|
208
|
+
## 🧾 Changelog
|
|
209
|
+
|
|
210
|
+
### v1.0.0 (2025)
|
|
211
|
+
- Initial release of FastSTR
|
|
212
|
+
- Supports three alignment modes and one default model
|
|
213
|
+
- Implemented parallel computation
|
|
214
|
+
- Added `.csv`, `.dat`, `.log` outputs
|
|
215
|
+
---
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
faststr/Segmented_Global_Alignment.py,sha256=iqPhqCNyL2N52sZwCZr5gSzdUTgmpQ_vunszW-4yGfM,26165
|
|
2
|
+
faststr/__init__.py,sha256=-brodmEr8X1HxTfVQ7g5BXYbHDRWV52VGpxfoAlXLAI,46
|
|
3
|
+
faststr/get_subread_trs.py,sha256=B3Yzk9_gxNq1oVDLnx2O4zoSMe3pjD7Ax_tfDQOIe9k,1325
|
|
4
|
+
faststr/main.py,sha256=zkIH0faI4zj9P3CBkoWK0UpPV35GQhGyYZCFBMafhDg,16841
|
|
5
|
+
faststr/make_consensus_TRs.py,sha256=GtixIOCCv_CtmPw0XrzVUHXRAKE7mZD7OpAh5ZUIOvg,20274
|
|
6
|
+
faststr/scan_subread.py,sha256=v7yI3h1SKPnus2EIh2HGP17L40ZvTzq62VBUQioN41s,18030
|
|
7
|
+
faststr/trs_align.py,sha256=wJoOqGqeAd5tyeblB5qbonMF6pmvG2QVI-ckM4MSFz8,37750
|
|
8
|
+
faststr/utils.py,sha256=KZO7qUcRnD-CWZUuoCHoIGECYw9gdx_nbNF8kol0vkQ,16018
|
|
9
|
+
faststr-1.0.0.dist-info/METADATA,sha256=qKCq0XkVMVSMu70jVBD5nvBgf03D8R0BbgLdAZ8h9Cc,7093
|
|
10
|
+
faststr-1.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
11
|
+
faststr-1.0.0.dist-info/entry_points.txt,sha256=U2IgRO55zTgGvY5CxxQd05GEK7OdgUptGzXOWtsWrH4,46
|
|
12
|
+
faststr-1.0.0.dist-info/top_level.txt,sha256=289H-XYZbLnN6h63Kc0OZw0tSZ-RMaSovsW-MP8kkk4,8
|
|
13
|
+
faststr-1.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
faststr
|