FastSTR 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- faststr/Segmented_Global_Alignment.py +541 -0
- faststr/__init__.py +2 -0
- faststr/get_subread_trs.py +28 -0
- faststr/main.py +338 -0
- faststr/make_consensus_TRs.py +367 -0
- faststr/scan_subread.py +326 -0
- faststr/trs_align.py +704 -0
- faststr/utils.py +428 -0
- faststr-1.0.0.dist-info/METADATA +220 -0
- faststr-1.0.0.dist-info/RECORD +13 -0
- faststr-1.0.0.dist-info/WHEEL +5 -0
- faststr-1.0.0.dist-info/entry_points.txt +2 -0
- faststr-1.0.0.dist-info/top_level.txt +1 -0
faststr/main.py
ADDED
|
@@ -0,0 +1,338 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import multiprocessing
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import time
|
|
6
|
+
import csv
|
|
7
|
+
|
|
8
|
+
from . import utils
|
|
9
|
+
from . import get_subread_trs
|
|
10
|
+
from . import make_consensus_TRs
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def parse_args():
|
|
14
|
+
parser = argparse.ArgumentParser(
|
|
15
|
+
description="FastSTR: A tool for identifying tandem repeats (STRs) from DNA sequences."
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
# --- 模式选择(必选,三选一) ---
|
|
19
|
+
mode_group = parser.add_mutually_exclusive_group(required=True)
|
|
20
|
+
mode_group.add_argument("--strict", action="store_true", help="Strict mode (match=2, mismatch=3, gap_open=5, gap_extend=1)")
|
|
21
|
+
mode_group.add_argument("--normal", action="store_true", help="Normal mode (match=2, mismatch=5, gap_open=7, gap_extend=3)")
|
|
22
|
+
mode_group.add_argument("--loose", action="store_true", help="Loose mode (match=2, mismatch=7, gap_open=7, gap_extend=7)")
|
|
23
|
+
|
|
24
|
+
# --- 模型选择(目前只有一个 default)---
|
|
25
|
+
model_group = parser.add_mutually_exclusive_group(required=True)
|
|
26
|
+
model_group.add_argument("--default", action="store_true", help="Use the default STR identification model")
|
|
27
|
+
|
|
28
|
+
# --- 输入序列(位置参数)---
|
|
29
|
+
parser.add_argument("sequence", type=str, help="Path to the DNA sequence FASTA file")
|
|
30
|
+
|
|
31
|
+
# --- 可选参数 ---
|
|
32
|
+
parser.add_argument("-f", "--out_dir", type=str, default=".", help="Output directory (default: current directory)")
|
|
33
|
+
parser.add_argument("-s", "--start", type=int, default=1, help="Start index (default: 1)")
|
|
34
|
+
parser.add_argument("-e", "--end", type=int, default=0, help="End index (0 means full length)")
|
|
35
|
+
parser.add_argument("-l", "--read_length", type=int, default=15000, help="Sub-read length (default: 15000)")
|
|
36
|
+
parser.add_argument("-o", "--overlap", type=int, default=1000, help="Overlap length (default: 1000)")
|
|
37
|
+
parser.add_argument("-p", "--processes", type=int, default=1, help="Number of CPU cores to use (default: 1)")
|
|
38
|
+
parser.add_argument("-b", "--beta", type=float, default=0.045, help="Motif coverage threshold for alignment (default: 0.045)")
|
|
39
|
+
|
|
40
|
+
return parser.parse_args()
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def Fast_TR(m, r, g, e, p_indel, p_match, score, input_path, out_path='', start_index=1,
|
|
45
|
+
end_index=0, read_length=15000, overlap_length=1000, process=1, beta=0.045):
|
|
46
|
+
# 预处理
|
|
47
|
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
48
|
+
gene_name = os.path.splitext(os.path.basename(input_path))[0]
|
|
49
|
+
parameter_name = str(m) + '_' + str(r) + '_' + str(g) + '_' + str(e) + '_' + str(p_indel) + '_' + str(
|
|
50
|
+
p_match) + '_' + str(score)
|
|
51
|
+
out_file_name = gene_name + '.' + parameter_name
|
|
52
|
+
# logfile_path = os.path.join(current_dir, out_file_name + '.log')
|
|
53
|
+
if out_path == '' or out_path == 'None':
|
|
54
|
+
logfile_path = os.path.join(current_dir, out_file_name + '.log')
|
|
55
|
+
detected_tr_detailed_path = os.path.join(current_dir, out_file_name + '_detailed.dat')
|
|
56
|
+
detected_tr_aligned_path = os.path.join(current_dir, out_file_name + '_aligned.dat')
|
|
57
|
+
detected_tr_summary_path = os.path.join(current_dir, out_file_name + '_summary.csv')
|
|
58
|
+
else:
|
|
59
|
+
logfile_path = os.path.join(out_path, out_file_name + '.log')
|
|
60
|
+
detected_tr_detailed_path = os.path.join(out_path, out_file_name + '_detailed.dat')
|
|
61
|
+
detected_tr_aligned_path = os.path.join(out_path, out_file_name + '_aligned.dat')
|
|
62
|
+
detected_tr_summary_path = os.path.join(out_path, out_file_name + '_summary.csv')
|
|
63
|
+
logging.basicConfig(filename=logfile_path, level=logging.INFO,
|
|
64
|
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', filemode='w')
|
|
65
|
+
r = -r
|
|
66
|
+
g = -g
|
|
67
|
+
e = -e
|
|
68
|
+
p_indel = p_indel / 100
|
|
69
|
+
p_match = p_match / 100
|
|
70
|
+
logging.info("Program started")
|
|
71
|
+
|
|
72
|
+
# 获取基因序列
|
|
73
|
+
print('Start reading gene sequence')
|
|
74
|
+
start_time_part = time.time()
|
|
75
|
+
sequences_list = utils.read_fasta(input_path, start_index, end_index)
|
|
76
|
+
end_time_part = time.time()
|
|
77
|
+
run_time_part = end_time_part - start_time_part
|
|
78
|
+
logging.info(f"read gene sequence running time: {run_time_part} seconds")
|
|
79
|
+
|
|
80
|
+
# 划分各条染色体
|
|
81
|
+
print('Start dividing gene sequences')
|
|
82
|
+
num_subreads = []
|
|
83
|
+
sub_reads_list = []
|
|
84
|
+
sequences_name_list = []
|
|
85
|
+
start_time_part = time.time()
|
|
86
|
+
for seqs in sequences_list:
|
|
87
|
+
sequences_name_list.append(seqs.description)
|
|
88
|
+
sub_reads_list.extend(utils.make_sub_reads(seqs, read_length, overlap_length))
|
|
89
|
+
num_subreads.append(len(sub_reads_list))
|
|
90
|
+
end_time_part = time.time()
|
|
91
|
+
run_time_part = end_time_part - start_time_part
|
|
92
|
+
logging.info(f"divide gene sequence running time: {run_time_part} seconds")
|
|
93
|
+
del sequences_list
|
|
94
|
+
|
|
95
|
+
# 并行处理sub_reads
|
|
96
|
+
print('Start detecting TRs in parallel')
|
|
97
|
+
start_time_part = time.time()
|
|
98
|
+
params = []
|
|
99
|
+
for sub_read in sub_reads_list:
|
|
100
|
+
params.append((sub_read, m, r, g, e, p_indel, p_match, score, beta))
|
|
101
|
+
pool = multiprocessing.Pool(process)
|
|
102
|
+
Reads_TRs = pool.starmap(get_subread_trs.get_subread_trs, params)
|
|
103
|
+
pool.close()
|
|
104
|
+
pool.join()
|
|
105
|
+
end_time_part = time.time()
|
|
106
|
+
run_time_part = end_time_part - start_time_part
|
|
107
|
+
logging.info(f"detect TRs in parallel running time: {run_time_part} seconds")
|
|
108
|
+
del params
|
|
109
|
+
|
|
110
|
+
# 合并各个sub_read上的trs,构建共识trs
|
|
111
|
+
print('Start constructing consensus TRs')
|
|
112
|
+
start_time_part = time.time()
|
|
113
|
+
before_handling_compatibility_TRs_dict = {}
|
|
114
|
+
cross_sub_reads_TRs_list = []
|
|
115
|
+
cro_sub_read_params = []
|
|
116
|
+
num_cro_sub_read = [0]
|
|
117
|
+
time_it = []
|
|
118
|
+
for index, s_n in enumerate(sequences_name_list):
|
|
119
|
+
before_handling_compatibility_TRs_dict[s_n] = []
|
|
120
|
+
cross_sub_reads_TRs_list.clear()
|
|
121
|
+
last_seq_subreads = 0 if index == 0 else num_subreads[index - 1]
|
|
122
|
+
for pos, _ in enumerate(Reads_TRs[last_seq_subreads:num_subreads[index]]):
|
|
123
|
+
sub_read = Reads_TRs[last_seq_subreads + pos]
|
|
124
|
+
if pos + 1 < num_subreads[index] - last_seq_subreads:
|
|
125
|
+
sequence = sub_reads_list[last_seq_subreads + pos][:read_length - overlap_length] + sub_reads_list[
|
|
126
|
+
last_seq_subreads + pos + 1]
|
|
127
|
+
next_sub_read = Reads_TRs[last_seq_subreads + pos + 1]
|
|
128
|
+
else:
|
|
129
|
+
sequence = sub_reads_list[num_subreads[index] - 1]
|
|
130
|
+
next_sub_read = []
|
|
131
|
+
up_trs, mid_trs, if_change_motif = make_consensus_TRs.make_two_subreads_consensus(sub_read, next_sub_read,
|
|
132
|
+
sequence, pos,
|
|
133
|
+
read_length,
|
|
134
|
+
overlap_length,
|
|
135
|
+
start_index,
|
|
136
|
+
cross_sub_reads_TRs_list,
|
|
137
|
+
p_indel, p_match, m, r, g,
|
|
138
|
+
e, score, beta)
|
|
139
|
+
before_handling_compatibility_TRs_dict[s_n].extend(up_trs)
|
|
140
|
+
if pos + 1 < num_subreads[index] - last_seq_subreads:
|
|
141
|
+
Reads_TRs[last_seq_subreads + pos + 1] = mid_trs
|
|
142
|
+
|
|
143
|
+
if if_change_motif == 1:
|
|
144
|
+
cross_tr_seq = []
|
|
145
|
+
start_piece = (cross_sub_reads_TRs_list[-1][1] - start_index) // (read_length - overlap_length)
|
|
146
|
+
end_picee = (cross_sub_reads_TRs_list[-1][2] - start_index) // (read_length - overlap_length)
|
|
147
|
+
cross_tr_seq.append(str(
|
|
148
|
+
sub_reads_list[last_seq_subreads + start_piece][
|
|
149
|
+
cross_sub_reads_TRs_list[-1][1] - start_index - ((read_length - overlap_length)) * start_piece:]))
|
|
150
|
+
for i in range(start_piece + 1, end_picee):
|
|
151
|
+
cross_tr_seq.append(str(sub_reads_list[last_seq_subreads + i]))
|
|
152
|
+
cross_tr_seq.append(str(
|
|
153
|
+
sub_reads_list[last_seq_subreads + end_picee][
|
|
154
|
+
:cross_sub_reads_TRs_list[-1][2] - start_index - ((read_length - overlap_length)) * end_picee + 1]))
|
|
155
|
+
cross_tr_seq = ''.join(cross_tr_seq)
|
|
156
|
+
consensus_motif = utils.tri_gram_model(cross_tr_seq, len(cross_sub_reads_TRs_list[-1][0]))
|
|
157
|
+
cross_sub_reads_TRs_list[-1][0] = consensus_motif[0]
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
if cross_sub_reads_TRs_list == []:
|
|
161
|
+
num_cro_sub_read.append(len(cro_sub_read_params))
|
|
162
|
+
continue
|
|
163
|
+
for c_s_r in cross_sub_reads_TRs_list:
|
|
164
|
+
cross_tr_seq = []
|
|
165
|
+
start_piece = (c_s_r[1] - start_index) // (read_length - overlap_length)
|
|
166
|
+
end_picee = (c_s_r[2] - start_index) // (read_length - overlap_length)
|
|
167
|
+
cross_tr_seq.append(str(
|
|
168
|
+
sub_reads_list[last_seq_subreads + start_piece][
|
|
169
|
+
c_s_r[1] - start_index - ((read_length - overlap_length)) * start_piece:]))
|
|
170
|
+
for i in range(start_piece + 1, end_picee):
|
|
171
|
+
cross_tr_seq.append(str(sub_reads_list[last_seq_subreads + i]))
|
|
172
|
+
cross_tr_seq.append(str(
|
|
173
|
+
sub_reads_list[last_seq_subreads + end_picee][
|
|
174
|
+
:c_s_r[2] - start_index - ((read_length - overlap_length)) * end_picee + 1]))
|
|
175
|
+
cross_tr_seq = ''.join(cross_tr_seq)
|
|
176
|
+
cro_sub_read_params.append((c_s_r, cross_tr_seq, p_indel, p_match, m, r, g, e, score, beta))
|
|
177
|
+
num_cro_sub_read.append(len(cro_sub_read_params))
|
|
178
|
+
|
|
179
|
+
del Reads_TRs
|
|
180
|
+
del sub_reads_list
|
|
181
|
+
del cross_sub_reads_TRs_list
|
|
182
|
+
|
|
183
|
+
Cross_TRs = []
|
|
184
|
+
if len(cro_sub_read_params) > 0:
|
|
185
|
+
pool = multiprocessing.Pool(process)
|
|
186
|
+
Cross_TRs = pool.starmap(make_consensus_TRs.calculate_cross_subread_tr, cro_sub_read_params)
|
|
187
|
+
pool.close()
|
|
188
|
+
pool.join()
|
|
189
|
+
|
|
190
|
+
del cro_sub_read_params
|
|
191
|
+
Final_TRs_dict = {}
|
|
192
|
+
Final_TRs_Region_dict = {}
|
|
193
|
+
for index, s_n in enumerate(sequences_name_list):
|
|
194
|
+
i = 0
|
|
195
|
+
j = 0
|
|
196
|
+
Final_TRs_dict[s_n] = []
|
|
197
|
+
Final_TRs_Region_dict[s_n] = []
|
|
198
|
+
merged_crosubtrs = []
|
|
199
|
+
for sublist in Cross_TRs[num_cro_sub_read[index]:num_cro_sub_read[index + 1]]:
|
|
200
|
+
merged_crosubtrs.extend(sublist)
|
|
201
|
+
if len(before_handling_compatibility_TRs_dict[s_n]) > 0 and len(Cross_TRs) > 0:
|
|
202
|
+
while i < len(before_handling_compatibility_TRs_dict[s_n]) and j < len(merged_crosubtrs):
|
|
203
|
+
if before_handling_compatibility_TRs_dict[s_n][i][1] <= merged_crosubtrs[j][1]:
|
|
204
|
+
Final_TRs_dict[s_n].append(before_handling_compatibility_TRs_dict[s_n][i])
|
|
205
|
+
i += 1
|
|
206
|
+
else:
|
|
207
|
+
Final_TRs_dict[s_n].append(merged_crosubtrs[j])
|
|
208
|
+
j += 1
|
|
209
|
+
if before_handling_compatibility_TRs_dict[s_n] and i < len(before_handling_compatibility_TRs_dict[s_n]):
|
|
210
|
+
Final_TRs_dict[s_n].extend(before_handling_compatibility_TRs_dict[s_n][i:])
|
|
211
|
+
if Cross_TRs and j < len(merged_crosubtrs):
|
|
212
|
+
Final_TRs_dict[s_n].extend(merged_crosubtrs[j:])
|
|
213
|
+
|
|
214
|
+
Final_TRs_dict[s_n], Final_TRs_Region_dict[s_n] = make_consensus_TRs.handling_compatibility(Final_TRs_dict[s_n],
|
|
215
|
+
p_match, p_indel)
|
|
216
|
+
|
|
217
|
+
end_time_part = time.time()
|
|
218
|
+
run_time_part = end_time_part - start_time_part
|
|
219
|
+
logging.info(f"merge sub_reads and construct consensus TRs running time: {run_time_part} seconds")
|
|
220
|
+
del before_handling_compatibility_TRs_dict
|
|
221
|
+
|
|
222
|
+
# 写入识别结果
|
|
223
|
+
print('Start saving the detected TRs')
|
|
224
|
+
start_time_part = time.time()
|
|
225
|
+
'''
|
|
226
|
+
写入摘要信息(dat)
|
|
227
|
+
'''
|
|
228
|
+
with open(detected_tr_summary_path, mode='w', newline='', encoding='utf-8') as file:
|
|
229
|
+
writer = csv.writer(file)
|
|
230
|
+
# 写入表头(手动创建)
|
|
231
|
+
writer.writerow(
|
|
232
|
+
["Seq Name", "Start", "End", "Primary Motif", "STR Gain Score", "Second Motif", "STR Gain Score",
|
|
233
|
+
"Third Motif", "STR Gain Score"])
|
|
234
|
+
for seq_name, final_trs_region in Final_TRs_Region_dict.items():
|
|
235
|
+
for f_t_r in final_trs_region:
|
|
236
|
+
if f_t_r[0] == 1:
|
|
237
|
+
writer.writerow([seq_name, f_t_r[1], f_t_r[2], f_t_r[3], f_t_r[4]])
|
|
238
|
+
elif f_t_r[0] == 2:
|
|
239
|
+
writer.writerow([seq_name, f_t_r[1], f_t_r[2], f_t_r[3], f_t_r[4], f_t_r[5], f_t_r[6]])
|
|
240
|
+
else:
|
|
241
|
+
writer.writerow(
|
|
242
|
+
[seq_name, f_t_r[1], f_t_r[2], f_t_r[3], f_t_r[4], f_t_r[5], f_t_r[6], f_t_r[7], f_t_r[8]])
|
|
243
|
+
|
|
244
|
+
'''
|
|
245
|
+
写入检测报告(dat)
|
|
246
|
+
'''
|
|
247
|
+
with open(detected_tr_detailed_path, 'w') as file1, open(detected_tr_aligned_path, 'w') as file2:
|
|
248
|
+
# 写入注释
|
|
249
|
+
file1.write(
|
|
250
|
+
'The report on FastSTR detected of STRs provides a detailed list of the distribution, quality, and structure of all STRs, with the following content template:\n\n\n')
|
|
251
|
+
file1.write('**********************************************************************\n')
|
|
252
|
+
file1.write('Gene sequence name\n')
|
|
253
|
+
file1.write('----------------------------------------\n')
|
|
254
|
+
file1.write(
|
|
255
|
+
'start\t\tend\t\tregion length\t\tmotif length\t\tcopy number\t\tmotif\t\tindel percentage\t\tmatch percentage\t\talign score\t\talign uuid\n')
|
|
256
|
+
file1.write('----------------------------------------\n')
|
|
257
|
+
file1.write("The total number of detected STRs is: X\n")
|
|
258
|
+
file1.write('**********************************************************************\n\n\n')
|
|
259
|
+
# 写入tr
|
|
260
|
+
for seq_name, final_trs in Final_TRs_dict.items():
|
|
261
|
+
file1.write('**********************************************************************\n')
|
|
262
|
+
file1.write(seq_name + '\n')
|
|
263
|
+
file1.write('----------------------------------------\n')
|
|
264
|
+
for f_t in final_trs:
|
|
265
|
+
file1.write(
|
|
266
|
+
f"{str(f_t[1]):<12}\t{str(f_t[2]):<12}\t{str(f_t[2] - f_t[1]):<10}\t{str(len(f_t[0])):<5}"
|
|
267
|
+
f"\t{f'{f_t[7]:.2f}':<10}\t{f_t[0]:<12}\t{f'{f_t[3]:.4f}':<10}\t{f'{f_t[4]:.4f}':<10}"
|
|
268
|
+
f"\t{str(f_t[5]):<10}\t"
|
|
269
|
+
)
|
|
270
|
+
s_n = seq_name.replace(' ', '')
|
|
271
|
+
file1.write('%'.join([s_n[:min(len(s_n), 20)], str(f_t[1]), str(f_t[2]), f_t[0]]) + '\n')
|
|
272
|
+
file2.write('%'.join([s_n[:min(len(s_n), 20)], str(f_t[1]), str(f_t[2]), f_t[0]]) + '\n')
|
|
273
|
+
file2.write(f_t[6] + '\n\n')
|
|
274
|
+
# 写入tr总个数
|
|
275
|
+
if len(final_trs) > 0:
|
|
276
|
+
file1.write('----------------------------------------\n')
|
|
277
|
+
file1.write("The total number of detected STRs is: " + str(len(final_trs)) + '\n')
|
|
278
|
+
file1.write('**********************************************************************\n\n\n')
|
|
279
|
+
end_time_part = time.time()
|
|
280
|
+
run_time_part = end_time_part - start_time_part
|
|
281
|
+
|
|
282
|
+
logging.info(f"save the detected TRs running time: {run_time_part} seconds")
|
|
283
|
+
logging.info("Program completed")
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def main():
|
|
287
|
+
args = parse_args()
|
|
288
|
+
|
|
289
|
+
# --- 模式选择 ---
|
|
290
|
+
if args.strict:
|
|
291
|
+
match, mismatch, gap_open, gap_extend = 2, 3, 5, 1
|
|
292
|
+
mode_name = "strict"
|
|
293
|
+
elif args.normal:
|
|
294
|
+
match, mismatch, gap_open, gap_extend = 2, 5, 7, 3
|
|
295
|
+
mode_name = "normal"
|
|
296
|
+
elif args.loose:
|
|
297
|
+
match, mismatch, gap_open, gap_extend = 2, 7, 7, 7
|
|
298
|
+
mode_name = "loose"
|
|
299
|
+
else:
|
|
300
|
+
raise ValueError("You must specify one of --strict, --normal, or --loose")
|
|
301
|
+
|
|
302
|
+
# --- 模型选择 ---
|
|
303
|
+
if args.default:
|
|
304
|
+
p_indel, p_match, score, quality_control = 0.15, 0.80, 50, False
|
|
305
|
+
else:
|
|
306
|
+
raise ValueError("You must specify one of --default")
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
# --- 输出信息 ---
|
|
310
|
+
print(f"Running FastSTR in {mode_name.upper()} mode with DEFAULT model")
|
|
311
|
+
print(f"Input sequence: {args.sequence}")
|
|
312
|
+
print(f"Output directory: {args.out_dir}")
|
|
313
|
+
print(f"Using {args.processes} CPU core(s)")
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
# --- 调用核心算法 ---
|
|
317
|
+
Fast_TR(
|
|
318
|
+
match,
|
|
319
|
+
mismatch,
|
|
320
|
+
gap_open,
|
|
321
|
+
gap_extend,
|
|
322
|
+
p_indel * 100,
|
|
323
|
+
p_match * 100,
|
|
324
|
+
score,
|
|
325
|
+
args.sequence,
|
|
326
|
+
args.out_dir,
|
|
327
|
+
args.start,
|
|
328
|
+
args.end,
|
|
329
|
+
args.read_length,
|
|
330
|
+
args.overlap,
|
|
331
|
+
args.processes,
|
|
332
|
+
args.beta
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
if __name__ == "__main__":
|
|
337
|
+
multiprocessing.freeze_support()
|
|
338
|
+
main()
|