FastSTR 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
faststr/utils.py ADDED
@@ -0,0 +1,428 @@
1
+ import math
2
+ import numpy as np
3
+ import regex
4
+ from Bio import SeqIO
5
+
6
+
7
+ # 获取每条染色体从start_index到end_index上的碱基序列
8
+ def read_fasta(path, start_index=1, end_index=0):
9
+ sequences = []
10
+ with open(path, "r") as handle:
11
+ for record in SeqIO.parse(handle, "fasta"):
12
+ if end_index == 0:
13
+ record.seq = record.seq.upper()[start_index - 1:]
14
+ else:
15
+ record.seq = record.seq.upper()[start_index - 1:end_index]
16
+
17
+ sequences.append(record)
18
+ return sequences
19
+
20
+
21
+ def make_sub_reads(gene_sequence, read_length, overlap_length):
22
+ sub_reads = []
23
+ if len(gene_sequence) <= read_length:
24
+ return [gene_sequence.seq]
25
+ # 滑动窗口提取sub_read
26
+ for start in range(0, (len(gene_sequence) // (read_length - overlap_length) + 1) * (read_length - overlap_length),
27
+ read_length - overlap_length):
28
+ if start >= len(gene_sequence):
29
+ break
30
+ end = start + read_length
31
+ subsequence = gene_sequence.seq[start:min(end, len(gene_sequence))]
32
+ sub_reads.append(subsequence)
33
+
34
+ return sub_reads
35
+
36
+
37
+ def get_anchors(sub_read):
38
+ anchor_dict = {i: set() for i in range(1, 9)}
39
+ anchor_mark_dict = {i: [0] * len(sub_read) for i in range(1, 9)}
40
+ same_bases = [[], [], [], []]
41
+ for p, c in enumerate(sub_read):
42
+ if c == 'A':
43
+ same_bases[0].append(p)
44
+ elif c == 'G':
45
+ same_bases[1].append(p)
46
+ elif c == 'C':
47
+ same_bases[2].append(p)
48
+ elif c == 'T':
49
+ same_bases[3].append(p)
50
+ for s_b in same_bases:
51
+ i = 0
52
+ for j in range(len(s_b)):
53
+ while s_b[j] - s_b[i] > 8:
54
+ i += 1
55
+ for k in range(i, j):
56
+ anchor_dict[s_b[j] - s_b[k]].add((s_b[k], s_b[j] - s_b[k]))
57
+ anchor_dict[s_b[j] - s_b[k]].add((s_b[j], s_b[j] - s_b[k]))
58
+ anchor_mark_dict[s_b[j] - s_b[k]][s_b[k]] = 1
59
+ return anchor_dict, anchor_mark_dict
60
+
61
+
62
+ def if_motif_is_tr(motif):
63
+ if not motif:
64
+ return False
65
+ # 拼接字符串并移除首尾字符
66
+ doubled_motif = (motif + motif)[1:-1]
67
+ # 检查原字符串是否出现在移除首尾字符后的新字符串中
68
+ return motif in doubled_motif
69
+
70
+
71
+ def tri_gram_model(sequence, n):
72
+ # 获取二元、三元计数和二元位置
73
+ tri_count = {}
74
+ bi_count = {'AA': 0, 'AG': 0, 'AC': 0, 'AT': 0, 'GA': 0, 'GG': 0, 'GC': 0, 'GT': 0, 'CA': 0, 'CG': 0, 'CC': 0,
75
+ 'CT': 0, 'TA': 0, 'TG': 0, 'TC': 0, 'TT': 0}
76
+ bi_pos = {'AA': [], 'AG': [], 'AC': [], 'AT': [], 'GA': [], 'GG': [], 'GC': [], 'GT': [], 'CA': [], 'CG': [],
77
+ 'CC': [], 'CT': [], 'TA': [], 'TG': [], 'TC': [], 'TT': []}
78
+ for i in range(len(sequence) - 2):
79
+ if not set(sequence[i:i + 3]).issubset({'A', 'G', 'T', 'C'}):
80
+ continue
81
+ if sequence[i:i + 3] not in tri_count:
82
+ tri_count[sequence[i:i + 3]] = 1
83
+ else:
84
+ tri_count[sequence[i:i + 3]] += 1
85
+ bi_pos[sequence[i:i + 2]].append(i)
86
+ bi_count[sequence[i:i + 2]] += 1
87
+ if i == len(sequence) - 3:
88
+ bi_pos[sequence[i + 1:i + 3]].append(i + 1)
89
+ bi_count[sequence[i + 1:i + 3]] += 1
90
+
91
+ # 根据种子获取备选motif
92
+ sorted_seed = sorted(bi_count, key=lambda x: bi_count[x], reverse=True)
93
+ if bi_count[sorted_seed[0]] < 2:
94
+ return None
95
+ # 如果motif长度为1
96
+ if n == 1:
97
+ return [sorted_seed[0][0]]
98
+ # 如果motif长度为2
99
+ if n == 2:
100
+ i = 0
101
+ while i < len(sorted_seed):
102
+ if sorted_seed[i][0] == sorted_seed[i][1] and bi_count[sorted_seed[i]] >= len(sequence) / 2:
103
+ return None
104
+ if sorted_seed[i][0] != sorted_seed[i][1] and bi_count[sorted_seed[i]] > 0:
105
+ return [sorted_seed[i]]
106
+ i += 1
107
+ return None
108
+ # 如果motif长度大于2小于7
109
+ motifs = set()
110
+ # true_motif = []
111
+ k = 0
112
+ while True:
113
+ max_seed = sorted_seed[k]
114
+ bipos_set = set(bi_pos[max_seed])
115
+
116
+ for pos in bi_pos[max_seed]:
117
+ if pos + n in bipos_set:
118
+ if if_motif_is_tr(sequence[pos:pos + n]):
119
+ continue
120
+ if not set(sequence[pos:pos + n]).issubset({'A', 'G', 'T', 'C'}):
121
+ continue
122
+ # if sequence[pos:pos + n] in m + m:
123
+ # true_motif.append(sequence[pos:pos + n])
124
+ motifs.add(sequence[pos:pos + n])
125
+ if motifs and k == 2:
126
+ break
127
+ k += 1
128
+ if k > 15:
129
+ return None
130
+
131
+ # 计算各个motif的合理概率,得到最佳motif
132
+ tri_gram = {}
133
+ for tri, _ in tri_count.items():
134
+ if bi_count[tri[:2]] == 0:
135
+ continue
136
+ tri_gram[tri] = tri_count[tri] / bi_count[tri[:2]]
137
+ p_rationality = []
138
+ for mot in motifs:
139
+ p_r = 1
140
+ for b in range(n - 2):
141
+ p_r += math.log(tri_gram[mot[b:b + 3]])
142
+ p_rationality.append(p_r)
143
+ sorted_prationalty = sorted(p_rationality)
144
+ max_prationalty = sorted_prationalty[-1]
145
+ motifs = list(motifs)
146
+ alternative_motif = [str(motifs[index]) for index, value in enumerate(p_rationality) if value == max_prationalty]
147
+ final_motif = [alternative_motif[0]]
148
+ if len(alternative_motif) == 1:
149
+ return final_motif
150
+ for a_m in alternative_motif[1:]:
151
+ logo = 0
152
+ for f_m in final_motif:
153
+ if a_m in f_m + f_m:
154
+ logo = 1
155
+ if logo == 0:
156
+ final_motif.append(a_m)
157
+ return final_motif
158
+
159
+
160
+ def get_realscan_sequence(sequence):
161
+ if len(sequence) < 40:
162
+ return sequence
163
+ elif len(sequence) < 800:
164
+ noisy_bases = round(len(sequence) * 0.05)
165
+ start_noisy_bases = noisy_bases // 2
166
+ end_noisy_bases = noisy_bases - start_noisy_bases
167
+ return sequence[start_noisy_bases:len(sequence) - end_noisy_bases]
168
+ else:
169
+ return sequence[20:len(sequence) - 20]
170
+
171
+
172
+ def get_motif_marks(sequence, motif):
173
+ if len(motif) < 4:
174
+ pattern = f"({motif}){{s<=0}}"
175
+ elif len(motif) < 6:
176
+ pattern = f"({motif}){{s<=1}}"
177
+ else:
178
+ pattern = f"({motif}){{s<=2}}"
179
+ matches = regex.finditer(pattern, str(sequence))
180
+ positions = [match.start() for match in matches]
181
+ return positions
182
+
183
+
184
+ def high_motif_detection(sequence, n):
185
+ # 获取二元、三元计数和二元位置
186
+ tri_count = {}
187
+ bi_count = {'AA': 0, 'AG': 0, 'AC': 0, 'AT': 0, 'GA': 0, 'GG': 0, 'GC': 0, 'GT': 0, 'CA': 0, 'CG': 0, 'CC': 0,
188
+ 'CT': 0, 'TA': 0, 'TG': 0, 'TC': 0, 'TT': 0}
189
+ bi_pos = {'AA': [], 'AG': [], 'AC': [], 'AT': [], 'GA': [], 'GG': [], 'GC': [], 'GT': [], 'CA': [], 'CG': [],
190
+ 'CC': [], 'CT': [], 'TA': [], 'TG': [], 'TC': [], 'TT': []}
191
+ base_de_bruijn = np.zeros((4, 4))
192
+ base_mark = {'A': 0, 'G': 1, 'C': 2, 'T': 3}
193
+ base_annotation = {0: 'A', 1: 'G', 2: 'C', 3: 'T'}
194
+ for i in range(len(sequence) - 2):
195
+ if i > 0 and set(sequence[i - 1:i + 1]).issubset({'A', 'G', 'T', 'C'}):
196
+ base_de_bruijn[base_mark[sequence[i - 1]]][base_mark[sequence[i]]] += 1
197
+ if not set(sequence[i:i + 3]).issubset({'A', 'G', 'T', 'C'}):
198
+ continue
199
+ if sequence[i:i + 3] not in tri_count:
200
+ tri_count[sequence[i:i + 3]] = 1
201
+ else:
202
+ tri_count[sequence[i:i + 3]] += 1
203
+ bi_pos[sequence[i:i + 2]].append(i)
204
+ bi_count[sequence[i:i + 2]] += 1
205
+ if i == len(sequence) - 3:
206
+ bi_pos[sequence[i + 1:i + 3]].append(i + 1)
207
+ bi_count[sequence[i + 1:i + 3]] += 1
208
+ base_de_bruijn[base_mark[sequence[i]]][base_mark[sequence[i + 1]]] += 1
209
+ base_de_bruijn[base_mark[sequence[i + 1]]][base_mark[sequence[i + 2]]] += 1
210
+
211
+ # 根据种子获取备选motif
212
+ sorted_seed = sorted(bi_count, key=lambda x: bi_count[x], reverse=True)
213
+ if bi_count[sorted_seed[0]] < 2:
214
+ return None
215
+
216
+ motifs = set()
217
+ k = 0
218
+ while True:
219
+ max_seed = sorted_seed[k]
220
+ bipos_set = set(bi_pos[max_seed])
221
+
222
+ for pos in bi_pos[max_seed]:
223
+ if pos + n in bipos_set:
224
+ if if_motif_is_tr(sequence[pos:pos + n]):
225
+ continue
226
+ if not set(sequence[pos:pos + n]).issubset({'A', 'G', 'T', 'C'}):
227
+ continue
228
+ motifs.add(sequence[pos:pos + n])
229
+ if motifs and k == 2:
230
+ break
231
+ k += 1
232
+ if k > 15:
233
+ return None
234
+
235
+ # 计算各个motif的合理概率,得到最佳motif
236
+ tri_gram = {}
237
+ for tri, _ in tri_count.items():
238
+ if bi_count[tri[:2]] == 0:
239
+ continue
240
+ tri_gram[tri] = tri_count[tri] / bi_count[tri[:2]]
241
+ p_rationality = []
242
+ for mot in motifs:
243
+ p_r = 1
244
+ for b in range(n - 2):
245
+ p_r += math.log(tri_gram[mot[b:b + 3]])
246
+ p_rationality.append(p_r)
247
+ sorted_prationalty = sorted(p_rationality)
248
+ max_prationalty = sorted_prationalty[-1]
249
+ motifs_mar = []
250
+ motifs = list(motifs)
251
+ arfa = 0
252
+ if n == 5:
253
+ arfa = 0.3
254
+ elif n == 6:
255
+ arfa = 0.4
256
+ elif n == 7:
257
+ arfa = 0.8
258
+ else:
259
+ arfa = 1
260
+ for inde, mot in enumerate(motifs):
261
+ if max_prationalty - p_rationality[inde] <= arfa:
262
+ motifs_mar.append(mot)
263
+
264
+ # copy_vector_variance = []
265
+ # # copy = round(len(sequence) / n)
266
+ # copy = estimated_copy_number(sequence, n, base_de_bruijn)
267
+ # for mot in motifs_mar:
268
+ # mot_copy_vector_variance = get_copy_vector(mot, n, copy, base_de_bruijn, base_mark)
269
+ # if mot_copy_vector_variance != -1:
270
+ # copy_vector_variance.append(mot_copy_vector_variance)
271
+ #
272
+ # if copy_vector_variance == []:
273
+ # return None
274
+ # min_variance = min(copy_vector_variance)
275
+ # alternative_motif = [str(motifs_mar[index]) for index, value in enumerate(copy_vector_variance) if
276
+ # value == min_variance]
277
+ #
278
+ # final_motif = [alternative_motif[0]]
279
+ # if len(alternative_motif) == 1:
280
+ # return final_motif
281
+ # for a_m in alternative_motif[1:]:
282
+ # logo = 0
283
+ # for f_m in final_motif:
284
+ # if a_m in f_m + f_m:
285
+ # logo = 1
286
+ # if logo == 0:
287
+ # final_motif.append(a_m)
288
+
289
+ copy = estimated_copy_number(sequence, n, base_de_bruijn)
290
+ Frequency = []
291
+ for mot in motifs_mar:
292
+ Frequency.append(calculate_base_frequencye(mot, copy, base_de_bruijn, base_mark))
293
+
294
+ max_frequency = max(Frequency)
295
+ alternative_motif = []
296
+ for index, value in enumerate(Frequency):
297
+ if value == max_frequency:
298
+ alternative_motif.append(str(motifs_mar[index]))
299
+
300
+ final_motif = [alternative_motif[0]]
301
+ if len(alternative_motif) == 1:
302
+ return final_motif
303
+ for a_m in alternative_motif[1:]:
304
+ logo = 0
305
+ for f_m in final_motif:
306
+ if a_m in f_m + f_m:
307
+ logo = 1
308
+ if logo == 0:
309
+ final_motif.append(a_m)
310
+ if len(final_motif) > 3:
311
+ copy = len(sequence) // n
312
+ Frequency = []
313
+ for mot in motifs_mar:
314
+ Frequency.append(calculate_base_frequencye(mot, copy, base_de_bruijn, base_mark))
315
+
316
+ max_frequency = max(Frequency)
317
+ alternative_motif = []
318
+ for index, value in enumerate(Frequency):
319
+ if value == max_frequency:
320
+ alternative_motif.append(str(motifs_mar[index]))
321
+
322
+ final_motif = [alternative_motif[0]]
323
+ if len(alternative_motif) == 1:
324
+ return final_motif
325
+ for a_m in alternative_motif[1:]:
326
+ logo = 0
327
+ for f_m in final_motif:
328
+ if a_m in f_m + f_m:
329
+ logo = 1
330
+ if logo == 0:
331
+ final_motif.append(a_m)
332
+
333
+ return final_motif
334
+
335
+
336
+ def estimated_copy_number(sequence, n, base_de_bruijn):
337
+ copy = len(sequence) // n
338
+ # 展开数组为一维
339
+ flattened = base_de_bruijn.flatten()
340
+ # 获取最大的前8个数
341
+ largest_eight = np.sort(flattened)[-8:][::-1]
342
+ large = largest_eight[0]
343
+ for i in range(1, n):
344
+ if i == n - 1 or largest_eight[i] < copy:
345
+ small = math.ceil(sum(largest_eight[:n]) / n)
346
+ break
347
+ if large - largest_eight[i] >= copy:
348
+ small = largest_eight[i]
349
+ break
350
+
351
+ common_divisor = large - small
352
+ max_common_divisor = min(common_divisor, small)
353
+ while max_common_divisor >= copy:
354
+ if small != max_common_divisor:
355
+ common_divisor = max(max_common_divisor, small) - min(small, max_common_divisor)
356
+ small = min(small, max_common_divisor)
357
+ max_common_divisor = min(common_divisor, small)
358
+ else:
359
+ if max_common_divisor % copy == 0:
360
+ max_common_divisor = copy
361
+ break
362
+ max_common_divisor = math.ceil(max_common_divisor / round(max_common_divisor / copy))
363
+ break
364
+ return math.ceil(max_common_divisor * 0.5 + copy * 0.5)
365
+
366
+
367
+ def calculate_base_frequencye(motif, copy, base_de_bruijn, base_mark):
368
+ base_de_bruijn = base_de_bruijn.copy()
369
+ base_frequency = 0
370
+ for i in range(1, len(motif)):
371
+ if base_de_bruijn[base_mark[motif[i - 1]]][base_mark[motif[i]]] >= copy:
372
+ base_frequency += copy
373
+ base_de_bruijn[base_mark[motif[i - 1]]][base_mark[motif[i]]] -= copy
374
+ else:
375
+ if base_de_bruijn[base_mark[motif[i - 1]]][base_mark[motif[i]]] >= 0.6 * copy:
376
+ base_frequency += base_de_bruijn[base_mark[motif[i - 1]]][base_mark[motif[i]]]
377
+ else:
378
+ base_frequency += 0
379
+ if base_de_bruijn[base_mark[motif[i - 1]]][base_mark[motif[i]]] > 0:
380
+ base_de_bruijn[base_mark[motif[i - 1]]][base_mark[motif[i]]] = 0
381
+
382
+ return base_frequency
383
+
384
+
385
+ def get_copy_vector(motif, n, copy, base_de_bruijn, base_mark):
386
+ motif_base_de_bruijn = np.zeros((4, 4))
387
+ for i in range(1, n):
388
+ motif_base_de_bruijn[base_mark[motif[i - 1]]][base_mark[motif[i]]] += 1
389
+
390
+ copy_vector = []
391
+ for r in range(4):
392
+ for l in range(4):
393
+ if motif_base_de_bruijn[r][l] > 0:
394
+ copy_vector.append(base_de_bruijn[r][l] / motif_base_de_bruijn[r][l])
395
+
396
+ return np.mean((np.array(copy_vector) - copy) ** 2)
397
+
398
+
399
+ def find_most_frequent_substring_given(long_seq, four_mer_marker, k):
400
+ freq_map = {} # 字符串 -> 频数
401
+ positions_map = {} # 字符串 -> 出现位置列表
402
+ max_count = 0
403
+ most_frequent_string = ""
404
+
405
+ # Convert short_strings list to a set for O(1) lookup
406
+ short_strings_set = set(four_mer_marker)
407
+
408
+ # Traverse all substrings of length k using sliding window
409
+ for i in range(len(long_seq) - k + 1):
410
+ substring = long_seq[i:i + k]
411
+
412
+ # Only count the substring if it is in the set of given short strings
413
+ if substring in short_strings_set:
414
+ # Update frequency map
415
+ if substring in freq_map and i - positions_map[substring][-1] >= k:
416
+ freq_map[substring] += 1
417
+ positions_map[substring].append(i)
418
+ elif substring not in freq_map:
419
+ freq_map[substring] = 1
420
+ positions_map[substring] = [i]
421
+
422
+ # Update the most frequent substring if needed
423
+ if freq_map[substring] > max_count:
424
+ max_count = freq_map[substring]
425
+ most_frequent_string = substring
426
+
427
+ # Return the most frequent substring and its positions
428
+ return most_frequent_string, positions_map[most_frequent_string]
@@ -0,0 +1,220 @@
1
+ Metadata-Version: 2.4
2
+ Name: FastSTR
3
+ Version: 1.0.0
4
+ Summary: FastSTR: A high-performance tool for short tandem repeat (STR) detection and analysis from genome assemblies.
5
+ Author-email: Wen Long <wenlong2024@mail.nwpu.edu.cn>
6
+ Maintainer-email: Wen Long <wenlong2024@mail.nwpu.edu.cn>
7
+ License-Expression: MIT
8
+ Project-URL: Homepage, https://github.com/XL-BioGroup/FastSTR
9
+ Project-URL: Repository, https://github.com/XL-BioGroup/FastSTR
10
+ Project-URL: Issues, https://github.com/XL-BioGroup/FastSTR/issues
11
+ Keywords: tandem repeat,STR,genome,bioinformatics,sequence alignment
12
+ Classifier: Development Status :: 5 - Production/Stable
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Operating System :: OS Independent
17
+ Requires-Python: >=3.9
18
+ Description-Content-Type: text/markdown
19
+ Requires-Dist: biopython==1.83
20
+ Requires-Dist: numpy==1.26.0
21
+ Requires-Dist: pandas==2.2.2
22
+ Requires-Dist: regex==2024.9.11
23
+ Requires-Dist: scikit-learn==1.5.2
24
+
25
+ # 🧬 FastSTR
26
+
27
+ **FastSTR** — Ultra-fast and accurate identification of Short Tandem Repeats (STRs) from long-read DNA sequences. Developed for genome-wide STR detection, consensus construction, and comparative STR analysis.
28
+
29
+ ---
30
+
31
+ ## 📘 Table of Contents
32
+ 1. [Overview](#overview)
33
+ 2. [Installation](#installation)
34
+ 3. [Quick Start](#quick-start)
35
+ 4. [Command Line Options](#command-line-options)
36
+ 5. [Input & Output](#input--output)
37
+ 6. [Usage](#usage)
38
+ 7. [Performance](#performance)
39
+ 8. [Citation](#citation)
40
+ 9. [License](#license)
41
+ 10. [Changelog](#changelog)
42
+
43
+
44
+ ---
45
+
46
+ ## 🌍 Overview
47
+
48
+ **FastSTR** is a novel and efficient tool for de novo detection of short tandem repeats (STRs) in genomic sequences. It combines fast motif recognition with accurate sequence alignment to achieve both high precision and completeness in STR identification. FastSTR is optimized for large-scale genomic datasets and enables rapid detection of repetitive elements without relying on predefined motif libraries or fixed repeat-length thresholds.
49
+
50
+ Compared to classical tools like **TRF**, **T-reks**, and **TRASH**, FastSTR achieves:
51
+
52
+ - ⚡ **High-speed parallel processing** — Processes genomic fragments in parallel, achieving **up to 10× faster runtime**.
53
+ - 🧠 **Context-aware motif recognition** — Uses an **N-gram + Markov** model to identify representative motifs without predefined motif libraries.
54
+ - 🧩 **Segmented global alignment** — Efficiently handles **ultra-long or complex STRs** while maintaining base-level precision.
55
+ - 🔍 **Smart interval merging** — Applies an **interval-gain decision** strategy to accurately resolve overlapping STRs.
56
+ - 🧬 **Enhanced detection in complex regions** — Identifies **confounding or nested repeat regions** (e.g., centromeric satellites) with a novel **density-based concentration test**.
57
+ - 💾 **Lightweight & scalable** — Requires few dependencies, easy to install and run, and supports **multiple operating systems**.
58
+
59
+ ---
60
+
61
+ ## ⚙️ Installation
62
+
63
+ ### Option 1: Install via `pip`
64
+
65
+ ```bash
66
+ pip install faststr
67
+ ```
68
+
69
+ ### Option 2: Install via `conda`
70
+ *(coming soon)*
71
+ ```bash
72
+ conda install -c bioconda faststr
73
+ ```
74
+
75
+ ### Option 3: Local installation (development)
76
+ ```bash
77
+ git clone https://github.com/yourname/faststr.git
78
+ cd faststr
79
+ pip install -e .
80
+ ```
81
+
82
+ ---
83
+
84
+ ## 🚀 Quick Start
85
+
86
+ ### Basic Command
87
+
88
+ ```bash
89
+ faststr [--strict | --normal | --loose] [--default] genome.fa
90
+ ```
91
+
92
+ ### Example
93
+
94
+ ```bash
95
+ faststr --strict --default genome.fa
96
+ ```
97
+
98
+ This runs FastSTR in **strict mode** using the **default model** to identify STRs in the `genome.fa` file.
99
+
100
+ ---
101
+
102
+ ## ⚡ Command Line Options
103
+
104
+ | Argument | Type | Default | Description |
105
+ |-----------|------|----------|-------------|
106
+ | `match` | int | 2 | Match score |
107
+ | `mismatch` | int | 5 | Mismatch score |
108
+ | `gap_open` | int | 7 | Gap opening penalty |
109
+ | `gap_extend` | int | 3 | Gap extension penalty |
110
+ | `p_indel` | int | 15 | Indel percentage threshold |
111
+ | `p_match` | int | 80 | Match percentage threshold |
112
+ | `score` | int | 50 | Alignment score threshold |
113
+ | `quality_control` | bool | False | Enable read-level quality control |
114
+ | `DNA_file` | str | — | Path to DNA FASTA input |
115
+ | `-f` | str | — | Output directory |
116
+ | `-s` | int | 1 | Start index |
117
+ | `-e` | int | 0 | End index |
118
+ | `-l` | int | 15000 | Sub-read length |
119
+ | `-o` | int | 1000 | Overlap length |
120
+ | `-p` | int | 1 | Number of CPU cores |
121
+ | `-b` | float | 0.045 | Motif coverage threshold |
122
+
123
+ ---
124
+
125
+ ## 🧠 Alignment Modes
126
+
127
+ | Mode | Description |
128
+ |------|--------------|
129
+ | `--strict` | High precision, recommended for curated assemblies |
130
+ | `--normal` | Balanced mode, suitable for most datasets |
131
+ | `--loose` | High sensitivity, tolerant of mismatches |
132
+
133
+ ---
134
+
135
+ ## 🧬 Model Presets
136
+
137
+ | Preset | Description |
138
+ |---------|-------------|
139
+ | `--default` | Standard scoring model |
140
+ | *(future)* `--sensitive` | Optimized for noisy long reads |
141
+ | *(future)* `--speed` | Optimized for large-scale detection |
142
+
143
+ ---
144
+
145
+ ## 📥 Input & Output
146
+
147
+ ### Input
148
+ - DNA sequences in **FASTA** format
149
+
150
+ ### Output
151
+ | File Pattern | Description |
152
+ |----------------|-------------|
153
+ | `*detail.dat` | Contains all STR positions and motifs, quality statistics for each STR, and STR counts per chromosome. |
154
+ | `*align.dat` | Detailed alignment of all STRs against reference STRs, including mismatches and indels. |
155
+ | `*.csv` | Merged STR intervals with representative motifs and summary statistics for each interval. |
156
+ | `*.log` | Processing logs. |
157
+
158
+ ---
159
+
160
+ ## 🧪 Usage
161
+
162
+ ### 1️⃣ Identify STRs in a genome
163
+ ```bash
164
+ faststr --normal --default human_genome.fa
165
+ ```
166
+
167
+ ### 2️⃣ Use multiple cores
168
+ ```bash
169
+ faststr --strict --default genome.fa -p 8
170
+ ```
171
+
172
+ ---
173
+
174
+ ## 📈 Performance
175
+
176
+ | Dataset | Genome Size | Tool | Runtime | Recall | Precision |
177
+ |--------------------|------------|--------|-----------|--------|-----------|
178
+ | Human (T2T) | 2.94 G | TRF | 18 h 31 min | - | - |
179
+ | | | FastSTR| 1 h 13 min | 0.950 | 0.994 |
180
+ | Mouse (GRCm39) | 2.57 G | TRF | 1 h 41 min | - | - |
181
+ | | | FastSTR| 38 min | 0.966 | 0.997 |
182
+ | Zebrafish (GRCz11) | 1.58 G | TRF | 2 h 51 min | - | - |
183
+ | | | FastSTR| 25 min | 0.945 | 0.998 |
184
+
185
+ *Note: TRF is used as the ground-truth. FastSTR runs based on 72 CPUs.*
186
+
187
+
188
+ ---
189
+
190
+ ## 📚 Citation
191
+
192
+ If you use **FastSTR** in your research, please cite:
193
+
194
+ > Xingyu Liao *et al.*,
195
+ > **Efficient Identification of Short Tandem Repeats via Context-Aware Motif Discovery and Ultra-Fast Sequence Alignment**,
196
+ > *Nat. Methods*, 2025.
197
+
198
+ ---
199
+
200
+
201
+ ## 📄 License
202
+
203
+ This project is licensed under the **MIT License**.
204
+ See [LICENSE](LICENSE) for more details.
205
+
206
+ ---
207
+
208
+ ## 🧾 Changelog
209
+
210
+ ### v1.0.0 (2025)
211
+ - Initial release of FastSTR
212
+ - Supports three alignment modes and one default model
213
+ - Implemented parallel computation
214
+ - Added `.csv`, `.dat`, `.log` outputs
215
+ ---
216
+
217
+
218
+
219
+
220
+
@@ -0,0 +1,13 @@
1
+ faststr/Segmented_Global_Alignment.py,sha256=iqPhqCNyL2N52sZwCZr5gSzdUTgmpQ_vunszW-4yGfM,26165
2
+ faststr/__init__.py,sha256=-brodmEr8X1HxTfVQ7g5BXYbHDRWV52VGpxfoAlXLAI,46
3
+ faststr/get_subread_trs.py,sha256=B3Yzk9_gxNq1oVDLnx2O4zoSMe3pjD7Ax_tfDQOIe9k,1325
4
+ faststr/main.py,sha256=zkIH0faI4zj9P3CBkoWK0UpPV35GQhGyYZCFBMafhDg,16841
5
+ faststr/make_consensus_TRs.py,sha256=GtixIOCCv_CtmPw0XrzVUHXRAKE7mZD7OpAh5ZUIOvg,20274
6
+ faststr/scan_subread.py,sha256=v7yI3h1SKPnus2EIh2HGP17L40ZvTzq62VBUQioN41s,18030
7
+ faststr/trs_align.py,sha256=wJoOqGqeAd5tyeblB5qbonMF6pmvG2QVI-ckM4MSFz8,37750
8
+ faststr/utils.py,sha256=KZO7qUcRnD-CWZUuoCHoIGECYw9gdx_nbNF8kol0vkQ,16018
9
+ faststr-1.0.0.dist-info/METADATA,sha256=qKCq0XkVMVSMu70jVBD5nvBgf03D8R0BbgLdAZ8h9Cc,7093
10
+ faststr-1.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
11
+ faststr-1.0.0.dist-info/entry_points.txt,sha256=U2IgRO55zTgGvY5CxxQd05GEK7OdgUptGzXOWtsWrH4,46
12
+ faststr-1.0.0.dist-info/top_level.txt,sha256=289H-XYZbLnN6h63Kc0OZw0tSZ-RMaSovsW-MP8kkk4,8
13
+ faststr-1.0.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ faststr = faststr.main:main
@@ -0,0 +1 @@
1
+ faststr