telomore 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- telomore/__init__.py +5 -0
- telomore/_version.py +34 -0
- telomore/app.py +536 -0
- telomore/utils/__init__.py +1 -0
- telomore/utils/arg_parser.py +220 -0
- telomore/utils/classes_and_small_func.py +289 -0
- telomore/utils/cmd_tools.py +732 -0
- telomore/utils/fasta_tools.py +595 -0
- telomore/utils/map_tools.py +1333 -0
- telomore/utils/qc_reports.py +493 -0
- telomore-0.4.1.dist-info/METADATA +149 -0
- telomore-0.4.1.dist-info/RECORD +15 -0
- telomore-0.4.1.dist-info/WHEEL +4 -0
- telomore-0.4.1.dist-info/entry_points.txt +2 -0
- telomore-0.4.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,1333 @@
|
|
|
1
|
+
"""Functions for handling read mappings and extracting terminal reads."""
|
|
2
|
+
|
|
3
|
+
import gzip
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
from Bio import SeqIO
|
|
9
|
+
from Bio.Seq import Seq
|
|
10
|
+
from Bio.SeqRecord import SeqRecord
|
|
11
|
+
import pysam
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def sam_to_readpair(
|
|
15
|
+
sam_in: Path, fastq_in1: Path, fastq_in2: Path, fastq_out1: Path, fastq_out2: Path
|
|
16
|
+
) -> None:
|
|
17
|
+
"""
|
|
18
|
+
Extract complete read pairs from paired-end FASTQ files based on SAM alignment.
|
|
19
|
+
|
|
20
|
+
Retrieves both reads (R1 and R2) of paired-end sequences if either read
|
|
21
|
+
appears in the input SAM file. This preserves read pairing for downstream
|
|
22
|
+
analysis that requires synchronized paired-end data.
|
|
23
|
+
|
|
24
|
+
Parameters
|
|
25
|
+
----------
|
|
26
|
+
sam_in : Path
|
|
27
|
+
Path to input SAM alignment file.
|
|
28
|
+
fastq_in1 : Path
|
|
29
|
+
Path to forward/R1 FASTQ file (gzip compressed).
|
|
30
|
+
fastq_in2 : Path
|
|
31
|
+
Path to reverse/R2 FASTQ file (gzip compressed).
|
|
32
|
+
fastq_out1 : Path
|
|
33
|
+
Path for output R1 FASTQ file containing extracted read pairs.
|
|
34
|
+
fastq_out2 : Path
|
|
35
|
+
Path for output R2 FASTQ file containing extracted read pairs.
|
|
36
|
+
|
|
37
|
+
Returns
|
|
38
|
+
-------
|
|
39
|
+
None
|
|
40
|
+
Writes extracted read pairs to fastq_out1 and fastq_out2.
|
|
41
|
+
|
|
42
|
+
Notes
|
|
43
|
+
-----
|
|
44
|
+
Processing details:
|
|
45
|
+
- First pass: Collects all read IDs from SAM file into a set
|
|
46
|
+
- Handles read names with spaces by taking only the first part
|
|
47
|
+
- Second pass: Extracts matching reads from gzipped FASTQ files
|
|
48
|
+
- Both R1 and R2 are extracted if read name appears in SAM
|
|
49
|
+
- Output files are uncompressed FASTQ format
|
|
50
|
+
|
|
51
|
+
The input FASTQ files must be gzip-compressed (.gz), while outputs
|
|
52
|
+
are plain text for immediate downstream processing.
|
|
53
|
+
"""
|
|
54
|
+
with pysam.AlignmentFile(sam_in) as samfile:
|
|
55
|
+
reads_to_grep = set() # using a set should be faster than list
|
|
56
|
+
|
|
57
|
+
# get all read names
|
|
58
|
+
for read in samfile.fetch(until_eof=True):
|
|
59
|
+
read_name = read.query_name
|
|
60
|
+
if ' ' in read.query_name:
|
|
61
|
+
read_name = read_name.split(' ')[0]
|
|
62
|
+
|
|
63
|
+
reads_to_grep.add(read_name)
|
|
64
|
+
|
|
65
|
+
# get read 1
|
|
66
|
+
with (
|
|
67
|
+
gzip.open(fastq_in1, 'rt') as gzip_handle,
|
|
68
|
+
open(fastq_out1, 'w') as outfile,
|
|
69
|
+
):
|
|
70
|
+
for record in SeqIO.parse(gzip_handle, 'fastq'):
|
|
71
|
+
if record.id in reads_to_grep:
|
|
72
|
+
SeqIO.write(record, outfile, 'fastq')
|
|
73
|
+
|
|
74
|
+
# get read 2
|
|
75
|
+
with (
|
|
76
|
+
gzip.open(fastq_in2, 'rt') as gzip_handle,
|
|
77
|
+
open(fastq_out2, 'w') as outfile,
|
|
78
|
+
):
|
|
79
|
+
for record in SeqIO.parse(gzip_handle, 'fastq'):
|
|
80
|
+
if record.id in reads_to_grep:
|
|
81
|
+
SeqIO.write(record, outfile, 'fastq')
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def sam_to_fastq(sam_in: Path, fastq_out: Path) -> None:
|
|
85
|
+
r"""
|
|
86
|
+
Convert SAM alignment file to FASTQ format, excluding unmapped reads.
|
|
87
|
+
|
|
88
|
+
Extracts sequence and quality information from aligned reads in a SAM file
|
|
89
|
+
and writes them in FASTQ format. Unmapped reads are filtered out. Used to
|
|
90
|
+
extract reads that successfully aligned to terminal regions.
|
|
91
|
+
|
|
92
|
+
Parameters
|
|
93
|
+
----------
|
|
94
|
+
sam_in : Path
|
|
95
|
+
Path to input SAM alignment file.
|
|
96
|
+
fastq_out : Path
|
|
97
|
+
File handle (opened in write mode) for output FASTQ.
|
|
98
|
+
|
|
99
|
+
Returns
|
|
100
|
+
-------
|
|
101
|
+
None
|
|
102
|
+
Writes FASTQ records to the provided file handle.
|
|
103
|
+
|
|
104
|
+
Notes
|
|
105
|
+
-----
|
|
106
|
+
Processing details:
|
|
107
|
+
- Only mapped reads (not flagged as unmapped) are converted
|
|
108
|
+
- If quality scores are missing, assigns default high quality 'I' (Q40)
|
|
109
|
+
- FASTQ format: @name\\nseq\\n+\\nqual\\n
|
|
110
|
+
- The fastq_out parameter should be an open file handle, not a path string
|
|
111
|
+
|
|
112
|
+
Quality score handling is important for reads extracted from SAM files
|
|
113
|
+
that may not have retained the original quality information.
|
|
114
|
+
"""
|
|
115
|
+
with pysam.AlignmentFile(sam_in, 'r') as samfile:
|
|
116
|
+
for read in samfile.fetch(until_eof=True):
|
|
117
|
+
if not read.is_unmapped:
|
|
118
|
+
name = read.query_name
|
|
119
|
+
seq = read.query_sequence
|
|
120
|
+
qual = read.qual
|
|
121
|
+
if qual is None:
|
|
122
|
+
qual = 'I' * len(seq) # Assign a default high-quality score
|
|
123
|
+
|
|
124
|
+
# Write the read in FASTQ format to the provided handle
|
|
125
|
+
fastq_out.write(f'@{name}\n{seq}\n+\n{qual}\n')
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def mapped_bases(cigarstring: str) -> int:
|
|
129
|
+
"""
|
|
130
|
+
Calculate the number of bases mapped to the reference from a CIGAR string.
|
|
131
|
+
|
|
132
|
+
Parses a CIGAR string and sums the lengths of all operations that consume
|
|
133
|
+
reference bases (M, D, N, X, =). This is used to compare alignment quality
|
|
134
|
+
when a read maps to multiple locations.
|
|
135
|
+
|
|
136
|
+
Parameters
|
|
137
|
+
----------
|
|
138
|
+
cigarstring : str
|
|
139
|
+
CIGAR string from SAM alignment (e.g., '100M5S', '50M2D50M').
|
|
140
|
+
|
|
141
|
+
Returns
|
|
142
|
+
-------
|
|
143
|
+
int
|
|
144
|
+
Total number of bases that align to the reference sequence.
|
|
145
|
+
|
|
146
|
+
Notes
|
|
147
|
+
-----
|
|
148
|
+
CIGAR operations that consume reference bases:
|
|
149
|
+
- M: alignment match (can be match or mismatch)
|
|
150
|
+
- D: deletion from reference
|
|
151
|
+
- N: skipped region from reference
|
|
152
|
+
- X: sequence mismatch
|
|
153
|
+
- =: sequence match
|
|
154
|
+
|
|
155
|
+
Operations that do NOT consume reference (excluded from count):
|
|
156
|
+
- S: soft clipping
|
|
157
|
+
- I: insertion to reference
|
|
158
|
+
- H: hard clipping
|
|
159
|
+
- P: padding
|
|
160
|
+
|
|
161
|
+
This count represents how much of the reference sequence is covered
|
|
162
|
+
by the alignment, which is useful for selecting the best alignment
|
|
163
|
+
when a read maps to multiple positions.
|
|
164
|
+
"""
|
|
165
|
+
# Define operations that consume reference bases
|
|
166
|
+
consuming_operations = 'MDNX='
|
|
167
|
+
|
|
168
|
+
# Parse the CIGAR string using regex
|
|
169
|
+
# This produces a tuple in the format (121,"S")
|
|
170
|
+
operations = re.findall(r'(\d+)([MIDNSHP=X])', cigarstring)
|
|
171
|
+
|
|
172
|
+
# Initialize base count
|
|
173
|
+
mapped_bases_count = 0
|
|
174
|
+
|
|
175
|
+
# Loop through the parsed operations and sum bases for consuming operations
|
|
176
|
+
for length, op in operations:
|
|
177
|
+
if op in consuming_operations:
|
|
178
|
+
mapped_bases_count += int(length)
|
|
179
|
+
|
|
180
|
+
return mapped_bases_count
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def cigar_maps_more_bases(cigar1: str, cigar2: str) -> bool:
|
|
184
|
+
"""
|
|
185
|
+
Compare two CIGAR strings to determine which maps more reference bases.
|
|
186
|
+
|
|
187
|
+
Evaluates which of two alignments covers more bases on the reference
|
|
188
|
+
sequence. Used to select the better alignment when a read maps to
|
|
189
|
+
multiple locations.
|
|
190
|
+
|
|
191
|
+
Parameters
|
|
192
|
+
----------
|
|
193
|
+
cigar1 : str
|
|
194
|
+
First CIGAR string to compare.
|
|
195
|
+
cigar2 : str
|
|
196
|
+
Second CIGAR string to compare.
|
|
197
|
+
|
|
198
|
+
Returns
|
|
199
|
+
-------
|
|
200
|
+
bool or None
|
|
201
|
+
True if cigar1 maps more bases than cigar2, False if cigar2 maps
|
|
202
|
+
more bases, None if they map equal bases.
|
|
203
|
+
|
|
204
|
+
Notes
|
|
205
|
+
-----
|
|
206
|
+
The comparison is based on the number of reference-consuming bases
|
|
207
|
+
(M, D, N, X, =) calculated by the mapped_bases function.
|
|
208
|
+
|
|
209
|
+
Return values:
|
|
210
|
+
- True: cigar1 has more mapped bases
|
|
211
|
+
- False: cigar2 has more mapped bases
|
|
212
|
+
- None: both have equal mapped bases (implicit, no return statement)
|
|
213
|
+
|
|
214
|
+
This function is used to resolve multi-mapping reads by keeping the
|
|
215
|
+
alignment that covers the most reference sequence, which typically
|
|
216
|
+
indicates a better alignment quality.
|
|
217
|
+
"""
|
|
218
|
+
bases1 = mapped_bases(cigar1)
|
|
219
|
+
bases2 = mapped_bases(cigar2)
|
|
220
|
+
|
|
221
|
+
if bases1 > bases2:
|
|
222
|
+
return True
|
|
223
|
+
elif bases1 < bases2:
|
|
224
|
+
return False
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def get_terminal_reads(
|
|
228
|
+
sorted_bam_file: Path, contig: Path, loutput_handle: Path, routput_handle: Path
|
|
229
|
+
) -> None:
|
|
230
|
+
"""
|
|
231
|
+
Extract reads mapping to the terminal 20bp regions of a contig.
|
|
232
|
+
|
|
233
|
+
Retrieves all reads that align to the first or last 20 bases of a reference
|
|
234
|
+
contig. For multi-mapping reads, keeps only the alignment with the most
|
|
235
|
+
mapped bases. Critical for identifying reads that extend beyond assembly ends.
|
|
236
|
+
|
|
237
|
+
Parameters
|
|
238
|
+
----------
|
|
239
|
+
sorted_bam_file : Path
|
|
240
|
+
Path to sorted BAM alignment file.
|
|
241
|
+
contig : Path
|
|
242
|
+
Name/ID of the contig to extract terminal reads from.
|
|
243
|
+
loutput_handle : Path
|
|
244
|
+
Path for output BAM file containing left-terminal reads.
|
|
245
|
+
routput_handle : Path
|
|
246
|
+
Path for output BAM file containing right-terminal reads.
|
|
247
|
+
|
|
248
|
+
Returns
|
|
249
|
+
-------
|
|
250
|
+
None
|
|
251
|
+
Writes left-terminal reads to loutput_handle and right-terminal reads
|
|
252
|
+
to routput_handle.
|
|
253
|
+
|
|
254
|
+
Notes
|
|
255
|
+
-----
|
|
256
|
+
Terminal region definition:
|
|
257
|
+
- Left terminal: positions 0-20 (first 20bp)
|
|
258
|
+
- Right terminal: (seq_end - 20) to seq_end (last 20bp)
|
|
259
|
+
|
|
260
|
+
Multi-mapping read handling:
|
|
261
|
+
- If a read maps to terminal region multiple times, compare CIGAR strings
|
|
262
|
+
- Keep only the alignment mapping the most reference bases
|
|
263
|
+
- Skips reads with no sequence (query_sequence is None)
|
|
264
|
+
|
|
265
|
+
This function is essential for the Telomore workflow as it identifies
|
|
266
|
+
reads that may contain sequence extending beyond the assembly, which
|
|
267
|
+
can be used to build consensus extensions.
|
|
268
|
+
"""
|
|
269
|
+
input = pysam.AlignmentFile(sorted_bam_file, 'r')
|
|
270
|
+
|
|
271
|
+
# Fetch all reads aligned at start or end of reference
|
|
272
|
+
seq_end = input.get_reference_length(contig)
|
|
273
|
+
ref_name = contig
|
|
274
|
+
left_reads = input.fetch(ref_name, start=0, stop=20)
|
|
275
|
+
right_reads = input.fetch(ref_name, start=(seq_end - 20), stop=seq_end)
|
|
276
|
+
|
|
277
|
+
# dict to store best mapped read from each end
|
|
278
|
+
lterminal_reads = {}
|
|
279
|
+
rterminal_reads = {}
|
|
280
|
+
|
|
281
|
+
for lread in left_reads:
|
|
282
|
+
query_name = lread.query_name
|
|
283
|
+
cigar = lread.cigarstring
|
|
284
|
+
|
|
285
|
+
if lread.query_sequence is None: # skip empty reads
|
|
286
|
+
continue
|
|
287
|
+
|
|
288
|
+
# Check if the read is mapped multiple times and use
|
|
289
|
+
# the read that maps to most bases
|
|
290
|
+
if query_name in lterminal_reads:
|
|
291
|
+
prior_read = lterminal_reads[query_name]
|
|
292
|
+
prior_cigar = prior_read.cigarstring
|
|
293
|
+
|
|
294
|
+
# Compare CIGAR strings to keep the one that maps more bases
|
|
295
|
+
if cigar_maps_more_bases(cigar, prior_cigar):
|
|
296
|
+
lterminal_reads[query_name] = lread
|
|
297
|
+
else:
|
|
298
|
+
lterminal_reads[query_name] = lread
|
|
299
|
+
|
|
300
|
+
for rread in right_reads:
|
|
301
|
+
query_name = rread.query_name
|
|
302
|
+
cigar = rread.cigarstring
|
|
303
|
+
|
|
304
|
+
if rread.query_sequence is None: # skip empty reads
|
|
305
|
+
continue
|
|
306
|
+
|
|
307
|
+
# Check if the read is mapped multiple times and use
|
|
308
|
+
# the read that maps to most bases
|
|
309
|
+
if query_name in rterminal_reads:
|
|
310
|
+
prior_read = rterminal_reads[query_name]
|
|
311
|
+
prior_cigar = prior_read.cigarstring
|
|
312
|
+
|
|
313
|
+
# Compare CIGAR strings to keep the one that maps more bases
|
|
314
|
+
if cigar_maps_more_bases(cigar, prior_cigar):
|
|
315
|
+
rterminal_reads[query_name] = rread
|
|
316
|
+
else:
|
|
317
|
+
rterminal_reads[query_name] = rread
|
|
318
|
+
|
|
319
|
+
# Write all fetched reads to a new file
|
|
320
|
+
lterminal_file = pysam.AlignmentFile(loutput_handle, 'w', template=input)
|
|
321
|
+
for read in lterminal_reads.values():
|
|
322
|
+
lterminal_file.write(read)
|
|
323
|
+
lterminal_file.close()
|
|
324
|
+
|
|
325
|
+
rterminal_file = pysam.AlignmentFile(routput_handle, 'w', template=input)
|
|
326
|
+
for read in rterminal_reads.values():
|
|
327
|
+
rterminal_file.write(read)
|
|
328
|
+
rterminal_file.close()
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def get_left_soft(sam_file: Path, left_out: Path, offset: int = 0) -> None:
|
|
332
|
+
r"""
|
|
333
|
+
Extract reads with 5' soft-clipping that extends beyond reference start.
|
|
334
|
+
|
|
335
|
+
Identifies reads where the soft-clipped portion at the 5' end would extend
|
|
336
|
+
beyond position 0 of the reference. Writes full alignments to SAM and
|
|
337
|
+
extracts only the soft-clipped sequences to FASTQ. These represent sequence
|
|
338
|
+
extending left of the assembly.
|
|
339
|
+
|
|
340
|
+
Parameters
|
|
341
|
+
----------
|
|
342
|
+
sam_file : Path
|
|
343
|
+
Path to input SAM alignment file.
|
|
344
|
+
left_out : Path
|
|
345
|
+
Base path for output files (adds .sam and .fastq extensions).
|
|
346
|
+
offset : int, default=0
|
|
347
|
+
Additional bases to include beyond the soft-clipped region.
|
|
348
|
+
|
|
349
|
+
Returns
|
|
350
|
+
-------
|
|
351
|
+
None
|
|
352
|
+
Creates two output files:
|
|
353
|
+
- {left_out}.sam: Full alignment records.
|
|
354
|
+
- {left_out}.fastq: Soft-clipped sequences only.
|
|
355
|
+
|
|
356
|
+
Notes
|
|
357
|
+
-----
|
|
358
|
+
Filtering logic:
|
|
359
|
+
- Looks for CIGAR patterns starting with soft-clip: ^(\\d+)S
|
|
360
|
+
- Only keeps reads where soft-clip length > reference_start position
|
|
361
|
+
- This ensures the clipped sequence extends beyond the reference start
|
|
362
|
+
|
|
363
|
+
FASTQ output contains:
|
|
364
|
+
- Sequence: bases [0:clip_num+offset] from read
|
|
365
|
+
- Quality: Phred scores converted to Sanger ASCII (Q+33)
|
|
366
|
+
|
|
367
|
+
The offset parameter allows including additional bases for context,
|
|
368
|
+
which can improve consensus building at the assembly boundary.
|
|
369
|
+
"""
|
|
370
|
+
sam_in = pysam.AlignmentFile(sam_file, 'r')
|
|
371
|
+
lclip = pysam.AlignmentFile(left_out + '.sam', 'w', template=sam_in)
|
|
372
|
+
lfastq = open(left_out + '.fastq', 'w')
|
|
373
|
+
|
|
374
|
+
start_clip = r'^(\d+)S'
|
|
375
|
+
for read in sam_in:
|
|
376
|
+
lmatch = re.match(start_clip, read.cigarstring)
|
|
377
|
+
|
|
378
|
+
if lmatch:
|
|
379
|
+
clip_num = int(lmatch.group(1)) # digits are retrieve via .group
|
|
380
|
+
|
|
381
|
+
if clip_num > read.reference_start:
|
|
382
|
+
lclip.write(read) # write to sam-file
|
|
383
|
+
|
|
384
|
+
# get info for fastq-file
|
|
385
|
+
name = read.query_name
|
|
386
|
+
seq = read.query_sequence[0 : clip_num + offset]
|
|
387
|
+
sanger_qual = ''.join(
|
|
388
|
+
[chr(q + 33) for q in read.query_qualities[0 : clip_num + offset]]
|
|
389
|
+
) # phred qual converted to ASCII with 33 offset
|
|
390
|
+
lfastq.write('@{}\n{}\n+\n{}\n'.format(name, seq, sanger_qual))
|
|
391
|
+
sam_in.close()
|
|
392
|
+
lclip.close()
|
|
393
|
+
lfastq.close()
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def get_right_soft(
|
|
397
|
+
sam_file: Path, contig: Path, right_out: Path, offset: int = 0
|
|
398
|
+
) -> None:
|
|
399
|
+
r"""
|
|
400
|
+
Extract reads with 3' soft-clipping that extends beyond reference end.
|
|
401
|
+
|
|
402
|
+
Identifies reads where the soft-clipped portion at the 3' end would extend
|
|
403
|
+
beyond the reference sequence end. Writes full alignments to SAM and
|
|
404
|
+
extracts only the soft-clipped sequences to FASTQ. These represent sequence
|
|
405
|
+
extending right of the assembly.
|
|
406
|
+
|
|
407
|
+
Parameters
|
|
408
|
+
----------
|
|
409
|
+
sam_file : Path
|
|
410
|
+
Path to input SAM alignment file.
|
|
411
|
+
contig : Path
|
|
412
|
+
Name/ID of the contig to determine reference length.
|
|
413
|
+
right_out : Path
|
|
414
|
+
Base path for output files (adds .sam and .fastq extensions).
|
|
415
|
+
offset : int, default=0
|
|
416
|
+
Additional bases to include beyond the soft-clipped region.
|
|
417
|
+
|
|
418
|
+
Returns
|
|
419
|
+
-------
|
|
420
|
+
None
|
|
421
|
+
Creates two output files:
|
|
422
|
+
- {right_out}.sam: Full alignment records.
|
|
423
|
+
- {right_out}.fastq: Soft-clipped sequences only.
|
|
424
|
+
|
|
425
|
+
Notes
|
|
426
|
+
-----
|
|
427
|
+
Filtering logic:
|
|
428
|
+
- Looks for CIGAR patterns ending with soft-clip: (\\d+)S$
|
|
429
|
+
- Only keeps reads where (clip_length + reference_end) > seq_end
|
|
430
|
+
- This ensures the clipped sequence extends beyond the reference end
|
|
431
|
+
|
|
432
|
+
FASTQ output contains:
|
|
433
|
+
- Sequence: last (clip_num+offset) bases from read
|
|
434
|
+
- Quality: Phred scores converted to Sanger ASCII (Q+33)
|
|
435
|
+
|
|
436
|
+
The offset parameter allows including additional bases for context,
|
|
437
|
+
which can improve consensus building at the assembly boundary.
|
|
438
|
+
"""
|
|
439
|
+
sam_in = pysam.AlignmentFile(sam_file, 'r')
|
|
440
|
+
rclip = pysam.AlignmentFile(right_out + '.sam', 'w', template=sam_in)
|
|
441
|
+
rfastq = open(right_out + '.fastq', 'w')
|
|
442
|
+
seq_end = sam_in.get_reference_length(contig) # get length of reference
|
|
443
|
+
end_clip = r'(\d+)S$'
|
|
444
|
+
for read in sam_in:
|
|
445
|
+
rmatch = re.search(end_clip, read.cigarstring)
|
|
446
|
+
if rmatch:
|
|
447
|
+
clip_num = int(rmatch.group(1)) # digits are retrieve via .group
|
|
448
|
+
|
|
449
|
+
if clip_num + read.reference_end > seq_end:
|
|
450
|
+
rclip.write(read) # write to sam-file
|
|
451
|
+
|
|
452
|
+
# get info for fastq-file
|
|
453
|
+
name = read.query_name
|
|
454
|
+
seq = read.query_sequence[-(clip_num + offset) :]
|
|
455
|
+
sanger_qual = ''.join(
|
|
456
|
+
[chr(q + 33) for q in read.query_qualities[-(clip_num + offset) :]]
|
|
457
|
+
) # phred qual converted to ASCII with 33 offset
|
|
458
|
+
rfastq.write('@{}\n{}\n+\n{}\n'.format(name, seq, sanger_qual))
|
|
459
|
+
|
|
460
|
+
sam_in.close()
|
|
461
|
+
rclip.close()
|
|
462
|
+
rfastq.close()
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
def revcomp_reads(reads_in: str, reads_out: str) -> None:
|
|
466
|
+
"""
|
|
467
|
+
Generate reverse complement of all reads in a FASTQ file.
|
|
468
|
+
|
|
469
|
+
Converts all sequences in a FASTQ file to their reverse complement,
|
|
470
|
+
reversing both the sequence and quality scores. Adds 'rev_' prefix
|
|
471
|
+
to read IDs. Used to orient left-terminal reads for consensus building.
|
|
472
|
+
|
|
473
|
+
Parameters
|
|
474
|
+
----------
|
|
475
|
+
reads_in : str
|
|
476
|
+
Path to input FASTQ file.
|
|
477
|
+
reads_out : str
|
|
478
|
+
Path for output reverse-complemented FASTQ file.
|
|
479
|
+
|
|
480
|
+
Returns
|
|
481
|
+
-------
|
|
482
|
+
None
|
|
483
|
+
Writes reverse-complemented reads to reads_out.
|
|
484
|
+
|
|
485
|
+
Notes
|
|
486
|
+
-----
|
|
487
|
+
Transformation details:
|
|
488
|
+
- Sequence: Reverse complemented (A↔T, G↔C, reversed)
|
|
489
|
+
- Quality scores: Reversed to match new sequence orientation
|
|
490
|
+
- Read ID: Prefixed with 'rev_'
|
|
491
|
+
- Original ID and quality annotations are preserved in structure
|
|
492
|
+
|
|
493
|
+
This is necessary for left-terminal reads because they need to be
|
|
494
|
+
reverse-complemented before consensus building to match the expected
|
|
495
|
+
5' to 3' orientation for extension sequences.
|
|
496
|
+
"""
|
|
497
|
+
with open(reads_in, 'r') as input_handle, open(reads_out, 'w') as output_handle:
|
|
498
|
+
for record in SeqIO.parse(input_handle, 'fastq'):
|
|
499
|
+
# Get the reverse complement of the sequence
|
|
500
|
+
rev_complement_seq = record.seq.reverse_complement()
|
|
501
|
+
|
|
502
|
+
# Reverse the quality scores as well
|
|
503
|
+
rev_quality_scores = record.letter_annotations['phred_quality'][::-1]
|
|
504
|
+
|
|
505
|
+
# Create a new record with the reverse complement sequence and quality scores
|
|
506
|
+
rev_complement_record = record
|
|
507
|
+
rev_complement_record.id = 'rev_' + str(record.id)
|
|
508
|
+
|
|
509
|
+
rev_complement_record.seq = rev_complement_seq
|
|
510
|
+
rev_complement_record.letter_annotations['phred_quality'] = (
|
|
511
|
+
rev_quality_scores
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
# Write the reverse complement record to the output FASTQ file
|
|
515
|
+
SeqIO.write(rev_complement_record, output_handle, 'fastq')
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
def revcomp(fasta_in: str, fasta_out: str) -> None:
|
|
519
|
+
"""
|
|
520
|
+
Generate reverse complement of all sequences in a FASTA file.
|
|
521
|
+
|
|
522
|
+
Converts all sequences in a FASTA file to their reverse complement.
|
|
523
|
+
Adds 'rev_' prefix to sequence IDs. Used to reorient consensus sequences
|
|
524
|
+
to match expected telomere orientation.
|
|
525
|
+
|
|
526
|
+
Parameters
|
|
527
|
+
----------
|
|
528
|
+
fasta_in : str
|
|
529
|
+
Path to input FASTA file.
|
|
530
|
+
fasta_out : str
|
|
531
|
+
Path for output reverse-complemented FASTA file.
|
|
532
|
+
|
|
533
|
+
Returns
|
|
534
|
+
-------
|
|
535
|
+
None
|
|
536
|
+
Writes reverse-complemented sequences to fasta_out.
|
|
537
|
+
|
|
538
|
+
Notes
|
|
539
|
+
-----
|
|
540
|
+
Transformation details:
|
|
541
|
+
- Sequence: Reverse complemented (A↔T, G↔C, reversed)
|
|
542
|
+
- Sequence ID: Prefixed with 'rev_'
|
|
543
|
+
- Description preserved from original
|
|
544
|
+
|
|
545
|
+
Unlike revcomp_reads, this operates on FASTA format and doesn't
|
|
546
|
+
need to handle quality scores. Used primarily for consensus sequences
|
|
547
|
+
built from left-terminal reads.
|
|
548
|
+
"""
|
|
549
|
+
with open(fasta_in, 'r') as input_handle, open(fasta_out, 'w') as output_handle:
|
|
550
|
+
for record in SeqIO.parse(input_handle, 'fasta'):
|
|
551
|
+
# Get the reverse complement of the sequence
|
|
552
|
+
rev_complement_seq = record.seq.reverse_complement()
|
|
553
|
+
|
|
554
|
+
# Create a new record with the reverse complement sequence and quality scores
|
|
555
|
+
rev_complement_record = record
|
|
556
|
+
rev_complement_record.id = 'rev_' + str(record.id)
|
|
557
|
+
rev_complement_record.seq = rev_complement_seq
|
|
558
|
+
|
|
559
|
+
# Write the reverse complement record to the output FASTQ file
|
|
560
|
+
SeqIO.write(rev_complement_record, output_handle, 'fasta')
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
def is_map_empty(file_path: str) -> bool:
|
|
564
|
+
"""
|
|
565
|
+
Check if a BAM file contains any reads.
|
|
566
|
+
|
|
567
|
+
Attempts to fetch the first read from a BAM alignment file to determine
|
|
568
|
+
if the file is empty. Used to validate that alignment steps produced
|
|
569
|
+
output before proceeding with downstream analysis.
|
|
570
|
+
|
|
571
|
+
Parameters
|
|
572
|
+
----------
|
|
573
|
+
file_path : str
|
|
574
|
+
Path to BAM file to check.
|
|
575
|
+
|
|
576
|
+
Returns
|
|
577
|
+
-------
|
|
578
|
+
bool
|
|
579
|
+
False if the file contains at least one read, True if empty.
|
|
580
|
+
|
|
581
|
+
Notes
|
|
582
|
+
-----
|
|
583
|
+
Implementation uses next() to attempt fetching the first read:
|
|
584
|
+
- If successful: Returns False (file not empty)
|
|
585
|
+
- If StopIteration raised: Returns True (file is empty)
|
|
586
|
+
|
|
587
|
+
This is more efficient than loading all reads since it stops at
|
|
588
|
+
the first read found. Empty BAM files indicate no reads aligned
|
|
589
|
+
in a mapping step, which may require special handling.
|
|
590
|
+
"""
|
|
591
|
+
# Open the alignment file
|
|
592
|
+
with pysam.AlignmentFile(file_path, 'rb') as alignment_file:
|
|
593
|
+
# Try to fetch the first read
|
|
594
|
+
try:
|
|
595
|
+
next(alignment_file)
|
|
596
|
+
return False # Alignment is not empty
|
|
597
|
+
except StopIteration:
|
|
598
|
+
return True # Alignment is empty
|
|
599
|
+
|
|
600
|
+
|
|
601
|
+
def is_consensus_unmapped(file_path: str) -> bool:
|
|
602
|
+
"""
|
|
603
|
+
Check if all reads in a BAM file are unmapped.
|
|
604
|
+
|
|
605
|
+
Determines whether a consensus sequence failed to map to the reference
|
|
606
|
+
by checking if all reads in the BAM file are flagged as unmapped. Used
|
|
607
|
+
to detect when a consensus doesn't match the expected location.
|
|
608
|
+
|
|
609
|
+
Parameters
|
|
610
|
+
----------
|
|
611
|
+
file_path : str
|
|
612
|
+
Path to BAM file to check.
|
|
613
|
+
|
|
614
|
+
Returns
|
|
615
|
+
-------
|
|
616
|
+
bool
|
|
617
|
+
True if all reads are unmapped or file is empty, False if any
|
|
618
|
+
read is mapped.
|
|
619
|
+
|
|
620
|
+
Notes
|
|
621
|
+
-----
|
|
622
|
+
Processing logic:
|
|
623
|
+
- Loads all reads into memory (suitable for small consensus BAMs)
|
|
624
|
+
- Returns True if file is empty (no reads)
|
|
625
|
+
- Returns False immediately upon finding first mapped read
|
|
626
|
+
- Returns True only if all reads are unmapped
|
|
627
|
+
|
|
628
|
+
An unmapped consensus indicates the consensus sequence doesn't align
|
|
629
|
+
to the expected position on the reference, suggesting it may not be
|
|
630
|
+
a valid extension or may belong elsewhere in the genome.
|
|
631
|
+
"""
|
|
632
|
+
with pysam.AlignmentFile(file_path, 'rb') as alignment_file:
|
|
633
|
+
reads = list(alignment_file) # get reads
|
|
634
|
+
|
|
635
|
+
is_unmapped = True # s
|
|
636
|
+
|
|
637
|
+
if len(reads) > 0:
|
|
638
|
+
for read in reads:
|
|
639
|
+
if not read.is_unmapped:
|
|
640
|
+
is_unmapped = False
|
|
641
|
+
return is_unmapped
|
|
642
|
+
|
|
643
|
+
return is_unmapped
|
|
644
|
+
|
|
645
|
+
|
|
646
|
+
def is_consensus_empty(file_path: str) -> bool:
|
|
647
|
+
"""
|
|
648
|
+
Check if a BAM file represents an empty consensus sequence.
|
|
649
|
+
|
|
650
|
+
Identifies BAM files produced by mapping empty consensus sequences, which
|
|
651
|
+
contain exactly one unmapped read with no sequence. This indicates no
|
|
652
|
+
consensus could be built, typically because no reads extended the assembly.
|
|
653
|
+
|
|
654
|
+
Parameters
|
|
655
|
+
----------
|
|
656
|
+
file_path : str
|
|
657
|
+
Path to BAM file to check.
|
|
658
|
+
|
|
659
|
+
Returns
|
|
660
|
+
-------
|
|
661
|
+
bool
|
|
662
|
+
True if the file contains exactly one unmapped read with no sequence,
|
|
663
|
+
False otherwise.
|
|
664
|
+
|
|
665
|
+
Notes
|
|
666
|
+
-----
|
|
667
|
+
Criteria for empty consensus:
|
|
668
|
+
1. Exactly one read in the file
|
|
669
|
+
2. Read is flagged as unmapped
|
|
670
|
+
3. Read has no sequence (seq is None or '*')
|
|
671
|
+
|
|
672
|
+
This specific pattern occurs when an empty FASTA sequence (often produced
|
|
673
|
+
when no terminal reads are found) is mapped against the reference. The
|
|
674
|
+
aligner produces a single unmapped record with no sequence data.
|
|
675
|
+
|
|
676
|
+
Distinguishes between:
|
|
677
|
+
- Empty consensus: No reads to build consensus from
|
|
678
|
+
- Unmapped consensus: Consensus built but doesn't align to expected location
|
|
679
|
+
"""
|
|
680
|
+
with pysam.AlignmentFile(file_path, 'rb') as alignment_file:
|
|
681
|
+
reads = list(alignment_file) # Load all reads into a list
|
|
682
|
+
|
|
683
|
+
# Check if there is exactly one read
|
|
684
|
+
if len(reads) == 1:
|
|
685
|
+
read = reads[0]
|
|
686
|
+
# Check if the read is unmapped and has no sequence
|
|
687
|
+
if read.is_unmapped and (not read.seq or read.seq == '*'):
|
|
688
|
+
return True # Only one unmapped read with no sequence
|
|
689
|
+
return False # Either more reads, or the read does not meet the conditions
|
|
690
|
+
|
|
691
|
+
|
|
692
|
+
def stitch_telo(
|
|
693
|
+
ref: str,
|
|
694
|
+
left_map: str,
|
|
695
|
+
right_map: str,
|
|
696
|
+
outfile: str,
|
|
697
|
+
logout: str,
|
|
698
|
+
tmp_left: str,
|
|
699
|
+
tmp_right: str,
|
|
700
|
+
) -> tuple[int, int]:
|
|
701
|
+
"""
|
|
702
|
+
Extend reference sequence with consensus sequences from terminal alignments.
|
|
703
|
+
|
|
704
|
+
Extracts soft-clipped portions of consensus sequences that extend beyond
|
|
705
|
+
the reference ends, attaches them to the reference, and creates a log
|
|
706
|
+
documenting the extension process. Handles cases where consensus is empty,
|
|
707
|
+
unmapped, or doesn't extend beyond reference.
|
|
708
|
+
|
|
709
|
+
Parameters
|
|
710
|
+
----------
|
|
711
|
+
ref : str
|
|
712
|
+
Path to reference FASTA file.
|
|
713
|
+
left_map : str
|
|
714
|
+
Path to BAM file with left consensus aligned to reference.
|
|
715
|
+
right_map : str
|
|
716
|
+
Path to BAM file with right consensus aligned to reference.
|
|
717
|
+
outfile : str
|
|
718
|
+
Path for output extended FASTA file.
|
|
719
|
+
logout : str
|
|
720
|
+
Path for output log file documenting extension.
|
|
721
|
+
tmp_left : str
|
|
722
|
+
Path for temporary left consensus FASTA file.
|
|
723
|
+
tmp_right : str
|
|
724
|
+
Path for temporary right consensus FASTA file.
|
|
725
|
+
|
|
726
|
+
Returns
|
|
727
|
+
-------
|
|
728
|
+
tuple of (int, int)
|
|
729
|
+
Length of left consensus and length of right consensus in bases.
|
|
730
|
+
|
|
731
|
+
Notes
|
|
732
|
+
-----
|
|
733
|
+
Left consensus processing:
|
|
734
|
+
- Extracts reads mapping near reference start (position < 1000)
|
|
735
|
+
- Looks for 5' soft-clipping extending beyond position 0
|
|
736
|
+
- Adjusts for offset between soft-clip and actual overhang
|
|
737
|
+
- Logs if consensus is empty, unmapped, or doesn't extend reference
|
|
738
|
+
|
|
739
|
+
Right consensus processing:
|
|
740
|
+
- Extracts reads mapping near reference end (position > 1000)
|
|
741
|
+
- Looks for 3' soft-clipping extending beyond reference length
|
|
742
|
+
- Adjusts for offset between soft-clip and actual overhang
|
|
743
|
+
- Logs if consensus is empty, unmapped, or doesn't extend reference
|
|
744
|
+
|
|
745
|
+
The output file contains: left_consensus + original_reference + right_consensus
|
|
746
|
+
|
|
747
|
+
Empty SeqRecord objects are created when consensus fails validation,
|
|
748
|
+
allowing the workflow to continue without breaking on concatenation.
|
|
749
|
+
|
|
750
|
+
Log file format includes:
|
|
751
|
+
- Section header
|
|
752
|
+
- Consensus lengths
|
|
753
|
+
- Error messages if consensus rejected
|
|
754
|
+
- Full consensus sequences
|
|
755
|
+
"""
|
|
756
|
+
left_log_mes = ''
|
|
757
|
+
# Check if an empty left consensus was used to generate the map:
|
|
758
|
+
if is_consensus_empty(left_map):
|
|
759
|
+
# Make an empty seq list to enable errors later on
|
|
760
|
+
left_seqs = []
|
|
761
|
+
left_log_mes = '#No consensus produced for left-side end. Likely, no reads extends the assembly. '
|
|
762
|
+
elif is_consensus_unmapped(left_map):
|
|
763
|
+
left_seqs = []
|
|
764
|
+
left_log_mes = f'#The consensus produced for the left-side does not map to left-side of {ref}'
|
|
765
|
+
else:
|
|
766
|
+
# extract left cons-to-stitch
|
|
767
|
+
l_sam_in = pysam.AlignmentFile(left_map, 'r')
|
|
768
|
+
left_seqs = []
|
|
769
|
+
start_clip = r'^(\d+)S'
|
|
770
|
+
# filter away mapping at right side
|
|
771
|
+
cons_at_left = [read for read in l_sam_in if read.reference_start < 1000]
|
|
772
|
+
|
|
773
|
+
# Get the sequence extending beyond the genome
|
|
774
|
+
for read in cons_at_left:
|
|
775
|
+
lmatch = re.match(start_clip, read.cigarstring)
|
|
776
|
+
if lmatch:
|
|
777
|
+
clip_num = int(lmatch.group(1)) # digits are retrieve via .group
|
|
778
|
+
|
|
779
|
+
# check if the clipped sequence extends beyond genome
|
|
780
|
+
if clip_num - read.reference_start <= 0:
|
|
781
|
+
left_log_mes = f'#The consensus produced for the left-side does extend beyond the start of {ref}'
|
|
782
|
+
left_seqs = []
|
|
783
|
+
else:
|
|
784
|
+
seq = read.query_sequence[
|
|
785
|
+
0 : (clip_num - read.reference_start)
|
|
786
|
+
] # Adjust for if more than just overhanging bases are soft-clipped
|
|
787
|
+
left_seqs.append(seq)
|
|
788
|
+
l_sam_in.close()
|
|
789
|
+
|
|
790
|
+
right_log_mes = ''
|
|
791
|
+
|
|
792
|
+
# Check if an empty left consensus was used to generate the map:
|
|
793
|
+
if is_consensus_empty(right_map):
|
|
794
|
+
right_seqs = []
|
|
795
|
+
right_log_mes = '#No consensus produced for right-side end. Likely, no reads extends the assembly.'
|
|
796
|
+
elif is_consensus_unmapped(right_map):
|
|
797
|
+
right_seqs = []
|
|
798
|
+
right_log_mes = f'#The consensus produced for the right-side does not map to the right-side of {ref}'
|
|
799
|
+
else:
|
|
800
|
+
# extract right cons-to-stitch
|
|
801
|
+
r_sam_in = pysam.AlignmentFile(right_map, 'r')
|
|
802
|
+
seq_end = r_sam_in.lengths[0] # get length of reference
|
|
803
|
+
right_seqs = []
|
|
804
|
+
end_clip = r'(\d+)S$' # reg. exp for ending with *S[num]
|
|
805
|
+
|
|
806
|
+
cons_at_right = [read for read in r_sam_in if read.reference_start > 1000]
|
|
807
|
+
for read in cons_at_right:
|
|
808
|
+
rmatch = re.search(end_clip, read.cigarstring)
|
|
809
|
+
if rmatch:
|
|
810
|
+
clip_num = int(rmatch.group(1)) # digits are retrieve via .group
|
|
811
|
+
# Adjusting for potential difference between overhang and soft-clip
|
|
812
|
+
adj = seq_end - read.reference_end
|
|
813
|
+
if clip_num + read.reference_end > seq_end:
|
|
814
|
+
seq = read.query_sequence[-(clip_num - adj) :]
|
|
815
|
+
right_seqs.append(seq)
|
|
816
|
+
r_sam_in.close()
|
|
817
|
+
|
|
818
|
+
# stitch the fuckers toghether
|
|
819
|
+
genome = SeqIO.read(ref, 'fasta')
|
|
820
|
+
|
|
821
|
+
# check if no conesnsus extens beyond the reference
|
|
822
|
+
if len(left_seqs) == 0:
|
|
823
|
+
left_cons = SeqRecord(
|
|
824
|
+
Seq('')
|
|
825
|
+
) # if it is empty make an empty seqrecord to avoid errors in joining later
|
|
826
|
+
logging.info('Left consensus does not extend genome')
|
|
827
|
+
else:
|
|
828
|
+
left_cons = SeqRecord(Seq(left_seqs[0]), id='left_cons')
|
|
829
|
+
logging.info(f'Left consensus is {len(left_cons)}')
|
|
830
|
+
if len(right_seqs) == 0:
|
|
831
|
+
right_cons = SeqRecord(
|
|
832
|
+
Seq('')
|
|
833
|
+
) # if it is empty make an empty seqrecord to avoid errors in joining later
|
|
834
|
+
logging.info('Right cons does not extend genome')
|
|
835
|
+
else:
|
|
836
|
+
right_cons = SeqRecord(Seq(right_seqs[0]), id='right_cons')
|
|
837
|
+
|
|
838
|
+
logging.info(f'Right consensus is {len(right_cons)}')
|
|
839
|
+
new_genome = left_cons + genome + right_cons
|
|
840
|
+
new_genome.id = 'Reference_with_consensus_attached'
|
|
841
|
+
new_genome.description = ''
|
|
842
|
+
SeqIO.write(new_genome, outfile, 'fasta')
|
|
843
|
+
SeqIO.write(left_cons, tmp_left, 'fasta')
|
|
844
|
+
SeqIO.write(right_cons, tmp_right, 'fasta')
|
|
845
|
+
|
|
846
|
+
# Create log of consensus length
|
|
847
|
+
log = open(logout, 'w')
|
|
848
|
+
log.write(
|
|
849
|
+
'=============================================================================='
|
|
850
|
+
)
|
|
851
|
+
log.write('\nINTIAL CONSENSUS')
|
|
852
|
+
log.write(
|
|
853
|
+
'\n=============================================================================='
|
|
854
|
+
)
|
|
855
|
+
log_content = '\nleft_cons:{}\tright_consensus:{}'.format(
|
|
856
|
+
len(left_cons), len(right_cons)
|
|
857
|
+
)
|
|
858
|
+
comment_mes = '\n' + '\n'.join([left_log_mes, right_log_mes])
|
|
859
|
+
log_content = log_content + comment_mes
|
|
860
|
+
log.write(log_content)
|
|
861
|
+
log.write('\n>left_cons\n')
|
|
862
|
+
log.write(str(left_cons.seq))
|
|
863
|
+
log.write('\n>right_cons\n')
|
|
864
|
+
log.write(str(right_cons.seq))
|
|
865
|
+
log.close()
|
|
866
|
+
|
|
867
|
+
return (len(left_cons), len(right_cons))
|
|
868
|
+
|
|
869
|
+
|
|
870
|
+
def get_support_info(
|
|
871
|
+
bam_file: str, genome: str, position: int, qual_threshold: int = 1
|
|
872
|
+
) -> tuple[int, int]:
|
|
873
|
+
"""
|
|
874
|
+
Calculate coverage and reference-matching bases at a specific position.
|
|
875
|
+
|
|
876
|
+
Determines read support at a genomic position by counting total coverage
|
|
877
|
+
and the number of bases matching the reference. Used to validate consensus
|
|
878
|
+
sequence quality by assessing read support at each position.
|
|
879
|
+
|
|
880
|
+
Parameters
|
|
881
|
+
----------
|
|
882
|
+
bam_file : str
|
|
883
|
+
Path to BAM alignment file.
|
|
884
|
+
genome : str
|
|
885
|
+
Path to reference FASTA file.
|
|
886
|
+
position : int
|
|
887
|
+
Zero-based position to query.
|
|
888
|
+
qual_threshold : int, default=1
|
|
889
|
+
Minimum base quality score to include in counts.
|
|
890
|
+
|
|
891
|
+
Returns
|
|
892
|
+
-------
|
|
893
|
+
tuple of (int, int)
|
|
894
|
+
(coverage, matching_bases) where:
|
|
895
|
+
- coverage: Total number of bases at this position.
|
|
896
|
+
- matching_bases: Number of bases matching the reference.
|
|
897
|
+
|
|
898
|
+
Notes
|
|
899
|
+
-----
|
|
900
|
+
Base counting:
|
|
901
|
+
- Counts A, C, G, T bases separately at the position
|
|
902
|
+
- Only includes bases with quality >= qual_threshold
|
|
903
|
+
- Includes secondary mappings (read_callback='nofilter')
|
|
904
|
+
- Sums all bases for total coverage
|
|
905
|
+
|
|
906
|
+
Reference matching:
|
|
907
|
+
- Compares reference base at position to read bases
|
|
908
|
+
- If reference is 'N': matching_bases = 0
|
|
909
|
+
- Otherwise: matching_bases = count of bases matching reference
|
|
910
|
+
|
|
911
|
+
The matching ratio (matching_bases/coverage) indicates how well
|
|
912
|
+
reads support the reference sequence at that position. High ratios
|
|
913
|
+
(>0.7) indicate strong support, while low ratios suggest the consensus
|
|
914
|
+
may not be well-supported by the reads.
|
|
915
|
+
"""
|
|
916
|
+
fasta_file = SeqIO.read(genome, 'fasta')
|
|
917
|
+
bam_in = pysam.AlignmentFile(
|
|
918
|
+
bam_file,
|
|
919
|
+
'rb',
|
|
920
|
+
)
|
|
921
|
+
|
|
922
|
+
# Set read_callback="no filter" to include secondary-mappings
|
|
923
|
+
# Set quality threshold=1 to include all reads
|
|
924
|
+
|
|
925
|
+
# Get reference name from BAM file
|
|
926
|
+
reference_name = bam_in.get_reference_name(0)
|
|
927
|
+
coverage_count = bam_in.count_coverage(
|
|
928
|
+
reference_name,
|
|
929
|
+
start=position,
|
|
930
|
+
stop=position + 1,
|
|
931
|
+
read_callback='nofilter',
|
|
932
|
+
quality_threshold=qual_threshold,
|
|
933
|
+
)
|
|
934
|
+
A_num = coverage_count[0][0]
|
|
935
|
+
C_num = coverage_count[1][0]
|
|
936
|
+
G_num = coverage_count[2][0]
|
|
937
|
+
T_num = coverage_count[3][0]
|
|
938
|
+
cov = A_num + C_num + G_num + T_num
|
|
939
|
+
|
|
940
|
+
if fasta_file.seq[position].upper() == 'N':
|
|
941
|
+
matching_bases = 0
|
|
942
|
+
elif fasta_file.seq[position].upper() == 'A':
|
|
943
|
+
matching_bases = A_num
|
|
944
|
+
elif fasta_file.seq[position].upper() == 'C':
|
|
945
|
+
matching_bases = C_num
|
|
946
|
+
elif fasta_file.seq[position].upper() == 'G':
|
|
947
|
+
matching_bases = G_num
|
|
948
|
+
elif fasta_file.seq[position].upper() == 'T':
|
|
949
|
+
matching_bases = T_num
|
|
950
|
+
|
|
951
|
+
return (cov, matching_bases)
|
|
952
|
+
|
|
953
|
+
|
|
954
|
+
def trim_by_map(
|
|
955
|
+
untrimmed_assembly: str,
|
|
956
|
+
sorted_bam_file: str,
|
|
957
|
+
output_handle: str,
|
|
958
|
+
cons_log: str,
|
|
959
|
+
cov_thres: int = 5,
|
|
960
|
+
ratio_thres: float = 0.7,
|
|
961
|
+
qual_thres: int = 0,
|
|
962
|
+
) -> None:
|
|
963
|
+
"""
|
|
964
|
+
Trim consensus extensions based on read support thresholds (Nanopore).
|
|
965
|
+
|
|
966
|
+
Validates attached consensus sequences by trimming from the ends inward
|
|
967
|
+
until finding positions with sufficient coverage and reference support.
|
|
968
|
+
Removes unsupported consensus bases while retaining well-supported extensions.
|
|
969
|
+
Optimized for Nanopore data with lower coverage requirements.
|
|
970
|
+
|
|
971
|
+
Parameters
|
|
972
|
+
----------
|
|
973
|
+
untrimmed_assembly : str
|
|
974
|
+
Path to FASTA file with untrimmed consensus attached.
|
|
975
|
+
sorted_bam_file : str
|
|
976
|
+
Path to sorted BAM of terminal reads aligned to untrimmed assembly.
|
|
977
|
+
output_handle : str
|
|
978
|
+
Path for output trimmed FASTA file.
|
|
979
|
+
cons_log : str
|
|
980
|
+
Path to existing log file (will be appended with trimming info).
|
|
981
|
+
cov_thres : int, default=5
|
|
982
|
+
Minimum coverage depth required to keep a position.
|
|
983
|
+
ratio_thres : float, default=0.7
|
|
984
|
+
Minimum fraction of reads matching reference to keep a position.
|
|
985
|
+
qual_thres : int, default=0
|
|
986
|
+
Minimum base quality score to include in coverage calculation.
|
|
987
|
+
|
|
988
|
+
Returns
|
|
989
|
+
-------
|
|
990
|
+
None
|
|
991
|
+
Writes trimmed assembly to output_handle and appends to cons_log.
|
|
992
|
+
|
|
993
|
+
Notes
|
|
994
|
+
-----
|
|
995
|
+
Trimming algorithm:
|
|
996
|
+
1. Reads original consensus lengths from log file line 4
|
|
997
|
+
2. Left end: Scans positions 0 to left_length
|
|
998
|
+
- Stops at first position meeting coverage and ratio thresholds
|
|
999
|
+
- Trims all bases before this position
|
|
1000
|
+
3. Right end: Scans positions (end - right_length) to end
|
|
1001
|
+
- Stops at first position meeting coverage and ratio thresholds
|
|
1002
|
+
- Trims all bases after this position
|
|
1003
|
+
|
|
1004
|
+
Validation criteria:
|
|
1005
|
+
- Coverage >= cov_thres
|
|
1006
|
+
- (matching_bases / coverage) > ratio_thres
|
|
1007
|
+
- Base quality >= qual_thres
|
|
1008
|
+
|
|
1009
|
+
Outcomes logged for each end:
|
|
1010
|
+
- Both rejected: Returns original reference only
|
|
1011
|
+
- One rejected: Keeps validated consensus on one side only
|
|
1012
|
+
- Both validated: Keeps both trimmed consensus sequences
|
|
1013
|
+
|
|
1014
|
+
The output sequence ID indicates whether consensus was attached and
|
|
1015
|
+
includes descriptive suffix about trimming results.
|
|
1016
|
+
|
|
1017
|
+
Designed for Nanopore data: Lower coverage threshold (5x) but
|
|
1018
|
+
similar ratio threshold to Illumina version.
|
|
1019
|
+
"""
|
|
1020
|
+
# load genome
|
|
1021
|
+
fasta = SeqIO.read(untrimmed_assembly, 'fasta')
|
|
1022
|
+
fasta_end = len(fasta.seq) - 1 # subtract one to make it 0-indexed
|
|
1023
|
+
txt = open(cons_log, 'r')
|
|
1024
|
+
txt_lines = txt.readlines()[3]
|
|
1025
|
+
txt.close()
|
|
1026
|
+
left_len = int(txt_lines.split('\t')[0].split(':')[1])
|
|
1027
|
+
right_len = int(txt_lines.split('\t')[1].split(':')[1])
|
|
1028
|
+
|
|
1029
|
+
index_start = None
|
|
1030
|
+
index_end = None
|
|
1031
|
+
|
|
1032
|
+
# trim start/left-side
|
|
1033
|
+
for pos in range(0, 0 + left_len):
|
|
1034
|
+
try:
|
|
1035
|
+
cov, match = get_support_info(
|
|
1036
|
+
sorted_bam_file, untrimmed_assembly, pos, qual_thres
|
|
1037
|
+
)
|
|
1038
|
+
|
|
1039
|
+
if cov >= cov_thres and (match / cov) > ratio_thres:
|
|
1040
|
+
index_start = pos
|
|
1041
|
+
|
|
1042
|
+
break
|
|
1043
|
+
except TypeError: # if no reads are mapped
|
|
1044
|
+
continue
|
|
1045
|
+
|
|
1046
|
+
# trim end/right
|
|
1047
|
+
for pos in range(fasta_end, fasta_end - right_len, -1):
|
|
1048
|
+
try:
|
|
1049
|
+
cov, match = get_support_info(
|
|
1050
|
+
sorted_bam_file, untrimmed_assembly, pos, qual_thres
|
|
1051
|
+
)
|
|
1052
|
+
|
|
1053
|
+
if cov >= cov_thres and (match / cov) > ratio_thres:
|
|
1054
|
+
index_end = pos
|
|
1055
|
+
|
|
1056
|
+
break
|
|
1057
|
+
except TypeError:
|
|
1058
|
+
continue
|
|
1059
|
+
|
|
1060
|
+
# check if coverage is too low for either consensus
|
|
1061
|
+
# Unclear on why, but adding one on the right side is nessesary to not trim an additional base
|
|
1062
|
+
# Even if the consensus is rejected.
|
|
1063
|
+
if index_start is None and index_end is None:
|
|
1064
|
+
trimmed_fasta = fasta[(0 + left_len) : (fasta_end - right_len) + 1]
|
|
1065
|
+
log_message = '\nLeft consensus rejected\nRight consensus rejected\n'
|
|
1066
|
+
trimmed_fasta.id = output_handle.split('.')[0] + '_with_no_consensus'
|
|
1067
|
+
trimmed_fasta.description = ''
|
|
1068
|
+
elif index_start is None: # index without left consensus, but + right side
|
|
1069
|
+
log_message = (
|
|
1070
|
+
'\nLeft consensus rejected\nRight consensus trimmed with {}\n'.format(
|
|
1071
|
+
(fasta_end - index_end)
|
|
1072
|
+
)
|
|
1073
|
+
)
|
|
1074
|
+
trimmed_fasta = fasta[(0 + left_len) : index_end + 1]
|
|
1075
|
+
trimmed_fasta.id = (
|
|
1076
|
+
output_handle.split('.')[0] + '_with_trimmed_consensus_attached'
|
|
1077
|
+
)
|
|
1078
|
+
trimmed_fasta.description = ''
|
|
1079
|
+
elif index_end is None: # index from consensus until before consensus on right side
|
|
1080
|
+
log_message = '\nLeft consensus trimmed with {}\nRight rejected\n'.format(
|
|
1081
|
+
index_start
|
|
1082
|
+
)
|
|
1083
|
+
trimmed_fasta = fasta[index_start : (fasta_end - right_len) + 1]
|
|
1084
|
+
trimmed_fasta.id = (
|
|
1085
|
+
output_handle.split('.')[0] + '_with_trimmed_consensus_attached'
|
|
1086
|
+
)
|
|
1087
|
+
trimmed_fasta.description = ''
|
|
1088
|
+
else:
|
|
1089
|
+
log_message = '\nLeft consensus trimmed with {}\nRight consensus trimmed with {}\n'.format(
|
|
1090
|
+
index_start, (fasta_end - index_end)
|
|
1091
|
+
)
|
|
1092
|
+
trimmed_fasta = fasta[index_start : index_end + 1]
|
|
1093
|
+
trimmed_fasta.id = (
|
|
1094
|
+
output_handle.split('.')[0] + '_with_trimmed_consensus_attached'
|
|
1095
|
+
)
|
|
1096
|
+
trimmed_fasta.description = ''
|
|
1097
|
+
|
|
1098
|
+
log = open(cons_log, 'a')
|
|
1099
|
+
log.write(
|
|
1100
|
+
'\n=============================================================================='
|
|
1101
|
+
)
|
|
1102
|
+
log.write('\nCONSENSUS TRIMMING')
|
|
1103
|
+
log.write(
|
|
1104
|
+
'\n=============================================================================='
|
|
1105
|
+
)
|
|
1106
|
+
log.write(
|
|
1107
|
+
f'\nRule: Trimmed until Q_score>= {qual_thres}, cov>= {cov_thres} and supporting ratio>= {ratio_thres}'
|
|
1108
|
+
)
|
|
1109
|
+
log.write(log_message)
|
|
1110
|
+
log.close()
|
|
1111
|
+
SeqIO.write(trimmed_fasta, output_handle, 'fasta')
|
|
1112
|
+
|
|
1113
|
+
|
|
1114
|
+
def trim_by_map_illumina(
|
|
1115
|
+
untrimmed_assembly: str,
|
|
1116
|
+
sorted_bam_file: str,
|
|
1117
|
+
output_handle: str,
|
|
1118
|
+
cons_log: str,
|
|
1119
|
+
cov_thres: int = 1,
|
|
1120
|
+
ratio_thres: float = 0.7,
|
|
1121
|
+
qual_thres: int = 30,
|
|
1122
|
+
) -> None:
|
|
1123
|
+
"""
|
|
1124
|
+
Trim consensus extensions based on read support thresholds (Illumina).
|
|
1125
|
+
|
|
1126
|
+
Validates attached consensus sequences by trimming from the ends inward
|
|
1127
|
+
until finding positions with sufficient coverage and reference support.
|
|
1128
|
+
Removes unsupported consensus bases while retaining well-supported extensions.
|
|
1129
|
+
Optimized for Illumina data with high quality requirements.
|
|
1130
|
+
|
|
1131
|
+
Parameters
|
|
1132
|
+
----------
|
|
1133
|
+
untrimmed_assembly : str
|
|
1134
|
+
Path to FASTA file with untrimmed consensus attached.
|
|
1135
|
+
sorted_bam_file : str
|
|
1136
|
+
Path to sorted BAM of terminal reads aligned to untrimmed assembly.
|
|
1137
|
+
output_handle : str
|
|
1138
|
+
Path for output trimmed FASTA file.
|
|
1139
|
+
cons_log : str
|
|
1140
|
+
Path to existing log file (will be appended with trimming info).
|
|
1141
|
+
cov_thres : int, default=1
|
|
1142
|
+
Minimum coverage depth required to keep a position.
|
|
1143
|
+
ratio_thres : float, default=0.7
|
|
1144
|
+
Minimum fraction of reads matching reference to keep a position.
|
|
1145
|
+
qual_thres : int, default=30
|
|
1146
|
+
Minimum base quality score (Q30) to include in coverage calculation.
|
|
1147
|
+
|
|
1148
|
+
Returns
|
|
1149
|
+
-------
|
|
1150
|
+
None
|
|
1151
|
+
Writes trimmed assembly to output_handle and appends to cons_log.
|
|
1152
|
+
|
|
1153
|
+
Notes
|
|
1154
|
+
-----
|
|
1155
|
+
Trimming algorithm:
|
|
1156
|
+
1. Reads original consensus lengths from log file line 4
|
|
1157
|
+
2. Left end: Scans positions 0 to left_length
|
|
1158
|
+
- Stops at first position meeting coverage and ratio thresholds
|
|
1159
|
+
- Trims all bases before this position
|
|
1160
|
+
3. Right end: Scans positions (end - right_length) to end
|
|
1161
|
+
- Stops at first position meeting coverage and ratio thresholds
|
|
1162
|
+
- Trims all bases after this position
|
|
1163
|
+
|
|
1164
|
+
Validation criteria:
|
|
1165
|
+
- Coverage >= cov_thres
|
|
1166
|
+
- (matching_bases / coverage) > ratio_thres
|
|
1167
|
+
- Base quality >= qual_thres
|
|
1168
|
+
|
|
1169
|
+
Outcomes logged for each end:
|
|
1170
|
+
- Both rejected: Returns original reference only
|
|
1171
|
+
- One rejected: Keeps validated consensus on one side only
|
|
1172
|
+
- Both validated: Keeps both trimmed consensus sequences
|
|
1173
|
+
|
|
1174
|
+
The output sequence ID indicates whether consensus was attached and
|
|
1175
|
+
includes descriptive suffix about trimming results.
|
|
1176
|
+
|
|
1177
|
+
Designed for Illumina data: Higher quality threshold (Q30) but
|
|
1178
|
+
lower coverage requirement (1x) compared to Nanopore version.
|
|
1179
|
+
Illumina's higher per-base accuracy allows more stringent quality
|
|
1180
|
+
filtering with lower coverage depth.
|
|
1181
|
+
"""
|
|
1182
|
+
# load genome
|
|
1183
|
+
fasta = SeqIO.read(untrimmed_assembly, 'fasta')
|
|
1184
|
+
fasta_end = len(fasta.seq) - 1 # subtract one to make it 0-indexed
|
|
1185
|
+
txt = open(cons_log, 'r')
|
|
1186
|
+
txt_lines = txt.readlines()[3]
|
|
1187
|
+
txt.close()
|
|
1188
|
+
left_len = int(txt_lines.split('\t')[0].split(':')[1])
|
|
1189
|
+
right_len = int(txt_lines.split('\t')[1].split(':')[1])
|
|
1190
|
+
|
|
1191
|
+
index_start = None
|
|
1192
|
+
index_end = None
|
|
1193
|
+
|
|
1194
|
+
# trim start/left-side
|
|
1195
|
+
for pos in range(0, 0 + left_len):
|
|
1196
|
+
try:
|
|
1197
|
+
cov, match = get_support_info(
|
|
1198
|
+
sorted_bam_file, untrimmed_assembly, pos, qual_thres
|
|
1199
|
+
)
|
|
1200
|
+
|
|
1201
|
+
if cov >= cov_thres and (match / cov) > ratio_thres:
|
|
1202
|
+
index_start = pos
|
|
1203
|
+
|
|
1204
|
+
break
|
|
1205
|
+
except TypeError: # if no reads are mapped
|
|
1206
|
+
continue
|
|
1207
|
+
|
|
1208
|
+
# trim end/right
|
|
1209
|
+
for pos in range(fasta_end, fasta_end - right_len, -1):
|
|
1210
|
+
try:
|
|
1211
|
+
cov, match = get_support_info(
|
|
1212
|
+
sorted_bam_file, untrimmed_assembly, pos, qual_thres
|
|
1213
|
+
)
|
|
1214
|
+
|
|
1215
|
+
if cov >= cov_thres and (match / cov) > ratio_thres:
|
|
1216
|
+
index_end = pos
|
|
1217
|
+
|
|
1218
|
+
break
|
|
1219
|
+
except TypeError:
|
|
1220
|
+
continue
|
|
1221
|
+
|
|
1222
|
+
# check if coverage is too low for either consensus
|
|
1223
|
+
# Unclear on why, but adding one on the right side is nessesary to not trim an additional base
|
|
1224
|
+
# Even if the consensus is rejected.
|
|
1225
|
+
if index_start is None and index_end is None:
|
|
1226
|
+
trimmed_fasta = fasta[(0 + left_len) : (fasta_end - right_len) + 1]
|
|
1227
|
+
log_message = '\nLeft consensus rejected\nRight consensus rejected\n'
|
|
1228
|
+
trimmed_fasta.id = output_handle.split('.')[0] + '_with_no_consensus'
|
|
1229
|
+
trimmed_fasta.description = ''
|
|
1230
|
+
elif index_start is None: # index without left consensus, but + right side
|
|
1231
|
+
log_message = (
|
|
1232
|
+
'\nLeft consensus rejected\nRight consensus trimmed with {}\n'.format(
|
|
1233
|
+
(fasta_end - index_end)
|
|
1234
|
+
)
|
|
1235
|
+
)
|
|
1236
|
+
trimmed_fasta = fasta[(0 + left_len) : index_end + 1]
|
|
1237
|
+
trimmed_fasta.id = (
|
|
1238
|
+
output_handle.split('.')[0] + '_with_trimmed_consensus_attached'
|
|
1239
|
+
)
|
|
1240
|
+
trimmed_fasta.description = ''
|
|
1241
|
+
elif index_end is None: # index from consensus until before consensus on right side
|
|
1242
|
+
log_message = '\nLeft consensus trimmed with {}\nRight rejected\n'.format(
|
|
1243
|
+
index_start
|
|
1244
|
+
)
|
|
1245
|
+
trimmed_fasta = fasta[index_start : (fasta_end - right_len) + 1]
|
|
1246
|
+
trimmed_fasta.id = (
|
|
1247
|
+
output_handle.split('.')[0] + '_with_trimmed_consensus_attached'
|
|
1248
|
+
)
|
|
1249
|
+
trimmed_fasta.description = ''
|
|
1250
|
+
else:
|
|
1251
|
+
log_message = '\nLeft consensus trimmed with {}\nRight consensus trimmed with {}\n'.format(
|
|
1252
|
+
index_start, (fasta_end - index_end)
|
|
1253
|
+
)
|
|
1254
|
+
trimmed_fasta = fasta[index_start : index_end + 1]
|
|
1255
|
+
trimmed_fasta.id = (
|
|
1256
|
+
output_handle.split('.')[0] + '_with_trimmed_consensus_attached'
|
|
1257
|
+
)
|
|
1258
|
+
trimmed_fasta.description = ''
|
|
1259
|
+
|
|
1260
|
+
log = open(cons_log, 'a')
|
|
1261
|
+
log.write(
|
|
1262
|
+
'\n=============================================================================='
|
|
1263
|
+
)
|
|
1264
|
+
log.write('\nCONSENSUS TRIMMING')
|
|
1265
|
+
log.write(
|
|
1266
|
+
'\n=============================================================================='
|
|
1267
|
+
)
|
|
1268
|
+
log.write(
|
|
1269
|
+
f'\nRule: Trimmed until Q_score>= {qual_thres}, cov>= {cov_thres} and supporting ratio>= {ratio_thres}'
|
|
1270
|
+
)
|
|
1271
|
+
log.write(log_message)
|
|
1272
|
+
log.close()
|
|
1273
|
+
SeqIO.write(trimmed_fasta, output_handle, 'fasta')
|
|
1274
|
+
|
|
1275
|
+
|
|
1276
|
+
def generate_support_log(genome: str, qc_bam_file: str, output_handle: str) -> None:
|
|
1277
|
+
"""
|
|
1278
|
+
Generate position-by-position coverage and support statistics for QC.
|
|
1279
|
+
|
|
1280
|
+
Creates a detailed log showing coverage and reference-matching bases at
|
|
1281
|
+
every position in the genome. Used for quality control visualization and
|
|
1282
|
+
analysis of read support across the extended assembly.
|
|
1283
|
+
|
|
1284
|
+
Parameters
|
|
1285
|
+
----------
|
|
1286
|
+
genome : str
|
|
1287
|
+
Path to reference genome FASTA file.
|
|
1288
|
+
qc_bam_file : str
|
|
1289
|
+
Path to BAM file with QC reads aligned to genome.
|
|
1290
|
+
output_handle : str
|
|
1291
|
+
Path for output log file with coverage statistics.
|
|
1292
|
+
|
|
1293
|
+
Returns
|
|
1294
|
+
-------
|
|
1295
|
+
None
|
|
1296
|
+
Writes position, coverage, and matching bases to output_handle.
|
|
1297
|
+
|
|
1298
|
+
Notes
|
|
1299
|
+
-----
|
|
1300
|
+
For each position from 0 to (genome_length - 1):
|
|
1301
|
+
- Calculates coverage (total bases)
|
|
1302
|
+
- Calculates matching bases (bases matching reference)
|
|
1303
|
+
- Prints position, coverage, matching_bases to stdout
|
|
1304
|
+
- Skips positions where no reads map (TypeError caught)
|
|
1305
|
+
|
|
1306
|
+
The output allows plotting coverage profiles to visualize:
|
|
1307
|
+
- Read support across the genome
|
|
1308
|
+
- Quality of consensus extensions at telomeres
|
|
1309
|
+
- Positions where support drops (potential trimming sites)
|
|
1310
|
+
|
|
1311
|
+
Uses qual_threshold=1 to include all bases regardless of quality,
|
|
1312
|
+
providing a complete picture of coverage for QC purposes.
|
|
1313
|
+
|
|
1314
|
+
Note: Current implementation only prints to stdout. To write to file,
|
|
1315
|
+
the log.write() call should be corrected.
|
|
1316
|
+
"""
|
|
1317
|
+
# trim start/left-side
|
|
1318
|
+
|
|
1319
|
+
fasta = SeqIO.read(genome, 'fasta')
|
|
1320
|
+
fasta_end = len(fasta.seq) - 1 # subtract one to make it 0-indexed
|
|
1321
|
+
|
|
1322
|
+
# Generate log of coverage at all positions
|
|
1323
|
+
with open(output_handle, 'a') as log:
|
|
1324
|
+
for pos in range(0, fasta_end):
|
|
1325
|
+
try:
|
|
1326
|
+
cov, match = get_support_info(
|
|
1327
|
+
bam_file=qc_bam_file, genome=genome, position=pos, qual_threshold=1
|
|
1328
|
+
)
|
|
1329
|
+
|
|
1330
|
+
print(pos, cov, match)
|
|
1331
|
+
log.write(pos, cov, match)
|
|
1332
|
+
except TypeError: # if no reads are mapped
|
|
1333
|
+
continue
|