telomore 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1333 @@
1
+ """Functions for handling read mappings and extracting terminal reads."""
2
+
3
+ import gzip
4
+ import logging
5
+ from pathlib import Path
6
+ import re
7
+
8
+ from Bio import SeqIO
9
+ from Bio.Seq import Seq
10
+ from Bio.SeqRecord import SeqRecord
11
+ import pysam
12
+
13
+
14
+ def sam_to_readpair(
15
+ sam_in: Path, fastq_in1: Path, fastq_in2: Path, fastq_out1: Path, fastq_out2: Path
16
+ ) -> None:
17
+ """
18
+ Extract complete read pairs from paired-end FASTQ files based on SAM alignment.
19
+
20
+ Retrieves both reads (R1 and R2) of paired-end sequences if either read
21
+ appears in the input SAM file. This preserves read pairing for downstream
22
+ analysis that requires synchronized paired-end data.
23
+
24
+ Parameters
25
+ ----------
26
+ sam_in : Path
27
+ Path to input SAM alignment file.
28
+ fastq_in1 : Path
29
+ Path to forward/R1 FASTQ file (gzip compressed).
30
+ fastq_in2 : Path
31
+ Path to reverse/R2 FASTQ file (gzip compressed).
32
+ fastq_out1 : Path
33
+ Path for output R1 FASTQ file containing extracted read pairs.
34
+ fastq_out2 : Path
35
+ Path for output R2 FASTQ file containing extracted read pairs.
36
+
37
+ Returns
38
+ -------
39
+ None
40
+ Writes extracted read pairs to fastq_out1 and fastq_out2.
41
+
42
+ Notes
43
+ -----
44
+ Processing details:
45
+ - First pass: Collects all read IDs from SAM file into a set
46
+ - Handles read names with spaces by taking only the first part
47
+ - Second pass: Extracts matching reads from gzipped FASTQ files
48
+ - Both R1 and R2 are extracted if read name appears in SAM
49
+ - Output files are uncompressed FASTQ format
50
+
51
+ The input FASTQ files must be gzip-compressed (.gz), while outputs
52
+ are plain text for immediate downstream processing.
53
+ """
54
+ with pysam.AlignmentFile(sam_in) as samfile:
55
+ reads_to_grep = set() # using a set should be faster than list
56
+
57
+ # get all read names
58
+ for read in samfile.fetch(until_eof=True):
59
+ read_name = read.query_name
60
+ if ' ' in read.query_name:
61
+ read_name = read_name.split(' ')[0]
62
+
63
+ reads_to_grep.add(read_name)
64
+
65
+ # get read 1
66
+ with (
67
+ gzip.open(fastq_in1, 'rt') as gzip_handle,
68
+ open(fastq_out1, 'w') as outfile,
69
+ ):
70
+ for record in SeqIO.parse(gzip_handle, 'fastq'):
71
+ if record.id in reads_to_grep:
72
+ SeqIO.write(record, outfile, 'fastq')
73
+
74
+ # get read 2
75
+ with (
76
+ gzip.open(fastq_in2, 'rt') as gzip_handle,
77
+ open(fastq_out2, 'w') as outfile,
78
+ ):
79
+ for record in SeqIO.parse(gzip_handle, 'fastq'):
80
+ if record.id in reads_to_grep:
81
+ SeqIO.write(record, outfile, 'fastq')
82
+
83
+
84
+ def sam_to_fastq(sam_in: Path, fastq_out: Path) -> None:
85
+ r"""
86
+ Convert SAM alignment file to FASTQ format, excluding unmapped reads.
87
+
88
+ Extracts sequence and quality information from aligned reads in a SAM file
89
+ and writes them in FASTQ format. Unmapped reads are filtered out. Used to
90
+ extract reads that successfully aligned to terminal regions.
91
+
92
+ Parameters
93
+ ----------
94
+ sam_in : Path
95
+ Path to input SAM alignment file.
96
+ fastq_out : Path
97
+ File handle (opened in write mode) for output FASTQ.
98
+
99
+ Returns
100
+ -------
101
+ None
102
+ Writes FASTQ records to the provided file handle.
103
+
104
+ Notes
105
+ -----
106
+ Processing details:
107
+ - Only mapped reads (not flagged as unmapped) are converted
108
+ - If quality scores are missing, assigns default high quality 'I' (Q40)
109
+ - FASTQ format: @name\\nseq\\n+\\nqual\\n
110
+ - The fastq_out parameter should be an open file handle, not a path string
111
+
112
+ Quality score handling is important for reads extracted from SAM files
113
+ that may not have retained the original quality information.
114
+ """
115
+ with pysam.AlignmentFile(sam_in, 'r') as samfile:
116
+ for read in samfile.fetch(until_eof=True):
117
+ if not read.is_unmapped:
118
+ name = read.query_name
119
+ seq = read.query_sequence
120
+ qual = read.qual
121
+ if qual is None:
122
+ qual = 'I' * len(seq) # Assign a default high-quality score
123
+
124
+ # Write the read in FASTQ format to the provided handle
125
+ fastq_out.write(f'@{name}\n{seq}\n+\n{qual}\n')
126
+
127
+
128
+ def mapped_bases(cigarstring: str) -> int:
129
+ """
130
+ Calculate the number of bases mapped to the reference from a CIGAR string.
131
+
132
+ Parses a CIGAR string and sums the lengths of all operations that consume
133
+ reference bases (M, D, N, X, =). This is used to compare alignment quality
134
+ when a read maps to multiple locations.
135
+
136
+ Parameters
137
+ ----------
138
+ cigarstring : str
139
+ CIGAR string from SAM alignment (e.g., '100M5S', '50M2D50M').
140
+
141
+ Returns
142
+ -------
143
+ int
144
+ Total number of bases that align to the reference sequence.
145
+
146
+ Notes
147
+ -----
148
+ CIGAR operations that consume reference bases:
149
+ - M: alignment match (can be match or mismatch)
150
+ - D: deletion from reference
151
+ - N: skipped region from reference
152
+ - X: sequence mismatch
153
+ - =: sequence match
154
+
155
+ Operations that do NOT consume reference (excluded from count):
156
+ - S: soft clipping
157
+ - I: insertion to reference
158
+ - H: hard clipping
159
+ - P: padding
160
+
161
+ This count represents how much of the reference sequence is covered
162
+ by the alignment, which is useful for selecting the best alignment
163
+ when a read maps to multiple positions.
164
+ """
165
+ # Define operations that consume reference bases
166
+ consuming_operations = 'MDNX='
167
+
168
+ # Parse the CIGAR string using regex
169
+ # This produces a tuple in the format (121,"S")
170
+ operations = re.findall(r'(\d+)([MIDNSHP=X])', cigarstring)
171
+
172
+ # Initialize base count
173
+ mapped_bases_count = 0
174
+
175
+ # Loop through the parsed operations and sum bases for consuming operations
176
+ for length, op in operations:
177
+ if op in consuming_operations:
178
+ mapped_bases_count += int(length)
179
+
180
+ return mapped_bases_count
181
+
182
+
183
+ def cigar_maps_more_bases(cigar1: str, cigar2: str) -> bool:
184
+ """
185
+ Compare two CIGAR strings to determine which maps more reference bases.
186
+
187
+ Evaluates which of two alignments covers more bases on the reference
188
+ sequence. Used to select the better alignment when a read maps to
189
+ multiple locations.
190
+
191
+ Parameters
192
+ ----------
193
+ cigar1 : str
194
+ First CIGAR string to compare.
195
+ cigar2 : str
196
+ Second CIGAR string to compare.
197
+
198
+ Returns
199
+ -------
200
+ bool or None
201
+ True if cigar1 maps more bases than cigar2, False if cigar2 maps
202
+ more bases, None if they map equal bases.
203
+
204
+ Notes
205
+ -----
206
+ The comparison is based on the number of reference-consuming bases
207
+ (M, D, N, X, =) calculated by the mapped_bases function.
208
+
209
+ Return values:
210
+ - True: cigar1 has more mapped bases
211
+ - False: cigar2 has more mapped bases
212
+ - None: both have equal mapped bases (implicit, no return statement)
213
+
214
+ This function is used to resolve multi-mapping reads by keeping the
215
+ alignment that covers the most reference sequence, which typically
216
+ indicates a better alignment quality.
217
+ """
218
+ bases1 = mapped_bases(cigar1)
219
+ bases2 = mapped_bases(cigar2)
220
+
221
+ if bases1 > bases2:
222
+ return True
223
+ elif bases1 < bases2:
224
+ return False
225
+
226
+
227
+ def get_terminal_reads(
228
+ sorted_bam_file: Path, contig: Path, loutput_handle: Path, routput_handle: Path
229
+ ) -> None:
230
+ """
231
+ Extract reads mapping to the terminal 20bp regions of a contig.
232
+
233
+ Retrieves all reads that align to the first or last 20 bases of a reference
234
+ contig. For multi-mapping reads, keeps only the alignment with the most
235
+ mapped bases. Critical for identifying reads that extend beyond assembly ends.
236
+
237
+ Parameters
238
+ ----------
239
+ sorted_bam_file : Path
240
+ Path to sorted BAM alignment file.
241
+ contig : Path
242
+ Name/ID of the contig to extract terminal reads from.
243
+ loutput_handle : Path
244
+ Path for output BAM file containing left-terminal reads.
245
+ routput_handle : Path
246
+ Path for output BAM file containing right-terminal reads.
247
+
248
+ Returns
249
+ -------
250
+ None
251
+ Writes left-terminal reads to loutput_handle and right-terminal reads
252
+ to routput_handle.
253
+
254
+ Notes
255
+ -----
256
+ Terminal region definition:
257
+ - Left terminal: positions 0-20 (first 20bp)
258
+ - Right terminal: (seq_end - 20) to seq_end (last 20bp)
259
+
260
+ Multi-mapping read handling:
261
+ - If a read maps to terminal region multiple times, compare CIGAR strings
262
+ - Keep only the alignment mapping the most reference bases
263
+ - Skips reads with no sequence (query_sequence is None)
264
+
265
+ This function is essential for the Telomore workflow as it identifies
266
+ reads that may contain sequence extending beyond the assembly, which
267
+ can be used to build consensus extensions.
268
+ """
269
+ input = pysam.AlignmentFile(sorted_bam_file, 'r')
270
+
271
+ # Fetch all reads aligned at start or end of reference
272
+ seq_end = input.get_reference_length(contig)
273
+ ref_name = contig
274
+ left_reads = input.fetch(ref_name, start=0, stop=20)
275
+ right_reads = input.fetch(ref_name, start=(seq_end - 20), stop=seq_end)
276
+
277
+ # dict to store best mapped read from each end
278
+ lterminal_reads = {}
279
+ rterminal_reads = {}
280
+
281
+ for lread in left_reads:
282
+ query_name = lread.query_name
283
+ cigar = lread.cigarstring
284
+
285
+ if lread.query_sequence is None: # skip empty reads
286
+ continue
287
+
288
+ # Check if the read is mapped multiple times and use
289
+ # the read that maps to most bases
290
+ if query_name in lterminal_reads:
291
+ prior_read = lterminal_reads[query_name]
292
+ prior_cigar = prior_read.cigarstring
293
+
294
+ # Compare CIGAR strings to keep the one that maps more bases
295
+ if cigar_maps_more_bases(cigar, prior_cigar):
296
+ lterminal_reads[query_name] = lread
297
+ else:
298
+ lterminal_reads[query_name] = lread
299
+
300
+ for rread in right_reads:
301
+ query_name = rread.query_name
302
+ cigar = rread.cigarstring
303
+
304
+ if rread.query_sequence is None: # skip empty reads
305
+ continue
306
+
307
+ # Check if the read is mapped multiple times and use
308
+ # the read that maps to most bases
309
+ if query_name in rterminal_reads:
310
+ prior_read = rterminal_reads[query_name]
311
+ prior_cigar = prior_read.cigarstring
312
+
313
+ # Compare CIGAR strings to keep the one that maps more bases
314
+ if cigar_maps_more_bases(cigar, prior_cigar):
315
+ rterminal_reads[query_name] = rread
316
+ else:
317
+ rterminal_reads[query_name] = rread
318
+
319
+ # Write all fetched reads to a new file
320
+ lterminal_file = pysam.AlignmentFile(loutput_handle, 'w', template=input)
321
+ for read in lterminal_reads.values():
322
+ lterminal_file.write(read)
323
+ lterminal_file.close()
324
+
325
+ rterminal_file = pysam.AlignmentFile(routput_handle, 'w', template=input)
326
+ for read in rterminal_reads.values():
327
+ rterminal_file.write(read)
328
+ rterminal_file.close()
329
+
330
+
331
+ def get_left_soft(sam_file: Path, left_out: Path, offset: int = 0) -> None:
332
+ r"""
333
+ Extract reads with 5' soft-clipping that extends beyond reference start.
334
+
335
+ Identifies reads where the soft-clipped portion at the 5' end would extend
336
+ beyond position 0 of the reference. Writes full alignments to SAM and
337
+ extracts only the soft-clipped sequences to FASTQ. These represent sequence
338
+ extending left of the assembly.
339
+
340
+ Parameters
341
+ ----------
342
+ sam_file : Path
343
+ Path to input SAM alignment file.
344
+ left_out : Path
345
+ Base path for output files (adds .sam and .fastq extensions).
346
+ offset : int, default=0
347
+ Additional bases to include beyond the soft-clipped region.
348
+
349
+ Returns
350
+ -------
351
+ None
352
+ Creates two output files:
353
+ - {left_out}.sam: Full alignment records.
354
+ - {left_out}.fastq: Soft-clipped sequences only.
355
+
356
+ Notes
357
+ -----
358
+ Filtering logic:
359
+ - Looks for CIGAR patterns starting with soft-clip: ^(\\d+)S
360
+ - Only keeps reads where soft-clip length > reference_start position
361
+ - This ensures the clipped sequence extends beyond the reference start
362
+
363
+ FASTQ output contains:
364
+ - Sequence: bases [0:clip_num+offset] from read
365
+ - Quality: Phred scores converted to Sanger ASCII (Q+33)
366
+
367
+ The offset parameter allows including additional bases for context,
368
+ which can improve consensus building at the assembly boundary.
369
+ """
370
+ sam_in = pysam.AlignmentFile(sam_file, 'r')
371
+ lclip = pysam.AlignmentFile(left_out + '.sam', 'w', template=sam_in)
372
+ lfastq = open(left_out + '.fastq', 'w')
373
+
374
+ start_clip = r'^(\d+)S'
375
+ for read in sam_in:
376
+ lmatch = re.match(start_clip, read.cigarstring)
377
+
378
+ if lmatch:
379
+ clip_num = int(lmatch.group(1)) # digits are retrieve via .group
380
+
381
+ if clip_num > read.reference_start:
382
+ lclip.write(read) # write to sam-file
383
+
384
+ # get info for fastq-file
385
+ name = read.query_name
386
+ seq = read.query_sequence[0 : clip_num + offset]
387
+ sanger_qual = ''.join(
388
+ [chr(q + 33) for q in read.query_qualities[0 : clip_num + offset]]
389
+ ) # phred qual converted to ASCII with 33 offset
390
+ lfastq.write('@{}\n{}\n+\n{}\n'.format(name, seq, sanger_qual))
391
+ sam_in.close()
392
+ lclip.close()
393
+ lfastq.close()
394
+
395
+
396
+ def get_right_soft(
397
+ sam_file: Path, contig: Path, right_out: Path, offset: int = 0
398
+ ) -> None:
399
+ r"""
400
+ Extract reads with 3' soft-clipping that extends beyond reference end.
401
+
402
+ Identifies reads where the soft-clipped portion at the 3' end would extend
403
+ beyond the reference sequence end. Writes full alignments to SAM and
404
+ extracts only the soft-clipped sequences to FASTQ. These represent sequence
405
+ extending right of the assembly.
406
+
407
+ Parameters
408
+ ----------
409
+ sam_file : Path
410
+ Path to input SAM alignment file.
411
+ contig : Path
412
+ Name/ID of the contig to determine reference length.
413
+ right_out : Path
414
+ Base path for output files (adds .sam and .fastq extensions).
415
+ offset : int, default=0
416
+ Additional bases to include beyond the soft-clipped region.
417
+
418
+ Returns
419
+ -------
420
+ None
421
+ Creates two output files:
422
+ - {right_out}.sam: Full alignment records.
423
+ - {right_out}.fastq: Soft-clipped sequences only.
424
+
425
+ Notes
426
+ -----
427
+ Filtering logic:
428
+ - Looks for CIGAR patterns ending with soft-clip: (\\d+)S$
429
+ - Only keeps reads where (clip_length + reference_end) > seq_end
430
+ - This ensures the clipped sequence extends beyond the reference end
431
+
432
+ FASTQ output contains:
433
+ - Sequence: last (clip_num+offset) bases from read
434
+ - Quality: Phred scores converted to Sanger ASCII (Q+33)
435
+
436
+ The offset parameter allows including additional bases for context,
437
+ which can improve consensus building at the assembly boundary.
438
+ """
439
+ sam_in = pysam.AlignmentFile(sam_file, 'r')
440
+ rclip = pysam.AlignmentFile(right_out + '.sam', 'w', template=sam_in)
441
+ rfastq = open(right_out + '.fastq', 'w')
442
+ seq_end = sam_in.get_reference_length(contig) # get length of reference
443
+ end_clip = r'(\d+)S$'
444
+ for read in sam_in:
445
+ rmatch = re.search(end_clip, read.cigarstring)
446
+ if rmatch:
447
+ clip_num = int(rmatch.group(1)) # digits are retrieve via .group
448
+
449
+ if clip_num + read.reference_end > seq_end:
450
+ rclip.write(read) # write to sam-file
451
+
452
+ # get info for fastq-file
453
+ name = read.query_name
454
+ seq = read.query_sequence[-(clip_num + offset) :]
455
+ sanger_qual = ''.join(
456
+ [chr(q + 33) for q in read.query_qualities[-(clip_num + offset) :]]
457
+ ) # phred qual converted to ASCII with 33 offset
458
+ rfastq.write('@{}\n{}\n+\n{}\n'.format(name, seq, sanger_qual))
459
+
460
+ sam_in.close()
461
+ rclip.close()
462
+ rfastq.close()
463
+
464
+
465
+ def revcomp_reads(reads_in: str, reads_out: str) -> None:
466
+ """
467
+ Generate reverse complement of all reads in a FASTQ file.
468
+
469
+ Converts all sequences in a FASTQ file to their reverse complement,
470
+ reversing both the sequence and quality scores. Adds 'rev_' prefix
471
+ to read IDs. Used to orient left-terminal reads for consensus building.
472
+
473
+ Parameters
474
+ ----------
475
+ reads_in : str
476
+ Path to input FASTQ file.
477
+ reads_out : str
478
+ Path for output reverse-complemented FASTQ file.
479
+
480
+ Returns
481
+ -------
482
+ None
483
+ Writes reverse-complemented reads to reads_out.
484
+
485
+ Notes
486
+ -----
487
+ Transformation details:
488
+ - Sequence: Reverse complemented (A↔T, G↔C, reversed)
489
+ - Quality scores: Reversed to match new sequence orientation
490
+ - Read ID: Prefixed with 'rev_'
491
+ - Original ID and quality annotations are preserved in structure
492
+
493
+ This is necessary for left-terminal reads because they need to be
494
+ reverse-complemented before consensus building to match the expected
495
+ 5' to 3' orientation for extension sequences.
496
+ """
497
+ with open(reads_in, 'r') as input_handle, open(reads_out, 'w') as output_handle:
498
+ for record in SeqIO.parse(input_handle, 'fastq'):
499
+ # Get the reverse complement of the sequence
500
+ rev_complement_seq = record.seq.reverse_complement()
501
+
502
+ # Reverse the quality scores as well
503
+ rev_quality_scores = record.letter_annotations['phred_quality'][::-1]
504
+
505
+ # Create a new record with the reverse complement sequence and quality scores
506
+ rev_complement_record = record
507
+ rev_complement_record.id = 'rev_' + str(record.id)
508
+
509
+ rev_complement_record.seq = rev_complement_seq
510
+ rev_complement_record.letter_annotations['phred_quality'] = (
511
+ rev_quality_scores
512
+ )
513
+
514
+ # Write the reverse complement record to the output FASTQ file
515
+ SeqIO.write(rev_complement_record, output_handle, 'fastq')
516
+
517
+
518
+ def revcomp(fasta_in: str, fasta_out: str) -> None:
519
+ """
520
+ Generate reverse complement of all sequences in a FASTA file.
521
+
522
+ Converts all sequences in a FASTA file to their reverse complement.
523
+ Adds 'rev_' prefix to sequence IDs. Used to reorient consensus sequences
524
+ to match expected telomere orientation.
525
+
526
+ Parameters
527
+ ----------
528
+ fasta_in : str
529
+ Path to input FASTA file.
530
+ fasta_out : str
531
+ Path for output reverse-complemented FASTA file.
532
+
533
+ Returns
534
+ -------
535
+ None
536
+ Writes reverse-complemented sequences to fasta_out.
537
+
538
+ Notes
539
+ -----
540
+ Transformation details:
541
+ - Sequence: Reverse complemented (A↔T, G↔C, reversed)
542
+ - Sequence ID: Prefixed with 'rev_'
543
+ - Description preserved from original
544
+
545
+ Unlike revcomp_reads, this operates on FASTA format and doesn't
546
+ need to handle quality scores. Used primarily for consensus sequences
547
+ built from left-terminal reads.
548
+ """
549
+ with open(fasta_in, 'r') as input_handle, open(fasta_out, 'w') as output_handle:
550
+ for record in SeqIO.parse(input_handle, 'fasta'):
551
+ # Get the reverse complement of the sequence
552
+ rev_complement_seq = record.seq.reverse_complement()
553
+
554
+ # Create a new record with the reverse complement sequence and quality scores
555
+ rev_complement_record = record
556
+ rev_complement_record.id = 'rev_' + str(record.id)
557
+ rev_complement_record.seq = rev_complement_seq
558
+
559
+ # Write the reverse complement record to the output FASTQ file
560
+ SeqIO.write(rev_complement_record, output_handle, 'fasta')
561
+
562
+
563
+ def is_map_empty(file_path: str) -> bool:
564
+ """
565
+ Check if a BAM file contains any reads.
566
+
567
+ Attempts to fetch the first read from a BAM alignment file to determine
568
+ if the file is empty. Used to validate that alignment steps produced
569
+ output before proceeding with downstream analysis.
570
+
571
+ Parameters
572
+ ----------
573
+ file_path : str
574
+ Path to BAM file to check.
575
+
576
+ Returns
577
+ -------
578
+ bool
579
+ False if the file contains at least one read, True if empty.
580
+
581
+ Notes
582
+ -----
583
+ Implementation uses next() to attempt fetching the first read:
584
+ - If successful: Returns False (file not empty)
585
+ - If StopIteration raised: Returns True (file is empty)
586
+
587
+ This is more efficient than loading all reads since it stops at
588
+ the first read found. Empty BAM files indicate no reads aligned
589
+ in a mapping step, which may require special handling.
590
+ """
591
+ # Open the alignment file
592
+ with pysam.AlignmentFile(file_path, 'rb') as alignment_file:
593
+ # Try to fetch the first read
594
+ try:
595
+ next(alignment_file)
596
+ return False # Alignment is not empty
597
+ except StopIteration:
598
+ return True # Alignment is empty
599
+
600
+
601
+ def is_consensus_unmapped(file_path: str) -> bool:
602
+ """
603
+ Check if all reads in a BAM file are unmapped.
604
+
605
+ Determines whether a consensus sequence failed to map to the reference
606
+ by checking if all reads in the BAM file are flagged as unmapped. Used
607
+ to detect when a consensus doesn't match the expected location.
608
+
609
+ Parameters
610
+ ----------
611
+ file_path : str
612
+ Path to BAM file to check.
613
+
614
+ Returns
615
+ -------
616
+ bool
617
+ True if all reads are unmapped or file is empty, False if any
618
+ read is mapped.
619
+
620
+ Notes
621
+ -----
622
+ Processing logic:
623
+ - Loads all reads into memory (suitable for small consensus BAMs)
624
+ - Returns True if file is empty (no reads)
625
+ - Returns False immediately upon finding first mapped read
626
+ - Returns True only if all reads are unmapped
627
+
628
+ An unmapped consensus indicates the consensus sequence doesn't align
629
+ to the expected position on the reference, suggesting it may not be
630
+ a valid extension or may belong elsewhere in the genome.
631
+ """
632
+ with pysam.AlignmentFile(file_path, 'rb') as alignment_file:
633
+ reads = list(alignment_file) # get reads
634
+
635
+ is_unmapped = True # s
636
+
637
+ if len(reads) > 0:
638
+ for read in reads:
639
+ if not read.is_unmapped:
640
+ is_unmapped = False
641
+ return is_unmapped
642
+
643
+ return is_unmapped
644
+
645
+
646
+ def is_consensus_empty(file_path: str) -> bool:
647
+ """
648
+ Check if a BAM file represents an empty consensus sequence.
649
+
650
+ Identifies BAM files produced by mapping empty consensus sequences, which
651
+ contain exactly one unmapped read with no sequence. This indicates no
652
+ consensus could be built, typically because no reads extended the assembly.
653
+
654
+ Parameters
655
+ ----------
656
+ file_path : str
657
+ Path to BAM file to check.
658
+
659
+ Returns
660
+ -------
661
+ bool
662
+ True if the file contains exactly one unmapped read with no sequence,
663
+ False otherwise.
664
+
665
+ Notes
666
+ -----
667
+ Criteria for empty consensus:
668
+ 1. Exactly one read in the file
669
+ 2. Read is flagged as unmapped
670
+ 3. Read has no sequence (seq is None or '*')
671
+
672
+ This specific pattern occurs when an empty FASTA sequence (often produced
673
+ when no terminal reads are found) is mapped against the reference. The
674
+ aligner produces a single unmapped record with no sequence data.
675
+
676
+ Distinguishes between:
677
+ - Empty consensus: No reads to build consensus from
678
+ - Unmapped consensus: Consensus built but doesn't align to expected location
679
+ """
680
+ with pysam.AlignmentFile(file_path, 'rb') as alignment_file:
681
+ reads = list(alignment_file) # Load all reads into a list
682
+
683
+ # Check if there is exactly one read
684
+ if len(reads) == 1:
685
+ read = reads[0]
686
+ # Check if the read is unmapped and has no sequence
687
+ if read.is_unmapped and (not read.seq or read.seq == '*'):
688
+ return True # Only one unmapped read with no sequence
689
+ return False # Either more reads, or the read does not meet the conditions
690
+
691
+
692
+ def stitch_telo(
693
+ ref: str,
694
+ left_map: str,
695
+ right_map: str,
696
+ outfile: str,
697
+ logout: str,
698
+ tmp_left: str,
699
+ tmp_right: str,
700
+ ) -> tuple[int, int]:
701
+ """
702
+ Extend reference sequence with consensus sequences from terminal alignments.
703
+
704
+ Extracts soft-clipped portions of consensus sequences that extend beyond
705
+ the reference ends, attaches them to the reference, and creates a log
706
+ documenting the extension process. Handles cases where consensus is empty,
707
+ unmapped, or doesn't extend beyond reference.
708
+
709
+ Parameters
710
+ ----------
711
+ ref : str
712
+ Path to reference FASTA file.
713
+ left_map : str
714
+ Path to BAM file with left consensus aligned to reference.
715
+ right_map : str
716
+ Path to BAM file with right consensus aligned to reference.
717
+ outfile : str
718
+ Path for output extended FASTA file.
719
+ logout : str
720
+ Path for output log file documenting extension.
721
+ tmp_left : str
722
+ Path for temporary left consensus FASTA file.
723
+ tmp_right : str
724
+ Path for temporary right consensus FASTA file.
725
+
726
+ Returns
727
+ -------
728
+ tuple of (int, int)
729
+ Length of left consensus and length of right consensus in bases.
730
+
731
+ Notes
732
+ -----
733
+ Left consensus processing:
734
+ - Extracts reads mapping near reference start (position < 1000)
735
+ - Looks for 5' soft-clipping extending beyond position 0
736
+ - Adjusts for offset between soft-clip and actual overhang
737
+ - Logs if consensus is empty, unmapped, or doesn't extend reference
738
+
739
+ Right consensus processing:
740
+ - Extracts reads mapping near reference end (position > 1000)
741
+ - Looks for 3' soft-clipping extending beyond reference length
742
+ - Adjusts for offset between soft-clip and actual overhang
743
+ - Logs if consensus is empty, unmapped, or doesn't extend reference
744
+
745
+ The output file contains: left_consensus + original_reference + right_consensus
746
+
747
+ Empty SeqRecord objects are created when consensus fails validation,
748
+ allowing the workflow to continue without breaking on concatenation.
749
+
750
+ Log file format includes:
751
+ - Section header
752
+ - Consensus lengths
753
+ - Error messages if consensus rejected
754
+ - Full consensus sequences
755
+ """
756
+ left_log_mes = ''
757
+ # Check if an empty left consensus was used to generate the map:
758
+ if is_consensus_empty(left_map):
759
+ # Make an empty seq list to enable errors later on
760
+ left_seqs = []
761
+ left_log_mes = '#No consensus produced for left-side end. Likely, no reads extends the assembly. '
762
+ elif is_consensus_unmapped(left_map):
763
+ left_seqs = []
764
+ left_log_mes = f'#The consensus produced for the left-side does not map to left-side of {ref}'
765
+ else:
766
+ # extract left cons-to-stitch
767
+ l_sam_in = pysam.AlignmentFile(left_map, 'r')
768
+ left_seqs = []
769
+ start_clip = r'^(\d+)S'
770
+ # filter away mapping at right side
771
+ cons_at_left = [read for read in l_sam_in if read.reference_start < 1000]
772
+
773
+ # Get the sequence extending beyond the genome
774
+ for read in cons_at_left:
775
+ lmatch = re.match(start_clip, read.cigarstring)
776
+ if lmatch:
777
+ clip_num = int(lmatch.group(1)) # digits are retrieve via .group
778
+
779
+ # check if the clipped sequence extends beyond genome
780
+ if clip_num - read.reference_start <= 0:
781
+ left_log_mes = f'#The consensus produced for the left-side does extend beyond the start of {ref}'
782
+ left_seqs = []
783
+ else:
784
+ seq = read.query_sequence[
785
+ 0 : (clip_num - read.reference_start)
786
+ ] # Adjust for if more than just overhanging bases are soft-clipped
787
+ left_seqs.append(seq)
788
+ l_sam_in.close()
789
+
790
+ right_log_mes = ''
791
+
792
+ # Check if an empty left consensus was used to generate the map:
793
+ if is_consensus_empty(right_map):
794
+ right_seqs = []
795
+ right_log_mes = '#No consensus produced for right-side end. Likely, no reads extends the assembly.'
796
+ elif is_consensus_unmapped(right_map):
797
+ right_seqs = []
798
+ right_log_mes = f'#The consensus produced for the right-side does not map to the right-side of {ref}'
799
+ else:
800
+ # extract right cons-to-stitch
801
+ r_sam_in = pysam.AlignmentFile(right_map, 'r')
802
+ seq_end = r_sam_in.lengths[0] # get length of reference
803
+ right_seqs = []
804
+ end_clip = r'(\d+)S$' # reg. exp for ending with *S[num]
805
+
806
+ cons_at_right = [read for read in r_sam_in if read.reference_start > 1000]
807
+ for read in cons_at_right:
808
+ rmatch = re.search(end_clip, read.cigarstring)
809
+ if rmatch:
810
+ clip_num = int(rmatch.group(1)) # digits are retrieve via .group
811
+ # Adjusting for potential difference between overhang and soft-clip
812
+ adj = seq_end - read.reference_end
813
+ if clip_num + read.reference_end > seq_end:
814
+ seq = read.query_sequence[-(clip_num - adj) :]
815
+ right_seqs.append(seq)
816
+ r_sam_in.close()
817
+
818
+ # stitch the fuckers toghether
819
+ genome = SeqIO.read(ref, 'fasta')
820
+
821
+ # check if no conesnsus extens beyond the reference
822
+ if len(left_seqs) == 0:
823
+ left_cons = SeqRecord(
824
+ Seq('')
825
+ ) # if it is empty make an empty seqrecord to avoid errors in joining later
826
+ logging.info('Left consensus does not extend genome')
827
+ else:
828
+ left_cons = SeqRecord(Seq(left_seqs[0]), id='left_cons')
829
+ logging.info(f'Left consensus is {len(left_cons)}')
830
+ if len(right_seqs) == 0:
831
+ right_cons = SeqRecord(
832
+ Seq('')
833
+ ) # if it is empty make an empty seqrecord to avoid errors in joining later
834
+ logging.info('Right cons does not extend genome')
835
+ else:
836
+ right_cons = SeqRecord(Seq(right_seqs[0]), id='right_cons')
837
+
838
+ logging.info(f'Right consensus is {len(right_cons)}')
839
+ new_genome = left_cons + genome + right_cons
840
+ new_genome.id = 'Reference_with_consensus_attached'
841
+ new_genome.description = ''
842
+ SeqIO.write(new_genome, outfile, 'fasta')
843
+ SeqIO.write(left_cons, tmp_left, 'fasta')
844
+ SeqIO.write(right_cons, tmp_right, 'fasta')
845
+
846
+ # Create log of consensus length
847
+ log = open(logout, 'w')
848
+ log.write(
849
+ '=============================================================================='
850
+ )
851
+ log.write('\nINTIAL CONSENSUS')
852
+ log.write(
853
+ '\n=============================================================================='
854
+ )
855
+ log_content = '\nleft_cons:{}\tright_consensus:{}'.format(
856
+ len(left_cons), len(right_cons)
857
+ )
858
+ comment_mes = '\n' + '\n'.join([left_log_mes, right_log_mes])
859
+ log_content = log_content + comment_mes
860
+ log.write(log_content)
861
+ log.write('\n>left_cons\n')
862
+ log.write(str(left_cons.seq))
863
+ log.write('\n>right_cons\n')
864
+ log.write(str(right_cons.seq))
865
+ log.close()
866
+
867
+ return (len(left_cons), len(right_cons))
868
+
869
+
870
+ def get_support_info(
871
+ bam_file: str, genome: str, position: int, qual_threshold: int = 1
872
+ ) -> tuple[int, int]:
873
+ """
874
+ Calculate coverage and reference-matching bases at a specific position.
875
+
876
+ Determines read support at a genomic position by counting total coverage
877
+ and the number of bases matching the reference. Used to validate consensus
878
+ sequence quality by assessing read support at each position.
879
+
880
+ Parameters
881
+ ----------
882
+ bam_file : str
883
+ Path to BAM alignment file.
884
+ genome : str
885
+ Path to reference FASTA file.
886
+ position : int
887
+ Zero-based position to query.
888
+ qual_threshold : int, default=1
889
+ Minimum base quality score to include in counts.
890
+
891
+ Returns
892
+ -------
893
+ tuple of (int, int)
894
+ (coverage, matching_bases) where:
895
+ - coverage: Total number of bases at this position.
896
+ - matching_bases: Number of bases matching the reference.
897
+
898
+ Notes
899
+ -----
900
+ Base counting:
901
+ - Counts A, C, G, T bases separately at the position
902
+ - Only includes bases with quality >= qual_threshold
903
+ - Includes secondary mappings (read_callback='nofilter')
904
+ - Sums all bases for total coverage
905
+
906
+ Reference matching:
907
+ - Compares reference base at position to read bases
908
+ - If reference is 'N': matching_bases = 0
909
+ - Otherwise: matching_bases = count of bases matching reference
910
+
911
+ The matching ratio (matching_bases/coverage) indicates how well
912
+ reads support the reference sequence at that position. High ratios
913
+ (>0.7) indicate strong support, while low ratios suggest the consensus
914
+ may not be well-supported by the reads.
915
+ """
916
+ fasta_file = SeqIO.read(genome, 'fasta')
917
+ bam_in = pysam.AlignmentFile(
918
+ bam_file,
919
+ 'rb',
920
+ )
921
+
922
+ # Set read_callback="no filter" to include secondary-mappings
923
+ # Set quality threshold=1 to include all reads
924
+
925
+ # Get reference name from BAM file
926
+ reference_name = bam_in.get_reference_name(0)
927
+ coverage_count = bam_in.count_coverage(
928
+ reference_name,
929
+ start=position,
930
+ stop=position + 1,
931
+ read_callback='nofilter',
932
+ quality_threshold=qual_threshold,
933
+ )
934
+ A_num = coverage_count[0][0]
935
+ C_num = coverage_count[1][0]
936
+ G_num = coverage_count[2][0]
937
+ T_num = coverage_count[3][0]
938
+ cov = A_num + C_num + G_num + T_num
939
+
940
+ if fasta_file.seq[position].upper() == 'N':
941
+ matching_bases = 0
942
+ elif fasta_file.seq[position].upper() == 'A':
943
+ matching_bases = A_num
944
+ elif fasta_file.seq[position].upper() == 'C':
945
+ matching_bases = C_num
946
+ elif fasta_file.seq[position].upper() == 'G':
947
+ matching_bases = G_num
948
+ elif fasta_file.seq[position].upper() == 'T':
949
+ matching_bases = T_num
950
+
951
+ return (cov, matching_bases)
952
+
953
+
954
+ def trim_by_map(
955
+ untrimmed_assembly: str,
956
+ sorted_bam_file: str,
957
+ output_handle: str,
958
+ cons_log: str,
959
+ cov_thres: int = 5,
960
+ ratio_thres: float = 0.7,
961
+ qual_thres: int = 0,
962
+ ) -> None:
963
+ """
964
+ Trim consensus extensions based on read support thresholds (Nanopore).
965
+
966
+ Validates attached consensus sequences by trimming from the ends inward
967
+ until finding positions with sufficient coverage and reference support.
968
+ Removes unsupported consensus bases while retaining well-supported extensions.
969
+ Optimized for Nanopore data with lower coverage requirements.
970
+
971
+ Parameters
972
+ ----------
973
+ untrimmed_assembly : str
974
+ Path to FASTA file with untrimmed consensus attached.
975
+ sorted_bam_file : str
976
+ Path to sorted BAM of terminal reads aligned to untrimmed assembly.
977
+ output_handle : str
978
+ Path for output trimmed FASTA file.
979
+ cons_log : str
980
+ Path to existing log file (will be appended with trimming info).
981
+ cov_thres : int, default=5
982
+ Minimum coverage depth required to keep a position.
983
+ ratio_thres : float, default=0.7
984
+ Minimum fraction of reads matching reference to keep a position.
985
+ qual_thres : int, default=0
986
+ Minimum base quality score to include in coverage calculation.
987
+
988
+ Returns
989
+ -------
990
+ None
991
+ Writes trimmed assembly to output_handle and appends to cons_log.
992
+
993
+ Notes
994
+ -----
995
+ Trimming algorithm:
996
+ 1. Reads original consensus lengths from log file line 4
997
+ 2. Left end: Scans positions 0 to left_length
998
+ - Stops at first position meeting coverage and ratio thresholds
999
+ - Trims all bases before this position
1000
+ 3. Right end: Scans positions (end - right_length) to end
1001
+ - Stops at first position meeting coverage and ratio thresholds
1002
+ - Trims all bases after this position
1003
+
1004
+ Validation criteria:
1005
+ - Coverage >= cov_thres
1006
+ - (matching_bases / coverage) > ratio_thres
1007
+ - Base quality >= qual_thres
1008
+
1009
+ Outcomes logged for each end:
1010
+ - Both rejected: Returns original reference only
1011
+ - One rejected: Keeps validated consensus on one side only
1012
+ - Both validated: Keeps both trimmed consensus sequences
1013
+
1014
+ The output sequence ID indicates whether consensus was attached and
1015
+ includes descriptive suffix about trimming results.
1016
+
1017
+ Designed for Nanopore data: Lower coverage threshold (5x) but
1018
+ similar ratio threshold to Illumina version.
1019
+ """
1020
+ # load genome
1021
+ fasta = SeqIO.read(untrimmed_assembly, 'fasta')
1022
+ fasta_end = len(fasta.seq) - 1 # subtract one to make it 0-indexed
1023
+ txt = open(cons_log, 'r')
1024
+ txt_lines = txt.readlines()[3]
1025
+ txt.close()
1026
+ left_len = int(txt_lines.split('\t')[0].split(':')[1])
1027
+ right_len = int(txt_lines.split('\t')[1].split(':')[1])
1028
+
1029
+ index_start = None
1030
+ index_end = None
1031
+
1032
+ # trim start/left-side
1033
+ for pos in range(0, 0 + left_len):
1034
+ try:
1035
+ cov, match = get_support_info(
1036
+ sorted_bam_file, untrimmed_assembly, pos, qual_thres
1037
+ )
1038
+
1039
+ if cov >= cov_thres and (match / cov) > ratio_thres:
1040
+ index_start = pos
1041
+
1042
+ break
1043
+ except TypeError: # if no reads are mapped
1044
+ continue
1045
+
1046
+ # trim end/right
1047
+ for pos in range(fasta_end, fasta_end - right_len, -1):
1048
+ try:
1049
+ cov, match = get_support_info(
1050
+ sorted_bam_file, untrimmed_assembly, pos, qual_thres
1051
+ )
1052
+
1053
+ if cov >= cov_thres and (match / cov) > ratio_thres:
1054
+ index_end = pos
1055
+
1056
+ break
1057
+ except TypeError:
1058
+ continue
1059
+
1060
+ # check if coverage is too low for either consensus
1061
+ # Unclear on why, but adding one on the right side is nessesary to not trim an additional base
1062
+ # Even if the consensus is rejected.
1063
+ if index_start is None and index_end is None:
1064
+ trimmed_fasta = fasta[(0 + left_len) : (fasta_end - right_len) + 1]
1065
+ log_message = '\nLeft consensus rejected\nRight consensus rejected\n'
1066
+ trimmed_fasta.id = output_handle.split('.')[0] + '_with_no_consensus'
1067
+ trimmed_fasta.description = ''
1068
+ elif index_start is None: # index without left consensus, but + right side
1069
+ log_message = (
1070
+ '\nLeft consensus rejected\nRight consensus trimmed with {}\n'.format(
1071
+ (fasta_end - index_end)
1072
+ )
1073
+ )
1074
+ trimmed_fasta = fasta[(0 + left_len) : index_end + 1]
1075
+ trimmed_fasta.id = (
1076
+ output_handle.split('.')[0] + '_with_trimmed_consensus_attached'
1077
+ )
1078
+ trimmed_fasta.description = ''
1079
+ elif index_end is None: # index from consensus until before consensus on right side
1080
+ log_message = '\nLeft consensus trimmed with {}\nRight rejected\n'.format(
1081
+ index_start
1082
+ )
1083
+ trimmed_fasta = fasta[index_start : (fasta_end - right_len) + 1]
1084
+ trimmed_fasta.id = (
1085
+ output_handle.split('.')[0] + '_with_trimmed_consensus_attached'
1086
+ )
1087
+ trimmed_fasta.description = ''
1088
+ else:
1089
+ log_message = '\nLeft consensus trimmed with {}\nRight consensus trimmed with {}\n'.format(
1090
+ index_start, (fasta_end - index_end)
1091
+ )
1092
+ trimmed_fasta = fasta[index_start : index_end + 1]
1093
+ trimmed_fasta.id = (
1094
+ output_handle.split('.')[0] + '_with_trimmed_consensus_attached'
1095
+ )
1096
+ trimmed_fasta.description = ''
1097
+
1098
+ log = open(cons_log, 'a')
1099
+ log.write(
1100
+ '\n=============================================================================='
1101
+ )
1102
+ log.write('\nCONSENSUS TRIMMING')
1103
+ log.write(
1104
+ '\n=============================================================================='
1105
+ )
1106
+ log.write(
1107
+ f'\nRule: Trimmed until Q_score>= {qual_thres}, cov>= {cov_thres} and supporting ratio>= {ratio_thres}'
1108
+ )
1109
+ log.write(log_message)
1110
+ log.close()
1111
+ SeqIO.write(trimmed_fasta, output_handle, 'fasta')
1112
+
1113
+
1114
+ def trim_by_map_illumina(
1115
+ untrimmed_assembly: str,
1116
+ sorted_bam_file: str,
1117
+ output_handle: str,
1118
+ cons_log: str,
1119
+ cov_thres: int = 1,
1120
+ ratio_thres: float = 0.7,
1121
+ qual_thres: int = 30,
1122
+ ) -> None:
1123
+ """
1124
+ Trim consensus extensions based on read support thresholds (Illumina).
1125
+
1126
+ Validates attached consensus sequences by trimming from the ends inward
1127
+ until finding positions with sufficient coverage and reference support.
1128
+ Removes unsupported consensus bases while retaining well-supported extensions.
1129
+ Optimized for Illumina data with high quality requirements.
1130
+
1131
+ Parameters
1132
+ ----------
1133
+ untrimmed_assembly : str
1134
+ Path to FASTA file with untrimmed consensus attached.
1135
+ sorted_bam_file : str
1136
+ Path to sorted BAM of terminal reads aligned to untrimmed assembly.
1137
+ output_handle : str
1138
+ Path for output trimmed FASTA file.
1139
+ cons_log : str
1140
+ Path to existing log file (will be appended with trimming info).
1141
+ cov_thres : int, default=1
1142
+ Minimum coverage depth required to keep a position.
1143
+ ratio_thres : float, default=0.7
1144
+ Minimum fraction of reads matching reference to keep a position.
1145
+ qual_thres : int, default=30
1146
+ Minimum base quality score (Q30) to include in coverage calculation.
1147
+
1148
+ Returns
1149
+ -------
1150
+ None
1151
+ Writes trimmed assembly to output_handle and appends to cons_log.
1152
+
1153
+ Notes
1154
+ -----
1155
+ Trimming algorithm:
1156
+ 1. Reads original consensus lengths from log file line 4
1157
+ 2. Left end: Scans positions 0 to left_length
1158
+ - Stops at first position meeting coverage and ratio thresholds
1159
+ - Trims all bases before this position
1160
+ 3. Right end: Scans positions (end - right_length) to end
1161
+ - Stops at first position meeting coverage and ratio thresholds
1162
+ - Trims all bases after this position
1163
+
1164
+ Validation criteria:
1165
+ - Coverage >= cov_thres
1166
+ - (matching_bases / coverage) > ratio_thres
1167
+ - Base quality >= qual_thres
1168
+
1169
+ Outcomes logged for each end:
1170
+ - Both rejected: Returns original reference only
1171
+ - One rejected: Keeps validated consensus on one side only
1172
+ - Both validated: Keeps both trimmed consensus sequences
1173
+
1174
+ The output sequence ID indicates whether consensus was attached and
1175
+ includes descriptive suffix about trimming results.
1176
+
1177
+ Designed for Illumina data: Higher quality threshold (Q30) but
1178
+ lower coverage requirement (1x) compared to Nanopore version.
1179
+ Illumina's higher per-base accuracy allows more stringent quality
1180
+ filtering with lower coverage depth.
1181
+ """
1182
+ # load genome
1183
+ fasta = SeqIO.read(untrimmed_assembly, 'fasta')
1184
+ fasta_end = len(fasta.seq) - 1 # subtract one to make it 0-indexed
1185
+ txt = open(cons_log, 'r')
1186
+ txt_lines = txt.readlines()[3]
1187
+ txt.close()
1188
+ left_len = int(txt_lines.split('\t')[0].split(':')[1])
1189
+ right_len = int(txt_lines.split('\t')[1].split(':')[1])
1190
+
1191
+ index_start = None
1192
+ index_end = None
1193
+
1194
+ # trim start/left-side
1195
+ for pos in range(0, 0 + left_len):
1196
+ try:
1197
+ cov, match = get_support_info(
1198
+ sorted_bam_file, untrimmed_assembly, pos, qual_thres
1199
+ )
1200
+
1201
+ if cov >= cov_thres and (match / cov) > ratio_thres:
1202
+ index_start = pos
1203
+
1204
+ break
1205
+ except TypeError: # if no reads are mapped
1206
+ continue
1207
+
1208
+ # trim end/right
1209
+ for pos in range(fasta_end, fasta_end - right_len, -1):
1210
+ try:
1211
+ cov, match = get_support_info(
1212
+ sorted_bam_file, untrimmed_assembly, pos, qual_thres
1213
+ )
1214
+
1215
+ if cov >= cov_thres and (match / cov) > ratio_thres:
1216
+ index_end = pos
1217
+
1218
+ break
1219
+ except TypeError:
1220
+ continue
1221
+
1222
+ # check if coverage is too low for either consensus
1223
+ # Unclear on why, but adding one on the right side is nessesary to not trim an additional base
1224
+ # Even if the consensus is rejected.
1225
+ if index_start is None and index_end is None:
1226
+ trimmed_fasta = fasta[(0 + left_len) : (fasta_end - right_len) + 1]
1227
+ log_message = '\nLeft consensus rejected\nRight consensus rejected\n'
1228
+ trimmed_fasta.id = output_handle.split('.')[0] + '_with_no_consensus'
1229
+ trimmed_fasta.description = ''
1230
+ elif index_start is None: # index without left consensus, but + right side
1231
+ log_message = (
1232
+ '\nLeft consensus rejected\nRight consensus trimmed with {}\n'.format(
1233
+ (fasta_end - index_end)
1234
+ )
1235
+ )
1236
+ trimmed_fasta = fasta[(0 + left_len) : index_end + 1]
1237
+ trimmed_fasta.id = (
1238
+ output_handle.split('.')[0] + '_with_trimmed_consensus_attached'
1239
+ )
1240
+ trimmed_fasta.description = ''
1241
+ elif index_end is None: # index from consensus until before consensus on right side
1242
+ log_message = '\nLeft consensus trimmed with {}\nRight rejected\n'.format(
1243
+ index_start
1244
+ )
1245
+ trimmed_fasta = fasta[index_start : (fasta_end - right_len) + 1]
1246
+ trimmed_fasta.id = (
1247
+ output_handle.split('.')[0] + '_with_trimmed_consensus_attached'
1248
+ )
1249
+ trimmed_fasta.description = ''
1250
+ else:
1251
+ log_message = '\nLeft consensus trimmed with {}\nRight consensus trimmed with {}\n'.format(
1252
+ index_start, (fasta_end - index_end)
1253
+ )
1254
+ trimmed_fasta = fasta[index_start : index_end + 1]
1255
+ trimmed_fasta.id = (
1256
+ output_handle.split('.')[0] + '_with_trimmed_consensus_attached'
1257
+ )
1258
+ trimmed_fasta.description = ''
1259
+
1260
+ log = open(cons_log, 'a')
1261
+ log.write(
1262
+ '\n=============================================================================='
1263
+ )
1264
+ log.write('\nCONSENSUS TRIMMING')
1265
+ log.write(
1266
+ '\n=============================================================================='
1267
+ )
1268
+ log.write(
1269
+ f'\nRule: Trimmed until Q_score>= {qual_thres}, cov>= {cov_thres} and supporting ratio>= {ratio_thres}'
1270
+ )
1271
+ log.write(log_message)
1272
+ log.close()
1273
+ SeqIO.write(trimmed_fasta, output_handle, 'fasta')
1274
+
1275
+
1276
+ def generate_support_log(genome: str, qc_bam_file: str, output_handle: str) -> None:
1277
+ """
1278
+ Generate position-by-position coverage and support statistics for QC.
1279
+
1280
+ Creates a detailed log showing coverage and reference-matching bases at
1281
+ every position in the genome. Used for quality control visualization and
1282
+ analysis of read support across the extended assembly.
1283
+
1284
+ Parameters
1285
+ ----------
1286
+ genome : str
1287
+ Path to reference genome FASTA file.
1288
+ qc_bam_file : str
1289
+ Path to BAM file with QC reads aligned to genome.
1290
+ output_handle : str
1291
+ Path for output log file with coverage statistics.
1292
+
1293
+ Returns
1294
+ -------
1295
+ None
1296
+ Writes position, coverage, and matching bases to output_handle.
1297
+
1298
+ Notes
1299
+ -----
1300
+ For each position from 0 to (genome_length - 1):
1301
+ - Calculates coverage (total bases)
1302
+ - Calculates matching bases (bases matching reference)
1303
+ - Prints position, coverage, matching_bases to stdout
1304
+ - Skips positions where no reads map (TypeError caught)
1305
+
1306
+ The output allows plotting coverage profiles to visualize:
1307
+ - Read support across the genome
1308
+ - Quality of consensus extensions at telomeres
1309
+ - Positions where support drops (potential trimming sites)
1310
+
1311
+ Uses qual_threshold=1 to include all bases regardless of quality,
1312
+ providing a complete picture of coverage for QC purposes.
1313
+
1314
+ Note: Current implementation only prints to stdout. To write to file,
1315
+ the log.write() call should be corrected.
1316
+ """
1317
+ # trim start/left-side
1318
+
1319
+ fasta = SeqIO.read(genome, 'fasta')
1320
+ fasta_end = len(fasta.seq) - 1 # subtract one to make it 0-indexed
1321
+
1322
+ # Generate log of coverage at all positions
1323
+ with open(output_handle, 'a') as log:
1324
+ for pos in range(0, fasta_end):
1325
+ try:
1326
+ cov, match = get_support_info(
1327
+ bam_file=qc_bam_file, genome=genome, position=pos, qual_threshold=1
1328
+ )
1329
+
1330
+ print(pos, cov, match)
1331
+ log.write(pos, cov, match)
1332
+ except TypeError: # if no reads are mapped
1333
+ continue