telomore 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,732 @@
1
+ """Functions for running CLI-tools to map reads and generate consensus."""
2
+
3
+ import logging
4
+ import os
5
+ from pathlib import Path
6
+ import subprocess
7
+ import traceback
8
+
9
+ from Bio import SeqIO
10
+ from Bio.Seq import Seq
11
+ from Bio.SeqRecord import SeqRecord
12
+
13
+
14
+ def map_and_sort(reference: str, fastq: str, output: str, threads: int = 1) -> None:
15
+ """
16
+ Map Oxford Nanopore long reads to a reference genome using minimap2.
17
+
18
+ This function performs the complete mapping workflow for long reads:
19
+ maps reads using minimap2 with parameters optimized for telomere extension
20
+ analysis, sorts the output to BAM format, and creates an index file.
21
+
22
+ Parameters
23
+ ----------
24
+ reference : str
25
+ Path to the reference genome file (.fasta, .fna, or .fa)
26
+ fastq : str
27
+ Path to the Oxford Nanopore FASTQ file (can be gzipped)
28
+ output : str
29
+ Path for the output BAM file
30
+ threads : int, default=1
31
+ Number of threads to use for mapping and sorting
32
+
33
+ Returns
34
+ -------
35
+ None
36
+ Creates a sorted, indexed BAM file at the specified output path
37
+
38
+ Raises
39
+ ------
40
+ FileNotFoundError
41
+ If reference or FASTQ files don't exist
42
+ subprocess.CalledProcessError
43
+ If any of the mapping steps fail
44
+ TypeError
45
+ If threads is not an integer
46
+
47
+ Notes
48
+ -----
49
+ Uses minimap2 with these key parameters for long read mapping:
50
+ - `-a`: Output in SAM format
51
+ - `--secondary-seq`: Include sequence in secondary alignments
52
+ - `-Y`: Use soft clipping for supplementary alignments
53
+
54
+ These parameters are crucial for telomere extension analysis as they ensure:
55
+ - Secondary alignments retain sequence information
56
+ - Soft clips are preserved for reads extending beyond genome edges
57
+
58
+ The function automatically sorts the output and creates a BAM index.
59
+ """
60
+ # Input validation
61
+ if not isinstance(threads, int):
62
+ raise TypeError('threads must be an integer')
63
+ if not os.path.isfile(reference):
64
+ raise FileNotFoundError(f'Reference file does not exist: {reference}')
65
+ if not os.path.isfile(fastq):
66
+ raise FileNotFoundError(f'FASTQ file does not exist: {fastq}')
67
+
68
+ # Define path for temporary SAM file
69
+ temp_sam = f'{output}.temp.sam'
70
+
71
+ logging.info(f'Starting Oxford Nanopore read mapping for {fastq}')
72
+ logging.info(f'Reference: {reference}, Output: {output}, Threads: {threads}')
73
+
74
+ try:
75
+ # Step 1: Map reads using minimap2
76
+ logging.info('Mapping long reads with minimap2...')
77
+ map_cmd = [
78
+ 'minimap2',
79
+ '-a', # Output in SAM format
80
+ reference, # Reference genome
81
+ fastq, # Input FASTQ file
82
+ '-t',
83
+ str(threads), # Number of threads
84
+ '-o',
85
+ temp_sam, # Output SAM file
86
+ '--secondary-seq', # Include sequence in secondary alignments
87
+ '-Y', # Use soft clipping for supplementary alignments
88
+ ]
89
+
90
+ result = subprocess.run(map_cmd, capture_output=True, text=True, check=True)
91
+ logging.debug(f'Minimap2 mapping completed: {result.stderr}')
92
+
93
+ # Step 2: Sort SAM to BAM
94
+ logging.info('Sorting SAM file to BAM format...')
95
+ sort_cmd = [
96
+ 'samtools',
97
+ 'sort',
98
+ '-@',
99
+ str(threads), # Number of threads
100
+ '-o',
101
+ output, # Output file
102
+ temp_sam, # Input SAM file
103
+ ]
104
+
105
+ result = subprocess.run(sort_cmd, capture_output=True, text=True, check=True)
106
+ logging.debug(f'SAM sorting completed: {result.stderr}')
107
+
108
+ # Step 3: Index the BAM file
109
+ logging.info('Indexing BAM file...')
110
+ index_cmd = ['samtools', 'index', output]
111
+
112
+ result = subprocess.run(index_cmd, capture_output=True, text=True, check=True)
113
+ logging.debug(f'BAM indexing completed: {result.stderr}')
114
+
115
+ logging.info(f'Oxford Nanopore mapping completed successfully: {output}')
116
+
117
+ except subprocess.CalledProcessError as e:
118
+ logging.error(f'Nanopore mapping failed at command: {" ".join(e.cmd)}')
119
+ logging.error(f'Error code: {e.returncode}')
120
+ logging.error(f'Stdout: {e.stdout}')
121
+ logging.error(f'Stderr: {e.stderr}')
122
+ raise
123
+
124
+ except Exception as e:
125
+ logging.error(f'Unexpected error during Nanopore mapping: {e}')
126
+ logging.error(traceback.format_exc())
127
+ raise
128
+
129
+ finally:
130
+ # Clean up temporary SAM file
131
+ logging.info('Cleaning up temporary files...')
132
+ if os.path.exists(temp_sam):
133
+ try:
134
+ os.remove(temp_sam)
135
+ logging.debug(f'Removed temporary SAM file: {temp_sam}')
136
+ except OSError as e:
137
+ logging.warning(f'Could not remove temporary SAM file {temp_sam}: {e}')
138
+
139
+
140
+ def map_and_sort_illumina(
141
+ reference: str, read1: str, read2: str, output: str, threads: int = 1
142
+ ) -> None:
143
+ """
144
+ Map Illumina paired-end reads to a reference genome using Bowtie2.
145
+
146
+ This function performs the complete mapping workflow: builds a Bowtie2 index,
147
+ maps reads with parameters optimized for telomere extension analysis,
148
+ sorts the output to BAM format, and creates an index file.
149
+
150
+ Parameters
151
+ ----------
152
+ reference : str
153
+ Path to the reference genome file (.fasta, .fna, or .fa)
154
+ read1 : str
155
+ Path to the first paired-end FASTQ file (gzipped)
156
+ read2 : str
157
+ Path to the second paired-end FASTQ file (gzipped)
158
+ output : str
159
+ Path for the output BAM file
160
+ threads : int, default=1
161
+ Number of threads to use for mapping and sorting
162
+
163
+ Returns
164
+ -------
165
+ None
166
+ Creates a sorted, indexed BAM file at the specified output path
167
+
168
+ Raises
169
+ ------
170
+ FileNotFoundError
171
+ If reference or read files don't exist
172
+ subprocess.CalledProcessError
173
+ If any of the mapping steps fail
174
+ TypeError
175
+ If threads is not an integer
176
+
177
+ Notes
178
+ -----
179
+ Uses Bowtie2 with these key parameters:
180
+ - `-a`: Report all alignments (including secondary/supplementary)
181
+ - `--local`: Allow soft-clipping for reads extending beyond genome edges
182
+ - `--no-mixed`: Suppress unpaired alignments for paired reads
183
+ - `--no-discordant`: Suppress discordant alignments
184
+
185
+ Temporary index files are automatically cleaned up after mapping.
186
+ """
187
+ # Input validation
188
+ if not isinstance(threads, int):
189
+ raise TypeError('threads must be an integer')
190
+ if not os.path.isfile(reference):
191
+ raise FileNotFoundError(f'Reference file does not exist: {reference}')
192
+ if not os.path.isfile(read1):
193
+ raise FileNotFoundError(f'Read1 file does not exist: {read1}')
194
+ if not os.path.isfile(read2):
195
+ raise FileNotFoundError(f'Read2 file does not exist: {read2}')
196
+
197
+ # Define paths for temporary files
198
+ ref_path = Path(reference)
199
+ index_prefix = f'{ref_path}.bt.index'
200
+ temp_sam = f'{output}.temp.sam'
201
+
202
+ logging.info(f'Starting Illumina read mapping for {read1} and {read2}')
203
+ logging.info(f'Reference: {reference}, Output: {output}, Threads: {threads}')
204
+
205
+ try:
206
+ # Step 1: Build Bowtie2 index
207
+ logging.info('Building Bowtie2 index...')
208
+ build_cmd = [
209
+ 'bowtie2-build',
210
+ '-q', # Quiet mode
211
+ '--threads',
212
+ str(threads),
213
+ reference,
214
+ index_prefix,
215
+ ]
216
+
217
+ result = subprocess.run(build_cmd, capture_output=True, text=True, check=True)
218
+ logging.debug(f'Index build completed: {result.stderr}')
219
+
220
+ # Step 2: Map reads using Bowtie2
221
+ logging.info('Mapping reads with Bowtie2...')
222
+ map_cmd = [
223
+ 'bowtie2',
224
+ '-a', # Report all alignments
225
+ '-p',
226
+ str(threads), # Number of threads
227
+ '--local', # Local alignment mode for soft-clipping
228
+ '--sam-no-qname-trunc', # Don't truncate QNAME
229
+ '-x',
230
+ index_prefix, # Index prefix
231
+ '-1',
232
+ read1, # First mate file
233
+ '-2',
234
+ read2, # Second mate file
235
+ '-S',
236
+ temp_sam, # Output SAM file
237
+ '--quiet', # Suppress verbose output
238
+ '--no-mixed', # Suppress unpaired alignments
239
+ '--no-discordant', # Suppress discordant alignments
240
+ ]
241
+
242
+ result = subprocess.run(map_cmd, capture_output=True, text=True, check=True)
243
+ logging.debug(f'Read mapping completed: {result.stderr}')
244
+
245
+ # Step 3: Sort SAM to BAM
246
+ logging.info('Sorting SAM file to BAM format...')
247
+ sort_cmd = [
248
+ 'samtools',
249
+ 'sort',
250
+ '-@',
251
+ str(threads), # Number of threads
252
+ '-o',
253
+ output, # Output file
254
+ temp_sam, # Input SAM file
255
+ ]
256
+
257
+ result = subprocess.run(sort_cmd, capture_output=True, text=True, check=True)
258
+ logging.debug(f'SAM sorting completed: {result.stderr}')
259
+
260
+ # Step 4: Index the BAM file
261
+ logging.info('Indexing BAM file...')
262
+ index_cmd = ['samtools', 'index', output]
263
+
264
+ result = subprocess.run(index_cmd, capture_output=True, text=True, check=True)
265
+ logging.debug(f'BAM indexing completed: {result.stderr}')
266
+
267
+ logging.info(f'Illumina mapping completed successfully: {output}')
268
+
269
+ except subprocess.CalledProcessError as e:
270
+ logging.error(f'Illumina mapping failed at command: {" ".join(e.cmd)}')
271
+ logging.error(f'Error code: {e.returncode}')
272
+ logging.error(f'Stdout: {e.stdout}')
273
+ logging.error(f'Stderr: {e.stderr}')
274
+ raise
275
+
276
+ except Exception as e:
277
+ logging.error(f'Unexpected error during Illumina mapping: {e}')
278
+ logging.error(traceback.format_exc())
279
+ raise
280
+
281
+ finally:
282
+ # Clean up temporary files
283
+ logging.info('Cleaning up temporary files...')
284
+
285
+ # Remove temporary SAM file
286
+ if os.path.exists(temp_sam):
287
+ try:
288
+ os.remove(temp_sam)
289
+ logging.debug(f'Removed temporary SAM file: {temp_sam}')
290
+ except OSError as e:
291
+ logging.warning(f'Could not remove temporary SAM file {temp_sam}: {e}')
292
+
293
+ # Remove Bowtie2 index files
294
+ index_extensions = [
295
+ '.1.bt2',
296
+ '.2.bt2',
297
+ '.3.bt2',
298
+ '.4.bt2',
299
+ '.rev.1.bt2',
300
+ '.rev.2.bt2',
301
+ ]
302
+ for ext in index_extensions:
303
+ index_file = f'{index_prefix}{ext}'
304
+ if os.path.exists(index_file):
305
+ try:
306
+ os.remove(index_file)
307
+ logging.debug(f'Removed index file: {index_file}')
308
+ except OSError as e:
309
+ logging.warning(f'Could not remove index file {index_file}: {e}')
310
+
311
+
312
+ def map_and_sort_illumina_cons(
313
+ reference: str, consensus_fasta: str, output: str, threads: int = 1
314
+ ) -> None:
315
+ """
316
+ Map consensus sequences to a reference genome using Bowtie2.
317
+
318
+ This function performs the complete mapping workflow for consensus sequences:
319
+ builds a Bowtie2 index, maps the consensus FASTA sequences with parameters
320
+ optimized for telomere extension analysis, sorts the output to BAM format,
321
+ and creates an index file.
322
+
323
+ Parameters
324
+ ----------
325
+ reference : str
326
+ Path to the reference genome file (.fasta, .fna, or .fa)
327
+ consensus_fasta : str
328
+ Path to the consensus sequences file in FASTA format
329
+ output : str
330
+ Path for the output BAM file
331
+ threads : int, default=1
332
+ Number of threads to use for mapping and sorting
333
+
334
+ Returns
335
+ -------
336
+ None
337
+ Creates a sorted, indexed BAM file at the specified output path
338
+
339
+ Raises
340
+ ------
341
+ FileNotFoundError
342
+ If reference or consensus files don't exist
343
+ subprocess.CalledProcessError
344
+ If any of the mapping steps fail
345
+ TypeError
346
+ If threads is not an integer
347
+
348
+ Notes
349
+ -----
350
+ Uses Bowtie2 with these key parameters for consensus mapping:
351
+ - `-a`: Report all alignments (including secondary/supplementary)
352
+ - `--local`: Allow soft-clipping for sequences extending beyond genome edges
353
+ - `-f`: Input sequences are in FASTA format (not FASTQ)
354
+ - `--sam-no-qname-trunc`: Don't truncate sequence names
355
+
356
+ This function is specifically designed for mapping consensus sequences
357
+ generated from read assemblies, not raw sequencing reads.
358
+
359
+ Temporary index files are automatically cleaned up after mapping.
360
+ """
361
+ # Input validation
362
+ if not isinstance(threads, int):
363
+ raise TypeError('threads must be an integer')
364
+ if not os.path.isfile(reference):
365
+ raise FileNotFoundError(f'Reference file does not exist: {reference}')
366
+ if not os.path.isfile(consensus_fasta):
367
+ raise FileNotFoundError(
368
+ f'Consensus FASTA file does not exist: {consensus_fasta}'
369
+ )
370
+
371
+ # Define paths for temporary files
372
+ ref_path = Path(reference)
373
+ index_prefix = f'{ref_path}.bt.cons.index'
374
+ temp_sam = f'{output}.temp.sam'
375
+
376
+ logging.info(f'Starting consensus sequence mapping for {consensus_fasta}')
377
+ logging.info(f'Reference: {reference}, Output: {output}, Threads: {threads}')
378
+
379
+ try:
380
+ # Step 1: Build Bowtie2 index
381
+ logging.info('Building Bowtie2 index for consensus mapping...')
382
+ build_cmd = [
383
+ 'bowtie2-build',
384
+ '-q', # Quiet mode
385
+ '--threads',
386
+ str(threads),
387
+ reference,
388
+ index_prefix,
389
+ ]
390
+
391
+ result = subprocess.run(build_cmd, capture_output=True, text=True, check=True)
392
+ logging.debug(f'Index build completed: {result.stderr}')
393
+
394
+ # Step 2: Map consensus sequences using Bowtie2
395
+ logging.info('Mapping consensus sequences with Bowtie2...')
396
+ map_cmd = [
397
+ 'bowtie2',
398
+ '-a', # Report all alignments
399
+ '-p',
400
+ str(threads), # Number of threads
401
+ '--local', # Local alignment mode for soft-clipping
402
+ '--sam-no-qname-trunc', # Don't truncate QNAME
403
+ '-f', # Input is FASTA format (not FASTQ)
404
+ '-x',
405
+ index_prefix, # Index prefix
406
+ '-U',
407
+ consensus_fasta, # Unpaired input file (consensus sequences)
408
+ '-S',
409
+ temp_sam, # Output SAM file
410
+ '--quiet', # Suppress verbose output
411
+ ]
412
+
413
+ result = subprocess.run(map_cmd, capture_output=True, text=True, check=True)
414
+ logging.debug(f'Consensus mapping completed: {result.stderr}')
415
+
416
+ # Step 3: Sort SAM to BAM
417
+ logging.info('Sorting SAM file to BAM format...')
418
+ sort_cmd = [
419
+ 'samtools',
420
+ 'sort',
421
+ '-@',
422
+ str(threads), # Number of threads
423
+ '-o',
424
+ output, # Output file
425
+ temp_sam, # Input SAM file
426
+ ]
427
+
428
+ result = subprocess.run(sort_cmd, capture_output=True, text=True, check=True)
429
+ logging.debug(f'SAM sorting completed: {result.stderr}')
430
+
431
+ # Step 4: Index the BAM file
432
+ logging.info('Indexing BAM file...')
433
+ index_cmd = ['samtools', 'index', output]
434
+
435
+ result = subprocess.run(index_cmd, capture_output=True, text=True, check=True)
436
+ logging.debug(f'BAM indexing completed: {result.stderr}')
437
+
438
+ logging.info(f'Consensus mapping completed successfully: {output}')
439
+
440
+ except subprocess.CalledProcessError as e:
441
+ logging.error(f'Consensus mapping failed at command: {" ".join(e.cmd)}')
442
+ logging.error(f'Error code: {e.returncode}')
443
+ logging.error(f'Stdout: {e.stdout}')
444
+ logging.error(f'Stderr: {e.stderr}')
445
+ raise
446
+
447
+ except Exception as e:
448
+ logging.error(f'Unexpected error during consensus mapping: {e}')
449
+ logging.error(traceback.format_exc())
450
+ raise
451
+
452
+ finally:
453
+ # Clean up temporary files
454
+ logging.info('Cleaning up temporary files...')
455
+
456
+ # Remove temporary SAM file
457
+ if os.path.exists(temp_sam):
458
+ try:
459
+ os.remove(temp_sam)
460
+ logging.debug(f'Removed temporary SAM file: {temp_sam}')
461
+ except OSError as e:
462
+ logging.warning(f'Could not remove temporary SAM file {temp_sam}: {e}')
463
+
464
+ # Remove Bowtie2 index files
465
+ index_extensions = [
466
+ '.1.bt2',
467
+ '.2.bt2',
468
+ '.3.bt2',
469
+ '.4.bt2',
470
+ '.rev.1.bt2',
471
+ '.rev.2.bt2',
472
+ ]
473
+ for ext in index_extensions:
474
+ index_file = f'{index_prefix}{ext}'
475
+ if os.path.exists(index_file):
476
+ try:
477
+ os.remove(index_file)
478
+ logging.debug(f'Removed index file: {index_file}')
479
+ except OSError as e:
480
+ logging.warning(f'Could not remove index file {index_file}: {e}')
481
+
482
+
483
+ def train_lastDB(fasta_name: str, reads: str, db_name: str, t: int = 1) -> None:
484
+ """
485
+ Train a LAST database using a reference genome and long reads.
486
+
487
+ This function creates and trains a LAST database for use with lamassemble
488
+ consensus generation. The training process optimizes alignment parameters
489
+ based on the actual sequences in the reference and read files.
490
+
491
+ Parameters
492
+ ----------
493
+ fasta_name : str
494
+ Path to the reference genome file in FASTA format
495
+ reads : str
496
+ Path to the long reads file (FASTQ format, can be gzipped)
497
+ db_name : str
498
+ Output path/prefix for the LAST database files
499
+ t : int, default=1
500
+ Number of threads to use for indexing and training
501
+
502
+ Returns
503
+ -------
504
+ None
505
+ Creates database files with extensions: .bck, .des, .par, .prj, .sds, .ssp, .suf, .tis
506
+
507
+ Raises
508
+ ------
509
+ subprocess.CalledProcessError
510
+ If lastdb or last-train commands fail
511
+
512
+ Notes
513
+ -----
514
+ This function performs two steps:
515
+ 1. Indexes the reference using lastdb with -uRY4 (specifies repeat masking)
516
+ 2. Trains alignment parameters using last-train with -Qkeep option
517
+
518
+ The trained parameters are written to {db_name}.par and used by lamassemble
519
+ for accurate consensus generation from dissimilar sequences.
520
+ """
521
+ # index fasta file
522
+ try:
523
+ subprocess.run(
524
+ ['lastdb', '-P' + str(t), '-uRY4', db_name, fasta_name], check=True
525
+ )
526
+ except subprocess.CalledProcessError as e:
527
+ # If the bash script fails, capture the error and log the traceback
528
+ logging.error('train_lastDB failed with error: %s', e)
529
+ logging.error('Script stderr: %s', e.stderr)
530
+ logging.error(traceback.format_exc())
531
+ raise
532
+ # train last-db
533
+ try:
534
+ file = open(db_name + '.par', 'w') # needed to write to file
535
+ subprocess.run(
536
+ ['last-train', '-P' + str(t), '-Qkeep', db_name, reads],
537
+ stdout=file,
538
+ check=True,
539
+ )
540
+ file.close()
541
+ except subprocess.CalledProcessError as e:
542
+ # If the bash script fails, capture the error and log the traceback
543
+ logging.error('train_lastDB failed with error: %s', e)
544
+ logging.error('Script stderr: %s', e.stderr)
545
+ logging.error(traceback.format_exc())
546
+ raise
547
+
548
+
549
+ def generate_consensus_lamassemble(db_name: str, reads: str, output: str) -> None:
550
+ """
551
+ Generate consensus sequence using lamassemble for Oxford Nanopore reads.
552
+
553
+ This function uses lamassemble with a trained LAST database to generate
554
+ consensus sequences from dissimilar reads, which is ideal for Oxford Nanopore
555
+ data with higher error rates. Handles edge cases of zero or single reads.
556
+
557
+ Parameters
558
+ ----------
559
+ db_name : str
560
+ Path prefix to the trained LAST database (without .par extension)
561
+ reads : str
562
+ Path to input reads in FASTQ format
563
+ output : str
564
+ Path for output consensus sequence in FASTA format
565
+
566
+ Returns
567
+ -------
568
+ None
569
+ Writes consensus sequence to output file and alignment to {output}.aln
570
+
571
+ Notes
572
+ -----
573
+ Special handling for edge cases:
574
+ - 0 reads: Writes empty consensus with id 'empty_consensus'
575
+ - 1 read: Uses that read directly as consensus
576
+ - 2+ reads: Runs lamassemble for proper consensus generation
577
+
578
+ The function also saves the alignment of reads to the consensus in a
579
+ separate .aln file for quality assessment.
580
+ """
581
+ # Check if only a single read is present before generating consensus
582
+ # If that is the case write just that read to consensus file
583
+ sequence_count = 0
584
+ for record in SeqIO.parse(reads, 'fastq'):
585
+ sequence_count += 1
586
+ latest_record = record
587
+
588
+ if sequence_count == 0:
589
+ logging.info(
590
+ 'There are no reads to construct a consensus from. Emtpy consensus returned to %s',
591
+ output,
592
+ )
593
+ with open(f'{output}', 'w') as seq:
594
+ empty_record = SeqRecord(Seq(''), id='empty_consensus')
595
+ SeqIO.write(empty_record, seq, 'fasta')
596
+ if sequence_count == 1:
597
+ single_record = latest_record
598
+ logging.info(
599
+ 'There are only a single read to construct a consensus from. Returning read as consensus to %s',
600
+ output,
601
+ )
602
+ with open(f'{output}', 'w') as seq:
603
+ SeqIO.write(single_record, seq, 'fasta')
604
+ elif sequence_count > 1:
605
+ db = db_name + '.par'
606
+ seq = open(str(output), 'w')
607
+ try:
608
+ subprocess.run(
609
+ ['lamassemble', '--name=' + output, db, reads], stdout=seq, check=True
610
+ )
611
+ seq.close()
612
+
613
+ aln = open(str(output) + '.aln', 'w') # save alignment of reads
614
+ subprocess.run(['lamassemble', '-a', db, reads], stdout=aln)
615
+ aln.close()
616
+
617
+ except subprocess.CalledProcessError as e:
618
+ # If the bash script fails, capture the error and log the traceback
619
+ logging.error('generate_consensus_lamassemble failed with error: %s', e)
620
+ logging.error('Script stderr: %s', e.stderr)
621
+ logging.error(traceback.format_exc())
622
+ raise
623
+
624
+
625
+ def generate_consensus_mafft(reads: str, output: str) -> None:
626
+ """
627
+ Generate consensus sequence using MAFFT and EMBOSS cons for Illumina reads.
628
+
629
+ This function generates consensus sequences from similar reads using multiple
630
+ sequence alignment (MAFFT) followed by consensus calling (EMBOSS cons).
631
+ This approach is suitable for Illumina reads which have lower error rates
632
+ and are more similar to each other.
633
+
634
+ Parameters
635
+ ----------
636
+ reads : str
637
+ Path to input reads in FASTQ format
638
+ output : str
639
+ Path for output consensus sequence in FASTA format
640
+
641
+ Returns
642
+ -------
643
+ None
644
+ Writes consensus sequence to output file and alignment to {output}.aln
645
+
646
+ Raises
647
+ ------
648
+ subprocess.CalledProcessError
649
+ If MAFFT or EMBOSS cons commands fail
650
+
651
+ Notes
652
+ -----
653
+ Special handling for edge cases:
654
+ - 0 reads: Writes empty consensus with id 'empty_consensus'
655
+ - 1 read: Uses that read directly as consensus
656
+ - 2+ reads: Performs MAFFT alignment followed by cons consensus calling
657
+
658
+ The EMBOSS cons is run with plurality=1, meaning only one read needs to
659
+ support a position for it to be included in the consensus. This is safe
660
+ for similar Illumina reads but would be dangerous for dissimilar sequences.
661
+
662
+ Temporary FASTA file created during conversion is automatically cleaned up.
663
+ """
664
+ # Check if only a single read is mapped and use that as the consensus if there are no others.
665
+ sequence_count = 0
666
+ for record in SeqIO.parse(reads, 'fastq'):
667
+ sequence_count += 1
668
+ latest_record = record
669
+
670
+ if sequence_count == 0:
671
+ logging.info(
672
+ 'There are no reads to construct a consensus from. Emtpy consensus returned to %s',
673
+ output,
674
+ )
675
+ with open(f'{output}', 'w') as seq:
676
+ empty_record = SeqRecord(Seq(''), id='empty_consensus')
677
+ SeqIO.write(empty_record, seq, 'fasta')
678
+
679
+ if sequence_count == 1:
680
+ single_record = latest_record
681
+ logging.info(
682
+ 'There are only a single read to construct a consensus from. Returning read as consensus to %s',
683
+ output,
684
+ )
685
+ with open(f'{output}', 'w') as seq:
686
+ SeqIO.write(single_record, seq, 'fasta')
687
+
688
+ elif sequence_count > 1:
689
+ # Convert fastq to fasta
690
+ fasta_reads = reads + '.fasta'
691
+ with open(reads, 'r') as input_handle, open(fasta_reads, 'w') as output_handle:
692
+ SeqIO.convert(input_handle, 'fastq', output_handle, 'fasta')
693
+ # Run Mafft
694
+ mafft_output = output + '.aln'
695
+ try:
696
+ mafft_file = open(mafft_output, 'w')
697
+ subprocess.run(
698
+ ['mafft', '--quiet', fasta_reads], stdout=mafft_file, check=True
699
+ )
700
+ mafft_file.close()
701
+ os.remove(fasta_reads)
702
+
703
+ except subprocess.CalledProcessError as e:
704
+ # If the bash script fails, capture the error and log the traceback
705
+ logging.error('generate_consensus_mafft failed with error: %s', e)
706
+ logging.error('Script stderr: %s', e.stderr)
707
+ logging.error(traceback.format_exc())
708
+ raise
709
+
710
+ try:
711
+ # Generate consensus using Emboss cons
712
+ # Plurality 1 ensures that only one reads needs to cover a position to generate
713
+ # consensus. This is dangerous with dissimilar seqeunces
714
+ subprocess.run(
715
+ [
716
+ 'cons',
717
+ '-name=' + output,
718
+ '-plurality',
719
+ str(1),
720
+ '-sequence=' + mafft_output,
721
+ '-outseq=' + output,
722
+ ],
723
+ capture_output=True,
724
+ text=True,
725
+ check=True,
726
+ )
727
+ except subprocess.CalledProcessError as e:
728
+ # If the bash script fails, capture the error and log the traceback
729
+ logging.error('generate_consensus_mafft failed with error: %s', e)
730
+ logging.error('Script stderr: %s', e.stderr)
731
+ logging.error(traceback.format_exc())
732
+ raise