telomore 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- telomore/__init__.py +5 -0
- telomore/_version.py +34 -0
- telomore/app.py +536 -0
- telomore/utils/__init__.py +1 -0
- telomore/utils/arg_parser.py +220 -0
- telomore/utils/classes_and_small_func.py +289 -0
- telomore/utils/cmd_tools.py +732 -0
- telomore/utils/fasta_tools.py +595 -0
- telomore/utils/map_tools.py +1333 -0
- telomore/utils/qc_reports.py +493 -0
- telomore-0.4.1.dist-info/METADATA +149 -0
- telomore-0.4.1.dist-info/RECORD +15 -0
- telomore-0.4.1.dist-info/WHEEL +4 -0
- telomore-0.4.1.dist-info/entry_points.txt +2 -0
- telomore-0.4.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,732 @@
|
|
|
1
|
+
"""Functions for running CLI-tools to map reads and generate consensus."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import subprocess
|
|
7
|
+
import traceback
|
|
8
|
+
|
|
9
|
+
from Bio import SeqIO
|
|
10
|
+
from Bio.Seq import Seq
|
|
11
|
+
from Bio.SeqRecord import SeqRecord
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def map_and_sort(reference: str, fastq: str, output: str, threads: int = 1) -> None:
|
|
15
|
+
"""
|
|
16
|
+
Map Oxford Nanopore long reads to a reference genome using minimap2.
|
|
17
|
+
|
|
18
|
+
This function performs the complete mapping workflow for long reads:
|
|
19
|
+
maps reads using minimap2 with parameters optimized for telomere extension
|
|
20
|
+
analysis, sorts the output to BAM format, and creates an index file.
|
|
21
|
+
|
|
22
|
+
Parameters
|
|
23
|
+
----------
|
|
24
|
+
reference : str
|
|
25
|
+
Path to the reference genome file (.fasta, .fna, or .fa)
|
|
26
|
+
fastq : str
|
|
27
|
+
Path to the Oxford Nanopore FASTQ file (can be gzipped)
|
|
28
|
+
output : str
|
|
29
|
+
Path for the output BAM file
|
|
30
|
+
threads : int, default=1
|
|
31
|
+
Number of threads to use for mapping and sorting
|
|
32
|
+
|
|
33
|
+
Returns
|
|
34
|
+
-------
|
|
35
|
+
None
|
|
36
|
+
Creates a sorted, indexed BAM file at the specified output path
|
|
37
|
+
|
|
38
|
+
Raises
|
|
39
|
+
------
|
|
40
|
+
FileNotFoundError
|
|
41
|
+
If reference or FASTQ files don't exist
|
|
42
|
+
subprocess.CalledProcessError
|
|
43
|
+
If any of the mapping steps fail
|
|
44
|
+
TypeError
|
|
45
|
+
If threads is not an integer
|
|
46
|
+
|
|
47
|
+
Notes
|
|
48
|
+
-----
|
|
49
|
+
Uses minimap2 with these key parameters for long read mapping:
|
|
50
|
+
- `-a`: Output in SAM format
|
|
51
|
+
- `--secondary-seq`: Include sequence in secondary alignments
|
|
52
|
+
- `-Y`: Use soft clipping for supplementary alignments
|
|
53
|
+
|
|
54
|
+
These parameters are crucial for telomere extension analysis as they ensure:
|
|
55
|
+
- Secondary alignments retain sequence information
|
|
56
|
+
- Soft clips are preserved for reads extending beyond genome edges
|
|
57
|
+
|
|
58
|
+
The function automatically sorts the output and creates a BAM index.
|
|
59
|
+
"""
|
|
60
|
+
# Input validation
|
|
61
|
+
if not isinstance(threads, int):
|
|
62
|
+
raise TypeError('threads must be an integer')
|
|
63
|
+
if not os.path.isfile(reference):
|
|
64
|
+
raise FileNotFoundError(f'Reference file does not exist: {reference}')
|
|
65
|
+
if not os.path.isfile(fastq):
|
|
66
|
+
raise FileNotFoundError(f'FASTQ file does not exist: {fastq}')
|
|
67
|
+
|
|
68
|
+
# Define path for temporary SAM file
|
|
69
|
+
temp_sam = f'{output}.temp.sam'
|
|
70
|
+
|
|
71
|
+
logging.info(f'Starting Oxford Nanopore read mapping for {fastq}')
|
|
72
|
+
logging.info(f'Reference: {reference}, Output: {output}, Threads: {threads}')
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
# Step 1: Map reads using minimap2
|
|
76
|
+
logging.info('Mapping long reads with minimap2...')
|
|
77
|
+
map_cmd = [
|
|
78
|
+
'minimap2',
|
|
79
|
+
'-a', # Output in SAM format
|
|
80
|
+
reference, # Reference genome
|
|
81
|
+
fastq, # Input FASTQ file
|
|
82
|
+
'-t',
|
|
83
|
+
str(threads), # Number of threads
|
|
84
|
+
'-o',
|
|
85
|
+
temp_sam, # Output SAM file
|
|
86
|
+
'--secondary-seq', # Include sequence in secondary alignments
|
|
87
|
+
'-Y', # Use soft clipping for supplementary alignments
|
|
88
|
+
]
|
|
89
|
+
|
|
90
|
+
result = subprocess.run(map_cmd, capture_output=True, text=True, check=True)
|
|
91
|
+
logging.debug(f'Minimap2 mapping completed: {result.stderr}')
|
|
92
|
+
|
|
93
|
+
# Step 2: Sort SAM to BAM
|
|
94
|
+
logging.info('Sorting SAM file to BAM format...')
|
|
95
|
+
sort_cmd = [
|
|
96
|
+
'samtools',
|
|
97
|
+
'sort',
|
|
98
|
+
'-@',
|
|
99
|
+
str(threads), # Number of threads
|
|
100
|
+
'-o',
|
|
101
|
+
output, # Output file
|
|
102
|
+
temp_sam, # Input SAM file
|
|
103
|
+
]
|
|
104
|
+
|
|
105
|
+
result = subprocess.run(sort_cmd, capture_output=True, text=True, check=True)
|
|
106
|
+
logging.debug(f'SAM sorting completed: {result.stderr}')
|
|
107
|
+
|
|
108
|
+
# Step 3: Index the BAM file
|
|
109
|
+
logging.info('Indexing BAM file...')
|
|
110
|
+
index_cmd = ['samtools', 'index', output]
|
|
111
|
+
|
|
112
|
+
result = subprocess.run(index_cmd, capture_output=True, text=True, check=True)
|
|
113
|
+
logging.debug(f'BAM indexing completed: {result.stderr}')
|
|
114
|
+
|
|
115
|
+
logging.info(f'Oxford Nanopore mapping completed successfully: {output}')
|
|
116
|
+
|
|
117
|
+
except subprocess.CalledProcessError as e:
|
|
118
|
+
logging.error(f'Nanopore mapping failed at command: {" ".join(e.cmd)}')
|
|
119
|
+
logging.error(f'Error code: {e.returncode}')
|
|
120
|
+
logging.error(f'Stdout: {e.stdout}')
|
|
121
|
+
logging.error(f'Stderr: {e.stderr}')
|
|
122
|
+
raise
|
|
123
|
+
|
|
124
|
+
except Exception as e:
|
|
125
|
+
logging.error(f'Unexpected error during Nanopore mapping: {e}')
|
|
126
|
+
logging.error(traceback.format_exc())
|
|
127
|
+
raise
|
|
128
|
+
|
|
129
|
+
finally:
|
|
130
|
+
# Clean up temporary SAM file
|
|
131
|
+
logging.info('Cleaning up temporary files...')
|
|
132
|
+
if os.path.exists(temp_sam):
|
|
133
|
+
try:
|
|
134
|
+
os.remove(temp_sam)
|
|
135
|
+
logging.debug(f'Removed temporary SAM file: {temp_sam}')
|
|
136
|
+
except OSError as e:
|
|
137
|
+
logging.warning(f'Could not remove temporary SAM file {temp_sam}: {e}')
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def map_and_sort_illumina(
|
|
141
|
+
reference: str, read1: str, read2: str, output: str, threads: int = 1
|
|
142
|
+
) -> None:
|
|
143
|
+
"""
|
|
144
|
+
Map Illumina paired-end reads to a reference genome using Bowtie2.
|
|
145
|
+
|
|
146
|
+
This function performs the complete mapping workflow: builds a Bowtie2 index,
|
|
147
|
+
maps reads with parameters optimized for telomere extension analysis,
|
|
148
|
+
sorts the output to BAM format, and creates an index file.
|
|
149
|
+
|
|
150
|
+
Parameters
|
|
151
|
+
----------
|
|
152
|
+
reference : str
|
|
153
|
+
Path to the reference genome file (.fasta, .fna, or .fa)
|
|
154
|
+
read1 : str
|
|
155
|
+
Path to the first paired-end FASTQ file (gzipped)
|
|
156
|
+
read2 : str
|
|
157
|
+
Path to the second paired-end FASTQ file (gzipped)
|
|
158
|
+
output : str
|
|
159
|
+
Path for the output BAM file
|
|
160
|
+
threads : int, default=1
|
|
161
|
+
Number of threads to use for mapping and sorting
|
|
162
|
+
|
|
163
|
+
Returns
|
|
164
|
+
-------
|
|
165
|
+
None
|
|
166
|
+
Creates a sorted, indexed BAM file at the specified output path
|
|
167
|
+
|
|
168
|
+
Raises
|
|
169
|
+
------
|
|
170
|
+
FileNotFoundError
|
|
171
|
+
If reference or read files don't exist
|
|
172
|
+
subprocess.CalledProcessError
|
|
173
|
+
If any of the mapping steps fail
|
|
174
|
+
TypeError
|
|
175
|
+
If threads is not an integer
|
|
176
|
+
|
|
177
|
+
Notes
|
|
178
|
+
-----
|
|
179
|
+
Uses Bowtie2 with these key parameters:
|
|
180
|
+
- `-a`: Report all alignments (including secondary/supplementary)
|
|
181
|
+
- `--local`: Allow soft-clipping for reads extending beyond genome edges
|
|
182
|
+
- `--no-mixed`: Suppress unpaired alignments for paired reads
|
|
183
|
+
- `--no-discordant`: Suppress discordant alignments
|
|
184
|
+
|
|
185
|
+
Temporary index files are automatically cleaned up after mapping.
|
|
186
|
+
"""
|
|
187
|
+
# Input validation
|
|
188
|
+
if not isinstance(threads, int):
|
|
189
|
+
raise TypeError('threads must be an integer')
|
|
190
|
+
if not os.path.isfile(reference):
|
|
191
|
+
raise FileNotFoundError(f'Reference file does not exist: {reference}')
|
|
192
|
+
if not os.path.isfile(read1):
|
|
193
|
+
raise FileNotFoundError(f'Read1 file does not exist: {read1}')
|
|
194
|
+
if not os.path.isfile(read2):
|
|
195
|
+
raise FileNotFoundError(f'Read2 file does not exist: {read2}')
|
|
196
|
+
|
|
197
|
+
# Define paths for temporary files
|
|
198
|
+
ref_path = Path(reference)
|
|
199
|
+
index_prefix = f'{ref_path}.bt.index'
|
|
200
|
+
temp_sam = f'{output}.temp.sam'
|
|
201
|
+
|
|
202
|
+
logging.info(f'Starting Illumina read mapping for {read1} and {read2}')
|
|
203
|
+
logging.info(f'Reference: {reference}, Output: {output}, Threads: {threads}')
|
|
204
|
+
|
|
205
|
+
try:
|
|
206
|
+
# Step 1: Build Bowtie2 index
|
|
207
|
+
logging.info('Building Bowtie2 index...')
|
|
208
|
+
build_cmd = [
|
|
209
|
+
'bowtie2-build',
|
|
210
|
+
'-q', # Quiet mode
|
|
211
|
+
'--threads',
|
|
212
|
+
str(threads),
|
|
213
|
+
reference,
|
|
214
|
+
index_prefix,
|
|
215
|
+
]
|
|
216
|
+
|
|
217
|
+
result = subprocess.run(build_cmd, capture_output=True, text=True, check=True)
|
|
218
|
+
logging.debug(f'Index build completed: {result.stderr}')
|
|
219
|
+
|
|
220
|
+
# Step 2: Map reads using Bowtie2
|
|
221
|
+
logging.info('Mapping reads with Bowtie2...')
|
|
222
|
+
map_cmd = [
|
|
223
|
+
'bowtie2',
|
|
224
|
+
'-a', # Report all alignments
|
|
225
|
+
'-p',
|
|
226
|
+
str(threads), # Number of threads
|
|
227
|
+
'--local', # Local alignment mode for soft-clipping
|
|
228
|
+
'--sam-no-qname-trunc', # Don't truncate QNAME
|
|
229
|
+
'-x',
|
|
230
|
+
index_prefix, # Index prefix
|
|
231
|
+
'-1',
|
|
232
|
+
read1, # First mate file
|
|
233
|
+
'-2',
|
|
234
|
+
read2, # Second mate file
|
|
235
|
+
'-S',
|
|
236
|
+
temp_sam, # Output SAM file
|
|
237
|
+
'--quiet', # Suppress verbose output
|
|
238
|
+
'--no-mixed', # Suppress unpaired alignments
|
|
239
|
+
'--no-discordant', # Suppress discordant alignments
|
|
240
|
+
]
|
|
241
|
+
|
|
242
|
+
result = subprocess.run(map_cmd, capture_output=True, text=True, check=True)
|
|
243
|
+
logging.debug(f'Read mapping completed: {result.stderr}')
|
|
244
|
+
|
|
245
|
+
# Step 3: Sort SAM to BAM
|
|
246
|
+
logging.info('Sorting SAM file to BAM format...')
|
|
247
|
+
sort_cmd = [
|
|
248
|
+
'samtools',
|
|
249
|
+
'sort',
|
|
250
|
+
'-@',
|
|
251
|
+
str(threads), # Number of threads
|
|
252
|
+
'-o',
|
|
253
|
+
output, # Output file
|
|
254
|
+
temp_sam, # Input SAM file
|
|
255
|
+
]
|
|
256
|
+
|
|
257
|
+
result = subprocess.run(sort_cmd, capture_output=True, text=True, check=True)
|
|
258
|
+
logging.debug(f'SAM sorting completed: {result.stderr}')
|
|
259
|
+
|
|
260
|
+
# Step 4: Index the BAM file
|
|
261
|
+
logging.info('Indexing BAM file...')
|
|
262
|
+
index_cmd = ['samtools', 'index', output]
|
|
263
|
+
|
|
264
|
+
result = subprocess.run(index_cmd, capture_output=True, text=True, check=True)
|
|
265
|
+
logging.debug(f'BAM indexing completed: {result.stderr}')
|
|
266
|
+
|
|
267
|
+
logging.info(f'Illumina mapping completed successfully: {output}')
|
|
268
|
+
|
|
269
|
+
except subprocess.CalledProcessError as e:
|
|
270
|
+
logging.error(f'Illumina mapping failed at command: {" ".join(e.cmd)}')
|
|
271
|
+
logging.error(f'Error code: {e.returncode}')
|
|
272
|
+
logging.error(f'Stdout: {e.stdout}')
|
|
273
|
+
logging.error(f'Stderr: {e.stderr}')
|
|
274
|
+
raise
|
|
275
|
+
|
|
276
|
+
except Exception as e:
|
|
277
|
+
logging.error(f'Unexpected error during Illumina mapping: {e}')
|
|
278
|
+
logging.error(traceback.format_exc())
|
|
279
|
+
raise
|
|
280
|
+
|
|
281
|
+
finally:
|
|
282
|
+
# Clean up temporary files
|
|
283
|
+
logging.info('Cleaning up temporary files...')
|
|
284
|
+
|
|
285
|
+
# Remove temporary SAM file
|
|
286
|
+
if os.path.exists(temp_sam):
|
|
287
|
+
try:
|
|
288
|
+
os.remove(temp_sam)
|
|
289
|
+
logging.debug(f'Removed temporary SAM file: {temp_sam}')
|
|
290
|
+
except OSError as e:
|
|
291
|
+
logging.warning(f'Could not remove temporary SAM file {temp_sam}: {e}')
|
|
292
|
+
|
|
293
|
+
# Remove Bowtie2 index files
|
|
294
|
+
index_extensions = [
|
|
295
|
+
'.1.bt2',
|
|
296
|
+
'.2.bt2',
|
|
297
|
+
'.3.bt2',
|
|
298
|
+
'.4.bt2',
|
|
299
|
+
'.rev.1.bt2',
|
|
300
|
+
'.rev.2.bt2',
|
|
301
|
+
]
|
|
302
|
+
for ext in index_extensions:
|
|
303
|
+
index_file = f'{index_prefix}{ext}'
|
|
304
|
+
if os.path.exists(index_file):
|
|
305
|
+
try:
|
|
306
|
+
os.remove(index_file)
|
|
307
|
+
logging.debug(f'Removed index file: {index_file}')
|
|
308
|
+
except OSError as e:
|
|
309
|
+
logging.warning(f'Could not remove index file {index_file}: {e}')
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def map_and_sort_illumina_cons(
|
|
313
|
+
reference: str, consensus_fasta: str, output: str, threads: int = 1
|
|
314
|
+
) -> None:
|
|
315
|
+
"""
|
|
316
|
+
Map consensus sequences to a reference genome using Bowtie2.
|
|
317
|
+
|
|
318
|
+
This function performs the complete mapping workflow for consensus sequences:
|
|
319
|
+
builds a Bowtie2 index, maps the consensus FASTA sequences with parameters
|
|
320
|
+
optimized for telomere extension analysis, sorts the output to BAM format,
|
|
321
|
+
and creates an index file.
|
|
322
|
+
|
|
323
|
+
Parameters
|
|
324
|
+
----------
|
|
325
|
+
reference : str
|
|
326
|
+
Path to the reference genome file (.fasta, .fna, or .fa)
|
|
327
|
+
consensus_fasta : str
|
|
328
|
+
Path to the consensus sequences file in FASTA format
|
|
329
|
+
output : str
|
|
330
|
+
Path for the output BAM file
|
|
331
|
+
threads : int, default=1
|
|
332
|
+
Number of threads to use for mapping and sorting
|
|
333
|
+
|
|
334
|
+
Returns
|
|
335
|
+
-------
|
|
336
|
+
None
|
|
337
|
+
Creates a sorted, indexed BAM file at the specified output path
|
|
338
|
+
|
|
339
|
+
Raises
|
|
340
|
+
------
|
|
341
|
+
FileNotFoundError
|
|
342
|
+
If reference or consensus files don't exist
|
|
343
|
+
subprocess.CalledProcessError
|
|
344
|
+
If any of the mapping steps fail
|
|
345
|
+
TypeError
|
|
346
|
+
If threads is not an integer
|
|
347
|
+
|
|
348
|
+
Notes
|
|
349
|
+
-----
|
|
350
|
+
Uses Bowtie2 with these key parameters for consensus mapping:
|
|
351
|
+
- `-a`: Report all alignments (including secondary/supplementary)
|
|
352
|
+
- `--local`: Allow soft-clipping for sequences extending beyond genome edges
|
|
353
|
+
- `-f`: Input sequences are in FASTA format (not FASTQ)
|
|
354
|
+
- `--sam-no-qname-trunc`: Don't truncate sequence names
|
|
355
|
+
|
|
356
|
+
This function is specifically designed for mapping consensus sequences
|
|
357
|
+
generated from read assemblies, not raw sequencing reads.
|
|
358
|
+
|
|
359
|
+
Temporary index files are automatically cleaned up after mapping.
|
|
360
|
+
"""
|
|
361
|
+
# Input validation
|
|
362
|
+
if not isinstance(threads, int):
|
|
363
|
+
raise TypeError('threads must be an integer')
|
|
364
|
+
if not os.path.isfile(reference):
|
|
365
|
+
raise FileNotFoundError(f'Reference file does not exist: {reference}')
|
|
366
|
+
if not os.path.isfile(consensus_fasta):
|
|
367
|
+
raise FileNotFoundError(
|
|
368
|
+
f'Consensus FASTA file does not exist: {consensus_fasta}'
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
# Define paths for temporary files
|
|
372
|
+
ref_path = Path(reference)
|
|
373
|
+
index_prefix = f'{ref_path}.bt.cons.index'
|
|
374
|
+
temp_sam = f'{output}.temp.sam'
|
|
375
|
+
|
|
376
|
+
logging.info(f'Starting consensus sequence mapping for {consensus_fasta}')
|
|
377
|
+
logging.info(f'Reference: {reference}, Output: {output}, Threads: {threads}')
|
|
378
|
+
|
|
379
|
+
try:
|
|
380
|
+
# Step 1: Build Bowtie2 index
|
|
381
|
+
logging.info('Building Bowtie2 index for consensus mapping...')
|
|
382
|
+
build_cmd = [
|
|
383
|
+
'bowtie2-build',
|
|
384
|
+
'-q', # Quiet mode
|
|
385
|
+
'--threads',
|
|
386
|
+
str(threads),
|
|
387
|
+
reference,
|
|
388
|
+
index_prefix,
|
|
389
|
+
]
|
|
390
|
+
|
|
391
|
+
result = subprocess.run(build_cmd, capture_output=True, text=True, check=True)
|
|
392
|
+
logging.debug(f'Index build completed: {result.stderr}')
|
|
393
|
+
|
|
394
|
+
# Step 2: Map consensus sequences using Bowtie2
|
|
395
|
+
logging.info('Mapping consensus sequences with Bowtie2...')
|
|
396
|
+
map_cmd = [
|
|
397
|
+
'bowtie2',
|
|
398
|
+
'-a', # Report all alignments
|
|
399
|
+
'-p',
|
|
400
|
+
str(threads), # Number of threads
|
|
401
|
+
'--local', # Local alignment mode for soft-clipping
|
|
402
|
+
'--sam-no-qname-trunc', # Don't truncate QNAME
|
|
403
|
+
'-f', # Input is FASTA format (not FASTQ)
|
|
404
|
+
'-x',
|
|
405
|
+
index_prefix, # Index prefix
|
|
406
|
+
'-U',
|
|
407
|
+
consensus_fasta, # Unpaired input file (consensus sequences)
|
|
408
|
+
'-S',
|
|
409
|
+
temp_sam, # Output SAM file
|
|
410
|
+
'--quiet', # Suppress verbose output
|
|
411
|
+
]
|
|
412
|
+
|
|
413
|
+
result = subprocess.run(map_cmd, capture_output=True, text=True, check=True)
|
|
414
|
+
logging.debug(f'Consensus mapping completed: {result.stderr}')
|
|
415
|
+
|
|
416
|
+
# Step 3: Sort SAM to BAM
|
|
417
|
+
logging.info('Sorting SAM file to BAM format...')
|
|
418
|
+
sort_cmd = [
|
|
419
|
+
'samtools',
|
|
420
|
+
'sort',
|
|
421
|
+
'-@',
|
|
422
|
+
str(threads), # Number of threads
|
|
423
|
+
'-o',
|
|
424
|
+
output, # Output file
|
|
425
|
+
temp_sam, # Input SAM file
|
|
426
|
+
]
|
|
427
|
+
|
|
428
|
+
result = subprocess.run(sort_cmd, capture_output=True, text=True, check=True)
|
|
429
|
+
logging.debug(f'SAM sorting completed: {result.stderr}')
|
|
430
|
+
|
|
431
|
+
# Step 4: Index the BAM file
|
|
432
|
+
logging.info('Indexing BAM file...')
|
|
433
|
+
index_cmd = ['samtools', 'index', output]
|
|
434
|
+
|
|
435
|
+
result = subprocess.run(index_cmd, capture_output=True, text=True, check=True)
|
|
436
|
+
logging.debug(f'BAM indexing completed: {result.stderr}')
|
|
437
|
+
|
|
438
|
+
logging.info(f'Consensus mapping completed successfully: {output}')
|
|
439
|
+
|
|
440
|
+
except subprocess.CalledProcessError as e:
|
|
441
|
+
logging.error(f'Consensus mapping failed at command: {" ".join(e.cmd)}')
|
|
442
|
+
logging.error(f'Error code: {e.returncode}')
|
|
443
|
+
logging.error(f'Stdout: {e.stdout}')
|
|
444
|
+
logging.error(f'Stderr: {e.stderr}')
|
|
445
|
+
raise
|
|
446
|
+
|
|
447
|
+
except Exception as e:
|
|
448
|
+
logging.error(f'Unexpected error during consensus mapping: {e}')
|
|
449
|
+
logging.error(traceback.format_exc())
|
|
450
|
+
raise
|
|
451
|
+
|
|
452
|
+
finally:
|
|
453
|
+
# Clean up temporary files
|
|
454
|
+
logging.info('Cleaning up temporary files...')
|
|
455
|
+
|
|
456
|
+
# Remove temporary SAM file
|
|
457
|
+
if os.path.exists(temp_sam):
|
|
458
|
+
try:
|
|
459
|
+
os.remove(temp_sam)
|
|
460
|
+
logging.debug(f'Removed temporary SAM file: {temp_sam}')
|
|
461
|
+
except OSError as e:
|
|
462
|
+
logging.warning(f'Could not remove temporary SAM file {temp_sam}: {e}')
|
|
463
|
+
|
|
464
|
+
# Remove Bowtie2 index files
|
|
465
|
+
index_extensions = [
|
|
466
|
+
'.1.bt2',
|
|
467
|
+
'.2.bt2',
|
|
468
|
+
'.3.bt2',
|
|
469
|
+
'.4.bt2',
|
|
470
|
+
'.rev.1.bt2',
|
|
471
|
+
'.rev.2.bt2',
|
|
472
|
+
]
|
|
473
|
+
for ext in index_extensions:
|
|
474
|
+
index_file = f'{index_prefix}{ext}'
|
|
475
|
+
if os.path.exists(index_file):
|
|
476
|
+
try:
|
|
477
|
+
os.remove(index_file)
|
|
478
|
+
logging.debug(f'Removed index file: {index_file}')
|
|
479
|
+
except OSError as e:
|
|
480
|
+
logging.warning(f'Could not remove index file {index_file}: {e}')
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
def train_lastDB(fasta_name: str, reads: str, db_name: str, t: int = 1) -> None:
|
|
484
|
+
"""
|
|
485
|
+
Train a LAST database using a reference genome and long reads.
|
|
486
|
+
|
|
487
|
+
This function creates and trains a LAST database for use with lamassemble
|
|
488
|
+
consensus generation. The training process optimizes alignment parameters
|
|
489
|
+
based on the actual sequences in the reference and read files.
|
|
490
|
+
|
|
491
|
+
Parameters
|
|
492
|
+
----------
|
|
493
|
+
fasta_name : str
|
|
494
|
+
Path to the reference genome file in FASTA format
|
|
495
|
+
reads : str
|
|
496
|
+
Path to the long reads file (FASTQ format, can be gzipped)
|
|
497
|
+
db_name : str
|
|
498
|
+
Output path/prefix for the LAST database files
|
|
499
|
+
t : int, default=1
|
|
500
|
+
Number of threads to use for indexing and training
|
|
501
|
+
|
|
502
|
+
Returns
|
|
503
|
+
-------
|
|
504
|
+
None
|
|
505
|
+
Creates database files with extensions: .bck, .des, .par, .prj, .sds, .ssp, .suf, .tis
|
|
506
|
+
|
|
507
|
+
Raises
|
|
508
|
+
------
|
|
509
|
+
subprocess.CalledProcessError
|
|
510
|
+
If lastdb or last-train commands fail
|
|
511
|
+
|
|
512
|
+
Notes
|
|
513
|
+
-----
|
|
514
|
+
This function performs two steps:
|
|
515
|
+
1. Indexes the reference using lastdb with -uRY4 (specifies repeat masking)
|
|
516
|
+
2. Trains alignment parameters using last-train with -Qkeep option
|
|
517
|
+
|
|
518
|
+
The trained parameters are written to {db_name}.par and used by lamassemble
|
|
519
|
+
for accurate consensus generation from dissimilar sequences.
|
|
520
|
+
"""
|
|
521
|
+
# index fasta file
|
|
522
|
+
try:
|
|
523
|
+
subprocess.run(
|
|
524
|
+
['lastdb', '-P' + str(t), '-uRY4', db_name, fasta_name], check=True
|
|
525
|
+
)
|
|
526
|
+
except subprocess.CalledProcessError as e:
|
|
527
|
+
# If the bash script fails, capture the error and log the traceback
|
|
528
|
+
logging.error('train_lastDB failed with error: %s', e)
|
|
529
|
+
logging.error('Script stderr: %s', e.stderr)
|
|
530
|
+
logging.error(traceback.format_exc())
|
|
531
|
+
raise
|
|
532
|
+
# train last-db
|
|
533
|
+
try:
|
|
534
|
+
file = open(db_name + '.par', 'w') # needed to write to file
|
|
535
|
+
subprocess.run(
|
|
536
|
+
['last-train', '-P' + str(t), '-Qkeep', db_name, reads],
|
|
537
|
+
stdout=file,
|
|
538
|
+
check=True,
|
|
539
|
+
)
|
|
540
|
+
file.close()
|
|
541
|
+
except subprocess.CalledProcessError as e:
|
|
542
|
+
# If the bash script fails, capture the error and log the traceback
|
|
543
|
+
logging.error('train_lastDB failed with error: %s', e)
|
|
544
|
+
logging.error('Script stderr: %s', e.stderr)
|
|
545
|
+
logging.error(traceback.format_exc())
|
|
546
|
+
raise
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
def generate_consensus_lamassemble(db_name: str, reads: str, output: str) -> None:
|
|
550
|
+
"""
|
|
551
|
+
Generate consensus sequence using lamassemble for Oxford Nanopore reads.
|
|
552
|
+
|
|
553
|
+
This function uses lamassemble with a trained LAST database to generate
|
|
554
|
+
consensus sequences from dissimilar reads, which is ideal for Oxford Nanopore
|
|
555
|
+
data with higher error rates. Handles edge cases of zero or single reads.
|
|
556
|
+
|
|
557
|
+
Parameters
|
|
558
|
+
----------
|
|
559
|
+
db_name : str
|
|
560
|
+
Path prefix to the trained LAST database (without .par extension)
|
|
561
|
+
reads : str
|
|
562
|
+
Path to input reads in FASTQ format
|
|
563
|
+
output : str
|
|
564
|
+
Path for output consensus sequence in FASTA format
|
|
565
|
+
|
|
566
|
+
Returns
|
|
567
|
+
-------
|
|
568
|
+
None
|
|
569
|
+
Writes consensus sequence to output file and alignment to {output}.aln
|
|
570
|
+
|
|
571
|
+
Notes
|
|
572
|
+
-----
|
|
573
|
+
Special handling for edge cases:
|
|
574
|
+
- 0 reads: Writes empty consensus with id 'empty_consensus'
|
|
575
|
+
- 1 read: Uses that read directly as consensus
|
|
576
|
+
- 2+ reads: Runs lamassemble for proper consensus generation
|
|
577
|
+
|
|
578
|
+
The function also saves the alignment of reads to the consensus in a
|
|
579
|
+
separate .aln file for quality assessment.
|
|
580
|
+
"""
|
|
581
|
+
# Check if only a single read is present before generating consensus
|
|
582
|
+
# If that is the case write just that read to consensus file
|
|
583
|
+
sequence_count = 0
|
|
584
|
+
for record in SeqIO.parse(reads, 'fastq'):
|
|
585
|
+
sequence_count += 1
|
|
586
|
+
latest_record = record
|
|
587
|
+
|
|
588
|
+
if sequence_count == 0:
|
|
589
|
+
logging.info(
|
|
590
|
+
'There are no reads to construct a consensus from. Emtpy consensus returned to %s',
|
|
591
|
+
output,
|
|
592
|
+
)
|
|
593
|
+
with open(f'{output}', 'w') as seq:
|
|
594
|
+
empty_record = SeqRecord(Seq(''), id='empty_consensus')
|
|
595
|
+
SeqIO.write(empty_record, seq, 'fasta')
|
|
596
|
+
if sequence_count == 1:
|
|
597
|
+
single_record = latest_record
|
|
598
|
+
logging.info(
|
|
599
|
+
'There are only a single read to construct a consensus from. Returning read as consensus to %s',
|
|
600
|
+
output,
|
|
601
|
+
)
|
|
602
|
+
with open(f'{output}', 'w') as seq:
|
|
603
|
+
SeqIO.write(single_record, seq, 'fasta')
|
|
604
|
+
elif sequence_count > 1:
|
|
605
|
+
db = db_name + '.par'
|
|
606
|
+
seq = open(str(output), 'w')
|
|
607
|
+
try:
|
|
608
|
+
subprocess.run(
|
|
609
|
+
['lamassemble', '--name=' + output, db, reads], stdout=seq, check=True
|
|
610
|
+
)
|
|
611
|
+
seq.close()
|
|
612
|
+
|
|
613
|
+
aln = open(str(output) + '.aln', 'w') # save alignment of reads
|
|
614
|
+
subprocess.run(['lamassemble', '-a', db, reads], stdout=aln)
|
|
615
|
+
aln.close()
|
|
616
|
+
|
|
617
|
+
except subprocess.CalledProcessError as e:
|
|
618
|
+
# If the bash script fails, capture the error and log the traceback
|
|
619
|
+
logging.error('generate_consensus_lamassemble failed with error: %s', e)
|
|
620
|
+
logging.error('Script stderr: %s', e.stderr)
|
|
621
|
+
logging.error(traceback.format_exc())
|
|
622
|
+
raise
|
|
623
|
+
|
|
624
|
+
|
|
625
|
+
def generate_consensus_mafft(reads: str, output: str) -> None:
|
|
626
|
+
"""
|
|
627
|
+
Generate consensus sequence using MAFFT and EMBOSS cons for Illumina reads.
|
|
628
|
+
|
|
629
|
+
This function generates consensus sequences from similar reads using multiple
|
|
630
|
+
sequence alignment (MAFFT) followed by consensus calling (EMBOSS cons).
|
|
631
|
+
This approach is suitable for Illumina reads which have lower error rates
|
|
632
|
+
and are more similar to each other.
|
|
633
|
+
|
|
634
|
+
Parameters
|
|
635
|
+
----------
|
|
636
|
+
reads : str
|
|
637
|
+
Path to input reads in FASTQ format
|
|
638
|
+
output : str
|
|
639
|
+
Path for output consensus sequence in FASTA format
|
|
640
|
+
|
|
641
|
+
Returns
|
|
642
|
+
-------
|
|
643
|
+
None
|
|
644
|
+
Writes consensus sequence to output file and alignment to {output}.aln
|
|
645
|
+
|
|
646
|
+
Raises
|
|
647
|
+
------
|
|
648
|
+
subprocess.CalledProcessError
|
|
649
|
+
If MAFFT or EMBOSS cons commands fail
|
|
650
|
+
|
|
651
|
+
Notes
|
|
652
|
+
-----
|
|
653
|
+
Special handling for edge cases:
|
|
654
|
+
- 0 reads: Writes empty consensus with id 'empty_consensus'
|
|
655
|
+
- 1 read: Uses that read directly as consensus
|
|
656
|
+
- 2+ reads: Performs MAFFT alignment followed by cons consensus calling
|
|
657
|
+
|
|
658
|
+
The EMBOSS cons is run with plurality=1, meaning only one read needs to
|
|
659
|
+
support a position for it to be included in the consensus. This is safe
|
|
660
|
+
for similar Illumina reads but would be dangerous for dissimilar sequences.
|
|
661
|
+
|
|
662
|
+
Temporary FASTA file created during conversion is automatically cleaned up.
|
|
663
|
+
"""
|
|
664
|
+
# Check if only a single read is mapped and use that as the consensus if there are no others.
|
|
665
|
+
sequence_count = 0
|
|
666
|
+
for record in SeqIO.parse(reads, 'fastq'):
|
|
667
|
+
sequence_count += 1
|
|
668
|
+
latest_record = record
|
|
669
|
+
|
|
670
|
+
if sequence_count == 0:
|
|
671
|
+
logging.info(
|
|
672
|
+
'There are no reads to construct a consensus from. Emtpy consensus returned to %s',
|
|
673
|
+
output,
|
|
674
|
+
)
|
|
675
|
+
with open(f'{output}', 'w') as seq:
|
|
676
|
+
empty_record = SeqRecord(Seq(''), id='empty_consensus')
|
|
677
|
+
SeqIO.write(empty_record, seq, 'fasta')
|
|
678
|
+
|
|
679
|
+
if sequence_count == 1:
|
|
680
|
+
single_record = latest_record
|
|
681
|
+
logging.info(
|
|
682
|
+
'There are only a single read to construct a consensus from. Returning read as consensus to %s',
|
|
683
|
+
output,
|
|
684
|
+
)
|
|
685
|
+
with open(f'{output}', 'w') as seq:
|
|
686
|
+
SeqIO.write(single_record, seq, 'fasta')
|
|
687
|
+
|
|
688
|
+
elif sequence_count > 1:
|
|
689
|
+
# Convert fastq to fasta
|
|
690
|
+
fasta_reads = reads + '.fasta'
|
|
691
|
+
with open(reads, 'r') as input_handle, open(fasta_reads, 'w') as output_handle:
|
|
692
|
+
SeqIO.convert(input_handle, 'fastq', output_handle, 'fasta')
|
|
693
|
+
# Run Mafft
|
|
694
|
+
mafft_output = output + '.aln'
|
|
695
|
+
try:
|
|
696
|
+
mafft_file = open(mafft_output, 'w')
|
|
697
|
+
subprocess.run(
|
|
698
|
+
['mafft', '--quiet', fasta_reads], stdout=mafft_file, check=True
|
|
699
|
+
)
|
|
700
|
+
mafft_file.close()
|
|
701
|
+
os.remove(fasta_reads)
|
|
702
|
+
|
|
703
|
+
except subprocess.CalledProcessError as e:
|
|
704
|
+
# If the bash script fails, capture the error and log the traceback
|
|
705
|
+
logging.error('generate_consensus_mafft failed with error: %s', e)
|
|
706
|
+
logging.error('Script stderr: %s', e.stderr)
|
|
707
|
+
logging.error(traceback.format_exc())
|
|
708
|
+
raise
|
|
709
|
+
|
|
710
|
+
try:
|
|
711
|
+
# Generate consensus using Emboss cons
|
|
712
|
+
# Plurality 1 ensures that only one reads needs to cover a position to generate
|
|
713
|
+
# consensus. This is dangerous with dissimilar seqeunces
|
|
714
|
+
subprocess.run(
|
|
715
|
+
[
|
|
716
|
+
'cons',
|
|
717
|
+
'-name=' + output,
|
|
718
|
+
'-plurality',
|
|
719
|
+
str(1),
|
|
720
|
+
'-sequence=' + mafft_output,
|
|
721
|
+
'-outseq=' + output,
|
|
722
|
+
],
|
|
723
|
+
capture_output=True,
|
|
724
|
+
text=True,
|
|
725
|
+
check=True,
|
|
726
|
+
)
|
|
727
|
+
except subprocess.CalledProcessError as e:
|
|
728
|
+
# If the bash script fails, capture the error and log the traceback
|
|
729
|
+
logging.error('generate_consensus_mafft failed with error: %s', e)
|
|
730
|
+
logging.error('Script stderr: %s', e.stderr)
|
|
731
|
+
logging.error(traceback.format_exc())
|
|
732
|
+
raise
|