telomore 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
telomore/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ """Telomore: Telomeric sequence extension for Streptomycetes assemblies.
2
+
3
+ This package provides tools for identifying and extracting telomeric sequences
4
+ from Oxford Nanopore or Illumina sequencing reads to extend Streptomycetes assemblies.
5
+ """
telomore/_version.py ADDED
@@ -0,0 +1,34 @@
1
+ # file generated by setuptools-scm
2
+ # don't change, don't track in version control
3
+
4
+ __all__ = [
5
+ "__version__",
6
+ "__version_tuple__",
7
+ "version",
8
+ "version_tuple",
9
+ "__commit_id__",
10
+ "commit_id",
11
+ ]
12
+
13
+ TYPE_CHECKING = False
14
+ if TYPE_CHECKING:
15
+ from typing import Tuple
16
+ from typing import Union
17
+
18
+ VERSION_TUPLE = Tuple[Union[int, str], ...]
19
+ COMMIT_ID = Union[str, None]
20
+ else:
21
+ VERSION_TUPLE = object
22
+ COMMIT_ID = object
23
+
24
+ version: str
25
+ __version__: str
26
+ __version_tuple__: VERSION_TUPLE
27
+ version_tuple: VERSION_TUPLE
28
+ commit_id: COMMIT_ID
29
+ __commit_id__: COMMIT_ID
30
+
31
+ __version__ = version = '0.4.1'
32
+ __version_tuple__ = version_tuple = (0, 4, 1)
33
+
34
+ __commit_id__ = commit_id = None
telomore/app.py ADDED
@@ -0,0 +1,536 @@
1
+ """
2
+ Telomore main application module.
3
+
4
+ Script for finding and extracting telomeres from nanopore or illumina reads,
5
+ which have been excluded from a de novo assembly.
6
+ """
7
+
8
+ from argparse import Namespace
9
+ import logging
10
+ import os
11
+ import shutil
12
+ import traceback
13
+
14
+ from telomore._version import __version__
15
+ from telomore.utils.arg_parser import get_args, setup_logging
16
+ from telomore.utils.classes_and_small_func import Replicon
17
+ from telomore.utils.cmd_tools import (
18
+ generate_consensus_lamassemble,
19
+ generate_consensus_mafft,
20
+ map_and_sort,
21
+ map_and_sort_illumina,
22
+ map_and_sort_illumina_cons,
23
+ train_lastDB,
24
+ )
25
+ from telomore.utils.fasta_tools import (
26
+ build_extended_fasta,
27
+ extract_contig,
28
+ get_fasta_length,
29
+ get_linear_elements,
30
+ strip_fasta,
31
+ )
32
+ from telomore.utils.map_tools import (
33
+ get_left_soft,
34
+ get_right_soft,
35
+ get_terminal_reads,
36
+ revcomp,
37
+ revcomp_reads,
38
+ stitch_telo,
39
+ trim_by_map,
40
+ trim_by_map_illumina,
41
+ )
42
+ from telomore.utils.qc_reports import (
43
+ finalize_log,
44
+ qc_map,
45
+ qc_map_illumina,
46
+ )
47
+
48
+
49
+ def check_dependencies(required_tools: list[str] | None = None) -> None:
50
+ """
51
+ Check if required external dependencies are available in PATH.
52
+
53
+ Verifies that all bioinformatics tools required by Telomore are installed
54
+ and accessible. Logs the path to each found tool and exits with error if
55
+ any tools are missing.
56
+
57
+ Parameters
58
+ ----------
59
+ required_tools : list of str or None, optional
60
+ List of command-line tool names to check. If None, no tools are checked.
61
+ Common tools include: minimap2, samtools, lamassemble, mafft, bowtie2,
62
+ lastdb, lastal, cons.
63
+
64
+ Returns
65
+ -------
66
+ None
67
+ Logs tool locations or exits if dependencies are missing.
68
+
69
+ Raises
70
+ ------
71
+ SystemExit
72
+ If any required tools are not found in PATH (exits with code 1)
73
+
74
+ Notes
75
+ -----
76
+ For each tool, this function:
77
+ - Checks if the tool is available using shutil.which()
78
+ - Logs the full path if found
79
+ - Collects missing tools and reports them all at once before exiting
80
+
81
+ This ensures users know about all missing dependencies upfront rather than
82
+ discovering them one at a time during execution.
83
+ """
84
+ missing_tools = []
85
+ for tool in required_tools:
86
+ if shutil.which(tool) is None:
87
+ # Log missing tool
88
+ missing_tools.append(tool)
89
+ else:
90
+ # Log the path to the tool
91
+ logging.info(f'{tool}\t {shutil.which(tool)}')
92
+ if missing_tools:
93
+ # Log all missing tools and exit
94
+ logging.error(f'Missing required tools: {", ".join(missing_tools)}')
95
+ exit(1)
96
+
97
+
98
+ def entrypoint() -> None:
99
+ """
100
+ Entry point for the telomore command-line interface.
101
+
102
+ Parses command-line arguments, sets up logging, and calls the main workflow.
103
+ This function serves as the entry point defined in pyproject.toml for the
104
+ 'telomore' console script.
105
+
106
+ Returns
107
+ -------
108
+ None
109
+ Executes the main workflow or exits with error code 1 on failure.
110
+
111
+ Raises
112
+ ------
113
+ SystemExit
114
+ If argument parsing fails or an unhandled exception occurs during workflow
115
+
116
+ Notes
117
+ -----
118
+ Error handling:
119
+ - Captures all exceptions during workflow execution
120
+ - Logs full traceback to log file
121
+ - Exits with code 1 to signal failure to calling process
122
+
123
+ Logging is configured before main() is called, with output to both
124
+ console and telomore.log file (unless --quiet is specified).
125
+ """
126
+ args = get_args() # Get arguments
127
+ setup_logging(log_file='telomore.log', quiet=args.quiet) # setup logging
128
+ try:
129
+ main(args)
130
+
131
+ except Exception:
132
+ logging.error('An error occurred during the workflow:')
133
+ logging.error(traceback.format_exc())
134
+ exit(1)
135
+
136
+
137
+ def main(args: Namespace) -> None:
138
+ """
139
+ Execute the main Telomore telomere extension workflow.
140
+
141
+ Orchestrates the complete pipeline for extending linear contigs with
142
+ telomeric sequences identified from unmapped reads. Processes either
143
+ Oxford Nanopore or Illumina sequencing data based on the mode parameter.
144
+
145
+ Parameters
146
+ ----------
147
+ args : argparse.Namespace
148
+ Parsed command-line arguments containing:
149
+ - mode : str - Sequencing platform ('nanopore' or 'illumina')
150
+ - reference : str - Path to reference genome FASTA
151
+ - single : str - Nanopore FASTQ file (if mode='nanopore')
152
+ - read1, read2 : str - Illumina paired FASTQ files (if mode='illumina')
153
+ - threads : int - Number of threads for parallel operations
154
+ - keep : bool - Whether to retain intermediate files
155
+ - quiet : bool - Suppress console logging
156
+ - coverage_threshold : int or None - Minimum coverage for consensus trimming
157
+ - quality_threshold : int or None - Minimum base quality for consensus trimming
158
+
159
+ Returns
160
+ -------
161
+ None
162
+ Creates output directory with extended assemblies and QC files.
163
+
164
+ Raises
165
+ ------
166
+ SystemExit
167
+ If output folder exists, no linear contigs found, or dependencies missing
168
+
169
+ Notes
170
+ -----
171
+ Workflow steps:
172
+ 1. Check external tool dependencies
173
+ 2. Identify linear contigs from reference headers
174
+ 3. Map reads to reference genome
175
+ 4. Extract terminal extending reads for each linear contig
176
+ 5. Generate consensus sequences from extending reads
177
+ 6. Align and attach consensus to contig ends
178
+ 7. Trim consensus based on read support
179
+ 8. Generate QC BAM files for manual inspection
180
+ 9. Create final assembly combining extended and unmodified contigs
181
+ 10. Clean up intermediate files (unless --keep specified)
182
+
183
+ Platform-specific defaults:
184
+ - Nanopore: coverage_threshold=5, quality_threshold=10
185
+ - Illumina: coverage_threshold=1, quality_threshold=30
186
+
187
+ Output structure: {reference_basename}_{np|ill}_telomore/
188
+ """
189
+ logging.info(f'Running Telomore: {__version__}')
190
+
191
+ check_dependencies(
192
+ [
193
+ 'minimap2',
194
+ 'samtools',
195
+ 'lamassemble',
196
+ 'mafft',
197
+ 'bowtie2',
198
+ 'lastdb',
199
+ 'lastal',
200
+ 'cons',
201
+ ]
202
+ )
203
+
204
+ ref_name = os.path.splitext(os.path.basename(args.reference))[0]
205
+ folder_content = os.listdir()
206
+
207
+ # Create output folder
208
+ if args.mode == 'nanopore':
209
+ telo_folder = ref_name + '_np_telomore'
210
+ elif args.mode == 'illumina':
211
+ telo_folder = ref_name + '_ill_telomore'
212
+
213
+ if os.path.isdir(telo_folder):
214
+ logging.info('Output folder %s already exists.', telo_folder)
215
+ exit()
216
+ os.mkdir(telo_folder)
217
+
218
+ # Identify linear elements
219
+ linear_elements = get_linear_elements(args.reference)
220
+ if not linear_elements:
221
+ logging.info('No tagged linear elements identified')
222
+ exit()
223
+ logging.info('Identified the following tagged linear elements %s', linear_elements)
224
+
225
+ # Create a list of replicon instances
226
+ replicon_list = [Replicon(element, args.reference) for element in linear_elements]
227
+
228
+ # 0: Map reads and extract terminally-extending sequence
229
+ # -----------------------------------------------------------------
230
+ logging.info('Mapping reads to assembly')
231
+
232
+ map_out = ref_name + '_map.bam'
233
+
234
+ # Use already existing map
235
+ if map_out in folder_content:
236
+ logging.info('Using already identified .bam-file %s', map_out)
237
+ elif args.mode == 'nanopore':
238
+ map_and_sort(
239
+ reference=args.reference,
240
+ fastq=args.single,
241
+ output=map_out,
242
+ threads=args.threads,
243
+ )
244
+ elif args.mode == 'illumina':
245
+ map_and_sort_illumina(
246
+ reference=args.reference,
247
+ read1=args.read1,
248
+ read2=args.read2,
249
+ output=map_out,
250
+ threads=args.threads,
251
+ )
252
+
253
+ for replicon in replicon_list:
254
+ logging.info('\tContig %s', replicon.name)
255
+
256
+ get_terminal_reads(
257
+ sorted_bam_file=map_out,
258
+ contig=replicon.name,
259
+ loutput_handle=replicon.left_sam,
260
+ routput_handle=replicon.right_sam,
261
+ )
262
+ get_left_soft(
263
+ sam_file=replicon.left_sam, left_out=replicon.left_filt, offset=500
264
+ )
265
+ get_right_soft(
266
+ sam_file=replicon.right_sam,
267
+ contig=replicon.name,
268
+ right_out=replicon.right_filt,
269
+ offset=500,
270
+ )
271
+
272
+ # 1: Generate consensus
273
+ # -----------------------------------------------------------------
274
+ logging.info('Generating consensus')
275
+
276
+ # Generate consensus
277
+ for replicon in replicon_list:
278
+ logging.info('\tContig %s', replicon.name)
279
+
280
+ # GENERATE LEFT CONSENSUS
281
+ # To maintain alignment anchor point, the reads are flipped
282
+ # And the resulting consensus must then be flipped again
283
+ revcomp_reads(reads_in=replicon.left_filt_fq, reads_out=replicon.revcomp_out)
284
+
285
+ if args.mode == 'nanopore':
286
+ db_out = ref_name + '.db'
287
+ train_lastDB(
288
+ args.reference, args.single, db_out, args.threads
289
+ ) # train on entire reference
290
+ generate_consensus_lamassemble(
291
+ db_name=db_out, reads=replicon.revcomp_out, output=replicon.l_cons_out
292
+ )
293
+
294
+ elif args.mode == 'illumina':
295
+ generate_consensus_mafft(
296
+ reads=replicon.revcomp_out, output=replicon.l_cons_out
297
+ )
298
+ # flip consensus to match original orientation
299
+ revcomp(fasta_in=replicon.l_cons_out, fasta_out=replicon.l_cons_final_out)
300
+
301
+ # GENERATE RIGHT CONSENSUS
302
+ # The right reads are already oriented with the anchor point
303
+ # left-most and does therefore not need to be flipped
304
+ if args.mode == 'nanopore':
305
+ # A last-db should aldready exist from the left-consensus
306
+ generate_consensus_lamassemble(
307
+ db_name=db_out,
308
+ reads=replicon.right_filt_fq,
309
+ output=replicon.r_cons_final_out,
310
+ )
311
+ elif args.mode == 'illumina':
312
+ generate_consensus_mafft(
313
+ reads=replicon.right_filt_fq, output=replicon.r_cons_final_out
314
+ )
315
+ # 2: Extend assembly with consensus by mapping onto chromsome
316
+ # -----------------------------------------------------------------
317
+ logging.info('Extending assembly')
318
+
319
+ for replicon in replicon_list:
320
+ logging.info('\tContig %s', replicon.name)
321
+
322
+ # Produce fasta file of just the contig to be extended
323
+ extract_contig(
324
+ fasta_in=replicon.org_fasta,
325
+ contig_name=replicon.name,
326
+ fasta_out=replicon.contig_fasta,
327
+ )
328
+
329
+ # Discard bases that provide alternative mapping sites
330
+ # for the consensus to map to as Streptomyces have TIRs.
331
+ # discard half the contig
332
+
333
+ strip_size = int(
334
+ get_fasta_length(
335
+ fasta_file=replicon.contig_fasta, contig_name=replicon.name
336
+ )
337
+ / 2
338
+ )
339
+ strip_fasta(
340
+ input_file=replicon.contig_fasta,
341
+ output_file=replicon.trunc_left_fasta,
342
+ x=strip_size,
343
+ remove_from='end',
344
+ )
345
+ strip_fasta(
346
+ input_file=replicon.contig_fasta,
347
+ output_file=replicon.trunc_right_fasta,
348
+ x=strip_size,
349
+ remove_from='start',
350
+ )
351
+
352
+ if args.mode == 'nanopore':
353
+ # Map onto the reduced reference using minimap2
354
+ map_and_sort(
355
+ reference=replicon.trunc_left_fasta,
356
+ fastq=replicon.l_cons_final_out,
357
+ output=replicon.l_map_out,
358
+ threads=args.threads,
359
+ )
360
+
361
+ map_and_sort(
362
+ reference=replicon.trunc_right_fasta,
363
+ fastq=replicon.r_cons_final_out,
364
+ output=replicon.r_map_out,
365
+ threads=args.threads,
366
+ )
367
+
368
+ elif args.mode == 'illumina':
369
+ # Map onto reduced reference using bowtie2
370
+ map_and_sort_illumina_cons(
371
+ reference=replicon.trunc_left_fasta,
372
+ consensus_fasta=replicon.l_cons_final_out,
373
+ output=replicon.l_map_out,
374
+ threads=args.threads,
375
+ )
376
+
377
+ map_and_sort_illumina_cons(
378
+ reference=replicon.trunc_right_fasta,
379
+ consensus_fasta=replicon.r_cons_final_out,
380
+ output=replicon.r_map_out,
381
+ threads=args.threads,
382
+ )
383
+
384
+ # Extend the assembly using the map
385
+ if args.mode == 'nanopore':
386
+ cons_log_out = replicon.cons_log_np_out
387
+ elif args.mode == 'illumina':
388
+ cons_log_out = replicon.cons_log_ill_out
389
+
390
+ stitch_telo(
391
+ ref=replicon.contig_fasta,
392
+ left_map=replicon.l_map_out,
393
+ right_map=replicon.r_map_out,
394
+ outfile=replicon.stitch_out,
395
+ logout=cons_log_out,
396
+ tmp_left=replicon.stitch_left_fasta,
397
+ tmp_right=replicon.stitch_right_fasta,
398
+ )
399
+
400
+ # 3: Trim consensus using a map of terminal reads onto extended
401
+ # assembly
402
+ # -----------------------------------------------------------------
403
+ logging.info('Trimming consensus based on read support')
404
+
405
+ for replicon in replicon_list:
406
+ # Iterate to the correct log
407
+ if args.mode == 'nanopore':
408
+ cons_log_out = replicon.cons_log_np_out
409
+ elif args.mode == 'illumina':
410
+ cons_log_out = replicon.cons_log_ill_out
411
+
412
+ logging.info('\tContig %s', replicon.name)
413
+
414
+ if args.mode == 'nanopore':
415
+ # Set default values for consensus trimming if the User did not
416
+ if args.coverage_threshold is None:
417
+ args.coverage_threshold = 5
418
+ if args.quality_threshold is None:
419
+ args.quality_threshold = 10
420
+
421
+ qc_map(
422
+ extended_assembly=replicon.stitch_out,
423
+ left=replicon.left_sam,
424
+ right=replicon.right_sam,
425
+ output_handle=replicon.trim_map,
426
+ t=args.threads,
427
+ )
428
+
429
+ trim_by_map(
430
+ untrimmed_assembly=replicon.stitch_out,
431
+ sorted_bam_file=replicon.trim_map,
432
+ output_handle=replicon.trim_out,
433
+ cons_log=cons_log_out,
434
+ cov_thres=args.coverage_threshold,
435
+ ratio_thres=0.7,
436
+ qual_thres=args.quality_threshold,
437
+ )
438
+
439
+ elif args.mode == 'illumina':
440
+ # Set default values for consensus trimming if the User did not
441
+ if args.coverage_threshold is None:
442
+ args.coverage_threshold = 1
443
+ if args.quality_threshold is None:
444
+ args.quality_threshold = 30
445
+
446
+ qc_map_illumina(
447
+ extended_assembly=replicon.stitch_out,
448
+ left_sam=replicon.left_sam,
449
+ right_sam=replicon.right_sam,
450
+ fastq_in1=args.read1,
451
+ fastq_in2=args.read2,
452
+ output_handle=replicon.trim_map,
453
+ t=args.threads,
454
+ )
455
+ trim_by_map_illumina(
456
+ untrimmed_assembly=replicon.stitch_out,
457
+ sorted_bam_file=replicon.trim_map,
458
+ output_handle=replicon.trim_out,
459
+ cons_log=cons_log_out,
460
+ cov_thres=args.coverage_threshold,
461
+ ratio_thres=0.7,
462
+ qual_thres=args.quality_threshold,
463
+ )
464
+ # 4: Generate QC files
465
+ # -----------------------------------------------------------------
466
+ logging.info('Generating QC map and finalizing result-log')
467
+
468
+ for replicon in replicon_list:
469
+ # Iterate to the correct log
470
+ if args.mode == 'nanopore':
471
+ cons_log_out = replicon.cons_log_np_out
472
+ elif args.mode == 'illumina':
473
+ cons_log_out = replicon.cons_log_ill_out
474
+
475
+ logging.info('\tContig %s', replicon.name)
476
+
477
+ if args.mode == 'nanopore':
478
+ qc_map(
479
+ extended_assembly=replicon.trim_out,
480
+ left=replicon.left_sam,
481
+ right=replicon.right_sam,
482
+ output_handle=replicon.qc_out,
483
+ t=args.threads,
484
+ )
485
+ if args.mode == 'illumina':
486
+ qc_map_illumina(
487
+ extended_assembly=replicon.trim_out,
488
+ left_sam=replicon.left_sam,
489
+ right_sam=replicon.right_sam,
490
+ fastq_in1=args.read1,
491
+ fastq_in2=args.read2,
492
+ output_handle=replicon.qc_out,
493
+ t=args.threads,
494
+ )
495
+
496
+ finalize_log(
497
+ log=cons_log_out,
498
+ right_fasta=replicon.stitch_right_fasta,
499
+ left_fasta=replicon.stitch_left_fasta,
500
+ )
501
+
502
+ # 5: Clean-up
503
+ # -----------------------------------------------------------------
504
+ logging.info('Removing temporary files')
505
+
506
+ finished_fasta = ref_name + '_telomore.fasta'
507
+ build_extended_fasta(
508
+ org_fasta=args.reference,
509
+ linear_elements=linear_elements,
510
+ replicon_list=replicon_list,
511
+ output_handle=finished_fasta,
512
+ )
513
+
514
+ shutil.move(src=finished_fasta, dst=os.path.join(telo_folder, finished_fasta))
515
+
516
+ for replicon in replicon_list:
517
+ replicon.mv_files(telo_folder, args.mode)
518
+
519
+ if args.keep is False:
520
+ # rmv tmp files
521
+ for replicon in replicon_list:
522
+ replicon.cleanup_tmp_files()
523
+
524
+ # rmv lastdb
525
+ last_db_ext = ['.bck', '.des', '.par', '.prj', '.sds', '.ssp', '.suf', '.tis']
526
+
527
+ if args.mode == 'nanopore':
528
+ for ext in last_db_ext:
529
+ db_file = f'{db_out}{ext}'
530
+ os.remove(db_file)
531
+
532
+ # remove map
533
+ os.remove(map_out) # map
534
+ os.remove(f'{map_out}.bai') # index
535
+
536
+ logging.info('Output files moved to %s', telo_folder)
@@ -0,0 +1 @@
1
+ """Utilities for telomore."""