telomore 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,220 @@
1
+ """Argument parser for the telomore tool."""
2
+
3
+ import argparse
4
+ from argparse import Namespace
5
+ import logging
6
+ import sys
7
+
8
+ from telomore._version import __version__
9
+
10
+
11
+ def get_args() -> Namespace:
12
+ r"""
13
+ Parse and validate command-line arguments for Telomore.
14
+
15
+ Defines the command-line interface for Telomore, including all required
16
+ and optional arguments. Validates mode-specific requirements and provides
17
+ helpful error messages.
18
+
19
+ Returns
20
+ -------
21
+ argparse.Namespace
22
+ Parsed arguments with attributes:
23
+ - mode : str - Sequencing platform ('nanopore' or 'illumina')
24
+ - reference : str - Path to reference genome FASTA
25
+ - single : str or None - Nanopore FASTQ file (required if mode='nanopore')
26
+ - read1 : str or None - Illumina R1 FASTQ (required if mode='illumina')
27
+ - read2 : str or None - Illumina R2 FASTQ (required if mode='illumina')
28
+ - threads : int - Number of threads (default: 1)
29
+ - keep : bool - Retain intermediate files (default: False)
30
+ - quiet : bool - Suppress console output (default: False)
31
+ - coverage_threshold : int or None - Coverage cutoff for trimming
32
+ - quality_threshold : int or None - Quality score cutoff for trimming
33
+
34
+ Raises
35
+ ------
36
+ SystemExit
37
+ If no arguments provided, required arguments missing, or mode-specific
38
+ validation fails. Prints help message and exits with code 1.
39
+
40
+ Notes
41
+ -----
42
+ Mode-specific validation:
43
+ - nanopore mode requires --single argument
44
+ - illumina mode requires both --read1 and --read2 arguments
45
+
46
+ Threshold defaults (applied in main workflow if not specified):
47
+ - Nanopore: coverage=5, quality=10
48
+ - Illumina: coverage=1, quality=30
49
+
50
+ Examples
51
+ --------
52
+ Nanopore mode:
53
+ telomore --mode nanopore --single reads.fq.gz --reference genome.fa -t 8
54
+
55
+ Illumina mode:
56
+ telomore --mode illumina --read1 R1.fq.gz --read2 R2.fq.gz \\
57
+ --reference genome.fa -t 8 --coverage_threshold 2
58
+ """
59
+ parser = argparse.ArgumentParser(
60
+ description="""Telomore: A tool to recover potential telomeric sequences from Streptomyces genomes.
61
+
62
+ This tool processes sequencing data from Oxford Nanopore or Illumina platforms to extend assemblies and generate QC reports.
63
+
64
+ INPUT:
65
+ - For Nanopore mode (--mode=nanopore): Provide a single gzipped FASTQ file using --single.
66
+ - For Illumina mode (--mode=illumina): Provide two gzipped FASTQ files using --read1 and --read2.
67
+ - A reference genome file in FASTA format is required for both modes (--reference).
68
+
69
+ OUTPUT:
70
+ - Extended assembly written to basename.02.trimmed.fasta (basename is the name of the input file without the extension).
71
+ - QC reports saved in a folder named basename_seqtype_QC.
72
+ - Logs are written to telomore.log and basename.seqtype.cons.log.txt. in basename_seqtyope_QC.
73
+
74
+ OPTIONS:
75
+ - Specify the number of threads to use with --threads (default: 1).
76
+ - Use --keep to retain intermediate files (default: False).
77
+ - Use --quiet to suppress console logging.
78
+
79
+ EXAMPLES:
80
+ 1. Nanopore mode:
81
+ telomore --mode=nanopore --single reads.fastq.gz --reference genome.fasta -t 8
82
+
83
+ 2. Illumina mode:
84
+ telomore --mode=illumina --read1 read1.fastq.gz --read2 read2.fastq.gz --reference genome.fasta -t 8
85
+ """,
86
+ formatter_class=argparse.RawTextHelpFormatter,
87
+ )
88
+
89
+ parser.add_argument(
90
+ '-v',
91
+ '--version',
92
+ action='version',
93
+ version=f'telomore {__version__}',
94
+ help='Show version number and exit',
95
+ )
96
+
97
+ parser.add_argument(
98
+ '-m',
99
+ '--mode',
100
+ choices=['nanopore', 'illumina'],
101
+ required=True,
102
+ help="""Choose which mode to run.
103
+ --mode=nanopore takes a single read-file, specified using --single
104
+ --mode=illumina-mode takes two read-files specified using --read1 and --read2""",
105
+ )
106
+ parser.add_argument(
107
+ '--single',
108
+ type=str,
109
+ help='Path to a single gzipped nanopore fastq-file',
110
+ )
111
+ parser.add_argument(
112
+ '--read1',
113
+ type=str,
114
+ help='Path to gzipped illumina read1 fastq-file',
115
+ )
116
+ parser.add_argument(
117
+ '--read2',
118
+ type=str,
119
+ help='Path to gzipped illumina read2 fastq-file',
120
+ )
121
+ parser.add_argument(
122
+ '-r',
123
+ '--reference',
124
+ type=str,
125
+ required=True,
126
+ help='Path to reference file (.fasta, .fna, or .fa)',
127
+ )
128
+ parser.add_argument(
129
+ '-t', '--threads', type=int, default=1, help='Threads to use. Default is 1'
130
+ )
131
+ parser.add_argument(
132
+ '-k',
133
+ '--keep',
134
+ action='store_true',
135
+ help='Flag to keep intermediate files. Default is False',
136
+ )
137
+ parser.add_argument(
138
+ '-q', '--quiet', action='store_true', help='Set logging to quiet.'
139
+ )
140
+ parser.add_argument(
141
+ '--coverage_threshold',
142
+ type=int,
143
+ help='Coverage threshold for consensus trimming. Default is coverage=5 for nanopore and coverage=1 for Illumina.',
144
+ )
145
+ parser.add_argument(
146
+ '--quality_threshold',
147
+ type=int,
148
+ help='Quality threshold for consensus trimming. Default is q_score=10 for nanopore and q_score=30 for illumina.',
149
+ )
150
+
151
+ # Check if no arguments were provided
152
+ if len(sys.argv) == 1:
153
+ parser.print_help(sys.stderr)
154
+ sys.exit(1)
155
+
156
+ args = parser.parse_args()
157
+
158
+ if args.mode == 'illumina':
159
+ if not (args.read1 and args.read2):
160
+ parser.error(
161
+ 'Illumina mode requires two FASTQ files, specified by --read1 and --read2'
162
+ )
163
+ elif args.mode == 'nanopore':
164
+ if not args.single:
165
+ parser.error(
166
+ 'Nanopore mode takes one collected FASTQ file, specified by --single'
167
+ )
168
+ return args
169
+
170
+
171
+ def setup_logging(log_file: str = 'telomore.log', quiet: bool = False) -> None:
172
+ """
173
+ Configure logging for Telomore with file and console output.
174
+
175
+ Sets up Python's logging system to write to both a log file and console
176
+ (unless quiet mode is enabled). Uses INFO level logging with timestamps.
177
+
178
+ Parameters
179
+ ----------
180
+ log_file : str, default='telomore.log'
181
+ Path to log file where all messages will be written
182
+ quiet : bool, default=False
183
+ If True, suppress console output (file logging still occurs)
184
+
185
+ Returns
186
+ -------
187
+ None
188
+ Configures the global logging system
189
+
190
+ Notes
191
+ -----
192
+ Log format: '%(asctime)s - %(message)s'
193
+ Log level: INFO (captures informational messages, warnings, and errors)
194
+
195
+ When quiet=False, logs appear in both:
196
+ - Console (via StreamHandler to stdout)
197
+ - File (via FileHandler to log_file)
198
+
199
+ When quiet=True, logs only appear in:
200
+ - File (via FileHandler to log_file)
201
+
202
+ This allows users to suppress verbose console output while maintaining
203
+ a complete log file for debugging and reproducibility.
204
+ """
205
+ if quiet is True:
206
+ handlers_to_use = [
207
+ logging.FileHandler(log_file), # Log file
208
+ ]
209
+ else:
210
+ handlers_to_use = [
211
+ logging.FileHandler(log_file), # Log file
212
+ logging.StreamHandler(sys.stdout), # Print to console
213
+ ]
214
+ logging.basicConfig(
215
+ level=logging.INFO, format='%(asctime)s - %(message)s', handlers=handlers_to_use
216
+ )
217
+
218
+
219
+ if __name__ == '__main__':
220
+ get_args()
@@ -0,0 +1,289 @@
1
+ """Class for handling files related to each replicon."""
2
+
3
+ import os
4
+ import shutil
5
+
6
+
7
+ # Class
8
+ # Replicon class
9
+ class Replicon:
10
+ """
11
+ Manage file paths and operations for individual replicon extension.
12
+
13
+ A Replicon object tracks all input, intermediate, and output files
14
+ associated with extending a single linear contig (replicon). It provides
15
+ organized file management and cleanup methods for the Telomore workflow.
16
+
17
+ Parameters
18
+ ----------
19
+ name : str
20
+ Identifier for the replicon (typically contig name from FASTA)
21
+ org_fasta : str
22
+ Path to the original multi-FASTA file containing this replicon
23
+
24
+ Attributes
25
+ ----------
26
+ name : str
27
+ Replicon identifier
28
+ org_fasta : str
29
+ Original FASTA file path
30
+
31
+ Mapping files:
32
+ org_map : str
33
+ BAM file of reads mapped to original contig
34
+ org_map_index : str
35
+ BAI index for org_map
36
+
37
+ Terminal read files:
38
+ left_sam, right_sam : str
39
+ SAM files with left/right terminal read alignments
40
+ left_filt, right_filt : str
41
+ Base paths for filtered reads
42
+ left_filt_sam, right_filt_sam : str
43
+ Filtered SAM files
44
+ left_filt_fq, right_filt_fq : str
45
+ Filtered FASTQ files
46
+
47
+ Consensus files:
48
+ l_cons_out : str
49
+ Left consensus (reverse complement, temporary)
50
+ l_cons_final_out : str
51
+ Left consensus (final orientation)
52
+ l_cons_alignment : str
53
+ Alignment file for left consensus
54
+ revcomp_out : str
55
+ Reverse complement of left filtered reads
56
+ r_cons_final_out : str
57
+ Right consensus (final)
58
+ r_cons_alignment : str
59
+ Alignment file for right consensus
60
+
61
+ Extension files:
62
+ contig_fasta : str
63
+ Extracted single-contig FASTA
64
+ cons_log_np_out : str
65
+ Extension log for Nanopore mode
66
+ cons_log_ill_out : str
67
+ Extension log for Illumina mode
68
+ trunc_left_fasta, trunc_right_fasta : str
69
+ Truncated contigs to prevent alternative mappings
70
+ l_map_out, r_map_out : str
71
+ BAM files of consensus mapped to truncated contigs
72
+ l_map_out_index, r_map_out_index : str
73
+ BAI indices for consensus maps
74
+ stitch_out : str
75
+ Extended assembly before trimming
76
+ stitch_left_fasta, stitch_right_fasta : str
77
+ Extracted consensus sequences
78
+ trim_map : str
79
+ BAM of QC reads mapped to untrimmed assembly
80
+ trim_map_index : str
81
+ BAI index for trim_map
82
+ trim_out : str
83
+ Final trimmed extended assembly
84
+
85
+ QC files:
86
+ qc_out : str
87
+ Final QC BAM file
88
+ qc_out_index : str
89
+ BAI index for QC BAM
90
+
91
+ Notes
92
+ -----
93
+ All file paths are automatically generated from the replicon name
94
+ following a consistent naming convention. This ensures files are
95
+ traceable and organized.
96
+
97
+ The class provides methods for:
98
+ - cleanup_tmp_files(): Remove intermediate processing files
99
+ - mv_files(): Move final output files to designated directory
100
+
101
+ Files are categorized as:
102
+ - Temporary: Deleted after successful extension
103
+ - Output: Moved to results directory for user
104
+ """
105
+
106
+ def __init__(self, name: str, org_fasta: str):
107
+ self.name = name
108
+ self.org_fasta = org_fasta
109
+
110
+ # Map files
111
+ self.org_map = f'{self.name}_map.bam'
112
+ self.org_map_index = f'{self.name}_map.bam.bai'
113
+
114
+ # Filtered files
115
+ self.left_sam = f'{self.name}_left.sam'
116
+ self.left_filt = f'{self.name}_left_filtered'
117
+ self.left_filt_sam = f'{self.name}_left_filtered.sam'
118
+ self.left_filt_fq = f'{self.name}_left_filtered.fastq'
119
+
120
+ self.right_sam = f'{self.name}_right.sam'
121
+ self.right_filt = f'{self.name}_right_filtered'
122
+ self.right_filt_sam = f'{self.name}_right_filtered.sam'
123
+ self.right_filt_fq = f'{self.name}_right_filtered.fastq'
124
+
125
+ # Consensus files
126
+ # left
127
+ self.l_cons_out = f'rev_{self.name}_left_cons.fasta'
128
+ self.l_cons_final_out = f'{self.name}_left_cons.fasta'
129
+ self.l_cons_alignment = f'{self.l_cons_out}.aln'
130
+ self.revcomp_out = f'rev_{self.left_filt_fq}'
131
+ # right
132
+ self.r_cons_final_out = f'{self.name}_right_cons.fasta'
133
+ self.r_cons_alignment = f'{self.r_cons_final_out}.aln'
134
+
135
+ # Extension files
136
+ self.contig_fasta = f'{name}.fasta'
137
+
138
+ self.cons_log_np_out = f'{self.name}_telomore_ext_np.log'
139
+ self.cons_log_ill_out = f'{self.name}_telomore_ill_ext.log'
140
+
141
+ # Truncated contig which discard alternative mapping points
142
+ self.trunc_left_fasta = f'{self.name}_trunc_left.fa'
143
+ self.trunc_right_fasta = f'{self.name}_trunc_right.fa'
144
+
145
+ # Maps on trunc fasta
146
+ self.l_map_out = f'{self.name}_left_map.bam'
147
+ self.r_map_out = f'{self.name}_right_map.bam'
148
+ self.l_map_out_index = f'{self.l_map_out}.bai'
149
+ self.r_map_out_index = f'{self.r_map_out}.bai'
150
+
151
+ # Extended assembly
152
+ self.stitch_out = f'{self.name}_telomore_untrimmed.fasta'
153
+ self.stitch_left_fasta = f'{self.name}_left.fasta'
154
+ self.stitch_right_fasta = f'{self.name}_right.fasta'
155
+
156
+ # Trim files
157
+ self.trim_map = f'{self.name}_telomore_untrimmed.bam'
158
+ self.trim_map_index = f'{self.trim_map}.bai'
159
+ self.trim_out = f'{self.name}_telomore_extended.fasta'
160
+
161
+ # QC_files
162
+ self.qc_out = f'{self.name}_telomore_QC.bam'
163
+ self.qc_out_index = f'{self.qc_out}.bai'
164
+
165
+ def cleanup_tmp_files(self) -> None:
166
+ """
167
+ Remove temporary intermediate files after successful extension.
168
+
169
+ Deletes all intermediate files that are not needed in the final output,
170
+ including mapping files, filtered reads, consensus intermediates, and
171
+ truncated references. Preserves only the final extended assemblies and
172
+ QC files.
173
+
174
+ Returns
175
+ -------
176
+ None
177
+ Removes files from the filesystem
178
+
179
+ Notes
180
+ -----
181
+ Files removed include:
182
+ - Original mapping: org_map, org_map_index
183
+ - Terminal read SAMs: left_sam, right_sam
184
+ - Filtered reads: left_filt_sam, left_filt_fq, right_filt_sam, right_filt_fq
185
+ - Consensus intermediates: l_cons_out, l_cons_final_out, l_cons_alignment,
186
+ revcomp_out, r_cons_final_out, r_cons_alignment
187
+ - Extracted/truncated contigs: contig_fasta, trunc_left_fasta, trunc_right_fasta
188
+ - Consensus mappings: l_map_out, r_map_out, and their indices
189
+ - Stitching intermediates: stitch_left_fasta, stitch_right_fasta
190
+ - Trimming map: trim_map, trim_map_index
191
+
192
+ Files preserved (not deleted):
193
+ - stitch_out: Untrimmed extended assembly
194
+ - trim_out: Final trimmed extended assembly
195
+ - qc_out, qc_out_index: QC alignment files
196
+ - cons_log_np_out or cons_log_ill_out: Extension logs
197
+
198
+ Only deletes files that exist - missing files are silently skipped.
199
+ Call this method after successful completion of extension workflow
200
+ to reduce disk space usage.
201
+ """
202
+ tmp_files = [
203
+ self.org_map,
204
+ self.org_map_index,
205
+ self.left_sam,
206
+ self.left_filt_sam,
207
+ self.left_filt_fq,
208
+ self.right_sam,
209
+ self.right_filt_sam,
210
+ self.right_filt_fq,
211
+ self.l_cons_out,
212
+ self.l_cons_final_out,
213
+ self.l_cons_alignment,
214
+ self.revcomp_out,
215
+ self.r_cons_final_out,
216
+ self.r_cons_alignment,
217
+ self.contig_fasta,
218
+ self.trunc_left_fasta,
219
+ self.trunc_right_fasta,
220
+ self.l_map_out,
221
+ self.l_map_out_index,
222
+ self.r_map_out_index,
223
+ self.r_map_out,
224
+ self.stitch_left_fasta,
225
+ self.stitch_right_fasta,
226
+ self.trim_map,
227
+ self.trim_map_index,
228
+ ]
229
+ for path in tmp_files:
230
+ if os.path.exists(path):
231
+ os.remove(path)
232
+
233
+ def mv_files(self, folder: str, mode: str) -> None:
234
+ """
235
+ Move final output files to designated output directory.
236
+
237
+ Relocates the essential output files (extended assemblies, QC BAM, and
238
+ extension log) from the working directory to the specified output folder.
239
+ The log file moved depends on the sequencing mode.
240
+
241
+ Parameters
242
+ ----------
243
+ folder : str
244
+ Path to destination directory for output files
245
+ mode : str
246
+ Sequencing technology mode: 'nanopore' or 'illumina'
247
+
248
+ Returns
249
+ -------
250
+ None
251
+ Moves files to the destination folder
252
+
253
+ Raises
254
+ ------
255
+ FileNotFoundError
256
+ If any of the required output files don't exist (implicitly from shutil.move)
257
+
258
+ Notes
259
+ -----
260
+ Files moved for all modes:
261
+ - stitch_out: Untrimmed extended assembly
262
+ - trim_out: Final trimmed extended assembly
263
+ - qc_out: QC alignment BAM
264
+ - qc_out_index: QC alignment BAM index
265
+
266
+ Mode-specific files:
267
+ - If mode='nanopore': moves cons_log_np_out
268
+ - If mode='illumina': moves cons_log_ill_out
269
+
270
+ The destination folder must already exist. Files retain their
271
+ original names in the destination directory.
272
+
273
+ This method should be called after cleanup_tmp_files() to organize
274
+ the final results while removing intermediate files from the working
275
+ directory.
276
+ """
277
+ keep_files = [self.stitch_out, self.trim_out, self.qc_out, self.qc_out_index]
278
+
279
+ for file in keep_files:
280
+ shutil.move(src=file, dst=os.path.join(folder, file))
281
+ if mode == 'nanopore':
282
+ shutil.move(
283
+ src=self.cons_log_np_out, dst=os.path.join(folder, self.cons_log_np_out)
284
+ )
285
+ elif mode == 'illumina':
286
+ shutil.move(
287
+ src=self.cons_log_ill_out,
288
+ dst=os.path.join(folder, self.cons_log_ill_out),
289
+ )