telomore 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- telomore/__init__.py +5 -0
- telomore/_version.py +34 -0
- telomore/app.py +536 -0
- telomore/utils/__init__.py +1 -0
- telomore/utils/arg_parser.py +220 -0
- telomore/utils/classes_and_small_func.py +289 -0
- telomore/utils/cmd_tools.py +732 -0
- telomore/utils/fasta_tools.py +595 -0
- telomore/utils/map_tools.py +1333 -0
- telomore/utils/qc_reports.py +493 -0
- telomore-0.4.1.dist-info/METADATA +149 -0
- telomore-0.4.1.dist-info/RECORD +15 -0
- telomore-0.4.1.dist-info/WHEEL +4 -0
- telomore-0.4.1.dist-info/entry_points.txt +2 -0
- telomore-0.4.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,493 @@
|
|
|
1
|
+
"""Functions for generating useful QC metrics from the telomore script."""
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import os
|
|
5
|
+
import tempfile
|
|
6
|
+
|
|
7
|
+
from Bio import SeqIO
|
|
8
|
+
from Bio.Seq import Seq
|
|
9
|
+
from Bio.SeqRecord import SeqRecord
|
|
10
|
+
import pysam
|
|
11
|
+
|
|
12
|
+
from .cmd_tools import map_and_sort, map_and_sort_illumina
|
|
13
|
+
from .fasta_tools import (
|
|
14
|
+
cat_and_derep_fastq,
|
|
15
|
+
check_fastq_order,
|
|
16
|
+
dereplicate_fastq,
|
|
17
|
+
merge_fasta,
|
|
18
|
+
)
|
|
19
|
+
from .map_tools import sam_to_fastq, sam_to_readpair
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def qc_map(
|
|
23
|
+
extended_assembly: str, left: str, right: str, output_handle: str, t: int = 1
|
|
24
|
+
) -> None:
|
|
25
|
+
"""
|
|
26
|
+
Generate QC alignment of terminal reads against extended assembly (Nanopore).
|
|
27
|
+
|
|
28
|
+
Collects terminal reads from left and right SAM files, converts them to
|
|
29
|
+
FASTQ, deduplicates, and maps them back to the extended assembly. Used
|
|
30
|
+
to validate the quality of the extension by visualizing read support.
|
|
31
|
+
|
|
32
|
+
Parameters
|
|
33
|
+
----------
|
|
34
|
+
extended_assembly : str
|
|
35
|
+
Path to FASTA file of extended assembly with consensus attached
|
|
36
|
+
left : str
|
|
37
|
+
Path to SAM file containing left-terminal reads
|
|
38
|
+
right : str
|
|
39
|
+
Path to SAM file containing right-terminal reads
|
|
40
|
+
output_handle : str
|
|
41
|
+
Path for output sorted BAM file with QC alignments
|
|
42
|
+
t : int, default=1
|
|
43
|
+
Number of threads for mapping
|
|
44
|
+
|
|
45
|
+
Returns
|
|
46
|
+
-------
|
|
47
|
+
None
|
|
48
|
+
Writes QC alignment BAM to output_handle
|
|
49
|
+
|
|
50
|
+
Notes
|
|
51
|
+
-----
|
|
52
|
+
QC mapping workflow:
|
|
53
|
+
1. Creates temporary FASTQ file
|
|
54
|
+
2. Converts left SAM reads to FASTQ (appends to temp file)
|
|
55
|
+
3. Converts right SAM reads to FASTQ (appends to temp file)
|
|
56
|
+
4. Deduplicates the combined FASTQ to remove redundant reads
|
|
57
|
+
5. Maps deduplicated reads to extended assembly using minimap2
|
|
58
|
+
6. Sorts and indexes the resulting BAM file
|
|
59
|
+
7. Cleans up temporary FASTQ file
|
|
60
|
+
|
|
61
|
+
The resulting BAM file can be visualized in IGV or analyzed with
|
|
62
|
+
samtools to assess:
|
|
63
|
+
- Coverage across extended regions
|
|
64
|
+
- Read support for consensus sequences
|
|
65
|
+
- Consistency of read alignments at telomeres
|
|
66
|
+
|
|
67
|
+
Deduplication is critical because the same read may appear in both
|
|
68
|
+
left and right terminal sets if it maps near both ends.
|
|
69
|
+
"""
|
|
70
|
+
# The file has to be mode=w to create the correct type of object
|
|
71
|
+
# for sam_to_fastq
|
|
72
|
+
with tempfile.NamedTemporaryFile(
|
|
73
|
+
suffix='.fastq', delete=False, mode='w'
|
|
74
|
+
) as temp_fastq:
|
|
75
|
+
temp_fastq_path = temp_fastq.name
|
|
76
|
+
sam_to_fastq(left, temp_fastq)
|
|
77
|
+
sam_to_fastq(right, temp_fastq)
|
|
78
|
+
dereplicate_fastq(fastq_in=temp_fastq_path, fastq_out=temp_fastq_path)
|
|
79
|
+
map_and_sort(extended_assembly, temp_fastq_path, output_handle, t)
|
|
80
|
+
os.remove(temp_fastq_path)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def qc_map_illumina(
|
|
84
|
+
extended_assembly: str,
|
|
85
|
+
left_sam: str,
|
|
86
|
+
right_sam: str,
|
|
87
|
+
fastq_in1: str,
|
|
88
|
+
fastq_in2: str,
|
|
89
|
+
output_handle: str,
|
|
90
|
+
t: int = 1,
|
|
91
|
+
) -> None:
|
|
92
|
+
"""
|
|
93
|
+
Generate QC alignment of terminal reads against extended assembly (Illumina).
|
|
94
|
+
|
|
95
|
+
Collects complete paired-end reads for all terminal alignments from left
|
|
96
|
+
and right SAM files, deduplicates, and maps them back to the extended
|
|
97
|
+
assembly. Preserves read pairing for accurate Illumina QC assessment.
|
|
98
|
+
|
|
99
|
+
Parameters
|
|
100
|
+
----------
|
|
101
|
+
extended_assembly : str
|
|
102
|
+
Path to FASTA file of extended assembly with consensus attached
|
|
103
|
+
left_sam : str
|
|
104
|
+
Path to SAM file containing left-terminal read alignments
|
|
105
|
+
right_sam : str
|
|
106
|
+
Path to SAM file containing right-terminal read alignments
|
|
107
|
+
fastq_in1 : str
|
|
108
|
+
Path to original R1 FASTQ file (gzip compressed)
|
|
109
|
+
fastq_in2 : str
|
|
110
|
+
Path to original R2 FASTQ file (gzip compressed)
|
|
111
|
+
output_handle : str
|
|
112
|
+
Path for output sorted BAM file with QC alignments
|
|
113
|
+
t : int, default=1
|
|
114
|
+
Number of threads for mapping
|
|
115
|
+
|
|
116
|
+
Returns
|
|
117
|
+
-------
|
|
118
|
+
None
|
|
119
|
+
Writes QC alignment BAM to output_handle
|
|
120
|
+
|
|
121
|
+
Raises
|
|
122
|
+
------
|
|
123
|
+
Exception
|
|
124
|
+
If FASTQ files are not properly paired or ordered
|
|
125
|
+
|
|
126
|
+
Notes
|
|
127
|
+
-----
|
|
128
|
+
QC mapping workflow for paired-end data:
|
|
129
|
+
1. Creates temporary directory for intermediate files
|
|
130
|
+
2. Extracts both R1 and R2 for reads in left SAM from original FASTQs
|
|
131
|
+
3. Extracts both R1 and R2 for reads in right SAM from original FASTQs
|
|
132
|
+
4. Concatenates and deduplicates R1 files separately
|
|
133
|
+
5. Concatenates and deduplicates R2 files separately
|
|
134
|
+
6. Validates that R1 and R2 files are properly synchronized
|
|
135
|
+
7. Maps paired reads to extended assembly using BWA-MEM
|
|
136
|
+
8. Sorts and indexes the resulting BAM file
|
|
137
|
+
9. Cleans up temporary directory
|
|
138
|
+
|
|
139
|
+
The paired-end approach ensures:
|
|
140
|
+
- Proper insert size analysis for extended regions
|
|
141
|
+
- Better mapping quality through paired information
|
|
142
|
+
- Accurate assessment of consensus support from both read ends
|
|
143
|
+
|
|
144
|
+
File order validation is critical - BWA-MEM requires synchronized
|
|
145
|
+
R1/R2 pairs, and the check prevents mapping with mismatched pairs.
|
|
146
|
+
"""
|
|
147
|
+
# get left paired read
|
|
148
|
+
with (
|
|
149
|
+
tempfile.TemporaryDirectory()
|
|
150
|
+
) as temp_dir: # ensures files are deleted after usage
|
|
151
|
+
# Create multiple temporary files in the temporary directory
|
|
152
|
+
l_tmp1 = os.path.join(temp_dir, 'terminal_left_reads_1.fastq')
|
|
153
|
+
l_tmp2 = os.path.join(temp_dir, 'terminal_left_reads_2.fastq')
|
|
154
|
+
r_tmp1 = os.path.join(temp_dir, 'terminal_right_reads_1.fastq')
|
|
155
|
+
r_tmp2 = os.path.join(temp_dir, 'terminal_right_reads_2.fastq')
|
|
156
|
+
a_tmp1 = os.path.join(temp_dir, 'all_terminal_reads_1.fastq')
|
|
157
|
+
a_tmp2 = os.path.join(temp_dir, 'all_terminal_reads_2.fastq')
|
|
158
|
+
|
|
159
|
+
sam_to_readpair(
|
|
160
|
+
sam_in=left_sam,
|
|
161
|
+
fastq_in1=fastq_in1,
|
|
162
|
+
fastq_in2=fastq_in2,
|
|
163
|
+
fastq_out1=l_tmp1,
|
|
164
|
+
fastq_out2=l_tmp2,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
# get right paired read
|
|
168
|
+
|
|
169
|
+
sam_to_readpair(
|
|
170
|
+
sam_in=right_sam,
|
|
171
|
+
fastq_in1=fastq_in1,
|
|
172
|
+
fastq_in2=fastq_in2,
|
|
173
|
+
fastq_out1=r_tmp1,
|
|
174
|
+
fastq_out2=r_tmp2,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
# collect the paired read files:
|
|
178
|
+
cat_and_derep_fastq(fastq_in1=l_tmp1, fastq_in2=r_tmp1, fastq_out=a_tmp1)
|
|
179
|
+
|
|
180
|
+
cat_and_derep_fastq(fastq_in1=l_tmp2, fastq_in2=r_tmp2, fastq_out=a_tmp2)
|
|
181
|
+
|
|
182
|
+
if check_fastq_order(a_tmp1, a_tmp2):
|
|
183
|
+
map_and_sort_illumina(
|
|
184
|
+
reference=extended_assembly,
|
|
185
|
+
read1=a_tmp1,
|
|
186
|
+
read2=a_tmp2,
|
|
187
|
+
output=output_handle,
|
|
188
|
+
threads=t,
|
|
189
|
+
)
|
|
190
|
+
else:
|
|
191
|
+
raise Exception('FASTQ files are not properly paired or ordered.')
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def cons_genome_map(
|
|
195
|
+
left_cons: str,
|
|
196
|
+
right_cons: str,
|
|
197
|
+
polished_genome: str,
|
|
198
|
+
output_handle: str,
|
|
199
|
+
t: int = 1,
|
|
200
|
+
) -> None:
|
|
201
|
+
"""
|
|
202
|
+
Map consensus sequences against the polished reference genome.
|
|
203
|
+
|
|
204
|
+
Merges left and right consensus sequences and aligns them to the final
|
|
205
|
+
polished genome to verify their placement and identify any issues with
|
|
206
|
+
consensus quality or positioning. Used for QC validation of consensus.
|
|
207
|
+
|
|
208
|
+
Parameters
|
|
209
|
+
----------
|
|
210
|
+
left_cons : str
|
|
211
|
+
Path to FASTA file containing left consensus sequence
|
|
212
|
+
right_cons : str
|
|
213
|
+
Path to FASTA file containing right consensus sequence
|
|
214
|
+
polished_genome : str
|
|
215
|
+
Path to FASTA file of polished/final genome
|
|
216
|
+
output_handle : str
|
|
217
|
+
Path for output sorted BAM file with consensus alignments
|
|
218
|
+
t : int, default=1
|
|
219
|
+
Number of threads for mapping
|
|
220
|
+
|
|
221
|
+
Returns
|
|
222
|
+
-------
|
|
223
|
+
None
|
|
224
|
+
Writes consensus alignment BAM to output_handle
|
|
225
|
+
|
|
226
|
+
Notes
|
|
227
|
+
-----
|
|
228
|
+
This QC mapping helps identify:
|
|
229
|
+
- Whether consensus sequences map uniquely to their expected locations
|
|
230
|
+
- If consensus contains repeats that map to multiple locations
|
|
231
|
+
- Quality of consensus alignment (mismatches, soft-clipping)
|
|
232
|
+
- Whether consensus extends correctly from the reference
|
|
233
|
+
|
|
234
|
+
The temporary merged FASTA file 'all_cons.fasta' is created in the
|
|
235
|
+
current directory and not automatically cleaned up.
|
|
236
|
+
|
|
237
|
+
Consensus sequences should map with high identity to their respective
|
|
238
|
+
ends. Multiple mappings or poor alignment quality suggests the consensus
|
|
239
|
+
may not represent true telomeric extension.
|
|
240
|
+
"""
|
|
241
|
+
merge_fasta(left_cons, right_cons, 'all_cons.fasta')
|
|
242
|
+
map_and_sort(polished_genome, 'all_cons.fasta', output_handle, t)
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def cons_cons_map(
|
|
246
|
+
left_cons: str, right_cons: str, output_handle: str, t: int = 1
|
|
247
|
+
) -> None:
|
|
248
|
+
"""
|
|
249
|
+
Map left consensus against right consensus to detect similarity.
|
|
250
|
+
|
|
251
|
+
Aligns left and right consensus sequences against each other to identify
|
|
252
|
+
potential circularization or repetitive telomeric sequences. If consensus
|
|
253
|
+
sequences map to each other, it may indicate the chromosome is circular
|
|
254
|
+
or contains telomeric repeats.
|
|
255
|
+
|
|
256
|
+
Parameters
|
|
257
|
+
----------
|
|
258
|
+
left_cons : str
|
|
259
|
+
Path to FASTA file containing left consensus (used as reference)
|
|
260
|
+
right_cons : str
|
|
261
|
+
Path to FASTA file containing right consensus (used as query)
|
|
262
|
+
output_handle : str
|
|
263
|
+
Path for output sorted BAM file with cross-consensus alignments
|
|
264
|
+
t : int, default=1
|
|
265
|
+
Number of threads for mapping
|
|
266
|
+
|
|
267
|
+
Returns
|
|
268
|
+
-------
|
|
269
|
+
None
|
|
270
|
+
Writes cross-consensus alignment BAM to output_handle
|
|
271
|
+
|
|
272
|
+
Notes
|
|
273
|
+
-----
|
|
274
|
+
Interpretation of results:
|
|
275
|
+
- No alignment: Linear chromosome with distinct telomeres (expected)
|
|
276
|
+
- High-quality alignment: May indicate:
|
|
277
|
+
* Circular chromosome where ends should connect
|
|
278
|
+
* Telomeric repeat arrays present at both ends
|
|
279
|
+
* Potential artifact if sequences shouldn't match
|
|
280
|
+
|
|
281
|
+
This QC check is particularly useful for:
|
|
282
|
+
- Bacterial genomes where circularity is expected
|
|
283
|
+
- Identifying repetitive telomeric sequences
|
|
284
|
+
- Validating that linear chromosome ends are truly distinct
|
|
285
|
+
|
|
286
|
+
Maps right consensus (query) against left consensus (reference) using
|
|
287
|
+
minimap2 single-read mode with sorting and indexing.
|
|
288
|
+
"""
|
|
289
|
+
map_and_sort(left_cons, right_cons, output_handle, t)
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def cons_length(cons_file: str, output_handle: str, offset: int = 100) -> None:
|
|
293
|
+
"""
|
|
294
|
+
Write consensus sequence length statistics to TSV file.
|
|
295
|
+
|
|
296
|
+
Calculates and records the length of consensus sequences with and without
|
|
297
|
+
an offset adjustment. The offset represents the amount of original reference
|
|
298
|
+
sequence included in the consensus file for context.
|
|
299
|
+
|
|
300
|
+
Parameters
|
|
301
|
+
----------
|
|
302
|
+
cons_file : str
|
|
303
|
+
Path to FASTA file containing consensus sequences
|
|
304
|
+
output_handle : str
|
|
305
|
+
Path for output TSV file with length statistics
|
|
306
|
+
offset : int, default=100
|
|
307
|
+
Number of bases of original reference included in consensus
|
|
308
|
+
|
|
309
|
+
Returns
|
|
310
|
+
-------
|
|
311
|
+
None
|
|
312
|
+
Writes TSV with columns: seq_id, end_cons, full_cons
|
|
313
|
+
|
|
314
|
+
Notes
|
|
315
|
+
-----
|
|
316
|
+
Output TSV format:
|
|
317
|
+
- Header: seq_id, end_cons, full_cons
|
|
318
|
+
- seq_id: Identifier of the consensus sequence
|
|
319
|
+
- end_cons: Length of true extension (full_cons - offset)
|
|
320
|
+
- full_cons: Total length including offset region
|
|
321
|
+
|
|
322
|
+
The offset adjustment is important because consensus building may
|
|
323
|
+
include some bases from the original reference sequence for context
|
|
324
|
+
and alignment purposes. The 'end_cons' value represents only the
|
|
325
|
+
novel sequence extending beyond the original assembly.
|
|
326
|
+
|
|
327
|
+
Example:
|
|
328
|
+
- Full consensus: 250bp
|
|
329
|
+
- Offset: 100bp
|
|
330
|
+
- True extension: 150bp (reported as end_cons)
|
|
331
|
+
|
|
332
|
+
Used for summarizing extension results across multiple contigs.
|
|
333
|
+
"""
|
|
334
|
+
cons_file = SeqIO.parse(cons_file, 'fasta')
|
|
335
|
+
header = ['seq_id', 'end_cons', 'full_cons']
|
|
336
|
+
tsv_log = []
|
|
337
|
+
tsv_log.append(header)
|
|
338
|
+
|
|
339
|
+
for record in cons_file:
|
|
340
|
+
seq_id = record.id
|
|
341
|
+
seq_len = len(record)
|
|
342
|
+
gen_len = int(seq_len) - offset
|
|
343
|
+
tsv_log.append([seq_id, gen_len, seq_len])
|
|
344
|
+
|
|
345
|
+
with open(output_handle, 'w', newline='') as tsv_file:
|
|
346
|
+
writer = csv.writer(tsv_file, delimiter='\t')
|
|
347
|
+
writer.writerows(tsv_log)
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def map_to_depth(bam_file: str, output_handle: str) -> None:
|
|
351
|
+
"""
|
|
352
|
+
Generate position-by-position depth of coverage from BAM file.
|
|
353
|
+
|
|
354
|
+
Extracts coverage depth at every position in the reference using samtools
|
|
355
|
+
depth. Creates a tab-delimited file showing reference name, position, and
|
|
356
|
+
coverage at each position. Used for visualizing coverage profiles.
|
|
357
|
+
|
|
358
|
+
Parameters
|
|
359
|
+
----------
|
|
360
|
+
bam_file : str
|
|
361
|
+
Path to input BAM alignment file
|
|
362
|
+
output_handle : str
|
|
363
|
+
Path for output depth file
|
|
364
|
+
|
|
365
|
+
Returns
|
|
366
|
+
-------
|
|
367
|
+
None
|
|
368
|
+
Writes depth information to output_handle
|
|
369
|
+
|
|
370
|
+
Notes
|
|
371
|
+
-----
|
|
372
|
+
Uses 'samtools depth -aa' which:
|
|
373
|
+
- -aa: Output absolutely all positions, including zero-coverage
|
|
374
|
+
- Ensures complete coverage profile even for uncovered regions
|
|
375
|
+
|
|
376
|
+
Output format (tab-delimited):
|
|
377
|
+
- Column 1: Reference sequence name
|
|
378
|
+
- Column 2: Position (1-based)
|
|
379
|
+
- Column 3: Coverage depth at that position
|
|
380
|
+
|
|
381
|
+
The output file can be:
|
|
382
|
+
- Plotted to visualize coverage across genome
|
|
383
|
+
- Used to identify low-coverage regions
|
|
384
|
+
- Analyzed to assess quality of consensus extensions
|
|
385
|
+
- Imported into visualization tools like R or Python
|
|
386
|
+
|
|
387
|
+
Particularly useful for QC visualization to show how coverage
|
|
388
|
+
changes across telomeric regions and consensus extensions.
|
|
389
|
+
"""
|
|
390
|
+
pysam.depth('-aa', bam_file, '-o', output_handle)
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
def finalize_log(log: str, right_fasta: str, left_fasta: str) -> None:
|
|
394
|
+
"""
|
|
395
|
+
Finalize extension log by prepending final consensus lengths and sequences.
|
|
396
|
+
|
|
397
|
+
Rewrites the log file with a summary section at the top showing the final
|
|
398
|
+
validated consensus lengths after trimming, followed by the original log
|
|
399
|
+
content documenting the extension process. Extracts trimmed consensus
|
|
400
|
+
sequences and includes them in the final summary.
|
|
401
|
+
|
|
402
|
+
Parameters
|
|
403
|
+
----------
|
|
404
|
+
log : str
|
|
405
|
+
Path to extension log file to finalize (will be overwritten)
|
|
406
|
+
right_fasta : str
|
|
407
|
+
Path to FASTA file containing right consensus sequence
|
|
408
|
+
left_fasta : str
|
|
409
|
+
Path to FASTA file containing left consensus sequence
|
|
410
|
+
|
|
411
|
+
Returns
|
|
412
|
+
-------
|
|
413
|
+
None
|
|
414
|
+
Overwrites log file with finalized version including summary header
|
|
415
|
+
|
|
416
|
+
Notes
|
|
417
|
+
-----
|
|
418
|
+
Log processing steps:
|
|
419
|
+
1. Reads existing log content
|
|
420
|
+
2. Extracts original consensus lengths from line 4
|
|
421
|
+
3. Extracts trimming information from last two lines
|
|
422
|
+
4. Calculates final lengths: original_length - trimmed_bases
|
|
423
|
+
5. Extracts trimmed portion of consensus sequences
|
|
424
|
+
6. Writes new header section with final results
|
|
425
|
+
7. Appends original log content below header
|
|
426
|
+
|
|
427
|
+
Final log structure:
|
|
428
|
+
- FINAL GENOME EXTENSION header
|
|
429
|
+
- Final consensus lengths (may show 'rejected' if validation failed)
|
|
430
|
+
- Final consensus sequences (trimmed portions only)
|
|
431
|
+
- Separator line
|
|
432
|
+
- Original log content (initial consensus, trimming details)
|
|
433
|
+
- Closing separator
|
|
434
|
+
|
|
435
|
+
Handles rejected consensus:
|
|
436
|
+
- If 'rejected' in trim log: Shows 'rejected' instead of length
|
|
437
|
+
- Creates empty SeqRecord for rejected consensus
|
|
438
|
+
- For accepted consensus: Shows length and trimmed sequence
|
|
439
|
+
|
|
440
|
+
The final log provides a complete record of:
|
|
441
|
+
- What extensions were added (top section)
|
|
442
|
+
- How they were generated and validated (original log below)
|
|
443
|
+
"""
|
|
444
|
+
file = open(log)
|
|
445
|
+
log_cont = file.readlines()
|
|
446
|
+
file.close()
|
|
447
|
+
# Org lengths of consensus added
|
|
448
|
+
length_lines = log_cont[3]
|
|
449
|
+
left_len = int(length_lines.split('\t')[0].split(':')[1])
|
|
450
|
+
right_len = int(length_lines.split('\t')[1].split(':')[1])
|
|
451
|
+
|
|
452
|
+
# get the number of bases trimmed off
|
|
453
|
+
trim_left = log_cont[-2].split(' ')[-1]
|
|
454
|
+
trim_right = log_cont[-1].split(' ')[-1]
|
|
455
|
+
left_seq = SeqIO.read(left_fasta, 'fasta')
|
|
456
|
+
right_seq = SeqIO.read(right_fasta, 'fasta')
|
|
457
|
+
|
|
458
|
+
if trim_left.rstrip() == 'rejected':
|
|
459
|
+
new_left = 'rejected'
|
|
460
|
+
left_seq = SeqRecord(Seq(''))
|
|
461
|
+
else:
|
|
462
|
+
new_left = left_len - int(trim_left)
|
|
463
|
+
left_seq = left_seq[int(trim_left) :]
|
|
464
|
+
if trim_right.rstrip() == 'rejected':
|
|
465
|
+
new_right = 'rejected'
|
|
466
|
+
right_seq = SeqRecord(Seq(''))
|
|
467
|
+
else:
|
|
468
|
+
new_right = right_len - int(trim_right)
|
|
469
|
+
right_seq = right_seq[0:new_right]
|
|
470
|
+
|
|
471
|
+
final_lengths = 'left_cons:{}\tright_consensus:{}'.format(new_left, new_right)
|
|
472
|
+
|
|
473
|
+
# write to log file
|
|
474
|
+
file = open(log, 'w')
|
|
475
|
+
file.write(
|
|
476
|
+
'=============================================================================='
|
|
477
|
+
)
|
|
478
|
+
file.write('\nFINAL GENOME EXTENSION')
|
|
479
|
+
file.write(
|
|
480
|
+
'\n==============================================================================\n'
|
|
481
|
+
)
|
|
482
|
+
file.write(final_lengths)
|
|
483
|
+
file.write('\n>left_cons\n')
|
|
484
|
+
file.write(str(left_seq.seq))
|
|
485
|
+
file.write('\n>right_cons\n')
|
|
486
|
+
file.write(str(right_seq.seq))
|
|
487
|
+
file.write('\n')
|
|
488
|
+
for line in log_cont:
|
|
489
|
+
file.write(line)
|
|
490
|
+
file.write(
|
|
491
|
+
'==============================================================================\n'
|
|
492
|
+
)
|
|
493
|
+
file.close()
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: telomore
|
|
3
|
+
Version: 0.4.1
|
|
4
|
+
Summary: Identify and extract telomeric sequences from Oxford Nanopore or Illumina sequencing reads to extend Streptomycetes assemblies.
|
|
5
|
+
Project-URL: documentation, https://github.com/dalofa/telomore
|
|
6
|
+
Project-URL: homepage, https://github.com/dalofa/telomore
|
|
7
|
+
Project-URL: repository, https://github.com/dalofa/telomore
|
|
8
|
+
Author-email: David Faurdal <dalofa@biosustain.dtu.dk>
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
14
|
+
Requires-Python: >=3.9
|
|
15
|
+
Requires-Dist: biopython
|
|
16
|
+
Requires-Dist: pysam
|
|
17
|
+
Provides-Extra: dev
|
|
18
|
+
Requires-Dist: hatch; extra == 'dev'
|
|
19
|
+
Requires-Dist: isort; extra == 'dev'
|
|
20
|
+
Requires-Dist: numpydoc-validation; extra == 'dev'
|
|
21
|
+
Requires-Dist: pre-commit; extra == 'dev'
|
|
22
|
+
Requires-Dist: pydocstyle; extra == 'dev'
|
|
23
|
+
Requires-Dist: ruff; extra == 'dev'
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
# TELOMORE
|
|
27
|
+
|
|
28
|
+
Telomore is a tool for identifying and extracting telomeric sequences from
|
|
29
|
+
**Oxford Nanopore** or **Illumina** sequencing reads of *Streptomycetes spp.*
|
|
30
|
+
that have been excluded from a *de novo* assembly. It processes sequencing data
|
|
31
|
+
to extend assemblies, generate quality control (QC) maps, and produce finalized
|
|
32
|
+
assemblies with the telomere/recessed bases included.
|
|
33
|
+
|
|
34
|
+
## Before running Telomore
|
|
35
|
+
|
|
36
|
+
Telomore does not identify linear contigs but rather rely on the user to provide
|
|
37
|
+
that information in the header of the fasta-reference file.
|
|
38
|
+
|
|
39
|
+
## Usage
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
telomore --mode <mode> --reference <reference.fasta> [options]
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Required Arguments
|
|
46
|
+
|
|
47
|
+
- `--mode` Specify the sequencing platform. Options: nanopore or illumina.
|
|
48
|
+
- `--reference` Path to the reference genome file in FASTA format.
|
|
49
|
+
|
|
50
|
+
Nanopore-Specific Arguments
|
|
51
|
+
|
|
52
|
+
- `--single` Path to a single gzipped FASTQ file containing Nanopore reads.
|
|
53
|
+
|
|
54
|
+
Illumina-Specific Arguments
|
|
55
|
+
|
|
56
|
+
- `--read1` Path to gzipped FASTQ file for Illumina read 1.
|
|
57
|
+
- `--read2` Path to gzipped FASTQ file for Illumina read 2.
|
|
58
|
+
|
|
59
|
+
Optional Arguments
|
|
60
|
+
|
|
61
|
+
- `--coverage_threshold` Set the threshold for coverage to stop trimming during
|
|
62
|
+
consensus trimming (Default is coverage=5 for ONT reads and coverage=1 for
|
|
63
|
+
Illumina reads).
|
|
64
|
+
- `--quality_threshold` Set the Q-score required to count a read position in the
|
|
65
|
+
coverage calculation during consensus trimming (Default is Q-score=10 for ONT
|
|
66
|
+
reads and Q-score=30 for Illumina reads).
|
|
67
|
+
- `--threads` Number of threads to use (default: 1).
|
|
68
|
+
- `--keep` Retain intermediate files (default: False).
|
|
69
|
+
- `--quiet` Suppress console logging.
|
|
70
|
+
|
|
71
|
+
## Process overview
|
|
72
|
+
|
|
73
|
+
The process is as follows:
|
|
74
|
+
|
|
75
|
+
1. **Map Reads:**
|
|
76
|
+
Reads are mapped against all contigs in a reference using either minimap2 or
|
|
77
|
+
Bowtie2.
|
|
78
|
+
2. **Extract Extending Reads**
|
|
79
|
+
Extending reads that are mapped to the ends of linear contigs are extracted.
|
|
80
|
+
3. **Build Consensus**
|
|
81
|
+
The terminal extending reads from each end is used to construct a consensus
|
|
82
|
+
using either lamassemble or mafft + EMBOSS cons
|
|
83
|
+
4. **Align and Attach consensus**
|
|
84
|
+
The consensus for each end is aligned to the reference and used to extend it.
|
|
85
|
+
5. **Trim Extended Replicon**
|
|
86
|
+
In a final step, all terminally mapped reads are mapped to the new extended
|
|
87
|
+
reference and used to trim away spurious sequence, based on read-support.
|
|
88
|
+
|
|
89
|
+
## Outputs
|
|
90
|
+
|
|
91
|
+
At the end of a run Telomore produces the following outputs:
|
|
92
|
+
|
|
93
|
+
```Output
|
|
94
|
+
├── {fasta_basename}_{seqtype}_telomore
|
|
95
|
+
│ ├── {contig_name}_telomore_extended.fasta
|
|
96
|
+
│ ├── {contig_name}_telomore_ext_{seqtype}.log
|
|
97
|
+
│ ├── {contig_name}_telomore_QC.bam
|
|
98
|
+
│ ├── {contig_name}_telomore_QC.bam.bai
|
|
99
|
+
│ ├── {contig_name}_telomore_untrimmed.fasta
|
|
100
|
+
│ └── {fasta_basename}_telomore.fasta
|
|
101
|
+
└── telomore.log # log containing run information.
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
In the folder there is a number of files generated for each contig considered:
|
|
105
|
+
|
|
106
|
+
| File Name | Description |
|
|
107
|
+
|-----------|-------------|
|
|
108
|
+
| `{contig_name}_telomore_extended.fasta` | Original contig sequence + added terminal bases - trimmed bases |
|
|
109
|
+
| `{contig_name}_telomore_ext_{seqtype}.log` | Log contianing information about bases added, trimmed off and final result. |
|
|
110
|
+
| `{contig_name}_telomore_QC.bam` | BAM file containing terminal reads mapped to `{contig_name}_telomore_extended.fasta`. Useful for manual inspection of the extension|
|
|
111
|
+
| `{contig_name}_telomore_QC.bam.bai` | Index file for the corresponding BAM file. |
|
|
112
|
+
| `{contig_name}_telomore_untrimmed.fasta` | Original contig sequence + added terminal bases |
|
|
113
|
+
|
|
114
|
+
Additionally, there is a fasta-file collecting all tagged linear contigs as they
|
|
115
|
+
appear in `{contig_name}_telomore_extended.fasta` together with all non-linear
|
|
116
|
+
contigs in the order they appear in the original file.
|
|
117
|
+
|
|
118
|
+
Inspecting the {contig_name}_QC.bam-file in IGV (Integrative Genomics Viewer)
|
|
119
|
+
can be informative in evaluating the extended contig.
|
|
120
|
+
|
|
121
|
+
## Dependencies (CLI-tools)
|
|
122
|
+
|
|
123
|
+
- Bowtie2
|
|
124
|
+
- Emboss tools (cons specifically)
|
|
125
|
+
- Lamassemble
|
|
126
|
+
- LAST-DB
|
|
127
|
+
- Mafft
|
|
128
|
+
- Minimap2, version 2.25 or higher
|
|
129
|
+
- Samtools
|
|
130
|
+
|
|
131
|
+
These can be installed using the conda recipe in this repo:
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
conda env create -f environment.yml -y
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
This repo can then be downloaded using git clone, the conda enviroment activated
|
|
138
|
+
and the tool installed
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
# Activate telomore conda env
|
|
142
|
+
conda activate telomore
|
|
143
|
+
|
|
144
|
+
# Clone telomore repo
|
|
145
|
+
git clone https://github.com/dalofa/telomore && cd telomore
|
|
146
|
+
|
|
147
|
+
# Install package
|
|
148
|
+
pip install -e '.[dev]'
|
|
149
|
+
```
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
telomore/__init__.py,sha256=-yilfTa-JZQ_2VfYWowO54WFEDInH1h-OGWaw88McfI,244
|
|
2
|
+
telomore/_version.py,sha256=k7cu0JKra64gmMNU_UfA5sw2eNc_GRvf3QmesiYAy8g,704
|
|
3
|
+
telomore/app.py,sha256=A3JuFRZHQYxjiVqwofo8w56okmdVDHf1C4zA4klfCl4,17935
|
|
4
|
+
telomore/utils/__init__.py,sha256=6LPHIWv6ARRiIY6ys2_uLJmzmpZYeM8cTuoKWG9ukGU,30
|
|
5
|
+
telomore/utils/arg_parser.py,sha256=UH0sRL14YyuV4PCMzkeR329fD-x5xR6yHA-6UD7q_us,7349
|
|
6
|
+
telomore/utils/classes_and_small_func.py,sha256=Xf3ytB-CSlCYvdFfIVsL74k8jq8nNdZKqEtMy2bMSAw,10173
|
|
7
|
+
telomore/utils/cmd_tools.py,sha256=4EIZLhWZR1fvzIny22KjqRo8IPFEAGL1QHeANTu6ipQ,26949
|
|
8
|
+
telomore/utils/fasta_tools.py,sha256=dZc4lTlkZu6JZoBb9e6EuJ3bC4GnxijiUKV3_JbXcSY,19798
|
|
9
|
+
telomore/utils/map_tools.py,sha256=0cIlIyyjbBWAQT46SYwvQ6eKsF-0RLmyz8udVaJPFaE,47804
|
|
10
|
+
telomore/utils/qc_reports.py,sha256=Mcyn3S1As6Drd9TuTywRyTdA5qfb3msRozs_NhWFYAw,16849
|
|
11
|
+
telomore-0.4.1.dist-info/METADATA,sha256=HxgTEyLYFR9yWpJaARiDCjSojpKFLmOyuqKBnNl2Nzs,5427
|
|
12
|
+
telomore-0.4.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
13
|
+
telomore-0.4.1.dist-info/entry_points.txt,sha256=imwQdQxdlhqz5NeIIiqWRbR9jYh1cy6sIJpY-FTiGgA,53
|
|
14
|
+
telomore-0.4.1.dist-info/licenses/LICENSE,sha256=otCsiAo74jRQIibnrWLcyZ9qk-0c2pMP7Xl984uh-Cs,1088
|
|
15
|
+
telomore-0.4.1.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Technical University of Denmark
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|