telomore 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- telomore/__init__.py +5 -0
- telomore/_version.py +34 -0
- telomore/app.py +536 -0
- telomore/utils/__init__.py +1 -0
- telomore/utils/arg_parser.py +220 -0
- telomore/utils/classes_and_small_func.py +289 -0
- telomore/utils/cmd_tools.py +732 -0
- telomore/utils/fasta_tools.py +595 -0
- telomore/utils/map_tools.py +1333 -0
- telomore/utils/qc_reports.py +493 -0
- telomore-0.4.1.dist-info/METADATA +149 -0
- telomore-0.4.1.dist-info/RECORD +15 -0
- telomore-0.4.1.dist-info/WHEEL +4 -0
- telomore-0.4.1.dist-info/entry_points.txt +2 -0
- telomore-0.4.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,595 @@
|
|
|
1
|
+
"""Utilities for handling fasta files."""
|
|
2
|
+
|
|
3
|
+
from itertools import zip_longest
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
from Bio import SeqIO
|
|
7
|
+
from Bio.SeqRecord import SeqRecord
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def check_fastq_order(file1: str, file2: str) -> bool:
|
|
11
|
+
"""
|
|
12
|
+
Check if two FASTQ files have the same length and read order.
|
|
13
|
+
|
|
14
|
+
Validates that paired-end FASTQ files are properly synchronized by ensuring
|
|
15
|
+
they contain the same number of reads in the same order. This is critical
|
|
16
|
+
for paired-end mapping tools which expect synchronized inputs.
|
|
17
|
+
|
|
18
|
+
Parameters
|
|
19
|
+
----------
|
|
20
|
+
file1 : str
|
|
21
|
+
Path to first FASTQ file
|
|
22
|
+
file2 : str
|
|
23
|
+
Path to second FASTQ file
|
|
24
|
+
|
|
25
|
+
Returns
|
|
26
|
+
-------
|
|
27
|
+
bool
|
|
28
|
+
True if files are the same length with matching read IDs in order,
|
|
29
|
+
False otherwise
|
|
30
|
+
|
|
31
|
+
Notes
|
|
32
|
+
-----
|
|
33
|
+
This function:
|
|
34
|
+
- Iterates through both files simultaneously using zip_longest
|
|
35
|
+
- Compares read IDs at each position
|
|
36
|
+
- Prints informative error message if mismatch found
|
|
37
|
+
- Returns False immediately upon first mismatch
|
|
38
|
+
|
|
39
|
+
A return value of False indicates the files cannot be used together
|
|
40
|
+
for paired-end mapping without reordering or filtering.
|
|
41
|
+
"""
|
|
42
|
+
handle1 = SeqIO.parse(file1, 'fastq')
|
|
43
|
+
handle2 = SeqIO.parse(file2, 'fastq')
|
|
44
|
+
|
|
45
|
+
# Iterate over reads, use zip_longest to not stop if one file is shorter than the other
|
|
46
|
+
for i, (read1, read2) in enumerate(zip_longest(handle1, handle2)):
|
|
47
|
+
if read1 is None or read2 is None:
|
|
48
|
+
print(
|
|
49
|
+
f'{file1} and {file2} are not the same length, diverging at read {i + 1}'
|
|
50
|
+
)
|
|
51
|
+
return False
|
|
52
|
+
if read1.id != read2.id:
|
|
53
|
+
print(
|
|
54
|
+
f'Mismatch at read {i + 1} in files {file1} {file2}: {read1.id} != {read2.id}'
|
|
55
|
+
)
|
|
56
|
+
return False
|
|
57
|
+
return True
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def get_linear_elements(fasta_file: str) -> list[str]:
|
|
61
|
+
"""
|
|
62
|
+
Extract contig names that are tagged as linear in a FASTA file.
|
|
63
|
+
|
|
64
|
+
Parses a FASTA file to identify contigs with 'linear' in their description
|
|
65
|
+
line. This is used to identify which contigs should be processed for
|
|
66
|
+
telomere extension.
|
|
67
|
+
|
|
68
|
+
Parameters
|
|
69
|
+
----------
|
|
70
|
+
fasta_file : str
|
|
71
|
+
Path to FASTA file where linear contigs are tagged
|
|
72
|
+
|
|
73
|
+
Returns
|
|
74
|
+
-------
|
|
75
|
+
list of str
|
|
76
|
+
List of contig IDs (record.id) for contigs with 'linear' in description
|
|
77
|
+
|
|
78
|
+
Notes
|
|
79
|
+
-----
|
|
80
|
+
Expected FASTA header format for linear contigs:
|
|
81
|
+
>contig_name linear
|
|
82
|
+
or
|
|
83
|
+
>contig_name [linear] some other description
|
|
84
|
+
|
|
85
|
+
The 'linear' keyword can appear anywhere in the description line.
|
|
86
|
+
Only the contig ID (before the first space) is returned, not the full
|
|
87
|
+
description.
|
|
88
|
+
|
|
89
|
+
Empty list is returned if no linear contigs are found, which causes
|
|
90
|
+
the workflow to exit gracefully.
|
|
91
|
+
"""
|
|
92
|
+
linear_list = []
|
|
93
|
+
for record in SeqIO.parse(fasta_file, 'fasta'):
|
|
94
|
+
if 'linear' in record.description:
|
|
95
|
+
linear_list.append(record.id)
|
|
96
|
+
return linear_list
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def extract_contig(fasta_in: str, contig_name: str, fasta_out: str) -> None:
|
|
100
|
+
"""
|
|
101
|
+
Extract a single contig from a multi-FASTA file.
|
|
102
|
+
|
|
103
|
+
Searches through a FASTA file for a contig with the specified name and
|
|
104
|
+
writes it to a new single-sequence FASTA file.
|
|
105
|
+
|
|
106
|
+
Parameters
|
|
107
|
+
----------
|
|
108
|
+
fasta_in : str
|
|
109
|
+
Path to input multi-FASTA file
|
|
110
|
+
contig_name : str
|
|
111
|
+
Name of contig to extract (must match record.id exactly)
|
|
112
|
+
fasta_out : str
|
|
113
|
+
Path for output FASTA file containing only the extracted contig
|
|
114
|
+
|
|
115
|
+
Returns
|
|
116
|
+
-------
|
|
117
|
+
None
|
|
118
|
+
Writes extracted contig to fasta_out
|
|
119
|
+
|
|
120
|
+
Notes
|
|
121
|
+
-----
|
|
122
|
+
- Only the first contig matching contig_name is extracted
|
|
123
|
+
- If no match is found, no output file is created
|
|
124
|
+
- The output FASTA retains the original sequence and description
|
|
125
|
+
"""
|
|
126
|
+
for record in SeqIO.parse(fasta_in, 'fasta'):
|
|
127
|
+
if record.id == contig_name:
|
|
128
|
+
contig = record
|
|
129
|
+
with open(fasta_out, 'w') as fq_file:
|
|
130
|
+
SeqIO.write(sequences=contig, handle=fq_file, format='fasta')
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def get_fasta_length(fasta_file: str, contig_name: str) -> int:
|
|
134
|
+
"""
|
|
135
|
+
Get the sequence length of a specific contig in a FASTA file.
|
|
136
|
+
|
|
137
|
+
Searches through a FASTA file for a contig with the specified name and
|
|
138
|
+
returns its sequence length in bases.
|
|
139
|
+
|
|
140
|
+
Parameters
|
|
141
|
+
----------
|
|
142
|
+
fasta_file : str
|
|
143
|
+
Path to FASTA file
|
|
144
|
+
contig_name : str
|
|
145
|
+
Name of contig whose length to retrieve (must match record.id exactly)
|
|
146
|
+
|
|
147
|
+
Returns
|
|
148
|
+
-------
|
|
149
|
+
int
|
|
150
|
+
Length of the contig sequence in bases
|
|
151
|
+
|
|
152
|
+
Notes
|
|
153
|
+
-----
|
|
154
|
+
- Returns length of first matching contig
|
|
155
|
+
- Returns None implicitly if contig not found (no explicit return statement)
|
|
156
|
+
- Used to determine truncation boundaries for preventing alternative mappings
|
|
157
|
+
"""
|
|
158
|
+
for record in SeqIO.parse(fasta_file, 'fasta'):
|
|
159
|
+
if record.id == contig_name:
|
|
160
|
+
length = len(record.seq)
|
|
161
|
+
return length
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def dereplicate_fastq(fastq_in: str, fastq_out: str) -> None:
|
|
165
|
+
"""
|
|
166
|
+
Remove duplicate reads from a FASTQ file based on read ID.
|
|
167
|
+
|
|
168
|
+
Creates a new FASTQ file containing only the first occurrence of each
|
|
169
|
+
unique read ID. This prevents the same read from being counted multiple
|
|
170
|
+
times in coverage calculations.
|
|
171
|
+
|
|
172
|
+
Parameters
|
|
173
|
+
----------
|
|
174
|
+
fastq_in : str
|
|
175
|
+
Path to input FASTQ file (may contain duplicates)
|
|
176
|
+
fastq_out : str
|
|
177
|
+
Path for output deduplicated FASTQ file
|
|
178
|
+
|
|
179
|
+
Returns
|
|
180
|
+
-------
|
|
181
|
+
None
|
|
182
|
+
Writes deduplicated reads to fastq_out
|
|
183
|
+
|
|
184
|
+
Notes
|
|
185
|
+
-----
|
|
186
|
+
- Deduplication is based solely on read.id (not sequence)
|
|
187
|
+
- Order of first occurrences is preserved
|
|
188
|
+
- Subsequent reads with the same ID are discarded
|
|
189
|
+
- Useful when reads may map to multiple locations and appear in
|
|
190
|
+
multiple SAM extractions
|
|
191
|
+
"""
|
|
192
|
+
seen_reads = set() # To store unique read identifiers and sequences
|
|
193
|
+
unique_reads = []
|
|
194
|
+
|
|
195
|
+
with open(fastq_in, 'r') as infile:
|
|
196
|
+
for record in SeqIO.parse(infile, 'fastq'):
|
|
197
|
+
# Dereplicate only on read_id to avoid identical reads with different mappings producing duplicates
|
|
198
|
+
read_key = record.id
|
|
199
|
+
|
|
200
|
+
if read_key not in seen_reads:
|
|
201
|
+
seen_reads.add(read_key)
|
|
202
|
+
unique_reads.append(record)
|
|
203
|
+
|
|
204
|
+
with open(fastq_out, 'w') as outfile:
|
|
205
|
+
SeqIO.write(unique_reads, outfile, 'fastq')
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def cat_and_derep_fastq(fastq_in1: str, fastq_in2: str, fastq_out: str) -> None:
|
|
209
|
+
"""
|
|
210
|
+
Concatenate two FASTQ files and remove duplicate reads.
|
|
211
|
+
|
|
212
|
+
Combines two FASTQ files into a single output file and then removes
|
|
213
|
+
duplicate reads based on read ID. This is useful for merging left and
|
|
214
|
+
right terminal reads while ensuring each read appears only once.
|
|
215
|
+
|
|
216
|
+
Parameters
|
|
217
|
+
----------
|
|
218
|
+
fastq_in1 : str
|
|
219
|
+
Path to first input FASTQ file
|
|
220
|
+
fastq_in2 : str
|
|
221
|
+
Path to second input FASTQ file
|
|
222
|
+
fastq_out : str
|
|
223
|
+
Path for output deduplicated FASTQ file
|
|
224
|
+
|
|
225
|
+
Returns
|
|
226
|
+
-------
|
|
227
|
+
None
|
|
228
|
+
Writes concatenated and deduplicated reads to fastq_out
|
|
229
|
+
|
|
230
|
+
Notes
|
|
231
|
+
-----
|
|
232
|
+
This function operates in two stages:
|
|
233
|
+
1. Concatenation: All reads from both input files are written to output
|
|
234
|
+
2. Deduplication: The output file is overwritten with unique reads only
|
|
235
|
+
|
|
236
|
+
The deduplication is performed by the dereplicate_fastq function, which
|
|
237
|
+
removes duplicates based on read.id. The output file is written twice
|
|
238
|
+
(once for concatenation, once after deduplication).
|
|
239
|
+
"""
|
|
240
|
+
with open(fastq_out, 'w') as outfile:
|
|
241
|
+
# concat
|
|
242
|
+
with open(fastq_in1, 'r') as infile1:
|
|
243
|
+
for record in SeqIO.parse(infile1, 'fastq'):
|
|
244
|
+
SeqIO.write(record, outfile, 'fastq')
|
|
245
|
+
|
|
246
|
+
with open(fastq_in2, 'r') as infile2:
|
|
247
|
+
for record in SeqIO.parse(infile2, 'fastq'):
|
|
248
|
+
SeqIO.write(record, outfile, 'fastq')
|
|
249
|
+
|
|
250
|
+
dereplicate_fastq(fastq_in=fastq_out, fastq_out=fastq_out)
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def get_chromosome(fasta: str, output_handle: str) -> None:
|
|
254
|
+
"""
|
|
255
|
+
Extract the primary chromosome from a FASTA file.
|
|
256
|
+
|
|
257
|
+
If the input contains a single contig, it is written to the output.
|
|
258
|
+
If multiple contigs exist, the longest contig is selected and written
|
|
259
|
+
as it is assumed to be the main chromosome. Logs information about
|
|
260
|
+
the selected contig.
|
|
261
|
+
|
|
262
|
+
Parameters
|
|
263
|
+
----------
|
|
264
|
+
fasta : str
|
|
265
|
+
Path to input FASTA file (single or multi-contig)
|
|
266
|
+
output_handle : str
|
|
267
|
+
Path for output FASTA file containing the selected chromosome
|
|
268
|
+
|
|
269
|
+
Returns
|
|
270
|
+
-------
|
|
271
|
+
None
|
|
272
|
+
Writes the selected chromosome to output_handle and logs the selection
|
|
273
|
+
|
|
274
|
+
Notes
|
|
275
|
+
-----
|
|
276
|
+
Selection logic:
|
|
277
|
+
- Single contig: Uses that contig directly
|
|
278
|
+
- Multiple contigs: Selects the longest contig by sequence length
|
|
279
|
+
|
|
280
|
+
The function assumes the longest contig is the main chromosome, which
|
|
281
|
+
is appropriate for bacterial genomes or assemblies where the chromosome
|
|
282
|
+
is expected to be significantly longer than plasmids or contaminants.
|
|
283
|
+
|
|
284
|
+
Logging messages indicate which contig was selected and whether it was
|
|
285
|
+
the only contig or chosen as the longest.
|
|
286
|
+
"""
|
|
287
|
+
# test if there are a single entry in the fasta file
|
|
288
|
+
try: # there is a single entry
|
|
289
|
+
chromosome = SeqIO.read(fasta, format='fasta')
|
|
290
|
+
SeqIO.write(chromosome, output_handle, format='fasta')
|
|
291
|
+
message = 'A single contig: {} was found and will be used for mapping'.format(
|
|
292
|
+
'>' + chromosome.id
|
|
293
|
+
)
|
|
294
|
+
logging.info(message)
|
|
295
|
+
|
|
296
|
+
except ValueError: # there are more than one entry
|
|
297
|
+
contigs = SeqIO.parse(fasta, format='fasta')
|
|
298
|
+
max_len = 0
|
|
299
|
+
|
|
300
|
+
# identify longest entry and assume it is the chromosome
|
|
301
|
+
for record in contigs:
|
|
302
|
+
seq_len = len(record.seq)
|
|
303
|
+
|
|
304
|
+
if seq_len > max_len:
|
|
305
|
+
chromosome = record
|
|
306
|
+
max_len = seq_len
|
|
307
|
+
|
|
308
|
+
SeqIO.write(chromosome, output_handle, format='fasta')
|
|
309
|
+
message = 'The longest contig: {} has been saved as {} and will be used for mapping.'.format(
|
|
310
|
+
'>' + chromosome.id, output_handle
|
|
311
|
+
)
|
|
312
|
+
logging.info(message)
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def attach_seq(
|
|
316
|
+
left: str, right: str, chromosome: str, output_name: str, offset: int = 0
|
|
317
|
+
) -> None:
|
|
318
|
+
"""
|
|
319
|
+
Attach telomeric sequences to both ends of a chromosome sequence.
|
|
320
|
+
|
|
321
|
+
Concatenates left and right sequences to the chromosome, optionally
|
|
322
|
+
trimming bases from each end of the chromosome before attachment.
|
|
323
|
+
This is used to build extended genomes with telomeric sequences.
|
|
324
|
+
|
|
325
|
+
Parameters
|
|
326
|
+
----------
|
|
327
|
+
left : str
|
|
328
|
+
Path to FASTA file containing left/5' telomeric sequence
|
|
329
|
+
right : str
|
|
330
|
+
Path to FASTA file containing right/3' telomeric sequence
|
|
331
|
+
chromosome : str
|
|
332
|
+
Path to FASTA file containing chromosome sequence
|
|
333
|
+
output_name : str
|
|
334
|
+
Path for output FASTA file with attached sequences
|
|
335
|
+
offset : int, default=0
|
|
336
|
+
Number of bases to trim from each end of chromosome before attachment
|
|
337
|
+
|
|
338
|
+
Returns
|
|
339
|
+
-------
|
|
340
|
+
None
|
|
341
|
+
Writes extended genome to output_name
|
|
342
|
+
|
|
343
|
+
Raises
|
|
344
|
+
------
|
|
345
|
+
ValueError
|
|
346
|
+
If offset is greater than or equal to half the chromosome length
|
|
347
|
+
|
|
348
|
+
Notes
|
|
349
|
+
-----
|
|
350
|
+
The offset parameter allows trimming of chromosome ends to remove
|
|
351
|
+
potentially problematic assembly regions before attaching telomeric
|
|
352
|
+
sequences. If offset > 0, bases [offset:-offset] are retained.
|
|
353
|
+
|
|
354
|
+
The output sequence ID is derived from output_name by removing the
|
|
355
|
+
file extension.
|
|
356
|
+
|
|
357
|
+
Example: For a 10kb chromosome with offset=100:
|
|
358
|
+
- Chromosome bases 100-9900 are retained
|
|
359
|
+
- Left sequence + chromosome[100:9900] + right sequence
|
|
360
|
+
"""
|
|
361
|
+
left_seq = SeqIO.read(left, 'fasta')
|
|
362
|
+
right_seq = SeqIO.read(right, 'fasta')
|
|
363
|
+
chrom = SeqIO.read(chromosome, 'fasta')
|
|
364
|
+
|
|
365
|
+
if offset == 0: # if offset is 0 offset:-offset fucks it up
|
|
366
|
+
genome = chrom
|
|
367
|
+
elif offset >= len(chrom.seq) / 2:
|
|
368
|
+
logging.error('Error: Offset is larger than 1/2 genome length.')
|
|
369
|
+
return
|
|
370
|
+
else:
|
|
371
|
+
genome = chrom[offset:-offset]
|
|
372
|
+
|
|
373
|
+
att_genome = left_seq + genome + right_seq
|
|
374
|
+
att_genome.id = output_name.split('.')[0]
|
|
375
|
+
SeqIO.write(att_genome, output_name, 'fasta')
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
# A function to merge fasta files
|
|
379
|
+
def merge_fasta(input_file1: str, input_file2: str, output_file: str) -> None:
|
|
380
|
+
"""
|
|
381
|
+
Merge two FASTA files into a single multi-FASTA file.
|
|
382
|
+
|
|
383
|
+
Combines all sequences from two FASTA files into one output file,
|
|
384
|
+
preserving the order (file1 sequences first, then file2 sequences).
|
|
385
|
+
Useful for creating multi-sequence reference files or combining
|
|
386
|
+
consensus sequences.
|
|
387
|
+
|
|
388
|
+
Parameters
|
|
389
|
+
----------
|
|
390
|
+
input_file1 : str
|
|
391
|
+
Path to first input FASTA file
|
|
392
|
+
input_file2 : str
|
|
393
|
+
Path to second input FASTA file
|
|
394
|
+
output_file : str
|
|
395
|
+
Path for output merged FASTA file
|
|
396
|
+
|
|
397
|
+
Returns
|
|
398
|
+
-------
|
|
399
|
+
None
|
|
400
|
+
Writes merged sequences to output_file
|
|
401
|
+
|
|
402
|
+
Notes
|
|
403
|
+
-----
|
|
404
|
+
- All sequences from both files are included
|
|
405
|
+
- Original sequence IDs and descriptions are preserved
|
|
406
|
+
- Order is maintained: all sequences from file1, then all from file2
|
|
407
|
+
- Can merge single-sequence or multi-sequence FASTA files
|
|
408
|
+
"""
|
|
409
|
+
# Read sequences from input_file1 and input_file2
|
|
410
|
+
sequences1 = list(SeqIO.parse(input_file1, 'fasta'))
|
|
411
|
+
sequences2 = list(SeqIO.parse(input_file2, 'fasta'))
|
|
412
|
+
|
|
413
|
+
# Merge sequences
|
|
414
|
+
merged_sequences = sequences1 + sequences2
|
|
415
|
+
|
|
416
|
+
# Write merged sequences to output_file
|
|
417
|
+
with open(output_file, 'w') as output_handle:
|
|
418
|
+
SeqIO.write(merged_sequences, output_handle, 'fasta')
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
def trim_to_cons(input_seq: str, num_base: int, output_handle: str) -> None:
|
|
422
|
+
"""
|
|
423
|
+
Trim sequences to a specified number of bases from the start.
|
|
424
|
+
|
|
425
|
+
Truncates all sequences in a FASTA file to the first num_base bases,
|
|
426
|
+
adding a 'trimmed_' prefix to sequence IDs. Skips sequences shorter
|
|
427
|
+
than the requested length with an error message.
|
|
428
|
+
|
|
429
|
+
Parameters
|
|
430
|
+
----------
|
|
431
|
+
input_seq : str
|
|
432
|
+
Path to input FASTA file
|
|
433
|
+
num_base : int
|
|
434
|
+
Number of bases to retain from the start of each sequence
|
|
435
|
+
output_handle : str
|
|
436
|
+
Path for output trimmed FASTA file
|
|
437
|
+
|
|
438
|
+
Returns
|
|
439
|
+
-------
|
|
440
|
+
None
|
|
441
|
+
Writes trimmed sequences to output_handle
|
|
442
|
+
|
|
443
|
+
Notes
|
|
444
|
+
-----
|
|
445
|
+
Processing details:
|
|
446
|
+
- Sequences are trimmed to bases [0:num_base+1] (indices 0 through num_base)
|
|
447
|
+
- Sequence IDs are prefixed with 'trimmed_'
|
|
448
|
+
- Descriptions are removed from output sequences
|
|
449
|
+
- Sequences shorter than num_base are skipped with error log
|
|
450
|
+
- Only successfully trimmed sequences are written to output
|
|
451
|
+
|
|
452
|
+
If all sequences are too short, an empty output file may be created.
|
|
453
|
+
"""
|
|
454
|
+
# load file
|
|
455
|
+
with open(input_seq) as fasta_file:
|
|
456
|
+
all_rec = []
|
|
457
|
+
|
|
458
|
+
for record in SeqIO.parse(fasta_file, 'fasta'):
|
|
459
|
+
new_id = 'trimmed_' + record.id
|
|
460
|
+
|
|
461
|
+
r_seq = record.seq
|
|
462
|
+
length = len(r_seq)
|
|
463
|
+
|
|
464
|
+
if num_base <= length:
|
|
465
|
+
to_write = SeqRecord(
|
|
466
|
+
seq=r_seq[0 : num_base + 1], id=new_id, description=''
|
|
467
|
+
)
|
|
468
|
+
all_rec.append(to_write)
|
|
469
|
+
|
|
470
|
+
else:
|
|
471
|
+
logging.error('Error: Index out of range')
|
|
472
|
+
|
|
473
|
+
if len(all_rec) > 0:
|
|
474
|
+
SeqIO.write(all_rec, output_handle, 'fasta')
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
def strip_fasta(
|
|
478
|
+
input_file: str, output_file: str, x: int, remove_from: str = 'start'
|
|
479
|
+
) -> None:
|
|
480
|
+
"""
|
|
481
|
+
Remove a specified number of bases from sequence ends.
|
|
482
|
+
|
|
483
|
+
Strips x bases from either the start (5' end) or end (3' end) of all
|
|
484
|
+
sequences in a FASTA file. Useful for removing adapter sequences,
|
|
485
|
+
low-quality ends, or trimming consensus sequences.
|
|
486
|
+
|
|
487
|
+
Parameters
|
|
488
|
+
----------
|
|
489
|
+
input_file : str
|
|
490
|
+
Path to input FASTA file
|
|
491
|
+
output_file : str
|
|
492
|
+
Path for output stripped FASTA file
|
|
493
|
+
x : int
|
|
494
|
+
Number of bases to remove from each sequence
|
|
495
|
+
remove_from : str, default='start'
|
|
496
|
+
Which end to remove bases from: 'start' for 5' end, 'end' for 3' end
|
|
497
|
+
|
|
498
|
+
Returns
|
|
499
|
+
-------
|
|
500
|
+
None
|
|
501
|
+
Writes stripped sequences to output_file
|
|
502
|
+
|
|
503
|
+
Raises
|
|
504
|
+
------
|
|
505
|
+
AssertionError
|
|
506
|
+
If x is not an integer
|
|
507
|
+
ValueError
|
|
508
|
+
If remove_from is not 'start' or 'end'
|
|
509
|
+
|
|
510
|
+
Notes
|
|
511
|
+
-----
|
|
512
|
+
- Sequence IDs and descriptions are preserved
|
|
513
|
+
- If remove_from='start': sequence[x:] is retained
|
|
514
|
+
- If remove_from='end': sequence[:-x] is retained
|
|
515
|
+
- All sequences in the file are processed identically
|
|
516
|
+
- No validation that x is less than sequence length
|
|
517
|
+
"""
|
|
518
|
+
assert type(x) is int
|
|
519
|
+
|
|
520
|
+
records = []
|
|
521
|
+
|
|
522
|
+
for record in SeqIO.parse(input_file, 'fasta'):
|
|
523
|
+
if remove_from == 'start':
|
|
524
|
+
modified_seq = record.seq[x:]
|
|
525
|
+
elif remove_from == 'end':
|
|
526
|
+
modified_seq = record.seq[:-x]
|
|
527
|
+
else:
|
|
528
|
+
raise ValueError("remove_from must be either 'start' or 'end'")
|
|
529
|
+
|
|
530
|
+
record.seq = modified_seq
|
|
531
|
+
records.append(record)
|
|
532
|
+
|
|
533
|
+
SeqIO.write(records, output_file, 'fasta')
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
def build_extended_fasta(
|
|
537
|
+
org_fasta: str, linear_elements: list[str], replicon_list: list, output_handle: str
|
|
538
|
+
) -> None:
|
|
539
|
+
"""
|
|
540
|
+
Reconstruct multi-FASTA with extended linear contigs in original order.
|
|
541
|
+
|
|
542
|
+
Replaces linear contigs that were extended by Telomore with their extended
|
|
543
|
+
versions, while keeping circular/unprocessed contigs unchanged. The output
|
|
544
|
+
maintains the original contig order and marks extended contigs as [linear].
|
|
545
|
+
|
|
546
|
+
Parameters
|
|
547
|
+
----------
|
|
548
|
+
org_fasta : str
|
|
549
|
+
Path to original input FASTA file
|
|
550
|
+
linear_elements : list of str
|
|
551
|
+
List of contig IDs that were identified as linear and extended
|
|
552
|
+
replicon_list : list
|
|
553
|
+
List of Replicon objects containing paths to extended sequences
|
|
554
|
+
output_handle : str
|
|
555
|
+
Path for output FASTA file with extended contigs
|
|
556
|
+
|
|
557
|
+
Returns
|
|
558
|
+
-------
|
|
559
|
+
None
|
|
560
|
+
Writes reconstructed FASTA to output_handle
|
|
561
|
+
|
|
562
|
+
Notes
|
|
563
|
+
-----
|
|
564
|
+
Processing logic:
|
|
565
|
+
- Iterates through original FASTA in order
|
|
566
|
+
- For linear contigs: replaces with extended version from Replicon.trim_out
|
|
567
|
+
- For other contigs: copies unchanged from original
|
|
568
|
+
- Adds '[linear]' to description of extended contigs
|
|
569
|
+
|
|
570
|
+
This ensures the final assembly maintains the original contig order,
|
|
571
|
+
which is important for tools that expect specific reference structures.
|
|
572
|
+
The [linear] tag allows downstream tools to identify which contigs
|
|
573
|
+
were extended.
|
|
574
|
+
"""
|
|
575
|
+
seq_rec_list = [] # list of seqrecord to write to newfile
|
|
576
|
+
|
|
577
|
+
for record in SeqIO.parse(org_fasta, 'fasta'):
|
|
578
|
+
if record.id in linear_elements:
|
|
579
|
+
for replicon in replicon_list:
|
|
580
|
+
if replicon.name == record.id:
|
|
581
|
+
path_to_telomore_rec = replicon.trim_out
|
|
582
|
+
telomore_rec = SeqIO.read(path_to_telomore_rec, format='fasta')
|
|
583
|
+
telomore_rec.description = '[linear]'
|
|
584
|
+
seq_rec_list.append(telomore_rec)
|
|
585
|
+
else:
|
|
586
|
+
seq_rec_list.append(record)
|
|
587
|
+
|
|
588
|
+
SeqIO.write(sequences=seq_rec_list, handle=output_handle, format='fasta')
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
if __name__ == '__main__':
|
|
592
|
+
check_fastq_order(
|
|
593
|
+
'/tmp/tmpnrb8ke64/all_terminal_reads_1.fastq',
|
|
594
|
+
'/tmp/tmpnrb8ke64/all_terminal_reads_2.fastq',
|
|
595
|
+
)
|