telomore 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,595 @@
1
+ """Utilities for handling fasta files."""
2
+
3
+ from itertools import zip_longest
4
+ import logging
5
+
6
+ from Bio import SeqIO
7
+ from Bio.SeqRecord import SeqRecord
8
+
9
+
10
+ def check_fastq_order(file1: str, file2: str) -> bool:
11
+ """
12
+ Check if two FASTQ files have the same length and read order.
13
+
14
+ Validates that paired-end FASTQ files are properly synchronized by ensuring
15
+ they contain the same number of reads in the same order. This is critical
16
+ for paired-end mapping tools which expect synchronized inputs.
17
+
18
+ Parameters
19
+ ----------
20
+ file1 : str
21
+ Path to first FASTQ file
22
+ file2 : str
23
+ Path to second FASTQ file
24
+
25
+ Returns
26
+ -------
27
+ bool
28
+ True if files are the same length with matching read IDs in order,
29
+ False otherwise
30
+
31
+ Notes
32
+ -----
33
+ This function:
34
+ - Iterates through both files simultaneously using zip_longest
35
+ - Compares read IDs at each position
36
+ - Prints informative error message if mismatch found
37
+ - Returns False immediately upon first mismatch
38
+
39
+ A return value of False indicates the files cannot be used together
40
+ for paired-end mapping without reordering or filtering.
41
+ """
42
+ handle1 = SeqIO.parse(file1, 'fastq')
43
+ handle2 = SeqIO.parse(file2, 'fastq')
44
+
45
+ # Iterate over reads, use zip_longest to not stop if one file is shorter than the other
46
+ for i, (read1, read2) in enumerate(zip_longest(handle1, handle2)):
47
+ if read1 is None or read2 is None:
48
+ print(
49
+ f'{file1} and {file2} are not the same length, diverging at read {i + 1}'
50
+ )
51
+ return False
52
+ if read1.id != read2.id:
53
+ print(
54
+ f'Mismatch at read {i + 1} in files {file1} {file2}: {read1.id} != {read2.id}'
55
+ )
56
+ return False
57
+ return True
58
+
59
+
60
+ def get_linear_elements(fasta_file: str) -> list[str]:
61
+ """
62
+ Extract contig names that are tagged as linear in a FASTA file.
63
+
64
+ Parses a FASTA file to identify contigs with 'linear' in their description
65
+ line. This is used to identify which contigs should be processed for
66
+ telomere extension.
67
+
68
+ Parameters
69
+ ----------
70
+ fasta_file : str
71
+ Path to FASTA file where linear contigs are tagged
72
+
73
+ Returns
74
+ -------
75
+ list of str
76
+ List of contig IDs (record.id) for contigs with 'linear' in description
77
+
78
+ Notes
79
+ -----
80
+ Expected FASTA header format for linear contigs:
81
+ >contig_name linear
82
+ or
83
+ >contig_name [linear] some other description
84
+
85
+ The 'linear' keyword can appear anywhere in the description line.
86
+ Only the contig ID (before the first space) is returned, not the full
87
+ description.
88
+
89
+ Empty list is returned if no linear contigs are found, which causes
90
+ the workflow to exit gracefully.
91
+ """
92
+ linear_list = []
93
+ for record in SeqIO.parse(fasta_file, 'fasta'):
94
+ if 'linear' in record.description:
95
+ linear_list.append(record.id)
96
+ return linear_list
97
+
98
+
99
+ def extract_contig(fasta_in: str, contig_name: str, fasta_out: str) -> None:
100
+ """
101
+ Extract a single contig from a multi-FASTA file.
102
+
103
+ Searches through a FASTA file for a contig with the specified name and
104
+ writes it to a new single-sequence FASTA file.
105
+
106
+ Parameters
107
+ ----------
108
+ fasta_in : str
109
+ Path to input multi-FASTA file
110
+ contig_name : str
111
+ Name of contig to extract (must match record.id exactly)
112
+ fasta_out : str
113
+ Path for output FASTA file containing only the extracted contig
114
+
115
+ Returns
116
+ -------
117
+ None
118
+ Writes extracted contig to fasta_out
119
+
120
+ Notes
121
+ -----
122
+ - Only the first contig matching contig_name is extracted
123
+ - If no match is found, no output file is created
124
+ - The output FASTA retains the original sequence and description
125
+ """
126
+ for record in SeqIO.parse(fasta_in, 'fasta'):
127
+ if record.id == contig_name:
128
+ contig = record
129
+ with open(fasta_out, 'w') as fq_file:
130
+ SeqIO.write(sequences=contig, handle=fq_file, format='fasta')
131
+
132
+
133
+ def get_fasta_length(fasta_file: str, contig_name: str) -> int:
134
+ """
135
+ Get the sequence length of a specific contig in a FASTA file.
136
+
137
+ Searches through a FASTA file for a contig with the specified name and
138
+ returns its sequence length in bases.
139
+
140
+ Parameters
141
+ ----------
142
+ fasta_file : str
143
+ Path to FASTA file
144
+ contig_name : str
145
+ Name of contig whose length to retrieve (must match record.id exactly)
146
+
147
+ Returns
148
+ -------
149
+ int
150
+ Length of the contig sequence in bases
151
+
152
+ Notes
153
+ -----
154
+ - Returns length of first matching contig
155
+ - Returns None implicitly if contig not found (no explicit return statement)
156
+ - Used to determine truncation boundaries for preventing alternative mappings
157
+ """
158
+ for record in SeqIO.parse(fasta_file, 'fasta'):
159
+ if record.id == contig_name:
160
+ length = len(record.seq)
161
+ return length
162
+
163
+
164
+ def dereplicate_fastq(fastq_in: str, fastq_out: str) -> None:
165
+ """
166
+ Remove duplicate reads from a FASTQ file based on read ID.
167
+
168
+ Creates a new FASTQ file containing only the first occurrence of each
169
+ unique read ID. This prevents the same read from being counted multiple
170
+ times in coverage calculations.
171
+
172
+ Parameters
173
+ ----------
174
+ fastq_in : str
175
+ Path to input FASTQ file (may contain duplicates)
176
+ fastq_out : str
177
+ Path for output deduplicated FASTQ file
178
+
179
+ Returns
180
+ -------
181
+ None
182
+ Writes deduplicated reads to fastq_out
183
+
184
+ Notes
185
+ -----
186
+ - Deduplication is based solely on read.id (not sequence)
187
+ - Order of first occurrences is preserved
188
+ - Subsequent reads with the same ID are discarded
189
+ - Useful when reads may map to multiple locations and appear in
190
+ multiple SAM extractions
191
+ """
192
+ seen_reads = set() # To store unique read identifiers and sequences
193
+ unique_reads = []
194
+
195
+ with open(fastq_in, 'r') as infile:
196
+ for record in SeqIO.parse(infile, 'fastq'):
197
+ # Dereplicate only on read_id to avoid identical reads with different mappings producing duplicates
198
+ read_key = record.id
199
+
200
+ if read_key not in seen_reads:
201
+ seen_reads.add(read_key)
202
+ unique_reads.append(record)
203
+
204
+ with open(fastq_out, 'w') as outfile:
205
+ SeqIO.write(unique_reads, outfile, 'fastq')
206
+
207
+
208
+ def cat_and_derep_fastq(fastq_in1: str, fastq_in2: str, fastq_out: str) -> None:
209
+ """
210
+ Concatenate two FASTQ files and remove duplicate reads.
211
+
212
+ Combines two FASTQ files into a single output file and then removes
213
+ duplicate reads based on read ID. This is useful for merging left and
214
+ right terminal reads while ensuring each read appears only once.
215
+
216
+ Parameters
217
+ ----------
218
+ fastq_in1 : str
219
+ Path to first input FASTQ file
220
+ fastq_in2 : str
221
+ Path to second input FASTQ file
222
+ fastq_out : str
223
+ Path for output deduplicated FASTQ file
224
+
225
+ Returns
226
+ -------
227
+ None
228
+ Writes concatenated and deduplicated reads to fastq_out
229
+
230
+ Notes
231
+ -----
232
+ This function operates in two stages:
233
+ 1. Concatenation: All reads from both input files are written to output
234
+ 2. Deduplication: The output file is overwritten with unique reads only
235
+
236
+ The deduplication is performed by the dereplicate_fastq function, which
237
+ removes duplicates based on read.id. The output file is written twice
238
+ (once for concatenation, once after deduplication).
239
+ """
240
+ with open(fastq_out, 'w') as outfile:
241
+ # concat
242
+ with open(fastq_in1, 'r') as infile1:
243
+ for record in SeqIO.parse(infile1, 'fastq'):
244
+ SeqIO.write(record, outfile, 'fastq')
245
+
246
+ with open(fastq_in2, 'r') as infile2:
247
+ for record in SeqIO.parse(infile2, 'fastq'):
248
+ SeqIO.write(record, outfile, 'fastq')
249
+
250
+ dereplicate_fastq(fastq_in=fastq_out, fastq_out=fastq_out)
251
+
252
+
253
+ def get_chromosome(fasta: str, output_handle: str) -> None:
254
+ """
255
+ Extract the primary chromosome from a FASTA file.
256
+
257
+ If the input contains a single contig, it is written to the output.
258
+ If multiple contigs exist, the longest contig is selected and written
259
+ as it is assumed to be the main chromosome. Logs information about
260
+ the selected contig.
261
+
262
+ Parameters
263
+ ----------
264
+ fasta : str
265
+ Path to input FASTA file (single or multi-contig)
266
+ output_handle : str
267
+ Path for output FASTA file containing the selected chromosome
268
+
269
+ Returns
270
+ -------
271
+ None
272
+ Writes the selected chromosome to output_handle and logs the selection
273
+
274
+ Notes
275
+ -----
276
+ Selection logic:
277
+ - Single contig: Uses that contig directly
278
+ - Multiple contigs: Selects the longest contig by sequence length
279
+
280
+ The function assumes the longest contig is the main chromosome, which
281
+ is appropriate for bacterial genomes or assemblies where the chromosome
282
+ is expected to be significantly longer than plasmids or contaminants.
283
+
284
+ Logging messages indicate which contig was selected and whether it was
285
+ the only contig or chosen as the longest.
286
+ """
287
+ # test if there are a single entry in the fasta file
288
+ try: # there is a single entry
289
+ chromosome = SeqIO.read(fasta, format='fasta')
290
+ SeqIO.write(chromosome, output_handle, format='fasta')
291
+ message = 'A single contig: {} was found and will be used for mapping'.format(
292
+ '>' + chromosome.id
293
+ )
294
+ logging.info(message)
295
+
296
+ except ValueError: # there are more than one entry
297
+ contigs = SeqIO.parse(fasta, format='fasta')
298
+ max_len = 0
299
+
300
+ # identify longest entry and assume it is the chromosome
301
+ for record in contigs:
302
+ seq_len = len(record.seq)
303
+
304
+ if seq_len > max_len:
305
+ chromosome = record
306
+ max_len = seq_len
307
+
308
+ SeqIO.write(chromosome, output_handle, format='fasta')
309
+ message = 'The longest contig: {} has been saved as {} and will be used for mapping.'.format(
310
+ '>' + chromosome.id, output_handle
311
+ )
312
+ logging.info(message)
313
+
314
+
315
+ def attach_seq(
316
+ left: str, right: str, chromosome: str, output_name: str, offset: int = 0
317
+ ) -> None:
318
+ """
319
+ Attach telomeric sequences to both ends of a chromosome sequence.
320
+
321
+ Concatenates left and right sequences to the chromosome, optionally
322
+ trimming bases from each end of the chromosome before attachment.
323
+ This is used to build extended genomes with telomeric sequences.
324
+
325
+ Parameters
326
+ ----------
327
+ left : str
328
+ Path to FASTA file containing left/5' telomeric sequence
329
+ right : str
330
+ Path to FASTA file containing right/3' telomeric sequence
331
+ chromosome : str
332
+ Path to FASTA file containing chromosome sequence
333
+ output_name : str
334
+ Path for output FASTA file with attached sequences
335
+ offset : int, default=0
336
+ Number of bases to trim from each end of chromosome before attachment
337
+
338
+ Returns
339
+ -------
340
+ None
341
+ Writes extended genome to output_name
342
+
343
+ Raises
344
+ ------
345
+ ValueError
346
+ If offset is greater than or equal to half the chromosome length
347
+
348
+ Notes
349
+ -----
350
+ The offset parameter allows trimming of chromosome ends to remove
351
+ potentially problematic assembly regions before attaching telomeric
352
+ sequences. If offset > 0, bases [offset:-offset] are retained.
353
+
354
+ The output sequence ID is derived from output_name by removing the
355
+ file extension.
356
+
357
+ Example: For a 10kb chromosome with offset=100:
358
+ - Chromosome bases 100-9900 are retained
359
+ - Left sequence + chromosome[100:9900] + right sequence
360
+ """
361
+ left_seq = SeqIO.read(left, 'fasta')
362
+ right_seq = SeqIO.read(right, 'fasta')
363
+ chrom = SeqIO.read(chromosome, 'fasta')
364
+
365
+ if offset == 0: # if offset is 0 offset:-offset fucks it up
366
+ genome = chrom
367
+ elif offset >= len(chrom.seq) / 2:
368
+ logging.error('Error: Offset is larger than 1/2 genome length.')
369
+ return
370
+ else:
371
+ genome = chrom[offset:-offset]
372
+
373
+ att_genome = left_seq + genome + right_seq
374
+ att_genome.id = output_name.split('.')[0]
375
+ SeqIO.write(att_genome, output_name, 'fasta')
376
+
377
+
378
+ # A function to merge fasta files
379
+ def merge_fasta(input_file1: str, input_file2: str, output_file: str) -> None:
380
+ """
381
+ Merge two FASTA files into a single multi-FASTA file.
382
+
383
+ Combines all sequences from two FASTA files into one output file,
384
+ preserving the order (file1 sequences first, then file2 sequences).
385
+ Useful for creating multi-sequence reference files or combining
386
+ consensus sequences.
387
+
388
+ Parameters
389
+ ----------
390
+ input_file1 : str
391
+ Path to first input FASTA file
392
+ input_file2 : str
393
+ Path to second input FASTA file
394
+ output_file : str
395
+ Path for output merged FASTA file
396
+
397
+ Returns
398
+ -------
399
+ None
400
+ Writes merged sequences to output_file
401
+
402
+ Notes
403
+ -----
404
+ - All sequences from both files are included
405
+ - Original sequence IDs and descriptions are preserved
406
+ - Order is maintained: all sequences from file1, then all from file2
407
+ - Can merge single-sequence or multi-sequence FASTA files
408
+ """
409
+ # Read sequences from input_file1 and input_file2
410
+ sequences1 = list(SeqIO.parse(input_file1, 'fasta'))
411
+ sequences2 = list(SeqIO.parse(input_file2, 'fasta'))
412
+
413
+ # Merge sequences
414
+ merged_sequences = sequences1 + sequences2
415
+
416
+ # Write merged sequences to output_file
417
+ with open(output_file, 'w') as output_handle:
418
+ SeqIO.write(merged_sequences, output_handle, 'fasta')
419
+
420
+
421
+ def trim_to_cons(input_seq: str, num_base: int, output_handle: str) -> None:
422
+ """
423
+ Trim sequences to a specified number of bases from the start.
424
+
425
+ Truncates all sequences in a FASTA file to the first num_base bases,
426
+ adding a 'trimmed_' prefix to sequence IDs. Skips sequences shorter
427
+ than the requested length with an error message.
428
+
429
+ Parameters
430
+ ----------
431
+ input_seq : str
432
+ Path to input FASTA file
433
+ num_base : int
434
+ Number of bases to retain from the start of each sequence
435
+ output_handle : str
436
+ Path for output trimmed FASTA file
437
+
438
+ Returns
439
+ -------
440
+ None
441
+ Writes trimmed sequences to output_handle
442
+
443
+ Notes
444
+ -----
445
+ Processing details:
446
+ - Sequences are trimmed to bases [0:num_base+1] (indices 0 through num_base)
447
+ - Sequence IDs are prefixed with 'trimmed_'
448
+ - Descriptions are removed from output sequences
449
+ - Sequences shorter than num_base are skipped with error log
450
+ - Only successfully trimmed sequences are written to output
451
+
452
+ If all sequences are too short, an empty output file may be created.
453
+ """
454
+ # load file
455
+ with open(input_seq) as fasta_file:
456
+ all_rec = []
457
+
458
+ for record in SeqIO.parse(fasta_file, 'fasta'):
459
+ new_id = 'trimmed_' + record.id
460
+
461
+ r_seq = record.seq
462
+ length = len(r_seq)
463
+
464
+ if num_base <= length:
465
+ to_write = SeqRecord(
466
+ seq=r_seq[0 : num_base + 1], id=new_id, description=''
467
+ )
468
+ all_rec.append(to_write)
469
+
470
+ else:
471
+ logging.error('Error: Index out of range')
472
+
473
+ if len(all_rec) > 0:
474
+ SeqIO.write(all_rec, output_handle, 'fasta')
475
+
476
+
477
+ def strip_fasta(
478
+ input_file: str, output_file: str, x: int, remove_from: str = 'start'
479
+ ) -> None:
480
+ """
481
+ Remove a specified number of bases from sequence ends.
482
+
483
+ Strips x bases from either the start (5' end) or end (3' end) of all
484
+ sequences in a FASTA file. Useful for removing adapter sequences,
485
+ low-quality ends, or trimming consensus sequences.
486
+
487
+ Parameters
488
+ ----------
489
+ input_file : str
490
+ Path to input FASTA file
491
+ output_file : str
492
+ Path for output stripped FASTA file
493
+ x : int
494
+ Number of bases to remove from each sequence
495
+ remove_from : str, default='start'
496
+ Which end to remove bases from: 'start' for 5' end, 'end' for 3' end
497
+
498
+ Returns
499
+ -------
500
+ None
501
+ Writes stripped sequences to output_file
502
+
503
+ Raises
504
+ ------
505
+ AssertionError
506
+ If x is not an integer
507
+ ValueError
508
+ If remove_from is not 'start' or 'end'
509
+
510
+ Notes
511
+ -----
512
+ - Sequence IDs and descriptions are preserved
513
+ - If remove_from='start': sequence[x:] is retained
514
+ - If remove_from='end': sequence[:-x] is retained
515
+ - All sequences in the file are processed identically
516
+ - No validation that x is less than sequence length
517
+ """
518
+ assert type(x) is int
519
+
520
+ records = []
521
+
522
+ for record in SeqIO.parse(input_file, 'fasta'):
523
+ if remove_from == 'start':
524
+ modified_seq = record.seq[x:]
525
+ elif remove_from == 'end':
526
+ modified_seq = record.seq[:-x]
527
+ else:
528
+ raise ValueError("remove_from must be either 'start' or 'end'")
529
+
530
+ record.seq = modified_seq
531
+ records.append(record)
532
+
533
+ SeqIO.write(records, output_file, 'fasta')
534
+
535
+
536
+ def build_extended_fasta(
537
+ org_fasta: str, linear_elements: list[str], replicon_list: list, output_handle: str
538
+ ) -> None:
539
+ """
540
+ Reconstruct multi-FASTA with extended linear contigs in original order.
541
+
542
+ Replaces linear contigs that were extended by Telomore with their extended
543
+ versions, while keeping circular/unprocessed contigs unchanged. The output
544
+ maintains the original contig order and marks extended contigs as [linear].
545
+
546
+ Parameters
547
+ ----------
548
+ org_fasta : str
549
+ Path to original input FASTA file
550
+ linear_elements : list of str
551
+ List of contig IDs that were identified as linear and extended
552
+ replicon_list : list
553
+ List of Replicon objects containing paths to extended sequences
554
+ output_handle : str
555
+ Path for output FASTA file with extended contigs
556
+
557
+ Returns
558
+ -------
559
+ None
560
+ Writes reconstructed FASTA to output_handle
561
+
562
+ Notes
563
+ -----
564
+ Processing logic:
565
+ - Iterates through original FASTA in order
566
+ - For linear contigs: replaces with extended version from Replicon.trim_out
567
+ - For other contigs: copies unchanged from original
568
+ - Adds '[linear]' to description of extended contigs
569
+
570
+ This ensures the final assembly maintains the original contig order,
571
+ which is important for tools that expect specific reference structures.
572
+ The [linear] tag allows downstream tools to identify which contigs
573
+ were extended.
574
+ """
575
+ seq_rec_list = [] # list of seqrecord to write to newfile
576
+
577
+ for record in SeqIO.parse(org_fasta, 'fasta'):
578
+ if record.id in linear_elements:
579
+ for replicon in replicon_list:
580
+ if replicon.name == record.id:
581
+ path_to_telomore_rec = replicon.trim_out
582
+ telomore_rec = SeqIO.read(path_to_telomore_rec, format='fasta')
583
+ telomore_rec.description = '[linear]'
584
+ seq_rec_list.append(telomore_rec)
585
+ else:
586
+ seq_rec_list.append(record)
587
+
588
+ SeqIO.write(sequences=seq_rec_list, handle=output_handle, format='fasta')
589
+
590
+
591
+ if __name__ == '__main__':
592
+ check_fastq_order(
593
+ '/tmp/tmpnrb8ke64/all_terminal_reads_1.fastq',
594
+ '/tmp/tmpnrb8ke64/all_terminal_reads_2.fastq',
595
+ )