align-trim 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
align_trim/__init__.py ADDED
File without changes
align_trim/main.py ADDED
@@ -0,0 +1,1065 @@
1
+ import argparse
2
+ import csv
3
+ import itertools
4
+ import sys
5
+ from collections import defaultdict
6
+ from copy import copy
7
+ from importlib.metadata import version
8
+ from pathlib import Path
9
+ from typing import Optional, Union
10
+
11
+ import numpy as np
12
+ import pysam
13
+ from primalbedtools.amplicons import Amplicon, create_amplicons
14
+ from primalbedtools.bedfiles import BedLine, merge_primers
15
+ from primalbedtools.scheme import Scheme
16
+
17
+ # consumesReference lookup for if a CIGAR operation consumes the reference sequence
18
+ consumesReference = [True, False, True, True, False, False, False, True]
19
+
20
+ # consumesQuery lookup for if a CIGAR operation consumes the query sequence
21
+ consumesQuery = [True, True, False, False, True, False, False, True]
22
+
23
+
24
+ def find_primer_with_lookup(lookup, pos, direction, chrom) -> Optional[BedLine]:
25
+ pos_amps = lookup[chrom][:, pos] # Search both pools for amplicons at this position
26
+ closest_dist = float("inf")
27
+ closest_p = None
28
+ if direction == "+":
29
+ # Loops over pool O(N)
30
+ for amp in pos_amps:
31
+ if amp is None:
32
+ continue
33
+ dist = abs(amp.coverage_start - pos)
34
+ if dist < closest_dist:
35
+ closest_p = amp.left[0]
36
+ closest_dist = dist
37
+ elif direction == "-":
38
+ for amp in pos_amps:
39
+ if amp is None:
40
+ continue
41
+ dist = abs(amp.coverage_end - pos)
42
+ if dist < closest_dist:
43
+ closest_p = amp.right[0]
44
+ closest_dist = dist
45
+ else:
46
+ pass
47
+ return closest_p
48
+
49
+
50
+ def find_primer(primers: list[BedLine], pos, direction, chrom, threshold=35):
51
+ """Given a reference position and a direction of travel, walk out and find the nearest primer site.
52
+
53
+ Parameters
54
+ ----------
55
+ bed : list
56
+ A list of dictionaries, where each dictionary contains a row of bedfile data
57
+ pos : int
58
+ The position in the reference sequence to start from
59
+ direction : string
60
+ The direction to search along the reference sequence
61
+
62
+ Returns
63
+ -------
64
+ tuple[int, int, dict] | bool
65
+ A tuple containing the distance to the primer, the relative position of the primer, and the primer site, or False if no primer found
66
+ """
67
+ from operator import itemgetter
68
+
69
+ if direction == "+":
70
+ primer_distances = [
71
+ (abs(bl.start - pos), bl.start - pos, bl)
72
+ for bl in primers
73
+ if (pos >= (bl.start - threshold)) and chrom == bl.chrom
74
+ ]
75
+
76
+ else:
77
+ primer_distances = [
78
+ (abs(bl.end - pos), bl.end - pos, bl)
79
+ for bl in primers
80
+ if (pos <= (bl.end + threshold)) and chrom == bl.chrom
81
+ ]
82
+
83
+ if not primer_distances:
84
+ return False
85
+
86
+ closest = min(
87
+ primer_distances,
88
+ key=itemgetter(0),
89
+ )
90
+
91
+ return closest
92
+
93
+
94
+ def trim(segment, primer_pos, end, verbose=False):
95
+ """Soft mask an alignment to fit within primer start/end sites.
96
+
97
+ Parameters
98
+ ----------
99
+ segment : pysam.AlignedSegment
100
+ The aligned segment to mask
101
+ primer_pos : int
102
+ The position in the reference to soft mask up to (equates to the start/end position of the primer in the reference)
103
+ end : bool
104
+ If True, the segment is being masked from the end (i.e. for the reverse primer)
105
+ verbose : bool
106
+ If True, will print soft masking info during trimming
107
+ """
108
+ if verbose:
109
+ print(
110
+ f"{segment.query_name}: Trimming {'end' if end else 'start'} of read to primer position {primer_pos}",
111
+ file=sys.stderr,
112
+ )
113
+ # get a copy of the cigar tuples to work with
114
+ cigar = copy(segment.cigartuples)
115
+
116
+ # get the segment position in the reference (depends on if start or end of the segment is being processed)
117
+ if not end:
118
+ pos = segment.pos
119
+ else:
120
+ pos = segment.reference_end
121
+
122
+ # process the CIGAR to determine how much softmasking is required
123
+ eaten = 0
124
+ while 1:
125
+ # chomp CIGAR operations from the start/end of the CIGAR
126
+ try:
127
+ if end:
128
+ flag, length = cigar.pop()
129
+ else:
130
+ flag, length = cigar.pop(0)
131
+ if verbose:
132
+ print(
133
+ f"{segment.query_name}: Chomped a {flag}, {length}",
134
+ file=sys.stderr,
135
+ )
136
+ except IndexError:
137
+ if verbose:
138
+ print(
139
+ f"{segment.query_name}: Ran out of cigar during soft masking - completely masked read will be ignored",
140
+ file=sys.stderr,
141
+ )
142
+ break
143
+
144
+ # if the CIGAR operation consumes the reference sequence, increment/decrement the position by the CIGAR operation length
145
+ if consumesReference[flag]:
146
+ if not end:
147
+ pos += length
148
+ else:
149
+ pos -= length
150
+
151
+ # if the CIGAR operation consumes the query sequence, increment the number of CIGAR operations eaten by the CIGAR operation length
152
+ if consumesQuery[flag]:
153
+ eaten += length
154
+
155
+ # stop processing the CIGAR if we've gone far enough to mask the primer
156
+ if not end and pos >= primer_pos and flag == 0:
157
+ break
158
+ if end and pos <= primer_pos and flag == 0:
159
+ break
160
+
161
+ # calculate how many extra matches are needed in the CIGAR
162
+ extra = abs(pos - primer_pos)
163
+ if verbose:
164
+ print(f"{segment.query_name}: extra {extra}", file=sys.stderr)
165
+ if extra:
166
+ if verbose:
167
+ print(
168
+ f"{segment.query_name}: Inserted a 0, {extra}",
169
+ file=sys.stderr,
170
+ )
171
+ if end:
172
+ cigar.append((0, extra))
173
+ else:
174
+ cigar.insert(0, (0, extra))
175
+ eaten -= extra
176
+
177
+ # softmask the left primer
178
+ if not end:
179
+ # update the position of the leftmost mapping base
180
+ segment.pos = pos - extra
181
+ if verbose:
182
+ print(
183
+ f"{segment.query_name}: New pos - {segment.pos}",
184
+ file=sys.stderr,
185
+ )
186
+
187
+ # if proposed softmask leads straight into a deletion, shuffle leftmost mapping base along and ignore the deletion
188
+ if cigar[0][0] == 2:
189
+ if verbose:
190
+ print(
191
+ f"{segment.query_name}: softmask created a leading deletion in the CIGAR, shuffling the alignment",
192
+ file=sys.stderr,
193
+ )
194
+ while 1:
195
+ if cigar[0][0] != 2:
196
+ break
197
+ _, length = cigar.pop(0)
198
+ segment.pos += length
199
+
200
+ # now add the leading softmask
201
+ cigar.insert(0, (4, eaten))
202
+
203
+ # softmask the right primer
204
+ else:
205
+ cigar.append((4, eaten))
206
+
207
+ # check the new CIGAR and replace the old one
208
+ if cigar[0][1] <= 0 or cigar[-1][1] <= 0:
209
+ if verbose:
210
+ print(
211
+ f"{segment.query_name}: invalid cigar operation created - possibly due to INDEL in primer",
212
+ file=sys.stderr,
213
+ )
214
+ return
215
+
216
+ segment.cigartuples = cigar
217
+ return
218
+
219
+
220
+ def handle_segments(
221
+ segment: Union[
222
+ pysam.AlignedSegment, tuple[pysam.AlignedSegment, pysam.AlignedSegment]
223
+ ],
224
+ lookup: dict,
225
+ args: argparse.Namespace,
226
+ min_mapq: int,
227
+ outfile_writer: pysam.AlignmentFile,
228
+ amp_depths: dict,
229
+ report_writer: csv.DictWriter = False, # type: ignore
230
+ ):
231
+ """Handle the alignment segment(s) including filtering, soft masking, and reporting.
232
+
233
+ Args:
234
+ segment (pysam.AlignedSegment | tuple): The alignment segment to process, can be a single segment or a tuple of paired segments
235
+ bed (dict): The primer scheme
236
+ reportfh (typing.IO): The report file handle
237
+ args (argparse.Namespace): The command line arguments
238
+
239
+ Returns:
240
+ tuple [int, pysam.AlignedSegment | bool] | bool: A tuple containing the amplicon number and the alignment segment, or False if the segment is to be skipped
241
+ """
242
+ paired = isinstance(segment, tuple)
243
+ if paired:
244
+ segment1, segment2 = segment
245
+ if not segment1 or not segment2:
246
+ segment = segment1 if segment1 else segment2
247
+ if args.verbose:
248
+ print(
249
+ f"{segment.query_name}: Pair skipped as at least one segment in pair does not exist",
250
+ file=sys.stderr,
251
+ )
252
+ return False
253
+
254
+ # filter out unmapped and supplementary alignment segments
255
+ if not paired:
256
+ if segment.is_unmapped:
257
+ if args.verbose:
258
+ print(
259
+ f"{segment.query_name}: skipped as unmapped",
260
+ file=sys.stderr,
261
+ )
262
+ return False
263
+ else:
264
+ if segment1.is_unmapped or segment2.is_unmapped:
265
+ if args.verbose:
266
+ print(
267
+ f"{segment1.query_name}: skipped as unmapped",
268
+ file=sys.stderr,
269
+ )
270
+ return False
271
+
272
+ if not paired:
273
+ if segment.is_supplementary:
274
+ if args.verbose:
275
+ print(
276
+ f"{segment.query_name}: skipped as supplementary",
277
+ file=sys.stderr,
278
+ )
279
+ return False
280
+ else:
281
+ if segment1.is_supplementary or segment2.is_supplementary:
282
+ if args.verbose:
283
+ print(
284
+ f"{segment1.query_name}: skipped as supplementary",
285
+ file=sys.stderr,
286
+ )
287
+ return False
288
+
289
+ if not paired:
290
+ if segment.mapping_quality < min_mapq:
291
+ if args.verbose:
292
+ print(
293
+ f"{segment.query_name}: skipped as mapping quality below threshold",
294
+ file=sys.stderr,
295
+ )
296
+ return False
297
+ else:
298
+ if segment1.mapping_quality < min_mapq or segment2.mapping_quality < min_mapq:
299
+ if args.verbose:
300
+ print(
301
+ f"{segment1.query_name}: skipped as mapping quality below threshold",
302
+ file=sys.stderr,
303
+ )
304
+ return False
305
+
306
+ if not paired:
307
+ if segment.reference_end is None:
308
+ if args.verbose:
309
+ print(
310
+ f"{segment.query_name}: skipped as no mapping data",
311
+ file=sys.stderr,
312
+ )
313
+ return False
314
+ else:
315
+ if segment1.reference_end is None or segment2.reference_end is None:
316
+ if args.verbose:
317
+ print(
318
+ f"{segment1.query_name}: skipped as no mapping data",
319
+ file=sys.stderr,
320
+ )
321
+ return False
322
+ if not paired:
323
+ # locate the nearest primers to this alignment segment
324
+ p1 = find_primer_with_lookup(
325
+ lookup=lookup,
326
+ pos=segment.reference_start,
327
+ direction="+",
328
+ chrom=segment.reference_name,
329
+ )
330
+
331
+ p2 = find_primer_with_lookup(
332
+ lookup=lookup,
333
+ pos=segment.reference_end,
334
+ direction="-",
335
+ chrom=segment.reference_name,
336
+ )
337
+ else:
338
+ # locate the nearest primers to this alignment segment pair
339
+ if segment1.reference_start < segment2.reference_start:
340
+ # if segment1 starts before segment2, then segment1 is the left segment relative to the reference
341
+ p1 = find_primer_with_lookup(
342
+ lookup=lookup,
343
+ pos=segment1.reference_start,
344
+ direction="+",
345
+ chrom=segment1.reference_name,
346
+ )
347
+ p2 = find_primer_with_lookup(
348
+ lookup=lookup,
349
+ pos=segment2.reference_end,
350
+ direction="-",
351
+ chrom=segment2.reference_name,
352
+ )
353
+ else:
354
+ # otherwise then segment2 is the left segment relative to the reference
355
+ p1 = find_primer_with_lookup(
356
+ lookup=lookup,
357
+ pos=segment2.reference_start,
358
+ direction="+",
359
+ chrom=segment2.reference_name,
360
+ )
361
+ p2 = find_primer_with_lookup(
362
+ lookup=lookup,
363
+ pos=segment1.reference_end,
364
+ direction="-",
365
+ chrom=segment1.reference_name,
366
+ )
367
+
368
+ if not p1 or not p2:
369
+ if paired:
370
+ segment = segment1 if segment1 else segment2
371
+ if args.verbose:
372
+ print(
373
+ f"{segment.query_name}: skipped as no primer found for segment",
374
+ file=sys.stderr,
375
+ )
376
+ return False
377
+
378
+ # check if primers are correctly paired and then assign read group
379
+ correctly_paired = p1.amplicon_number == p2.amplicon_number
380
+
381
+ if not paired:
382
+ if not args.no_read_groups:
383
+ if correctly_paired:
384
+ segment.set_tag("RG", str(p1.pool))
385
+ else:
386
+ segment.set_tag("RG", "unmatched")
387
+ else:
388
+ if not args.no_read_groups:
389
+ if correctly_paired:
390
+ segment1.set_tag("RG", str(p1.pool))
391
+ segment2.set_tag("RG", str(p2.pool))
392
+ else:
393
+ segment1.set_tag("RG", "unmatched")
394
+ segment2.set_tag("RG", "unmatched")
395
+
396
+ # get the amplicon number
397
+ amplicon = p1.amplicon_number
398
+
399
+ if args.report:
400
+ # update the report with this alignment segment + primer details
401
+ report_segment = segment if not paired else segment1
402
+ report = {
403
+ "chrom": report_segment.reference_name,
404
+ "QueryName": report_segment.query_name,
405
+ "ReferenceStart": report_segment.reference_start,
406
+ "ReferenceEnd": report_segment.reference_end,
407
+ "PrimerPair": f"{p1.primername}_{p2.primername}",
408
+ "Primer1": p1.primername,
409
+ "Primer1Start": p1.start,
410
+ "Primer2": p2.primername,
411
+ "Primer2Start": p2.start,
412
+ "IsSecondary": report_segment.is_secondary,
413
+ "IsSupplementary": report_segment.is_supplementary,
414
+ "Start": p1.start,
415
+ "End": p2.end,
416
+ "CorrectlyPaired": correctly_paired,
417
+ }
418
+ report_writer.writerow(report)
419
+
420
+ if not args.allow_incorrect_pairs and not correctly_paired:
421
+ segment = segment if not paired else segment1
422
+ if args.verbose:
423
+ print(
424
+ f"{segment.query_name}: skipped as not correctly paired",
425
+ file=sys.stderr,
426
+ )
427
+ return False
428
+
429
+ # get the primer positions
430
+ if not args.no_trim_primers:
431
+ p1_position = p1.end
432
+ p2_position = p2.start
433
+ else:
434
+ p1_position = p1.start
435
+ p2_position = p2.end
436
+
437
+ # softmask the alignment if left primer start/end inside alignment
438
+ if not paired:
439
+ if segment.reference_start < p1_position:
440
+ try:
441
+ trim(segment, p1_position, False, args.verbose)
442
+ if args.verbose:
443
+ print(
444
+ f"{segment.query_name}: ref start {segment.reference_start} >= primer_position {p1_position}",
445
+ file=sys.stderr,
446
+ )
447
+ except Exception as e:
448
+ print(
449
+ f"{segment.query_name}: problem soft masking left primer (error: {e}), skipping",
450
+ file=sys.stderr,
451
+ )
452
+ return False
453
+
454
+ # softmask the alignment if right primer start/end inside alignment
455
+ if segment.reference_end > p2_position: # type: ignore
456
+ try:
457
+ trim(segment, p2_position, True, args.verbose)
458
+ if args.verbose:
459
+ print(
460
+ f"{segment.query_name}: ref start {segment.reference_start} >= primer_position {p2_position}",
461
+ file=sys.stderr,
462
+ )
463
+ except Exception as e:
464
+ print(
465
+ f"{segment.query_name}: problem soft masking right primer (error: {e}), skipping",
466
+ file=sys.stderr,
467
+ )
468
+ return False
469
+
470
+ # check the the alignment still contains bases matching the reference
471
+ if "M" not in segment.cigarstring: # type: ignore
472
+ if args.verbose:
473
+ print(
474
+ f"{segment.query_name}: dropped as does not match reference post masking",
475
+ file=sys.stderr,
476
+ )
477
+ return False
478
+
479
+ # Check require-full-length
480
+ if args.require_full_length:
481
+ if segment.reference_start > p1.end or segment.reference_end < p2.start: # type: ignore
482
+ if args.verbose:
483
+ print(
484
+ f"{segment.query_name}: ref_start {segment.reference_start} > p1.end {p1.end} or ref_end {segment.reference_end} < p2.start {p2.start}, does not span a full amplicon, skipping",
485
+ file=sys.stderr,
486
+ )
487
+ return False
488
+
489
+ # If not normalising, write the segment to the output file and add it to amplicon depth numpy array
490
+ if not args.normalise:
491
+ outfile_writer.write(segment)
492
+ segment_amp_relative_start = segment.reference_start - p1.start
493
+ segment_amp_relative_end = segment.reference_end - p1.start # type: ignore
494
+ if segment_amp_relative_start < 0:
495
+ segment_amp_relative_start = 0
496
+
497
+ amp_depths[segment.reference_name][amplicon][
498
+ segment_amp_relative_start:segment_amp_relative_end
499
+ ] += 1
500
+
501
+ return (amplicon, False)
502
+
503
+ return (amplicon, segment)
504
+
505
+ else:
506
+ for segment_of_pair in (segment1, segment2):
507
+ if segment_of_pair.reference_start < p1_position:
508
+ try:
509
+ trim(
510
+ segment=segment_of_pair,
511
+ primer_pos=p1_position,
512
+ end=False,
513
+ verbose=args.verbose,
514
+ )
515
+ if args.verbose:
516
+ print(
517
+ f"{segment_of_pair.query_name}: ref start {segment_of_pair.reference_start} >= primer_position {p1_position}",
518
+ file=sys.stderr,
519
+ )
520
+ except Exception as e:
521
+ print(
522
+ f"{segment_of_pair.query_name}: Problem soft masking left primer (error: {e}), skipping",
523
+ file=sys.stderr,
524
+ )
525
+ return False
526
+
527
+ if segment_of_pair.reference_end > p2_position: # type: ignore
528
+ try:
529
+ trim(
530
+ segment=segment_of_pair,
531
+ primer_pos=p2_position,
532
+ end=True,
533
+ verbose=args.verbose,
534
+ )
535
+ if args.verbose:
536
+ print(
537
+ f"{segment_of_pair.query_name}: ref_end {segment_of_pair.reference_end} >= primer_position {p2_position}",
538
+ file=sys.stderr,
539
+ )
540
+ except Exception as e:
541
+ print(
542
+ f"{segment_of_pair.query_name}: Problem soft masking right primer (error: {e}), skipping",
543
+ file=sys.stderr,
544
+ )
545
+ return False
546
+
547
+ # check the the alignment still contains bases matching the reference
548
+ if "M" not in segment1.cigarstring or "M" not in segment2.cigarstring: # type: ignore
549
+ if args.verbose:
550
+ print(
551
+ f"{segment1.query_name}: Paired segment dropped as does not match reference post masking",
552
+ file=sys.stderr,
553
+ )
554
+ return False
555
+
556
+ if args.require_full_length:
557
+ if segment1.reference_start < segment2.reference_start:
558
+ if (
559
+ segment1.reference_start > p1.end # type: ignore
560
+ or segment2.reference_end < p2.start # type: ignore
561
+ ):
562
+ if args.verbose:
563
+ print(
564
+ f"{segment1.query_name}: ref_start {segment1.reference_start} > p1.end {p1.end} or ref_end {segment2.reference_end} < p2.start {p2.start}, does not span a full amplicon, skipping",
565
+ file=sys.stderr,
566
+ )
567
+ return False
568
+ else:
569
+ if (
570
+ segment2.reference_start > p1.end
571
+ or segment1.reference_end < p2.start # type: ignore
572
+ ):
573
+ if args.verbose:
574
+ print(
575
+ f"{segment1.query_name}: ref_end {segment1.reference_end} < p2.start {p2.start} or ref_start {segment2.reference_start} > p1.end {p1.end}, does not span a full amplicon, skipping",
576
+ file=sys.stderr,
577
+ )
578
+ return False
579
+
580
+ # If not normalising, write the segments to the output file and add them to amplicon depth numpy array
581
+ if not args.normalise:
582
+ outfile_writer.write(segment1)
583
+ outfile_writer.write(segment2)
584
+ for segment_in_pair in (segment1, segment2):
585
+ segment_amp_relative_start = segment_in_pair.reference_start - p1.start
586
+ segment_amp_relative_end = segment_in_pair.reference_end - p1.start # type: ignore
587
+ if segment_amp_relative_start < 0:
588
+ segment_amp_relative_start = 0
589
+ amp_depths[segment1.reference_name][amplicon][
590
+ segment_amp_relative_start:segment_amp_relative_end
591
+ ] += 1
592
+
593
+ return (amplicon, False)
594
+
595
+ return (amplicon, segment)
596
+
597
+
598
+ def read_pair_generator(bam, region_string=None):
599
+ """
600
+ Generate read pairs in a BAM file or within a region string.
601
+ Reads are added to read_dict until a pair is found.
602
+ """
603
+ read_dict = defaultdict(lambda: [None, None])
604
+ for read in bam:
605
+ if not read.is_proper_pair:
606
+ continue
607
+ qname = read.query_name
608
+ if qname not in read_dict:
609
+ if read.is_read1:
610
+ read_dict[qname][0] = read
611
+ else:
612
+ read_dict[qname][1] = read
613
+ else:
614
+ if read.is_read1:
615
+ yield read, read_dict[qname][1]
616
+ else:
617
+ yield read_dict[qname][0], read
618
+ del read_dict[qname]
619
+
620
+
621
+ def create_primer_lookup(ref_len_tuple, amplicons: list[Amplicon], padding=35):
622
+ """
623
+ Create a lookup table for efficient primer position queries across reference genomes.
624
+
625
+ Each chromosome gets its own 2D lookup array where:
626
+ - Rows represent non-overlapping "pools"* of amplicons at their corresponding positions.
627
+ - Columns represent genomic positions
628
+ - Values are Amplicon objects or None
629
+
630
+ The function automatically determines the minimum number of rows needed to ensure
631
+ no amplicons overlap within the same row when accounting for padding.
632
+
633
+ * Amplicons are placed in the first available row where they don't overlap, not their pool index.
634
+
635
+ Parameters
636
+ ----------
637
+ ref_len_tuple : list[tuple[str, int]]
638
+ List of tuples containing (chromosome_name, chromosome_length) pairs
639
+ from the reference genome
640
+ amplicons : list[Amplicon]
641
+ List of Amplicon objects containing primer scheme information
642
+ padding : int, optional
643
+ Number of bases to extend amplicon boundaries on both sides to allow
644
+ for fuzzy matching of reads with barcodes/adapters (default: 35)
645
+
646
+ Returns
647
+ -------
648
+ dict[str, np.ndarray]
649
+ Dictionary mapping chromosome names to 2D numpy arrays of shape (N, chrom_len+1)
650
+ where N is the minimum number of rows needed to prevent amplicon overlap.
651
+ Array elements are either Amplicon objects or None.
652
+
653
+
654
+ """
655
+ lookups = {}
656
+ for chrom, chromlen in ref_len_tuple:
657
+ lookup_array = np.empty_like(None, shape=(1, chromlen + 1))
658
+ for amp in amplicons:
659
+ added = False
660
+ if amp.chrom == chrom:
661
+ # If amplicon clashes with any in same pool add new row
662
+ amp_slice = lookup_array[
663
+ :,
664
+ max(amp.amplicon_start - padding, 0) : min(
665
+ amp.amplicon_end + padding, chromlen
666
+ ),
667
+ ]
668
+ for i, row in enumerate(amp_slice): # Check each row for collision
669
+ if row[row != None].size == 0:
670
+ lookup_array[
671
+ i,
672
+ max(amp.amplicon_start - padding, 0) : min(
673
+ amp.amplicon_end + padding, chromlen
674
+ ),
675
+ ] = amp
676
+ added = True
677
+ # If not added, create new row, add the amplicon to that then add back to original array
678
+ if not added:
679
+ new_row = np.empty_like(None, shape=(1, chromlen + 1))
680
+ new_row[
681
+ 0,
682
+ max(amp.amplicon_start - padding, 0) : min(
683
+ amp.amplicon_end + padding, chromlen
684
+ ),
685
+ ] = amp
686
+ lookup_array = np.vstack((lookup_array, new_row))
687
+
688
+ lookups[chrom] = lookup_array
689
+ return lookups
690
+
691
+
692
+ def go(args):
693
+ """Filter and soft mask an alignment file so that the alignment boundaries match the primer start and end sites.
694
+
695
+ Based on the most likely primer position, based on the alignment coordinates.
696
+ """
697
+ # guard for negative normalise
698
+ if args.normalise is not None and args.normalise < 0:
699
+ print("normalise must be >= 0, exiting.", file=sys.stderr)
700
+ sys.exit(1)
701
+
702
+ # prepare the report outfile
703
+ if args.report:
704
+ reportfh = open(args.report, "w")
705
+ report_headers = [
706
+ "chrom",
707
+ "QueryName",
708
+ "ReferenceStart",
709
+ "ReferenceEnd",
710
+ "PrimerPair",
711
+ "Primer1",
712
+ "Primer1Start",
713
+ "Primer2",
714
+ "Primer2Start",
715
+ "IsSecondary",
716
+ "IsSupplementary",
717
+ "Start",
718
+ "End",
719
+ "CorrectlyPaired",
720
+ ]
721
+ report_writer = csv.DictWriter(
722
+ reportfh, fieldnames=report_headers, delimiter="\t"
723
+ )
724
+ report_writer.writeheader()
725
+
726
+ # open the primer scheme and get the pools
727
+ scheme = Scheme.from_file(args.bedfile)
728
+
729
+ # Merge the primers
730
+ scheme.bedlines = merge_primers(scheme.bedlines)
731
+
732
+ amplicon_list = create_amplicons(scheme.bedlines)
733
+ amplicons = {}
734
+ for amplicon in amplicon_list:
735
+ amplicon.length = amplicon.amplicon_end - amplicon.amplicon_start # type: ignore
736
+ amplicons.setdefault(amplicon.chrom, {})[amplicon.amplicon_number] = amplicon
737
+
738
+ pools = set([bl.pool for bl in scheme.bedlines])
739
+
740
+ pools_str = {str(x) for x in pools}
741
+ pools_str.add("unmatched")
742
+
743
+ # open the input samfile and process read groups
744
+ if args.samfile and args.samfile != "-":
745
+ infile = pysam.AlignmentFile(args.samfile, "rb")
746
+ else:
747
+ infile = pysam.AlignmentFile("-", "rb")
748
+
749
+ first_segment = next(infile, None)
750
+ if not first_segment:
751
+ print("No segments found in the input file, exiting.", file=sys.stderr)
752
+ sys.exit(1)
753
+
754
+ # check if the first segment is paired, then chain the saved first segment with the infile iterator so nothing is lost
755
+ paired = first_segment.is_paired
756
+ chained_iterator = itertools.chain([first_segment], infile)
757
+
758
+ bam_header = infile.header.copy().to_dict()
759
+ if not args.no_read_groups:
760
+ bam_header["RG"] = []
761
+ for pool in sorted(pools_str): # set order can be non deterministic
762
+ read_group = {}
763
+ read_group["ID"] = pool
764
+ bam_header["RG"].append(read_group)
765
+
766
+ cli_cmd = " ".join(sys.argv)
767
+ bam_header["PG"].append(
768
+ {
769
+ "PN": "align_trim",
770
+ "ID": "align_trim",
771
+ "VN": version("align_trim"),
772
+ "CL": cli_cmd,
773
+ }
774
+ )
775
+
776
+ # prepare the alignment outfile
777
+ if args.output and args.output != "-":
778
+ if args.output.name.endswith(".bam"):
779
+ outfile = pysam.AlignmentFile(args.output, "wb", header=bam_header)
780
+ elif args.output.name.endswith(".sam"):
781
+ outfile = pysam.AlignmentFile(args.output, "wh", header=bam_header)
782
+ else:
783
+ print(
784
+ "Output file path must end with either .bam or .sam, exiting.",
785
+ file=sys.stderr,
786
+ )
787
+ sys.exit(1)
788
+
789
+ else:
790
+ outfile = pysam.AlignmentFile("-", "wh", header=bam_header)
791
+
792
+ # Initialise the amplicon depth dict
793
+ amp_depths = {}
794
+ for amp in amplicon_list:
795
+ amp_depths.setdefault(amp.chrom, {})
796
+ amp_depths[amp.chrom].setdefault(
797
+ amp.amplicon_number,
798
+ np.zeros(amp.length, dtype=int), # type: ignore
799
+ )
800
+
801
+ # Initialise the mean depths dictionary, this will get stomped over if normalisation is requested
802
+ mean_amp_depths = {}
803
+ for chrom in amplicons:
804
+ for amplicon in amplicons[chrom]:
805
+ mean_amp_depths[(chrom, amplicon)] = 0
806
+
807
+ # Create a lookup table for primer location
808
+ ref_lengths = [(r, infile.get_reference_length(r)) for r in infile.references]
809
+ primer_lookup = create_primer_lookup(
810
+ ref_len_tuple=ref_lengths,
811
+ amplicons=amplicon_list,
812
+ padding=args.primer_match_threshold,
813
+ )
814
+
815
+ # Per-amplicon normalisation state: running depth array and current MAD from target
816
+ if args.normalise:
817
+ norm_state = {}
818
+ for amp in amplicon_list:
819
+ norm_state[(amp.chrom, amp.amplicon_number)] = {
820
+ "depth": np.zeros(amp.length, dtype=int), # type: ignore
821
+ "distance": float(args.normalise),
822
+ }
823
+
824
+ if paired:
825
+ read_pairs = read_pair_generator(chained_iterator)
826
+
827
+ for segments in read_pairs:
828
+ if args.report:
829
+ trimming_tuple = handle_segments(
830
+ segment=segments, # type: ignore
831
+ lookup=primer_lookup,
832
+ args=args,
833
+ report_writer=report_writer, # type: ignore
834
+ min_mapq=args.min_mapq,
835
+ outfile_writer=outfile,
836
+ amp_depths=amp_depths,
837
+ )
838
+ else:
839
+ trimming_tuple = handle_segments(
840
+ segment=segments, # type: ignore
841
+ lookup=primer_lookup,
842
+ args=args,
843
+ min_mapq=args.min_mapq,
844
+ outfile_writer=outfile,
845
+ amp_depths=amp_depths,
846
+ )
847
+
848
+ if not trimming_tuple:
849
+ continue
850
+
851
+ # unpack the trimming tuple since segment passed trimming
852
+ amplicon, trimmed_pair = trimming_tuple
853
+
854
+ # If we aren't normalising the segments will have already been written to the outfile
855
+ if not args.normalise and not trimmed_pair:
856
+ continue
857
+
858
+ if args.normalise and trimmed_pair:
859
+ chrom = trimmed_pair[0].reference_name # type: ignore
860
+ state = norm_state[(chrom, amplicon)]
861
+ p_start = amplicons[chrom][amplicon].amplicon_start
862
+ test_depths = np.copy(state["depth"])
863
+ for seg in trimmed_pair: # type: ignore
864
+ relative_start = max(0, seg.reference_start - p_start)
865
+ relative_end = seg.reference_end - p_start
866
+ test_depths[relative_start:relative_end] += 1
867
+ test_distance = np.mean(np.abs(test_depths - args.normalise))
868
+ if test_distance < state["distance"]:
869
+ state["depth"] = test_depths
870
+ state["distance"] = test_distance
871
+ outfile.write(trimmed_pair[0]) # type: ignore
872
+ outfile.write(trimmed_pair[1]) # type: ignore
873
+
874
+ if args.normalise:
875
+ mean_amp_depths = {k: np.mean(v["depth"]) for k, v in norm_state.items()}
876
+ else:
877
+ mean_amp_depths = {}
878
+ for chrom, chrom_amps in amp_depths.items():
879
+ for amplicon, depths in chrom_amps.items():
880
+ mean_amp_depths[(chrom, amplicon)] = np.mean(depths)
881
+
882
+ # write mean amplicon depths to file
883
+ if args.amp_depth_report:
884
+ with open(args.amp_depth_report, "w") as amp_depth_report_fh:
885
+ writer = csv.DictWriter(
886
+ amp_depth_report_fh,
887
+ fieldnames=["chrom", "amplicon", "mean_depth"],
888
+ delimiter="\t",
889
+ )
890
+ writer.writeheader()
891
+ for (chrom, amplicon), depth in mean_amp_depths.items():
892
+ writer.writerow(
893
+ {"chrom": chrom, "amplicon": amplicon, "mean_depth": depth}
894
+ )
895
+
896
+ else:
897
+ # iterate over the alignment segments in the input SAM file
898
+ for segment in chained_iterator:
899
+ if args.report:
900
+ trimming_tuple = handle_segments(
901
+ segment=segment,
902
+ args=args,
903
+ report_writer=report_writer, # type: ignore
904
+ min_mapq=args.min_mapq,
905
+ lookup=primer_lookup,
906
+ outfile_writer=outfile,
907
+ amp_depths=amp_depths,
908
+ )
909
+
910
+ else:
911
+ trimming_tuple = handle_segments(
912
+ segment=segment,
913
+ args=args,
914
+ min_mapq=args.min_mapq,
915
+ lookup=primer_lookup,
916
+ outfile_writer=outfile,
917
+ amp_depths=amp_depths,
918
+ )
919
+
920
+ if not trimming_tuple:
921
+ continue
922
+
923
+ # unpack the trimming tuple since segment passed trimming
924
+ amplicon, trimmed_segment = trimming_tuple
925
+
926
+ # If we aren't normalising the segments will have already been written to the outfile
927
+ if not args.normalise and not trimmed_segment:
928
+ continue
929
+
930
+ if args.normalise and trimmed_segment:
931
+ chrom = trimmed_segment.reference_name # type: ignore
932
+ state = norm_state[(chrom, amplicon)]
933
+ p_start = amplicons[chrom][amplicon].amplicon_start
934
+ test_depths = np.copy(state["depth"])
935
+ relative_start = max(0, trimmed_segment.reference_start - p_start) # type: ignore
936
+ relative_end = trimmed_segment.reference_end - p_start # type: ignore
937
+ test_depths[relative_start:relative_end] += 1
938
+ test_distance = np.mean(np.abs(test_depths - args.normalise))
939
+ if test_distance < state["distance"]:
940
+ state["depth"] = test_depths
941
+ state["distance"] = test_distance
942
+ outfile.write(trimmed_segment) # type: ignore
943
+
944
+ # normalise if requested
945
+ if args.normalise:
946
+ mean_amp_depths = {k: np.mean(v["depth"]) for k, v in norm_state.items()}
947
+
948
+ else:
949
+ mean_amp_depths = {}
950
+ for chrom, chrom_amps in amp_depths.items():
951
+ for amplicon, depths in chrom_amps.items():
952
+ mean_amp_depths[(chrom, amplicon)] = np.mean(depths)
953
+
954
+ # write mean amplicon depths to file
955
+ if args.amp_depth_report:
956
+ with open(args.amp_depth_report, "w") as amp_depth_report_fh:
957
+ writer = csv.DictWriter(
958
+ amp_depth_report_fh,
959
+ fieldnames=["chrom", "amplicon", "mean_depth"],
960
+ delimiter="\t",
961
+ )
962
+ writer.writeheader()
963
+
964
+ for (chrom, amplicon), depth in mean_amp_depths.items():
965
+ writer.writerow(
966
+ {"chrom": chrom, "amplicon": amplicon, "mean_depth": depth}
967
+ )
968
+
969
+ # close up the file handles
970
+ infile.close()
971
+ outfile.close()
972
+ if args.report:
973
+ reportfh.close() # type: ignore
974
+
975
+
976
+ def main():
977
+ parser = argparse.ArgumentParser(
978
+ description="Trim alignments from an amplicon scheme. Bam (input) can be provided by --samfile or stdin"
979
+ )
980
+ parser.add_argument(
981
+ "bedfile",
982
+ help="BED file containing the amplicon scheme",
983
+ type=Path,
984
+ metavar="BEDFILE",
985
+ )
986
+ parser.add_argument(
987
+ "--samfile",
988
+ "-s",
989
+ help="Sorted SAM/BAM file containing the aligned reads, if this is not provided (or '-') then 'align_trim' will read from stdin.",
990
+ required=False,
991
+ )
992
+ parser.add_argument(
993
+ "--normalise",
994
+ "-n",
995
+ type=int,
996
+ help="Subsample to N coverage per amplicon. Use 0 for no normalisation. (default: %(default)s)",
997
+ default=0,
998
+ )
999
+ parser.add_argument(
1000
+ "--min-mapq",
1001
+ "-m",
1002
+ type=int,
1003
+ default=20,
1004
+ help="Minimum mapping quality to keep an aligned read (default: %(default)s)",
1005
+ )
1006
+ parser.add_argument(
1007
+ "--primer-match-threshold",
1008
+ "-p",
1009
+ type=int,
1010
+ default=35,
1011
+ help="Add -p bases of padding to the outside (5' end of primer) of primer coordinates to allow fuzzy matching for reads with barcodes/adapters. (default: %(default)s)",
1012
+ )
1013
+ parser.add_argument(
1014
+ "--report", "-r", type=Path, help="Output report TSV to filepath"
1015
+ )
1016
+ parser.add_argument(
1017
+ "--amp-depth-report",
1018
+ "-a",
1019
+ type=Path,
1020
+ help="Output amplicon depth TSV to filepath",
1021
+ )
1022
+ parser.add_argument(
1023
+ "--no-trim-primers",
1024
+ action="store_true",
1025
+ help="Do not trim primers from reads",
1026
+ )
1027
+ parser.add_argument(
1028
+ "--no-read-groups",
1029
+ dest="no_read_groups",
1030
+ help="Do not divide reads into groups in samfile output",
1031
+ action="store_true",
1032
+ )
1033
+ parser.add_argument(
1034
+ "--allow-incorrect-pairs",
1035
+ action="store_true",
1036
+ help="Allow reads to be assigned to amplicons even if the primers are not correctly paired, i.e. primer1 and primer2 are not from the same amplicon.",
1037
+ )
1038
+ parser.add_argument(
1039
+ "--require-full-length",
1040
+ action="store_true",
1041
+ help="Requires all reads to start and stop in a primer site, do not use this option if you are using rapid barcoding since the reads will not be full length.",
1042
+ )
1043
+ parser.add_argument(
1044
+ "--output",
1045
+ "-o",
1046
+ type=Path,
1047
+ default=None,
1048
+ metavar="OUTPUT",
1049
+ help="Location to write the output samfile to, the output type will be determined by the file extension. If no <OUTPUT> or '-' provided, will write plaintext samfile to stdout",
1050
+ )
1051
+ parser.add_argument("--verbose", "-v", action="store_true", help="Debug mode")
1052
+ parser.add_argument(
1053
+ "--version",
1054
+ action="version",
1055
+ version=f"%(prog)s {version('align_trim')}",
1056
+ help="Show the version of align_trim",
1057
+ )
1058
+
1059
+ args = parser.parse_args()
1060
+
1061
+ go(args)
1062
+
1063
+
1064
+ if __name__ == "__main__":
1065
+ main()
@@ -0,0 +1,133 @@
1
+ Metadata-Version: 2.4
2
+ Name: align_trim
3
+ Version: 1.1.0
4
+ Summary: Soft-clip primer sites for SAM/BAM files generated from amplicon sequencing runs
5
+ Project-URL: Repository, https://github.com/artic-network/align_trim.git
6
+ Project-URL: Issues, https://github.com/artic-network/align_trim/issues
7
+ Author-email: Nick Loman <n.j.loman@bham.ac.uk>, Sam Wilkinson <s.a.j.wilkinson@bham.ac.uk>, Chris Kent <c.g.kent@bham.ac.uk>
8
+ Maintainer-email: Sam Wilkinson <s.a.j.wilkinson@bham.ac.uk>, Chris Kent <c.g.kent@bham.ac.uk>
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Requires-Python: >=3.9
12
+ Requires-Dist: numpy
13
+ Requires-Dist: primalbedtools>=0.10.1
14
+ Requires-Dist: pysam
15
+ Description-Content-Type: text/markdown
16
+
17
+ # align_trim
18
+
19
+ Stand alone version of ARTIC's fieldbioinformatics align_trim.py
20
+
21
+ ## Installation
22
+
23
+ From conda
24
+ ```bash
25
+ conda install bioconda::align_trim
26
+ ```
27
+ from pypi
28
+ ```bash
29
+ pip install align_trim
30
+ ```
31
+ from source
32
+ ```bash
33
+ git clone https://github.com/artic-network/align_trim.git
34
+ cd align_trim
35
+ uv sync
36
+ uv run align_trim --help
37
+ ```
38
+
39
+ ## Command Line Interface
40
+
41
+ ### Basic Usage
42
+
43
+ ```bash
44
+ align_trim [OPTIONS] BEDFILE
45
+ ```
46
+
47
+ The tool reads alignment data from either a SAM/BAM file or stdin and outputs trimmed alignments to stdout in SAM format by default.
48
+
49
+ ### Required Arguments
50
+
51
+ - `BEDFILE`: BED file containing the amplicon primer scheme in [v3](https://doi.org/10.5281/zenodo.16366659) format.
52
+
53
+ ### Optional Arguments
54
+
55
+ #### Input/Output Options
56
+
57
+ - `--samfile`, `-s` : Sorted SAM/BAM file containing the aligned reads, if this is not provided (or '-') then 'align_trim' will read from stdin.
58
+ - `--output`, `-o` : Output file path. Format determined by extension (.sam/.bam). If not provided or '-', writes SAM to stdout
59
+
60
+ #### Processing Options
61
+
62
+ - `--normalise`, `-n` : Normalise to target depth N per amplicon using a greedy per-read algorithm. Each read is kept only if it brings the amplicon depth closer to the target. Use 0 for no normalisation (default: 0)
63
+ - `--min-mapq`, `-m` : Minimum mapping quality to keep an aligned read (default: 20)
64
+ - `--primer-match-threshold`, `-p` : Add this many bases of padding to the 5' end of primer coordinates to allow fuzzy matching for reads with barcodes/adapters (default: 35)
65
+
66
+ #### Primer and Read Handling
67
+
68
+ - `--no-trim-primers` : Do not trim primers from reads (by default, primers are trimmed)
69
+ - `--allow-incorrect-pairs` : Allow reads to be assigned to amplicons even if primers are not correctly paired
70
+ - `--require-full-length` : Require all reads to start and stop in primer sites (do not use with rapid barcoding)
71
+
72
+ #### Output and Reporting
73
+
74
+ - `--report`, `-r` : Output detailed report TSV to specified filepath
75
+ - `--amp-depth-report`, `-a` : Output mean depth for each amplicon as TSV to specified filepath
76
+ - `--no-read-groups` : Do not divide reads into pool-based read groups in SAM/BAM output
77
+
78
+ #### General Options
79
+
80
+ - `--verbose`, `-v` : Enable debug mode with detailed logging to stderr
81
+ - `--version` : Show version information
82
+ - `--help` : Show help message
83
+
84
+ ### Examples
85
+
86
+ #### Basic trimming with primer removal
87
+ ```bash
88
+ align_trim primers.bed --samfile input.bam --output trimmed.bam
89
+ ```
90
+
91
+ #### Normalize coverage and generate reports
92
+ ```bash
93
+ align_trim primers.bed --samfile input.bam --normalise 100 \
94
+ --report alignment_report.tsv --amp-depth-report depth_report.tsv \
95
+ --output normalized.bam
96
+ ```
97
+
98
+ #### Process from stdin with verbose output
99
+ ```bash
100
+ samtools view -h input.bam | align_trim primers.bed --verbose > trimmed.sam 2> verbose.out.txt
101
+ ```
102
+
103
+ #### Strict full-length read filtering
104
+ ```bash
105
+ align_trim primers.bed --samfile input.bam --require-full-length \
106
+ --min-mapq 30 --output filtered.bam
107
+ ```
108
+
109
+ #### Allow mismatched primer pairs with custom threshold
110
+ ```bash
111
+ align_trim primers.bed --samfile input.bam --allow-incorrect-pairs \
112
+ --primer-match-threshold 50 --output relaxed.bam
113
+ ```
114
+
115
+ ### Output Formats
116
+
117
+ The tool supports multiple output formats based on file extension:
118
+ - `.sam` - SAM format (text)
119
+ - `.bam` - BAM format (binary, compressed)
120
+ - No extension or `-` - SAM format to stdout
121
+
122
+ ### Report Files
123
+
124
+ When using `--report`, a tab-separated file is generated with the following columns:
125
+ - `chrom`: Reference chromosome/contig
126
+ - `QueryName`: Read name
127
+ - `ReferenceStart`/`ReferenceEnd`: Alignment coordinates
128
+ - `PrimerPair`: Primer pair assignment
129
+ - `Primer1`/`Primer2`: Individual primer information
130
+ - `CorrectlyPaired`: Boolean indicating proper primer pairing
131
+ - Additional alignment metrics
132
+
133
+ The `--amp-depth-report` generates a summary of coverage depth per amplicon.
@@ -0,0 +1,7 @@
1
+ align_trim/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ align_trim/main.py,sha256=oatjvWloOPYLYff8CZ1Zq6lYhWLNCNgn2sg1uJDeqa0,39555
3
+ align_trim-1.1.0.dist-info/METADATA,sha256=A3o0cC-omcEqAxlIwrPEIJHY7jqSn1_iwtMryyk_YYU,4557
4
+ align_trim-1.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
5
+ align_trim-1.1.0.dist-info/entry_points.txt,sha256=ZUSdteDVarhMD6c3JcQQcpcsotuuzDZVPfiHdRATYN4,52
6
+ align_trim-1.1.0.dist-info/licenses/LICENSE,sha256=E-00BwKTV_FGdgp-4TbWXU1LRY8Mbxxj6Ab9a_SqctE,1099
7
+ align_trim-1.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ align_trim = align_trim.main:main
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2017-2018 Nick Loman & the ZiBRA Project & the ARTIC project
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ SOFTWARE.