align-trim 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
align_trim/__init__.py
ADDED
|
File without changes
|
align_trim/main.py
ADDED
|
@@ -0,0 +1,1065 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import csv
|
|
3
|
+
import itertools
|
|
4
|
+
import sys
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
from copy import copy
|
|
7
|
+
from importlib.metadata import version
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Optional, Union
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
import pysam
|
|
13
|
+
from primalbedtools.amplicons import Amplicon, create_amplicons
|
|
14
|
+
from primalbedtools.bedfiles import BedLine, merge_primers
|
|
15
|
+
from primalbedtools.scheme import Scheme
|
|
16
|
+
|
|
17
|
+
# consumesReference lookup for if a CIGAR operation consumes the reference sequence
|
|
18
|
+
consumesReference = [True, False, True, True, False, False, False, True]
|
|
19
|
+
|
|
20
|
+
# consumesQuery lookup for if a CIGAR operation consumes the query sequence
|
|
21
|
+
consumesQuery = [True, True, False, False, True, False, False, True]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def find_primer_with_lookup(lookup, pos, direction, chrom) -> Optional[BedLine]:
|
|
25
|
+
pos_amps = lookup[chrom][:, pos] # Search both pools for amplicons at this position
|
|
26
|
+
closest_dist = float("inf")
|
|
27
|
+
closest_p = None
|
|
28
|
+
if direction == "+":
|
|
29
|
+
# Loops over pool O(N)
|
|
30
|
+
for amp in pos_amps:
|
|
31
|
+
if amp is None:
|
|
32
|
+
continue
|
|
33
|
+
dist = abs(amp.coverage_start - pos)
|
|
34
|
+
if dist < closest_dist:
|
|
35
|
+
closest_p = amp.left[0]
|
|
36
|
+
closest_dist = dist
|
|
37
|
+
elif direction == "-":
|
|
38
|
+
for amp in pos_amps:
|
|
39
|
+
if amp is None:
|
|
40
|
+
continue
|
|
41
|
+
dist = abs(amp.coverage_end - pos)
|
|
42
|
+
if dist < closest_dist:
|
|
43
|
+
closest_p = amp.right[0]
|
|
44
|
+
closest_dist = dist
|
|
45
|
+
else:
|
|
46
|
+
pass
|
|
47
|
+
return closest_p
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def find_primer(primers: list[BedLine], pos, direction, chrom, threshold=35):
|
|
51
|
+
"""Given a reference position and a direction of travel, walk out and find the nearest primer site.
|
|
52
|
+
|
|
53
|
+
Parameters
|
|
54
|
+
----------
|
|
55
|
+
bed : list
|
|
56
|
+
A list of dictionaries, where each dictionary contains a row of bedfile data
|
|
57
|
+
pos : int
|
|
58
|
+
The position in the reference sequence to start from
|
|
59
|
+
direction : string
|
|
60
|
+
The direction to search along the reference sequence
|
|
61
|
+
|
|
62
|
+
Returns
|
|
63
|
+
-------
|
|
64
|
+
tuple[int, int, dict] | bool
|
|
65
|
+
A tuple containing the distance to the primer, the relative position of the primer, and the primer site, or False if no primer found
|
|
66
|
+
"""
|
|
67
|
+
from operator import itemgetter
|
|
68
|
+
|
|
69
|
+
if direction == "+":
|
|
70
|
+
primer_distances = [
|
|
71
|
+
(abs(bl.start - pos), bl.start - pos, bl)
|
|
72
|
+
for bl in primers
|
|
73
|
+
if (pos >= (bl.start - threshold)) and chrom == bl.chrom
|
|
74
|
+
]
|
|
75
|
+
|
|
76
|
+
else:
|
|
77
|
+
primer_distances = [
|
|
78
|
+
(abs(bl.end - pos), bl.end - pos, bl)
|
|
79
|
+
for bl in primers
|
|
80
|
+
if (pos <= (bl.end + threshold)) and chrom == bl.chrom
|
|
81
|
+
]
|
|
82
|
+
|
|
83
|
+
if not primer_distances:
|
|
84
|
+
return False
|
|
85
|
+
|
|
86
|
+
closest = min(
|
|
87
|
+
primer_distances,
|
|
88
|
+
key=itemgetter(0),
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
return closest
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def trim(segment, primer_pos, end, verbose=False):
|
|
95
|
+
"""Soft mask an alignment to fit within primer start/end sites.
|
|
96
|
+
|
|
97
|
+
Parameters
|
|
98
|
+
----------
|
|
99
|
+
segment : pysam.AlignedSegment
|
|
100
|
+
The aligned segment to mask
|
|
101
|
+
primer_pos : int
|
|
102
|
+
The position in the reference to soft mask up to (equates to the start/end position of the primer in the reference)
|
|
103
|
+
end : bool
|
|
104
|
+
If True, the segment is being masked from the end (i.e. for the reverse primer)
|
|
105
|
+
verbose : bool
|
|
106
|
+
If True, will print soft masking info during trimming
|
|
107
|
+
"""
|
|
108
|
+
if verbose:
|
|
109
|
+
print(
|
|
110
|
+
f"{segment.query_name}: Trimming {'end' if end else 'start'} of read to primer position {primer_pos}",
|
|
111
|
+
file=sys.stderr,
|
|
112
|
+
)
|
|
113
|
+
# get a copy of the cigar tuples to work with
|
|
114
|
+
cigar = copy(segment.cigartuples)
|
|
115
|
+
|
|
116
|
+
# get the segment position in the reference (depends on if start or end of the segment is being processed)
|
|
117
|
+
if not end:
|
|
118
|
+
pos = segment.pos
|
|
119
|
+
else:
|
|
120
|
+
pos = segment.reference_end
|
|
121
|
+
|
|
122
|
+
# process the CIGAR to determine how much softmasking is required
|
|
123
|
+
eaten = 0
|
|
124
|
+
while 1:
|
|
125
|
+
# chomp CIGAR operations from the start/end of the CIGAR
|
|
126
|
+
try:
|
|
127
|
+
if end:
|
|
128
|
+
flag, length = cigar.pop()
|
|
129
|
+
else:
|
|
130
|
+
flag, length = cigar.pop(0)
|
|
131
|
+
if verbose:
|
|
132
|
+
print(
|
|
133
|
+
f"{segment.query_name}: Chomped a {flag}, {length}",
|
|
134
|
+
file=sys.stderr,
|
|
135
|
+
)
|
|
136
|
+
except IndexError:
|
|
137
|
+
if verbose:
|
|
138
|
+
print(
|
|
139
|
+
f"{segment.query_name}: Ran out of cigar during soft masking - completely masked read will be ignored",
|
|
140
|
+
file=sys.stderr,
|
|
141
|
+
)
|
|
142
|
+
break
|
|
143
|
+
|
|
144
|
+
# if the CIGAR operation consumes the reference sequence, increment/decrement the position by the CIGAR operation length
|
|
145
|
+
if consumesReference[flag]:
|
|
146
|
+
if not end:
|
|
147
|
+
pos += length
|
|
148
|
+
else:
|
|
149
|
+
pos -= length
|
|
150
|
+
|
|
151
|
+
# if the CIGAR operation consumes the query sequence, increment the number of CIGAR operations eaten by the CIGAR operation length
|
|
152
|
+
if consumesQuery[flag]:
|
|
153
|
+
eaten += length
|
|
154
|
+
|
|
155
|
+
# stop processing the CIGAR if we've gone far enough to mask the primer
|
|
156
|
+
if not end and pos >= primer_pos and flag == 0:
|
|
157
|
+
break
|
|
158
|
+
if end and pos <= primer_pos and flag == 0:
|
|
159
|
+
break
|
|
160
|
+
|
|
161
|
+
# calculate how many extra matches are needed in the CIGAR
|
|
162
|
+
extra = abs(pos - primer_pos)
|
|
163
|
+
if verbose:
|
|
164
|
+
print(f"{segment.query_name}: extra {extra}", file=sys.stderr)
|
|
165
|
+
if extra:
|
|
166
|
+
if verbose:
|
|
167
|
+
print(
|
|
168
|
+
f"{segment.query_name}: Inserted a 0, {extra}",
|
|
169
|
+
file=sys.stderr,
|
|
170
|
+
)
|
|
171
|
+
if end:
|
|
172
|
+
cigar.append((0, extra))
|
|
173
|
+
else:
|
|
174
|
+
cigar.insert(0, (0, extra))
|
|
175
|
+
eaten -= extra
|
|
176
|
+
|
|
177
|
+
# softmask the left primer
|
|
178
|
+
if not end:
|
|
179
|
+
# update the position of the leftmost mapping base
|
|
180
|
+
segment.pos = pos - extra
|
|
181
|
+
if verbose:
|
|
182
|
+
print(
|
|
183
|
+
f"{segment.query_name}: New pos - {segment.pos}",
|
|
184
|
+
file=sys.stderr,
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
# if proposed softmask leads straight into a deletion, shuffle leftmost mapping base along and ignore the deletion
|
|
188
|
+
if cigar[0][0] == 2:
|
|
189
|
+
if verbose:
|
|
190
|
+
print(
|
|
191
|
+
f"{segment.query_name}: softmask created a leading deletion in the CIGAR, shuffling the alignment",
|
|
192
|
+
file=sys.stderr,
|
|
193
|
+
)
|
|
194
|
+
while 1:
|
|
195
|
+
if cigar[0][0] != 2:
|
|
196
|
+
break
|
|
197
|
+
_, length = cigar.pop(0)
|
|
198
|
+
segment.pos += length
|
|
199
|
+
|
|
200
|
+
# now add the leading softmask
|
|
201
|
+
cigar.insert(0, (4, eaten))
|
|
202
|
+
|
|
203
|
+
# softmask the right primer
|
|
204
|
+
else:
|
|
205
|
+
cigar.append((4, eaten))
|
|
206
|
+
|
|
207
|
+
# check the new CIGAR and replace the old one
|
|
208
|
+
if cigar[0][1] <= 0 or cigar[-1][1] <= 0:
|
|
209
|
+
if verbose:
|
|
210
|
+
print(
|
|
211
|
+
f"{segment.query_name}: invalid cigar operation created - possibly due to INDEL in primer",
|
|
212
|
+
file=sys.stderr,
|
|
213
|
+
)
|
|
214
|
+
return
|
|
215
|
+
|
|
216
|
+
segment.cigartuples = cigar
|
|
217
|
+
return
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def handle_segments(
|
|
221
|
+
segment: Union[
|
|
222
|
+
pysam.AlignedSegment, tuple[pysam.AlignedSegment, pysam.AlignedSegment]
|
|
223
|
+
],
|
|
224
|
+
lookup: dict,
|
|
225
|
+
args: argparse.Namespace,
|
|
226
|
+
min_mapq: int,
|
|
227
|
+
outfile_writer: pysam.AlignmentFile,
|
|
228
|
+
amp_depths: dict,
|
|
229
|
+
report_writer: csv.DictWriter = False, # type: ignore
|
|
230
|
+
):
|
|
231
|
+
"""Handle the alignment segment(s) including filtering, soft masking, and reporting.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
segment (pysam.AlignedSegment | tuple): The alignment segment to process, can be a single segment or a tuple of paired segments
|
|
235
|
+
bed (dict): The primer scheme
|
|
236
|
+
reportfh (typing.IO): The report file handle
|
|
237
|
+
args (argparse.Namespace): The command line arguments
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
tuple [int, pysam.AlignedSegment | bool] | bool: A tuple containing the amplicon number and the alignment segment, or False if the segment is to be skipped
|
|
241
|
+
"""
|
|
242
|
+
paired = isinstance(segment, tuple)
|
|
243
|
+
if paired:
|
|
244
|
+
segment1, segment2 = segment
|
|
245
|
+
if not segment1 or not segment2:
|
|
246
|
+
segment = segment1 if segment1 else segment2
|
|
247
|
+
if args.verbose:
|
|
248
|
+
print(
|
|
249
|
+
f"{segment.query_name}: Pair skipped as at least one segment in pair does not exist",
|
|
250
|
+
file=sys.stderr,
|
|
251
|
+
)
|
|
252
|
+
return False
|
|
253
|
+
|
|
254
|
+
# filter out unmapped and supplementary alignment segments
|
|
255
|
+
if not paired:
|
|
256
|
+
if segment.is_unmapped:
|
|
257
|
+
if args.verbose:
|
|
258
|
+
print(
|
|
259
|
+
f"{segment.query_name}: skipped as unmapped",
|
|
260
|
+
file=sys.stderr,
|
|
261
|
+
)
|
|
262
|
+
return False
|
|
263
|
+
else:
|
|
264
|
+
if segment1.is_unmapped or segment2.is_unmapped:
|
|
265
|
+
if args.verbose:
|
|
266
|
+
print(
|
|
267
|
+
f"{segment1.query_name}: skipped as unmapped",
|
|
268
|
+
file=sys.stderr,
|
|
269
|
+
)
|
|
270
|
+
return False
|
|
271
|
+
|
|
272
|
+
if not paired:
|
|
273
|
+
if segment.is_supplementary:
|
|
274
|
+
if args.verbose:
|
|
275
|
+
print(
|
|
276
|
+
f"{segment.query_name}: skipped as supplementary",
|
|
277
|
+
file=sys.stderr,
|
|
278
|
+
)
|
|
279
|
+
return False
|
|
280
|
+
else:
|
|
281
|
+
if segment1.is_supplementary or segment2.is_supplementary:
|
|
282
|
+
if args.verbose:
|
|
283
|
+
print(
|
|
284
|
+
f"{segment1.query_name}: skipped as supplementary",
|
|
285
|
+
file=sys.stderr,
|
|
286
|
+
)
|
|
287
|
+
return False
|
|
288
|
+
|
|
289
|
+
if not paired:
|
|
290
|
+
if segment.mapping_quality < min_mapq:
|
|
291
|
+
if args.verbose:
|
|
292
|
+
print(
|
|
293
|
+
f"{segment.query_name}: skipped as mapping quality below threshold",
|
|
294
|
+
file=sys.stderr,
|
|
295
|
+
)
|
|
296
|
+
return False
|
|
297
|
+
else:
|
|
298
|
+
if segment1.mapping_quality < min_mapq or segment2.mapping_quality < min_mapq:
|
|
299
|
+
if args.verbose:
|
|
300
|
+
print(
|
|
301
|
+
f"{segment1.query_name}: skipped as mapping quality below threshold",
|
|
302
|
+
file=sys.stderr,
|
|
303
|
+
)
|
|
304
|
+
return False
|
|
305
|
+
|
|
306
|
+
if not paired:
|
|
307
|
+
if segment.reference_end is None:
|
|
308
|
+
if args.verbose:
|
|
309
|
+
print(
|
|
310
|
+
f"{segment.query_name}: skipped as no mapping data",
|
|
311
|
+
file=sys.stderr,
|
|
312
|
+
)
|
|
313
|
+
return False
|
|
314
|
+
else:
|
|
315
|
+
if segment1.reference_end is None or segment2.reference_end is None:
|
|
316
|
+
if args.verbose:
|
|
317
|
+
print(
|
|
318
|
+
f"{segment1.query_name}: skipped as no mapping data",
|
|
319
|
+
file=sys.stderr,
|
|
320
|
+
)
|
|
321
|
+
return False
|
|
322
|
+
if not paired:
|
|
323
|
+
# locate the nearest primers to this alignment segment
|
|
324
|
+
p1 = find_primer_with_lookup(
|
|
325
|
+
lookup=lookup,
|
|
326
|
+
pos=segment.reference_start,
|
|
327
|
+
direction="+",
|
|
328
|
+
chrom=segment.reference_name,
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
p2 = find_primer_with_lookup(
|
|
332
|
+
lookup=lookup,
|
|
333
|
+
pos=segment.reference_end,
|
|
334
|
+
direction="-",
|
|
335
|
+
chrom=segment.reference_name,
|
|
336
|
+
)
|
|
337
|
+
else:
|
|
338
|
+
# locate the nearest primers to this alignment segment pair
|
|
339
|
+
if segment1.reference_start < segment2.reference_start:
|
|
340
|
+
# if segment1 starts before segment2, then segment1 is the left segment relative to the reference
|
|
341
|
+
p1 = find_primer_with_lookup(
|
|
342
|
+
lookup=lookup,
|
|
343
|
+
pos=segment1.reference_start,
|
|
344
|
+
direction="+",
|
|
345
|
+
chrom=segment1.reference_name,
|
|
346
|
+
)
|
|
347
|
+
p2 = find_primer_with_lookup(
|
|
348
|
+
lookup=lookup,
|
|
349
|
+
pos=segment2.reference_end,
|
|
350
|
+
direction="-",
|
|
351
|
+
chrom=segment2.reference_name,
|
|
352
|
+
)
|
|
353
|
+
else:
|
|
354
|
+
# otherwise then segment2 is the left segment relative to the reference
|
|
355
|
+
p1 = find_primer_with_lookup(
|
|
356
|
+
lookup=lookup,
|
|
357
|
+
pos=segment2.reference_start,
|
|
358
|
+
direction="+",
|
|
359
|
+
chrom=segment2.reference_name,
|
|
360
|
+
)
|
|
361
|
+
p2 = find_primer_with_lookup(
|
|
362
|
+
lookup=lookup,
|
|
363
|
+
pos=segment1.reference_end,
|
|
364
|
+
direction="-",
|
|
365
|
+
chrom=segment1.reference_name,
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
if not p1 or not p2:
|
|
369
|
+
if paired:
|
|
370
|
+
segment = segment1 if segment1 else segment2
|
|
371
|
+
if args.verbose:
|
|
372
|
+
print(
|
|
373
|
+
f"{segment.query_name}: skipped as no primer found for segment",
|
|
374
|
+
file=sys.stderr,
|
|
375
|
+
)
|
|
376
|
+
return False
|
|
377
|
+
|
|
378
|
+
# check if primers are correctly paired and then assign read group
|
|
379
|
+
correctly_paired = p1.amplicon_number == p2.amplicon_number
|
|
380
|
+
|
|
381
|
+
if not paired:
|
|
382
|
+
if not args.no_read_groups:
|
|
383
|
+
if correctly_paired:
|
|
384
|
+
segment.set_tag("RG", str(p1.pool))
|
|
385
|
+
else:
|
|
386
|
+
segment.set_tag("RG", "unmatched")
|
|
387
|
+
else:
|
|
388
|
+
if not args.no_read_groups:
|
|
389
|
+
if correctly_paired:
|
|
390
|
+
segment1.set_tag("RG", str(p1.pool))
|
|
391
|
+
segment2.set_tag("RG", str(p2.pool))
|
|
392
|
+
else:
|
|
393
|
+
segment1.set_tag("RG", "unmatched")
|
|
394
|
+
segment2.set_tag("RG", "unmatched")
|
|
395
|
+
|
|
396
|
+
# get the amplicon number
|
|
397
|
+
amplicon = p1.amplicon_number
|
|
398
|
+
|
|
399
|
+
if args.report:
|
|
400
|
+
# update the report with this alignment segment + primer details
|
|
401
|
+
report_segment = segment if not paired else segment1
|
|
402
|
+
report = {
|
|
403
|
+
"chrom": report_segment.reference_name,
|
|
404
|
+
"QueryName": report_segment.query_name,
|
|
405
|
+
"ReferenceStart": report_segment.reference_start,
|
|
406
|
+
"ReferenceEnd": report_segment.reference_end,
|
|
407
|
+
"PrimerPair": f"{p1.primername}_{p2.primername}",
|
|
408
|
+
"Primer1": p1.primername,
|
|
409
|
+
"Primer1Start": p1.start,
|
|
410
|
+
"Primer2": p2.primername,
|
|
411
|
+
"Primer2Start": p2.start,
|
|
412
|
+
"IsSecondary": report_segment.is_secondary,
|
|
413
|
+
"IsSupplementary": report_segment.is_supplementary,
|
|
414
|
+
"Start": p1.start,
|
|
415
|
+
"End": p2.end,
|
|
416
|
+
"CorrectlyPaired": correctly_paired,
|
|
417
|
+
}
|
|
418
|
+
report_writer.writerow(report)
|
|
419
|
+
|
|
420
|
+
if not args.allow_incorrect_pairs and not correctly_paired:
|
|
421
|
+
segment = segment if not paired else segment1
|
|
422
|
+
if args.verbose:
|
|
423
|
+
print(
|
|
424
|
+
f"{segment.query_name}: skipped as not correctly paired",
|
|
425
|
+
file=sys.stderr,
|
|
426
|
+
)
|
|
427
|
+
return False
|
|
428
|
+
|
|
429
|
+
# get the primer positions
|
|
430
|
+
if not args.no_trim_primers:
|
|
431
|
+
p1_position = p1.end
|
|
432
|
+
p2_position = p2.start
|
|
433
|
+
else:
|
|
434
|
+
p1_position = p1.start
|
|
435
|
+
p2_position = p2.end
|
|
436
|
+
|
|
437
|
+
# softmask the alignment if left primer start/end inside alignment
|
|
438
|
+
if not paired:
|
|
439
|
+
if segment.reference_start < p1_position:
|
|
440
|
+
try:
|
|
441
|
+
trim(segment, p1_position, False, args.verbose)
|
|
442
|
+
if args.verbose:
|
|
443
|
+
print(
|
|
444
|
+
f"{segment.query_name}: ref start {segment.reference_start} >= primer_position {p1_position}",
|
|
445
|
+
file=sys.stderr,
|
|
446
|
+
)
|
|
447
|
+
except Exception as e:
|
|
448
|
+
print(
|
|
449
|
+
f"{segment.query_name}: problem soft masking left primer (error: {e}), skipping",
|
|
450
|
+
file=sys.stderr,
|
|
451
|
+
)
|
|
452
|
+
return False
|
|
453
|
+
|
|
454
|
+
# softmask the alignment if right primer start/end inside alignment
|
|
455
|
+
if segment.reference_end > p2_position: # type: ignore
|
|
456
|
+
try:
|
|
457
|
+
trim(segment, p2_position, True, args.verbose)
|
|
458
|
+
if args.verbose:
|
|
459
|
+
print(
|
|
460
|
+
f"{segment.query_name}: ref start {segment.reference_start} >= primer_position {p2_position}",
|
|
461
|
+
file=sys.stderr,
|
|
462
|
+
)
|
|
463
|
+
except Exception as e:
|
|
464
|
+
print(
|
|
465
|
+
f"{segment.query_name}: problem soft masking right primer (error: {e}), skipping",
|
|
466
|
+
file=sys.stderr,
|
|
467
|
+
)
|
|
468
|
+
return False
|
|
469
|
+
|
|
470
|
+
# check the the alignment still contains bases matching the reference
|
|
471
|
+
if "M" not in segment.cigarstring: # type: ignore
|
|
472
|
+
if args.verbose:
|
|
473
|
+
print(
|
|
474
|
+
f"{segment.query_name}: dropped as does not match reference post masking",
|
|
475
|
+
file=sys.stderr,
|
|
476
|
+
)
|
|
477
|
+
return False
|
|
478
|
+
|
|
479
|
+
# Check require-full-length
|
|
480
|
+
if args.require_full_length:
|
|
481
|
+
if segment.reference_start > p1.end or segment.reference_end < p2.start: # type: ignore
|
|
482
|
+
if args.verbose:
|
|
483
|
+
print(
|
|
484
|
+
f"{segment.query_name}: ref_start {segment.reference_start} > p1.end {p1.end} or ref_end {segment.reference_end} < p2.start {p2.start}, does not span a full amplicon, skipping",
|
|
485
|
+
file=sys.stderr,
|
|
486
|
+
)
|
|
487
|
+
return False
|
|
488
|
+
|
|
489
|
+
# If not normalising, write the segment to the output file and add it to amplicon depth numpy array
|
|
490
|
+
if not args.normalise:
|
|
491
|
+
outfile_writer.write(segment)
|
|
492
|
+
segment_amp_relative_start = segment.reference_start - p1.start
|
|
493
|
+
segment_amp_relative_end = segment.reference_end - p1.start # type: ignore
|
|
494
|
+
if segment_amp_relative_start < 0:
|
|
495
|
+
segment_amp_relative_start = 0
|
|
496
|
+
|
|
497
|
+
amp_depths[segment.reference_name][amplicon][
|
|
498
|
+
segment_amp_relative_start:segment_amp_relative_end
|
|
499
|
+
] += 1
|
|
500
|
+
|
|
501
|
+
return (amplicon, False)
|
|
502
|
+
|
|
503
|
+
return (amplicon, segment)
|
|
504
|
+
|
|
505
|
+
else:
|
|
506
|
+
for segment_of_pair in (segment1, segment2):
|
|
507
|
+
if segment_of_pair.reference_start < p1_position:
|
|
508
|
+
try:
|
|
509
|
+
trim(
|
|
510
|
+
segment=segment_of_pair,
|
|
511
|
+
primer_pos=p1_position,
|
|
512
|
+
end=False,
|
|
513
|
+
verbose=args.verbose,
|
|
514
|
+
)
|
|
515
|
+
if args.verbose:
|
|
516
|
+
print(
|
|
517
|
+
f"{segment_of_pair.query_name}: ref start {segment_of_pair.reference_start} >= primer_position {p1_position}",
|
|
518
|
+
file=sys.stderr,
|
|
519
|
+
)
|
|
520
|
+
except Exception as e:
|
|
521
|
+
print(
|
|
522
|
+
f"{segment_of_pair.query_name}: Problem soft masking left primer (error: {e}), skipping",
|
|
523
|
+
file=sys.stderr,
|
|
524
|
+
)
|
|
525
|
+
return False
|
|
526
|
+
|
|
527
|
+
if segment_of_pair.reference_end > p2_position: # type: ignore
|
|
528
|
+
try:
|
|
529
|
+
trim(
|
|
530
|
+
segment=segment_of_pair,
|
|
531
|
+
primer_pos=p2_position,
|
|
532
|
+
end=True,
|
|
533
|
+
verbose=args.verbose,
|
|
534
|
+
)
|
|
535
|
+
if args.verbose:
|
|
536
|
+
print(
|
|
537
|
+
f"{segment_of_pair.query_name}: ref_end {segment_of_pair.reference_end} >= primer_position {p2_position}",
|
|
538
|
+
file=sys.stderr,
|
|
539
|
+
)
|
|
540
|
+
except Exception as e:
|
|
541
|
+
print(
|
|
542
|
+
f"{segment_of_pair.query_name}: Problem soft masking right primer (error: {e}), skipping",
|
|
543
|
+
file=sys.stderr,
|
|
544
|
+
)
|
|
545
|
+
return False
|
|
546
|
+
|
|
547
|
+
# check the the alignment still contains bases matching the reference
|
|
548
|
+
if "M" not in segment1.cigarstring or "M" not in segment2.cigarstring: # type: ignore
|
|
549
|
+
if args.verbose:
|
|
550
|
+
print(
|
|
551
|
+
f"{segment1.query_name}: Paired segment dropped as does not match reference post masking",
|
|
552
|
+
file=sys.stderr,
|
|
553
|
+
)
|
|
554
|
+
return False
|
|
555
|
+
|
|
556
|
+
if args.require_full_length:
|
|
557
|
+
if segment1.reference_start < segment2.reference_start:
|
|
558
|
+
if (
|
|
559
|
+
segment1.reference_start > p1.end # type: ignore
|
|
560
|
+
or segment2.reference_end < p2.start # type: ignore
|
|
561
|
+
):
|
|
562
|
+
if args.verbose:
|
|
563
|
+
print(
|
|
564
|
+
f"{segment1.query_name}: ref_start {segment1.reference_start} > p1.end {p1.end} or ref_end {segment2.reference_end} < p2.start {p2.start}, does not span a full amplicon, skipping",
|
|
565
|
+
file=sys.stderr,
|
|
566
|
+
)
|
|
567
|
+
return False
|
|
568
|
+
else:
|
|
569
|
+
if (
|
|
570
|
+
segment2.reference_start > p1.end
|
|
571
|
+
or segment1.reference_end < p2.start # type: ignore
|
|
572
|
+
):
|
|
573
|
+
if args.verbose:
|
|
574
|
+
print(
|
|
575
|
+
f"{segment1.query_name}: ref_end {segment1.reference_end} < p2.start {p2.start} or ref_start {segment2.reference_start} > p1.end {p1.end}, does not span a full amplicon, skipping",
|
|
576
|
+
file=sys.stderr,
|
|
577
|
+
)
|
|
578
|
+
return False
|
|
579
|
+
|
|
580
|
+
# If not normalising, write the segments to the output file and add them to amplicon depth numpy array
|
|
581
|
+
if not args.normalise:
|
|
582
|
+
outfile_writer.write(segment1)
|
|
583
|
+
outfile_writer.write(segment2)
|
|
584
|
+
for segment_in_pair in (segment1, segment2):
|
|
585
|
+
segment_amp_relative_start = segment_in_pair.reference_start - p1.start
|
|
586
|
+
segment_amp_relative_end = segment_in_pair.reference_end - p1.start # type: ignore
|
|
587
|
+
if segment_amp_relative_start < 0:
|
|
588
|
+
segment_amp_relative_start = 0
|
|
589
|
+
amp_depths[segment1.reference_name][amplicon][
|
|
590
|
+
segment_amp_relative_start:segment_amp_relative_end
|
|
591
|
+
] += 1
|
|
592
|
+
|
|
593
|
+
return (amplicon, False)
|
|
594
|
+
|
|
595
|
+
return (amplicon, segment)
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
def read_pair_generator(bam, region_string=None):
|
|
599
|
+
"""
|
|
600
|
+
Generate read pairs in a BAM file or within a region string.
|
|
601
|
+
Reads are added to read_dict until a pair is found.
|
|
602
|
+
"""
|
|
603
|
+
read_dict = defaultdict(lambda: [None, None])
|
|
604
|
+
for read in bam:
|
|
605
|
+
if not read.is_proper_pair:
|
|
606
|
+
continue
|
|
607
|
+
qname = read.query_name
|
|
608
|
+
if qname not in read_dict:
|
|
609
|
+
if read.is_read1:
|
|
610
|
+
read_dict[qname][0] = read
|
|
611
|
+
else:
|
|
612
|
+
read_dict[qname][1] = read
|
|
613
|
+
else:
|
|
614
|
+
if read.is_read1:
|
|
615
|
+
yield read, read_dict[qname][1]
|
|
616
|
+
else:
|
|
617
|
+
yield read_dict[qname][0], read
|
|
618
|
+
del read_dict[qname]
|
|
619
|
+
|
|
620
|
+
|
|
621
|
+
def create_primer_lookup(ref_len_tuple, amplicons: list[Amplicon], padding=35):
|
|
622
|
+
"""
|
|
623
|
+
Create a lookup table for efficient primer position queries across reference genomes.
|
|
624
|
+
|
|
625
|
+
Each chromosome gets its own 2D lookup array where:
|
|
626
|
+
- Rows represent non-overlapping "pools"* of amplicons at their corresponding positions.
|
|
627
|
+
- Columns represent genomic positions
|
|
628
|
+
- Values are Amplicon objects or None
|
|
629
|
+
|
|
630
|
+
The function automatically determines the minimum number of rows needed to ensure
|
|
631
|
+
no amplicons overlap within the same row when accounting for padding.
|
|
632
|
+
|
|
633
|
+
* Amplicons are placed in the first available row where they don't overlap, not their pool index.
|
|
634
|
+
|
|
635
|
+
Parameters
|
|
636
|
+
----------
|
|
637
|
+
ref_len_tuple : list[tuple[str, int]]
|
|
638
|
+
List of tuples containing (chromosome_name, chromosome_length) pairs
|
|
639
|
+
from the reference genome
|
|
640
|
+
amplicons : list[Amplicon]
|
|
641
|
+
List of Amplicon objects containing primer scheme information
|
|
642
|
+
padding : int, optional
|
|
643
|
+
Number of bases to extend amplicon boundaries on both sides to allow
|
|
644
|
+
for fuzzy matching of reads with barcodes/adapters (default: 35)
|
|
645
|
+
|
|
646
|
+
Returns
|
|
647
|
+
-------
|
|
648
|
+
dict[str, np.ndarray]
|
|
649
|
+
Dictionary mapping chromosome names to 2D numpy arrays of shape (N, chrom_len+1)
|
|
650
|
+
where N is the minimum number of rows needed to prevent amplicon overlap.
|
|
651
|
+
Array elements are either Amplicon objects or None.
|
|
652
|
+
|
|
653
|
+
|
|
654
|
+
"""
|
|
655
|
+
lookups = {}
|
|
656
|
+
for chrom, chromlen in ref_len_tuple:
|
|
657
|
+
lookup_array = np.empty_like(None, shape=(1, chromlen + 1))
|
|
658
|
+
for amp in amplicons:
|
|
659
|
+
added = False
|
|
660
|
+
if amp.chrom == chrom:
|
|
661
|
+
# If amplicon clashes with any in same pool add new row
|
|
662
|
+
amp_slice = lookup_array[
|
|
663
|
+
:,
|
|
664
|
+
max(amp.amplicon_start - padding, 0) : min(
|
|
665
|
+
amp.amplicon_end + padding, chromlen
|
|
666
|
+
),
|
|
667
|
+
]
|
|
668
|
+
for i, row in enumerate(amp_slice): # Check each row for collision
|
|
669
|
+
if row[row != None].size == 0:
|
|
670
|
+
lookup_array[
|
|
671
|
+
i,
|
|
672
|
+
max(amp.amplicon_start - padding, 0) : min(
|
|
673
|
+
amp.amplicon_end + padding, chromlen
|
|
674
|
+
),
|
|
675
|
+
] = amp
|
|
676
|
+
added = True
|
|
677
|
+
# If not added, create new row, add the amplicon to that then add back to original array
|
|
678
|
+
if not added:
|
|
679
|
+
new_row = np.empty_like(None, shape=(1, chromlen + 1))
|
|
680
|
+
new_row[
|
|
681
|
+
0,
|
|
682
|
+
max(amp.amplicon_start - padding, 0) : min(
|
|
683
|
+
amp.amplicon_end + padding, chromlen
|
|
684
|
+
),
|
|
685
|
+
] = amp
|
|
686
|
+
lookup_array = np.vstack((lookup_array, new_row))
|
|
687
|
+
|
|
688
|
+
lookups[chrom] = lookup_array
|
|
689
|
+
return lookups
|
|
690
|
+
|
|
691
|
+
|
|
692
|
+
def go(args):
|
|
693
|
+
"""Filter and soft mask an alignment file so that the alignment boundaries match the primer start and end sites.
|
|
694
|
+
|
|
695
|
+
Based on the most likely primer position, based on the alignment coordinates.
|
|
696
|
+
"""
|
|
697
|
+
# guard for negative normalise
|
|
698
|
+
if args.normalise is not None and args.normalise < 0:
|
|
699
|
+
print("normalise must be >= 0, exiting.", file=sys.stderr)
|
|
700
|
+
sys.exit(1)
|
|
701
|
+
|
|
702
|
+
# prepare the report outfile
|
|
703
|
+
if args.report:
|
|
704
|
+
reportfh = open(args.report, "w")
|
|
705
|
+
report_headers = [
|
|
706
|
+
"chrom",
|
|
707
|
+
"QueryName",
|
|
708
|
+
"ReferenceStart",
|
|
709
|
+
"ReferenceEnd",
|
|
710
|
+
"PrimerPair",
|
|
711
|
+
"Primer1",
|
|
712
|
+
"Primer1Start",
|
|
713
|
+
"Primer2",
|
|
714
|
+
"Primer2Start",
|
|
715
|
+
"IsSecondary",
|
|
716
|
+
"IsSupplementary",
|
|
717
|
+
"Start",
|
|
718
|
+
"End",
|
|
719
|
+
"CorrectlyPaired",
|
|
720
|
+
]
|
|
721
|
+
report_writer = csv.DictWriter(
|
|
722
|
+
reportfh, fieldnames=report_headers, delimiter="\t"
|
|
723
|
+
)
|
|
724
|
+
report_writer.writeheader()
|
|
725
|
+
|
|
726
|
+
# open the primer scheme and get the pools
|
|
727
|
+
scheme = Scheme.from_file(args.bedfile)
|
|
728
|
+
|
|
729
|
+
# Merge the primers
|
|
730
|
+
scheme.bedlines = merge_primers(scheme.bedlines)
|
|
731
|
+
|
|
732
|
+
amplicon_list = create_amplicons(scheme.bedlines)
|
|
733
|
+
amplicons = {}
|
|
734
|
+
for amplicon in amplicon_list:
|
|
735
|
+
amplicon.length = amplicon.amplicon_end - amplicon.amplicon_start # type: ignore
|
|
736
|
+
amplicons.setdefault(amplicon.chrom, {})[amplicon.amplicon_number] = amplicon
|
|
737
|
+
|
|
738
|
+
pools = set([bl.pool for bl in scheme.bedlines])
|
|
739
|
+
|
|
740
|
+
pools_str = {str(x) for x in pools}
|
|
741
|
+
pools_str.add("unmatched")
|
|
742
|
+
|
|
743
|
+
# open the input samfile and process read groups
|
|
744
|
+
if args.samfile and args.samfile != "-":
|
|
745
|
+
infile = pysam.AlignmentFile(args.samfile, "rb")
|
|
746
|
+
else:
|
|
747
|
+
infile = pysam.AlignmentFile("-", "rb")
|
|
748
|
+
|
|
749
|
+
first_segment = next(infile, None)
|
|
750
|
+
if not first_segment:
|
|
751
|
+
print("No segments found in the input file, exiting.", file=sys.stderr)
|
|
752
|
+
sys.exit(1)
|
|
753
|
+
|
|
754
|
+
# check if the first segment is paired, then chain the saved first segment with the infile iterator so nothing is lost
|
|
755
|
+
paired = first_segment.is_paired
|
|
756
|
+
chained_iterator = itertools.chain([first_segment], infile)
|
|
757
|
+
|
|
758
|
+
bam_header = infile.header.copy().to_dict()
|
|
759
|
+
if not args.no_read_groups:
|
|
760
|
+
bam_header["RG"] = []
|
|
761
|
+
for pool in sorted(pools_str): # set order can be non deterministic
|
|
762
|
+
read_group = {}
|
|
763
|
+
read_group["ID"] = pool
|
|
764
|
+
bam_header["RG"].append(read_group)
|
|
765
|
+
|
|
766
|
+
cli_cmd = " ".join(sys.argv)
|
|
767
|
+
bam_header["PG"].append(
|
|
768
|
+
{
|
|
769
|
+
"PN": "align_trim",
|
|
770
|
+
"ID": "align_trim",
|
|
771
|
+
"VN": version("align_trim"),
|
|
772
|
+
"CL": cli_cmd,
|
|
773
|
+
}
|
|
774
|
+
)
|
|
775
|
+
|
|
776
|
+
# prepare the alignment outfile
|
|
777
|
+
if args.output and args.output != "-":
|
|
778
|
+
if args.output.name.endswith(".bam"):
|
|
779
|
+
outfile = pysam.AlignmentFile(args.output, "wb", header=bam_header)
|
|
780
|
+
elif args.output.name.endswith(".sam"):
|
|
781
|
+
outfile = pysam.AlignmentFile(args.output, "wh", header=bam_header)
|
|
782
|
+
else:
|
|
783
|
+
print(
|
|
784
|
+
"Output file path must end with either .bam or .sam, exiting.",
|
|
785
|
+
file=sys.stderr,
|
|
786
|
+
)
|
|
787
|
+
sys.exit(1)
|
|
788
|
+
|
|
789
|
+
else:
|
|
790
|
+
outfile = pysam.AlignmentFile("-", "wh", header=bam_header)
|
|
791
|
+
|
|
792
|
+
# Initialise the amplicon depth dict
|
|
793
|
+
amp_depths = {}
|
|
794
|
+
for amp in amplicon_list:
|
|
795
|
+
amp_depths.setdefault(amp.chrom, {})
|
|
796
|
+
amp_depths[amp.chrom].setdefault(
|
|
797
|
+
amp.amplicon_number,
|
|
798
|
+
np.zeros(amp.length, dtype=int), # type: ignore
|
|
799
|
+
)
|
|
800
|
+
|
|
801
|
+
# Initialise the mean depths dictionary, this will get stomped over if normalisation is requested
|
|
802
|
+
mean_amp_depths = {}
|
|
803
|
+
for chrom in amplicons:
|
|
804
|
+
for amplicon in amplicons[chrom]:
|
|
805
|
+
mean_amp_depths[(chrom, amplicon)] = 0
|
|
806
|
+
|
|
807
|
+
# Create a lookup table for primer location
|
|
808
|
+
ref_lengths = [(r, infile.get_reference_length(r)) for r in infile.references]
|
|
809
|
+
primer_lookup = create_primer_lookup(
|
|
810
|
+
ref_len_tuple=ref_lengths,
|
|
811
|
+
amplicons=amplicon_list,
|
|
812
|
+
padding=args.primer_match_threshold,
|
|
813
|
+
)
|
|
814
|
+
|
|
815
|
+
# Per-amplicon normalisation state: running depth array and current MAD from target
|
|
816
|
+
if args.normalise:
|
|
817
|
+
norm_state = {}
|
|
818
|
+
for amp in amplicon_list:
|
|
819
|
+
norm_state[(amp.chrom, amp.amplicon_number)] = {
|
|
820
|
+
"depth": np.zeros(amp.length, dtype=int), # type: ignore
|
|
821
|
+
"distance": float(args.normalise),
|
|
822
|
+
}
|
|
823
|
+
|
|
824
|
+
if paired:
|
|
825
|
+
read_pairs = read_pair_generator(chained_iterator)
|
|
826
|
+
|
|
827
|
+
for segments in read_pairs:
|
|
828
|
+
if args.report:
|
|
829
|
+
trimming_tuple = handle_segments(
|
|
830
|
+
segment=segments, # type: ignore
|
|
831
|
+
lookup=primer_lookup,
|
|
832
|
+
args=args,
|
|
833
|
+
report_writer=report_writer, # type: ignore
|
|
834
|
+
min_mapq=args.min_mapq,
|
|
835
|
+
outfile_writer=outfile,
|
|
836
|
+
amp_depths=amp_depths,
|
|
837
|
+
)
|
|
838
|
+
else:
|
|
839
|
+
trimming_tuple = handle_segments(
|
|
840
|
+
segment=segments, # type: ignore
|
|
841
|
+
lookup=primer_lookup,
|
|
842
|
+
args=args,
|
|
843
|
+
min_mapq=args.min_mapq,
|
|
844
|
+
outfile_writer=outfile,
|
|
845
|
+
amp_depths=amp_depths,
|
|
846
|
+
)
|
|
847
|
+
|
|
848
|
+
if not trimming_tuple:
|
|
849
|
+
continue
|
|
850
|
+
|
|
851
|
+
# unpack the trimming tuple since segment passed trimming
|
|
852
|
+
amplicon, trimmed_pair = trimming_tuple
|
|
853
|
+
|
|
854
|
+
# If we aren't normalising the segments will have already been written to the outfile
|
|
855
|
+
if not args.normalise and not trimmed_pair:
|
|
856
|
+
continue
|
|
857
|
+
|
|
858
|
+
if args.normalise and trimmed_pair:
|
|
859
|
+
chrom = trimmed_pair[0].reference_name # type: ignore
|
|
860
|
+
state = norm_state[(chrom, amplicon)]
|
|
861
|
+
p_start = amplicons[chrom][amplicon].amplicon_start
|
|
862
|
+
test_depths = np.copy(state["depth"])
|
|
863
|
+
for seg in trimmed_pair: # type: ignore
|
|
864
|
+
relative_start = max(0, seg.reference_start - p_start)
|
|
865
|
+
relative_end = seg.reference_end - p_start
|
|
866
|
+
test_depths[relative_start:relative_end] += 1
|
|
867
|
+
test_distance = np.mean(np.abs(test_depths - args.normalise))
|
|
868
|
+
if test_distance < state["distance"]:
|
|
869
|
+
state["depth"] = test_depths
|
|
870
|
+
state["distance"] = test_distance
|
|
871
|
+
outfile.write(trimmed_pair[0]) # type: ignore
|
|
872
|
+
outfile.write(trimmed_pair[1]) # type: ignore
|
|
873
|
+
|
|
874
|
+
if args.normalise:
|
|
875
|
+
mean_amp_depths = {k: np.mean(v["depth"]) for k, v in norm_state.items()}
|
|
876
|
+
else:
|
|
877
|
+
mean_amp_depths = {}
|
|
878
|
+
for chrom, chrom_amps in amp_depths.items():
|
|
879
|
+
for amplicon, depths in chrom_amps.items():
|
|
880
|
+
mean_amp_depths[(chrom, amplicon)] = np.mean(depths)
|
|
881
|
+
|
|
882
|
+
# write mean amplicon depths to file
|
|
883
|
+
if args.amp_depth_report:
|
|
884
|
+
with open(args.amp_depth_report, "w") as amp_depth_report_fh:
|
|
885
|
+
writer = csv.DictWriter(
|
|
886
|
+
amp_depth_report_fh,
|
|
887
|
+
fieldnames=["chrom", "amplicon", "mean_depth"],
|
|
888
|
+
delimiter="\t",
|
|
889
|
+
)
|
|
890
|
+
writer.writeheader()
|
|
891
|
+
for (chrom, amplicon), depth in mean_amp_depths.items():
|
|
892
|
+
writer.writerow(
|
|
893
|
+
{"chrom": chrom, "amplicon": amplicon, "mean_depth": depth}
|
|
894
|
+
)
|
|
895
|
+
|
|
896
|
+
else:
|
|
897
|
+
# iterate over the alignment segments in the input SAM file
|
|
898
|
+
for segment in chained_iterator:
|
|
899
|
+
if args.report:
|
|
900
|
+
trimming_tuple = handle_segments(
|
|
901
|
+
segment=segment,
|
|
902
|
+
args=args,
|
|
903
|
+
report_writer=report_writer, # type: ignore
|
|
904
|
+
min_mapq=args.min_mapq,
|
|
905
|
+
lookup=primer_lookup,
|
|
906
|
+
outfile_writer=outfile,
|
|
907
|
+
amp_depths=amp_depths,
|
|
908
|
+
)
|
|
909
|
+
|
|
910
|
+
else:
|
|
911
|
+
trimming_tuple = handle_segments(
|
|
912
|
+
segment=segment,
|
|
913
|
+
args=args,
|
|
914
|
+
min_mapq=args.min_mapq,
|
|
915
|
+
lookup=primer_lookup,
|
|
916
|
+
outfile_writer=outfile,
|
|
917
|
+
amp_depths=amp_depths,
|
|
918
|
+
)
|
|
919
|
+
|
|
920
|
+
if not trimming_tuple:
|
|
921
|
+
continue
|
|
922
|
+
|
|
923
|
+
# unpack the trimming tuple since segment passed trimming
|
|
924
|
+
amplicon, trimmed_segment = trimming_tuple
|
|
925
|
+
|
|
926
|
+
# If we aren't normalising the segments will have already been written to the outfile
|
|
927
|
+
if not args.normalise and not trimmed_segment:
|
|
928
|
+
continue
|
|
929
|
+
|
|
930
|
+
if args.normalise and trimmed_segment:
|
|
931
|
+
chrom = trimmed_segment.reference_name # type: ignore
|
|
932
|
+
state = norm_state[(chrom, amplicon)]
|
|
933
|
+
p_start = amplicons[chrom][amplicon].amplicon_start
|
|
934
|
+
test_depths = np.copy(state["depth"])
|
|
935
|
+
relative_start = max(0, trimmed_segment.reference_start - p_start) # type: ignore
|
|
936
|
+
relative_end = trimmed_segment.reference_end - p_start # type: ignore
|
|
937
|
+
test_depths[relative_start:relative_end] += 1
|
|
938
|
+
test_distance = np.mean(np.abs(test_depths - args.normalise))
|
|
939
|
+
if test_distance < state["distance"]:
|
|
940
|
+
state["depth"] = test_depths
|
|
941
|
+
state["distance"] = test_distance
|
|
942
|
+
outfile.write(trimmed_segment) # type: ignore
|
|
943
|
+
|
|
944
|
+
# normalise if requested
|
|
945
|
+
if args.normalise:
|
|
946
|
+
mean_amp_depths = {k: np.mean(v["depth"]) for k, v in norm_state.items()}
|
|
947
|
+
|
|
948
|
+
else:
|
|
949
|
+
mean_amp_depths = {}
|
|
950
|
+
for chrom, chrom_amps in amp_depths.items():
|
|
951
|
+
for amplicon, depths in chrom_amps.items():
|
|
952
|
+
mean_amp_depths[(chrom, amplicon)] = np.mean(depths)
|
|
953
|
+
|
|
954
|
+
# write mean amplicon depths to file
|
|
955
|
+
if args.amp_depth_report:
|
|
956
|
+
with open(args.amp_depth_report, "w") as amp_depth_report_fh:
|
|
957
|
+
writer = csv.DictWriter(
|
|
958
|
+
amp_depth_report_fh,
|
|
959
|
+
fieldnames=["chrom", "amplicon", "mean_depth"],
|
|
960
|
+
delimiter="\t",
|
|
961
|
+
)
|
|
962
|
+
writer.writeheader()
|
|
963
|
+
|
|
964
|
+
for (chrom, amplicon), depth in mean_amp_depths.items():
|
|
965
|
+
writer.writerow(
|
|
966
|
+
{"chrom": chrom, "amplicon": amplicon, "mean_depth": depth}
|
|
967
|
+
)
|
|
968
|
+
|
|
969
|
+
# close up the file handles
|
|
970
|
+
infile.close()
|
|
971
|
+
outfile.close()
|
|
972
|
+
if args.report:
|
|
973
|
+
reportfh.close() # type: ignore
|
|
974
|
+
|
|
975
|
+
|
|
976
|
+
def main():
|
|
977
|
+
parser = argparse.ArgumentParser(
|
|
978
|
+
description="Trim alignments from an amplicon scheme. Bam (input) can be provided by --samfile or stdin"
|
|
979
|
+
)
|
|
980
|
+
parser.add_argument(
|
|
981
|
+
"bedfile",
|
|
982
|
+
help="BED file containing the amplicon scheme",
|
|
983
|
+
type=Path,
|
|
984
|
+
metavar="BEDFILE",
|
|
985
|
+
)
|
|
986
|
+
parser.add_argument(
|
|
987
|
+
"--samfile",
|
|
988
|
+
"-s",
|
|
989
|
+
help="Sorted SAM/BAM file containing the aligned reads, if this is not provided (or '-') then 'align_trim' will read from stdin.",
|
|
990
|
+
required=False,
|
|
991
|
+
)
|
|
992
|
+
parser.add_argument(
|
|
993
|
+
"--normalise",
|
|
994
|
+
"-n",
|
|
995
|
+
type=int,
|
|
996
|
+
help="Subsample to N coverage per amplicon. Use 0 for no normalisation. (default: %(default)s)",
|
|
997
|
+
default=0,
|
|
998
|
+
)
|
|
999
|
+
parser.add_argument(
|
|
1000
|
+
"--min-mapq",
|
|
1001
|
+
"-m",
|
|
1002
|
+
type=int,
|
|
1003
|
+
default=20,
|
|
1004
|
+
help="Minimum mapping quality to keep an aligned read (default: %(default)s)",
|
|
1005
|
+
)
|
|
1006
|
+
parser.add_argument(
|
|
1007
|
+
"--primer-match-threshold",
|
|
1008
|
+
"-p",
|
|
1009
|
+
type=int,
|
|
1010
|
+
default=35,
|
|
1011
|
+
help="Add -p bases of padding to the outside (5' end of primer) of primer coordinates to allow fuzzy matching for reads with barcodes/adapters. (default: %(default)s)",
|
|
1012
|
+
)
|
|
1013
|
+
parser.add_argument(
|
|
1014
|
+
"--report", "-r", type=Path, help="Output report TSV to filepath"
|
|
1015
|
+
)
|
|
1016
|
+
parser.add_argument(
|
|
1017
|
+
"--amp-depth-report",
|
|
1018
|
+
"-a",
|
|
1019
|
+
type=Path,
|
|
1020
|
+
help="Output amplicon depth TSV to filepath",
|
|
1021
|
+
)
|
|
1022
|
+
parser.add_argument(
|
|
1023
|
+
"--no-trim-primers",
|
|
1024
|
+
action="store_true",
|
|
1025
|
+
help="Do not trim primers from reads",
|
|
1026
|
+
)
|
|
1027
|
+
parser.add_argument(
|
|
1028
|
+
"--no-read-groups",
|
|
1029
|
+
dest="no_read_groups",
|
|
1030
|
+
help="Do not divide reads into groups in samfile output",
|
|
1031
|
+
action="store_true",
|
|
1032
|
+
)
|
|
1033
|
+
parser.add_argument(
|
|
1034
|
+
"--allow-incorrect-pairs",
|
|
1035
|
+
action="store_true",
|
|
1036
|
+
help="Allow reads to be assigned to amplicons even if the primers are not correctly paired, i.e. primer1 and primer2 are not from the same amplicon.",
|
|
1037
|
+
)
|
|
1038
|
+
parser.add_argument(
|
|
1039
|
+
"--require-full-length",
|
|
1040
|
+
action="store_true",
|
|
1041
|
+
help="Requires all reads to start and stop in a primer site, do not use this option if you are using rapid barcoding since the reads will not be full length.",
|
|
1042
|
+
)
|
|
1043
|
+
parser.add_argument(
|
|
1044
|
+
"--output",
|
|
1045
|
+
"-o",
|
|
1046
|
+
type=Path,
|
|
1047
|
+
default=None,
|
|
1048
|
+
metavar="OUTPUT",
|
|
1049
|
+
help="Location to write the output samfile to, the output type will be determined by the file extension. If no <OUTPUT> or '-' provided, will write plaintext samfile to stdout",
|
|
1050
|
+
)
|
|
1051
|
+
parser.add_argument("--verbose", "-v", action="store_true", help="Debug mode")
|
|
1052
|
+
parser.add_argument(
|
|
1053
|
+
"--version",
|
|
1054
|
+
action="version",
|
|
1055
|
+
version=f"%(prog)s {version('align_trim')}",
|
|
1056
|
+
help="Show the version of align_trim",
|
|
1057
|
+
)
|
|
1058
|
+
|
|
1059
|
+
args = parser.parse_args()
|
|
1060
|
+
|
|
1061
|
+
go(args)
|
|
1062
|
+
|
|
1063
|
+
|
|
1064
|
+
if __name__ == "__main__":
|
|
1065
|
+
main()
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: align_trim
|
|
3
|
+
Version: 1.1.0
|
|
4
|
+
Summary: Soft-clip primer sites for SAM/BAM files generated from amplicon sequencing runs
|
|
5
|
+
Project-URL: Repository, https://github.com/artic-network/align_trim.git
|
|
6
|
+
Project-URL: Issues, https://github.com/artic-network/align_trim/issues
|
|
7
|
+
Author-email: Nick Loman <n.j.loman@bham.ac.uk>, Sam Wilkinson <s.a.j.wilkinson@bham.ac.uk>, Chris Kent <c.g.kent@bham.ac.uk>
|
|
8
|
+
Maintainer-email: Sam Wilkinson <s.a.j.wilkinson@bham.ac.uk>, Chris Kent <c.g.kent@bham.ac.uk>
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Requires-Python: >=3.9
|
|
12
|
+
Requires-Dist: numpy
|
|
13
|
+
Requires-Dist: primalbedtools>=0.10.1
|
|
14
|
+
Requires-Dist: pysam
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
|
|
17
|
+
# align_trim
|
|
18
|
+
|
|
19
|
+
Stand alone version of ARTIC's fieldbioinformatics align_trim.py
|
|
20
|
+
|
|
21
|
+
## Installation
|
|
22
|
+
|
|
23
|
+
From conda
|
|
24
|
+
```bash
|
|
25
|
+
conda install bioconda::align_trim
|
|
26
|
+
```
|
|
27
|
+
from pypi
|
|
28
|
+
```bash
|
|
29
|
+
pip install align_trim
|
|
30
|
+
```
|
|
31
|
+
from source
|
|
32
|
+
```bash
|
|
33
|
+
git clone https://github.com/artic-network/align_trim.git
|
|
34
|
+
cd align_trim
|
|
35
|
+
uv sync
|
|
36
|
+
uv run align_trim --help
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Command Line Interface
|
|
40
|
+
|
|
41
|
+
### Basic Usage
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
align_trim [OPTIONS] BEDFILE
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
The tool reads alignment data from either a SAM/BAM file or stdin and outputs trimmed alignments to stdout in SAM format by default.
|
|
48
|
+
|
|
49
|
+
### Required Arguments
|
|
50
|
+
|
|
51
|
+
- `BEDFILE`: BED file containing the amplicon primer scheme in [v3](https://doi.org/10.5281/zenodo.16366659) format.
|
|
52
|
+
|
|
53
|
+
### Optional Arguments
|
|
54
|
+
|
|
55
|
+
#### Input/Output Options
|
|
56
|
+
|
|
57
|
+
- `--samfile`, `-s` : Sorted SAM/BAM file containing the aligned reads, if this is not provided (or '-') then 'align_trim' will read from stdin.
|
|
58
|
+
- `--output`, `-o` : Output file path. Format determined by extension (.sam/.bam). If not provided or '-', writes SAM to stdout
|
|
59
|
+
|
|
60
|
+
#### Processing Options
|
|
61
|
+
|
|
62
|
+
- `--normalise`, `-n` : Normalise to target depth N per amplicon using a greedy per-read algorithm. Each read is kept only if it brings the amplicon depth closer to the target. Use 0 for no normalisation (default: 0)
|
|
63
|
+
- `--min-mapq`, `-m` : Minimum mapping quality to keep an aligned read (default: 20)
|
|
64
|
+
- `--primer-match-threshold`, `-p` : Add this many bases of padding to the 5' end of primer coordinates to allow fuzzy matching for reads with barcodes/adapters (default: 35)
|
|
65
|
+
|
|
66
|
+
#### Primer and Read Handling
|
|
67
|
+
|
|
68
|
+
- `--no-trim-primers` : Do not trim primers from reads (by default, primers are trimmed)
|
|
69
|
+
- `--allow-incorrect-pairs` : Allow reads to be assigned to amplicons even if primers are not correctly paired
|
|
70
|
+
- `--require-full-length` : Require all reads to start and stop in primer sites (do not use with rapid barcoding)
|
|
71
|
+
|
|
72
|
+
#### Output and Reporting
|
|
73
|
+
|
|
74
|
+
- `--report`, `-r` : Output detailed report TSV to specified filepath
|
|
75
|
+
- `--amp-depth-report`, `-a` : Output mean depth for each amplicon as TSV to specified filepath
|
|
76
|
+
- `--no-read-groups` : Do not divide reads into pool-based read groups in SAM/BAM output
|
|
77
|
+
|
|
78
|
+
#### General Options
|
|
79
|
+
|
|
80
|
+
- `--verbose`, `-v` : Enable debug mode with detailed logging to stderr
|
|
81
|
+
- `--version` : Show version information
|
|
82
|
+
- `--help` : Show help message
|
|
83
|
+
|
|
84
|
+
### Examples
|
|
85
|
+
|
|
86
|
+
#### Basic trimming with primer removal
|
|
87
|
+
```bash
|
|
88
|
+
align_trim primers.bed --samfile input.bam --output trimmed.bam
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
#### Normalize coverage and generate reports
|
|
92
|
+
```bash
|
|
93
|
+
align_trim primers.bed --samfile input.bam --normalise 100 \
|
|
94
|
+
--report alignment_report.tsv --amp-depth-report depth_report.tsv \
|
|
95
|
+
--output normalized.bam
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
#### Process from stdin with verbose output
|
|
99
|
+
```bash
|
|
100
|
+
samtools view -h input.bam | align_trim primers.bed --verbose > trimmed.sam 2> verbose.out.txt
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
#### Strict full-length read filtering
|
|
104
|
+
```bash
|
|
105
|
+
align_trim primers.bed --samfile input.bam --require-full-length \
|
|
106
|
+
--min-mapq 30 --output filtered.bam
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
#### Allow mismatched primer pairs with custom threshold
|
|
110
|
+
```bash
|
|
111
|
+
align_trim primers.bed --samfile input.bam --allow-incorrect-pairs \
|
|
112
|
+
--primer-match-threshold 50 --output relaxed.bam
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
### Output Formats
|
|
116
|
+
|
|
117
|
+
The tool supports multiple output formats based on file extension:
|
|
118
|
+
- `.sam` - SAM format (text)
|
|
119
|
+
- `.bam` - BAM format (binary, compressed)
|
|
120
|
+
- No extension or `-` - SAM format to stdout
|
|
121
|
+
|
|
122
|
+
### Report Files
|
|
123
|
+
|
|
124
|
+
When using `--report`, a tab-separated file is generated with the following columns:
|
|
125
|
+
- `chrom`: Reference chromosome/contig
|
|
126
|
+
- `QueryName`: Read name
|
|
127
|
+
- `ReferenceStart`/`ReferenceEnd`: Alignment coordinates
|
|
128
|
+
- `PrimerPair`: Primer pair assignment
|
|
129
|
+
- `Primer1`/`Primer2`: Individual primer information
|
|
130
|
+
- `CorrectlyPaired`: Boolean indicating proper primer pairing
|
|
131
|
+
- Additional alignment metrics
|
|
132
|
+
|
|
133
|
+
The `--amp-depth-report` generates a summary of coverage depth per amplicon.
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
align_trim/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
align_trim/main.py,sha256=oatjvWloOPYLYff8CZ1Zq6lYhWLNCNgn2sg1uJDeqa0,39555
|
|
3
|
+
align_trim-1.1.0.dist-info/METADATA,sha256=A3o0cC-omcEqAxlIwrPEIJHY7jqSn1_iwtMryyk_YYU,4557
|
|
4
|
+
align_trim-1.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
5
|
+
align_trim-1.1.0.dist-info/entry_points.txt,sha256=ZUSdteDVarhMD6c3JcQQcpcsotuuzDZVPfiHdRATYN4,52
|
|
6
|
+
align_trim-1.1.0.dist-info/licenses/LICENSE,sha256=E-00BwKTV_FGdgp-4TbWXU1LRY8Mbxxj6Ab9a_SqctE,1099
|
|
7
|
+
align_trim-1.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Copyright (c) 2017-2018 Nick Loman & the ZiBRA Project & the ARTIC project
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
5
|
+
in the Software without restriction, including without limitation the rights
|
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
8
|
+
furnished to do so, subject to the following conditions:
|
|
9
|
+
|
|
10
|
+
The above copyright notice and this permission notice shall be included in all
|
|
11
|
+
copies or substantial portions of the Software.
|
|
12
|
+
|
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
19
|
+
SOFTWARE.
|