smftools 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/_version.py +1 -1
- smftools/cli/helpers.py +32 -6
- smftools/cli/hmm_adata.py +232 -31
- smftools/cli/latent_adata.py +318 -0
- smftools/cli/load_adata.py +77 -73
- smftools/cli/preprocess_adata.py +178 -53
- smftools/cli/spatial_adata.py +149 -101
- smftools/cli_entry.py +12 -0
- smftools/config/conversion.yaml +11 -1
- smftools/config/default.yaml +38 -1
- smftools/config/experiment_config.py +53 -1
- smftools/constants.py +65 -0
- smftools/hmm/HMM.py +88 -0
- smftools/informatics/__init__.py +6 -0
- smftools/informatics/bam_functions.py +358 -8
- smftools/informatics/converted_BAM_to_adata.py +584 -163
- smftools/informatics/h5ad_functions.py +115 -2
- smftools/informatics/modkit_extract_to_adata.py +1003 -425
- smftools/informatics/sequence_encoding.py +72 -0
- smftools/logging_utils.py +21 -2
- smftools/metadata.py +1 -1
- smftools/plotting/__init__.py +9 -0
- smftools/plotting/general_plotting.py +2411 -628
- smftools/plotting/hmm_plotting.py +85 -7
- smftools/preprocessing/__init__.py +1 -0
- smftools/preprocessing/append_base_context.py +17 -17
- smftools/preprocessing/append_mismatch_frequency_sites.py +158 -0
- smftools/preprocessing/calculate_consensus.py +1 -1
- smftools/preprocessing/calculate_read_modification_stats.py +6 -1
- smftools/readwrite.py +53 -17
- smftools/schema/anndata_schema_v1.yaml +15 -1
- smftools/tools/__init__.py +4 -0
- smftools/tools/calculate_leiden.py +57 -0
- smftools/tools/calculate_nmf.py +119 -0
- smftools/tools/calculate_umap.py +91 -8
- smftools/tools/rolling_nn_distance.py +235 -0
- smftools/tools/tensor_factorization.py +169 -0
- {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/METADATA +8 -6
- {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/RECORD +42 -35
- {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/WHEEL +0 -0
- {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/entry_points.txt +0 -0
- {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -15,6 +15,7 @@ from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Un
|
|
|
15
15
|
import numpy as np
|
|
16
16
|
from tqdm import tqdm
|
|
17
17
|
|
|
18
|
+
from smftools.constants import MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT
|
|
18
19
|
from smftools.logging_utils import get_logger
|
|
19
20
|
from smftools.optional_imports import require
|
|
20
21
|
|
|
@@ -32,6 +33,20 @@ logger = get_logger(__name__)
|
|
|
32
33
|
|
|
33
34
|
_PROGRESS_RE = re.compile(r"Output records written:\s*(\d+)")
|
|
34
35
|
_EMPTY_RE = re.compile(r"^\s*$")
|
|
36
|
+
_BAM_FLAG_BITS: Tuple[Tuple[int, str], ...] = (
|
|
37
|
+
(0x1, "paired"),
|
|
38
|
+
(0x2, "proper_pair"),
|
|
39
|
+
(0x4, "unmapped"),
|
|
40
|
+
(0x8, "mate_unmapped"),
|
|
41
|
+
(0x10, "reverse"),
|
|
42
|
+
(0x20, "mate_reverse"),
|
|
43
|
+
(0x40, "read1"),
|
|
44
|
+
(0x80, "read2"),
|
|
45
|
+
(0x100, "secondary"),
|
|
46
|
+
(0x200, "qc_fail"),
|
|
47
|
+
(0x400, "duplicate"),
|
|
48
|
+
(0x800, "supplementary"),
|
|
49
|
+
)
|
|
35
50
|
|
|
36
51
|
|
|
37
52
|
def _require_pysam() -> "pysam_types":
|
|
@@ -260,6 +275,7 @@ def _index_bam_with_samtools(bam_path: Union[str, Path], threads: Optional[int]
|
|
|
260
275
|
def align_and_sort_BAM(
|
|
261
276
|
fasta,
|
|
262
277
|
input,
|
|
278
|
+
output,
|
|
263
279
|
cfg,
|
|
264
280
|
):
|
|
265
281
|
"""
|
|
@@ -279,10 +295,9 @@ def align_and_sort_BAM(
|
|
|
279
295
|
input_suffix = input.suffix
|
|
280
296
|
input_as_fastq = input.with_name(input.stem + ".fastq")
|
|
281
297
|
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
aligned_BAM = output_path_minus_suffix.with_name(output_path_minus_suffix.stem + "_aligned")
|
|
298
|
+
aligned_BAM = output.parent / output.stem
|
|
285
299
|
aligned_output = aligned_BAM.with_suffix(cfg.bam_suffix)
|
|
300
|
+
|
|
286
301
|
aligned_sorted_BAM = aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
|
|
287
302
|
aligned_sorted_output = aligned_sorted_BAM.with_suffix(cfg.bam_suffix)
|
|
288
303
|
|
|
@@ -1168,7 +1183,7 @@ def demux_and_index_BAM(
|
|
|
1168
1183
|
|
|
1169
1184
|
def extract_base_identities(
|
|
1170
1185
|
bam_file,
|
|
1171
|
-
|
|
1186
|
+
record,
|
|
1172
1187
|
positions,
|
|
1173
1188
|
max_reference_length,
|
|
1174
1189
|
sequence,
|
|
@@ -1179,7 +1194,7 @@ def extract_base_identities(
|
|
|
1179
1194
|
|
|
1180
1195
|
Parameters:
|
|
1181
1196
|
bam_file (str): Path to the BAM file.
|
|
1182
|
-
|
|
1197
|
+
record (str): Name of the reference record.
|
|
1183
1198
|
positions (list): Positions to extract (0-based).
|
|
1184
1199
|
max_reference_length (int): Maximum reference length for padding.
|
|
1185
1200
|
sequence (str): The sequence of the record fasta
|
|
@@ -1187,6 +1202,11 @@ def extract_base_identities(
|
|
|
1187
1202
|
Returns:
|
|
1188
1203
|
dict: Base identities from forward mapped reads.
|
|
1189
1204
|
dict: Base identities from reverse mapped reads.
|
|
1205
|
+
dict: Mismatch counts per read.
|
|
1206
|
+
dict: Mismatch trends per read.
|
|
1207
|
+
dict: Integer-encoded mismatch bases per read.
|
|
1208
|
+
dict: Base quality scores per read aligned to reference positions.
|
|
1209
|
+
dict: Read span masks per read (1 within span, 0 outside).
|
|
1190
1210
|
"""
|
|
1191
1211
|
logger.debug("Extracting nucleotide identities for each read using extract_base_identities")
|
|
1192
1212
|
timestamp = time.strftime("[%Y-%m-%d %H:%M:%S]")
|
|
@@ -1195,9 +1215,24 @@ def extract_base_identities(
|
|
|
1195
1215
|
fwd_base_identities = defaultdict(lambda: np.full(max_reference_length, "N", dtype="<U1"))
|
|
1196
1216
|
rev_base_identities = defaultdict(lambda: np.full(max_reference_length, "N", dtype="<U1"))
|
|
1197
1217
|
mismatch_counts_per_read = defaultdict(lambda: defaultdict(Counter))
|
|
1218
|
+
mismatch_base_identities = defaultdict(
|
|
1219
|
+
lambda: np.full(
|
|
1220
|
+
max_reference_length,
|
|
1221
|
+
MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT["N"],
|
|
1222
|
+
dtype=np.int16,
|
|
1223
|
+
)
|
|
1224
|
+
)
|
|
1225
|
+
base_quality_scores = defaultdict(lambda: np.full(max_reference_length, -1, dtype=np.int16))
|
|
1226
|
+
read_span_masks = defaultdict(lambda: np.zeros(max_reference_length, dtype=np.int8))
|
|
1198
1227
|
|
|
1199
1228
|
backend_choice = _resolve_samtools_backend(samtools_backend)
|
|
1200
1229
|
ref_seq = sequence.upper()
|
|
1230
|
+
sequence_length = len(sequence)
|
|
1231
|
+
|
|
1232
|
+
def _encode_mismatch_base(base: str) -> int:
|
|
1233
|
+
return MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT.get(
|
|
1234
|
+
base.upper(), MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT["N"]
|
|
1235
|
+
)
|
|
1201
1236
|
|
|
1202
1237
|
if backend_choice == "python":
|
|
1203
1238
|
logger.debug("Extracting base identities using python")
|
|
@@ -1205,26 +1240,45 @@ def extract_base_identities(
|
|
|
1205
1240
|
# print(f"{timestamp} Reading reads from {chromosome} BAM file: {bam_file}")
|
|
1206
1241
|
with pysam_mod.AlignmentFile(str(bam_file), "rb") as bam:
|
|
1207
1242
|
total_reads = bam.mapped
|
|
1208
|
-
for read in bam.fetch(
|
|
1243
|
+
for read in bam.fetch(record):
|
|
1209
1244
|
if not read.is_mapped:
|
|
1210
1245
|
continue # Skip unmapped reads
|
|
1211
1246
|
|
|
1212
1247
|
read_name = read.query_name
|
|
1213
1248
|
query_sequence = read.query_sequence
|
|
1249
|
+
query_qualities = read.query_qualities or []
|
|
1214
1250
|
base_dict = rev_base_identities if read.is_reverse else fwd_base_identities
|
|
1215
1251
|
|
|
1252
|
+
# Init arrays for each read in each dict
|
|
1253
|
+
mismatch_base_identities[read_name]
|
|
1254
|
+
base_quality_scores[read_name]
|
|
1255
|
+
read_span_masks[read_name]
|
|
1256
|
+
|
|
1257
|
+
if read.reference_start is not None and read.reference_end is not None:
|
|
1258
|
+
span_end = min(read.reference_end, max_reference_length)
|
|
1259
|
+
read_span_masks[read_name][read.reference_start : span_end] = 1
|
|
1260
|
+
|
|
1216
1261
|
# Use get_aligned_pairs directly with positions filtering
|
|
1217
1262
|
aligned_pairs = read.get_aligned_pairs(matches_only=True)
|
|
1218
1263
|
|
|
1219
1264
|
for read_position, reference_position in aligned_pairs:
|
|
1265
|
+
if reference_position is None or read_position is None:
|
|
1266
|
+
continue
|
|
1220
1267
|
read_base = query_sequence[read_position]
|
|
1221
1268
|
ref_base = ref_seq[reference_position]
|
|
1222
1269
|
if reference_position in positions:
|
|
1223
1270
|
base_dict[read_name][reference_position] = read_base
|
|
1271
|
+
if read_position < len(query_qualities):
|
|
1272
|
+
base_quality_scores[read_name][reference_position] = query_qualities[
|
|
1273
|
+
read_position
|
|
1274
|
+
]
|
|
1224
1275
|
|
|
1225
1276
|
# Track mismatches (excluding Ns)
|
|
1226
1277
|
if read_base != ref_base and read_base != "N" and ref_base != "N":
|
|
1227
1278
|
mismatch_counts_per_read[read_name][ref_base][read_base] += 1
|
|
1279
|
+
mismatch_base_identities[read_name][reference_position] = (
|
|
1280
|
+
_encode_mismatch_base(read_base)
|
|
1281
|
+
)
|
|
1228
1282
|
else:
|
|
1229
1283
|
bam_path = Path(bam_file)
|
|
1230
1284
|
logger.debug("Extracting base identities using samtools")
|
|
@@ -1247,7 +1301,14 @@ def extract_base_identities(
|
|
|
1247
1301
|
elif op in {"H", "P"}:
|
|
1248
1302
|
continue
|
|
1249
1303
|
|
|
1250
|
-
|
|
1304
|
+
def _reference_span_from_cigar(cigar: str) -> int:
|
|
1305
|
+
span = 0
|
|
1306
|
+
for length_str, op in re.findall(r"(\d+)([MIDNSHP=XB])", cigar):
|
|
1307
|
+
if op in {"M", "D", "N", "=", "X"}:
|
|
1308
|
+
span += int(length_str)
|
|
1309
|
+
return span
|
|
1310
|
+
|
|
1311
|
+
cmd = ["samtools", "view", "-F", "4", str(bam_path), record]
|
|
1251
1312
|
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
|
1252
1313
|
assert proc.stdout is not None
|
|
1253
1314
|
for line in proc.stdout:
|
|
@@ -1261,9 +1322,21 @@ def extract_base_identities(
|
|
|
1261
1322
|
pos = int(fields[3])
|
|
1262
1323
|
cigar = fields[5]
|
|
1263
1324
|
query_sequence = fields[9]
|
|
1325
|
+
qual_string = fields[10]
|
|
1264
1326
|
if cigar == "*" or query_sequence == "*":
|
|
1265
1327
|
continue
|
|
1266
1328
|
base_dict = rev_base_identities if (flag & 16) else fwd_base_identities
|
|
1329
|
+
mismatch_base_identities[read_name]
|
|
1330
|
+
base_quality_scores[read_name]
|
|
1331
|
+
read_span_masks[read_name]
|
|
1332
|
+
qualities = (
|
|
1333
|
+
[ord(ch) - 33 for ch in qual_string] if qual_string and qual_string != "*" else []
|
|
1334
|
+
)
|
|
1335
|
+
ref_start = pos - 1
|
|
1336
|
+
ref_end = ref_start + _reference_span_from_cigar(cigar)
|
|
1337
|
+
span_end = min(ref_end, max_reference_length)
|
|
1338
|
+
if ref_start < max_reference_length:
|
|
1339
|
+
read_span_masks[read_name][ref_start:span_end] = 1
|
|
1267
1340
|
for read_pos, ref_pos in _iter_aligned_pairs(cigar, pos - 1):
|
|
1268
1341
|
if read_pos >= len(query_sequence) or ref_pos >= len(ref_seq):
|
|
1269
1342
|
continue
|
|
@@ -1271,8 +1344,11 @@ def extract_base_identities(
|
|
|
1271
1344
|
ref_base = ref_seq[ref_pos]
|
|
1272
1345
|
if ref_pos in positions:
|
|
1273
1346
|
base_dict[read_name][ref_pos] = read_base
|
|
1347
|
+
if read_pos < len(qualities):
|
|
1348
|
+
base_quality_scores[read_name][ref_pos] = qualities[read_pos]
|
|
1274
1349
|
if read_base != ref_base and read_base != "N" and ref_base != "N":
|
|
1275
1350
|
mismatch_counts_per_read[read_name][ref_base][read_base] += 1
|
|
1351
|
+
mismatch_base_identities[read_name][ref_pos] = _encode_mismatch_base(read_base)
|
|
1276
1352
|
rc = proc.wait()
|
|
1277
1353
|
if rc != 0:
|
|
1278
1354
|
stderr = proc.stderr.read() if proc.stderr else ""
|
|
@@ -1293,11 +1369,19 @@ def extract_base_identities(
|
|
|
1293
1369
|
else:
|
|
1294
1370
|
mismatch_trend_per_read[read_name] = "none"
|
|
1295
1371
|
|
|
1372
|
+
if sequence_length < max_reference_length:
|
|
1373
|
+
padding_value = MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT["PAD"]
|
|
1374
|
+
for mismatch_values in mismatch_base_identities.values():
|
|
1375
|
+
mismatch_values[sequence_length:] = padding_value
|
|
1376
|
+
|
|
1296
1377
|
return (
|
|
1297
1378
|
dict(fwd_base_identities),
|
|
1298
1379
|
dict(rev_base_identities),
|
|
1299
1380
|
dict(mismatch_counts_per_read),
|
|
1300
1381
|
mismatch_trend_per_read,
|
|
1382
|
+
dict(mismatch_base_identities),
|
|
1383
|
+
dict(base_quality_scores),
|
|
1384
|
+
dict(read_span_masks),
|
|
1301
1385
|
)
|
|
1302
1386
|
|
|
1303
1387
|
|
|
@@ -1312,7 +1396,7 @@ def extract_read_features_from_bam(
|
|
|
1312
1396
|
|
|
1313
1397
|
Returns:
|
|
1314
1398
|
Mapping of read name to [read_length, read_median_qscore, reference_length,
|
|
1315
|
-
mapped_length, mapping_quality].
|
|
1399
|
+
mapped_length, mapping_quality, reference_start, reference_end].
|
|
1316
1400
|
"""
|
|
1317
1401
|
logger.debug(
|
|
1318
1402
|
"Extracting read metrics from BAM using extract_read_features_from_bam: %s",
|
|
@@ -1336,12 +1420,16 @@ def extract_read_features_from_bam(
|
|
|
1336
1420
|
reference_length = reference_lengths.get(read.reference_name, float("nan"))
|
|
1337
1421
|
mapped_length = sum(end - start for start, end in read.get_blocks())
|
|
1338
1422
|
mapping_quality = float(read.mapping_quality)
|
|
1423
|
+
reference_start = float(read.reference_start)
|
|
1424
|
+
reference_end = float(read.reference_end)
|
|
1339
1425
|
read_metrics[read.query_name] = [
|
|
1340
1426
|
float(read.query_length),
|
|
1341
1427
|
median_read_quality,
|
|
1342
1428
|
float(reference_length),
|
|
1343
1429
|
float(mapped_length),
|
|
1344
1430
|
mapping_quality,
|
|
1431
|
+
reference_start,
|
|
1432
|
+
reference_end,
|
|
1345
1433
|
]
|
|
1346
1434
|
return read_metrics
|
|
1347
1435
|
|
|
@@ -1372,6 +1460,14 @@ def extract_read_features_from_bam(
|
|
|
1372
1460
|
mapped += length
|
|
1373
1461
|
return mapped
|
|
1374
1462
|
|
|
1463
|
+
def _reference_span_from_cigar(cigar: str) -> int:
|
|
1464
|
+
reference_span = 0
|
|
1465
|
+
for length_str, op in re.findall(r"(\d+)([MIDNSHP=XB])", cigar):
|
|
1466
|
+
length = int(length_str)
|
|
1467
|
+
if op in {"M", "D", "N", "=", "X"}:
|
|
1468
|
+
reference_span += length
|
|
1469
|
+
return reference_span
|
|
1470
|
+
|
|
1375
1471
|
header_cp = subprocess.run(
|
|
1376
1472
|
["samtools", "view", "-H", str(bam_path)],
|
|
1377
1473
|
stdout=subprocess.PIPE,
|
|
@@ -1402,6 +1498,7 @@ def extract_read_features_from_bam(
|
|
|
1402
1498
|
reference_name = fields[2]
|
|
1403
1499
|
mapping_quality = float(fields[4])
|
|
1404
1500
|
cigar = fields[5]
|
|
1501
|
+
reference_start = float(int(fields[3]) - 1)
|
|
1405
1502
|
sequence = fields[9]
|
|
1406
1503
|
quality = fields[10]
|
|
1407
1504
|
if sequence == "*":
|
|
@@ -1415,12 +1512,18 @@ def extract_read_features_from_bam(
|
|
|
1415
1512
|
median_read_quality = float(np.median(phreds))
|
|
1416
1513
|
reference_length = float(reference_lengths.get(reference_name, float("nan")))
|
|
1417
1514
|
mapped_length = float(_mapped_length_from_cigar(cigar)) if cigar != "*" else 0.0
|
|
1515
|
+
if cigar != "*":
|
|
1516
|
+
reference_end = float(reference_start + _reference_span_from_cigar(cigar))
|
|
1517
|
+
else:
|
|
1518
|
+
reference_end = float("nan")
|
|
1418
1519
|
read_metrics[read_name] = [
|
|
1419
1520
|
read_length,
|
|
1420
1521
|
median_read_quality,
|
|
1421
1522
|
reference_length,
|
|
1422
1523
|
mapped_length,
|
|
1423
1524
|
mapping_quality,
|
|
1525
|
+
reference_start,
|
|
1526
|
+
reference_end,
|
|
1424
1527
|
]
|
|
1425
1528
|
|
|
1426
1529
|
rc = proc.wait()
|
|
@@ -1431,6 +1534,253 @@ def extract_read_features_from_bam(
|
|
|
1431
1534
|
return read_metrics
|
|
1432
1535
|
|
|
1433
1536
|
|
|
1537
|
+
def extract_read_tags_from_bam(
|
|
1538
|
+
bam_file_path: str | Path,
|
|
1539
|
+
tag_names: Iterable[str] | None = None,
|
|
1540
|
+
include_flags: bool = True,
|
|
1541
|
+
include_cigar: bool = True,
|
|
1542
|
+
samtools_backend: str | None = "auto",
|
|
1543
|
+
) -> Dict[str, Dict[str, object]]:
|
|
1544
|
+
"""Extract per-read tag metadata from a BAM file.
|
|
1545
|
+
|
|
1546
|
+
Args:
|
|
1547
|
+
bam_file_path: Path to the BAM file.
|
|
1548
|
+
tag_names: Iterable of BAM tag names to extract (e.g., ["NM", "MD", "MM", "ML"]).
|
|
1549
|
+
If None, only flags/cigar are populated.
|
|
1550
|
+
include_flags: Whether to include a list of flag names for each read.
|
|
1551
|
+
include_cigar: Whether to include the CIGAR string for each read.
|
|
1552
|
+
samtools_backend: Backend selection for samtools-compatible operations (auto|python|cli).
|
|
1553
|
+
|
|
1554
|
+
Returns:
|
|
1555
|
+
Mapping of read name to a dict of extracted tag values.
|
|
1556
|
+
"""
|
|
1557
|
+
backend_choice = _resolve_samtools_backend(samtools_backend)
|
|
1558
|
+
tag_names_list = [tag.upper() for tag in tag_names] if tag_names else []
|
|
1559
|
+
read_tags: Dict[str, Dict[str, object]] = {}
|
|
1560
|
+
|
|
1561
|
+
def _decode_flags(flag: int) -> list[str]:
|
|
1562
|
+
return [name for bit, name in _BAM_FLAG_BITS if flag & bit]
|
|
1563
|
+
|
|
1564
|
+
if backend_choice == "python":
|
|
1565
|
+
pysam_mod = _require_pysam()
|
|
1566
|
+
with pysam_mod.AlignmentFile(str(bam_file_path), "rb") as bam_file:
|
|
1567
|
+
for read in bam_file.fetch(until_eof=True):
|
|
1568
|
+
if not read.query_name:
|
|
1569
|
+
continue
|
|
1570
|
+
tag_map: Dict[str, object] = {}
|
|
1571
|
+
if include_cigar:
|
|
1572
|
+
tag_map["CIGAR"] = read.cigarstring
|
|
1573
|
+
if include_flags:
|
|
1574
|
+
tag_map["FLAGS"] = _decode_flags(read.flag)
|
|
1575
|
+
for tag in tag_names_list:
|
|
1576
|
+
try:
|
|
1577
|
+
tag_map[tag] = read.get_tag(tag)
|
|
1578
|
+
except Exception:
|
|
1579
|
+
tag_map[tag] = None
|
|
1580
|
+
read_tags[read.query_name] = tag_map
|
|
1581
|
+
else:
|
|
1582
|
+
cmd = ["samtools", "view", "-F", "4", str(bam_file_path)]
|
|
1583
|
+
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
|
1584
|
+
assert proc.stdout is not None
|
|
1585
|
+
for line in proc.stdout:
|
|
1586
|
+
if not line.strip() or line.startswith("@"):
|
|
1587
|
+
continue
|
|
1588
|
+
fields = line.rstrip("\n").split("\t")
|
|
1589
|
+
if len(fields) < 11:
|
|
1590
|
+
continue
|
|
1591
|
+
read_name = fields[0]
|
|
1592
|
+
flag = int(fields[1])
|
|
1593
|
+
cigar = fields[5]
|
|
1594
|
+
tag_map: Dict[str, object] = {}
|
|
1595
|
+
if include_cigar:
|
|
1596
|
+
tag_map["CIGAR"] = cigar
|
|
1597
|
+
if include_flags:
|
|
1598
|
+
tag_map["FLAGS"] = _decode_flags(flag)
|
|
1599
|
+
if tag_names_list:
|
|
1600
|
+
raw_tags = fields[11:]
|
|
1601
|
+
parsed_tags: Dict[str, str] = {}
|
|
1602
|
+
for raw_tag in raw_tags:
|
|
1603
|
+
parts = raw_tag.split(":", 2)
|
|
1604
|
+
if len(parts) == 3:
|
|
1605
|
+
tag_name, _tag_type, value = parts
|
|
1606
|
+
parsed_tags[tag_name.upper()] = value
|
|
1607
|
+
for tag in tag_names_list:
|
|
1608
|
+
tag_map[tag] = parsed_tags.get(tag)
|
|
1609
|
+
read_tags[read_name] = tag_map
|
|
1610
|
+
rc = proc.wait()
|
|
1611
|
+
if rc != 0:
|
|
1612
|
+
stderr = proc.stderr.read() if proc.stderr else ""
|
|
1613
|
+
raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
|
|
1614
|
+
|
|
1615
|
+
return read_tags
|
|
1616
|
+
|
|
1617
|
+
|
|
1618
|
+
def find_secondary_supplementary_read_names(
|
|
1619
|
+
bam_file_path: str | Path,
|
|
1620
|
+
read_names: Iterable[str],
|
|
1621
|
+
samtools_backend: str | None = "auto",
|
|
1622
|
+
) -> tuple[set[str], set[str]]:
|
|
1623
|
+
"""Find read names with secondary or supplementary alignments in a BAM.
|
|
1624
|
+
|
|
1625
|
+
Args:
|
|
1626
|
+
bam_file_path: Path to the BAM file to scan.
|
|
1627
|
+
read_names: Iterable of read names to check.
|
|
1628
|
+
samtools_backend: Backend selection for samtools-compatible operations (auto|python|cli).
|
|
1629
|
+
|
|
1630
|
+
Returns:
|
|
1631
|
+
Tuple of (secondary_read_names, supplementary_read_names).
|
|
1632
|
+
"""
|
|
1633
|
+
target_names = set(read_names)
|
|
1634
|
+
if not target_names:
|
|
1635
|
+
return set(), set()
|
|
1636
|
+
|
|
1637
|
+
secondary_reads: set[str] = set()
|
|
1638
|
+
supplementary_reads: set[str] = set()
|
|
1639
|
+
backend_choice = _resolve_samtools_backend(samtools_backend)
|
|
1640
|
+
|
|
1641
|
+
if backend_choice == "python":
|
|
1642
|
+
pysam_mod = _require_pysam()
|
|
1643
|
+
with pysam_mod.AlignmentFile(str(bam_file_path), "rb") as bam_file:
|
|
1644
|
+
for read in bam_file.fetch(until_eof=True):
|
|
1645
|
+
if not read.query_name or read.query_name not in target_names:
|
|
1646
|
+
continue
|
|
1647
|
+
if read.is_secondary:
|
|
1648
|
+
secondary_reads.add(read.query_name)
|
|
1649
|
+
if read.is_supplementary:
|
|
1650
|
+
supplementary_reads.add(read.query_name)
|
|
1651
|
+
else:
|
|
1652
|
+
|
|
1653
|
+
def _collect(flag: int) -> set[str]:
|
|
1654
|
+
cmd = ["samtools", "view", "-f", str(flag), str(bam_file_path)]
|
|
1655
|
+
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
|
1656
|
+
assert proc.stdout is not None
|
|
1657
|
+
hits: set[str] = set()
|
|
1658
|
+
for line in proc.stdout:
|
|
1659
|
+
if not line.strip() or line.startswith("@"):
|
|
1660
|
+
continue
|
|
1661
|
+
read_name = line.split("\t", 1)[0]
|
|
1662
|
+
if read_name in target_names:
|
|
1663
|
+
hits.add(read_name)
|
|
1664
|
+
rc = proc.wait()
|
|
1665
|
+
if rc != 0:
|
|
1666
|
+
stderr = proc.stderr.read() if proc.stderr else ""
|
|
1667
|
+
raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
|
|
1668
|
+
return hits
|
|
1669
|
+
|
|
1670
|
+
secondary_reads = _collect(0x100)
|
|
1671
|
+
supplementary_reads = _collect(0x800)
|
|
1672
|
+
|
|
1673
|
+
return secondary_reads, supplementary_reads
|
|
1674
|
+
|
|
1675
|
+
|
|
1676
|
+
def extract_secondary_supplementary_alignment_spans(
|
|
1677
|
+
bam_file_path: str | Path,
|
|
1678
|
+
read_names: Iterable[str],
|
|
1679
|
+
samtools_backend: str | None = "auto",
|
|
1680
|
+
) -> tuple[
|
|
1681
|
+
dict[str, list[tuple[float, float, float]]], dict[str, list[tuple[float, float, float]]]
|
|
1682
|
+
]:
|
|
1683
|
+
"""Extract reference/read span data for secondary/supplementary alignments.
|
|
1684
|
+
|
|
1685
|
+
Args:
|
|
1686
|
+
bam_file_path: Path to the BAM file to scan.
|
|
1687
|
+
read_names: Iterable of read names to check.
|
|
1688
|
+
samtools_backend: Backend selection for samtools-compatible operations (auto|python|cli).
|
|
1689
|
+
|
|
1690
|
+
Returns:
|
|
1691
|
+
Tuple of (secondary_spans, supplementary_spans) where each mapping contains
|
|
1692
|
+
read names mapped to lists of (reference_start, reference_end, read_span).
|
|
1693
|
+
"""
|
|
1694
|
+
target_names = set(read_names)
|
|
1695
|
+
if not target_names:
|
|
1696
|
+
return {}, {}
|
|
1697
|
+
|
|
1698
|
+
secondary_spans: dict[str, list[tuple[float, float, float]]] = {}
|
|
1699
|
+
supplementary_spans: dict[str, list[tuple[float, float, float]]] = {}
|
|
1700
|
+
backend_choice = _resolve_samtools_backend(samtools_backend)
|
|
1701
|
+
|
|
1702
|
+
if backend_choice == "python":
|
|
1703
|
+
pysam_mod = _require_pysam()
|
|
1704
|
+
with pysam_mod.AlignmentFile(str(bam_file_path), "rb") as bam_file:
|
|
1705
|
+
for read in bam_file.fetch(until_eof=True):
|
|
1706
|
+
if not read.query_name or read.query_name not in target_names:
|
|
1707
|
+
continue
|
|
1708
|
+
if not (read.is_secondary or read.is_supplementary):
|
|
1709
|
+
continue
|
|
1710
|
+
reference_start = (
|
|
1711
|
+
float(read.reference_start)
|
|
1712
|
+
if read.reference_start is not None
|
|
1713
|
+
else float("nan")
|
|
1714
|
+
)
|
|
1715
|
+
reference_end = (
|
|
1716
|
+
float(read.reference_end) if read.reference_end is not None else float("nan")
|
|
1717
|
+
)
|
|
1718
|
+
read_span = (
|
|
1719
|
+
float(read.query_alignment_length)
|
|
1720
|
+
if read.query_alignment_length is not None
|
|
1721
|
+
else float("nan")
|
|
1722
|
+
)
|
|
1723
|
+
if read.is_secondary:
|
|
1724
|
+
secondary_spans.setdefault(read.query_name, []).append(
|
|
1725
|
+
(reference_start, reference_end, read_span)
|
|
1726
|
+
)
|
|
1727
|
+
if read.is_supplementary:
|
|
1728
|
+
supplementary_spans.setdefault(read.query_name, []).append(
|
|
1729
|
+
(reference_start, reference_end, read_span)
|
|
1730
|
+
)
|
|
1731
|
+
return secondary_spans, supplementary_spans
|
|
1732
|
+
|
|
1733
|
+
def _mapped_length_from_cigar(cigar: str) -> int:
|
|
1734
|
+
mapped = 0
|
|
1735
|
+
for length_str, op in re.findall(r"(\d+)([MIDNSHP=XB])", cigar):
|
|
1736
|
+
length = int(length_str)
|
|
1737
|
+
if op in {"M", "=", "X"}:
|
|
1738
|
+
mapped += length
|
|
1739
|
+
return mapped
|
|
1740
|
+
|
|
1741
|
+
def _reference_span_from_cigar(cigar: str) -> int:
|
|
1742
|
+
reference_span = 0
|
|
1743
|
+
for length_str, op in re.findall(r"(\d+)([MIDNSHP=XB])", cigar):
|
|
1744
|
+
length = int(length_str)
|
|
1745
|
+
if op in {"M", "D", "N", "=", "X"}:
|
|
1746
|
+
reference_span += length
|
|
1747
|
+
return reference_span
|
|
1748
|
+
|
|
1749
|
+
def _collect(flag: int) -> dict[str, list[tuple[float, float, float]]]:
|
|
1750
|
+
cmd = ["samtools", "view", "-f", str(flag), str(bam_file_path)]
|
|
1751
|
+
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
|
1752
|
+
assert proc.stdout is not None
|
|
1753
|
+
spans: dict[str, list[tuple[float, float, float]]] = {}
|
|
1754
|
+
for line in proc.stdout:
|
|
1755
|
+
if not line.strip() or line.startswith("@"):
|
|
1756
|
+
continue
|
|
1757
|
+
fields = line.rstrip("\n").split("\t")
|
|
1758
|
+
if len(fields) < 11:
|
|
1759
|
+
continue
|
|
1760
|
+
read_name = fields[0]
|
|
1761
|
+
if read_name not in target_names:
|
|
1762
|
+
continue
|
|
1763
|
+
cigar = fields[5]
|
|
1764
|
+
reference_start = float(int(fields[3]) - 1)
|
|
1765
|
+
if cigar != "*":
|
|
1766
|
+
reference_end = float(reference_start + _reference_span_from_cigar(cigar))
|
|
1767
|
+
read_span = float(_mapped_length_from_cigar(cigar))
|
|
1768
|
+
else:
|
|
1769
|
+
reference_end = float("nan")
|
|
1770
|
+
read_span = float("nan")
|
|
1771
|
+
spans.setdefault(read_name, []).append((reference_start, reference_end, read_span))
|
|
1772
|
+
rc = proc.wait()
|
|
1773
|
+
if rc != 0:
|
|
1774
|
+
stderr = proc.stderr.read() if proc.stderr else ""
|
|
1775
|
+
raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
|
|
1776
|
+
return spans
|
|
1777
|
+
|
|
1778
|
+
secondary_spans = _collect(0x100)
|
|
1779
|
+
supplementary_spans = _collect(0x800)
|
|
1780
|
+
|
|
1781
|
+
return secondary_spans, supplementary_spans
|
|
1782
|
+
|
|
1783
|
+
|
|
1434
1784
|
def extract_readnames_from_bam(aligned_BAM, samtools_backend: str | None = "auto"):
|
|
1435
1785
|
"""
|
|
1436
1786
|
Takes a BAM and writes out a txt file containing read names from the BAM
|