smftools 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. smftools/_version.py +1 -1
  2. smftools/cli/helpers.py +32 -6
  3. smftools/cli/hmm_adata.py +232 -31
  4. smftools/cli/latent_adata.py +318 -0
  5. smftools/cli/load_adata.py +77 -73
  6. smftools/cli/preprocess_adata.py +178 -53
  7. smftools/cli/spatial_adata.py +149 -101
  8. smftools/cli_entry.py +12 -0
  9. smftools/config/conversion.yaml +11 -1
  10. smftools/config/default.yaml +38 -1
  11. smftools/config/experiment_config.py +53 -1
  12. smftools/constants.py +65 -0
  13. smftools/hmm/HMM.py +88 -0
  14. smftools/informatics/__init__.py +6 -0
  15. smftools/informatics/bam_functions.py +358 -8
  16. smftools/informatics/converted_BAM_to_adata.py +584 -163
  17. smftools/informatics/h5ad_functions.py +115 -2
  18. smftools/informatics/modkit_extract_to_adata.py +1003 -425
  19. smftools/informatics/sequence_encoding.py +72 -0
  20. smftools/logging_utils.py +21 -2
  21. smftools/metadata.py +1 -1
  22. smftools/plotting/__init__.py +9 -0
  23. smftools/plotting/general_plotting.py +2411 -628
  24. smftools/plotting/hmm_plotting.py +85 -7
  25. smftools/preprocessing/__init__.py +1 -0
  26. smftools/preprocessing/append_base_context.py +17 -17
  27. smftools/preprocessing/append_mismatch_frequency_sites.py +158 -0
  28. smftools/preprocessing/calculate_consensus.py +1 -1
  29. smftools/preprocessing/calculate_read_modification_stats.py +6 -1
  30. smftools/readwrite.py +53 -17
  31. smftools/schema/anndata_schema_v1.yaml +15 -1
  32. smftools/tools/__init__.py +4 -0
  33. smftools/tools/calculate_leiden.py +57 -0
  34. smftools/tools/calculate_nmf.py +119 -0
  35. smftools/tools/calculate_umap.py +91 -8
  36. smftools/tools/rolling_nn_distance.py +235 -0
  37. smftools/tools/tensor_factorization.py +169 -0
  38. {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/METADATA +8 -6
  39. {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/RECORD +42 -35
  40. {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/WHEEL +0 -0
  41. {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/entry_points.txt +0 -0
  42. {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -15,6 +15,7 @@ from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Un
15
15
  import numpy as np
16
16
  from tqdm import tqdm
17
17
 
18
+ from smftools.constants import MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT
18
19
  from smftools.logging_utils import get_logger
19
20
  from smftools.optional_imports import require
20
21
 
@@ -32,6 +33,20 @@ logger = get_logger(__name__)
32
33
 
33
34
  _PROGRESS_RE = re.compile(r"Output records written:\s*(\d+)")
34
35
  _EMPTY_RE = re.compile(r"^\s*$")
36
+ _BAM_FLAG_BITS: Tuple[Tuple[int, str], ...] = (
37
+ (0x1, "paired"),
38
+ (0x2, "proper_pair"),
39
+ (0x4, "unmapped"),
40
+ (0x8, "mate_unmapped"),
41
+ (0x10, "reverse"),
42
+ (0x20, "mate_reverse"),
43
+ (0x40, "read1"),
44
+ (0x80, "read2"),
45
+ (0x100, "secondary"),
46
+ (0x200, "qc_fail"),
47
+ (0x400, "duplicate"),
48
+ (0x800, "supplementary"),
49
+ )
35
50
 
36
51
 
37
52
  def _require_pysam() -> "pysam_types":
@@ -260,6 +275,7 @@ def _index_bam_with_samtools(bam_path: Union[str, Path], threads: Optional[int]
260
275
  def align_and_sort_BAM(
261
276
  fasta,
262
277
  input,
278
+ output,
263
279
  cfg,
264
280
  ):
265
281
  """
@@ -279,10 +295,9 @@ def align_and_sort_BAM(
279
295
  input_suffix = input.suffix
280
296
  input_as_fastq = input.with_name(input.stem + ".fastq")
281
297
 
282
- output_path_minus_suffix = cfg.output_directory / input.stem
283
-
284
- aligned_BAM = output_path_minus_suffix.with_name(output_path_minus_suffix.stem + "_aligned")
298
+ aligned_BAM = output.parent / output.stem
285
299
  aligned_output = aligned_BAM.with_suffix(cfg.bam_suffix)
300
+
286
301
  aligned_sorted_BAM = aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
287
302
  aligned_sorted_output = aligned_sorted_BAM.with_suffix(cfg.bam_suffix)
288
303
 
@@ -1168,7 +1183,7 @@ def demux_and_index_BAM(
1168
1183
 
1169
1184
  def extract_base_identities(
1170
1185
  bam_file,
1171
- chromosome,
1186
+ record,
1172
1187
  positions,
1173
1188
  max_reference_length,
1174
1189
  sequence,
@@ -1179,7 +1194,7 @@ def extract_base_identities(
1179
1194
 
1180
1195
  Parameters:
1181
1196
  bam_file (str): Path to the BAM file.
1182
- chromosome (str): Name of the reference chromosome.
1197
+ record (str): Name of the reference record.
1183
1198
  positions (list): Positions to extract (0-based).
1184
1199
  max_reference_length (int): Maximum reference length for padding.
1185
1200
  sequence (str): The sequence of the record fasta
@@ -1187,6 +1202,11 @@ def extract_base_identities(
1187
1202
  Returns:
1188
1203
  dict: Base identities from forward mapped reads.
1189
1204
  dict: Base identities from reverse mapped reads.
1205
+ dict: Mismatch counts per read.
1206
+ dict: Mismatch trends per read.
1207
+ dict: Integer-encoded mismatch bases per read.
1208
+ dict: Base quality scores per read aligned to reference positions.
1209
+ dict: Read span masks per read (1 within span, 0 outside).
1190
1210
  """
1191
1211
  logger.debug("Extracting nucleotide identities for each read using extract_base_identities")
1192
1212
  timestamp = time.strftime("[%Y-%m-%d %H:%M:%S]")
@@ -1195,9 +1215,24 @@ def extract_base_identities(
1195
1215
  fwd_base_identities = defaultdict(lambda: np.full(max_reference_length, "N", dtype="<U1"))
1196
1216
  rev_base_identities = defaultdict(lambda: np.full(max_reference_length, "N", dtype="<U1"))
1197
1217
  mismatch_counts_per_read = defaultdict(lambda: defaultdict(Counter))
1218
+ mismatch_base_identities = defaultdict(
1219
+ lambda: np.full(
1220
+ max_reference_length,
1221
+ MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT["N"],
1222
+ dtype=np.int16,
1223
+ )
1224
+ )
1225
+ base_quality_scores = defaultdict(lambda: np.full(max_reference_length, -1, dtype=np.int16))
1226
+ read_span_masks = defaultdict(lambda: np.zeros(max_reference_length, dtype=np.int8))
1198
1227
 
1199
1228
  backend_choice = _resolve_samtools_backend(samtools_backend)
1200
1229
  ref_seq = sequence.upper()
1230
+ sequence_length = len(sequence)
1231
+
1232
+ def _encode_mismatch_base(base: str) -> int:
1233
+ return MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT.get(
1234
+ base.upper(), MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT["N"]
1235
+ )
1201
1236
 
1202
1237
  if backend_choice == "python":
1203
1238
  logger.debug("Extracting base identities using python")
@@ -1205,26 +1240,45 @@ def extract_base_identities(
1205
1240
  # print(f"{timestamp} Reading reads from {chromosome} BAM file: {bam_file}")
1206
1241
  with pysam_mod.AlignmentFile(str(bam_file), "rb") as bam:
1207
1242
  total_reads = bam.mapped
1208
- for read in bam.fetch(chromosome):
1243
+ for read in bam.fetch(record):
1209
1244
  if not read.is_mapped:
1210
1245
  continue # Skip unmapped reads
1211
1246
 
1212
1247
  read_name = read.query_name
1213
1248
  query_sequence = read.query_sequence
1249
+ query_qualities = read.query_qualities or []
1214
1250
  base_dict = rev_base_identities if read.is_reverse else fwd_base_identities
1215
1251
 
1252
+ # Init arrays for each read in each dict
1253
+ mismatch_base_identities[read_name]
1254
+ base_quality_scores[read_name]
1255
+ read_span_masks[read_name]
1256
+
1257
+ if read.reference_start is not None and read.reference_end is not None:
1258
+ span_end = min(read.reference_end, max_reference_length)
1259
+ read_span_masks[read_name][read.reference_start : span_end] = 1
1260
+
1216
1261
  # Use get_aligned_pairs directly with positions filtering
1217
1262
  aligned_pairs = read.get_aligned_pairs(matches_only=True)
1218
1263
 
1219
1264
  for read_position, reference_position in aligned_pairs:
1265
+ if reference_position is None or read_position is None:
1266
+ continue
1220
1267
  read_base = query_sequence[read_position]
1221
1268
  ref_base = ref_seq[reference_position]
1222
1269
  if reference_position in positions:
1223
1270
  base_dict[read_name][reference_position] = read_base
1271
+ if read_position < len(query_qualities):
1272
+ base_quality_scores[read_name][reference_position] = query_qualities[
1273
+ read_position
1274
+ ]
1224
1275
 
1225
1276
  # Track mismatches (excluding Ns)
1226
1277
  if read_base != ref_base and read_base != "N" and ref_base != "N":
1227
1278
  mismatch_counts_per_read[read_name][ref_base][read_base] += 1
1279
+ mismatch_base_identities[read_name][reference_position] = (
1280
+ _encode_mismatch_base(read_base)
1281
+ )
1228
1282
  else:
1229
1283
  bam_path = Path(bam_file)
1230
1284
  logger.debug("Extracting base identities using samtools")
@@ -1247,7 +1301,14 @@ def extract_base_identities(
1247
1301
  elif op in {"H", "P"}:
1248
1302
  continue
1249
1303
 
1250
- cmd = ["samtools", "view", "-F", "4", str(bam_path), chromosome]
1304
+ def _reference_span_from_cigar(cigar: str) -> int:
1305
+ span = 0
1306
+ for length_str, op in re.findall(r"(\d+)([MIDNSHP=XB])", cigar):
1307
+ if op in {"M", "D", "N", "=", "X"}:
1308
+ span += int(length_str)
1309
+ return span
1310
+
1311
+ cmd = ["samtools", "view", "-F", "4", str(bam_path), record]
1251
1312
  proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
1252
1313
  assert proc.stdout is not None
1253
1314
  for line in proc.stdout:
@@ -1261,9 +1322,21 @@ def extract_base_identities(
1261
1322
  pos = int(fields[3])
1262
1323
  cigar = fields[5]
1263
1324
  query_sequence = fields[9]
1325
+ qual_string = fields[10]
1264
1326
  if cigar == "*" or query_sequence == "*":
1265
1327
  continue
1266
1328
  base_dict = rev_base_identities if (flag & 16) else fwd_base_identities
1329
+ mismatch_base_identities[read_name]
1330
+ base_quality_scores[read_name]
1331
+ read_span_masks[read_name]
1332
+ qualities = (
1333
+ [ord(ch) - 33 for ch in qual_string] if qual_string and qual_string != "*" else []
1334
+ )
1335
+ ref_start = pos - 1
1336
+ ref_end = ref_start + _reference_span_from_cigar(cigar)
1337
+ span_end = min(ref_end, max_reference_length)
1338
+ if ref_start < max_reference_length:
1339
+ read_span_masks[read_name][ref_start:span_end] = 1
1267
1340
  for read_pos, ref_pos in _iter_aligned_pairs(cigar, pos - 1):
1268
1341
  if read_pos >= len(query_sequence) or ref_pos >= len(ref_seq):
1269
1342
  continue
@@ -1271,8 +1344,11 @@ def extract_base_identities(
1271
1344
  ref_base = ref_seq[ref_pos]
1272
1345
  if ref_pos in positions:
1273
1346
  base_dict[read_name][ref_pos] = read_base
1347
+ if read_pos < len(qualities):
1348
+ base_quality_scores[read_name][ref_pos] = qualities[read_pos]
1274
1349
  if read_base != ref_base and read_base != "N" and ref_base != "N":
1275
1350
  mismatch_counts_per_read[read_name][ref_base][read_base] += 1
1351
+ mismatch_base_identities[read_name][ref_pos] = _encode_mismatch_base(read_base)
1276
1352
  rc = proc.wait()
1277
1353
  if rc != 0:
1278
1354
  stderr = proc.stderr.read() if proc.stderr else ""
@@ -1293,11 +1369,19 @@ def extract_base_identities(
1293
1369
  else:
1294
1370
  mismatch_trend_per_read[read_name] = "none"
1295
1371
 
1372
+ if sequence_length < max_reference_length:
1373
+ padding_value = MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT["PAD"]
1374
+ for mismatch_values in mismatch_base_identities.values():
1375
+ mismatch_values[sequence_length:] = padding_value
1376
+
1296
1377
  return (
1297
1378
  dict(fwd_base_identities),
1298
1379
  dict(rev_base_identities),
1299
1380
  dict(mismatch_counts_per_read),
1300
1381
  mismatch_trend_per_read,
1382
+ dict(mismatch_base_identities),
1383
+ dict(base_quality_scores),
1384
+ dict(read_span_masks),
1301
1385
  )
1302
1386
 
1303
1387
 
@@ -1312,7 +1396,7 @@ def extract_read_features_from_bam(
1312
1396
 
1313
1397
  Returns:
1314
1398
  Mapping of read name to [read_length, read_median_qscore, reference_length,
1315
- mapped_length, mapping_quality].
1399
+ mapped_length, mapping_quality, reference_start, reference_end].
1316
1400
  """
1317
1401
  logger.debug(
1318
1402
  "Extracting read metrics from BAM using extract_read_features_from_bam: %s",
@@ -1336,12 +1420,16 @@ def extract_read_features_from_bam(
1336
1420
  reference_length = reference_lengths.get(read.reference_name, float("nan"))
1337
1421
  mapped_length = sum(end - start for start, end in read.get_blocks())
1338
1422
  mapping_quality = float(read.mapping_quality)
1423
+ reference_start = float(read.reference_start)
1424
+ reference_end = float(read.reference_end)
1339
1425
  read_metrics[read.query_name] = [
1340
1426
  float(read.query_length),
1341
1427
  median_read_quality,
1342
1428
  float(reference_length),
1343
1429
  float(mapped_length),
1344
1430
  mapping_quality,
1431
+ reference_start,
1432
+ reference_end,
1345
1433
  ]
1346
1434
  return read_metrics
1347
1435
 
@@ -1372,6 +1460,14 @@ def extract_read_features_from_bam(
1372
1460
  mapped += length
1373
1461
  return mapped
1374
1462
 
1463
+ def _reference_span_from_cigar(cigar: str) -> int:
1464
+ reference_span = 0
1465
+ for length_str, op in re.findall(r"(\d+)([MIDNSHP=XB])", cigar):
1466
+ length = int(length_str)
1467
+ if op in {"M", "D", "N", "=", "X"}:
1468
+ reference_span += length
1469
+ return reference_span
1470
+
1375
1471
  header_cp = subprocess.run(
1376
1472
  ["samtools", "view", "-H", str(bam_path)],
1377
1473
  stdout=subprocess.PIPE,
@@ -1402,6 +1498,7 @@ def extract_read_features_from_bam(
1402
1498
  reference_name = fields[2]
1403
1499
  mapping_quality = float(fields[4])
1404
1500
  cigar = fields[5]
1501
+ reference_start = float(int(fields[3]) - 1)
1405
1502
  sequence = fields[9]
1406
1503
  quality = fields[10]
1407
1504
  if sequence == "*":
@@ -1415,12 +1512,18 @@ def extract_read_features_from_bam(
1415
1512
  median_read_quality = float(np.median(phreds))
1416
1513
  reference_length = float(reference_lengths.get(reference_name, float("nan")))
1417
1514
  mapped_length = float(_mapped_length_from_cigar(cigar)) if cigar != "*" else 0.0
1515
+ if cigar != "*":
1516
+ reference_end = float(reference_start + _reference_span_from_cigar(cigar))
1517
+ else:
1518
+ reference_end = float("nan")
1418
1519
  read_metrics[read_name] = [
1419
1520
  read_length,
1420
1521
  median_read_quality,
1421
1522
  reference_length,
1422
1523
  mapped_length,
1423
1524
  mapping_quality,
1525
+ reference_start,
1526
+ reference_end,
1424
1527
  ]
1425
1528
 
1426
1529
  rc = proc.wait()
@@ -1431,6 +1534,253 @@ def extract_read_features_from_bam(
1431
1534
  return read_metrics
1432
1535
 
1433
1536
 
1537
+ def extract_read_tags_from_bam(
1538
+ bam_file_path: str | Path,
1539
+ tag_names: Iterable[str] | None = None,
1540
+ include_flags: bool = True,
1541
+ include_cigar: bool = True,
1542
+ samtools_backend: str | None = "auto",
1543
+ ) -> Dict[str, Dict[str, object]]:
1544
+ """Extract per-read tag metadata from a BAM file.
1545
+
1546
+ Args:
1547
+ bam_file_path: Path to the BAM file.
1548
+ tag_names: Iterable of BAM tag names to extract (e.g., ["NM", "MD", "MM", "ML"]).
1549
+ If None, only flags/cigar are populated.
1550
+ include_flags: Whether to include a list of flag names for each read.
1551
+ include_cigar: Whether to include the CIGAR string for each read.
1552
+ samtools_backend: Backend selection for samtools-compatible operations (auto|python|cli).
1553
+
1554
+ Returns:
1555
+ Mapping of read name to a dict of extracted tag values.
1556
+ """
1557
+ backend_choice = _resolve_samtools_backend(samtools_backend)
1558
+ tag_names_list = [tag.upper() for tag in tag_names] if tag_names else []
1559
+ read_tags: Dict[str, Dict[str, object]] = {}
1560
+
1561
+ def _decode_flags(flag: int) -> list[str]:
1562
+ return [name for bit, name in _BAM_FLAG_BITS if flag & bit]
1563
+
1564
+ if backend_choice == "python":
1565
+ pysam_mod = _require_pysam()
1566
+ with pysam_mod.AlignmentFile(str(bam_file_path), "rb") as bam_file:
1567
+ for read in bam_file.fetch(until_eof=True):
1568
+ if not read.query_name:
1569
+ continue
1570
+ tag_map: Dict[str, object] = {}
1571
+ if include_cigar:
1572
+ tag_map["CIGAR"] = read.cigarstring
1573
+ if include_flags:
1574
+ tag_map["FLAGS"] = _decode_flags(read.flag)
1575
+ for tag in tag_names_list:
1576
+ try:
1577
+ tag_map[tag] = read.get_tag(tag)
1578
+ except Exception:
1579
+ tag_map[tag] = None
1580
+ read_tags[read.query_name] = tag_map
1581
+ else:
1582
+ cmd = ["samtools", "view", "-F", "4", str(bam_file_path)]
1583
+ proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
1584
+ assert proc.stdout is not None
1585
+ for line in proc.stdout:
1586
+ if not line.strip() or line.startswith("@"):
1587
+ continue
1588
+ fields = line.rstrip("\n").split("\t")
1589
+ if len(fields) < 11:
1590
+ continue
1591
+ read_name = fields[0]
1592
+ flag = int(fields[1])
1593
+ cigar = fields[5]
1594
+ tag_map: Dict[str, object] = {}
1595
+ if include_cigar:
1596
+ tag_map["CIGAR"] = cigar
1597
+ if include_flags:
1598
+ tag_map["FLAGS"] = _decode_flags(flag)
1599
+ if tag_names_list:
1600
+ raw_tags = fields[11:]
1601
+ parsed_tags: Dict[str, str] = {}
1602
+ for raw_tag in raw_tags:
1603
+ parts = raw_tag.split(":", 2)
1604
+ if len(parts) == 3:
1605
+ tag_name, _tag_type, value = parts
1606
+ parsed_tags[tag_name.upper()] = value
1607
+ for tag in tag_names_list:
1608
+ tag_map[tag] = parsed_tags.get(tag)
1609
+ read_tags[read_name] = tag_map
1610
+ rc = proc.wait()
1611
+ if rc != 0:
1612
+ stderr = proc.stderr.read() if proc.stderr else ""
1613
+ raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
1614
+
1615
+ return read_tags
1616
+
1617
+
1618
+ def find_secondary_supplementary_read_names(
1619
+ bam_file_path: str | Path,
1620
+ read_names: Iterable[str],
1621
+ samtools_backend: str | None = "auto",
1622
+ ) -> tuple[set[str], set[str]]:
1623
+ """Find read names with secondary or supplementary alignments in a BAM.
1624
+
1625
+ Args:
1626
+ bam_file_path: Path to the BAM file to scan.
1627
+ read_names: Iterable of read names to check.
1628
+ samtools_backend: Backend selection for samtools-compatible operations (auto|python|cli).
1629
+
1630
+ Returns:
1631
+ Tuple of (secondary_read_names, supplementary_read_names).
1632
+ """
1633
+ target_names = set(read_names)
1634
+ if not target_names:
1635
+ return set(), set()
1636
+
1637
+ secondary_reads: set[str] = set()
1638
+ supplementary_reads: set[str] = set()
1639
+ backend_choice = _resolve_samtools_backend(samtools_backend)
1640
+
1641
+ if backend_choice == "python":
1642
+ pysam_mod = _require_pysam()
1643
+ with pysam_mod.AlignmentFile(str(bam_file_path), "rb") as bam_file:
1644
+ for read in bam_file.fetch(until_eof=True):
1645
+ if not read.query_name or read.query_name not in target_names:
1646
+ continue
1647
+ if read.is_secondary:
1648
+ secondary_reads.add(read.query_name)
1649
+ if read.is_supplementary:
1650
+ supplementary_reads.add(read.query_name)
1651
+ else:
1652
+
1653
+ def _collect(flag: int) -> set[str]:
1654
+ cmd = ["samtools", "view", "-f", str(flag), str(bam_file_path)]
1655
+ proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
1656
+ assert proc.stdout is not None
1657
+ hits: set[str] = set()
1658
+ for line in proc.stdout:
1659
+ if not line.strip() or line.startswith("@"):
1660
+ continue
1661
+ read_name = line.split("\t", 1)[0]
1662
+ if read_name in target_names:
1663
+ hits.add(read_name)
1664
+ rc = proc.wait()
1665
+ if rc != 0:
1666
+ stderr = proc.stderr.read() if proc.stderr else ""
1667
+ raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
1668
+ return hits
1669
+
1670
+ secondary_reads = _collect(0x100)
1671
+ supplementary_reads = _collect(0x800)
1672
+
1673
+ return secondary_reads, supplementary_reads
1674
+
1675
+
1676
+ def extract_secondary_supplementary_alignment_spans(
1677
+ bam_file_path: str | Path,
1678
+ read_names: Iterable[str],
1679
+ samtools_backend: str | None = "auto",
1680
+ ) -> tuple[
1681
+ dict[str, list[tuple[float, float, float]]], dict[str, list[tuple[float, float, float]]]
1682
+ ]:
1683
+ """Extract reference/read span data for secondary/supplementary alignments.
1684
+
1685
+ Args:
1686
+ bam_file_path: Path to the BAM file to scan.
1687
+ read_names: Iterable of read names to check.
1688
+ samtools_backend: Backend selection for samtools-compatible operations (auto|python|cli).
1689
+
1690
+ Returns:
1691
+ Tuple of (secondary_spans, supplementary_spans) where each mapping contains
1692
+ read names mapped to lists of (reference_start, reference_end, read_span).
1693
+ """
1694
+ target_names = set(read_names)
1695
+ if not target_names:
1696
+ return {}, {}
1697
+
1698
+ secondary_spans: dict[str, list[tuple[float, float, float]]] = {}
1699
+ supplementary_spans: dict[str, list[tuple[float, float, float]]] = {}
1700
+ backend_choice = _resolve_samtools_backend(samtools_backend)
1701
+
1702
+ if backend_choice == "python":
1703
+ pysam_mod = _require_pysam()
1704
+ with pysam_mod.AlignmentFile(str(bam_file_path), "rb") as bam_file:
1705
+ for read in bam_file.fetch(until_eof=True):
1706
+ if not read.query_name or read.query_name not in target_names:
1707
+ continue
1708
+ if not (read.is_secondary or read.is_supplementary):
1709
+ continue
1710
+ reference_start = (
1711
+ float(read.reference_start)
1712
+ if read.reference_start is not None
1713
+ else float("nan")
1714
+ )
1715
+ reference_end = (
1716
+ float(read.reference_end) if read.reference_end is not None else float("nan")
1717
+ )
1718
+ read_span = (
1719
+ float(read.query_alignment_length)
1720
+ if read.query_alignment_length is not None
1721
+ else float("nan")
1722
+ )
1723
+ if read.is_secondary:
1724
+ secondary_spans.setdefault(read.query_name, []).append(
1725
+ (reference_start, reference_end, read_span)
1726
+ )
1727
+ if read.is_supplementary:
1728
+ supplementary_spans.setdefault(read.query_name, []).append(
1729
+ (reference_start, reference_end, read_span)
1730
+ )
1731
+ return secondary_spans, supplementary_spans
1732
+
1733
+ def _mapped_length_from_cigar(cigar: str) -> int:
1734
+ mapped = 0
1735
+ for length_str, op in re.findall(r"(\d+)([MIDNSHP=XB])", cigar):
1736
+ length = int(length_str)
1737
+ if op in {"M", "=", "X"}:
1738
+ mapped += length
1739
+ return mapped
1740
+
1741
+ def _reference_span_from_cigar(cigar: str) -> int:
1742
+ reference_span = 0
1743
+ for length_str, op in re.findall(r"(\d+)([MIDNSHP=XB])", cigar):
1744
+ length = int(length_str)
1745
+ if op in {"M", "D", "N", "=", "X"}:
1746
+ reference_span += length
1747
+ return reference_span
1748
+
1749
+ def _collect(flag: int) -> dict[str, list[tuple[float, float, float]]]:
1750
+ cmd = ["samtools", "view", "-f", str(flag), str(bam_file_path)]
1751
+ proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
1752
+ assert proc.stdout is not None
1753
+ spans: dict[str, list[tuple[float, float, float]]] = {}
1754
+ for line in proc.stdout:
1755
+ if not line.strip() or line.startswith("@"):
1756
+ continue
1757
+ fields = line.rstrip("\n").split("\t")
1758
+ if len(fields) < 11:
1759
+ continue
1760
+ read_name = fields[0]
1761
+ if read_name not in target_names:
1762
+ continue
1763
+ cigar = fields[5]
1764
+ reference_start = float(int(fields[3]) - 1)
1765
+ if cigar != "*":
1766
+ reference_end = float(reference_start + _reference_span_from_cigar(cigar))
1767
+ read_span = float(_mapped_length_from_cigar(cigar))
1768
+ else:
1769
+ reference_end = float("nan")
1770
+ read_span = float("nan")
1771
+ spans.setdefault(read_name, []).append((reference_start, reference_end, read_span))
1772
+ rc = proc.wait()
1773
+ if rc != 0:
1774
+ stderr = proc.stderr.read() if proc.stderr else ""
1775
+ raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
1776
+ return spans
1777
+
1778
+ secondary_spans = _collect(0x100)
1779
+ supplementary_spans = _collect(0x800)
1780
+
1781
+ return secondary_spans, supplementary_spans
1782
+
1783
+
1434
1784
  def extract_readnames_from_bam(aligned_BAM, samtools_backend: str | None = "auto"):
1435
1785
  """
1436
1786
  Takes a BAM and writes out a txt file containing read names from the BAM