smftools 0.2.5__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. smftools/__init__.py +39 -7
  2. smftools/_settings.py +2 -0
  3. smftools/_version.py +3 -1
  4. smftools/cli/__init__.py +1 -0
  5. smftools/cli/archived/cli_flows.py +2 -0
  6. smftools/cli/helpers.py +34 -6
  7. smftools/cli/hmm_adata.py +239 -33
  8. smftools/cli/latent_adata.py +318 -0
  9. smftools/cli/load_adata.py +167 -131
  10. smftools/cli/preprocess_adata.py +180 -53
  11. smftools/cli/spatial_adata.py +152 -100
  12. smftools/cli_entry.py +38 -1
  13. smftools/config/__init__.py +2 -0
  14. smftools/config/conversion.yaml +11 -1
  15. smftools/config/default.yaml +42 -2
  16. smftools/config/experiment_config.py +59 -1
  17. smftools/constants.py +65 -0
  18. smftools/datasets/__init__.py +2 -0
  19. smftools/hmm/HMM.py +97 -3
  20. smftools/hmm/__init__.py +24 -13
  21. smftools/hmm/archived/apply_hmm_batched.py +2 -0
  22. smftools/hmm/archived/calculate_distances.py +2 -0
  23. smftools/hmm/archived/call_hmm_peaks.py +2 -0
  24. smftools/hmm/archived/train_hmm.py +2 -0
  25. smftools/hmm/call_hmm_peaks.py +5 -2
  26. smftools/hmm/display_hmm.py +4 -1
  27. smftools/hmm/hmm_readwrite.py +7 -2
  28. smftools/hmm/nucleosome_hmm_refinement.py +2 -0
  29. smftools/informatics/__init__.py +59 -34
  30. smftools/informatics/archived/bam_conversion.py +2 -0
  31. smftools/informatics/archived/bam_direct.py +2 -0
  32. smftools/informatics/archived/basecall_pod5s.py +2 -0
  33. smftools/informatics/archived/basecalls_to_adata.py +2 -0
  34. smftools/informatics/archived/conversion_smf.py +2 -0
  35. smftools/informatics/archived/deaminase_smf.py +1 -0
  36. smftools/informatics/archived/direct_smf.py +2 -0
  37. smftools/informatics/archived/fast5_to_pod5.py +2 -0
  38. smftools/informatics/archived/helpers/archived/__init__.py +2 -0
  39. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +2 -0
  40. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
  41. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
  42. smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
  43. smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
  44. smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
  45. smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
  46. smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
  47. smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
  48. smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
  49. smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
  50. smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
  51. smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
  52. smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
  53. smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
  54. smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
  55. smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
  56. smftools/informatics/archived/helpers/archived/informatics.py +2 -0
  57. smftools/informatics/archived/helpers/archived/load_adata.py +2 -0
  58. smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
  59. smftools/informatics/archived/helpers/archived/modQC.py +2 -0
  60. smftools/informatics/archived/helpers/archived/modcall.py +2 -0
  61. smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
  62. smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
  63. smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
  64. smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
  65. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +2 -0
  66. smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
  67. smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
  68. smftools/informatics/archived/print_bam_query_seq.py +2 -0
  69. smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
  70. smftools/informatics/archived/subsample_pod5.py +2 -0
  71. smftools/informatics/bam_functions.py +1093 -176
  72. smftools/informatics/basecalling.py +2 -0
  73. smftools/informatics/bed_functions.py +271 -61
  74. smftools/informatics/binarize_converted_base_identities.py +3 -0
  75. smftools/informatics/complement_base_list.py +2 -0
  76. smftools/informatics/converted_BAM_to_adata.py +641 -176
  77. smftools/informatics/fasta_functions.py +94 -10
  78. smftools/informatics/h5ad_functions.py +123 -4
  79. smftools/informatics/modkit_extract_to_adata.py +1019 -431
  80. smftools/informatics/modkit_functions.py +2 -0
  81. smftools/informatics/ohe.py +2 -0
  82. smftools/informatics/pod5_functions.py +3 -2
  83. smftools/informatics/sequence_encoding.py +72 -0
  84. smftools/logging_utils.py +21 -2
  85. smftools/machine_learning/__init__.py +22 -6
  86. smftools/machine_learning/data/__init__.py +2 -0
  87. smftools/machine_learning/data/anndata_data_module.py +18 -4
  88. smftools/machine_learning/data/preprocessing.py +2 -0
  89. smftools/machine_learning/evaluation/__init__.py +2 -0
  90. smftools/machine_learning/evaluation/eval_utils.py +2 -0
  91. smftools/machine_learning/evaluation/evaluators.py +14 -9
  92. smftools/machine_learning/inference/__init__.py +2 -0
  93. smftools/machine_learning/inference/inference_utils.py +2 -0
  94. smftools/machine_learning/inference/lightning_inference.py +6 -1
  95. smftools/machine_learning/inference/sklearn_inference.py +2 -0
  96. smftools/machine_learning/inference/sliding_window_inference.py +2 -0
  97. smftools/machine_learning/models/__init__.py +2 -0
  98. smftools/machine_learning/models/base.py +7 -2
  99. smftools/machine_learning/models/cnn.py +7 -2
  100. smftools/machine_learning/models/lightning_base.py +16 -11
  101. smftools/machine_learning/models/mlp.py +5 -1
  102. smftools/machine_learning/models/positional.py +7 -2
  103. smftools/machine_learning/models/rnn.py +5 -1
  104. smftools/machine_learning/models/sklearn_models.py +14 -9
  105. smftools/machine_learning/models/transformer.py +7 -2
  106. smftools/machine_learning/models/wrappers.py +6 -2
  107. smftools/machine_learning/training/__init__.py +2 -0
  108. smftools/machine_learning/training/train_lightning_model.py +13 -3
  109. smftools/machine_learning/training/train_sklearn_model.py +2 -0
  110. smftools/machine_learning/utils/__init__.py +2 -0
  111. smftools/machine_learning/utils/device.py +5 -1
  112. smftools/machine_learning/utils/grl.py +5 -1
  113. smftools/metadata.py +1 -1
  114. smftools/optional_imports.py +31 -0
  115. smftools/plotting/__init__.py +41 -31
  116. smftools/plotting/autocorrelation_plotting.py +9 -5
  117. smftools/plotting/classifiers.py +16 -4
  118. smftools/plotting/general_plotting.py +2415 -629
  119. smftools/plotting/hmm_plotting.py +97 -9
  120. smftools/plotting/position_stats.py +15 -7
  121. smftools/plotting/qc_plotting.py +6 -1
  122. smftools/preprocessing/__init__.py +36 -37
  123. smftools/preprocessing/append_base_context.py +17 -17
  124. smftools/preprocessing/append_mismatch_frequency_sites.py +158 -0
  125. smftools/preprocessing/archived/add_read_length_and_mapping_qc.py +2 -0
  126. smftools/preprocessing/archived/calculate_complexity.py +2 -0
  127. smftools/preprocessing/archived/mark_duplicates.py +2 -0
  128. smftools/preprocessing/archived/preprocessing.py +2 -0
  129. smftools/preprocessing/archived/remove_duplicates.py +2 -0
  130. smftools/preprocessing/binary_layers_to_ohe.py +2 -1
  131. smftools/preprocessing/calculate_complexity_II.py +4 -1
  132. smftools/preprocessing/calculate_consensus.py +1 -1
  133. smftools/preprocessing/calculate_pairwise_differences.py +2 -0
  134. smftools/preprocessing/calculate_pairwise_hamming_distances.py +3 -0
  135. smftools/preprocessing/calculate_position_Youden.py +9 -2
  136. smftools/preprocessing/calculate_read_modification_stats.py +6 -1
  137. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +2 -0
  138. smftools/preprocessing/filter_reads_on_modification_thresholds.py +2 -0
  139. smftools/preprocessing/flag_duplicate_reads.py +42 -54
  140. smftools/preprocessing/make_dirs.py +2 -1
  141. smftools/preprocessing/min_non_diagonal.py +2 -0
  142. smftools/preprocessing/recipes.py +2 -0
  143. smftools/readwrite.py +53 -17
  144. smftools/schema/anndata_schema_v1.yaml +15 -1
  145. smftools/tools/__init__.py +30 -18
  146. smftools/tools/archived/apply_hmm.py +2 -0
  147. smftools/tools/archived/classifiers.py +2 -0
  148. smftools/tools/archived/classify_methylated_features.py +2 -0
  149. smftools/tools/archived/classify_non_methylated_features.py +2 -0
  150. smftools/tools/archived/subset_adata_v1.py +2 -0
  151. smftools/tools/archived/subset_adata_v2.py +2 -0
  152. smftools/tools/calculate_leiden.py +57 -0
  153. smftools/tools/calculate_nmf.py +119 -0
  154. smftools/tools/calculate_umap.py +93 -8
  155. smftools/tools/cluster_adata_on_methylation.py +7 -1
  156. smftools/tools/position_stats.py +17 -27
  157. smftools/tools/rolling_nn_distance.py +235 -0
  158. smftools/tools/tensor_factorization.py +169 -0
  159. {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/METADATA +69 -33
  160. smftools-0.3.1.dist-info/RECORD +189 -0
  161. smftools-0.2.5.dist-info/RECORD +0 -181
  162. {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/WHEEL +0 -0
  163. {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/entry_points.txt +0 -0
  164. {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -3,26 +3,131 @@ from __future__ import annotations
3
3
  import glob
4
4
  import os
5
5
  import re
6
+ import shutil
6
7
  import subprocess
7
8
  import time
8
9
  from collections import Counter, defaultdict, deque
9
10
  from concurrent.futures import ThreadPoolExecutor, as_completed
10
11
  from itertools import zip_longest
11
12
  from pathlib import Path
12
- from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
13
+ from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union
13
14
 
14
15
  import numpy as np
15
- import pysam
16
16
  from tqdm import tqdm
17
17
 
18
+ from smftools.constants import MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT
18
19
  from smftools.logging_utils import get_logger
20
+ from smftools.optional_imports import require
19
21
 
20
22
  from ..readwrite import date_string, time_string
21
23
 
24
+ if TYPE_CHECKING:
25
+ import pysam as pysam_types
26
+
27
+ try:
28
+ import pysam
29
+ except Exception:
30
+ pysam = None # type: ignore
31
+
22
32
  logger = get_logger(__name__)
23
33
 
24
34
  _PROGRESS_RE = re.compile(r"Output records written:\s*(\d+)")
25
35
  _EMPTY_RE = re.compile(r"^\s*$")
36
+ _BAM_FLAG_BITS: Tuple[Tuple[int, str], ...] = (
37
+ (0x1, "paired"),
38
+ (0x2, "proper_pair"),
39
+ (0x4, "unmapped"),
40
+ (0x8, "mate_unmapped"),
41
+ (0x10, "reverse"),
42
+ (0x20, "mate_reverse"),
43
+ (0x40, "read1"),
44
+ (0x80, "read2"),
45
+ (0x100, "secondary"),
46
+ (0x200, "qc_fail"),
47
+ (0x400, "duplicate"),
48
+ (0x800, "supplementary"),
49
+ )
50
+
51
+
52
+ def _require_pysam() -> "pysam_types":
53
+ """Return the pysam module or raise if unavailable."""
54
+ if pysam is not None:
55
+ return pysam
56
+ return require("pysam", extra="pysam", purpose="samtools-compatible Python backend")
57
+
58
+
59
+ def _resolve_samtools_backend(backend: str | None) -> str:
60
+ """Resolve backend choice for samtools-compatible operations.
61
+
62
+ Args:
63
+ backend: One of {"auto", "python", "cli"} (case-insensitive).
64
+
65
+ Returns:
66
+ Resolved backend string ("python" or "cli").
67
+ """
68
+ choice = (backend or "auto").strip().lower()
69
+ if choice not in {"auto", "python", "cli"}:
70
+ raise ValueError("samtools_backend must be one of: auto, python, cli")
71
+
72
+ have_pysam = pysam is not None
73
+ have_samtools = shutil.which("samtools") is not None
74
+
75
+ if choice == "python":
76
+ if not have_pysam:
77
+ raise RuntimeError("samtools_backend=python requires pysam to be installed.")
78
+ return "python"
79
+ if choice == "cli":
80
+ if not have_samtools:
81
+ raise RuntimeError("samtools_backend=cli requires samtools in PATH.")
82
+ return "cli"
83
+
84
+ if have_samtools:
85
+ return "cli"
86
+ if have_pysam:
87
+ return "python"
88
+ raise RuntimeError("Neither pysam nor samtools is available in PATH.")
89
+
90
+
91
+ def _has_bam_index(bam_path: Path) -> bool:
92
+ """Return True if the BAM index exists alongside the BAM."""
93
+ return (
94
+ bam_path.with_suffix(bam_path.suffix + ".bai").exists()
95
+ or Path(str(bam_path) + ".bai").exists()
96
+ )
97
+
98
+
99
+ def _ensure_bam_index(bam_path: Path, backend: str) -> None:
100
+ """Ensure a BAM index exists, creating one if needed."""
101
+ if _has_bam_index(bam_path):
102
+ return
103
+ if backend == "python":
104
+ _index_bam_with_pysam(bam_path)
105
+ else:
106
+ _index_bam_with_samtools(bam_path)
107
+
108
+
109
+ def _parse_idxstats_output(output: str) -> Tuple[int, int, Dict[str, Tuple[int, float]]]:
110
+ """Parse samtools idxstats output into counts and proportions."""
111
+ aligned_reads_count = 0
112
+ unaligned_reads_count = 0
113
+ record_counts: Dict[str, int] = {}
114
+ for line in output.splitlines():
115
+ if not line.strip():
116
+ continue
117
+ ref, _length, mapped, unmapped = line.split("\t")[:4]
118
+ if ref == "*":
119
+ unaligned_reads_count += int(unmapped)
120
+ continue
121
+ mapped_count = int(mapped)
122
+ aligned_reads_count += mapped_count
123
+ record_counts[ref] = mapped_count
124
+
125
+ proportions: Dict[str, Tuple[int, float]] = {}
126
+ for ref, count in record_counts.items():
127
+ proportion = count / aligned_reads_count if aligned_reads_count else 0.0
128
+ proportions[ref] = (count, proportion)
129
+
130
+ return aligned_reads_count, unaligned_reads_count, proportions
26
131
 
27
132
 
28
133
  def _stream_dorado_logs(stderr_iter) -> None:
@@ -60,8 +165,9 @@ def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str,
60
165
 
61
166
  logger.debug(f"Converting BAM to FASTQ using _bam_to_fastq_with_pysam")
62
167
 
168
+ pysam_mod = _require_pysam()
63
169
  with (
64
- pysam.AlignmentFile(bam_path, "rb", check_sq=False) as bam,
170
+ pysam_mod.AlignmentFile(bam_path, "rb", check_sq=False) as bam,
65
171
  open(fastq_path, "w", encoding="utf-8") as fq,
66
172
  ):
67
173
  for r in bam.fetch(until_eof=True):
@@ -103,7 +209,8 @@ def _sort_bam_with_pysam(
103
209
  if threads:
104
210
  args += ["-@", str(threads)]
105
211
  args += ["-o", out_bam, in_bam]
106
- pysam.sort(*args)
212
+ pysam_mod = _require_pysam()
213
+ pysam_mod.sort(*args)
107
214
 
108
215
 
109
216
  def _index_bam_with_pysam(bam_path: Union[str, Path], threads: Optional[int] = None) -> None:
@@ -115,16 +222,60 @@ def _index_bam_with_pysam(bam_path: Union[str, Path], threads: Optional[int] = N
115
222
  """
116
223
  bam_path = str(bam_path)
117
224
  logger.debug(f"Indexing BAM using _index_bam_with_pysam")
225
+ pysam_mod = _require_pysam()
118
226
  # pysam.index supports samtools-style args
119
227
  if threads:
120
- pysam.index("-@", str(threads), bam_path)
228
+ pysam_mod.index("-@", str(threads), bam_path)
121
229
  else:
122
- pysam.index(bam_path)
230
+ pysam_mod.index(bam_path)
231
+
232
+
233
+ def _bam_to_fastq_with_samtools(bam_path: Union[str, Path], fastq_path: Union[str, Path]) -> None:
234
+ """Convert BAM to FASTQ using samtools."""
235
+ if not shutil.which("samtools"):
236
+ raise RuntimeError("samtools is required but not available in PATH.")
237
+ cmd = ["samtools", "fastq", str(bam_path)]
238
+ logger.debug("Converting BAM to FASTQ using samtools: %s", " ".join(cmd))
239
+ with open(fastq_path, "w", encoding="utf-8") as fq:
240
+ cp = subprocess.run(cmd, stdout=fq, stderr=subprocess.PIPE, text=True)
241
+ if cp.returncode != 0:
242
+ raise RuntimeError(f"samtools fastq failed (exit {cp.returncode}):\n{cp.stderr}")
243
+
244
+
245
+ def _sort_bam_with_samtools(
246
+ in_bam: Union[str, Path], out_bam: Union[str, Path], threads: Optional[int] = None
247
+ ) -> None:
248
+ """Sort a BAM file using samtools."""
249
+ if not shutil.which("samtools"):
250
+ raise RuntimeError("samtools is required but not available in PATH.")
251
+ cmd = ["samtools", "sort", "-o", str(out_bam)]
252
+ if threads:
253
+ cmd += ["-@", str(threads)]
254
+ cmd.append(str(in_bam))
255
+ logger.debug("Sorting BAM using samtools: %s", " ".join(cmd))
256
+ cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
257
+ if cp.returncode != 0:
258
+ raise RuntimeError(f"samtools sort failed (exit {cp.returncode}):\n{cp.stderr}")
259
+
260
+
261
+ def _index_bam_with_samtools(bam_path: Union[str, Path], threads: Optional[int] = None) -> None:
262
+ """Index a BAM file using samtools."""
263
+ if not shutil.which("samtools"):
264
+ raise RuntimeError("samtools is required but not available in PATH.")
265
+ cmd = ["samtools", "index"]
266
+ if threads:
267
+ cmd += ["-@", str(threads)]
268
+ cmd.append(str(bam_path))
269
+ logger.debug("Indexing BAM using samtools: %s", " ".join(cmd))
270
+ cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
271
+ if cp.returncode != 0:
272
+ raise RuntimeError(f"samtools index failed (exit {cp.returncode}):\n{cp.stderr}")
123
273
 
124
274
 
125
275
  def align_and_sort_BAM(
126
276
  fasta,
127
277
  input,
278
+ output,
128
279
  cfg,
129
280
  ):
130
281
  """
@@ -144,10 +295,9 @@ def align_and_sort_BAM(
144
295
  input_suffix = input.suffix
145
296
  input_as_fastq = input.with_name(input.stem + ".fastq")
146
297
 
147
- output_path_minus_suffix = cfg.output_directory / input.stem
148
-
149
- aligned_BAM = output_path_minus_suffix.with_name(output_path_minus_suffix.stem + "_aligned")
298
+ aligned_BAM = output.parent / output.stem
150
299
  aligned_output = aligned_BAM.with_suffix(cfg.bam_suffix)
300
+
151
301
  aligned_sorted_BAM = aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
152
302
  aligned_sorted_output = aligned_sorted_BAM.with_suffix(cfg.bam_suffix)
153
303
 
@@ -156,10 +306,15 @@ def align_and_sort_BAM(
156
306
  else:
157
307
  threads = None
158
308
 
309
+ samtools_backend = _resolve_samtools_backend(getattr(cfg, "samtools_backend", "auto"))
310
+
159
311
  if cfg.aligner == "minimap2":
160
312
  if not cfg.align_from_bam:
161
313
  logger.debug(f"Converting BAM to FASTQ: {input}")
162
- _bam_to_fastq_with_pysam(input, input_as_fastq)
314
+ if samtools_backend == "python":
315
+ _bam_to_fastq_with_pysam(input, input_as_fastq)
316
+ else:
317
+ _bam_to_fastq_with_samtools(input, input_as_fastq)
163
318
  logger.debug(f"Aligning FASTQ to Reference: {input_as_fastq}")
164
319
  mm_input = input_as_fastq
165
320
  else:
@@ -220,12 +375,18 @@ def align_and_sort_BAM(
220
375
  logger.error(f"Aligner not recognized: {cfg.aligner}. Choose from minimap2 and dorado")
221
376
  return
222
377
 
223
- # --- Sort & Index with pysam ---
378
+ # --- Sort & Index ---
224
379
  logger.debug(f"Sorting: {aligned_output} -> {aligned_sorted_output}")
225
- _sort_bam_with_pysam(aligned_output, aligned_sorted_output, threads=threads)
380
+ if samtools_backend == "python":
381
+ _sort_bam_with_pysam(aligned_output, aligned_sorted_output, threads=threads)
382
+ else:
383
+ _sort_bam_with_samtools(aligned_output, aligned_sorted_output, threads=threads)
226
384
 
227
385
  logger.debug(f"Indexing: {aligned_sorted_output}")
228
- _index_bam_with_pysam(aligned_sorted_output, threads=threads)
386
+ if samtools_backend == "python":
387
+ _index_bam_with_pysam(aligned_sorted_output, threads=threads)
388
+ else:
389
+ _index_bam_with_samtools(aligned_sorted_output, threads=threads)
229
390
 
230
391
 
231
392
  def bam_qc(
@@ -236,25 +397,20 @@ def bam_qc(
236
397
  stats: bool = True,
237
398
  flagstats: bool = True,
238
399
  idxstats: bool = True,
400
+ samtools_backend: str | None = "auto",
239
401
  ) -> None:
240
402
  """
241
403
  QC for BAM/CRAMs: stats, flagstat, idxstats.
242
404
  Prefers pysam; falls back to `samtools` if needed.
243
405
  Runs BAMs in parallel (up to `threads`, default serial).
244
406
  """
245
- import shutil
246
407
  import subprocess
247
408
 
248
409
  logger.debug("Performing BAM QC using bam_qc")
249
410
 
250
- # Try to import pysam once
251
- try:
252
- import pysam # type: ignore
253
-
254
- have_pysam = True
255
- except Exception:
256
- pysam = None # type: ignore
257
- have_pysam = False
411
+ backend_choice = _resolve_samtools_backend(samtools_backend)
412
+ have_pysam = backend_choice == "python"
413
+ pysam_mod = _require_pysam() if have_pysam else None
258
414
 
259
415
  bam_qc_dir = Path(bam_qc_dir)
260
416
  bam_qc_dir.mkdir(parents=True, exist_ok=True)
@@ -275,11 +431,9 @@ def bam_qc(
275
431
  if _has_index(p):
276
432
  return
277
433
  if have_pysam:
278
- assert pysam is not None
279
- pysam.index(str(p)) # supports BAM & CRAM
434
+ assert pysam_mod is not None
435
+ pysam_mod.index(str(p)) # supports BAM & CRAM
280
436
  else:
281
- if not shutil.which("samtools"):
282
- raise RuntimeError("Neither pysam nor samtools is available in PATH.")
283
437
  cmd = ["samtools", "index", str(p)]
284
438
  # capture text so errors are readable; raise on failure
285
439
  cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
@@ -300,7 +454,7 @@ def bam_qc(
300
454
  line = line.rstrip()
301
455
  if line:
302
456
  last_err.append(line)
303
- logger.info("[%s][%s] %s", tag, bam.name, line)
457
+ logger.debug("[%s][%s] %s", tag, bam.name, line)
304
458
  rc = proc.wait()
305
459
 
306
460
  if rc != 0:
@@ -332,16 +486,13 @@ def bam_qc(
332
486
  # Still attempt stats/flagstat if requested; idxstats may fail later if index is required.
333
487
  logger.warning("Indexing failed for %s: %s", bam, e)
334
488
 
335
- if not have_pysam:
336
- import shutil
337
-
338
- if not shutil.which("samtools"):
339
- raise RuntimeError("Neither pysam nor samtools is available in PATH.")
340
-
341
489
  # --- stats ---
342
490
  if stats:
343
- if have_pysam and pysam is not None and hasattr(pysam, "stats"):
344
- txt = pysam.stats(str(bam))
491
+ if have_pysam:
492
+ assert pysam_mod is not None
493
+ if not hasattr(pysam_mod, "stats"):
494
+ raise RuntimeError("pysam.stats is unavailable in this pysam build.")
495
+ txt = pysam_mod.stats(str(bam))
345
496
  out_stats.write_text(txt)
346
497
  results.append(("stats(pysam)", 0))
347
498
  else:
@@ -351,8 +502,11 @@ def bam_qc(
351
502
 
352
503
  # --- flagstat ---
353
504
  if flagstats:
354
- if have_pysam and pysam is not None and hasattr(pysam, "flagstat"):
355
- txt = pysam.flagstat(str(bam))
505
+ if have_pysam:
506
+ assert pysam_mod is not None
507
+ if not hasattr(pysam_mod, "flagstat"):
508
+ raise RuntimeError("pysam.flagstat is unavailable in this pysam build.")
509
+ txt = pysam_mod.flagstat(str(bam))
356
510
  out_flag.write_text(txt)
357
511
  results.append(("flagstat(pysam)", 0))
358
512
  else:
@@ -362,8 +516,11 @@ def bam_qc(
362
516
 
363
517
  # --- idxstats ---
364
518
  if idxstats:
365
- if have_pysam and pysam is not None and hasattr(pysam, "idxstats"):
366
- txt = pysam.idxstats(str(bam))
519
+ if have_pysam:
520
+ assert pysam_mod is not None
521
+ if not hasattr(pysam_mod, "idxstats"):
522
+ raise RuntimeError("pysam.idxstats is unavailable in this pysam build.")
523
+ txt = pysam_mod.idxstats(str(bam))
367
524
  out_idx.write_text(txt)
368
525
  results.append(("idxstats(pysam)", 0))
369
526
  else:
@@ -400,6 +557,8 @@ def concatenate_fastqs_to_bam(
400
557
  rg_sample_field: Optional[str] = None,
401
558
  progress: bool = True,
402
559
  auto_pair: bool = True,
560
+ gzip_suffixes: Tuple[str, ...] = (".gz", ".gzip"),
561
+ samtools_backend: str | None = "auto",
403
562
  ) -> Dict[str, Any]:
404
563
  """
405
564
  Concatenate FASTQ(s) into an **unaligned** BAM. Supports single-end and paired-end.
@@ -422,6 +581,10 @@ def concatenate_fastqs_to_bam(
422
581
  Show tqdm progress bars.
423
582
  auto_pair : bool
424
583
  Auto-pair R1/R2 based on filename patterns if given a flat list.
584
+ gzip_suffixes : tuple[str, ...]
585
+ Suffixes treated as gzip-compressed FASTQ files.
586
+ samtools_backend : str | None
587
+ Backend selection for samtools-compatible operations (auto|python|cli).
425
588
 
426
589
  Returns
427
590
  -------
@@ -436,9 +599,10 @@ def concatenate_fastqs_to_bam(
436
599
  """
437
600
  name = p.name
438
601
  lowers = name.lower()
602
+ gzip_exts = tuple(s.lower() for s in gzip_suffixes)
439
603
  for ext in (
440
- ".fastq.gz",
441
- ".fq.gz",
604
+ *(f".fastq{suf}" for suf in gzip_exts),
605
+ *(f".fq{suf}" for suf in gzip_exts),
442
606
  ".fastq.bz2",
443
607
  ".fq.bz2",
444
608
  ".fastq.xz",
@@ -525,10 +689,50 @@ def concatenate_fastqs_to_bam(
525
689
  Pysam Fastx records.
526
690
  """
527
691
  # pysam.FastxFile handles compressed extensions transparently
528
- with pysam.FastxFile(str(p)) as fx:
692
+ pysam_mod = _require_pysam()
693
+ with pysam_mod.FastxFile(str(p)) as fx:
529
694
  for rec in fx:
530
695
  yield rec # rec.name, rec.sequence, rec.quality
531
696
 
697
+ def _fastq_iter_plain(p: Path) -> Iterable[Tuple[str, str, str]]:
698
+ """Yield FASTQ records from plain-text parsing.
699
+
700
+ Args:
701
+ p: FASTQ path.
702
+
703
+ Yields:
704
+ Tuple of (name, sequence, quality).
705
+ """
706
+ import bz2
707
+ import gzip
708
+ import lzma
709
+
710
+ lowers = p.name.lower()
711
+ if any(lowers.endswith(suf) for suf in (s.lower() for s in gzip_suffixes)):
712
+ handle = gzip.open(p, "rt", encoding="utf-8")
713
+ elif lowers.endswith(".bz2"):
714
+ handle = bz2.open(p, "rt", encoding="utf-8")
715
+ elif lowers.endswith(".xz"):
716
+ handle = lzma.open(p, "rt", encoding="utf-8")
717
+ else:
718
+ handle = p.open("r", encoding="utf-8")
719
+
720
+ with handle as fh:
721
+ while True:
722
+ header = fh.readline()
723
+ if not header:
724
+ break
725
+ seq = fh.readline()
726
+ fh.readline()
727
+ qual = fh.readline()
728
+ if not qual:
729
+ break
730
+ name = header.strip()
731
+ if name.startswith("@"):
732
+ name = name[1:]
733
+ name = name.split()[0]
734
+ yield name, seq.strip(), qual.strip()
735
+
532
736
  def _make_unaligned_segment(
533
737
  name: str,
534
738
  seq: str,
@@ -550,11 +754,12 @@ def concatenate_fastqs_to_bam(
550
754
  Returns:
551
755
  Unaligned pysam.AlignedSegment.
552
756
  """
553
- a = pysam.AlignedSegment()
757
+ pysam_mod = _require_pysam()
758
+ a = pysam_mod.AlignedSegment()
554
759
  a.query_name = name
555
760
  a.query_sequence = seq
556
761
  if qual is not None:
557
- a.query_qualities = pysam.qualitystring_to_array(qual)
762
+ a.query_qualities = pysam_mod.qualitystring_to_array(qual)
558
763
  a.is_unmapped = True
559
764
  a.is_paired = read1 or read2
560
765
  a.is_read1 = read1
@@ -570,6 +775,48 @@ def concatenate_fastqs_to_bam(
570
775
  a.set_tag("RG", str(bc), value_type="Z")
571
776
  return a
572
777
 
778
+ def _write_sam_line(
779
+ handle,
780
+ name: str,
781
+ seq: str,
782
+ qual: str,
783
+ bc: str,
784
+ *,
785
+ read1: bool,
786
+ read2: bool,
787
+ add_read_group: bool,
788
+ ) -> None:
789
+ """Write a single unaligned SAM record to a text stream."""
790
+ if read1:
791
+ flag = 77
792
+ elif read2:
793
+ flag = 141
794
+ else:
795
+ flag = 4
796
+ tags = [f"{barcode_tag}:Z:{bc}"]
797
+ if add_read_group:
798
+ tags.append(f"RG:Z:{bc}")
799
+ tag_str = "\t".join(tags)
800
+ if not qual:
801
+ qual = "*"
802
+ line = "\t".join(
803
+ [
804
+ name,
805
+ str(flag),
806
+ "*",
807
+ "0",
808
+ "0",
809
+ "*",
810
+ "*",
811
+ "0",
812
+ "0",
813
+ seq,
814
+ qual,
815
+ tag_str,
816
+ ]
817
+ )
818
+ handle.write(f"{line}\n")
819
+
573
820
  # ---------- normalize inputs to Path ----------
574
821
  def _to_path_pair(x) -> Tuple[Path, Path]:
575
822
  """Convert a tuple of path-like objects to Path instances."""
@@ -630,7 +877,29 @@ def concatenate_fastqs_to_bam(
630
877
  singletons_written = 0
631
878
 
632
879
  # ---------- write BAM ----------
633
- with pysam.AlignmentFile(str(output_bam), "wb", header=header) as bam_out:
880
+ backend_choice = _resolve_samtools_backend(samtools_backend)
881
+ if backend_choice == "python":
882
+ pysam_mod = _require_pysam()
883
+ bam_out_ctx = pysam_mod.AlignmentFile(str(output_bam), "wb", header=header)
884
+ else:
885
+ cmd = ["samtools", "view", "-b", "-o", str(output_bam), "-"]
886
+ logger.debug("Writing BAM using samtools: %s", " ".join(cmd))
887
+ bam_out_ctx = subprocess.Popen(
888
+ cmd, stdin=subprocess.PIPE, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True
889
+ )
890
+ assert bam_out_ctx.stdin is not None
891
+ header_lines = ["@HD\tVN:1.6\tSO:unknown"]
892
+ if add_read_group:
893
+ for bc in barcodes_in_order:
894
+ rg_fields = [f"ID:{bc}"]
895
+ if rg_sample_field:
896
+ rg_fields.append(f"SM:{rg_sample_field}")
897
+ rg_body = "\t".join(rg_fields)
898
+ header_lines.append(f"@RG\t{rg_body}")
899
+ header_lines.append("@PG\tID:concat-fastq\tPN:concatenate_fastqs_to_bam\tVN:1")
900
+ bam_out_ctx.stdin.write("\n".join(header_lines) + "\n")
901
+
902
+ try:
634
903
  # Paired
635
904
  it_pairs = explicit_pairs
636
905
  if progress and it_pairs:
@@ -640,8 +909,12 @@ def concatenate_fastqs_to_bam(
640
909
  raise FileNotFoundError(f"Paired file missing: {r1_path} or {r2_path}")
641
910
  bc = per_path_barcode.get(r1_path) or per_path_barcode.get(r2_path) or "barcode"
642
911
 
643
- it1 = _fastq_iter(r1_path)
644
- it2 = _fastq_iter(r2_path)
912
+ if backend_choice == "python":
913
+ it1 = _fastq_iter(r1_path)
914
+ it2 = _fastq_iter(r2_path)
915
+ else:
916
+ it1 = _fastq_iter_plain(r1_path)
917
+ it2 = _fastq_iter_plain(r2_path)
645
918
 
646
919
  for rec1, rec2 in zip_longest(it1, it2, fillvalue=None):
647
920
 
@@ -652,24 +925,67 @@ def concatenate_fastqs_to_bam(
652
925
  return re.sub(r"(?:/1$|/2$|\s[12]$)", "", n)
653
926
 
654
927
  name = (
655
- _clean(getattr(rec1, "name", None))
656
- or _clean(getattr(rec2, "name", None))
657
- or getattr(rec1, "name", None)
658
- or getattr(rec2, "name", None)
928
+ _clean(getattr(rec1, "name", None) if backend_choice == "python" else rec1[0])
929
+ if rec1 is not None
930
+ else None
659
931
  )
932
+ if name is None:
933
+ name = (
934
+ _clean(
935
+ getattr(rec2, "name", None) if backend_choice == "python" else rec2[0]
936
+ )
937
+ if rec2 is not None
938
+ else None
939
+ )
940
+ if name is None:
941
+ name = (
942
+ getattr(rec1, "name", None)
943
+ if backend_choice == "python" and rec1 is not None
944
+ else (rec1[0] if rec1 is not None else None)
945
+ )
946
+ if name is None:
947
+ name = (
948
+ getattr(rec2, "name", None)
949
+ if backend_choice == "python" and rec2 is not None
950
+ else (rec2[0] if rec2 is not None else None)
951
+ )
660
952
 
661
953
  if rec1 is not None:
662
- a1 = _make_unaligned_segment(
663
- name, rec1.sequence, rec1.quality, bc, read1=True, read2=False
664
- )
665
- bam_out.write(a1)
954
+ if backend_choice == "python":
955
+ a1 = _make_unaligned_segment(
956
+ name, rec1.sequence, rec1.quality, bc, read1=True, read2=False
957
+ )
958
+ bam_out_ctx.write(a1)
959
+ else:
960
+ _write_sam_line(
961
+ bam_out_ctx.stdin,
962
+ name,
963
+ rec1[1],
964
+ rec1[2],
965
+ bc,
966
+ read1=True,
967
+ read2=False,
968
+ add_read_group=add_read_group,
969
+ )
666
970
  per_file_counts[r1_path] = per_file_counts.get(r1_path, 0) + 1
667
971
  total_written += 1
668
972
  if rec2 is not None:
669
- a2 = _make_unaligned_segment(
670
- name, rec2.sequence, rec2.quality, bc, read1=False, read2=True
671
- )
672
- bam_out.write(a2)
973
+ if backend_choice == "python":
974
+ a2 = _make_unaligned_segment(
975
+ name, rec2.sequence, rec2.quality, bc, read1=False, read2=True
976
+ )
977
+ bam_out_ctx.write(a2)
978
+ else:
979
+ _write_sam_line(
980
+ bam_out_ctx.stdin,
981
+ name,
982
+ rec2[1],
983
+ rec2[2],
984
+ bc,
985
+ read1=False,
986
+ read2=True,
987
+ add_read_group=add_read_group,
988
+ )
673
989
  per_file_counts[r2_path] = per_file_counts.get(r2_path, 0) + 1
674
990
  total_written += 1
675
991
 
@@ -689,14 +1005,40 @@ def concatenate_fastqs_to_bam(
689
1005
  if not pth.exists():
690
1006
  raise FileNotFoundError(pth)
691
1007
  bc = per_path_barcode.get(pth, "barcode")
692
- for rec in _fastq_iter(pth):
693
- a = _make_unaligned_segment(
694
- rec.name, rec.sequence, rec.quality, bc, read1=False, read2=False
695
- )
696
- bam_out.write(a)
1008
+ if backend_choice == "python":
1009
+ iterator = _fastq_iter(pth)
1010
+ else:
1011
+ iterator = _fastq_iter_plain(pth)
1012
+ for rec in iterator:
1013
+ if backend_choice == "python":
1014
+ a = _make_unaligned_segment(
1015
+ rec.name, rec.sequence, rec.quality, bc, read1=False, read2=False
1016
+ )
1017
+ bam_out_ctx.write(a)
1018
+ else:
1019
+ _write_sam_line(
1020
+ bam_out_ctx.stdin,
1021
+ rec[0],
1022
+ rec[1],
1023
+ rec[2],
1024
+ bc,
1025
+ read1=False,
1026
+ read2=False,
1027
+ add_read_group=add_read_group,
1028
+ )
697
1029
  per_file_counts[pth] = per_file_counts.get(pth, 0) + 1
698
1030
  total_written += 1
699
1031
  singletons_written += 1
1032
+ finally:
1033
+ if backend_choice == "python":
1034
+ bam_out_ctx.close()
1035
+ else:
1036
+ if bam_out_ctx.stdin is not None:
1037
+ bam_out_ctx.stdin.close()
1038
+ rc = bam_out_ctx.wait()
1039
+ if rc != 0:
1040
+ stderr = bam_out_ctx.stderr.read() if bam_out_ctx.stderr else ""
1041
+ raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
700
1042
 
701
1043
  return {
702
1044
  "total_reads": total_written,
@@ -707,7 +1049,7 @@ def concatenate_fastqs_to_bam(
707
1049
  }
708
1050
 
709
1051
 
710
- def count_aligned_reads(bam_file):
1052
+ def count_aligned_reads(bam_file, samtools_backend: str | None = "auto"):
711
1053
  """
712
1054
  Counts the number of aligned reads in a bam file that map to each reference record.
713
1055
 
@@ -720,30 +1062,42 @@ def count_aligned_reads(bam_file):
720
1062
  record_counts (dict): A dictionary keyed by reference record instance that points toa tuple containing the total reads mapped to the record and the fraction of mapped reads which map to the record.
721
1063
 
722
1064
  """
723
- print("{0}: Counting aligned reads in BAM > {1}".format(time_string(), bam_file))
1065
+ logger.info("Counting aligned reads in BAM > {}".format(bam_file.name))
1066
+ backend_choice = _resolve_samtools_backend(samtools_backend)
724
1067
  aligned_reads_count = 0
725
1068
  unaligned_reads_count = 0
726
- # Make a dictionary, keyed by the reference_name of reference chromosome that points to an integer number of read counts mapped to the chromosome, as well as the proportion of mapped reads in that chromosome
727
- record_counts = defaultdict(int)
728
-
729
- with pysam.AlignmentFile(str(bam_file), "rb") as bam:
730
- total_reads = bam.mapped + bam.unmapped
731
- # Iterate over reads to get the total mapped read counts and the reads that map to each reference
732
- for read in tqdm(bam, desc="Counting aligned reads in BAM", total=total_reads):
733
- if read.is_unmapped:
734
- unaligned_reads_count += 1
735
- else:
736
- aligned_reads_count += 1
737
- record_counts[read.reference_name] += (
738
- 1 # Automatically increments if key exists, adds if not
739
- )
740
1069
 
741
- # reformat the dictionary to contain read counts mapped to the reference, as well as the proportion of mapped reads in reference
742
- for reference in record_counts:
743
- proportion_mapped_reads_in_record = record_counts[reference] / aligned_reads_count
744
- record_counts[reference] = (record_counts[reference], proportion_mapped_reads_in_record)
1070
+ if backend_choice == "python":
1071
+ pysam_mod = _require_pysam()
1072
+ record_counts = defaultdict(int)
1073
+ with pysam_mod.AlignmentFile(str(bam_file), "rb") as bam:
1074
+ total_reads = bam.mapped + bam.unmapped
1075
+ # Iterate over reads to get the total mapped read counts and the reads that map to each reference
1076
+ for read in bam:
1077
+ if read.is_unmapped:
1078
+ unaligned_reads_count += 1
1079
+ else:
1080
+ aligned_reads_count += 1
1081
+ record_counts[read.reference_name] += (
1082
+ 1 # Automatically increments if key exists, adds if not
1083
+ )
1084
+
1085
+ # reformat the dictionary to contain read counts mapped to the reference, as well as the proportion of mapped reads in reference
1086
+ for reference in record_counts:
1087
+ proportion_mapped_reads_in_record = record_counts[reference] / aligned_reads_count
1088
+ record_counts[reference] = (
1089
+ record_counts[reference],
1090
+ proportion_mapped_reads_in_record,
1091
+ )
1092
+ return aligned_reads_count, unaligned_reads_count, dict(record_counts)
745
1093
 
746
- return aligned_reads_count, unaligned_reads_count, dict(record_counts)
1094
+ bam_path = Path(bam_file)
1095
+ _ensure_bam_index(bam_path, backend_choice)
1096
+ cmd = ["samtools", "idxstats", str(bam_path)]
1097
+ cp = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
1098
+ if cp.returncode != 0:
1099
+ raise RuntimeError(f"samtools idxstats failed (exit {cp.returncode}):\n{cp.stderr}")
1100
+ return _parse_idxstats_output(cp.stdout)
747
1101
 
748
1102
 
749
1103
  def demux_and_index_BAM(
@@ -827,13 +1181,20 @@ def demux_and_index_BAM(
827
1181
  return renamed_bams
828
1182
 
829
1183
 
830
- def extract_base_identities(bam_file, chromosome, positions, max_reference_length, sequence):
1184
+ def extract_base_identities(
1185
+ bam_file,
1186
+ record,
1187
+ positions,
1188
+ max_reference_length,
1189
+ sequence,
1190
+ samtools_backend: str | None = "auto",
1191
+ ):
831
1192
  """
832
1193
  Efficiently extracts base identities from mapped reads with reference coordinates.
833
1194
 
834
1195
  Parameters:
835
1196
  bam_file (str): Path to the BAM file.
836
- chromosome (str): Name of the reference chromosome.
1197
+ record (str): Name of the reference record.
837
1198
  positions (list): Positions to extract (0-based).
838
1199
  max_reference_length (int): Maximum reference length for padding.
839
1200
  sequence (str): The sequence of the record fasta
@@ -841,6 +1202,11 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
841
1202
  Returns:
842
1203
  dict: Base identities from forward mapped reads.
843
1204
  dict: Base identities from reverse mapped reads.
1205
+ dict: Mismatch counts per read.
1206
+ dict: Mismatch trends per read.
1207
+ dict: Integer-encoded mismatch bases per read.
1208
+ dict: Base quality scores per read aligned to reference positions.
1209
+ dict: Read span masks per read (1 within span, 0 outside).
844
1210
  """
845
1211
  logger.debug("Extracting nucleotide identities for each read using extract_base_identities")
846
1212
  timestamp = time.strftime("[%Y-%m-%d %H:%M:%S]")
@@ -849,32 +1215,144 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
849
1215
  fwd_base_identities = defaultdict(lambda: np.full(max_reference_length, "N", dtype="<U1"))
850
1216
  rev_base_identities = defaultdict(lambda: np.full(max_reference_length, "N", dtype="<U1"))
851
1217
  mismatch_counts_per_read = defaultdict(lambda: defaultdict(Counter))
1218
+ mismatch_base_identities = defaultdict(
1219
+ lambda: np.full(
1220
+ max_reference_length,
1221
+ MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT["N"],
1222
+ dtype=np.int16,
1223
+ )
1224
+ )
1225
+ base_quality_scores = defaultdict(lambda: np.full(max_reference_length, -1, dtype=np.int16))
1226
+ read_span_masks = defaultdict(lambda: np.zeros(max_reference_length, dtype=np.int8))
852
1227
 
853
- # print(f"{timestamp} Reading reads from {chromosome} BAM file: {bam_file}")
854
- with pysam.AlignmentFile(str(bam_file), "rb") as bam:
855
- total_reads = bam.mapped
856
- ref_seq = sequence.upper()
857
- for read in bam.fetch(chromosome):
858
- if not read.is_mapped:
859
- continue # Skip unmapped reads
860
-
861
- read_name = read.query_name
862
- query_sequence = read.query_sequence
863
- base_dict = rev_base_identities if read.is_reverse else fwd_base_identities
1228
+ backend_choice = _resolve_samtools_backend(samtools_backend)
1229
+ ref_seq = sequence.upper()
1230
+ sequence_length = len(sequence)
864
1231
 
865
- # Use get_aligned_pairs directly with positions filtering
866
- aligned_pairs = read.get_aligned_pairs(matches_only=True)
1232
+ def _encode_mismatch_base(base: str) -> int:
1233
+ return MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT.get(
1234
+ base.upper(), MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT["N"]
1235
+ )
867
1236
 
868
- for read_position, reference_position in aligned_pairs:
869
- if reference_position in positions:
1237
+ if backend_choice == "python":
1238
+ logger.debug("Extracting base identities using python")
1239
+ pysam_mod = _require_pysam()
1240
+ # print(f"{timestamp} Reading reads from {chromosome} BAM file: {bam_file}")
1241
+ with pysam_mod.AlignmentFile(str(bam_file), "rb") as bam:
1242
+ total_reads = bam.mapped
1243
+ for read in bam.fetch(record):
1244
+ if not read.is_mapped:
1245
+ continue # Skip unmapped reads
1246
+
1247
+ read_name = read.query_name
1248
+ query_sequence = read.query_sequence
1249
+ query_qualities = read.query_qualities or []
1250
+ base_dict = rev_base_identities if read.is_reverse else fwd_base_identities
1251
+
1252
+ # Init arrays for each read in each dict
1253
+ mismatch_base_identities[read_name]
1254
+ base_quality_scores[read_name]
1255
+ read_span_masks[read_name]
1256
+
1257
+ if read.reference_start is not None and read.reference_end is not None:
1258
+ span_end = min(read.reference_end, max_reference_length)
1259
+ read_span_masks[read_name][read.reference_start : span_end] = 1
1260
+
1261
+ # Use get_aligned_pairs directly with positions filtering
1262
+ aligned_pairs = read.get_aligned_pairs(matches_only=True)
1263
+
1264
+ for read_position, reference_position in aligned_pairs:
1265
+ if reference_position is None or read_position is None:
1266
+ continue
870
1267
  read_base = query_sequence[read_position]
871
1268
  ref_base = ref_seq[reference_position]
872
-
873
- base_dict[read_name][reference_position] = read_base
874
-
875
- # Track mismatches (excluding Ns)
1269
+ if reference_position in positions:
1270
+ base_dict[read_name][reference_position] = read_base
1271
+ if read_position < len(query_qualities):
1272
+ base_quality_scores[read_name][reference_position] = query_qualities[
1273
+ read_position
1274
+ ]
1275
+
1276
+ # Track mismatches (excluding Ns)
1277
+ if read_base != ref_base and read_base != "N" and ref_base != "N":
1278
+ mismatch_counts_per_read[read_name][ref_base][read_base] += 1
1279
+ mismatch_base_identities[read_name][reference_position] = (
1280
+ _encode_mismatch_base(read_base)
1281
+ )
1282
+ else:
1283
+ bam_path = Path(bam_file)
1284
+ logger.debug("Extracting base identities using samtools")
1285
+ _ensure_bam_index(bam_path, backend_choice)
1286
+
1287
+ def _iter_aligned_pairs(cigar: str, start: int) -> Iterable[Tuple[int, int]]:
1288
+ qpos = 0
1289
+ rpos = start
1290
+ for length_str, op in re.findall(r"(\d+)([MIDNSHP=XB])", cigar):
1291
+ length = int(length_str)
1292
+ if op in {"M", "=", "X"}:
1293
+ for _ in range(length):
1294
+ yield qpos, rpos
1295
+ qpos += 1
1296
+ rpos += 1
1297
+ elif op in {"I", "S"}:
1298
+ qpos += length
1299
+ elif op in {"D", "N"}:
1300
+ rpos += length
1301
+ elif op in {"H", "P"}:
1302
+ continue
1303
+
1304
+ def _reference_span_from_cigar(cigar: str) -> int:
1305
+ span = 0
1306
+ for length_str, op in re.findall(r"(\d+)([MIDNSHP=XB])", cigar):
1307
+ if op in {"M", "D", "N", "=", "X"}:
1308
+ span += int(length_str)
1309
+ return span
1310
+
1311
+ cmd = ["samtools", "view", "-F", "4", str(bam_path), record]
1312
+ proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
1313
+ assert proc.stdout is not None
1314
+ for line in proc.stdout:
1315
+ if not line.strip() or line.startswith("@"):
1316
+ continue
1317
+ fields = line.rstrip("\n").split("\t")
1318
+ if len(fields) < 11:
1319
+ continue
1320
+ read_name = fields[0]
1321
+ flag = int(fields[1])
1322
+ pos = int(fields[3])
1323
+ cigar = fields[5]
1324
+ query_sequence = fields[9]
1325
+ qual_string = fields[10]
1326
+ if cigar == "*" or query_sequence == "*":
1327
+ continue
1328
+ base_dict = rev_base_identities if (flag & 16) else fwd_base_identities
1329
+ mismatch_base_identities[read_name]
1330
+ base_quality_scores[read_name]
1331
+ read_span_masks[read_name]
1332
+ qualities = (
1333
+ [ord(ch) - 33 for ch in qual_string] if qual_string and qual_string != "*" else []
1334
+ )
1335
+ ref_start = pos - 1
1336
+ ref_end = ref_start + _reference_span_from_cigar(cigar)
1337
+ span_end = min(ref_end, max_reference_length)
1338
+ if ref_start < max_reference_length:
1339
+ read_span_masks[read_name][ref_start:span_end] = 1
1340
+ for read_pos, ref_pos in _iter_aligned_pairs(cigar, pos - 1):
1341
+ if read_pos >= len(query_sequence) or ref_pos >= len(ref_seq):
1342
+ continue
1343
+ read_base = query_sequence[read_pos]
1344
+ ref_base = ref_seq[ref_pos]
1345
+ if ref_pos in positions:
1346
+ base_dict[read_name][ref_pos] = read_base
1347
+ if read_pos < len(qualities):
1348
+ base_quality_scores[read_name][ref_pos] = qualities[read_pos]
876
1349
  if read_base != ref_base and read_base != "N" and ref_base != "N":
877
1350
  mismatch_counts_per_read[read_name][ref_base][read_base] += 1
1351
+ mismatch_base_identities[read_name][ref_pos] = _encode_mismatch_base(read_base)
1352
+ rc = proc.wait()
1353
+ if rc != 0:
1354
+ stderr = proc.stderr.read() if proc.stderr else ""
1355
+ raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
878
1356
 
879
1357
  # Determine C→T vs G→A dominance per read
880
1358
  mismatch_trend_per_read = {}
@@ -891,54 +1369,419 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
891
1369
  else:
892
1370
  mismatch_trend_per_read[read_name] = "none"
893
1371
 
1372
+ if sequence_length < max_reference_length:
1373
+ padding_value = MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT["PAD"]
1374
+ for mismatch_values in mismatch_base_identities.values():
1375
+ mismatch_values[sequence_length:] = padding_value
1376
+
894
1377
  return (
895
1378
  dict(fwd_base_identities),
896
1379
  dict(rev_base_identities),
897
1380
  dict(mismatch_counts_per_read),
898
1381
  mismatch_trend_per_read,
1382
+ dict(mismatch_base_identities),
1383
+ dict(base_quality_scores),
1384
+ dict(read_span_masks),
899
1385
  )
900
1386
 
901
1387
 
902
- def extract_read_features_from_bam(bam_file_path):
903
- """
904
- Make a dict of reads from a bam that points to a list of read metrics: read length, read median Q-score, reference length, mapped length, mapping quality
905
- Params:
906
- bam_file_path (str):
1388
+ def extract_read_features_from_bam(
1389
+ bam_file_path: str | Path, samtools_backend: str | None = "auto"
1390
+ ) -> Dict[str, List[float]]:
1391
+ """Extract read metrics from a BAM file.
1392
+
1393
+ Args:
1394
+ bam_file_path: Path to the BAM file.
1395
+ samtools_backend: Backend selection for samtools-compatible operations (auto|python|cli).
1396
+
907
1397
  Returns:
908
- read_metrics (dict)
1398
+ Mapping of read name to [read_length, read_median_qscore, reference_length,
1399
+ mapped_length, mapping_quality, reference_start, reference_end].
909
1400
  """
910
- # Open the BAM file
911
1401
  logger.debug(
912
- f"Extracting read metrics from BAM using extract_read_features_from_bam: {bam_file_path}"
1402
+ "Extracting read metrics from BAM using extract_read_features_from_bam: %s",
1403
+ bam_file_path,
913
1404
  )
914
- with pysam.AlignmentFile(bam_file_path, "rb") as bam_file:
915
- read_metrics = {}
916
- reference_lengths = bam_file.lengths # List of lengths for each reference (chromosome)
917
- for read in bam_file:
918
- # Skip unmapped reads
919
- if read.is_unmapped:
1405
+ backend_choice = _resolve_samtools_backend(samtools_backend)
1406
+ read_metrics: Dict[str, List[float]] = {}
1407
+
1408
+ if backend_choice == "python":
1409
+ pysam_mod = _require_pysam()
1410
+ with pysam_mod.AlignmentFile(str(bam_file_path), "rb") as bam_file:
1411
+ reference_lengths = dict(zip(bam_file.references, bam_file.lengths))
1412
+ for read in bam_file:
1413
+ if read.is_unmapped:
1414
+ continue
1415
+ read_quality = read.query_qualities
1416
+ if read_quality is None:
1417
+ median_read_quality = float("nan")
1418
+ else:
1419
+ median_read_quality = float(np.median(read_quality))
1420
+ reference_length = reference_lengths.get(read.reference_name, float("nan"))
1421
+ mapped_length = sum(end - start for start, end in read.get_blocks())
1422
+ mapping_quality = float(read.mapping_quality)
1423
+ reference_start = float(read.reference_start)
1424
+ reference_end = float(read.reference_end)
1425
+ read_metrics[read.query_name] = [
1426
+ float(read.query_length),
1427
+ median_read_quality,
1428
+ float(reference_length),
1429
+ float(mapped_length),
1430
+ mapping_quality,
1431
+ reference_start,
1432
+ reference_end,
1433
+ ]
1434
+ return read_metrics
1435
+
1436
+ bam_path = Path(bam_file_path)
1437
+
1438
+ def _parse_reference_lengths(header_text: str) -> Dict[str, int]:
1439
+ ref_lengths: Dict[str, int] = {}
1440
+ for line in header_text.splitlines():
1441
+ if not line.startswith("@SQ"):
920
1442
  continue
921
- # Extract the read metrics
922
- read_quality = read.query_qualities
923
- median_read_quality = np.median(read_quality)
924
- # Extract the reference (chromosome) name and its length
925
- reference_name = read.reference_name
926
- reference_index = bam_file.references.index(reference_name)
927
- reference_length = reference_lengths[reference_index]
928
- mapped_length = sum(end - start for start, end in read.get_blocks())
929
- mapping_quality = read.mapping_quality # Phred-scaled MAPQ
930
- read_metrics[read.query_name] = [
931
- read.query_length,
932
- median_read_quality,
933
- reference_length,
934
- mapped_length,
935
- mapping_quality,
936
- ]
1443
+ fields = line.split("\t")
1444
+ name = None
1445
+ length = None
1446
+ for field in fields[1:]:
1447
+ if field.startswith("SN:"):
1448
+ name = field.split(":", 1)[1]
1449
+ elif field.startswith("LN:"):
1450
+ length = int(field.split(":", 1)[1])
1451
+ if name is not None and length is not None:
1452
+ ref_lengths[name] = length
1453
+ return ref_lengths
1454
+
1455
+ def _mapped_length_from_cigar(cigar: str) -> int:
1456
+ mapped = 0
1457
+ for length_str, op in re.findall(r"(\d+)([MIDNSHP=XB])", cigar):
1458
+ length = int(length_str)
1459
+ if op in {"M", "=", "X"}:
1460
+ mapped += length
1461
+ return mapped
1462
+
1463
+ def _reference_span_from_cigar(cigar: str) -> int:
1464
+ reference_span = 0
1465
+ for length_str, op in re.findall(r"(\d+)([MIDNSHP=XB])", cigar):
1466
+ length = int(length_str)
1467
+ if op in {"M", "D", "N", "=", "X"}:
1468
+ reference_span += length
1469
+ return reference_span
1470
+
1471
+ header_cp = subprocess.run(
1472
+ ["samtools", "view", "-H", str(bam_path)],
1473
+ stdout=subprocess.PIPE,
1474
+ stderr=subprocess.PIPE,
1475
+ text=True,
1476
+ check=False,
1477
+ )
1478
+ if header_cp.returncode != 0:
1479
+ raise RuntimeError(
1480
+ f"samtools view -H failed (exit {header_cp.returncode}):\n{header_cp.stderr}"
1481
+ )
1482
+ reference_lengths = _parse_reference_lengths(header_cp.stdout)
1483
+
1484
+ proc = subprocess.Popen(
1485
+ ["samtools", "view", "-F", "4", str(bam_path)],
1486
+ stdout=subprocess.PIPE,
1487
+ stderr=subprocess.PIPE,
1488
+ text=True,
1489
+ )
1490
+ assert proc.stdout is not None
1491
+ for line in proc.stdout:
1492
+ if not line.strip() or line.startswith("@"):
1493
+ continue
1494
+ fields = line.rstrip("\n").split("\t")
1495
+ if len(fields) < 11:
1496
+ continue
1497
+ read_name = fields[0]
1498
+ reference_name = fields[2]
1499
+ mapping_quality = float(fields[4])
1500
+ cigar = fields[5]
1501
+ reference_start = float(int(fields[3]) - 1)
1502
+ sequence = fields[9]
1503
+ quality = fields[10]
1504
+ if sequence == "*":
1505
+ read_length = float("nan")
1506
+ else:
1507
+ read_length = float(len(sequence))
1508
+ if quality == "*" or not quality:
1509
+ median_read_quality = float("nan")
1510
+ else:
1511
+ phreds = [ord(char) - 33 for char in quality]
1512
+ median_read_quality = float(np.median(phreds))
1513
+ reference_length = float(reference_lengths.get(reference_name, float("nan")))
1514
+ mapped_length = float(_mapped_length_from_cigar(cigar)) if cigar != "*" else 0.0
1515
+ if cigar != "*":
1516
+ reference_end = float(reference_start + _reference_span_from_cigar(cigar))
1517
+ else:
1518
+ reference_end = float("nan")
1519
+ read_metrics[read_name] = [
1520
+ read_length,
1521
+ median_read_quality,
1522
+ reference_length,
1523
+ mapped_length,
1524
+ mapping_quality,
1525
+ reference_start,
1526
+ reference_end,
1527
+ ]
1528
+
1529
+ rc = proc.wait()
1530
+ if rc != 0:
1531
+ stderr = proc.stderr.read() if proc.stderr else ""
1532
+ raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
937
1533
 
938
1534
  return read_metrics
939
1535
 
940
1536
 
941
- def extract_readnames_from_bam(aligned_BAM):
1537
+ def extract_read_tags_from_bam(
1538
+ bam_file_path: str | Path,
1539
+ tag_names: Iterable[str] | None = None,
1540
+ include_flags: bool = True,
1541
+ include_cigar: bool = True,
1542
+ samtools_backend: str | None = "auto",
1543
+ ) -> Dict[str, Dict[str, object]]:
1544
+ """Extract per-read tag metadata from a BAM file.
1545
+
1546
+ Args:
1547
+ bam_file_path: Path to the BAM file.
1548
+ tag_names: Iterable of BAM tag names to extract (e.g., ["NM", "MD", "MM", "ML"]).
1549
+ If None, only flags/cigar are populated.
1550
+ include_flags: Whether to include a list of flag names for each read.
1551
+ include_cigar: Whether to include the CIGAR string for each read.
1552
+ samtools_backend: Backend selection for samtools-compatible operations (auto|python|cli).
1553
+
1554
+ Returns:
1555
+ Mapping of read name to a dict of extracted tag values.
1556
+ """
1557
+ backend_choice = _resolve_samtools_backend(samtools_backend)
1558
+ tag_names_list = [tag.upper() for tag in tag_names] if tag_names else []
1559
+ read_tags: Dict[str, Dict[str, object]] = {}
1560
+
1561
+ def _decode_flags(flag: int) -> list[str]:
1562
+ return [name for bit, name in _BAM_FLAG_BITS if flag & bit]
1563
+
1564
+ if backend_choice == "python":
1565
+ pysam_mod = _require_pysam()
1566
+ with pysam_mod.AlignmentFile(str(bam_file_path), "rb") as bam_file:
1567
+ for read in bam_file.fetch(until_eof=True):
1568
+ if not read.query_name:
1569
+ continue
1570
+ tag_map: Dict[str, object] = {}
1571
+ if include_cigar:
1572
+ tag_map["CIGAR"] = read.cigarstring
1573
+ if include_flags:
1574
+ tag_map["FLAGS"] = _decode_flags(read.flag)
1575
+ for tag in tag_names_list:
1576
+ try:
1577
+ tag_map[tag] = read.get_tag(tag)
1578
+ except Exception:
1579
+ tag_map[tag] = None
1580
+ read_tags[read.query_name] = tag_map
1581
+ else:
1582
+ cmd = ["samtools", "view", "-F", "4", str(bam_file_path)]
1583
+ proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
1584
+ assert proc.stdout is not None
1585
+ for line in proc.stdout:
1586
+ if not line.strip() or line.startswith("@"):
1587
+ continue
1588
+ fields = line.rstrip("\n").split("\t")
1589
+ if len(fields) < 11:
1590
+ continue
1591
+ read_name = fields[0]
1592
+ flag = int(fields[1])
1593
+ cigar = fields[5]
1594
+ tag_map: Dict[str, object] = {}
1595
+ if include_cigar:
1596
+ tag_map["CIGAR"] = cigar
1597
+ if include_flags:
1598
+ tag_map["FLAGS"] = _decode_flags(flag)
1599
+ if tag_names_list:
1600
+ raw_tags = fields[11:]
1601
+ parsed_tags: Dict[str, str] = {}
1602
+ for raw_tag in raw_tags:
1603
+ parts = raw_tag.split(":", 2)
1604
+ if len(parts) == 3:
1605
+ tag_name, _tag_type, value = parts
1606
+ parsed_tags[tag_name.upper()] = value
1607
+ for tag in tag_names_list:
1608
+ tag_map[tag] = parsed_tags.get(tag)
1609
+ read_tags[read_name] = tag_map
1610
+ rc = proc.wait()
1611
+ if rc != 0:
1612
+ stderr = proc.stderr.read() if proc.stderr else ""
1613
+ raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
1614
+
1615
+ return read_tags
1616
+
1617
+
1618
+ def find_secondary_supplementary_read_names(
1619
+ bam_file_path: str | Path,
1620
+ read_names: Iterable[str],
1621
+ samtools_backend: str | None = "auto",
1622
+ ) -> tuple[set[str], set[str]]:
1623
+ """Find read names with secondary or supplementary alignments in a BAM.
1624
+
1625
+ Args:
1626
+ bam_file_path: Path to the BAM file to scan.
1627
+ read_names: Iterable of read names to check.
1628
+ samtools_backend: Backend selection for samtools-compatible operations (auto|python|cli).
1629
+
1630
+ Returns:
1631
+ Tuple of (secondary_read_names, supplementary_read_names).
1632
+ """
1633
+ target_names = set(read_names)
1634
+ if not target_names:
1635
+ return set(), set()
1636
+
1637
+ secondary_reads: set[str] = set()
1638
+ supplementary_reads: set[str] = set()
1639
+ backend_choice = _resolve_samtools_backend(samtools_backend)
1640
+
1641
+ if backend_choice == "python":
1642
+ pysam_mod = _require_pysam()
1643
+ with pysam_mod.AlignmentFile(str(bam_file_path), "rb") as bam_file:
1644
+ for read in bam_file.fetch(until_eof=True):
1645
+ if not read.query_name or read.query_name not in target_names:
1646
+ continue
1647
+ if read.is_secondary:
1648
+ secondary_reads.add(read.query_name)
1649
+ if read.is_supplementary:
1650
+ supplementary_reads.add(read.query_name)
1651
+ else:
1652
+
1653
+ def _collect(flag: int) -> set[str]:
1654
+ cmd = ["samtools", "view", "-f", str(flag), str(bam_file_path)]
1655
+ proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
1656
+ assert proc.stdout is not None
1657
+ hits: set[str] = set()
1658
+ for line in proc.stdout:
1659
+ if not line.strip() or line.startswith("@"):
1660
+ continue
1661
+ read_name = line.split("\t", 1)[0]
1662
+ if read_name in target_names:
1663
+ hits.add(read_name)
1664
+ rc = proc.wait()
1665
+ if rc != 0:
1666
+ stderr = proc.stderr.read() if proc.stderr else ""
1667
+ raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
1668
+ return hits
1669
+
1670
+ secondary_reads = _collect(0x100)
1671
+ supplementary_reads = _collect(0x800)
1672
+
1673
+ return secondary_reads, supplementary_reads
1674
+
1675
+
1676
+ def extract_secondary_supplementary_alignment_spans(
1677
+ bam_file_path: str | Path,
1678
+ read_names: Iterable[str],
1679
+ samtools_backend: str | None = "auto",
1680
+ ) -> tuple[
1681
+ dict[str, list[tuple[float, float, float]]], dict[str, list[tuple[float, float, float]]]
1682
+ ]:
1683
+ """Extract reference/read span data for secondary/supplementary alignments.
1684
+
1685
+ Args:
1686
+ bam_file_path: Path to the BAM file to scan.
1687
+ read_names: Iterable of read names to check.
1688
+ samtools_backend: Backend selection for samtools-compatible operations (auto|python|cli).
1689
+
1690
+ Returns:
1691
+ Tuple of (secondary_spans, supplementary_spans) where each mapping contains
1692
+ read names mapped to lists of (reference_start, reference_end, read_span).
1693
+ """
1694
+ target_names = set(read_names)
1695
+ if not target_names:
1696
+ return {}, {}
1697
+
1698
+ secondary_spans: dict[str, list[tuple[float, float, float]]] = {}
1699
+ supplementary_spans: dict[str, list[tuple[float, float, float]]] = {}
1700
+ backend_choice = _resolve_samtools_backend(samtools_backend)
1701
+
1702
+ if backend_choice == "python":
1703
+ pysam_mod = _require_pysam()
1704
+ with pysam_mod.AlignmentFile(str(bam_file_path), "rb") as bam_file:
1705
+ for read in bam_file.fetch(until_eof=True):
1706
+ if not read.query_name or read.query_name not in target_names:
1707
+ continue
1708
+ if not (read.is_secondary or read.is_supplementary):
1709
+ continue
1710
+ reference_start = (
1711
+ float(read.reference_start)
1712
+ if read.reference_start is not None
1713
+ else float("nan")
1714
+ )
1715
+ reference_end = (
1716
+ float(read.reference_end) if read.reference_end is not None else float("nan")
1717
+ )
1718
+ read_span = (
1719
+ float(read.query_alignment_length)
1720
+ if read.query_alignment_length is not None
1721
+ else float("nan")
1722
+ )
1723
+ if read.is_secondary:
1724
+ secondary_spans.setdefault(read.query_name, []).append(
1725
+ (reference_start, reference_end, read_span)
1726
+ )
1727
+ if read.is_supplementary:
1728
+ supplementary_spans.setdefault(read.query_name, []).append(
1729
+ (reference_start, reference_end, read_span)
1730
+ )
1731
+ return secondary_spans, supplementary_spans
1732
+
1733
+ def _mapped_length_from_cigar(cigar: str) -> int:
1734
+ mapped = 0
1735
+ for length_str, op in re.findall(r"(\d+)([MIDNSHP=XB])", cigar):
1736
+ length = int(length_str)
1737
+ if op in {"M", "=", "X"}:
1738
+ mapped += length
1739
+ return mapped
1740
+
1741
+ def _reference_span_from_cigar(cigar: str) -> int:
1742
+ reference_span = 0
1743
+ for length_str, op in re.findall(r"(\d+)([MIDNSHP=XB])", cigar):
1744
+ length = int(length_str)
1745
+ if op in {"M", "D", "N", "=", "X"}:
1746
+ reference_span += length
1747
+ return reference_span
1748
+
1749
+ def _collect(flag: int) -> dict[str, list[tuple[float, float, float]]]:
1750
+ cmd = ["samtools", "view", "-f", str(flag), str(bam_file_path)]
1751
+ proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
1752
+ assert proc.stdout is not None
1753
+ spans: dict[str, list[tuple[float, float, float]]] = {}
1754
+ for line in proc.stdout:
1755
+ if not line.strip() or line.startswith("@"):
1756
+ continue
1757
+ fields = line.rstrip("\n").split("\t")
1758
+ if len(fields) < 11:
1759
+ continue
1760
+ read_name = fields[0]
1761
+ if read_name not in target_names:
1762
+ continue
1763
+ cigar = fields[5]
1764
+ reference_start = float(int(fields[3]) - 1)
1765
+ if cigar != "*":
1766
+ reference_end = float(reference_start + _reference_span_from_cigar(cigar))
1767
+ read_span = float(_mapped_length_from_cigar(cigar))
1768
+ else:
1769
+ reference_end = float("nan")
1770
+ read_span = float("nan")
1771
+ spans.setdefault(read_name, []).append((reference_start, reference_end, read_span))
1772
+ rc = proc.wait()
1773
+ if rc != 0:
1774
+ stderr = proc.stderr.read() if proc.stderr else ""
1775
+ raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
1776
+ return spans
1777
+
1778
+ secondary_spans = _collect(0x100)
1779
+ supplementary_spans = _collect(0x800)
1780
+
1781
+ return secondary_spans, supplementary_spans
1782
+
1783
+
1784
+ def extract_readnames_from_bam(aligned_BAM, samtools_backend: str | None = "auto"):
942
1785
  """
943
1786
  Takes a BAM and writes out a txt file containing read names from the BAM
944
1787
 
@@ -949,21 +1792,39 @@ def extract_readnames_from_bam(aligned_BAM):
949
1792
  None
950
1793
 
951
1794
  """
952
- import subprocess
953
-
954
1795
  # Make a text file of reads for the BAM
1796
+ backend_choice = _resolve_samtools_backend(samtools_backend)
955
1797
  txt_output = aligned_BAM.split(".bam")[0] + "_read_names.txt"
956
- samtools_view = subprocess.Popen(["samtools", "view", aligned_BAM], stdout=subprocess.PIPE)
957
- with open(txt_output, "w") as output_file:
958
- cut_process = subprocess.Popen(
959
- ["cut", "-f1"], stdin=samtools_view.stdout, stdout=output_file
960
- )
961
- samtools_view.stdout.close()
962
- cut_process.wait()
963
- samtools_view.wait()
1798
+
1799
+ if backend_choice == "python":
1800
+ pysam_mod = _require_pysam()
1801
+ with (
1802
+ pysam_mod.AlignmentFile(aligned_BAM, "rb") as bam,
1803
+ open(txt_output, "w", encoding="utf-8") as output_file,
1804
+ ):
1805
+ for read in bam:
1806
+ output_file.write(f"{read.query_name}\n")
1807
+ return
1808
+
1809
+ samtools_view = subprocess.Popen(
1810
+ ["samtools", "view", aligned_BAM], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
1811
+ )
1812
+ assert samtools_view.stdout is not None
1813
+ with open(txt_output, "w", encoding="utf-8") as output_file:
1814
+ for line in samtools_view.stdout:
1815
+ if not line.strip():
1816
+ continue
1817
+ qname = line.split("\t", 1)[0]
1818
+ output_file.write(f"{qname}\n")
1819
+ rc = samtools_view.wait()
1820
+ if rc != 0:
1821
+ stderr = samtools_view.stderr.read() if samtools_view.stderr else ""
1822
+ raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
964
1823
 
965
1824
 
966
- def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
1825
+ def separate_bam_by_bc(
1826
+ input_bam, output_prefix, bam_suffix, split_dir, samtools_backend: str | None = "auto"
1827
+ ):
967
1828
  """
968
1829
  Separates an input BAM file on the BC SAM tag values.
969
1830
 
@@ -981,34 +1842,80 @@ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
981
1842
  bam_base = input_bam.name
982
1843
  bam_base_minus_suffix = input_bam.stem
983
1844
 
984
- # Open the input BAM file for reading
985
- with pysam.AlignmentFile(str(input_bam), "rb") as bam:
986
- # Create a dictionary to store output BAM files
987
- output_files = {}
988
- # Iterate over each read in the BAM file
989
- for read in bam:
990
- try:
991
- # Get the barcode tag value
992
- bc_tag = read.get_tag("BC", with_value_type=True)[0]
993
- # bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
994
- # Open the output BAM file corresponding to the barcode
995
- if bc_tag not in output_files:
996
- output_path = (
997
- split_dir / f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}"
998
- )
999
- output_files[bc_tag] = pysam.AlignmentFile(
1000
- str(output_path), "wb", header=bam.header
1001
- )
1002
- # Write the read to the corresponding output BAM file
1003
- output_files[bc_tag].write(read)
1004
- except KeyError:
1005
- logger.warning(f"BC tag not present for read: {read.query_name}")
1006
- # Close all output BAM files
1007
- for output_file in output_files.values():
1008
- output_file.close()
1845
+ backend_choice = _resolve_samtools_backend(samtools_backend)
1846
+
1847
+ if backend_choice == "python":
1848
+ pysam_mod = _require_pysam()
1849
+ # Open the input BAM file for reading
1850
+ with pysam_mod.AlignmentFile(str(input_bam), "rb") as bam:
1851
+ # Create a dictionary to store output BAM files
1852
+ output_files = {}
1853
+ # Iterate over each read in the BAM file
1854
+ for read in bam:
1855
+ try:
1856
+ # Get the barcode tag value
1857
+ bc_tag = read.get_tag("BC", with_value_type=True)[0]
1858
+ # bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
1859
+ # Open the output BAM file corresponding to the barcode
1860
+ if bc_tag not in output_files:
1861
+ output_path = (
1862
+ split_dir
1863
+ / f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}"
1864
+ )
1865
+ output_files[bc_tag] = pysam_mod.AlignmentFile(
1866
+ str(output_path), "wb", header=bam.header
1867
+ )
1868
+ # Write the read to the corresponding output BAM file
1869
+ output_files[bc_tag].write(read)
1870
+ except KeyError:
1871
+ logger.warning(f"BC tag not present for read: {read.query_name}")
1872
+ # Close all output BAM files
1873
+ for output_file in output_files.values():
1874
+ output_file.close()
1875
+ return
1876
+
1877
+ def _collect_bc_tags() -> set[str]:
1878
+ bc_tags: set[str] = set()
1879
+ proc = subprocess.Popen(
1880
+ ["samtools", "view", str(input_bam)],
1881
+ stdout=subprocess.PIPE,
1882
+ stderr=subprocess.PIPE,
1883
+ text=True,
1884
+ )
1885
+ assert proc.stdout is not None
1886
+ for line in proc.stdout:
1887
+ if not line.strip():
1888
+ continue
1889
+ fields = line.rstrip("\n").split("\t")
1890
+ for tag in fields[11:]:
1891
+ if tag.startswith("BC:"):
1892
+ bc_tags.add(tag.split(":", 2)[2])
1893
+ break
1894
+ rc = proc.wait()
1895
+ if rc != 0:
1896
+ stderr = proc.stderr.read() if proc.stderr else ""
1897
+ raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
1898
+ return bc_tags
1899
+
1900
+ bc_tags = _collect_bc_tags()
1901
+ if not bc_tags:
1902
+ logger.warning("No BC tags found in %s", input_bam)
1903
+ return
1904
+
1905
+ for bc_tag in bc_tags:
1906
+ output_path = split_dir / f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}"
1907
+ cmd = ["samtools", "view", "-b", "-d", f"BC:{bc_tag}", "-o", str(output_path)]
1908
+ cmd.append(str(input_bam))
1909
+ cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
1910
+ if cp.returncode != 0:
1911
+ raise RuntimeError(
1912
+ f"samtools view failed for BC={bc_tag} (exit {cp.returncode}):\n{cp.stderr}"
1913
+ )
1009
1914
 
1010
1915
 
1011
- def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
1916
+ def split_and_index_BAM(
1917
+ aligned_sorted_BAM, split_dir, bam_suffix, samtools_backend: str | None = "auto"
1918
+ ):
1012
1919
  """
1013
1920
  A wrapper function for splitting BAMS and indexing them.
1014
1921
  Parameters:
@@ -1023,12 +1930,22 @@ def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
1023
1930
  logger.debug("Demultiplexing and indexing BAMS based on BC tag using split_and_index_BAM")
1024
1931
  aligned_sorted_output = aligned_sorted_BAM + bam_suffix
1025
1932
  file_prefix = date_string()
1026
- separate_bam_by_bc(aligned_sorted_output, file_prefix, bam_suffix, split_dir)
1933
+ separate_bam_by_bc(
1934
+ aligned_sorted_output,
1935
+ file_prefix,
1936
+ bam_suffix,
1937
+ split_dir,
1938
+ samtools_backend=samtools_backend,
1939
+ )
1027
1940
  # Make a BAM index file for the BAMs in that directory
1028
1941
  bam_pattern = "*" + bam_suffix
1029
1942
  bam_files = glob.glob(split_dir / bam_pattern)
1030
1943
  bam_files = [str(bam) for bam in bam_files if ".bai" not in str(bam)]
1944
+ backend_choice = _resolve_samtools_backend(samtools_backend)
1031
1945
  for input_file in bam_files:
1032
- pysam.index(input_file)
1946
+ if backend_choice == "python":
1947
+ _index_bam_with_pysam(input_file)
1948
+ else:
1949
+ _index_bam_with_samtools(input_file)
1033
1950
 
1034
1951
  return bam_files