smftools 0.2.5__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. smftools/__init__.py +39 -7
  2. smftools/_settings.py +2 -0
  3. smftools/_version.py +3 -1
  4. smftools/cli/__init__.py +1 -0
  5. smftools/cli/archived/cli_flows.py +2 -0
  6. smftools/cli/helpers.py +2 -0
  7. smftools/cli/hmm_adata.py +7 -2
  8. smftools/cli/load_adata.py +130 -98
  9. smftools/cli/preprocess_adata.py +2 -0
  10. smftools/cli/spatial_adata.py +5 -1
  11. smftools/cli_entry.py +26 -1
  12. smftools/config/__init__.py +2 -0
  13. smftools/config/default.yaml +4 -1
  14. smftools/config/experiment_config.py +6 -0
  15. smftools/datasets/__init__.py +2 -0
  16. smftools/hmm/HMM.py +9 -3
  17. smftools/hmm/__init__.py +24 -13
  18. smftools/hmm/archived/apply_hmm_batched.py +2 -0
  19. smftools/hmm/archived/calculate_distances.py +2 -0
  20. smftools/hmm/archived/call_hmm_peaks.py +2 -0
  21. smftools/hmm/archived/train_hmm.py +2 -0
  22. smftools/hmm/call_hmm_peaks.py +5 -2
  23. smftools/hmm/display_hmm.py +4 -1
  24. smftools/hmm/hmm_readwrite.py +7 -2
  25. smftools/hmm/nucleosome_hmm_refinement.py +2 -0
  26. smftools/informatics/__init__.py +53 -34
  27. smftools/informatics/archived/bam_conversion.py +2 -0
  28. smftools/informatics/archived/bam_direct.py +2 -0
  29. smftools/informatics/archived/basecall_pod5s.py +2 -0
  30. smftools/informatics/archived/basecalls_to_adata.py +2 -0
  31. smftools/informatics/archived/conversion_smf.py +2 -0
  32. smftools/informatics/archived/deaminase_smf.py +1 -0
  33. smftools/informatics/archived/direct_smf.py +2 -0
  34. smftools/informatics/archived/fast5_to_pod5.py +2 -0
  35. smftools/informatics/archived/helpers/archived/__init__.py +2 -0
  36. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +2 -0
  37. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
  38. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
  39. smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
  40. smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
  41. smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
  42. smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
  43. smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
  44. smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
  45. smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
  46. smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
  47. smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
  48. smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
  49. smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
  50. smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
  51. smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
  52. smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
  53. smftools/informatics/archived/helpers/archived/informatics.py +2 -0
  54. smftools/informatics/archived/helpers/archived/load_adata.py +2 -0
  55. smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
  56. smftools/informatics/archived/helpers/archived/modQC.py +2 -0
  57. smftools/informatics/archived/helpers/archived/modcall.py +2 -0
  58. smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
  59. smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
  60. smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
  61. smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
  62. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +2 -0
  63. smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
  64. smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
  65. smftools/informatics/archived/print_bam_query_seq.py +2 -0
  66. smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
  67. smftools/informatics/archived/subsample_pod5.py +2 -0
  68. smftools/informatics/bam_functions.py +737 -170
  69. smftools/informatics/basecalling.py +2 -0
  70. smftools/informatics/bed_functions.py +271 -61
  71. smftools/informatics/binarize_converted_base_identities.py +3 -0
  72. smftools/informatics/complement_base_list.py +2 -0
  73. smftools/informatics/converted_BAM_to_adata.py +66 -22
  74. smftools/informatics/fasta_functions.py +94 -10
  75. smftools/informatics/h5ad_functions.py +8 -2
  76. smftools/informatics/modkit_extract_to_adata.py +16 -6
  77. smftools/informatics/modkit_functions.py +2 -0
  78. smftools/informatics/ohe.py +2 -0
  79. smftools/informatics/pod5_functions.py +3 -2
  80. smftools/machine_learning/__init__.py +22 -6
  81. smftools/machine_learning/data/__init__.py +2 -0
  82. smftools/machine_learning/data/anndata_data_module.py +18 -4
  83. smftools/machine_learning/data/preprocessing.py +2 -0
  84. smftools/machine_learning/evaluation/__init__.py +2 -0
  85. smftools/machine_learning/evaluation/eval_utils.py +2 -0
  86. smftools/machine_learning/evaluation/evaluators.py +14 -9
  87. smftools/machine_learning/inference/__init__.py +2 -0
  88. smftools/machine_learning/inference/inference_utils.py +2 -0
  89. smftools/machine_learning/inference/lightning_inference.py +6 -1
  90. smftools/machine_learning/inference/sklearn_inference.py +2 -0
  91. smftools/machine_learning/inference/sliding_window_inference.py +2 -0
  92. smftools/machine_learning/models/__init__.py +2 -0
  93. smftools/machine_learning/models/base.py +7 -2
  94. smftools/machine_learning/models/cnn.py +7 -2
  95. smftools/machine_learning/models/lightning_base.py +16 -11
  96. smftools/machine_learning/models/mlp.py +5 -1
  97. smftools/machine_learning/models/positional.py +7 -2
  98. smftools/machine_learning/models/rnn.py +5 -1
  99. smftools/machine_learning/models/sklearn_models.py +14 -9
  100. smftools/machine_learning/models/transformer.py +7 -2
  101. smftools/machine_learning/models/wrappers.py +6 -2
  102. smftools/machine_learning/training/__init__.py +2 -0
  103. smftools/machine_learning/training/train_lightning_model.py +13 -3
  104. smftools/machine_learning/training/train_sklearn_model.py +2 -0
  105. smftools/machine_learning/utils/__init__.py +2 -0
  106. smftools/machine_learning/utils/device.py +5 -1
  107. smftools/machine_learning/utils/grl.py +5 -1
  108. smftools/optional_imports.py +31 -0
  109. smftools/plotting/__init__.py +32 -31
  110. smftools/plotting/autocorrelation_plotting.py +9 -5
  111. smftools/plotting/classifiers.py +16 -4
  112. smftools/plotting/general_plotting.py +6 -3
  113. smftools/plotting/hmm_plotting.py +12 -2
  114. smftools/plotting/position_stats.py +15 -7
  115. smftools/plotting/qc_plotting.py +6 -1
  116. smftools/preprocessing/__init__.py +35 -37
  117. smftools/preprocessing/archived/add_read_length_and_mapping_qc.py +2 -0
  118. smftools/preprocessing/archived/calculate_complexity.py +2 -0
  119. smftools/preprocessing/archived/mark_duplicates.py +2 -0
  120. smftools/preprocessing/archived/preprocessing.py +2 -0
  121. smftools/preprocessing/archived/remove_duplicates.py +2 -0
  122. smftools/preprocessing/binary_layers_to_ohe.py +2 -1
  123. smftools/preprocessing/calculate_complexity_II.py +4 -1
  124. smftools/preprocessing/calculate_pairwise_differences.py +2 -0
  125. smftools/preprocessing/calculate_pairwise_hamming_distances.py +3 -0
  126. smftools/preprocessing/calculate_position_Youden.py +9 -2
  127. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +2 -0
  128. smftools/preprocessing/filter_reads_on_modification_thresholds.py +2 -0
  129. smftools/preprocessing/flag_duplicate_reads.py +42 -54
  130. smftools/preprocessing/make_dirs.py +2 -1
  131. smftools/preprocessing/min_non_diagonal.py +2 -0
  132. smftools/preprocessing/recipes.py +2 -0
  133. smftools/tools/__init__.py +26 -18
  134. smftools/tools/archived/apply_hmm.py +2 -0
  135. smftools/tools/archived/classifiers.py +2 -0
  136. smftools/tools/archived/classify_methylated_features.py +2 -0
  137. smftools/tools/archived/classify_non_methylated_features.py +2 -0
  138. smftools/tools/archived/subset_adata_v1.py +2 -0
  139. smftools/tools/archived/subset_adata_v2.py +2 -0
  140. smftools/tools/calculate_umap.py +3 -1
  141. smftools/tools/cluster_adata_on_methylation.py +7 -1
  142. smftools/tools/position_stats.py +17 -27
  143. {smftools-0.2.5.dist-info → smftools-0.3.0.dist-info}/METADATA +67 -33
  144. smftools-0.3.0.dist-info/RECORD +182 -0
  145. smftools-0.2.5.dist-info/RECORD +0 -181
  146. {smftools-0.2.5.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
  147. {smftools-0.2.5.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
  148. {smftools-0.2.5.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -3,28 +3,118 @@ from __future__ import annotations
3
3
  import glob
4
4
  import os
5
5
  import re
6
+ import shutil
6
7
  import subprocess
7
8
  import time
8
9
  from collections import Counter, defaultdict, deque
9
10
  from concurrent.futures import ThreadPoolExecutor, as_completed
10
11
  from itertools import zip_longest
11
12
  from pathlib import Path
12
- from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
13
+ from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union
13
14
 
14
15
  import numpy as np
15
- import pysam
16
16
  from tqdm import tqdm
17
17
 
18
18
  from smftools.logging_utils import get_logger
19
+ from smftools.optional_imports import require
19
20
 
20
21
  from ..readwrite import date_string, time_string
21
22
 
23
+ if TYPE_CHECKING:
24
+ import pysam as pysam_types
25
+
26
+ try:
27
+ import pysam
28
+ except Exception:
29
+ pysam = None # type: ignore
30
+
22
31
  logger = get_logger(__name__)
23
32
 
24
33
  _PROGRESS_RE = re.compile(r"Output records written:\s*(\d+)")
25
34
  _EMPTY_RE = re.compile(r"^\s*$")
26
35
 
27
36
 
37
+ def _require_pysam() -> "pysam_types":
38
+ """Return the pysam module or raise if unavailable."""
39
+ if pysam is not None:
40
+ return pysam
41
+ return require("pysam", extra="pysam", purpose="samtools-compatible Python backend")
42
+
43
+
44
+ def _resolve_samtools_backend(backend: str | None) -> str:
45
+ """Resolve backend choice for samtools-compatible operations.
46
+
47
+ Args:
48
+ backend: One of {"auto", "python", "cli"} (case-insensitive).
49
+
50
+ Returns:
51
+ Resolved backend string ("python" or "cli").
52
+ """
53
+ choice = (backend or "auto").strip().lower()
54
+ if choice not in {"auto", "python", "cli"}:
55
+ raise ValueError("samtools_backend must be one of: auto, python, cli")
56
+
57
+ have_pysam = pysam is not None
58
+ have_samtools = shutil.which("samtools") is not None
59
+
60
+ if choice == "python":
61
+ if not have_pysam:
62
+ raise RuntimeError("samtools_backend=python requires pysam to be installed.")
63
+ return "python"
64
+ if choice == "cli":
65
+ if not have_samtools:
66
+ raise RuntimeError("samtools_backend=cli requires samtools in PATH.")
67
+ return "cli"
68
+
69
+ if have_samtools:
70
+ return "cli"
71
+ if have_pysam:
72
+ return "python"
73
+ raise RuntimeError("Neither pysam nor samtools is available in PATH.")
74
+
75
+
76
+ def _has_bam_index(bam_path: Path) -> bool:
77
+ """Return True if the BAM index exists alongside the BAM."""
78
+ return (
79
+ bam_path.with_suffix(bam_path.suffix + ".bai").exists()
80
+ or Path(str(bam_path) + ".bai").exists()
81
+ )
82
+
83
+
84
+ def _ensure_bam_index(bam_path: Path, backend: str) -> None:
85
+ """Ensure a BAM index exists, creating one if needed."""
86
+ if _has_bam_index(bam_path):
87
+ return
88
+ if backend == "python":
89
+ _index_bam_with_pysam(bam_path)
90
+ else:
91
+ _index_bam_with_samtools(bam_path)
92
+
93
+
94
+ def _parse_idxstats_output(output: str) -> Tuple[int, int, Dict[str, Tuple[int, float]]]:
95
+ """Parse samtools idxstats output into counts and proportions."""
96
+ aligned_reads_count = 0
97
+ unaligned_reads_count = 0
98
+ record_counts: Dict[str, int] = {}
99
+ for line in output.splitlines():
100
+ if not line.strip():
101
+ continue
102
+ ref, _length, mapped, unmapped = line.split("\t")[:4]
103
+ if ref == "*":
104
+ unaligned_reads_count += int(unmapped)
105
+ continue
106
+ mapped_count = int(mapped)
107
+ aligned_reads_count += mapped_count
108
+ record_counts[ref] = mapped_count
109
+
110
+ proportions: Dict[str, Tuple[int, float]] = {}
111
+ for ref, count in record_counts.items():
112
+ proportion = count / aligned_reads_count if aligned_reads_count else 0.0
113
+ proportions[ref] = (count, proportion)
114
+
115
+ return aligned_reads_count, unaligned_reads_count, proportions
116
+
117
+
28
118
  def _stream_dorado_logs(stderr_iter) -> None:
29
119
  """Stream dorado stderr and emit structured log messages.
30
120
 
@@ -60,8 +150,9 @@ def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str,
60
150
 
61
151
  logger.debug(f"Converting BAM to FASTQ using _bam_to_fastq_with_pysam")
62
152
 
153
+ pysam_mod = _require_pysam()
63
154
  with (
64
- pysam.AlignmentFile(bam_path, "rb", check_sq=False) as bam,
155
+ pysam_mod.AlignmentFile(bam_path, "rb", check_sq=False) as bam,
65
156
  open(fastq_path, "w", encoding="utf-8") as fq,
66
157
  ):
67
158
  for r in bam.fetch(until_eof=True):
@@ -103,7 +194,8 @@ def _sort_bam_with_pysam(
103
194
  if threads:
104
195
  args += ["-@", str(threads)]
105
196
  args += ["-o", out_bam, in_bam]
106
- pysam.sort(*args)
197
+ pysam_mod = _require_pysam()
198
+ pysam_mod.sort(*args)
107
199
 
108
200
 
109
201
  def _index_bam_with_pysam(bam_path: Union[str, Path], threads: Optional[int] = None) -> None:
@@ -115,11 +207,54 @@ def _index_bam_with_pysam(bam_path: Union[str, Path], threads: Optional[int] = N
115
207
  """
116
208
  bam_path = str(bam_path)
117
209
  logger.debug(f"Indexing BAM using _index_bam_with_pysam")
210
+ pysam_mod = _require_pysam()
118
211
  # pysam.index supports samtools-style args
119
212
  if threads:
120
- pysam.index("-@", str(threads), bam_path)
213
+ pysam_mod.index("-@", str(threads), bam_path)
121
214
  else:
122
- pysam.index(bam_path)
215
+ pysam_mod.index(bam_path)
216
+
217
+
218
+ def _bam_to_fastq_with_samtools(bam_path: Union[str, Path], fastq_path: Union[str, Path]) -> None:
219
+ """Convert BAM to FASTQ using samtools."""
220
+ if not shutil.which("samtools"):
221
+ raise RuntimeError("samtools is required but not available in PATH.")
222
+ cmd = ["samtools", "fastq", str(bam_path)]
223
+ logger.debug("Converting BAM to FASTQ using samtools: %s", " ".join(cmd))
224
+ with open(fastq_path, "w", encoding="utf-8") as fq:
225
+ cp = subprocess.run(cmd, stdout=fq, stderr=subprocess.PIPE, text=True)
226
+ if cp.returncode != 0:
227
+ raise RuntimeError(f"samtools fastq failed (exit {cp.returncode}):\n{cp.stderr}")
228
+
229
+
230
+ def _sort_bam_with_samtools(
231
+ in_bam: Union[str, Path], out_bam: Union[str, Path], threads: Optional[int] = None
232
+ ) -> None:
233
+ """Sort a BAM file using samtools."""
234
+ if not shutil.which("samtools"):
235
+ raise RuntimeError("samtools is required but not available in PATH.")
236
+ cmd = ["samtools", "sort", "-o", str(out_bam)]
237
+ if threads:
238
+ cmd += ["-@", str(threads)]
239
+ cmd.append(str(in_bam))
240
+ logger.debug("Sorting BAM using samtools: %s", " ".join(cmd))
241
+ cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
242
+ if cp.returncode != 0:
243
+ raise RuntimeError(f"samtools sort failed (exit {cp.returncode}):\n{cp.stderr}")
244
+
245
+
246
+ def _index_bam_with_samtools(bam_path: Union[str, Path], threads: Optional[int] = None) -> None:
247
+ """Index a BAM file using samtools."""
248
+ if not shutil.which("samtools"):
249
+ raise RuntimeError("samtools is required but not available in PATH.")
250
+ cmd = ["samtools", "index"]
251
+ if threads:
252
+ cmd += ["-@", str(threads)]
253
+ cmd.append(str(bam_path))
254
+ logger.debug("Indexing BAM using samtools: %s", " ".join(cmd))
255
+ cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
256
+ if cp.returncode != 0:
257
+ raise RuntimeError(f"samtools index failed (exit {cp.returncode}):\n{cp.stderr}")
123
258
 
124
259
 
125
260
  def align_and_sort_BAM(
@@ -156,10 +291,15 @@ def align_and_sort_BAM(
156
291
  else:
157
292
  threads = None
158
293
 
294
+ samtools_backend = _resolve_samtools_backend(getattr(cfg, "samtools_backend", "auto"))
295
+
159
296
  if cfg.aligner == "minimap2":
160
297
  if not cfg.align_from_bam:
161
298
  logger.debug(f"Converting BAM to FASTQ: {input}")
162
- _bam_to_fastq_with_pysam(input, input_as_fastq)
299
+ if samtools_backend == "python":
300
+ _bam_to_fastq_with_pysam(input, input_as_fastq)
301
+ else:
302
+ _bam_to_fastq_with_samtools(input, input_as_fastq)
163
303
  logger.debug(f"Aligning FASTQ to Reference: {input_as_fastq}")
164
304
  mm_input = input_as_fastq
165
305
  else:
@@ -220,12 +360,18 @@ def align_and_sort_BAM(
220
360
  logger.error(f"Aligner not recognized: {cfg.aligner}. Choose from minimap2 and dorado")
221
361
  return
222
362
 
223
- # --- Sort & Index with pysam ---
363
+ # --- Sort & Index ---
224
364
  logger.debug(f"Sorting: {aligned_output} -> {aligned_sorted_output}")
225
- _sort_bam_with_pysam(aligned_output, aligned_sorted_output, threads=threads)
365
+ if samtools_backend == "python":
366
+ _sort_bam_with_pysam(aligned_output, aligned_sorted_output, threads=threads)
367
+ else:
368
+ _sort_bam_with_samtools(aligned_output, aligned_sorted_output, threads=threads)
226
369
 
227
370
  logger.debug(f"Indexing: {aligned_sorted_output}")
228
- _index_bam_with_pysam(aligned_sorted_output, threads=threads)
371
+ if samtools_backend == "python":
372
+ _index_bam_with_pysam(aligned_sorted_output, threads=threads)
373
+ else:
374
+ _index_bam_with_samtools(aligned_sorted_output, threads=threads)
229
375
 
230
376
 
231
377
  def bam_qc(
@@ -236,25 +382,20 @@ def bam_qc(
236
382
  stats: bool = True,
237
383
  flagstats: bool = True,
238
384
  idxstats: bool = True,
385
+ samtools_backend: str | None = "auto",
239
386
  ) -> None:
240
387
  """
241
388
  QC for BAM/CRAMs: stats, flagstat, idxstats.
242
389
  Prefers pysam; falls back to `samtools` if needed.
243
390
  Runs BAMs in parallel (up to `threads`, default serial).
244
391
  """
245
- import shutil
246
392
  import subprocess
247
393
 
248
394
  logger.debug("Performing BAM QC using bam_qc")
249
395
 
250
- # Try to import pysam once
251
- try:
252
- import pysam # type: ignore
253
-
254
- have_pysam = True
255
- except Exception:
256
- pysam = None # type: ignore
257
- have_pysam = False
396
+ backend_choice = _resolve_samtools_backend(samtools_backend)
397
+ have_pysam = backend_choice == "python"
398
+ pysam_mod = _require_pysam() if have_pysam else None
258
399
 
259
400
  bam_qc_dir = Path(bam_qc_dir)
260
401
  bam_qc_dir.mkdir(parents=True, exist_ok=True)
@@ -275,11 +416,9 @@ def bam_qc(
275
416
  if _has_index(p):
276
417
  return
277
418
  if have_pysam:
278
- assert pysam is not None
279
- pysam.index(str(p)) # supports BAM & CRAM
419
+ assert pysam_mod is not None
420
+ pysam_mod.index(str(p)) # supports BAM & CRAM
280
421
  else:
281
- if not shutil.which("samtools"):
282
- raise RuntimeError("Neither pysam nor samtools is available in PATH.")
283
422
  cmd = ["samtools", "index", str(p)]
284
423
  # capture text so errors are readable; raise on failure
285
424
  cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
@@ -300,7 +439,7 @@ def bam_qc(
300
439
  line = line.rstrip()
301
440
  if line:
302
441
  last_err.append(line)
303
- logger.info("[%s][%s] %s", tag, bam.name, line)
442
+ logger.debug("[%s][%s] %s", tag, bam.name, line)
304
443
  rc = proc.wait()
305
444
 
306
445
  if rc != 0:
@@ -332,16 +471,13 @@ def bam_qc(
332
471
  # Still attempt stats/flagstat if requested; idxstats may fail later if index is required.
333
472
  logger.warning("Indexing failed for %s: %s", bam, e)
334
473
 
335
- if not have_pysam:
336
- import shutil
337
-
338
- if not shutil.which("samtools"):
339
- raise RuntimeError("Neither pysam nor samtools is available in PATH.")
340
-
341
474
  # --- stats ---
342
475
  if stats:
343
- if have_pysam and pysam is not None and hasattr(pysam, "stats"):
344
- txt = pysam.stats(str(bam))
476
+ if have_pysam:
477
+ assert pysam_mod is not None
478
+ if not hasattr(pysam_mod, "stats"):
479
+ raise RuntimeError("pysam.stats is unavailable in this pysam build.")
480
+ txt = pysam_mod.stats(str(bam))
345
481
  out_stats.write_text(txt)
346
482
  results.append(("stats(pysam)", 0))
347
483
  else:
@@ -351,8 +487,11 @@ def bam_qc(
351
487
 
352
488
  # --- flagstat ---
353
489
  if flagstats:
354
- if have_pysam and pysam is not None and hasattr(pysam, "flagstat"):
355
- txt = pysam.flagstat(str(bam))
490
+ if have_pysam:
491
+ assert pysam_mod is not None
492
+ if not hasattr(pysam_mod, "flagstat"):
493
+ raise RuntimeError("pysam.flagstat is unavailable in this pysam build.")
494
+ txt = pysam_mod.flagstat(str(bam))
356
495
  out_flag.write_text(txt)
357
496
  results.append(("flagstat(pysam)", 0))
358
497
  else:
@@ -362,8 +501,11 @@ def bam_qc(
362
501
 
363
502
  # --- idxstats ---
364
503
  if idxstats:
365
- if have_pysam and pysam is not None and hasattr(pysam, "idxstats"):
366
- txt = pysam.idxstats(str(bam))
504
+ if have_pysam:
505
+ assert pysam_mod is not None
506
+ if not hasattr(pysam_mod, "idxstats"):
507
+ raise RuntimeError("pysam.idxstats is unavailable in this pysam build.")
508
+ txt = pysam_mod.idxstats(str(bam))
367
509
  out_idx.write_text(txt)
368
510
  results.append(("idxstats(pysam)", 0))
369
511
  else:
@@ -400,6 +542,8 @@ def concatenate_fastqs_to_bam(
400
542
  rg_sample_field: Optional[str] = None,
401
543
  progress: bool = True,
402
544
  auto_pair: bool = True,
545
+ gzip_suffixes: Tuple[str, ...] = (".gz", ".gzip"),
546
+ samtools_backend: str | None = "auto",
403
547
  ) -> Dict[str, Any]:
404
548
  """
405
549
  Concatenate FASTQ(s) into an **unaligned** BAM. Supports single-end and paired-end.
@@ -422,6 +566,10 @@ def concatenate_fastqs_to_bam(
422
566
  Show tqdm progress bars.
423
567
  auto_pair : bool
424
568
  Auto-pair R1/R2 based on filename patterns if given a flat list.
569
+ gzip_suffixes : tuple[str, ...]
570
+ Suffixes treated as gzip-compressed FASTQ files.
571
+ samtools_backend : str | None
572
+ Backend selection for samtools-compatible operations (auto|python|cli).
425
573
 
426
574
  Returns
427
575
  -------
@@ -436,9 +584,10 @@ def concatenate_fastqs_to_bam(
436
584
  """
437
585
  name = p.name
438
586
  lowers = name.lower()
587
+ gzip_exts = tuple(s.lower() for s in gzip_suffixes)
439
588
  for ext in (
440
- ".fastq.gz",
441
- ".fq.gz",
589
+ *(f".fastq{suf}" for suf in gzip_exts),
590
+ *(f".fq{suf}" for suf in gzip_exts),
442
591
  ".fastq.bz2",
443
592
  ".fq.bz2",
444
593
  ".fastq.xz",
@@ -525,10 +674,50 @@ def concatenate_fastqs_to_bam(
525
674
  Pysam Fastx records.
526
675
  """
527
676
  # pysam.FastxFile handles compressed extensions transparently
528
- with pysam.FastxFile(str(p)) as fx:
677
+ pysam_mod = _require_pysam()
678
+ with pysam_mod.FastxFile(str(p)) as fx:
529
679
  for rec in fx:
530
680
  yield rec # rec.name, rec.sequence, rec.quality
531
681
 
682
+ def _fastq_iter_plain(p: Path) -> Iterable[Tuple[str, str, str]]:
683
+ """Yield FASTQ records from plain-text parsing.
684
+
685
+ Args:
686
+ p: FASTQ path.
687
+
688
+ Yields:
689
+ Tuple of (name, sequence, quality).
690
+ """
691
+ import bz2
692
+ import gzip
693
+ import lzma
694
+
695
+ lowers = p.name.lower()
696
+ if any(lowers.endswith(suf) for suf in (s.lower() for s in gzip_suffixes)):
697
+ handle = gzip.open(p, "rt", encoding="utf-8")
698
+ elif lowers.endswith(".bz2"):
699
+ handle = bz2.open(p, "rt", encoding="utf-8")
700
+ elif lowers.endswith(".xz"):
701
+ handle = lzma.open(p, "rt", encoding="utf-8")
702
+ else:
703
+ handle = p.open("r", encoding="utf-8")
704
+
705
+ with handle as fh:
706
+ while True:
707
+ header = fh.readline()
708
+ if not header:
709
+ break
710
+ seq = fh.readline()
711
+ fh.readline()
712
+ qual = fh.readline()
713
+ if not qual:
714
+ break
715
+ name = header.strip()
716
+ if name.startswith("@"):
717
+ name = name[1:]
718
+ name = name.split()[0]
719
+ yield name, seq.strip(), qual.strip()
720
+
532
721
  def _make_unaligned_segment(
533
722
  name: str,
534
723
  seq: str,
@@ -550,11 +739,12 @@ def concatenate_fastqs_to_bam(
550
739
  Returns:
551
740
  Unaligned pysam.AlignedSegment.
552
741
  """
553
- a = pysam.AlignedSegment()
742
+ pysam_mod = _require_pysam()
743
+ a = pysam_mod.AlignedSegment()
554
744
  a.query_name = name
555
745
  a.query_sequence = seq
556
746
  if qual is not None:
557
- a.query_qualities = pysam.qualitystring_to_array(qual)
747
+ a.query_qualities = pysam_mod.qualitystring_to_array(qual)
558
748
  a.is_unmapped = True
559
749
  a.is_paired = read1 or read2
560
750
  a.is_read1 = read1
@@ -570,6 +760,48 @@ def concatenate_fastqs_to_bam(
570
760
  a.set_tag("RG", str(bc), value_type="Z")
571
761
  return a
572
762
 
763
+ def _write_sam_line(
764
+ handle,
765
+ name: str,
766
+ seq: str,
767
+ qual: str,
768
+ bc: str,
769
+ *,
770
+ read1: bool,
771
+ read2: bool,
772
+ add_read_group: bool,
773
+ ) -> None:
774
+ """Write a single unaligned SAM record to a text stream."""
775
+ if read1:
776
+ flag = 77
777
+ elif read2:
778
+ flag = 141
779
+ else:
780
+ flag = 4
781
+ tags = [f"{barcode_tag}:Z:{bc}"]
782
+ if add_read_group:
783
+ tags.append(f"RG:Z:{bc}")
784
+ tag_str = "\t".join(tags)
785
+ if not qual:
786
+ qual = "*"
787
+ line = "\t".join(
788
+ [
789
+ name,
790
+ str(flag),
791
+ "*",
792
+ "0",
793
+ "0",
794
+ "*",
795
+ "*",
796
+ "0",
797
+ "0",
798
+ seq,
799
+ qual,
800
+ tag_str,
801
+ ]
802
+ )
803
+ handle.write(f"{line}\n")
804
+
573
805
  # ---------- normalize inputs to Path ----------
574
806
  def _to_path_pair(x) -> Tuple[Path, Path]:
575
807
  """Convert a tuple of path-like objects to Path instances."""
@@ -630,7 +862,29 @@ def concatenate_fastqs_to_bam(
630
862
  singletons_written = 0
631
863
 
632
864
  # ---------- write BAM ----------
633
- with pysam.AlignmentFile(str(output_bam), "wb", header=header) as bam_out:
865
+ backend_choice = _resolve_samtools_backend(samtools_backend)
866
+ if backend_choice == "python":
867
+ pysam_mod = _require_pysam()
868
+ bam_out_ctx = pysam_mod.AlignmentFile(str(output_bam), "wb", header=header)
869
+ else:
870
+ cmd = ["samtools", "view", "-b", "-o", str(output_bam), "-"]
871
+ logger.debug("Writing BAM using samtools: %s", " ".join(cmd))
872
+ bam_out_ctx = subprocess.Popen(
873
+ cmd, stdin=subprocess.PIPE, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True
874
+ )
875
+ assert bam_out_ctx.stdin is not None
876
+ header_lines = ["@HD\tVN:1.6\tSO:unknown"]
877
+ if add_read_group:
878
+ for bc in barcodes_in_order:
879
+ rg_fields = [f"ID:{bc}"]
880
+ if rg_sample_field:
881
+ rg_fields.append(f"SM:{rg_sample_field}")
882
+ rg_body = "\t".join(rg_fields)
883
+ header_lines.append(f"@RG\t{rg_body}")
884
+ header_lines.append("@PG\tID:concat-fastq\tPN:concatenate_fastqs_to_bam\tVN:1")
885
+ bam_out_ctx.stdin.write("\n".join(header_lines) + "\n")
886
+
887
+ try:
634
888
  # Paired
635
889
  it_pairs = explicit_pairs
636
890
  if progress and it_pairs:
@@ -640,8 +894,12 @@ def concatenate_fastqs_to_bam(
640
894
  raise FileNotFoundError(f"Paired file missing: {r1_path} or {r2_path}")
641
895
  bc = per_path_barcode.get(r1_path) or per_path_barcode.get(r2_path) or "barcode"
642
896
 
643
- it1 = _fastq_iter(r1_path)
644
- it2 = _fastq_iter(r2_path)
897
+ if backend_choice == "python":
898
+ it1 = _fastq_iter(r1_path)
899
+ it2 = _fastq_iter(r2_path)
900
+ else:
901
+ it1 = _fastq_iter_plain(r1_path)
902
+ it2 = _fastq_iter_plain(r2_path)
645
903
 
646
904
  for rec1, rec2 in zip_longest(it1, it2, fillvalue=None):
647
905
 
@@ -652,24 +910,67 @@ def concatenate_fastqs_to_bam(
652
910
  return re.sub(r"(?:/1$|/2$|\s[12]$)", "", n)
653
911
 
654
912
  name = (
655
- _clean(getattr(rec1, "name", None))
656
- or _clean(getattr(rec2, "name", None))
657
- or getattr(rec1, "name", None)
658
- or getattr(rec2, "name", None)
913
+ _clean(getattr(rec1, "name", None) if backend_choice == "python" else rec1[0])
914
+ if rec1 is not None
915
+ else None
659
916
  )
917
+ if name is None:
918
+ name = (
919
+ _clean(
920
+ getattr(rec2, "name", None) if backend_choice == "python" else rec2[0]
921
+ )
922
+ if rec2 is not None
923
+ else None
924
+ )
925
+ if name is None:
926
+ name = (
927
+ getattr(rec1, "name", None)
928
+ if backend_choice == "python" and rec1 is not None
929
+ else (rec1[0] if rec1 is not None else None)
930
+ )
931
+ if name is None:
932
+ name = (
933
+ getattr(rec2, "name", None)
934
+ if backend_choice == "python" and rec2 is not None
935
+ else (rec2[0] if rec2 is not None else None)
936
+ )
660
937
 
661
938
  if rec1 is not None:
662
- a1 = _make_unaligned_segment(
663
- name, rec1.sequence, rec1.quality, bc, read1=True, read2=False
664
- )
665
- bam_out.write(a1)
939
+ if backend_choice == "python":
940
+ a1 = _make_unaligned_segment(
941
+ name, rec1.sequence, rec1.quality, bc, read1=True, read2=False
942
+ )
943
+ bam_out_ctx.write(a1)
944
+ else:
945
+ _write_sam_line(
946
+ bam_out_ctx.stdin,
947
+ name,
948
+ rec1[1],
949
+ rec1[2],
950
+ bc,
951
+ read1=True,
952
+ read2=False,
953
+ add_read_group=add_read_group,
954
+ )
666
955
  per_file_counts[r1_path] = per_file_counts.get(r1_path, 0) + 1
667
956
  total_written += 1
668
957
  if rec2 is not None:
669
- a2 = _make_unaligned_segment(
670
- name, rec2.sequence, rec2.quality, bc, read1=False, read2=True
671
- )
672
- bam_out.write(a2)
958
+ if backend_choice == "python":
959
+ a2 = _make_unaligned_segment(
960
+ name, rec2.sequence, rec2.quality, bc, read1=False, read2=True
961
+ )
962
+ bam_out_ctx.write(a2)
963
+ else:
964
+ _write_sam_line(
965
+ bam_out_ctx.stdin,
966
+ name,
967
+ rec2[1],
968
+ rec2[2],
969
+ bc,
970
+ read1=False,
971
+ read2=True,
972
+ add_read_group=add_read_group,
973
+ )
673
974
  per_file_counts[r2_path] = per_file_counts.get(r2_path, 0) + 1
674
975
  total_written += 1
675
976
 
@@ -689,14 +990,40 @@ def concatenate_fastqs_to_bam(
689
990
  if not pth.exists():
690
991
  raise FileNotFoundError(pth)
691
992
  bc = per_path_barcode.get(pth, "barcode")
692
- for rec in _fastq_iter(pth):
693
- a = _make_unaligned_segment(
694
- rec.name, rec.sequence, rec.quality, bc, read1=False, read2=False
695
- )
696
- bam_out.write(a)
993
+ if backend_choice == "python":
994
+ iterator = _fastq_iter(pth)
995
+ else:
996
+ iterator = _fastq_iter_plain(pth)
997
+ for rec in iterator:
998
+ if backend_choice == "python":
999
+ a = _make_unaligned_segment(
1000
+ rec.name, rec.sequence, rec.quality, bc, read1=False, read2=False
1001
+ )
1002
+ bam_out_ctx.write(a)
1003
+ else:
1004
+ _write_sam_line(
1005
+ bam_out_ctx.stdin,
1006
+ rec[0],
1007
+ rec[1],
1008
+ rec[2],
1009
+ bc,
1010
+ read1=False,
1011
+ read2=False,
1012
+ add_read_group=add_read_group,
1013
+ )
697
1014
  per_file_counts[pth] = per_file_counts.get(pth, 0) + 1
698
1015
  total_written += 1
699
1016
  singletons_written += 1
1017
+ finally:
1018
+ if backend_choice == "python":
1019
+ bam_out_ctx.close()
1020
+ else:
1021
+ if bam_out_ctx.stdin is not None:
1022
+ bam_out_ctx.stdin.close()
1023
+ rc = bam_out_ctx.wait()
1024
+ if rc != 0:
1025
+ stderr = bam_out_ctx.stderr.read() if bam_out_ctx.stderr else ""
1026
+ raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
700
1027
 
701
1028
  return {
702
1029
  "total_reads": total_written,
@@ -707,7 +1034,7 @@ def concatenate_fastqs_to_bam(
707
1034
  }
708
1035
 
709
1036
 
710
- def count_aligned_reads(bam_file):
1037
+ def count_aligned_reads(bam_file, samtools_backend: str | None = "auto"):
711
1038
  """
712
1039
  Counts the number of aligned reads in a bam file that map to each reference record.
713
1040
 
@@ -720,30 +1047,42 @@ def count_aligned_reads(bam_file):
720
1047
  record_counts (dict): A dictionary keyed by reference record instance that points toa tuple containing the total reads mapped to the record and the fraction of mapped reads which map to the record.
721
1048
 
722
1049
  """
723
- print("{0}: Counting aligned reads in BAM > {1}".format(time_string(), bam_file))
1050
+ logger.info("Counting aligned reads in BAM > {}".format(bam_file.name))
1051
+ backend_choice = _resolve_samtools_backend(samtools_backend)
724
1052
  aligned_reads_count = 0
725
1053
  unaligned_reads_count = 0
726
- # Make a dictionary, keyed by the reference_name of reference chromosome that points to an integer number of read counts mapped to the chromosome, as well as the proportion of mapped reads in that chromosome
727
- record_counts = defaultdict(int)
728
-
729
- with pysam.AlignmentFile(str(bam_file), "rb") as bam:
730
- total_reads = bam.mapped + bam.unmapped
731
- # Iterate over reads to get the total mapped read counts and the reads that map to each reference
732
- for read in tqdm(bam, desc="Counting aligned reads in BAM", total=total_reads):
733
- if read.is_unmapped:
734
- unaligned_reads_count += 1
735
- else:
736
- aligned_reads_count += 1
737
- record_counts[read.reference_name] += (
738
- 1 # Automatically increments if key exists, adds if not
739
- )
740
1054
 
741
- # reformat the dictionary to contain read counts mapped to the reference, as well as the proportion of mapped reads in reference
742
- for reference in record_counts:
743
- proportion_mapped_reads_in_record = record_counts[reference] / aligned_reads_count
744
- record_counts[reference] = (record_counts[reference], proportion_mapped_reads_in_record)
1055
+ if backend_choice == "python":
1056
+ pysam_mod = _require_pysam()
1057
+ record_counts = defaultdict(int)
1058
+ with pysam_mod.AlignmentFile(str(bam_file), "rb") as bam:
1059
+ total_reads = bam.mapped + bam.unmapped
1060
+ # Iterate over reads to get the total mapped read counts and the reads that map to each reference
1061
+ for read in bam:
1062
+ if read.is_unmapped:
1063
+ unaligned_reads_count += 1
1064
+ else:
1065
+ aligned_reads_count += 1
1066
+ record_counts[read.reference_name] += (
1067
+ 1 # Automatically increments if key exists, adds if not
1068
+ )
745
1069
 
746
- return aligned_reads_count, unaligned_reads_count, dict(record_counts)
1070
+ # reformat the dictionary to contain read counts mapped to the reference, as well as the proportion of mapped reads in reference
1071
+ for reference in record_counts:
1072
+ proportion_mapped_reads_in_record = record_counts[reference] / aligned_reads_count
1073
+ record_counts[reference] = (
1074
+ record_counts[reference],
1075
+ proportion_mapped_reads_in_record,
1076
+ )
1077
+ return aligned_reads_count, unaligned_reads_count, dict(record_counts)
1078
+
1079
+ bam_path = Path(bam_file)
1080
+ _ensure_bam_index(bam_path, backend_choice)
1081
+ cmd = ["samtools", "idxstats", str(bam_path)]
1082
+ cp = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
1083
+ if cp.returncode != 0:
1084
+ raise RuntimeError(f"samtools idxstats failed (exit {cp.returncode}):\n{cp.stderr}")
1085
+ return _parse_idxstats_output(cp.stdout)
747
1086
 
748
1087
 
749
1088
  def demux_and_index_BAM(
@@ -827,7 +1166,14 @@ def demux_and_index_BAM(
827
1166
  return renamed_bams
828
1167
 
829
1168
 
830
- def extract_base_identities(bam_file, chromosome, positions, max_reference_length, sequence):
1169
+ def extract_base_identities(
1170
+ bam_file,
1171
+ chromosome,
1172
+ positions,
1173
+ max_reference_length,
1174
+ sequence,
1175
+ samtools_backend: str | None = "auto",
1176
+ ):
831
1177
  """
832
1178
  Efficiently extracts base identities from mapped reads with reference coordinates.
833
1179
 
@@ -850,31 +1196,87 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
850
1196
  rev_base_identities = defaultdict(lambda: np.full(max_reference_length, "N", dtype="<U1"))
851
1197
  mismatch_counts_per_read = defaultdict(lambda: defaultdict(Counter))
852
1198
 
853
- # print(f"{timestamp} Reading reads from {chromosome} BAM file: {bam_file}")
854
- with pysam.AlignmentFile(str(bam_file), "rb") as bam:
855
- total_reads = bam.mapped
856
- ref_seq = sequence.upper()
857
- for read in bam.fetch(chromosome):
858
- if not read.is_mapped:
859
- continue # Skip unmapped reads
1199
+ backend_choice = _resolve_samtools_backend(samtools_backend)
1200
+ ref_seq = sequence.upper()
860
1201
 
861
- read_name = read.query_name
862
- query_sequence = read.query_sequence
863
- base_dict = rev_base_identities if read.is_reverse else fwd_base_identities
1202
+ if backend_choice == "python":
1203
+ logger.debug("Extracting base identities using python")
1204
+ pysam_mod = _require_pysam()
1205
+ # print(f"{timestamp} Reading reads from {chromosome} BAM file: {bam_file}")
1206
+ with pysam_mod.AlignmentFile(str(bam_file), "rb") as bam:
1207
+ total_reads = bam.mapped
1208
+ for read in bam.fetch(chromosome):
1209
+ if not read.is_mapped:
1210
+ continue # Skip unmapped reads
864
1211
 
865
- # Use get_aligned_pairs directly with positions filtering
866
- aligned_pairs = read.get_aligned_pairs(matches_only=True)
1212
+ read_name = read.query_name
1213
+ query_sequence = read.query_sequence
1214
+ base_dict = rev_base_identities if read.is_reverse else fwd_base_identities
867
1215
 
868
- for read_position, reference_position in aligned_pairs:
869
- if reference_position in positions:
1216
+ # Use get_aligned_pairs directly with positions filtering
1217
+ aligned_pairs = read.get_aligned_pairs(matches_only=True)
1218
+
1219
+ for read_position, reference_position in aligned_pairs:
870
1220
  read_base = query_sequence[read_position]
871
1221
  ref_base = ref_seq[reference_position]
1222
+ if reference_position in positions:
1223
+ base_dict[read_name][reference_position] = read_base
872
1224
 
873
- base_dict[read_name][reference_position] = read_base
874
-
875
- # Track mismatches (excluding Ns)
1225
+ # Track mismatches (excluding Ns)
1226
+ if read_base != ref_base and read_base != "N" and ref_base != "N":
1227
+ mismatch_counts_per_read[read_name][ref_base][read_base] += 1
1228
+ else:
1229
+ bam_path = Path(bam_file)
1230
+ logger.debug("Extracting base identities using samtools")
1231
+ _ensure_bam_index(bam_path, backend_choice)
1232
+
1233
+ def _iter_aligned_pairs(cigar: str, start: int) -> Iterable[Tuple[int, int]]:
1234
+ qpos = 0
1235
+ rpos = start
1236
+ for length_str, op in re.findall(r"(\d+)([MIDNSHP=XB])", cigar):
1237
+ length = int(length_str)
1238
+ if op in {"M", "=", "X"}:
1239
+ for _ in range(length):
1240
+ yield qpos, rpos
1241
+ qpos += 1
1242
+ rpos += 1
1243
+ elif op in {"I", "S"}:
1244
+ qpos += length
1245
+ elif op in {"D", "N"}:
1246
+ rpos += length
1247
+ elif op in {"H", "P"}:
1248
+ continue
1249
+
1250
+ cmd = ["samtools", "view", "-F", "4", str(bam_path), chromosome]
1251
+ proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
1252
+ assert proc.stdout is not None
1253
+ for line in proc.stdout:
1254
+ if not line.strip() or line.startswith("@"):
1255
+ continue
1256
+ fields = line.rstrip("\n").split("\t")
1257
+ if len(fields) < 11:
1258
+ continue
1259
+ read_name = fields[0]
1260
+ flag = int(fields[1])
1261
+ pos = int(fields[3])
1262
+ cigar = fields[5]
1263
+ query_sequence = fields[9]
1264
+ if cigar == "*" or query_sequence == "*":
1265
+ continue
1266
+ base_dict = rev_base_identities if (flag & 16) else fwd_base_identities
1267
+ for read_pos, ref_pos in _iter_aligned_pairs(cigar, pos - 1):
1268
+ if read_pos >= len(query_sequence) or ref_pos >= len(ref_seq):
1269
+ continue
1270
+ read_base = query_sequence[read_pos]
1271
+ ref_base = ref_seq[ref_pos]
1272
+ if ref_pos in positions:
1273
+ base_dict[read_name][ref_pos] = read_base
876
1274
  if read_base != ref_base and read_base != "N" and ref_base != "N":
877
1275
  mismatch_counts_per_read[read_name][ref_base][read_base] += 1
1276
+ rc = proc.wait()
1277
+ if rc != 0:
1278
+ stderr = proc.stderr.read() if proc.stderr else ""
1279
+ raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
878
1280
 
879
1281
  # Determine C→T vs G→A dominance per read
880
1282
  mismatch_trend_per_read = {}
@@ -899,46 +1301,137 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
899
1301
  )
900
1302
 
901
1303
 
902
- def extract_read_features_from_bam(bam_file_path):
903
- """
904
- Make a dict of reads from a bam that points to a list of read metrics: read length, read median Q-score, reference length, mapped length, mapping quality
905
- Params:
906
- bam_file_path (str):
1304
+ def extract_read_features_from_bam(
1305
+ bam_file_path: str | Path, samtools_backend: str | None = "auto"
1306
+ ) -> Dict[str, List[float]]:
1307
+ """Extract read metrics from a BAM file.
1308
+
1309
+ Args:
1310
+ bam_file_path: Path to the BAM file.
1311
+ samtools_backend: Backend selection for samtools-compatible operations (auto|python|cli).
1312
+
907
1313
  Returns:
908
- read_metrics (dict)
1314
+ Mapping of read name to [read_length, read_median_qscore, reference_length,
1315
+ mapped_length, mapping_quality].
909
1316
  """
910
- # Open the BAM file
911
1317
  logger.debug(
912
- f"Extracting read metrics from BAM using extract_read_features_from_bam: {bam_file_path}"
1318
+ "Extracting read metrics from BAM using extract_read_features_from_bam: %s",
1319
+ bam_file_path,
913
1320
  )
914
- with pysam.AlignmentFile(bam_file_path, "rb") as bam_file:
915
- read_metrics = {}
916
- reference_lengths = bam_file.lengths # List of lengths for each reference (chromosome)
917
- for read in bam_file:
918
- # Skip unmapped reads
919
- if read.is_unmapped:
1321
+ backend_choice = _resolve_samtools_backend(samtools_backend)
1322
+ read_metrics: Dict[str, List[float]] = {}
1323
+
1324
+ if backend_choice == "python":
1325
+ pysam_mod = _require_pysam()
1326
+ with pysam_mod.AlignmentFile(str(bam_file_path), "rb") as bam_file:
1327
+ reference_lengths = dict(zip(bam_file.references, bam_file.lengths))
1328
+ for read in bam_file:
1329
+ if read.is_unmapped:
1330
+ continue
1331
+ read_quality = read.query_qualities
1332
+ if read_quality is None:
1333
+ median_read_quality = float("nan")
1334
+ else:
1335
+ median_read_quality = float(np.median(read_quality))
1336
+ reference_length = reference_lengths.get(read.reference_name, float("nan"))
1337
+ mapped_length = sum(end - start for start, end in read.get_blocks())
1338
+ mapping_quality = float(read.mapping_quality)
1339
+ read_metrics[read.query_name] = [
1340
+ float(read.query_length),
1341
+ median_read_quality,
1342
+ float(reference_length),
1343
+ float(mapped_length),
1344
+ mapping_quality,
1345
+ ]
1346
+ return read_metrics
1347
+
1348
+ bam_path = Path(bam_file_path)
1349
+
1350
+ def _parse_reference_lengths(header_text: str) -> Dict[str, int]:
1351
+ ref_lengths: Dict[str, int] = {}
1352
+ for line in header_text.splitlines():
1353
+ if not line.startswith("@SQ"):
920
1354
  continue
921
- # Extract the read metrics
922
- read_quality = read.query_qualities
923
- median_read_quality = np.median(read_quality)
924
- # Extract the reference (chromosome) name and its length
925
- reference_name = read.reference_name
926
- reference_index = bam_file.references.index(reference_name)
927
- reference_length = reference_lengths[reference_index]
928
- mapped_length = sum(end - start for start, end in read.get_blocks())
929
- mapping_quality = read.mapping_quality # Phred-scaled MAPQ
930
- read_metrics[read.query_name] = [
931
- read.query_length,
932
- median_read_quality,
933
- reference_length,
934
- mapped_length,
935
- mapping_quality,
936
- ]
1355
+ fields = line.split("\t")
1356
+ name = None
1357
+ length = None
1358
+ for field in fields[1:]:
1359
+ if field.startswith("SN:"):
1360
+ name = field.split(":", 1)[1]
1361
+ elif field.startswith("LN:"):
1362
+ length = int(field.split(":", 1)[1])
1363
+ if name is not None and length is not None:
1364
+ ref_lengths[name] = length
1365
+ return ref_lengths
1366
+
1367
+ def _mapped_length_from_cigar(cigar: str) -> int:
1368
+ mapped = 0
1369
+ for length_str, op in re.findall(r"(\d+)([MIDNSHP=XB])", cigar):
1370
+ length = int(length_str)
1371
+ if op in {"M", "=", "X"}:
1372
+ mapped += length
1373
+ return mapped
1374
+
1375
+ header_cp = subprocess.run(
1376
+ ["samtools", "view", "-H", str(bam_path)],
1377
+ stdout=subprocess.PIPE,
1378
+ stderr=subprocess.PIPE,
1379
+ text=True,
1380
+ check=False,
1381
+ )
1382
+ if header_cp.returncode != 0:
1383
+ raise RuntimeError(
1384
+ f"samtools view -H failed (exit {header_cp.returncode}):\n{header_cp.stderr}"
1385
+ )
1386
+ reference_lengths = _parse_reference_lengths(header_cp.stdout)
1387
+
1388
+ proc = subprocess.Popen(
1389
+ ["samtools", "view", "-F", "4", str(bam_path)],
1390
+ stdout=subprocess.PIPE,
1391
+ stderr=subprocess.PIPE,
1392
+ text=True,
1393
+ )
1394
+ assert proc.stdout is not None
1395
+ for line in proc.stdout:
1396
+ if not line.strip() or line.startswith("@"):
1397
+ continue
1398
+ fields = line.rstrip("\n").split("\t")
1399
+ if len(fields) < 11:
1400
+ continue
1401
+ read_name = fields[0]
1402
+ reference_name = fields[2]
1403
+ mapping_quality = float(fields[4])
1404
+ cigar = fields[5]
1405
+ sequence = fields[9]
1406
+ quality = fields[10]
1407
+ if sequence == "*":
1408
+ read_length = float("nan")
1409
+ else:
1410
+ read_length = float(len(sequence))
1411
+ if quality == "*" or not quality:
1412
+ median_read_quality = float("nan")
1413
+ else:
1414
+ phreds = [ord(char) - 33 for char in quality]
1415
+ median_read_quality = float(np.median(phreds))
1416
+ reference_length = float(reference_lengths.get(reference_name, float("nan")))
1417
+ mapped_length = float(_mapped_length_from_cigar(cigar)) if cigar != "*" else 0.0
1418
+ read_metrics[read_name] = [
1419
+ read_length,
1420
+ median_read_quality,
1421
+ reference_length,
1422
+ mapped_length,
1423
+ mapping_quality,
1424
+ ]
1425
+
1426
+ rc = proc.wait()
1427
+ if rc != 0:
1428
+ stderr = proc.stderr.read() if proc.stderr else ""
1429
+ raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
937
1430
 
938
1431
  return read_metrics
939
1432
 
940
1433
 
941
- def extract_readnames_from_bam(aligned_BAM):
1434
+ def extract_readnames_from_bam(aligned_BAM, samtools_backend: str | None = "auto"):
942
1435
  """
943
1436
  Takes a BAM and writes out a txt file containing read names from the BAM
944
1437
 
@@ -949,21 +1442,39 @@ def extract_readnames_from_bam(aligned_BAM):
949
1442
  None
950
1443
 
951
1444
  """
952
- import subprocess
953
-
954
1445
  # Make a text file of reads for the BAM
1446
+ backend_choice = _resolve_samtools_backend(samtools_backend)
955
1447
  txt_output = aligned_BAM.split(".bam")[0] + "_read_names.txt"
956
- samtools_view = subprocess.Popen(["samtools", "view", aligned_BAM], stdout=subprocess.PIPE)
957
- with open(txt_output, "w") as output_file:
958
- cut_process = subprocess.Popen(
959
- ["cut", "-f1"], stdin=samtools_view.stdout, stdout=output_file
960
- )
961
- samtools_view.stdout.close()
962
- cut_process.wait()
963
- samtools_view.wait()
1448
+
1449
+ if backend_choice == "python":
1450
+ pysam_mod = _require_pysam()
1451
+ with (
1452
+ pysam_mod.AlignmentFile(aligned_BAM, "rb") as bam,
1453
+ open(txt_output, "w", encoding="utf-8") as output_file,
1454
+ ):
1455
+ for read in bam:
1456
+ output_file.write(f"{read.query_name}\n")
1457
+ return
1458
+
1459
+ samtools_view = subprocess.Popen(
1460
+ ["samtools", "view", aligned_BAM], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
1461
+ )
1462
+ assert samtools_view.stdout is not None
1463
+ with open(txt_output, "w", encoding="utf-8") as output_file:
1464
+ for line in samtools_view.stdout:
1465
+ if not line.strip():
1466
+ continue
1467
+ qname = line.split("\t", 1)[0]
1468
+ output_file.write(f"{qname}\n")
1469
+ rc = samtools_view.wait()
1470
+ if rc != 0:
1471
+ stderr = samtools_view.stderr.read() if samtools_view.stderr else ""
1472
+ raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
964
1473
 
965
1474
 
966
- def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
1475
+ def separate_bam_by_bc(
1476
+ input_bam, output_prefix, bam_suffix, split_dir, samtools_backend: str | None = "auto"
1477
+ ):
967
1478
  """
968
1479
  Separates an input BAM file on the BC SAM tag values.
969
1480
 
@@ -981,34 +1492,80 @@ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
981
1492
  bam_base = input_bam.name
982
1493
  bam_base_minus_suffix = input_bam.stem
983
1494
 
984
- # Open the input BAM file for reading
985
- with pysam.AlignmentFile(str(input_bam), "rb") as bam:
986
- # Create a dictionary to store output BAM files
987
- output_files = {}
988
- # Iterate over each read in the BAM file
989
- for read in bam:
990
- try:
991
- # Get the barcode tag value
992
- bc_tag = read.get_tag("BC", with_value_type=True)[0]
993
- # bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
994
- # Open the output BAM file corresponding to the barcode
995
- if bc_tag not in output_files:
996
- output_path = (
997
- split_dir / f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}"
998
- )
999
- output_files[bc_tag] = pysam.AlignmentFile(
1000
- str(output_path), "wb", header=bam.header
1001
- )
1002
- # Write the read to the corresponding output BAM file
1003
- output_files[bc_tag].write(read)
1004
- except KeyError:
1005
- logger.warning(f"BC tag not present for read: {read.query_name}")
1006
- # Close all output BAM files
1007
- for output_file in output_files.values():
1008
- output_file.close()
1495
+ backend_choice = _resolve_samtools_backend(samtools_backend)
1496
+
1497
+ if backend_choice == "python":
1498
+ pysam_mod = _require_pysam()
1499
+ # Open the input BAM file for reading
1500
+ with pysam_mod.AlignmentFile(str(input_bam), "rb") as bam:
1501
+ # Create a dictionary to store output BAM files
1502
+ output_files = {}
1503
+ # Iterate over each read in the BAM file
1504
+ for read in bam:
1505
+ try:
1506
+ # Get the barcode tag value
1507
+ bc_tag = read.get_tag("BC", with_value_type=True)[0]
1508
+ # bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
1509
+ # Open the output BAM file corresponding to the barcode
1510
+ if bc_tag not in output_files:
1511
+ output_path = (
1512
+ split_dir
1513
+ / f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}"
1514
+ )
1515
+ output_files[bc_tag] = pysam_mod.AlignmentFile(
1516
+ str(output_path), "wb", header=bam.header
1517
+ )
1518
+ # Write the read to the corresponding output BAM file
1519
+ output_files[bc_tag].write(read)
1520
+ except KeyError:
1521
+ logger.warning(f"BC tag not present for read: {read.query_name}")
1522
+ # Close all output BAM files
1523
+ for output_file in output_files.values():
1524
+ output_file.close()
1525
+ return
1526
+
1527
+ def _collect_bc_tags() -> set[str]:
1528
+ bc_tags: set[str] = set()
1529
+ proc = subprocess.Popen(
1530
+ ["samtools", "view", str(input_bam)],
1531
+ stdout=subprocess.PIPE,
1532
+ stderr=subprocess.PIPE,
1533
+ text=True,
1534
+ )
1535
+ assert proc.stdout is not None
1536
+ for line in proc.stdout:
1537
+ if not line.strip():
1538
+ continue
1539
+ fields = line.rstrip("\n").split("\t")
1540
+ for tag in fields[11:]:
1541
+ if tag.startswith("BC:"):
1542
+ bc_tags.add(tag.split(":", 2)[2])
1543
+ break
1544
+ rc = proc.wait()
1545
+ if rc != 0:
1546
+ stderr = proc.stderr.read() if proc.stderr else ""
1547
+ raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
1548
+ return bc_tags
1009
1549
 
1550
+ bc_tags = _collect_bc_tags()
1551
+ if not bc_tags:
1552
+ logger.warning("No BC tags found in %s", input_bam)
1553
+ return
1554
+
1555
+ for bc_tag in bc_tags:
1556
+ output_path = split_dir / f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}"
1557
+ cmd = ["samtools", "view", "-b", "-d", f"BC:{bc_tag}", "-o", str(output_path)]
1558
+ cmd.append(str(input_bam))
1559
+ cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
1560
+ if cp.returncode != 0:
1561
+ raise RuntimeError(
1562
+ f"samtools view failed for BC={bc_tag} (exit {cp.returncode}):\n{cp.stderr}"
1563
+ )
1010
1564
 
1011
- def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
1565
+
1566
+ def split_and_index_BAM(
1567
+ aligned_sorted_BAM, split_dir, bam_suffix, samtools_backend: str | None = "auto"
1568
+ ):
1012
1569
  """
1013
1570
  A wrapper function for splitting BAMS and indexing them.
1014
1571
  Parameters:
@@ -1023,12 +1580,22 @@ def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
1023
1580
  logger.debug("Demultiplexing and indexing BAMS based on BC tag using split_and_index_BAM")
1024
1581
  aligned_sorted_output = aligned_sorted_BAM + bam_suffix
1025
1582
  file_prefix = date_string()
1026
- separate_bam_by_bc(aligned_sorted_output, file_prefix, bam_suffix, split_dir)
1583
+ separate_bam_by_bc(
1584
+ aligned_sorted_output,
1585
+ file_prefix,
1586
+ bam_suffix,
1587
+ split_dir,
1588
+ samtools_backend=samtools_backend,
1589
+ )
1027
1590
  # Make a BAM index file for the BAMs in that directory
1028
1591
  bam_pattern = "*" + bam_suffix
1029
1592
  bam_files = glob.glob(split_dir / bam_pattern)
1030
1593
  bam_files = [str(bam) for bam in bam_files if ".bai" not in str(bam)]
1594
+ backend_choice = _resolve_samtools_backend(samtools_backend)
1031
1595
  for input_file in bam_files:
1032
- pysam.index(input_file)
1596
+ if backend_choice == "python":
1597
+ _index_bam_with_pysam(input_file)
1598
+ else:
1599
+ _index_bam_with_samtools(input_file)
1033
1600
 
1034
1601
  return bam_files