smftools 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. smftools/__init__.py +43 -13
  2. smftools/_settings.py +6 -6
  3. smftools/_version.py +3 -1
  4. smftools/cli/__init__.py +1 -0
  5. smftools/cli/archived/cli_flows.py +2 -0
  6. smftools/cli/helpers.py +9 -1
  7. smftools/cli/hmm_adata.py +905 -242
  8. smftools/cli/load_adata.py +432 -280
  9. smftools/cli/preprocess_adata.py +287 -171
  10. smftools/cli/spatial_adata.py +141 -53
  11. smftools/cli_entry.py +119 -178
  12. smftools/config/__init__.py +3 -1
  13. smftools/config/conversion.yaml +5 -1
  14. smftools/config/deaminase.yaml +1 -1
  15. smftools/config/default.yaml +26 -18
  16. smftools/config/direct.yaml +8 -3
  17. smftools/config/discover_input_files.py +19 -5
  18. smftools/config/experiment_config.py +511 -276
  19. smftools/constants.py +37 -0
  20. smftools/datasets/__init__.py +4 -8
  21. smftools/datasets/datasets.py +32 -18
  22. smftools/hmm/HMM.py +2133 -1428
  23. smftools/hmm/__init__.py +24 -14
  24. smftools/hmm/archived/apply_hmm_batched.py +2 -0
  25. smftools/hmm/archived/calculate_distances.py +2 -0
  26. smftools/hmm/archived/call_hmm_peaks.py +18 -1
  27. smftools/hmm/archived/train_hmm.py +2 -0
  28. smftools/hmm/call_hmm_peaks.py +176 -193
  29. smftools/hmm/display_hmm.py +23 -7
  30. smftools/hmm/hmm_readwrite.py +20 -6
  31. smftools/hmm/nucleosome_hmm_refinement.py +104 -14
  32. smftools/informatics/__init__.py +55 -13
  33. smftools/informatics/archived/bam_conversion.py +2 -0
  34. smftools/informatics/archived/bam_direct.py +2 -0
  35. smftools/informatics/archived/basecall_pod5s.py +2 -0
  36. smftools/informatics/archived/basecalls_to_adata.py +2 -0
  37. smftools/informatics/archived/conversion_smf.py +2 -0
  38. smftools/informatics/archived/deaminase_smf.py +1 -0
  39. smftools/informatics/archived/direct_smf.py +2 -0
  40. smftools/informatics/archived/fast5_to_pod5.py +2 -0
  41. smftools/informatics/archived/helpers/archived/__init__.py +2 -0
  42. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +16 -1
  43. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
  44. smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
  45. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
  46. smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
  47. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
  48. smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
  49. smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
  50. smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
  51. smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
  52. smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
  53. smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
  54. smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
  55. smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
  56. smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
  57. smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
  58. smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
  59. smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
  60. smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
  61. smftools/informatics/archived/helpers/archived/informatics.py +2 -0
  62. smftools/informatics/archived/helpers/archived/load_adata.py +5 -3
  63. smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
  64. smftools/informatics/archived/helpers/archived/modQC.py +2 -0
  65. smftools/informatics/archived/helpers/archived/modcall.py +2 -0
  66. smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
  67. smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
  68. smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
  69. smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
  70. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +5 -1
  71. smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
  72. smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
  73. smftools/informatics/archived/print_bam_query_seq.py +9 -1
  74. smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
  75. smftools/informatics/archived/subsample_pod5.py +2 -0
  76. smftools/informatics/bam_functions.py +1059 -269
  77. smftools/informatics/basecalling.py +53 -9
  78. smftools/informatics/bed_functions.py +357 -114
  79. smftools/informatics/binarize_converted_base_identities.py +21 -7
  80. smftools/informatics/complement_base_list.py +9 -6
  81. smftools/informatics/converted_BAM_to_adata.py +324 -137
  82. smftools/informatics/fasta_functions.py +251 -89
  83. smftools/informatics/h5ad_functions.py +202 -30
  84. smftools/informatics/modkit_extract_to_adata.py +623 -274
  85. smftools/informatics/modkit_functions.py +87 -44
  86. smftools/informatics/ohe.py +46 -21
  87. smftools/informatics/pod5_functions.py +114 -74
  88. smftools/informatics/run_multiqc.py +20 -14
  89. smftools/logging_utils.py +51 -0
  90. smftools/machine_learning/__init__.py +23 -12
  91. smftools/machine_learning/data/__init__.py +2 -0
  92. smftools/machine_learning/data/anndata_data_module.py +157 -50
  93. smftools/machine_learning/data/preprocessing.py +4 -1
  94. smftools/machine_learning/evaluation/__init__.py +3 -1
  95. smftools/machine_learning/evaluation/eval_utils.py +13 -14
  96. smftools/machine_learning/evaluation/evaluators.py +52 -34
  97. smftools/machine_learning/inference/__init__.py +3 -1
  98. smftools/machine_learning/inference/inference_utils.py +9 -4
  99. smftools/machine_learning/inference/lightning_inference.py +14 -13
  100. smftools/machine_learning/inference/sklearn_inference.py +8 -8
  101. smftools/machine_learning/inference/sliding_window_inference.py +37 -25
  102. smftools/machine_learning/models/__init__.py +12 -5
  103. smftools/machine_learning/models/base.py +34 -43
  104. smftools/machine_learning/models/cnn.py +22 -13
  105. smftools/machine_learning/models/lightning_base.py +78 -42
  106. smftools/machine_learning/models/mlp.py +18 -5
  107. smftools/machine_learning/models/positional.py +10 -4
  108. smftools/machine_learning/models/rnn.py +8 -3
  109. smftools/machine_learning/models/sklearn_models.py +46 -24
  110. smftools/machine_learning/models/transformer.py +75 -55
  111. smftools/machine_learning/models/wrappers.py +8 -3
  112. smftools/machine_learning/training/__init__.py +4 -2
  113. smftools/machine_learning/training/train_lightning_model.py +42 -23
  114. smftools/machine_learning/training/train_sklearn_model.py +11 -15
  115. smftools/machine_learning/utils/__init__.py +3 -1
  116. smftools/machine_learning/utils/device.py +12 -5
  117. smftools/machine_learning/utils/grl.py +8 -2
  118. smftools/metadata.py +443 -0
  119. smftools/optional_imports.py +31 -0
  120. smftools/plotting/__init__.py +32 -17
  121. smftools/plotting/autocorrelation_plotting.py +153 -48
  122. smftools/plotting/classifiers.py +175 -73
  123. smftools/plotting/general_plotting.py +350 -168
  124. smftools/plotting/hmm_plotting.py +53 -14
  125. smftools/plotting/position_stats.py +155 -87
  126. smftools/plotting/qc_plotting.py +25 -12
  127. smftools/preprocessing/__init__.py +35 -37
  128. smftools/preprocessing/append_base_context.py +105 -79
  129. smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
  130. smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +2 -0
  131. smftools/preprocessing/{archives → archived}/calculate_complexity.py +5 -1
  132. smftools/preprocessing/{archives → archived}/mark_duplicates.py +2 -0
  133. smftools/preprocessing/{archives → archived}/preprocessing.py +10 -6
  134. smftools/preprocessing/{archives → archived}/remove_duplicates.py +2 -0
  135. smftools/preprocessing/binarize.py +21 -4
  136. smftools/preprocessing/binarize_on_Youden.py +127 -31
  137. smftools/preprocessing/binary_layers_to_ohe.py +18 -11
  138. smftools/preprocessing/calculate_complexity_II.py +89 -59
  139. smftools/preprocessing/calculate_consensus.py +28 -19
  140. smftools/preprocessing/calculate_coverage.py +44 -22
  141. smftools/preprocessing/calculate_pairwise_differences.py +4 -1
  142. smftools/preprocessing/calculate_pairwise_hamming_distances.py +7 -3
  143. smftools/preprocessing/calculate_position_Youden.py +110 -55
  144. smftools/preprocessing/calculate_read_length_stats.py +52 -23
  145. smftools/preprocessing/calculate_read_modification_stats.py +91 -57
  146. smftools/preprocessing/clean_NaN.py +38 -28
  147. smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
  148. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +72 -37
  149. smftools/preprocessing/filter_reads_on_modification_thresholds.py +183 -73
  150. smftools/preprocessing/flag_duplicate_reads.py +708 -303
  151. smftools/preprocessing/invert_adata.py +26 -11
  152. smftools/preprocessing/load_sample_sheet.py +40 -22
  153. smftools/preprocessing/make_dirs.py +9 -3
  154. smftools/preprocessing/min_non_diagonal.py +4 -1
  155. smftools/preprocessing/recipes.py +58 -23
  156. smftools/preprocessing/reindex_references_adata.py +93 -27
  157. smftools/preprocessing/subsample_adata.py +33 -16
  158. smftools/readwrite.py +264 -109
  159. smftools/schema/__init__.py +11 -0
  160. smftools/schema/anndata_schema_v1.yaml +227 -0
  161. smftools/tools/__init__.py +25 -18
  162. smftools/tools/archived/apply_hmm.py +2 -0
  163. smftools/tools/archived/classifiers.py +165 -0
  164. smftools/tools/archived/classify_methylated_features.py +2 -0
  165. smftools/tools/archived/classify_non_methylated_features.py +2 -0
  166. smftools/tools/archived/subset_adata_v1.py +12 -1
  167. smftools/tools/archived/subset_adata_v2.py +14 -1
  168. smftools/tools/calculate_umap.py +56 -15
  169. smftools/tools/cluster_adata_on_methylation.py +122 -47
  170. smftools/tools/general_tools.py +70 -25
  171. smftools/tools/position_stats.py +220 -99
  172. smftools/tools/read_stats.py +50 -29
  173. smftools/tools/spatial_autocorrelation.py +365 -192
  174. smftools/tools/subset_adata.py +23 -21
  175. smftools-0.3.0.dist-info/METADATA +147 -0
  176. smftools-0.3.0.dist-info/RECORD +182 -0
  177. smftools-0.2.4.dist-info/METADATA +0 -141
  178. smftools-0.2.4.dist-info/RECORD +0 -176
  179. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
  180. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
  181. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,24 +1,145 @@
1
1
  from __future__ import annotations
2
2
 
3
- from pathlib import Path
3
+ import glob
4
4
  import os
5
+ import re
6
+ import shutil
5
7
  import subprocess
6
- import glob
7
8
  import time
8
- from typing import Dict, List, Any, Tuple, Union, Optional, Iterable
9
- import re
9
+ from collections import Counter, defaultdict, deque
10
+ from concurrent.futures import ThreadPoolExecutor, as_completed
10
11
  from itertools import zip_longest
11
- import pysam
12
+ from pathlib import Path
13
+ from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union
12
14
 
13
15
  import numpy as np
14
- import concurrent.futures
15
- from concurrent.futures import ThreadPoolExecutor, as_completed
16
- from concurrent.futures import ProcessPoolExecutor
17
-
18
16
  from tqdm import tqdm
19
- from collections import defaultdict, Counter
20
17
 
21
- from ..readwrite import make_dirs, time_string, date_string
18
+ from smftools.logging_utils import get_logger
19
+ from smftools.optional_imports import require
20
+
21
+ from ..readwrite import date_string, time_string
22
+
23
+ if TYPE_CHECKING:
24
+ import pysam as pysam_types
25
+
26
+ try:
27
+ import pysam
28
+ except Exception:
29
+ pysam = None # type: ignore
30
+
31
+ logger = get_logger(__name__)
32
+
33
+ _PROGRESS_RE = re.compile(r"Output records written:\s*(\d+)")
34
+ _EMPTY_RE = re.compile(r"^\s*$")
35
+
36
+
37
+ def _require_pysam() -> "pysam_types":
38
+ """Return the pysam module or raise if unavailable."""
39
+ if pysam is not None:
40
+ return pysam
41
+ return require("pysam", extra="pysam", purpose="samtools-compatible Python backend")
42
+
43
+
44
+ def _resolve_samtools_backend(backend: str | None) -> str:
45
+ """Resolve backend choice for samtools-compatible operations.
46
+
47
+ Args:
48
+ backend: One of {"auto", "python", "cli"} (case-insensitive).
49
+
50
+ Returns:
51
+ Resolved backend string ("python" or "cli").
52
+ """
53
+ choice = (backend or "auto").strip().lower()
54
+ if choice not in {"auto", "python", "cli"}:
55
+ raise ValueError("samtools_backend must be one of: auto, python, cli")
56
+
57
+ have_pysam = pysam is not None
58
+ have_samtools = shutil.which("samtools") is not None
59
+
60
+ if choice == "python":
61
+ if not have_pysam:
62
+ raise RuntimeError("samtools_backend=python requires pysam to be installed.")
63
+ return "python"
64
+ if choice == "cli":
65
+ if not have_samtools:
66
+ raise RuntimeError("samtools_backend=cli requires samtools in PATH.")
67
+ return "cli"
68
+
69
+ if have_samtools:
70
+ return "cli"
71
+ if have_pysam:
72
+ return "python"
73
+ raise RuntimeError("Neither pysam nor samtools is available in PATH.")
74
+
75
+
76
+ def _has_bam_index(bam_path: Path) -> bool:
77
+ """Return True if the BAM index exists alongside the BAM."""
78
+ return (
79
+ bam_path.with_suffix(bam_path.suffix + ".bai").exists()
80
+ or Path(str(bam_path) + ".bai").exists()
81
+ )
82
+
83
+
84
+ def _ensure_bam_index(bam_path: Path, backend: str) -> None:
85
+ """Ensure a BAM index exists, creating one if needed."""
86
+ if _has_bam_index(bam_path):
87
+ return
88
+ if backend == "python":
89
+ _index_bam_with_pysam(bam_path)
90
+ else:
91
+ _index_bam_with_samtools(bam_path)
92
+
93
+
94
+ def _parse_idxstats_output(output: str) -> Tuple[int, int, Dict[str, Tuple[int, float]]]:
95
+ """Parse samtools idxstats output into counts and proportions."""
96
+ aligned_reads_count = 0
97
+ unaligned_reads_count = 0
98
+ record_counts: Dict[str, int] = {}
99
+ for line in output.splitlines():
100
+ if not line.strip():
101
+ continue
102
+ ref, _length, mapped, unmapped = line.split("\t")[:4]
103
+ if ref == "*":
104
+ unaligned_reads_count += int(unmapped)
105
+ continue
106
+ mapped_count = int(mapped)
107
+ aligned_reads_count += mapped_count
108
+ record_counts[ref] = mapped_count
109
+
110
+ proportions: Dict[str, Tuple[int, float]] = {}
111
+ for ref, count in record_counts.items():
112
+ proportion = count / aligned_reads_count if aligned_reads_count else 0.0
113
+ proportions[ref] = (count, proportion)
114
+
115
+ return aligned_reads_count, unaligned_reads_count, proportions
116
+
117
+
118
+ def _stream_dorado_logs(stderr_iter) -> None:
119
+ """Stream dorado stderr and emit structured log messages.
120
+
121
+ Args:
122
+ stderr_iter: Iterable of stderr lines.
123
+ """
124
+ last_n: int | None = None
125
+
126
+ for raw in stderr_iter:
127
+ line = raw.rstrip("\n")
128
+ if _EMPTY_RE.match(line):
129
+ continue
130
+
131
+ m = _PROGRESS_RE.search(line)
132
+ if m:
133
+ n = int(m.group(1))
134
+ logger.debug("[dorado] Output records written: %d", n)
135
+ last_n = n
136
+ continue
137
+
138
+ logger.info("[dorado] %s", line)
139
+
140
+ if last_n is not None:
141
+ logger.info("[dorado] Final output records written: %d", last_n)
142
+
22
143
 
23
144
  def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str, Path]) -> None:
24
145
  """
@@ -26,7 +147,14 @@ def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str,
26
147
  """
27
148
  bam_path = str(bam_path)
28
149
  fastq_path = str(fastq_path)
29
- with pysam.AlignmentFile(bam_path, "rb", check_sq=False) as bam, open(fastq_path, "w", encoding="utf-8") as fq:
150
+
151
+ logger.debug(f"Converting BAM to FASTQ using _bam_to_fastq_with_pysam")
152
+
153
+ pysam_mod = _require_pysam()
154
+ with (
155
+ pysam_mod.AlignmentFile(bam_path, "rb", check_sq=False) as bam,
156
+ open(fastq_path, "w", encoding="utf-8") as fq,
157
+ ):
30
158
  for r in bam.fetch(until_eof=True):
31
159
  # Optionally skip secondary/supplementary:
32
160
  # if r.is_secondary or r.is_supplementary:
@@ -45,36 +173,98 @@ def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str,
45
173
  # q is an array/list of ints (Phred scores).
46
174
  # Convert to FASTQ string with Phred+33 encoding,
47
175
  # clamping to sane range [0, 93] to stay in printable ASCII.
48
- qual_str = "".join(
49
- chr(min(max(int(qv), 0), 93) + 33)
50
- for qv in q
51
- )
176
+ qual_str = "".join(chr(min(max(int(qv), 0), 93) + 33) for qv in q)
52
177
 
53
178
  fq.write(f"@{name}\n{seq}\n+\n{qual_str}\n")
54
179
 
55
- def _sort_bam_with_pysam(in_bam: Union[str, Path], out_bam: Union[str, Path], threads: Optional[int] = None) -> None:
180
+
181
+ def _sort_bam_with_pysam(
182
+ in_bam: Union[str, Path], out_bam: Union[str, Path], threads: Optional[int] = None
183
+ ) -> None:
184
+ """Sort a BAM file using pysam.
185
+
186
+ Args:
187
+ in_bam: Input BAM path.
188
+ out_bam: Output BAM path.
189
+ threads: Optional thread count.
190
+ """
191
+ logger.debug(f"Sorting BAM using _sort_bam_with_pysam")
56
192
  in_bam, out_bam = str(in_bam), str(out_bam)
57
193
  args = []
58
194
  if threads:
59
195
  args += ["-@", str(threads)]
60
196
  args += ["-o", out_bam, in_bam]
61
- pysam.sort(*args)
197
+ pysam_mod = _require_pysam()
198
+ pysam_mod.sort(*args)
199
+
62
200
 
63
201
  def _index_bam_with_pysam(bam_path: Union[str, Path], threads: Optional[int] = None) -> None:
202
+ """Index a BAM file using pysam.
203
+
204
+ Args:
205
+ bam_path: BAM path to index.
206
+ threads: Optional thread count.
207
+ """
64
208
  bam_path = str(bam_path)
209
+ logger.debug(f"Indexing BAM using _index_bam_with_pysam")
210
+ pysam_mod = _require_pysam()
65
211
  # pysam.index supports samtools-style args
66
212
  if threads:
67
- pysam.index("-@", str(threads), bam_path)
213
+ pysam_mod.index("-@", str(threads), bam_path)
68
214
  else:
69
- pysam.index(bam_path)
215
+ pysam_mod.index(bam_path)
216
+
217
+
218
+ def _bam_to_fastq_with_samtools(bam_path: Union[str, Path], fastq_path: Union[str, Path]) -> None:
219
+ """Convert BAM to FASTQ using samtools."""
220
+ if not shutil.which("samtools"):
221
+ raise RuntimeError("samtools is required but not available in PATH.")
222
+ cmd = ["samtools", "fastq", str(bam_path)]
223
+ logger.debug("Converting BAM to FASTQ using samtools: %s", " ".join(cmd))
224
+ with open(fastq_path, "w", encoding="utf-8") as fq:
225
+ cp = subprocess.run(cmd, stdout=fq, stderr=subprocess.PIPE, text=True)
226
+ if cp.returncode != 0:
227
+ raise RuntimeError(f"samtools fastq failed (exit {cp.returncode}):\n{cp.stderr}")
70
228
 
71
- def align_and_sort_BAM(fasta,
72
- input,
73
- cfg,
229
+
230
+ def _sort_bam_with_samtools(
231
+ in_bam: Union[str, Path], out_bam: Union[str, Path], threads: Optional[int] = None
232
+ ) -> None:
233
+ """Sort a BAM file using samtools."""
234
+ if not shutil.which("samtools"):
235
+ raise RuntimeError("samtools is required but not available in PATH.")
236
+ cmd = ["samtools", "sort", "-o", str(out_bam)]
237
+ if threads:
238
+ cmd += ["-@", str(threads)]
239
+ cmd.append(str(in_bam))
240
+ logger.debug("Sorting BAM using samtools: %s", " ".join(cmd))
241
+ cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
242
+ if cp.returncode != 0:
243
+ raise RuntimeError(f"samtools sort failed (exit {cp.returncode}):\n{cp.stderr}")
244
+
245
+
246
+ def _index_bam_with_samtools(bam_path: Union[str, Path], threads: Optional[int] = None) -> None:
247
+ """Index a BAM file using samtools."""
248
+ if not shutil.which("samtools"):
249
+ raise RuntimeError("samtools is required but not available in PATH.")
250
+ cmd = ["samtools", "index"]
251
+ if threads:
252
+ cmd += ["-@", str(threads)]
253
+ cmd.append(str(bam_path))
254
+ logger.debug("Indexing BAM using samtools: %s", " ".join(cmd))
255
+ cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
256
+ if cp.returncode != 0:
257
+ raise RuntimeError(f"samtools index failed (exit {cp.returncode}):\n{cp.stderr}")
258
+
259
+
260
+ def align_and_sort_BAM(
261
+ fasta,
262
+ input,
263
+ cfg,
74
264
  ):
75
265
  """
76
266
  A wrapper for running dorado aligner and samtools functions
77
-
267
+
78
268
  Parameters:
79
269
  fasta (str): File path to the reference genome to align to.
80
270
  input (str): File path to the basecalled file to align. Works for .bam and .fastq files
@@ -84,60 +274,105 @@ def align_and_sort_BAM(fasta,
84
274
  None
85
275
  The function writes out files for: 1) An aligned BAM, 2) and aligned_sorted BAM, 3) an index file for the aligned_sorted BAM, 4) A bed file for the aligned_sorted BAM, 5) A text file containing read names in the aligned_sorted BAM
86
276
  """
277
+ logger.debug("Aligning and sorting BAM using align_and_sort_BAM")
87
278
  input_basename = input.name
88
279
  input_suffix = input.suffix
89
- input_as_fastq = input.with_name(input.stem + '.fastq')
280
+ input_as_fastq = input.with_name(input.stem + ".fastq")
90
281
 
91
282
  output_path_minus_suffix = cfg.output_directory / input.stem
92
-
283
+
93
284
  aligned_BAM = output_path_minus_suffix.with_name(output_path_minus_suffix.stem + "_aligned")
94
285
  aligned_output = aligned_BAM.with_suffix(cfg.bam_suffix)
95
- aligned_sorted_BAM =aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
286
+ aligned_sorted_BAM = aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
96
287
  aligned_sorted_output = aligned_sorted_BAM.with_suffix(cfg.bam_suffix)
97
288
 
98
289
  if cfg.threads:
99
290
  threads = str(cfg.threads)
100
291
  else:
101
292
  threads = None
102
-
103
- if cfg.aligner == 'minimap2':
293
+
294
+ samtools_backend = _resolve_samtools_backend(getattr(cfg, "samtools_backend", "auto"))
295
+
296
+ if cfg.aligner == "minimap2":
104
297
  if not cfg.align_from_bam:
105
- print(f"Converting BAM to FASTQ: {input}")
106
- _bam_to_fastq_with_pysam(input, input_as_fastq)
107
- print(f"Aligning FASTQ to Reference: {input_as_fastq}")
298
+ logger.debug(f"Converting BAM to FASTQ: {input}")
299
+ if samtools_backend == "python":
300
+ _bam_to_fastq_with_pysam(input, input_as_fastq)
301
+ else:
302
+ _bam_to_fastq_with_samtools(input, input_as_fastq)
303
+ logger.debug(f"Aligning FASTQ to Reference: {input_as_fastq}")
108
304
  mm_input = input_as_fastq
109
- else:
110
- print(f"Aligning BAM to Reference: {input}")
305
+ else:
306
+ logger.debug(f"Aligning BAM to Reference: {input}")
111
307
  mm_input = input
112
308
 
113
309
  if threads:
114
- minimap_command = ['minimap2'] + cfg.aligner_args + ['-t', threads, str(fasta), str(mm_input)]
310
+ minimap_command = (
311
+ ["minimap2"] + cfg.aligner_args + ["-t", threads, str(fasta), str(mm_input)]
312
+ )
115
313
  else:
116
- minimap_command = ['minimap2'] + cfg.aligner_args + [str(fasta), str(mm_input)]
117
- subprocess.run(minimap_command, stdout=open(aligned_output, "wb"))
314
+ minimap_command = ["minimap2"] + cfg.aligner_args + [str(fasta), str(mm_input)]
315
+
316
+ with open(aligned_output, "wb") as out:
317
+ proc = subprocess.Popen(
318
+ minimap_command,
319
+ stdout=out,
320
+ stderr=subprocess.PIPE,
321
+ text=True,
322
+ )
323
+
324
+ assert proc.stderr is not None
325
+ for line in proc.stderr:
326
+ logger.info("[minimap2] %s", line.rstrip())
327
+
328
+ ret = proc.wait()
329
+ if ret != 0:
330
+ raise RuntimeError(f"minimap2 failed with exit code {ret}")
118
331
 
119
332
  if not cfg.align_from_bam:
120
333
  os.remove(input_as_fastq)
121
334
 
122
- elif cfg.aligner == 'dorado':
335
+ elif cfg.aligner == "dorado":
123
336
  # Run dorado aligner
124
337
  print(f"Aligning BAM to Reference: {input}")
125
338
  if threads:
126
- alignment_command = ["dorado", "aligner", "-t", threads] + cfg.aligner_args + [str(fasta), str(input)]
339
+ alignment_command = (
340
+ ["dorado", "aligner", "-t", threads] + cfg.aligner_args + [str(fasta), str(input)]
341
+ )
127
342
  else:
128
343
  alignment_command = ["dorado", "aligner"] + cfg.aligner_args + [str(fasta), str(input)]
129
- subprocess.run(alignment_command, stdout=open(aligned_output, "wb"))
130
344
 
345
+ with open(aligned_output, "wb") as out:
346
+ proc = subprocess.Popen(
347
+ alignment_command,
348
+ stdout=out,
349
+ stderr=subprocess.PIPE,
350
+ text=True,
351
+ )
352
+
353
+ assert proc.stderr is not None
354
+ _stream_dorado_logs(proc.stderr)
355
+ ret = proc.wait()
356
+
357
+ if ret != 0:
358
+ raise RuntimeError(f"dorado failed with exit code {ret}")
131
359
  else:
132
- print(f'Aligner not recognized: {cfg.aligner}. Choose from minimap2 and dorado')
360
+ logger.error(f"Aligner not recognized: {cfg.aligner}. Choose from minimap2 and dorado")
133
361
  return
134
-
135
- # --- Sort & Index with pysam ---
136
- print(f"[pysam] Sorting: {aligned_output} -> {aligned_sorted_output}")
137
- _sort_bam_with_pysam(aligned_output, aligned_sorted_output, threads=threads)
138
362
 
139
- print(f"[pysam] Indexing: {aligned_sorted_output}")
140
- _index_bam_with_pysam(aligned_sorted_output, threads=threads)
363
+ # --- Sort & Index ---
364
+ logger.debug(f"Sorting: {aligned_output} -> {aligned_sorted_output}")
365
+ if samtools_backend == "python":
366
+ _sort_bam_with_pysam(aligned_output, aligned_sorted_output, threads=threads)
367
+ else:
368
+ _sort_bam_with_samtools(aligned_output, aligned_sorted_output, threads=threads)
369
+
370
+ logger.debug(f"Indexing: {aligned_sorted_output}")
371
+ if samtools_backend == "python":
372
+ _index_bam_with_pysam(aligned_sorted_output, threads=threads)
373
+ else:
374
+ _index_bam_with_samtools(aligned_sorted_output, threads=threads)
375
+
141
376
 
142
377
  def bam_qc(
143
378
  bam_files: Iterable[str | Path],
@@ -147,6 +382,7 @@ def bam_qc(
147
382
  stats: bool = True,
148
383
  flagstats: bool = True,
149
384
  idxstats: bool = True,
385
+ samtools_backend: str | None = "auto",
150
386
  ) -> None:
151
387
  """
152
388
  QC for BAM/CRAMs: stats, flagstat, idxstats.
@@ -154,132 +390,148 @@ def bam_qc(
154
390
  Runs BAMs in parallel (up to `threads`, default serial).
155
391
  """
156
392
  import subprocess
157
- import shutil
158
393
 
159
- # Try to import pysam once
160
- try:
161
- import pysam
162
- HAVE_PYSAM = True
163
- except Exception:
164
- HAVE_PYSAM = False
394
+ logger.debug("Performing BAM QC using bam_qc")
395
+
396
+ backend_choice = _resolve_samtools_backend(samtools_backend)
397
+ have_pysam = backend_choice == "python"
398
+ pysam_mod = _require_pysam() if have_pysam else None
165
399
 
166
400
  bam_qc_dir = Path(bam_qc_dir)
167
401
  bam_qc_dir.mkdir(parents=True, exist_ok=True)
168
402
 
169
- bam_files = [Path(b) for b in bam_files]
403
+ bam_paths = [Path(b) for b in bam_files]
170
404
 
171
405
  def _has_index(p: Path) -> bool:
172
- if p.suffix.lower() == ".bam":
173
- bai = p.with_suffix(p.suffix + ".bai")
174
- bai_alt = Path(str(p) + ".bai")
175
- return bai.exists() or bai_alt.exists()
176
- if p.suffix.lower() == ".cram":
177
- crai = Path(str(p) + ".crai")
178
- return crai.exists()
406
+ """Return True if a BAM/CRAM index exists for the path."""
407
+ suf = p.suffix.lower()
408
+ if suf == ".bam":
409
+ return p.with_suffix(p.suffix + ".bai").exists() or Path(str(p) + ".bai").exists()
410
+ if suf == ".cram":
411
+ return Path(str(p) + ".crai").exists()
179
412
  return False
180
413
 
181
414
  def _ensure_index(p: Path) -> None:
415
+ """Ensure a BAM/CRAM index exists, creating one if needed."""
182
416
  if _has_index(p):
183
417
  return
184
- if HAVE_PYSAM:
185
- # pysam.index supports both BAM & CRAM
186
- pysam.index(str(p))
418
+ if have_pysam:
419
+ assert pysam_mod is not None
420
+ pysam_mod.index(str(p)) # supports BAM & CRAM
187
421
  else:
188
422
  cmd = ["samtools", "index", str(p)]
189
- subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
423
+ # capture text so errors are readable; raise on failure
424
+ cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
425
+ if cp.returncode != 0:
426
+ raise RuntimeError(f"samtools index failed (exit {cp.returncode}):\n{cp.stderr}")
190
427
 
191
- def _run_one(bam: Path) -> Tuple[Path, List[Tuple[str, int]]]:
192
- # outputs + return (file, [(task_name, returncode)])
193
- results: List[Tuple[str, int]] = []
194
- base = bam.stem # filename without .bam
428
+ def _run_samtools_to_file(cmd: list[str], out_path: Path, bam: Path, tag: str) -> int:
429
+ """
430
+ Stream stderr to logger; write stdout to out_path; return rc; raise with stderr tail on failure.
431
+ """
432
+ last_err = deque(maxlen=80)
433
+ out_path.parent.mkdir(parents=True, exist_ok=True)
434
+
435
+ with open(out_path, "w") as fh:
436
+ proc = subprocess.Popen(cmd, stdout=fh, stderr=subprocess.PIPE, text=True)
437
+ assert proc.stderr is not None
438
+ for line in proc.stderr:
439
+ line = line.rstrip()
440
+ if line:
441
+ last_err.append(line)
442
+ logger.debug("[%s][%s] %s", tag, bam.name, line)
443
+ rc = proc.wait()
444
+
445
+ if rc != 0:
446
+ tail = "\n".join(last_err)
447
+ raise RuntimeError(f"{tag} failed for {bam} (exit {rc}). Stderr tail:\n{tail}")
448
+ return rc
449
+
450
+ def _run_one(bam: Path) -> tuple[Path, list[tuple[str, int]]]:
451
+ """Run stats/flagstat/idxstats for a single BAM.
452
+
453
+ Args:
454
+ bam: Path to the BAM file.
455
+
456
+ Returns:
457
+ Tuple of (bam_path, list of (stage, return_code)).
458
+ """
459
+ import subprocess
460
+
461
+ results: list[tuple[str, int]] = []
462
+ base = bam.stem # e.g. sample.bam -> sample
195
463
  out_stats = bam_qc_dir / f"{base}_stats.txt"
196
464
  out_flag = bam_qc_dir / f"{base}_flagstat.txt"
197
- out_idx = bam_qc_dir / f"{base}_idxstats.txt"
465
+ out_idx = bam_qc_dir / f"{base}_idxstats.txt"
198
466
 
199
- # Make sure index exists (samtools stats/flagstat dont require, idxstats does)
467
+ # Make sure index exists (idxstats requires; stats/flagstat usually don't, but indexing is cheap/useful)
200
468
  try:
201
469
  _ensure_index(bam)
202
470
  except Exception as e:
203
- # Still attempt stats/flagstat if requested
204
- print(f"[warn] Indexing failed for {bam}: {e}")
205
-
206
- # Choose runner per task
207
- def run_stats():
208
- if not stats:
209
- return
210
- if HAVE_PYSAM and hasattr(pysam, "stats"):
211
- txt = pysam.stats(str(bam))
471
+ # Still attempt stats/flagstat if requested; idxstats may fail later if index is required.
472
+ logger.warning("Indexing failed for %s: %s", bam, e)
473
+
474
+ # --- stats ---
475
+ if stats:
476
+ if have_pysam:
477
+ assert pysam_mod is not None
478
+ if not hasattr(pysam_mod, "stats"):
479
+ raise RuntimeError("pysam.stats is unavailable in this pysam build.")
480
+ txt = pysam_mod.stats(str(bam))
212
481
  out_stats.write_text(txt)
213
482
  results.append(("stats(pysam)", 0))
214
483
  else:
215
484
  cmd = ["samtools", "stats", str(bam)]
216
- with open(out_stats, "w") as fh:
217
- cp = subprocess.run(cmd, stdout=fh, stderr=subprocess.PIPE)
218
- results.append(("stats(samtools)", cp.returncode))
219
- if cp.returncode != 0:
220
- raise RuntimeError(cp.stderr.decode(errors="replace"))
221
-
222
- def run_flagstat():
223
- if not flagstats:
224
- return
225
- if HAVE_PYSAM and hasattr(pysam, "flagstat"):
226
- txt = pysam.flagstat(str(bam))
485
+ rc = _run_samtools_to_file(cmd, out_stats, bam, "samtools stats")
486
+ results.append(("stats(samtools)", rc))
487
+
488
+ # --- flagstat ---
489
+ if flagstats:
490
+ if have_pysam:
491
+ assert pysam_mod is not None
492
+ if not hasattr(pysam_mod, "flagstat"):
493
+ raise RuntimeError("pysam.flagstat is unavailable in this pysam build.")
494
+ txt = pysam_mod.flagstat(str(bam))
227
495
  out_flag.write_text(txt)
228
496
  results.append(("flagstat(pysam)", 0))
229
497
  else:
230
498
  cmd = ["samtools", "flagstat", str(bam)]
231
- with open(out_flag, "w") as fh:
232
- cp = subprocess.run(cmd, stdout=fh, stderr=subprocess.PIPE)
233
- results.append(("flagstat(samtools)", cp.returncode))
234
- if cp.returncode != 0:
235
- raise RuntimeError(cp.stderr.decode(errors="replace"))
236
-
237
- def run_idxstats():
238
- if not idxstats:
239
- return
240
- if HAVE_PYSAM and hasattr(pysam, "idxstats"):
241
- txt = pysam.idxstats(str(bam))
499
+ rc = _run_samtools_to_file(cmd, out_flag, bam, "samtools flagstat")
500
+ results.append(("flagstat(samtools)", rc))
501
+
502
+ # --- idxstats ---
503
+ if idxstats:
504
+ if have_pysam:
505
+ assert pysam_mod is not None
506
+ if not hasattr(pysam_mod, "idxstats"):
507
+ raise RuntimeError("pysam.idxstats is unavailable in this pysam build.")
508
+ txt = pysam_mod.idxstats(str(bam))
242
509
  out_idx.write_text(txt)
243
510
  results.append(("idxstats(pysam)", 0))
244
511
  else:
245
512
  cmd = ["samtools", "idxstats", str(bam)]
246
- with open(out_idx, "w") as fh:
247
- cp = subprocess.run(cmd, stdout=fh, stderr=subprocess.PIPE)
248
- results.append(("idxstats(samtools)", cp.returncode))
249
- if cp.returncode != 0:
250
- raise RuntimeError(cp.stderr.decode(errors="replace"))
251
-
252
- # Sanity: ensure samtools exists if pysam missing
253
- if not HAVE_PYSAM:
254
- if not shutil.which("samtools"):
255
- raise RuntimeError("Neither pysam nor samtools is available in PATH.")
256
-
257
- # Execute tasks (serial per file; parallelized across files)
258
- run_stats()
259
- run_flagstat()
260
- run_idxstats()
513
+ rc = _run_samtools_to_file(cmd, out_idx, bam, "samtools idxstats")
514
+ results.append(("idxstats(samtools)", rc))
515
+
261
516
  return bam, results
262
517
 
263
- # Parallel across BAMs
264
518
  max_workers = int(threads) if threads and int(threads) > 0 else 1
265
- futures = []
266
- with ThreadPoolExecutor(max_workers=max_workers) as ex:
267
- for b in bam_files:
268
- futures.append(ex.submit(_run_one, b))
269
519
 
270
- for fut in as_completed(futures):
520
+ with ThreadPoolExecutor(max_workers=max_workers) as ex:
521
+ futs = [ex.submit(_run_one, b) for b in bam_paths]
522
+ for fut in as_completed(futs):
271
523
  try:
272
524
  bam, res = fut.result()
273
525
  summary = ", ".join(f"{name}:{rc}" for name, rc in res) or "no-op"
274
- print(f"[qc] {bam.name}: {summary}")
526
+ logger.info("[qc] %s: %s", bam.name, summary)
275
527
  except Exception as e:
276
- print(f"[error] QC failed: {e}")
528
+ logger.exception("QC failed: %s", e)
529
+
530
+ if modality not in {"conversion", "direct", "deaminase"}:
531
+ logger.warning("Unknown modality '%s', continuing.", modality)
277
532
 
278
- # Placeholders to keep your signature stable
279
- if modality not in {"conversion", "direct"}:
280
- print(f"[warn] Unknown modality '{modality}', continuing.")
533
+ logger.info("QC processing completed.")
281
534
 
282
- print("QC processing completed.")
283
535
 
284
536
  def concatenate_fastqs_to_bam(
285
537
  fastq_files: List[Union[str, Tuple[str, str], Path, Tuple[Path, Path]]],
@@ -290,6 +542,8 @@ def concatenate_fastqs_to_bam(
290
542
  rg_sample_field: Optional[str] = None,
291
543
  progress: bool = True,
292
544
  auto_pair: bool = True,
545
+ gzip_suffixes: Tuple[str, ...] = (".gz", ".gzip"),
546
+ samtools_backend: str | None = "auto",
293
547
  ) -> Dict[str, Any]:
294
548
  """
295
549
  Concatenate FASTQ(s) into an **unaligned** BAM. Supports single-end and paired-end.
@@ -312,6 +566,10 @@ def concatenate_fastqs_to_bam(
312
566
  Show tqdm progress bars.
313
567
  auto_pair : bool
314
568
  Auto-pair R1/R2 based on filename patterns if given a flat list.
569
+ gzip_suffixes : tuple[str, ...]
570
+ Suffixes treated as gzip-compressed FASTQ files.
571
+ samtools_backend : str | None
572
+ Backend selection for samtools-compatible operations (auto|python|cli).
315
573
 
316
574
  Returns
317
575
  -------
@@ -326,12 +584,30 @@ def concatenate_fastqs_to_bam(
326
584
  """
327
585
  name = p.name
328
586
  lowers = name.lower()
329
- for ext in (".fastq.gz", ".fq.gz", ".fastq.bz2", ".fq.bz2", ".fastq.xz", ".fq.xz", ".fastq", ".fq"):
587
+ gzip_exts = tuple(s.lower() for s in gzip_suffixes)
588
+ for ext in (
589
+ *(f".fastq{suf}" for suf in gzip_exts),
590
+ *(f".fq{suf}" for suf in gzip_exts),
591
+ ".fastq.bz2",
592
+ ".fq.bz2",
593
+ ".fastq.xz",
594
+ ".fq.xz",
595
+ ".fastq",
596
+ ".fq",
597
+ ):
330
598
  if lowers.endswith(ext):
331
599
  return name[: -len(ext)]
332
600
  return p.stem # fallback: remove last suffix only
333
601
 
334
602
  def _extract_barcode_from_filename(p: Path) -> str:
603
+ """Extract a barcode token from a FASTQ filename.
604
+
605
+ Args:
606
+ p: FASTQ path.
607
+
608
+ Returns:
609
+ Barcode token string.
610
+ """
335
611
  stem = _strip_fastq_ext(p)
336
612
  if "_" in stem:
337
613
  token = stem.split("_")[-1]
@@ -340,10 +616,18 @@ def concatenate_fastqs_to_bam(
340
616
  return stem
341
617
 
342
618
  def _classify_read_token(stem: str) -> Tuple[Optional[str], Optional[int]]:
619
+ """Classify a FASTQ filename stem into (prefix, read_number).
620
+
621
+ Args:
622
+ stem: Filename stem.
623
+
624
+ Returns:
625
+ Tuple of (prefix, read_number) or (None, None) if not matched.
626
+ """
343
627
  # return (prefix, readnum) if matches; else (None, None)
344
628
  patterns = [
345
- r"(?i)(.*?)[._-]r?([12])$", # prefix_R1 / prefix.r2 / prefix-1
346
- r"(?i)(.*?)[._-]read[_-]?([12])$", # prefix_read1
629
+ r"(?i)(.*?)[._-]r?([12])$", # prefix_R1 / prefix.r2 / prefix-1
630
+ r"(?i)(.*?)[._-]read[_-]?([12])$", # prefix_read1
347
631
  ]
348
632
  for pat in patterns:
349
633
  m = re.match(pat, stem)
@@ -352,6 +636,14 @@ def concatenate_fastqs_to_bam(
352
636
  return None, None
353
637
 
354
638
  def _pair_by_filename(paths: List[Path]) -> Tuple[List[Tuple[Path, Path]], List[Path]]:
639
+ """Pair FASTQ files based on filename conventions.
640
+
641
+ Args:
642
+ paths: FASTQ paths to pair.
643
+
644
+ Returns:
645
+ Tuple of (paired list, leftover list).
646
+ """
355
647
  pref_map: Dict[str, Dict[int, Path]] = {}
356
648
  unpaired: List[Path] = []
357
649
  for pth in paths:
@@ -373,11 +665,59 @@ def concatenate_fastqs_to_bam(
373
665
  return pairs, leftovers
374
666
 
375
667
  def _fastq_iter(p: Path):
668
+ """Yield FASTQ records using pysam.FastxFile.
669
+
670
+ Args:
671
+ p: FASTQ path.
672
+
673
+ Yields:
674
+ Pysam Fastx records.
675
+ """
376
676
  # pysam.FastxFile handles compressed extensions transparently
377
- with pysam.FastxFile(str(p)) as fx:
677
+ pysam_mod = _require_pysam()
678
+ with pysam_mod.FastxFile(str(p)) as fx:
378
679
  for rec in fx:
379
680
  yield rec # rec.name, rec.sequence, rec.quality
380
681
 
682
+ def _fastq_iter_plain(p: Path) -> Iterable[Tuple[str, str, str]]:
683
+ """Yield FASTQ records from plain-text parsing.
684
+
685
+ Args:
686
+ p: FASTQ path.
687
+
688
+ Yields:
689
+ Tuple of (name, sequence, quality).
690
+ """
691
+ import bz2
692
+ import gzip
693
+ import lzma
694
+
695
+ lowers = p.name.lower()
696
+ if any(lowers.endswith(suf) for suf in (s.lower() for s in gzip_suffixes)):
697
+ handle = gzip.open(p, "rt", encoding="utf-8")
698
+ elif lowers.endswith(".bz2"):
699
+ handle = bz2.open(p, "rt", encoding="utf-8")
700
+ elif lowers.endswith(".xz"):
701
+ handle = lzma.open(p, "rt", encoding="utf-8")
702
+ else:
703
+ handle = p.open("r", encoding="utf-8")
704
+
705
+ with handle as fh:
706
+ while True:
707
+ header = fh.readline()
708
+ if not header:
709
+ break
710
+ seq = fh.readline()
711
+ fh.readline()
712
+ qual = fh.readline()
713
+ if not qual:
714
+ break
715
+ name = header.strip()
716
+ if name.startswith("@"):
717
+ name = name[1:]
718
+ name = name.split()[0]
719
+ yield name, seq.strip(), qual.strip()
720
+
381
721
  def _make_unaligned_segment(
382
722
  name: str,
383
723
  seq: str,
@@ -386,11 +726,25 @@ def concatenate_fastqs_to_bam(
386
726
  read1: bool,
387
727
  read2: bool,
388
728
  ) -> pysam.AlignedSegment:
389
- a = pysam.AlignedSegment()
729
+ """Construct an unaligned pysam.AlignedSegment.
730
+
731
+ Args:
732
+ name: Read name.
733
+ seq: Read sequence.
734
+ qual: FASTQ quality string.
735
+ bc: Barcode string.
736
+ read1: Whether this is read 1.
737
+ read2: Whether this is read 2.
738
+
739
+ Returns:
740
+ Unaligned pysam.AlignedSegment.
741
+ """
742
+ pysam_mod = _require_pysam()
743
+ a = pysam_mod.AlignedSegment()
390
744
  a.query_name = name
391
745
  a.query_sequence = seq
392
746
  if qual is not None:
393
- a.query_qualities = pysam.qualitystring_to_array(qual)
747
+ a.query_qualities = pysam_mod.qualitystring_to_array(qual)
394
748
  a.is_unmapped = True
395
749
  a.is_paired = read1 or read2
396
750
  a.is_read1 = read1
@@ -406,8 +760,51 @@ def concatenate_fastqs_to_bam(
406
760
  a.set_tag("RG", str(bc), value_type="Z")
407
761
  return a
408
762
 
763
+ def _write_sam_line(
764
+ handle,
765
+ name: str,
766
+ seq: str,
767
+ qual: str,
768
+ bc: str,
769
+ *,
770
+ read1: bool,
771
+ read2: bool,
772
+ add_read_group: bool,
773
+ ) -> None:
774
+ """Write a single unaligned SAM record to a text stream."""
775
+ if read1:
776
+ flag = 77
777
+ elif read2:
778
+ flag = 141
779
+ else:
780
+ flag = 4
781
+ tags = [f"{barcode_tag}:Z:{bc}"]
782
+ if add_read_group:
783
+ tags.append(f"RG:Z:{bc}")
784
+ tag_str = "\t".join(tags)
785
+ if not qual:
786
+ qual = "*"
787
+ line = "\t".join(
788
+ [
789
+ name,
790
+ str(flag),
791
+ "*",
792
+ "0",
793
+ "0",
794
+ "*",
795
+ "*",
796
+ "0",
797
+ "0",
798
+ seq,
799
+ qual,
800
+ tag_str,
801
+ ]
802
+ )
803
+ handle.write(f"{line}\n")
804
+
409
805
  # ---------- normalize inputs to Path ----------
410
806
  def _to_path_pair(x) -> Tuple[Path, Path]:
807
+ """Convert a tuple of path-like objects to Path instances."""
411
808
  a, b = x
412
809
  return Path(a), Path(b)
413
810
 
@@ -450,7 +847,10 @@ def concatenate_fastqs_to_bam(
450
847
  # ---------- BAM header ----------
451
848
  header = {"HD": {"VN": "1.6", "SO": "unknown"}, "SQ": []}
452
849
  if add_read_group:
453
- header["RG"] = [{"ID": bc, **({"SM": rg_sample_field} if rg_sample_field else {})} for bc in barcodes_in_order]
850
+ header["RG"] = [
851
+ {"ID": bc, **({"SM": rg_sample_field} if rg_sample_field else {})}
852
+ for bc in barcodes_in_order
853
+ ]
454
854
  header.setdefault("PG", []).append(
455
855
  {"ID": "concat-fastq", "PN": "concatenate_fastqs_to_bam", "VN": "1"}
456
856
  )
@@ -462,7 +862,29 @@ def concatenate_fastqs_to_bam(
462
862
  singletons_written = 0
463
863
 
464
864
  # ---------- write BAM ----------
465
- with pysam.AlignmentFile(str(output_bam), "wb", header=header) as bam_out:
865
+ backend_choice = _resolve_samtools_backend(samtools_backend)
866
+ if backend_choice == "python":
867
+ pysam_mod = _require_pysam()
868
+ bam_out_ctx = pysam_mod.AlignmentFile(str(output_bam), "wb", header=header)
869
+ else:
870
+ cmd = ["samtools", "view", "-b", "-o", str(output_bam), "-"]
871
+ logger.debug("Writing BAM using samtools: %s", " ".join(cmd))
872
+ bam_out_ctx = subprocess.Popen(
873
+ cmd, stdin=subprocess.PIPE, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True
874
+ )
875
+ assert bam_out_ctx.stdin is not None
876
+ header_lines = ["@HD\tVN:1.6\tSO:unknown"]
877
+ if add_read_group:
878
+ for bc in barcodes_in_order:
879
+ rg_fields = [f"ID:{bc}"]
880
+ if rg_sample_field:
881
+ rg_fields.append(f"SM:{rg_sample_field}")
882
+ rg_body = "\t".join(rg_fields)
883
+ header_lines.append(f"@RG\t{rg_body}")
884
+ header_lines.append("@PG\tID:concat-fastq\tPN:concatenate_fastqs_to_bam\tVN:1")
885
+ bam_out_ctx.stdin.write("\n".join(header_lines) + "\n")
886
+
887
+ try:
466
888
  # Paired
467
889
  it_pairs = explicit_pairs
468
890
  if progress and it_pairs:
@@ -472,30 +894,83 @@ def concatenate_fastqs_to_bam(
472
894
  raise FileNotFoundError(f"Paired file missing: {r1_path} or {r2_path}")
473
895
  bc = per_path_barcode.get(r1_path) or per_path_barcode.get(r2_path) or "barcode"
474
896
 
475
- it1 = _fastq_iter(r1_path)
476
- it2 = _fastq_iter(r2_path)
897
+ if backend_choice == "python":
898
+ it1 = _fastq_iter(r1_path)
899
+ it2 = _fastq_iter(r2_path)
900
+ else:
901
+ it1 = _fastq_iter_plain(r1_path)
902
+ it2 = _fastq_iter_plain(r2_path)
477
903
 
478
904
  for rec1, rec2 in zip_longest(it1, it2, fillvalue=None):
905
+
479
906
  def _clean(n: Optional[str]) -> Optional[str]:
907
+ """Normalize FASTQ read names by trimming read suffixes."""
480
908
  if n is None:
481
909
  return None
482
910
  return re.sub(r"(?:/1$|/2$|\s[12]$)", "", n)
483
911
 
484
912
  name = (
485
- _clean(getattr(rec1, "name", None))
486
- or _clean(getattr(rec2, "name", None))
487
- or getattr(rec1, "name", None)
488
- or getattr(rec2, "name", None)
913
+ _clean(getattr(rec1, "name", None) if backend_choice == "python" else rec1[0])
914
+ if rec1 is not None
915
+ else None
489
916
  )
917
+ if name is None:
918
+ name = (
919
+ _clean(
920
+ getattr(rec2, "name", None) if backend_choice == "python" else rec2[0]
921
+ )
922
+ if rec2 is not None
923
+ else None
924
+ )
925
+ if name is None:
926
+ name = (
927
+ getattr(rec1, "name", None)
928
+ if backend_choice == "python" and rec1 is not None
929
+ else (rec1[0] if rec1 is not None else None)
930
+ )
931
+ if name is None:
932
+ name = (
933
+ getattr(rec2, "name", None)
934
+ if backend_choice == "python" and rec2 is not None
935
+ else (rec2[0] if rec2 is not None else None)
936
+ )
490
937
 
491
938
  if rec1 is not None:
492
- a1 = _make_unaligned_segment(name, rec1.sequence, rec1.quality, bc, read1=True, read2=False)
493
- bam_out.write(a1)
939
+ if backend_choice == "python":
940
+ a1 = _make_unaligned_segment(
941
+ name, rec1.sequence, rec1.quality, bc, read1=True, read2=False
942
+ )
943
+ bam_out_ctx.write(a1)
944
+ else:
945
+ _write_sam_line(
946
+ bam_out_ctx.stdin,
947
+ name,
948
+ rec1[1],
949
+ rec1[2],
950
+ bc,
951
+ read1=True,
952
+ read2=False,
953
+ add_read_group=add_read_group,
954
+ )
494
955
  per_file_counts[r1_path] = per_file_counts.get(r1_path, 0) + 1
495
956
  total_written += 1
496
957
  if rec2 is not None:
497
- a2 = _make_unaligned_segment(name, rec2.sequence, rec2.quality, bc, read1=False, read2=True)
498
- bam_out.write(a2)
958
+ if backend_choice == "python":
959
+ a2 = _make_unaligned_segment(
960
+ name, rec2.sequence, rec2.quality, bc, read1=False, read2=True
961
+ )
962
+ bam_out_ctx.write(a2)
963
+ else:
964
+ _write_sam_line(
965
+ bam_out_ctx.stdin,
966
+ name,
967
+ rec2[1],
968
+ rec2[2],
969
+ bc,
970
+ read1=False,
971
+ read2=True,
972
+ add_read_group=add_read_group,
973
+ )
499
974
  per_file_counts[r2_path] = per_file_counts.get(r2_path, 0) + 1
500
975
  total_written += 1
501
976
 
@@ -515,12 +990,40 @@ def concatenate_fastqs_to_bam(
515
990
  if not pth.exists():
516
991
  raise FileNotFoundError(pth)
517
992
  bc = per_path_barcode.get(pth, "barcode")
518
- for rec in _fastq_iter(pth):
519
- a = _make_unaligned_segment(rec.name, rec.sequence, rec.quality, bc, read1=False, read2=False)
520
- bam_out.write(a)
993
+ if backend_choice == "python":
994
+ iterator = _fastq_iter(pth)
995
+ else:
996
+ iterator = _fastq_iter_plain(pth)
997
+ for rec in iterator:
998
+ if backend_choice == "python":
999
+ a = _make_unaligned_segment(
1000
+ rec.name, rec.sequence, rec.quality, bc, read1=False, read2=False
1001
+ )
1002
+ bam_out_ctx.write(a)
1003
+ else:
1004
+ _write_sam_line(
1005
+ bam_out_ctx.stdin,
1006
+ rec[0],
1007
+ rec[1],
1008
+ rec[2],
1009
+ bc,
1010
+ read1=False,
1011
+ read2=False,
1012
+ add_read_group=add_read_group,
1013
+ )
521
1014
  per_file_counts[pth] = per_file_counts.get(pth, 0) + 1
522
1015
  total_written += 1
523
1016
  singletons_written += 1
1017
+ finally:
1018
+ if backend_choice == "python":
1019
+ bam_out_ctx.close()
1020
+ else:
1021
+ if bam_out_ctx.stdin is not None:
1022
+ bam_out_ctx.stdin.close()
1023
+ rc = bam_out_ctx.wait()
1024
+ if rc != 0:
1025
+ stderr = bam_out_ctx.stderr.read() if bam_out_ctx.stderr else ""
1026
+ raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
524
1027
 
525
1028
  return {
526
1029
  "total_reads": total_written,
@@ -530,43 +1033,61 @@ def concatenate_fastqs_to_bam(
530
1033
  "barcodes": barcodes_in_order,
531
1034
  }
532
1035
 
533
- def count_aligned_reads(bam_file):
1036
+
1037
+ def count_aligned_reads(bam_file, samtools_backend: str | None = "auto"):
534
1038
  """
535
1039
  Counts the number of aligned reads in a bam file that map to each reference record.
536
-
1040
+
537
1041
  Parameters:
538
1042
  bam_file (str): A string representing the path to an aligned BAM file.
539
-
1043
+
540
1044
  Returns:
541
1045
  aligned_reads_count (int): The total number or reads aligned in the BAM.
542
1046
  unaligned_reads_count (int): The total number of reads not aligned in the BAM.
543
1047
  record_counts (dict): A dictionary keyed by reference record instance that points toa tuple containing the total reads mapped to the record and the fraction of mapped reads which map to the record.
544
1048
 
545
1049
  """
546
- print('{0}: Counting aligned reads in BAM > {1}'.format(time_string(), bam_file))
1050
+ logger.info("Counting aligned reads in BAM > {}".format(bam_file.name))
1051
+ backend_choice = _resolve_samtools_backend(samtools_backend)
547
1052
  aligned_reads_count = 0
548
1053
  unaligned_reads_count = 0
549
- # Make a dictionary, keyed by the reference_name of reference chromosome that points to an integer number of read counts mapped to the chromosome, as well as the proportion of mapped reads in that chromosome
550
- record_counts = defaultdict(int)
551
-
552
- with pysam.AlignmentFile(str(bam_file), "rb") as bam:
553
- total_reads = bam.mapped + bam.unmapped
554
- # Iterate over reads to get the total mapped read counts and the reads that map to each reference
555
- for read in tqdm(bam, desc='Counting aligned reads in BAM', total=total_reads):
556
- if read.is_unmapped:
557
- unaligned_reads_count += 1
558
- else:
559
- aligned_reads_count += 1
560
- record_counts[read.reference_name] += 1 # Automatically increments if key exists, adds if not
561
1054
 
562
- # reformat the dictionary to contain read counts mapped to the reference, as well as the proportion of mapped reads in reference
563
- for reference in record_counts:
564
- proportion_mapped_reads_in_record = record_counts[reference] / aligned_reads_count
565
- record_counts[reference] = (record_counts[reference], proportion_mapped_reads_in_record)
1055
+ if backend_choice == "python":
1056
+ pysam_mod = _require_pysam()
1057
+ record_counts = defaultdict(int)
1058
+ with pysam_mod.AlignmentFile(str(bam_file), "rb") as bam:
1059
+ total_reads = bam.mapped + bam.unmapped
1060
+ # Iterate over reads to get the total mapped read counts and the reads that map to each reference
1061
+ for read in bam:
1062
+ if read.is_unmapped:
1063
+ unaligned_reads_count += 1
1064
+ else:
1065
+ aligned_reads_count += 1
1066
+ record_counts[read.reference_name] += (
1067
+ 1 # Automatically increments if key exists, adds if not
1068
+ )
1069
+
1070
+ # reformat the dictionary to contain read counts mapped to the reference, as well as the proportion of mapped reads in reference
1071
+ for reference in record_counts:
1072
+ proportion_mapped_reads_in_record = record_counts[reference] / aligned_reads_count
1073
+ record_counts[reference] = (
1074
+ record_counts[reference],
1075
+ proportion_mapped_reads_in_record,
1076
+ )
1077
+ return aligned_reads_count, unaligned_reads_count, dict(record_counts)
1078
+
1079
+ bam_path = Path(bam_file)
1080
+ _ensure_bam_index(bam_path, backend_choice)
1081
+ cmd = ["samtools", "idxstats", str(bam_path)]
1082
+ cp = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
1083
+ if cp.returncode != 0:
1084
+ raise RuntimeError(f"samtools idxstats failed (exit {cp.returncode}):\n{cp.stderr}")
1085
+ return _parse_idxstats_output(cp.stdout)
566
1086
 
567
- return aligned_reads_count, unaligned_reads_count, dict(record_counts)
568
1087
 
569
- def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit, barcode_both_ends, trim, threads):
1088
+ def demux_and_index_BAM(
1089
+ aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit, barcode_both_ends, trim, threads
1090
+ ):
570
1091
  """
571
1092
  A wrapper function for splitting BAMS and indexing them.
572
1093
  Parameters:
@@ -577,11 +1098,12 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
577
1098
  barcode_both_ends (bool): Whether to require both ends to be barcoded.
578
1099
  trim (bool): Whether to trim off barcodes after demultiplexing.
579
1100
  threads (int): Number of threads to use.
580
-
1101
+
581
1102
  Returns:
582
1103
  bam_files (list): List of split BAM file path strings
583
1104
  Splits an input BAM file on barcode value and makes a BAM index file.
584
1105
  """
1106
+
585
1107
  input_bam = aligned_sorted_BAM.with_suffix(bam_suffix)
586
1108
  command = ["dorado", "demux", "--kit-name", barcode_kit]
587
1109
  if barcode_both_ends:
@@ -594,25 +1116,37 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
594
1116
  pass
595
1117
  command += ["--emit-summary", "--sort-bam", "--output-dir", str(split_dir)]
596
1118
  command.append(str(input_bam))
597
- command_string = ' '.join(command)
598
- print(f"Running: {command_string}")
599
- subprocess.run(command)
1119
+ command_string = " ".join(command)
1120
+ logger.info("Running dorado demux: %s", " ".join(command))
1121
+
1122
+ proc = subprocess.Popen(
1123
+ command,
1124
+ stdout=subprocess.PIPE,
1125
+ stderr=subprocess.PIPE,
1126
+ text=True,
1127
+ )
1128
+
1129
+ assert proc.stderr is not None
1130
+ _stream_dorado_logs(proc.stderr)
1131
+ rc = proc.wait()
1132
+
1133
+ if rc != 0:
1134
+ raise RuntimeError(f"dorado demux failed with exit code {rc}")
600
1135
 
601
1136
  bam_files = sorted(
602
- p for p in split_dir.glob(f"*{bam_suffix}")
603
- if p.is_file() and p.suffix == bam_suffix
1137
+ p for p in split_dir.glob(f"*{bam_suffix}") if p.is_file() and p.suffix == bam_suffix
604
1138
  )
605
1139
 
606
1140
  if not bam_files:
607
1141
  raise FileNotFoundError(f"No BAM files found in {split_dir} with suffix {bam_suffix}")
608
-
1142
+
609
1143
  # ---- Optional renaming with prefix ----
610
1144
  renamed_bams = []
611
1145
  prefix = "de" if barcode_both_ends else "se"
612
1146
 
613
1147
  for bam in bam_files:
614
1148
  bam = Path(bam)
615
- bai = bam.with_suffix(bam_suffix + ".bai") # dorado’s sorting produces .bam.bai
1149
+ bai = bam.with_suffix(bam_suffix + ".bai") # dorado’s sorting produces .bam.bai
616
1150
 
617
1151
  if prefix:
618
1152
  new_name = f"{prefix}_{bam.name}"
@@ -628,10 +1162,18 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
628
1162
  bai.rename(new_bai)
629
1163
 
630
1164
  renamed_bams.append(new_bam)
631
-
1165
+
632
1166
  return renamed_bams
633
1167
 
634
- def extract_base_identities(bam_file, chromosome, positions, max_reference_length, sequence):
1168
+
1169
+ def extract_base_identities(
1170
+ bam_file,
1171
+ chromosome,
1172
+ positions,
1173
+ max_reference_length,
1174
+ sequence,
1175
+ samtools_backend: str | None = "auto",
1176
+ ):
635
1177
  """
636
1178
  Efficiently extracts base identities from mapped reads with reference coordinates.
637
1179
 
@@ -646,38 +1188,95 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
646
1188
  dict: Base identities from forward mapped reads.
647
1189
  dict: Base identities from reverse mapped reads.
648
1190
  """
1191
+ logger.debug("Extracting nucleotide identities for each read using extract_base_identities")
649
1192
  timestamp = time.strftime("[%Y-%m-%d %H:%M:%S]")
650
1193
 
651
1194
  positions = set(positions)
652
- fwd_base_identities = defaultdict(lambda: np.full(max_reference_length, 'N', dtype='<U1'))
653
- rev_base_identities = defaultdict(lambda: np.full(max_reference_length, 'N', dtype='<U1'))
1195
+ fwd_base_identities = defaultdict(lambda: np.full(max_reference_length, "N", dtype="<U1"))
1196
+ rev_base_identities = defaultdict(lambda: np.full(max_reference_length, "N", dtype="<U1"))
654
1197
  mismatch_counts_per_read = defaultdict(lambda: defaultdict(Counter))
655
1198
 
656
- #print(f"{timestamp} Reading reads from {chromosome} BAM file: {bam_file}")
657
- with pysam.AlignmentFile(str(bam_file), "rb") as bam:
658
- total_reads = bam.mapped
659
- ref_seq = sequence.upper()
660
- for read in bam.fetch(chromosome):
661
- if not read.is_mapped:
662
- continue # Skip unmapped reads
1199
+ backend_choice = _resolve_samtools_backend(samtools_backend)
1200
+ ref_seq = sequence.upper()
1201
+
1202
+ if backend_choice == "python":
1203
+ logger.debug("Extracting base identities using python")
1204
+ pysam_mod = _require_pysam()
1205
+ # print(f"{timestamp} Reading reads from {chromosome} BAM file: {bam_file}")
1206
+ with pysam_mod.AlignmentFile(str(bam_file), "rb") as bam:
1207
+ total_reads = bam.mapped
1208
+ for read in bam.fetch(chromosome):
1209
+ if not read.is_mapped:
1210
+ continue # Skip unmapped reads
663
1211
 
664
- read_name = read.query_name
665
- query_sequence = read.query_sequence
666
- base_dict = rev_base_identities if read.is_reverse else fwd_base_identities
1212
+ read_name = read.query_name
1213
+ query_sequence = read.query_sequence
1214
+ base_dict = rev_base_identities if read.is_reverse else fwd_base_identities
667
1215
 
668
- # Use get_aligned_pairs directly with positions filtering
669
- aligned_pairs = read.get_aligned_pairs(matches_only=True)
1216
+ # Use get_aligned_pairs directly with positions filtering
1217
+ aligned_pairs = read.get_aligned_pairs(matches_only=True)
670
1218
 
671
- for read_position, reference_position in aligned_pairs:
672
- if reference_position in positions:
1219
+ for read_position, reference_position in aligned_pairs:
673
1220
  read_base = query_sequence[read_position]
674
1221
  ref_base = ref_seq[reference_position]
1222
+ if reference_position in positions:
1223
+ base_dict[read_name][reference_position] = read_base
675
1224
 
676
- base_dict[read_name][reference_position] = read_base
677
-
678
- # Track mismatches (excluding Ns)
679
- if read_base != ref_base and read_base != 'N' and ref_base != 'N':
1225
+ # Track mismatches (excluding Ns)
1226
+ if read_base != ref_base and read_base != "N" and ref_base != "N":
1227
+ mismatch_counts_per_read[read_name][ref_base][read_base] += 1
1228
+ else:
1229
+ bam_path = Path(bam_file)
1230
+ logger.debug("Extracting base identities using samtools")
1231
+ _ensure_bam_index(bam_path, backend_choice)
1232
+
1233
+ def _iter_aligned_pairs(cigar: str, start: int) -> Iterable[Tuple[int, int]]:
1234
+ qpos = 0
1235
+ rpos = start
1236
+ for length_str, op in re.findall(r"(\d+)([MIDNSHP=XB])", cigar):
1237
+ length = int(length_str)
1238
+ if op in {"M", "=", "X"}:
1239
+ for _ in range(length):
1240
+ yield qpos, rpos
1241
+ qpos += 1
1242
+ rpos += 1
1243
+ elif op in {"I", "S"}:
1244
+ qpos += length
1245
+ elif op in {"D", "N"}:
1246
+ rpos += length
1247
+ elif op in {"H", "P"}:
1248
+ continue
1249
+
1250
+ cmd = ["samtools", "view", "-F", "4", str(bam_path), chromosome]
1251
+ proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
1252
+ assert proc.stdout is not None
1253
+ for line in proc.stdout:
1254
+ if not line.strip() or line.startswith("@"):
1255
+ continue
1256
+ fields = line.rstrip("\n").split("\t")
1257
+ if len(fields) < 11:
1258
+ continue
1259
+ read_name = fields[0]
1260
+ flag = int(fields[1])
1261
+ pos = int(fields[3])
1262
+ cigar = fields[5]
1263
+ query_sequence = fields[9]
1264
+ if cigar == "*" or query_sequence == "*":
1265
+ continue
1266
+ base_dict = rev_base_identities if (flag & 16) else fwd_base_identities
1267
+ for read_pos, ref_pos in _iter_aligned_pairs(cigar, pos - 1):
1268
+ if read_pos >= len(query_sequence) or ref_pos >= len(ref_seq):
1269
+ continue
1270
+ read_base = query_sequence[read_pos]
1271
+ ref_base = ref_seq[ref_pos]
1272
+ if ref_pos in positions:
1273
+ base_dict[read_name][ref_pos] = read_base
1274
+ if read_base != ref_base and read_base != "N" and ref_base != "N":
680
1275
  mismatch_counts_per_read[read_name][ref_base][read_base] += 1
1276
+ rc = proc.wait()
1277
+ if rc != 0:
1278
+ stderr = proc.stderr.read() if proc.stderr else ""
1279
+ raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
681
1280
 
682
1281
  # Determine C→T vs G→A dominance per read
683
1282
  mismatch_trend_per_read = {}
@@ -694,39 +1293,145 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
694
1293
  else:
695
1294
  mismatch_trend_per_read[read_name] = "none"
696
1295
 
697
- return dict(fwd_base_identities), dict(rev_base_identities), dict(mismatch_counts_per_read), mismatch_trend_per_read
1296
+ return (
1297
+ dict(fwd_base_identities),
1298
+ dict(rev_base_identities),
1299
+ dict(mismatch_counts_per_read),
1300
+ mismatch_trend_per_read,
1301
+ )
1302
+
1303
+
1304
+ def extract_read_features_from_bam(
1305
+ bam_file_path: str | Path, samtools_backend: str | None = "auto"
1306
+ ) -> Dict[str, List[float]]:
1307
+ """Extract read metrics from a BAM file.
1308
+
1309
+ Args:
1310
+ bam_file_path: Path to the BAM file.
1311
+ samtools_backend: Backend selection for samtools-compatible operations (auto|python|cli).
698
1312
 
699
- def extract_read_features_from_bam(bam_file_path):
700
- """
701
- Make a dict of reads from a bam that points to a list of read metrics: read length, read median Q-score, reference length, mapped length, mapping quality
702
- Params:
703
- bam_file_path (str):
704
1313
  Returns:
705
- read_metrics (dict)
1314
+ Mapping of read name to [read_length, read_median_qscore, reference_length,
1315
+ mapped_length, mapping_quality].
706
1316
  """
707
- # Open the BAM file
708
- print(f'Extracting read features from BAM: {bam_file_path}')
709
- with pysam.AlignmentFile(bam_file_path, "rb") as bam_file:
710
- read_metrics = {}
711
- reference_lengths = bam_file.lengths # List of lengths for each reference (chromosome)
712
- for read in bam_file:
713
- # Skip unmapped reads
714
- if read.is_unmapped:
1317
+ logger.debug(
1318
+ "Extracting read metrics from BAM using extract_read_features_from_bam: %s",
1319
+ bam_file_path,
1320
+ )
1321
+ backend_choice = _resolve_samtools_backend(samtools_backend)
1322
+ read_metrics: Dict[str, List[float]] = {}
1323
+
1324
+ if backend_choice == "python":
1325
+ pysam_mod = _require_pysam()
1326
+ with pysam_mod.AlignmentFile(str(bam_file_path), "rb") as bam_file:
1327
+ reference_lengths = dict(zip(bam_file.references, bam_file.lengths))
1328
+ for read in bam_file:
1329
+ if read.is_unmapped:
1330
+ continue
1331
+ read_quality = read.query_qualities
1332
+ if read_quality is None:
1333
+ median_read_quality = float("nan")
1334
+ else:
1335
+ median_read_quality = float(np.median(read_quality))
1336
+ reference_length = reference_lengths.get(read.reference_name, float("nan"))
1337
+ mapped_length = sum(end - start for start, end in read.get_blocks())
1338
+ mapping_quality = float(read.mapping_quality)
1339
+ read_metrics[read.query_name] = [
1340
+ float(read.query_length),
1341
+ median_read_quality,
1342
+ float(reference_length),
1343
+ float(mapped_length),
1344
+ mapping_quality,
1345
+ ]
1346
+ return read_metrics
1347
+
1348
+ bam_path = Path(bam_file_path)
1349
+
1350
+ def _parse_reference_lengths(header_text: str) -> Dict[str, int]:
1351
+ ref_lengths: Dict[str, int] = {}
1352
+ for line in header_text.splitlines():
1353
+ if not line.startswith("@SQ"):
715
1354
  continue
716
- # Extract the read metrics
717
- read_quality = read.query_qualities
718
- median_read_quality = np.median(read_quality)
719
- # Extract the reference (chromosome) name and its length
720
- reference_name = read.reference_name
721
- reference_index = bam_file.references.index(reference_name)
722
- reference_length = reference_lengths[reference_index]
723
- mapped_length = sum(end - start for start, end in read.get_blocks())
724
- mapping_quality = read.mapping_quality # Phred-scaled MAPQ
725
- read_metrics[read.query_name] = [read.query_length, median_read_quality, reference_length, mapped_length, mapping_quality]
1355
+ fields = line.split("\t")
1356
+ name = None
1357
+ length = None
1358
+ for field in fields[1:]:
1359
+ if field.startswith("SN:"):
1360
+ name = field.split(":", 1)[1]
1361
+ elif field.startswith("LN:"):
1362
+ length = int(field.split(":", 1)[1])
1363
+ if name is not None and length is not None:
1364
+ ref_lengths[name] = length
1365
+ return ref_lengths
1366
+
1367
+ def _mapped_length_from_cigar(cigar: str) -> int:
1368
+ mapped = 0
1369
+ for length_str, op in re.findall(r"(\d+)([MIDNSHP=XB])", cigar):
1370
+ length = int(length_str)
1371
+ if op in {"M", "=", "X"}:
1372
+ mapped += length
1373
+ return mapped
1374
+
1375
+ header_cp = subprocess.run(
1376
+ ["samtools", "view", "-H", str(bam_path)],
1377
+ stdout=subprocess.PIPE,
1378
+ stderr=subprocess.PIPE,
1379
+ text=True,
1380
+ check=False,
1381
+ )
1382
+ if header_cp.returncode != 0:
1383
+ raise RuntimeError(
1384
+ f"samtools view -H failed (exit {header_cp.returncode}):\n{header_cp.stderr}"
1385
+ )
1386
+ reference_lengths = _parse_reference_lengths(header_cp.stdout)
1387
+
1388
+ proc = subprocess.Popen(
1389
+ ["samtools", "view", "-F", "4", str(bam_path)],
1390
+ stdout=subprocess.PIPE,
1391
+ stderr=subprocess.PIPE,
1392
+ text=True,
1393
+ )
1394
+ assert proc.stdout is not None
1395
+ for line in proc.stdout:
1396
+ if not line.strip() or line.startswith("@"):
1397
+ continue
1398
+ fields = line.rstrip("\n").split("\t")
1399
+ if len(fields) < 11:
1400
+ continue
1401
+ read_name = fields[0]
1402
+ reference_name = fields[2]
1403
+ mapping_quality = float(fields[4])
1404
+ cigar = fields[5]
1405
+ sequence = fields[9]
1406
+ quality = fields[10]
1407
+ if sequence == "*":
1408
+ read_length = float("nan")
1409
+ else:
1410
+ read_length = float(len(sequence))
1411
+ if quality == "*" or not quality:
1412
+ median_read_quality = float("nan")
1413
+ else:
1414
+ phreds = [ord(char) - 33 for char in quality]
1415
+ median_read_quality = float(np.median(phreds))
1416
+ reference_length = float(reference_lengths.get(reference_name, float("nan")))
1417
+ mapped_length = float(_mapped_length_from_cigar(cigar)) if cigar != "*" else 0.0
1418
+ read_metrics[read_name] = [
1419
+ read_length,
1420
+ median_read_quality,
1421
+ reference_length,
1422
+ mapped_length,
1423
+ mapping_quality,
1424
+ ]
1425
+
1426
+ rc = proc.wait()
1427
+ if rc != 0:
1428
+ stderr = proc.stderr.read() if proc.stderr else ""
1429
+ raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
726
1430
 
727
1431
  return read_metrics
728
1432
 
729
- def extract_readnames_from_bam(aligned_BAM):
1433
+
1434
+ def extract_readnames_from_bam(aligned_BAM, samtools_backend: str | None = "auto"):
730
1435
  """
731
1436
  Takes a BAM and writes out a txt file containing read names from the BAM
732
1437
 
@@ -737,17 +1442,39 @@ def extract_readnames_from_bam(aligned_BAM):
737
1442
  None
738
1443
 
739
1444
  """
740
- import subprocess
741
1445
  # Make a text file of reads for the BAM
742
- txt_output = aligned_BAM.split('.bam')[0] + '_read_names.txt'
743
- samtools_view = subprocess.Popen(["samtools", "view", aligned_BAM], stdout=subprocess.PIPE)
744
- with open(txt_output, "w") as output_file:
745
- cut_process = subprocess.Popen(["cut", "-f1"], stdin=samtools_view.stdout, stdout=output_file)
746
- samtools_view.stdout.close()
747
- cut_process.wait()
748
- samtools_view.wait()
749
-
750
- def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
1446
+ backend_choice = _resolve_samtools_backend(samtools_backend)
1447
+ txt_output = aligned_BAM.split(".bam")[0] + "_read_names.txt"
1448
+
1449
+ if backend_choice == "python":
1450
+ pysam_mod = _require_pysam()
1451
+ with (
1452
+ pysam_mod.AlignmentFile(aligned_BAM, "rb") as bam,
1453
+ open(txt_output, "w", encoding="utf-8") as output_file,
1454
+ ):
1455
+ for read in bam:
1456
+ output_file.write(f"{read.query_name}\n")
1457
+ return
1458
+
1459
+ samtools_view = subprocess.Popen(
1460
+ ["samtools", "view", aligned_BAM], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
1461
+ )
1462
+ assert samtools_view.stdout is not None
1463
+ with open(txt_output, "w", encoding="utf-8") as output_file:
1464
+ for line in samtools_view.stdout:
1465
+ if not line.strip():
1466
+ continue
1467
+ qname = line.split("\t", 1)[0]
1468
+ output_file.write(f"{qname}\n")
1469
+ rc = samtools_view.wait()
1470
+ if rc != 0:
1471
+ stderr = samtools_view.stderr.read() if samtools_view.stderr else ""
1472
+ raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
1473
+
1474
+
1475
+ def separate_bam_by_bc(
1476
+ input_bam, output_prefix, bam_suffix, split_dir, samtools_backend: str | None = "auto"
1477
+ ):
751
1478
  """
752
1479
  Separates an input BAM file on the BC SAM tag values.
753
1480
 
@@ -756,56 +1483,119 @@ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
756
1483
  output_prefix (str): A prefix to append to the output BAM.
757
1484
  bam_suffix (str): A suffix to add to the bam file.
758
1485
  split_dir (str): String indicating path to directory to split BAMs into
759
-
1486
+
760
1487
  Returns:
761
1488
  None
762
1489
  Writes out split BAM files.
763
1490
  """
1491
+ logger.debug("Demultiplexing BAM based on the BC tag")
764
1492
  bam_base = input_bam.name
765
1493
  bam_base_minus_suffix = input_bam.stem
766
1494
 
767
- # Open the input BAM file for reading
768
- with pysam.AlignmentFile(str(input_bam), "rb") as bam:
769
- # Create a dictionary to store output BAM files
770
- output_files = {}
771
- # Iterate over each read in the BAM file
772
- for read in bam:
773
- try:
774
- # Get the barcode tag value
775
- bc_tag = read.get_tag("BC", with_value_type=True)[0]
776
- #bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
777
- # Open the output BAM file corresponding to the barcode
778
- if bc_tag not in output_files:
779
- output_path = split_dir / f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}"
780
- output_files[bc_tag] = pysam.AlignmentFile(str(output_path), "wb", header=bam.header)
781
- # Write the read to the corresponding output BAM file
782
- output_files[bc_tag].write(read)
783
- except KeyError:
784
- print(f"BC tag not present for read: {read.query_name}")
785
- # Close all output BAM files
786
- for output_file in output_files.values():
787
- output_file.close()
788
-
789
- def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
1495
+ backend_choice = _resolve_samtools_backend(samtools_backend)
1496
+
1497
+ if backend_choice == "python":
1498
+ pysam_mod = _require_pysam()
1499
+ # Open the input BAM file for reading
1500
+ with pysam_mod.AlignmentFile(str(input_bam), "rb") as bam:
1501
+ # Create a dictionary to store output BAM files
1502
+ output_files = {}
1503
+ # Iterate over each read in the BAM file
1504
+ for read in bam:
1505
+ try:
1506
+ # Get the barcode tag value
1507
+ bc_tag = read.get_tag("BC", with_value_type=True)[0]
1508
+ # bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
1509
+ # Open the output BAM file corresponding to the barcode
1510
+ if bc_tag not in output_files:
1511
+ output_path = (
1512
+ split_dir
1513
+ / f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}"
1514
+ )
1515
+ output_files[bc_tag] = pysam_mod.AlignmentFile(
1516
+ str(output_path), "wb", header=bam.header
1517
+ )
1518
+ # Write the read to the corresponding output BAM file
1519
+ output_files[bc_tag].write(read)
1520
+ except KeyError:
1521
+ logger.warning(f"BC tag not present for read: {read.query_name}")
1522
+ # Close all output BAM files
1523
+ for output_file in output_files.values():
1524
+ output_file.close()
1525
+ return
1526
+
1527
+ def _collect_bc_tags() -> set[str]:
1528
+ bc_tags: set[str] = set()
1529
+ proc = subprocess.Popen(
1530
+ ["samtools", "view", str(input_bam)],
1531
+ stdout=subprocess.PIPE,
1532
+ stderr=subprocess.PIPE,
1533
+ text=True,
1534
+ )
1535
+ assert proc.stdout is not None
1536
+ for line in proc.stdout:
1537
+ if not line.strip():
1538
+ continue
1539
+ fields = line.rstrip("\n").split("\t")
1540
+ for tag in fields[11:]:
1541
+ if tag.startswith("BC:"):
1542
+ bc_tags.add(tag.split(":", 2)[2])
1543
+ break
1544
+ rc = proc.wait()
1545
+ if rc != 0:
1546
+ stderr = proc.stderr.read() if proc.stderr else ""
1547
+ raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
1548
+ return bc_tags
1549
+
1550
+ bc_tags = _collect_bc_tags()
1551
+ if not bc_tags:
1552
+ logger.warning("No BC tags found in %s", input_bam)
1553
+ return
1554
+
1555
+ for bc_tag in bc_tags:
1556
+ output_path = split_dir / f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}"
1557
+ cmd = ["samtools", "view", "-b", "-d", f"BC:{bc_tag}", "-o", str(output_path)]
1558
+ cmd.append(str(input_bam))
1559
+ cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
1560
+ if cp.returncode != 0:
1561
+ raise RuntimeError(
1562
+ f"samtools view failed for BC={bc_tag} (exit {cp.returncode}):\n{cp.stderr}"
1563
+ )
1564
+
1565
+
1566
+ def split_and_index_BAM(
1567
+ aligned_sorted_BAM, split_dir, bam_suffix, samtools_backend: str | None = "auto"
1568
+ ):
790
1569
  """
791
1570
  A wrapper function for splitting BAMS and indexing them.
792
1571
  Parameters:
793
1572
  aligned_sorted_BAM (str): A string representing the file path of the aligned_sorted BAM file.
794
1573
  split_dir (str): A string representing the file path to the directory to split the BAMs into.
795
1574
  bam_suffix (str): A suffix to add to the bam file.
796
-
1575
+
797
1576
  Returns:
798
1577
  None
799
1578
  Splits an input BAM file on barcode value and makes a BAM index file.
800
1579
  """
1580
+ logger.debug("Demultiplexing and indexing BAMS based on BC tag using split_and_index_BAM")
801
1581
  aligned_sorted_output = aligned_sorted_BAM + bam_suffix
802
1582
  file_prefix = date_string()
803
- separate_bam_by_bc(aligned_sorted_output, file_prefix, bam_suffix, split_dir)
1583
+ separate_bam_by_bc(
1584
+ aligned_sorted_output,
1585
+ file_prefix,
1586
+ bam_suffix,
1587
+ split_dir,
1588
+ samtools_backend=samtools_backend,
1589
+ )
804
1590
  # Make a BAM index file for the BAMs in that directory
805
- bam_pattern = '*' + bam_suffix
1591
+ bam_pattern = "*" + bam_suffix
806
1592
  bam_files = glob.glob(split_dir / bam_pattern)
807
- bam_files = [str(bam) for bam in bam_files if '.bai' not in str(bam)]
1593
+ bam_files = [str(bam) for bam in bam_files if ".bai" not in str(bam)]
1594
+ backend_choice = _resolve_samtools_backend(samtools_backend)
808
1595
  for input_file in bam_files:
809
- pysam.index(input_file)
1596
+ if backend_choice == "python":
1597
+ _index_bam_with_pysam(input_file)
1598
+ else:
1599
+ _index_bam_with_samtools(input_file)
810
1600
 
811
- return bam_files
1601
+ return bam_files