smftools 0.2.5__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +39 -7
- smftools/_settings.py +2 -0
- smftools/_version.py +3 -1
- smftools/cli/__init__.py +1 -0
- smftools/cli/archived/cli_flows.py +2 -0
- smftools/cli/helpers.py +34 -6
- smftools/cli/hmm_adata.py +239 -33
- smftools/cli/latent_adata.py +318 -0
- smftools/cli/load_adata.py +167 -131
- smftools/cli/preprocess_adata.py +180 -53
- smftools/cli/spatial_adata.py +152 -100
- smftools/cli_entry.py +38 -1
- smftools/config/__init__.py +2 -0
- smftools/config/conversion.yaml +11 -1
- smftools/config/default.yaml +42 -2
- smftools/config/experiment_config.py +59 -1
- smftools/constants.py +65 -0
- smftools/datasets/__init__.py +2 -0
- smftools/hmm/HMM.py +97 -3
- smftools/hmm/__init__.py +24 -13
- smftools/hmm/archived/apply_hmm_batched.py +2 -0
- smftools/hmm/archived/calculate_distances.py +2 -0
- smftools/hmm/archived/call_hmm_peaks.py +2 -0
- smftools/hmm/archived/train_hmm.py +2 -0
- smftools/hmm/call_hmm_peaks.py +5 -2
- smftools/hmm/display_hmm.py +4 -1
- smftools/hmm/hmm_readwrite.py +7 -2
- smftools/hmm/nucleosome_hmm_refinement.py +2 -0
- smftools/informatics/__init__.py +59 -34
- smftools/informatics/archived/bam_conversion.py +2 -0
- smftools/informatics/archived/bam_direct.py +2 -0
- smftools/informatics/archived/basecall_pod5s.py +2 -0
- smftools/informatics/archived/basecalls_to_adata.py +2 -0
- smftools/informatics/archived/conversion_smf.py +2 -0
- smftools/informatics/archived/deaminase_smf.py +1 -0
- smftools/informatics/archived/direct_smf.py +2 -0
- smftools/informatics/archived/fast5_to_pod5.py +2 -0
- smftools/informatics/archived/helpers/archived/__init__.py +2 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
- smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
- smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
- smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
- smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
- smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
- smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
- smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
- smftools/informatics/archived/helpers/archived/informatics.py +2 -0
- smftools/informatics/archived/helpers/archived/load_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
- smftools/informatics/archived/helpers/archived/modQC.py +2 -0
- smftools/informatics/archived/helpers/archived/modcall.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +2 -0
- smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
- smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
- smftools/informatics/archived/print_bam_query_seq.py +2 -0
- smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
- smftools/informatics/archived/subsample_pod5.py +2 -0
- smftools/informatics/bam_functions.py +1093 -176
- smftools/informatics/basecalling.py +2 -0
- smftools/informatics/bed_functions.py +271 -61
- smftools/informatics/binarize_converted_base_identities.py +3 -0
- smftools/informatics/complement_base_list.py +2 -0
- smftools/informatics/converted_BAM_to_adata.py +641 -176
- smftools/informatics/fasta_functions.py +94 -10
- smftools/informatics/h5ad_functions.py +123 -4
- smftools/informatics/modkit_extract_to_adata.py +1019 -431
- smftools/informatics/modkit_functions.py +2 -0
- smftools/informatics/ohe.py +2 -0
- smftools/informatics/pod5_functions.py +3 -2
- smftools/informatics/sequence_encoding.py +72 -0
- smftools/logging_utils.py +21 -2
- smftools/machine_learning/__init__.py +22 -6
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +18 -4
- smftools/machine_learning/data/preprocessing.py +2 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +2 -0
- smftools/machine_learning/evaluation/evaluators.py +14 -9
- smftools/machine_learning/inference/__init__.py +2 -0
- smftools/machine_learning/inference/inference_utils.py +2 -0
- smftools/machine_learning/inference/lightning_inference.py +6 -1
- smftools/machine_learning/inference/sklearn_inference.py +2 -0
- smftools/machine_learning/inference/sliding_window_inference.py +2 -0
- smftools/machine_learning/models/__init__.py +2 -0
- smftools/machine_learning/models/base.py +7 -2
- smftools/machine_learning/models/cnn.py +7 -2
- smftools/machine_learning/models/lightning_base.py +16 -11
- smftools/machine_learning/models/mlp.py +5 -1
- smftools/machine_learning/models/positional.py +7 -2
- smftools/machine_learning/models/rnn.py +5 -1
- smftools/machine_learning/models/sklearn_models.py +14 -9
- smftools/machine_learning/models/transformer.py +7 -2
- smftools/machine_learning/models/wrappers.py +6 -2
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +13 -3
- smftools/machine_learning/training/train_sklearn_model.py +2 -0
- smftools/machine_learning/utils/__init__.py +2 -0
- smftools/machine_learning/utils/device.py +5 -1
- smftools/machine_learning/utils/grl.py +5 -1
- smftools/metadata.py +1 -1
- smftools/optional_imports.py +31 -0
- smftools/plotting/__init__.py +41 -31
- smftools/plotting/autocorrelation_plotting.py +9 -5
- smftools/plotting/classifiers.py +16 -4
- smftools/plotting/general_plotting.py +2415 -629
- smftools/plotting/hmm_plotting.py +97 -9
- smftools/plotting/position_stats.py +15 -7
- smftools/plotting/qc_plotting.py +6 -1
- smftools/preprocessing/__init__.py +36 -37
- smftools/preprocessing/append_base_context.py +17 -17
- smftools/preprocessing/append_mismatch_frequency_sites.py +158 -0
- smftools/preprocessing/archived/add_read_length_and_mapping_qc.py +2 -0
- smftools/preprocessing/archived/calculate_complexity.py +2 -0
- smftools/preprocessing/archived/mark_duplicates.py +2 -0
- smftools/preprocessing/archived/preprocessing.py +2 -0
- smftools/preprocessing/archived/remove_duplicates.py +2 -0
- smftools/preprocessing/binary_layers_to_ohe.py +2 -1
- smftools/preprocessing/calculate_complexity_II.py +4 -1
- smftools/preprocessing/calculate_consensus.py +1 -1
- smftools/preprocessing/calculate_pairwise_differences.py +2 -0
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +3 -0
- smftools/preprocessing/calculate_position_Youden.py +9 -2
- smftools/preprocessing/calculate_read_modification_stats.py +6 -1
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +2 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +2 -0
- smftools/preprocessing/flag_duplicate_reads.py +42 -54
- smftools/preprocessing/make_dirs.py +2 -1
- smftools/preprocessing/min_non_diagonal.py +2 -0
- smftools/preprocessing/recipes.py +2 -0
- smftools/readwrite.py +53 -17
- smftools/schema/anndata_schema_v1.yaml +15 -1
- smftools/tools/__init__.py +30 -18
- smftools/tools/archived/apply_hmm.py +2 -0
- smftools/tools/archived/classifiers.py +2 -0
- smftools/tools/archived/classify_methylated_features.py +2 -0
- smftools/tools/archived/classify_non_methylated_features.py +2 -0
- smftools/tools/archived/subset_adata_v1.py +2 -0
- smftools/tools/archived/subset_adata_v2.py +2 -0
- smftools/tools/calculate_leiden.py +57 -0
- smftools/tools/calculate_nmf.py +119 -0
- smftools/tools/calculate_umap.py +93 -8
- smftools/tools/cluster_adata_on_methylation.py +7 -1
- smftools/tools/position_stats.py +17 -27
- smftools/tools/rolling_nn_distance.py +235 -0
- smftools/tools/tensor_factorization.py +169 -0
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/METADATA +69 -33
- smftools-0.3.1.dist-info/RECORD +189 -0
- smftools-0.2.5.dist-info/RECORD +0 -181
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/WHEEL +0 -0
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -3,26 +3,131 @@ from __future__ import annotations
|
|
|
3
3
|
import glob
|
|
4
4
|
import os
|
|
5
5
|
import re
|
|
6
|
+
import shutil
|
|
6
7
|
import subprocess
|
|
7
8
|
import time
|
|
8
9
|
from collections import Counter, defaultdict, deque
|
|
9
10
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
10
11
|
from itertools import zip_longest
|
|
11
12
|
from pathlib import Path
|
|
12
|
-
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
13
|
+
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
13
14
|
|
|
14
15
|
import numpy as np
|
|
15
|
-
import pysam
|
|
16
16
|
from tqdm import tqdm
|
|
17
17
|
|
|
18
|
+
from smftools.constants import MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT
|
|
18
19
|
from smftools.logging_utils import get_logger
|
|
20
|
+
from smftools.optional_imports import require
|
|
19
21
|
|
|
20
22
|
from ..readwrite import date_string, time_string
|
|
21
23
|
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
import pysam as pysam_types
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
import pysam
|
|
29
|
+
except Exception:
|
|
30
|
+
pysam = None # type: ignore
|
|
31
|
+
|
|
22
32
|
logger = get_logger(__name__)
|
|
23
33
|
|
|
24
34
|
_PROGRESS_RE = re.compile(r"Output records written:\s*(\d+)")
|
|
25
35
|
_EMPTY_RE = re.compile(r"^\s*$")
|
|
36
|
+
_BAM_FLAG_BITS: Tuple[Tuple[int, str], ...] = (
|
|
37
|
+
(0x1, "paired"),
|
|
38
|
+
(0x2, "proper_pair"),
|
|
39
|
+
(0x4, "unmapped"),
|
|
40
|
+
(0x8, "mate_unmapped"),
|
|
41
|
+
(0x10, "reverse"),
|
|
42
|
+
(0x20, "mate_reverse"),
|
|
43
|
+
(0x40, "read1"),
|
|
44
|
+
(0x80, "read2"),
|
|
45
|
+
(0x100, "secondary"),
|
|
46
|
+
(0x200, "qc_fail"),
|
|
47
|
+
(0x400, "duplicate"),
|
|
48
|
+
(0x800, "supplementary"),
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _require_pysam() -> "pysam_types":
|
|
53
|
+
"""Return the pysam module or raise if unavailable."""
|
|
54
|
+
if pysam is not None:
|
|
55
|
+
return pysam
|
|
56
|
+
return require("pysam", extra="pysam", purpose="samtools-compatible Python backend")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _resolve_samtools_backend(backend: str | None) -> str:
|
|
60
|
+
"""Resolve backend choice for samtools-compatible operations.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
backend: One of {"auto", "python", "cli"} (case-insensitive).
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
Resolved backend string ("python" or "cli").
|
|
67
|
+
"""
|
|
68
|
+
choice = (backend or "auto").strip().lower()
|
|
69
|
+
if choice not in {"auto", "python", "cli"}:
|
|
70
|
+
raise ValueError("samtools_backend must be one of: auto, python, cli")
|
|
71
|
+
|
|
72
|
+
have_pysam = pysam is not None
|
|
73
|
+
have_samtools = shutil.which("samtools") is not None
|
|
74
|
+
|
|
75
|
+
if choice == "python":
|
|
76
|
+
if not have_pysam:
|
|
77
|
+
raise RuntimeError("samtools_backend=python requires pysam to be installed.")
|
|
78
|
+
return "python"
|
|
79
|
+
if choice == "cli":
|
|
80
|
+
if not have_samtools:
|
|
81
|
+
raise RuntimeError("samtools_backend=cli requires samtools in PATH.")
|
|
82
|
+
return "cli"
|
|
83
|
+
|
|
84
|
+
if have_samtools:
|
|
85
|
+
return "cli"
|
|
86
|
+
if have_pysam:
|
|
87
|
+
return "python"
|
|
88
|
+
raise RuntimeError("Neither pysam nor samtools is available in PATH.")
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _has_bam_index(bam_path: Path) -> bool:
|
|
92
|
+
"""Return True if the BAM index exists alongside the BAM."""
|
|
93
|
+
return (
|
|
94
|
+
bam_path.with_suffix(bam_path.suffix + ".bai").exists()
|
|
95
|
+
or Path(str(bam_path) + ".bai").exists()
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _ensure_bam_index(bam_path: Path, backend: str) -> None:
|
|
100
|
+
"""Ensure a BAM index exists, creating one if needed."""
|
|
101
|
+
if _has_bam_index(bam_path):
|
|
102
|
+
return
|
|
103
|
+
if backend == "python":
|
|
104
|
+
_index_bam_with_pysam(bam_path)
|
|
105
|
+
else:
|
|
106
|
+
_index_bam_with_samtools(bam_path)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _parse_idxstats_output(output: str) -> Tuple[int, int, Dict[str, Tuple[int, float]]]:
|
|
110
|
+
"""Parse samtools idxstats output into counts and proportions."""
|
|
111
|
+
aligned_reads_count = 0
|
|
112
|
+
unaligned_reads_count = 0
|
|
113
|
+
record_counts: Dict[str, int] = {}
|
|
114
|
+
for line in output.splitlines():
|
|
115
|
+
if not line.strip():
|
|
116
|
+
continue
|
|
117
|
+
ref, _length, mapped, unmapped = line.split("\t")[:4]
|
|
118
|
+
if ref == "*":
|
|
119
|
+
unaligned_reads_count += int(unmapped)
|
|
120
|
+
continue
|
|
121
|
+
mapped_count = int(mapped)
|
|
122
|
+
aligned_reads_count += mapped_count
|
|
123
|
+
record_counts[ref] = mapped_count
|
|
124
|
+
|
|
125
|
+
proportions: Dict[str, Tuple[int, float]] = {}
|
|
126
|
+
for ref, count in record_counts.items():
|
|
127
|
+
proportion = count / aligned_reads_count if aligned_reads_count else 0.0
|
|
128
|
+
proportions[ref] = (count, proportion)
|
|
129
|
+
|
|
130
|
+
return aligned_reads_count, unaligned_reads_count, proportions
|
|
26
131
|
|
|
27
132
|
|
|
28
133
|
def _stream_dorado_logs(stderr_iter) -> None:
|
|
@@ -60,8 +165,9 @@ def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str,
|
|
|
60
165
|
|
|
61
166
|
logger.debug(f"Converting BAM to FASTQ using _bam_to_fastq_with_pysam")
|
|
62
167
|
|
|
168
|
+
pysam_mod = _require_pysam()
|
|
63
169
|
with (
|
|
64
|
-
|
|
170
|
+
pysam_mod.AlignmentFile(bam_path, "rb", check_sq=False) as bam,
|
|
65
171
|
open(fastq_path, "w", encoding="utf-8") as fq,
|
|
66
172
|
):
|
|
67
173
|
for r in bam.fetch(until_eof=True):
|
|
@@ -103,7 +209,8 @@ def _sort_bam_with_pysam(
|
|
|
103
209
|
if threads:
|
|
104
210
|
args += ["-@", str(threads)]
|
|
105
211
|
args += ["-o", out_bam, in_bam]
|
|
106
|
-
|
|
212
|
+
pysam_mod = _require_pysam()
|
|
213
|
+
pysam_mod.sort(*args)
|
|
107
214
|
|
|
108
215
|
|
|
109
216
|
def _index_bam_with_pysam(bam_path: Union[str, Path], threads: Optional[int] = None) -> None:
|
|
@@ -115,16 +222,60 @@ def _index_bam_with_pysam(bam_path: Union[str, Path], threads: Optional[int] = N
|
|
|
115
222
|
"""
|
|
116
223
|
bam_path = str(bam_path)
|
|
117
224
|
logger.debug(f"Indexing BAM using _index_bam_with_pysam")
|
|
225
|
+
pysam_mod = _require_pysam()
|
|
118
226
|
# pysam.index supports samtools-style args
|
|
119
227
|
if threads:
|
|
120
|
-
|
|
228
|
+
pysam_mod.index("-@", str(threads), bam_path)
|
|
121
229
|
else:
|
|
122
|
-
|
|
230
|
+
pysam_mod.index(bam_path)
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def _bam_to_fastq_with_samtools(bam_path: Union[str, Path], fastq_path: Union[str, Path]) -> None:
|
|
234
|
+
"""Convert BAM to FASTQ using samtools."""
|
|
235
|
+
if not shutil.which("samtools"):
|
|
236
|
+
raise RuntimeError("samtools is required but not available in PATH.")
|
|
237
|
+
cmd = ["samtools", "fastq", str(bam_path)]
|
|
238
|
+
logger.debug("Converting BAM to FASTQ using samtools: %s", " ".join(cmd))
|
|
239
|
+
with open(fastq_path, "w", encoding="utf-8") as fq:
|
|
240
|
+
cp = subprocess.run(cmd, stdout=fq, stderr=subprocess.PIPE, text=True)
|
|
241
|
+
if cp.returncode != 0:
|
|
242
|
+
raise RuntimeError(f"samtools fastq failed (exit {cp.returncode}):\n{cp.stderr}")
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _sort_bam_with_samtools(
|
|
246
|
+
in_bam: Union[str, Path], out_bam: Union[str, Path], threads: Optional[int] = None
|
|
247
|
+
) -> None:
|
|
248
|
+
"""Sort a BAM file using samtools."""
|
|
249
|
+
if not shutil.which("samtools"):
|
|
250
|
+
raise RuntimeError("samtools is required but not available in PATH.")
|
|
251
|
+
cmd = ["samtools", "sort", "-o", str(out_bam)]
|
|
252
|
+
if threads:
|
|
253
|
+
cmd += ["-@", str(threads)]
|
|
254
|
+
cmd.append(str(in_bam))
|
|
255
|
+
logger.debug("Sorting BAM using samtools: %s", " ".join(cmd))
|
|
256
|
+
cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
|
|
257
|
+
if cp.returncode != 0:
|
|
258
|
+
raise RuntimeError(f"samtools sort failed (exit {cp.returncode}):\n{cp.stderr}")
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def _index_bam_with_samtools(bam_path: Union[str, Path], threads: Optional[int] = None) -> None:
|
|
262
|
+
"""Index a BAM file using samtools."""
|
|
263
|
+
if not shutil.which("samtools"):
|
|
264
|
+
raise RuntimeError("samtools is required but not available in PATH.")
|
|
265
|
+
cmd = ["samtools", "index"]
|
|
266
|
+
if threads:
|
|
267
|
+
cmd += ["-@", str(threads)]
|
|
268
|
+
cmd.append(str(bam_path))
|
|
269
|
+
logger.debug("Indexing BAM using samtools: %s", " ".join(cmd))
|
|
270
|
+
cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
|
|
271
|
+
if cp.returncode != 0:
|
|
272
|
+
raise RuntimeError(f"samtools index failed (exit {cp.returncode}):\n{cp.stderr}")
|
|
123
273
|
|
|
124
274
|
|
|
125
275
|
def align_and_sort_BAM(
|
|
126
276
|
fasta,
|
|
127
277
|
input,
|
|
278
|
+
output,
|
|
128
279
|
cfg,
|
|
129
280
|
):
|
|
130
281
|
"""
|
|
@@ -144,10 +295,9 @@ def align_and_sort_BAM(
|
|
|
144
295
|
input_suffix = input.suffix
|
|
145
296
|
input_as_fastq = input.with_name(input.stem + ".fastq")
|
|
146
297
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
aligned_BAM = output_path_minus_suffix.with_name(output_path_minus_suffix.stem + "_aligned")
|
|
298
|
+
aligned_BAM = output.parent / output.stem
|
|
150
299
|
aligned_output = aligned_BAM.with_suffix(cfg.bam_suffix)
|
|
300
|
+
|
|
151
301
|
aligned_sorted_BAM = aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
|
|
152
302
|
aligned_sorted_output = aligned_sorted_BAM.with_suffix(cfg.bam_suffix)
|
|
153
303
|
|
|
@@ -156,10 +306,15 @@ def align_and_sort_BAM(
|
|
|
156
306
|
else:
|
|
157
307
|
threads = None
|
|
158
308
|
|
|
309
|
+
samtools_backend = _resolve_samtools_backend(getattr(cfg, "samtools_backend", "auto"))
|
|
310
|
+
|
|
159
311
|
if cfg.aligner == "minimap2":
|
|
160
312
|
if not cfg.align_from_bam:
|
|
161
313
|
logger.debug(f"Converting BAM to FASTQ: {input}")
|
|
162
|
-
|
|
314
|
+
if samtools_backend == "python":
|
|
315
|
+
_bam_to_fastq_with_pysam(input, input_as_fastq)
|
|
316
|
+
else:
|
|
317
|
+
_bam_to_fastq_with_samtools(input, input_as_fastq)
|
|
163
318
|
logger.debug(f"Aligning FASTQ to Reference: {input_as_fastq}")
|
|
164
319
|
mm_input = input_as_fastq
|
|
165
320
|
else:
|
|
@@ -220,12 +375,18 @@ def align_and_sort_BAM(
|
|
|
220
375
|
logger.error(f"Aligner not recognized: {cfg.aligner}. Choose from minimap2 and dorado")
|
|
221
376
|
return
|
|
222
377
|
|
|
223
|
-
# --- Sort & Index
|
|
378
|
+
# --- Sort & Index ---
|
|
224
379
|
logger.debug(f"Sorting: {aligned_output} -> {aligned_sorted_output}")
|
|
225
|
-
|
|
380
|
+
if samtools_backend == "python":
|
|
381
|
+
_sort_bam_with_pysam(aligned_output, aligned_sorted_output, threads=threads)
|
|
382
|
+
else:
|
|
383
|
+
_sort_bam_with_samtools(aligned_output, aligned_sorted_output, threads=threads)
|
|
226
384
|
|
|
227
385
|
logger.debug(f"Indexing: {aligned_sorted_output}")
|
|
228
|
-
|
|
386
|
+
if samtools_backend == "python":
|
|
387
|
+
_index_bam_with_pysam(aligned_sorted_output, threads=threads)
|
|
388
|
+
else:
|
|
389
|
+
_index_bam_with_samtools(aligned_sorted_output, threads=threads)
|
|
229
390
|
|
|
230
391
|
|
|
231
392
|
def bam_qc(
|
|
@@ -236,25 +397,20 @@ def bam_qc(
|
|
|
236
397
|
stats: bool = True,
|
|
237
398
|
flagstats: bool = True,
|
|
238
399
|
idxstats: bool = True,
|
|
400
|
+
samtools_backend: str | None = "auto",
|
|
239
401
|
) -> None:
|
|
240
402
|
"""
|
|
241
403
|
QC for BAM/CRAMs: stats, flagstat, idxstats.
|
|
242
404
|
Prefers pysam; falls back to `samtools` if needed.
|
|
243
405
|
Runs BAMs in parallel (up to `threads`, default serial).
|
|
244
406
|
"""
|
|
245
|
-
import shutil
|
|
246
407
|
import subprocess
|
|
247
408
|
|
|
248
409
|
logger.debug("Performing BAM QC using bam_qc")
|
|
249
410
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
have_pysam = True
|
|
255
|
-
except Exception:
|
|
256
|
-
pysam = None # type: ignore
|
|
257
|
-
have_pysam = False
|
|
411
|
+
backend_choice = _resolve_samtools_backend(samtools_backend)
|
|
412
|
+
have_pysam = backend_choice == "python"
|
|
413
|
+
pysam_mod = _require_pysam() if have_pysam else None
|
|
258
414
|
|
|
259
415
|
bam_qc_dir = Path(bam_qc_dir)
|
|
260
416
|
bam_qc_dir.mkdir(parents=True, exist_ok=True)
|
|
@@ -275,11 +431,9 @@ def bam_qc(
|
|
|
275
431
|
if _has_index(p):
|
|
276
432
|
return
|
|
277
433
|
if have_pysam:
|
|
278
|
-
assert
|
|
279
|
-
|
|
434
|
+
assert pysam_mod is not None
|
|
435
|
+
pysam_mod.index(str(p)) # supports BAM & CRAM
|
|
280
436
|
else:
|
|
281
|
-
if not shutil.which("samtools"):
|
|
282
|
-
raise RuntimeError("Neither pysam nor samtools is available in PATH.")
|
|
283
437
|
cmd = ["samtools", "index", str(p)]
|
|
284
438
|
# capture text so errors are readable; raise on failure
|
|
285
439
|
cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
|
|
@@ -300,7 +454,7 @@ def bam_qc(
|
|
|
300
454
|
line = line.rstrip()
|
|
301
455
|
if line:
|
|
302
456
|
last_err.append(line)
|
|
303
|
-
logger.
|
|
457
|
+
logger.debug("[%s][%s] %s", tag, bam.name, line)
|
|
304
458
|
rc = proc.wait()
|
|
305
459
|
|
|
306
460
|
if rc != 0:
|
|
@@ -332,16 +486,13 @@ def bam_qc(
|
|
|
332
486
|
# Still attempt stats/flagstat if requested; idxstats may fail later if index is required.
|
|
333
487
|
logger.warning("Indexing failed for %s: %s", bam, e)
|
|
334
488
|
|
|
335
|
-
if not have_pysam:
|
|
336
|
-
import shutil
|
|
337
|
-
|
|
338
|
-
if not shutil.which("samtools"):
|
|
339
|
-
raise RuntimeError("Neither pysam nor samtools is available in PATH.")
|
|
340
|
-
|
|
341
489
|
# --- stats ---
|
|
342
490
|
if stats:
|
|
343
|
-
if have_pysam
|
|
344
|
-
|
|
491
|
+
if have_pysam:
|
|
492
|
+
assert pysam_mod is not None
|
|
493
|
+
if not hasattr(pysam_mod, "stats"):
|
|
494
|
+
raise RuntimeError("pysam.stats is unavailable in this pysam build.")
|
|
495
|
+
txt = pysam_mod.stats(str(bam))
|
|
345
496
|
out_stats.write_text(txt)
|
|
346
497
|
results.append(("stats(pysam)", 0))
|
|
347
498
|
else:
|
|
@@ -351,8 +502,11 @@ def bam_qc(
|
|
|
351
502
|
|
|
352
503
|
# --- flagstat ---
|
|
353
504
|
if flagstats:
|
|
354
|
-
if have_pysam
|
|
355
|
-
|
|
505
|
+
if have_pysam:
|
|
506
|
+
assert pysam_mod is not None
|
|
507
|
+
if not hasattr(pysam_mod, "flagstat"):
|
|
508
|
+
raise RuntimeError("pysam.flagstat is unavailable in this pysam build.")
|
|
509
|
+
txt = pysam_mod.flagstat(str(bam))
|
|
356
510
|
out_flag.write_text(txt)
|
|
357
511
|
results.append(("flagstat(pysam)", 0))
|
|
358
512
|
else:
|
|
@@ -362,8 +516,11 @@ def bam_qc(
|
|
|
362
516
|
|
|
363
517
|
# --- idxstats ---
|
|
364
518
|
if idxstats:
|
|
365
|
-
if have_pysam
|
|
366
|
-
|
|
519
|
+
if have_pysam:
|
|
520
|
+
assert pysam_mod is not None
|
|
521
|
+
if not hasattr(pysam_mod, "idxstats"):
|
|
522
|
+
raise RuntimeError("pysam.idxstats is unavailable in this pysam build.")
|
|
523
|
+
txt = pysam_mod.idxstats(str(bam))
|
|
367
524
|
out_idx.write_text(txt)
|
|
368
525
|
results.append(("idxstats(pysam)", 0))
|
|
369
526
|
else:
|
|
@@ -400,6 +557,8 @@ def concatenate_fastqs_to_bam(
|
|
|
400
557
|
rg_sample_field: Optional[str] = None,
|
|
401
558
|
progress: bool = True,
|
|
402
559
|
auto_pair: bool = True,
|
|
560
|
+
gzip_suffixes: Tuple[str, ...] = (".gz", ".gzip"),
|
|
561
|
+
samtools_backend: str | None = "auto",
|
|
403
562
|
) -> Dict[str, Any]:
|
|
404
563
|
"""
|
|
405
564
|
Concatenate FASTQ(s) into an **unaligned** BAM. Supports single-end and paired-end.
|
|
@@ -422,6 +581,10 @@ def concatenate_fastqs_to_bam(
|
|
|
422
581
|
Show tqdm progress bars.
|
|
423
582
|
auto_pair : bool
|
|
424
583
|
Auto-pair R1/R2 based on filename patterns if given a flat list.
|
|
584
|
+
gzip_suffixes : tuple[str, ...]
|
|
585
|
+
Suffixes treated as gzip-compressed FASTQ files.
|
|
586
|
+
samtools_backend : str | None
|
|
587
|
+
Backend selection for samtools-compatible operations (auto|python|cli).
|
|
425
588
|
|
|
426
589
|
Returns
|
|
427
590
|
-------
|
|
@@ -436,9 +599,10 @@ def concatenate_fastqs_to_bam(
|
|
|
436
599
|
"""
|
|
437
600
|
name = p.name
|
|
438
601
|
lowers = name.lower()
|
|
602
|
+
gzip_exts = tuple(s.lower() for s in gzip_suffixes)
|
|
439
603
|
for ext in (
|
|
440
|
-
".fastq
|
|
441
|
-
".fq
|
|
604
|
+
*(f".fastq{suf}" for suf in gzip_exts),
|
|
605
|
+
*(f".fq{suf}" for suf in gzip_exts),
|
|
442
606
|
".fastq.bz2",
|
|
443
607
|
".fq.bz2",
|
|
444
608
|
".fastq.xz",
|
|
@@ -525,10 +689,50 @@ def concatenate_fastqs_to_bam(
|
|
|
525
689
|
Pysam Fastx records.
|
|
526
690
|
"""
|
|
527
691
|
# pysam.FastxFile handles compressed extensions transparently
|
|
528
|
-
|
|
692
|
+
pysam_mod = _require_pysam()
|
|
693
|
+
with pysam_mod.FastxFile(str(p)) as fx:
|
|
529
694
|
for rec in fx:
|
|
530
695
|
yield rec # rec.name, rec.sequence, rec.quality
|
|
531
696
|
|
|
697
|
+
def _fastq_iter_plain(p: Path) -> Iterable[Tuple[str, str, str]]:
|
|
698
|
+
"""Yield FASTQ records from plain-text parsing.
|
|
699
|
+
|
|
700
|
+
Args:
|
|
701
|
+
p: FASTQ path.
|
|
702
|
+
|
|
703
|
+
Yields:
|
|
704
|
+
Tuple of (name, sequence, quality).
|
|
705
|
+
"""
|
|
706
|
+
import bz2
|
|
707
|
+
import gzip
|
|
708
|
+
import lzma
|
|
709
|
+
|
|
710
|
+
lowers = p.name.lower()
|
|
711
|
+
if any(lowers.endswith(suf) for suf in (s.lower() for s in gzip_suffixes)):
|
|
712
|
+
handle = gzip.open(p, "rt", encoding="utf-8")
|
|
713
|
+
elif lowers.endswith(".bz2"):
|
|
714
|
+
handle = bz2.open(p, "rt", encoding="utf-8")
|
|
715
|
+
elif lowers.endswith(".xz"):
|
|
716
|
+
handle = lzma.open(p, "rt", encoding="utf-8")
|
|
717
|
+
else:
|
|
718
|
+
handle = p.open("r", encoding="utf-8")
|
|
719
|
+
|
|
720
|
+
with handle as fh:
|
|
721
|
+
while True:
|
|
722
|
+
header = fh.readline()
|
|
723
|
+
if not header:
|
|
724
|
+
break
|
|
725
|
+
seq = fh.readline()
|
|
726
|
+
fh.readline()
|
|
727
|
+
qual = fh.readline()
|
|
728
|
+
if not qual:
|
|
729
|
+
break
|
|
730
|
+
name = header.strip()
|
|
731
|
+
if name.startswith("@"):
|
|
732
|
+
name = name[1:]
|
|
733
|
+
name = name.split()[0]
|
|
734
|
+
yield name, seq.strip(), qual.strip()
|
|
735
|
+
|
|
532
736
|
def _make_unaligned_segment(
|
|
533
737
|
name: str,
|
|
534
738
|
seq: str,
|
|
@@ -550,11 +754,12 @@ def concatenate_fastqs_to_bam(
|
|
|
550
754
|
Returns:
|
|
551
755
|
Unaligned pysam.AlignedSegment.
|
|
552
756
|
"""
|
|
553
|
-
|
|
757
|
+
pysam_mod = _require_pysam()
|
|
758
|
+
a = pysam_mod.AlignedSegment()
|
|
554
759
|
a.query_name = name
|
|
555
760
|
a.query_sequence = seq
|
|
556
761
|
if qual is not None:
|
|
557
|
-
a.query_qualities =
|
|
762
|
+
a.query_qualities = pysam_mod.qualitystring_to_array(qual)
|
|
558
763
|
a.is_unmapped = True
|
|
559
764
|
a.is_paired = read1 or read2
|
|
560
765
|
a.is_read1 = read1
|
|
@@ -570,6 +775,48 @@ def concatenate_fastqs_to_bam(
|
|
|
570
775
|
a.set_tag("RG", str(bc), value_type="Z")
|
|
571
776
|
return a
|
|
572
777
|
|
|
778
|
+
def _write_sam_line(
|
|
779
|
+
handle,
|
|
780
|
+
name: str,
|
|
781
|
+
seq: str,
|
|
782
|
+
qual: str,
|
|
783
|
+
bc: str,
|
|
784
|
+
*,
|
|
785
|
+
read1: bool,
|
|
786
|
+
read2: bool,
|
|
787
|
+
add_read_group: bool,
|
|
788
|
+
) -> None:
|
|
789
|
+
"""Write a single unaligned SAM record to a text stream."""
|
|
790
|
+
if read1:
|
|
791
|
+
flag = 77
|
|
792
|
+
elif read2:
|
|
793
|
+
flag = 141
|
|
794
|
+
else:
|
|
795
|
+
flag = 4
|
|
796
|
+
tags = [f"{barcode_tag}:Z:{bc}"]
|
|
797
|
+
if add_read_group:
|
|
798
|
+
tags.append(f"RG:Z:{bc}")
|
|
799
|
+
tag_str = "\t".join(tags)
|
|
800
|
+
if not qual:
|
|
801
|
+
qual = "*"
|
|
802
|
+
line = "\t".join(
|
|
803
|
+
[
|
|
804
|
+
name,
|
|
805
|
+
str(flag),
|
|
806
|
+
"*",
|
|
807
|
+
"0",
|
|
808
|
+
"0",
|
|
809
|
+
"*",
|
|
810
|
+
"*",
|
|
811
|
+
"0",
|
|
812
|
+
"0",
|
|
813
|
+
seq,
|
|
814
|
+
qual,
|
|
815
|
+
tag_str,
|
|
816
|
+
]
|
|
817
|
+
)
|
|
818
|
+
handle.write(f"{line}\n")
|
|
819
|
+
|
|
573
820
|
# ---------- normalize inputs to Path ----------
|
|
574
821
|
def _to_path_pair(x) -> Tuple[Path, Path]:
|
|
575
822
|
"""Convert a tuple of path-like objects to Path instances."""
|
|
@@ -630,7 +877,29 @@ def concatenate_fastqs_to_bam(
|
|
|
630
877
|
singletons_written = 0
|
|
631
878
|
|
|
632
879
|
# ---------- write BAM ----------
|
|
633
|
-
|
|
880
|
+
backend_choice = _resolve_samtools_backend(samtools_backend)
|
|
881
|
+
if backend_choice == "python":
|
|
882
|
+
pysam_mod = _require_pysam()
|
|
883
|
+
bam_out_ctx = pysam_mod.AlignmentFile(str(output_bam), "wb", header=header)
|
|
884
|
+
else:
|
|
885
|
+
cmd = ["samtools", "view", "-b", "-o", str(output_bam), "-"]
|
|
886
|
+
logger.debug("Writing BAM using samtools: %s", " ".join(cmd))
|
|
887
|
+
bam_out_ctx = subprocess.Popen(
|
|
888
|
+
cmd, stdin=subprocess.PIPE, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True
|
|
889
|
+
)
|
|
890
|
+
assert bam_out_ctx.stdin is not None
|
|
891
|
+
header_lines = ["@HD\tVN:1.6\tSO:unknown"]
|
|
892
|
+
if add_read_group:
|
|
893
|
+
for bc in barcodes_in_order:
|
|
894
|
+
rg_fields = [f"ID:{bc}"]
|
|
895
|
+
if rg_sample_field:
|
|
896
|
+
rg_fields.append(f"SM:{rg_sample_field}")
|
|
897
|
+
rg_body = "\t".join(rg_fields)
|
|
898
|
+
header_lines.append(f"@RG\t{rg_body}")
|
|
899
|
+
header_lines.append("@PG\tID:concat-fastq\tPN:concatenate_fastqs_to_bam\tVN:1")
|
|
900
|
+
bam_out_ctx.stdin.write("\n".join(header_lines) + "\n")
|
|
901
|
+
|
|
902
|
+
try:
|
|
634
903
|
# Paired
|
|
635
904
|
it_pairs = explicit_pairs
|
|
636
905
|
if progress and it_pairs:
|
|
@@ -640,8 +909,12 @@ def concatenate_fastqs_to_bam(
|
|
|
640
909
|
raise FileNotFoundError(f"Paired file missing: {r1_path} or {r2_path}")
|
|
641
910
|
bc = per_path_barcode.get(r1_path) or per_path_barcode.get(r2_path) or "barcode"
|
|
642
911
|
|
|
643
|
-
|
|
644
|
-
|
|
912
|
+
if backend_choice == "python":
|
|
913
|
+
it1 = _fastq_iter(r1_path)
|
|
914
|
+
it2 = _fastq_iter(r2_path)
|
|
915
|
+
else:
|
|
916
|
+
it1 = _fastq_iter_plain(r1_path)
|
|
917
|
+
it2 = _fastq_iter_plain(r2_path)
|
|
645
918
|
|
|
646
919
|
for rec1, rec2 in zip_longest(it1, it2, fillvalue=None):
|
|
647
920
|
|
|
@@ -652,24 +925,67 @@ def concatenate_fastqs_to_bam(
|
|
|
652
925
|
return re.sub(r"(?:/1$|/2$|\s[12]$)", "", n)
|
|
653
926
|
|
|
654
927
|
name = (
|
|
655
|
-
_clean(getattr(rec1, "name", None))
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
or getattr(rec2, "name", None)
|
|
928
|
+
_clean(getattr(rec1, "name", None) if backend_choice == "python" else rec1[0])
|
|
929
|
+
if rec1 is not None
|
|
930
|
+
else None
|
|
659
931
|
)
|
|
932
|
+
if name is None:
|
|
933
|
+
name = (
|
|
934
|
+
_clean(
|
|
935
|
+
getattr(rec2, "name", None) if backend_choice == "python" else rec2[0]
|
|
936
|
+
)
|
|
937
|
+
if rec2 is not None
|
|
938
|
+
else None
|
|
939
|
+
)
|
|
940
|
+
if name is None:
|
|
941
|
+
name = (
|
|
942
|
+
getattr(rec1, "name", None)
|
|
943
|
+
if backend_choice == "python" and rec1 is not None
|
|
944
|
+
else (rec1[0] if rec1 is not None else None)
|
|
945
|
+
)
|
|
946
|
+
if name is None:
|
|
947
|
+
name = (
|
|
948
|
+
getattr(rec2, "name", None)
|
|
949
|
+
if backend_choice == "python" and rec2 is not None
|
|
950
|
+
else (rec2[0] if rec2 is not None else None)
|
|
951
|
+
)
|
|
660
952
|
|
|
661
953
|
if rec1 is not None:
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
954
|
+
if backend_choice == "python":
|
|
955
|
+
a1 = _make_unaligned_segment(
|
|
956
|
+
name, rec1.sequence, rec1.quality, bc, read1=True, read2=False
|
|
957
|
+
)
|
|
958
|
+
bam_out_ctx.write(a1)
|
|
959
|
+
else:
|
|
960
|
+
_write_sam_line(
|
|
961
|
+
bam_out_ctx.stdin,
|
|
962
|
+
name,
|
|
963
|
+
rec1[1],
|
|
964
|
+
rec1[2],
|
|
965
|
+
bc,
|
|
966
|
+
read1=True,
|
|
967
|
+
read2=False,
|
|
968
|
+
add_read_group=add_read_group,
|
|
969
|
+
)
|
|
666
970
|
per_file_counts[r1_path] = per_file_counts.get(r1_path, 0) + 1
|
|
667
971
|
total_written += 1
|
|
668
972
|
if rec2 is not None:
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
973
|
+
if backend_choice == "python":
|
|
974
|
+
a2 = _make_unaligned_segment(
|
|
975
|
+
name, rec2.sequence, rec2.quality, bc, read1=False, read2=True
|
|
976
|
+
)
|
|
977
|
+
bam_out_ctx.write(a2)
|
|
978
|
+
else:
|
|
979
|
+
_write_sam_line(
|
|
980
|
+
bam_out_ctx.stdin,
|
|
981
|
+
name,
|
|
982
|
+
rec2[1],
|
|
983
|
+
rec2[2],
|
|
984
|
+
bc,
|
|
985
|
+
read1=False,
|
|
986
|
+
read2=True,
|
|
987
|
+
add_read_group=add_read_group,
|
|
988
|
+
)
|
|
673
989
|
per_file_counts[r2_path] = per_file_counts.get(r2_path, 0) + 1
|
|
674
990
|
total_written += 1
|
|
675
991
|
|
|
@@ -689,14 +1005,40 @@ def concatenate_fastqs_to_bam(
|
|
|
689
1005
|
if not pth.exists():
|
|
690
1006
|
raise FileNotFoundError(pth)
|
|
691
1007
|
bc = per_path_barcode.get(pth, "barcode")
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
)
|
|
696
|
-
|
|
1008
|
+
if backend_choice == "python":
|
|
1009
|
+
iterator = _fastq_iter(pth)
|
|
1010
|
+
else:
|
|
1011
|
+
iterator = _fastq_iter_plain(pth)
|
|
1012
|
+
for rec in iterator:
|
|
1013
|
+
if backend_choice == "python":
|
|
1014
|
+
a = _make_unaligned_segment(
|
|
1015
|
+
rec.name, rec.sequence, rec.quality, bc, read1=False, read2=False
|
|
1016
|
+
)
|
|
1017
|
+
bam_out_ctx.write(a)
|
|
1018
|
+
else:
|
|
1019
|
+
_write_sam_line(
|
|
1020
|
+
bam_out_ctx.stdin,
|
|
1021
|
+
rec[0],
|
|
1022
|
+
rec[1],
|
|
1023
|
+
rec[2],
|
|
1024
|
+
bc,
|
|
1025
|
+
read1=False,
|
|
1026
|
+
read2=False,
|
|
1027
|
+
add_read_group=add_read_group,
|
|
1028
|
+
)
|
|
697
1029
|
per_file_counts[pth] = per_file_counts.get(pth, 0) + 1
|
|
698
1030
|
total_written += 1
|
|
699
1031
|
singletons_written += 1
|
|
1032
|
+
finally:
|
|
1033
|
+
if backend_choice == "python":
|
|
1034
|
+
bam_out_ctx.close()
|
|
1035
|
+
else:
|
|
1036
|
+
if bam_out_ctx.stdin is not None:
|
|
1037
|
+
bam_out_ctx.stdin.close()
|
|
1038
|
+
rc = bam_out_ctx.wait()
|
|
1039
|
+
if rc != 0:
|
|
1040
|
+
stderr = bam_out_ctx.stderr.read() if bam_out_ctx.stderr else ""
|
|
1041
|
+
raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
|
|
700
1042
|
|
|
701
1043
|
return {
|
|
702
1044
|
"total_reads": total_written,
|
|
@@ -707,7 +1049,7 @@ def concatenate_fastqs_to_bam(
|
|
|
707
1049
|
}
|
|
708
1050
|
|
|
709
1051
|
|
|
710
|
-
def count_aligned_reads(bam_file):
|
|
1052
|
+
def count_aligned_reads(bam_file, samtools_backend: str | None = "auto"):
|
|
711
1053
|
"""
|
|
712
1054
|
Counts the number of aligned reads in a bam file that map to each reference record.
|
|
713
1055
|
|
|
@@ -720,30 +1062,42 @@ def count_aligned_reads(bam_file):
|
|
|
720
1062
|
record_counts (dict): A dictionary keyed by reference record instance that points toa tuple containing the total reads mapped to the record and the fraction of mapped reads which map to the record.
|
|
721
1063
|
|
|
722
1064
|
"""
|
|
723
|
-
|
|
1065
|
+
logger.info("Counting aligned reads in BAM > {}".format(bam_file.name))
|
|
1066
|
+
backend_choice = _resolve_samtools_backend(samtools_backend)
|
|
724
1067
|
aligned_reads_count = 0
|
|
725
1068
|
unaligned_reads_count = 0
|
|
726
|
-
# Make a dictionary, keyed by the reference_name of reference chromosome that points to an integer number of read counts mapped to the chromosome, as well as the proportion of mapped reads in that chromosome
|
|
727
|
-
record_counts = defaultdict(int)
|
|
728
|
-
|
|
729
|
-
with pysam.AlignmentFile(str(bam_file), "rb") as bam:
|
|
730
|
-
total_reads = bam.mapped + bam.unmapped
|
|
731
|
-
# Iterate over reads to get the total mapped read counts and the reads that map to each reference
|
|
732
|
-
for read in tqdm(bam, desc="Counting aligned reads in BAM", total=total_reads):
|
|
733
|
-
if read.is_unmapped:
|
|
734
|
-
unaligned_reads_count += 1
|
|
735
|
-
else:
|
|
736
|
-
aligned_reads_count += 1
|
|
737
|
-
record_counts[read.reference_name] += (
|
|
738
|
-
1 # Automatically increments if key exists, adds if not
|
|
739
|
-
)
|
|
740
1069
|
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
1070
|
+
if backend_choice == "python":
|
|
1071
|
+
pysam_mod = _require_pysam()
|
|
1072
|
+
record_counts = defaultdict(int)
|
|
1073
|
+
with pysam_mod.AlignmentFile(str(bam_file), "rb") as bam:
|
|
1074
|
+
total_reads = bam.mapped + bam.unmapped
|
|
1075
|
+
# Iterate over reads to get the total mapped read counts and the reads that map to each reference
|
|
1076
|
+
for read in bam:
|
|
1077
|
+
if read.is_unmapped:
|
|
1078
|
+
unaligned_reads_count += 1
|
|
1079
|
+
else:
|
|
1080
|
+
aligned_reads_count += 1
|
|
1081
|
+
record_counts[read.reference_name] += (
|
|
1082
|
+
1 # Automatically increments if key exists, adds if not
|
|
1083
|
+
)
|
|
1084
|
+
|
|
1085
|
+
# reformat the dictionary to contain read counts mapped to the reference, as well as the proportion of mapped reads in reference
|
|
1086
|
+
for reference in record_counts:
|
|
1087
|
+
proportion_mapped_reads_in_record = record_counts[reference] / aligned_reads_count
|
|
1088
|
+
record_counts[reference] = (
|
|
1089
|
+
record_counts[reference],
|
|
1090
|
+
proportion_mapped_reads_in_record,
|
|
1091
|
+
)
|
|
1092
|
+
return aligned_reads_count, unaligned_reads_count, dict(record_counts)
|
|
745
1093
|
|
|
746
|
-
|
|
1094
|
+
bam_path = Path(bam_file)
|
|
1095
|
+
_ensure_bam_index(bam_path, backend_choice)
|
|
1096
|
+
cmd = ["samtools", "idxstats", str(bam_path)]
|
|
1097
|
+
cp = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
|
1098
|
+
if cp.returncode != 0:
|
|
1099
|
+
raise RuntimeError(f"samtools idxstats failed (exit {cp.returncode}):\n{cp.stderr}")
|
|
1100
|
+
return _parse_idxstats_output(cp.stdout)
|
|
747
1101
|
|
|
748
1102
|
|
|
749
1103
|
def demux_and_index_BAM(
|
|
@@ -827,13 +1181,20 @@ def demux_and_index_BAM(
|
|
|
827
1181
|
return renamed_bams
|
|
828
1182
|
|
|
829
1183
|
|
|
830
|
-
def extract_base_identities(
|
|
1184
|
+
def extract_base_identities(
|
|
1185
|
+
bam_file,
|
|
1186
|
+
record,
|
|
1187
|
+
positions,
|
|
1188
|
+
max_reference_length,
|
|
1189
|
+
sequence,
|
|
1190
|
+
samtools_backend: str | None = "auto",
|
|
1191
|
+
):
|
|
831
1192
|
"""
|
|
832
1193
|
Efficiently extracts base identities from mapped reads with reference coordinates.
|
|
833
1194
|
|
|
834
1195
|
Parameters:
|
|
835
1196
|
bam_file (str): Path to the BAM file.
|
|
836
|
-
|
|
1197
|
+
record (str): Name of the reference record.
|
|
837
1198
|
positions (list): Positions to extract (0-based).
|
|
838
1199
|
max_reference_length (int): Maximum reference length for padding.
|
|
839
1200
|
sequence (str): The sequence of the record fasta
|
|
@@ -841,6 +1202,11 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
|
|
|
841
1202
|
Returns:
|
|
842
1203
|
dict: Base identities from forward mapped reads.
|
|
843
1204
|
dict: Base identities from reverse mapped reads.
|
|
1205
|
+
dict: Mismatch counts per read.
|
|
1206
|
+
dict: Mismatch trends per read.
|
|
1207
|
+
dict: Integer-encoded mismatch bases per read.
|
|
1208
|
+
dict: Base quality scores per read aligned to reference positions.
|
|
1209
|
+
dict: Read span masks per read (1 within span, 0 outside).
|
|
844
1210
|
"""
|
|
845
1211
|
logger.debug("Extracting nucleotide identities for each read using extract_base_identities")
|
|
846
1212
|
timestamp = time.strftime("[%Y-%m-%d %H:%M:%S]")
|
|
@@ -849,32 +1215,144 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
|
|
|
849
1215
|
fwd_base_identities = defaultdict(lambda: np.full(max_reference_length, "N", dtype="<U1"))
|
|
850
1216
|
rev_base_identities = defaultdict(lambda: np.full(max_reference_length, "N", dtype="<U1"))
|
|
851
1217
|
mismatch_counts_per_read = defaultdict(lambda: defaultdict(Counter))
|
|
1218
|
+
mismatch_base_identities = defaultdict(
|
|
1219
|
+
lambda: np.full(
|
|
1220
|
+
max_reference_length,
|
|
1221
|
+
MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT["N"],
|
|
1222
|
+
dtype=np.int16,
|
|
1223
|
+
)
|
|
1224
|
+
)
|
|
1225
|
+
base_quality_scores = defaultdict(lambda: np.full(max_reference_length, -1, dtype=np.int16))
|
|
1226
|
+
read_span_masks = defaultdict(lambda: np.zeros(max_reference_length, dtype=np.int8))
|
|
852
1227
|
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
ref_seq = sequence.upper()
|
|
857
|
-
for read in bam.fetch(chromosome):
|
|
858
|
-
if not read.is_mapped:
|
|
859
|
-
continue # Skip unmapped reads
|
|
860
|
-
|
|
861
|
-
read_name = read.query_name
|
|
862
|
-
query_sequence = read.query_sequence
|
|
863
|
-
base_dict = rev_base_identities if read.is_reverse else fwd_base_identities
|
|
1228
|
+
backend_choice = _resolve_samtools_backend(samtools_backend)
|
|
1229
|
+
ref_seq = sequence.upper()
|
|
1230
|
+
sequence_length = len(sequence)
|
|
864
1231
|
|
|
865
|
-
|
|
866
|
-
|
|
1232
|
+
def _encode_mismatch_base(base: str) -> int:
|
|
1233
|
+
return MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT.get(
|
|
1234
|
+
base.upper(), MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT["N"]
|
|
1235
|
+
)
|
|
867
1236
|
|
|
868
|
-
|
|
869
|
-
|
|
1237
|
+
if backend_choice == "python":
|
|
1238
|
+
logger.debug("Extracting base identities using python")
|
|
1239
|
+
pysam_mod = _require_pysam()
|
|
1240
|
+
# print(f"{timestamp} Reading reads from {chromosome} BAM file: {bam_file}")
|
|
1241
|
+
with pysam_mod.AlignmentFile(str(bam_file), "rb") as bam:
|
|
1242
|
+
total_reads = bam.mapped
|
|
1243
|
+
for read in bam.fetch(record):
|
|
1244
|
+
if not read.is_mapped:
|
|
1245
|
+
continue # Skip unmapped reads
|
|
1246
|
+
|
|
1247
|
+
read_name = read.query_name
|
|
1248
|
+
query_sequence = read.query_sequence
|
|
1249
|
+
query_qualities = read.query_qualities or []
|
|
1250
|
+
base_dict = rev_base_identities if read.is_reverse else fwd_base_identities
|
|
1251
|
+
|
|
1252
|
+
# Init arrays for each read in each dict
|
|
1253
|
+
mismatch_base_identities[read_name]
|
|
1254
|
+
base_quality_scores[read_name]
|
|
1255
|
+
read_span_masks[read_name]
|
|
1256
|
+
|
|
1257
|
+
if read.reference_start is not None and read.reference_end is not None:
|
|
1258
|
+
span_end = min(read.reference_end, max_reference_length)
|
|
1259
|
+
read_span_masks[read_name][read.reference_start : span_end] = 1
|
|
1260
|
+
|
|
1261
|
+
# Use get_aligned_pairs directly with positions filtering
|
|
1262
|
+
aligned_pairs = read.get_aligned_pairs(matches_only=True)
|
|
1263
|
+
|
|
1264
|
+
for read_position, reference_position in aligned_pairs:
|
|
1265
|
+
if reference_position is None or read_position is None:
|
|
1266
|
+
continue
|
|
870
1267
|
read_base = query_sequence[read_position]
|
|
871
1268
|
ref_base = ref_seq[reference_position]
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
1269
|
+
if reference_position in positions:
|
|
1270
|
+
base_dict[read_name][reference_position] = read_base
|
|
1271
|
+
if read_position < len(query_qualities):
|
|
1272
|
+
base_quality_scores[read_name][reference_position] = query_qualities[
|
|
1273
|
+
read_position
|
|
1274
|
+
]
|
|
1275
|
+
|
|
1276
|
+
# Track mismatches (excluding Ns)
|
|
1277
|
+
if read_base != ref_base and read_base != "N" and ref_base != "N":
|
|
1278
|
+
mismatch_counts_per_read[read_name][ref_base][read_base] += 1
|
|
1279
|
+
mismatch_base_identities[read_name][reference_position] = (
|
|
1280
|
+
_encode_mismatch_base(read_base)
|
|
1281
|
+
)
|
|
1282
|
+
else:
|
|
1283
|
+
bam_path = Path(bam_file)
|
|
1284
|
+
logger.debug("Extracting base identities using samtools")
|
|
1285
|
+
_ensure_bam_index(bam_path, backend_choice)
|
|
1286
|
+
|
|
1287
|
+
def _iter_aligned_pairs(cigar: str, start: int) -> Iterable[Tuple[int, int]]:
|
|
1288
|
+
qpos = 0
|
|
1289
|
+
rpos = start
|
|
1290
|
+
for length_str, op in re.findall(r"(\d+)([MIDNSHP=XB])", cigar):
|
|
1291
|
+
length = int(length_str)
|
|
1292
|
+
if op in {"M", "=", "X"}:
|
|
1293
|
+
for _ in range(length):
|
|
1294
|
+
yield qpos, rpos
|
|
1295
|
+
qpos += 1
|
|
1296
|
+
rpos += 1
|
|
1297
|
+
elif op in {"I", "S"}:
|
|
1298
|
+
qpos += length
|
|
1299
|
+
elif op in {"D", "N"}:
|
|
1300
|
+
rpos += length
|
|
1301
|
+
elif op in {"H", "P"}:
|
|
1302
|
+
continue
|
|
1303
|
+
|
|
1304
|
+
def _reference_span_from_cigar(cigar: str) -> int:
|
|
1305
|
+
span = 0
|
|
1306
|
+
for length_str, op in re.findall(r"(\d+)([MIDNSHP=XB])", cigar):
|
|
1307
|
+
if op in {"M", "D", "N", "=", "X"}:
|
|
1308
|
+
span += int(length_str)
|
|
1309
|
+
return span
|
|
1310
|
+
|
|
1311
|
+
cmd = ["samtools", "view", "-F", "4", str(bam_path), record]
|
|
1312
|
+
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
|
1313
|
+
assert proc.stdout is not None
|
|
1314
|
+
for line in proc.stdout:
|
|
1315
|
+
if not line.strip() or line.startswith("@"):
|
|
1316
|
+
continue
|
|
1317
|
+
fields = line.rstrip("\n").split("\t")
|
|
1318
|
+
if len(fields) < 11:
|
|
1319
|
+
continue
|
|
1320
|
+
read_name = fields[0]
|
|
1321
|
+
flag = int(fields[1])
|
|
1322
|
+
pos = int(fields[3])
|
|
1323
|
+
cigar = fields[5]
|
|
1324
|
+
query_sequence = fields[9]
|
|
1325
|
+
qual_string = fields[10]
|
|
1326
|
+
if cigar == "*" or query_sequence == "*":
|
|
1327
|
+
continue
|
|
1328
|
+
base_dict = rev_base_identities if (flag & 16) else fwd_base_identities
|
|
1329
|
+
mismatch_base_identities[read_name]
|
|
1330
|
+
base_quality_scores[read_name]
|
|
1331
|
+
read_span_masks[read_name]
|
|
1332
|
+
qualities = (
|
|
1333
|
+
[ord(ch) - 33 for ch in qual_string] if qual_string and qual_string != "*" else []
|
|
1334
|
+
)
|
|
1335
|
+
ref_start = pos - 1
|
|
1336
|
+
ref_end = ref_start + _reference_span_from_cigar(cigar)
|
|
1337
|
+
span_end = min(ref_end, max_reference_length)
|
|
1338
|
+
if ref_start < max_reference_length:
|
|
1339
|
+
read_span_masks[read_name][ref_start:span_end] = 1
|
|
1340
|
+
for read_pos, ref_pos in _iter_aligned_pairs(cigar, pos - 1):
|
|
1341
|
+
if read_pos >= len(query_sequence) or ref_pos >= len(ref_seq):
|
|
1342
|
+
continue
|
|
1343
|
+
read_base = query_sequence[read_pos]
|
|
1344
|
+
ref_base = ref_seq[ref_pos]
|
|
1345
|
+
if ref_pos in positions:
|
|
1346
|
+
base_dict[read_name][ref_pos] = read_base
|
|
1347
|
+
if read_pos < len(qualities):
|
|
1348
|
+
base_quality_scores[read_name][ref_pos] = qualities[read_pos]
|
|
876
1349
|
if read_base != ref_base and read_base != "N" and ref_base != "N":
|
|
877
1350
|
mismatch_counts_per_read[read_name][ref_base][read_base] += 1
|
|
1351
|
+
mismatch_base_identities[read_name][ref_pos] = _encode_mismatch_base(read_base)
|
|
1352
|
+
rc = proc.wait()
|
|
1353
|
+
if rc != 0:
|
|
1354
|
+
stderr = proc.stderr.read() if proc.stderr else ""
|
|
1355
|
+
raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
|
|
878
1356
|
|
|
879
1357
|
# Determine C→T vs G→A dominance per read
|
|
880
1358
|
mismatch_trend_per_read = {}
|
|
@@ -891,54 +1369,419 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
|
|
|
891
1369
|
else:
|
|
892
1370
|
mismatch_trend_per_read[read_name] = "none"
|
|
893
1371
|
|
|
1372
|
+
if sequence_length < max_reference_length:
|
|
1373
|
+
padding_value = MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT["PAD"]
|
|
1374
|
+
for mismatch_values in mismatch_base_identities.values():
|
|
1375
|
+
mismatch_values[sequence_length:] = padding_value
|
|
1376
|
+
|
|
894
1377
|
return (
|
|
895
1378
|
dict(fwd_base_identities),
|
|
896
1379
|
dict(rev_base_identities),
|
|
897
1380
|
dict(mismatch_counts_per_read),
|
|
898
1381
|
mismatch_trend_per_read,
|
|
1382
|
+
dict(mismatch_base_identities),
|
|
1383
|
+
dict(base_quality_scores),
|
|
1384
|
+
dict(read_span_masks),
|
|
899
1385
|
)
|
|
900
1386
|
|
|
901
1387
|
|
|
902
|
-
def extract_read_features_from_bam(
|
|
903
|
-
""
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
1388
|
+
def extract_read_features_from_bam(
|
|
1389
|
+
bam_file_path: str | Path, samtools_backend: str | None = "auto"
|
|
1390
|
+
) -> Dict[str, List[float]]:
|
|
1391
|
+
"""Extract read metrics from a BAM file.
|
|
1392
|
+
|
|
1393
|
+
Args:
|
|
1394
|
+
bam_file_path: Path to the BAM file.
|
|
1395
|
+
samtools_backend: Backend selection for samtools-compatible operations (auto|python|cli).
|
|
1396
|
+
|
|
907
1397
|
Returns:
|
|
908
|
-
|
|
1398
|
+
Mapping of read name to [read_length, read_median_qscore, reference_length,
|
|
1399
|
+
mapped_length, mapping_quality, reference_start, reference_end].
|
|
909
1400
|
"""
|
|
910
|
-
# Open the BAM file
|
|
911
1401
|
logger.debug(
|
|
912
|
-
|
|
1402
|
+
"Extracting read metrics from BAM using extract_read_features_from_bam: %s",
|
|
1403
|
+
bam_file_path,
|
|
913
1404
|
)
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
1405
|
+
backend_choice = _resolve_samtools_backend(samtools_backend)
|
|
1406
|
+
read_metrics: Dict[str, List[float]] = {}
|
|
1407
|
+
|
|
1408
|
+
if backend_choice == "python":
|
|
1409
|
+
pysam_mod = _require_pysam()
|
|
1410
|
+
with pysam_mod.AlignmentFile(str(bam_file_path), "rb") as bam_file:
|
|
1411
|
+
reference_lengths = dict(zip(bam_file.references, bam_file.lengths))
|
|
1412
|
+
for read in bam_file:
|
|
1413
|
+
if read.is_unmapped:
|
|
1414
|
+
continue
|
|
1415
|
+
read_quality = read.query_qualities
|
|
1416
|
+
if read_quality is None:
|
|
1417
|
+
median_read_quality = float("nan")
|
|
1418
|
+
else:
|
|
1419
|
+
median_read_quality = float(np.median(read_quality))
|
|
1420
|
+
reference_length = reference_lengths.get(read.reference_name, float("nan"))
|
|
1421
|
+
mapped_length = sum(end - start for start, end in read.get_blocks())
|
|
1422
|
+
mapping_quality = float(read.mapping_quality)
|
|
1423
|
+
reference_start = float(read.reference_start)
|
|
1424
|
+
reference_end = float(read.reference_end)
|
|
1425
|
+
read_metrics[read.query_name] = [
|
|
1426
|
+
float(read.query_length),
|
|
1427
|
+
median_read_quality,
|
|
1428
|
+
float(reference_length),
|
|
1429
|
+
float(mapped_length),
|
|
1430
|
+
mapping_quality,
|
|
1431
|
+
reference_start,
|
|
1432
|
+
reference_end,
|
|
1433
|
+
]
|
|
1434
|
+
return read_metrics
|
|
1435
|
+
|
|
1436
|
+
bam_path = Path(bam_file_path)
|
|
1437
|
+
|
|
1438
|
+
def _parse_reference_lengths(header_text: str) -> Dict[str, int]:
|
|
1439
|
+
ref_lengths: Dict[str, int] = {}
|
|
1440
|
+
for line in header_text.splitlines():
|
|
1441
|
+
if not line.startswith("@SQ"):
|
|
920
1442
|
continue
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
1443
|
+
fields = line.split("\t")
|
|
1444
|
+
name = None
|
|
1445
|
+
length = None
|
|
1446
|
+
for field in fields[1:]:
|
|
1447
|
+
if field.startswith("SN:"):
|
|
1448
|
+
name = field.split(":", 1)[1]
|
|
1449
|
+
elif field.startswith("LN:"):
|
|
1450
|
+
length = int(field.split(":", 1)[1])
|
|
1451
|
+
if name is not None and length is not None:
|
|
1452
|
+
ref_lengths[name] = length
|
|
1453
|
+
return ref_lengths
|
|
1454
|
+
|
|
1455
|
+
def _mapped_length_from_cigar(cigar: str) -> int:
|
|
1456
|
+
mapped = 0
|
|
1457
|
+
for length_str, op in re.findall(r"(\d+)([MIDNSHP=XB])", cigar):
|
|
1458
|
+
length = int(length_str)
|
|
1459
|
+
if op in {"M", "=", "X"}:
|
|
1460
|
+
mapped += length
|
|
1461
|
+
return mapped
|
|
1462
|
+
|
|
1463
|
+
def _reference_span_from_cigar(cigar: str) -> int:
|
|
1464
|
+
reference_span = 0
|
|
1465
|
+
for length_str, op in re.findall(r"(\d+)([MIDNSHP=XB])", cigar):
|
|
1466
|
+
length = int(length_str)
|
|
1467
|
+
if op in {"M", "D", "N", "=", "X"}:
|
|
1468
|
+
reference_span += length
|
|
1469
|
+
return reference_span
|
|
1470
|
+
|
|
1471
|
+
header_cp = subprocess.run(
|
|
1472
|
+
["samtools", "view", "-H", str(bam_path)],
|
|
1473
|
+
stdout=subprocess.PIPE,
|
|
1474
|
+
stderr=subprocess.PIPE,
|
|
1475
|
+
text=True,
|
|
1476
|
+
check=False,
|
|
1477
|
+
)
|
|
1478
|
+
if header_cp.returncode != 0:
|
|
1479
|
+
raise RuntimeError(
|
|
1480
|
+
f"samtools view -H failed (exit {header_cp.returncode}):\n{header_cp.stderr}"
|
|
1481
|
+
)
|
|
1482
|
+
reference_lengths = _parse_reference_lengths(header_cp.stdout)
|
|
1483
|
+
|
|
1484
|
+
proc = subprocess.Popen(
|
|
1485
|
+
["samtools", "view", "-F", "4", str(bam_path)],
|
|
1486
|
+
stdout=subprocess.PIPE,
|
|
1487
|
+
stderr=subprocess.PIPE,
|
|
1488
|
+
text=True,
|
|
1489
|
+
)
|
|
1490
|
+
assert proc.stdout is not None
|
|
1491
|
+
for line in proc.stdout:
|
|
1492
|
+
if not line.strip() or line.startswith("@"):
|
|
1493
|
+
continue
|
|
1494
|
+
fields = line.rstrip("\n").split("\t")
|
|
1495
|
+
if len(fields) < 11:
|
|
1496
|
+
continue
|
|
1497
|
+
read_name = fields[0]
|
|
1498
|
+
reference_name = fields[2]
|
|
1499
|
+
mapping_quality = float(fields[4])
|
|
1500
|
+
cigar = fields[5]
|
|
1501
|
+
reference_start = float(int(fields[3]) - 1)
|
|
1502
|
+
sequence = fields[9]
|
|
1503
|
+
quality = fields[10]
|
|
1504
|
+
if sequence == "*":
|
|
1505
|
+
read_length = float("nan")
|
|
1506
|
+
else:
|
|
1507
|
+
read_length = float(len(sequence))
|
|
1508
|
+
if quality == "*" or not quality:
|
|
1509
|
+
median_read_quality = float("nan")
|
|
1510
|
+
else:
|
|
1511
|
+
phreds = [ord(char) - 33 for char in quality]
|
|
1512
|
+
median_read_quality = float(np.median(phreds))
|
|
1513
|
+
reference_length = float(reference_lengths.get(reference_name, float("nan")))
|
|
1514
|
+
mapped_length = float(_mapped_length_from_cigar(cigar)) if cigar != "*" else 0.0
|
|
1515
|
+
if cigar != "*":
|
|
1516
|
+
reference_end = float(reference_start + _reference_span_from_cigar(cigar))
|
|
1517
|
+
else:
|
|
1518
|
+
reference_end = float("nan")
|
|
1519
|
+
read_metrics[read_name] = [
|
|
1520
|
+
read_length,
|
|
1521
|
+
median_read_quality,
|
|
1522
|
+
reference_length,
|
|
1523
|
+
mapped_length,
|
|
1524
|
+
mapping_quality,
|
|
1525
|
+
reference_start,
|
|
1526
|
+
reference_end,
|
|
1527
|
+
]
|
|
1528
|
+
|
|
1529
|
+
rc = proc.wait()
|
|
1530
|
+
if rc != 0:
|
|
1531
|
+
stderr = proc.stderr.read() if proc.stderr else ""
|
|
1532
|
+
raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
|
|
937
1533
|
|
|
938
1534
|
return read_metrics
|
|
939
1535
|
|
|
940
1536
|
|
|
941
|
-
def
|
|
1537
|
+
def extract_read_tags_from_bam(
|
|
1538
|
+
bam_file_path: str | Path,
|
|
1539
|
+
tag_names: Iterable[str] | None = None,
|
|
1540
|
+
include_flags: bool = True,
|
|
1541
|
+
include_cigar: bool = True,
|
|
1542
|
+
samtools_backend: str | None = "auto",
|
|
1543
|
+
) -> Dict[str, Dict[str, object]]:
|
|
1544
|
+
"""Extract per-read tag metadata from a BAM file.
|
|
1545
|
+
|
|
1546
|
+
Args:
|
|
1547
|
+
bam_file_path: Path to the BAM file.
|
|
1548
|
+
tag_names: Iterable of BAM tag names to extract (e.g., ["NM", "MD", "MM", "ML"]).
|
|
1549
|
+
If None, only flags/cigar are populated.
|
|
1550
|
+
include_flags: Whether to include a list of flag names for each read.
|
|
1551
|
+
include_cigar: Whether to include the CIGAR string for each read.
|
|
1552
|
+
samtools_backend: Backend selection for samtools-compatible operations (auto|python|cli).
|
|
1553
|
+
|
|
1554
|
+
Returns:
|
|
1555
|
+
Mapping of read name to a dict of extracted tag values.
|
|
1556
|
+
"""
|
|
1557
|
+
backend_choice = _resolve_samtools_backend(samtools_backend)
|
|
1558
|
+
tag_names_list = [tag.upper() for tag in tag_names] if tag_names else []
|
|
1559
|
+
read_tags: Dict[str, Dict[str, object]] = {}
|
|
1560
|
+
|
|
1561
|
+
def _decode_flags(flag: int) -> list[str]:
|
|
1562
|
+
return [name for bit, name in _BAM_FLAG_BITS if flag & bit]
|
|
1563
|
+
|
|
1564
|
+
if backend_choice == "python":
|
|
1565
|
+
pysam_mod = _require_pysam()
|
|
1566
|
+
with pysam_mod.AlignmentFile(str(bam_file_path), "rb") as bam_file:
|
|
1567
|
+
for read in bam_file.fetch(until_eof=True):
|
|
1568
|
+
if not read.query_name:
|
|
1569
|
+
continue
|
|
1570
|
+
tag_map: Dict[str, object] = {}
|
|
1571
|
+
if include_cigar:
|
|
1572
|
+
tag_map["CIGAR"] = read.cigarstring
|
|
1573
|
+
if include_flags:
|
|
1574
|
+
tag_map["FLAGS"] = _decode_flags(read.flag)
|
|
1575
|
+
for tag in tag_names_list:
|
|
1576
|
+
try:
|
|
1577
|
+
tag_map[tag] = read.get_tag(tag)
|
|
1578
|
+
except Exception:
|
|
1579
|
+
tag_map[tag] = None
|
|
1580
|
+
read_tags[read.query_name] = tag_map
|
|
1581
|
+
else:
|
|
1582
|
+
cmd = ["samtools", "view", "-F", "4", str(bam_file_path)]
|
|
1583
|
+
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
|
1584
|
+
assert proc.stdout is not None
|
|
1585
|
+
for line in proc.stdout:
|
|
1586
|
+
if not line.strip() or line.startswith("@"):
|
|
1587
|
+
continue
|
|
1588
|
+
fields = line.rstrip("\n").split("\t")
|
|
1589
|
+
if len(fields) < 11:
|
|
1590
|
+
continue
|
|
1591
|
+
read_name = fields[0]
|
|
1592
|
+
flag = int(fields[1])
|
|
1593
|
+
cigar = fields[5]
|
|
1594
|
+
tag_map: Dict[str, object] = {}
|
|
1595
|
+
if include_cigar:
|
|
1596
|
+
tag_map["CIGAR"] = cigar
|
|
1597
|
+
if include_flags:
|
|
1598
|
+
tag_map["FLAGS"] = _decode_flags(flag)
|
|
1599
|
+
if tag_names_list:
|
|
1600
|
+
raw_tags = fields[11:]
|
|
1601
|
+
parsed_tags: Dict[str, str] = {}
|
|
1602
|
+
for raw_tag in raw_tags:
|
|
1603
|
+
parts = raw_tag.split(":", 2)
|
|
1604
|
+
if len(parts) == 3:
|
|
1605
|
+
tag_name, _tag_type, value = parts
|
|
1606
|
+
parsed_tags[tag_name.upper()] = value
|
|
1607
|
+
for tag in tag_names_list:
|
|
1608
|
+
tag_map[tag] = parsed_tags.get(tag)
|
|
1609
|
+
read_tags[read_name] = tag_map
|
|
1610
|
+
rc = proc.wait()
|
|
1611
|
+
if rc != 0:
|
|
1612
|
+
stderr = proc.stderr.read() if proc.stderr else ""
|
|
1613
|
+
raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
|
|
1614
|
+
|
|
1615
|
+
return read_tags
|
|
1616
|
+
|
|
1617
|
+
|
|
1618
|
+
def find_secondary_supplementary_read_names(
|
|
1619
|
+
bam_file_path: str | Path,
|
|
1620
|
+
read_names: Iterable[str],
|
|
1621
|
+
samtools_backend: str | None = "auto",
|
|
1622
|
+
) -> tuple[set[str], set[str]]:
|
|
1623
|
+
"""Find read names with secondary or supplementary alignments in a BAM.
|
|
1624
|
+
|
|
1625
|
+
Args:
|
|
1626
|
+
bam_file_path: Path to the BAM file to scan.
|
|
1627
|
+
read_names: Iterable of read names to check.
|
|
1628
|
+
samtools_backend: Backend selection for samtools-compatible operations (auto|python|cli).
|
|
1629
|
+
|
|
1630
|
+
Returns:
|
|
1631
|
+
Tuple of (secondary_read_names, supplementary_read_names).
|
|
1632
|
+
"""
|
|
1633
|
+
target_names = set(read_names)
|
|
1634
|
+
if not target_names:
|
|
1635
|
+
return set(), set()
|
|
1636
|
+
|
|
1637
|
+
secondary_reads: set[str] = set()
|
|
1638
|
+
supplementary_reads: set[str] = set()
|
|
1639
|
+
backend_choice = _resolve_samtools_backend(samtools_backend)
|
|
1640
|
+
|
|
1641
|
+
if backend_choice == "python":
|
|
1642
|
+
pysam_mod = _require_pysam()
|
|
1643
|
+
with pysam_mod.AlignmentFile(str(bam_file_path), "rb") as bam_file:
|
|
1644
|
+
for read in bam_file.fetch(until_eof=True):
|
|
1645
|
+
if not read.query_name or read.query_name not in target_names:
|
|
1646
|
+
continue
|
|
1647
|
+
if read.is_secondary:
|
|
1648
|
+
secondary_reads.add(read.query_name)
|
|
1649
|
+
if read.is_supplementary:
|
|
1650
|
+
supplementary_reads.add(read.query_name)
|
|
1651
|
+
else:
|
|
1652
|
+
|
|
1653
|
+
def _collect(flag: int) -> set[str]:
|
|
1654
|
+
cmd = ["samtools", "view", "-f", str(flag), str(bam_file_path)]
|
|
1655
|
+
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
|
1656
|
+
assert proc.stdout is not None
|
|
1657
|
+
hits: set[str] = set()
|
|
1658
|
+
for line in proc.stdout:
|
|
1659
|
+
if not line.strip() or line.startswith("@"):
|
|
1660
|
+
continue
|
|
1661
|
+
read_name = line.split("\t", 1)[0]
|
|
1662
|
+
if read_name in target_names:
|
|
1663
|
+
hits.add(read_name)
|
|
1664
|
+
rc = proc.wait()
|
|
1665
|
+
if rc != 0:
|
|
1666
|
+
stderr = proc.stderr.read() if proc.stderr else ""
|
|
1667
|
+
raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
|
|
1668
|
+
return hits
|
|
1669
|
+
|
|
1670
|
+
secondary_reads = _collect(0x100)
|
|
1671
|
+
supplementary_reads = _collect(0x800)
|
|
1672
|
+
|
|
1673
|
+
return secondary_reads, supplementary_reads
|
|
1674
|
+
|
|
1675
|
+
|
|
1676
|
+
def extract_secondary_supplementary_alignment_spans(
|
|
1677
|
+
bam_file_path: str | Path,
|
|
1678
|
+
read_names: Iterable[str],
|
|
1679
|
+
samtools_backend: str | None = "auto",
|
|
1680
|
+
) -> tuple[
|
|
1681
|
+
dict[str, list[tuple[float, float, float]]], dict[str, list[tuple[float, float, float]]]
|
|
1682
|
+
]:
|
|
1683
|
+
"""Extract reference/read span data for secondary/supplementary alignments.
|
|
1684
|
+
|
|
1685
|
+
Args:
|
|
1686
|
+
bam_file_path: Path to the BAM file to scan.
|
|
1687
|
+
read_names: Iterable of read names to check.
|
|
1688
|
+
samtools_backend: Backend selection for samtools-compatible operations (auto|python|cli).
|
|
1689
|
+
|
|
1690
|
+
Returns:
|
|
1691
|
+
Tuple of (secondary_spans, supplementary_spans) where each mapping contains
|
|
1692
|
+
read names mapped to lists of (reference_start, reference_end, read_span).
|
|
1693
|
+
"""
|
|
1694
|
+
target_names = set(read_names)
|
|
1695
|
+
if not target_names:
|
|
1696
|
+
return {}, {}
|
|
1697
|
+
|
|
1698
|
+
secondary_spans: dict[str, list[tuple[float, float, float]]] = {}
|
|
1699
|
+
supplementary_spans: dict[str, list[tuple[float, float, float]]] = {}
|
|
1700
|
+
backend_choice = _resolve_samtools_backend(samtools_backend)
|
|
1701
|
+
|
|
1702
|
+
if backend_choice == "python":
|
|
1703
|
+
pysam_mod = _require_pysam()
|
|
1704
|
+
with pysam_mod.AlignmentFile(str(bam_file_path), "rb") as bam_file:
|
|
1705
|
+
for read in bam_file.fetch(until_eof=True):
|
|
1706
|
+
if not read.query_name or read.query_name not in target_names:
|
|
1707
|
+
continue
|
|
1708
|
+
if not (read.is_secondary or read.is_supplementary):
|
|
1709
|
+
continue
|
|
1710
|
+
reference_start = (
|
|
1711
|
+
float(read.reference_start)
|
|
1712
|
+
if read.reference_start is not None
|
|
1713
|
+
else float("nan")
|
|
1714
|
+
)
|
|
1715
|
+
reference_end = (
|
|
1716
|
+
float(read.reference_end) if read.reference_end is not None else float("nan")
|
|
1717
|
+
)
|
|
1718
|
+
read_span = (
|
|
1719
|
+
float(read.query_alignment_length)
|
|
1720
|
+
if read.query_alignment_length is not None
|
|
1721
|
+
else float("nan")
|
|
1722
|
+
)
|
|
1723
|
+
if read.is_secondary:
|
|
1724
|
+
secondary_spans.setdefault(read.query_name, []).append(
|
|
1725
|
+
(reference_start, reference_end, read_span)
|
|
1726
|
+
)
|
|
1727
|
+
if read.is_supplementary:
|
|
1728
|
+
supplementary_spans.setdefault(read.query_name, []).append(
|
|
1729
|
+
(reference_start, reference_end, read_span)
|
|
1730
|
+
)
|
|
1731
|
+
return secondary_spans, supplementary_spans
|
|
1732
|
+
|
|
1733
|
+
def _mapped_length_from_cigar(cigar: str) -> int:
|
|
1734
|
+
mapped = 0
|
|
1735
|
+
for length_str, op in re.findall(r"(\d+)([MIDNSHP=XB])", cigar):
|
|
1736
|
+
length = int(length_str)
|
|
1737
|
+
if op in {"M", "=", "X"}:
|
|
1738
|
+
mapped += length
|
|
1739
|
+
return mapped
|
|
1740
|
+
|
|
1741
|
+
def _reference_span_from_cigar(cigar: str) -> int:
|
|
1742
|
+
reference_span = 0
|
|
1743
|
+
for length_str, op in re.findall(r"(\d+)([MIDNSHP=XB])", cigar):
|
|
1744
|
+
length = int(length_str)
|
|
1745
|
+
if op in {"M", "D", "N", "=", "X"}:
|
|
1746
|
+
reference_span += length
|
|
1747
|
+
return reference_span
|
|
1748
|
+
|
|
1749
|
+
def _collect(flag: int) -> dict[str, list[tuple[float, float, float]]]:
|
|
1750
|
+
cmd = ["samtools", "view", "-f", str(flag), str(bam_file_path)]
|
|
1751
|
+
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
|
1752
|
+
assert proc.stdout is not None
|
|
1753
|
+
spans: dict[str, list[tuple[float, float, float]]] = {}
|
|
1754
|
+
for line in proc.stdout:
|
|
1755
|
+
if not line.strip() or line.startswith("@"):
|
|
1756
|
+
continue
|
|
1757
|
+
fields = line.rstrip("\n").split("\t")
|
|
1758
|
+
if len(fields) < 11:
|
|
1759
|
+
continue
|
|
1760
|
+
read_name = fields[0]
|
|
1761
|
+
if read_name not in target_names:
|
|
1762
|
+
continue
|
|
1763
|
+
cigar = fields[5]
|
|
1764
|
+
reference_start = float(int(fields[3]) - 1)
|
|
1765
|
+
if cigar != "*":
|
|
1766
|
+
reference_end = float(reference_start + _reference_span_from_cigar(cigar))
|
|
1767
|
+
read_span = float(_mapped_length_from_cigar(cigar))
|
|
1768
|
+
else:
|
|
1769
|
+
reference_end = float("nan")
|
|
1770
|
+
read_span = float("nan")
|
|
1771
|
+
spans.setdefault(read_name, []).append((reference_start, reference_end, read_span))
|
|
1772
|
+
rc = proc.wait()
|
|
1773
|
+
if rc != 0:
|
|
1774
|
+
stderr = proc.stderr.read() if proc.stderr else ""
|
|
1775
|
+
raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
|
|
1776
|
+
return spans
|
|
1777
|
+
|
|
1778
|
+
secondary_spans = _collect(0x100)
|
|
1779
|
+
supplementary_spans = _collect(0x800)
|
|
1780
|
+
|
|
1781
|
+
return secondary_spans, supplementary_spans
|
|
1782
|
+
|
|
1783
|
+
|
|
1784
|
+
def extract_readnames_from_bam(aligned_BAM, samtools_backend: str | None = "auto"):
|
|
942
1785
|
"""
|
|
943
1786
|
Takes a BAM and writes out a txt file containing read names from the BAM
|
|
944
1787
|
|
|
@@ -949,21 +1792,39 @@ def extract_readnames_from_bam(aligned_BAM):
|
|
|
949
1792
|
None
|
|
950
1793
|
|
|
951
1794
|
"""
|
|
952
|
-
import subprocess
|
|
953
|
-
|
|
954
1795
|
# Make a text file of reads for the BAM
|
|
1796
|
+
backend_choice = _resolve_samtools_backend(samtools_backend)
|
|
955
1797
|
txt_output = aligned_BAM.split(".bam")[0] + "_read_names.txt"
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
1798
|
+
|
|
1799
|
+
if backend_choice == "python":
|
|
1800
|
+
pysam_mod = _require_pysam()
|
|
1801
|
+
with (
|
|
1802
|
+
pysam_mod.AlignmentFile(aligned_BAM, "rb") as bam,
|
|
1803
|
+
open(txt_output, "w", encoding="utf-8") as output_file,
|
|
1804
|
+
):
|
|
1805
|
+
for read in bam:
|
|
1806
|
+
output_file.write(f"{read.query_name}\n")
|
|
1807
|
+
return
|
|
1808
|
+
|
|
1809
|
+
samtools_view = subprocess.Popen(
|
|
1810
|
+
["samtools", "view", aligned_BAM], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
|
|
1811
|
+
)
|
|
1812
|
+
assert samtools_view.stdout is not None
|
|
1813
|
+
with open(txt_output, "w", encoding="utf-8") as output_file:
|
|
1814
|
+
for line in samtools_view.stdout:
|
|
1815
|
+
if not line.strip():
|
|
1816
|
+
continue
|
|
1817
|
+
qname = line.split("\t", 1)[0]
|
|
1818
|
+
output_file.write(f"{qname}\n")
|
|
1819
|
+
rc = samtools_view.wait()
|
|
1820
|
+
if rc != 0:
|
|
1821
|
+
stderr = samtools_view.stderr.read() if samtools_view.stderr else ""
|
|
1822
|
+
raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
|
|
964
1823
|
|
|
965
1824
|
|
|
966
|
-
def separate_bam_by_bc(
|
|
1825
|
+
def separate_bam_by_bc(
|
|
1826
|
+
input_bam, output_prefix, bam_suffix, split_dir, samtools_backend: str | None = "auto"
|
|
1827
|
+
):
|
|
967
1828
|
"""
|
|
968
1829
|
Separates an input BAM file on the BC SAM tag values.
|
|
969
1830
|
|
|
@@ -981,34 +1842,80 @@ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
|
|
|
981
1842
|
bam_base = input_bam.name
|
|
982
1843
|
bam_base_minus_suffix = input_bam.stem
|
|
983
1844
|
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
#
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1845
|
+
backend_choice = _resolve_samtools_backend(samtools_backend)
|
|
1846
|
+
|
|
1847
|
+
if backend_choice == "python":
|
|
1848
|
+
pysam_mod = _require_pysam()
|
|
1849
|
+
# Open the input BAM file for reading
|
|
1850
|
+
with pysam_mod.AlignmentFile(str(input_bam), "rb") as bam:
|
|
1851
|
+
# Create a dictionary to store output BAM files
|
|
1852
|
+
output_files = {}
|
|
1853
|
+
# Iterate over each read in the BAM file
|
|
1854
|
+
for read in bam:
|
|
1855
|
+
try:
|
|
1856
|
+
# Get the barcode tag value
|
|
1857
|
+
bc_tag = read.get_tag("BC", with_value_type=True)[0]
|
|
1858
|
+
# bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
|
|
1859
|
+
# Open the output BAM file corresponding to the barcode
|
|
1860
|
+
if bc_tag not in output_files:
|
|
1861
|
+
output_path = (
|
|
1862
|
+
split_dir
|
|
1863
|
+
/ f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}"
|
|
1864
|
+
)
|
|
1865
|
+
output_files[bc_tag] = pysam_mod.AlignmentFile(
|
|
1866
|
+
str(output_path), "wb", header=bam.header
|
|
1867
|
+
)
|
|
1868
|
+
# Write the read to the corresponding output BAM file
|
|
1869
|
+
output_files[bc_tag].write(read)
|
|
1870
|
+
except KeyError:
|
|
1871
|
+
logger.warning(f"BC tag not present for read: {read.query_name}")
|
|
1872
|
+
# Close all output BAM files
|
|
1873
|
+
for output_file in output_files.values():
|
|
1874
|
+
output_file.close()
|
|
1875
|
+
return
|
|
1876
|
+
|
|
1877
|
+
def _collect_bc_tags() -> set[str]:
|
|
1878
|
+
bc_tags: set[str] = set()
|
|
1879
|
+
proc = subprocess.Popen(
|
|
1880
|
+
["samtools", "view", str(input_bam)],
|
|
1881
|
+
stdout=subprocess.PIPE,
|
|
1882
|
+
stderr=subprocess.PIPE,
|
|
1883
|
+
text=True,
|
|
1884
|
+
)
|
|
1885
|
+
assert proc.stdout is not None
|
|
1886
|
+
for line in proc.stdout:
|
|
1887
|
+
if not line.strip():
|
|
1888
|
+
continue
|
|
1889
|
+
fields = line.rstrip("\n").split("\t")
|
|
1890
|
+
for tag in fields[11:]:
|
|
1891
|
+
if tag.startswith("BC:"):
|
|
1892
|
+
bc_tags.add(tag.split(":", 2)[2])
|
|
1893
|
+
break
|
|
1894
|
+
rc = proc.wait()
|
|
1895
|
+
if rc != 0:
|
|
1896
|
+
stderr = proc.stderr.read() if proc.stderr else ""
|
|
1897
|
+
raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
|
|
1898
|
+
return bc_tags
|
|
1899
|
+
|
|
1900
|
+
bc_tags = _collect_bc_tags()
|
|
1901
|
+
if not bc_tags:
|
|
1902
|
+
logger.warning("No BC tags found in %s", input_bam)
|
|
1903
|
+
return
|
|
1904
|
+
|
|
1905
|
+
for bc_tag in bc_tags:
|
|
1906
|
+
output_path = split_dir / f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}"
|
|
1907
|
+
cmd = ["samtools", "view", "-b", "-d", f"BC:{bc_tag}", "-o", str(output_path)]
|
|
1908
|
+
cmd.append(str(input_bam))
|
|
1909
|
+
cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
|
|
1910
|
+
if cp.returncode != 0:
|
|
1911
|
+
raise RuntimeError(
|
|
1912
|
+
f"samtools view failed for BC={bc_tag} (exit {cp.returncode}):\n{cp.stderr}"
|
|
1913
|
+
)
|
|
1009
1914
|
|
|
1010
1915
|
|
|
1011
|
-
def split_and_index_BAM(
|
|
1916
|
+
def split_and_index_BAM(
|
|
1917
|
+
aligned_sorted_BAM, split_dir, bam_suffix, samtools_backend: str | None = "auto"
|
|
1918
|
+
):
|
|
1012
1919
|
"""
|
|
1013
1920
|
A wrapper function for splitting BAMS and indexing them.
|
|
1014
1921
|
Parameters:
|
|
@@ -1023,12 +1930,22 @@ def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
|
|
|
1023
1930
|
logger.debug("Demultiplexing and indexing BAMS based on BC tag using split_and_index_BAM")
|
|
1024
1931
|
aligned_sorted_output = aligned_sorted_BAM + bam_suffix
|
|
1025
1932
|
file_prefix = date_string()
|
|
1026
|
-
separate_bam_by_bc(
|
|
1933
|
+
separate_bam_by_bc(
|
|
1934
|
+
aligned_sorted_output,
|
|
1935
|
+
file_prefix,
|
|
1936
|
+
bam_suffix,
|
|
1937
|
+
split_dir,
|
|
1938
|
+
samtools_backend=samtools_backend,
|
|
1939
|
+
)
|
|
1027
1940
|
# Make a BAM index file for the BAMs in that directory
|
|
1028
1941
|
bam_pattern = "*" + bam_suffix
|
|
1029
1942
|
bam_files = glob.glob(split_dir / bam_pattern)
|
|
1030
1943
|
bam_files = [str(bam) for bam in bam_files if ".bai" not in str(bam)]
|
|
1944
|
+
backend_choice = _resolve_samtools_backend(samtools_backend)
|
|
1031
1945
|
for input_file in bam_files:
|
|
1032
|
-
|
|
1946
|
+
if backend_choice == "python":
|
|
1947
|
+
_index_bam_with_pysam(input_file)
|
|
1948
|
+
else:
|
|
1949
|
+
_index_bam_with_samtools(input_file)
|
|
1033
1950
|
|
|
1034
1951
|
return bam_files
|