smftools 0.2.5__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +39 -7
- smftools/_settings.py +2 -0
- smftools/_version.py +3 -1
- smftools/cli/__init__.py +1 -0
- smftools/cli/archived/cli_flows.py +2 -0
- smftools/cli/helpers.py +2 -0
- smftools/cli/hmm_adata.py +7 -2
- smftools/cli/load_adata.py +130 -98
- smftools/cli/preprocess_adata.py +2 -0
- smftools/cli/spatial_adata.py +5 -1
- smftools/cli_entry.py +26 -1
- smftools/config/__init__.py +2 -0
- smftools/config/default.yaml +4 -1
- smftools/config/experiment_config.py +6 -0
- smftools/datasets/__init__.py +2 -0
- smftools/hmm/HMM.py +9 -3
- smftools/hmm/__init__.py +24 -13
- smftools/hmm/archived/apply_hmm_batched.py +2 -0
- smftools/hmm/archived/calculate_distances.py +2 -0
- smftools/hmm/archived/call_hmm_peaks.py +2 -0
- smftools/hmm/archived/train_hmm.py +2 -0
- smftools/hmm/call_hmm_peaks.py +5 -2
- smftools/hmm/display_hmm.py +4 -1
- smftools/hmm/hmm_readwrite.py +7 -2
- smftools/hmm/nucleosome_hmm_refinement.py +2 -0
- smftools/informatics/__init__.py +53 -34
- smftools/informatics/archived/bam_conversion.py +2 -0
- smftools/informatics/archived/bam_direct.py +2 -0
- smftools/informatics/archived/basecall_pod5s.py +2 -0
- smftools/informatics/archived/basecalls_to_adata.py +2 -0
- smftools/informatics/archived/conversion_smf.py +2 -0
- smftools/informatics/archived/deaminase_smf.py +1 -0
- smftools/informatics/archived/direct_smf.py +2 -0
- smftools/informatics/archived/fast5_to_pod5.py +2 -0
- smftools/informatics/archived/helpers/archived/__init__.py +2 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
- smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
- smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
- smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
- smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
- smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
- smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
- smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
- smftools/informatics/archived/helpers/archived/informatics.py +2 -0
- smftools/informatics/archived/helpers/archived/load_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
- smftools/informatics/archived/helpers/archived/modQC.py +2 -0
- smftools/informatics/archived/helpers/archived/modcall.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +2 -0
- smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
- smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
- smftools/informatics/archived/print_bam_query_seq.py +2 -0
- smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
- smftools/informatics/archived/subsample_pod5.py +2 -0
- smftools/informatics/bam_functions.py +737 -170
- smftools/informatics/basecalling.py +2 -0
- smftools/informatics/bed_functions.py +271 -61
- smftools/informatics/binarize_converted_base_identities.py +3 -0
- smftools/informatics/complement_base_list.py +2 -0
- smftools/informatics/converted_BAM_to_adata.py +66 -22
- smftools/informatics/fasta_functions.py +94 -10
- smftools/informatics/h5ad_functions.py +8 -2
- smftools/informatics/modkit_extract_to_adata.py +16 -6
- smftools/informatics/modkit_functions.py +2 -0
- smftools/informatics/ohe.py +2 -0
- smftools/informatics/pod5_functions.py +3 -2
- smftools/machine_learning/__init__.py +22 -6
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +18 -4
- smftools/machine_learning/data/preprocessing.py +2 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +2 -0
- smftools/machine_learning/evaluation/evaluators.py +14 -9
- smftools/machine_learning/inference/__init__.py +2 -0
- smftools/machine_learning/inference/inference_utils.py +2 -0
- smftools/machine_learning/inference/lightning_inference.py +6 -1
- smftools/machine_learning/inference/sklearn_inference.py +2 -0
- smftools/machine_learning/inference/sliding_window_inference.py +2 -0
- smftools/machine_learning/models/__init__.py +2 -0
- smftools/machine_learning/models/base.py +7 -2
- smftools/machine_learning/models/cnn.py +7 -2
- smftools/machine_learning/models/lightning_base.py +16 -11
- smftools/machine_learning/models/mlp.py +5 -1
- smftools/machine_learning/models/positional.py +7 -2
- smftools/machine_learning/models/rnn.py +5 -1
- smftools/machine_learning/models/sklearn_models.py +14 -9
- smftools/machine_learning/models/transformer.py +7 -2
- smftools/machine_learning/models/wrappers.py +6 -2
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +13 -3
- smftools/machine_learning/training/train_sklearn_model.py +2 -0
- smftools/machine_learning/utils/__init__.py +2 -0
- smftools/machine_learning/utils/device.py +5 -1
- smftools/machine_learning/utils/grl.py +5 -1
- smftools/optional_imports.py +31 -0
- smftools/plotting/__init__.py +32 -31
- smftools/plotting/autocorrelation_plotting.py +9 -5
- smftools/plotting/classifiers.py +16 -4
- smftools/plotting/general_plotting.py +6 -3
- smftools/plotting/hmm_plotting.py +12 -2
- smftools/plotting/position_stats.py +15 -7
- smftools/plotting/qc_plotting.py +6 -1
- smftools/preprocessing/__init__.py +35 -37
- smftools/preprocessing/archived/add_read_length_and_mapping_qc.py +2 -0
- smftools/preprocessing/archived/calculate_complexity.py +2 -0
- smftools/preprocessing/archived/mark_duplicates.py +2 -0
- smftools/preprocessing/archived/preprocessing.py +2 -0
- smftools/preprocessing/archived/remove_duplicates.py +2 -0
- smftools/preprocessing/binary_layers_to_ohe.py +2 -1
- smftools/preprocessing/calculate_complexity_II.py +4 -1
- smftools/preprocessing/calculate_pairwise_differences.py +2 -0
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +3 -0
- smftools/preprocessing/calculate_position_Youden.py +9 -2
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +2 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +2 -0
- smftools/preprocessing/flag_duplicate_reads.py +42 -54
- smftools/preprocessing/make_dirs.py +2 -1
- smftools/preprocessing/min_non_diagonal.py +2 -0
- smftools/preprocessing/recipes.py +2 -0
- smftools/tools/__init__.py +26 -18
- smftools/tools/archived/apply_hmm.py +2 -0
- smftools/tools/archived/classifiers.py +2 -0
- smftools/tools/archived/classify_methylated_features.py +2 -0
- smftools/tools/archived/classify_non_methylated_features.py +2 -0
- smftools/tools/archived/subset_adata_v1.py +2 -0
- smftools/tools/archived/subset_adata_v2.py +2 -0
- smftools/tools/calculate_umap.py +3 -1
- smftools/tools/cluster_adata_on_methylation.py +7 -1
- smftools/tools/position_stats.py +17 -27
- {smftools-0.2.5.dist-info → smftools-0.3.0.dist-info}/METADATA +67 -33
- smftools-0.3.0.dist-info/RECORD +182 -0
- smftools-0.2.5.dist-info/RECORD +0 -181
- {smftools-0.2.5.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
- {smftools-0.2.5.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.5.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -3,28 +3,118 @@ from __future__ import annotations
|
|
|
3
3
|
import glob
|
|
4
4
|
import os
|
|
5
5
|
import re
|
|
6
|
+
import shutil
|
|
6
7
|
import subprocess
|
|
7
8
|
import time
|
|
8
9
|
from collections import Counter, defaultdict, deque
|
|
9
10
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
10
11
|
from itertools import zip_longest
|
|
11
12
|
from pathlib import Path
|
|
12
|
-
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
13
|
+
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
13
14
|
|
|
14
15
|
import numpy as np
|
|
15
|
-
import pysam
|
|
16
16
|
from tqdm import tqdm
|
|
17
17
|
|
|
18
18
|
from smftools.logging_utils import get_logger
|
|
19
|
+
from smftools.optional_imports import require
|
|
19
20
|
|
|
20
21
|
from ..readwrite import date_string, time_string
|
|
21
22
|
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
import pysam as pysam_types
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
import pysam
|
|
28
|
+
except Exception:
|
|
29
|
+
pysam = None # type: ignore
|
|
30
|
+
|
|
22
31
|
logger = get_logger(__name__)
|
|
23
32
|
|
|
24
33
|
_PROGRESS_RE = re.compile(r"Output records written:\s*(\d+)")
|
|
25
34
|
_EMPTY_RE = re.compile(r"^\s*$")
|
|
26
35
|
|
|
27
36
|
|
|
37
|
+
def _require_pysam() -> "pysam_types":
|
|
38
|
+
"""Return the pysam module or raise if unavailable."""
|
|
39
|
+
if pysam is not None:
|
|
40
|
+
return pysam
|
|
41
|
+
return require("pysam", extra="pysam", purpose="samtools-compatible Python backend")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _resolve_samtools_backend(backend: str | None) -> str:
|
|
45
|
+
"""Resolve backend choice for samtools-compatible operations.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
backend: One of {"auto", "python", "cli"} (case-insensitive).
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Resolved backend string ("python" or "cli").
|
|
52
|
+
"""
|
|
53
|
+
choice = (backend or "auto").strip().lower()
|
|
54
|
+
if choice not in {"auto", "python", "cli"}:
|
|
55
|
+
raise ValueError("samtools_backend must be one of: auto, python, cli")
|
|
56
|
+
|
|
57
|
+
have_pysam = pysam is not None
|
|
58
|
+
have_samtools = shutil.which("samtools") is not None
|
|
59
|
+
|
|
60
|
+
if choice == "python":
|
|
61
|
+
if not have_pysam:
|
|
62
|
+
raise RuntimeError("samtools_backend=python requires pysam to be installed.")
|
|
63
|
+
return "python"
|
|
64
|
+
if choice == "cli":
|
|
65
|
+
if not have_samtools:
|
|
66
|
+
raise RuntimeError("samtools_backend=cli requires samtools in PATH.")
|
|
67
|
+
return "cli"
|
|
68
|
+
|
|
69
|
+
if have_samtools:
|
|
70
|
+
return "cli"
|
|
71
|
+
if have_pysam:
|
|
72
|
+
return "python"
|
|
73
|
+
raise RuntimeError("Neither pysam nor samtools is available in PATH.")
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _has_bam_index(bam_path: Path) -> bool:
|
|
77
|
+
"""Return True if the BAM index exists alongside the BAM."""
|
|
78
|
+
return (
|
|
79
|
+
bam_path.with_suffix(bam_path.suffix + ".bai").exists()
|
|
80
|
+
or Path(str(bam_path) + ".bai").exists()
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _ensure_bam_index(bam_path: Path, backend: str) -> None:
|
|
85
|
+
"""Ensure a BAM index exists, creating one if needed."""
|
|
86
|
+
if _has_bam_index(bam_path):
|
|
87
|
+
return
|
|
88
|
+
if backend == "python":
|
|
89
|
+
_index_bam_with_pysam(bam_path)
|
|
90
|
+
else:
|
|
91
|
+
_index_bam_with_samtools(bam_path)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _parse_idxstats_output(output: str) -> Tuple[int, int, Dict[str, Tuple[int, float]]]:
|
|
95
|
+
"""Parse samtools idxstats output into counts and proportions."""
|
|
96
|
+
aligned_reads_count = 0
|
|
97
|
+
unaligned_reads_count = 0
|
|
98
|
+
record_counts: Dict[str, int] = {}
|
|
99
|
+
for line in output.splitlines():
|
|
100
|
+
if not line.strip():
|
|
101
|
+
continue
|
|
102
|
+
ref, _length, mapped, unmapped = line.split("\t")[:4]
|
|
103
|
+
if ref == "*":
|
|
104
|
+
unaligned_reads_count += int(unmapped)
|
|
105
|
+
continue
|
|
106
|
+
mapped_count = int(mapped)
|
|
107
|
+
aligned_reads_count += mapped_count
|
|
108
|
+
record_counts[ref] = mapped_count
|
|
109
|
+
|
|
110
|
+
proportions: Dict[str, Tuple[int, float]] = {}
|
|
111
|
+
for ref, count in record_counts.items():
|
|
112
|
+
proportion = count / aligned_reads_count if aligned_reads_count else 0.0
|
|
113
|
+
proportions[ref] = (count, proportion)
|
|
114
|
+
|
|
115
|
+
return aligned_reads_count, unaligned_reads_count, proportions
|
|
116
|
+
|
|
117
|
+
|
|
28
118
|
def _stream_dorado_logs(stderr_iter) -> None:
|
|
29
119
|
"""Stream dorado stderr and emit structured log messages.
|
|
30
120
|
|
|
@@ -60,8 +150,9 @@ def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str,
|
|
|
60
150
|
|
|
61
151
|
logger.debug(f"Converting BAM to FASTQ using _bam_to_fastq_with_pysam")
|
|
62
152
|
|
|
153
|
+
pysam_mod = _require_pysam()
|
|
63
154
|
with (
|
|
64
|
-
|
|
155
|
+
pysam_mod.AlignmentFile(bam_path, "rb", check_sq=False) as bam,
|
|
65
156
|
open(fastq_path, "w", encoding="utf-8") as fq,
|
|
66
157
|
):
|
|
67
158
|
for r in bam.fetch(until_eof=True):
|
|
@@ -103,7 +194,8 @@ def _sort_bam_with_pysam(
|
|
|
103
194
|
if threads:
|
|
104
195
|
args += ["-@", str(threads)]
|
|
105
196
|
args += ["-o", out_bam, in_bam]
|
|
106
|
-
|
|
197
|
+
pysam_mod = _require_pysam()
|
|
198
|
+
pysam_mod.sort(*args)
|
|
107
199
|
|
|
108
200
|
|
|
109
201
|
def _index_bam_with_pysam(bam_path: Union[str, Path], threads: Optional[int] = None) -> None:
|
|
@@ -115,11 +207,54 @@ def _index_bam_with_pysam(bam_path: Union[str, Path], threads: Optional[int] = N
|
|
|
115
207
|
"""
|
|
116
208
|
bam_path = str(bam_path)
|
|
117
209
|
logger.debug(f"Indexing BAM using _index_bam_with_pysam")
|
|
210
|
+
pysam_mod = _require_pysam()
|
|
118
211
|
# pysam.index supports samtools-style args
|
|
119
212
|
if threads:
|
|
120
|
-
|
|
213
|
+
pysam_mod.index("-@", str(threads), bam_path)
|
|
121
214
|
else:
|
|
122
|
-
|
|
215
|
+
pysam_mod.index(bam_path)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def _bam_to_fastq_with_samtools(bam_path: Union[str, Path], fastq_path: Union[str, Path]) -> None:
|
|
219
|
+
"""Convert BAM to FASTQ using samtools."""
|
|
220
|
+
if not shutil.which("samtools"):
|
|
221
|
+
raise RuntimeError("samtools is required but not available in PATH.")
|
|
222
|
+
cmd = ["samtools", "fastq", str(bam_path)]
|
|
223
|
+
logger.debug("Converting BAM to FASTQ using samtools: %s", " ".join(cmd))
|
|
224
|
+
with open(fastq_path, "w", encoding="utf-8") as fq:
|
|
225
|
+
cp = subprocess.run(cmd, stdout=fq, stderr=subprocess.PIPE, text=True)
|
|
226
|
+
if cp.returncode != 0:
|
|
227
|
+
raise RuntimeError(f"samtools fastq failed (exit {cp.returncode}):\n{cp.stderr}")
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def _sort_bam_with_samtools(
|
|
231
|
+
in_bam: Union[str, Path], out_bam: Union[str, Path], threads: Optional[int] = None
|
|
232
|
+
) -> None:
|
|
233
|
+
"""Sort a BAM file using samtools."""
|
|
234
|
+
if not shutil.which("samtools"):
|
|
235
|
+
raise RuntimeError("samtools is required but not available in PATH.")
|
|
236
|
+
cmd = ["samtools", "sort", "-o", str(out_bam)]
|
|
237
|
+
if threads:
|
|
238
|
+
cmd += ["-@", str(threads)]
|
|
239
|
+
cmd.append(str(in_bam))
|
|
240
|
+
logger.debug("Sorting BAM using samtools: %s", " ".join(cmd))
|
|
241
|
+
cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
|
|
242
|
+
if cp.returncode != 0:
|
|
243
|
+
raise RuntimeError(f"samtools sort failed (exit {cp.returncode}):\n{cp.stderr}")
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def _index_bam_with_samtools(bam_path: Union[str, Path], threads: Optional[int] = None) -> None:
|
|
247
|
+
"""Index a BAM file using samtools."""
|
|
248
|
+
if not shutil.which("samtools"):
|
|
249
|
+
raise RuntimeError("samtools is required but not available in PATH.")
|
|
250
|
+
cmd = ["samtools", "index"]
|
|
251
|
+
if threads:
|
|
252
|
+
cmd += ["-@", str(threads)]
|
|
253
|
+
cmd.append(str(bam_path))
|
|
254
|
+
logger.debug("Indexing BAM using samtools: %s", " ".join(cmd))
|
|
255
|
+
cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
|
|
256
|
+
if cp.returncode != 0:
|
|
257
|
+
raise RuntimeError(f"samtools index failed (exit {cp.returncode}):\n{cp.stderr}")
|
|
123
258
|
|
|
124
259
|
|
|
125
260
|
def align_and_sort_BAM(
|
|
@@ -156,10 +291,15 @@ def align_and_sort_BAM(
|
|
|
156
291
|
else:
|
|
157
292
|
threads = None
|
|
158
293
|
|
|
294
|
+
samtools_backend = _resolve_samtools_backend(getattr(cfg, "samtools_backend", "auto"))
|
|
295
|
+
|
|
159
296
|
if cfg.aligner == "minimap2":
|
|
160
297
|
if not cfg.align_from_bam:
|
|
161
298
|
logger.debug(f"Converting BAM to FASTQ: {input}")
|
|
162
|
-
|
|
299
|
+
if samtools_backend == "python":
|
|
300
|
+
_bam_to_fastq_with_pysam(input, input_as_fastq)
|
|
301
|
+
else:
|
|
302
|
+
_bam_to_fastq_with_samtools(input, input_as_fastq)
|
|
163
303
|
logger.debug(f"Aligning FASTQ to Reference: {input_as_fastq}")
|
|
164
304
|
mm_input = input_as_fastq
|
|
165
305
|
else:
|
|
@@ -220,12 +360,18 @@ def align_and_sort_BAM(
|
|
|
220
360
|
logger.error(f"Aligner not recognized: {cfg.aligner}. Choose from minimap2 and dorado")
|
|
221
361
|
return
|
|
222
362
|
|
|
223
|
-
# --- Sort & Index
|
|
363
|
+
# --- Sort & Index ---
|
|
224
364
|
logger.debug(f"Sorting: {aligned_output} -> {aligned_sorted_output}")
|
|
225
|
-
|
|
365
|
+
if samtools_backend == "python":
|
|
366
|
+
_sort_bam_with_pysam(aligned_output, aligned_sorted_output, threads=threads)
|
|
367
|
+
else:
|
|
368
|
+
_sort_bam_with_samtools(aligned_output, aligned_sorted_output, threads=threads)
|
|
226
369
|
|
|
227
370
|
logger.debug(f"Indexing: {aligned_sorted_output}")
|
|
228
|
-
|
|
371
|
+
if samtools_backend == "python":
|
|
372
|
+
_index_bam_with_pysam(aligned_sorted_output, threads=threads)
|
|
373
|
+
else:
|
|
374
|
+
_index_bam_with_samtools(aligned_sorted_output, threads=threads)
|
|
229
375
|
|
|
230
376
|
|
|
231
377
|
def bam_qc(
|
|
@@ -236,25 +382,20 @@ def bam_qc(
|
|
|
236
382
|
stats: bool = True,
|
|
237
383
|
flagstats: bool = True,
|
|
238
384
|
idxstats: bool = True,
|
|
385
|
+
samtools_backend: str | None = "auto",
|
|
239
386
|
) -> None:
|
|
240
387
|
"""
|
|
241
388
|
QC for BAM/CRAMs: stats, flagstat, idxstats.
|
|
242
389
|
Prefers pysam; falls back to `samtools` if needed.
|
|
243
390
|
Runs BAMs in parallel (up to `threads`, default serial).
|
|
244
391
|
"""
|
|
245
|
-
import shutil
|
|
246
392
|
import subprocess
|
|
247
393
|
|
|
248
394
|
logger.debug("Performing BAM QC using bam_qc")
|
|
249
395
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
have_pysam = True
|
|
255
|
-
except Exception:
|
|
256
|
-
pysam = None # type: ignore
|
|
257
|
-
have_pysam = False
|
|
396
|
+
backend_choice = _resolve_samtools_backend(samtools_backend)
|
|
397
|
+
have_pysam = backend_choice == "python"
|
|
398
|
+
pysam_mod = _require_pysam() if have_pysam else None
|
|
258
399
|
|
|
259
400
|
bam_qc_dir = Path(bam_qc_dir)
|
|
260
401
|
bam_qc_dir.mkdir(parents=True, exist_ok=True)
|
|
@@ -275,11 +416,9 @@ def bam_qc(
|
|
|
275
416
|
if _has_index(p):
|
|
276
417
|
return
|
|
277
418
|
if have_pysam:
|
|
278
|
-
assert
|
|
279
|
-
|
|
419
|
+
assert pysam_mod is not None
|
|
420
|
+
pysam_mod.index(str(p)) # supports BAM & CRAM
|
|
280
421
|
else:
|
|
281
|
-
if not shutil.which("samtools"):
|
|
282
|
-
raise RuntimeError("Neither pysam nor samtools is available in PATH.")
|
|
283
422
|
cmd = ["samtools", "index", str(p)]
|
|
284
423
|
# capture text so errors are readable; raise on failure
|
|
285
424
|
cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
|
|
@@ -300,7 +439,7 @@ def bam_qc(
|
|
|
300
439
|
line = line.rstrip()
|
|
301
440
|
if line:
|
|
302
441
|
last_err.append(line)
|
|
303
|
-
logger.
|
|
442
|
+
logger.debug("[%s][%s] %s", tag, bam.name, line)
|
|
304
443
|
rc = proc.wait()
|
|
305
444
|
|
|
306
445
|
if rc != 0:
|
|
@@ -332,16 +471,13 @@ def bam_qc(
|
|
|
332
471
|
# Still attempt stats/flagstat if requested; idxstats may fail later if index is required.
|
|
333
472
|
logger.warning("Indexing failed for %s: %s", bam, e)
|
|
334
473
|
|
|
335
|
-
if not have_pysam:
|
|
336
|
-
import shutil
|
|
337
|
-
|
|
338
|
-
if not shutil.which("samtools"):
|
|
339
|
-
raise RuntimeError("Neither pysam nor samtools is available in PATH.")
|
|
340
|
-
|
|
341
474
|
# --- stats ---
|
|
342
475
|
if stats:
|
|
343
|
-
if have_pysam
|
|
344
|
-
|
|
476
|
+
if have_pysam:
|
|
477
|
+
assert pysam_mod is not None
|
|
478
|
+
if not hasattr(pysam_mod, "stats"):
|
|
479
|
+
raise RuntimeError("pysam.stats is unavailable in this pysam build.")
|
|
480
|
+
txt = pysam_mod.stats(str(bam))
|
|
345
481
|
out_stats.write_text(txt)
|
|
346
482
|
results.append(("stats(pysam)", 0))
|
|
347
483
|
else:
|
|
@@ -351,8 +487,11 @@ def bam_qc(
|
|
|
351
487
|
|
|
352
488
|
# --- flagstat ---
|
|
353
489
|
if flagstats:
|
|
354
|
-
if have_pysam
|
|
355
|
-
|
|
490
|
+
if have_pysam:
|
|
491
|
+
assert pysam_mod is not None
|
|
492
|
+
if not hasattr(pysam_mod, "flagstat"):
|
|
493
|
+
raise RuntimeError("pysam.flagstat is unavailable in this pysam build.")
|
|
494
|
+
txt = pysam_mod.flagstat(str(bam))
|
|
356
495
|
out_flag.write_text(txt)
|
|
357
496
|
results.append(("flagstat(pysam)", 0))
|
|
358
497
|
else:
|
|
@@ -362,8 +501,11 @@ def bam_qc(
|
|
|
362
501
|
|
|
363
502
|
# --- idxstats ---
|
|
364
503
|
if idxstats:
|
|
365
|
-
if have_pysam
|
|
366
|
-
|
|
504
|
+
if have_pysam:
|
|
505
|
+
assert pysam_mod is not None
|
|
506
|
+
if not hasattr(pysam_mod, "idxstats"):
|
|
507
|
+
raise RuntimeError("pysam.idxstats is unavailable in this pysam build.")
|
|
508
|
+
txt = pysam_mod.idxstats(str(bam))
|
|
367
509
|
out_idx.write_text(txt)
|
|
368
510
|
results.append(("idxstats(pysam)", 0))
|
|
369
511
|
else:
|
|
@@ -400,6 +542,8 @@ def concatenate_fastqs_to_bam(
|
|
|
400
542
|
rg_sample_field: Optional[str] = None,
|
|
401
543
|
progress: bool = True,
|
|
402
544
|
auto_pair: bool = True,
|
|
545
|
+
gzip_suffixes: Tuple[str, ...] = (".gz", ".gzip"),
|
|
546
|
+
samtools_backend: str | None = "auto",
|
|
403
547
|
) -> Dict[str, Any]:
|
|
404
548
|
"""
|
|
405
549
|
Concatenate FASTQ(s) into an **unaligned** BAM. Supports single-end and paired-end.
|
|
@@ -422,6 +566,10 @@ def concatenate_fastqs_to_bam(
|
|
|
422
566
|
Show tqdm progress bars.
|
|
423
567
|
auto_pair : bool
|
|
424
568
|
Auto-pair R1/R2 based on filename patterns if given a flat list.
|
|
569
|
+
gzip_suffixes : tuple[str, ...]
|
|
570
|
+
Suffixes treated as gzip-compressed FASTQ files.
|
|
571
|
+
samtools_backend : str | None
|
|
572
|
+
Backend selection for samtools-compatible operations (auto|python|cli).
|
|
425
573
|
|
|
426
574
|
Returns
|
|
427
575
|
-------
|
|
@@ -436,9 +584,10 @@ def concatenate_fastqs_to_bam(
|
|
|
436
584
|
"""
|
|
437
585
|
name = p.name
|
|
438
586
|
lowers = name.lower()
|
|
587
|
+
gzip_exts = tuple(s.lower() for s in gzip_suffixes)
|
|
439
588
|
for ext in (
|
|
440
|
-
".fastq
|
|
441
|
-
".fq
|
|
589
|
+
*(f".fastq{suf}" for suf in gzip_exts),
|
|
590
|
+
*(f".fq{suf}" for suf in gzip_exts),
|
|
442
591
|
".fastq.bz2",
|
|
443
592
|
".fq.bz2",
|
|
444
593
|
".fastq.xz",
|
|
@@ -525,10 +674,50 @@ def concatenate_fastqs_to_bam(
|
|
|
525
674
|
Pysam Fastx records.
|
|
526
675
|
"""
|
|
527
676
|
# pysam.FastxFile handles compressed extensions transparently
|
|
528
|
-
|
|
677
|
+
pysam_mod = _require_pysam()
|
|
678
|
+
with pysam_mod.FastxFile(str(p)) as fx:
|
|
529
679
|
for rec in fx:
|
|
530
680
|
yield rec # rec.name, rec.sequence, rec.quality
|
|
531
681
|
|
|
682
|
+
def _fastq_iter_plain(p: Path) -> Iterable[Tuple[str, str, str]]:
|
|
683
|
+
"""Yield FASTQ records from plain-text parsing.
|
|
684
|
+
|
|
685
|
+
Args:
|
|
686
|
+
p: FASTQ path.
|
|
687
|
+
|
|
688
|
+
Yields:
|
|
689
|
+
Tuple of (name, sequence, quality).
|
|
690
|
+
"""
|
|
691
|
+
import bz2
|
|
692
|
+
import gzip
|
|
693
|
+
import lzma
|
|
694
|
+
|
|
695
|
+
lowers = p.name.lower()
|
|
696
|
+
if any(lowers.endswith(suf) for suf in (s.lower() for s in gzip_suffixes)):
|
|
697
|
+
handle = gzip.open(p, "rt", encoding="utf-8")
|
|
698
|
+
elif lowers.endswith(".bz2"):
|
|
699
|
+
handle = bz2.open(p, "rt", encoding="utf-8")
|
|
700
|
+
elif lowers.endswith(".xz"):
|
|
701
|
+
handle = lzma.open(p, "rt", encoding="utf-8")
|
|
702
|
+
else:
|
|
703
|
+
handle = p.open("r", encoding="utf-8")
|
|
704
|
+
|
|
705
|
+
with handle as fh:
|
|
706
|
+
while True:
|
|
707
|
+
header = fh.readline()
|
|
708
|
+
if not header:
|
|
709
|
+
break
|
|
710
|
+
seq = fh.readline()
|
|
711
|
+
fh.readline()
|
|
712
|
+
qual = fh.readline()
|
|
713
|
+
if not qual:
|
|
714
|
+
break
|
|
715
|
+
name = header.strip()
|
|
716
|
+
if name.startswith("@"):
|
|
717
|
+
name = name[1:]
|
|
718
|
+
name = name.split()[0]
|
|
719
|
+
yield name, seq.strip(), qual.strip()
|
|
720
|
+
|
|
532
721
|
def _make_unaligned_segment(
|
|
533
722
|
name: str,
|
|
534
723
|
seq: str,
|
|
@@ -550,11 +739,12 @@ def concatenate_fastqs_to_bam(
|
|
|
550
739
|
Returns:
|
|
551
740
|
Unaligned pysam.AlignedSegment.
|
|
552
741
|
"""
|
|
553
|
-
|
|
742
|
+
pysam_mod = _require_pysam()
|
|
743
|
+
a = pysam_mod.AlignedSegment()
|
|
554
744
|
a.query_name = name
|
|
555
745
|
a.query_sequence = seq
|
|
556
746
|
if qual is not None:
|
|
557
|
-
a.query_qualities =
|
|
747
|
+
a.query_qualities = pysam_mod.qualitystring_to_array(qual)
|
|
558
748
|
a.is_unmapped = True
|
|
559
749
|
a.is_paired = read1 or read2
|
|
560
750
|
a.is_read1 = read1
|
|
@@ -570,6 +760,48 @@ def concatenate_fastqs_to_bam(
|
|
|
570
760
|
a.set_tag("RG", str(bc), value_type="Z")
|
|
571
761
|
return a
|
|
572
762
|
|
|
763
|
+
def _write_sam_line(
|
|
764
|
+
handle,
|
|
765
|
+
name: str,
|
|
766
|
+
seq: str,
|
|
767
|
+
qual: str,
|
|
768
|
+
bc: str,
|
|
769
|
+
*,
|
|
770
|
+
read1: bool,
|
|
771
|
+
read2: bool,
|
|
772
|
+
add_read_group: bool,
|
|
773
|
+
) -> None:
|
|
774
|
+
"""Write a single unaligned SAM record to a text stream."""
|
|
775
|
+
if read1:
|
|
776
|
+
flag = 77
|
|
777
|
+
elif read2:
|
|
778
|
+
flag = 141
|
|
779
|
+
else:
|
|
780
|
+
flag = 4
|
|
781
|
+
tags = [f"{barcode_tag}:Z:{bc}"]
|
|
782
|
+
if add_read_group:
|
|
783
|
+
tags.append(f"RG:Z:{bc}")
|
|
784
|
+
tag_str = "\t".join(tags)
|
|
785
|
+
if not qual:
|
|
786
|
+
qual = "*"
|
|
787
|
+
line = "\t".join(
|
|
788
|
+
[
|
|
789
|
+
name,
|
|
790
|
+
str(flag),
|
|
791
|
+
"*",
|
|
792
|
+
"0",
|
|
793
|
+
"0",
|
|
794
|
+
"*",
|
|
795
|
+
"*",
|
|
796
|
+
"0",
|
|
797
|
+
"0",
|
|
798
|
+
seq,
|
|
799
|
+
qual,
|
|
800
|
+
tag_str,
|
|
801
|
+
]
|
|
802
|
+
)
|
|
803
|
+
handle.write(f"{line}\n")
|
|
804
|
+
|
|
573
805
|
# ---------- normalize inputs to Path ----------
|
|
574
806
|
def _to_path_pair(x) -> Tuple[Path, Path]:
|
|
575
807
|
"""Convert a tuple of path-like objects to Path instances."""
|
|
@@ -630,7 +862,29 @@ def concatenate_fastqs_to_bam(
|
|
|
630
862
|
singletons_written = 0
|
|
631
863
|
|
|
632
864
|
# ---------- write BAM ----------
|
|
633
|
-
|
|
865
|
+
backend_choice = _resolve_samtools_backend(samtools_backend)
|
|
866
|
+
if backend_choice == "python":
|
|
867
|
+
pysam_mod = _require_pysam()
|
|
868
|
+
bam_out_ctx = pysam_mod.AlignmentFile(str(output_bam), "wb", header=header)
|
|
869
|
+
else:
|
|
870
|
+
cmd = ["samtools", "view", "-b", "-o", str(output_bam), "-"]
|
|
871
|
+
logger.debug("Writing BAM using samtools: %s", " ".join(cmd))
|
|
872
|
+
bam_out_ctx = subprocess.Popen(
|
|
873
|
+
cmd, stdin=subprocess.PIPE, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True
|
|
874
|
+
)
|
|
875
|
+
assert bam_out_ctx.stdin is not None
|
|
876
|
+
header_lines = ["@HD\tVN:1.6\tSO:unknown"]
|
|
877
|
+
if add_read_group:
|
|
878
|
+
for bc in barcodes_in_order:
|
|
879
|
+
rg_fields = [f"ID:{bc}"]
|
|
880
|
+
if rg_sample_field:
|
|
881
|
+
rg_fields.append(f"SM:{rg_sample_field}")
|
|
882
|
+
rg_body = "\t".join(rg_fields)
|
|
883
|
+
header_lines.append(f"@RG\t{rg_body}")
|
|
884
|
+
header_lines.append("@PG\tID:concat-fastq\tPN:concatenate_fastqs_to_bam\tVN:1")
|
|
885
|
+
bam_out_ctx.stdin.write("\n".join(header_lines) + "\n")
|
|
886
|
+
|
|
887
|
+
try:
|
|
634
888
|
# Paired
|
|
635
889
|
it_pairs = explicit_pairs
|
|
636
890
|
if progress and it_pairs:
|
|
@@ -640,8 +894,12 @@ def concatenate_fastqs_to_bam(
|
|
|
640
894
|
raise FileNotFoundError(f"Paired file missing: {r1_path} or {r2_path}")
|
|
641
895
|
bc = per_path_barcode.get(r1_path) or per_path_barcode.get(r2_path) or "barcode"
|
|
642
896
|
|
|
643
|
-
|
|
644
|
-
|
|
897
|
+
if backend_choice == "python":
|
|
898
|
+
it1 = _fastq_iter(r1_path)
|
|
899
|
+
it2 = _fastq_iter(r2_path)
|
|
900
|
+
else:
|
|
901
|
+
it1 = _fastq_iter_plain(r1_path)
|
|
902
|
+
it2 = _fastq_iter_plain(r2_path)
|
|
645
903
|
|
|
646
904
|
for rec1, rec2 in zip_longest(it1, it2, fillvalue=None):
|
|
647
905
|
|
|
@@ -652,24 +910,67 @@ def concatenate_fastqs_to_bam(
|
|
|
652
910
|
return re.sub(r"(?:/1$|/2$|\s[12]$)", "", n)
|
|
653
911
|
|
|
654
912
|
name = (
|
|
655
|
-
_clean(getattr(rec1, "name", None))
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
or getattr(rec2, "name", None)
|
|
913
|
+
_clean(getattr(rec1, "name", None) if backend_choice == "python" else rec1[0])
|
|
914
|
+
if rec1 is not None
|
|
915
|
+
else None
|
|
659
916
|
)
|
|
917
|
+
if name is None:
|
|
918
|
+
name = (
|
|
919
|
+
_clean(
|
|
920
|
+
getattr(rec2, "name", None) if backend_choice == "python" else rec2[0]
|
|
921
|
+
)
|
|
922
|
+
if rec2 is not None
|
|
923
|
+
else None
|
|
924
|
+
)
|
|
925
|
+
if name is None:
|
|
926
|
+
name = (
|
|
927
|
+
getattr(rec1, "name", None)
|
|
928
|
+
if backend_choice == "python" and rec1 is not None
|
|
929
|
+
else (rec1[0] if rec1 is not None else None)
|
|
930
|
+
)
|
|
931
|
+
if name is None:
|
|
932
|
+
name = (
|
|
933
|
+
getattr(rec2, "name", None)
|
|
934
|
+
if backend_choice == "python" and rec2 is not None
|
|
935
|
+
else (rec2[0] if rec2 is not None else None)
|
|
936
|
+
)
|
|
660
937
|
|
|
661
938
|
if rec1 is not None:
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
939
|
+
if backend_choice == "python":
|
|
940
|
+
a1 = _make_unaligned_segment(
|
|
941
|
+
name, rec1.sequence, rec1.quality, bc, read1=True, read2=False
|
|
942
|
+
)
|
|
943
|
+
bam_out_ctx.write(a1)
|
|
944
|
+
else:
|
|
945
|
+
_write_sam_line(
|
|
946
|
+
bam_out_ctx.stdin,
|
|
947
|
+
name,
|
|
948
|
+
rec1[1],
|
|
949
|
+
rec1[2],
|
|
950
|
+
bc,
|
|
951
|
+
read1=True,
|
|
952
|
+
read2=False,
|
|
953
|
+
add_read_group=add_read_group,
|
|
954
|
+
)
|
|
666
955
|
per_file_counts[r1_path] = per_file_counts.get(r1_path, 0) + 1
|
|
667
956
|
total_written += 1
|
|
668
957
|
if rec2 is not None:
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
958
|
+
if backend_choice == "python":
|
|
959
|
+
a2 = _make_unaligned_segment(
|
|
960
|
+
name, rec2.sequence, rec2.quality, bc, read1=False, read2=True
|
|
961
|
+
)
|
|
962
|
+
bam_out_ctx.write(a2)
|
|
963
|
+
else:
|
|
964
|
+
_write_sam_line(
|
|
965
|
+
bam_out_ctx.stdin,
|
|
966
|
+
name,
|
|
967
|
+
rec2[1],
|
|
968
|
+
rec2[2],
|
|
969
|
+
bc,
|
|
970
|
+
read1=False,
|
|
971
|
+
read2=True,
|
|
972
|
+
add_read_group=add_read_group,
|
|
973
|
+
)
|
|
673
974
|
per_file_counts[r2_path] = per_file_counts.get(r2_path, 0) + 1
|
|
674
975
|
total_written += 1
|
|
675
976
|
|
|
@@ -689,14 +990,40 @@ def concatenate_fastqs_to_bam(
|
|
|
689
990
|
if not pth.exists():
|
|
690
991
|
raise FileNotFoundError(pth)
|
|
691
992
|
bc = per_path_barcode.get(pth, "barcode")
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
)
|
|
696
|
-
|
|
993
|
+
if backend_choice == "python":
|
|
994
|
+
iterator = _fastq_iter(pth)
|
|
995
|
+
else:
|
|
996
|
+
iterator = _fastq_iter_plain(pth)
|
|
997
|
+
for rec in iterator:
|
|
998
|
+
if backend_choice == "python":
|
|
999
|
+
a = _make_unaligned_segment(
|
|
1000
|
+
rec.name, rec.sequence, rec.quality, bc, read1=False, read2=False
|
|
1001
|
+
)
|
|
1002
|
+
bam_out_ctx.write(a)
|
|
1003
|
+
else:
|
|
1004
|
+
_write_sam_line(
|
|
1005
|
+
bam_out_ctx.stdin,
|
|
1006
|
+
rec[0],
|
|
1007
|
+
rec[1],
|
|
1008
|
+
rec[2],
|
|
1009
|
+
bc,
|
|
1010
|
+
read1=False,
|
|
1011
|
+
read2=False,
|
|
1012
|
+
add_read_group=add_read_group,
|
|
1013
|
+
)
|
|
697
1014
|
per_file_counts[pth] = per_file_counts.get(pth, 0) + 1
|
|
698
1015
|
total_written += 1
|
|
699
1016
|
singletons_written += 1
|
|
1017
|
+
finally:
|
|
1018
|
+
if backend_choice == "python":
|
|
1019
|
+
bam_out_ctx.close()
|
|
1020
|
+
else:
|
|
1021
|
+
if bam_out_ctx.stdin is not None:
|
|
1022
|
+
bam_out_ctx.stdin.close()
|
|
1023
|
+
rc = bam_out_ctx.wait()
|
|
1024
|
+
if rc != 0:
|
|
1025
|
+
stderr = bam_out_ctx.stderr.read() if bam_out_ctx.stderr else ""
|
|
1026
|
+
raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
|
|
700
1027
|
|
|
701
1028
|
return {
|
|
702
1029
|
"total_reads": total_written,
|
|
@@ -707,7 +1034,7 @@ def concatenate_fastqs_to_bam(
|
|
|
707
1034
|
}
|
|
708
1035
|
|
|
709
1036
|
|
|
710
|
-
def count_aligned_reads(bam_file):
|
|
1037
|
+
def count_aligned_reads(bam_file, samtools_backend: str | None = "auto"):
|
|
711
1038
|
"""
|
|
712
1039
|
Counts the number of aligned reads in a bam file that map to each reference record.
|
|
713
1040
|
|
|
@@ -720,30 +1047,42 @@ def count_aligned_reads(bam_file):
|
|
|
720
1047
|
record_counts (dict): A dictionary keyed by reference record instance that points toa tuple containing the total reads mapped to the record and the fraction of mapped reads which map to the record.
|
|
721
1048
|
|
|
722
1049
|
"""
|
|
723
|
-
|
|
1050
|
+
logger.info("Counting aligned reads in BAM > {}".format(bam_file.name))
|
|
1051
|
+
backend_choice = _resolve_samtools_backend(samtools_backend)
|
|
724
1052
|
aligned_reads_count = 0
|
|
725
1053
|
unaligned_reads_count = 0
|
|
726
|
-
# Make a dictionary, keyed by the reference_name of reference chromosome that points to an integer number of read counts mapped to the chromosome, as well as the proportion of mapped reads in that chromosome
|
|
727
|
-
record_counts = defaultdict(int)
|
|
728
|
-
|
|
729
|
-
with pysam.AlignmentFile(str(bam_file), "rb") as bam:
|
|
730
|
-
total_reads = bam.mapped + bam.unmapped
|
|
731
|
-
# Iterate over reads to get the total mapped read counts and the reads that map to each reference
|
|
732
|
-
for read in tqdm(bam, desc="Counting aligned reads in BAM", total=total_reads):
|
|
733
|
-
if read.is_unmapped:
|
|
734
|
-
unaligned_reads_count += 1
|
|
735
|
-
else:
|
|
736
|
-
aligned_reads_count += 1
|
|
737
|
-
record_counts[read.reference_name] += (
|
|
738
|
-
1 # Automatically increments if key exists, adds if not
|
|
739
|
-
)
|
|
740
1054
|
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
1055
|
+
if backend_choice == "python":
|
|
1056
|
+
pysam_mod = _require_pysam()
|
|
1057
|
+
record_counts = defaultdict(int)
|
|
1058
|
+
with pysam_mod.AlignmentFile(str(bam_file), "rb") as bam:
|
|
1059
|
+
total_reads = bam.mapped + bam.unmapped
|
|
1060
|
+
# Iterate over reads to get the total mapped read counts and the reads that map to each reference
|
|
1061
|
+
for read in bam:
|
|
1062
|
+
if read.is_unmapped:
|
|
1063
|
+
unaligned_reads_count += 1
|
|
1064
|
+
else:
|
|
1065
|
+
aligned_reads_count += 1
|
|
1066
|
+
record_counts[read.reference_name] += (
|
|
1067
|
+
1 # Automatically increments if key exists, adds if not
|
|
1068
|
+
)
|
|
745
1069
|
|
|
746
|
-
|
|
1070
|
+
# reformat the dictionary to contain read counts mapped to the reference, as well as the proportion of mapped reads in reference
|
|
1071
|
+
for reference in record_counts:
|
|
1072
|
+
proportion_mapped_reads_in_record = record_counts[reference] / aligned_reads_count
|
|
1073
|
+
record_counts[reference] = (
|
|
1074
|
+
record_counts[reference],
|
|
1075
|
+
proportion_mapped_reads_in_record,
|
|
1076
|
+
)
|
|
1077
|
+
return aligned_reads_count, unaligned_reads_count, dict(record_counts)
|
|
1078
|
+
|
|
1079
|
+
bam_path = Path(bam_file)
|
|
1080
|
+
_ensure_bam_index(bam_path, backend_choice)
|
|
1081
|
+
cmd = ["samtools", "idxstats", str(bam_path)]
|
|
1082
|
+
cp = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
|
1083
|
+
if cp.returncode != 0:
|
|
1084
|
+
raise RuntimeError(f"samtools idxstats failed (exit {cp.returncode}):\n{cp.stderr}")
|
|
1085
|
+
return _parse_idxstats_output(cp.stdout)
|
|
747
1086
|
|
|
748
1087
|
|
|
749
1088
|
def demux_and_index_BAM(
|
|
@@ -827,7 +1166,14 @@ def demux_and_index_BAM(
|
|
|
827
1166
|
return renamed_bams
|
|
828
1167
|
|
|
829
1168
|
|
|
830
|
-
def extract_base_identities(
|
|
1169
|
+
def extract_base_identities(
|
|
1170
|
+
bam_file,
|
|
1171
|
+
chromosome,
|
|
1172
|
+
positions,
|
|
1173
|
+
max_reference_length,
|
|
1174
|
+
sequence,
|
|
1175
|
+
samtools_backend: str | None = "auto",
|
|
1176
|
+
):
|
|
831
1177
|
"""
|
|
832
1178
|
Efficiently extracts base identities from mapped reads with reference coordinates.
|
|
833
1179
|
|
|
@@ -850,31 +1196,87 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
|
|
|
850
1196
|
rev_base_identities = defaultdict(lambda: np.full(max_reference_length, "N", dtype="<U1"))
|
|
851
1197
|
mismatch_counts_per_read = defaultdict(lambda: defaultdict(Counter))
|
|
852
1198
|
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
total_reads = bam.mapped
|
|
856
|
-
ref_seq = sequence.upper()
|
|
857
|
-
for read in bam.fetch(chromosome):
|
|
858
|
-
if not read.is_mapped:
|
|
859
|
-
continue # Skip unmapped reads
|
|
1199
|
+
backend_choice = _resolve_samtools_backend(samtools_backend)
|
|
1200
|
+
ref_seq = sequence.upper()
|
|
860
1201
|
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
1202
|
+
if backend_choice == "python":
|
|
1203
|
+
logger.debug("Extracting base identities using python")
|
|
1204
|
+
pysam_mod = _require_pysam()
|
|
1205
|
+
# print(f"{timestamp} Reading reads from {chromosome} BAM file: {bam_file}")
|
|
1206
|
+
with pysam_mod.AlignmentFile(str(bam_file), "rb") as bam:
|
|
1207
|
+
total_reads = bam.mapped
|
|
1208
|
+
for read in bam.fetch(chromosome):
|
|
1209
|
+
if not read.is_mapped:
|
|
1210
|
+
continue # Skip unmapped reads
|
|
864
1211
|
|
|
865
|
-
|
|
866
|
-
|
|
1212
|
+
read_name = read.query_name
|
|
1213
|
+
query_sequence = read.query_sequence
|
|
1214
|
+
base_dict = rev_base_identities if read.is_reverse else fwd_base_identities
|
|
867
1215
|
|
|
868
|
-
|
|
869
|
-
|
|
1216
|
+
# Use get_aligned_pairs directly with positions filtering
|
|
1217
|
+
aligned_pairs = read.get_aligned_pairs(matches_only=True)
|
|
1218
|
+
|
|
1219
|
+
for read_position, reference_position in aligned_pairs:
|
|
870
1220
|
read_base = query_sequence[read_position]
|
|
871
1221
|
ref_base = ref_seq[reference_position]
|
|
1222
|
+
if reference_position in positions:
|
|
1223
|
+
base_dict[read_name][reference_position] = read_base
|
|
872
1224
|
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
1225
|
+
# Track mismatches (excluding Ns)
|
|
1226
|
+
if read_base != ref_base and read_base != "N" and ref_base != "N":
|
|
1227
|
+
mismatch_counts_per_read[read_name][ref_base][read_base] += 1
|
|
1228
|
+
else:
|
|
1229
|
+
bam_path = Path(bam_file)
|
|
1230
|
+
logger.debug("Extracting base identities using samtools")
|
|
1231
|
+
_ensure_bam_index(bam_path, backend_choice)
|
|
1232
|
+
|
|
1233
|
+
def _iter_aligned_pairs(cigar: str, start: int) -> Iterable[Tuple[int, int]]:
|
|
1234
|
+
qpos = 0
|
|
1235
|
+
rpos = start
|
|
1236
|
+
for length_str, op in re.findall(r"(\d+)([MIDNSHP=XB])", cigar):
|
|
1237
|
+
length = int(length_str)
|
|
1238
|
+
if op in {"M", "=", "X"}:
|
|
1239
|
+
for _ in range(length):
|
|
1240
|
+
yield qpos, rpos
|
|
1241
|
+
qpos += 1
|
|
1242
|
+
rpos += 1
|
|
1243
|
+
elif op in {"I", "S"}:
|
|
1244
|
+
qpos += length
|
|
1245
|
+
elif op in {"D", "N"}:
|
|
1246
|
+
rpos += length
|
|
1247
|
+
elif op in {"H", "P"}:
|
|
1248
|
+
continue
|
|
1249
|
+
|
|
1250
|
+
cmd = ["samtools", "view", "-F", "4", str(bam_path), chromosome]
|
|
1251
|
+
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
|
1252
|
+
assert proc.stdout is not None
|
|
1253
|
+
for line in proc.stdout:
|
|
1254
|
+
if not line.strip() or line.startswith("@"):
|
|
1255
|
+
continue
|
|
1256
|
+
fields = line.rstrip("\n").split("\t")
|
|
1257
|
+
if len(fields) < 11:
|
|
1258
|
+
continue
|
|
1259
|
+
read_name = fields[0]
|
|
1260
|
+
flag = int(fields[1])
|
|
1261
|
+
pos = int(fields[3])
|
|
1262
|
+
cigar = fields[5]
|
|
1263
|
+
query_sequence = fields[9]
|
|
1264
|
+
if cigar == "*" or query_sequence == "*":
|
|
1265
|
+
continue
|
|
1266
|
+
base_dict = rev_base_identities if (flag & 16) else fwd_base_identities
|
|
1267
|
+
for read_pos, ref_pos in _iter_aligned_pairs(cigar, pos - 1):
|
|
1268
|
+
if read_pos >= len(query_sequence) or ref_pos >= len(ref_seq):
|
|
1269
|
+
continue
|
|
1270
|
+
read_base = query_sequence[read_pos]
|
|
1271
|
+
ref_base = ref_seq[ref_pos]
|
|
1272
|
+
if ref_pos in positions:
|
|
1273
|
+
base_dict[read_name][ref_pos] = read_base
|
|
876
1274
|
if read_base != ref_base and read_base != "N" and ref_base != "N":
|
|
877
1275
|
mismatch_counts_per_read[read_name][ref_base][read_base] += 1
|
|
1276
|
+
rc = proc.wait()
|
|
1277
|
+
if rc != 0:
|
|
1278
|
+
stderr = proc.stderr.read() if proc.stderr else ""
|
|
1279
|
+
raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
|
|
878
1280
|
|
|
879
1281
|
# Determine C→T vs G→A dominance per read
|
|
880
1282
|
mismatch_trend_per_read = {}
|
|
@@ -899,46 +1301,137 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
|
|
|
899
1301
|
)
|
|
900
1302
|
|
|
901
1303
|
|
|
902
|
-
def extract_read_features_from_bam(
|
|
903
|
-
""
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
1304
|
+
def extract_read_features_from_bam(
|
|
1305
|
+
bam_file_path: str | Path, samtools_backend: str | None = "auto"
|
|
1306
|
+
) -> Dict[str, List[float]]:
|
|
1307
|
+
"""Extract read metrics from a BAM file.
|
|
1308
|
+
|
|
1309
|
+
Args:
|
|
1310
|
+
bam_file_path: Path to the BAM file.
|
|
1311
|
+
samtools_backend: Backend selection for samtools-compatible operations (auto|python|cli).
|
|
1312
|
+
|
|
907
1313
|
Returns:
|
|
908
|
-
|
|
1314
|
+
Mapping of read name to [read_length, read_median_qscore, reference_length,
|
|
1315
|
+
mapped_length, mapping_quality].
|
|
909
1316
|
"""
|
|
910
|
-
# Open the BAM file
|
|
911
1317
|
logger.debug(
|
|
912
|
-
|
|
1318
|
+
"Extracting read metrics from BAM using extract_read_features_from_bam: %s",
|
|
1319
|
+
bam_file_path,
|
|
913
1320
|
)
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
1321
|
+
backend_choice = _resolve_samtools_backend(samtools_backend)
|
|
1322
|
+
read_metrics: Dict[str, List[float]] = {}
|
|
1323
|
+
|
|
1324
|
+
if backend_choice == "python":
|
|
1325
|
+
pysam_mod = _require_pysam()
|
|
1326
|
+
with pysam_mod.AlignmentFile(str(bam_file_path), "rb") as bam_file:
|
|
1327
|
+
reference_lengths = dict(zip(bam_file.references, bam_file.lengths))
|
|
1328
|
+
for read in bam_file:
|
|
1329
|
+
if read.is_unmapped:
|
|
1330
|
+
continue
|
|
1331
|
+
read_quality = read.query_qualities
|
|
1332
|
+
if read_quality is None:
|
|
1333
|
+
median_read_quality = float("nan")
|
|
1334
|
+
else:
|
|
1335
|
+
median_read_quality = float(np.median(read_quality))
|
|
1336
|
+
reference_length = reference_lengths.get(read.reference_name, float("nan"))
|
|
1337
|
+
mapped_length = sum(end - start for start, end in read.get_blocks())
|
|
1338
|
+
mapping_quality = float(read.mapping_quality)
|
|
1339
|
+
read_metrics[read.query_name] = [
|
|
1340
|
+
float(read.query_length),
|
|
1341
|
+
median_read_quality,
|
|
1342
|
+
float(reference_length),
|
|
1343
|
+
float(mapped_length),
|
|
1344
|
+
mapping_quality,
|
|
1345
|
+
]
|
|
1346
|
+
return read_metrics
|
|
1347
|
+
|
|
1348
|
+
bam_path = Path(bam_file_path)
|
|
1349
|
+
|
|
1350
|
+
def _parse_reference_lengths(header_text: str) -> Dict[str, int]:
|
|
1351
|
+
ref_lengths: Dict[str, int] = {}
|
|
1352
|
+
for line in header_text.splitlines():
|
|
1353
|
+
if not line.startswith("@SQ"):
|
|
920
1354
|
continue
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
1355
|
+
fields = line.split("\t")
|
|
1356
|
+
name = None
|
|
1357
|
+
length = None
|
|
1358
|
+
for field in fields[1:]:
|
|
1359
|
+
if field.startswith("SN:"):
|
|
1360
|
+
name = field.split(":", 1)[1]
|
|
1361
|
+
elif field.startswith("LN:"):
|
|
1362
|
+
length = int(field.split(":", 1)[1])
|
|
1363
|
+
if name is not None and length is not None:
|
|
1364
|
+
ref_lengths[name] = length
|
|
1365
|
+
return ref_lengths
|
|
1366
|
+
|
|
1367
|
+
def _mapped_length_from_cigar(cigar: str) -> int:
|
|
1368
|
+
mapped = 0
|
|
1369
|
+
for length_str, op in re.findall(r"(\d+)([MIDNSHP=XB])", cigar):
|
|
1370
|
+
length = int(length_str)
|
|
1371
|
+
if op in {"M", "=", "X"}:
|
|
1372
|
+
mapped += length
|
|
1373
|
+
return mapped
|
|
1374
|
+
|
|
1375
|
+
header_cp = subprocess.run(
|
|
1376
|
+
["samtools", "view", "-H", str(bam_path)],
|
|
1377
|
+
stdout=subprocess.PIPE,
|
|
1378
|
+
stderr=subprocess.PIPE,
|
|
1379
|
+
text=True,
|
|
1380
|
+
check=False,
|
|
1381
|
+
)
|
|
1382
|
+
if header_cp.returncode != 0:
|
|
1383
|
+
raise RuntimeError(
|
|
1384
|
+
f"samtools view -H failed (exit {header_cp.returncode}):\n{header_cp.stderr}"
|
|
1385
|
+
)
|
|
1386
|
+
reference_lengths = _parse_reference_lengths(header_cp.stdout)
|
|
1387
|
+
|
|
1388
|
+
proc = subprocess.Popen(
|
|
1389
|
+
["samtools", "view", "-F", "4", str(bam_path)],
|
|
1390
|
+
stdout=subprocess.PIPE,
|
|
1391
|
+
stderr=subprocess.PIPE,
|
|
1392
|
+
text=True,
|
|
1393
|
+
)
|
|
1394
|
+
assert proc.stdout is not None
|
|
1395
|
+
for line in proc.stdout:
|
|
1396
|
+
if not line.strip() or line.startswith("@"):
|
|
1397
|
+
continue
|
|
1398
|
+
fields = line.rstrip("\n").split("\t")
|
|
1399
|
+
if len(fields) < 11:
|
|
1400
|
+
continue
|
|
1401
|
+
read_name = fields[0]
|
|
1402
|
+
reference_name = fields[2]
|
|
1403
|
+
mapping_quality = float(fields[4])
|
|
1404
|
+
cigar = fields[5]
|
|
1405
|
+
sequence = fields[9]
|
|
1406
|
+
quality = fields[10]
|
|
1407
|
+
if sequence == "*":
|
|
1408
|
+
read_length = float("nan")
|
|
1409
|
+
else:
|
|
1410
|
+
read_length = float(len(sequence))
|
|
1411
|
+
if quality == "*" or not quality:
|
|
1412
|
+
median_read_quality = float("nan")
|
|
1413
|
+
else:
|
|
1414
|
+
phreds = [ord(char) - 33 for char in quality]
|
|
1415
|
+
median_read_quality = float(np.median(phreds))
|
|
1416
|
+
reference_length = float(reference_lengths.get(reference_name, float("nan")))
|
|
1417
|
+
mapped_length = float(_mapped_length_from_cigar(cigar)) if cigar != "*" else 0.0
|
|
1418
|
+
read_metrics[read_name] = [
|
|
1419
|
+
read_length,
|
|
1420
|
+
median_read_quality,
|
|
1421
|
+
reference_length,
|
|
1422
|
+
mapped_length,
|
|
1423
|
+
mapping_quality,
|
|
1424
|
+
]
|
|
1425
|
+
|
|
1426
|
+
rc = proc.wait()
|
|
1427
|
+
if rc != 0:
|
|
1428
|
+
stderr = proc.stderr.read() if proc.stderr else ""
|
|
1429
|
+
raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
|
|
937
1430
|
|
|
938
1431
|
return read_metrics
|
|
939
1432
|
|
|
940
1433
|
|
|
941
|
-
def extract_readnames_from_bam(aligned_BAM):
|
|
1434
|
+
def extract_readnames_from_bam(aligned_BAM, samtools_backend: str | None = "auto"):
|
|
942
1435
|
"""
|
|
943
1436
|
Takes a BAM and writes out a txt file containing read names from the BAM
|
|
944
1437
|
|
|
@@ -949,21 +1442,39 @@ def extract_readnames_from_bam(aligned_BAM):
|
|
|
949
1442
|
None
|
|
950
1443
|
|
|
951
1444
|
"""
|
|
952
|
-
import subprocess
|
|
953
|
-
|
|
954
1445
|
# Make a text file of reads for the BAM
|
|
1446
|
+
backend_choice = _resolve_samtools_backend(samtools_backend)
|
|
955
1447
|
txt_output = aligned_BAM.split(".bam")[0] + "_read_names.txt"
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
1448
|
+
|
|
1449
|
+
if backend_choice == "python":
|
|
1450
|
+
pysam_mod = _require_pysam()
|
|
1451
|
+
with (
|
|
1452
|
+
pysam_mod.AlignmentFile(aligned_BAM, "rb") as bam,
|
|
1453
|
+
open(txt_output, "w", encoding="utf-8") as output_file,
|
|
1454
|
+
):
|
|
1455
|
+
for read in bam:
|
|
1456
|
+
output_file.write(f"{read.query_name}\n")
|
|
1457
|
+
return
|
|
1458
|
+
|
|
1459
|
+
samtools_view = subprocess.Popen(
|
|
1460
|
+
["samtools", "view", aligned_BAM], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
|
|
1461
|
+
)
|
|
1462
|
+
assert samtools_view.stdout is not None
|
|
1463
|
+
with open(txt_output, "w", encoding="utf-8") as output_file:
|
|
1464
|
+
for line in samtools_view.stdout:
|
|
1465
|
+
if not line.strip():
|
|
1466
|
+
continue
|
|
1467
|
+
qname = line.split("\t", 1)[0]
|
|
1468
|
+
output_file.write(f"{qname}\n")
|
|
1469
|
+
rc = samtools_view.wait()
|
|
1470
|
+
if rc != 0:
|
|
1471
|
+
stderr = samtools_view.stderr.read() if samtools_view.stderr else ""
|
|
1472
|
+
raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
|
|
964
1473
|
|
|
965
1474
|
|
|
966
|
-
def separate_bam_by_bc(
|
|
1475
|
+
def separate_bam_by_bc(
|
|
1476
|
+
input_bam, output_prefix, bam_suffix, split_dir, samtools_backend: str | None = "auto"
|
|
1477
|
+
):
|
|
967
1478
|
"""
|
|
968
1479
|
Separates an input BAM file on the BC SAM tag values.
|
|
969
1480
|
|
|
@@ -981,34 +1492,80 @@ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
|
|
|
981
1492
|
bam_base = input_bam.name
|
|
982
1493
|
bam_base_minus_suffix = input_bam.stem
|
|
983
1494
|
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
#
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1495
|
+
backend_choice = _resolve_samtools_backend(samtools_backend)
|
|
1496
|
+
|
|
1497
|
+
if backend_choice == "python":
|
|
1498
|
+
pysam_mod = _require_pysam()
|
|
1499
|
+
# Open the input BAM file for reading
|
|
1500
|
+
with pysam_mod.AlignmentFile(str(input_bam), "rb") as bam:
|
|
1501
|
+
# Create a dictionary to store output BAM files
|
|
1502
|
+
output_files = {}
|
|
1503
|
+
# Iterate over each read in the BAM file
|
|
1504
|
+
for read in bam:
|
|
1505
|
+
try:
|
|
1506
|
+
# Get the barcode tag value
|
|
1507
|
+
bc_tag = read.get_tag("BC", with_value_type=True)[0]
|
|
1508
|
+
# bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
|
|
1509
|
+
# Open the output BAM file corresponding to the barcode
|
|
1510
|
+
if bc_tag not in output_files:
|
|
1511
|
+
output_path = (
|
|
1512
|
+
split_dir
|
|
1513
|
+
/ f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}"
|
|
1514
|
+
)
|
|
1515
|
+
output_files[bc_tag] = pysam_mod.AlignmentFile(
|
|
1516
|
+
str(output_path), "wb", header=bam.header
|
|
1517
|
+
)
|
|
1518
|
+
# Write the read to the corresponding output BAM file
|
|
1519
|
+
output_files[bc_tag].write(read)
|
|
1520
|
+
except KeyError:
|
|
1521
|
+
logger.warning(f"BC tag not present for read: {read.query_name}")
|
|
1522
|
+
# Close all output BAM files
|
|
1523
|
+
for output_file in output_files.values():
|
|
1524
|
+
output_file.close()
|
|
1525
|
+
return
|
|
1526
|
+
|
|
1527
|
+
def _collect_bc_tags() -> set[str]:
|
|
1528
|
+
bc_tags: set[str] = set()
|
|
1529
|
+
proc = subprocess.Popen(
|
|
1530
|
+
["samtools", "view", str(input_bam)],
|
|
1531
|
+
stdout=subprocess.PIPE,
|
|
1532
|
+
stderr=subprocess.PIPE,
|
|
1533
|
+
text=True,
|
|
1534
|
+
)
|
|
1535
|
+
assert proc.stdout is not None
|
|
1536
|
+
for line in proc.stdout:
|
|
1537
|
+
if not line.strip():
|
|
1538
|
+
continue
|
|
1539
|
+
fields = line.rstrip("\n").split("\t")
|
|
1540
|
+
for tag in fields[11:]:
|
|
1541
|
+
if tag.startswith("BC:"):
|
|
1542
|
+
bc_tags.add(tag.split(":", 2)[2])
|
|
1543
|
+
break
|
|
1544
|
+
rc = proc.wait()
|
|
1545
|
+
if rc != 0:
|
|
1546
|
+
stderr = proc.stderr.read() if proc.stderr else ""
|
|
1547
|
+
raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
|
|
1548
|
+
return bc_tags
|
|
1009
1549
|
|
|
1550
|
+
bc_tags = _collect_bc_tags()
|
|
1551
|
+
if not bc_tags:
|
|
1552
|
+
logger.warning("No BC tags found in %s", input_bam)
|
|
1553
|
+
return
|
|
1554
|
+
|
|
1555
|
+
for bc_tag in bc_tags:
|
|
1556
|
+
output_path = split_dir / f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}"
|
|
1557
|
+
cmd = ["samtools", "view", "-b", "-d", f"BC:{bc_tag}", "-o", str(output_path)]
|
|
1558
|
+
cmd.append(str(input_bam))
|
|
1559
|
+
cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
|
|
1560
|
+
if cp.returncode != 0:
|
|
1561
|
+
raise RuntimeError(
|
|
1562
|
+
f"samtools view failed for BC={bc_tag} (exit {cp.returncode}):\n{cp.stderr}"
|
|
1563
|
+
)
|
|
1010
1564
|
|
|
1011
|
-
|
|
1565
|
+
|
|
1566
|
+
def split_and_index_BAM(
|
|
1567
|
+
aligned_sorted_BAM, split_dir, bam_suffix, samtools_backend: str | None = "auto"
|
|
1568
|
+
):
|
|
1012
1569
|
"""
|
|
1013
1570
|
A wrapper function for splitting BAMS and indexing them.
|
|
1014
1571
|
Parameters:
|
|
@@ -1023,12 +1580,22 @@ def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
|
|
|
1023
1580
|
logger.debug("Demultiplexing and indexing BAMS based on BC tag using split_and_index_BAM")
|
|
1024
1581
|
aligned_sorted_output = aligned_sorted_BAM + bam_suffix
|
|
1025
1582
|
file_prefix = date_string()
|
|
1026
|
-
separate_bam_by_bc(
|
|
1583
|
+
separate_bam_by_bc(
|
|
1584
|
+
aligned_sorted_output,
|
|
1585
|
+
file_prefix,
|
|
1586
|
+
bam_suffix,
|
|
1587
|
+
split_dir,
|
|
1588
|
+
samtools_backend=samtools_backend,
|
|
1589
|
+
)
|
|
1027
1590
|
# Make a BAM index file for the BAMs in that directory
|
|
1028
1591
|
bam_pattern = "*" + bam_suffix
|
|
1029
1592
|
bam_files = glob.glob(split_dir / bam_pattern)
|
|
1030
1593
|
bam_files = [str(bam) for bam in bam_files if ".bai" not in str(bam)]
|
|
1594
|
+
backend_choice = _resolve_samtools_backend(samtools_backend)
|
|
1031
1595
|
for input_file in bam_files:
|
|
1032
|
-
|
|
1596
|
+
if backend_choice == "python":
|
|
1597
|
+
_index_bam_with_pysam(input_file)
|
|
1598
|
+
else:
|
|
1599
|
+
_index_bam_with_samtools(input_file)
|
|
1033
1600
|
|
|
1034
1601
|
return bam_files
|