smftools 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +43 -13
- smftools/_settings.py +6 -6
- smftools/_version.py +3 -1
- smftools/cli/__init__.py +1 -0
- smftools/cli/archived/cli_flows.py +2 -0
- smftools/cli/helpers.py +9 -1
- smftools/cli/hmm_adata.py +905 -242
- smftools/cli/load_adata.py +432 -280
- smftools/cli/preprocess_adata.py +287 -171
- smftools/cli/spatial_adata.py +141 -53
- smftools/cli_entry.py +119 -178
- smftools/config/__init__.py +3 -1
- smftools/config/conversion.yaml +5 -1
- smftools/config/deaminase.yaml +1 -1
- smftools/config/default.yaml +26 -18
- smftools/config/direct.yaml +8 -3
- smftools/config/discover_input_files.py +19 -5
- smftools/config/experiment_config.py +511 -276
- smftools/constants.py +37 -0
- smftools/datasets/__init__.py +4 -8
- smftools/datasets/datasets.py +32 -18
- smftools/hmm/HMM.py +2133 -1428
- smftools/hmm/__init__.py +24 -14
- smftools/hmm/archived/apply_hmm_batched.py +2 -0
- smftools/hmm/archived/calculate_distances.py +2 -0
- smftools/hmm/archived/call_hmm_peaks.py +18 -1
- smftools/hmm/archived/train_hmm.py +2 -0
- smftools/hmm/call_hmm_peaks.py +176 -193
- smftools/hmm/display_hmm.py +23 -7
- smftools/hmm/hmm_readwrite.py +20 -6
- smftools/hmm/nucleosome_hmm_refinement.py +104 -14
- smftools/informatics/__init__.py +55 -13
- smftools/informatics/archived/bam_conversion.py +2 -0
- smftools/informatics/archived/bam_direct.py +2 -0
- smftools/informatics/archived/basecall_pod5s.py +2 -0
- smftools/informatics/archived/basecalls_to_adata.py +2 -0
- smftools/informatics/archived/conversion_smf.py +2 -0
- smftools/informatics/archived/deaminase_smf.py +1 -0
- smftools/informatics/archived/direct_smf.py +2 -0
- smftools/informatics/archived/fast5_to_pod5.py +2 -0
- smftools/informatics/archived/helpers/archived/__init__.py +2 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +16 -1
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
- smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
- smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
- smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
- smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
- smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
- smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
- smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
- smftools/informatics/archived/helpers/archived/informatics.py +2 -0
- smftools/informatics/archived/helpers/archived/load_adata.py +5 -3
- smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
- smftools/informatics/archived/helpers/archived/modQC.py +2 -0
- smftools/informatics/archived/helpers/archived/modcall.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +5 -1
- smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
- smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
- smftools/informatics/archived/print_bam_query_seq.py +9 -1
- smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
- smftools/informatics/archived/subsample_pod5.py +2 -0
- smftools/informatics/bam_functions.py +1059 -269
- smftools/informatics/basecalling.py +53 -9
- smftools/informatics/bed_functions.py +357 -114
- smftools/informatics/binarize_converted_base_identities.py +21 -7
- smftools/informatics/complement_base_list.py +9 -6
- smftools/informatics/converted_BAM_to_adata.py +324 -137
- smftools/informatics/fasta_functions.py +251 -89
- smftools/informatics/h5ad_functions.py +202 -30
- smftools/informatics/modkit_extract_to_adata.py +623 -274
- smftools/informatics/modkit_functions.py +87 -44
- smftools/informatics/ohe.py +46 -21
- smftools/informatics/pod5_functions.py +114 -74
- smftools/informatics/run_multiqc.py +20 -14
- smftools/logging_utils.py +51 -0
- smftools/machine_learning/__init__.py +23 -12
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +157 -50
- smftools/machine_learning/data/preprocessing.py +4 -1
- smftools/machine_learning/evaluation/__init__.py +3 -1
- smftools/machine_learning/evaluation/eval_utils.py +13 -14
- smftools/machine_learning/evaluation/evaluators.py +52 -34
- smftools/machine_learning/inference/__init__.py +3 -1
- smftools/machine_learning/inference/inference_utils.py +9 -4
- smftools/machine_learning/inference/lightning_inference.py +14 -13
- smftools/machine_learning/inference/sklearn_inference.py +8 -8
- smftools/machine_learning/inference/sliding_window_inference.py +37 -25
- smftools/machine_learning/models/__init__.py +12 -5
- smftools/machine_learning/models/base.py +34 -43
- smftools/machine_learning/models/cnn.py +22 -13
- smftools/machine_learning/models/lightning_base.py +78 -42
- smftools/machine_learning/models/mlp.py +18 -5
- smftools/machine_learning/models/positional.py +10 -4
- smftools/machine_learning/models/rnn.py +8 -3
- smftools/machine_learning/models/sklearn_models.py +46 -24
- smftools/machine_learning/models/transformer.py +75 -55
- smftools/machine_learning/models/wrappers.py +8 -3
- smftools/machine_learning/training/__init__.py +4 -2
- smftools/machine_learning/training/train_lightning_model.py +42 -23
- smftools/machine_learning/training/train_sklearn_model.py +11 -15
- smftools/machine_learning/utils/__init__.py +3 -1
- smftools/machine_learning/utils/device.py +12 -5
- smftools/machine_learning/utils/grl.py +8 -2
- smftools/metadata.py +443 -0
- smftools/optional_imports.py +31 -0
- smftools/plotting/__init__.py +32 -17
- smftools/plotting/autocorrelation_plotting.py +153 -48
- smftools/plotting/classifiers.py +175 -73
- smftools/plotting/general_plotting.py +350 -168
- smftools/plotting/hmm_plotting.py +53 -14
- smftools/plotting/position_stats.py +155 -87
- smftools/plotting/qc_plotting.py +25 -12
- smftools/preprocessing/__init__.py +35 -37
- smftools/preprocessing/append_base_context.py +105 -79
- smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
- smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +2 -0
- smftools/preprocessing/{archives → archived}/calculate_complexity.py +5 -1
- smftools/preprocessing/{archives → archived}/mark_duplicates.py +2 -0
- smftools/preprocessing/{archives → archived}/preprocessing.py +10 -6
- smftools/preprocessing/{archives → archived}/remove_duplicates.py +2 -0
- smftools/preprocessing/binarize.py +21 -4
- smftools/preprocessing/binarize_on_Youden.py +127 -31
- smftools/preprocessing/binary_layers_to_ohe.py +18 -11
- smftools/preprocessing/calculate_complexity_II.py +89 -59
- smftools/preprocessing/calculate_consensus.py +28 -19
- smftools/preprocessing/calculate_coverage.py +44 -22
- smftools/preprocessing/calculate_pairwise_differences.py +4 -1
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +7 -3
- smftools/preprocessing/calculate_position_Youden.py +110 -55
- smftools/preprocessing/calculate_read_length_stats.py +52 -23
- smftools/preprocessing/calculate_read_modification_stats.py +91 -57
- smftools/preprocessing/clean_NaN.py +38 -28
- smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +72 -37
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +183 -73
- smftools/preprocessing/flag_duplicate_reads.py +708 -303
- smftools/preprocessing/invert_adata.py +26 -11
- smftools/preprocessing/load_sample_sheet.py +40 -22
- smftools/preprocessing/make_dirs.py +9 -3
- smftools/preprocessing/min_non_diagonal.py +4 -1
- smftools/preprocessing/recipes.py +58 -23
- smftools/preprocessing/reindex_references_adata.py +93 -27
- smftools/preprocessing/subsample_adata.py +33 -16
- smftools/readwrite.py +264 -109
- smftools/schema/__init__.py +11 -0
- smftools/schema/anndata_schema_v1.yaml +227 -0
- smftools/tools/__init__.py +25 -18
- smftools/tools/archived/apply_hmm.py +2 -0
- smftools/tools/archived/classifiers.py +165 -0
- smftools/tools/archived/classify_methylated_features.py +2 -0
- smftools/tools/archived/classify_non_methylated_features.py +2 -0
- smftools/tools/archived/subset_adata_v1.py +12 -1
- smftools/tools/archived/subset_adata_v2.py +14 -1
- smftools/tools/calculate_umap.py +56 -15
- smftools/tools/cluster_adata_on_methylation.py +122 -47
- smftools/tools/general_tools.py +70 -25
- smftools/tools/position_stats.py +220 -99
- smftools/tools/read_stats.py +50 -29
- smftools/tools/spatial_autocorrelation.py +365 -192
- smftools/tools/subset_adata.py +23 -21
- smftools-0.3.0.dist-info/METADATA +147 -0
- smftools-0.3.0.dist-info/RECORD +182 -0
- smftools-0.2.4.dist-info/METADATA +0 -141
- smftools-0.2.4.dist-info/RECORD +0 -176
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,24 +1,145 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
import glob
|
|
4
4
|
import os
|
|
5
|
+
import re
|
|
6
|
+
import shutil
|
|
5
7
|
import subprocess
|
|
6
|
-
import glob
|
|
7
8
|
import time
|
|
8
|
-
from
|
|
9
|
-
import
|
|
9
|
+
from collections import Counter, defaultdict, deque
|
|
10
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
10
11
|
from itertools import zip_longest
|
|
11
|
-
import
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
12
14
|
|
|
13
15
|
import numpy as np
|
|
14
|
-
import concurrent.futures
|
|
15
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
16
|
-
from concurrent.futures import ProcessPoolExecutor
|
|
17
|
-
|
|
18
16
|
from tqdm import tqdm
|
|
19
|
-
from collections import defaultdict, Counter
|
|
20
17
|
|
|
21
|
-
from
|
|
18
|
+
from smftools.logging_utils import get_logger
|
|
19
|
+
from smftools.optional_imports import require
|
|
20
|
+
|
|
21
|
+
from ..readwrite import date_string, time_string
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
import pysam as pysam_types
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
import pysam
|
|
28
|
+
except Exception:
|
|
29
|
+
pysam = None # type: ignore
|
|
30
|
+
|
|
31
|
+
logger = get_logger(__name__)
|
|
32
|
+
|
|
33
|
+
_PROGRESS_RE = re.compile(r"Output records written:\s*(\d+)")
|
|
34
|
+
_EMPTY_RE = re.compile(r"^\s*$")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _require_pysam() -> "pysam_types":
|
|
38
|
+
"""Return the pysam module or raise if unavailable."""
|
|
39
|
+
if pysam is not None:
|
|
40
|
+
return pysam
|
|
41
|
+
return require("pysam", extra="pysam", purpose="samtools-compatible Python backend")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _resolve_samtools_backend(backend: str | None) -> str:
|
|
45
|
+
"""Resolve backend choice for samtools-compatible operations.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
backend: One of {"auto", "python", "cli"} (case-insensitive).
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Resolved backend string ("python" or "cli").
|
|
52
|
+
"""
|
|
53
|
+
choice = (backend or "auto").strip().lower()
|
|
54
|
+
if choice not in {"auto", "python", "cli"}:
|
|
55
|
+
raise ValueError("samtools_backend must be one of: auto, python, cli")
|
|
56
|
+
|
|
57
|
+
have_pysam = pysam is not None
|
|
58
|
+
have_samtools = shutil.which("samtools") is not None
|
|
59
|
+
|
|
60
|
+
if choice == "python":
|
|
61
|
+
if not have_pysam:
|
|
62
|
+
raise RuntimeError("samtools_backend=python requires pysam to be installed.")
|
|
63
|
+
return "python"
|
|
64
|
+
if choice == "cli":
|
|
65
|
+
if not have_samtools:
|
|
66
|
+
raise RuntimeError("samtools_backend=cli requires samtools in PATH.")
|
|
67
|
+
return "cli"
|
|
68
|
+
|
|
69
|
+
if have_samtools:
|
|
70
|
+
return "cli"
|
|
71
|
+
if have_pysam:
|
|
72
|
+
return "python"
|
|
73
|
+
raise RuntimeError("Neither pysam nor samtools is available in PATH.")
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _has_bam_index(bam_path: Path) -> bool:
|
|
77
|
+
"""Return True if the BAM index exists alongside the BAM."""
|
|
78
|
+
return (
|
|
79
|
+
bam_path.with_suffix(bam_path.suffix + ".bai").exists()
|
|
80
|
+
or Path(str(bam_path) + ".bai").exists()
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _ensure_bam_index(bam_path: Path, backend: str) -> None:
|
|
85
|
+
"""Ensure a BAM index exists, creating one if needed."""
|
|
86
|
+
if _has_bam_index(bam_path):
|
|
87
|
+
return
|
|
88
|
+
if backend == "python":
|
|
89
|
+
_index_bam_with_pysam(bam_path)
|
|
90
|
+
else:
|
|
91
|
+
_index_bam_with_samtools(bam_path)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _parse_idxstats_output(output: str) -> Tuple[int, int, Dict[str, Tuple[int, float]]]:
|
|
95
|
+
"""Parse samtools idxstats output into counts and proportions."""
|
|
96
|
+
aligned_reads_count = 0
|
|
97
|
+
unaligned_reads_count = 0
|
|
98
|
+
record_counts: Dict[str, int] = {}
|
|
99
|
+
for line in output.splitlines():
|
|
100
|
+
if not line.strip():
|
|
101
|
+
continue
|
|
102
|
+
ref, _length, mapped, unmapped = line.split("\t")[:4]
|
|
103
|
+
if ref == "*":
|
|
104
|
+
unaligned_reads_count += int(unmapped)
|
|
105
|
+
continue
|
|
106
|
+
mapped_count = int(mapped)
|
|
107
|
+
aligned_reads_count += mapped_count
|
|
108
|
+
record_counts[ref] = mapped_count
|
|
109
|
+
|
|
110
|
+
proportions: Dict[str, Tuple[int, float]] = {}
|
|
111
|
+
for ref, count in record_counts.items():
|
|
112
|
+
proportion = count / aligned_reads_count if aligned_reads_count else 0.0
|
|
113
|
+
proportions[ref] = (count, proportion)
|
|
114
|
+
|
|
115
|
+
return aligned_reads_count, unaligned_reads_count, proportions
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _stream_dorado_logs(stderr_iter) -> None:
|
|
119
|
+
"""Stream dorado stderr and emit structured log messages.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
stderr_iter: Iterable of stderr lines.
|
|
123
|
+
"""
|
|
124
|
+
last_n: int | None = None
|
|
125
|
+
|
|
126
|
+
for raw in stderr_iter:
|
|
127
|
+
line = raw.rstrip("\n")
|
|
128
|
+
if _EMPTY_RE.match(line):
|
|
129
|
+
continue
|
|
130
|
+
|
|
131
|
+
m = _PROGRESS_RE.search(line)
|
|
132
|
+
if m:
|
|
133
|
+
n = int(m.group(1))
|
|
134
|
+
logger.debug("[dorado] Output records written: %d", n)
|
|
135
|
+
last_n = n
|
|
136
|
+
continue
|
|
137
|
+
|
|
138
|
+
logger.info("[dorado] %s", line)
|
|
139
|
+
|
|
140
|
+
if last_n is not None:
|
|
141
|
+
logger.info("[dorado] Final output records written: %d", last_n)
|
|
142
|
+
|
|
22
143
|
|
|
23
144
|
def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str, Path]) -> None:
|
|
24
145
|
"""
|
|
@@ -26,7 +147,14 @@ def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str,
|
|
|
26
147
|
"""
|
|
27
148
|
bam_path = str(bam_path)
|
|
28
149
|
fastq_path = str(fastq_path)
|
|
29
|
-
|
|
150
|
+
|
|
151
|
+
logger.debug(f"Converting BAM to FASTQ using _bam_to_fastq_with_pysam")
|
|
152
|
+
|
|
153
|
+
pysam_mod = _require_pysam()
|
|
154
|
+
with (
|
|
155
|
+
pysam_mod.AlignmentFile(bam_path, "rb", check_sq=False) as bam,
|
|
156
|
+
open(fastq_path, "w", encoding="utf-8") as fq,
|
|
157
|
+
):
|
|
30
158
|
for r in bam.fetch(until_eof=True):
|
|
31
159
|
# Optionally skip secondary/supplementary:
|
|
32
160
|
# if r.is_secondary or r.is_supplementary:
|
|
@@ -45,36 +173,98 @@ def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str,
|
|
|
45
173
|
# q is an array/list of ints (Phred scores).
|
|
46
174
|
# Convert to FASTQ string with Phred+33 encoding,
|
|
47
175
|
# clamping to sane range [0, 93] to stay in printable ASCII.
|
|
48
|
-
qual_str = "".join(
|
|
49
|
-
chr(min(max(int(qv), 0), 93) + 33)
|
|
50
|
-
for qv in q
|
|
51
|
-
)
|
|
176
|
+
qual_str = "".join(chr(min(max(int(qv), 0), 93) + 33) for qv in q)
|
|
52
177
|
|
|
53
178
|
fq.write(f"@{name}\n{seq}\n+\n{qual_str}\n")
|
|
54
179
|
|
|
55
|
-
|
|
180
|
+
|
|
181
|
+
def _sort_bam_with_pysam(
|
|
182
|
+
in_bam: Union[str, Path], out_bam: Union[str, Path], threads: Optional[int] = None
|
|
183
|
+
) -> None:
|
|
184
|
+
"""Sort a BAM file using pysam.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
in_bam: Input BAM path.
|
|
188
|
+
out_bam: Output BAM path.
|
|
189
|
+
threads: Optional thread count.
|
|
190
|
+
"""
|
|
191
|
+
logger.debug(f"Sorting BAM using _sort_bam_with_pysam")
|
|
56
192
|
in_bam, out_bam = str(in_bam), str(out_bam)
|
|
57
193
|
args = []
|
|
58
194
|
if threads:
|
|
59
195
|
args += ["-@", str(threads)]
|
|
60
196
|
args += ["-o", out_bam, in_bam]
|
|
61
|
-
|
|
197
|
+
pysam_mod = _require_pysam()
|
|
198
|
+
pysam_mod.sort(*args)
|
|
199
|
+
|
|
62
200
|
|
|
63
201
|
def _index_bam_with_pysam(bam_path: Union[str, Path], threads: Optional[int] = None) -> None:
|
|
202
|
+
"""Index a BAM file using pysam.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
bam_path: BAM path to index.
|
|
206
|
+
threads: Optional thread count.
|
|
207
|
+
"""
|
|
64
208
|
bam_path = str(bam_path)
|
|
209
|
+
logger.debug(f"Indexing BAM using _index_bam_with_pysam")
|
|
210
|
+
pysam_mod = _require_pysam()
|
|
65
211
|
# pysam.index supports samtools-style args
|
|
66
212
|
if threads:
|
|
67
|
-
|
|
213
|
+
pysam_mod.index("-@", str(threads), bam_path)
|
|
68
214
|
else:
|
|
69
|
-
|
|
215
|
+
pysam_mod.index(bam_path)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def _bam_to_fastq_with_samtools(bam_path: Union[str, Path], fastq_path: Union[str, Path]) -> None:
|
|
219
|
+
"""Convert BAM to FASTQ using samtools."""
|
|
220
|
+
if not shutil.which("samtools"):
|
|
221
|
+
raise RuntimeError("samtools is required but not available in PATH.")
|
|
222
|
+
cmd = ["samtools", "fastq", str(bam_path)]
|
|
223
|
+
logger.debug("Converting BAM to FASTQ using samtools: %s", " ".join(cmd))
|
|
224
|
+
with open(fastq_path, "w", encoding="utf-8") as fq:
|
|
225
|
+
cp = subprocess.run(cmd, stdout=fq, stderr=subprocess.PIPE, text=True)
|
|
226
|
+
if cp.returncode != 0:
|
|
227
|
+
raise RuntimeError(f"samtools fastq failed (exit {cp.returncode}):\n{cp.stderr}")
|
|
70
228
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
229
|
+
|
|
230
|
+
def _sort_bam_with_samtools(
|
|
231
|
+
in_bam: Union[str, Path], out_bam: Union[str, Path], threads: Optional[int] = None
|
|
232
|
+
) -> None:
|
|
233
|
+
"""Sort a BAM file using samtools."""
|
|
234
|
+
if not shutil.which("samtools"):
|
|
235
|
+
raise RuntimeError("samtools is required but not available in PATH.")
|
|
236
|
+
cmd = ["samtools", "sort", "-o", str(out_bam)]
|
|
237
|
+
if threads:
|
|
238
|
+
cmd += ["-@", str(threads)]
|
|
239
|
+
cmd.append(str(in_bam))
|
|
240
|
+
logger.debug("Sorting BAM using samtools: %s", " ".join(cmd))
|
|
241
|
+
cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
|
|
242
|
+
if cp.returncode != 0:
|
|
243
|
+
raise RuntimeError(f"samtools sort failed (exit {cp.returncode}):\n{cp.stderr}")
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def _index_bam_with_samtools(bam_path: Union[str, Path], threads: Optional[int] = None) -> None:
|
|
247
|
+
"""Index a BAM file using samtools."""
|
|
248
|
+
if not shutil.which("samtools"):
|
|
249
|
+
raise RuntimeError("samtools is required but not available in PATH.")
|
|
250
|
+
cmd = ["samtools", "index"]
|
|
251
|
+
if threads:
|
|
252
|
+
cmd += ["-@", str(threads)]
|
|
253
|
+
cmd.append(str(bam_path))
|
|
254
|
+
logger.debug("Indexing BAM using samtools: %s", " ".join(cmd))
|
|
255
|
+
cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
|
|
256
|
+
if cp.returncode != 0:
|
|
257
|
+
raise RuntimeError(f"samtools index failed (exit {cp.returncode}):\n{cp.stderr}")
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def align_and_sort_BAM(
|
|
261
|
+
fasta,
|
|
262
|
+
input,
|
|
263
|
+
cfg,
|
|
74
264
|
):
|
|
75
265
|
"""
|
|
76
266
|
A wrapper for running dorado aligner and samtools functions
|
|
77
|
-
|
|
267
|
+
|
|
78
268
|
Parameters:
|
|
79
269
|
fasta (str): File path to the reference genome to align to.
|
|
80
270
|
input (str): File path to the basecalled file to align. Works for .bam and .fastq files
|
|
@@ -84,60 +274,105 @@ def align_and_sort_BAM(fasta,
|
|
|
84
274
|
None
|
|
85
275
|
The function writes out files for: 1) An aligned BAM, 2) and aligned_sorted BAM, 3) an index file for the aligned_sorted BAM, 4) A bed file for the aligned_sorted BAM, 5) A text file containing read names in the aligned_sorted BAM
|
|
86
276
|
"""
|
|
277
|
+
logger.debug("Aligning and sorting BAM using align_and_sort_BAM")
|
|
87
278
|
input_basename = input.name
|
|
88
279
|
input_suffix = input.suffix
|
|
89
|
-
input_as_fastq = input.with_name(input.stem +
|
|
280
|
+
input_as_fastq = input.with_name(input.stem + ".fastq")
|
|
90
281
|
|
|
91
282
|
output_path_minus_suffix = cfg.output_directory / input.stem
|
|
92
|
-
|
|
283
|
+
|
|
93
284
|
aligned_BAM = output_path_minus_suffix.with_name(output_path_minus_suffix.stem + "_aligned")
|
|
94
285
|
aligned_output = aligned_BAM.with_suffix(cfg.bam_suffix)
|
|
95
|
-
aligned_sorted_BAM =aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
|
|
286
|
+
aligned_sorted_BAM = aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
|
|
96
287
|
aligned_sorted_output = aligned_sorted_BAM.with_suffix(cfg.bam_suffix)
|
|
97
288
|
|
|
98
289
|
if cfg.threads:
|
|
99
290
|
threads = str(cfg.threads)
|
|
100
291
|
else:
|
|
101
292
|
threads = None
|
|
102
|
-
|
|
103
|
-
|
|
293
|
+
|
|
294
|
+
samtools_backend = _resolve_samtools_backend(getattr(cfg, "samtools_backend", "auto"))
|
|
295
|
+
|
|
296
|
+
if cfg.aligner == "minimap2":
|
|
104
297
|
if not cfg.align_from_bam:
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
298
|
+
logger.debug(f"Converting BAM to FASTQ: {input}")
|
|
299
|
+
if samtools_backend == "python":
|
|
300
|
+
_bam_to_fastq_with_pysam(input, input_as_fastq)
|
|
301
|
+
else:
|
|
302
|
+
_bam_to_fastq_with_samtools(input, input_as_fastq)
|
|
303
|
+
logger.debug(f"Aligning FASTQ to Reference: {input_as_fastq}")
|
|
108
304
|
mm_input = input_as_fastq
|
|
109
|
-
else:
|
|
110
|
-
|
|
305
|
+
else:
|
|
306
|
+
logger.debug(f"Aligning BAM to Reference: {input}")
|
|
111
307
|
mm_input = input
|
|
112
308
|
|
|
113
309
|
if threads:
|
|
114
|
-
minimap_command =
|
|
310
|
+
minimap_command = (
|
|
311
|
+
["minimap2"] + cfg.aligner_args + ["-t", threads, str(fasta), str(mm_input)]
|
|
312
|
+
)
|
|
115
313
|
else:
|
|
116
|
-
minimap_command = [
|
|
117
|
-
|
|
314
|
+
minimap_command = ["minimap2"] + cfg.aligner_args + [str(fasta), str(mm_input)]
|
|
315
|
+
|
|
316
|
+
with open(aligned_output, "wb") as out:
|
|
317
|
+
proc = subprocess.Popen(
|
|
318
|
+
minimap_command,
|
|
319
|
+
stdout=out,
|
|
320
|
+
stderr=subprocess.PIPE,
|
|
321
|
+
text=True,
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
assert proc.stderr is not None
|
|
325
|
+
for line in proc.stderr:
|
|
326
|
+
logger.info("[minimap2] %s", line.rstrip())
|
|
327
|
+
|
|
328
|
+
ret = proc.wait()
|
|
329
|
+
if ret != 0:
|
|
330
|
+
raise RuntimeError(f"minimap2 failed with exit code {ret}")
|
|
118
331
|
|
|
119
332
|
if not cfg.align_from_bam:
|
|
120
333
|
os.remove(input_as_fastq)
|
|
121
334
|
|
|
122
|
-
elif cfg.aligner ==
|
|
335
|
+
elif cfg.aligner == "dorado":
|
|
123
336
|
# Run dorado aligner
|
|
124
337
|
print(f"Aligning BAM to Reference: {input}")
|
|
125
338
|
if threads:
|
|
126
|
-
alignment_command =
|
|
339
|
+
alignment_command = (
|
|
340
|
+
["dorado", "aligner", "-t", threads] + cfg.aligner_args + [str(fasta), str(input)]
|
|
341
|
+
)
|
|
127
342
|
else:
|
|
128
343
|
alignment_command = ["dorado", "aligner"] + cfg.aligner_args + [str(fasta), str(input)]
|
|
129
|
-
subprocess.run(alignment_command, stdout=open(aligned_output, "wb"))
|
|
130
344
|
|
|
345
|
+
with open(aligned_output, "wb") as out:
|
|
346
|
+
proc = subprocess.Popen(
|
|
347
|
+
alignment_command,
|
|
348
|
+
stdout=out,
|
|
349
|
+
stderr=subprocess.PIPE,
|
|
350
|
+
text=True,
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
assert proc.stderr is not None
|
|
354
|
+
_stream_dorado_logs(proc.stderr)
|
|
355
|
+
ret = proc.wait()
|
|
356
|
+
|
|
357
|
+
if ret != 0:
|
|
358
|
+
raise RuntimeError(f"dorado failed with exit code {ret}")
|
|
131
359
|
else:
|
|
132
|
-
|
|
360
|
+
logger.error(f"Aligner not recognized: {cfg.aligner}. Choose from minimap2 and dorado")
|
|
133
361
|
return
|
|
134
|
-
|
|
135
|
-
# --- Sort & Index with pysam ---
|
|
136
|
-
print(f"[pysam] Sorting: {aligned_output} -> {aligned_sorted_output}")
|
|
137
|
-
_sort_bam_with_pysam(aligned_output, aligned_sorted_output, threads=threads)
|
|
138
362
|
|
|
139
|
-
|
|
140
|
-
|
|
363
|
+
# --- Sort & Index ---
|
|
364
|
+
logger.debug(f"Sorting: {aligned_output} -> {aligned_sorted_output}")
|
|
365
|
+
if samtools_backend == "python":
|
|
366
|
+
_sort_bam_with_pysam(aligned_output, aligned_sorted_output, threads=threads)
|
|
367
|
+
else:
|
|
368
|
+
_sort_bam_with_samtools(aligned_output, aligned_sorted_output, threads=threads)
|
|
369
|
+
|
|
370
|
+
logger.debug(f"Indexing: {aligned_sorted_output}")
|
|
371
|
+
if samtools_backend == "python":
|
|
372
|
+
_index_bam_with_pysam(aligned_sorted_output, threads=threads)
|
|
373
|
+
else:
|
|
374
|
+
_index_bam_with_samtools(aligned_sorted_output, threads=threads)
|
|
375
|
+
|
|
141
376
|
|
|
142
377
|
def bam_qc(
|
|
143
378
|
bam_files: Iterable[str | Path],
|
|
@@ -147,6 +382,7 @@ def bam_qc(
|
|
|
147
382
|
stats: bool = True,
|
|
148
383
|
flagstats: bool = True,
|
|
149
384
|
idxstats: bool = True,
|
|
385
|
+
samtools_backend: str | None = "auto",
|
|
150
386
|
) -> None:
|
|
151
387
|
"""
|
|
152
388
|
QC for BAM/CRAMs: stats, flagstat, idxstats.
|
|
@@ -154,132 +390,148 @@ def bam_qc(
|
|
|
154
390
|
Runs BAMs in parallel (up to `threads`, default serial).
|
|
155
391
|
"""
|
|
156
392
|
import subprocess
|
|
157
|
-
import shutil
|
|
158
393
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
HAVE_PYSAM = False
|
|
394
|
+
logger.debug("Performing BAM QC using bam_qc")
|
|
395
|
+
|
|
396
|
+
backend_choice = _resolve_samtools_backend(samtools_backend)
|
|
397
|
+
have_pysam = backend_choice == "python"
|
|
398
|
+
pysam_mod = _require_pysam() if have_pysam else None
|
|
165
399
|
|
|
166
400
|
bam_qc_dir = Path(bam_qc_dir)
|
|
167
401
|
bam_qc_dir.mkdir(parents=True, exist_ok=True)
|
|
168
402
|
|
|
169
|
-
|
|
403
|
+
bam_paths = [Path(b) for b in bam_files]
|
|
170
404
|
|
|
171
405
|
def _has_index(p: Path) -> bool:
|
|
172
|
-
if
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
return bai.exists() or
|
|
176
|
-
if
|
|
177
|
-
|
|
178
|
-
return crai.exists()
|
|
406
|
+
"""Return True if a BAM/CRAM index exists for the path."""
|
|
407
|
+
suf = p.suffix.lower()
|
|
408
|
+
if suf == ".bam":
|
|
409
|
+
return p.with_suffix(p.suffix + ".bai").exists() or Path(str(p) + ".bai").exists()
|
|
410
|
+
if suf == ".cram":
|
|
411
|
+
return Path(str(p) + ".crai").exists()
|
|
179
412
|
return False
|
|
180
413
|
|
|
181
414
|
def _ensure_index(p: Path) -> None:
|
|
415
|
+
"""Ensure a BAM/CRAM index exists, creating one if needed."""
|
|
182
416
|
if _has_index(p):
|
|
183
417
|
return
|
|
184
|
-
if
|
|
185
|
-
|
|
186
|
-
|
|
418
|
+
if have_pysam:
|
|
419
|
+
assert pysam_mod is not None
|
|
420
|
+
pysam_mod.index(str(p)) # supports BAM & CRAM
|
|
187
421
|
else:
|
|
188
422
|
cmd = ["samtools", "index", str(p)]
|
|
189
|
-
|
|
423
|
+
# capture text so errors are readable; raise on failure
|
|
424
|
+
cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
|
|
425
|
+
if cp.returncode != 0:
|
|
426
|
+
raise RuntimeError(f"samtools index failed (exit {cp.returncode}):\n{cp.stderr}")
|
|
190
427
|
|
|
191
|
-
def
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
428
|
+
def _run_samtools_to_file(cmd: list[str], out_path: Path, bam: Path, tag: str) -> int:
|
|
429
|
+
"""
|
|
430
|
+
Stream stderr to logger; write stdout to out_path; return rc; raise with stderr tail on failure.
|
|
431
|
+
"""
|
|
432
|
+
last_err = deque(maxlen=80)
|
|
433
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
434
|
+
|
|
435
|
+
with open(out_path, "w") as fh:
|
|
436
|
+
proc = subprocess.Popen(cmd, stdout=fh, stderr=subprocess.PIPE, text=True)
|
|
437
|
+
assert proc.stderr is not None
|
|
438
|
+
for line in proc.stderr:
|
|
439
|
+
line = line.rstrip()
|
|
440
|
+
if line:
|
|
441
|
+
last_err.append(line)
|
|
442
|
+
logger.debug("[%s][%s] %s", tag, bam.name, line)
|
|
443
|
+
rc = proc.wait()
|
|
444
|
+
|
|
445
|
+
if rc != 0:
|
|
446
|
+
tail = "\n".join(last_err)
|
|
447
|
+
raise RuntimeError(f"{tag} failed for {bam} (exit {rc}). Stderr tail:\n{tail}")
|
|
448
|
+
return rc
|
|
449
|
+
|
|
450
|
+
def _run_one(bam: Path) -> tuple[Path, list[tuple[str, int]]]:
|
|
451
|
+
"""Run stats/flagstat/idxstats for a single BAM.
|
|
452
|
+
|
|
453
|
+
Args:
|
|
454
|
+
bam: Path to the BAM file.
|
|
455
|
+
|
|
456
|
+
Returns:
|
|
457
|
+
Tuple of (bam_path, list of (stage, return_code)).
|
|
458
|
+
"""
|
|
459
|
+
import subprocess
|
|
460
|
+
|
|
461
|
+
results: list[tuple[str, int]] = []
|
|
462
|
+
base = bam.stem # e.g. sample.bam -> sample
|
|
195
463
|
out_stats = bam_qc_dir / f"{base}_stats.txt"
|
|
196
464
|
out_flag = bam_qc_dir / f"{base}_flagstat.txt"
|
|
197
|
-
out_idx
|
|
465
|
+
out_idx = bam_qc_dir / f"{base}_idxstats.txt"
|
|
198
466
|
|
|
199
|
-
# Make sure index exists (
|
|
467
|
+
# Make sure index exists (idxstats requires; stats/flagstat usually don't, but indexing is cheap/useful)
|
|
200
468
|
try:
|
|
201
469
|
_ensure_index(bam)
|
|
202
470
|
except Exception as e:
|
|
203
|
-
# Still attempt stats/flagstat if requested
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
#
|
|
207
|
-
|
|
208
|
-
if
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
471
|
+
# Still attempt stats/flagstat if requested; idxstats may fail later if index is required.
|
|
472
|
+
logger.warning("Indexing failed for %s: %s", bam, e)
|
|
473
|
+
|
|
474
|
+
# --- stats ---
|
|
475
|
+
if stats:
|
|
476
|
+
if have_pysam:
|
|
477
|
+
assert pysam_mod is not None
|
|
478
|
+
if not hasattr(pysam_mod, "stats"):
|
|
479
|
+
raise RuntimeError("pysam.stats is unavailable in this pysam build.")
|
|
480
|
+
txt = pysam_mod.stats(str(bam))
|
|
212
481
|
out_stats.write_text(txt)
|
|
213
482
|
results.append(("stats(pysam)", 0))
|
|
214
483
|
else:
|
|
215
484
|
cmd = ["samtools", "stats", str(bam)]
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
txt = pysam.flagstat(str(bam))
|
|
485
|
+
rc = _run_samtools_to_file(cmd, out_stats, bam, "samtools stats")
|
|
486
|
+
results.append(("stats(samtools)", rc))
|
|
487
|
+
|
|
488
|
+
# --- flagstat ---
|
|
489
|
+
if flagstats:
|
|
490
|
+
if have_pysam:
|
|
491
|
+
assert pysam_mod is not None
|
|
492
|
+
if not hasattr(pysam_mod, "flagstat"):
|
|
493
|
+
raise RuntimeError("pysam.flagstat is unavailable in this pysam build.")
|
|
494
|
+
txt = pysam_mod.flagstat(str(bam))
|
|
227
495
|
out_flag.write_text(txt)
|
|
228
496
|
results.append(("flagstat(pysam)", 0))
|
|
229
497
|
else:
|
|
230
498
|
cmd = ["samtools", "flagstat", str(bam)]
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
txt = pysam.idxstats(str(bam))
|
|
499
|
+
rc = _run_samtools_to_file(cmd, out_flag, bam, "samtools flagstat")
|
|
500
|
+
results.append(("flagstat(samtools)", rc))
|
|
501
|
+
|
|
502
|
+
# --- idxstats ---
|
|
503
|
+
if idxstats:
|
|
504
|
+
if have_pysam:
|
|
505
|
+
assert pysam_mod is not None
|
|
506
|
+
if not hasattr(pysam_mod, "idxstats"):
|
|
507
|
+
raise RuntimeError("pysam.idxstats is unavailable in this pysam build.")
|
|
508
|
+
txt = pysam_mod.idxstats(str(bam))
|
|
242
509
|
out_idx.write_text(txt)
|
|
243
510
|
results.append(("idxstats(pysam)", 0))
|
|
244
511
|
else:
|
|
245
512
|
cmd = ["samtools", "idxstats", str(bam)]
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
if cp.returncode != 0:
|
|
250
|
-
raise RuntimeError(cp.stderr.decode(errors="replace"))
|
|
251
|
-
|
|
252
|
-
# Sanity: ensure samtools exists if pysam missing
|
|
253
|
-
if not HAVE_PYSAM:
|
|
254
|
-
if not shutil.which("samtools"):
|
|
255
|
-
raise RuntimeError("Neither pysam nor samtools is available in PATH.")
|
|
256
|
-
|
|
257
|
-
# Execute tasks (serial per file; parallelized across files)
|
|
258
|
-
run_stats()
|
|
259
|
-
run_flagstat()
|
|
260
|
-
run_idxstats()
|
|
513
|
+
rc = _run_samtools_to_file(cmd, out_idx, bam, "samtools idxstats")
|
|
514
|
+
results.append(("idxstats(samtools)", rc))
|
|
515
|
+
|
|
261
516
|
return bam, results
|
|
262
517
|
|
|
263
|
-
# Parallel across BAMs
|
|
264
518
|
max_workers = int(threads) if threads and int(threads) > 0 else 1
|
|
265
|
-
futures = []
|
|
266
|
-
with ThreadPoolExecutor(max_workers=max_workers) as ex:
|
|
267
|
-
for b in bam_files:
|
|
268
|
-
futures.append(ex.submit(_run_one, b))
|
|
269
519
|
|
|
270
|
-
|
|
520
|
+
with ThreadPoolExecutor(max_workers=max_workers) as ex:
|
|
521
|
+
futs = [ex.submit(_run_one, b) for b in bam_paths]
|
|
522
|
+
for fut in as_completed(futs):
|
|
271
523
|
try:
|
|
272
524
|
bam, res = fut.result()
|
|
273
525
|
summary = ", ".join(f"{name}:{rc}" for name, rc in res) or "no-op"
|
|
274
|
-
|
|
526
|
+
logger.info("[qc] %s: %s", bam.name, summary)
|
|
275
527
|
except Exception as e:
|
|
276
|
-
|
|
528
|
+
logger.exception("QC failed: %s", e)
|
|
529
|
+
|
|
530
|
+
if modality not in {"conversion", "direct", "deaminase"}:
|
|
531
|
+
logger.warning("Unknown modality '%s', continuing.", modality)
|
|
277
532
|
|
|
278
|
-
|
|
279
|
-
if modality not in {"conversion", "direct"}:
|
|
280
|
-
print(f"[warn] Unknown modality '{modality}', continuing.")
|
|
533
|
+
logger.info("QC processing completed.")
|
|
281
534
|
|
|
282
|
-
print("QC processing completed.")
|
|
283
535
|
|
|
284
536
|
def concatenate_fastqs_to_bam(
|
|
285
537
|
fastq_files: List[Union[str, Tuple[str, str], Path, Tuple[Path, Path]]],
|
|
@@ -290,6 +542,8 @@ def concatenate_fastqs_to_bam(
|
|
|
290
542
|
rg_sample_field: Optional[str] = None,
|
|
291
543
|
progress: bool = True,
|
|
292
544
|
auto_pair: bool = True,
|
|
545
|
+
gzip_suffixes: Tuple[str, ...] = (".gz", ".gzip"),
|
|
546
|
+
samtools_backend: str | None = "auto",
|
|
293
547
|
) -> Dict[str, Any]:
|
|
294
548
|
"""
|
|
295
549
|
Concatenate FASTQ(s) into an **unaligned** BAM. Supports single-end and paired-end.
|
|
@@ -312,6 +566,10 @@ def concatenate_fastqs_to_bam(
|
|
|
312
566
|
Show tqdm progress bars.
|
|
313
567
|
auto_pair : bool
|
|
314
568
|
Auto-pair R1/R2 based on filename patterns if given a flat list.
|
|
569
|
+
gzip_suffixes : tuple[str, ...]
|
|
570
|
+
Suffixes treated as gzip-compressed FASTQ files.
|
|
571
|
+
samtools_backend : str | None
|
|
572
|
+
Backend selection for samtools-compatible operations (auto|python|cli).
|
|
315
573
|
|
|
316
574
|
Returns
|
|
317
575
|
-------
|
|
@@ -326,12 +584,30 @@ def concatenate_fastqs_to_bam(
|
|
|
326
584
|
"""
|
|
327
585
|
name = p.name
|
|
328
586
|
lowers = name.lower()
|
|
329
|
-
|
|
587
|
+
gzip_exts = tuple(s.lower() for s in gzip_suffixes)
|
|
588
|
+
for ext in (
|
|
589
|
+
*(f".fastq{suf}" for suf in gzip_exts),
|
|
590
|
+
*(f".fq{suf}" for suf in gzip_exts),
|
|
591
|
+
".fastq.bz2",
|
|
592
|
+
".fq.bz2",
|
|
593
|
+
".fastq.xz",
|
|
594
|
+
".fq.xz",
|
|
595
|
+
".fastq",
|
|
596
|
+
".fq",
|
|
597
|
+
):
|
|
330
598
|
if lowers.endswith(ext):
|
|
331
599
|
return name[: -len(ext)]
|
|
332
600
|
return p.stem # fallback: remove last suffix only
|
|
333
601
|
|
|
334
602
|
def _extract_barcode_from_filename(p: Path) -> str:
|
|
603
|
+
"""Extract a barcode token from a FASTQ filename.
|
|
604
|
+
|
|
605
|
+
Args:
|
|
606
|
+
p: FASTQ path.
|
|
607
|
+
|
|
608
|
+
Returns:
|
|
609
|
+
Barcode token string.
|
|
610
|
+
"""
|
|
335
611
|
stem = _strip_fastq_ext(p)
|
|
336
612
|
if "_" in stem:
|
|
337
613
|
token = stem.split("_")[-1]
|
|
@@ -340,10 +616,18 @@ def concatenate_fastqs_to_bam(
|
|
|
340
616
|
return stem
|
|
341
617
|
|
|
342
618
|
def _classify_read_token(stem: str) -> Tuple[Optional[str], Optional[int]]:
|
|
619
|
+
"""Classify a FASTQ filename stem into (prefix, read_number).
|
|
620
|
+
|
|
621
|
+
Args:
|
|
622
|
+
stem: Filename stem.
|
|
623
|
+
|
|
624
|
+
Returns:
|
|
625
|
+
Tuple of (prefix, read_number) or (None, None) if not matched.
|
|
626
|
+
"""
|
|
343
627
|
# return (prefix, readnum) if matches; else (None, None)
|
|
344
628
|
patterns = [
|
|
345
|
-
r"(?i)(.*?)[._-]r?([12])$",
|
|
346
|
-
r"(?i)(.*?)[._-]read[_-]?([12])$",
|
|
629
|
+
r"(?i)(.*?)[._-]r?([12])$", # prefix_R1 / prefix.r2 / prefix-1
|
|
630
|
+
r"(?i)(.*?)[._-]read[_-]?([12])$", # prefix_read1
|
|
347
631
|
]
|
|
348
632
|
for pat in patterns:
|
|
349
633
|
m = re.match(pat, stem)
|
|
@@ -352,6 +636,14 @@ def concatenate_fastqs_to_bam(
|
|
|
352
636
|
return None, None
|
|
353
637
|
|
|
354
638
|
def _pair_by_filename(paths: List[Path]) -> Tuple[List[Tuple[Path, Path]], List[Path]]:
|
|
639
|
+
"""Pair FASTQ files based on filename conventions.
|
|
640
|
+
|
|
641
|
+
Args:
|
|
642
|
+
paths: FASTQ paths to pair.
|
|
643
|
+
|
|
644
|
+
Returns:
|
|
645
|
+
Tuple of (paired list, leftover list).
|
|
646
|
+
"""
|
|
355
647
|
pref_map: Dict[str, Dict[int, Path]] = {}
|
|
356
648
|
unpaired: List[Path] = []
|
|
357
649
|
for pth in paths:
|
|
@@ -373,11 +665,59 @@ def concatenate_fastqs_to_bam(
|
|
|
373
665
|
return pairs, leftovers
|
|
374
666
|
|
|
375
667
|
def _fastq_iter(p: Path):
|
|
668
|
+
"""Yield FASTQ records using pysam.FastxFile.
|
|
669
|
+
|
|
670
|
+
Args:
|
|
671
|
+
p: FASTQ path.
|
|
672
|
+
|
|
673
|
+
Yields:
|
|
674
|
+
Pysam Fastx records.
|
|
675
|
+
"""
|
|
376
676
|
# pysam.FastxFile handles compressed extensions transparently
|
|
377
|
-
|
|
677
|
+
pysam_mod = _require_pysam()
|
|
678
|
+
with pysam_mod.FastxFile(str(p)) as fx:
|
|
378
679
|
for rec in fx:
|
|
379
680
|
yield rec # rec.name, rec.sequence, rec.quality
|
|
380
681
|
|
|
682
|
+
def _fastq_iter_plain(p: Path) -> Iterable[Tuple[str, str, str]]:
|
|
683
|
+
"""Yield FASTQ records from plain-text parsing.
|
|
684
|
+
|
|
685
|
+
Args:
|
|
686
|
+
p: FASTQ path.
|
|
687
|
+
|
|
688
|
+
Yields:
|
|
689
|
+
Tuple of (name, sequence, quality).
|
|
690
|
+
"""
|
|
691
|
+
import bz2
|
|
692
|
+
import gzip
|
|
693
|
+
import lzma
|
|
694
|
+
|
|
695
|
+
lowers = p.name.lower()
|
|
696
|
+
if any(lowers.endswith(suf) for suf in (s.lower() for s in gzip_suffixes)):
|
|
697
|
+
handle = gzip.open(p, "rt", encoding="utf-8")
|
|
698
|
+
elif lowers.endswith(".bz2"):
|
|
699
|
+
handle = bz2.open(p, "rt", encoding="utf-8")
|
|
700
|
+
elif lowers.endswith(".xz"):
|
|
701
|
+
handle = lzma.open(p, "rt", encoding="utf-8")
|
|
702
|
+
else:
|
|
703
|
+
handle = p.open("r", encoding="utf-8")
|
|
704
|
+
|
|
705
|
+
with handle as fh:
|
|
706
|
+
while True:
|
|
707
|
+
header = fh.readline()
|
|
708
|
+
if not header:
|
|
709
|
+
break
|
|
710
|
+
seq = fh.readline()
|
|
711
|
+
fh.readline()
|
|
712
|
+
qual = fh.readline()
|
|
713
|
+
if not qual:
|
|
714
|
+
break
|
|
715
|
+
name = header.strip()
|
|
716
|
+
if name.startswith("@"):
|
|
717
|
+
name = name[1:]
|
|
718
|
+
name = name.split()[0]
|
|
719
|
+
yield name, seq.strip(), qual.strip()
|
|
720
|
+
|
|
381
721
|
def _make_unaligned_segment(
|
|
382
722
|
name: str,
|
|
383
723
|
seq: str,
|
|
@@ -386,11 +726,25 @@ def concatenate_fastqs_to_bam(
|
|
|
386
726
|
read1: bool,
|
|
387
727
|
read2: bool,
|
|
388
728
|
) -> pysam.AlignedSegment:
|
|
389
|
-
|
|
729
|
+
"""Construct an unaligned pysam.AlignedSegment.
|
|
730
|
+
|
|
731
|
+
Args:
|
|
732
|
+
name: Read name.
|
|
733
|
+
seq: Read sequence.
|
|
734
|
+
qual: FASTQ quality string.
|
|
735
|
+
bc: Barcode string.
|
|
736
|
+
read1: Whether this is read 1.
|
|
737
|
+
read2: Whether this is read 2.
|
|
738
|
+
|
|
739
|
+
Returns:
|
|
740
|
+
Unaligned pysam.AlignedSegment.
|
|
741
|
+
"""
|
|
742
|
+
pysam_mod = _require_pysam()
|
|
743
|
+
a = pysam_mod.AlignedSegment()
|
|
390
744
|
a.query_name = name
|
|
391
745
|
a.query_sequence = seq
|
|
392
746
|
if qual is not None:
|
|
393
|
-
a.query_qualities =
|
|
747
|
+
a.query_qualities = pysam_mod.qualitystring_to_array(qual)
|
|
394
748
|
a.is_unmapped = True
|
|
395
749
|
a.is_paired = read1 or read2
|
|
396
750
|
a.is_read1 = read1
|
|
@@ -406,8 +760,51 @@ def concatenate_fastqs_to_bam(
|
|
|
406
760
|
a.set_tag("RG", str(bc), value_type="Z")
|
|
407
761
|
return a
|
|
408
762
|
|
|
763
|
+
def _write_sam_line(
|
|
764
|
+
handle,
|
|
765
|
+
name: str,
|
|
766
|
+
seq: str,
|
|
767
|
+
qual: str,
|
|
768
|
+
bc: str,
|
|
769
|
+
*,
|
|
770
|
+
read1: bool,
|
|
771
|
+
read2: bool,
|
|
772
|
+
add_read_group: bool,
|
|
773
|
+
) -> None:
|
|
774
|
+
"""Write a single unaligned SAM record to a text stream."""
|
|
775
|
+
if read1:
|
|
776
|
+
flag = 77
|
|
777
|
+
elif read2:
|
|
778
|
+
flag = 141
|
|
779
|
+
else:
|
|
780
|
+
flag = 4
|
|
781
|
+
tags = [f"{barcode_tag}:Z:{bc}"]
|
|
782
|
+
if add_read_group:
|
|
783
|
+
tags.append(f"RG:Z:{bc}")
|
|
784
|
+
tag_str = "\t".join(tags)
|
|
785
|
+
if not qual:
|
|
786
|
+
qual = "*"
|
|
787
|
+
line = "\t".join(
|
|
788
|
+
[
|
|
789
|
+
name,
|
|
790
|
+
str(flag),
|
|
791
|
+
"*",
|
|
792
|
+
"0",
|
|
793
|
+
"0",
|
|
794
|
+
"*",
|
|
795
|
+
"*",
|
|
796
|
+
"0",
|
|
797
|
+
"0",
|
|
798
|
+
seq,
|
|
799
|
+
qual,
|
|
800
|
+
tag_str,
|
|
801
|
+
]
|
|
802
|
+
)
|
|
803
|
+
handle.write(f"{line}\n")
|
|
804
|
+
|
|
409
805
|
# ---------- normalize inputs to Path ----------
|
|
410
806
|
def _to_path_pair(x) -> Tuple[Path, Path]:
|
|
807
|
+
"""Convert a tuple of path-like objects to Path instances."""
|
|
411
808
|
a, b = x
|
|
412
809
|
return Path(a), Path(b)
|
|
413
810
|
|
|
@@ -450,7 +847,10 @@ def concatenate_fastqs_to_bam(
|
|
|
450
847
|
# ---------- BAM header ----------
|
|
451
848
|
header = {"HD": {"VN": "1.6", "SO": "unknown"}, "SQ": []}
|
|
452
849
|
if add_read_group:
|
|
453
|
-
header["RG"] = [
|
|
850
|
+
header["RG"] = [
|
|
851
|
+
{"ID": bc, **({"SM": rg_sample_field} if rg_sample_field else {})}
|
|
852
|
+
for bc in barcodes_in_order
|
|
853
|
+
]
|
|
454
854
|
header.setdefault("PG", []).append(
|
|
455
855
|
{"ID": "concat-fastq", "PN": "concatenate_fastqs_to_bam", "VN": "1"}
|
|
456
856
|
)
|
|
@@ -462,7 +862,29 @@ def concatenate_fastqs_to_bam(
|
|
|
462
862
|
singletons_written = 0
|
|
463
863
|
|
|
464
864
|
# ---------- write BAM ----------
|
|
465
|
-
|
|
865
|
+
backend_choice = _resolve_samtools_backend(samtools_backend)
|
|
866
|
+
if backend_choice == "python":
|
|
867
|
+
pysam_mod = _require_pysam()
|
|
868
|
+
bam_out_ctx = pysam_mod.AlignmentFile(str(output_bam), "wb", header=header)
|
|
869
|
+
else:
|
|
870
|
+
cmd = ["samtools", "view", "-b", "-o", str(output_bam), "-"]
|
|
871
|
+
logger.debug("Writing BAM using samtools: %s", " ".join(cmd))
|
|
872
|
+
bam_out_ctx = subprocess.Popen(
|
|
873
|
+
cmd, stdin=subprocess.PIPE, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True
|
|
874
|
+
)
|
|
875
|
+
assert bam_out_ctx.stdin is not None
|
|
876
|
+
header_lines = ["@HD\tVN:1.6\tSO:unknown"]
|
|
877
|
+
if add_read_group:
|
|
878
|
+
for bc in barcodes_in_order:
|
|
879
|
+
rg_fields = [f"ID:{bc}"]
|
|
880
|
+
if rg_sample_field:
|
|
881
|
+
rg_fields.append(f"SM:{rg_sample_field}")
|
|
882
|
+
rg_body = "\t".join(rg_fields)
|
|
883
|
+
header_lines.append(f"@RG\t{rg_body}")
|
|
884
|
+
header_lines.append("@PG\tID:concat-fastq\tPN:concatenate_fastqs_to_bam\tVN:1")
|
|
885
|
+
bam_out_ctx.stdin.write("\n".join(header_lines) + "\n")
|
|
886
|
+
|
|
887
|
+
try:
|
|
466
888
|
# Paired
|
|
467
889
|
it_pairs = explicit_pairs
|
|
468
890
|
if progress and it_pairs:
|
|
@@ -472,30 +894,83 @@ def concatenate_fastqs_to_bam(
|
|
|
472
894
|
raise FileNotFoundError(f"Paired file missing: {r1_path} or {r2_path}")
|
|
473
895
|
bc = per_path_barcode.get(r1_path) or per_path_barcode.get(r2_path) or "barcode"
|
|
474
896
|
|
|
475
|
-
|
|
476
|
-
|
|
897
|
+
if backend_choice == "python":
|
|
898
|
+
it1 = _fastq_iter(r1_path)
|
|
899
|
+
it2 = _fastq_iter(r2_path)
|
|
900
|
+
else:
|
|
901
|
+
it1 = _fastq_iter_plain(r1_path)
|
|
902
|
+
it2 = _fastq_iter_plain(r2_path)
|
|
477
903
|
|
|
478
904
|
for rec1, rec2 in zip_longest(it1, it2, fillvalue=None):
|
|
905
|
+
|
|
479
906
|
def _clean(n: Optional[str]) -> Optional[str]:
|
|
907
|
+
"""Normalize FASTQ read names by trimming read suffixes."""
|
|
480
908
|
if n is None:
|
|
481
909
|
return None
|
|
482
910
|
return re.sub(r"(?:/1$|/2$|\s[12]$)", "", n)
|
|
483
911
|
|
|
484
912
|
name = (
|
|
485
|
-
_clean(getattr(rec1, "name", None))
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
or getattr(rec2, "name", None)
|
|
913
|
+
_clean(getattr(rec1, "name", None) if backend_choice == "python" else rec1[0])
|
|
914
|
+
if rec1 is not None
|
|
915
|
+
else None
|
|
489
916
|
)
|
|
917
|
+
if name is None:
|
|
918
|
+
name = (
|
|
919
|
+
_clean(
|
|
920
|
+
getattr(rec2, "name", None) if backend_choice == "python" else rec2[0]
|
|
921
|
+
)
|
|
922
|
+
if rec2 is not None
|
|
923
|
+
else None
|
|
924
|
+
)
|
|
925
|
+
if name is None:
|
|
926
|
+
name = (
|
|
927
|
+
getattr(rec1, "name", None)
|
|
928
|
+
if backend_choice == "python" and rec1 is not None
|
|
929
|
+
else (rec1[0] if rec1 is not None else None)
|
|
930
|
+
)
|
|
931
|
+
if name is None:
|
|
932
|
+
name = (
|
|
933
|
+
getattr(rec2, "name", None)
|
|
934
|
+
if backend_choice == "python" and rec2 is not None
|
|
935
|
+
else (rec2[0] if rec2 is not None else None)
|
|
936
|
+
)
|
|
490
937
|
|
|
491
938
|
if rec1 is not None:
|
|
492
|
-
|
|
493
|
-
|
|
939
|
+
if backend_choice == "python":
|
|
940
|
+
a1 = _make_unaligned_segment(
|
|
941
|
+
name, rec1.sequence, rec1.quality, bc, read1=True, read2=False
|
|
942
|
+
)
|
|
943
|
+
bam_out_ctx.write(a1)
|
|
944
|
+
else:
|
|
945
|
+
_write_sam_line(
|
|
946
|
+
bam_out_ctx.stdin,
|
|
947
|
+
name,
|
|
948
|
+
rec1[1],
|
|
949
|
+
rec1[2],
|
|
950
|
+
bc,
|
|
951
|
+
read1=True,
|
|
952
|
+
read2=False,
|
|
953
|
+
add_read_group=add_read_group,
|
|
954
|
+
)
|
|
494
955
|
per_file_counts[r1_path] = per_file_counts.get(r1_path, 0) + 1
|
|
495
956
|
total_written += 1
|
|
496
957
|
if rec2 is not None:
|
|
497
|
-
|
|
498
|
-
|
|
958
|
+
if backend_choice == "python":
|
|
959
|
+
a2 = _make_unaligned_segment(
|
|
960
|
+
name, rec2.sequence, rec2.quality, bc, read1=False, read2=True
|
|
961
|
+
)
|
|
962
|
+
bam_out_ctx.write(a2)
|
|
963
|
+
else:
|
|
964
|
+
_write_sam_line(
|
|
965
|
+
bam_out_ctx.stdin,
|
|
966
|
+
name,
|
|
967
|
+
rec2[1],
|
|
968
|
+
rec2[2],
|
|
969
|
+
bc,
|
|
970
|
+
read1=False,
|
|
971
|
+
read2=True,
|
|
972
|
+
add_read_group=add_read_group,
|
|
973
|
+
)
|
|
499
974
|
per_file_counts[r2_path] = per_file_counts.get(r2_path, 0) + 1
|
|
500
975
|
total_written += 1
|
|
501
976
|
|
|
@@ -515,12 +990,40 @@ def concatenate_fastqs_to_bam(
|
|
|
515
990
|
if not pth.exists():
|
|
516
991
|
raise FileNotFoundError(pth)
|
|
517
992
|
bc = per_path_barcode.get(pth, "barcode")
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
993
|
+
if backend_choice == "python":
|
|
994
|
+
iterator = _fastq_iter(pth)
|
|
995
|
+
else:
|
|
996
|
+
iterator = _fastq_iter_plain(pth)
|
|
997
|
+
for rec in iterator:
|
|
998
|
+
if backend_choice == "python":
|
|
999
|
+
a = _make_unaligned_segment(
|
|
1000
|
+
rec.name, rec.sequence, rec.quality, bc, read1=False, read2=False
|
|
1001
|
+
)
|
|
1002
|
+
bam_out_ctx.write(a)
|
|
1003
|
+
else:
|
|
1004
|
+
_write_sam_line(
|
|
1005
|
+
bam_out_ctx.stdin,
|
|
1006
|
+
rec[0],
|
|
1007
|
+
rec[1],
|
|
1008
|
+
rec[2],
|
|
1009
|
+
bc,
|
|
1010
|
+
read1=False,
|
|
1011
|
+
read2=False,
|
|
1012
|
+
add_read_group=add_read_group,
|
|
1013
|
+
)
|
|
521
1014
|
per_file_counts[pth] = per_file_counts.get(pth, 0) + 1
|
|
522
1015
|
total_written += 1
|
|
523
1016
|
singletons_written += 1
|
|
1017
|
+
finally:
|
|
1018
|
+
if backend_choice == "python":
|
|
1019
|
+
bam_out_ctx.close()
|
|
1020
|
+
else:
|
|
1021
|
+
if bam_out_ctx.stdin is not None:
|
|
1022
|
+
bam_out_ctx.stdin.close()
|
|
1023
|
+
rc = bam_out_ctx.wait()
|
|
1024
|
+
if rc != 0:
|
|
1025
|
+
stderr = bam_out_ctx.stderr.read() if bam_out_ctx.stderr else ""
|
|
1026
|
+
raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
|
|
524
1027
|
|
|
525
1028
|
return {
|
|
526
1029
|
"total_reads": total_written,
|
|
@@ -530,43 +1033,61 @@ def concatenate_fastqs_to_bam(
|
|
|
530
1033
|
"barcodes": barcodes_in_order,
|
|
531
1034
|
}
|
|
532
1035
|
|
|
533
|
-
|
|
1036
|
+
|
|
1037
|
+
def count_aligned_reads(bam_file, samtools_backend: str | None = "auto"):
|
|
534
1038
|
"""
|
|
535
1039
|
Counts the number of aligned reads in a bam file that map to each reference record.
|
|
536
|
-
|
|
1040
|
+
|
|
537
1041
|
Parameters:
|
|
538
1042
|
bam_file (str): A string representing the path to an aligned BAM file.
|
|
539
|
-
|
|
1043
|
+
|
|
540
1044
|
Returns:
|
|
541
1045
|
aligned_reads_count (int): The total number or reads aligned in the BAM.
|
|
542
1046
|
unaligned_reads_count (int): The total number of reads not aligned in the BAM.
|
|
543
1047
|
record_counts (dict): A dictionary keyed by reference record instance that points toa tuple containing the total reads mapped to the record and the fraction of mapped reads which map to the record.
|
|
544
1048
|
|
|
545
1049
|
"""
|
|
546
|
-
|
|
1050
|
+
logger.info("Counting aligned reads in BAM > {}".format(bam_file.name))
|
|
1051
|
+
backend_choice = _resolve_samtools_backend(samtools_backend)
|
|
547
1052
|
aligned_reads_count = 0
|
|
548
1053
|
unaligned_reads_count = 0
|
|
549
|
-
# Make a dictionary, keyed by the reference_name of reference chromosome that points to an integer number of read counts mapped to the chromosome, as well as the proportion of mapped reads in that chromosome
|
|
550
|
-
record_counts = defaultdict(int)
|
|
551
|
-
|
|
552
|
-
with pysam.AlignmentFile(str(bam_file), "rb") as bam:
|
|
553
|
-
total_reads = bam.mapped + bam.unmapped
|
|
554
|
-
# Iterate over reads to get the total mapped read counts and the reads that map to each reference
|
|
555
|
-
for read in tqdm(bam, desc='Counting aligned reads in BAM', total=total_reads):
|
|
556
|
-
if read.is_unmapped:
|
|
557
|
-
unaligned_reads_count += 1
|
|
558
|
-
else:
|
|
559
|
-
aligned_reads_count += 1
|
|
560
|
-
record_counts[read.reference_name] += 1 # Automatically increments if key exists, adds if not
|
|
561
1054
|
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
1055
|
+
if backend_choice == "python":
|
|
1056
|
+
pysam_mod = _require_pysam()
|
|
1057
|
+
record_counts = defaultdict(int)
|
|
1058
|
+
with pysam_mod.AlignmentFile(str(bam_file), "rb") as bam:
|
|
1059
|
+
total_reads = bam.mapped + bam.unmapped
|
|
1060
|
+
# Iterate over reads to get the total mapped read counts and the reads that map to each reference
|
|
1061
|
+
for read in bam:
|
|
1062
|
+
if read.is_unmapped:
|
|
1063
|
+
unaligned_reads_count += 1
|
|
1064
|
+
else:
|
|
1065
|
+
aligned_reads_count += 1
|
|
1066
|
+
record_counts[read.reference_name] += (
|
|
1067
|
+
1 # Automatically increments if key exists, adds if not
|
|
1068
|
+
)
|
|
1069
|
+
|
|
1070
|
+
# reformat the dictionary to contain read counts mapped to the reference, as well as the proportion of mapped reads in reference
|
|
1071
|
+
for reference in record_counts:
|
|
1072
|
+
proportion_mapped_reads_in_record = record_counts[reference] / aligned_reads_count
|
|
1073
|
+
record_counts[reference] = (
|
|
1074
|
+
record_counts[reference],
|
|
1075
|
+
proportion_mapped_reads_in_record,
|
|
1076
|
+
)
|
|
1077
|
+
return aligned_reads_count, unaligned_reads_count, dict(record_counts)
|
|
1078
|
+
|
|
1079
|
+
bam_path = Path(bam_file)
|
|
1080
|
+
_ensure_bam_index(bam_path, backend_choice)
|
|
1081
|
+
cmd = ["samtools", "idxstats", str(bam_path)]
|
|
1082
|
+
cp = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
|
1083
|
+
if cp.returncode != 0:
|
|
1084
|
+
raise RuntimeError(f"samtools idxstats failed (exit {cp.returncode}):\n{cp.stderr}")
|
|
1085
|
+
return _parse_idxstats_output(cp.stdout)
|
|
566
1086
|
|
|
567
|
-
return aligned_reads_count, unaligned_reads_count, dict(record_counts)
|
|
568
1087
|
|
|
569
|
-
def demux_and_index_BAM(
|
|
1088
|
+
def demux_and_index_BAM(
|
|
1089
|
+
aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit, barcode_both_ends, trim, threads
|
|
1090
|
+
):
|
|
570
1091
|
"""
|
|
571
1092
|
A wrapper function for splitting BAMS and indexing them.
|
|
572
1093
|
Parameters:
|
|
@@ -577,11 +1098,12 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
|
|
|
577
1098
|
barcode_both_ends (bool): Whether to require both ends to be barcoded.
|
|
578
1099
|
trim (bool): Whether to trim off barcodes after demultiplexing.
|
|
579
1100
|
threads (int): Number of threads to use.
|
|
580
|
-
|
|
1101
|
+
|
|
581
1102
|
Returns:
|
|
582
1103
|
bam_files (list): List of split BAM file path strings
|
|
583
1104
|
Splits an input BAM file on barcode value and makes a BAM index file.
|
|
584
1105
|
"""
|
|
1106
|
+
|
|
585
1107
|
input_bam = aligned_sorted_BAM.with_suffix(bam_suffix)
|
|
586
1108
|
command = ["dorado", "demux", "--kit-name", barcode_kit]
|
|
587
1109
|
if barcode_both_ends:
|
|
@@ -594,25 +1116,37 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
|
|
|
594
1116
|
pass
|
|
595
1117
|
command += ["--emit-summary", "--sort-bam", "--output-dir", str(split_dir)]
|
|
596
1118
|
command.append(str(input_bam))
|
|
597
|
-
command_string =
|
|
598
|
-
|
|
599
|
-
|
|
1119
|
+
command_string = " ".join(command)
|
|
1120
|
+
logger.info("Running dorado demux: %s", " ".join(command))
|
|
1121
|
+
|
|
1122
|
+
proc = subprocess.Popen(
|
|
1123
|
+
command,
|
|
1124
|
+
stdout=subprocess.PIPE,
|
|
1125
|
+
stderr=subprocess.PIPE,
|
|
1126
|
+
text=True,
|
|
1127
|
+
)
|
|
1128
|
+
|
|
1129
|
+
assert proc.stderr is not None
|
|
1130
|
+
_stream_dorado_logs(proc.stderr)
|
|
1131
|
+
rc = proc.wait()
|
|
1132
|
+
|
|
1133
|
+
if rc != 0:
|
|
1134
|
+
raise RuntimeError(f"dorado demux failed with exit code {rc}")
|
|
600
1135
|
|
|
601
1136
|
bam_files = sorted(
|
|
602
|
-
p for p in split_dir.glob(f"*{bam_suffix}")
|
|
603
|
-
if p.is_file() and p.suffix == bam_suffix
|
|
1137
|
+
p for p in split_dir.glob(f"*{bam_suffix}") if p.is_file() and p.suffix == bam_suffix
|
|
604
1138
|
)
|
|
605
1139
|
|
|
606
1140
|
if not bam_files:
|
|
607
1141
|
raise FileNotFoundError(f"No BAM files found in {split_dir} with suffix {bam_suffix}")
|
|
608
|
-
|
|
1142
|
+
|
|
609
1143
|
# ---- Optional renaming with prefix ----
|
|
610
1144
|
renamed_bams = []
|
|
611
1145
|
prefix = "de" if barcode_both_ends else "se"
|
|
612
1146
|
|
|
613
1147
|
for bam in bam_files:
|
|
614
1148
|
bam = Path(bam)
|
|
615
|
-
bai = bam.with_suffix(bam_suffix + ".bai")
|
|
1149
|
+
bai = bam.with_suffix(bam_suffix + ".bai") # dorado’s sorting produces .bam.bai
|
|
616
1150
|
|
|
617
1151
|
if prefix:
|
|
618
1152
|
new_name = f"{prefix}_{bam.name}"
|
|
@@ -628,10 +1162,18 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
|
|
|
628
1162
|
bai.rename(new_bai)
|
|
629
1163
|
|
|
630
1164
|
renamed_bams.append(new_bam)
|
|
631
|
-
|
|
1165
|
+
|
|
632
1166
|
return renamed_bams
|
|
633
1167
|
|
|
634
|
-
|
|
1168
|
+
|
|
1169
|
+
def extract_base_identities(
|
|
1170
|
+
bam_file,
|
|
1171
|
+
chromosome,
|
|
1172
|
+
positions,
|
|
1173
|
+
max_reference_length,
|
|
1174
|
+
sequence,
|
|
1175
|
+
samtools_backend: str | None = "auto",
|
|
1176
|
+
):
|
|
635
1177
|
"""
|
|
636
1178
|
Efficiently extracts base identities from mapped reads with reference coordinates.
|
|
637
1179
|
|
|
@@ -646,38 +1188,95 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
|
|
|
646
1188
|
dict: Base identities from forward mapped reads.
|
|
647
1189
|
dict: Base identities from reverse mapped reads.
|
|
648
1190
|
"""
|
|
1191
|
+
logger.debug("Extracting nucleotide identities for each read using extract_base_identities")
|
|
649
1192
|
timestamp = time.strftime("[%Y-%m-%d %H:%M:%S]")
|
|
650
1193
|
|
|
651
1194
|
positions = set(positions)
|
|
652
|
-
fwd_base_identities = defaultdict(lambda: np.full(max_reference_length,
|
|
653
|
-
rev_base_identities = defaultdict(lambda: np.full(max_reference_length,
|
|
1195
|
+
fwd_base_identities = defaultdict(lambda: np.full(max_reference_length, "N", dtype="<U1"))
|
|
1196
|
+
rev_base_identities = defaultdict(lambda: np.full(max_reference_length, "N", dtype="<U1"))
|
|
654
1197
|
mismatch_counts_per_read = defaultdict(lambda: defaultdict(Counter))
|
|
655
1198
|
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
1199
|
+
backend_choice = _resolve_samtools_backend(samtools_backend)
|
|
1200
|
+
ref_seq = sequence.upper()
|
|
1201
|
+
|
|
1202
|
+
if backend_choice == "python":
|
|
1203
|
+
logger.debug("Extracting base identities using python")
|
|
1204
|
+
pysam_mod = _require_pysam()
|
|
1205
|
+
# print(f"{timestamp} Reading reads from {chromosome} BAM file: {bam_file}")
|
|
1206
|
+
with pysam_mod.AlignmentFile(str(bam_file), "rb") as bam:
|
|
1207
|
+
total_reads = bam.mapped
|
|
1208
|
+
for read in bam.fetch(chromosome):
|
|
1209
|
+
if not read.is_mapped:
|
|
1210
|
+
continue # Skip unmapped reads
|
|
663
1211
|
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
1212
|
+
read_name = read.query_name
|
|
1213
|
+
query_sequence = read.query_sequence
|
|
1214
|
+
base_dict = rev_base_identities if read.is_reverse else fwd_base_identities
|
|
667
1215
|
|
|
668
|
-
|
|
669
|
-
|
|
1216
|
+
# Use get_aligned_pairs directly with positions filtering
|
|
1217
|
+
aligned_pairs = read.get_aligned_pairs(matches_only=True)
|
|
670
1218
|
|
|
671
|
-
|
|
672
|
-
if reference_position in positions:
|
|
1219
|
+
for read_position, reference_position in aligned_pairs:
|
|
673
1220
|
read_base = query_sequence[read_position]
|
|
674
1221
|
ref_base = ref_seq[reference_position]
|
|
1222
|
+
if reference_position in positions:
|
|
1223
|
+
base_dict[read_name][reference_position] = read_base
|
|
675
1224
|
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
1225
|
+
# Track mismatches (excluding Ns)
|
|
1226
|
+
if read_base != ref_base and read_base != "N" and ref_base != "N":
|
|
1227
|
+
mismatch_counts_per_read[read_name][ref_base][read_base] += 1
|
|
1228
|
+
else:
|
|
1229
|
+
bam_path = Path(bam_file)
|
|
1230
|
+
logger.debug("Extracting base identities using samtools")
|
|
1231
|
+
_ensure_bam_index(bam_path, backend_choice)
|
|
1232
|
+
|
|
1233
|
+
def _iter_aligned_pairs(cigar: str, start: int) -> Iterable[Tuple[int, int]]:
|
|
1234
|
+
qpos = 0
|
|
1235
|
+
rpos = start
|
|
1236
|
+
for length_str, op in re.findall(r"(\d+)([MIDNSHP=XB])", cigar):
|
|
1237
|
+
length = int(length_str)
|
|
1238
|
+
if op in {"M", "=", "X"}:
|
|
1239
|
+
for _ in range(length):
|
|
1240
|
+
yield qpos, rpos
|
|
1241
|
+
qpos += 1
|
|
1242
|
+
rpos += 1
|
|
1243
|
+
elif op in {"I", "S"}:
|
|
1244
|
+
qpos += length
|
|
1245
|
+
elif op in {"D", "N"}:
|
|
1246
|
+
rpos += length
|
|
1247
|
+
elif op in {"H", "P"}:
|
|
1248
|
+
continue
|
|
1249
|
+
|
|
1250
|
+
cmd = ["samtools", "view", "-F", "4", str(bam_path), chromosome]
|
|
1251
|
+
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
|
1252
|
+
assert proc.stdout is not None
|
|
1253
|
+
for line in proc.stdout:
|
|
1254
|
+
if not line.strip() or line.startswith("@"):
|
|
1255
|
+
continue
|
|
1256
|
+
fields = line.rstrip("\n").split("\t")
|
|
1257
|
+
if len(fields) < 11:
|
|
1258
|
+
continue
|
|
1259
|
+
read_name = fields[0]
|
|
1260
|
+
flag = int(fields[1])
|
|
1261
|
+
pos = int(fields[3])
|
|
1262
|
+
cigar = fields[5]
|
|
1263
|
+
query_sequence = fields[9]
|
|
1264
|
+
if cigar == "*" or query_sequence == "*":
|
|
1265
|
+
continue
|
|
1266
|
+
base_dict = rev_base_identities if (flag & 16) else fwd_base_identities
|
|
1267
|
+
for read_pos, ref_pos in _iter_aligned_pairs(cigar, pos - 1):
|
|
1268
|
+
if read_pos >= len(query_sequence) or ref_pos >= len(ref_seq):
|
|
1269
|
+
continue
|
|
1270
|
+
read_base = query_sequence[read_pos]
|
|
1271
|
+
ref_base = ref_seq[ref_pos]
|
|
1272
|
+
if ref_pos in positions:
|
|
1273
|
+
base_dict[read_name][ref_pos] = read_base
|
|
1274
|
+
if read_base != ref_base and read_base != "N" and ref_base != "N":
|
|
680
1275
|
mismatch_counts_per_read[read_name][ref_base][read_base] += 1
|
|
1276
|
+
rc = proc.wait()
|
|
1277
|
+
if rc != 0:
|
|
1278
|
+
stderr = proc.stderr.read() if proc.stderr else ""
|
|
1279
|
+
raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
|
|
681
1280
|
|
|
682
1281
|
# Determine C→T vs G→A dominance per read
|
|
683
1282
|
mismatch_trend_per_read = {}
|
|
@@ -694,39 +1293,145 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
|
|
|
694
1293
|
else:
|
|
695
1294
|
mismatch_trend_per_read[read_name] = "none"
|
|
696
1295
|
|
|
697
|
-
return
|
|
1296
|
+
return (
|
|
1297
|
+
dict(fwd_base_identities),
|
|
1298
|
+
dict(rev_base_identities),
|
|
1299
|
+
dict(mismatch_counts_per_read),
|
|
1300
|
+
mismatch_trend_per_read,
|
|
1301
|
+
)
|
|
1302
|
+
|
|
1303
|
+
|
|
1304
|
+
def extract_read_features_from_bam(
|
|
1305
|
+
bam_file_path: str | Path, samtools_backend: str | None = "auto"
|
|
1306
|
+
) -> Dict[str, List[float]]:
|
|
1307
|
+
"""Extract read metrics from a BAM file.
|
|
1308
|
+
|
|
1309
|
+
Args:
|
|
1310
|
+
bam_file_path: Path to the BAM file.
|
|
1311
|
+
samtools_backend: Backend selection for samtools-compatible operations (auto|python|cli).
|
|
698
1312
|
|
|
699
|
-
def extract_read_features_from_bam(bam_file_path):
|
|
700
|
-
"""
|
|
701
|
-
Make a dict of reads from a bam that points to a list of read metrics: read length, read median Q-score, reference length, mapped length, mapping quality
|
|
702
|
-
Params:
|
|
703
|
-
bam_file_path (str):
|
|
704
1313
|
Returns:
|
|
705
|
-
|
|
1314
|
+
Mapping of read name to [read_length, read_median_qscore, reference_length,
|
|
1315
|
+
mapped_length, mapping_quality].
|
|
706
1316
|
"""
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
1317
|
+
logger.debug(
|
|
1318
|
+
"Extracting read metrics from BAM using extract_read_features_from_bam: %s",
|
|
1319
|
+
bam_file_path,
|
|
1320
|
+
)
|
|
1321
|
+
backend_choice = _resolve_samtools_backend(samtools_backend)
|
|
1322
|
+
read_metrics: Dict[str, List[float]] = {}
|
|
1323
|
+
|
|
1324
|
+
if backend_choice == "python":
|
|
1325
|
+
pysam_mod = _require_pysam()
|
|
1326
|
+
with pysam_mod.AlignmentFile(str(bam_file_path), "rb") as bam_file:
|
|
1327
|
+
reference_lengths = dict(zip(bam_file.references, bam_file.lengths))
|
|
1328
|
+
for read in bam_file:
|
|
1329
|
+
if read.is_unmapped:
|
|
1330
|
+
continue
|
|
1331
|
+
read_quality = read.query_qualities
|
|
1332
|
+
if read_quality is None:
|
|
1333
|
+
median_read_quality = float("nan")
|
|
1334
|
+
else:
|
|
1335
|
+
median_read_quality = float(np.median(read_quality))
|
|
1336
|
+
reference_length = reference_lengths.get(read.reference_name, float("nan"))
|
|
1337
|
+
mapped_length = sum(end - start for start, end in read.get_blocks())
|
|
1338
|
+
mapping_quality = float(read.mapping_quality)
|
|
1339
|
+
read_metrics[read.query_name] = [
|
|
1340
|
+
float(read.query_length),
|
|
1341
|
+
median_read_quality,
|
|
1342
|
+
float(reference_length),
|
|
1343
|
+
float(mapped_length),
|
|
1344
|
+
mapping_quality,
|
|
1345
|
+
]
|
|
1346
|
+
return read_metrics
|
|
1347
|
+
|
|
1348
|
+
bam_path = Path(bam_file_path)
|
|
1349
|
+
|
|
1350
|
+
def _parse_reference_lengths(header_text: str) -> Dict[str, int]:
|
|
1351
|
+
ref_lengths: Dict[str, int] = {}
|
|
1352
|
+
for line in header_text.splitlines():
|
|
1353
|
+
if not line.startswith("@SQ"):
|
|
715
1354
|
continue
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
1355
|
+
fields = line.split("\t")
|
|
1356
|
+
name = None
|
|
1357
|
+
length = None
|
|
1358
|
+
for field in fields[1:]:
|
|
1359
|
+
if field.startswith("SN:"):
|
|
1360
|
+
name = field.split(":", 1)[1]
|
|
1361
|
+
elif field.startswith("LN:"):
|
|
1362
|
+
length = int(field.split(":", 1)[1])
|
|
1363
|
+
if name is not None and length is not None:
|
|
1364
|
+
ref_lengths[name] = length
|
|
1365
|
+
return ref_lengths
|
|
1366
|
+
|
|
1367
|
+
def _mapped_length_from_cigar(cigar: str) -> int:
|
|
1368
|
+
mapped = 0
|
|
1369
|
+
for length_str, op in re.findall(r"(\d+)([MIDNSHP=XB])", cigar):
|
|
1370
|
+
length = int(length_str)
|
|
1371
|
+
if op in {"M", "=", "X"}:
|
|
1372
|
+
mapped += length
|
|
1373
|
+
return mapped
|
|
1374
|
+
|
|
1375
|
+
header_cp = subprocess.run(
|
|
1376
|
+
["samtools", "view", "-H", str(bam_path)],
|
|
1377
|
+
stdout=subprocess.PIPE,
|
|
1378
|
+
stderr=subprocess.PIPE,
|
|
1379
|
+
text=True,
|
|
1380
|
+
check=False,
|
|
1381
|
+
)
|
|
1382
|
+
if header_cp.returncode != 0:
|
|
1383
|
+
raise RuntimeError(
|
|
1384
|
+
f"samtools view -H failed (exit {header_cp.returncode}):\n{header_cp.stderr}"
|
|
1385
|
+
)
|
|
1386
|
+
reference_lengths = _parse_reference_lengths(header_cp.stdout)
|
|
1387
|
+
|
|
1388
|
+
proc = subprocess.Popen(
|
|
1389
|
+
["samtools", "view", "-F", "4", str(bam_path)],
|
|
1390
|
+
stdout=subprocess.PIPE,
|
|
1391
|
+
stderr=subprocess.PIPE,
|
|
1392
|
+
text=True,
|
|
1393
|
+
)
|
|
1394
|
+
assert proc.stdout is not None
|
|
1395
|
+
for line in proc.stdout:
|
|
1396
|
+
if not line.strip() or line.startswith("@"):
|
|
1397
|
+
continue
|
|
1398
|
+
fields = line.rstrip("\n").split("\t")
|
|
1399
|
+
if len(fields) < 11:
|
|
1400
|
+
continue
|
|
1401
|
+
read_name = fields[0]
|
|
1402
|
+
reference_name = fields[2]
|
|
1403
|
+
mapping_quality = float(fields[4])
|
|
1404
|
+
cigar = fields[5]
|
|
1405
|
+
sequence = fields[9]
|
|
1406
|
+
quality = fields[10]
|
|
1407
|
+
if sequence == "*":
|
|
1408
|
+
read_length = float("nan")
|
|
1409
|
+
else:
|
|
1410
|
+
read_length = float(len(sequence))
|
|
1411
|
+
if quality == "*" or not quality:
|
|
1412
|
+
median_read_quality = float("nan")
|
|
1413
|
+
else:
|
|
1414
|
+
phreds = [ord(char) - 33 for char in quality]
|
|
1415
|
+
median_read_quality = float(np.median(phreds))
|
|
1416
|
+
reference_length = float(reference_lengths.get(reference_name, float("nan")))
|
|
1417
|
+
mapped_length = float(_mapped_length_from_cigar(cigar)) if cigar != "*" else 0.0
|
|
1418
|
+
read_metrics[read_name] = [
|
|
1419
|
+
read_length,
|
|
1420
|
+
median_read_quality,
|
|
1421
|
+
reference_length,
|
|
1422
|
+
mapped_length,
|
|
1423
|
+
mapping_quality,
|
|
1424
|
+
]
|
|
1425
|
+
|
|
1426
|
+
rc = proc.wait()
|
|
1427
|
+
if rc != 0:
|
|
1428
|
+
stderr = proc.stderr.read() if proc.stderr else ""
|
|
1429
|
+
raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
|
|
726
1430
|
|
|
727
1431
|
return read_metrics
|
|
728
1432
|
|
|
729
|
-
|
|
1433
|
+
|
|
1434
|
+
def extract_readnames_from_bam(aligned_BAM, samtools_backend: str | None = "auto"):
|
|
730
1435
|
"""
|
|
731
1436
|
Takes a BAM and writes out a txt file containing read names from the BAM
|
|
732
1437
|
|
|
@@ -737,17 +1442,39 @@ def extract_readnames_from_bam(aligned_BAM):
|
|
|
737
1442
|
None
|
|
738
1443
|
|
|
739
1444
|
"""
|
|
740
|
-
import subprocess
|
|
741
1445
|
# Make a text file of reads for the BAM
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
1446
|
+
backend_choice = _resolve_samtools_backend(samtools_backend)
|
|
1447
|
+
txt_output = aligned_BAM.split(".bam")[0] + "_read_names.txt"
|
|
1448
|
+
|
|
1449
|
+
if backend_choice == "python":
|
|
1450
|
+
pysam_mod = _require_pysam()
|
|
1451
|
+
with (
|
|
1452
|
+
pysam_mod.AlignmentFile(aligned_BAM, "rb") as bam,
|
|
1453
|
+
open(txt_output, "w", encoding="utf-8") as output_file,
|
|
1454
|
+
):
|
|
1455
|
+
for read in bam:
|
|
1456
|
+
output_file.write(f"{read.query_name}\n")
|
|
1457
|
+
return
|
|
1458
|
+
|
|
1459
|
+
samtools_view = subprocess.Popen(
|
|
1460
|
+
["samtools", "view", aligned_BAM], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
|
|
1461
|
+
)
|
|
1462
|
+
assert samtools_view.stdout is not None
|
|
1463
|
+
with open(txt_output, "w", encoding="utf-8") as output_file:
|
|
1464
|
+
for line in samtools_view.stdout:
|
|
1465
|
+
if not line.strip():
|
|
1466
|
+
continue
|
|
1467
|
+
qname = line.split("\t", 1)[0]
|
|
1468
|
+
output_file.write(f"{qname}\n")
|
|
1469
|
+
rc = samtools_view.wait()
|
|
1470
|
+
if rc != 0:
|
|
1471
|
+
stderr = samtools_view.stderr.read() if samtools_view.stderr else ""
|
|
1472
|
+
raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
|
|
1473
|
+
|
|
1474
|
+
|
|
1475
|
+
def separate_bam_by_bc(
|
|
1476
|
+
input_bam, output_prefix, bam_suffix, split_dir, samtools_backend: str | None = "auto"
|
|
1477
|
+
):
|
|
751
1478
|
"""
|
|
752
1479
|
Separates an input BAM file on the BC SAM tag values.
|
|
753
1480
|
|
|
@@ -756,56 +1483,119 @@ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
|
|
|
756
1483
|
output_prefix (str): A prefix to append to the output BAM.
|
|
757
1484
|
bam_suffix (str): A suffix to add to the bam file.
|
|
758
1485
|
split_dir (str): String indicating path to directory to split BAMs into
|
|
759
|
-
|
|
1486
|
+
|
|
760
1487
|
Returns:
|
|
761
1488
|
None
|
|
762
1489
|
Writes out split BAM files.
|
|
763
1490
|
"""
|
|
1491
|
+
logger.debug("Demultiplexing BAM based on the BC tag")
|
|
764
1492
|
bam_base = input_bam.name
|
|
765
1493
|
bam_base_minus_suffix = input_bam.stem
|
|
766
1494
|
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
#
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
1495
|
+
backend_choice = _resolve_samtools_backend(samtools_backend)
|
|
1496
|
+
|
|
1497
|
+
if backend_choice == "python":
|
|
1498
|
+
pysam_mod = _require_pysam()
|
|
1499
|
+
# Open the input BAM file for reading
|
|
1500
|
+
with pysam_mod.AlignmentFile(str(input_bam), "rb") as bam:
|
|
1501
|
+
# Create a dictionary to store output BAM files
|
|
1502
|
+
output_files = {}
|
|
1503
|
+
# Iterate over each read in the BAM file
|
|
1504
|
+
for read in bam:
|
|
1505
|
+
try:
|
|
1506
|
+
# Get the barcode tag value
|
|
1507
|
+
bc_tag = read.get_tag("BC", with_value_type=True)[0]
|
|
1508
|
+
# bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
|
|
1509
|
+
# Open the output BAM file corresponding to the barcode
|
|
1510
|
+
if bc_tag not in output_files:
|
|
1511
|
+
output_path = (
|
|
1512
|
+
split_dir
|
|
1513
|
+
/ f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}"
|
|
1514
|
+
)
|
|
1515
|
+
output_files[bc_tag] = pysam_mod.AlignmentFile(
|
|
1516
|
+
str(output_path), "wb", header=bam.header
|
|
1517
|
+
)
|
|
1518
|
+
# Write the read to the corresponding output BAM file
|
|
1519
|
+
output_files[bc_tag].write(read)
|
|
1520
|
+
except KeyError:
|
|
1521
|
+
logger.warning(f"BC tag not present for read: {read.query_name}")
|
|
1522
|
+
# Close all output BAM files
|
|
1523
|
+
for output_file in output_files.values():
|
|
1524
|
+
output_file.close()
|
|
1525
|
+
return
|
|
1526
|
+
|
|
1527
|
+
def _collect_bc_tags() -> set[str]:
|
|
1528
|
+
bc_tags: set[str] = set()
|
|
1529
|
+
proc = subprocess.Popen(
|
|
1530
|
+
["samtools", "view", str(input_bam)],
|
|
1531
|
+
stdout=subprocess.PIPE,
|
|
1532
|
+
stderr=subprocess.PIPE,
|
|
1533
|
+
text=True,
|
|
1534
|
+
)
|
|
1535
|
+
assert proc.stdout is not None
|
|
1536
|
+
for line in proc.stdout:
|
|
1537
|
+
if not line.strip():
|
|
1538
|
+
continue
|
|
1539
|
+
fields = line.rstrip("\n").split("\t")
|
|
1540
|
+
for tag in fields[11:]:
|
|
1541
|
+
if tag.startswith("BC:"):
|
|
1542
|
+
bc_tags.add(tag.split(":", 2)[2])
|
|
1543
|
+
break
|
|
1544
|
+
rc = proc.wait()
|
|
1545
|
+
if rc != 0:
|
|
1546
|
+
stderr = proc.stderr.read() if proc.stderr else ""
|
|
1547
|
+
raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
|
|
1548
|
+
return bc_tags
|
|
1549
|
+
|
|
1550
|
+
bc_tags = _collect_bc_tags()
|
|
1551
|
+
if not bc_tags:
|
|
1552
|
+
logger.warning("No BC tags found in %s", input_bam)
|
|
1553
|
+
return
|
|
1554
|
+
|
|
1555
|
+
for bc_tag in bc_tags:
|
|
1556
|
+
output_path = split_dir / f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}"
|
|
1557
|
+
cmd = ["samtools", "view", "-b", "-d", f"BC:{bc_tag}", "-o", str(output_path)]
|
|
1558
|
+
cmd.append(str(input_bam))
|
|
1559
|
+
cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
|
|
1560
|
+
if cp.returncode != 0:
|
|
1561
|
+
raise RuntimeError(
|
|
1562
|
+
f"samtools view failed for BC={bc_tag} (exit {cp.returncode}):\n{cp.stderr}"
|
|
1563
|
+
)
|
|
1564
|
+
|
|
1565
|
+
|
|
1566
|
+
def split_and_index_BAM(
|
|
1567
|
+
aligned_sorted_BAM, split_dir, bam_suffix, samtools_backend: str | None = "auto"
|
|
1568
|
+
):
|
|
790
1569
|
"""
|
|
791
1570
|
A wrapper function for splitting BAMS and indexing them.
|
|
792
1571
|
Parameters:
|
|
793
1572
|
aligned_sorted_BAM (str): A string representing the file path of the aligned_sorted BAM file.
|
|
794
1573
|
split_dir (str): A string representing the file path to the directory to split the BAMs into.
|
|
795
1574
|
bam_suffix (str): A suffix to add to the bam file.
|
|
796
|
-
|
|
1575
|
+
|
|
797
1576
|
Returns:
|
|
798
1577
|
None
|
|
799
1578
|
Splits an input BAM file on barcode value and makes a BAM index file.
|
|
800
1579
|
"""
|
|
1580
|
+
logger.debug("Demultiplexing and indexing BAMS based on BC tag using split_and_index_BAM")
|
|
801
1581
|
aligned_sorted_output = aligned_sorted_BAM + bam_suffix
|
|
802
1582
|
file_prefix = date_string()
|
|
803
|
-
separate_bam_by_bc(
|
|
1583
|
+
separate_bam_by_bc(
|
|
1584
|
+
aligned_sorted_output,
|
|
1585
|
+
file_prefix,
|
|
1586
|
+
bam_suffix,
|
|
1587
|
+
split_dir,
|
|
1588
|
+
samtools_backend=samtools_backend,
|
|
1589
|
+
)
|
|
804
1590
|
# Make a BAM index file for the BAMs in that directory
|
|
805
|
-
bam_pattern =
|
|
1591
|
+
bam_pattern = "*" + bam_suffix
|
|
806
1592
|
bam_files = glob.glob(split_dir / bam_pattern)
|
|
807
|
-
bam_files = [str(bam) for bam in bam_files if
|
|
1593
|
+
bam_files = [str(bam) for bam in bam_files if ".bai" not in str(bam)]
|
|
1594
|
+
backend_choice = _resolve_samtools_backend(samtools_backend)
|
|
808
1595
|
for input_file in bam_files:
|
|
809
|
-
|
|
1596
|
+
if backend_choice == "python":
|
|
1597
|
+
_index_bam_with_pysam(input_file)
|
|
1598
|
+
else:
|
|
1599
|
+
_index_bam_with_samtools(input_file)
|
|
810
1600
|
|
|
811
|
-
return bam_files
|
|
1601
|
+
return bam_files
|