smftools 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +6 -8
- smftools/_settings.py +4 -6
- smftools/_version.py +1 -1
- smftools/cli/helpers.py +7 -1
- smftools/cli/hmm_adata.py +902 -244
- smftools/cli/load_adata.py +318 -198
- smftools/cli/preprocess_adata.py +285 -171
- smftools/cli/spatial_adata.py +137 -53
- smftools/cli_entry.py +94 -178
- smftools/config/__init__.py +1 -1
- smftools/config/conversion.yaml +5 -1
- smftools/config/deaminase.yaml +1 -1
- smftools/config/default.yaml +22 -17
- smftools/config/direct.yaml +8 -3
- smftools/config/discover_input_files.py +19 -5
- smftools/config/experiment_config.py +505 -276
- smftools/constants.py +37 -0
- smftools/datasets/__init__.py +2 -8
- smftools/datasets/datasets.py +32 -18
- smftools/hmm/HMM.py +2125 -1426
- smftools/hmm/__init__.py +2 -3
- smftools/hmm/archived/call_hmm_peaks.py +16 -1
- smftools/hmm/call_hmm_peaks.py +173 -193
- smftools/hmm/display_hmm.py +19 -6
- smftools/hmm/hmm_readwrite.py +13 -4
- smftools/hmm/nucleosome_hmm_refinement.py +102 -14
- smftools/informatics/__init__.py +30 -7
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
- smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
- smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
- smftools/informatics/archived/print_bam_query_seq.py +7 -1
- smftools/informatics/bam_functions.py +379 -156
- smftools/informatics/basecalling.py +51 -9
- smftools/informatics/bed_functions.py +90 -57
- smftools/informatics/binarize_converted_base_identities.py +18 -7
- smftools/informatics/complement_base_list.py +7 -6
- smftools/informatics/converted_BAM_to_adata.py +265 -122
- smftools/informatics/fasta_functions.py +161 -83
- smftools/informatics/h5ad_functions.py +195 -29
- smftools/informatics/modkit_extract_to_adata.py +609 -270
- smftools/informatics/modkit_functions.py +85 -44
- smftools/informatics/ohe.py +44 -21
- smftools/informatics/pod5_functions.py +112 -73
- smftools/informatics/run_multiqc.py +20 -14
- smftools/logging_utils.py +51 -0
- smftools/machine_learning/__init__.py +2 -7
- smftools/machine_learning/data/anndata_data_module.py +143 -50
- smftools/machine_learning/data/preprocessing.py +2 -1
- smftools/machine_learning/evaluation/__init__.py +1 -1
- smftools/machine_learning/evaluation/eval_utils.py +11 -14
- smftools/machine_learning/evaluation/evaluators.py +46 -33
- smftools/machine_learning/inference/__init__.py +1 -1
- smftools/machine_learning/inference/inference_utils.py +7 -4
- smftools/machine_learning/inference/lightning_inference.py +9 -13
- smftools/machine_learning/inference/sklearn_inference.py +6 -8
- smftools/machine_learning/inference/sliding_window_inference.py +35 -25
- smftools/machine_learning/models/__init__.py +10 -5
- smftools/machine_learning/models/base.py +28 -42
- smftools/machine_learning/models/cnn.py +15 -11
- smftools/machine_learning/models/lightning_base.py +71 -40
- smftools/machine_learning/models/mlp.py +13 -4
- smftools/machine_learning/models/positional.py +3 -2
- smftools/machine_learning/models/rnn.py +3 -2
- smftools/machine_learning/models/sklearn_models.py +39 -22
- smftools/machine_learning/models/transformer.py +68 -53
- smftools/machine_learning/models/wrappers.py +2 -1
- smftools/machine_learning/training/__init__.py +2 -2
- smftools/machine_learning/training/train_lightning_model.py +29 -20
- smftools/machine_learning/training/train_sklearn_model.py +9 -15
- smftools/machine_learning/utils/__init__.py +1 -1
- smftools/machine_learning/utils/device.py +7 -4
- smftools/machine_learning/utils/grl.py +3 -1
- smftools/metadata.py +443 -0
- smftools/plotting/__init__.py +19 -5
- smftools/plotting/autocorrelation_plotting.py +145 -44
- smftools/plotting/classifiers.py +162 -72
- smftools/plotting/general_plotting.py +347 -168
- smftools/plotting/hmm_plotting.py +42 -13
- smftools/plotting/position_stats.py +145 -85
- smftools/plotting/qc_plotting.py +20 -12
- smftools/preprocessing/__init__.py +8 -8
- smftools/preprocessing/append_base_context.py +105 -79
- smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
- smftools/preprocessing/{archives → archived}/calculate_complexity.py +3 -1
- smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
- smftools/preprocessing/binarize.py +21 -4
- smftools/preprocessing/binarize_on_Youden.py +127 -31
- smftools/preprocessing/binary_layers_to_ohe.py +17 -11
- smftools/preprocessing/calculate_complexity_II.py +86 -59
- smftools/preprocessing/calculate_consensus.py +28 -19
- smftools/preprocessing/calculate_coverage.py +44 -22
- smftools/preprocessing/calculate_pairwise_differences.py +2 -1
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
- smftools/preprocessing/calculate_position_Youden.py +103 -55
- smftools/preprocessing/calculate_read_length_stats.py +52 -23
- smftools/preprocessing/calculate_read_modification_stats.py +91 -57
- smftools/preprocessing/clean_NaN.py +38 -28
- smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +70 -37
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
- smftools/preprocessing/flag_duplicate_reads.py +688 -271
- smftools/preprocessing/invert_adata.py +26 -11
- smftools/preprocessing/load_sample_sheet.py +40 -22
- smftools/preprocessing/make_dirs.py +8 -3
- smftools/preprocessing/min_non_diagonal.py +2 -1
- smftools/preprocessing/recipes.py +56 -23
- smftools/preprocessing/reindex_references_adata.py +93 -27
- smftools/preprocessing/subsample_adata.py +33 -16
- smftools/readwrite.py +264 -109
- smftools/schema/__init__.py +11 -0
- smftools/schema/anndata_schema_v1.yaml +227 -0
- smftools/tools/__init__.py +3 -4
- smftools/tools/archived/classifiers.py +163 -0
- smftools/tools/archived/subset_adata_v1.py +10 -1
- smftools/tools/archived/subset_adata_v2.py +12 -1
- smftools/tools/calculate_umap.py +54 -15
- smftools/tools/cluster_adata_on_methylation.py +115 -46
- smftools/tools/general_tools.py +70 -25
- smftools/tools/position_stats.py +229 -98
- smftools/tools/read_stats.py +50 -29
- smftools/tools/spatial_autocorrelation.py +365 -192
- smftools/tools/subset_adata.py +23 -21
- {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/METADATA +15 -43
- smftools-0.2.5.dist-info/RECORD +181 -0
- smftools-0.2.4.dist-info/RECORD +0 -176
- /smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +0 -0
- /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
- /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
- {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
- {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,24 +1,55 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
import glob
|
|
4
4
|
import os
|
|
5
|
+
import re
|
|
5
6
|
import subprocess
|
|
6
|
-
import glob
|
|
7
7
|
import time
|
|
8
|
-
from
|
|
9
|
-
import
|
|
8
|
+
from collections import Counter, defaultdict, deque
|
|
9
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
10
10
|
from itertools import zip_longest
|
|
11
|
-
import
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
12
13
|
|
|
13
14
|
import numpy as np
|
|
14
|
-
import
|
|
15
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
16
|
-
from concurrent.futures import ProcessPoolExecutor
|
|
17
|
-
|
|
15
|
+
import pysam
|
|
18
16
|
from tqdm import tqdm
|
|
19
|
-
from collections import defaultdict, Counter
|
|
20
17
|
|
|
21
|
-
from
|
|
18
|
+
from smftools.logging_utils import get_logger
|
|
19
|
+
|
|
20
|
+
from ..readwrite import date_string, time_string
|
|
21
|
+
|
|
22
|
+
logger = get_logger(__name__)
|
|
23
|
+
|
|
24
|
+
_PROGRESS_RE = re.compile(r"Output records written:\s*(\d+)")
|
|
25
|
+
_EMPTY_RE = re.compile(r"^\s*$")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _stream_dorado_logs(stderr_iter) -> None:
|
|
29
|
+
"""Stream dorado stderr and emit structured log messages.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
stderr_iter: Iterable of stderr lines.
|
|
33
|
+
"""
|
|
34
|
+
last_n: int | None = None
|
|
35
|
+
|
|
36
|
+
for raw in stderr_iter:
|
|
37
|
+
line = raw.rstrip("\n")
|
|
38
|
+
if _EMPTY_RE.match(line):
|
|
39
|
+
continue
|
|
40
|
+
|
|
41
|
+
m = _PROGRESS_RE.search(line)
|
|
42
|
+
if m:
|
|
43
|
+
n = int(m.group(1))
|
|
44
|
+
logger.debug("[dorado] Output records written: %d", n)
|
|
45
|
+
last_n = n
|
|
46
|
+
continue
|
|
47
|
+
|
|
48
|
+
logger.info("[dorado] %s", line)
|
|
49
|
+
|
|
50
|
+
if last_n is not None:
|
|
51
|
+
logger.info("[dorado] Final output records written: %d", last_n)
|
|
52
|
+
|
|
22
53
|
|
|
23
54
|
def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str, Path]) -> None:
|
|
24
55
|
"""
|
|
@@ -26,7 +57,13 @@ def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str,
|
|
|
26
57
|
"""
|
|
27
58
|
bam_path = str(bam_path)
|
|
28
59
|
fastq_path = str(fastq_path)
|
|
29
|
-
|
|
60
|
+
|
|
61
|
+
logger.debug(f"Converting BAM to FASTQ using _bam_to_fastq_with_pysam")
|
|
62
|
+
|
|
63
|
+
with (
|
|
64
|
+
pysam.AlignmentFile(bam_path, "rb", check_sq=False) as bam,
|
|
65
|
+
open(fastq_path, "w", encoding="utf-8") as fq,
|
|
66
|
+
):
|
|
30
67
|
for r in bam.fetch(until_eof=True):
|
|
31
68
|
# Optionally skip secondary/supplementary:
|
|
32
69
|
# if r.is_secondary or r.is_supplementary:
|
|
@@ -45,14 +82,22 @@ def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str,
|
|
|
45
82
|
# q is an array/list of ints (Phred scores).
|
|
46
83
|
# Convert to FASTQ string with Phred+33 encoding,
|
|
47
84
|
# clamping to sane range [0, 93] to stay in printable ASCII.
|
|
48
|
-
qual_str = "".join(
|
|
49
|
-
chr(min(max(int(qv), 0), 93) + 33)
|
|
50
|
-
for qv in q
|
|
51
|
-
)
|
|
85
|
+
qual_str = "".join(chr(min(max(int(qv), 0), 93) + 33) for qv in q)
|
|
52
86
|
|
|
53
87
|
fq.write(f"@{name}\n{seq}\n+\n{qual_str}\n")
|
|
54
88
|
|
|
55
|
-
|
|
89
|
+
|
|
90
|
+
def _sort_bam_with_pysam(
|
|
91
|
+
in_bam: Union[str, Path], out_bam: Union[str, Path], threads: Optional[int] = None
|
|
92
|
+
) -> None:
|
|
93
|
+
"""Sort a BAM file using pysam.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
in_bam: Input BAM path.
|
|
97
|
+
out_bam: Output BAM path.
|
|
98
|
+
threads: Optional thread count.
|
|
99
|
+
"""
|
|
100
|
+
logger.debug(f"Sorting BAM using _sort_bam_with_pysam")
|
|
56
101
|
in_bam, out_bam = str(in_bam), str(out_bam)
|
|
57
102
|
args = []
|
|
58
103
|
if threads:
|
|
@@ -60,21 +105,31 @@ def _sort_bam_with_pysam(in_bam: Union[str, Path], out_bam: Union[str, Path], th
|
|
|
60
105
|
args += ["-o", out_bam, in_bam]
|
|
61
106
|
pysam.sort(*args)
|
|
62
107
|
|
|
108
|
+
|
|
63
109
|
def _index_bam_with_pysam(bam_path: Union[str, Path], threads: Optional[int] = None) -> None:
|
|
110
|
+
"""Index a BAM file using pysam.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
bam_path: BAM path to index.
|
|
114
|
+
threads: Optional thread count.
|
|
115
|
+
"""
|
|
64
116
|
bam_path = str(bam_path)
|
|
117
|
+
logger.debug(f"Indexing BAM using _index_bam_with_pysam")
|
|
65
118
|
# pysam.index supports samtools-style args
|
|
66
119
|
if threads:
|
|
67
120
|
pysam.index("-@", str(threads), bam_path)
|
|
68
121
|
else:
|
|
69
122
|
pysam.index(bam_path)
|
|
70
123
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
124
|
+
|
|
125
|
+
def align_and_sort_BAM(
|
|
126
|
+
fasta,
|
|
127
|
+
input,
|
|
128
|
+
cfg,
|
|
74
129
|
):
|
|
75
130
|
"""
|
|
76
131
|
A wrapper for running dorado aligner and samtools functions
|
|
77
|
-
|
|
132
|
+
|
|
78
133
|
Parameters:
|
|
79
134
|
fasta (str): File path to the reference genome to align to.
|
|
80
135
|
input (str): File path to the basecalled file to align. Works for .bam and .fastq files
|
|
@@ -84,61 +139,95 @@ def align_and_sort_BAM(fasta,
|
|
|
84
139
|
None
|
|
85
140
|
The function writes out files for: 1) An aligned BAM, 2) and aligned_sorted BAM, 3) an index file for the aligned_sorted BAM, 4) A bed file for the aligned_sorted BAM, 5) A text file containing read names in the aligned_sorted BAM
|
|
86
141
|
"""
|
|
142
|
+
logger.debug("Aligning and sorting BAM using align_and_sort_BAM")
|
|
87
143
|
input_basename = input.name
|
|
88
144
|
input_suffix = input.suffix
|
|
89
|
-
input_as_fastq = input.with_name(input.stem +
|
|
145
|
+
input_as_fastq = input.with_name(input.stem + ".fastq")
|
|
90
146
|
|
|
91
147
|
output_path_minus_suffix = cfg.output_directory / input.stem
|
|
92
|
-
|
|
148
|
+
|
|
93
149
|
aligned_BAM = output_path_minus_suffix.with_name(output_path_minus_suffix.stem + "_aligned")
|
|
94
150
|
aligned_output = aligned_BAM.with_suffix(cfg.bam_suffix)
|
|
95
|
-
aligned_sorted_BAM =aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
|
|
151
|
+
aligned_sorted_BAM = aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
|
|
96
152
|
aligned_sorted_output = aligned_sorted_BAM.with_suffix(cfg.bam_suffix)
|
|
97
153
|
|
|
98
154
|
if cfg.threads:
|
|
99
155
|
threads = str(cfg.threads)
|
|
100
156
|
else:
|
|
101
157
|
threads = None
|
|
102
|
-
|
|
103
|
-
if cfg.aligner ==
|
|
158
|
+
|
|
159
|
+
if cfg.aligner == "minimap2":
|
|
104
160
|
if not cfg.align_from_bam:
|
|
105
|
-
|
|
161
|
+
logger.debug(f"Converting BAM to FASTQ: {input}")
|
|
106
162
|
_bam_to_fastq_with_pysam(input, input_as_fastq)
|
|
107
|
-
|
|
163
|
+
logger.debug(f"Aligning FASTQ to Reference: {input_as_fastq}")
|
|
108
164
|
mm_input = input_as_fastq
|
|
109
|
-
else:
|
|
110
|
-
|
|
165
|
+
else:
|
|
166
|
+
logger.debug(f"Aligning BAM to Reference: {input}")
|
|
111
167
|
mm_input = input
|
|
112
168
|
|
|
113
169
|
if threads:
|
|
114
|
-
minimap_command =
|
|
170
|
+
minimap_command = (
|
|
171
|
+
["minimap2"] + cfg.aligner_args + ["-t", threads, str(fasta), str(mm_input)]
|
|
172
|
+
)
|
|
115
173
|
else:
|
|
116
|
-
minimap_command = [
|
|
117
|
-
|
|
174
|
+
minimap_command = ["minimap2"] + cfg.aligner_args + [str(fasta), str(mm_input)]
|
|
175
|
+
|
|
176
|
+
with open(aligned_output, "wb") as out:
|
|
177
|
+
proc = subprocess.Popen(
|
|
178
|
+
minimap_command,
|
|
179
|
+
stdout=out,
|
|
180
|
+
stderr=subprocess.PIPE,
|
|
181
|
+
text=True,
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
assert proc.stderr is not None
|
|
185
|
+
for line in proc.stderr:
|
|
186
|
+
logger.info("[minimap2] %s", line.rstrip())
|
|
187
|
+
|
|
188
|
+
ret = proc.wait()
|
|
189
|
+
if ret != 0:
|
|
190
|
+
raise RuntimeError(f"minimap2 failed with exit code {ret}")
|
|
118
191
|
|
|
119
192
|
if not cfg.align_from_bam:
|
|
120
193
|
os.remove(input_as_fastq)
|
|
121
194
|
|
|
122
|
-
elif cfg.aligner ==
|
|
195
|
+
elif cfg.aligner == "dorado":
|
|
123
196
|
# Run dorado aligner
|
|
124
197
|
print(f"Aligning BAM to Reference: {input}")
|
|
125
198
|
if threads:
|
|
126
|
-
alignment_command =
|
|
199
|
+
alignment_command = (
|
|
200
|
+
["dorado", "aligner", "-t", threads] + cfg.aligner_args + [str(fasta), str(input)]
|
|
201
|
+
)
|
|
127
202
|
else:
|
|
128
203
|
alignment_command = ["dorado", "aligner"] + cfg.aligner_args + [str(fasta), str(input)]
|
|
129
|
-
subprocess.run(alignment_command, stdout=open(aligned_output, "wb"))
|
|
130
204
|
|
|
205
|
+
with open(aligned_output, "wb") as out:
|
|
206
|
+
proc = subprocess.Popen(
|
|
207
|
+
alignment_command,
|
|
208
|
+
stdout=out,
|
|
209
|
+
stderr=subprocess.PIPE,
|
|
210
|
+
text=True,
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
assert proc.stderr is not None
|
|
214
|
+
_stream_dorado_logs(proc.stderr)
|
|
215
|
+
ret = proc.wait()
|
|
216
|
+
|
|
217
|
+
if ret != 0:
|
|
218
|
+
raise RuntimeError(f"dorado failed with exit code {ret}")
|
|
131
219
|
else:
|
|
132
|
-
|
|
220
|
+
logger.error(f"Aligner not recognized: {cfg.aligner}. Choose from minimap2 and dorado")
|
|
133
221
|
return
|
|
134
|
-
|
|
222
|
+
|
|
135
223
|
# --- Sort & Index with pysam ---
|
|
136
|
-
|
|
224
|
+
logger.debug(f"Sorting: {aligned_output} -> {aligned_sorted_output}")
|
|
137
225
|
_sort_bam_with_pysam(aligned_output, aligned_sorted_output, threads=threads)
|
|
138
226
|
|
|
139
|
-
|
|
227
|
+
logger.debug(f"Indexing: {aligned_sorted_output}")
|
|
140
228
|
_index_bam_with_pysam(aligned_sorted_output, threads=threads)
|
|
141
229
|
|
|
230
|
+
|
|
142
231
|
def bam_qc(
|
|
143
232
|
bam_files: Iterable[str | Path],
|
|
144
233
|
bam_qc_dir: str | Path,
|
|
@@ -153,133 +242,154 @@ def bam_qc(
|
|
|
153
242
|
Prefers pysam; falls back to `samtools` if needed.
|
|
154
243
|
Runs BAMs in parallel (up to `threads`, default serial).
|
|
155
244
|
"""
|
|
156
|
-
import subprocess
|
|
157
245
|
import shutil
|
|
246
|
+
import subprocess
|
|
247
|
+
|
|
248
|
+
logger.debug("Performing BAM QC using bam_qc")
|
|
158
249
|
|
|
159
250
|
# Try to import pysam once
|
|
160
251
|
try:
|
|
161
|
-
import pysam
|
|
162
|
-
|
|
252
|
+
import pysam # type: ignore
|
|
253
|
+
|
|
254
|
+
have_pysam = True
|
|
163
255
|
except Exception:
|
|
164
|
-
|
|
256
|
+
pysam = None # type: ignore
|
|
257
|
+
have_pysam = False
|
|
165
258
|
|
|
166
259
|
bam_qc_dir = Path(bam_qc_dir)
|
|
167
260
|
bam_qc_dir.mkdir(parents=True, exist_ok=True)
|
|
168
261
|
|
|
169
|
-
|
|
262
|
+
bam_paths = [Path(b) for b in bam_files]
|
|
170
263
|
|
|
171
264
|
def _has_index(p: Path) -> bool:
|
|
172
|
-
if
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
return bai.exists() or
|
|
176
|
-
if
|
|
177
|
-
|
|
178
|
-
return crai.exists()
|
|
265
|
+
"""Return True if a BAM/CRAM index exists for the path."""
|
|
266
|
+
suf = p.suffix.lower()
|
|
267
|
+
if suf == ".bam":
|
|
268
|
+
return p.with_suffix(p.suffix + ".bai").exists() or Path(str(p) + ".bai").exists()
|
|
269
|
+
if suf == ".cram":
|
|
270
|
+
return Path(str(p) + ".crai").exists()
|
|
179
271
|
return False
|
|
180
272
|
|
|
181
273
|
def _ensure_index(p: Path) -> None:
|
|
274
|
+
"""Ensure a BAM/CRAM index exists, creating one if needed."""
|
|
182
275
|
if _has_index(p):
|
|
183
276
|
return
|
|
184
|
-
if
|
|
185
|
-
|
|
186
|
-
pysam.index(str(p))
|
|
277
|
+
if have_pysam:
|
|
278
|
+
assert pysam is not None
|
|
279
|
+
pysam.index(str(p)) # supports BAM & CRAM
|
|
187
280
|
else:
|
|
281
|
+
if not shutil.which("samtools"):
|
|
282
|
+
raise RuntimeError("Neither pysam nor samtools is available in PATH.")
|
|
188
283
|
cmd = ["samtools", "index", str(p)]
|
|
189
|
-
|
|
284
|
+
# capture text so errors are readable; raise on failure
|
|
285
|
+
cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
|
|
286
|
+
if cp.returncode != 0:
|
|
287
|
+
raise RuntimeError(f"samtools index failed (exit {cp.returncode}):\n{cp.stderr}")
|
|
288
|
+
|
|
289
|
+
def _run_samtools_to_file(cmd: list[str], out_path: Path, bam: Path, tag: str) -> int:
|
|
290
|
+
"""
|
|
291
|
+
Stream stderr to logger; write stdout to out_path; return rc; raise with stderr tail on failure.
|
|
292
|
+
"""
|
|
293
|
+
last_err = deque(maxlen=80)
|
|
294
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
295
|
+
|
|
296
|
+
with open(out_path, "w") as fh:
|
|
297
|
+
proc = subprocess.Popen(cmd, stdout=fh, stderr=subprocess.PIPE, text=True)
|
|
298
|
+
assert proc.stderr is not None
|
|
299
|
+
for line in proc.stderr:
|
|
300
|
+
line = line.rstrip()
|
|
301
|
+
if line:
|
|
302
|
+
last_err.append(line)
|
|
303
|
+
logger.info("[%s][%s] %s", tag, bam.name, line)
|
|
304
|
+
rc = proc.wait()
|
|
305
|
+
|
|
306
|
+
if rc != 0:
|
|
307
|
+
tail = "\n".join(last_err)
|
|
308
|
+
raise RuntimeError(f"{tag} failed for {bam} (exit {rc}). Stderr tail:\n{tail}")
|
|
309
|
+
return rc
|
|
310
|
+
|
|
311
|
+
def _run_one(bam: Path) -> tuple[Path, list[tuple[str, int]]]:
|
|
312
|
+
"""Run stats/flagstat/idxstats for a single BAM.
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
bam: Path to the BAM file.
|
|
316
|
+
|
|
317
|
+
Returns:
|
|
318
|
+
Tuple of (bam_path, list of (stage, return_code)).
|
|
319
|
+
"""
|
|
320
|
+
import subprocess
|
|
190
321
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
results: List[Tuple[str, int]] = []
|
|
194
|
-
base = bam.stem # filename without .bam
|
|
322
|
+
results: list[tuple[str, int]] = []
|
|
323
|
+
base = bam.stem # e.g. sample.bam -> sample
|
|
195
324
|
out_stats = bam_qc_dir / f"{base}_stats.txt"
|
|
196
325
|
out_flag = bam_qc_dir / f"{base}_flagstat.txt"
|
|
197
|
-
out_idx
|
|
326
|
+
out_idx = bam_qc_dir / f"{base}_idxstats.txt"
|
|
198
327
|
|
|
199
|
-
# Make sure index exists (
|
|
328
|
+
# Make sure index exists (idxstats requires; stats/flagstat usually don't, but indexing is cheap/useful)
|
|
200
329
|
try:
|
|
201
330
|
_ensure_index(bam)
|
|
202
331
|
except Exception as e:
|
|
203
|
-
# Still attempt stats/flagstat if requested
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
332
|
+
# Still attempt stats/flagstat if requested; idxstats may fail later if index is required.
|
|
333
|
+
logger.warning("Indexing failed for %s: %s", bam, e)
|
|
334
|
+
|
|
335
|
+
if not have_pysam:
|
|
336
|
+
import shutil
|
|
337
|
+
|
|
338
|
+
if not shutil.which("samtools"):
|
|
339
|
+
raise RuntimeError("Neither pysam nor samtools is available in PATH.")
|
|
340
|
+
|
|
341
|
+
# --- stats ---
|
|
342
|
+
if stats:
|
|
343
|
+
if have_pysam and pysam is not None and hasattr(pysam, "stats"):
|
|
211
344
|
txt = pysam.stats(str(bam))
|
|
212
345
|
out_stats.write_text(txt)
|
|
213
346
|
results.append(("stats(pysam)", 0))
|
|
214
347
|
else:
|
|
215
348
|
cmd = ["samtools", "stats", str(bam)]
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
def run_flagstat():
|
|
223
|
-
if not flagstats:
|
|
224
|
-
return
|
|
225
|
-
if HAVE_PYSAM and hasattr(pysam, "flagstat"):
|
|
349
|
+
rc = _run_samtools_to_file(cmd, out_stats, bam, "samtools stats")
|
|
350
|
+
results.append(("stats(samtools)", rc))
|
|
351
|
+
|
|
352
|
+
# --- flagstat ---
|
|
353
|
+
if flagstats:
|
|
354
|
+
if have_pysam and pysam is not None and hasattr(pysam, "flagstat"):
|
|
226
355
|
txt = pysam.flagstat(str(bam))
|
|
227
356
|
out_flag.write_text(txt)
|
|
228
357
|
results.append(("flagstat(pysam)", 0))
|
|
229
358
|
else:
|
|
230
359
|
cmd = ["samtools", "flagstat", str(bam)]
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
def run_idxstats():
|
|
238
|
-
if not idxstats:
|
|
239
|
-
return
|
|
240
|
-
if HAVE_PYSAM and hasattr(pysam, "idxstats"):
|
|
360
|
+
rc = _run_samtools_to_file(cmd, out_flag, bam, "samtools flagstat")
|
|
361
|
+
results.append(("flagstat(samtools)", rc))
|
|
362
|
+
|
|
363
|
+
# --- idxstats ---
|
|
364
|
+
if idxstats:
|
|
365
|
+
if have_pysam and pysam is not None and hasattr(pysam, "idxstats"):
|
|
241
366
|
txt = pysam.idxstats(str(bam))
|
|
242
367
|
out_idx.write_text(txt)
|
|
243
368
|
results.append(("idxstats(pysam)", 0))
|
|
244
369
|
else:
|
|
245
370
|
cmd = ["samtools", "idxstats", str(bam)]
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
results.append(("idxstats(samtools)", cp.returncode))
|
|
249
|
-
if cp.returncode != 0:
|
|
250
|
-
raise RuntimeError(cp.stderr.decode(errors="replace"))
|
|
251
|
-
|
|
252
|
-
# Sanity: ensure samtools exists if pysam missing
|
|
253
|
-
if not HAVE_PYSAM:
|
|
254
|
-
if not shutil.which("samtools"):
|
|
255
|
-
raise RuntimeError("Neither pysam nor samtools is available in PATH.")
|
|
371
|
+
rc = _run_samtools_to_file(cmd, out_idx, bam, "samtools idxstats")
|
|
372
|
+
results.append(("idxstats(samtools)", rc))
|
|
256
373
|
|
|
257
|
-
# Execute tasks (serial per file; parallelized across files)
|
|
258
|
-
run_stats()
|
|
259
|
-
run_flagstat()
|
|
260
|
-
run_idxstats()
|
|
261
374
|
return bam, results
|
|
262
375
|
|
|
263
|
-
# Parallel across BAMs
|
|
264
376
|
max_workers = int(threads) if threads and int(threads) > 0 else 1
|
|
265
|
-
futures = []
|
|
266
|
-
with ThreadPoolExecutor(max_workers=max_workers) as ex:
|
|
267
|
-
for b in bam_files:
|
|
268
|
-
futures.append(ex.submit(_run_one, b))
|
|
269
377
|
|
|
270
|
-
|
|
378
|
+
with ThreadPoolExecutor(max_workers=max_workers) as ex:
|
|
379
|
+
futs = [ex.submit(_run_one, b) for b in bam_paths]
|
|
380
|
+
for fut in as_completed(futs):
|
|
271
381
|
try:
|
|
272
382
|
bam, res = fut.result()
|
|
273
383
|
summary = ", ".join(f"{name}:{rc}" for name, rc in res) or "no-op"
|
|
274
|
-
|
|
384
|
+
logger.info("[qc] %s: %s", bam.name, summary)
|
|
275
385
|
except Exception as e:
|
|
276
|
-
|
|
386
|
+
logger.exception("QC failed: %s", e)
|
|
387
|
+
|
|
388
|
+
if modality not in {"conversion", "direct", "deaminase"}:
|
|
389
|
+
logger.warning("Unknown modality '%s', continuing.", modality)
|
|
277
390
|
|
|
278
|
-
|
|
279
|
-
if modality not in {"conversion", "direct"}:
|
|
280
|
-
print(f"[warn] Unknown modality '{modality}', continuing.")
|
|
391
|
+
logger.info("QC processing completed.")
|
|
281
392
|
|
|
282
|
-
print("QC processing completed.")
|
|
283
393
|
|
|
284
394
|
def concatenate_fastqs_to_bam(
|
|
285
395
|
fastq_files: List[Union[str, Tuple[str, str], Path, Tuple[Path, Path]]],
|
|
@@ -326,12 +436,29 @@ def concatenate_fastqs_to_bam(
|
|
|
326
436
|
"""
|
|
327
437
|
name = p.name
|
|
328
438
|
lowers = name.lower()
|
|
329
|
-
for ext in (
|
|
439
|
+
for ext in (
|
|
440
|
+
".fastq.gz",
|
|
441
|
+
".fq.gz",
|
|
442
|
+
".fastq.bz2",
|
|
443
|
+
".fq.bz2",
|
|
444
|
+
".fastq.xz",
|
|
445
|
+
".fq.xz",
|
|
446
|
+
".fastq",
|
|
447
|
+
".fq",
|
|
448
|
+
):
|
|
330
449
|
if lowers.endswith(ext):
|
|
331
450
|
return name[: -len(ext)]
|
|
332
451
|
return p.stem # fallback: remove last suffix only
|
|
333
452
|
|
|
334
453
|
def _extract_barcode_from_filename(p: Path) -> str:
|
|
454
|
+
"""Extract a barcode token from a FASTQ filename.
|
|
455
|
+
|
|
456
|
+
Args:
|
|
457
|
+
p: FASTQ path.
|
|
458
|
+
|
|
459
|
+
Returns:
|
|
460
|
+
Barcode token string.
|
|
461
|
+
"""
|
|
335
462
|
stem = _strip_fastq_ext(p)
|
|
336
463
|
if "_" in stem:
|
|
337
464
|
token = stem.split("_")[-1]
|
|
@@ -340,10 +467,18 @@ def concatenate_fastqs_to_bam(
|
|
|
340
467
|
return stem
|
|
341
468
|
|
|
342
469
|
def _classify_read_token(stem: str) -> Tuple[Optional[str], Optional[int]]:
|
|
470
|
+
"""Classify a FASTQ filename stem into (prefix, read_number).
|
|
471
|
+
|
|
472
|
+
Args:
|
|
473
|
+
stem: Filename stem.
|
|
474
|
+
|
|
475
|
+
Returns:
|
|
476
|
+
Tuple of (prefix, read_number) or (None, None) if not matched.
|
|
477
|
+
"""
|
|
343
478
|
# return (prefix, readnum) if matches; else (None, None)
|
|
344
479
|
patterns = [
|
|
345
|
-
r"(?i)(.*?)[._-]r?([12])$",
|
|
346
|
-
r"(?i)(.*?)[._-]read[_-]?([12])$",
|
|
480
|
+
r"(?i)(.*?)[._-]r?([12])$", # prefix_R1 / prefix.r2 / prefix-1
|
|
481
|
+
r"(?i)(.*?)[._-]read[_-]?([12])$", # prefix_read1
|
|
347
482
|
]
|
|
348
483
|
for pat in patterns:
|
|
349
484
|
m = re.match(pat, stem)
|
|
@@ -352,6 +487,14 @@ def concatenate_fastqs_to_bam(
|
|
|
352
487
|
return None, None
|
|
353
488
|
|
|
354
489
|
def _pair_by_filename(paths: List[Path]) -> Tuple[List[Tuple[Path, Path]], List[Path]]:
|
|
490
|
+
"""Pair FASTQ files based on filename conventions.
|
|
491
|
+
|
|
492
|
+
Args:
|
|
493
|
+
paths: FASTQ paths to pair.
|
|
494
|
+
|
|
495
|
+
Returns:
|
|
496
|
+
Tuple of (paired list, leftover list).
|
|
497
|
+
"""
|
|
355
498
|
pref_map: Dict[str, Dict[int, Path]] = {}
|
|
356
499
|
unpaired: List[Path] = []
|
|
357
500
|
for pth in paths:
|
|
@@ -373,6 +516,14 @@ def concatenate_fastqs_to_bam(
|
|
|
373
516
|
return pairs, leftovers
|
|
374
517
|
|
|
375
518
|
def _fastq_iter(p: Path):
|
|
519
|
+
"""Yield FASTQ records using pysam.FastxFile.
|
|
520
|
+
|
|
521
|
+
Args:
|
|
522
|
+
p: FASTQ path.
|
|
523
|
+
|
|
524
|
+
Yields:
|
|
525
|
+
Pysam Fastx records.
|
|
526
|
+
"""
|
|
376
527
|
# pysam.FastxFile handles compressed extensions transparently
|
|
377
528
|
with pysam.FastxFile(str(p)) as fx:
|
|
378
529
|
for rec in fx:
|
|
@@ -386,6 +537,19 @@ def concatenate_fastqs_to_bam(
|
|
|
386
537
|
read1: bool,
|
|
387
538
|
read2: bool,
|
|
388
539
|
) -> pysam.AlignedSegment:
|
|
540
|
+
"""Construct an unaligned pysam.AlignedSegment.
|
|
541
|
+
|
|
542
|
+
Args:
|
|
543
|
+
name: Read name.
|
|
544
|
+
seq: Read sequence.
|
|
545
|
+
qual: FASTQ quality string.
|
|
546
|
+
bc: Barcode string.
|
|
547
|
+
read1: Whether this is read 1.
|
|
548
|
+
read2: Whether this is read 2.
|
|
549
|
+
|
|
550
|
+
Returns:
|
|
551
|
+
Unaligned pysam.AlignedSegment.
|
|
552
|
+
"""
|
|
389
553
|
a = pysam.AlignedSegment()
|
|
390
554
|
a.query_name = name
|
|
391
555
|
a.query_sequence = seq
|
|
@@ -408,6 +572,7 @@ def concatenate_fastqs_to_bam(
|
|
|
408
572
|
|
|
409
573
|
# ---------- normalize inputs to Path ----------
|
|
410
574
|
def _to_path_pair(x) -> Tuple[Path, Path]:
|
|
575
|
+
"""Convert a tuple of path-like objects to Path instances."""
|
|
411
576
|
a, b = x
|
|
412
577
|
return Path(a), Path(b)
|
|
413
578
|
|
|
@@ -450,7 +615,10 @@ def concatenate_fastqs_to_bam(
|
|
|
450
615
|
# ---------- BAM header ----------
|
|
451
616
|
header = {"HD": {"VN": "1.6", "SO": "unknown"}, "SQ": []}
|
|
452
617
|
if add_read_group:
|
|
453
|
-
header["RG"] = [
|
|
618
|
+
header["RG"] = [
|
|
619
|
+
{"ID": bc, **({"SM": rg_sample_field} if rg_sample_field else {})}
|
|
620
|
+
for bc in barcodes_in_order
|
|
621
|
+
]
|
|
454
622
|
header.setdefault("PG", []).append(
|
|
455
623
|
{"ID": "concat-fastq", "PN": "concatenate_fastqs_to_bam", "VN": "1"}
|
|
456
624
|
)
|
|
@@ -476,7 +644,9 @@ def concatenate_fastqs_to_bam(
|
|
|
476
644
|
it2 = _fastq_iter(r2_path)
|
|
477
645
|
|
|
478
646
|
for rec1, rec2 in zip_longest(it1, it2, fillvalue=None):
|
|
647
|
+
|
|
479
648
|
def _clean(n: Optional[str]) -> Optional[str]:
|
|
649
|
+
"""Normalize FASTQ read names by trimming read suffixes."""
|
|
480
650
|
if n is None:
|
|
481
651
|
return None
|
|
482
652
|
return re.sub(r"(?:/1$|/2$|\s[12]$)", "", n)
|
|
@@ -489,12 +659,16 @@ def concatenate_fastqs_to_bam(
|
|
|
489
659
|
)
|
|
490
660
|
|
|
491
661
|
if rec1 is not None:
|
|
492
|
-
a1 = _make_unaligned_segment(
|
|
662
|
+
a1 = _make_unaligned_segment(
|
|
663
|
+
name, rec1.sequence, rec1.quality, bc, read1=True, read2=False
|
|
664
|
+
)
|
|
493
665
|
bam_out.write(a1)
|
|
494
666
|
per_file_counts[r1_path] = per_file_counts.get(r1_path, 0) + 1
|
|
495
667
|
total_written += 1
|
|
496
668
|
if rec2 is not None:
|
|
497
|
-
a2 = _make_unaligned_segment(
|
|
669
|
+
a2 = _make_unaligned_segment(
|
|
670
|
+
name, rec2.sequence, rec2.quality, bc, read1=False, read2=True
|
|
671
|
+
)
|
|
498
672
|
bam_out.write(a2)
|
|
499
673
|
per_file_counts[r2_path] = per_file_counts.get(r2_path, 0) + 1
|
|
500
674
|
total_written += 1
|
|
@@ -516,7 +690,9 @@ def concatenate_fastqs_to_bam(
|
|
|
516
690
|
raise FileNotFoundError(pth)
|
|
517
691
|
bc = per_path_barcode.get(pth, "barcode")
|
|
518
692
|
for rec in _fastq_iter(pth):
|
|
519
|
-
a = _make_unaligned_segment(
|
|
693
|
+
a = _make_unaligned_segment(
|
|
694
|
+
rec.name, rec.sequence, rec.quality, bc, read1=False, read2=False
|
|
695
|
+
)
|
|
520
696
|
bam_out.write(a)
|
|
521
697
|
per_file_counts[pth] = per_file_counts.get(pth, 0) + 1
|
|
522
698
|
total_written += 1
|
|
@@ -530,20 +706,21 @@ def concatenate_fastqs_to_bam(
|
|
|
530
706
|
"barcodes": barcodes_in_order,
|
|
531
707
|
}
|
|
532
708
|
|
|
709
|
+
|
|
533
710
|
def count_aligned_reads(bam_file):
|
|
534
711
|
"""
|
|
535
712
|
Counts the number of aligned reads in a bam file that map to each reference record.
|
|
536
|
-
|
|
713
|
+
|
|
537
714
|
Parameters:
|
|
538
715
|
bam_file (str): A string representing the path to an aligned BAM file.
|
|
539
|
-
|
|
716
|
+
|
|
540
717
|
Returns:
|
|
541
718
|
aligned_reads_count (int): The total number or reads aligned in the BAM.
|
|
542
719
|
unaligned_reads_count (int): The total number of reads not aligned in the BAM.
|
|
543
720
|
record_counts (dict): A dictionary keyed by reference record instance that points toa tuple containing the total reads mapped to the record and the fraction of mapped reads which map to the record.
|
|
544
721
|
|
|
545
722
|
"""
|
|
546
|
-
print(
|
|
723
|
+
print("{0}: Counting aligned reads in BAM > {1}".format(time_string(), bam_file))
|
|
547
724
|
aligned_reads_count = 0
|
|
548
725
|
unaligned_reads_count = 0
|
|
549
726
|
# Make a dictionary, keyed by the reference_name of reference chromosome that points to an integer number of read counts mapped to the chromosome, as well as the proportion of mapped reads in that chromosome
|
|
@@ -552,12 +729,14 @@ def count_aligned_reads(bam_file):
|
|
|
552
729
|
with pysam.AlignmentFile(str(bam_file), "rb") as bam:
|
|
553
730
|
total_reads = bam.mapped + bam.unmapped
|
|
554
731
|
# Iterate over reads to get the total mapped read counts and the reads that map to each reference
|
|
555
|
-
for read in tqdm(bam, desc=
|
|
732
|
+
for read in tqdm(bam, desc="Counting aligned reads in BAM", total=total_reads):
|
|
556
733
|
if read.is_unmapped:
|
|
557
734
|
unaligned_reads_count += 1
|
|
558
735
|
else:
|
|
559
736
|
aligned_reads_count += 1
|
|
560
|
-
record_counts[read.reference_name] +=
|
|
737
|
+
record_counts[read.reference_name] += (
|
|
738
|
+
1 # Automatically increments if key exists, adds if not
|
|
739
|
+
)
|
|
561
740
|
|
|
562
741
|
# reformat the dictionary to contain read counts mapped to the reference, as well as the proportion of mapped reads in reference
|
|
563
742
|
for reference in record_counts:
|
|
@@ -566,7 +745,10 @@ def count_aligned_reads(bam_file):
|
|
|
566
745
|
|
|
567
746
|
return aligned_reads_count, unaligned_reads_count, dict(record_counts)
|
|
568
747
|
|
|
569
|
-
|
|
748
|
+
|
|
749
|
+
def demux_and_index_BAM(
|
|
750
|
+
aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit, barcode_both_ends, trim, threads
|
|
751
|
+
):
|
|
570
752
|
"""
|
|
571
753
|
A wrapper function for splitting BAMS and indexing them.
|
|
572
754
|
Parameters:
|
|
@@ -577,11 +759,12 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
|
|
|
577
759
|
barcode_both_ends (bool): Whether to require both ends to be barcoded.
|
|
578
760
|
trim (bool): Whether to trim off barcodes after demultiplexing.
|
|
579
761
|
threads (int): Number of threads to use.
|
|
580
|
-
|
|
762
|
+
|
|
581
763
|
Returns:
|
|
582
764
|
bam_files (list): List of split BAM file path strings
|
|
583
765
|
Splits an input BAM file on barcode value and makes a BAM index file.
|
|
584
766
|
"""
|
|
767
|
+
|
|
585
768
|
input_bam = aligned_sorted_BAM.with_suffix(bam_suffix)
|
|
586
769
|
command = ["dorado", "demux", "--kit-name", barcode_kit]
|
|
587
770
|
if barcode_both_ends:
|
|
@@ -594,25 +777,37 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
|
|
|
594
777
|
pass
|
|
595
778
|
command += ["--emit-summary", "--sort-bam", "--output-dir", str(split_dir)]
|
|
596
779
|
command.append(str(input_bam))
|
|
597
|
-
command_string =
|
|
598
|
-
|
|
599
|
-
|
|
780
|
+
command_string = " ".join(command)
|
|
781
|
+
logger.info("Running dorado demux: %s", " ".join(command))
|
|
782
|
+
|
|
783
|
+
proc = subprocess.Popen(
|
|
784
|
+
command,
|
|
785
|
+
stdout=subprocess.PIPE,
|
|
786
|
+
stderr=subprocess.PIPE,
|
|
787
|
+
text=True,
|
|
788
|
+
)
|
|
789
|
+
|
|
790
|
+
assert proc.stderr is not None
|
|
791
|
+
_stream_dorado_logs(proc.stderr)
|
|
792
|
+
rc = proc.wait()
|
|
793
|
+
|
|
794
|
+
if rc != 0:
|
|
795
|
+
raise RuntimeError(f"dorado demux failed with exit code {rc}")
|
|
600
796
|
|
|
601
797
|
bam_files = sorted(
|
|
602
|
-
p for p in split_dir.glob(f"*{bam_suffix}")
|
|
603
|
-
if p.is_file() and p.suffix == bam_suffix
|
|
798
|
+
p for p in split_dir.glob(f"*{bam_suffix}") if p.is_file() and p.suffix == bam_suffix
|
|
604
799
|
)
|
|
605
800
|
|
|
606
801
|
if not bam_files:
|
|
607
802
|
raise FileNotFoundError(f"No BAM files found in {split_dir} with suffix {bam_suffix}")
|
|
608
|
-
|
|
803
|
+
|
|
609
804
|
# ---- Optional renaming with prefix ----
|
|
610
805
|
renamed_bams = []
|
|
611
806
|
prefix = "de" if barcode_both_ends else "se"
|
|
612
807
|
|
|
613
808
|
for bam in bam_files:
|
|
614
809
|
bam = Path(bam)
|
|
615
|
-
bai = bam.with_suffix(bam_suffix + ".bai")
|
|
810
|
+
bai = bam.with_suffix(bam_suffix + ".bai") # dorado’s sorting produces .bam.bai
|
|
616
811
|
|
|
617
812
|
if prefix:
|
|
618
813
|
new_name = f"{prefix}_{bam.name}"
|
|
@@ -628,9 +823,10 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
|
|
|
628
823
|
bai.rename(new_bai)
|
|
629
824
|
|
|
630
825
|
renamed_bams.append(new_bam)
|
|
631
|
-
|
|
826
|
+
|
|
632
827
|
return renamed_bams
|
|
633
828
|
|
|
829
|
+
|
|
634
830
|
def extract_base_identities(bam_file, chromosome, positions, max_reference_length, sequence):
|
|
635
831
|
"""
|
|
636
832
|
Efficiently extracts base identities from mapped reads with reference coordinates.
|
|
@@ -646,14 +842,15 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
|
|
|
646
842
|
dict: Base identities from forward mapped reads.
|
|
647
843
|
dict: Base identities from reverse mapped reads.
|
|
648
844
|
"""
|
|
845
|
+
logger.debug("Extracting nucleotide identities for each read using extract_base_identities")
|
|
649
846
|
timestamp = time.strftime("[%Y-%m-%d %H:%M:%S]")
|
|
650
847
|
|
|
651
848
|
positions = set(positions)
|
|
652
|
-
fwd_base_identities = defaultdict(lambda: np.full(max_reference_length,
|
|
653
|
-
rev_base_identities = defaultdict(lambda: np.full(max_reference_length,
|
|
849
|
+
fwd_base_identities = defaultdict(lambda: np.full(max_reference_length, "N", dtype="<U1"))
|
|
850
|
+
rev_base_identities = defaultdict(lambda: np.full(max_reference_length, "N", dtype="<U1"))
|
|
654
851
|
mismatch_counts_per_read = defaultdict(lambda: defaultdict(Counter))
|
|
655
852
|
|
|
656
|
-
#print(f"{timestamp} Reading reads from {chromosome} BAM file: {bam_file}")
|
|
853
|
+
# print(f"{timestamp} Reading reads from {chromosome} BAM file: {bam_file}")
|
|
657
854
|
with pysam.AlignmentFile(str(bam_file), "rb") as bam:
|
|
658
855
|
total_reads = bam.mapped
|
|
659
856
|
ref_seq = sequence.upper()
|
|
@@ -676,7 +873,7 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
|
|
|
676
873
|
base_dict[read_name][reference_position] = read_base
|
|
677
874
|
|
|
678
875
|
# Track mismatches (excluding Ns)
|
|
679
|
-
if read_base != ref_base and read_base !=
|
|
876
|
+
if read_base != ref_base and read_base != "N" and ref_base != "N":
|
|
680
877
|
mismatch_counts_per_read[read_name][ref_base][read_base] += 1
|
|
681
878
|
|
|
682
879
|
# Determine C→T vs G→A dominance per read
|
|
@@ -694,7 +891,13 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
|
|
|
694
891
|
else:
|
|
695
892
|
mismatch_trend_per_read[read_name] = "none"
|
|
696
893
|
|
|
697
|
-
return
|
|
894
|
+
return (
|
|
895
|
+
dict(fwd_base_identities),
|
|
896
|
+
dict(rev_base_identities),
|
|
897
|
+
dict(mismatch_counts_per_read),
|
|
898
|
+
mismatch_trend_per_read,
|
|
899
|
+
)
|
|
900
|
+
|
|
698
901
|
|
|
699
902
|
def extract_read_features_from_bam(bam_file_path):
|
|
700
903
|
"""
|
|
@@ -705,7 +908,9 @@ def extract_read_features_from_bam(bam_file_path):
|
|
|
705
908
|
read_metrics (dict)
|
|
706
909
|
"""
|
|
707
910
|
# Open the BAM file
|
|
708
|
-
|
|
911
|
+
logger.debug(
|
|
912
|
+
f"Extracting read metrics from BAM using extract_read_features_from_bam: {bam_file_path}"
|
|
913
|
+
)
|
|
709
914
|
with pysam.AlignmentFile(bam_file_path, "rb") as bam_file:
|
|
710
915
|
read_metrics = {}
|
|
711
916
|
reference_lengths = bam_file.lengths # List of lengths for each reference (chromosome)
|
|
@@ -722,10 +927,17 @@ def extract_read_features_from_bam(bam_file_path):
|
|
|
722
927
|
reference_length = reference_lengths[reference_index]
|
|
723
928
|
mapped_length = sum(end - start for start, end in read.get_blocks())
|
|
724
929
|
mapping_quality = read.mapping_quality # Phred-scaled MAPQ
|
|
725
|
-
read_metrics[read.query_name] = [
|
|
930
|
+
read_metrics[read.query_name] = [
|
|
931
|
+
read.query_length,
|
|
932
|
+
median_read_quality,
|
|
933
|
+
reference_length,
|
|
934
|
+
mapped_length,
|
|
935
|
+
mapping_quality,
|
|
936
|
+
]
|
|
726
937
|
|
|
727
938
|
return read_metrics
|
|
728
939
|
|
|
940
|
+
|
|
729
941
|
def extract_readnames_from_bam(aligned_BAM):
|
|
730
942
|
"""
|
|
731
943
|
Takes a BAM and writes out a txt file containing read names from the BAM
|
|
@@ -738,15 +950,19 @@ def extract_readnames_from_bam(aligned_BAM):
|
|
|
738
950
|
|
|
739
951
|
"""
|
|
740
952
|
import subprocess
|
|
953
|
+
|
|
741
954
|
# Make a text file of reads for the BAM
|
|
742
|
-
txt_output = aligned_BAM.split(
|
|
955
|
+
txt_output = aligned_BAM.split(".bam")[0] + "_read_names.txt"
|
|
743
956
|
samtools_view = subprocess.Popen(["samtools", "view", aligned_BAM], stdout=subprocess.PIPE)
|
|
744
957
|
with open(txt_output, "w") as output_file:
|
|
745
|
-
cut_process = subprocess.Popen(
|
|
958
|
+
cut_process = subprocess.Popen(
|
|
959
|
+
["cut", "-f1"], stdin=samtools_view.stdout, stdout=output_file
|
|
960
|
+
)
|
|
746
961
|
samtools_view.stdout.close()
|
|
747
962
|
cut_process.wait()
|
|
748
963
|
samtools_view.wait()
|
|
749
964
|
|
|
965
|
+
|
|
750
966
|
def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
|
|
751
967
|
"""
|
|
752
968
|
Separates an input BAM file on the BC SAM tag values.
|
|
@@ -756,11 +972,12 @@ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
|
|
|
756
972
|
output_prefix (str): A prefix to append to the output BAM.
|
|
757
973
|
bam_suffix (str): A suffix to add to the bam file.
|
|
758
974
|
split_dir (str): String indicating path to directory to split BAMs into
|
|
759
|
-
|
|
975
|
+
|
|
760
976
|
Returns:
|
|
761
977
|
None
|
|
762
978
|
Writes out split BAM files.
|
|
763
979
|
"""
|
|
980
|
+
logger.debug("Demultiplexing BAM based on the BC tag")
|
|
764
981
|
bam_base = input_bam.name
|
|
765
982
|
bam_base_minus_suffix = input_bam.stem
|
|
766
983
|
|
|
@@ -773,19 +990,24 @@ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
|
|
|
773
990
|
try:
|
|
774
991
|
# Get the barcode tag value
|
|
775
992
|
bc_tag = read.get_tag("BC", with_value_type=True)[0]
|
|
776
|
-
#bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
|
|
993
|
+
# bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
|
|
777
994
|
# Open the output BAM file corresponding to the barcode
|
|
778
995
|
if bc_tag not in output_files:
|
|
779
|
-
output_path =
|
|
780
|
-
|
|
996
|
+
output_path = (
|
|
997
|
+
split_dir / f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}"
|
|
998
|
+
)
|
|
999
|
+
output_files[bc_tag] = pysam.AlignmentFile(
|
|
1000
|
+
str(output_path), "wb", header=bam.header
|
|
1001
|
+
)
|
|
781
1002
|
# Write the read to the corresponding output BAM file
|
|
782
1003
|
output_files[bc_tag].write(read)
|
|
783
1004
|
except KeyError:
|
|
784
|
-
|
|
1005
|
+
logger.warning(f"BC tag not present for read: {read.query_name}")
|
|
785
1006
|
# Close all output BAM files
|
|
786
1007
|
for output_file in output_files.values():
|
|
787
1008
|
output_file.close()
|
|
788
1009
|
|
|
1010
|
+
|
|
789
1011
|
def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
|
|
790
1012
|
"""
|
|
791
1013
|
A wrapper function for splitting BAMS and indexing them.
|
|
@@ -793,19 +1015,20 @@ def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
|
|
|
793
1015
|
aligned_sorted_BAM (str): A string representing the file path of the aligned_sorted BAM file.
|
|
794
1016
|
split_dir (str): A string representing the file path to the directory to split the BAMs into.
|
|
795
1017
|
bam_suffix (str): A suffix to add to the bam file.
|
|
796
|
-
|
|
1018
|
+
|
|
797
1019
|
Returns:
|
|
798
1020
|
None
|
|
799
1021
|
Splits an input BAM file on barcode value and makes a BAM index file.
|
|
800
1022
|
"""
|
|
1023
|
+
logger.debug("Demultiplexing and indexing BAMS based on BC tag using split_and_index_BAM")
|
|
801
1024
|
aligned_sorted_output = aligned_sorted_BAM + bam_suffix
|
|
802
1025
|
file_prefix = date_string()
|
|
803
1026
|
separate_bam_by_bc(aligned_sorted_output, file_prefix, bam_suffix, split_dir)
|
|
804
1027
|
# Make a BAM index file for the BAMs in that directory
|
|
805
|
-
bam_pattern =
|
|
1028
|
+
bam_pattern = "*" + bam_suffix
|
|
806
1029
|
bam_files = glob.glob(split_dir / bam_pattern)
|
|
807
|
-
bam_files = [str(bam) for bam in bam_files if
|
|
1030
|
+
bam_files = [str(bam) for bam in bam_files if ".bai" not in str(bam)]
|
|
808
1031
|
for input_file in bam_files:
|
|
809
1032
|
pysam.index(input_file)
|
|
810
1033
|
|
|
811
|
-
return bam_files
|
|
1034
|
+
return bam_files
|