smftools 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +6 -8
- smftools/_settings.py +4 -6
- smftools/_version.py +1 -1
- smftools/cli/helpers.py +54 -0
- smftools/cli/hmm_adata.py +937 -256
- smftools/cli/load_adata.py +448 -268
- smftools/cli/preprocess_adata.py +469 -263
- smftools/cli/spatial_adata.py +536 -319
- smftools/cli_entry.py +97 -182
- smftools/config/__init__.py +1 -1
- smftools/config/conversion.yaml +17 -6
- smftools/config/deaminase.yaml +12 -10
- smftools/config/default.yaml +142 -33
- smftools/config/direct.yaml +11 -3
- smftools/config/discover_input_files.py +19 -5
- smftools/config/experiment_config.py +594 -264
- smftools/constants.py +37 -0
- smftools/datasets/__init__.py +2 -8
- smftools/datasets/datasets.py +32 -18
- smftools/hmm/HMM.py +2128 -1418
- smftools/hmm/__init__.py +2 -9
- smftools/hmm/archived/call_hmm_peaks.py +121 -0
- smftools/hmm/call_hmm_peaks.py +299 -91
- smftools/hmm/display_hmm.py +19 -6
- smftools/hmm/hmm_readwrite.py +13 -4
- smftools/hmm/nucleosome_hmm_refinement.py +102 -14
- smftools/informatics/__init__.py +30 -7
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
- smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
- smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
- smftools/informatics/archived/print_bam_query_seq.py +7 -1
- smftools/informatics/bam_functions.py +397 -175
- smftools/informatics/basecalling.py +51 -9
- smftools/informatics/bed_functions.py +90 -57
- smftools/informatics/binarize_converted_base_identities.py +18 -7
- smftools/informatics/complement_base_list.py +7 -6
- smftools/informatics/converted_BAM_to_adata.py +265 -122
- smftools/informatics/fasta_functions.py +161 -83
- smftools/informatics/h5ad_functions.py +196 -30
- smftools/informatics/modkit_extract_to_adata.py +609 -270
- smftools/informatics/modkit_functions.py +85 -44
- smftools/informatics/ohe.py +44 -21
- smftools/informatics/pod5_functions.py +112 -73
- smftools/informatics/run_multiqc.py +20 -14
- smftools/logging_utils.py +51 -0
- smftools/machine_learning/__init__.py +2 -7
- smftools/machine_learning/data/anndata_data_module.py +143 -50
- smftools/machine_learning/data/preprocessing.py +2 -1
- smftools/machine_learning/evaluation/__init__.py +1 -1
- smftools/machine_learning/evaluation/eval_utils.py +11 -14
- smftools/machine_learning/evaluation/evaluators.py +46 -33
- smftools/machine_learning/inference/__init__.py +1 -1
- smftools/machine_learning/inference/inference_utils.py +7 -4
- smftools/machine_learning/inference/lightning_inference.py +9 -13
- smftools/machine_learning/inference/sklearn_inference.py +6 -8
- smftools/machine_learning/inference/sliding_window_inference.py +35 -25
- smftools/machine_learning/models/__init__.py +10 -5
- smftools/machine_learning/models/base.py +28 -42
- smftools/machine_learning/models/cnn.py +15 -11
- smftools/machine_learning/models/lightning_base.py +71 -40
- smftools/machine_learning/models/mlp.py +13 -4
- smftools/machine_learning/models/positional.py +3 -2
- smftools/machine_learning/models/rnn.py +3 -2
- smftools/machine_learning/models/sklearn_models.py +39 -22
- smftools/machine_learning/models/transformer.py +68 -53
- smftools/machine_learning/models/wrappers.py +2 -1
- smftools/machine_learning/training/__init__.py +2 -2
- smftools/machine_learning/training/train_lightning_model.py +29 -20
- smftools/machine_learning/training/train_sklearn_model.py +9 -15
- smftools/machine_learning/utils/__init__.py +1 -1
- smftools/machine_learning/utils/device.py +7 -4
- smftools/machine_learning/utils/grl.py +3 -1
- smftools/metadata.py +443 -0
- smftools/plotting/__init__.py +19 -5
- smftools/plotting/autocorrelation_plotting.py +145 -44
- smftools/plotting/classifiers.py +162 -72
- smftools/plotting/general_plotting.py +422 -197
- smftools/plotting/hmm_plotting.py +42 -13
- smftools/plotting/position_stats.py +147 -87
- smftools/plotting/qc_plotting.py +20 -12
- smftools/preprocessing/__init__.py +10 -12
- smftools/preprocessing/append_base_context.py +115 -80
- smftools/preprocessing/append_binary_layer_by_base_context.py +77 -39
- smftools/preprocessing/{calculate_complexity.py → archived/calculate_complexity.py} +3 -1
- smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
- smftools/preprocessing/binarize.py +21 -4
- smftools/preprocessing/binarize_on_Youden.py +129 -31
- smftools/preprocessing/binary_layers_to_ohe.py +17 -11
- smftools/preprocessing/calculate_complexity_II.py +86 -59
- smftools/preprocessing/calculate_consensus.py +28 -19
- smftools/preprocessing/calculate_coverage.py +50 -25
- smftools/preprocessing/calculate_pairwise_differences.py +2 -1
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
- smftools/preprocessing/calculate_position_Youden.py +118 -54
- smftools/preprocessing/calculate_read_length_stats.py +52 -23
- smftools/preprocessing/calculate_read_modification_stats.py +91 -57
- smftools/preprocessing/clean_NaN.py +38 -28
- smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +71 -38
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
- smftools/preprocessing/flag_duplicate_reads.py +689 -272
- smftools/preprocessing/invert_adata.py +26 -11
- smftools/preprocessing/load_sample_sheet.py +40 -22
- smftools/preprocessing/make_dirs.py +8 -3
- smftools/preprocessing/min_non_diagonal.py +2 -1
- smftools/preprocessing/recipes.py +56 -23
- smftools/preprocessing/reindex_references_adata.py +103 -0
- smftools/preprocessing/subsample_adata.py +33 -16
- smftools/readwrite.py +331 -82
- smftools/schema/__init__.py +11 -0
- smftools/schema/anndata_schema_v1.yaml +227 -0
- smftools/tools/__init__.py +3 -4
- smftools/tools/archived/classifiers.py +163 -0
- smftools/tools/archived/subset_adata_v1.py +10 -1
- smftools/tools/archived/subset_adata_v2.py +12 -1
- smftools/tools/calculate_umap.py +54 -15
- smftools/tools/cluster_adata_on_methylation.py +115 -46
- smftools/tools/general_tools.py +70 -25
- smftools/tools/position_stats.py +229 -98
- smftools/tools/read_stats.py +50 -29
- smftools/tools/spatial_autocorrelation.py +365 -192
- smftools/tools/subset_adata.py +23 -21
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/METADATA +17 -39
- smftools-0.2.5.dist-info/RECORD +181 -0
- smftools-0.2.3.dist-info/RECORD +0 -173
- /smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
- /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
- /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
- /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
- /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archived/add_read_length_and_mapping_qc.py} +0 -0
- /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
- /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,24 +1,55 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
import glob
|
|
4
4
|
import os
|
|
5
|
+
import re
|
|
5
6
|
import subprocess
|
|
6
|
-
import glob
|
|
7
7
|
import time
|
|
8
|
-
from
|
|
9
|
-
import
|
|
8
|
+
from collections import Counter, defaultdict, deque
|
|
9
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
10
10
|
from itertools import zip_longest
|
|
11
|
-
import
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
12
13
|
|
|
13
14
|
import numpy as np
|
|
14
|
-
import
|
|
15
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
16
|
-
from concurrent.futures import ProcessPoolExecutor
|
|
17
|
-
|
|
15
|
+
import pysam
|
|
18
16
|
from tqdm import tqdm
|
|
19
|
-
from collections import defaultdict, Counter
|
|
20
17
|
|
|
21
|
-
from
|
|
18
|
+
from smftools.logging_utils import get_logger
|
|
19
|
+
|
|
20
|
+
from ..readwrite import date_string, time_string
|
|
21
|
+
|
|
22
|
+
logger = get_logger(__name__)
|
|
23
|
+
|
|
24
|
+
_PROGRESS_RE = re.compile(r"Output records written:\s*(\d+)")
|
|
25
|
+
_EMPTY_RE = re.compile(r"^\s*$")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _stream_dorado_logs(stderr_iter) -> None:
|
|
29
|
+
"""Stream dorado stderr and emit structured log messages.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
stderr_iter: Iterable of stderr lines.
|
|
33
|
+
"""
|
|
34
|
+
last_n: int | None = None
|
|
35
|
+
|
|
36
|
+
for raw in stderr_iter:
|
|
37
|
+
line = raw.rstrip("\n")
|
|
38
|
+
if _EMPTY_RE.match(line):
|
|
39
|
+
continue
|
|
40
|
+
|
|
41
|
+
m = _PROGRESS_RE.search(line)
|
|
42
|
+
if m:
|
|
43
|
+
n = int(m.group(1))
|
|
44
|
+
logger.debug("[dorado] Output records written: %d", n)
|
|
45
|
+
last_n = n
|
|
46
|
+
continue
|
|
47
|
+
|
|
48
|
+
logger.info("[dorado] %s", line)
|
|
49
|
+
|
|
50
|
+
if last_n is not None:
|
|
51
|
+
logger.info("[dorado] Final output records written: %d", last_n)
|
|
52
|
+
|
|
22
53
|
|
|
23
54
|
def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str, Path]) -> None:
|
|
24
55
|
"""
|
|
@@ -26,7 +57,13 @@ def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str,
|
|
|
26
57
|
"""
|
|
27
58
|
bam_path = str(bam_path)
|
|
28
59
|
fastq_path = str(fastq_path)
|
|
29
|
-
|
|
60
|
+
|
|
61
|
+
logger.debug(f"Converting BAM to FASTQ using _bam_to_fastq_with_pysam")
|
|
62
|
+
|
|
63
|
+
with (
|
|
64
|
+
pysam.AlignmentFile(bam_path, "rb", check_sq=False) as bam,
|
|
65
|
+
open(fastq_path, "w", encoding="utf-8") as fq,
|
|
66
|
+
):
|
|
30
67
|
for r in bam.fetch(until_eof=True):
|
|
31
68
|
# Optionally skip secondary/supplementary:
|
|
32
69
|
# if r.is_secondary or r.is_supplementary:
|
|
@@ -45,14 +82,22 @@ def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str,
|
|
|
45
82
|
# q is an array/list of ints (Phred scores).
|
|
46
83
|
# Convert to FASTQ string with Phred+33 encoding,
|
|
47
84
|
# clamping to sane range [0, 93] to stay in printable ASCII.
|
|
48
|
-
qual_str = "".join(
|
|
49
|
-
chr(min(max(int(qv), 0), 93) + 33)
|
|
50
|
-
for qv in q
|
|
51
|
-
)
|
|
85
|
+
qual_str = "".join(chr(min(max(int(qv), 0), 93) + 33) for qv in q)
|
|
52
86
|
|
|
53
87
|
fq.write(f"@{name}\n{seq}\n+\n{qual_str}\n")
|
|
54
88
|
|
|
55
|
-
|
|
89
|
+
|
|
90
|
+
def _sort_bam_with_pysam(
|
|
91
|
+
in_bam: Union[str, Path], out_bam: Union[str, Path], threads: Optional[int] = None
|
|
92
|
+
) -> None:
|
|
93
|
+
"""Sort a BAM file using pysam.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
in_bam: Input BAM path.
|
|
97
|
+
out_bam: Output BAM path.
|
|
98
|
+
threads: Optional thread count.
|
|
99
|
+
"""
|
|
100
|
+
logger.debug(f"Sorting BAM using _sort_bam_with_pysam")
|
|
56
101
|
in_bam, out_bam = str(in_bam), str(out_bam)
|
|
57
102
|
args = []
|
|
58
103
|
if threads:
|
|
@@ -60,86 +105,129 @@ def _sort_bam_with_pysam(in_bam: Union[str, Path], out_bam: Union[str, Path], th
|
|
|
60
105
|
args += ["-o", out_bam, in_bam]
|
|
61
106
|
pysam.sort(*args)
|
|
62
107
|
|
|
108
|
+
|
|
63
109
|
def _index_bam_with_pysam(bam_path: Union[str, Path], threads: Optional[int] = None) -> None:
|
|
110
|
+
"""Index a BAM file using pysam.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
bam_path: BAM path to index.
|
|
114
|
+
threads: Optional thread count.
|
|
115
|
+
"""
|
|
64
116
|
bam_path = str(bam_path)
|
|
117
|
+
logger.debug(f"Indexing BAM using _index_bam_with_pysam")
|
|
65
118
|
# pysam.index supports samtools-style args
|
|
66
119
|
if threads:
|
|
67
120
|
pysam.index("-@", str(threads), bam_path)
|
|
68
121
|
else:
|
|
69
122
|
pysam.index(bam_path)
|
|
70
123
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
aligner='minimap2',
|
|
78
|
-
aligner_args=['-a', '-x', 'map-ont', '--MD', '-Y', '-y', '-N', '5', '--secondary=no']):
|
|
124
|
+
|
|
125
|
+
def align_and_sort_BAM(
|
|
126
|
+
fasta,
|
|
127
|
+
input,
|
|
128
|
+
cfg,
|
|
129
|
+
):
|
|
79
130
|
"""
|
|
80
131
|
A wrapper for running dorado aligner and samtools functions
|
|
81
|
-
|
|
132
|
+
|
|
82
133
|
Parameters:
|
|
83
134
|
fasta (str): File path to the reference genome to align to.
|
|
84
135
|
input (str): File path to the basecalled file to align. Works for .bam and .fastq files
|
|
85
|
-
|
|
86
|
-
output_directory (str): A file path to the directory to output all the analyses.
|
|
87
|
-
make_bigwigs (bool): Whether to make bigwigs
|
|
88
|
-
threads (int): Number of additional threads to use
|
|
89
|
-
aligner (str): Aligner to use. minimap2 and dorado options
|
|
90
|
-
aligner_args (list): list of optional parameters to use for the alignment
|
|
136
|
+
cfg: The configuration object
|
|
91
137
|
|
|
92
138
|
Returns:
|
|
93
139
|
None
|
|
94
140
|
The function writes out files for: 1) An aligned BAM, 2) and aligned_sorted BAM, 3) an index file for the aligned_sorted BAM, 4) A bed file for the aligned_sorted BAM, 5) A text file containing read names in the aligned_sorted BAM
|
|
95
141
|
"""
|
|
142
|
+
logger.debug("Aligning and sorting BAM using align_and_sort_BAM")
|
|
96
143
|
input_basename = input.name
|
|
97
144
|
input_suffix = input.suffix
|
|
98
|
-
input_as_fastq = input.with_name(input.stem +
|
|
145
|
+
input_as_fastq = input.with_name(input.stem + ".fastq")
|
|
146
|
+
|
|
147
|
+
output_path_minus_suffix = cfg.output_directory / input.stem
|
|
99
148
|
|
|
100
|
-
output_path_minus_suffix = output_directory / input.stem
|
|
101
|
-
|
|
102
149
|
aligned_BAM = output_path_minus_suffix.with_name(output_path_minus_suffix.stem + "_aligned")
|
|
103
|
-
aligned_output = aligned_BAM.with_suffix(bam_suffix)
|
|
104
|
-
aligned_sorted_BAM =aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
|
|
105
|
-
aligned_sorted_output = aligned_sorted_BAM.with_suffix(bam_suffix)
|
|
150
|
+
aligned_output = aligned_BAM.with_suffix(cfg.bam_suffix)
|
|
151
|
+
aligned_sorted_BAM = aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
|
|
152
|
+
aligned_sorted_output = aligned_sorted_BAM.with_suffix(cfg.bam_suffix)
|
|
106
153
|
|
|
107
|
-
if threads:
|
|
108
|
-
threads = str(threads)
|
|
154
|
+
if cfg.threads:
|
|
155
|
+
threads = str(cfg.threads)
|
|
109
156
|
else:
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
if aligner ==
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
157
|
+
threads = None
|
|
158
|
+
|
|
159
|
+
if cfg.aligner == "minimap2":
|
|
160
|
+
if not cfg.align_from_bam:
|
|
161
|
+
logger.debug(f"Converting BAM to FASTQ: {input}")
|
|
162
|
+
_bam_to_fastq_with_pysam(input, input_as_fastq)
|
|
163
|
+
logger.debug(f"Aligning FASTQ to Reference: {input_as_fastq}")
|
|
164
|
+
mm_input = input_as_fastq
|
|
165
|
+
else:
|
|
166
|
+
logger.debug(f"Aligning BAM to Reference: {input}")
|
|
167
|
+
mm_input = input
|
|
168
|
+
|
|
116
169
|
if threads:
|
|
117
|
-
minimap_command =
|
|
170
|
+
minimap_command = (
|
|
171
|
+
["minimap2"] + cfg.aligner_args + ["-t", threads, str(fasta), str(mm_input)]
|
|
172
|
+
)
|
|
118
173
|
else:
|
|
119
|
-
minimap_command = [
|
|
120
|
-
|
|
121
|
-
|
|
174
|
+
minimap_command = ["minimap2"] + cfg.aligner_args + [str(fasta), str(mm_input)]
|
|
175
|
+
|
|
176
|
+
with open(aligned_output, "wb") as out:
|
|
177
|
+
proc = subprocess.Popen(
|
|
178
|
+
minimap_command,
|
|
179
|
+
stdout=out,
|
|
180
|
+
stderr=subprocess.PIPE,
|
|
181
|
+
text=True,
|
|
182
|
+
)
|
|
122
183
|
|
|
123
|
-
|
|
184
|
+
assert proc.stderr is not None
|
|
185
|
+
for line in proc.stderr:
|
|
186
|
+
logger.info("[minimap2] %s", line.rstrip())
|
|
187
|
+
|
|
188
|
+
ret = proc.wait()
|
|
189
|
+
if ret != 0:
|
|
190
|
+
raise RuntimeError(f"minimap2 failed with exit code {ret}")
|
|
191
|
+
|
|
192
|
+
if not cfg.align_from_bam:
|
|
193
|
+
os.remove(input_as_fastq)
|
|
194
|
+
|
|
195
|
+
elif cfg.aligner == "dorado":
|
|
124
196
|
# Run dorado aligner
|
|
125
197
|
print(f"Aligning BAM to Reference: {input}")
|
|
126
198
|
if threads:
|
|
127
|
-
alignment_command =
|
|
199
|
+
alignment_command = (
|
|
200
|
+
["dorado", "aligner", "-t", threads] + cfg.aligner_args + [str(fasta), str(input)]
|
|
201
|
+
)
|
|
128
202
|
else:
|
|
129
|
-
alignment_command = ["dorado", "aligner"] + aligner_args + [str(fasta), str(input)]
|
|
130
|
-
|
|
131
|
-
|
|
203
|
+
alignment_command = ["dorado", "aligner"] + cfg.aligner_args + [str(fasta), str(input)]
|
|
204
|
+
|
|
205
|
+
with open(aligned_output, "wb") as out:
|
|
206
|
+
proc = subprocess.Popen(
|
|
207
|
+
alignment_command,
|
|
208
|
+
stdout=out,
|
|
209
|
+
stderr=subprocess.PIPE,
|
|
210
|
+
text=True,
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
assert proc.stderr is not None
|
|
214
|
+
_stream_dorado_logs(proc.stderr)
|
|
215
|
+
ret = proc.wait()
|
|
216
|
+
|
|
217
|
+
if ret != 0:
|
|
218
|
+
raise RuntimeError(f"dorado failed with exit code {ret}")
|
|
132
219
|
else:
|
|
133
|
-
|
|
220
|
+
logger.error(f"Aligner not recognized: {cfg.aligner}. Choose from minimap2 and dorado")
|
|
134
221
|
return
|
|
135
|
-
|
|
222
|
+
|
|
136
223
|
# --- Sort & Index with pysam ---
|
|
137
|
-
|
|
224
|
+
logger.debug(f"Sorting: {aligned_output} -> {aligned_sorted_output}")
|
|
138
225
|
_sort_bam_with_pysam(aligned_output, aligned_sorted_output, threads=threads)
|
|
139
226
|
|
|
140
|
-
|
|
227
|
+
logger.debug(f"Indexing: {aligned_sorted_output}")
|
|
141
228
|
_index_bam_with_pysam(aligned_sorted_output, threads=threads)
|
|
142
229
|
|
|
230
|
+
|
|
143
231
|
def bam_qc(
|
|
144
232
|
bam_files: Iterable[str | Path],
|
|
145
233
|
bam_qc_dir: str | Path,
|
|
@@ -154,133 +242,154 @@ def bam_qc(
|
|
|
154
242
|
Prefers pysam; falls back to `samtools` if needed.
|
|
155
243
|
Runs BAMs in parallel (up to `threads`, default serial).
|
|
156
244
|
"""
|
|
157
|
-
import subprocess
|
|
158
245
|
import shutil
|
|
246
|
+
import subprocess
|
|
247
|
+
|
|
248
|
+
logger.debug("Performing BAM QC using bam_qc")
|
|
159
249
|
|
|
160
250
|
# Try to import pysam once
|
|
161
251
|
try:
|
|
162
|
-
import pysam
|
|
163
|
-
|
|
252
|
+
import pysam # type: ignore
|
|
253
|
+
|
|
254
|
+
have_pysam = True
|
|
164
255
|
except Exception:
|
|
165
|
-
|
|
256
|
+
pysam = None # type: ignore
|
|
257
|
+
have_pysam = False
|
|
166
258
|
|
|
167
259
|
bam_qc_dir = Path(bam_qc_dir)
|
|
168
260
|
bam_qc_dir.mkdir(parents=True, exist_ok=True)
|
|
169
261
|
|
|
170
|
-
|
|
262
|
+
bam_paths = [Path(b) for b in bam_files]
|
|
171
263
|
|
|
172
264
|
def _has_index(p: Path) -> bool:
|
|
173
|
-
if
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
return bai.exists() or
|
|
177
|
-
if
|
|
178
|
-
|
|
179
|
-
return crai.exists()
|
|
265
|
+
"""Return True if a BAM/CRAM index exists for the path."""
|
|
266
|
+
suf = p.suffix.lower()
|
|
267
|
+
if suf == ".bam":
|
|
268
|
+
return p.with_suffix(p.suffix + ".bai").exists() or Path(str(p) + ".bai").exists()
|
|
269
|
+
if suf == ".cram":
|
|
270
|
+
return Path(str(p) + ".crai").exists()
|
|
180
271
|
return False
|
|
181
272
|
|
|
182
273
|
def _ensure_index(p: Path) -> None:
|
|
274
|
+
"""Ensure a BAM/CRAM index exists, creating one if needed."""
|
|
183
275
|
if _has_index(p):
|
|
184
276
|
return
|
|
185
|
-
if
|
|
186
|
-
|
|
187
|
-
pysam.index(str(p))
|
|
277
|
+
if have_pysam:
|
|
278
|
+
assert pysam is not None
|
|
279
|
+
pysam.index(str(p)) # supports BAM & CRAM
|
|
188
280
|
else:
|
|
281
|
+
if not shutil.which("samtools"):
|
|
282
|
+
raise RuntimeError("Neither pysam nor samtools is available in PATH.")
|
|
189
283
|
cmd = ["samtools", "index", str(p)]
|
|
190
|
-
|
|
284
|
+
# capture text so errors are readable; raise on failure
|
|
285
|
+
cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
|
|
286
|
+
if cp.returncode != 0:
|
|
287
|
+
raise RuntimeError(f"samtools index failed (exit {cp.returncode}):\n{cp.stderr}")
|
|
191
288
|
|
|
192
|
-
def
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
289
|
+
def _run_samtools_to_file(cmd: list[str], out_path: Path, bam: Path, tag: str) -> int:
|
|
290
|
+
"""
|
|
291
|
+
Stream stderr to logger; write stdout to out_path; return rc; raise with stderr tail on failure.
|
|
292
|
+
"""
|
|
293
|
+
last_err = deque(maxlen=80)
|
|
294
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
295
|
+
|
|
296
|
+
with open(out_path, "w") as fh:
|
|
297
|
+
proc = subprocess.Popen(cmd, stdout=fh, stderr=subprocess.PIPE, text=True)
|
|
298
|
+
assert proc.stderr is not None
|
|
299
|
+
for line in proc.stderr:
|
|
300
|
+
line = line.rstrip()
|
|
301
|
+
if line:
|
|
302
|
+
last_err.append(line)
|
|
303
|
+
logger.info("[%s][%s] %s", tag, bam.name, line)
|
|
304
|
+
rc = proc.wait()
|
|
305
|
+
|
|
306
|
+
if rc != 0:
|
|
307
|
+
tail = "\n".join(last_err)
|
|
308
|
+
raise RuntimeError(f"{tag} failed for {bam} (exit {rc}). Stderr tail:\n{tail}")
|
|
309
|
+
return rc
|
|
310
|
+
|
|
311
|
+
def _run_one(bam: Path) -> tuple[Path, list[tuple[str, int]]]:
|
|
312
|
+
"""Run stats/flagstat/idxstats for a single BAM.
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
bam: Path to the BAM file.
|
|
316
|
+
|
|
317
|
+
Returns:
|
|
318
|
+
Tuple of (bam_path, list of (stage, return_code)).
|
|
319
|
+
"""
|
|
320
|
+
import subprocess
|
|
321
|
+
|
|
322
|
+
results: list[tuple[str, int]] = []
|
|
323
|
+
base = bam.stem # e.g. sample.bam -> sample
|
|
196
324
|
out_stats = bam_qc_dir / f"{base}_stats.txt"
|
|
197
325
|
out_flag = bam_qc_dir / f"{base}_flagstat.txt"
|
|
198
|
-
out_idx
|
|
326
|
+
out_idx = bam_qc_dir / f"{base}_idxstats.txt"
|
|
199
327
|
|
|
200
|
-
# Make sure index exists (
|
|
328
|
+
# Make sure index exists (idxstats requires; stats/flagstat usually don't, but indexing is cheap/useful)
|
|
201
329
|
try:
|
|
202
330
|
_ensure_index(bam)
|
|
203
331
|
except Exception as e:
|
|
204
|
-
# Still attempt stats/flagstat if requested
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
332
|
+
# Still attempt stats/flagstat if requested; idxstats may fail later if index is required.
|
|
333
|
+
logger.warning("Indexing failed for %s: %s", bam, e)
|
|
334
|
+
|
|
335
|
+
if not have_pysam:
|
|
336
|
+
import shutil
|
|
337
|
+
|
|
338
|
+
if not shutil.which("samtools"):
|
|
339
|
+
raise RuntimeError("Neither pysam nor samtools is available in PATH.")
|
|
340
|
+
|
|
341
|
+
# --- stats ---
|
|
342
|
+
if stats:
|
|
343
|
+
if have_pysam and pysam is not None and hasattr(pysam, "stats"):
|
|
212
344
|
txt = pysam.stats(str(bam))
|
|
213
345
|
out_stats.write_text(txt)
|
|
214
346
|
results.append(("stats(pysam)", 0))
|
|
215
347
|
else:
|
|
216
348
|
cmd = ["samtools", "stats", str(bam)]
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
def run_flagstat():
|
|
224
|
-
if not flagstats:
|
|
225
|
-
return
|
|
226
|
-
if HAVE_PYSAM and hasattr(pysam, "flagstat"):
|
|
349
|
+
rc = _run_samtools_to_file(cmd, out_stats, bam, "samtools stats")
|
|
350
|
+
results.append(("stats(samtools)", rc))
|
|
351
|
+
|
|
352
|
+
# --- flagstat ---
|
|
353
|
+
if flagstats:
|
|
354
|
+
if have_pysam and pysam is not None and hasattr(pysam, "flagstat"):
|
|
227
355
|
txt = pysam.flagstat(str(bam))
|
|
228
356
|
out_flag.write_text(txt)
|
|
229
357
|
results.append(("flagstat(pysam)", 0))
|
|
230
358
|
else:
|
|
231
359
|
cmd = ["samtools", "flagstat", str(bam)]
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
def run_idxstats():
|
|
239
|
-
if not idxstats:
|
|
240
|
-
return
|
|
241
|
-
if HAVE_PYSAM and hasattr(pysam, "idxstats"):
|
|
360
|
+
rc = _run_samtools_to_file(cmd, out_flag, bam, "samtools flagstat")
|
|
361
|
+
results.append(("flagstat(samtools)", rc))
|
|
362
|
+
|
|
363
|
+
# --- idxstats ---
|
|
364
|
+
if idxstats:
|
|
365
|
+
if have_pysam and pysam is not None and hasattr(pysam, "idxstats"):
|
|
242
366
|
txt = pysam.idxstats(str(bam))
|
|
243
367
|
out_idx.write_text(txt)
|
|
244
368
|
results.append(("idxstats(pysam)", 0))
|
|
245
369
|
else:
|
|
246
370
|
cmd = ["samtools", "idxstats", str(bam)]
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
results.append(("idxstats(samtools)", cp.returncode))
|
|
250
|
-
if cp.returncode != 0:
|
|
251
|
-
raise RuntimeError(cp.stderr.decode(errors="replace"))
|
|
252
|
-
|
|
253
|
-
# Sanity: ensure samtools exists if pysam missing
|
|
254
|
-
if not HAVE_PYSAM:
|
|
255
|
-
if not shutil.which("samtools"):
|
|
256
|
-
raise RuntimeError("Neither pysam nor samtools is available in PATH.")
|
|
371
|
+
rc = _run_samtools_to_file(cmd, out_idx, bam, "samtools idxstats")
|
|
372
|
+
results.append(("idxstats(samtools)", rc))
|
|
257
373
|
|
|
258
|
-
# Execute tasks (serial per file; parallelized across files)
|
|
259
|
-
run_stats()
|
|
260
|
-
run_flagstat()
|
|
261
|
-
run_idxstats()
|
|
262
374
|
return bam, results
|
|
263
375
|
|
|
264
|
-
# Parallel across BAMs
|
|
265
376
|
max_workers = int(threads) if threads and int(threads) > 0 else 1
|
|
266
|
-
futures = []
|
|
267
|
-
with ThreadPoolExecutor(max_workers=max_workers) as ex:
|
|
268
|
-
for b in bam_files:
|
|
269
|
-
futures.append(ex.submit(_run_one, b))
|
|
270
377
|
|
|
271
|
-
|
|
378
|
+
with ThreadPoolExecutor(max_workers=max_workers) as ex:
|
|
379
|
+
futs = [ex.submit(_run_one, b) for b in bam_paths]
|
|
380
|
+
for fut in as_completed(futs):
|
|
272
381
|
try:
|
|
273
382
|
bam, res = fut.result()
|
|
274
383
|
summary = ", ".join(f"{name}:{rc}" for name, rc in res) or "no-op"
|
|
275
|
-
|
|
384
|
+
logger.info("[qc] %s: %s", bam.name, summary)
|
|
276
385
|
except Exception as e:
|
|
277
|
-
|
|
386
|
+
logger.exception("QC failed: %s", e)
|
|
278
387
|
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
388
|
+
if modality not in {"conversion", "direct", "deaminase"}:
|
|
389
|
+
logger.warning("Unknown modality '%s', continuing.", modality)
|
|
390
|
+
|
|
391
|
+
logger.info("QC processing completed.")
|
|
282
392
|
|
|
283
|
-
print("QC processing completed.")
|
|
284
393
|
|
|
285
394
|
def concatenate_fastqs_to_bam(
|
|
286
395
|
fastq_files: List[Union[str, Tuple[str, str], Path, Tuple[Path, Path]]],
|
|
@@ -327,12 +436,29 @@ def concatenate_fastqs_to_bam(
|
|
|
327
436
|
"""
|
|
328
437
|
name = p.name
|
|
329
438
|
lowers = name.lower()
|
|
330
|
-
for ext in (
|
|
439
|
+
for ext in (
|
|
440
|
+
".fastq.gz",
|
|
441
|
+
".fq.gz",
|
|
442
|
+
".fastq.bz2",
|
|
443
|
+
".fq.bz2",
|
|
444
|
+
".fastq.xz",
|
|
445
|
+
".fq.xz",
|
|
446
|
+
".fastq",
|
|
447
|
+
".fq",
|
|
448
|
+
):
|
|
331
449
|
if lowers.endswith(ext):
|
|
332
450
|
return name[: -len(ext)]
|
|
333
451
|
return p.stem # fallback: remove last suffix only
|
|
334
452
|
|
|
335
453
|
def _extract_barcode_from_filename(p: Path) -> str:
|
|
454
|
+
"""Extract a barcode token from a FASTQ filename.
|
|
455
|
+
|
|
456
|
+
Args:
|
|
457
|
+
p: FASTQ path.
|
|
458
|
+
|
|
459
|
+
Returns:
|
|
460
|
+
Barcode token string.
|
|
461
|
+
"""
|
|
336
462
|
stem = _strip_fastq_ext(p)
|
|
337
463
|
if "_" in stem:
|
|
338
464
|
token = stem.split("_")[-1]
|
|
@@ -341,10 +467,18 @@ def concatenate_fastqs_to_bam(
|
|
|
341
467
|
return stem
|
|
342
468
|
|
|
343
469
|
def _classify_read_token(stem: str) -> Tuple[Optional[str], Optional[int]]:
|
|
470
|
+
"""Classify a FASTQ filename stem into (prefix, read_number).
|
|
471
|
+
|
|
472
|
+
Args:
|
|
473
|
+
stem: Filename stem.
|
|
474
|
+
|
|
475
|
+
Returns:
|
|
476
|
+
Tuple of (prefix, read_number) or (None, None) if not matched.
|
|
477
|
+
"""
|
|
344
478
|
# return (prefix, readnum) if matches; else (None, None)
|
|
345
479
|
patterns = [
|
|
346
|
-
r"(?i)(.*?)[._-]r?([12])$",
|
|
347
|
-
r"(?i)(.*?)[._-]read[_-]?([12])$",
|
|
480
|
+
r"(?i)(.*?)[._-]r?([12])$", # prefix_R1 / prefix.r2 / prefix-1
|
|
481
|
+
r"(?i)(.*?)[._-]read[_-]?([12])$", # prefix_read1
|
|
348
482
|
]
|
|
349
483
|
for pat in patterns:
|
|
350
484
|
m = re.match(pat, stem)
|
|
@@ -353,6 +487,14 @@ def concatenate_fastqs_to_bam(
|
|
|
353
487
|
return None, None
|
|
354
488
|
|
|
355
489
|
def _pair_by_filename(paths: List[Path]) -> Tuple[List[Tuple[Path, Path]], List[Path]]:
|
|
490
|
+
"""Pair FASTQ files based on filename conventions.
|
|
491
|
+
|
|
492
|
+
Args:
|
|
493
|
+
paths: FASTQ paths to pair.
|
|
494
|
+
|
|
495
|
+
Returns:
|
|
496
|
+
Tuple of (paired list, leftover list).
|
|
497
|
+
"""
|
|
356
498
|
pref_map: Dict[str, Dict[int, Path]] = {}
|
|
357
499
|
unpaired: List[Path] = []
|
|
358
500
|
for pth in paths:
|
|
@@ -374,6 +516,14 @@ def concatenate_fastqs_to_bam(
|
|
|
374
516
|
return pairs, leftovers
|
|
375
517
|
|
|
376
518
|
def _fastq_iter(p: Path):
|
|
519
|
+
"""Yield FASTQ records using pysam.FastxFile.
|
|
520
|
+
|
|
521
|
+
Args:
|
|
522
|
+
p: FASTQ path.
|
|
523
|
+
|
|
524
|
+
Yields:
|
|
525
|
+
Pysam Fastx records.
|
|
526
|
+
"""
|
|
377
527
|
# pysam.FastxFile handles compressed extensions transparently
|
|
378
528
|
with pysam.FastxFile(str(p)) as fx:
|
|
379
529
|
for rec in fx:
|
|
@@ -387,6 +537,19 @@ def concatenate_fastqs_to_bam(
|
|
|
387
537
|
read1: bool,
|
|
388
538
|
read2: bool,
|
|
389
539
|
) -> pysam.AlignedSegment:
|
|
540
|
+
"""Construct an unaligned pysam.AlignedSegment.
|
|
541
|
+
|
|
542
|
+
Args:
|
|
543
|
+
name: Read name.
|
|
544
|
+
seq: Read sequence.
|
|
545
|
+
qual: FASTQ quality string.
|
|
546
|
+
bc: Barcode string.
|
|
547
|
+
read1: Whether this is read 1.
|
|
548
|
+
read2: Whether this is read 2.
|
|
549
|
+
|
|
550
|
+
Returns:
|
|
551
|
+
Unaligned pysam.AlignedSegment.
|
|
552
|
+
"""
|
|
390
553
|
a = pysam.AlignedSegment()
|
|
391
554
|
a.query_name = name
|
|
392
555
|
a.query_sequence = seq
|
|
@@ -409,6 +572,7 @@ def concatenate_fastqs_to_bam(
|
|
|
409
572
|
|
|
410
573
|
# ---------- normalize inputs to Path ----------
|
|
411
574
|
def _to_path_pair(x) -> Tuple[Path, Path]:
|
|
575
|
+
"""Convert a tuple of path-like objects to Path instances."""
|
|
412
576
|
a, b = x
|
|
413
577
|
return Path(a), Path(b)
|
|
414
578
|
|
|
@@ -451,7 +615,10 @@ def concatenate_fastqs_to_bam(
|
|
|
451
615
|
# ---------- BAM header ----------
|
|
452
616
|
header = {"HD": {"VN": "1.6", "SO": "unknown"}, "SQ": []}
|
|
453
617
|
if add_read_group:
|
|
454
|
-
header["RG"] = [
|
|
618
|
+
header["RG"] = [
|
|
619
|
+
{"ID": bc, **({"SM": rg_sample_field} if rg_sample_field else {})}
|
|
620
|
+
for bc in barcodes_in_order
|
|
621
|
+
]
|
|
455
622
|
header.setdefault("PG", []).append(
|
|
456
623
|
{"ID": "concat-fastq", "PN": "concatenate_fastqs_to_bam", "VN": "1"}
|
|
457
624
|
)
|
|
@@ -477,7 +644,9 @@ def concatenate_fastqs_to_bam(
|
|
|
477
644
|
it2 = _fastq_iter(r2_path)
|
|
478
645
|
|
|
479
646
|
for rec1, rec2 in zip_longest(it1, it2, fillvalue=None):
|
|
647
|
+
|
|
480
648
|
def _clean(n: Optional[str]) -> Optional[str]:
|
|
649
|
+
"""Normalize FASTQ read names by trimming read suffixes."""
|
|
481
650
|
if n is None:
|
|
482
651
|
return None
|
|
483
652
|
return re.sub(r"(?:/1$|/2$|\s[12]$)", "", n)
|
|
@@ -490,12 +659,16 @@ def concatenate_fastqs_to_bam(
|
|
|
490
659
|
)
|
|
491
660
|
|
|
492
661
|
if rec1 is not None:
|
|
493
|
-
a1 = _make_unaligned_segment(
|
|
662
|
+
a1 = _make_unaligned_segment(
|
|
663
|
+
name, rec1.sequence, rec1.quality, bc, read1=True, read2=False
|
|
664
|
+
)
|
|
494
665
|
bam_out.write(a1)
|
|
495
666
|
per_file_counts[r1_path] = per_file_counts.get(r1_path, 0) + 1
|
|
496
667
|
total_written += 1
|
|
497
668
|
if rec2 is not None:
|
|
498
|
-
a2 = _make_unaligned_segment(
|
|
669
|
+
a2 = _make_unaligned_segment(
|
|
670
|
+
name, rec2.sequence, rec2.quality, bc, read1=False, read2=True
|
|
671
|
+
)
|
|
499
672
|
bam_out.write(a2)
|
|
500
673
|
per_file_counts[r2_path] = per_file_counts.get(r2_path, 0) + 1
|
|
501
674
|
total_written += 1
|
|
@@ -517,7 +690,9 @@ def concatenate_fastqs_to_bam(
|
|
|
517
690
|
raise FileNotFoundError(pth)
|
|
518
691
|
bc = per_path_barcode.get(pth, "barcode")
|
|
519
692
|
for rec in _fastq_iter(pth):
|
|
520
|
-
a = _make_unaligned_segment(
|
|
693
|
+
a = _make_unaligned_segment(
|
|
694
|
+
rec.name, rec.sequence, rec.quality, bc, read1=False, read2=False
|
|
695
|
+
)
|
|
521
696
|
bam_out.write(a)
|
|
522
697
|
per_file_counts[pth] = per_file_counts.get(pth, 0) + 1
|
|
523
698
|
total_written += 1
|
|
@@ -531,20 +706,21 @@ def concatenate_fastqs_to_bam(
|
|
|
531
706
|
"barcodes": barcodes_in_order,
|
|
532
707
|
}
|
|
533
708
|
|
|
709
|
+
|
|
534
710
|
def count_aligned_reads(bam_file):
|
|
535
711
|
"""
|
|
536
712
|
Counts the number of aligned reads in a bam file that map to each reference record.
|
|
537
|
-
|
|
713
|
+
|
|
538
714
|
Parameters:
|
|
539
715
|
bam_file (str): A string representing the path to an aligned BAM file.
|
|
540
|
-
|
|
716
|
+
|
|
541
717
|
Returns:
|
|
542
718
|
aligned_reads_count (int): The total number or reads aligned in the BAM.
|
|
543
719
|
unaligned_reads_count (int): The total number of reads not aligned in the BAM.
|
|
544
720
|
record_counts (dict): A dictionary keyed by reference record instance that points toa tuple containing the total reads mapped to the record and the fraction of mapped reads which map to the record.
|
|
545
721
|
|
|
546
722
|
"""
|
|
547
|
-
print(
|
|
723
|
+
print("{0}: Counting aligned reads in BAM > {1}".format(time_string(), bam_file))
|
|
548
724
|
aligned_reads_count = 0
|
|
549
725
|
unaligned_reads_count = 0
|
|
550
726
|
# Make a dictionary, keyed by the reference_name of reference chromosome that points to an integer number of read counts mapped to the chromosome, as well as the proportion of mapped reads in that chromosome
|
|
@@ -553,12 +729,14 @@ def count_aligned_reads(bam_file):
|
|
|
553
729
|
with pysam.AlignmentFile(str(bam_file), "rb") as bam:
|
|
554
730
|
total_reads = bam.mapped + bam.unmapped
|
|
555
731
|
# Iterate over reads to get the total mapped read counts and the reads that map to each reference
|
|
556
|
-
for read in tqdm(bam, desc=
|
|
732
|
+
for read in tqdm(bam, desc="Counting aligned reads in BAM", total=total_reads):
|
|
557
733
|
if read.is_unmapped:
|
|
558
734
|
unaligned_reads_count += 1
|
|
559
735
|
else:
|
|
560
736
|
aligned_reads_count += 1
|
|
561
|
-
record_counts[read.reference_name] +=
|
|
737
|
+
record_counts[read.reference_name] += (
|
|
738
|
+
1 # Automatically increments if key exists, adds if not
|
|
739
|
+
)
|
|
562
740
|
|
|
563
741
|
# reformat the dictionary to contain read counts mapped to the reference, as well as the proportion of mapped reads in reference
|
|
564
742
|
for reference in record_counts:
|
|
@@ -567,7 +745,10 @@ def count_aligned_reads(bam_file):
|
|
|
567
745
|
|
|
568
746
|
return aligned_reads_count, unaligned_reads_count, dict(record_counts)
|
|
569
747
|
|
|
570
|
-
|
|
748
|
+
|
|
749
|
+
def demux_and_index_BAM(
|
|
750
|
+
aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit, barcode_both_ends, trim, threads
|
|
751
|
+
):
|
|
571
752
|
"""
|
|
572
753
|
A wrapper function for splitting BAMS and indexing them.
|
|
573
754
|
Parameters:
|
|
@@ -578,11 +759,12 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
|
|
|
578
759
|
barcode_both_ends (bool): Whether to require both ends to be barcoded.
|
|
579
760
|
trim (bool): Whether to trim off barcodes after demultiplexing.
|
|
580
761
|
threads (int): Number of threads to use.
|
|
581
|
-
|
|
762
|
+
|
|
582
763
|
Returns:
|
|
583
764
|
bam_files (list): List of split BAM file path strings
|
|
584
765
|
Splits an input BAM file on barcode value and makes a BAM index file.
|
|
585
766
|
"""
|
|
767
|
+
|
|
586
768
|
input_bam = aligned_sorted_BAM.with_suffix(bam_suffix)
|
|
587
769
|
command = ["dorado", "demux", "--kit-name", barcode_kit]
|
|
588
770
|
if barcode_both_ends:
|
|
@@ -595,25 +777,37 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
|
|
|
595
777
|
pass
|
|
596
778
|
command += ["--emit-summary", "--sort-bam", "--output-dir", str(split_dir)]
|
|
597
779
|
command.append(str(input_bam))
|
|
598
|
-
command_string =
|
|
599
|
-
|
|
600
|
-
|
|
780
|
+
command_string = " ".join(command)
|
|
781
|
+
logger.info("Running dorado demux: %s", " ".join(command))
|
|
782
|
+
|
|
783
|
+
proc = subprocess.Popen(
|
|
784
|
+
command,
|
|
785
|
+
stdout=subprocess.PIPE,
|
|
786
|
+
stderr=subprocess.PIPE,
|
|
787
|
+
text=True,
|
|
788
|
+
)
|
|
789
|
+
|
|
790
|
+
assert proc.stderr is not None
|
|
791
|
+
_stream_dorado_logs(proc.stderr)
|
|
792
|
+
rc = proc.wait()
|
|
793
|
+
|
|
794
|
+
if rc != 0:
|
|
795
|
+
raise RuntimeError(f"dorado demux failed with exit code {rc}")
|
|
601
796
|
|
|
602
797
|
bam_files = sorted(
|
|
603
|
-
p for p in split_dir.glob(f"*{bam_suffix}")
|
|
604
|
-
if p.is_file() and p.suffix == bam_suffix
|
|
798
|
+
p for p in split_dir.glob(f"*{bam_suffix}") if p.is_file() and p.suffix == bam_suffix
|
|
605
799
|
)
|
|
606
800
|
|
|
607
801
|
if not bam_files:
|
|
608
802
|
raise FileNotFoundError(f"No BAM files found in {split_dir} with suffix {bam_suffix}")
|
|
609
|
-
|
|
803
|
+
|
|
610
804
|
# ---- Optional renaming with prefix ----
|
|
611
805
|
renamed_bams = []
|
|
612
806
|
prefix = "de" if barcode_both_ends else "se"
|
|
613
807
|
|
|
614
808
|
for bam in bam_files:
|
|
615
809
|
bam = Path(bam)
|
|
616
|
-
bai = bam.with_suffix(bam_suffix + ".bai")
|
|
810
|
+
bai = bam.with_suffix(bam_suffix + ".bai") # dorado’s sorting produces .bam.bai
|
|
617
811
|
|
|
618
812
|
if prefix:
|
|
619
813
|
new_name = f"{prefix}_{bam.name}"
|
|
@@ -629,9 +823,10 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
|
|
|
629
823
|
bai.rename(new_bai)
|
|
630
824
|
|
|
631
825
|
renamed_bams.append(new_bam)
|
|
632
|
-
|
|
826
|
+
|
|
633
827
|
return renamed_bams
|
|
634
828
|
|
|
829
|
+
|
|
635
830
|
def extract_base_identities(bam_file, chromosome, positions, max_reference_length, sequence):
|
|
636
831
|
"""
|
|
637
832
|
Efficiently extracts base identities from mapped reads with reference coordinates.
|
|
@@ -647,14 +842,15 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
|
|
|
647
842
|
dict: Base identities from forward mapped reads.
|
|
648
843
|
dict: Base identities from reverse mapped reads.
|
|
649
844
|
"""
|
|
845
|
+
logger.debug("Extracting nucleotide identities for each read using extract_base_identities")
|
|
650
846
|
timestamp = time.strftime("[%Y-%m-%d %H:%M:%S]")
|
|
651
847
|
|
|
652
848
|
positions = set(positions)
|
|
653
|
-
fwd_base_identities = defaultdict(lambda: np.full(max_reference_length,
|
|
654
|
-
rev_base_identities = defaultdict(lambda: np.full(max_reference_length,
|
|
849
|
+
fwd_base_identities = defaultdict(lambda: np.full(max_reference_length, "N", dtype="<U1"))
|
|
850
|
+
rev_base_identities = defaultdict(lambda: np.full(max_reference_length, "N", dtype="<U1"))
|
|
655
851
|
mismatch_counts_per_read = defaultdict(lambda: defaultdict(Counter))
|
|
656
852
|
|
|
657
|
-
#print(f"{timestamp} Reading reads from {chromosome} BAM file: {bam_file}")
|
|
853
|
+
# print(f"{timestamp} Reading reads from {chromosome} BAM file: {bam_file}")
|
|
658
854
|
with pysam.AlignmentFile(str(bam_file), "rb") as bam:
|
|
659
855
|
total_reads = bam.mapped
|
|
660
856
|
ref_seq = sequence.upper()
|
|
@@ -677,7 +873,7 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
|
|
|
677
873
|
base_dict[read_name][reference_position] = read_base
|
|
678
874
|
|
|
679
875
|
# Track mismatches (excluding Ns)
|
|
680
|
-
if read_base != ref_base and read_base !=
|
|
876
|
+
if read_base != ref_base and read_base != "N" and ref_base != "N":
|
|
681
877
|
mismatch_counts_per_read[read_name][ref_base][read_base] += 1
|
|
682
878
|
|
|
683
879
|
# Determine C→T vs G→A dominance per read
|
|
@@ -695,7 +891,13 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
|
|
|
695
891
|
else:
|
|
696
892
|
mismatch_trend_per_read[read_name] = "none"
|
|
697
893
|
|
|
698
|
-
return
|
|
894
|
+
return (
|
|
895
|
+
dict(fwd_base_identities),
|
|
896
|
+
dict(rev_base_identities),
|
|
897
|
+
dict(mismatch_counts_per_read),
|
|
898
|
+
mismatch_trend_per_read,
|
|
899
|
+
)
|
|
900
|
+
|
|
699
901
|
|
|
700
902
|
def extract_read_features_from_bam(bam_file_path):
|
|
701
903
|
"""
|
|
@@ -706,7 +908,9 @@ def extract_read_features_from_bam(bam_file_path):
|
|
|
706
908
|
read_metrics (dict)
|
|
707
909
|
"""
|
|
708
910
|
# Open the BAM file
|
|
709
|
-
|
|
911
|
+
logger.debug(
|
|
912
|
+
f"Extracting read metrics from BAM using extract_read_features_from_bam: {bam_file_path}"
|
|
913
|
+
)
|
|
710
914
|
with pysam.AlignmentFile(bam_file_path, "rb") as bam_file:
|
|
711
915
|
read_metrics = {}
|
|
712
916
|
reference_lengths = bam_file.lengths # List of lengths for each reference (chromosome)
|
|
@@ -723,10 +927,17 @@ def extract_read_features_from_bam(bam_file_path):
|
|
|
723
927
|
reference_length = reference_lengths[reference_index]
|
|
724
928
|
mapped_length = sum(end - start for start, end in read.get_blocks())
|
|
725
929
|
mapping_quality = read.mapping_quality # Phred-scaled MAPQ
|
|
726
|
-
read_metrics[read.query_name] = [
|
|
930
|
+
read_metrics[read.query_name] = [
|
|
931
|
+
read.query_length,
|
|
932
|
+
median_read_quality,
|
|
933
|
+
reference_length,
|
|
934
|
+
mapped_length,
|
|
935
|
+
mapping_quality,
|
|
936
|
+
]
|
|
727
937
|
|
|
728
938
|
return read_metrics
|
|
729
939
|
|
|
940
|
+
|
|
730
941
|
def extract_readnames_from_bam(aligned_BAM):
|
|
731
942
|
"""
|
|
732
943
|
Takes a BAM and writes out a txt file containing read names from the BAM
|
|
@@ -739,15 +950,19 @@ def extract_readnames_from_bam(aligned_BAM):
|
|
|
739
950
|
|
|
740
951
|
"""
|
|
741
952
|
import subprocess
|
|
953
|
+
|
|
742
954
|
# Make a text file of reads for the BAM
|
|
743
|
-
txt_output = aligned_BAM.split(
|
|
955
|
+
txt_output = aligned_BAM.split(".bam")[0] + "_read_names.txt"
|
|
744
956
|
samtools_view = subprocess.Popen(["samtools", "view", aligned_BAM], stdout=subprocess.PIPE)
|
|
745
957
|
with open(txt_output, "w") as output_file:
|
|
746
|
-
cut_process = subprocess.Popen(
|
|
958
|
+
cut_process = subprocess.Popen(
|
|
959
|
+
["cut", "-f1"], stdin=samtools_view.stdout, stdout=output_file
|
|
960
|
+
)
|
|
747
961
|
samtools_view.stdout.close()
|
|
748
962
|
cut_process.wait()
|
|
749
963
|
samtools_view.wait()
|
|
750
964
|
|
|
965
|
+
|
|
751
966
|
def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
|
|
752
967
|
"""
|
|
753
968
|
Separates an input BAM file on the BC SAM tag values.
|
|
@@ -757,11 +972,12 @@ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
|
|
|
757
972
|
output_prefix (str): A prefix to append to the output BAM.
|
|
758
973
|
bam_suffix (str): A suffix to add to the bam file.
|
|
759
974
|
split_dir (str): String indicating path to directory to split BAMs into
|
|
760
|
-
|
|
975
|
+
|
|
761
976
|
Returns:
|
|
762
977
|
None
|
|
763
978
|
Writes out split BAM files.
|
|
764
979
|
"""
|
|
980
|
+
logger.debug("Demultiplexing BAM based on the BC tag")
|
|
765
981
|
bam_base = input_bam.name
|
|
766
982
|
bam_base_minus_suffix = input_bam.stem
|
|
767
983
|
|
|
@@ -774,19 +990,24 @@ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
|
|
|
774
990
|
try:
|
|
775
991
|
# Get the barcode tag value
|
|
776
992
|
bc_tag = read.get_tag("BC", with_value_type=True)[0]
|
|
777
|
-
#bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
|
|
993
|
+
# bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
|
|
778
994
|
# Open the output BAM file corresponding to the barcode
|
|
779
995
|
if bc_tag not in output_files:
|
|
780
|
-
output_path =
|
|
781
|
-
|
|
996
|
+
output_path = (
|
|
997
|
+
split_dir / f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}"
|
|
998
|
+
)
|
|
999
|
+
output_files[bc_tag] = pysam.AlignmentFile(
|
|
1000
|
+
str(output_path), "wb", header=bam.header
|
|
1001
|
+
)
|
|
782
1002
|
# Write the read to the corresponding output BAM file
|
|
783
1003
|
output_files[bc_tag].write(read)
|
|
784
1004
|
except KeyError:
|
|
785
|
-
|
|
1005
|
+
logger.warning(f"BC tag not present for read: {read.query_name}")
|
|
786
1006
|
# Close all output BAM files
|
|
787
1007
|
for output_file in output_files.values():
|
|
788
1008
|
output_file.close()
|
|
789
1009
|
|
|
1010
|
+
|
|
790
1011
|
def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
|
|
791
1012
|
"""
|
|
792
1013
|
A wrapper function for splitting BAMS and indexing them.
|
|
@@ -794,19 +1015,20 @@ def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
|
|
|
794
1015
|
aligned_sorted_BAM (str): A string representing the file path of the aligned_sorted BAM file.
|
|
795
1016
|
split_dir (str): A string representing the file path to the directory to split the BAMs into.
|
|
796
1017
|
bam_suffix (str): A suffix to add to the bam file.
|
|
797
|
-
|
|
1018
|
+
|
|
798
1019
|
Returns:
|
|
799
1020
|
None
|
|
800
1021
|
Splits an input BAM file on barcode value and makes a BAM index file.
|
|
801
1022
|
"""
|
|
1023
|
+
logger.debug("Demultiplexing and indexing BAMS based on BC tag using split_and_index_BAM")
|
|
802
1024
|
aligned_sorted_output = aligned_sorted_BAM + bam_suffix
|
|
803
1025
|
file_prefix = date_string()
|
|
804
1026
|
separate_bam_by_bc(aligned_sorted_output, file_prefix, bam_suffix, split_dir)
|
|
805
1027
|
# Make a BAM index file for the BAMs in that directory
|
|
806
|
-
bam_pattern =
|
|
1028
|
+
bam_pattern = "*" + bam_suffix
|
|
807
1029
|
bam_files = glob.glob(split_dir / bam_pattern)
|
|
808
|
-
bam_files = [str(bam) for bam in bam_files if
|
|
1030
|
+
bam_files = [str(bam) for bam in bam_files if ".bai" not in str(bam)]
|
|
809
1031
|
for input_file in bam_files:
|
|
810
1032
|
pysam.index(input_file)
|
|
811
1033
|
|
|
812
|
-
return bam_files
|
|
1034
|
+
return bam_files
|