smftools 0.2.5__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +39 -7
- smftools/_settings.py +2 -0
- smftools/_version.py +3 -1
- smftools/cli/__init__.py +1 -0
- smftools/cli/archived/cli_flows.py +2 -0
- smftools/cli/helpers.py +2 -0
- smftools/cli/hmm_adata.py +7 -2
- smftools/cli/load_adata.py +130 -98
- smftools/cli/preprocess_adata.py +2 -0
- smftools/cli/spatial_adata.py +5 -1
- smftools/cli_entry.py +26 -1
- smftools/config/__init__.py +2 -0
- smftools/config/default.yaml +4 -1
- smftools/config/experiment_config.py +6 -0
- smftools/datasets/__init__.py +2 -0
- smftools/hmm/HMM.py +9 -3
- smftools/hmm/__init__.py +24 -13
- smftools/hmm/archived/apply_hmm_batched.py +2 -0
- smftools/hmm/archived/calculate_distances.py +2 -0
- smftools/hmm/archived/call_hmm_peaks.py +2 -0
- smftools/hmm/archived/train_hmm.py +2 -0
- smftools/hmm/call_hmm_peaks.py +5 -2
- smftools/hmm/display_hmm.py +4 -1
- smftools/hmm/hmm_readwrite.py +7 -2
- smftools/hmm/nucleosome_hmm_refinement.py +2 -0
- smftools/informatics/__init__.py +53 -34
- smftools/informatics/archived/bam_conversion.py +2 -0
- smftools/informatics/archived/bam_direct.py +2 -0
- smftools/informatics/archived/basecall_pod5s.py +2 -0
- smftools/informatics/archived/basecalls_to_adata.py +2 -0
- smftools/informatics/archived/conversion_smf.py +2 -0
- smftools/informatics/archived/deaminase_smf.py +1 -0
- smftools/informatics/archived/direct_smf.py +2 -0
- smftools/informatics/archived/fast5_to_pod5.py +2 -0
- smftools/informatics/archived/helpers/archived/__init__.py +2 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
- smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
- smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
- smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
- smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
- smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
- smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
- smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
- smftools/informatics/archived/helpers/archived/informatics.py +2 -0
- smftools/informatics/archived/helpers/archived/load_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
- smftools/informatics/archived/helpers/archived/modQC.py +2 -0
- smftools/informatics/archived/helpers/archived/modcall.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +2 -0
- smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
- smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
- smftools/informatics/archived/print_bam_query_seq.py +2 -0
- smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
- smftools/informatics/archived/subsample_pod5.py +2 -0
- smftools/informatics/bam_functions.py +737 -170
- smftools/informatics/basecalling.py +2 -0
- smftools/informatics/bed_functions.py +271 -61
- smftools/informatics/binarize_converted_base_identities.py +3 -0
- smftools/informatics/complement_base_list.py +2 -0
- smftools/informatics/converted_BAM_to_adata.py +66 -22
- smftools/informatics/fasta_functions.py +94 -10
- smftools/informatics/h5ad_functions.py +8 -2
- smftools/informatics/modkit_extract_to_adata.py +16 -6
- smftools/informatics/modkit_functions.py +2 -0
- smftools/informatics/ohe.py +2 -0
- smftools/informatics/pod5_functions.py +3 -2
- smftools/machine_learning/__init__.py +22 -6
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +18 -4
- smftools/machine_learning/data/preprocessing.py +2 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +2 -0
- smftools/machine_learning/evaluation/evaluators.py +14 -9
- smftools/machine_learning/inference/__init__.py +2 -0
- smftools/machine_learning/inference/inference_utils.py +2 -0
- smftools/machine_learning/inference/lightning_inference.py +6 -1
- smftools/machine_learning/inference/sklearn_inference.py +2 -0
- smftools/machine_learning/inference/sliding_window_inference.py +2 -0
- smftools/machine_learning/models/__init__.py +2 -0
- smftools/machine_learning/models/base.py +7 -2
- smftools/machine_learning/models/cnn.py +7 -2
- smftools/machine_learning/models/lightning_base.py +16 -11
- smftools/machine_learning/models/mlp.py +5 -1
- smftools/machine_learning/models/positional.py +7 -2
- smftools/machine_learning/models/rnn.py +5 -1
- smftools/machine_learning/models/sklearn_models.py +14 -9
- smftools/machine_learning/models/transformer.py +7 -2
- smftools/machine_learning/models/wrappers.py +6 -2
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +13 -3
- smftools/machine_learning/training/train_sklearn_model.py +2 -0
- smftools/machine_learning/utils/__init__.py +2 -0
- smftools/machine_learning/utils/device.py +5 -1
- smftools/machine_learning/utils/grl.py +5 -1
- smftools/optional_imports.py +31 -0
- smftools/plotting/__init__.py +32 -31
- smftools/plotting/autocorrelation_plotting.py +9 -5
- smftools/plotting/classifiers.py +16 -4
- smftools/plotting/general_plotting.py +6 -3
- smftools/plotting/hmm_plotting.py +12 -2
- smftools/plotting/position_stats.py +15 -7
- smftools/plotting/qc_plotting.py +6 -1
- smftools/preprocessing/__init__.py +35 -37
- smftools/preprocessing/archived/add_read_length_and_mapping_qc.py +2 -0
- smftools/preprocessing/archived/calculate_complexity.py +2 -0
- smftools/preprocessing/archived/mark_duplicates.py +2 -0
- smftools/preprocessing/archived/preprocessing.py +2 -0
- smftools/preprocessing/archived/remove_duplicates.py +2 -0
- smftools/preprocessing/binary_layers_to_ohe.py +2 -1
- smftools/preprocessing/calculate_complexity_II.py +4 -1
- smftools/preprocessing/calculate_pairwise_differences.py +2 -0
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +3 -0
- smftools/preprocessing/calculate_position_Youden.py +9 -2
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +2 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +2 -0
- smftools/preprocessing/flag_duplicate_reads.py +42 -54
- smftools/preprocessing/make_dirs.py +2 -1
- smftools/preprocessing/min_non_diagonal.py +2 -0
- smftools/preprocessing/recipes.py +2 -0
- smftools/tools/__init__.py +26 -18
- smftools/tools/archived/apply_hmm.py +2 -0
- smftools/tools/archived/classifiers.py +2 -0
- smftools/tools/archived/classify_methylated_features.py +2 -0
- smftools/tools/archived/classify_non_methylated_features.py +2 -0
- smftools/tools/archived/subset_adata_v1.py +2 -0
- smftools/tools/archived/subset_adata_v2.py +2 -0
- smftools/tools/calculate_umap.py +3 -1
- smftools/tools/cluster_adata_on_methylation.py +7 -1
- smftools/tools/position_stats.py +17 -27
- {smftools-0.2.5.dist-info → smftools-0.3.0.dist-info}/METADATA +67 -33
- smftools-0.3.0.dist-info/RECORD +182 -0
- smftools-0.2.5.dist-info/RECORD +0 -181
- {smftools-0.2.5.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
- {smftools-0.2.5.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.5.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,23 +1,134 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import concurrent.futures
|
|
2
4
|
import os
|
|
5
|
+
import shutil
|
|
6
|
+
import subprocess
|
|
3
7
|
from concurrent.futures import ProcessPoolExecutor
|
|
4
8
|
from pathlib import Path
|
|
9
|
+
from typing import TYPE_CHECKING
|
|
5
10
|
|
|
6
|
-
import matplotlib.pyplot as plt
|
|
7
11
|
import numpy as np
|
|
8
12
|
import pandas as pd
|
|
9
|
-
import pybedtools
|
|
10
|
-
import pyBigWig
|
|
11
|
-
import pysam
|
|
12
13
|
|
|
13
14
|
from smftools.logging_utils import get_logger
|
|
15
|
+
from smftools.optional_imports import require
|
|
14
16
|
|
|
15
17
|
from ..readwrite import make_dirs
|
|
16
18
|
|
|
17
19
|
logger = get_logger(__name__)
|
|
18
20
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
import pybedtools as pybedtools_types
|
|
23
|
+
import pyBigWig as pybigwig_types
|
|
24
|
+
import pysam as pysam_types
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
import pybedtools
|
|
28
|
+
except Exception:
|
|
29
|
+
pybedtools = None # type: ignore
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
import pyBigWig
|
|
33
|
+
except Exception:
|
|
34
|
+
pyBigWig = None # type: ignore
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
import pysam
|
|
38
|
+
except Exception:
|
|
39
|
+
pysam = None # type: ignore
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _require_pybedtools() -> "pybedtools_types":
|
|
43
|
+
if pybedtools is not None:
|
|
44
|
+
return pybedtools
|
|
45
|
+
return require("pybedtools", extra="pybedtools", purpose="bedtools Python backend")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _require_pybigwig() -> "pybigwig_types":
|
|
49
|
+
if pyBigWig is not None:
|
|
50
|
+
return pyBigWig
|
|
51
|
+
return require("pyBigWig", extra="pybigwig", purpose="BigWig Python backend")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _require_pysam() -> "pysam_types":
|
|
55
|
+
if pysam is not None:
|
|
56
|
+
return pysam
|
|
57
|
+
return require("pysam", extra="pysam", purpose="FASTA indexing")
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _resolve_backend(
|
|
61
|
+
backend: str | None, *, tool: str, python_available: bool, cli_name: str
|
|
62
|
+
) -> str:
|
|
63
|
+
choice = (backend or "auto").strip().lower()
|
|
64
|
+
if choice not in {"auto", "python", "cli"}:
|
|
65
|
+
raise ValueError(f"{tool}_backend must be one of: auto, python, cli")
|
|
66
|
+
if choice == "python":
|
|
67
|
+
if not python_available:
|
|
68
|
+
raise RuntimeError(
|
|
69
|
+
f"{tool}_backend=python requires the Python package to be installed."
|
|
70
|
+
)
|
|
71
|
+
return "python"
|
|
72
|
+
if choice == "cli":
|
|
73
|
+
if not shutil.which(cli_name):
|
|
74
|
+
raise RuntimeError(f"{tool}_backend=cli requires {cli_name} in PATH.")
|
|
75
|
+
return "cli"
|
|
76
|
+
if shutil.which(cli_name):
|
|
77
|
+
return "cli"
|
|
78
|
+
if python_available:
|
|
79
|
+
return "python"
|
|
80
|
+
raise RuntimeError(f"Neither Python nor CLI backend is available for {tool}.")
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _read_chrom_sizes(chrom_sizes: Path) -> list[tuple[str, int]]:
|
|
84
|
+
sizes: list[tuple[str, int]] = []
|
|
85
|
+
with chrom_sizes.open() as f:
|
|
86
|
+
for line in f:
|
|
87
|
+
chrom, size = line.split()[:2]
|
|
88
|
+
sizes.append((chrom, int(size)))
|
|
89
|
+
return sizes
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _ensure_fasta_index(fasta: Path) -> Path:
|
|
93
|
+
fai = fasta.with_suffix(fasta.suffix + ".fai")
|
|
94
|
+
if fai.exists():
|
|
95
|
+
return fai
|
|
96
|
+
if shutil.which("samtools"):
|
|
97
|
+
cp = subprocess.run(
|
|
98
|
+
["samtools", "faidx", str(fasta)],
|
|
99
|
+
stdout=subprocess.DEVNULL,
|
|
100
|
+
stderr=subprocess.PIPE,
|
|
101
|
+
text=True,
|
|
102
|
+
)
|
|
103
|
+
if cp.returncode != 0:
|
|
104
|
+
raise RuntimeError(f"samtools faidx failed (exit {cp.returncode}):\n{cp.stderr}")
|
|
105
|
+
return fai
|
|
106
|
+
if pysam is not None:
|
|
107
|
+
pysam_mod = _require_pysam()
|
|
108
|
+
pysam_mod.faidx(str(fasta))
|
|
109
|
+
return fai
|
|
110
|
+
raise RuntimeError("FASTA indexing requires pysam or samtools in PATH.")
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _ensure_chrom_sizes(fasta: Path) -> Path:
|
|
114
|
+
fai = _ensure_fasta_index(fasta)
|
|
115
|
+
chrom_sizes = fasta.with_suffix(".chrom.sizes")
|
|
116
|
+
if chrom_sizes.exists():
|
|
117
|
+
return chrom_sizes
|
|
118
|
+
with fai.open() as f_in, chrom_sizes.open("w") as out:
|
|
119
|
+
for line in f_in:
|
|
120
|
+
chrom, size = line.split()[:2]
|
|
121
|
+
out.write(f"{chrom}\t{size}\n")
|
|
122
|
+
return chrom_sizes
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _bed_to_bigwig(
|
|
126
|
+
fasta: str,
|
|
127
|
+
bed: str,
|
|
128
|
+
*,
|
|
129
|
+
bedtools_backend: str | None = "auto",
|
|
130
|
+
bigwig_backend: str | None = "auto",
|
|
131
|
+
) -> str:
|
|
21
132
|
"""
|
|
22
133
|
BED → bedGraph → bigWig
|
|
23
134
|
Requires:
|
|
@@ -28,40 +139,70 @@ def _bed_to_bigwig(fasta: str, bed: str) -> str:
|
|
|
28
139
|
fa = Path(fasta) # path to .fa
|
|
29
140
|
parent = bed.parent
|
|
30
141
|
stem = bed.stem
|
|
31
|
-
|
|
32
|
-
fai = parent / f"{fa_stem}.fai"
|
|
142
|
+
chrom_sizes = _ensure_chrom_sizes(fa)
|
|
33
143
|
|
|
34
144
|
bedgraph = parent / f"{stem}.bedgraph"
|
|
35
145
|
bigwig = parent / f"{stem}.bw"
|
|
36
146
|
|
|
37
147
|
# 1) Compute coverage → bedGraph
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
148
|
+
bedtools_choice = _resolve_backend(
|
|
149
|
+
bedtools_backend,
|
|
150
|
+
tool="bedtools",
|
|
151
|
+
python_available=pybedtools is not None,
|
|
152
|
+
cli_name="bedtools",
|
|
153
|
+
)
|
|
154
|
+
if bedtools_choice == "python":
|
|
155
|
+
logger.debug(f"[pybedtools] generating coverage bedgraph from {bed}")
|
|
156
|
+
pybedtools_mod = _require_pybedtools()
|
|
157
|
+
bt = pybedtools_mod.BedTool(str(bed))
|
|
158
|
+
# bedtools genomecov -bg
|
|
159
|
+
coverage = bt.genome_coverage(bg=True, genome=str(chrom_sizes))
|
|
160
|
+
coverage.saveas(str(bedgraph))
|
|
161
|
+
else:
|
|
162
|
+
if not shutil.which("bedtools"):
|
|
163
|
+
raise RuntimeError("bedtools is required but not available in PATH.")
|
|
164
|
+
cmd = [
|
|
165
|
+
"bedtools",
|
|
166
|
+
"genomecov",
|
|
167
|
+
"-i",
|
|
168
|
+
str(bed),
|
|
169
|
+
"-g",
|
|
170
|
+
str(chrom_sizes),
|
|
171
|
+
"-bg",
|
|
172
|
+
]
|
|
173
|
+
logger.debug("[bedtools] generating coverage bedgraph: %s", " ".join(cmd))
|
|
174
|
+
with bedgraph.open("w") as out:
|
|
175
|
+
cp = subprocess.run(cmd, stdout=out, stderr=subprocess.PIPE, text=True)
|
|
176
|
+
if cp.returncode != 0:
|
|
177
|
+
raise RuntimeError(f"bedtools genomecov failed (exit {cp.returncode}):\n{cp.stderr}")
|
|
43
178
|
|
|
44
179
|
# 2) Convert bedGraph → BigWig via pyBigWig
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
180
|
+
bigwig_choice = _resolve_backend(
|
|
181
|
+
bigwig_backend,
|
|
182
|
+
tool="bigwig",
|
|
183
|
+
python_available=pyBigWig is not None,
|
|
184
|
+
cli_name="bedGraphToBigWig",
|
|
185
|
+
)
|
|
186
|
+
if bigwig_choice == "python":
|
|
187
|
+
logger.debug(f"[pyBigWig] converting bedgraph → bigwig: {bigwig}")
|
|
188
|
+
pybigwig_mod = _require_pybigwig()
|
|
189
|
+
bw = pybigwig_mod.open(str(bigwig), "w")
|
|
190
|
+
bw.addHeader(_read_chrom_sizes(chrom_sizes))
|
|
191
|
+
|
|
192
|
+
with bedgraph.open() as f:
|
|
193
|
+
for line in f:
|
|
194
|
+
chrom, start, end, coverage = line.strip().split()
|
|
195
|
+
bw.addEntries(chrom, int(start), ends=int(end), values=float(coverage))
|
|
196
|
+
|
|
197
|
+
bw.close()
|
|
198
|
+
else:
|
|
199
|
+
if not shutil.which("bedGraphToBigWig"):
|
|
200
|
+
raise RuntimeError("bedGraphToBigWig is required but not available in PATH.")
|
|
201
|
+
cmd = ["bedGraphToBigWig", str(bedgraph), str(chrom_sizes), str(bigwig)]
|
|
202
|
+
logger.debug("[bedGraphToBigWig] converting bedgraph → bigwig: %s", " ".join(cmd))
|
|
203
|
+
cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
|
|
204
|
+
if cp.returncode != 0:
|
|
205
|
+
raise RuntimeError(f"bedGraphToBigWig failed (exit {cp.returncode}):\n{cp.stderr}")
|
|
65
206
|
|
|
66
207
|
logger.debug(f"BigWig written: {bigwig}")
|
|
67
208
|
return str(bigwig)
|
|
@@ -113,6 +254,8 @@ def _plot_bed_histograms(
|
|
|
113
254
|
coordinate_mode : {"one_based","zero_based"}
|
|
114
255
|
One-based, inclusive (your file) vs BED-standard zero-based, half-open.
|
|
115
256
|
"""
|
|
257
|
+
plt = require("matplotlib.pyplot", extra="plotting", purpose="plotting BED histograms")
|
|
258
|
+
|
|
116
259
|
os.makedirs(plotting_directory, exist_ok=True)
|
|
117
260
|
|
|
118
261
|
bed_basename = os.path.basename(bed_file).rsplit(".bed", 1)[0]
|
|
@@ -167,7 +310,8 @@ def _plot_bed_histograms(
|
|
|
167
310
|
return np.clip(x, lo, hi)
|
|
168
311
|
|
|
169
312
|
# Load chromosome order/lengths from FASTA
|
|
170
|
-
|
|
313
|
+
pysam_mod = _require_pysam()
|
|
314
|
+
with pysam_mod.FastaFile(fasta) as fa:
|
|
171
315
|
ref_names = list(fa.references)
|
|
172
316
|
ref_lengths = dict(zip(ref_names, fa.lengths))
|
|
173
317
|
|
|
@@ -292,7 +436,17 @@ def _plot_bed_histograms(
|
|
|
292
436
|
logger.debug("[plot_bed_histograms] Done.")
|
|
293
437
|
|
|
294
438
|
|
|
295
|
-
def aligned_BAM_to_bed(
|
|
439
|
+
def aligned_BAM_to_bed(
|
|
440
|
+
aligned_BAM,
|
|
441
|
+
out_dir,
|
|
442
|
+
fasta,
|
|
443
|
+
make_bigwigs,
|
|
444
|
+
threads=None,
|
|
445
|
+
*,
|
|
446
|
+
samtools_backend: str | None = "auto",
|
|
447
|
+
bedtools_backend: str | None = "auto",
|
|
448
|
+
bigwig_backend: str | None = "auto",
|
|
449
|
+
):
|
|
296
450
|
"""
|
|
297
451
|
Takes an aligned BAM as input and writes a BED file of reads as output.
|
|
298
452
|
Bed columns are: Record name, start position, end position, read length, read name, mapping quality, read quality.
|
|
@@ -318,31 +472,79 @@ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
|
|
|
318
472
|
|
|
319
473
|
logger.debug(f"Creating BED-like file from BAM (with MAPQ and avg base quality): {aligned_BAM}")
|
|
320
474
|
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
475
|
+
backend_choice = _resolve_backend(
|
|
476
|
+
samtools_backend,
|
|
477
|
+
tool="samtools",
|
|
478
|
+
python_available=pysam is not None,
|
|
479
|
+
cli_name="samtools",
|
|
480
|
+
)
|
|
481
|
+
with open(bed_output, "w") as out:
|
|
482
|
+
if backend_choice == "python":
|
|
483
|
+
pysam_mod = _require_pysam()
|
|
484
|
+
with pysam_mod.AlignmentFile(aligned_BAM, "rb") as bam:
|
|
485
|
+
for read in bam.fetch(until_eof=True):
|
|
486
|
+
if read.is_unmapped:
|
|
487
|
+
chrom = "*"
|
|
488
|
+
start1 = 1
|
|
489
|
+
rl = read.query_length or 0
|
|
490
|
+
mapq = 0
|
|
491
|
+
else:
|
|
492
|
+
chrom = bam.get_reference_name(read.reference_id)
|
|
493
|
+
# pysam reference_start is 0-based → +1 for 1-based SAM-like start
|
|
494
|
+
start1 = int(read.reference_start) + 1
|
|
495
|
+
rl = read.query_length or 0
|
|
496
|
+
mapq = int(read.mapping_quality)
|
|
497
|
+
|
|
498
|
+
# End position in 1-based inclusive coords
|
|
499
|
+
end1 = start1 + (rl or 0) - 1
|
|
500
|
+
|
|
501
|
+
qname = read.query_name
|
|
502
|
+
quals = read.query_qualities
|
|
503
|
+
if quals is None or rl == 0:
|
|
504
|
+
avg_q = float("nan")
|
|
505
|
+
else:
|
|
506
|
+
avg_q = float(np.mean(quals))
|
|
507
|
+
|
|
508
|
+
out.write(f"{chrom}\t{start1}\t{end1}\t{rl}\t{qname}\t{mapq}\t{avg_q:.3f}\n")
|
|
509
|
+
else:
|
|
510
|
+
samtools_view = subprocess.Popen(
|
|
511
|
+
["samtools", "view", str(aligned_BAM)],
|
|
512
|
+
stdout=subprocess.PIPE,
|
|
513
|
+
stderr=subprocess.PIPE,
|
|
514
|
+
text=True,
|
|
515
|
+
)
|
|
516
|
+
assert samtools_view.stdout is not None
|
|
517
|
+
for line in samtools_view.stdout:
|
|
518
|
+
if not line.strip():
|
|
519
|
+
continue
|
|
520
|
+
fields = line.rstrip("\n").split("\t")
|
|
521
|
+
if len(fields) < 11:
|
|
522
|
+
continue
|
|
523
|
+
qname = fields[0]
|
|
524
|
+
flag = int(fields[1])
|
|
525
|
+
chrom = fields[2]
|
|
526
|
+
pos = int(fields[3])
|
|
527
|
+
mapq = int(fields[4])
|
|
528
|
+
seq = fields[9]
|
|
529
|
+
qual = fields[10]
|
|
530
|
+
rl = 0 if seq == "*" else len(seq)
|
|
531
|
+
is_unmapped = bool(flag & 0x4) or chrom == "*"
|
|
532
|
+
if is_unmapped:
|
|
533
|
+
chrom = "*"
|
|
534
|
+
start1 = 1
|
|
535
|
+
mapq = 0
|
|
536
|
+
else:
|
|
537
|
+
start1 = pos
|
|
538
|
+
end1 = start1 + (rl or 0) - 1
|
|
539
|
+
if qual == "*" or rl == 0:
|
|
540
|
+
avg_q = float("nan")
|
|
541
|
+
else:
|
|
542
|
+
avg_q = float(np.mean([ord(ch) - 33 for ch in qual]))
|
|
543
|
+
out.write(f"{chrom}\t{start1}\t{end1}\t{rl}\t{qname}\t{mapq}\t{avg_q:.3f}\n")
|
|
544
|
+
rc = samtools_view.wait()
|
|
545
|
+
if rc != 0:
|
|
546
|
+
stderr = samtools_view.stderr.read() if samtools_view.stderr else ""
|
|
547
|
+
raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
|
|
346
548
|
|
|
347
549
|
logger.debug(f"BED-like file created: {bed_output}")
|
|
348
550
|
|
|
@@ -368,7 +570,15 @@ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
|
|
|
368
570
|
futures = []
|
|
369
571
|
futures.append(executor.submit(_plot_bed_histograms, aligned_bed, plotting_dir, fasta))
|
|
370
572
|
if make_bigwigs:
|
|
371
|
-
futures.append(
|
|
573
|
+
futures.append(
|
|
574
|
+
executor.submit(
|
|
575
|
+
_bed_to_bigwig,
|
|
576
|
+
fasta,
|
|
577
|
+
aligned_bed,
|
|
578
|
+
bedtools_backend=bedtools_backend,
|
|
579
|
+
bigwig_backend=bigwig_backend,
|
|
580
|
+
)
|
|
581
|
+
)
|
|
372
582
|
concurrent.futures.wait(futures)
|
|
373
583
|
|
|
374
584
|
logger.debug("Processing completed successfully.")
|
|
@@ -1,18 +1,20 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import gc
|
|
2
|
-
import
|
|
4
|
+
import logging
|
|
3
5
|
import shutil
|
|
4
6
|
import time
|
|
5
7
|
import traceback
|
|
6
8
|
from multiprocessing import Manager, Pool, current_process
|
|
7
9
|
from pathlib import Path
|
|
8
|
-
from typing import Iterable, Optional, Union
|
|
10
|
+
from typing import TYPE_CHECKING, Iterable, Optional, Union
|
|
9
11
|
|
|
10
12
|
import anndata as ad
|
|
11
13
|
import numpy as np
|
|
12
14
|
import pandas as pd
|
|
13
|
-
import torch
|
|
14
15
|
|
|
15
|
-
from smftools.logging_utils import get_logger
|
|
16
|
+
from smftools.logging_utils import get_logger, setup_logging
|
|
17
|
+
from smftools.optional_imports import require
|
|
16
18
|
|
|
17
19
|
from ..readwrite import make_dirs
|
|
18
20
|
from .bam_functions import count_aligned_reads, extract_base_identities
|
|
@@ -22,8 +24,10 @@ from .ohe import ohe_batching
|
|
|
22
24
|
|
|
23
25
|
logger = get_logger(__name__)
|
|
24
26
|
|
|
25
|
-
if
|
|
26
|
-
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
import torch
|
|
29
|
+
|
|
30
|
+
torch = require("torch", extra="torch", purpose="converted BAM processing")
|
|
27
31
|
|
|
28
32
|
|
|
29
33
|
def converted_BAM_to_adata(
|
|
@@ -40,6 +44,7 @@ def converted_BAM_to_adata(
|
|
|
40
44
|
deaminase_footprinting: bool = False,
|
|
41
45
|
delete_intermediates: bool = True,
|
|
42
46
|
double_barcoded_path: Path | None = None,
|
|
47
|
+
samtools_backend: str | None = "auto",
|
|
43
48
|
) -> tuple[ad.AnnData | None, Path]:
|
|
44
49
|
"""Convert BAM files into an AnnData object by binarizing modified base identities.
|
|
45
50
|
|
|
@@ -89,7 +94,9 @@ def converted_BAM_to_adata(
|
|
|
89
94
|
)
|
|
90
95
|
|
|
91
96
|
bam_path_list = bam_files
|
|
92
|
-
|
|
97
|
+
|
|
98
|
+
bam_names = [bam.name for bam in bam_files]
|
|
99
|
+
logger.info(f"Found {len(bam_files)} BAM files within {split_dir}: {bam_names}")
|
|
93
100
|
|
|
94
101
|
## Process Conversion Sites
|
|
95
102
|
max_reference_length, record_FASTA_dict, chromosome_FASTA_dict = process_conversion_sites(
|
|
@@ -98,7 +105,7 @@ def converted_BAM_to_adata(
|
|
|
98
105
|
|
|
99
106
|
## Filter BAM Files by Mapping Threshold
|
|
100
107
|
records_to_analyze = filter_bams_by_mapping_threshold(
|
|
101
|
-
bam_path_list, bam_files, mapping_threshold
|
|
108
|
+
bam_path_list, bam_files, mapping_threshold, samtools_backend
|
|
102
109
|
)
|
|
103
110
|
|
|
104
111
|
## Process BAMs in Parallel
|
|
@@ -113,6 +120,7 @@ def converted_BAM_to_adata(
|
|
|
113
120
|
max_reference_length,
|
|
114
121
|
device,
|
|
115
122
|
deaminase_footprinting,
|
|
123
|
+
samtools_backend,
|
|
116
124
|
)
|
|
117
125
|
|
|
118
126
|
final_adata.uns["References"] = {}
|
|
@@ -240,14 +248,14 @@ def process_conversion_sites(
|
|
|
240
248
|
return max_reference_length, record_FASTA_dict, chromosome_FASTA_dict
|
|
241
249
|
|
|
242
250
|
|
|
243
|
-
def filter_bams_by_mapping_threshold(bam_path_list, bam_files, mapping_threshold):
|
|
251
|
+
def filter_bams_by_mapping_threshold(bam_path_list, bam_files, mapping_threshold, samtools_backend):
|
|
244
252
|
"""Filters BAM files based on mapping threshold."""
|
|
245
253
|
records_to_analyze = set()
|
|
246
254
|
|
|
247
255
|
for i, bam in enumerate(bam_path_list):
|
|
248
|
-
aligned_reads, unaligned_reads, record_counts = count_aligned_reads(bam)
|
|
256
|
+
aligned_reads, unaligned_reads, record_counts = count_aligned_reads(bam, samtools_backend)
|
|
249
257
|
aligned_percent = aligned_reads * 100 / (aligned_reads + unaligned_reads)
|
|
250
|
-
|
|
258
|
+
logger.info(f"{aligned_percent:.2f}% of reads in {bam_files[i].name} aligned successfully.")
|
|
251
259
|
|
|
252
260
|
for record, (count, percent) in record_counts.items():
|
|
253
261
|
if percent >= mapping_threshold:
|
|
@@ -267,6 +275,7 @@ def process_single_bam(
|
|
|
267
275
|
max_reference_length,
|
|
268
276
|
device,
|
|
269
277
|
deaminase_footprinting,
|
|
278
|
+
samtools_backend,
|
|
270
279
|
):
|
|
271
280
|
"""Worker function to process a single BAM file (must be at top-level for multiprocessing)."""
|
|
272
281
|
adata_list = []
|
|
@@ -281,7 +290,7 @@ def process_single_bam(
|
|
|
281
290
|
# Extract Base Identities
|
|
282
291
|
fwd_bases, rev_bases, mismatch_counts_per_read, mismatch_trend_per_read = (
|
|
283
292
|
extract_base_identities(
|
|
284
|
-
bam, record, range(current_length), max_reference_length, sequence
|
|
293
|
+
bam, record, range(current_length), max_reference_length, sequence, samtools_backend
|
|
285
294
|
)
|
|
286
295
|
)
|
|
287
296
|
mismatch_trend_series = pd.Series(mismatch_trend_per_read)
|
|
@@ -433,9 +442,13 @@ def worker_function(
|
|
|
433
442
|
max_reference_length,
|
|
434
443
|
device,
|
|
435
444
|
deaminase_footprinting,
|
|
445
|
+
samtools_backend,
|
|
436
446
|
progress_queue,
|
|
447
|
+
log_level,
|
|
448
|
+
log_file,
|
|
437
449
|
):
|
|
438
450
|
"""Worker function that processes a single BAM and writes the output to an H5AD file."""
|
|
451
|
+
_ensure_worker_logging(log_level, log_file)
|
|
439
452
|
worker_id = current_process().pid # Get worker process ID
|
|
440
453
|
sample = bam.stem
|
|
441
454
|
|
|
@@ -471,6 +484,7 @@ def worker_function(
|
|
|
471
484
|
max_reference_length,
|
|
472
485
|
device,
|
|
473
486
|
deaminase_footprinting,
|
|
487
|
+
samtools_backend,
|
|
474
488
|
)
|
|
475
489
|
|
|
476
490
|
if adata is not None:
|
|
@@ -501,19 +515,13 @@ def process_bams_parallel(
|
|
|
501
515
|
max_reference_length,
|
|
502
516
|
device,
|
|
503
517
|
deaminase_footprinting,
|
|
518
|
+
samtools_backend,
|
|
504
519
|
):
|
|
505
520
|
"""Processes BAM files in parallel, writes each H5AD to disk, and concatenates them at the end."""
|
|
506
521
|
make_dirs(h5_dir) # Ensure h5_dir exists
|
|
507
522
|
|
|
508
523
|
logger.info(f"Starting parallel BAM processing with {num_threads} threads...")
|
|
509
|
-
|
|
510
|
-
# Ensure macOS uses forkserver to avoid spawning issues
|
|
511
|
-
try:
|
|
512
|
-
import multiprocessing
|
|
513
|
-
|
|
514
|
-
multiprocessing.set_start_method("forkserver", force=True)
|
|
515
|
-
except RuntimeError:
|
|
516
|
-
logger.warning(f"Multiprocessing context already set. Skipping set_start_method.")
|
|
524
|
+
log_level, log_file = _get_logger_config()
|
|
517
525
|
|
|
518
526
|
with Manager() as manager:
|
|
519
527
|
progress_queue = manager.Queue()
|
|
@@ -534,13 +542,16 @@ def process_bams_parallel(
|
|
|
534
542
|
max_reference_length,
|
|
535
543
|
device,
|
|
536
544
|
deaminase_footprinting,
|
|
545
|
+
samtools_backend,
|
|
537
546
|
progress_queue,
|
|
547
|
+
log_level,
|
|
548
|
+
log_file,
|
|
538
549
|
),
|
|
539
550
|
)
|
|
540
551
|
for i, bam in enumerate(bam_path_list)
|
|
541
552
|
]
|
|
542
553
|
|
|
543
|
-
logger.info(f"
|
|
554
|
+
logger.info(f"Submitting {len(results)} BAMs for processing.")
|
|
544
555
|
|
|
545
556
|
# Track completed BAMs
|
|
546
557
|
completed_bams = set()
|
|
@@ -550,15 +561,18 @@ def process_bams_parallel(
|
|
|
550
561
|
completed_bams.add(processed_bam)
|
|
551
562
|
except Exception as e:
|
|
552
563
|
logger.error(f"Timeout waiting for worker process. Possible crash? {e}")
|
|
564
|
+
_log_async_result_errors(results, bam_path_list)
|
|
553
565
|
|
|
554
566
|
pool.close()
|
|
555
567
|
pool.join() # Ensure all workers finish
|
|
556
568
|
|
|
569
|
+
_log_async_result_errors(results, bam_path_list)
|
|
570
|
+
|
|
557
571
|
# Final Concatenation Step
|
|
558
572
|
h5ad_files = [f for f in h5_dir.iterdir() if f.suffix == ".h5ad"]
|
|
559
573
|
|
|
560
574
|
if not h5ad_files:
|
|
561
|
-
logger.
|
|
575
|
+
logger.warning(f"No valid H5AD files generated. Exiting.")
|
|
562
576
|
return None
|
|
563
577
|
|
|
564
578
|
logger.info(f"Concatenating {len(h5ad_files)} H5AD files into final output...")
|
|
@@ -568,6 +582,36 @@ def process_bams_parallel(
|
|
|
568
582
|
return final_adata
|
|
569
583
|
|
|
570
584
|
|
|
585
|
+
def _log_async_result_errors(results, bam_path_list):
|
|
586
|
+
"""Log worker failures captured by multiprocessing AsyncResult objects."""
|
|
587
|
+
for bam, result in zip(bam_path_list, results):
|
|
588
|
+
if not result.ready():
|
|
589
|
+
continue
|
|
590
|
+
try:
|
|
591
|
+
result.get()
|
|
592
|
+
except Exception as exc:
|
|
593
|
+
logger.error("Worker process failed for %s: %s", bam, exc)
|
|
594
|
+
|
|
595
|
+
|
|
596
|
+
def _get_logger_config() -> tuple[int, Path | None]:
|
|
597
|
+
smftools_logger = logging.getLogger("smftools")
|
|
598
|
+
level = smftools_logger.level
|
|
599
|
+
if level == logging.NOTSET:
|
|
600
|
+
level = logging.INFO
|
|
601
|
+
log_file: Path | None = None
|
|
602
|
+
for handler in smftools_logger.handlers:
|
|
603
|
+
if isinstance(handler, logging.FileHandler):
|
|
604
|
+
log_file = Path(handler.baseFilename)
|
|
605
|
+
break
|
|
606
|
+
return level, log_file
|
|
607
|
+
|
|
608
|
+
|
|
609
|
+
def _ensure_worker_logging(log_level: int, log_file: Path | None) -> None:
|
|
610
|
+
smftools_logger = logging.getLogger("smftools")
|
|
611
|
+
if not smftools_logger.handlers:
|
|
612
|
+
setup_logging(level=log_level, log_file=log_file)
|
|
613
|
+
|
|
614
|
+
|
|
571
615
|
def delete_intermediate_h5ads_and_tmpdir(
|
|
572
616
|
h5_dir: Union[str, Path, Iterable[str], None],
|
|
573
617
|
tmp_dir: Optional[Union[str, Path]] = None,
|