smftools 0.2.5__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +39 -7
- smftools/_settings.py +2 -0
- smftools/_version.py +3 -1
- smftools/cli/__init__.py +1 -0
- smftools/cli/archived/cli_flows.py +2 -0
- smftools/cli/helpers.py +34 -6
- smftools/cli/hmm_adata.py +239 -33
- smftools/cli/latent_adata.py +318 -0
- smftools/cli/load_adata.py +167 -131
- smftools/cli/preprocess_adata.py +180 -53
- smftools/cli/spatial_adata.py +152 -100
- smftools/cli_entry.py +38 -1
- smftools/config/__init__.py +2 -0
- smftools/config/conversion.yaml +11 -1
- smftools/config/default.yaml +42 -2
- smftools/config/experiment_config.py +59 -1
- smftools/constants.py +65 -0
- smftools/datasets/__init__.py +2 -0
- smftools/hmm/HMM.py +97 -3
- smftools/hmm/__init__.py +24 -13
- smftools/hmm/archived/apply_hmm_batched.py +2 -0
- smftools/hmm/archived/calculate_distances.py +2 -0
- smftools/hmm/archived/call_hmm_peaks.py +2 -0
- smftools/hmm/archived/train_hmm.py +2 -0
- smftools/hmm/call_hmm_peaks.py +5 -2
- smftools/hmm/display_hmm.py +4 -1
- smftools/hmm/hmm_readwrite.py +7 -2
- smftools/hmm/nucleosome_hmm_refinement.py +2 -0
- smftools/informatics/__init__.py +59 -34
- smftools/informatics/archived/bam_conversion.py +2 -0
- smftools/informatics/archived/bam_direct.py +2 -0
- smftools/informatics/archived/basecall_pod5s.py +2 -0
- smftools/informatics/archived/basecalls_to_adata.py +2 -0
- smftools/informatics/archived/conversion_smf.py +2 -0
- smftools/informatics/archived/deaminase_smf.py +1 -0
- smftools/informatics/archived/direct_smf.py +2 -0
- smftools/informatics/archived/fast5_to_pod5.py +2 -0
- smftools/informatics/archived/helpers/archived/__init__.py +2 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
- smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
- smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
- smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
- smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
- smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
- smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
- smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
- smftools/informatics/archived/helpers/archived/informatics.py +2 -0
- smftools/informatics/archived/helpers/archived/load_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
- smftools/informatics/archived/helpers/archived/modQC.py +2 -0
- smftools/informatics/archived/helpers/archived/modcall.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +2 -0
- smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
- smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
- smftools/informatics/archived/print_bam_query_seq.py +2 -0
- smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
- smftools/informatics/archived/subsample_pod5.py +2 -0
- smftools/informatics/bam_functions.py +1093 -176
- smftools/informatics/basecalling.py +2 -0
- smftools/informatics/bed_functions.py +271 -61
- smftools/informatics/binarize_converted_base_identities.py +3 -0
- smftools/informatics/complement_base_list.py +2 -0
- smftools/informatics/converted_BAM_to_adata.py +641 -176
- smftools/informatics/fasta_functions.py +94 -10
- smftools/informatics/h5ad_functions.py +123 -4
- smftools/informatics/modkit_extract_to_adata.py +1019 -431
- smftools/informatics/modkit_functions.py +2 -0
- smftools/informatics/ohe.py +2 -0
- smftools/informatics/pod5_functions.py +3 -2
- smftools/informatics/sequence_encoding.py +72 -0
- smftools/logging_utils.py +21 -2
- smftools/machine_learning/__init__.py +22 -6
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +18 -4
- smftools/machine_learning/data/preprocessing.py +2 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +2 -0
- smftools/machine_learning/evaluation/evaluators.py +14 -9
- smftools/machine_learning/inference/__init__.py +2 -0
- smftools/machine_learning/inference/inference_utils.py +2 -0
- smftools/machine_learning/inference/lightning_inference.py +6 -1
- smftools/machine_learning/inference/sklearn_inference.py +2 -0
- smftools/machine_learning/inference/sliding_window_inference.py +2 -0
- smftools/machine_learning/models/__init__.py +2 -0
- smftools/machine_learning/models/base.py +7 -2
- smftools/machine_learning/models/cnn.py +7 -2
- smftools/machine_learning/models/lightning_base.py +16 -11
- smftools/machine_learning/models/mlp.py +5 -1
- smftools/machine_learning/models/positional.py +7 -2
- smftools/machine_learning/models/rnn.py +5 -1
- smftools/machine_learning/models/sklearn_models.py +14 -9
- smftools/machine_learning/models/transformer.py +7 -2
- smftools/machine_learning/models/wrappers.py +6 -2
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +13 -3
- smftools/machine_learning/training/train_sklearn_model.py +2 -0
- smftools/machine_learning/utils/__init__.py +2 -0
- smftools/machine_learning/utils/device.py +5 -1
- smftools/machine_learning/utils/grl.py +5 -1
- smftools/metadata.py +1 -1
- smftools/optional_imports.py +31 -0
- smftools/plotting/__init__.py +41 -31
- smftools/plotting/autocorrelation_plotting.py +9 -5
- smftools/plotting/classifiers.py +16 -4
- smftools/plotting/general_plotting.py +2415 -629
- smftools/plotting/hmm_plotting.py +97 -9
- smftools/plotting/position_stats.py +15 -7
- smftools/plotting/qc_plotting.py +6 -1
- smftools/preprocessing/__init__.py +36 -37
- smftools/preprocessing/append_base_context.py +17 -17
- smftools/preprocessing/append_mismatch_frequency_sites.py +158 -0
- smftools/preprocessing/archived/add_read_length_and_mapping_qc.py +2 -0
- smftools/preprocessing/archived/calculate_complexity.py +2 -0
- smftools/preprocessing/archived/mark_duplicates.py +2 -0
- smftools/preprocessing/archived/preprocessing.py +2 -0
- smftools/preprocessing/archived/remove_duplicates.py +2 -0
- smftools/preprocessing/binary_layers_to_ohe.py +2 -1
- smftools/preprocessing/calculate_complexity_II.py +4 -1
- smftools/preprocessing/calculate_consensus.py +1 -1
- smftools/preprocessing/calculate_pairwise_differences.py +2 -0
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +3 -0
- smftools/preprocessing/calculate_position_Youden.py +9 -2
- smftools/preprocessing/calculate_read_modification_stats.py +6 -1
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +2 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +2 -0
- smftools/preprocessing/flag_duplicate_reads.py +42 -54
- smftools/preprocessing/make_dirs.py +2 -1
- smftools/preprocessing/min_non_diagonal.py +2 -0
- smftools/preprocessing/recipes.py +2 -0
- smftools/readwrite.py +53 -17
- smftools/schema/anndata_schema_v1.yaml +15 -1
- smftools/tools/__init__.py +30 -18
- smftools/tools/archived/apply_hmm.py +2 -0
- smftools/tools/archived/classifiers.py +2 -0
- smftools/tools/archived/classify_methylated_features.py +2 -0
- smftools/tools/archived/classify_non_methylated_features.py +2 -0
- smftools/tools/archived/subset_adata_v1.py +2 -0
- smftools/tools/archived/subset_adata_v2.py +2 -0
- smftools/tools/calculate_leiden.py +57 -0
- smftools/tools/calculate_nmf.py +119 -0
- smftools/tools/calculate_umap.py +93 -8
- smftools/tools/cluster_adata_on_methylation.py +7 -1
- smftools/tools/position_stats.py +17 -27
- smftools/tools/rolling_nn_distance.py +235 -0
- smftools/tools/tensor_factorization.py +169 -0
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/METADATA +69 -33
- smftools-0.3.1.dist-info/RECORD +189 -0
- smftools-0.2.5.dist-info/RECORD +0 -181
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/WHEEL +0 -0
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,23 +1,134 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import concurrent.futures
|
|
2
4
|
import os
|
|
5
|
+
import shutil
|
|
6
|
+
import subprocess
|
|
3
7
|
from concurrent.futures import ProcessPoolExecutor
|
|
4
8
|
from pathlib import Path
|
|
9
|
+
from typing import TYPE_CHECKING
|
|
5
10
|
|
|
6
|
-
import matplotlib.pyplot as plt
|
|
7
11
|
import numpy as np
|
|
8
12
|
import pandas as pd
|
|
9
|
-
import pybedtools
|
|
10
|
-
import pyBigWig
|
|
11
|
-
import pysam
|
|
12
13
|
|
|
13
14
|
from smftools.logging_utils import get_logger
|
|
15
|
+
from smftools.optional_imports import require
|
|
14
16
|
|
|
15
17
|
from ..readwrite import make_dirs
|
|
16
18
|
|
|
17
19
|
logger = get_logger(__name__)
|
|
18
20
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
import pybedtools as pybedtools_types
|
|
23
|
+
import pyBigWig as pybigwig_types
|
|
24
|
+
import pysam as pysam_types
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
import pybedtools
|
|
28
|
+
except Exception:
|
|
29
|
+
pybedtools = None # type: ignore
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
import pyBigWig
|
|
33
|
+
except Exception:
|
|
34
|
+
pyBigWig = None # type: ignore
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
import pysam
|
|
38
|
+
except Exception:
|
|
39
|
+
pysam = None # type: ignore
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _require_pybedtools() -> "pybedtools_types":
|
|
43
|
+
if pybedtools is not None:
|
|
44
|
+
return pybedtools
|
|
45
|
+
return require("pybedtools", extra="pybedtools", purpose="bedtools Python backend")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _require_pybigwig() -> "pybigwig_types":
|
|
49
|
+
if pyBigWig is not None:
|
|
50
|
+
return pyBigWig
|
|
51
|
+
return require("pyBigWig", extra="pybigwig", purpose="BigWig Python backend")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _require_pysam() -> "pysam_types":
|
|
55
|
+
if pysam is not None:
|
|
56
|
+
return pysam
|
|
57
|
+
return require("pysam", extra="pysam", purpose="FASTA indexing")
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _resolve_backend(
|
|
61
|
+
backend: str | None, *, tool: str, python_available: bool, cli_name: str
|
|
62
|
+
) -> str:
|
|
63
|
+
choice = (backend or "auto").strip().lower()
|
|
64
|
+
if choice not in {"auto", "python", "cli"}:
|
|
65
|
+
raise ValueError(f"{tool}_backend must be one of: auto, python, cli")
|
|
66
|
+
if choice == "python":
|
|
67
|
+
if not python_available:
|
|
68
|
+
raise RuntimeError(
|
|
69
|
+
f"{tool}_backend=python requires the Python package to be installed."
|
|
70
|
+
)
|
|
71
|
+
return "python"
|
|
72
|
+
if choice == "cli":
|
|
73
|
+
if not shutil.which(cli_name):
|
|
74
|
+
raise RuntimeError(f"{tool}_backend=cli requires {cli_name} in PATH.")
|
|
75
|
+
return "cli"
|
|
76
|
+
if shutil.which(cli_name):
|
|
77
|
+
return "cli"
|
|
78
|
+
if python_available:
|
|
79
|
+
return "python"
|
|
80
|
+
raise RuntimeError(f"Neither Python nor CLI backend is available for {tool}.")
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _read_chrom_sizes(chrom_sizes: Path) -> list[tuple[str, int]]:
|
|
84
|
+
sizes: list[tuple[str, int]] = []
|
|
85
|
+
with chrom_sizes.open() as f:
|
|
86
|
+
for line in f:
|
|
87
|
+
chrom, size = line.split()[:2]
|
|
88
|
+
sizes.append((chrom, int(size)))
|
|
89
|
+
return sizes
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _ensure_fasta_index(fasta: Path) -> Path:
|
|
93
|
+
fai = fasta.with_suffix(fasta.suffix + ".fai")
|
|
94
|
+
if fai.exists():
|
|
95
|
+
return fai
|
|
96
|
+
if shutil.which("samtools"):
|
|
97
|
+
cp = subprocess.run(
|
|
98
|
+
["samtools", "faidx", str(fasta)],
|
|
99
|
+
stdout=subprocess.DEVNULL,
|
|
100
|
+
stderr=subprocess.PIPE,
|
|
101
|
+
text=True,
|
|
102
|
+
)
|
|
103
|
+
if cp.returncode != 0:
|
|
104
|
+
raise RuntimeError(f"samtools faidx failed (exit {cp.returncode}):\n{cp.stderr}")
|
|
105
|
+
return fai
|
|
106
|
+
if pysam is not None:
|
|
107
|
+
pysam_mod = _require_pysam()
|
|
108
|
+
pysam_mod.faidx(str(fasta))
|
|
109
|
+
return fai
|
|
110
|
+
raise RuntimeError("FASTA indexing requires pysam or samtools in PATH.")
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _ensure_chrom_sizes(fasta: Path) -> Path:
|
|
114
|
+
fai = _ensure_fasta_index(fasta)
|
|
115
|
+
chrom_sizes = fasta.with_suffix(".chrom.sizes")
|
|
116
|
+
if chrom_sizes.exists():
|
|
117
|
+
return chrom_sizes
|
|
118
|
+
with fai.open() as f_in, chrom_sizes.open("w") as out:
|
|
119
|
+
for line in f_in:
|
|
120
|
+
chrom, size = line.split()[:2]
|
|
121
|
+
out.write(f"{chrom}\t{size}\n")
|
|
122
|
+
return chrom_sizes
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _bed_to_bigwig(
|
|
126
|
+
fasta: str,
|
|
127
|
+
bed: str,
|
|
128
|
+
*,
|
|
129
|
+
bedtools_backend: str | None = "auto",
|
|
130
|
+
bigwig_backend: str | None = "auto",
|
|
131
|
+
) -> str:
|
|
21
132
|
"""
|
|
22
133
|
BED → bedGraph → bigWig
|
|
23
134
|
Requires:
|
|
@@ -28,40 +139,70 @@ def _bed_to_bigwig(fasta: str, bed: str) -> str:
|
|
|
28
139
|
fa = Path(fasta) # path to .fa
|
|
29
140
|
parent = bed.parent
|
|
30
141
|
stem = bed.stem
|
|
31
|
-
|
|
32
|
-
fai = parent / f"{fa_stem}.fai"
|
|
142
|
+
chrom_sizes = _ensure_chrom_sizes(fa)
|
|
33
143
|
|
|
34
144
|
bedgraph = parent / f"{stem}.bedgraph"
|
|
35
145
|
bigwig = parent / f"{stem}.bw"
|
|
36
146
|
|
|
37
147
|
# 1) Compute coverage → bedGraph
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
148
|
+
bedtools_choice = _resolve_backend(
|
|
149
|
+
bedtools_backend,
|
|
150
|
+
tool="bedtools",
|
|
151
|
+
python_available=pybedtools is not None,
|
|
152
|
+
cli_name="bedtools",
|
|
153
|
+
)
|
|
154
|
+
if bedtools_choice == "python":
|
|
155
|
+
logger.debug(f"[pybedtools] generating coverage bedgraph from {bed}")
|
|
156
|
+
pybedtools_mod = _require_pybedtools()
|
|
157
|
+
bt = pybedtools_mod.BedTool(str(bed))
|
|
158
|
+
# bedtools genomecov -bg
|
|
159
|
+
coverage = bt.genome_coverage(bg=True, genome=str(chrom_sizes))
|
|
160
|
+
coverage.saveas(str(bedgraph))
|
|
161
|
+
else:
|
|
162
|
+
if not shutil.which("bedtools"):
|
|
163
|
+
raise RuntimeError("bedtools is required but not available in PATH.")
|
|
164
|
+
cmd = [
|
|
165
|
+
"bedtools",
|
|
166
|
+
"genomecov",
|
|
167
|
+
"-i",
|
|
168
|
+
str(bed),
|
|
169
|
+
"-g",
|
|
170
|
+
str(chrom_sizes),
|
|
171
|
+
"-bg",
|
|
172
|
+
]
|
|
173
|
+
logger.debug("[bedtools] generating coverage bedgraph: %s", " ".join(cmd))
|
|
174
|
+
with bedgraph.open("w") as out:
|
|
175
|
+
cp = subprocess.run(cmd, stdout=out, stderr=subprocess.PIPE, text=True)
|
|
176
|
+
if cp.returncode != 0:
|
|
177
|
+
raise RuntimeError(f"bedtools genomecov failed (exit {cp.returncode}):\n{cp.stderr}")
|
|
43
178
|
|
|
44
179
|
# 2) Convert bedGraph → BigWig via pyBigWig
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
180
|
+
bigwig_choice = _resolve_backend(
|
|
181
|
+
bigwig_backend,
|
|
182
|
+
tool="bigwig",
|
|
183
|
+
python_available=pyBigWig is not None,
|
|
184
|
+
cli_name="bedGraphToBigWig",
|
|
185
|
+
)
|
|
186
|
+
if bigwig_choice == "python":
|
|
187
|
+
logger.debug(f"[pyBigWig] converting bedgraph → bigwig: {bigwig}")
|
|
188
|
+
pybigwig_mod = _require_pybigwig()
|
|
189
|
+
bw = pybigwig_mod.open(str(bigwig), "w")
|
|
190
|
+
bw.addHeader(_read_chrom_sizes(chrom_sizes))
|
|
191
|
+
|
|
192
|
+
with bedgraph.open() as f:
|
|
193
|
+
for line in f:
|
|
194
|
+
chrom, start, end, coverage = line.strip().split()
|
|
195
|
+
bw.addEntries(chrom, int(start), ends=int(end), values=float(coverage))
|
|
196
|
+
|
|
197
|
+
bw.close()
|
|
198
|
+
else:
|
|
199
|
+
if not shutil.which("bedGraphToBigWig"):
|
|
200
|
+
raise RuntimeError("bedGraphToBigWig is required but not available in PATH.")
|
|
201
|
+
cmd = ["bedGraphToBigWig", str(bedgraph), str(chrom_sizes), str(bigwig)]
|
|
202
|
+
logger.debug("[bedGraphToBigWig] converting bedgraph → bigwig: %s", " ".join(cmd))
|
|
203
|
+
cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
|
|
204
|
+
if cp.returncode != 0:
|
|
205
|
+
raise RuntimeError(f"bedGraphToBigWig failed (exit {cp.returncode}):\n{cp.stderr}")
|
|
65
206
|
|
|
66
207
|
logger.debug(f"BigWig written: {bigwig}")
|
|
67
208
|
return str(bigwig)
|
|
@@ -113,6 +254,8 @@ def _plot_bed_histograms(
|
|
|
113
254
|
coordinate_mode : {"one_based","zero_based"}
|
|
114
255
|
One-based, inclusive (your file) vs BED-standard zero-based, half-open.
|
|
115
256
|
"""
|
|
257
|
+
plt = require("matplotlib.pyplot", extra="plotting", purpose="plotting BED histograms")
|
|
258
|
+
|
|
116
259
|
os.makedirs(plotting_directory, exist_ok=True)
|
|
117
260
|
|
|
118
261
|
bed_basename = os.path.basename(bed_file).rsplit(".bed", 1)[0]
|
|
@@ -167,7 +310,8 @@ def _plot_bed_histograms(
|
|
|
167
310
|
return np.clip(x, lo, hi)
|
|
168
311
|
|
|
169
312
|
# Load chromosome order/lengths from FASTA
|
|
170
|
-
|
|
313
|
+
pysam_mod = _require_pysam()
|
|
314
|
+
with pysam_mod.FastaFile(fasta) as fa:
|
|
171
315
|
ref_names = list(fa.references)
|
|
172
316
|
ref_lengths = dict(zip(ref_names, fa.lengths))
|
|
173
317
|
|
|
@@ -292,7 +436,17 @@ def _plot_bed_histograms(
|
|
|
292
436
|
logger.debug("[plot_bed_histograms] Done.")
|
|
293
437
|
|
|
294
438
|
|
|
295
|
-
def aligned_BAM_to_bed(
|
|
439
|
+
def aligned_BAM_to_bed(
|
|
440
|
+
aligned_BAM,
|
|
441
|
+
out_dir,
|
|
442
|
+
fasta,
|
|
443
|
+
make_bigwigs,
|
|
444
|
+
threads=None,
|
|
445
|
+
*,
|
|
446
|
+
samtools_backend: str | None = "auto",
|
|
447
|
+
bedtools_backend: str | None = "auto",
|
|
448
|
+
bigwig_backend: str | None = "auto",
|
|
449
|
+
):
|
|
296
450
|
"""
|
|
297
451
|
Takes an aligned BAM as input and writes a BED file of reads as output.
|
|
298
452
|
Bed columns are: Record name, start position, end position, read length, read name, mapping quality, read quality.
|
|
@@ -318,31 +472,79 @@ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
|
|
|
318
472
|
|
|
319
473
|
logger.debug(f"Creating BED-like file from BAM (with MAPQ and avg base quality): {aligned_BAM}")
|
|
320
474
|
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
475
|
+
backend_choice = _resolve_backend(
|
|
476
|
+
samtools_backend,
|
|
477
|
+
tool="samtools",
|
|
478
|
+
python_available=pysam is not None,
|
|
479
|
+
cli_name="samtools",
|
|
480
|
+
)
|
|
481
|
+
with open(bed_output, "w") as out:
|
|
482
|
+
if backend_choice == "python":
|
|
483
|
+
pysam_mod = _require_pysam()
|
|
484
|
+
with pysam_mod.AlignmentFile(aligned_BAM, "rb") as bam:
|
|
485
|
+
for read in bam.fetch(until_eof=True):
|
|
486
|
+
if read.is_unmapped:
|
|
487
|
+
chrom = "*"
|
|
488
|
+
start1 = 1
|
|
489
|
+
rl = read.query_length or 0
|
|
490
|
+
mapq = 0
|
|
491
|
+
else:
|
|
492
|
+
chrom = bam.get_reference_name(read.reference_id)
|
|
493
|
+
# pysam reference_start is 0-based → +1 for 1-based SAM-like start
|
|
494
|
+
start1 = int(read.reference_start) + 1
|
|
495
|
+
rl = read.query_length or 0
|
|
496
|
+
mapq = int(read.mapping_quality)
|
|
497
|
+
|
|
498
|
+
# End position in 1-based inclusive coords
|
|
499
|
+
end1 = start1 + (rl or 0) - 1
|
|
500
|
+
|
|
501
|
+
qname = read.query_name
|
|
502
|
+
quals = read.query_qualities
|
|
503
|
+
if quals is None or rl == 0:
|
|
504
|
+
avg_q = float("nan")
|
|
505
|
+
else:
|
|
506
|
+
avg_q = float(np.mean(quals))
|
|
507
|
+
|
|
508
|
+
out.write(f"{chrom}\t{start1}\t{end1}\t{rl}\t{qname}\t{mapq}\t{avg_q:.3f}\n")
|
|
509
|
+
else:
|
|
510
|
+
samtools_view = subprocess.Popen(
|
|
511
|
+
["samtools", "view", str(aligned_BAM)],
|
|
512
|
+
stdout=subprocess.PIPE,
|
|
513
|
+
stderr=subprocess.PIPE,
|
|
514
|
+
text=True,
|
|
515
|
+
)
|
|
516
|
+
assert samtools_view.stdout is not None
|
|
517
|
+
for line in samtools_view.stdout:
|
|
518
|
+
if not line.strip():
|
|
519
|
+
continue
|
|
520
|
+
fields = line.rstrip("\n").split("\t")
|
|
521
|
+
if len(fields) < 11:
|
|
522
|
+
continue
|
|
523
|
+
qname = fields[0]
|
|
524
|
+
flag = int(fields[1])
|
|
525
|
+
chrom = fields[2]
|
|
526
|
+
pos = int(fields[3])
|
|
527
|
+
mapq = int(fields[4])
|
|
528
|
+
seq = fields[9]
|
|
529
|
+
qual = fields[10]
|
|
530
|
+
rl = 0 if seq == "*" else len(seq)
|
|
531
|
+
is_unmapped = bool(flag & 0x4) or chrom == "*"
|
|
532
|
+
if is_unmapped:
|
|
533
|
+
chrom = "*"
|
|
534
|
+
start1 = 1
|
|
535
|
+
mapq = 0
|
|
536
|
+
else:
|
|
537
|
+
start1 = pos
|
|
538
|
+
end1 = start1 + (rl or 0) - 1
|
|
539
|
+
if qual == "*" or rl == 0:
|
|
540
|
+
avg_q = float("nan")
|
|
541
|
+
else:
|
|
542
|
+
avg_q = float(np.mean([ord(ch) - 33 for ch in qual]))
|
|
543
|
+
out.write(f"{chrom}\t{start1}\t{end1}\t{rl}\t{qname}\t{mapq}\t{avg_q:.3f}\n")
|
|
544
|
+
rc = samtools_view.wait()
|
|
545
|
+
if rc != 0:
|
|
546
|
+
stderr = samtools_view.stderr.read() if samtools_view.stderr else ""
|
|
547
|
+
raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
|
|
346
548
|
|
|
347
549
|
logger.debug(f"BED-like file created: {bed_output}")
|
|
348
550
|
|
|
@@ -368,7 +570,15 @@ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
|
|
|
368
570
|
futures = []
|
|
369
571
|
futures.append(executor.submit(_plot_bed_histograms, aligned_bed, plotting_dir, fasta))
|
|
370
572
|
if make_bigwigs:
|
|
371
|
-
futures.append(
|
|
573
|
+
futures.append(
|
|
574
|
+
executor.submit(
|
|
575
|
+
_bed_to_bigwig,
|
|
576
|
+
fasta,
|
|
577
|
+
aligned_bed,
|
|
578
|
+
bedtools_backend=bedtools_backend,
|
|
579
|
+
bigwig_backend=bigwig_backend,
|
|
580
|
+
)
|
|
581
|
+
)
|
|
372
582
|
concurrent.futures.wait(futures)
|
|
373
583
|
|
|
374
584
|
logger.debug("Processing completed successfully.")
|