smftools 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +43 -13
- smftools/_settings.py +6 -6
- smftools/_version.py +3 -1
- smftools/cli/__init__.py +1 -0
- smftools/cli/archived/cli_flows.py +2 -0
- smftools/cli/helpers.py +9 -1
- smftools/cli/hmm_adata.py +905 -242
- smftools/cli/load_adata.py +432 -280
- smftools/cli/preprocess_adata.py +287 -171
- smftools/cli/spatial_adata.py +141 -53
- smftools/cli_entry.py +119 -178
- smftools/config/__init__.py +3 -1
- smftools/config/conversion.yaml +5 -1
- smftools/config/deaminase.yaml +1 -1
- smftools/config/default.yaml +26 -18
- smftools/config/direct.yaml +8 -3
- smftools/config/discover_input_files.py +19 -5
- smftools/config/experiment_config.py +511 -276
- smftools/constants.py +37 -0
- smftools/datasets/__init__.py +4 -8
- smftools/datasets/datasets.py +32 -18
- smftools/hmm/HMM.py +2133 -1428
- smftools/hmm/__init__.py +24 -14
- smftools/hmm/archived/apply_hmm_batched.py +2 -0
- smftools/hmm/archived/calculate_distances.py +2 -0
- smftools/hmm/archived/call_hmm_peaks.py +18 -1
- smftools/hmm/archived/train_hmm.py +2 -0
- smftools/hmm/call_hmm_peaks.py +176 -193
- smftools/hmm/display_hmm.py +23 -7
- smftools/hmm/hmm_readwrite.py +20 -6
- smftools/hmm/nucleosome_hmm_refinement.py +104 -14
- smftools/informatics/__init__.py +55 -13
- smftools/informatics/archived/bam_conversion.py +2 -0
- smftools/informatics/archived/bam_direct.py +2 -0
- smftools/informatics/archived/basecall_pod5s.py +2 -0
- smftools/informatics/archived/basecalls_to_adata.py +2 -0
- smftools/informatics/archived/conversion_smf.py +2 -0
- smftools/informatics/archived/deaminase_smf.py +1 -0
- smftools/informatics/archived/direct_smf.py +2 -0
- smftools/informatics/archived/fast5_to_pod5.py +2 -0
- smftools/informatics/archived/helpers/archived/__init__.py +2 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +16 -1
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
- smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
- smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
- smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
- smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
- smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
- smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
- smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
- smftools/informatics/archived/helpers/archived/informatics.py +2 -0
- smftools/informatics/archived/helpers/archived/load_adata.py +5 -3
- smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
- smftools/informatics/archived/helpers/archived/modQC.py +2 -0
- smftools/informatics/archived/helpers/archived/modcall.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +5 -1
- smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
- smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
- smftools/informatics/archived/print_bam_query_seq.py +9 -1
- smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
- smftools/informatics/archived/subsample_pod5.py +2 -0
- smftools/informatics/bam_functions.py +1059 -269
- smftools/informatics/basecalling.py +53 -9
- smftools/informatics/bed_functions.py +357 -114
- smftools/informatics/binarize_converted_base_identities.py +21 -7
- smftools/informatics/complement_base_list.py +9 -6
- smftools/informatics/converted_BAM_to_adata.py +324 -137
- smftools/informatics/fasta_functions.py +251 -89
- smftools/informatics/h5ad_functions.py +202 -30
- smftools/informatics/modkit_extract_to_adata.py +623 -274
- smftools/informatics/modkit_functions.py +87 -44
- smftools/informatics/ohe.py +46 -21
- smftools/informatics/pod5_functions.py +114 -74
- smftools/informatics/run_multiqc.py +20 -14
- smftools/logging_utils.py +51 -0
- smftools/machine_learning/__init__.py +23 -12
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +157 -50
- smftools/machine_learning/data/preprocessing.py +4 -1
- smftools/machine_learning/evaluation/__init__.py +3 -1
- smftools/machine_learning/evaluation/eval_utils.py +13 -14
- smftools/machine_learning/evaluation/evaluators.py +52 -34
- smftools/machine_learning/inference/__init__.py +3 -1
- smftools/machine_learning/inference/inference_utils.py +9 -4
- smftools/machine_learning/inference/lightning_inference.py +14 -13
- smftools/machine_learning/inference/sklearn_inference.py +8 -8
- smftools/machine_learning/inference/sliding_window_inference.py +37 -25
- smftools/machine_learning/models/__init__.py +12 -5
- smftools/machine_learning/models/base.py +34 -43
- smftools/machine_learning/models/cnn.py +22 -13
- smftools/machine_learning/models/lightning_base.py +78 -42
- smftools/machine_learning/models/mlp.py +18 -5
- smftools/machine_learning/models/positional.py +10 -4
- smftools/machine_learning/models/rnn.py +8 -3
- smftools/machine_learning/models/sklearn_models.py +46 -24
- smftools/machine_learning/models/transformer.py +75 -55
- smftools/machine_learning/models/wrappers.py +8 -3
- smftools/machine_learning/training/__init__.py +4 -2
- smftools/machine_learning/training/train_lightning_model.py +42 -23
- smftools/machine_learning/training/train_sklearn_model.py +11 -15
- smftools/machine_learning/utils/__init__.py +3 -1
- smftools/machine_learning/utils/device.py +12 -5
- smftools/machine_learning/utils/grl.py +8 -2
- smftools/metadata.py +443 -0
- smftools/optional_imports.py +31 -0
- smftools/plotting/__init__.py +32 -17
- smftools/plotting/autocorrelation_plotting.py +153 -48
- smftools/plotting/classifiers.py +175 -73
- smftools/plotting/general_plotting.py +350 -168
- smftools/plotting/hmm_plotting.py +53 -14
- smftools/plotting/position_stats.py +155 -87
- smftools/plotting/qc_plotting.py +25 -12
- smftools/preprocessing/__init__.py +35 -37
- smftools/preprocessing/append_base_context.py +105 -79
- smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
- smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +2 -0
- smftools/preprocessing/{archives → archived}/calculate_complexity.py +5 -1
- smftools/preprocessing/{archives → archived}/mark_duplicates.py +2 -0
- smftools/preprocessing/{archives → archived}/preprocessing.py +10 -6
- smftools/preprocessing/{archives → archived}/remove_duplicates.py +2 -0
- smftools/preprocessing/binarize.py +21 -4
- smftools/preprocessing/binarize_on_Youden.py +127 -31
- smftools/preprocessing/binary_layers_to_ohe.py +18 -11
- smftools/preprocessing/calculate_complexity_II.py +89 -59
- smftools/preprocessing/calculate_consensus.py +28 -19
- smftools/preprocessing/calculate_coverage.py +44 -22
- smftools/preprocessing/calculate_pairwise_differences.py +4 -1
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +7 -3
- smftools/preprocessing/calculate_position_Youden.py +110 -55
- smftools/preprocessing/calculate_read_length_stats.py +52 -23
- smftools/preprocessing/calculate_read_modification_stats.py +91 -57
- smftools/preprocessing/clean_NaN.py +38 -28
- smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +72 -37
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +183 -73
- smftools/preprocessing/flag_duplicate_reads.py +708 -303
- smftools/preprocessing/invert_adata.py +26 -11
- smftools/preprocessing/load_sample_sheet.py +40 -22
- smftools/preprocessing/make_dirs.py +9 -3
- smftools/preprocessing/min_non_diagonal.py +4 -1
- smftools/preprocessing/recipes.py +58 -23
- smftools/preprocessing/reindex_references_adata.py +93 -27
- smftools/preprocessing/subsample_adata.py +33 -16
- smftools/readwrite.py +264 -109
- smftools/schema/__init__.py +11 -0
- smftools/schema/anndata_schema_v1.yaml +227 -0
- smftools/tools/__init__.py +25 -18
- smftools/tools/archived/apply_hmm.py +2 -0
- smftools/tools/archived/classifiers.py +165 -0
- smftools/tools/archived/classify_methylated_features.py +2 -0
- smftools/tools/archived/classify_non_methylated_features.py +2 -0
- smftools/tools/archived/subset_adata_v1.py +12 -1
- smftools/tools/archived/subset_adata_v2.py +14 -1
- smftools/tools/calculate_umap.py +56 -15
- smftools/tools/cluster_adata_on_methylation.py +122 -47
- smftools/tools/general_tools.py +70 -25
- smftools/tools/position_stats.py +220 -99
- smftools/tools/read_stats.py +50 -29
- smftools/tools/spatial_autocorrelation.py +365 -192
- smftools/tools/subset_adata.py +23 -21
- smftools-0.3.0.dist-info/METADATA +147 -0
- smftools-0.3.0.dist-info/RECORD +182 -0
- smftools-0.2.4.dist-info/METADATA +0 -141
- smftools-0.2.4.dist-info/RECORD +0 -176
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,21 +1,134 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import concurrent.futures
|
|
2
4
|
import os
|
|
5
|
+
import shutil
|
|
3
6
|
import subprocess
|
|
4
|
-
from
|
|
5
|
-
import
|
|
6
|
-
import
|
|
7
|
-
import pyBigWig
|
|
7
|
+
from concurrent.futures import ProcessPoolExecutor
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import TYPE_CHECKING
|
|
8
10
|
|
|
9
11
|
import numpy as np
|
|
10
12
|
import pandas as pd
|
|
11
|
-
import concurrent.futures
|
|
12
|
-
from concurrent.futures import ProcessPoolExecutor
|
|
13
13
|
|
|
14
|
-
|
|
14
|
+
from smftools.logging_utils import get_logger
|
|
15
|
+
from smftools.optional_imports import require
|
|
15
16
|
|
|
16
17
|
from ..readwrite import make_dirs
|
|
17
18
|
|
|
18
|
-
|
|
19
|
+
logger = get_logger(__name__)
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
import pybedtools as pybedtools_types
|
|
23
|
+
import pyBigWig as pybigwig_types
|
|
24
|
+
import pysam as pysam_types
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
import pybedtools
|
|
28
|
+
except Exception:
|
|
29
|
+
pybedtools = None # type: ignore
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
import pyBigWig
|
|
33
|
+
except Exception:
|
|
34
|
+
pyBigWig = None # type: ignore
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
import pysam
|
|
38
|
+
except Exception:
|
|
39
|
+
pysam = None # type: ignore
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _require_pybedtools() -> "pybedtools_types":
|
|
43
|
+
if pybedtools is not None:
|
|
44
|
+
return pybedtools
|
|
45
|
+
return require("pybedtools", extra="pybedtools", purpose="bedtools Python backend")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _require_pybigwig() -> "pybigwig_types":
|
|
49
|
+
if pyBigWig is not None:
|
|
50
|
+
return pyBigWig
|
|
51
|
+
return require("pyBigWig", extra="pybigwig", purpose="BigWig Python backend")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _require_pysam() -> "pysam_types":
|
|
55
|
+
if pysam is not None:
|
|
56
|
+
return pysam
|
|
57
|
+
return require("pysam", extra="pysam", purpose="FASTA indexing")
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _resolve_backend(
|
|
61
|
+
backend: str | None, *, tool: str, python_available: bool, cli_name: str
|
|
62
|
+
) -> str:
|
|
63
|
+
choice = (backend or "auto").strip().lower()
|
|
64
|
+
if choice not in {"auto", "python", "cli"}:
|
|
65
|
+
raise ValueError(f"{tool}_backend must be one of: auto, python, cli")
|
|
66
|
+
if choice == "python":
|
|
67
|
+
if not python_available:
|
|
68
|
+
raise RuntimeError(
|
|
69
|
+
f"{tool}_backend=python requires the Python package to be installed."
|
|
70
|
+
)
|
|
71
|
+
return "python"
|
|
72
|
+
if choice == "cli":
|
|
73
|
+
if not shutil.which(cli_name):
|
|
74
|
+
raise RuntimeError(f"{tool}_backend=cli requires {cli_name} in PATH.")
|
|
75
|
+
return "cli"
|
|
76
|
+
if shutil.which(cli_name):
|
|
77
|
+
return "cli"
|
|
78
|
+
if python_available:
|
|
79
|
+
return "python"
|
|
80
|
+
raise RuntimeError(f"Neither Python nor CLI backend is available for {tool}.")
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _read_chrom_sizes(chrom_sizes: Path) -> list[tuple[str, int]]:
|
|
84
|
+
sizes: list[tuple[str, int]] = []
|
|
85
|
+
with chrom_sizes.open() as f:
|
|
86
|
+
for line in f:
|
|
87
|
+
chrom, size = line.split()[:2]
|
|
88
|
+
sizes.append((chrom, int(size)))
|
|
89
|
+
return sizes
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _ensure_fasta_index(fasta: Path) -> Path:
|
|
93
|
+
fai = fasta.with_suffix(fasta.suffix + ".fai")
|
|
94
|
+
if fai.exists():
|
|
95
|
+
return fai
|
|
96
|
+
if shutil.which("samtools"):
|
|
97
|
+
cp = subprocess.run(
|
|
98
|
+
["samtools", "faidx", str(fasta)],
|
|
99
|
+
stdout=subprocess.DEVNULL,
|
|
100
|
+
stderr=subprocess.PIPE,
|
|
101
|
+
text=True,
|
|
102
|
+
)
|
|
103
|
+
if cp.returncode != 0:
|
|
104
|
+
raise RuntimeError(f"samtools faidx failed (exit {cp.returncode}):\n{cp.stderr}")
|
|
105
|
+
return fai
|
|
106
|
+
if pysam is not None:
|
|
107
|
+
pysam_mod = _require_pysam()
|
|
108
|
+
pysam_mod.faidx(str(fasta))
|
|
109
|
+
return fai
|
|
110
|
+
raise RuntimeError("FASTA indexing requires pysam or samtools in PATH.")
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _ensure_chrom_sizes(fasta: Path) -> Path:
|
|
114
|
+
fai = _ensure_fasta_index(fasta)
|
|
115
|
+
chrom_sizes = fasta.with_suffix(".chrom.sizes")
|
|
116
|
+
if chrom_sizes.exists():
|
|
117
|
+
return chrom_sizes
|
|
118
|
+
with fai.open() as f_in, chrom_sizes.open("w") as out:
|
|
119
|
+
for line in f_in:
|
|
120
|
+
chrom, size = line.split()[:2]
|
|
121
|
+
out.write(f"{chrom}\t{size}\n")
|
|
122
|
+
return chrom_sizes
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _bed_to_bigwig(
|
|
126
|
+
fasta: str,
|
|
127
|
+
bed: str,
|
|
128
|
+
*,
|
|
129
|
+
bedtools_backend: str | None = "auto",
|
|
130
|
+
bigwig_backend: str | None = "auto",
|
|
131
|
+
) -> str:
|
|
19
132
|
"""
|
|
20
133
|
BED → bedGraph → bigWig
|
|
21
134
|
Requires:
|
|
@@ -26,44 +139,75 @@ def _bed_to_bigwig(fasta: str, bed: str) -> str:
|
|
|
26
139
|
fa = Path(fasta) # path to .fa
|
|
27
140
|
parent = bed.parent
|
|
28
141
|
stem = bed.stem
|
|
29
|
-
|
|
30
|
-
fai = parent / f"{fa_stem}.fai"
|
|
142
|
+
chrom_sizes = _ensure_chrom_sizes(fa)
|
|
31
143
|
|
|
32
144
|
bedgraph = parent / f"{stem}.bedgraph"
|
|
33
145
|
bigwig = parent / f"{stem}.bw"
|
|
34
146
|
|
|
35
147
|
# 1) Compute coverage → bedGraph
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
148
|
+
bedtools_choice = _resolve_backend(
|
|
149
|
+
bedtools_backend,
|
|
150
|
+
tool="bedtools",
|
|
151
|
+
python_available=pybedtools is not None,
|
|
152
|
+
cli_name="bedtools",
|
|
153
|
+
)
|
|
154
|
+
if bedtools_choice == "python":
|
|
155
|
+
logger.debug(f"[pybedtools] generating coverage bedgraph from {bed}")
|
|
156
|
+
pybedtools_mod = _require_pybedtools()
|
|
157
|
+
bt = pybedtools_mod.BedTool(str(bed))
|
|
158
|
+
# bedtools genomecov -bg
|
|
159
|
+
coverage = bt.genome_coverage(bg=True, genome=str(chrom_sizes))
|
|
160
|
+
coverage.saveas(str(bedgraph))
|
|
161
|
+
else:
|
|
162
|
+
if not shutil.which("bedtools"):
|
|
163
|
+
raise RuntimeError("bedtools is required but not available in PATH.")
|
|
164
|
+
cmd = [
|
|
165
|
+
"bedtools",
|
|
166
|
+
"genomecov",
|
|
167
|
+
"-i",
|
|
168
|
+
str(bed),
|
|
169
|
+
"-g",
|
|
170
|
+
str(chrom_sizes),
|
|
171
|
+
"-bg",
|
|
172
|
+
]
|
|
173
|
+
logger.debug("[bedtools] generating coverage bedgraph: %s", " ".join(cmd))
|
|
174
|
+
with bedgraph.open("w") as out:
|
|
175
|
+
cp = subprocess.run(cmd, stdout=out, stderr=subprocess.PIPE, text=True)
|
|
176
|
+
if cp.returncode != 0:
|
|
177
|
+
raise RuntimeError(f"bedtools genomecov failed (exit {cp.returncode}):\n{cp.stderr}")
|
|
41
178
|
|
|
42
179
|
# 2) Convert bedGraph → BigWig via pyBigWig
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
180
|
+
bigwig_choice = _resolve_backend(
|
|
181
|
+
bigwig_backend,
|
|
182
|
+
tool="bigwig",
|
|
183
|
+
python_available=pyBigWig is not None,
|
|
184
|
+
cli_name="bedGraphToBigWig",
|
|
185
|
+
)
|
|
186
|
+
if bigwig_choice == "python":
|
|
187
|
+
logger.debug(f"[pyBigWig] converting bedgraph → bigwig: {bigwig}")
|
|
188
|
+
pybigwig_mod = _require_pybigwig()
|
|
189
|
+
bw = pybigwig_mod.open(str(bigwig), "w")
|
|
190
|
+
bw.addHeader(_read_chrom_sizes(chrom_sizes))
|
|
191
|
+
|
|
192
|
+
with bedgraph.open() as f:
|
|
193
|
+
for line in f:
|
|
194
|
+
chrom, start, end, coverage = line.strip().split()
|
|
195
|
+
bw.addEntries(chrom, int(start), ends=int(end), values=float(coverage))
|
|
196
|
+
|
|
197
|
+
bw.close()
|
|
198
|
+
else:
|
|
199
|
+
if not shutil.which("bedGraphToBigWig"):
|
|
200
|
+
raise RuntimeError("bedGraphToBigWig is required but not available in PATH.")
|
|
201
|
+
cmd = ["bedGraphToBigWig", str(bedgraph), str(chrom_sizes), str(bigwig)]
|
|
202
|
+
logger.debug("[bedGraphToBigWig] converting bedgraph → bigwig: %s", " ".join(cmd))
|
|
203
|
+
cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
|
|
204
|
+
if cp.returncode != 0:
|
|
205
|
+
raise RuntimeError(f"bedGraphToBigWig failed (exit {cp.returncode}):\n{cp.stderr}")
|
|
206
|
+
|
|
207
|
+
logger.debug(f"BigWig written: {bigwig}")
|
|
65
208
|
return str(bigwig)
|
|
66
209
|
|
|
210
|
+
|
|
67
211
|
def _plot_bed_histograms(
|
|
68
212
|
bed_file,
|
|
69
213
|
plotting_directory,
|
|
@@ -71,9 +215,9 @@ def _plot_bed_histograms(
|
|
|
71
215
|
*,
|
|
72
216
|
bins=60,
|
|
73
217
|
clip_quantiles=(0.0, 0.995),
|
|
74
|
-
cov_bin_size=1000,
|
|
75
|
-
rows_per_fig=6,
|
|
76
|
-
include_mapq_quality=True,
|
|
218
|
+
cov_bin_size=1000, # coverage bin size in bp
|
|
219
|
+
rows_per_fig=6, # paginate if many chromosomes
|
|
220
|
+
include_mapq_quality=True, # add MAPQ + avg read quality columns to grid
|
|
77
221
|
coordinate_mode="one_based", # "one_based" (your BED-like) or "zero_based"
|
|
78
222
|
):
|
|
79
223
|
"""
|
|
@@ -110,22 +254,35 @@ def _plot_bed_histograms(
|
|
|
110
254
|
coordinate_mode : {"one_based","zero_based"}
|
|
111
255
|
One-based, inclusive (your file) vs BED-standard zero-based, half-open.
|
|
112
256
|
"""
|
|
257
|
+
plt = require("matplotlib.pyplot", extra="plotting", purpose="plotting BED histograms")
|
|
258
|
+
|
|
113
259
|
os.makedirs(plotting_directory, exist_ok=True)
|
|
114
260
|
|
|
115
261
|
bed_basename = os.path.basename(bed_file).rsplit(".bed", 1)[0]
|
|
116
|
-
|
|
262
|
+
logger.debug(f"[plot_bed_histograms] Loading: {bed_file}")
|
|
117
263
|
|
|
118
264
|
# Load BED-like table
|
|
119
|
-
cols = [
|
|
120
|
-
df = pd.read_csv(
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
265
|
+
cols = ["chrom", "start", "end", "read_len", "qname", "mapq", "avg_q"]
|
|
266
|
+
df = pd.read_csv(
|
|
267
|
+
bed_file,
|
|
268
|
+
sep="\t",
|
|
269
|
+
header=None,
|
|
270
|
+
names=cols,
|
|
271
|
+
dtype={
|
|
272
|
+
"chrom": str,
|
|
273
|
+
"start": int,
|
|
274
|
+
"end": int,
|
|
275
|
+
"read_len": int,
|
|
276
|
+
"qname": str,
|
|
277
|
+
"mapq": float,
|
|
278
|
+
"avg_q": float,
|
|
279
|
+
},
|
|
280
|
+
)
|
|
124
281
|
|
|
125
282
|
# Drop unaligned records (chrom == '*') if present
|
|
126
|
-
df = df[df[
|
|
283
|
+
df = df[df["chrom"] != "*"].copy()
|
|
127
284
|
if df.empty:
|
|
128
|
-
|
|
285
|
+
logger.debug("[plot_bed_histograms] No aligned reads found; nothing to plot.")
|
|
129
286
|
return
|
|
130
287
|
|
|
131
288
|
# Ensure coordinate mode consistent; convert to 0-based half-open for bin math internally
|
|
@@ -135,15 +292,16 @@ def _plot_bed_histograms(
|
|
|
135
292
|
|
|
136
293
|
if coordinate_mode == "one_based":
|
|
137
294
|
# convert to 0-based half-open [start0, end0)
|
|
138
|
-
start0 = df[
|
|
139
|
-
end0
|
|
295
|
+
start0 = df["start"].to_numpy() - 1
|
|
296
|
+
end0 = df["end"].to_numpy() # inclusive in input -> +1 already handled by not subtracting
|
|
140
297
|
else:
|
|
141
298
|
# already 0-based half-open (assumption)
|
|
142
|
-
start0 = df[
|
|
143
|
-
end0
|
|
299
|
+
start0 = df["start"].to_numpy()
|
|
300
|
+
end0 = df["end"].to_numpy()
|
|
144
301
|
|
|
145
302
|
# Clip helper for hist tails
|
|
146
303
|
def _clip_series(s, q=(0.0, 0.995)):
|
|
304
|
+
"""Clip a Series to quantile bounds for plotting."""
|
|
147
305
|
if q is None:
|
|
148
306
|
return s.to_numpy()
|
|
149
307
|
lo = s.quantile(q[0]) if q[0] is not None else s.min()
|
|
@@ -152,47 +310,48 @@ def _plot_bed_histograms(
|
|
|
152
310
|
return np.clip(x, lo, hi)
|
|
153
311
|
|
|
154
312
|
# Load chromosome order/lengths from FASTA
|
|
155
|
-
|
|
313
|
+
pysam_mod = _require_pysam()
|
|
314
|
+
with pysam_mod.FastaFile(fasta) as fa:
|
|
156
315
|
ref_names = list(fa.references)
|
|
157
316
|
ref_lengths = dict(zip(ref_names, fa.lengths))
|
|
158
317
|
|
|
159
318
|
# Keep only chroms present in FASTA and with at least one read
|
|
160
|
-
chroms = [c for c in df[
|
|
319
|
+
chroms = [c for c in df["chrom"].unique() if c in ref_lengths]
|
|
161
320
|
# Order chromosomes by FASTA order
|
|
162
321
|
chrom_order = [c for c in ref_names if c in chroms]
|
|
163
322
|
|
|
164
323
|
if not chrom_order:
|
|
165
|
-
|
|
324
|
+
logger.debug(
|
|
325
|
+
"[plot_bed_histograms] No chromosomes from BED are present in FASTA; aborting."
|
|
326
|
+
)
|
|
166
327
|
return
|
|
167
328
|
|
|
168
329
|
# Pagination
|
|
169
330
|
def _sanitize(name: str) -> str:
|
|
331
|
+
"""Sanitize a string for use in filenames."""
|
|
170
332
|
return "".join(ch if ch.isalnum() or ch in "-._" else "_" for ch in name)
|
|
171
333
|
|
|
172
334
|
cols_per_fig = 4 if include_mapq_quality else 2
|
|
173
335
|
|
|
174
336
|
for start_idx in range(0, len(chrom_order), rows_per_fig):
|
|
175
|
-
chunk = chrom_order[start_idx:start_idx + rows_per_fig]
|
|
337
|
+
chunk = chrom_order[start_idx : start_idx + rows_per_fig]
|
|
176
338
|
nrows = len(chunk)
|
|
177
339
|
ncols = cols_per_fig
|
|
178
340
|
|
|
179
341
|
fig, axes = plt.subplots(
|
|
180
|
-
nrows=nrows, ncols=ncols,
|
|
181
|
-
figsize=(4.0 * ncols, 2.6 * nrows),
|
|
182
|
-
dpi=160,
|
|
183
|
-
squeeze=False
|
|
342
|
+
nrows=nrows, ncols=ncols, figsize=(4.0 * ncols, 2.6 * nrows), dpi=160, squeeze=False
|
|
184
343
|
)
|
|
185
344
|
|
|
186
345
|
for r, chrom in enumerate(chunk):
|
|
187
346
|
chrom_len = ref_lengths[chrom]
|
|
188
|
-
mask =
|
|
347
|
+
mask = df["chrom"].to_numpy() == chrom
|
|
189
348
|
|
|
190
349
|
# Slice per-chrom arrays for speed
|
|
191
350
|
s0 = start0[mask]
|
|
192
351
|
e0 = end0[mask]
|
|
193
|
-
len_arr = df.loc[mask,
|
|
194
|
-
mapq_arr = df.loc[mask,
|
|
195
|
-
q_arr = df.loc[mask,
|
|
352
|
+
len_arr = df.loc[mask, "read_len"]
|
|
353
|
+
mapq_arr = df.loc[mask, "mapq"]
|
|
354
|
+
q_arr = df.loc[mask, "avg_q"]
|
|
196
355
|
|
|
197
356
|
# --- Col 1: Read length histogram (clipped) ---
|
|
198
357
|
ax = axes[r, 0]
|
|
@@ -222,7 +381,7 @@ def _plot_bed_histograms(
|
|
|
222
381
|
|
|
223
382
|
# Increment all bins in range; loop but at bin resolution (fast for reasonable cov_bin_size).
|
|
224
383
|
for lo, hi in zip(b_lo, b_hi):
|
|
225
|
-
cov[lo:hi + 1] += 1
|
|
384
|
+
cov[lo : hi + 1] += 1
|
|
226
385
|
|
|
227
386
|
x_mid = (edges[:-1] + edges[1:]) / 2.0
|
|
228
387
|
ax.plot(x_mid, cov)
|
|
@@ -237,7 +396,12 @@ def _plot_bed_histograms(
|
|
|
237
396
|
# --- Col 3: MAPQ ---
|
|
238
397
|
ax = axes[r, 2]
|
|
239
398
|
# Clip MAPQ upper tail if needed (usually 60)
|
|
240
|
-
ax.hist(
|
|
399
|
+
ax.hist(
|
|
400
|
+
_clip_series(mapq_arr.fillna(0), clip_quantiles),
|
|
401
|
+
bins=bins,
|
|
402
|
+
edgecolor="black",
|
|
403
|
+
alpha=0.7,
|
|
404
|
+
)
|
|
241
405
|
if r == 0:
|
|
242
406
|
ax.set_title("MAPQ")
|
|
243
407
|
ax.set_xlabel("MAPQ")
|
|
@@ -245,7 +409,12 @@ def _plot_bed_histograms(
|
|
|
245
409
|
|
|
246
410
|
# --- Col 4: Avg base quality ---
|
|
247
411
|
ax = axes[r, 3]
|
|
248
|
-
ax.hist(
|
|
412
|
+
ax.hist(
|
|
413
|
+
_clip_series(q_arr.fillna(np.nan), clip_quantiles),
|
|
414
|
+
bins=bins,
|
|
415
|
+
edgecolor="black",
|
|
416
|
+
alpha=0.7,
|
|
417
|
+
)
|
|
249
418
|
if r == 0:
|
|
250
419
|
ax.set_title("Avg base qual")
|
|
251
420
|
ax.set_xlabel("Phred")
|
|
@@ -254,7 +423,8 @@ def _plot_bed_histograms(
|
|
|
254
423
|
fig.suptitle(
|
|
255
424
|
f"{bed_basename} — per-chromosome QC "
|
|
256
425
|
f"({'len,cov,MAPQ,qual' if include_mapq_quality else 'len,cov'})",
|
|
257
|
-
y=0.995,
|
|
426
|
+
y=0.995,
|
|
427
|
+
fontsize=11,
|
|
258
428
|
)
|
|
259
429
|
fig.tight_layout(rect=[0, 0, 1, 0.98])
|
|
260
430
|
|
|
@@ -263,9 +433,20 @@ def _plot_bed_histograms(
|
|
|
263
433
|
plt.savefig(out_png, bbox_inches="tight")
|
|
264
434
|
plt.close(fig)
|
|
265
435
|
|
|
266
|
-
|
|
436
|
+
logger.debug("[plot_bed_histograms] Done.")
|
|
267
437
|
|
|
268
|
-
|
|
438
|
+
|
|
439
|
+
def aligned_BAM_to_bed(
|
|
440
|
+
aligned_BAM,
|
|
441
|
+
out_dir,
|
|
442
|
+
fasta,
|
|
443
|
+
make_bigwigs,
|
|
444
|
+
threads=None,
|
|
445
|
+
*,
|
|
446
|
+
samtools_backend: str | None = "auto",
|
|
447
|
+
bedtools_backend: str | None = "auto",
|
|
448
|
+
bigwig_backend: str | None = "auto",
|
|
449
|
+
):
|
|
269
450
|
"""
|
|
270
451
|
Takes an aligned BAM as input and writes a BED file of reads as output.
|
|
271
452
|
Bed columns are: Record name, start position, end position, read length, read name, mapping quality, read quality.
|
|
@@ -287,60 +468,121 @@ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
|
|
|
287
468
|
bed_dir = out_dir / "beds"
|
|
288
469
|
make_dirs([plotting_dir, bed_dir])
|
|
289
470
|
|
|
290
|
-
bed_output = bed_dir /
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
471
|
+
bed_output = bed_dir / str(aligned_BAM.name).replace(".bam", "_bed.bed")
|
|
472
|
+
|
|
473
|
+
logger.debug(f"Creating BED-like file from BAM (with MAPQ and avg base quality): {aligned_BAM}")
|
|
474
|
+
|
|
475
|
+
backend_choice = _resolve_backend(
|
|
476
|
+
samtools_backend,
|
|
477
|
+
tool="samtools",
|
|
478
|
+
python_available=pysam is not None,
|
|
479
|
+
cli_name="samtools",
|
|
480
|
+
)
|
|
481
|
+
with open(bed_output, "w") as out:
|
|
482
|
+
if backend_choice == "python":
|
|
483
|
+
pysam_mod = _require_pysam()
|
|
484
|
+
with pysam_mod.AlignmentFile(aligned_BAM, "rb") as bam:
|
|
485
|
+
for read in bam.fetch(until_eof=True):
|
|
486
|
+
if read.is_unmapped:
|
|
487
|
+
chrom = "*"
|
|
488
|
+
start1 = 1
|
|
489
|
+
rl = read.query_length or 0
|
|
490
|
+
mapq = 0
|
|
491
|
+
else:
|
|
492
|
+
chrom = bam.get_reference_name(read.reference_id)
|
|
493
|
+
# pysam reference_start is 0-based → +1 for 1-based SAM-like start
|
|
494
|
+
start1 = int(read.reference_start) + 1
|
|
495
|
+
rl = read.query_length or 0
|
|
496
|
+
mapq = int(read.mapping_quality)
|
|
497
|
+
|
|
498
|
+
# End position in 1-based inclusive coords
|
|
499
|
+
end1 = start1 + (rl or 0) - 1
|
|
500
|
+
|
|
501
|
+
qname = read.query_name
|
|
502
|
+
quals = read.query_qualities
|
|
503
|
+
if quals is None or rl == 0:
|
|
504
|
+
avg_q = float("nan")
|
|
505
|
+
else:
|
|
506
|
+
avg_q = float(np.mean(quals))
|
|
507
|
+
|
|
508
|
+
out.write(f"{chrom}\t{start1}\t{end1}\t{rl}\t{qname}\t{mapq}\t{avg_q:.3f}\n")
|
|
509
|
+
else:
|
|
510
|
+
samtools_view = subprocess.Popen(
|
|
511
|
+
["samtools", "view", str(aligned_BAM)],
|
|
512
|
+
stdout=subprocess.PIPE,
|
|
513
|
+
stderr=subprocess.PIPE,
|
|
514
|
+
text=True,
|
|
515
|
+
)
|
|
516
|
+
assert samtools_view.stdout is not None
|
|
517
|
+
for line in samtools_view.stdout:
|
|
518
|
+
if not line.strip():
|
|
519
|
+
continue
|
|
520
|
+
fields = line.rstrip("\n").split("\t")
|
|
521
|
+
if len(fields) < 11:
|
|
522
|
+
continue
|
|
523
|
+
qname = fields[0]
|
|
524
|
+
flag = int(fields[1])
|
|
525
|
+
chrom = fields[2]
|
|
526
|
+
pos = int(fields[3])
|
|
527
|
+
mapq = int(fields[4])
|
|
528
|
+
seq = fields[9]
|
|
529
|
+
qual = fields[10]
|
|
530
|
+
rl = 0 if seq == "*" else len(seq)
|
|
531
|
+
is_unmapped = bool(flag & 0x4) or chrom == "*"
|
|
532
|
+
if is_unmapped:
|
|
533
|
+
chrom = "*"
|
|
534
|
+
start1 = 1
|
|
535
|
+
mapq = 0
|
|
536
|
+
else:
|
|
537
|
+
start1 = pos
|
|
538
|
+
end1 = start1 + (rl or 0) - 1
|
|
539
|
+
if qual == "*" or rl == 0:
|
|
540
|
+
avg_q = float("nan")
|
|
541
|
+
else:
|
|
542
|
+
avg_q = float(np.mean([ord(ch) - 33 for ch in qual]))
|
|
543
|
+
out.write(f"{chrom}\t{start1}\t{end1}\t{rl}\t{qname}\t{mapq}\t{avg_q:.3f}\n")
|
|
544
|
+
rc = samtools_view.wait()
|
|
545
|
+
if rc != 0:
|
|
546
|
+
stderr = samtools_view.stderr.read() if samtools_view.stderr else ""
|
|
547
|
+
raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
|
|
548
|
+
|
|
549
|
+
logger.debug(f"BED-like file created: {bed_output}")
|
|
321
550
|
|
|
322
551
|
def split_bed(bed):
|
|
323
552
|
"""Splits into aligned and unaligned reads (chrom == '*')."""
|
|
324
553
|
bed = str(bed)
|
|
325
554
|
aligned = bed.replace(".bed", "_aligned.bed")
|
|
326
555
|
unaligned = bed.replace(".bed", "_unaligned.bed")
|
|
327
|
-
with
|
|
556
|
+
with (
|
|
557
|
+
open(bed, "r") as infile,
|
|
558
|
+
open(aligned, "w") as aligned_out,
|
|
559
|
+
open(unaligned, "w") as unaligned_out,
|
|
560
|
+
):
|
|
328
561
|
for line in infile:
|
|
329
562
|
(unaligned_out if line.startswith("*\t") else aligned_out).write(line)
|
|
330
563
|
os.remove(bed)
|
|
331
564
|
return aligned
|
|
332
565
|
|
|
333
|
-
|
|
566
|
+
logger.debug(f"Splitting: {bed_output}")
|
|
334
567
|
aligned_bed = split_bed(bed_output)
|
|
335
568
|
|
|
336
569
|
with ProcessPoolExecutor() as executor:
|
|
337
570
|
futures = []
|
|
338
571
|
futures.append(executor.submit(_plot_bed_histograms, aligned_bed, plotting_dir, fasta))
|
|
339
572
|
if make_bigwigs:
|
|
340
|
-
futures.append(
|
|
573
|
+
futures.append(
|
|
574
|
+
executor.submit(
|
|
575
|
+
_bed_to_bigwig,
|
|
576
|
+
fasta,
|
|
577
|
+
aligned_bed,
|
|
578
|
+
bedtools_backend=bedtools_backend,
|
|
579
|
+
bigwig_backend=bigwig_backend,
|
|
580
|
+
)
|
|
581
|
+
)
|
|
341
582
|
concurrent.futures.wait(futures)
|
|
342
583
|
|
|
343
|
-
|
|
584
|
+
logger.debug("Processing completed successfully.")
|
|
585
|
+
|
|
344
586
|
|
|
345
587
|
def extract_read_lengths_from_bed(file_path):
|
|
346
588
|
"""
|
|
@@ -352,15 +594,16 @@ def extract_read_lengths_from_bed(file_path):
|
|
|
352
594
|
read_dict (dict)
|
|
353
595
|
"""
|
|
354
596
|
import pandas as pd
|
|
355
|
-
|
|
356
|
-
|
|
597
|
+
|
|
598
|
+
columns = ["chrom", "start", "end", "length", "name"]
|
|
599
|
+
df = pd.read_csv(file_path, sep="\t", header=None, names=columns, comment="#")
|
|
357
600
|
read_dict = {}
|
|
358
601
|
for _, row in df.iterrows():
|
|
359
|
-
chrom = row[
|
|
360
|
-
start = row[
|
|
361
|
-
end = row[
|
|
362
|
-
name = row[
|
|
363
|
-
length = row[
|
|
602
|
+
chrom = row["chrom"]
|
|
603
|
+
start = row["start"]
|
|
604
|
+
end = row["end"]
|
|
605
|
+
name = row["name"]
|
|
606
|
+
length = row["length"]
|
|
364
607
|
read_dict[name] = length
|
|
365
608
|
|
|
366
|
-
return read_dict
|
|
609
|
+
return read_dict
|