smftools 0.2.5__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +39 -7
- smftools/_settings.py +2 -0
- smftools/_version.py +3 -1
- smftools/cli/__init__.py +1 -0
- smftools/cli/archived/cli_flows.py +2 -0
- smftools/cli/helpers.py +34 -6
- smftools/cli/hmm_adata.py +239 -33
- smftools/cli/latent_adata.py +318 -0
- smftools/cli/load_adata.py +167 -131
- smftools/cli/preprocess_adata.py +180 -53
- smftools/cli/spatial_adata.py +152 -100
- smftools/cli_entry.py +38 -1
- smftools/config/__init__.py +2 -0
- smftools/config/conversion.yaml +11 -1
- smftools/config/default.yaml +42 -2
- smftools/config/experiment_config.py +59 -1
- smftools/constants.py +65 -0
- smftools/datasets/__init__.py +2 -0
- smftools/hmm/HMM.py +97 -3
- smftools/hmm/__init__.py +24 -13
- smftools/hmm/archived/apply_hmm_batched.py +2 -0
- smftools/hmm/archived/calculate_distances.py +2 -0
- smftools/hmm/archived/call_hmm_peaks.py +2 -0
- smftools/hmm/archived/train_hmm.py +2 -0
- smftools/hmm/call_hmm_peaks.py +5 -2
- smftools/hmm/display_hmm.py +4 -1
- smftools/hmm/hmm_readwrite.py +7 -2
- smftools/hmm/nucleosome_hmm_refinement.py +2 -0
- smftools/informatics/__init__.py +59 -34
- smftools/informatics/archived/bam_conversion.py +2 -0
- smftools/informatics/archived/bam_direct.py +2 -0
- smftools/informatics/archived/basecall_pod5s.py +2 -0
- smftools/informatics/archived/basecalls_to_adata.py +2 -0
- smftools/informatics/archived/conversion_smf.py +2 -0
- smftools/informatics/archived/deaminase_smf.py +1 -0
- smftools/informatics/archived/direct_smf.py +2 -0
- smftools/informatics/archived/fast5_to_pod5.py +2 -0
- smftools/informatics/archived/helpers/archived/__init__.py +2 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
- smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
- smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
- smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
- smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
- smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
- smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
- smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
- smftools/informatics/archived/helpers/archived/informatics.py +2 -0
- smftools/informatics/archived/helpers/archived/load_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
- smftools/informatics/archived/helpers/archived/modQC.py +2 -0
- smftools/informatics/archived/helpers/archived/modcall.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +2 -0
- smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
- smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
- smftools/informatics/archived/print_bam_query_seq.py +2 -0
- smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
- smftools/informatics/archived/subsample_pod5.py +2 -0
- smftools/informatics/bam_functions.py +1093 -176
- smftools/informatics/basecalling.py +2 -0
- smftools/informatics/bed_functions.py +271 -61
- smftools/informatics/binarize_converted_base_identities.py +3 -0
- smftools/informatics/complement_base_list.py +2 -0
- smftools/informatics/converted_BAM_to_adata.py +641 -176
- smftools/informatics/fasta_functions.py +94 -10
- smftools/informatics/h5ad_functions.py +123 -4
- smftools/informatics/modkit_extract_to_adata.py +1019 -431
- smftools/informatics/modkit_functions.py +2 -0
- smftools/informatics/ohe.py +2 -0
- smftools/informatics/pod5_functions.py +3 -2
- smftools/informatics/sequence_encoding.py +72 -0
- smftools/logging_utils.py +21 -2
- smftools/machine_learning/__init__.py +22 -6
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +18 -4
- smftools/machine_learning/data/preprocessing.py +2 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +2 -0
- smftools/machine_learning/evaluation/evaluators.py +14 -9
- smftools/machine_learning/inference/__init__.py +2 -0
- smftools/machine_learning/inference/inference_utils.py +2 -0
- smftools/machine_learning/inference/lightning_inference.py +6 -1
- smftools/machine_learning/inference/sklearn_inference.py +2 -0
- smftools/machine_learning/inference/sliding_window_inference.py +2 -0
- smftools/machine_learning/models/__init__.py +2 -0
- smftools/machine_learning/models/base.py +7 -2
- smftools/machine_learning/models/cnn.py +7 -2
- smftools/machine_learning/models/lightning_base.py +16 -11
- smftools/machine_learning/models/mlp.py +5 -1
- smftools/machine_learning/models/positional.py +7 -2
- smftools/machine_learning/models/rnn.py +5 -1
- smftools/machine_learning/models/sklearn_models.py +14 -9
- smftools/machine_learning/models/transformer.py +7 -2
- smftools/machine_learning/models/wrappers.py +6 -2
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +13 -3
- smftools/machine_learning/training/train_sklearn_model.py +2 -0
- smftools/machine_learning/utils/__init__.py +2 -0
- smftools/machine_learning/utils/device.py +5 -1
- smftools/machine_learning/utils/grl.py +5 -1
- smftools/metadata.py +1 -1
- smftools/optional_imports.py +31 -0
- smftools/plotting/__init__.py +41 -31
- smftools/plotting/autocorrelation_plotting.py +9 -5
- smftools/plotting/classifiers.py +16 -4
- smftools/plotting/general_plotting.py +2415 -629
- smftools/plotting/hmm_plotting.py +97 -9
- smftools/plotting/position_stats.py +15 -7
- smftools/plotting/qc_plotting.py +6 -1
- smftools/preprocessing/__init__.py +36 -37
- smftools/preprocessing/append_base_context.py +17 -17
- smftools/preprocessing/append_mismatch_frequency_sites.py +158 -0
- smftools/preprocessing/archived/add_read_length_and_mapping_qc.py +2 -0
- smftools/preprocessing/archived/calculate_complexity.py +2 -0
- smftools/preprocessing/archived/mark_duplicates.py +2 -0
- smftools/preprocessing/archived/preprocessing.py +2 -0
- smftools/preprocessing/archived/remove_duplicates.py +2 -0
- smftools/preprocessing/binary_layers_to_ohe.py +2 -1
- smftools/preprocessing/calculate_complexity_II.py +4 -1
- smftools/preprocessing/calculate_consensus.py +1 -1
- smftools/preprocessing/calculate_pairwise_differences.py +2 -0
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +3 -0
- smftools/preprocessing/calculate_position_Youden.py +9 -2
- smftools/preprocessing/calculate_read_modification_stats.py +6 -1
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +2 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +2 -0
- smftools/preprocessing/flag_duplicate_reads.py +42 -54
- smftools/preprocessing/make_dirs.py +2 -1
- smftools/preprocessing/min_non_diagonal.py +2 -0
- smftools/preprocessing/recipes.py +2 -0
- smftools/readwrite.py +53 -17
- smftools/schema/anndata_schema_v1.yaml +15 -1
- smftools/tools/__init__.py +30 -18
- smftools/tools/archived/apply_hmm.py +2 -0
- smftools/tools/archived/classifiers.py +2 -0
- smftools/tools/archived/classify_methylated_features.py +2 -0
- smftools/tools/archived/classify_non_methylated_features.py +2 -0
- smftools/tools/archived/subset_adata_v1.py +2 -0
- smftools/tools/archived/subset_adata_v2.py +2 -0
- smftools/tools/calculate_leiden.py +57 -0
- smftools/tools/calculate_nmf.py +119 -0
- smftools/tools/calculate_umap.py +93 -8
- smftools/tools/cluster_adata_on_methylation.py +7 -1
- smftools/tools/position_stats.py +17 -27
- smftools/tools/rolling_nn_distance.py +235 -0
- smftools/tools/tensor_factorization.py +169 -0
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/METADATA +69 -33
- smftools-0.3.1.dist-info/RECORD +189 -0
- smftools-0.2.5.dist-info/RECORD +0 -181
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/WHEEL +0 -0
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/licenses/LICENSE +0 -0
smftools/__init__.py
CHANGED
|
@@ -1,20 +1,52 @@
|
|
|
1
1
|
"""smftools"""
|
|
2
2
|
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
3
5
|
import logging
|
|
4
6
|
import warnings
|
|
7
|
+
from importlib import import_module
|
|
5
8
|
from importlib.metadata import version
|
|
9
|
+
from typing import TYPE_CHECKING
|
|
6
10
|
|
|
7
|
-
from . import
|
|
8
|
-
from . import informatics as inform
|
|
9
|
-
from . import machine_learning as ml
|
|
10
|
-
from . import plotting as pl
|
|
11
|
-
from . import preprocessing as pp
|
|
12
|
-
from . import tools as tl
|
|
13
|
-
from .readwrite import adata_to_df, merge_barcoded_anndatas_core, safe_read_h5ad, safe_write_h5ad
|
|
11
|
+
from .readwrite import adata_to_df, safe_read_h5ad, safe_write_h5ad
|
|
14
12
|
|
|
15
13
|
package_name = "smftools"
|
|
16
14
|
__version__ = version(package_name)
|
|
17
15
|
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from smftools import (
|
|
18
|
+
cli,
|
|
19
|
+
config,
|
|
20
|
+
datasets,
|
|
21
|
+
hmm,
|
|
22
|
+
informatics,
|
|
23
|
+
machine_learning,
|
|
24
|
+
plotting,
|
|
25
|
+
preprocessing,
|
|
26
|
+
tools,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
_LAZY_MODULES = {
|
|
30
|
+
"cli": "smftools.cli",
|
|
31
|
+
"config": "smftools.config",
|
|
32
|
+
"datasets": "smftools.datasets",
|
|
33
|
+
"hmm": "smftools.hmm",
|
|
34
|
+
"inform": "smftools.informatics",
|
|
35
|
+
"ml": "smftools.machine_learning",
|
|
36
|
+
"pl": "smftools.plotting",
|
|
37
|
+
"pp": "smftools.preprocessing",
|
|
38
|
+
"tl": "smftools.tools",
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def __getattr__(name: str):
|
|
43
|
+
if name in _LAZY_MODULES:
|
|
44
|
+
module = import_module(_LAZY_MODULES[name])
|
|
45
|
+
globals()[name] = module
|
|
46
|
+
return module
|
|
47
|
+
raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
|
|
48
|
+
|
|
49
|
+
|
|
18
50
|
__all__ = [
|
|
19
51
|
"adata_to_df",
|
|
20
52
|
"inform",
|
smftools/_settings.py
CHANGED
smftools/_version.py
CHANGED
smftools/cli/__init__.py
CHANGED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from __future__ import annotations
|
smftools/cli/helpers.py
CHANGED
|
@@ -1,8 +1,12 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
from dataclasses import dataclass
|
|
2
4
|
from pathlib import Path
|
|
3
5
|
|
|
4
6
|
import anndata as ad
|
|
5
7
|
|
|
8
|
+
from smftools.constants import H5_DIR, HMM_DIR, LATENT_DIR, LOAD_DIR, PREPROCESS_DIR, SPATIAL_DIR
|
|
9
|
+
|
|
6
10
|
from ..metadata import write_runtime_schema_yaml
|
|
7
11
|
from ..readwrite import safe_write_h5ad
|
|
8
12
|
|
|
@@ -14,28 +18,35 @@ class AdataPaths:
|
|
|
14
18
|
pp_dedup: Path
|
|
15
19
|
spatial: Path
|
|
16
20
|
hmm: Path
|
|
21
|
+
latent: Path
|
|
17
22
|
|
|
18
23
|
|
|
19
24
|
def get_adata_paths(cfg) -> AdataPaths:
|
|
20
25
|
"""
|
|
21
26
|
Central helper: given cfg, compute all standard AnnData paths.
|
|
22
27
|
"""
|
|
23
|
-
|
|
28
|
+
output_directory = Path(cfg.output_directory)
|
|
24
29
|
|
|
25
|
-
raw =
|
|
30
|
+
raw = output_directory / LOAD_DIR / H5_DIR / f"{cfg.experiment_name}.h5ad.gz"
|
|
26
31
|
|
|
27
|
-
pp =
|
|
32
|
+
pp = output_directory / PREPROCESS_DIR / H5_DIR / f"{cfg.experiment_name}_preprocessed.h5ad.gz"
|
|
28
33
|
|
|
29
34
|
if cfg.smf_modality == "direct":
|
|
30
35
|
# direct SMF: duplicate-removed path is just preprocessed path
|
|
31
36
|
pp_dedup = pp
|
|
32
37
|
else:
|
|
33
|
-
pp_dedup =
|
|
38
|
+
pp_dedup = (
|
|
39
|
+
output_directory
|
|
40
|
+
/ PREPROCESS_DIR
|
|
41
|
+
/ H5_DIR
|
|
42
|
+
/ f"{cfg.experiment_name}_preprocessed_duplicates_removed.h5ad.gz"
|
|
43
|
+
)
|
|
34
44
|
|
|
35
45
|
pp_dedup_base = pp_dedup.name.removesuffix(".h5ad.gz")
|
|
36
46
|
|
|
37
|
-
spatial =
|
|
38
|
-
hmm =
|
|
47
|
+
spatial = output_directory / SPATIAL_DIR / H5_DIR / f"{pp_dedup_base}_spatial.h5ad.gz"
|
|
48
|
+
hmm = output_directory / HMM_DIR / H5_DIR / f"{pp_dedup_base}_hmm.h5ad.gz"
|
|
49
|
+
latent = output_directory / LATENT_DIR / H5_DIR / f"{pp_dedup_base}_latent.h5ad.gz"
|
|
39
50
|
|
|
40
51
|
return AdataPaths(
|
|
41
52
|
raw=raw,
|
|
@@ -43,7 +54,24 @@ def get_adata_paths(cfg) -> AdataPaths:
|
|
|
43
54
|
pp_dedup=pp_dedup,
|
|
44
55
|
spatial=spatial,
|
|
45
56
|
hmm=hmm,
|
|
57
|
+
latent=latent,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def load_experiment_config(config_path: str):
|
|
62
|
+
"""Load ExperimentConfig without invoking any pipeline stages."""
|
|
63
|
+
from datetime import datetime
|
|
64
|
+
from importlib import resources
|
|
65
|
+
|
|
66
|
+
from ..config import ExperimentConfig, LoadExperimentConfig
|
|
67
|
+
|
|
68
|
+
date_str = datetime.today().strftime("%y%m%d")
|
|
69
|
+
loader = LoadExperimentConfig(config_path)
|
|
70
|
+
defaults_dir = resources.files("smftools").joinpath("config")
|
|
71
|
+
cfg, _ = ExperimentConfig.from_var_dict(
|
|
72
|
+
loader.var_dict, date_str=date_str, defaults_dir=defaults_dir
|
|
46
73
|
)
|
|
74
|
+
return cfg
|
|
47
75
|
|
|
48
76
|
|
|
49
77
|
def write_gz_h5ad(adata: ad.AnnData, path: Path) -> Path:
|
smftools/cli/hmm_adata.py
CHANGED
|
@@ -1,25 +1,152 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import copy
|
|
4
|
+
import logging
|
|
4
5
|
from dataclasses import dataclass
|
|
5
6
|
from pathlib import Path
|
|
6
|
-
from typing import Any, List, Optional, Sequence, Tuple, Union
|
|
7
|
+
from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Tuple, Union
|
|
7
8
|
|
|
8
9
|
import numpy as np
|
|
9
|
-
import torch
|
|
10
10
|
|
|
11
|
-
from smftools.
|
|
11
|
+
from smftools.constants import HMM_DIR, LOGGING_DIR
|
|
12
|
+
from smftools.logging_utils import get_logger, setup_logging
|
|
13
|
+
from smftools.optional_imports import require
|
|
12
14
|
|
|
13
15
|
# FIX: import _to_dense_np to avoid NameError
|
|
14
16
|
from ..hmm.HMM import _safe_int_coords, _to_dense_np, create_hmm, normalize_hmm_feature_sets
|
|
15
17
|
|
|
16
18
|
logger = get_logger(__name__)
|
|
17
19
|
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
import torch as torch_types
|
|
22
|
+
|
|
23
|
+
torch = require("torch", extra="torch", purpose="HMM CLI")
|
|
24
|
+
mpl = require("matplotlib", extra="plotting", purpose="HMM plotting")
|
|
25
|
+
mpl_colors = require("matplotlib.colors", extra="plotting", purpose="HMM plotting")
|
|
26
|
+
|
|
18
27
|
# =============================================================================
|
|
19
28
|
# Helpers: extracting training arrays
|
|
20
29
|
# =============================================================================
|
|
21
30
|
|
|
22
31
|
|
|
32
|
+
def _strip_hmm_layer_prefix(layer: str) -> str:
|
|
33
|
+
"""Strip methbase prefixes and length suffixes from an HMM layer name.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
layer: Full layer name (e.g., "GpC_small_accessible_patch_lengths").
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
The base layer name without methbase prefixes or length suffixes.
|
|
40
|
+
"""
|
|
41
|
+
base = layer
|
|
42
|
+
for prefix in ("Combined_", "GpC_", "CpG_", "C_", "A_"):
|
|
43
|
+
if base.startswith(prefix):
|
|
44
|
+
base = base[len(prefix) :]
|
|
45
|
+
break
|
|
46
|
+
if base.endswith("_lengths"):
|
|
47
|
+
base = base[: -len("_lengths")]
|
|
48
|
+
if base.endswith("_merged"):
|
|
49
|
+
base = base[: -len("_merged")]
|
|
50
|
+
return base
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _resolve_feature_colormap(layer: str, cfg, default_cmap: str) -> Any:
|
|
54
|
+
"""Resolve a colormap for a given HMM layer.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
layer: Full layer name.
|
|
58
|
+
cfg: Experiment config.
|
|
59
|
+
default_cmap: Fallback colormap name.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
A matplotlib colormap or colormap name.
|
|
63
|
+
"""
|
|
64
|
+
feature_maps = getattr(cfg, "hmm_feature_colormaps", {}) or {}
|
|
65
|
+
if not isinstance(feature_maps, dict):
|
|
66
|
+
feature_maps = {}
|
|
67
|
+
|
|
68
|
+
base = _strip_hmm_layer_prefix(layer)
|
|
69
|
+
value = feature_maps.get(layer, feature_maps.get(base))
|
|
70
|
+
if value is None:
|
|
71
|
+
return default_cmap
|
|
72
|
+
|
|
73
|
+
if isinstance(value, (list, tuple)):
|
|
74
|
+
return mpl_colors.ListedColormap(list(value))
|
|
75
|
+
|
|
76
|
+
if isinstance(value, str):
|
|
77
|
+
try:
|
|
78
|
+
mpl.colormaps.get_cmap(value)
|
|
79
|
+
return value
|
|
80
|
+
except Exception:
|
|
81
|
+
return mpl_colors.LinearSegmentedColormap.from_list(
|
|
82
|
+
f"hmm_{base}_cmap", ["#ffffff", value]
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
return default_cmap
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _resolve_feature_color(layer: str, cfg, fallback_cmap: str, idx: int, total: int) -> Any:
|
|
89
|
+
"""Resolve a line color for a given HMM layer."""
|
|
90
|
+
feature_maps = getattr(cfg, "hmm_feature_colormaps", {}) or {}
|
|
91
|
+
if not isinstance(feature_maps, dict):
|
|
92
|
+
feature_maps = {}
|
|
93
|
+
|
|
94
|
+
base = _strip_hmm_layer_prefix(layer)
|
|
95
|
+
value = feature_maps.get(layer, feature_maps.get(base))
|
|
96
|
+
if isinstance(value, str):
|
|
97
|
+
try:
|
|
98
|
+
mpl.colormaps.get_cmap(value)
|
|
99
|
+
except Exception:
|
|
100
|
+
return value
|
|
101
|
+
return mpl.colormaps.get_cmap(value)(0.75)
|
|
102
|
+
if isinstance(value, (list, tuple)) and value:
|
|
103
|
+
return value[-1]
|
|
104
|
+
|
|
105
|
+
cmap_obj = mpl.colormaps.get_cmap(fallback_cmap)
|
|
106
|
+
if total <= 1:
|
|
107
|
+
return cmap_obj(0.5)
|
|
108
|
+
return cmap_obj(idx / (total - 1))
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _resolve_length_feature_ranges(
|
|
112
|
+
layer: str, cfg, default_cmap: str
|
|
113
|
+
) -> List[Tuple[int, int, Any]]:
|
|
114
|
+
"""Resolve length-based feature ranges to colors for size contour overlays."""
|
|
115
|
+
base = _strip_hmm_layer_prefix(layer)
|
|
116
|
+
feature_sets = getattr(cfg, "hmm_feature_sets", {}) or {}
|
|
117
|
+
if not isinstance(feature_sets, dict):
|
|
118
|
+
return []
|
|
119
|
+
|
|
120
|
+
feature_key = None
|
|
121
|
+
if "accessible" in base:
|
|
122
|
+
feature_key = "accessible"
|
|
123
|
+
elif "footprint" in base:
|
|
124
|
+
feature_key = "footprint"
|
|
125
|
+
|
|
126
|
+
if feature_key is None:
|
|
127
|
+
return []
|
|
128
|
+
|
|
129
|
+
features = feature_sets.get(feature_key, {}).get("features", {})
|
|
130
|
+
if not isinstance(features, dict):
|
|
131
|
+
return []
|
|
132
|
+
|
|
133
|
+
ranges: List[Tuple[int, int, Any]] = []
|
|
134
|
+
for feature_name, bounds in features.items():
|
|
135
|
+
if not isinstance(bounds, (list, tuple)) or len(bounds) != 2:
|
|
136
|
+
continue
|
|
137
|
+
min_len, max_len = bounds
|
|
138
|
+
if max_len is None or (isinstance(max_len, (float, int)) and np.isinf(max_len)):
|
|
139
|
+
max_len = int(1e9)
|
|
140
|
+
try:
|
|
141
|
+
min_len_int = int(min_len)
|
|
142
|
+
max_len_int = int(max_len)
|
|
143
|
+
except (TypeError, ValueError):
|
|
144
|
+
continue
|
|
145
|
+
color = _resolve_feature_color(feature_name, cfg, default_cmap, 0, 1)
|
|
146
|
+
ranges.append((min_len_int, max_len_int, color))
|
|
147
|
+
return ranges
|
|
148
|
+
|
|
149
|
+
|
|
23
150
|
def _get_training_matrix(
|
|
24
151
|
subset, cols_mask: np.ndarray, smf_modality: Optional[str], cfg
|
|
25
152
|
) -> Tuple[np.ndarray, Optional[str]]:
|
|
@@ -440,31 +567,25 @@ def hmm_adata(config_path: str):
|
|
|
440
567
|
- Call hmm_adata_core(cfg, adata, paths)
|
|
441
568
|
"""
|
|
442
569
|
from ..readwrite import safe_read_h5ad
|
|
443
|
-
from .helpers import get_adata_paths
|
|
444
|
-
from .load_adata import load_adata
|
|
445
|
-
from .preprocess_adata import preprocess_adata
|
|
446
|
-
from .spatial_adata import spatial_adata
|
|
570
|
+
from .helpers import get_adata_paths, load_experiment_config
|
|
447
571
|
|
|
448
572
|
# 1) load cfg / stage paths
|
|
449
|
-
|
|
450
|
-
paths = get_adata_paths(cfg)
|
|
573
|
+
cfg = load_experiment_config(config_path)
|
|
451
574
|
|
|
452
|
-
|
|
453
|
-
preprocess_adata(config_path)
|
|
454
|
-
spatial_ad, spatial_path = spatial_adata(config_path)
|
|
575
|
+
paths = get_adata_paths(cfg)
|
|
455
576
|
|
|
456
|
-
#
|
|
577
|
+
# 2) choose starting AnnData
|
|
457
578
|
# Prefer:
|
|
458
579
|
# - existing HMM h5ad if not forcing redo
|
|
459
580
|
# - in-memory spatial_ad from wrapper call
|
|
460
581
|
# - saved spatial / pp_dedup / pp / raw on disk
|
|
461
582
|
if paths.hmm.exists() and not (cfg.force_redo_hmm_fit or cfg.force_redo_hmm_apply):
|
|
462
|
-
|
|
463
|
-
return
|
|
583
|
+
logger.debug(f"Skipping hmm. HMM AnnData found: {paths.hmm}")
|
|
584
|
+
return None
|
|
464
585
|
|
|
465
|
-
if
|
|
466
|
-
adata =
|
|
467
|
-
source_path =
|
|
586
|
+
if paths.hmm.exists():
|
|
587
|
+
adata, _ = safe_read_h5ad(paths.hmm)
|
|
588
|
+
source_path = paths.hmm
|
|
468
589
|
elif paths.spatial.exists():
|
|
469
590
|
adata, _ = safe_read_h5ad(paths.spatial)
|
|
470
591
|
source_path = paths.spatial
|
|
@@ -511,11 +632,14 @@ def hmm_adata_core(
|
|
|
511
632
|
Does NOT decide which h5ad to start from – that is the wrapper's job.
|
|
512
633
|
"""
|
|
513
634
|
|
|
635
|
+
from datetime import datetime
|
|
636
|
+
|
|
514
637
|
import numpy as np
|
|
515
638
|
|
|
516
639
|
from ..hmm import call_hmm_peaks
|
|
517
640
|
from ..metadata import record_smftools_metadata
|
|
518
641
|
from ..plotting import (
|
|
642
|
+
combined_hmm_length_clustermap,
|
|
519
643
|
combined_hmm_raw_clustermap,
|
|
520
644
|
plot_hmm_layers_rolling_by_sample_ref,
|
|
521
645
|
plot_hmm_size_contours,
|
|
@@ -523,18 +647,33 @@ def hmm_adata_core(
|
|
|
523
647
|
from ..readwrite import make_dirs
|
|
524
648
|
from .helpers import write_gz_h5ad
|
|
525
649
|
|
|
650
|
+
date_str = datetime.today().strftime("%y%m%d")
|
|
651
|
+
now = datetime.now()
|
|
652
|
+
time_str = now.strftime("%H%M%S")
|
|
653
|
+
|
|
654
|
+
log_level = getattr(logging, cfg.log_level.upper(), logging.INFO)
|
|
655
|
+
|
|
526
656
|
smf_modality = cfg.smf_modality
|
|
527
657
|
deaminase = smf_modality == "deaminase"
|
|
528
658
|
|
|
529
659
|
output_directory = Path(cfg.output_directory)
|
|
530
|
-
|
|
660
|
+
hmm_directory = output_directory / HMM_DIR
|
|
661
|
+
logging_directory = hmm_directory / LOGGING_DIR
|
|
662
|
+
|
|
663
|
+
make_dirs([output_directory, hmm_directory])
|
|
664
|
+
|
|
665
|
+
if cfg.emit_log_file:
|
|
666
|
+
log_file = logging_directory / f"{date_str}_{time_str}_log.log"
|
|
667
|
+
make_dirs([logging_directory])
|
|
668
|
+
else:
|
|
669
|
+
log_file = None
|
|
531
670
|
|
|
532
|
-
|
|
671
|
+
setup_logging(level=log_level, log_file=log_file, reconfigure=log_file is not None)
|
|
533
672
|
|
|
534
673
|
# ---------------------------- HMM annotate stage ----------------------------
|
|
535
674
|
if not (cfg.bypass_hmm_fit and cfg.bypass_hmm_apply):
|
|
536
|
-
hmm_models_dir =
|
|
537
|
-
make_dirs([
|
|
675
|
+
hmm_models_dir = hmm_directory / "10_hmm_models"
|
|
676
|
+
make_dirs([hmm_directory, hmm_models_dir])
|
|
538
677
|
|
|
539
678
|
# Standard bookkeeping
|
|
540
679
|
uns_key = "hmm_appended_layers"
|
|
@@ -738,6 +877,8 @@ def hmm_adata_core(
|
|
|
738
877
|
uns_key=uns_key,
|
|
739
878
|
uns_flag="hmm_annotated_combined",
|
|
740
879
|
force_redo=force_apply,
|
|
880
|
+
mask_to_read_span=True,
|
|
881
|
+
mask_use_original_var_names=True,
|
|
741
882
|
)
|
|
742
883
|
|
|
743
884
|
for core_layer, dist in (
|
|
@@ -850,11 +991,11 @@ def hmm_adata_core(
|
|
|
850
991
|
logger.info(f"HMM appended layers: {hmm_layers}")
|
|
851
992
|
|
|
852
993
|
# ---------------------------- HMM peak calling stage ----------------------------
|
|
853
|
-
hmm_dir =
|
|
994
|
+
hmm_dir = hmm_directory / "11_hmm_peak_calling"
|
|
854
995
|
if hmm_dir.is_dir():
|
|
855
996
|
pass
|
|
856
997
|
else:
|
|
857
|
-
make_dirs([
|
|
998
|
+
make_dirs([hmm_directory, hmm_dir])
|
|
858
999
|
|
|
859
1000
|
call_hmm_peaks(
|
|
860
1001
|
adata,
|
|
@@ -883,8 +1024,8 @@ def hmm_adata_core(
|
|
|
883
1024
|
|
|
884
1025
|
############################################### HMM based feature plotting ###############################################
|
|
885
1026
|
|
|
886
|
-
hmm_dir =
|
|
887
|
-
make_dirs([
|
|
1027
|
+
hmm_dir = hmm_directory / "12_hmm_clustermaps"
|
|
1028
|
+
make_dirs([hmm_directory, hmm_dir])
|
|
888
1029
|
|
|
889
1030
|
layers: list[str] = []
|
|
890
1031
|
|
|
@@ -909,6 +1050,7 @@ def hmm_adata_core(
|
|
|
909
1050
|
pass
|
|
910
1051
|
else:
|
|
911
1052
|
make_dirs([hmm_cluster_save_dir])
|
|
1053
|
+
hmm_cmap = _resolve_feature_colormap(layer, cfg, cfg.clustermap_cmap_hmm)
|
|
912
1054
|
|
|
913
1055
|
combined_hmm_raw_clustermap(
|
|
914
1056
|
adata,
|
|
@@ -919,7 +1061,7 @@ def hmm_adata_core(
|
|
|
919
1061
|
layer_cpg=cfg.layer_for_clustermap_plotting,
|
|
920
1062
|
layer_c=cfg.layer_for_clustermap_plotting,
|
|
921
1063
|
layer_a=cfg.layer_for_clustermap_plotting,
|
|
922
|
-
cmap_hmm=
|
|
1064
|
+
cmap_hmm=hmm_cmap,
|
|
923
1065
|
cmap_gpc=cfg.clustermap_cmap_gpc,
|
|
924
1066
|
cmap_cpg=cfg.clustermap_cmap_cpg,
|
|
925
1067
|
cmap_c=cfg.clustermap_cmap_c,
|
|
@@ -930,7 +1072,7 @@ def hmm_adata_core(
|
|
|
930
1072
|
0
|
|
931
1073
|
],
|
|
932
1074
|
min_position_valid_fraction=1 - cfg.position_max_nan_threshold,
|
|
933
|
-
demux_types=
|
|
1075
|
+
demux_types=cfg.clustermap_demux_types_to_plot,
|
|
934
1076
|
save_path=hmm_cluster_save_dir,
|
|
935
1077
|
normalize_hmm=False,
|
|
936
1078
|
sort_by=cfg.hmm_clustermap_sortby, # options: 'gpc', 'cpg', 'gpc_cpg', 'none', or 'obs:<column>'
|
|
@@ -940,12 +1082,68 @@ def hmm_adata_core(
|
|
|
940
1082
|
index_col_suffix=cfg.reindexed_var_suffix,
|
|
941
1083
|
)
|
|
942
1084
|
|
|
943
|
-
|
|
1085
|
+
hmm_length_dir = hmm_directory / "12b_hmm_length_clustermaps"
|
|
1086
|
+
make_dirs([hmm_directory, hmm_length_dir])
|
|
1087
|
+
|
|
1088
|
+
length_layers: list[str] = []
|
|
1089
|
+
length_layer_roots = list(
|
|
1090
|
+
getattr(cfg, "hmm_clustermap_length_layers", cfg.hmm_clustermap_feature_layers)
|
|
1091
|
+
)
|
|
1092
|
+
|
|
1093
|
+
for base in cfg.hmm_methbases:
|
|
1094
|
+
length_layers.extend([f"{base}_{layer}_lengths" for layer in length_layer_roots])
|
|
1095
|
+
|
|
1096
|
+
if getattr(cfg, "hmm_run_multichannel", True) and len(cfg.hmm_methbases) >= 2:
|
|
1097
|
+
length_layers.extend([f"Combined_{layer}_lengths" for layer in length_layer_roots])
|
|
1098
|
+
|
|
1099
|
+
if cfg.cpg:
|
|
1100
|
+
length_layers.extend(["CpG_cpg_patch_lengths"])
|
|
1101
|
+
|
|
1102
|
+
for layer in length_layers:
|
|
1103
|
+
hmm_cluster_save_dir = hmm_length_dir / layer
|
|
1104
|
+
if hmm_cluster_save_dir.is_dir():
|
|
1105
|
+
pass
|
|
1106
|
+
else:
|
|
1107
|
+
make_dirs([hmm_cluster_save_dir])
|
|
1108
|
+
length_cmap = _resolve_feature_colormap(layer, cfg, "Greens")
|
|
1109
|
+
length_feature_ranges = _resolve_length_feature_ranges(layer, cfg, "Greens")
|
|
1110
|
+
|
|
1111
|
+
combined_hmm_length_clustermap(
|
|
1112
|
+
adata,
|
|
1113
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
1114
|
+
reference_col=cfg.reference_column,
|
|
1115
|
+
length_layer=layer,
|
|
1116
|
+
layer_gpc=cfg.layer_for_clustermap_plotting,
|
|
1117
|
+
layer_cpg=cfg.layer_for_clustermap_plotting,
|
|
1118
|
+
layer_c=cfg.layer_for_clustermap_plotting,
|
|
1119
|
+
layer_a=cfg.layer_for_clustermap_plotting,
|
|
1120
|
+
cmap_lengths=length_cmap,
|
|
1121
|
+
cmap_gpc=cfg.clustermap_cmap_gpc,
|
|
1122
|
+
cmap_cpg=cfg.clustermap_cmap_cpg,
|
|
1123
|
+
cmap_c=cfg.clustermap_cmap_c,
|
|
1124
|
+
cmap_a=cfg.clustermap_cmap_a,
|
|
1125
|
+
min_quality=cfg.read_quality_filter_thresholds[0],
|
|
1126
|
+
min_length=cfg.read_len_filter_thresholds[0],
|
|
1127
|
+
min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[
|
|
1128
|
+
0
|
|
1129
|
+
],
|
|
1130
|
+
min_position_valid_fraction=1 - cfg.position_max_nan_threshold,
|
|
1131
|
+
demux_types=cfg.clustermap_demux_types_to_plot,
|
|
1132
|
+
save_path=hmm_cluster_save_dir,
|
|
1133
|
+
sort_by=cfg.hmm_clustermap_sortby,
|
|
1134
|
+
bins=None,
|
|
1135
|
+
deaminase=deaminase,
|
|
1136
|
+
min_signal=0,
|
|
1137
|
+
index_col_suffix=cfg.reindexed_var_suffix,
|
|
1138
|
+
length_feature_ranges=length_feature_ranges,
|
|
1139
|
+
)
|
|
1140
|
+
|
|
1141
|
+
hmm_dir = hmm_directory / "13_hmm_bulk_traces"
|
|
944
1142
|
|
|
945
1143
|
if hmm_dir.is_dir():
|
|
946
1144
|
logger.debug(f"{hmm_dir} already exists.")
|
|
947
1145
|
else:
|
|
948
|
-
make_dirs([
|
|
1146
|
+
make_dirs([hmm_directory, hmm_dir])
|
|
949
1147
|
from ..plotting import plot_hmm_layers_rolling_by_sample_ref
|
|
950
1148
|
|
|
951
1149
|
bulk_hmm_layers = [
|
|
@@ -953,6 +1151,10 @@ def hmm_adata_core(
|
|
|
953
1151
|
for layer in hmm_layers
|
|
954
1152
|
if not any(s in layer for s in ("_lengths", "_states", "_posterior"))
|
|
955
1153
|
]
|
|
1154
|
+
layer_colors = {
|
|
1155
|
+
layer: _resolve_feature_color(layer, cfg, "tab20", idx, len(bulk_hmm_layers))
|
|
1156
|
+
for idx, layer in enumerate(bulk_hmm_layers)
|
|
1157
|
+
}
|
|
956
1158
|
saved = plot_hmm_layers_rolling_by_sample_ref(
|
|
957
1159
|
adata,
|
|
958
1160
|
layers=bulk_hmm_layers,
|
|
@@ -964,14 +1166,15 @@ def hmm_adata_core(
|
|
|
964
1166
|
output_dir=hmm_dir,
|
|
965
1167
|
save=True,
|
|
966
1168
|
show_raw=False,
|
|
1169
|
+
layer_colors=layer_colors,
|
|
967
1170
|
)
|
|
968
1171
|
|
|
969
|
-
hmm_dir =
|
|
1172
|
+
hmm_dir = hmm_directory / "14_hmm_fragment_distributions"
|
|
970
1173
|
|
|
971
1174
|
if hmm_dir.is_dir():
|
|
972
1175
|
logger.debug(f"{hmm_dir} already exists.")
|
|
973
1176
|
else:
|
|
974
|
-
make_dirs([
|
|
1177
|
+
make_dirs([hmm_directory, hmm_dir])
|
|
975
1178
|
from ..plotting import plot_hmm_size_contours
|
|
976
1179
|
|
|
977
1180
|
if smf_modality == "deaminase":
|
|
@@ -996,6 +1199,8 @@ def hmm_adata_core(
|
|
|
996
1199
|
for layer, max in fragments:
|
|
997
1200
|
save_path = hmm_dir / layer
|
|
998
1201
|
make_dirs([save_path])
|
|
1202
|
+
layer_cmap = _resolve_feature_colormap(layer, cfg, "Greens")
|
|
1203
|
+
feature_ranges = _resolve_length_feature_ranges(layer, cfg, "Greens")
|
|
999
1204
|
|
|
1000
1205
|
figs = plot_hmm_size_contours(
|
|
1001
1206
|
adata,
|
|
@@ -1011,8 +1216,9 @@ def hmm_adata_core(
|
|
|
1011
1216
|
dpi=200,
|
|
1012
1217
|
smoothing_sigma=(10, 10),
|
|
1013
1218
|
normalize_after_smoothing=True,
|
|
1014
|
-
cmap=
|
|
1219
|
+
cmap=layer_cmap,
|
|
1015
1220
|
log_scale_z=True,
|
|
1221
|
+
feature_ranges=tuple(feature_ranges),
|
|
1016
1222
|
)
|
|
1017
1223
|
########################################################################################################################
|
|
1018
1224
|
|